Update Linux to v5.4.2 Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd

commit: 0f672f6c0b52b7b0700b0915c72b540721af4465 [log] [tgz]
author: David Brazdil <dbrazdil@google.com> Tue Dec 10 10:32:29 2019 +0000
committer: David Brazdil <dbrazdil@google.com> Tue Dec 10 19:03:18 2019 +0000
tree: 85c8cba019caa205e4f8920d72d93f6d6deaf29c
parent: 3a0ad55d848b50499b68d7141d4eca997fce28ef [diff]
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 11045d8..ac2ec45 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config 9P_FS
 	tristate "Plan 9 Resource Sharing Support (9P2000)"
 	depends on INET && NET_9P

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 082d227..6261719 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c

@@ -276,7 +276,7 @@
 	switch (handler->flags) {
 	case ACL_TYPE_ACCESS:
 		if (acl) {
-			struct iattr iattr;
+			struct iattr iattr = { 0 };
 			struct posix_acl *old_acl = acl;
 
 			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 9eb3470..eb2151f 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c

@@ -1,23 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * V9FS cache definitions.
  *
  *  Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #include <linux/jiffies.h>
@@ -66,6 +51,8 @@
 	if (!v9ses->cachetag) {
 		if (v9fs_random_cachetag(v9ses) < 0) {
 			v9ses->fscache = NULL;
+			kfree(v9ses->cachetag);
+			v9ses->cachetag = NULL;
 			return;
 		}
 	}

diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 247e47e..00f107a 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h

@@ -1,23 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * V9FS cache definitions.
  *
  *  Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #ifndef _9P_CACHE_H

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index a9ef46f..3d681a2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c

@@ -1,24 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * V9FS FID Management
  *
  *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2005, 2006 by Eric Van Hensbergen <ericvh@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #include <linux/module.h>

diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 4491bca..928b109 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h

@@ -1,23 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * V9FS FID Management
  *
  *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 #ifndef FS_9P_FID_H
 #define FS_9P_FID_H

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 89bac3d..15a99f9 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/9p/v9fs.c
  *
@@ -5,22 +6,6 @@
  *
  *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -61,6 +46,8 @@
 	Opt_cache_loose, Opt_fscache, Opt_mmap,
 	/* Access options */
 	Opt_access, Opt_posixacl,
+	/* Lock timeout option */
+	Opt_locktimeout,
 	/* Error token */
 	Opt_err
 };
@@ -80,6 +67,7 @@
 	{Opt_cachetag, "cachetag=%s"},
 	{Opt_access, "access=%s"},
 	{Opt_posixacl, "posixacl"},
+	{Opt_locktimeout, "locktimeout=%u"},
 	{Opt_err, NULL}
 };
 
@@ -187,6 +175,7 @@
 #ifdef CONFIG_9P_FSCACHE
 	v9ses->cachetag = NULL;
 #endif
+	v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
 
 	if (!opts)
 		return 0;
@@ -359,6 +348,23 @@
 #endif
 			break;
 
+		case Opt_locktimeout:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			if (option < 1) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "locktimeout must be a greater than zero integer.\n");
+				ret = -EINVAL;
+				continue;
+			}
+			v9ses->session_lock_timeout = (long)option * HZ;
+			break;
+
 		default:
 			continue;
 		}

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 982e017..7b76377 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h

@@ -1,24 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * V9FS definitions.
  *
  *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 #ifndef FS_9P_V9FS_H
 #define FS_9P_V9FS_H
@@ -116,6 +101,7 @@
 	struct p9_client *clnt;	/* 9p client */
 	struct list_head slist; /* list of sessions registered with v9fs */
 	struct rw_semaphore rename_sem;
+	long session_lock_timeout; /* retry interval for blocking locks */
 };
 
 /* cache_validity flags */

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 5a0db6d..fd2a2b0 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h

@@ -1,24 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * V9FS VFS extensions.
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 #ifndef FS_9P_V9FS_VFS_H
 #define FS_9P_V9FS_VFS_H
@@ -40,6 +25,9 @@
  */
 #define P9_LOCK_TIMEOUT (30*HZ)
 
+/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */
+#define V9FS_STAT2INODE_KEEP_ISIZE 1
+
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
@@ -55,14 +43,16 @@
 extern struct kmem_cache *v9fs_inode_cache;
 
 struct inode *v9fs_alloc_inode(struct super_block *sb);
-void v9fs_destroy_inode(struct inode *inode);
+void v9fs_free_inode(struct inode *inode);
 struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t);
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		    struct inode *inode, umode_t mode, dev_t);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
-void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+		      struct super_block *sb, unsigned int flags);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+			   unsigned int flags);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -83,4 +73,18 @@
 }
 
 int v9fs_open_to_dotl_flags(int flags);
+
+static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+	/*
+	 * 32-bit need the lock, concurrent updates could break the
+	 * sequences and make i_size_read() loop forever.
+	 * 64-bit updates are atomic and can skip the locking.
+	 */
+	if (sizeof(i_size) > sizeof(long))
+		spin_lock(&inode->i_lock);
+	i_size_write(inode, i_size);
+	if (sizeof(i_size) > sizeof(long))
+		spin_unlock(&inode->i_lock);
+}
 #endif

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index e1cbdfd..cce9ace 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/9p/vfs_addr.c
  *
@@ -5,22 +6,6 @@
  *
  *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #include <linux/module.h>
@@ -50,8 +35,9 @@
  * @page: structure to page
  *
  */
-static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
+static int v9fs_fid_readpage(void *data, struct page *page)
 {
+	struct p9_fid *fid = data;
 	struct inode *inode = page->mapping->host;
 	struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE};
 	struct iov_iter to;
@@ -65,7 +51,7 @@
 	if (retval == 0)
 		return retval;
 
-	iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE);
+	iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE);
 
 	retval = p9_client_read(fid, page_offset(page), &to, &err);
 	if (err) {
@@ -122,7 +108,8 @@
 	if (ret == 0)
 		return ret;
 
-	ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
+	ret = read_cache_pages(mapping, pages, v9fs_fid_readpage,
+			filp->private_data);
 	p9_debug(P9_DEBUG_VFS, "  = %d\n", ret);
 	return ret;
 }
@@ -175,7 +162,7 @@
 	bvec.bv_page = page;
 	bvec.bv_offset = 0;
 	bvec.bv_len = len;
-	iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len);
+	iov_iter_bvec(&from, WRITE, &bvec, 1, len);
 
 	/* We should have writeback_fid always set */
 	BUG_ON(!v9inode->writeback_fid);

diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index bd456c6..7d6f69a 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/9p/vfs_dentry.c
  *
@@ -5,22 +6,6 @@
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #include <linux/module.h>

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 48db9a9..674d22b 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/9p/vfs_dir.c
  *
@@ -5,22 +6,6 @@
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #include <linux/module.h>
@@ -105,7 +90,6 @@
 	int err = 0;
 	struct p9_fid *fid;
 	int buflen;
-	int reclen = 0;
 	struct p9_rdir *rdir;
 	struct kvec kvec;
 
@@ -124,7 +108,7 @@
 		if (rdir->tail == rdir->head) {
 			struct iov_iter to;
 			int n;
-			iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen);
+			iov_iter_kvec(&to, READ, &kvec, 1, buflen);
 			n = p9_client_read(file->private_data, ctx->pos, &to,
 					   &err);
 			if (err)
@@ -138,11 +122,10 @@
 		while (rdir->head < rdir->tail) {
 			err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
 					  rdir->tail - rdir->head, &st);
-			if (err) {
+			if (err <= 0) {
 				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
 				return -EIO;
 			}
-			reclen = st.size+2;
 
 			over = !dir_emit(ctx, st.name, strlen(st.name),
 					 v9fs_qid2ino(&st.qid), dt_type(&st));
@@ -150,8 +133,8 @@
 			if (over)
 				return 0;
 
-			rdir->head += reclen;
-			ctx->pos += reclen;
+			rdir->head += err;
+			ctx->pos += err;
 		}
 	}
 }

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index ab3d5f5..fe7f0bd 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/9p/vfs_file.c
  *
@@ -5,22 +6,6 @@
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #include <linux/module.h>
@@ -154,6 +139,7 @@
 	uint8_t status = P9_LOCK_ERROR;
 	int res = 0;
 	unsigned char fl_type;
+	struct v9fs_session_info *v9ses;
 
 	fid = filp->private_data;
 	BUG_ON(fid == NULL);
@@ -189,6 +175,8 @@
 	if (IS_SETLKW(cmd))
 		flock.flags = P9_LOCK_FLAGS_BLOCK;
 
+	v9ses = v9fs_inode2v9ses(file_inode(filp));
+
 	/*
 	 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
 	 * for lock request, keep on trying
@@ -202,7 +190,8 @@
 			break;
 		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
 			break;
-		if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+		if (schedule_timeout_interruptible(v9ses->session_lock_timeout)
+				!= 0)
 			break;
 		/*
 		 * p9_client_lock_dotl overwrites flock.client_id with the
@@ -224,7 +213,7 @@
 		break;
 	default:
 		WARN_ONCE(1, "unknown lock status code: %d\n", status);
-		/* fallthough */
+		/* fall through */
 	case P9_LOCK_ERROR:
 	case P9_LOCK_GRACE:
 		res = -ENOLCK;
@@ -442,7 +431,11 @@
 		i_size = i_size_read(inode);
 		if (iocb->ki_pos > i_size) {
 			inode_add_bytes(inode, iocb->ki_pos - i_size);
-			i_size_write(inode, iocb->ki_pos);
+			/*
+			 * Need to serialize against i_size_write() in
+			 * v9fs_stat2inode()
+			 */
+			v9fs_i_size_write(inode, iocb->ki_pos);
 		}
 		return retval;
 	}
@@ -520,6 +513,7 @@
 	v9inode = V9FS_I(inode);
 	mutex_lock(&v9inode->v_mutex);
 	if (!v9inode->writeback_fid &&
+	    (vma->vm_flags & VM_SHARED) &&
 	    (vma->vm_flags & VM_WRITE)) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -621,6 +615,8 @@
 			(vma->vm_end - vma->vm_start - 1),
 	};
 
+	if (!(vma->vm_flags & VM_SHARED))
+		return;
 
 	p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
 

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 85ff859..b82423a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/9p/vfs_inode.c
  *
@@ -5,22 +6,6 @@
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -253,21 +238,15 @@
 }
 
 /**
- * v9fs_destroy_inode - destroy an inode
+ * v9fs_free_inode - destroy an inode
  *
  */
 
-static void v9fs_i_callback(struct rcu_head *head)
+void v9fs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
 }
 
-void v9fs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, v9fs_i_callback);
-}
-
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		    struct inode *inode, umode_t mode, dev_t rdev)
 {
@@ -538,7 +517,7 @@
 	if (retval)
 		goto error;
 
-	v9fs_stat2inode(st, inode, sb);
+	v9fs_stat2inode(st, inode, sb, 0);
 	v9fs_cache_inode_get_cookie(inode);
 	unlock_new_inode(inode);
 	return inode;
@@ -1092,7 +1071,7 @@
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb);
+	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
 	generic_fillattr(d_inode(dentry), stat);
 
 	p9stat_free(st);
@@ -1170,12 +1149,13 @@
  * @stat: Plan 9 metadata (mistat) structure
  * @inode: inode to populate
  * @sb: superblock of filesystem
+ * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
  *
  */
 
 void
 v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
-	struct super_block *sb)
+		 struct super_block *sb, unsigned int flags)
 {
 	umode_t mode;
 	char ext[32];
@@ -1216,10 +1196,11 @@
 	mode = p9mode2perm(v9ses, stat);
 	mode |= inode->i_mode & ~S_IALLUGO;
 	inode->i_mode = mode;
-	i_size_write(inode, stat->length);
 
+	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+		v9fs_i_size_write(inode, stat->length);
 	/* not real number of blocks, but 512 byte ones ... */
-	inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+	inode->i_blocks = (stat->length + 512 - 1) >> 9;
 	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 
@@ -1416,9 +1397,9 @@
 {
 	int umode;
 	dev_t rdev;
-	loff_t i_size;
 	struct p9_wstat *st;
 	struct v9fs_session_info *v9ses;
+	unsigned int flags;
 
 	v9ses = v9fs_inode2v9ses(inode);
 	st = p9_client_stat(fid);
@@ -1431,16 +1412,13 @@
 	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
 		goto out;
 
-	spin_lock(&inode->i_lock);
 	/*
 	 * We don't want to refresh inode->i_size,
 	 * because we may have cached data
 	 */
-	i_size = inode->i_size;
-	v9fs_stat2inode(st, inode, inode->i_sb);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		inode->i_size = i_size;
-	spin_unlock(&inode->i_lock);
+	flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+		V9FS_STAT2INODE_KEEP_ISIZE : 0;
+	v9fs_stat2inode(st, inode, inode->i_sb, flags);
 out:
 	p9stat_free(st);
 	kfree(st);

diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4823e1c..60328b2 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/9p/vfs_inode_dotl.c
  *
@@ -5,22 +6,6 @@
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #include <linux/module.h>
@@ -143,7 +128,7 @@
 	if (retval)
 		goto error;
 
-	v9fs_stat2inode_dotl(st, inode);
+	v9fs_stat2inode_dotl(st, inode, 0);
 	v9fs_cache_inode_get_cookie(inode);
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
@@ -496,7 +481,7 @@
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode_dotl(st, d_inode(dentry));
+	v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
 	generic_fillattr(d_inode(dentry), stat);
 	/* Change block size to what the server returned */
 	stat->blksize = st->st_blksize;
@@ -607,11 +592,13 @@
  * v9fs_stat2inode_dotl - populate an inode structure with stat info
  * @stat: stat structure
  * @inode: inode to populate
+ * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
  *
  */
 
 void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+		      unsigned int flags)
 {
 	umode_t mode;
 	struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -631,7 +618,8 @@
 		mode |= inode->i_mode & ~S_IALLUGO;
 		inode->i_mode = mode;
 
-		i_size_write(inode, stat->st_size);
+		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+			v9fs_i_size_write(inode, stat->st_size);
 		inode->i_blocks = stat->st_blocks;
 	} else {
 		if (stat->st_result_mask & P9_STATS_ATIME) {
@@ -661,8 +649,9 @@
 		}
 		if (stat->st_result_mask & P9_STATS_RDEV)
 			inode->i_rdev = new_decode_dev(stat->st_rdev);
-		if (stat->st_result_mask & P9_STATS_SIZE)
-			i_size_write(inode, stat->st_size);
+		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
+		    stat->st_result_mask & P9_STATS_SIZE)
+			v9fs_i_size_write(inode, stat->st_size);
 		if (stat->st_result_mask & P9_STATS_BLOCKS)
 			inode->i_blocks = stat->st_blocks;
 	}
@@ -928,9 +917,9 @@
 
 int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 {
-	loff_t i_size;
 	struct p9_stat_dotl *st;
 	struct v9fs_session_info *v9ses;
+	unsigned int flags;
 
 	v9ses = v9fs_inode2v9ses(inode);
 	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
@@ -942,16 +931,13 @@
 	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
 		goto out;
 
-	spin_lock(&inode->i_lock);
 	/*
 	 * We don't want to refresh inode->i_size,
 	 * because we may have cached data
 	 */
-	i_size = inode->i_size;
-	v9fs_stat2inode_dotl(st, inode);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		inode->i_size = i_size;
-	spin_unlock(&inode->i_lock);
+	flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+		V9FS_STAT2INODE_KEEP_ISIZE : 0;
+	v9fs_stat2inode_dotl(st, inode, flags);
 out:
 	kfree(st);
 	return 0;

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48ce504..74df32b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/9p/vfs_super.c
  *
@@ -6,22 +7,6 @@
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #include <linux/kernel.h>
@@ -73,7 +58,7 @@
 
 static int
 v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
-		int flags, void *data)
+		int flags)
 {
 	int ret;
 
@@ -84,15 +69,19 @@
 	if (v9fs_proto_dotl(v9ses)) {
 		sb->s_op = &v9fs_super_ops_dotl;
 		sb->s_xattr = v9fs_xattr_handlers;
-	} else
+	} else {
 		sb->s_op = &v9fs_super_ops;
+		sb->s_time_max = U32_MAX;
+	}
+
+	sb->s_time_min = 0;
 
 	ret = super_setup_bdi(sb);
 	if (ret)
 		return ret;
 
 	if (v9ses->cache)
-		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
+		sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
 
 	sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
 	if (!v9ses->cache)
@@ -143,7 +132,7 @@
 		retval = PTR_ERR(sb);
 		goto clunk_fid;
 	}
-	retval = v9fs_fill_super(sb, v9ses, flags, data);
+	retval = v9fs_fill_super(sb, v9ses, flags);
 	if (retval)
 		goto release_sb;
 
@@ -172,7 +161,7 @@
 			goto release_sb;
 		}
 		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
-		v9fs_stat2inode_dotl(st, d_inode(root));
+		v9fs_stat2inode_dotl(st, d_inode(root), 0);
 		kfree(st);
 	} else {
 		struct p9_wstat *st = NULL;
@@ -183,7 +172,7 @@
 		}
 
 		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
-		v9fs_stat2inode(st, d_inode(root), sb);
+		v9fs_stat2inode(st, d_inode(root), sb, 0);
 
 		p9stat_free(st);
 		kfree(st);
@@ -344,7 +333,7 @@
 
 static const struct super_operations v9fs_super_ops = {
 	.alloc_inode = v9fs_alloc_inode,
-	.destroy_inode = v9fs_destroy_inode,
+	.free_inode = v9fs_free_inode,
 	.statfs = simple_statfs,
 	.evict_inode = v9fs_evict_inode,
 	.show_options = v9fs_show_options,
@@ -354,7 +343,7 @@
 
 static const struct super_operations v9fs_super_ops_dotl = {
 	.alloc_inode = v9fs_alloc_inode,
-	.destroy_inode = v9fs_destroy_inode,
+	.free_inode = v9fs_free_inode,
 	.statfs = v9fs_statfs,
 	.drop_inode = v9fs_drop_inode,
 	.evict_inode = v9fs_evict_inode,

diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 352abc3..ac8ff8c 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c

@@ -32,7 +32,7 @@
 	struct iov_iter to;
 	int err;
 
-	iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size);
+	iov_iter_kvec(&to, READ, &kvec, 1, buffer_size);
 
 	attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
 	if (IS_ERR(attr_fid)) {
@@ -107,7 +107,7 @@
 	struct iov_iter from;
 	int retval, err;
 
-	iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len);
+	iov_iter_kvec(&from, WRITE, &kvec, 1, value_len);
 
 	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
 		 name, value_len, flags);

diff --git a/fs/Kconfig b/fs/Kconfig
index ac474a6..2501e6f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # File system configuration
 #
@@ -8,6 +9,12 @@
 config DCACHE_WORD_ACCESS
        bool
 
+config VALIDATE_FS_PARSER
+	bool "Validate filesystem parameter description"
+	help
+	  Enable this to perform validation of the parameter description for a
+	  filesystem when it is registered.
+
 if BLOCK
 
 config FS_IOMAP
@@ -105,6 +112,8 @@
 
 source "fs/crypto/Kconfig"
 
+source "fs/verity/Kconfig"
+
 source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
@@ -254,12 +263,10 @@
 source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
-source "fs/exofs/Kconfig"
+source "fs/erofs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
-source "fs/exofs/Kconfig.ore"
-
 menuconfig NETWORK_FILESYSTEMS
 	bool "Network File Systems"
 	default y
@@ -313,5 +320,6 @@
 
 source "fs/nls/Kconfig"
 source "fs/dlm/Kconfig"
+source "fs/unicode/Kconfig"
 
 endmenu

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index b795f8d..62dc4f5 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 
 menu "Executable file formats"
 
@@ -90,12 +91,28 @@
 
 	  Most systems will not boot if you say M or N here.  If unsure, say Y.
 
+config ARCH_HAS_BINFMT_FLAT
+	bool
+
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
-	depends on !MMU || ARM || M68K
+	depends on ARCH_HAS_BINFMT_FLAT
 	help
 	  Support uClinux FLAT format binaries.
 
+config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
+	bool
+
+config BINFMT_FLAT_OLD_ALWAYS_RAM
+	bool
+
+config BINFMT_FLAT_OLD
+	bool "Enable support for very old legacy flat binaries"
+	depends on BINFMT_FLAT
+	help
+	  Support decade old uClinux FLAT format binaries.  Unless you know
+	  you have some of those say N here.
+
 config BINFMT_ZFLAT
 	bool "Enable ZFLAT support"
 	depends on BINFMT_FLAT

diff --git a/fs/Makefile b/fs/Makefile
index 293733f..14231b4 100644
--- a/fs/Makefile
+++ b/fs/Makefile

@@ -12,7 +12,8 @@
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
-		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
+		fs_types.o fs_context.o fs_parser.o fsopen.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
@@ -24,14 +25,16 @@
 
 obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
-obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
+obj-y				+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
+obj-$(CONFIG_FS_VERITY)		+= verity/
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
@@ -50,7 +53,7 @@
 obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
-obj-$(CONFIG_FS_IOMAP)		+= iomap.o
+obj-y				+= iomap/
 
 obj-y				+= quota/
 
@@ -90,6 +93,7 @@
 obj-$(CONFIG_NFSD)		+= nfsd/
 obj-$(CONFIG_LOCKD)		+= lockd/
 obj-$(CONFIG_NLS)		+= nls/
+obj-$(CONFIG_UNICODE)		+= unicode/
 obj-$(CONFIG_SYSV_FS)		+= sysv/
 obj-$(CONFIG_CIFS)		+= cifs/
 obj-$(CONFIG_HPFS_FS)		+= hpfs/
@@ -124,7 +128,7 @@
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)		+= f2fs/
-obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+obj-$(CONFIG_EROFS_FS)		+= erofs/

diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index c5a7787..df4650d 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config ADFS_FS
 	tristate "ADFS file system support"
 	depends on BLOCK

diff --git a/fs/adfs/Makefile b/fs/adfs/Makefile
index 9b2d71a..cf7de6e 100644
--- a/fs/adfs/Makefile
+++ b/fs/adfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux adfs filesystem routines.
 #

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index c76db75..b7e844d 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h

@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/adfs_fs.h>
 
@@ -8,6 +9,15 @@
 #define ADFS_BAD_FRAG		 1
 #define ADFS_ROOT_FRAG		 2
 
+#define ADFS_FILETYPE_NONE	((u16)~0)
+
+/* RISC OS 12-bit filetype is stored in load_address[19:8] */
+static inline u16 adfs_filetype(u32 loadaddr)
+{
+	return (loadaddr & 0xfff00000) == 0xfff00000 ?
+	       (loadaddr >> 8) & 0xfff : ADFS_FILETYPE_NONE;
+}
+
 #define ADFS_NDA_OWNER_READ	(1 << 0)
 #define ADFS_NDA_OWNER_WRITE	(1 << 1)
 #define ADFS_NDA_LOCKED		(1 << 2)
@@ -18,22 +28,28 @@
 
 #include "dir_f.h"
 
-struct buffer_head;
-
 /*
  * adfs file system inode data in memory
  */
 struct adfs_inode_info {
 	loff_t		mmu_private;
-	unsigned long	parent_id;	/* object id of parent		*/
+	__u32		parent_id;	/* parent indirect disc address	*/
 	__u32		loadaddr;	/* RISC OS load address		*/
 	__u32		execaddr;	/* RISC OS exec address		*/
-	unsigned int	filetype;	/* RISC OS file type		*/
 	unsigned int	attr;		/* RISC OS permissions		*/
-	unsigned int	stamped:1;	/* RISC OS file has date/time	*/
 	struct inode vfs_inode;
 };
 
+static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
+{
+	return container_of(inode, struct adfs_inode_info, vfs_inode);
+}
+
+static inline bool adfs_inode_is_stamped(struct inode *inode)
+{
+	return (ADFS_I(inode)->loadaddr & 0xfff00000) == 0xfff00000;
+}
+
 /*
  * Forward-declare this
  */
@@ -59,10 +75,8 @@
 	__u32		s_ids_per_zone;	/* max. no ids in one zone */
 	__u32		s_idlen;	/* length of ID in map */
 	__u32		s_map_size;	/* sector size of a map	*/
-	unsigned long	s_size;		/* total size (in blocks) of this fs */
 	signed int	s_map2blk;	/* shift left by this for map->sector*/
 	unsigned int	s_log2sharesize;/* log2 share size */
-	__le32		s_version;	/* disc format version */
 	unsigned int	s_namelen;	/* maximum number of characters in name	 */
 };
 
@@ -71,11 +85,6 @@
 	return sb->s_fs_info;
 }
 
-static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
-{
-	return container_of(inode, struct adfs_inode_info, vfs_inode);
-}
-
 /*
  * Directory handling
  */
@@ -89,7 +98,7 @@
 	struct buffer_head	**bh_fplus;
 
 	unsigned int		pos;
-	unsigned int		parent_id;
+	__u32			parent_id;
 
 	struct adfs_dirheader	dirhead;
 	union  adfs_dirtail	dirtail;
@@ -101,33 +110,18 @@
 #define ADFS_MAX_NAME_LEN	(256 + 4) /* +4 for ,xyz hex filetype suffix */
 struct object_info {
 	__u32		parent_id;		/* parent object id	*/
-	__u32		file_id;		/* object id		*/
+	__u32		indaddr;		/* indirect disc addr	*/
 	__u32		loadaddr;		/* load address		*/
 	__u32		execaddr;		/* execution address	*/
 	__u32		size;			/* size			*/
 	__u8		attr;			/* RISC OS attributes	*/
 	unsigned int	name_len;		/* name length		*/
 	char		name[ADFS_MAX_NAME_LEN];/* file name		*/
-
-	/* RISC OS file type (12-bit: derived from loadaddr) */
-	__u16		filetype;
 };
 
-/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
-static inline int append_filetype_suffix(char *buf, __u16 filetype)
-{
-	if (filetype == 0xffff)	/* no explicit 12-bit file type was set */
-		return 0;
-
-	*buf++ = ',';
-	*buf++ = hex_asc_lo(filetype >> 8);
-	*buf++ = hex_asc_lo(filetype >> 4);
-	*buf++ = hex_asc_lo(filetype >> 0);
-	return 4;
-}
-
 struct adfs_dir_ops {
-	int	(*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir);
+	int	(*read)(struct super_block *sb, unsigned int indaddr,
+			unsigned int size, struct adfs_dir *dir);
 	int	(*setpos)(struct adfs_dir *dir, unsigned int fpos);
 	int	(*getnext)(struct adfs_dir *dir, struct object_info *obj);
 	int	(*update)(struct adfs_dir *dir, struct object_info *obj);
@@ -150,7 +144,7 @@
 int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
 
 /* map.c */
-extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigned int offset);
+int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset);
 extern unsigned int adfs_map_free(struct super_block *sb);
 
 /* Misc */
@@ -158,6 +152,7 @@
 void __adfs_error(struct super_block *sb, const char *function,
 		  const char *fmt, ...);
 #define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt)
+void adfs_msg(struct super_block *sb, const char *pfx, const char *fmt, ...);
 
 /* super.c */
 
@@ -172,6 +167,7 @@
 extern const struct adfs_dir_ops adfs_f_dir_ops;
 extern const struct adfs_dir_ops adfs_fplus_dir_ops;
 
+void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj);
 extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
 			   int wait);
 
@@ -194,16 +190,28 @@
  *
  * The root directory ID should always be looked up in the map [3.4]
  */
-static inline int
-__adfs_block_map(struct super_block *sb, unsigned int object_id,
-		 unsigned int block)
+static inline int __adfs_block_map(struct super_block *sb, u32 indaddr,
+				   unsigned int block)
 {
-	if (object_id & 255) {
+	if (indaddr & 255) {
 		unsigned int off;
 
-		off = (object_id & 255) - 1;
+		off = (indaddr & 255) - 1;
 		block += off << ADFS_SB(sb)->s_log2sharesize;
 	}
 
-	return adfs_map_lookup(sb, object_id >> 8, block);
+	return adfs_map_lookup(sb, indaddr >> 8, block);
+}
+
+/* Return the disc record from the map */
+static inline
+struct adfs_discrecord *adfs_map_discrecord(struct adfs_discmap *dm)
+{
+	return (void *)(dm[0].dm_bh->b_data + 4);
+}
+
+static inline u64 adfs_disc_size(const struct adfs_discrecord *dr)
+{
+	return (u64)le32_to_cpu(dr->disc_size_high) << 32 |
+		    le32_to_cpu(dr->disc_size);
 }

diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index e18eff8..a54c532 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/adfs/dir.c
  *
  *  Copyright (C) 1999-2000 Russell King
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
  *  Common directory handling for ADFS
  */
 #include "adfs.h"
@@ -16,6 +13,44 @@
  */
 static DEFINE_RWLOCK(adfs_dir_lock);
 
+void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj)
+{
+	unsigned int dots, i;
+
+	/*
+	 * RISC OS allows the use of '/' in directory entry names, so we need
+	 * to fix these up.  '/' is typically used for FAT compatibility to
+	 * represent '.', so do the same conversion here.  In any case, '.'
+	 * will never be in a RISC OS name since it is used as the pathname
+	 * separator.  Handle the case where we may generate a '.' or '..'
+	 * name, replacing the first character with '^' (the RISC OS "parent
+	 * directory" character.)
+	 */
+	for (i = dots = 0; i < obj->name_len; i++)
+		if (obj->name[i] == '/') {
+			obj->name[i] = '.';
+			dots++;
+		}
+
+	if (obj->name_len <= 2 && dots == obj->name_len)
+		obj->name[0] = '^';
+
+	/*
+	 * If the object is a file, and the user requested the ,xyz hex
+	 * filetype suffix to the name, check the filetype and append.
+	 */
+	if (!(obj->attr & ADFS_NDA_DIRECTORY) && ADFS_SB(dir->sb)->s_ftsuffix) {
+		u16 filetype = adfs_filetype(obj->loadaddr);
+
+		if (filetype != ADFS_FILETYPE_NONE) {
+			obj->name[obj->name_len++] = ',';
+			obj->name[obj->name_len++] = hex_asc_lo(filetype >> 8);
+			obj->name[obj->name_len++] = hex_asc_lo(filetype >> 4);
+			obj->name[obj->name_len++] = hex_asc_lo(filetype >> 0);
+		}
+	}
+}
+
 static int
 adfs_readdir(struct file *file, struct dir_context *ctx)
 {
@@ -51,7 +86,7 @@
 		goto unlock_out;
 	while (ops->getnext(&dir, &obj) == 0) {
 		if (!dir_emit(ctx, obj.name, obj.name_len,
-			    obj.file_id, DT_UNKNOWN))
+			      obj.indaddr, DT_UNKNOWN))
 			break;
 		ctx->pos++;
 	}
@@ -72,8 +107,8 @@
 	const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
 	struct adfs_dir dir;
 
-	printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n",
-		 obj->file_id, obj->parent_id);
+	printk(KERN_INFO "adfs_dir_update: object %06x in dir %06x\n",
+		 obj->indaddr, obj->parent_id);
 
 	if (!ops->update) {
 		ret = -EINVAL;
@@ -100,37 +135,36 @@
 	return ret;
 }
 
-static int
-adfs_match(const struct qstr *name, struct object_info *obj)
+static unsigned char adfs_tolower(unsigned char c)
 {
-	int i;
-
-	if (name->len != obj->name_len)
-		return 0;
-
-	for (i = 0; i < name->len; i++) {
-		char c1, c2;
-
-		c1 = name->name[i];
-		c2 = obj->name[i];
-
-		if (c1 >= 'A' && c1 <= 'Z')
-			c1 += 'a' - 'A';
-		if (c2 >= 'A' && c2 <= 'Z')
-			c2 += 'a' - 'A';
-
-		if (c1 != c2)
-			return 0;
-	}
-	return 1;
+	if (c >= 'A' && c <= 'Z')
+		c += 'a' - 'A';
+	return c;
 }
 
-static int
-adfs_dir_lookup_byname(struct inode *inode, const struct qstr *name, struct object_info *obj)
+static int __adfs_compare(const unsigned char *qstr, u32 qlen,
+			  const char *str, u32 len)
+{
+	u32 i;
+
+	if (qlen != len)
+		return 1;
+
+	for (i = 0; i < qlen; i++)
+		if (adfs_tolower(qstr[i]) != adfs_tolower(str[i]))
+			return 1;
+
+	return 0;
+}
+
+static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr,
+				  struct object_info *obj)
 {
 	struct super_block *sb = inode->i_sb;
 	const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	const unsigned char *name;
 	struct adfs_dir dir;
+	u32 name_len;
 	int ret;
 
 	ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
@@ -138,7 +172,8 @@
 		goto out;
 
 	if (ADFS_I(inode)->parent_id != dir.parent_id) {
-		adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n",
+		adfs_error(sb,
+			   "parent directory changed under me! (%06x but got %06x)\n",
 			   ADFS_I(inode)->parent_id, dir.parent_id);
 		ret = -EIO;
 		goto free_out;
@@ -153,8 +188,10 @@
 		goto unlock_out;
 
 	ret = -ENOENT;
+	name = qstr->name;
+	name_len = qstr->len;
 	while (ops->getnext(&dir, obj) == 0) {
-		if (adfs_match(name, obj)) {
+		if (!__adfs_compare(name, name_len, obj->name, obj->name_len)) {
 			ret = 0;
 			break;
 		}
@@ -179,30 +216,18 @@
 static int
 adfs_hash(const struct dentry *parent, struct qstr *qstr)
 {
-	const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
 	const unsigned char *name;
 	unsigned long hash;
-	int i;
+	u32 len;
 
-	if (qstr->len < name_len)
-		return 0;
+	if (qstr->len > ADFS_SB(parent->d_sb)->s_namelen)
+		return -ENAMETOOLONG;
 
-	/*
-	 * Truncate the name in place, avoids
-	 * having to define a compare function.
-	 */
-	qstr->len = i = name_len;
+	len = qstr->len;
 	name = qstr->name;
 	hash = init_name_hash(parent);
-	while (i--) {
-		char c;
-
-		c = *name++;
-		if (c >= 'A' && c <= 'Z')
-			c += 'a' - 'A';
-
-		hash = partial_name_hash(c, hash);
-	}
+	while (len--)
+		hash = partial_name_hash(adfs_tolower(*name++), hash);
 	qstr->hash = end_name_hash(hash);
 
 	return 0;
@@ -212,30 +237,10 @@
  * Compare two names, taking note of the name length
  * requirements of the underlying filesystem.
  */
-static int
-adfs_compare(const struct dentry *dentry,
-		unsigned int len, const char *str, const struct qstr *name)
+static int adfs_compare(const struct dentry *dentry, unsigned int len,
+			const char *str, const struct qstr *qstr)
 {
-	int i;
-
-	if (len != name->len)
-		return 1;
-
-	for (i = 0; i < name->len; i++) {
-		char a, b;
-
-		a = str[i];
-		b = name->name[i];
-
-		if (a >= 'A' && a <= 'Z')
-			a += 'a' - 'A';
-		if (b >= 'A' && b <= 'Z')
-			b += 'a' - 'A';
-
-		if (a != b)
-			return 1;
-	}
-	return 0;
+	return __adfs_compare(qstr->name, qstr->len, str, len);
 }
 
 const struct dentry_operations adfs_dentry_operations = {

diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 0fbfd0b..c1a950c 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c

@@ -1,15 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/adfs/dir_f.c
  *
  * Copyright (C) 1997-1999 Russell King
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
  *  E and F format directory handling
  */
-#include <linux/buffer_head.h>
 #include "adfs.h"
 #include "dir_f.h"
 
@@ -24,8 +20,11 @@
 
 	switch (len) {
 	case 4:		val |= p[3] << 24;
+			/* fall through */
 	case 3:		val |= p[2] << 16;
+			/* fall through */
 	case 2:		val |= p[1] << 8;
+			/* fall through */
 	default:	val |= p[0];
 	}
 	return val;
@@ -35,27 +34,15 @@
 {
 	switch (len) {
 	case 4:		p[3] = val >> 24;
+			/* fall through */
 	case 3:		p[2] = val >> 16;
+			/* fall through */
 	case 2:		p[1] = val >> 8;
+			/* fall through */
 	default:	p[0] = val;
 	}
 }
 
-static inline int adfs_readname(char *buf, char *ptr, int maxlen)
-{
-	char *old_buf = buf;
-
-	while ((unsigned char)*ptr >= ' ' && maxlen--) {
-		if (*ptr == '/')
-			*buf++ = '.';
-		else
-			*buf++ = *ptr;
-		ptr++;
-	}
-
-	return buf - old_buf;
-}
-
 #define ror13(v) ((v >> 13) | (v << 19))
 
 #define dir_u8(idx)				\
@@ -136,12 +123,9 @@
 	return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff;
 }
 
-/*
- * Read and check that a directory is valid
- */
-static int
-adfs_dir_read(struct super_block *sb, unsigned long object_id,
-	      unsigned int size, struct adfs_dir *dir)
+/* Read and check that a directory is valid */
+static int adfs_dir_read(struct super_block *sb, u32 indaddr,
+			 unsigned int size, struct adfs_dir *dir)
 {
 	const unsigned int blocksize_bits = sb->s_blocksize_bits;
 	int blk = 0;
@@ -161,10 +145,10 @@
 	for (blk = 0; blk < size; blk++) {
 		int phys;
 
-		phys = __adfs_block_map(sb, object_id, blk);
+		phys = __adfs_block_map(sb, indaddr, blk);
 		if (!phys) {
-			adfs_error(sb, "dir object %lX has a hole at offset %d",
-				   object_id, blk);
+			adfs_error(sb, "dir %06x has a hole at offset %d",
+				   indaddr, blk);
 			goto release_buffers;
 		}
 
@@ -192,8 +176,7 @@
 	return 0;
 
 bad_dir:
-	adfs_error(sb, "corrupted directory fragment %lX",
-		   object_id);
+	adfs_error(sb, "dir %06x is corrupted", indaddr);
 release_buffers:
 	for (blk -= 1; blk >= 0; blk -= 1)
 		brelse(dir->bh[blk]);
@@ -210,29 +193,23 @@
 adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj,
 	struct adfs_direntry *de)
 {
-	obj->name_len =	adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN);
-	obj->file_id  = adfs_readval(de->dirinddiscadd, 3);
+	unsigned int name_len;
+
+	for (name_len = 0; name_len < ADFS_F_NAME_LEN; name_len++) {
+		if (de->dirobname[name_len] < ' ')
+			break;
+
+		obj->name[name_len] = de->dirobname[name_len];
+	}
+
+	obj->name_len =	name_len;
+	obj->indaddr  = adfs_readval(de->dirinddiscadd, 3);
 	obj->loadaddr = adfs_readval(de->dirload, 4);
 	obj->execaddr = adfs_readval(de->direxec, 4);
 	obj->size     = adfs_readval(de->dirlen,  4);
 	obj->attr     = de->newdiratts;
-	obj->filetype = -1;
 
-	/*
-	 * object is a file and is filetyped and timestamped?
-	 * RISC OS 12-bit filetype is stored in load_address[19:8]
-	 */
-	if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
-		(0xfff00000 == (0xfff00000 & obj->loadaddr))) {
-		obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
-
-		/* optionally append the ,xyz hex filetype suffix */
-		if (ADFS_SB(dir->sb)->s_ftsuffix)
-			obj->name_len +=
-				append_filetype_suffix(
-					&obj->name[obj->name_len],
-					obj->filetype);
-	}
+	adfs_object_fixup(dir, obj);
 }
 
 /*
@@ -241,7 +218,7 @@
 static inline void
 adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj)
 {
-	adfs_writeval(de->dirinddiscadd, 3, obj->file_id);
+	adfs_writeval(de->dirinddiscadd, 3, obj->indaddr);
 	adfs_writeval(de->dirload, 4, obj->loadaddr);
 	adfs_writeval(de->direxec, 4, obj->execaddr);
 	adfs_writeval(de->dirlen,  4, obj->size);
@@ -327,8 +304,7 @@
  * the caller is responsible for holding the necessary
  * locks.
  */
-static int
-adfs_dir_find_entry(struct adfs_dir *dir, unsigned long object_id)
+static int adfs_dir_find_entry(struct adfs_dir *dir, u32 indaddr)
 {
 	int pos, ret;
 
@@ -340,7 +316,7 @@
 		if (!__adfs_dir_get(dir, pos, &obj))
 			break;
 
-		if (obj.file_id == object_id) {
+		if (obj.indaddr == indaddr) {
 			ret = pos;
 			break;
 		}
@@ -349,15 +325,15 @@
 	return ret;
 }
 
-static int
-adfs_f_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir)
+static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size,
+		       struct adfs_dir *dir)
 {
 	int ret;
 
-	if (sz != ADFS_NEWDIR_SIZE)
+	if (size != ADFS_NEWDIR_SIZE)
 		return -EIO;
 
-	ret = adfs_dir_read(sb, id, sz, dir);
+	ret = adfs_dir_read(sb, indaddr, size, dir);
 	if (ret)
 		adfs_error(sb, "unable to read directory");
 	else
@@ -394,7 +370,7 @@
 	struct super_block *sb = dir->sb;
 	int ret, i;
 
-	ret = adfs_dir_find_entry(dir, obj->file_id);
+	ret = adfs_dir_find_entry(dir, obj->indaddr);
 	if (ret < 0) {
 		adfs_error(dir->sb, "unable to locate entry to update");
 		goto out;

diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h
index e471340..5aec332 100644
--- a/fs/adfs/dir_f.h
+++ b/fs/adfs/dir_f.h

@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  *  linux/fs/adfs/dir_f.h
  *
  *  Copyright (C) 1999 Russell King
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
  *  Structures of directories on the F format disk
  */
 #ifndef ADFS_DIR_F_H

diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index c92cfb6..d56924c 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/adfs/dir_fplus.c
  *
  *  Copyright (C) 1997-1999 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
-#include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include "adfs.h"
 #include "dir_fplus.h"
@@ -40,17 +36,15 @@
 	h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
 	size = le32_to_cpu(h->bigdirsize);
 	if (size != sz) {
-		printk(KERN_WARNING "adfs: adfs_fplus_read:"
-					" directory header size %X\n"
-					" does not match directory size %X\n",
-					size, sz);
+		adfs_msg(sb, KERN_WARNING,
+			 "directory header size %X does not match directory size %X",
+			 size, sz);
 	}
 
 	if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
 	    h->bigdirversion[2] != 0 || size & 2047 ||
 	    h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
-		printk(KERN_WARNING "adfs: dir object %X has"
-					" malformed dir header\n", id);
+		adfs_error(sb, "dir %06x has malformed header", id);
 		goto out;
 	}
 
@@ -61,9 +55,10 @@
 			kcalloc(size, sizeof(struct buffer_head *),
 				GFP_KERNEL);
 		if (!bh_fplus) {
+			adfs_msg(sb, KERN_ERR,
+				 "not enough memory for dir object %X (%d blocks)",
+				 id, size);
 			ret = -ENOMEM;
-			adfs_error(sb, "not enough memory for"
-					" dir object %X (%d blocks)", id, size);
 			goto out;
 		}
 		dir->bh_fplus = bh_fplus;
@@ -94,8 +89,7 @@
 	if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
 	    t->bigdirendmasseq != h->startmasseq ||
 	    t->reserved[0] != 0 || t->reserved[1] != 0) {
-		printk(KERN_WARNING "adfs: dir object %X has "
-					"malformed dir end\n", id);
+		adfs_error(sb, "dir %06x has malformed tail", id);
 		goto out;
 	}
 
@@ -169,7 +163,7 @@
 		(struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
 	struct adfs_bigdirentry bde;
 	unsigned int offset;
-	int i, ret = -ENOENT;
+	int ret = -ENOENT;
 
 	if (dir->pos >= le32_to_cpu(h->bigdirentries))
 		goto out;
@@ -183,7 +177,7 @@
 	obj->loadaddr = le32_to_cpu(bde.bigdirload);
 	obj->execaddr = le32_to_cpu(bde.bigdirexec);
 	obj->size     = le32_to_cpu(bde.bigdirlen);
-	obj->file_id  = le32_to_cpu(bde.bigdirindaddr);
+	obj->indaddr  = le32_to_cpu(bde.bigdirindaddr);
 	obj->attr     = le32_to_cpu(bde.bigdirattr);
 	obj->name_len = le32_to_cpu(bde.bigdirobnamelen);
 
@@ -193,27 +187,7 @@
 	offset += le32_to_cpu(bde.bigdirobnameptr);
 
 	dir_memcpy(dir, offset, obj->name, obj->name_len);
-	for (i = 0; i < obj->name_len; i++)
-		if (obj->name[i] == '/')
-			obj->name[i] = '.';
-
-	obj->filetype = -1;
-
-	/*
-	 * object is a file and is filetyped and timestamped?
-	 * RISC OS 12-bit filetype is stored in load_address[19:8]
-	 */
-	if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
-		(0xfff00000 == (0xfff00000 & obj->loadaddr))) {
-		obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
-
-		/* optionally append the ,xyz hex filetype suffix */
-		if (ADFS_SB(dir->sb)->s_ftsuffix)
-			obj->name_len +=
-				append_filetype_suffix(
-					&obj->name[obj->name_len],
-					obj->filetype);
-	}
+	adfs_object_fixup(dir, obj);
 
 	dir->pos += 1;
 	ret = 0;

diff --git a/fs/adfs/dir_fplus.h b/fs/adfs/dir_fplus.h
index b55aa41..4ec0931 100644
--- a/fs/adfs/dir_fplus.h
+++ b/fs/adfs/dir_fplus.h

@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  *  linux/fs/adfs/dir_fplus.h
  *
  *  Copyright (C) 1999 Russell King
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
  *  Structures of directories on the F+ format disk
  */
 

diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 66621e9..124de75 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c

@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/adfs/inode.c
  *
  *  Copyright (C) 1997-1999 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
@@ -97,7 +94,7 @@
 		return S_IFDIR | S_IXUGO | mode;
 	}
 
-	switch (ADFS_I(inode)->filetype) {
+	switch (adfs_filetype(ADFS_I(inode)->loadaddr)) {
 	case 0xfc0:	/* LinkFS */
 		return S_IFLNK|S_IRWXUGO;
 
@@ -177,7 +174,7 @@
 							2208988800000000000LL;
 	s64 nsec;
 
-	if (ADFS_I(inode)->stamped == 0)
+	if (!adfs_inode_is_stamped(inode))
 		goto cur_time;
 
 	high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */
@@ -216,7 +213,7 @@
 {
 	unsigned int high, low;
 
-	if (ADFS_I(inode)->stamped) {
+	if (adfs_inode_is_stamped(inode)) {
 		/* convert 32-bit seconds to 40-bit centi-seconds */
 		low  = (secs & 255) * 100;
 		high = (secs / 256) * 100 + (low >> 8) + 0x336e996a;
@@ -250,7 +247,7 @@
 
 	inode->i_uid	 = ADFS_SB(sb)->s_uid;
 	inode->i_gid	 = ADFS_SB(sb)->s_gid;
-	inode->i_ino	 = obj->file_id;
+	inode->i_ino	 = obj->indaddr;
 	inode->i_size	 = obj->size;
 	set_nlink(inode, 2);
 	inode->i_blocks	 = (inode->i_size + sb->s_blocksize - 1) >>
@@ -266,8 +263,6 @@
 	ADFS_I(inode)->loadaddr  = obj->loadaddr;
 	ADFS_I(inode)->execaddr  = obj->execaddr;
 	ADFS_I(inode)->attr      = obj->attr;
-	ADFS_I(inode)->filetype  = obj->filetype;
-	ADFS_I(inode)->stamped   = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
 
 	inode->i_mode	 = adfs_atts2mode(sb, inode);
 	adfs_adfs2unix_time(&inode->i_mtime, inode);
@@ -358,7 +353,7 @@
 	struct object_info obj;
 	int ret;
 
-	obj.file_id	= inode->i_ino;
+	obj.indaddr	= inode->i_ino;
 	obj.name_len	= 0;
 	obj.parent_id	= ADFS_I(inode)->parent_id;
 	obj.loadaddr	= ADFS_I(inode)->loadaddr;

diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 6935f05..f44d12c 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/adfs/map.c
  *
  *  Copyright (C) 1997-2002 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
-#include <linux/buffer_head.h>
 #include <asm/unaligned.h>
 #include "adfs.h"
 
@@ -67,9 +63,8 @@
  * output of:
  *  gcc -D__KERNEL__ -O2 -I../../include -o - -S map.c
  */
-static int
-lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen,
-	    const unsigned int frag_id, unsigned int *offset)
+static int lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen,
+		       const u32 frag_id, unsigned int *offset)
 {
 	const unsigned int mapsize = dm->dm_endbit;
 	const u32 idmask = (1 << idlen) - 1;
@@ -188,9 +183,8 @@
 	return 0;
 }
 
-static int
-scan_map(struct adfs_sb_info *asb, unsigned int zone,
-	 const unsigned int frag_id, unsigned int mapoff)
+static int scan_map(struct adfs_sb_info *asb, unsigned int zone,
+		    const u32 frag_id, unsigned int mapoff)
 {
 	const unsigned int idlen = asb->s_idlen;
 	struct adfs_discmap *dm, *dm_end;
@@ -244,9 +238,7 @@
 	return signed_asl(total, asb->s_map2blk);
 }
 
-int
-adfs_map_lookup(struct super_block *sb, unsigned int frag_id,
-		unsigned int offset)
+int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset)
 {
 	struct adfs_sb_info *asb = ADFS_SB(sb);
 	unsigned int zone, mapoff;

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7e099a7..65b04eb 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c

@@ -1,15 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/adfs/super.c
  *
  *  Copyright (C) 1997-1999 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/buffer_head.h>
 #include <linux/parser.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
@@ -20,25 +16,42 @@
 #include "dir_f.h"
 #include "dir_fplus.h"
 
+#define ADFS_SB_FLAGS SB_NOATIME
+
 #define ADFS_DEFAULT_OWNER_MASK S_IRWXU
 #define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO)
 
 void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...)
 {
-	char error_buf[128];
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
-	va_end(args);
+	vaf.fmt = fmt;
+	vaf.va = &args;
 
-	printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %s\n",
+	printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %pV\n",
 		sb->s_id, function ? ": " : "",
-		function ? function : "", error_buf);
+		function ? function : "", &vaf);
+
+	va_end(args);
+}
+
+void adfs_msg(struct super_block *sb, const char *pfx, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sADFS-fs (%s): %pV\n", pfx, sb->s_id, &vaf);
+	va_end(args);
 }
 
 static int adfs_checkdiscrecord(struct adfs_discrecord *dr)
 {
+	unsigned int max_idlen;
 	int i;
 
 	/* sector size must be 256, 512 or 1024 bytes */
@@ -58,8 +71,13 @@
 	if (le32_to_cpu(dr->disc_size_high) >> dr->log2secsize)
 		return 1;
 
-	/* idlen must be no greater than 19 v2 [1.0] */
-	if (dr->idlen > 19)
+	/*
+	 * Maximum idlen is limited to 16 bits for new directories by
+	 * the three-byte storage of an indirect disc address.  For
+	 * big directories, idlen must be no greater than 19 v2 [1.0]
+	 */
+	max_idlen = dr->format_version ? 19 : 16;
+	if (dr->idlen > max_idlen)
 		return 1;
 
 	/* reserved bytes should be zero */
@@ -155,10 +173,10 @@
 	{Opt_err, NULL}
 };
 
-static int parse_options(struct super_block *sb, char *options)
+static int parse_options(struct super_block *sb, struct adfs_sb_info *asb,
+			 char *options)
 {
 	char *p;
-	struct adfs_sb_info *asb = ADFS_SB(sb);
 	int option;
 
 	if (!options)
@@ -202,8 +220,9 @@
 			asb->s_ftsuffix = option;
 			break;
 		default:
-			printk("ADFS-fs: unrecognised mount option \"%s\" "
-					"or missing value\n", p);
+			adfs_msg(sb, KERN_ERR,
+				 "unrecognised mount option \"%s\" or missing value",
+				 p);
 			return -EINVAL;
 		}
 	}
@@ -212,21 +231,31 @@
 
 static int adfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	struct adfs_sb_info temp_asb;
+	int ret;
+
 	sync_filesystem(sb);
-	*flags |= SB_NODIRATIME;
-	return parse_options(sb, data);
+	*flags |= ADFS_SB_FLAGS;
+
+	temp_asb = *ADFS_SB(sb);
+	ret = parse_options(sb, &temp_asb, data);
+	if (ret == 0)
+		*ADFS_SB(sb) = temp_asb;
+
+	return ret;
 }
 
 static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct adfs_sb_info *sbi = ADFS_SB(sb);
+	struct adfs_discrecord *dr = adfs_map_discrecord(sbi->s_map);
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type    = ADFS_SUPER_MAGIC;
 	buf->f_namelen = sbi->s_namelen;
 	buf->f_bsize   = sb->s_blocksize;
-	buf->f_blocks  = sbi->s_size;
+	buf->f_blocks  = adfs_disc_size(dr) >> sb->s_blocksize_bits;
 	buf->f_files   = sbi->s_ids_per_zone * sbi->s_map_size;
 	buf->f_bavail  =
 	buf->f_bfree   = adfs_map_free(sb);
@@ -248,17 +277,11 @@
 	return &ei->vfs_inode;
 }
 
-static void adfs_i_callback(struct rcu_head *head)
+static void adfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
-static void adfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, adfs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
@@ -290,7 +313,7 @@
 
 static const struct super_operations adfs_sops = {
 	.alloc_inode	= adfs_alloc_inode,
-	.destroy_inode	= adfs_destroy_inode,
+	.free_inode	= adfs_free_inode,
 	.drop_inode	= generic_delete_inode,
 	.write_inode	= adfs_write_inode,
 	.put_super	= adfs_put_super,
@@ -336,8 +359,7 @@
 	i = zone - 1;
 	dm[0].dm_startblk = 0;
 	dm[0].dm_startbit = ADFS_DR_SIZE_BITS;
-	dm[i].dm_endbit   = (le32_to_cpu(dr->disc_size_high) << (32 - dr->log2bpmb)) +
-			    (le32_to_cpu(dr->disc_size) >> dr->log2bpmb) +
+	dm[i].dm_endbit   = (adfs_disc_size(dr) >> dr->log2bpmb) +
 			    (ADFS_DR_SIZE_BITS - i * zone_size);
 
 	if (adfs_checkmap(sb, dm))
@@ -353,27 +375,18 @@
 	return ERR_PTR(-EIO);
 }
 
-static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits)
-{
-	unsigned long discsize;
-
-	discsize  = le32_to_cpu(dr->disc_size_high) << (32 - block_bits);
-	discsize |= le32_to_cpu(dr->disc_size) >> block_bits;
-
-	return discsize;
-}
-
 static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct adfs_discrecord *dr;
 	struct buffer_head *bh;
 	struct object_info root_obj;
 	unsigned char *b_data;
+	unsigned int blocksize;
 	struct adfs_sb_info *asb;
 	struct inode *root;
 	int ret = -EINVAL;
 
-	sb->s_flags |= SB_NODIRATIME;
+	sb->s_flags |= ADFS_SB_FLAGS;
 
 	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
 	if (!asb)
@@ -387,12 +400,12 @@
 	asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
 	asb->s_ftsuffix = 0;
 
-	if (parse_options(sb, data))
+	if (parse_options(sb, asb, data))
 		goto error;
 
 	sb_set_blocksize(sb, BLOCK_SIZE);
 	if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) {
-		adfs_error(sb, "unable to read superblock");
+		adfs_msg(sb, KERN_ERR, "error: unable to read superblock");
 		ret = -EIO;
 		goto error;
 	}
@@ -400,11 +413,8 @@
 	b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE);
 
 	if (adfs_checkbblk(b_data)) {
-		if (!silent)
-			printk("VFS: Can't find an adfs filesystem on dev "
-				"%s.\n", sb->s_id);
 		ret = -EINVAL;
-		goto error_free_bh;
+		goto error_badfs;
 	}
 
 	dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
@@ -413,33 +423,33 @@
 	 * Do some sanity checks on the ADFS disc record
 	 */
 	if (adfs_checkdiscrecord(dr)) {
-		if (!silent)
-			printk("VPS: Can't find an adfs filesystem on dev "
-				"%s.\n", sb->s_id);
 		ret = -EINVAL;
-		goto error_free_bh;
+		goto error_badfs;
 	}
 
+	blocksize = 1 << dr->log2secsize;
 	brelse(bh);
-	if (sb_set_blocksize(sb, 1 << dr->log2secsize)) {
+
+	if (sb_set_blocksize(sb, blocksize)) {
 		bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize);
 		if (!bh) {
-			adfs_error(sb, "couldn't read superblock on "
-				"2nd try.");
+			adfs_msg(sb, KERN_ERR,
+				 "error: couldn't read superblock on 2nd try.");
 			ret = -EIO;
 			goto error;
 		}
 		b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize);
 		if (adfs_checkbblk(b_data)) {
-			adfs_error(sb, "disc record mismatch, very weird!");
+			adfs_msg(sb, KERN_ERR,
+				 "error: disc record mismatch, very weird!");
 			ret = -EINVAL;
 			goto error_free_bh;
 		}
 		dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
 	} else {
 		if (!silent)
-			printk(KERN_ERR "VFS: Unsupported blocksize on dev "
-				"%s.\n", sb->s_id);
+			adfs_msg(sb, KERN_ERR,
+				 "error: unsupported blocksize");
 		ret = -EINVAL;
 		goto error;
 	}
@@ -452,8 +462,6 @@
 	asb->s_idlen		= dr->idlen;
 	asb->s_map_size		= dr->nzones | (dr->nzones_high << 8);
 	asb->s_map2blk		= dr->log2bpmb - dr->log2secsize;
-	asb->s_size    		= adfs_discsize(dr, sb->s_blocksize_bits);
-	asb->s_version 		= dr->format_version;
 	asb->s_log2sharesize	= dr->log2sharesize;
 
 	asb->s_map = adfs_read_map(sb, dr);
@@ -469,9 +477,9 @@
 	 */
 	sb->s_op = &adfs_sops;
 
-	dr = (struct adfs_discrecord *)(asb->s_map[0].dm_bh->b_data + 4);
+	dr = adfs_map_discrecord(asb->s_map);
 
-	root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root);
+	root_obj.parent_id = root_obj.indaddr = le32_to_cpu(dr->root);
 	root_obj.name_len  = 0;
 	/* Set root object date as 01 Jan 1987 00:00:00 */
 	root_obj.loadaddr  = 0xfff0003f;
@@ -479,13 +487,12 @@
 	root_obj.size	   = ADFS_NEWDIR_SIZE;
 	root_obj.attr	   = ADFS_NDA_DIRECTORY   | ADFS_NDA_OWNER_READ |
 			     ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ;
-	root_obj.filetype  = -1;
 
 	/*
 	 * If this is a F+ disk with variable length directories,
 	 * get the root_size from the disc record.
 	 */
-	if (asb->s_version) {
+	if (dr->format_version) {
 		root_obj.size = le32_to_cpu(dr->root_size);
 		asb->s_dir     = &adfs_fplus_dir_ops;
 		asb->s_namelen = ADFS_FPLUS_NAME_LEN;
@@ -514,6 +521,11 @@
 	}
 	return 0;
 
+error_badfs:
+	if (!silent)
+		adfs_msg(sb, KERN_ERR,
+			 "error: can't find an ADFS filesystem on dev %s.",
+			 sb->s_id);
 error_free_bh:
 	brelse(bh);
 error:

diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index a04d9e8..84c46b9 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config AFFS_FS
 	tristate "Amiga FFS file system support"
 	depends on BLOCK

diff --git a/fs/affs/Makefile b/fs/affs/Makefile
index 3988b4a..f2c8114 100644
--- a/fs/affs/Makefile
+++ b/fs/affs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux affs filesystem routines.
 #

diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 14a6c1b..f708c45 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c

@@ -375,7 +375,7 @@
 	u32	 minute;
 	s32	 rem;
 
-	secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60);
+	secs -= sys_tz.tz_minuteswest * 60 + AFFS_EPOCH_DELTA;
 	if (secs < 0)
 		secs = 0;
 	days    = div_s64_rem(secs, 86400, &rem);

diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
index f9bef90..81fb396 100644
--- a/fs/affs/amigaffs.h
+++ b/fs/affs/amigaffs.h

@@ -32,6 +32,9 @@
 
 #define AFFS_ROOT_BMAPS		25
 
+/* Seconds since Amiga epoch of 1978/01/01 to UNIX */
+#define AFFS_EPOCH_DELTA ((8 * 365 + 2) * 86400LL)
+
 struct affs_date {
 	__be32 days;
 	__be32 mins;

diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 73598bf..a346cf7 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c

@@ -150,10 +150,10 @@
 	}
 
 	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec
-		       = (be32_to_cpu(tail->change.days) * (24 * 60 * 60) +
+		       = (be32_to_cpu(tail->change.days) * 86400LL +
 		         be32_to_cpu(tail->change.mins) * 60 +
 			 be32_to_cpu(tail->change.ticks) / 50 +
-			 ((8 * 365 + 2) * 24 * 60 * 60)) +
+			 AFFS_EPOCH_DELTA) +
 			 sys_tz.tz_minuteswest * 60;
 	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0;
 	affs_brelse(bh);

diff --git a/fs/affs/super.c b/fs/affs/super.c
index d1ad11a..cc463ae 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/affs/inode.c
  *
@@ -111,17 +112,11 @@
 	return &i->vfs_inode;
 }
 
-static void affs_i_callback(struct rcu_head *head)
+static void affs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
-static void affs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, affs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
@@ -155,7 +150,7 @@
 
 static const struct super_operations affs_sops = {
 	.alloc_inode	= affs_alloc_inode,
-	.destroy_inode	= affs_destroy_inode,
+	.free_inode	= affs_free_inode,
 	.write_inode	= affs_write_inode,
 	.evict_inode	= affs_evict_inode,
 	.put_super	= affs_put_super,
@@ -360,6 +355,10 @@
 	sb->s_op                = &affs_sops;
 	sb->s_flags |= SB_NODIRATIME;
 
+	sb->s_time_gran = NSEC_PER_SEC;
+	sb->s_time_min = sys_tz.tz_minuteswest * 60 + AFFS_EPOCH_DELTA;
+	sb->s_time_max = 86400LL * U32_MAX + 86400 + sb->s_time_min;
+
 	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
@@ -487,7 +486,7 @@
 		break;
 	case MUFS_OFS:
 		affs_set_opt(sbi->s_flags, SF_MUFS);
-		/* fall thru */
+		/* fall through */
 	case FS_OFS:
 		affs_set_opt(sbi->s_flags, SF_OFS);
 		sb->s_flags |= SB_NOEXEC;
@@ -495,6 +494,7 @@
 	case MUFS_DCOFS:
 	case MUFS_INTLOFS:
 		affs_set_opt(sbi->s_flags, SF_MUFS);
+		/* fall through */
 	case FS_DCOFS:
 	case FS_INTLOFS:
 		affs_set_opt(sbi->s_flags, SF_INTL);

diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index ebba3b1..3fb1f55 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config AFS_FS
 	tristate "Andrew File System support (AFS)"
 	depends on INET
@@ -27,3 +28,15 @@
 	help
 	  Say Y here if you want AFS data to be cached locally on disk through
 	  the generic filesystem cache manager
+
+config AFS_DEBUG_CURSOR
+	bool "AFS server cursor debugging"
+	depends on AFS_FS
+	help
+	  Say Y here to cause the contents of a server cursor to be dumped to
+	  the dmesg log if the server rotation algorithm fails to successfully
+	  contact a server.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.

diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 5468740..10359be 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile

@@ -13,10 +13,12 @@
 	cmservice.o \
 	dir.o \
 	dir_edit.o \
+	dir_silly.o \
 	dynroot.o \
 	file.o \
 	flock.o \
 	fsclient.o \
+	fs_probe.o \
 	inode.o \
 	main.o \
 	misc.o \
@@ -27,11 +29,14 @@
 	server.o \
 	server_list.o \
 	super.o \
-	netdevices.o \
 	vlclient.o \
+	vl_list.o \
+	vl_probe.o \
+	vl_rotate.o \
 	volume.o \
 	write.o \
-	xattr.o
+	xattr.o \
+	yfsclient.o
 
 kafs-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_AFS_FS)  := kafs.o

diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 025a9a5..df415c0 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Server address list management
  *
  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/slab.h>
@@ -17,11 +13,6 @@
 #include "internal.h"
 #include "afs_fs.h"
 
-//#define AFS_MAX_ADDRESSES
-//	((unsigned int)((PAGE_SIZE - sizeof(struct afs_addr_list)) /
-//			sizeof(struct sockaddr_rxrpc)))
-#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
-
 /*
  * Release an address list.
  */
@@ -43,11 +34,15 @@
 
 	_enter("%u,%u,%u", nr, service, port);
 
+	if (nr > AFS_MAX_ADDRESSES)
+		nr = AFS_MAX_ADDRESSES;
+
 	alist = kzalloc(struct_size(alist, addrs, nr), GFP_KERNEL);
 	if (!alist)
 		return NULL;
 
 	refcount_set(&alist->usage, 1);
+	alist->max_addrs = nr;
 
 	for (i = 0; i < nr; i++) {
 		struct sockaddr_rxrpc *srx = &alist->addrs[i];
@@ -65,19 +60,25 @@
 /*
  * Parse a text string consisting of delimited addresses.
  */
-struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
-					   char delim,
-					   unsigned short service,
-					   unsigned short port)
+struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
+					       const char *text, size_t len,
+					       char delim,
+					       unsigned short service,
+					       unsigned short port)
 {
+	struct afs_vlserver_list *vllist;
 	struct afs_addr_list *alist;
 	const char *p, *end = text + len;
+	const char *problem;
 	unsigned int nr = 0;
+	int ret = -ENOMEM;
 
 	_enter("%*.*s,%c", (int)len, (int)len, text, delim);
 
-	if (!len)
+	if (!len) {
+		_leave(" = -EDESTADDRREQ [empty]");
 		return ERR_PTR(-EDESTADDRREQ);
+	}
 
 	if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len)))
 		delim = ',';
@@ -85,18 +86,24 @@
 	/* Count the addresses */
 	p = text;
 	do {
-		if (!*p)
-			return ERR_PTR(-EINVAL);
+		if (!*p) {
+			problem = "nul";
+			goto inval;
+		}
 		if (*p == delim)
 			continue;
 		nr++;
 		if (*p == '[') {
 			p++;
-			if (p == end)
-				return ERR_PTR(-EINVAL);
+			if (p == end) {
+				problem = "brace1";
+				goto inval;
+			}
 			p = memchr(p, ']', end - p);
-			if (!p)
-				return ERR_PTR(-EINVAL);
+			if (!p) {
+				problem = "brace2";
+				goto inval;
+			}
 			p++;
 			if (p >= end)
 				break;
@@ -109,18 +116,27 @@
 	} while (p < end);
 
 	_debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES);
-	if (nr > AFS_MAX_ADDRESSES)
-		nr = AFS_MAX_ADDRESSES;
 
-	alist = afs_alloc_addrlist(nr, service, port);
-	if (!alist)
+	vllist = afs_alloc_vlserver_list(1);
+	if (!vllist)
 		return ERR_PTR(-ENOMEM);
 
+	vllist->nr_servers = 1;
+	vllist->servers[0].server = afs_alloc_vlserver("<dummy>", 7, AFS_VL_PORT);
+	if (!vllist->servers[0].server)
+		goto error_vl;
+
+	alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT);
+	if (!alist)
+		goto error;
+
 	/* Extract the addresses */
 	p = text;
 	do {
-		struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
 		const char *q, *stop;
+		unsigned int xport = port;
+		__be32 x[4];
+		int family;
 
 		if (*p == delim) {
 			p++;
@@ -136,58 +152,74 @@
 					break;
 		}
 
-		if (in4_pton(p, q - p,
-			     (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
-			     -1, &stop)) {
-			srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
-			srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
-			srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
-		} else if (in6_pton(p, q - p,
-				    srx->transport.sin6.sin6_addr.s6_addr,
-				    -1, &stop)) {
-			/* Nothing to do */
+		if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) {
+			family = AF_INET;
+		} else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) {
+			family = AF_INET6;
 		} else {
+			problem = "family";
 			goto bad_address;
 		}
 
-		if (stop != q)
-			goto bad_address;
-
 		p = q;
+		if (stop != p) {
+			problem = "nostop";
+			goto bad_address;
+		}
+
 		if (q < end && *q == ']')
 			p++;
 
 		if (p < end) {
 			if (*p == '+') {
 				/* Port number specification "+1234" */
-				unsigned int xport = 0;
+				xport = 0;
 				p++;
-				if (p >= end || !isdigit(*p))
+				if (p >= end || !isdigit(*p)) {
+					problem = "port";
 					goto bad_address;
+				}
 				do {
 					xport *= 10;
 					xport += *p - '0';
-					if (xport > 65535)
+					if (xport > 65535) {
+						problem = "pval";
 						goto bad_address;
+					}
 					p++;
 				} while (p < end && isdigit(*p));
-				srx->transport.sin6.sin6_port = htons(xport);
 			} else if (*p == delim) {
 				p++;
 			} else {
+				problem = "weird";
 				goto bad_address;
 			}
 		}
 
-		alist->nr_addrs++;
-	} while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES);
+		if (family == AF_INET)
+			afs_merge_fs_addr4(alist, x[0], xport);
+		else
+			afs_merge_fs_addr6(alist, x, xport);
 
+	} while (p < end);
+
+	rcu_assign_pointer(vllist->servers[0].server->addresses, alist);
 	_leave(" = [nr %u]", alist->nr_addrs);
-	return alist;
+	return vllist;
 
-bad_address:
-	kfree(alist);
+inval:
+	_leave(" = -EINVAL [%s %zu %*.*s]",
+	       problem, p - text, (int)len, (int)len, text);
 	return ERR_PTR(-EINVAL);
+bad_address:
+	_leave(" = -EINVAL [%s %zu %*.*s]",
+	       problem, p - text, (int)len, (int)len, text);
+	ret = -EINVAL;
+error:
+	afs_put_addrlist(alist);
+error_vl:
+	afs_put_vlserverlist(net, vllist);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -206,30 +238,34 @@
 /*
  * Perform a DNS query for VL servers and build a up an address list.
  */
-struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
+struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
 {
-	struct afs_addr_list *alist;
-	char *vllist = NULL;
+	struct afs_vlserver_list *vllist;
+	char *result = NULL;
 	int ret;
 
 	_enter("%s", cell->name);
 
-	ret = dns_query("afsdb", cell->name, cell->name_len,
-			"", &vllist, _expiry);
-	if (ret < 0)
+	ret = dns_query(cell->net->net, "afsdb", cell->name, cell->name_len,
+			"srv=1", &result, _expiry, true);
+	if (ret < 0) {
+		_leave(" = %d [dns]", ret);
 		return ERR_PTR(ret);
-
-	alist = afs_parse_text_addrs(vllist, strlen(vllist), ',',
-				     VL_SERVICE, AFS_VL_PORT);
-	if (IS_ERR(alist)) {
-		kfree(vllist);
-		if (alist != ERR_PTR(-ENOMEM))
-			pr_err("Failed to parse DNS data\n");
-		return alist;
 	}
 
-	kfree(vllist);
-	return alist;
+	if (*_expiry == 0)
+		*_expiry = ktime_get_real_seconds() + 60;
+
+	if (ret > 1 && result[0] == 0)
+		vllist = afs_extract_vlserver_list(cell, result, ret);
+	else
+		vllist = afs_parse_text_addrs(cell->net, result, ret, ',',
+					      VL_SERVICE, AFS_VL_PORT);
+	kfree(result);
+	if (IS_ERR(vllist) && vllist != ERR_PTR(-ENOMEM))
+		pr_err("Failed to parse DNS data %ld\n", PTR_ERR(vllist));
+
+	return vllist;
 }
 
 /*
@@ -237,19 +273,23 @@
  */
 void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
 {
-	struct sockaddr_in6 *a;
-	__be16 xport = htons(port);
+	struct sockaddr_rxrpc *srx;
+	u32 addr = ntohl(xdr);
 	int i;
 
+	if (alist->nr_addrs >= alist->max_addrs)
+		return;
+
 	for (i = 0; i < alist->nr_ipv4; i++) {
-		a = &alist->addrs[i].transport.sin6;
-		if (xdr == a->sin6_addr.s6_addr32[3] &&
-		    xport == a->sin6_port)
+		struct sockaddr_in *a = &alist->addrs[i].transport.sin;
+		u32 a_addr = ntohl(a->sin_addr.s_addr);
+		u16 a_port = ntohs(a->sin_port);
+
+		if (addr == a_addr && port == a_port)
 			return;
-		if (xdr == a->sin6_addr.s6_addr32[3] &&
-		    (u16 __force)xport < (u16 __force)a->sin6_port)
+		if (addr == a_addr && port < a_port)
 			break;
-		if ((u32 __force)xdr < (u32 __force)a->sin6_addr.s6_addr32[3])
+		if (addr < a_addr)
 			break;
 	}
 
@@ -258,12 +298,13 @@
 			alist->addrs + i,
 			sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
 
-	a = &alist->addrs[i].transport.sin6;
-	a->sin6_port		  = xport;
-	a->sin6_addr.s6_addr32[0] = 0;
-	a->sin6_addr.s6_addr32[1] = 0;
-	a->sin6_addr.s6_addr32[2] = htonl(0xffff);
-	a->sin6_addr.s6_addr32[3] = xdr;
+	srx = &alist->addrs[i];
+	srx->srx_family = AF_RXRPC;
+	srx->transport_type = SOCK_DGRAM;
+	srx->transport_len = sizeof(srx->transport.sin);
+	srx->transport.sin.sin_family = AF_INET;
+	srx->transport.sin.sin_port = htons(port);
+	srx->transport.sin.sin_addr.s_addr = xdr;
 	alist->nr_ipv4++;
 	alist->nr_addrs++;
 }
@@ -273,18 +314,20 @@
  */
 void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
 {
-	struct sockaddr_in6 *a;
-	__be16 xport = htons(port);
+	struct sockaddr_rxrpc *srx;
 	int i, diff;
 
+	if (alist->nr_addrs >= alist->max_addrs)
+		return;
+
 	for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
-		a = &alist->addrs[i].transport.sin6;
+		struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6;
+		u16 a_port = ntohs(a->sin6_port);
+
 		diff = memcmp(xdr, &a->sin6_addr, 16);
-		if (diff == 0 &&
-		    xport == a->sin6_port)
+		if (diff == 0 && port == a_port)
 			return;
-		if (diff == 0 &&
-		    (u16 __force)xport < (u16 __force)a->sin6_port)
+		if (diff == 0 && port < a_port)
 			break;
 		if (diff < 0)
 			break;
@@ -295,12 +338,13 @@
 			alist->addrs + i,
 			sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
 
-	a = &alist->addrs[i].transport.sin6;
-	a->sin6_port		  = xport;
-	a->sin6_addr.s6_addr32[0] = xdr[0];
-	a->sin6_addr.s6_addr32[1] = xdr[1];
-	a->sin6_addr.s6_addr32[2] = xdr[2];
-	a->sin6_addr.s6_addr32[3] = xdr[3];
+	srx = &alist->addrs[i];
+	srx->srx_family = AF_RXRPC;
+	srx->transport_type = SOCK_DGRAM;
+	srx->transport_len = sizeof(srx->transport.sin6);
+	srx->transport.sin6.sin6_family = AF_INET6;
+	srx->transport.sin6.sin6_port = htons(port);
+	memcpy(&srx->transport.sin6.sin6_addr, xdr, 16);
 	alist->nr_addrs++;
 }
 
@@ -309,25 +353,33 @@
  */
 bool afs_iterate_addresses(struct afs_addr_cursor *ac)
 {
-	_enter("%hu+%hd", ac->start, (short)ac->index);
+	unsigned long set, failed;
+	int index;
 
 	if (!ac->alist)
 		return false;
 
-	if (ac->begun) {
-		ac->index++;
-		if (ac->index == ac->alist->nr_addrs)
-			ac->index = 0;
+	set = ac->alist->responded;
+	failed = ac->alist->failed;
+	_enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index);
 
-		if (ac->index == ac->start) {
-			ac->error = -EDESTADDRREQ;
-			return false;
-		}
-	}
+	ac->nr_iterations++;
 
-	ac->begun = true;
+	set &= ~(failed | ac->tried);
+
+	if (!set)
+		return false;
+
+	index = READ_ONCE(ac->alist->preferred);
+	if (test_bit(index, &set))
+		goto selected;
+
+	index = __ffs(set);
+
+selected:
+	ac->index = index;
+	set_bit(index, &ac->tried);
 	ac->responded = false;
-	ac->addr = &ac->alist->addrs[ac->index];
 	return true;
 }
 
@@ -340,53 +392,13 @@
 
 	alist = ac->alist;
 	if (alist) {
-		if (ac->responded && ac->index != ac->start)
-			WRITE_ONCE(alist->index, ac->index);
+		if (ac->responded &&
+		    ac->index != alist->preferred &&
+		    test_bit(ac->alist->preferred, &ac->tried))
+			WRITE_ONCE(alist->preferred, ac->index);
 		afs_put_addrlist(alist);
+		ac->alist = NULL;
 	}
 
-	ac->addr = NULL;
-	ac->alist = NULL;
-	ac->begun = false;
 	return ac->error;
 }
-
-/*
- * Set the address cursor for iterating over VL servers.
- */
-int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell)
-{
-	struct afs_addr_list *alist;
-	int ret;
-
-	if (!rcu_access_pointer(cell->vl_addrs)) {
-		ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
-				  TASK_INTERRUPTIBLE);
-		if (ret < 0)
-			return ret;
-
-		if (!rcu_access_pointer(cell->vl_addrs) &&
-		    ktime_get_real_seconds() < cell->dns_expiry)
-			return cell->error;
-	}
-
-	read_lock(&cell->vl_addrs_lock);
-	alist = rcu_dereference_protected(cell->vl_addrs,
-					  lockdep_is_held(&cell->vl_addrs_lock));
-	if (alist->nr_addrs > 0)
-		afs_get_addrlist(alist);
-	else
-		alist = NULL;
-	read_unlock(&cell->vl_addrs_lock);
-
-	if (!alist)
-		return -EDESTADDRREQ;
-
-	ac->alist = alist;
-	ac->addr = NULL;
-	ac->start = READ_ONCE(alist->index);
-	ac->index = ac->start;
-	ac->error = 0;
-	ac->begun = false;
-	return 0;
-}

diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index b4ff1f7..b6d49d6 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* AFS common types
  *
  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #ifndef AFS_H
@@ -23,9 +19,12 @@
 #define AFSPATHMAX		1024	/* Maximum length of a pathname plus NUL */
 #define AFSOPAQUEMAX		1024	/* Maximum length of an opaque field */
 
-typedef unsigned			afs_volid_t;
-typedef unsigned			afs_vnodeid_t;
-typedef unsigned long long		afs_dataversion_t;
+#define AFS_VL_MAX_LIFESPAN	(120 * HZ)
+#define AFS_PROBE_MAX_LIFESPAN	(30 * HZ)
+
+typedef u64			afs_volid_t;
+typedef u64			afs_vnodeid_t;
+typedef u64			afs_dataversion_t;
 
 typedef enum {
 	AFSVL_RWVOL,			/* read/write volume */
@@ -52,8 +51,9 @@
  */
 struct afs_fid {
 	afs_volid_t	vid;		/* volume ID */
-	afs_vnodeid_t	vnode;		/* file index within volume */
-	unsigned	unique;		/* unique ID number (file index version) */
+	afs_vnodeid_t	vnode;		/* Lower 64-bits of file index within volume */
+	u32		vnode_hi;	/* Upper 32-bits of file index */
+	u32		unique;		/* unique ID number (file index version) */
 };
 
 /*
@@ -67,14 +67,14 @@
 } afs_callback_type_t;
 
 struct afs_callback {
-	unsigned		version;	/* Callback version */
-	unsigned		expiry;		/* Time at which expires */
-	afs_callback_type_t	type;		/* Type of callback */
+	time64_t		expires_at;	/* Time at which expires */
+	//unsigned		version;	/* Callback version */
+	//afs_callback_type_t	type;		/* Type of callback */
 };
 
 struct afs_callback_break {
 	struct afs_fid		fid;		/* File identifier */
-	struct afs_callback	cb;		/* Callback details */
+	//struct afs_callback	cb;		/* Callback details */
 };
 
 #define AFSCBMAX 50	/* maximum callbacks transferred per bulk op */
@@ -129,19 +129,27 @@
 struct afs_file_status {
 	u64			size;		/* file size */
 	afs_dataversion_t	data_version;	/* current data version */
-	time_t			mtime_client;	/* last time client changed data */
-	time_t			mtime_server;	/* last time server changed data */
-	unsigned		abort_code;	/* Abort if bulk-fetching this failed */
-
-	afs_file_type_t		type;		/* file type */
-	unsigned		nlink;		/* link count */
-	u32			author;		/* author ID */
-	u32			owner;		/* owner ID */
-	u32			group;		/* group ID */
+	struct timespec64	mtime_client;	/* Last time client changed data */
+	struct timespec64	mtime_server;	/* Last time server changed data */
+	s64			author;		/* author ID */
+	s64			owner;		/* owner ID */
+	s64			group;		/* group ID */
 	afs_access_t		caller_access;	/* access rights for authenticated caller */
 	afs_access_t		anon_access;	/* access rights for unauthenticated caller */
 	umode_t			mode;		/* UNIX mode */
+	afs_file_type_t		type;		/* file type */
+	u32			nlink;		/* link count */
 	s32			lock_count;	/* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
+	u32			abort_code;	/* Abort if bulk-fetching this failed */
+};
+
+struct afs_status_cb {
+	struct afs_file_status	status;
+	struct afs_callback	callback;
+	unsigned int		cb_break;	/* Pre-op callback break counter */
+	bool			have_status;	/* True if status record was retrieved */
+	bool			have_cb;	/* True if cb record was retrieved */
+	bool			have_error;	/* True if status.abort_code indicates an error */
 };
 
 /*
@@ -158,25 +166,27 @@
  * AFS volume synchronisation information
  */
 struct afs_volsync {
-	time_t			creation;	/* volume creation time */
+	time64_t		creation;	/* volume creation time */
 };
 
 /*
  * AFS volume status record
  */
 struct afs_volume_status {
-	u32			vid;		/* volume ID */
-	u32			parent_id;	/* parent volume ID */
+	afs_volid_t		vid;		/* volume ID */
+	afs_volid_t		parent_id;	/* parent volume ID */
 	u8			online;		/* true if volume currently online and available */
 	u8			in_service;	/* true if volume currently in service */
 	u8			blessed;	/* same as in_service */
 	u8			needs_salvage;	/* true if consistency checking required */
 	u32			type;		/* volume type (afs_voltype_t) */
-	u32			min_quota;	/* minimum space set aside (blocks) */
-	u32			max_quota;	/* maximum space this volume may occupy (blocks) */
-	u32			blocks_in_use;	/* space this volume currently occupies (blocks) */
-	u32			part_blocks_avail; /* space available in volume's partition */
-	u32			part_max_blocks; /* size of volume's partition */
+	u64			min_quota;	/* minimum space set aside (blocks) */
+	u64			max_quota;	/* maximum space this volume may occupy (blocks) */
+	u64			blocks_in_use;	/* space this volume currently occupies (blocks) */
+	u64			part_blocks_avail; /* space available in volume's partition */
+	u64			part_max_blocks; /* size of volume's partition */
+	s64			vol_copy_date;
+	s64			vol_backup_date;
 };
 
 #define AFS_BLOCK_SIZE	1024

diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h
index 255f5dd..565cbe0 100644
--- a/fs/afs/afs_cm.h
+++ b/fs/afs/afs_cm.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* AFS Cache Manager definitions
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #ifndef AFS_CM_H

diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index ddfa88a..20ab344 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* AFS File Service definitions
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #ifndef AFS_FS_H
@@ -17,8 +13,10 @@
 
 enum AFS_FS_Operations {
 	FSFETCHDATA		= 130,	/* AFS Fetch file data */
+	FSFETCHACL		= 131,	/* AFS Fetch file ACL */
 	FSFETCHSTATUS		= 132,	/* AFS Fetch file status */
 	FSSTOREDATA		= 133,	/* AFS Store file data */
+	FSSTOREACL		= 134,	/* AFS Store file ACL */
 	FSSTORESTATUS		= 135,	/* AFS Store file status */
 	FSREMOVEFILE		= 136,	/* AFS Remove a file */
 	FSCREATEFILE		= 137,	/* AFS Create a file */

diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
index e3c4688..e9b8029 100644
--- a/fs/afs/afs_vl.h
+++ b/fs/afs/afs_vl.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* AFS Volume Location Service client interface
  *
  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #ifndef AFS_VL_H

diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index b1c31ec..037af93 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS caching stuff
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/sched.h>
@@ -49,7 +45,7 @@
 	struct afs_vnode *vnode = cookie_netfs_data;
 	struct afs_vnode_cache_aux aux;
 
-	_enter("{%x,%x,%llx},%p,%u",
+	_enter("{%llx,%x,%llx},%p,%u",
 	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
 	       buffer, buflen);
 

diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 5f261fb..2dca8df 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c

@@ -48,7 +48,7 @@
 	refcount_set(&new->usage, 1);
 	new->sb = vnode->vfs_inode.i_sb;
 	new->vid = vnode->volume->vid;
-	new->server = afs_get_server(server);
+	new->server = afs_get_server(server, afs_server_trace_get_new_cbi);
 	INIT_HLIST_NODE(&new->cb_vlink);
 
 	write_lock(&server->cb_break_lock);
@@ -94,15 +94,15 @@
 	struct afs_server *server = entry->server;
 
 again:
-	if (vnode->cb_interest &&
-	    likely(vnode->cb_interest == entry->cb_interest))
+	vcbi = rcu_dereference_protected(vnode->cb_interest,
+					 lockdep_is_held(&vnode->io_lock));
+	if (vcbi && likely(vcbi == entry->cb_interest))
 		return 0;
 
 	read_lock(&slist->lock);
 	cbi = afs_get_cb_interest(entry->cb_interest);
 	read_unlock(&slist->lock);
 
-	vcbi = vnode->cb_interest;
 	if (vcbi) {
 		if (vcbi == cbi) {
 			afs_put_cb_interest(afs_v2net(vnode), cbi);
@@ -114,8 +114,9 @@
 		 */
 		if (cbi && vcbi->server == cbi->server) {
 			write_seqlock(&vnode->cb_lock);
-			old = vnode->cb_interest;
-			vnode->cb_interest = cbi;
+			old = rcu_dereference_protected(vnode->cb_interest,
+							lockdep_is_held(&vnode->cb_lock.lock));
+			rcu_assign_pointer(vnode->cb_interest, cbi);
 			write_sequnlock(&vnode->cb_lock);
 			afs_put_cb_interest(afs_v2net(vnode), old);
 			return 0;
@@ -160,8 +161,9 @@
 	 */
 	write_seqlock(&vnode->cb_lock);
 
-	old = vnode->cb_interest;
-	vnode->cb_interest = cbi;
+	old = rcu_dereference_protected(vnode->cb_interest,
+					lockdep_is_held(&vnode->cb_lock.lock));
+	rcu_assign_pointer(vnode->cb_interest, cbi);
 	vnode->cb_s_break = cbi->server->cb_s_break;
 	vnode->cb_v_break = vnode->volume->cb_v_break;
 	clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
@@ -191,10 +193,11 @@
 				vi = NULL;
 
 			write_unlock(&cbi->server->cb_break_lock);
-			kfree(vi);
-			afs_put_server(net, cbi->server);
+			if (vi)
+				kfree_rcu(vi, rcu);
+			afs_put_server(net, cbi->server, afs_server_trace_put_cbi);
 		}
-		kfree(cbi);
+		kfree_rcu(cbi, rcu);
 	}
 }
 
@@ -203,34 +206,34 @@
  */
 void afs_init_callback_state(struct afs_server *server)
 {
-	if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags))
-		server->cb_s_break++;
+	server->cb_s_break++;
 }
 
 /*
  * actually break a callback
  */
-void afs_break_callback(struct afs_vnode *vnode)
+void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason)
 {
 	_enter("");
 
-	write_seqlock(&vnode->cb_lock);
-
 	clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
 	if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
 		vnode->cb_break++;
 		afs_clear_permits(vnode);
 
-		spin_lock(&vnode->lock);
-
-		_debug("break callback");
-
-		if (list_empty(&vnode->granted_locks) &&
-		    !list_empty(&vnode->pending_locks))
+		if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
 			afs_lock_may_be_available(vnode);
-		spin_unlock(&vnode->lock);
-	}
 
+		trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
+	} else {
+		trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, false);
+	}
+}
+
+void afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason)
+{
+	write_seqlock(&vnode->cb_lock);
+	__afs_break_callback(vnode, reason);
 	write_sequnlock(&vnode->cb_lock);
 }
 
@@ -276,9 +279,11 @@
 			struct afs_super_info *as = AFS_FS_S(cbi->sb);
 			struct afs_volume *volume = as->volume;
 
-			write_lock(&volume->cb_break_lock);
+			write_lock(&volume->cb_v_break_lock);
 			volume->cb_v_break++;
-			write_unlock(&volume->cb_break_lock);
+			trace_afs_cb_break(fid, volume->cb_v_break,
+					   afs_cb_break_for_volume_callback, false);
+			write_unlock(&volume->cb_v_break_lock);
 		} else {
 			data.volume = NULL;
 			data.fid = *fid;
@@ -286,8 +291,10 @@
 						afs_iget5_test, &data);
 			if (inode) {
 				vnode = AFS_FS_I(inode);
-				afs_break_callback(vnode);
+				afs_break_callback(vnode, afs_cb_break_for_callback);
 				iput(inode);
+			} else {
+				trace_afs_cb_miss(fid, afs_cb_break_for_callback);
 			}
 		}
 	}
@@ -305,19 +312,14 @@
 	_enter("%p,%zu,", server, count);
 
 	ASSERT(server != NULL);
-	ASSERTCMP(count, <=, AFSCBMAX);
 
 	/* TODO: Sort the callback break list by volume ID */
 
 	for (; count > 0; callbacks++, count--) {
-		_debug("- Fid { vl=%08x n=%u u=%u }  CB { v=%u x=%u t=%u }",
+		_debug("- Fid { vl=%08llx n=%llu u=%u }",
 		       callbacks->fid.vid,
 		       callbacks->fid.vnode,
-		       callbacks->fid.unique,
-		       callbacks->cb.version,
-		       callbacks->cb.expiry,
-		       callbacks->cb.type
-		       );
+		       callbacks->fid.unique);
 		afs_break_one_callback(server, &callbacks->fid);
 	}
 

diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 6127f0f..fd5133e 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS cell and server record management
  *
  * Copyright (C) 2002, 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/slab.h>
@@ -20,6 +16,8 @@
 #include "internal.h"
 
 static unsigned __read_mostly afs_cell_gc_delay = 10;
+static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
+static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
 
 static void afs_manage_cell(struct work_struct *);
 
@@ -76,6 +74,7 @@
 			cell = rcu_dereference_raw(net->ws_cell);
 			if (cell) {
 				afs_get_cell(cell);
+				ret = 0;
 				break;
 			}
 			ret = -EDESTADDRREQ;
@@ -110,6 +109,9 @@
 
 	done_seqretry(&net->cells_lock, seq);
 
+	if (ret != 0 && cell)
+		afs_put_cell(net, cell);
+
 	return ret == 0 ? cell : ERR_PTR(ret);
 }
 
@@ -119,8 +121,9 @@
  */
 static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 				       const char *name, unsigned int namelen,
-				       const char *vllist)
+				       const char *addresses)
 {
+	struct afs_vlserver_list *vllist;
 	struct afs_cell *cell;
 	int i, ret;
 
@@ -134,7 +137,7 @@
 	if (namelen == 5 && memcmp(name, "@cell", 5) == 0)
 		return ERR_PTR(-EINVAL);
 
-	_enter("%*.*s,%s", namelen, namelen, name, vllist);
+	_enter("%*.*s,%s", namelen, namelen, name, addresses);
 
 	cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL);
 	if (!cell) {
@@ -149,35 +152,48 @@
 
 	atomic_set(&cell->usage, 2);
 	INIT_WORK(&cell->manager, afs_manage_cell);
-	cell->flags = ((1 << AFS_CELL_FL_NOT_READY) |
-		       (1 << AFS_CELL_FL_NO_LOOKUP_YET));
 	INIT_LIST_HEAD(&cell->proc_volumes);
 	rwlock_init(&cell->proc_lock);
-	rwlock_init(&cell->vl_addrs_lock);
+	rwlock_init(&cell->vl_servers_lock);
 
-	/* Fill in the VL server list if we were given a list of addresses to
-	 * use.
+	/* Provide a VL server list, filling it in if we were given a list of
+	 * addresses to use.
 	 */
-	if (vllist) {
-		struct afs_addr_list *alist;
-
-		alist = afs_parse_text_addrs(vllist, strlen(vllist), ':',
-					     VL_SERVICE, AFS_VL_PORT);
-		if (IS_ERR(alist)) {
-			ret = PTR_ERR(alist);
+	if (addresses) {
+		vllist = afs_parse_text_addrs(net,
+					      addresses, strlen(addresses), ':',
+					      VL_SERVICE, AFS_VL_PORT);
+		if (IS_ERR(vllist)) {
+			ret = PTR_ERR(vllist);
 			goto parse_failed;
 		}
 
-		rcu_assign_pointer(cell->vl_addrs, alist);
+		vllist->source = DNS_RECORD_FROM_CONFIG;
+		vllist->status = DNS_LOOKUP_NOT_DONE;
 		cell->dns_expiry = TIME64_MAX;
+	} else {
+		ret = -ENOMEM;
+		vllist = afs_alloc_vlserver_list(0);
+		if (!vllist)
+			goto error;
+		vllist->source = DNS_RECORD_UNAVAILABLE;
+		vllist->status = DNS_LOOKUP_NOT_DONE;
+		cell->dns_expiry = ktime_get_real_seconds();
 	}
 
+	rcu_assign_pointer(cell->vl_servers, vllist);
+
+	cell->dns_source = vllist->source;
+	cell->dns_status = vllist->status;
+	smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */
+
 	_leave(" = %p", cell);
 	return cell;
 
 parse_failed:
 	if (ret == -EINVAL)
 		printk(KERN_ERR "kAFS: bad VL server IP address\n");
+error:
 	kfree(cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
@@ -202,6 +218,7 @@
 {
 	struct afs_cell *cell, *candidate, *cursor;
 	struct rb_node *parent, **pp;
+	enum afs_cell_state state;
 	int ret, n;
 
 	_enter("%s,%s", name, vllist);
@@ -261,18 +278,16 @@
 
 wait_for_cell:
 	_debug("wait_for_cell");
-	ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NOT_READY, TASK_INTERRUPTIBLE);
-	smp_rmb();
+	wait_var_event(&cell->state,
+		       ({
+			       state = smp_load_acquire(&cell->state); /* vs error */
+			       state == AFS_CELL_ACTIVE || state == AFS_CELL_FAILED;
+		       }));
 
-	switch (READ_ONCE(cell->state)) {
-	case AFS_CELL_FAILED:
+	/* Check the state obtained from the wait check. */
+	if (state == AFS_CELL_FAILED) {
 		ret = cell->error;
 		goto error;
-	default:
-		_debug("weird %u %d", cell->state, cell->error);
-		goto error;
-	case AFS_CELL_ACTIVE:
-		break;
 	}
 
 	_leave(" = %p [cell]", cell);
@@ -354,53 +369,96 @@
 /*
  * Update a cell's VL server address list from the DNS.
  */
-static void afs_update_cell(struct afs_cell *cell)
+static int afs_update_cell(struct afs_cell *cell)
 {
-	struct afs_addr_list *alist, *old;
-	time64_t now, expiry;
+	struct afs_vlserver_list *vllist, *old = NULL, *p;
+	unsigned int min_ttl = READ_ONCE(afs_cell_min_ttl);
+	unsigned int max_ttl = READ_ONCE(afs_cell_max_ttl);
+	time64_t now, expiry = 0;
+	int ret = 0;
 
 	_enter("%s", cell->name);
 
-	alist = afs_dns_query(cell, &expiry);
-	if (IS_ERR(alist)) {
-		switch (PTR_ERR(alist)) {
-		case -ENODATA:
-			/* The DNS said that the cell does not exist */
-			set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
-			clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
-			cell->dns_expiry = ktime_get_real_seconds() + 61;
-			break;
+	vllist = afs_dns_query(cell, &expiry);
+	if (IS_ERR(vllist)) {
+		ret = PTR_ERR(vllist);
 
+		_debug("%s: fail %d", cell->name, ret);
+		if (ret == -ENOMEM)
+			goto out_wake;
+
+		ret = -ENOMEM;
+		vllist = afs_alloc_vlserver_list(0);
+		if (!vllist)
+			goto out_wake;
+
+		switch (ret) {
+		case -ENODATA:
+		case -EDESTADDRREQ:
+			vllist->status = DNS_LOOKUP_GOT_NOT_FOUND;
+			break;
 		case -EAGAIN:
 		case -ECONNREFUSED:
+			vllist->status = DNS_LOOKUP_GOT_TEMP_FAILURE;
+			break;
 		default:
-			set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
-			cell->dns_expiry = ktime_get_real_seconds() + 10;
+			vllist->status = DNS_LOOKUP_GOT_LOCAL_FAILURE;
 			break;
 		}
-
-		cell->error = -EDESTADDRREQ;
-	} else {
-		clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
-		clear_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
-
-		/* Exclusion on changing vl_addrs is achieved by a
-		 * non-reentrant work item.
-		 */
-		old = rcu_dereference_protected(cell->vl_addrs, true);
-		rcu_assign_pointer(cell->vl_addrs, alist);
-		cell->dns_expiry = expiry;
-
-		if (old)
-			afs_put_addrlist(old);
 	}
 
-	if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags))
-		wake_up_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET);
+	_debug("%s: got list %d %d", cell->name, vllist->source, vllist->status);
+	cell->dns_status = vllist->status;
 
 	now = ktime_get_real_seconds();
-	afs_set_cell_timer(cell->net, cell->dns_expiry - now);
-	_leave("");
+	if (min_ttl > max_ttl)
+		max_ttl = min_ttl;
+	if (expiry < now + min_ttl)
+		expiry = now + min_ttl;
+	else if (expiry > now + max_ttl)
+		expiry = now + max_ttl;
+
+	_debug("%s: status %d", cell->name, vllist->status);
+	if (vllist->source == DNS_RECORD_UNAVAILABLE) {
+		switch (vllist->status) {
+		case DNS_LOOKUP_GOT_NOT_FOUND:
+			/* The DNS said that the cell does not exist or there
+			 * weren't any addresses to be had.
+			 */
+			cell->dns_expiry = expiry;
+			break;
+
+		case DNS_LOOKUP_BAD:
+		case DNS_LOOKUP_GOT_LOCAL_FAILURE:
+		case DNS_LOOKUP_GOT_TEMP_FAILURE:
+		case DNS_LOOKUP_GOT_NS_FAILURE:
+		default:
+			cell->dns_expiry = now + 10;
+			break;
+		}
+	} else {
+		cell->dns_expiry = expiry;
+	}
+
+	/* Replace the VL server list if the new record has servers or the old
+	 * record doesn't.
+	 */
+	write_lock(&cell->vl_servers_lock);
+	p = rcu_dereference_protected(cell->vl_servers, true);
+	if (vllist->nr_servers > 0 || p->nr_servers == 0) {
+		rcu_assign_pointer(cell->vl_servers, vllist);
+		cell->dns_source = vllist->source;
+		old = p;
+	}
+	write_unlock(&cell->vl_servers_lock);
+	afs_put_vlserverlist(cell->net, old);
+
+out_wake:
+	smp_store_release(&cell->dns_lookup_count,
+			  cell->dns_lookup_count + 1); /* vs source/status */
+	wake_up_var(&cell->dns_lookup_count);
+	_leave(" = %d", ret);
+	return ret;
 }
 
 /*
@@ -414,7 +472,7 @@
 
 	ASSERTCMP(atomic_read(&cell->usage), ==, 0);
 
-	afs_put_addrlist(rcu_access_pointer(cell->vl_addrs));
+	afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers));
 	key_put(cell->anonymous_key);
 	kfree(cell);
 
@@ -471,8 +529,7 @@
 	now = ktime_get_real_seconds();
 	cell->last_inactive = now;
 	expire_delay = 0;
-	if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) &&
-	    !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
+	if (cell->vl_servers->nr_servers)
 		expire_delay = afs_cell_gc_delay;
 
 	if (atomic_dec_return(&cell->usage) > 1)
@@ -603,11 +660,13 @@
 			goto final_destruction;
 		if (cell->state == AFS_CELL_FAILED)
 			goto done;
-		cell->state = AFS_CELL_UNSET;
+		smp_store_release(&cell->state, AFS_CELL_UNSET);
+		wake_up_var(&cell->state);
 		goto again;
 
 	case AFS_CELL_UNSET:
-		cell->state = AFS_CELL_ACTIVATING;
+		smp_store_release(&cell->state, AFS_CELL_ACTIVATING);
+		wake_up_var(&cell->state);
 		goto again;
 
 	case AFS_CELL_ACTIVATING:
@@ -615,28 +674,29 @@
 		if (ret < 0)
 			goto activation_failed;
 
-		cell->state = AFS_CELL_ACTIVE;
-		smp_wmb();
-		clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
-		wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+		smp_store_release(&cell->state, AFS_CELL_ACTIVE);
+		wake_up_var(&cell->state);
 		goto again;
 
 	case AFS_CELL_ACTIVE:
 		if (atomic_read(&cell->usage) > 1) {
-			time64_t now = ktime_get_real_seconds();
-			if (cell->dns_expiry <= now && net->live)
-				afs_update_cell(cell);
+			if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
+				ret = afs_update_cell(cell);
+				if (ret < 0)
+					cell->error = ret;
+			}
 			goto done;
 		}
-		cell->state = AFS_CELL_DEACTIVATING;
+		smp_store_release(&cell->state, AFS_CELL_DEACTIVATING);
+		wake_up_var(&cell->state);
 		goto again;
 
 	case AFS_CELL_DEACTIVATING:
-		set_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
 		if (atomic_read(&cell->usage) > 1)
 			goto reverse_deactivation;
 		afs_deactivate_cell(net, cell);
-		cell->state = AFS_CELL_INACTIVE;
+		smp_store_release(&cell->state, AFS_CELL_INACTIVE);
+		wake_up_var(&cell->state);
 		goto again;
 
 	default:
@@ -649,17 +709,13 @@
 	cell->error = ret;
 	afs_deactivate_cell(net, cell);
 
-	cell->state = AFS_CELL_FAILED;
-	smp_wmb();
-	if (test_and_clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags))
-		wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+	smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */
+	wake_up_var(&cell->state);
 	goto again;
 
 reverse_deactivation:
-	cell->state = AFS_CELL_ACTIVE;
-	smp_wmb();
-	clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
-	wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+	smp_store_release(&cell->state, AFS_CELL_ACTIVE);
+	wake_up_var(&cell->state);
 	_leave(" [deact->act]");
 	return;
 
@@ -719,11 +775,16 @@
 		}
 
 		if (usage == 1) {
+			struct afs_vlserver_list *vllist;
 			time64_t expire_at = cell->last_inactive;
 
-			if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) &&
-			    !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
+			read_lock(&cell->vl_servers_lock);
+			vllist = rcu_dereference_protected(
+				cell->vl_servers,
+				lockdep_is_held(&cell->vl_servers_lock));
+			if (vllist->nr_servers > 0)
 				expire_at += afs_cell_gc_delay;
+			read_unlock(&cell->vl_servers_lock);
 			if (purging || expire_at <= now)
 				sched_cell = true;
 			else if (expire_at < next_manage)
@@ -731,10 +792,8 @@
 		}
 
 		if (!purging) {
-			if (cell->dns_expiry <= now)
+			if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags))
 				sched_cell = true;
-			else if (cell->dns_expiry <= next_manage)
-				next_manage = cell->dns_expiry;
 		}
 
 		if (sched_cell)

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 9e51d6f..b86195e 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS Cache Manager Service
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/module.h>
@@ -16,6 +12,7 @@
 #include <linux/ip.h>
 #include "internal.h"
 #include "afs_cm.h"
+#include "protocol_yfs.h"
 
 static int afs_deliver_cb_init_call_back_state(struct afs_call *);
 static int afs_deliver_cb_init_call_back_state3(struct afs_call *);
@@ -30,8 +27,10 @@
 static void SRXAFSCB_ProbeUuid(struct work_struct *);
 static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
 
+static int afs_deliver_yfs_cb_callback(struct afs_call *);
+
 #define CM_NAME(name) \
-	const char afs_SRXCB##name##_name[] __tracepoint_string =	\
+	char afs_SRXCB##name##_name[] __tracepoint_string =	\
 		"CB." #name
 
 /*
@@ -101,12 +100,25 @@
 };
 
 /*
+ * YFS CB.CallBack operation type
+ */
+static CM_NAME(YFS_CallBack);
+static const struct afs_call_type afs_SRXYFSCB_CallBack = {
+	.name		= afs_SRXCBYFS_CallBack_name,
+	.deliver	= afs_deliver_yfs_cb_callback,
+	.destructor	= afs_cm_destructor,
+	.work		= SRXAFSCB_CallBack,
+};
+
+/*
  * route an incoming cache manager call
  * - return T if supported, F if not
  */
 bool afs_cm_incoming_call(struct afs_call *call)
 {
-	_enter("{CB.OP %u}", call->operation_ID);
+	_enter("{%u, CB.OP %u}", call->service_id, call->operation_ID);
+
+	call->epoch = rxrpc_kernel_get_epoch(call->net->socket, call->rxcall);
 
 	switch (call->operation_ID) {
 	case CBCallBack:
@@ -127,12 +139,102 @@
 	case CBTellMeAboutYourself:
 		call->type = &afs_SRXCBTellMeAboutYourself;
 		return true;
+	case YFSCBCallBack:
+		if (call->service_id != YFS_CM_SERVICE)
+			return false;
+		call->type = &afs_SRXYFSCB_CallBack;
+		return true;
 	default:
 		return false;
 	}
 }
 
 /*
+ * Record a probe to the cache manager from a server.
+ */
+static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server)
+{
+	_enter("");
+
+	if (test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags) &&
+	    !test_bit(AFS_SERVER_FL_PROBING, &server->flags)) {
+		if (server->cm_epoch == call->epoch)
+			return 0;
+
+		if (!server->probe.said_rebooted) {
+			pr_notice("kAFS: FS rebooted %pU\n", &server->uuid);
+			server->probe.said_rebooted = true;
+		}
+	}
+
+	spin_lock(&server->probe_lock);
+
+	if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) {
+		server->cm_epoch = call->epoch;
+		server->probe.cm_epoch = call->epoch;
+		goto out;
+	}
+
+	if (server->probe.cm_probed &&
+	    call->epoch != server->probe.cm_epoch &&
+	    !server->probe.said_inconsistent) {
+		pr_notice("kAFS: FS endpoints inconsistent %pU\n",
+			  &server->uuid);
+		server->probe.said_inconsistent = true;
+	}
+
+	if (!server->probe.cm_probed || call->epoch == server->cm_epoch)
+		server->probe.cm_epoch = server->cm_epoch;
+
+out:
+	server->probe.cm_probed = true;
+	spin_unlock(&server->probe_lock);
+	return 0;
+}
+
+/*
+ * Find the server record by peer address and record a probe to the cache
+ * manager from a server.
+ */
+static int afs_find_cm_server_by_peer(struct afs_call *call)
+{
+	struct sockaddr_rxrpc srx;
+	struct afs_server *server;
+
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+
+	server = afs_find_server(call->net, &srx);
+	if (!server) {
+		trace_afs_cm_no_server(call, &srx);
+		return 0;
+	}
+
+	call->server = server;
+	return afs_record_cm_probe(call, server);
+}
+
+/*
+ * Find the server record by server UUID and record a probe to the cache
+ * manager from a server.
+ */
+static int afs_find_cm_server_by_uuid(struct afs_call *call,
+				      struct afs_uuid *uuid)
+{
+	struct afs_server *server;
+
+	rcu_read_lock();
+	server = afs_find_server_by_uuid(call->net, call->request);
+	rcu_read_unlock();
+	if (!server) {
+		trace_afs_cm_no_server_u(call, call->request);
+		return 0;
+	}
+
+	call->server = server;
+	return afs_record_cm_probe(call, server);
+}
+
+/*
  * Clean up a cache manager call.
  */
 static void afs_cm_destructor(struct afs_call *call)
@@ -154,8 +256,11 @@
 	 * server holds up change visibility till it receives our reply so as
 	 * to maintain cache coherency.
 	 */
-	if (call->cm_server)
-		afs_break_callbacks(call->cm_server, call->count, call->request);
+	if (call->server) {
+		trace_afs_server(call->server, atomic_read(&call->server->usage),
+				 afs_server_trace_callback);
+		afs_break_callbacks(call->server, call->count, call->request);
+	}
 
 	afs_send_empty_reply(call);
 	afs_put_call(call);
@@ -168,7 +273,6 @@
 static int afs_deliver_cb_callback(struct afs_call *call)
 {
 	struct afs_callback_break *cb;
-	struct sockaddr_rxrpc srx;
 	__be32 *bp;
 	int ret, loop;
 
@@ -176,32 +280,34 @@
 
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
+		afs_extract_to_tmp(call);
 		call->unmarshall++;
 
 		/* extract the FID array and its count in two steps */
+		/* fall through */
 	case 1:
 		_debug("extract FID count");
-		ret = afs_extract_data(call, &call->tmp, 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("FID count: %u", call->count);
 		if (call->count > AFSCBMAX)
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_cb_fid_count);
 
 		call->buffer = kmalloc(array3_size(call->count, 3, 4),
 				       GFP_KERNEL);
 		if (!call->buffer)
 			return -ENOMEM;
-		call->offset = 0;
+		afs_extract_to_buf(call, call->count * 3 * 4);
 		call->unmarshall++;
 
+		/* Fall through */
 	case 2:
 		_debug("extract FID array");
-		ret = afs_extract_data(call, call->buffer,
-				       call->count * 3 * 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
@@ -218,59 +324,48 @@
 			cb->fid.vid	= ntohl(*bp++);
 			cb->fid.vnode	= ntohl(*bp++);
 			cb->fid.unique	= ntohl(*bp++);
-			cb->cb.type	= AFSCM_CB_UNTYPED;
 		}
 
-		call->offset = 0;
+		afs_extract_to_tmp(call);
 		call->unmarshall++;
 
 		/* extract the callback array and its count in two steps */
+		/* fall through */
 	case 3:
 		_debug("extract CB count");
-		ret = afs_extract_data(call, &call->tmp, 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		call->count2 = ntohl(call->tmp);
 		_debug("CB count: %u", call->count2);
 		if (call->count2 != call->count && call->count2 != 0)
-			return afs_protocol_error(call, -EBADMSG);
-		call->offset = 0;
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_cb_count);
+		call->_iter = &call->iter;
+		iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4);
 		call->unmarshall++;
 
+		/* Fall through */
 	case 4:
-		_debug("extract CB array");
-		ret = afs_extract_data(call, call->buffer,
-				       call->count2 * 3 * 4, false);
+		_debug("extract discard %zu/%u",
+		       iov_iter_count(&call->iter), call->count2 * 3 * 4);
+
+		ret = afs_extract_data(call, false);
 		if (ret < 0)
 			return ret;
 
-		_debug("unmarshall CB array");
-		cb = call->request;
-		bp = call->buffer;
-		for (loop = call->count2; loop > 0; loop--, cb++) {
-			cb->cb.version	= ntohl(*bp++);
-			cb->cb.expiry	= ntohl(*bp++);
-			cb->cb.type	= ntohl(*bp++);
-		}
-
-		call->offset = 0;
 		call->unmarshall++;
 	case 5:
 		break;
 	}
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
-		return -EIO;
+		return afs_io_error(call, afs_io_error_cm_reply);
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
-	call->cm_server = afs_find_server(call->net, &srx);
-	if (!call->cm_server)
-		trace_afs_cm_no_server(call, &srx);
-
-	return afs_queue_call_work(call);
+	return afs_find_cm_server_by_peer(call);
 }
 
 /*
@@ -280,10 +375,10 @@
 {
 	struct afs_call *call = container_of(work, struct afs_call, work);
 
-	_enter("{%p}", call->cm_server);
+	_enter("{%p}", call->server);
 
-	if (call->cm_server)
-		afs_init_callback_state(call->cm_server);
+	if (call->server)
+		afs_init_callback_state(call->server);
 	afs_send_empty_reply(call);
 	afs_put_call(call);
 	_leave("");
@@ -294,24 +389,18 @@
  */
 static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 {
-	struct sockaddr_rxrpc srx;
 	int ret;
 
 	_enter("");
 
-	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
-
-	ret = afs_extract_data(call, NULL, 0, false);
+	afs_extract_discard(call, 0);
+	ret = afs_extract_data(call, false);
 	if (ret < 0)
 		return ret;
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	call->cm_server = afs_find_server(call->net, &srx);
-	if (!call->cm_server)
-		trace_afs_cm_no_server(call, &srx);
-
-	return afs_queue_call_work(call);
+	return afs_find_cm_server_by_peer(call);
 }
 
 /*
@@ -330,16 +419,16 @@
 
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
 		call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL);
 		if (!call->buffer)
 			return -ENOMEM;
+		afs_extract_to_buf(call, 11 * sizeof(__be32));
 		call->unmarshall++;
 
+		/* Fall through */
 	case 1:
 		_debug("extract UUID");
-		ret = afs_extract_data(call, call->buffer,
-				       11 * sizeof(__be32), false);
+		ret = afs_extract_data(call, false);
 		switch (ret) {
 		case 0:		break;
 		case -EAGAIN:	return 0;
@@ -362,7 +451,6 @@
 		for (loop = 0; loop < 6; loop++)
 			r->node[loop] = ntohl(b[loop + 5]);
 
-		call->offset = 0;
 		call->unmarshall++;
 
 	case 2:
@@ -370,17 +458,11 @@
 	}
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
-		return -EIO;
+		return afs_io_error(call, afs_io_error_cm_reply);
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	rcu_read_lock();
-	call->cm_server = afs_find_server_by_uuid(call->net, call->request);
-	rcu_read_unlock();
-	if (!call->cm_server)
-		trace_afs_cm_no_server_u(call, call->request);
-
-	return afs_queue_call_work(call);
+	return afs_find_cm_server_by_uuid(call, call->request);
 }
 
 /*
@@ -405,14 +487,14 @@
 
 	_enter("");
 
-	ret = afs_extract_data(call, NULL, 0, false);
+	afs_extract_discard(call, 0);
+	ret = afs_extract_data(call, false);
 	if (ret < 0)
 		return ret;
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
-		return -EIO;
-
-	return afs_queue_call_work(call);
+		return afs_io_error(call, afs_io_error_cm_reply);
+	return afs_find_cm_server_by_peer(call);
 }
 
 /*
@@ -423,18 +505,14 @@
 	struct afs_call *call = container_of(work, struct afs_call, work);
 	struct afs_uuid *r = call->request;
 
-	struct {
-		__be32	match;
-	} reply;
-
 	_enter("");
 
 	if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
-		reply.match = htonl(0);
+		afs_send_empty_reply(call);
 	else
-		reply.match = htonl(1);
+		rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+					1, 1, "K-1");
 
-	afs_send_simple_reply(call, &reply, sizeof(reply));
 	afs_put_call(call);
 	_leave("");
 }
@@ -453,16 +531,16 @@
 
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
 		call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL);
 		if (!call->buffer)
 			return -ENOMEM;
+		afs_extract_to_buf(call, 11 * sizeof(__be32));
 		call->unmarshall++;
 
+		/* Fall through */
 	case 1:
 		_debug("extract UUID");
-		ret = afs_extract_data(call, call->buffer,
-				       11 * sizeof(__be32), false);
+		ret = afs_extract_data(call, false);
 		switch (ret) {
 		case 0:		break;
 		case -EAGAIN:	return 0;
@@ -485,7 +563,6 @@
 		for (loop = 0; loop < 6; loop++)
 			r->node[loop] = ntohl(b[loop + 5]);
 
-		call->offset = 0;
 		call->unmarshall++;
 
 	case 2:
@@ -493,9 +570,8 @@
 	}
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
-		return -EIO;
-
-	return afs_queue_call_work(call);
+		return afs_io_error(call, afs_io_error_cm_reply);
+	return afs_find_cm_server_by_uuid(call, call->request);
 }
 
 /*
@@ -503,9 +579,8 @@
  */
 static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 {
-	struct afs_interface *ifs;
 	struct afs_call *call = container_of(work, struct afs_call, work);
-	int loop, nifs;
+	int loop;
 
 	struct {
 		struct /* InterfaceAddr */ {
@@ -523,19 +598,7 @@
 
 	_enter("");
 
-	nifs = 0;
-	ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
-	if (ifs) {
-		nifs = afs_get_ipv4_interfaces(call->net, ifs, 32, false);
-		if (nifs < 0) {
-			kfree(ifs);
-			ifs = NULL;
-			nifs = 0;
-		}
-	}
-
 	memset(&reply, 0, sizeof(reply));
-	reply.ia.nifs = htonl(nifs);
 
 	reply.ia.uuid[0] = call->net->uuid.time_low;
 	reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid));
@@ -545,15 +608,6 @@
 	for (loop = 0; loop < 6; loop++)
 		reply.ia.uuid[loop + 5] = htonl((s8) call->net->uuid.node[loop]);
 
-	if (ifs) {
-		for (loop = 0; loop < nifs; loop++) {
-			reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
-			reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
-			reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
-		}
-		kfree(ifs);
-	}
-
 	reply.cap.capcount = htonl(1);
 	reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
 	afs_send_simple_reply(call, &reply, sizeof(reply));
@@ -570,12 +624,90 @@
 
 	_enter("");
 
-	ret = afs_extract_data(call, NULL, 0, false);
+	afs_extract_discard(call, 0);
+	ret = afs_extract_data(call, false);
 	if (ret < 0)
 		return ret;
 
 	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
-		return -EIO;
+		return afs_io_error(call, afs_io_error_cm_reply);
+	return afs_find_cm_server_by_peer(call);
+}
 
-	return afs_queue_call_work(call);
+/*
+ * deliver request data to a YFS CB.CallBack call
+ */
+static int afs_deliver_yfs_cb_callback(struct afs_call *call)
+{
+	struct afs_callback_break *cb;
+	struct yfs_xdr_YFSFid *bp;
+	size_t size;
+	int ret, loop;
+
+	_enter("{%u}", call->unmarshall);
+
+	switch (call->unmarshall) {
+	case 0:
+		afs_extract_to_tmp(call);
+		call->unmarshall++;
+
+		/* extract the FID array and its count in two steps */
+		/* Fall through */
+	case 1:
+		_debug("extract FID count");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		call->count = ntohl(call->tmp);
+		_debug("FID count: %u", call->count);
+		if (call->count > YFSCBMAX)
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_cb_fid_count);
+
+		size = array_size(call->count, sizeof(struct yfs_xdr_YFSFid));
+		call->buffer = kmalloc(size, GFP_KERNEL);
+		if (!call->buffer)
+			return -ENOMEM;
+		afs_extract_to_buf(call, size);
+		call->unmarshall++;
+
+		/* Fall through */
+	case 2:
+		_debug("extract FID array");
+		ret = afs_extract_data(call, false);
+		if (ret < 0)
+			return ret;
+
+		_debug("unmarshall FID array");
+		call->request = kcalloc(call->count,
+					sizeof(struct afs_callback_break),
+					GFP_KERNEL);
+		if (!call->request)
+			return -ENOMEM;
+
+		cb = call->request;
+		bp = call->buffer;
+		for (loop = call->count; loop > 0; loop--, cb++) {
+			cb->fid.vid	= xdr_to_u64(bp->volume);
+			cb->fid.vnode	= xdr_to_u64(bp->vnode.lo);
+			cb->fid.vnode_hi = ntohl(bp->vnode.hi);
+			cb->fid.unique	= ntohl(bp->vnode.unique);
+			bp++;
+		}
+
+		afs_extract_to_tmp(call);
+		call->unmarshall++;
+
+	case 3:
+		break;
+	}
+
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return afs_io_error(call, afs_io_error_cm_reply);
+
+	/* We'll need the file server record as that tells us which set of
+	 * vnodes to operate upon.
+	 */
+	return afs_find_cm_server_by_peer(call);
 }

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 855bf2b..497f979 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* dir.c: AFS filesystem directory handling
  *
  * Copyright (C) 2002, 2018 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
@@ -18,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
+#include "afs_fs.h"
 #include "xdr_fs.h"
 
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -26,6 +23,7 @@
 static int afs_readdir(struct file *file, struct dir_context *ctx);
 static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
+static void afs_d_iput(struct dentry *dentry, struct inode *inode);
 static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
 				  loff_t fpos, u64 ino, unsigned dtype);
 static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
@@ -85,6 +83,7 @@
 	.d_delete	= afs_d_delete,
 	.d_release	= afs_d_release,
 	.d_automount	= afs_d_automount,
+	.d_iput		= afs_d_iput,
 };
 
 struct afs_lookup_one_cookie {
@@ -100,8 +99,8 @@
 	bool			found;
 	bool			one_only;
 	unsigned short		nr_fids;
-	struct afs_file_status	*statuses;
-	struct afs_callback	*callbacks;
+	struct inode		**inodes;
+	struct afs_status_cb	*statuses;
 	struct afs_fid		fids[50];
 };
 
@@ -138,6 +137,7 @@
 			       ntohs(dbuf->blocks[tmp].hdr.magic));
 			trace_afs_dir_check_failed(dvnode, off, i_size);
 			kunmap(page);
+			trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
 			goto error;
 		}
 
@@ -159,6 +159,38 @@
 }
 
 /*
+ * Check the contents of a directory that we've just read.
+ */
+static bool afs_dir_check_pages(struct afs_vnode *dvnode, struct afs_read *req)
+{
+	struct afs_xdr_dir_page *dbuf;
+	unsigned int i, j, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block);
+
+	for (i = 0; i < req->nr_pages; i++)
+		if (!afs_dir_check_page(dvnode, req->pages[i], req->actual_len))
+			goto bad;
+	return true;
+
+bad:
+	pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx r=%llx\n",
+		dvnode->fid.vid, dvnode->fid.vnode,
+		req->file_size, req->len, req->actual_len, req->remain);
+	pr_warn("DIR %llx %x %x %x\n",
+		req->pos, req->index, req->nr_pages, req->offset);
+
+	for (i = 0; i < req->nr_pages; i++) {
+		dbuf = kmap(req->pages[i]);
+		for (j = 0; j < qty; j++) {
+			union afs_xdr_dir_block *block = &dbuf->blocks[j];
+
+			pr_warn("[%02x] %32phN\n", i * qty + j, block);
+		}
+		kunmap(req->pages[i]);
+	}
+	return false;
+}
+
+/*
  * open an AFS directory file
  */
 static int afs_dir_open(struct inode *inode, struct file *file)
@@ -190,9 +222,11 @@
 retry:
 	i_size = i_size_read(&dvnode->vfs_inode);
 	if (i_size < 2048)
-		return ERR_PTR(-EIO);
-	if (i_size > 2048 * 1024)
+		return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small));
+	if (i_size > 2048 * 1024) {
+		trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
 		return ERR_PTR(-EFBIG);
+	}
 
 	_enter("%llu", i_size);
 
@@ -204,8 +238,7 @@
 	if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *))
 		nr_inline = 0;
 
-	req = kzalloc(sizeof(*req) + sizeof(struct page *) * nr_inline,
-		      GFP_KERNEL);
+	req = kzalloc(struct_size(req, array, nr_inline), GFP_KERNEL);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 
@@ -274,6 +307,7 @@
 		goto error;
 
 	if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
+		trace_afs_reload_dir(dvnode);
 		ret = afs_fetch_data(dvnode, key, req);
 		if (ret < 0)
 			goto error_unlock;
@@ -285,10 +319,8 @@
 
 		/* Validate the data we just read. */
 		ret = -EIO;
-		for (i = 0; i < req->nr_pages; i++)
-			if (!afs_dir_check_page(dvnode, req->pages[i],
-						req->actual_len))
-				goto error_unlock;
+		if (!afs_dir_check_pages(dvnode, req))
+			goto error_unlock;
 
 		// TODO: Trim excess pages
 
@@ -315,7 +347,8 @@
 /*
  * deal with one block in an AFS directory
  */
-static int afs_dir_iterate_block(struct dir_context *ctx,
+static int afs_dir_iterate_block(struct afs_vnode *dvnode,
+				 struct dir_context *ctx,
 				 union afs_xdr_dir_block *block,
 				 unsigned blkoff)
 {
@@ -365,7 +398,7 @@
 				       " (len %u/%zu)",
 				       blkoff / sizeof(union afs_xdr_dir_block),
 				       offset, next, tmp, nlen);
-				return -EIO;
+				return afs_bad(dvnode, afs_file_error_dir_over_end);
 			}
 			if (!(block->hdr.bitmap[next / 8] &
 			      (1 << (next % 8)))) {
@@ -373,7 +406,7 @@
 				       " %u unmarked extension (len %u/%zu)",
 				       blkoff / sizeof(union afs_xdr_dir_block),
 				       offset, next, tmp, nlen);
-				return -EIO;
+				return afs_bad(dvnode, afs_file_error_dir_unmarked_ext);
 			}
 
 			_debug("ENT[%zu.%u]: ext %u/%zu",
@@ -407,7 +440,7 @@
  * iterate through the data blob that lists the contents of an AFS directory
  */
 static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
-			   struct key *key)
+			   struct key *key, afs_dataversion_t *_dir_version)
 {
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
 	struct afs_xdr_dir_page *dbuf;
@@ -427,6 +460,7 @@
 	req = afs_read_dir(dvnode, key);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
+	*_dir_version = req->data_version;
 
 	/* round the file position up to the next entry boundary */
 	ctx->pos += sizeof(union afs_xdr_dirent) - 1;
@@ -442,7 +476,7 @@
 		 */
 		page = req->pages[blkoff / PAGE_SIZE];
 		if (!page) {
-			ret = -EIO;
+			ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
 			break;
 		}
 		mark_page_accessed(page);
@@ -455,7 +489,7 @@
 		do {
 			dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
 					       sizeof(union afs_xdr_dir_block)];
-			ret = afs_dir_iterate_block(ctx, dblock, blkoff);
+			ret = afs_dir_iterate_block(dvnode, ctx, dblock, blkoff);
 			if (ret != 1) {
 				kunmap(page);
 				goto out;
@@ -481,7 +515,10 @@
  */
 static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file));
+	afs_dataversion_t dir_version;
+
+	return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file),
+			       &dir_version);
 }
 
 /*
@@ -522,7 +559,8 @@
  * - just returns the FID the dentry name maps to if found
  */
 static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
-			     struct afs_fid *fid, struct key *key)
+			     struct afs_fid *fid, struct key *key,
+			     afs_dataversion_t *_dir_version)
 {
 	struct afs_super_info *as = dir->i_sb->s_fs_info;
 	struct afs_lookup_one_cookie cookie = {
@@ -535,7 +573,7 @@
 	_enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
 
 	/* search the directory */
-	ret = afs_dir_iterate(dir, &cookie.ctx, key);
+	ret = afs_dir_iterate(dir, &cookie.ctx, key, _dir_version);
 	if (ret < 0) {
 		_leave(" = %d [iter]", ret);
 		return ret;
@@ -548,7 +586,7 @@
 	}
 
 	*fid = cookie.fid;
-	_leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+	_leave(" = 0 { vn=%llu u=%u }", fid->vnode, fid->unique);
 	return 0;
 }
 
@@ -601,12 +639,15 @@
 				   struct key *key)
 {
 	struct afs_lookup_cookie *cookie;
-	struct afs_cb_interest *cbi = NULL;
+	struct afs_cb_interest *dcbi, *cbi = NULL;
 	struct afs_super_info *as = dir->i_sb->s_fs_info;
-	struct afs_iget_data data;
+	struct afs_status_cb *scb;
+	struct afs_iget_data iget_data;
 	struct afs_fs_cursor fc;
-	struct afs_vnode *dvnode = AFS_FS_I(dir);
-	struct inode *inode = NULL;
+	struct afs_server *server;
+	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
+	struct inode *inode = NULL, *ti;
+	afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version);
 	int ret, i;
 
 	_enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
@@ -620,45 +661,70 @@
 	cookie->nr_fids = 1; /* slot 0 is saved for the fid we actually want */
 
 	read_seqlock_excl(&dvnode->cb_lock);
-	if (dvnode->cb_interest &&
-	    dvnode->cb_interest->server &&
-	    test_bit(AFS_SERVER_FL_NO_IBULK, &dvnode->cb_interest->server->flags))
-		cookie->one_only = true;
+	dcbi = rcu_dereference_protected(dvnode->cb_interest,
+					 lockdep_is_held(&dvnode->cb_lock.lock));
+	if (dcbi) {
+		server = dcbi->server;
+		if (server &&
+		    test_bit(AFS_SERVER_FL_NO_IBULK, &server->flags))
+			cookie->one_only = true;
+	}
 	read_sequnlock_excl(&dvnode->cb_lock);
 
 	for (i = 0; i < 50; i++)
 		cookie->fids[i].vid = as->volume->vid;
 
 	/* search the directory */
-	ret = afs_dir_iterate(dir, &cookie->ctx, key);
+	ret = afs_dir_iterate(dir, &cookie->ctx, key, &data_version);
 	if (ret < 0) {
 		inode = ERR_PTR(ret);
 		goto out;
 	}
 
+	dentry->d_fsdata = (void *)(unsigned long)data_version;
+
 	inode = ERR_PTR(-ENOENT);
 	if (!cookie->found)
 		goto out;
 
 	/* Check to see if we already have an inode for the primary fid. */
-	data.volume = dvnode->volume;
-	data.fid = cookie->fids[0];
-	inode = ilookup5(dir->i_sb, cookie->fids[0].vnode, afs_iget5_test, &data);
+	iget_data.fid = cookie->fids[0];
+	iget_data.volume = dvnode->volume;
+	iget_data.cb_v_break = dvnode->volume->cb_v_break;
+	iget_data.cb_s_break = 0;
+	inode = ilookup5(dir->i_sb, cookie->fids[0].vnode,
+			 afs_iget5_test, &iget_data);
 	if (inode)
 		goto out;
 
 	/* Need space for examining all the selected files */
 	inode = ERR_PTR(-ENOMEM);
-	cookie->statuses = kcalloc(cookie->nr_fids, sizeof(struct afs_file_status),
-				   GFP_KERNEL);
+	cookie->statuses = kvcalloc(cookie->nr_fids, sizeof(struct afs_status_cb),
+				    GFP_KERNEL);
 	if (!cookie->statuses)
 		goto out;
 
-	cookie->callbacks = kcalloc(cookie->nr_fids, sizeof(struct afs_callback),
-				    GFP_KERNEL);
-	if (!cookie->callbacks)
+	cookie->inodes = kcalloc(cookie->nr_fids, sizeof(struct inode *),
+				 GFP_KERNEL);
+	if (!cookie->inodes)
 		goto out_s;
 
+	for (i = 1; i < cookie->nr_fids; i++) {
+		scb = &cookie->statuses[i];
+
+		/* Find any inodes that already exist and get their
+		 * callback counters.
+		 */
+		iget_data.fid = cookie->fids[i];
+		ti = ilookup5_nowait(dir->i_sb, iget_data.fid.vnode,
+				     afs_iget5_test, &iget_data);
+		if (!IS_ERR_OR_NULL(ti)) {
+			vnode = AFS_FS_I(ti);
+			scb->cb_break = afs_calc_vnode_cb_break(vnode);
+			cookie->inodes[i] = ti;
+		}
+	}
+
 	/* Try FS.InlineBulkStatus first.  Abort codes for the individual
 	 * lookups contained therein are stored in the reply without aborting
 	 * the whole operation.
@@ -667,7 +733,7 @@
 		goto no_inline_bulk_status;
 
 	inode = ERR_PTR(-ERESTARTSYS);
-	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
 		while (afs_select_fileserver(&fc)) {
 			if (test_bit(AFS_SERVER_FL_NO_IBULK,
 				      &fc.cbi->server->flags)) {
@@ -675,11 +741,12 @@
 				fc.ac.error = -ECONNABORTED;
 				break;
 			}
+			iget_data.cb_v_break = dvnode->volume->cb_v_break;
+			iget_data.cb_s_break = fc.cbi->server->cb_s_break;
 			afs_fs_inline_bulk_status(&fc,
 						  afs_v2net(dvnode),
 						  cookie->fids,
 						  cookie->statuses,
-						  cookie->callbacks,
 						  cookie->nr_fids, NULL);
 		}
 
@@ -700,15 +767,16 @@
 	 * any of the lookups fails - so, for the moment, revert to
 	 * FS.FetchStatus for just the primary fid.
 	 */
-	cookie->nr_fids = 1;
 	inode = ERR_PTR(-ERESTARTSYS);
-	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
 		while (afs_select_fileserver(&fc)) {
+			iget_data.cb_v_break = dvnode->volume->cb_v_break;
+			iget_data.cb_s_break = fc.cbi->server->cb_s_break;
+			scb = &cookie->statuses[0];
 			afs_fs_fetch_status(&fc,
 					    afs_v2net(dvnode),
 					    cookie->fids,
-					    cookie->statuses,
-					    cookie->callbacks,
+					    scb,
 					    NULL);
 		}
 
@@ -720,26 +788,41 @@
 	if (IS_ERR(inode))
 		goto out_c;
 
-	for (i = 0; i < cookie->nr_fids; i++)
-		cookie->statuses[i].abort_code = 0;
-
 success:
 	/* Turn all the files into inodes and save the first one - which is the
 	 * one we actually want.
 	 */
-	if (cookie->statuses[0].abort_code != 0)
-		inode = ERR_PTR(afs_abort_to_error(cookie->statuses[0].abort_code));
+	scb = &cookie->statuses[0];
+	if (scb->status.abort_code != 0)
+		inode = ERR_PTR(afs_abort_to_error(scb->status.abort_code));
 
 	for (i = 0; i < cookie->nr_fids; i++) {
-		struct inode *ti;
+		struct afs_status_cb *scb = &cookie->statuses[i];
 
-		if (cookie->statuses[i].abort_code != 0)
+		if (!scb->have_status && !scb->have_error)
 			continue;
 
-		ti = afs_iget(dir->i_sb, key, &cookie->fids[i],
-			      &cookie->statuses[i],
-			      &cookie->callbacks[i],
-			      cbi);
+		if (cookie->inodes[i]) {
+			struct afs_vnode *iv = AFS_FS_I(cookie->inodes[i]);
+
+			if (test_bit(AFS_VNODE_UNSET, &iv->flags))
+				continue;
+
+			afs_vnode_commit_status(&fc, iv,
+						scb->cb_break, NULL, scb);
+			continue;
+		}
+
+		if (scb->status.abort_code != 0)
+			continue;
+
+		iget_data.fid = cookie->fids[i];
+		ti = afs_iget(dir->i_sb, key, &iget_data, scb, cbi, dvnode);
+		if (!IS_ERR(ti))
+			afs_cache_permit(AFS_FS_I(ti), key,
+					 0 /* Assume vnode->cb_break is 0 */ +
+					 iget_data.cb_v_break,
+					 scb);
 		if (i == 0) {
 			inode = ti;
 		} else {
@@ -750,9 +833,13 @@
 
 out_c:
 	afs_put_cb_interest(afs_v2net(dvnode), cbi);
-	kfree(cookie->callbacks);
+	if (cookie->inodes) {
+		for (i = 0; i < cookie->nr_fids; i++)
+			iput(cookie->inodes[i]);
+		kfree(cookie->inodes);
+	}
 out_s:
-	kfree(cookie->statuses);
+	kvfree(cookie->statuses);
 out:
 	kfree(cookie);
 	return inode;
@@ -826,7 +913,7 @@
 	struct key *key;
 	int ret;
 
-	_enter("{%x:%u},%p{%pd},",
+	_enter("{%llx:%llu},%p{%pd},",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry);
 
 	ASSERTCMP(d_inode(dentry), ==, NULL);
@@ -871,12 +958,71 @@
 			(void *)(unsigned long)dvnode->status.data_version;
 	}
 	d = d_splice_alias(inode, dentry);
-	if (!IS_ERR_OR_NULL(d))
+	if (!IS_ERR_OR_NULL(d)) {
 		d->d_fsdata = dentry->d_fsdata;
+		trace_afs_lookup(dvnode, &d->d_name,
+				 inode ? AFS_FS_I(inode) : NULL);
+	} else {
+		trace_afs_lookup(dvnode, &dentry->d_name,
+				 IS_ERR_OR_NULL(inode) ? NULL
+				 : AFS_FS_I(inode));
+	}
 	return d;
 }
 
 /*
+ * Check the validity of a dentry under RCU conditions.
+ */
+static int afs_d_revalidate_rcu(struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct dentry *parent;
+	struct inode *dir, *inode;
+	long dir_version, de_version;
+
+	_enter("%p", dentry);
+
+	/* Check the parent directory is still valid first. */
+	parent = READ_ONCE(dentry->d_parent);
+	dir = d_inode_rcu(parent);
+	if (!dir)
+		return -ECHILD;
+	dvnode = AFS_FS_I(dir);
+	if (test_bit(AFS_VNODE_DELETED, &dvnode->flags))
+		return -ECHILD;
+
+	if (!afs_check_validity(dvnode))
+		return -ECHILD;
+
+	/* We only need to invalidate a dentry if the server's copy changed
+	 * behind our back.  If we made the change, it's no problem.  Note that
+	 * on a 32-bit system, we only have 32 bits in the dentry to store the
+	 * version.
+	 */
+	dir_version = (long)READ_ONCE(dvnode->status.data_version);
+	de_version = (long)READ_ONCE(dentry->d_fsdata);
+	if (de_version != dir_version) {
+		dir_version = (long)READ_ONCE(dvnode->invalid_before);
+		if (de_version - dir_version < 0)
+			return -ECHILD;
+	}
+
+	/* Check to see if the vnode referred to by the dentry still
+	 * has a callback.
+	 */
+	if (d_really_is_positive(dentry)) {
+		inode = d_inode_rcu(dentry);
+		if (inode) {
+			vnode = AFS_FS_I(inode);
+			if (!afs_check_validity(vnode))
+				return -ECHILD;
+		}
+	}
+
+	return 1; /* Still valid */
+}
+
+/*
  * check that a dentry lookup hit has found a valid entry
  * - NOTE! the hit can be a negative hit too, so we can't assume we have an
  *   inode
@@ -888,15 +1034,16 @@
 	struct dentry *parent;
 	struct inode *inode;
 	struct key *key;
-	long dir_version, de_version;
+	afs_dataversion_t dir_version;
+	long de_version;
 	int ret;
 
 	if (flags & LOOKUP_RCU)
-		return -ECHILD;
+		return afs_d_revalidate_rcu(dentry);
 
 	if (d_really_is_positive(dentry)) {
 		vnode = AFS_FS_I(d_inode(dentry));
-		_enter("{v={%x:%u} n=%pd fl=%lx},",
+		_enter("{v={%llx:%llu} n=%pd fl=%lx},",
 		       vnode->fid.vid, vnode->fid.vnode, dentry,
 		       vnode->flags);
 	} else {
@@ -934,20 +1081,20 @@
 	 * on a 32-bit system, we only have 32 bits in the dentry to store the
 	 * version.
 	 */
-	dir_version = (long)dir->status.data_version;
+	dir_version = dir->status.data_version;
 	de_version = (long)dentry->d_fsdata;
-	if (de_version == dir_version)
-		goto out_valid;
+	if (de_version == (long)dir_version)
+		goto out_valid_noupdate;
 
-	dir_version = (long)dir->invalid_before;
-	if (de_version - dir_version >= 0)
+	dir_version = dir->invalid_before;
+	if (de_version - (long)dir_version >= 0)
 		goto out_valid;
 
 	_debug("dir modified");
 	afs_stat_v(dir, n_reval);
 
 	/* search the directory for this vnode */
-	ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key);
+	ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version);
 	switch (ret) {
 	case 0:
 		/* the filename maps to something */
@@ -965,7 +1112,7 @@
 		/* if the vnode ID has changed, then the dirent points to a
 		 * different file */
 		if (fid.vnode != vnode->fid.vnode) {
-			_debug("%pd: dirent changed [%u != %u]",
+			_debug("%pd: dirent changed [%llu != %llu]",
 			       dentry, fid.vnode,
 			       vnode->fid.vnode);
 			goto not_found;
@@ -1000,7 +1147,8 @@
 	}
 
 out_valid:
-	dentry->d_fsdata = (void *)dir_version;
+	dentry->d_fsdata = (void *)(unsigned long)dir_version;
+out_valid_noupdate:
 	dput(parent);
 	key_put(key);
 	_leave(" = 1 [valid]");
@@ -1049,6 +1197,16 @@
 }
 
 /*
+ * Clean up sillyrename files on dentry removal.
+ */
+static void afs_d_iput(struct dentry *dentry, struct inode *inode)
+{
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		afs_silly_iput(dentry, inode);
+	iput(inode);
+}
+
+/*
  * handle dentry release
  */
 void afs_d_release(struct dentry *dentry)
@@ -1061,9 +1219,8 @@
  */
 static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
 				struct dentry *new_dentry,
-				struct afs_fid *newfid,
-				struct afs_file_status *newstatus,
-				struct afs_callback *newcb)
+				struct afs_iget_data *new_data,
+				struct afs_status_cb *new_scb)
 {
 	struct afs_vnode *vnode;
 	struct inode *inode;
@@ -1071,10 +1228,8 @@
 	if (fc->ac.error < 0)
 		return;
 
-	d_drop(new_dentry);
-
 	inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key,
-			 newfid, newstatus, newcb, fc->cbi);
+			 new_data, new_scb, fc->cbi, fc->vnode);
 	if (IS_ERR(inode)) {
 		/* ENOMEM or EINTR at a really inconvenient time - just abandon
 		 * the new directory on the server.
@@ -1085,7 +1240,31 @@
 
 	vnode = AFS_FS_I(inode);
 	set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
-	d_add(new_dentry, inode);
+	if (fc->ac.error == 0)
+		afs_cache_permit(vnode, fc->key, vnode->cb_break, new_scb);
+	d_instantiate(new_dentry, inode);
+}
+
+static void afs_prep_for_new_inode(struct afs_fs_cursor *fc,
+				   struct afs_iget_data *iget_data)
+{
+	iget_data->volume = fc->vnode->volume;
+	iget_data->cb_v_break = fc->vnode->volume->cb_v_break;
+	iget_data->cb_s_break = fc->cbi->server->cb_s_break;
+}
+
+/*
+ * Note that a dentry got changed.  We need to set d_fsdata to the data version
+ * number derived from the result of the operation.  It doesn't matter if
+ * d_fsdata goes backwards as we'll just revalidate.
+ */
+static void afs_update_dentry_version(struct afs_fs_cursor *fc,
+				      struct dentry *dentry,
+				      struct afs_status_cb *scb)
+{
+	if (fc->ac.error == 0)
+		dentry->d_fsdata =
+			(void *)(unsigned long)scb->status.data_version;
 }
 
 /*
@@ -1093,37 +1272,45 @@
  */
 static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	struct afs_file_status newstatus;
+	struct afs_iget_data iget_data;
+	struct afs_status_cb *scb;
 	struct afs_fs_cursor fc;
-	struct afs_callback newcb;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
-	struct afs_fid newfid;
 	struct key *key;
-	u64 data_version = dvnode->status.data_version;
 	int ret;
 
 	mode |= S_IFDIR;
 
-	_enter("{%x:%u},{%pd},%ho",
+	_enter("{%llx:%llu},{%pd},%ho",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
 
+	ret = -ENOMEM;
+	scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
-		goto error;
+		goto error_scb;
 	}
 
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+		afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-			afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
-				      &newfid, &newstatus, &newcb);
+			afs_prep_for_new_inode(&fc, &iget_data);
+			afs_fs_create(&fc, dentry->d_name.name, mode,
+				      &scb[0], &iget_data.fid, &scb[1]);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
-		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+		afs_check_for_remote_deletion(&fc, dvnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+					&data_version, &scb[0]);
+		afs_update_dentry_version(&fc, dentry, &scb[0]);
+		afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]);
 		ret = afs_end_vnode_operation(&fc);
 		if (ret < 0)
 			goto error_key;
@@ -1133,15 +1320,18 @@
 
 	if (ret == 0 &&
 	    test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+		afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
 				 afs_edit_dir_for_create);
 
 	key_put(key);
+	kfree(scb);
 	_leave(" = 0");
 	return 0;
 
 error_key:
 	key_put(key);
+error_scb:
+	kfree(scb);
 error:
 	d_drop(dentry);
 	_leave(" = %d", ret);
@@ -1168,30 +1358,51 @@
  */
 static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 {
+	struct afs_status_cb *scb;
 	struct afs_fs_cursor fc;
-	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
 	struct key *key;
-	u64 data_version = dvnode->status.data_version;
 	int ret;
 
-	_enter("{%x:%u},{%pd}",
+	_enter("{%llx:%llu},{%pd}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		return -ENOMEM;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
+	/* Try to make sure we have a callback promise on the victim. */
+	if (d_really_is_positive(dentry)) {
+		vnode = AFS_FS_I(d_inode(dentry));
+		ret = afs_validate(vnode, key);
+		if (ret < 0)
+			goto error_key;
+	}
+
+	if (vnode) {
+		ret = down_write_killable(&vnode->rmdir_lock);
+		if (ret < 0)
+			goto error_key;
+	}
+
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+		afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-			afs_fs_remove(&fc, dentry->d_name.name, true,
-				      data_version);
+			afs_fs_remove(&fc, vnode, dentry->d_name.name, true, scb);
 		}
 
-		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+					&data_version, scb);
+		afs_update_dentry_version(&fc, dentry, scb);
 		ret = afs_end_vnode_operation(&fc);
 		if (ret == 0) {
 			afs_dir_remove_subdir(dentry);
@@ -1201,8 +1412,12 @@
 		}
 	}
 
+	if (vnode)
+		up_write(&vnode->rmdir_lock);
+error_key:
 	key_put(key);
 error:
+	kfree(scb);
 	return ret;
 }
 
@@ -1216,30 +1431,27 @@
  * However, if we didn't have a callback promise outstanding, or it was
  * outstanding on a different server, then it won't break it either...
  */
-static int afs_dir_remove_link(struct dentry *dentry, struct key *key,
-			       unsigned long d_version_before,
-			       unsigned long d_version_after)
+static int afs_dir_remove_link(struct afs_vnode *dvnode, struct dentry *dentry,
+			       struct key *key)
 {
-	bool dir_valid;
 	int ret = 0;
 
-	/* There were no intervening changes on the server if the version
-	 * number we got back was incremented by exactly 1.
-	 */
-	dir_valid = (d_version_after == d_version_before + 1);
-
 	if (d_really_is_positive(dentry)) {
 		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 
-		if (dir_valid) {
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+			/* Already done */
+		} else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
+			write_seqlock(&vnode->cb_lock);
 			drop_nlink(&vnode->vfs_inode);
 			if (vnode->vfs_inode.i_nlink == 0) {
 				set_bit(AFS_VNODE_DELETED, &vnode->flags);
-				clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+				__afs_break_callback(vnode, afs_cb_break_for_unlink);
 			}
+			write_sequnlock(&vnode->cb_lock);
 			ret = 0;
 		} else {
-			clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+			afs_break_callback(vnode, afs_cb_break_for_unlink);
 
 			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 				kdebug("AFS_VNODE_DELETED");
@@ -1260,54 +1472,93 @@
 static int afs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct afs_fs_cursor fc;
-	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
+	struct afs_status_cb *scb;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 	struct key *key;
-	unsigned long d_version = (unsigned long)dentry->d_fsdata;
-	u64 data_version = dvnode->status.data_version;
+	bool need_rehash = false;
 	int ret;
 
-	_enter("{%x:%u},{%pd}",
+	_enter("{%llx:%llu},{%pd}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
 	if (dentry->d_name.len >= AFSNAMEMAX)
 		return -ENAMETOOLONG;
 
+	ret = -ENOMEM;
+	scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
-		goto error;
+		goto error_scb;
 	}
 
 	/* Try to make sure we have a callback promise on the victim. */
-	if (d_really_is_positive(dentry)) {
-		vnode = AFS_FS_I(d_inode(dentry));
-		ret = afs_validate(vnode, key);
-		if (ret < 0)
-			goto error_key;
+	ret = afs_validate(vnode, key);
+	if (ret < 0)
+		goto error_key;
+
+	spin_lock(&dentry->d_lock);
+	if (d_count(dentry) > 1) {
+		spin_unlock(&dentry->d_lock);
+		/* Start asynchronous writeout of the inode */
+		write_inode_now(d_inode(dentry), 0);
+		ret = afs_sillyrename(dvnode, vnode, dentry, key);
+		goto error_key;
 	}
+	if (!d_unhashed(dentry)) {
+		/* Prevent a race with RCU lookup. */
+		__d_drop(dentry);
+		need_rehash = true;
+	}
+	spin_unlock(&dentry->d_lock);
 
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+		afs_dataversion_t data_version = dvnode->status.data_version + 1;
+		afs_dataversion_t data_version_2 = vnode->status.data_version;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-			afs_fs_remove(&fc, dentry->d_name.name, false,
-				      data_version);
+			fc.cb_break_2 = afs_calc_vnode_cb_break(vnode);
+
+			if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) &&
+			    !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) {
+				yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name,
+						    &scb[0], &scb[1]);
+				if (fc.ac.error != -ECONNABORTED ||
+				    fc.ac.abort_code != RXGEN_OPCODE)
+					continue;
+				set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags);
+			}
+
+			afs_fs_remove(&fc, vnode, dentry->d_name.name, false, &scb[0]);
 		}
 
-		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+					&data_version, &scb[0]);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break_2,
+					&data_version_2, &scb[1]);
+		afs_update_dentry_version(&fc, dentry, &scb[0]);
 		ret = afs_end_vnode_operation(&fc);
-		if (ret == 0)
-			ret = afs_dir_remove_link(
-				dentry, key, d_version,
-				(unsigned long)dvnode->status.data_version);
+		if (ret == 0 && !(scb[1].have_status || scb[1].have_error))
+			ret = afs_dir_remove_link(dvnode, dentry, key);
 		if (ret == 0 &&
 		    test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
 			afs_edit_dir_remove(dvnode, &dentry->d_name,
 					    afs_edit_dir_for_unlink);
 	}
 
+	if (need_rehash && ret < 0 && ret != -ENOENT)
+		d_rehash(dentry);
+
 error_key:
 	key_put(key);
+error_scb:
+	kfree(scb);
 error:
 	_leave(" = %d", ret);
 	return ret;
@@ -1319,18 +1570,16 @@
 static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		      bool excl)
 {
+	struct afs_iget_data iget_data;
 	struct afs_fs_cursor fc;
-	struct afs_file_status newstatus;
-	struct afs_callback newcb;
+	struct afs_status_cb *scb;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
-	struct afs_fid newfid;
 	struct key *key;
-	u64 data_version = dvnode->status.data_version;
 	int ret;
 
 	mode |= S_IFREG;
 
-	_enter("{%x:%u},{%pd},%ho,",
+	_enter("{%llx:%llu},{%pd},%ho,",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
 
 	ret = -ENAMETOOLONG;
@@ -1343,17 +1592,27 @@
 		goto error;
 	}
 
+	ret = -ENOMEM;
+	scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		goto error_scb;
+
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+		afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-			afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
-				      &newfid, &newstatus, &newcb);
+			afs_prep_for_new_inode(&fc, &iget_data);
+			afs_fs_create(&fc, dentry->d_name.name, mode,
+				      &scb[0], &iget_data.fid, &scb[1]);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
-		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+		afs_check_for_remote_deletion(&fc, dvnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+					&data_version, &scb[0]);
+		afs_update_dentry_version(&fc, dentry, &scb[0]);
+		afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]);
 		ret = afs_end_vnode_operation(&fc);
 		if (ret < 0)
 			goto error_key;
@@ -1362,13 +1621,16 @@
 	}
 
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+		afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
 				 afs_edit_dir_for_create);
 
+	kfree(scb);
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
+error_scb:
+	kfree(scb);
 error_key:
 	key_put(key);
 error:
@@ -1384,16 +1646,13 @@
 		    struct dentry *dentry)
 {
 	struct afs_fs_cursor fc;
-	struct afs_vnode *dvnode, *vnode;
+	struct afs_status_cb *scb;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_vnode *vnode = AFS_FS_I(d_inode(from));
 	struct key *key;
-	u64 data_version;
 	int ret;
 
-	vnode = AFS_FS_I(d_inode(from));
-	dvnode = AFS_FS_I(dir);
-	data_version = dvnode->status.data_version;
-
-	_enter("{%x:%u},{%x:%u},{%pd}",
+	_enter("{%llx:%llu},{%llx:%llu},{%pd}",
 	       vnode->fid.vid, vnode->fid.vnode,
 	       dvnode->fid.vid, dvnode->fid.vnode,
 	       dentry);
@@ -1402,14 +1661,21 @@
 	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
 
+	ret = -ENOMEM;
+	scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
-		goto error;
+		goto error_scb;
 	}
 
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+		afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
 		if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) {
 			afs_end_vnode_operation(&fc);
 			goto error_key;
@@ -1418,12 +1684,16 @@
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 			fc.cb_break_2 = afs_calc_vnode_cb_break(vnode);
-			afs_fs_link(&fc, vnode, dentry->d_name.name, data_version);
+			afs_fs_link(&fc, vnode, dentry->d_name.name,
+				    &scb[0], &scb[1]);
 		}
 
-		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break_2);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+					&data_version, &scb[0]);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break_2,
+					NULL, &scb[1]);
 		ihold(&vnode->vfs_inode);
+		afs_update_dentry_version(&fc, dentry, &scb[0]);
 		d_instantiate(dentry, &vnode->vfs_inode);
 
 		mutex_unlock(&vnode->io_lock);
@@ -1439,11 +1709,14 @@
 				 afs_edit_dir_for_link);
 
 	key_put(key);
+	kfree(scb);
 	_leave(" = 0");
 	return 0;
 
 error_key:
 	key_put(key);
+error_scb:
+	kfree(scb);
 error:
 	d_drop(dentry);
 	_leave(" = %d", ret);
@@ -1456,15 +1729,14 @@
 static int afs_symlink(struct inode *dir, struct dentry *dentry,
 		       const char *content)
 {
+	struct afs_iget_data iget_data;
 	struct afs_fs_cursor fc;
-	struct afs_file_status newstatus;
+	struct afs_status_cb *scb;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
-	struct afs_fid newfid;
 	struct key *key;
-	u64 data_version = dvnode->status.data_version;
 	int ret;
 
-	_enter("{%x:%u},{%pd},%s",
+	_enter("{%llx:%llu},{%pd},%s",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry,
 	       content);
 
@@ -1476,24 +1748,33 @@
 	if (strlen(content) >= AFSPATHMAX)
 		goto error;
 
+	ret = -ENOMEM;
+	scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
-		goto error;
+		goto error_scb;
 	}
 
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+		afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-			afs_fs_symlink(&fc, dentry->d_name.name,
-				       content, data_version,
-				       &newfid, &newstatus);
+			afs_prep_for_new_inode(&fc, &iget_data);
+			afs_fs_symlink(&fc, dentry->d_name.name, content,
+				       &scb[0], &iget_data.fid, &scb[1]);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
-		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, NULL);
+		afs_check_for_remote_deletion(&fc, dvnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+					&data_version, &scb[0]);
+		afs_update_dentry_version(&fc, dentry, &scb[0]);
+		afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]);
 		ret = afs_end_vnode_operation(&fc);
 		if (ret < 0)
 			goto error_key;
@@ -1502,15 +1783,18 @@
 	}
 
 	if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-		afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+		afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
 				 afs_edit_dir_for_symlink);
 
 	key_put(key);
+	kfree(scb);
 	_leave(" = 0");
 	return 0;
 
 error_key:
 	key_put(key);
+error_scb:
+	kfree(scb);
 error:
 	d_drop(dentry);
 	_leave(" = %d", ret);
@@ -1525,75 +1809,178 @@
 		      unsigned int flags)
 {
 	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
 	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+	struct dentry *tmp = NULL, *rehash = NULL;
+	struct inode *new_inode;
 	struct key *key;
-	u64 orig_data_version, new_data_version;
 	bool new_negative = d_is_negative(new_dentry);
 	int ret;
 
 	if (flags)
 		return -EINVAL;
 
+	/* Don't allow silly-rename files be moved around. */
+	if (old_dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		return -EINVAL;
+
 	vnode = AFS_FS_I(d_inode(old_dentry));
 	orig_dvnode = AFS_FS_I(old_dir);
 	new_dvnode = AFS_FS_I(new_dir);
-	orig_data_version = orig_dvnode->status.data_version;
-	new_data_version = new_dvnode->status.data_version;
 
-	_enter("{%x:%u},{%x:%u},{%x:%u},{%pd}",
+	_enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
 	       orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
 	       vnode->fid.vid, vnode->fid.vnode,
 	       new_dvnode->fid.vid, new_dvnode->fid.vnode,
 	       new_dentry);
 
+	ret = -ENOMEM;
+	scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		goto error;
+
 	key = afs_request_key(orig_dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
-		goto error;
+		goto error_scb;
 	}
 
+	/* For non-directories, check whether the target is busy and if so,
+	 * make a copy of the dentry and then do a silly-rename.  If the
+	 * silly-rename succeeds, the copied dentry is hashed and becomes the
+	 * new target.
+	 */
+	if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
+		/* To prevent any new references to the target during the
+		 * rename, we unhash the dentry in advance.
+		 */
+		if (!d_unhashed(new_dentry)) {
+			d_drop(new_dentry);
+			rehash = new_dentry;
+		}
+
+		if (d_count(new_dentry) > 2) {
+			/* copy the target dentry's name */
+			ret = -ENOMEM;
+			tmp = d_alloc(new_dentry->d_parent,
+				      &new_dentry->d_name);
+			if (!tmp)
+				goto error_rehash;
+
+			ret = afs_sillyrename(new_dvnode,
+					      AFS_FS_I(d_inode(new_dentry)),
+					      new_dentry, key);
+			if (ret)
+				goto error_rehash;
+
+			new_dentry = tmp;
+			rehash = NULL;
+			new_negative = true;
+		}
+	}
+
+	/* This bit is potentially nasty as there's a potential race with
+	 * afs_d_revalidate{,_rcu}().  We have to change d_fsdata on the dentry
+	 * to reflect it's new parent's new data_version after the op, but
+	 * d_revalidate may see old_dentry between the op having taken place
+	 * and the version being updated.
+	 *
+	 * So drop the old_dentry for now to make other threads go through
+	 * lookup instead - which we hold a lock against.
+	 */
+	d_drop(old_dentry);
+
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, orig_dvnode, key)) {
+	if (afs_begin_vnode_operation(&fc, orig_dvnode, key, true)) {
+		afs_dataversion_t orig_data_version;
+		afs_dataversion_t new_data_version;
+		struct afs_status_cb *new_scb = &scb[1];
+
+		orig_data_version = orig_dvnode->status.data_version + 1;
+
 		if (orig_dvnode != new_dvnode) {
 			if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) {
 				afs_end_vnode_operation(&fc);
-				goto error_key;
+				goto error_rehash_old;
 			}
+			new_data_version = new_dvnode->status.data_version + 1;
+		} else {
+			new_data_version = orig_data_version;
+			new_scb = &scb[0];
 		}
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode);
 			fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode);
 			afs_fs_rename(&fc, old_dentry->d_name.name,
 				      new_dvnode, new_dentry->d_name.name,
-				      orig_data_version, new_data_version);
+				      &scb[0], new_scb);
 		}
 
-		afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break);
-		afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2);
-		if (orig_dvnode != new_dvnode)
+		afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break,
+					&orig_data_version, &scb[0]);
+		if (new_dvnode != orig_dvnode) {
+			afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2,
+						&new_data_version, &scb[1]);
 			mutex_unlock(&new_dvnode->io_lock);
+		}
 		ret = afs_end_vnode_operation(&fc);
 		if (ret < 0)
-			goto error_key;
+			goto error_rehash_old;
 	}
 
 	if (ret == 0) {
+		if (rehash)
+			d_rehash(rehash);
 		if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags))
 		    afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name,
-					afs_edit_dir_for_rename);
+					afs_edit_dir_for_rename_0);
 
 		if (!new_negative &&
 		    test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
 			afs_edit_dir_remove(new_dvnode, &new_dentry->d_name,
-					    afs_edit_dir_for_rename);
+					    afs_edit_dir_for_rename_1);
 
 		if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
 			afs_edit_dir_add(new_dvnode, &new_dentry->d_name,
-					 &vnode->fid,  afs_edit_dir_for_rename);
+					 &vnode->fid, afs_edit_dir_for_rename_2);
+
+		new_inode = d_inode(new_dentry);
+		if (new_inode) {
+			spin_lock(&new_inode->i_lock);
+			if (new_inode->i_nlink > 0)
+				drop_nlink(new_inode);
+			spin_unlock(&new_inode->i_lock);
+		}
+
+		/* Now we can update d_fsdata on the dentries to reflect their
+		 * new parent's data_version.
+		 *
+		 * Note that if we ever implement RENAME_EXCHANGE, we'll have
+		 * to update both dentries with opposing dir versions.
+		 */
+		if (new_dvnode != orig_dvnode) {
+			afs_update_dentry_version(&fc, old_dentry, &scb[1]);
+			afs_update_dentry_version(&fc, new_dentry, &scb[1]);
+		} else {
+			afs_update_dentry_version(&fc, old_dentry, &scb[0]);
+			afs_update_dentry_version(&fc, new_dentry, &scb[0]);
+		}
+		d_move(old_dentry, new_dentry);
+		goto error_tmp;
 	}
 
-error_key:
+error_rehash_old:
+	d_rehash(new_dentry);
+error_rehash:
+	if (rehash)
+		d_rehash(rehash);
+error_tmp:
+	if (tmp)
+		dput(tmp);
 	key_put(key);
+error_scb:
+	kfree(scb);
 error:
 	_leave(" = %d", ret);
 	return ret;
@@ -1607,7 +1994,7 @@
 {
 	struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
 
-	_enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
+	_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
 
 	set_page_private(page, 0);
 	ClearPagePrivate(page);

diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 8b400f5..d4fbe5f 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS filesystem directory editing
  *
  * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>

diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
new file mode 100644
index 0000000..361088a
--- /dev/null
+++ b/fs/afs/dir_silly.c

@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* AFS silly rename handling
+ *
+ * Copyright (C) 2019 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * - Derived from NFS's sillyrename.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+#include "internal.h"
+
+/*
+ * Actually perform the silly rename step.
+ */
+static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
+			       struct dentry *old, struct dentry *new,
+			       struct key *key)
+{
+	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
+	int ret = -ERESTARTSYS;
+
+	_enter("%pd,%pd", old, new);
+
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		return -ENOMEM;
+
+	trace_afs_silly_rename(vnode, false);
+	if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+		afs_dataversion_t dir_data_version = dvnode->status.data_version + 1;
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+			afs_fs_rename(&fc, old->d_name.name,
+				      dvnode, new->d_name.name,
+				      scb, scb);
+		}
+
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+					&dir_data_version, scb);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	if (ret == 0) {
+		spin_lock(&old->d_lock);
+		old->d_flags |= DCACHE_NFSFS_RENAMED;
+		spin_unlock(&old->d_lock);
+		if (dvnode->silly_key != key) {
+			key_put(dvnode->silly_key);
+			dvnode->silly_key = key_get(key);
+		}
+
+		if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+			afs_edit_dir_remove(dvnode, &old->d_name,
+					    afs_edit_dir_for_silly_0);
+		if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+			afs_edit_dir_add(dvnode, &new->d_name,
+					 &vnode->fid, afs_edit_dir_for_silly_1);
+	}
+
+	kfree(scb);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/**
+ * afs_sillyrename - Perform a silly-rename of a dentry
+ *
+ * AFS is stateless and the server doesn't know when the client is holding a
+ * file open.  To prevent application problems when a file is unlinked while
+ * it's still open, the client performs a "silly-rename".  That is, it renames
+ * the file to a hidden file in the same directory, and only performs the
+ * unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ */
+int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
+		    struct dentry *dentry, struct key *key)
+{
+	static unsigned int sillycounter;
+	struct dentry *sdentry = NULL;
+	unsigned char silly[16];
+	int ret = -EBUSY;
+
+	_enter("");
+
+	/* We don't allow a dentry to be silly-renamed twice. */
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		return -EBUSY;
+
+	sdentry = NULL;
+	do {
+		int slen;
+
+		dput(sdentry);
+		sillycounter++;
+
+		/* Create a silly name.  Note that the ".__afs" prefix is
+		 * understood by the salvager and must not be changed.
+		 */
+		slen = scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
+		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+
+		/* N.B. Better to return EBUSY here ... it could be dangerous
+		 * to delete the file while it's in use.
+		 */
+		if (IS_ERR(sdentry))
+			goto out;
+	} while (!d_is_negative(sdentry));
+
+	ihold(&vnode->vfs_inode);
+
+	ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key);
+	switch (ret) {
+	case 0:
+		/* The rename succeeded. */
+		d_move(dentry, sdentry);
+		break;
+	case -ERESTARTSYS:
+		/* The result of the rename is unknown. Play it safe by forcing
+		 * a new lookup.
+		 */
+		d_drop(dentry);
+		d_drop(sdentry);
+	}
+
+	iput(&vnode->vfs_inode);
+	dput(sdentry);
+out:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Tell the server to remove a sillyrename file.
+ */
+static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode,
+			       struct dentry *dentry, struct key *key)
+{
+	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
+	int ret = -ERESTARTSYS;
+
+	_enter("");
+
+	scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		return -ENOMEM;
+
+	trace_afs_silly_rename(vnode, true);
+	if (afs_begin_vnode_operation(&fc, dvnode, key, false)) {
+		afs_dataversion_t dir_data_version = dvnode->status.data_version + 1;
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+
+			if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) &&
+			    !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) {
+				yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name,
+						    &scb[0], &scb[1]);
+				if (fc.ac.error != -ECONNABORTED ||
+				    fc.ac.abort_code != RXGEN_OPCODE)
+					continue;
+				set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags);
+			}
+
+			afs_fs_remove(&fc, vnode, dentry->d_name.name, false, &scb[0]);
+		}
+
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+					&dir_data_version, &scb[0]);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret == 0) {
+			drop_nlink(&vnode->vfs_inode);
+			if (vnode->vfs_inode.i_nlink == 0) {
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
+				clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+			}
+		}
+		if (ret == 0 &&
+		    test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+			afs_edit_dir_remove(dvnode, &dentry->d_name,
+					    afs_edit_dir_for_unlink);
+	}
+
+	kfree(scb);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Remove sillyrename file on iput.
+ */
+int afs_silly_iput(struct dentry *dentry, struct inode *inode)
+{
+	struct afs_vnode *dvnode = AFS_FS_I(d_inode(dentry->d_parent));
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct dentry *alias;
+	int ret;
+
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+	_enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode);
+
+	down_read(&dvnode->rmdir_lock);
+
+	alias = d_alloc_parallel(dentry->d_parent, &dentry->d_name, &wq);
+	if (IS_ERR(alias)) {
+		up_read(&dvnode->rmdir_lock);
+		return 0;
+	}
+
+	if (!d_in_lookup(alias)) {
+		/* We raced with lookup...  See if we need to transfer the
+		 * sillyrename information to the aliased dentry.
+		 */
+		ret = 0;
+		spin_lock(&alias->d_lock);
+		if (d_really_is_positive(alias) &&
+		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+			alias->d_flags |= DCACHE_NFSFS_RENAMED;
+			ret = 1;
+		}
+		spin_unlock(&alias->d_lock);
+		up_read(&dvnode->rmdir_lock);
+		dput(alias);
+		return ret;
+	}
+
+	/* Stop lock-release from complaining. */
+	spin_lock(&vnode->lock);
+	vnode->lock_state = AFS_VNODE_LOCK_DELETED;
+	trace_afs_flock_ev(vnode, NULL, afs_flock_silly_delete, 0);
+	spin_unlock(&vnode->lock);
+
+	afs_do_silly_unlink(dvnode, vnode, dentry, dvnode->silly_key);
+	up_read(&dvnode->rmdir_lock);
+	d_lookup_done(alias);
+	dput(alias);
+	return 1;
+}

diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index f29c6da..4150280 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS dynamic root handling
  *
  * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/fs.h>
@@ -14,13 +10,6 @@
 #include <linux/dns_resolver.h>
 #include "internal.h"
 
-const struct file_operations afs_dynroot_file_operations = {
-	.open		= dcache_dir_open,
-	.release	= dcache_dir_close,
-	.iterate_shared	= dcache_readdir,
-	.llseek		= dcache_dir_lseek,
-};
-
 /*
  * Probe to see if a cell may exist.  This prevents positive dentries from
  * being created unnecessarily.
@@ -28,6 +17,7 @@
 static int afs_probe_cell_name(struct dentry *dentry)
 {
 	struct afs_cell *cell;
+	struct afs_net *net = afs_d2net(dentry);
 	const char *name = dentry->d_name.name;
 	size_t len = dentry->d_name.len;
 	int ret;
@@ -40,13 +30,14 @@
 		len--;
 	}
 
-	cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len);
+	cell = afs_lookup_cell_rcu(net, name, len);
 	if (!IS_ERR(cell)) {
-		afs_put_cell(afs_d2net(dentry), cell);
+		afs_put_cell(net, cell);
 		return 0;
 	}
 
-	ret = dns_query("afsdb", name, len, "", NULL, NULL);
+	ret = dns_query(net->net, "afsdb", name, len, "srv=1",
+			NULL, NULL, false);
 	if (ret == -ENODATA)
 		ret = -EDESTADDRREQ;
 	return ret;
@@ -62,7 +53,7 @@
 	struct inode *inode;
 	int ret = -ENOENT;
 
-	_enter("%p{%pd}, {%x:%u}",
+	_enter("%p{%pd}, {%llx:%llu}",
 	       dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
 
 	if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
@@ -261,8 +252,7 @@
 	struct afs_net *net = afs_sb2net(sb);
 	int ret;
 
-	if (mutex_lock_interruptible(&net->proc_cells_lock) < 0)
-		return -ERESTARTSYS;
+	mutex_lock(&net->proc_cells_lock);
 
 	net->dynroot_sb = sb;
 	hlist_for_each_entry(cell, &net->proc_cells, proc_link) {

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7d4f261..dd3c55c 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS filesystem file handling
  *
  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
@@ -17,6 +13,7 @@
 #include <linux/writeback.h>
 #include <linux/gfp.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/mm.h>
 #include "internal.h"
 
 static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
@@ -121,7 +118,7 @@
 	struct key *key;
 	int ret;
 
-	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+	_enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
 
 	key = afs_request_key(vnode->volume->cell);
 	if (IS_ERR(key)) {
@@ -169,11 +166,12 @@
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	struct afs_file *af = file->private_data;
+	int ret = 0;
 
-	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+	_enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
 
 	if ((file->f_mode & FMODE_WRITE))
-		return vfs_fsync(file, 0);
+		ret = vfs_fsync(file, 0);
 
 	file->private_data = NULL;
 	if (af->wb)
@@ -181,8 +179,8 @@
 	key_put(af->key);
 	kfree(af);
 	afs_prune_wb_keys(vnode);
-	_leave(" = 0");
-	return 0;
+	_leave(" = %d", ret);
+	return ret;
 }
 
 /*
@@ -193,11 +191,13 @@
 	int i;
 
 	if (refcount_dec_and_test(&req->usage)) {
-		for (i = 0; i < req->nr_pages; i++)
-			if (req->pages[i])
-				put_page(req->pages[i]);
-		if (req->pages != req->array)
-			kfree(req->pages);
+		if (req->pages) {
+			for (i = 0; i < req->nr_pages; i++)
+				if (req->pages[i])
+					put_page(req->pages[i]);
+			if (req->pages != req->array)
+				kfree(req->pages);
+		}
 		kfree(req);
 	}
 }
@@ -226,24 +226,32 @@
 int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc)
 {
 	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
 	int ret;
 
-	_enter("%s{%x:%u.%u},%x,,,",
+	_enter("%s{%llx:%llu.%u},%x,,,",
 	       vnode->volume->name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
 	       key_serial(key));
 
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		return -ENOMEM;
+
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+	if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+		afs_dataversion_t data_version = vnode->status.data_version;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(vnode);
-			afs_fs_fetch_data(&fc, desc);
+			afs_fs_fetch_data(&fc, scb, desc);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		afs_check_for_remote_deletion(&fc, vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+					&data_version, scb);
 		ret = afs_end_vnode_operation(&fc);
 	}
 
@@ -253,6 +261,7 @@
 				&afs_v2net(vnode)->n_fetch_bytes);
 	}
 
+	kfree(scb);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -299,10 +308,11 @@
 		/* page will not be cached */
 	case -ENOBUFS:
 		_debug("cache said ENOBUFS");
+
+		/* fall through */
 	default:
 	go_on:
-		req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
-			      GFP_KERNEL);
+		req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
 		if (!req)
 			goto enomem;
 
@@ -401,10 +411,10 @@
 /*
  * Make pages available as they're filled.
  */
-static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
+static void afs_readpages_page_done(struct afs_read *req)
 {
 #ifdef CONFIG_AFS_FSCACHE
-	struct afs_vnode *vnode = call->reply[0];
+	struct afs_vnode *vnode = req->vnode;
 #endif
 	struct page *page = req->pages[req->index];
 
@@ -441,7 +451,7 @@
 	/* Count the number of contiguous pages at the front of the list.  Note
 	 * that the list goes prev-wards rather than next-wards.
 	 */
-	first = list_entry(pages->prev, struct page, lru);
+	first = lru_to_page(pages);
 	index = first->index + 1;
 	n = 1;
 	for (p = first->lru.prev; p != pages; p = p->prev) {
@@ -452,12 +462,12 @@
 		n++;
 	}
 
-	req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *) * n,
-		      GFP_NOFS);
+	req = kzalloc(struct_size(req, array, n), GFP_NOFS);
 	if (!req)
 		return -ENOMEM;
 
 	refcount_set(&req->usage, 1);
+	req->vnode = vnode;
 	req->page_done = afs_readpages_page_done;
 	req->pos = first->index;
 	req->pos <<= PAGE_SHIFT;
@@ -473,7 +483,7 @@
 	 * page at the end of the file.
 	 */
 	do {
-		page = list_entry(pages->prev, struct page, lru);
+		page = lru_to_page(pages);
 		list_del(&page->lru);
 		index = page->index;
 		if (add_to_page_cache_lru(page, mapping, index,
@@ -634,7 +644,7 @@
 	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
 	unsigned long priv;
 
-	_enter("{{%x:%u}[%lu],%lx},%x",
+	_enter("{{%llx:%llu}[%lu],%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
 	       gfp_flags);
 

diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index dc62d15..d5e5a6d 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c

@@ -1,21 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS file locking support
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include "internal.h"
 
 #define AFS_LOCK_GRANTED	0
 #define AFS_LOCK_PENDING	1
+#define AFS_LOCK_YOUR_TRY	2
 
 struct workqueue_struct *afs_lock_manager;
 
+static void afs_next_locker(struct afs_vnode *vnode, int error);
 static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
 static void afs_fl_release_private(struct file_lock *fl);
 
@@ -24,14 +22,26 @@
 	.fl_release_private	= afs_fl_release_private,
 };
 
+static inline void afs_set_lock_state(struct afs_vnode *vnode, enum afs_lock_state state)
+{
+	_debug("STATE %u -> %u", vnode->lock_state, state);
+	vnode->lock_state = state;
+}
+
+static atomic_t afs_file_lock_debug_id;
+
 /*
  * if the callback is broken on this vnode, then the lock may now be available
  */
 void afs_lock_may_be_available(struct afs_vnode *vnode)
 {
-	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
 
-	queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
+	spin_lock(&vnode->lock);
+	if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
+		afs_next_locker(vnode, 0);
+	trace_afs_flock_ev(vnode, NULL, afs_flock_callback_break, 0);
+	spin_unlock(&vnode->lock);
 }
 
 /*
@@ -40,8 +50,35 @@
  */
 static void afs_schedule_lock_extension(struct afs_vnode *vnode)
 {
-	queue_delayed_work(afs_lock_manager, &vnode->lock_work,
-			   AFS_LOCKWAIT * HZ / 2);
+	ktime_t expires_at, now, duration;
+	u64 duration_j;
+
+	expires_at = ktime_add_ms(vnode->locked_at, AFS_LOCKWAIT * 1000 / 2);
+	now = ktime_get_real();
+	duration = ktime_sub(expires_at, now);
+	if (duration <= 0)
+		duration_j = 0;
+	else
+		duration_j = nsecs_to_jiffies(ktime_to_ns(duration));
+
+	queue_delayed_work(afs_lock_manager, &vnode->lock_work, duration_j);
+}
+
+/*
+ * In the case of successful completion of a lock operation, record the time
+ * the reply appeared and start the lock extension timer.
+ */
+void afs_lock_op_done(struct afs_call *call)
+{
+	struct afs_vnode *vnode = call->lvnode;
+
+	if (call->error == 0) {
+		spin_lock(&vnode->lock);
+		trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0);
+		vnode->locked_at = call->reply_time;
+		afs_schedule_lock_extension(vnode);
+		spin_unlock(&vnode->lock);
+	}
 }
 
 /*
@@ -49,52 +86,126 @@
  * first lock in the queue is itself a readlock)
  * - the caller must hold the vnode lock
  */
-static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl)
+static void afs_grant_locks(struct afs_vnode *vnode)
 {
 	struct file_lock *p, *_p;
+	bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE);
 
-	list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
-	if (fl->fl_type == F_RDLCK) {
-		list_for_each_entry_safe(p, _p, &vnode->pending_locks,
-					 fl_u.afs.link) {
-			if (p->fl_type == F_RDLCK) {
-				p->fl_u.afs.state = AFS_LOCK_GRANTED;
-				list_move_tail(&p->fl_u.afs.link,
-					       &vnode->granted_locks);
-				wake_up(&p->fl_wait);
-			}
-		}
+	list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
+		if (!exclusive && p->fl_type == F_WRLCK)
+			continue;
+
+		list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks);
+		p->fl_u.afs.state = AFS_LOCK_GRANTED;
+		trace_afs_flock_op(vnode, p, afs_flock_op_grant);
+		wake_up(&p->fl_wait);
 	}
 }
 
 /*
+ * If an error is specified, reject every pending lock that matches the
+ * authentication and type of the lock we failed to get.  If there are any
+ * remaining lockers, try to wake up one of them to have a go.
+ */
+static void afs_next_locker(struct afs_vnode *vnode, int error)
+{
+	struct file_lock *p, *_p, *next = NULL;
+	struct key *key = vnode->lock_key;
+	unsigned int fl_type = F_RDLCK;
+
+	_enter("");
+
+	if (vnode->lock_type == AFS_LOCK_WRITE)
+		fl_type = F_WRLCK;
+
+	list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
+		if (error &&
+		    p->fl_type == fl_type &&
+		    afs_file_key(p->fl_file) == key) {
+			list_del_init(&p->fl_u.afs.link);
+			p->fl_u.afs.state = error;
+			wake_up(&p->fl_wait);
+		}
+
+		/* Select the next locker to hand off to. */
+		if (next &&
+		    (next->fl_type == F_WRLCK || p->fl_type == F_RDLCK))
+			continue;
+		next = p;
+	}
+
+	vnode->lock_key = NULL;
+	key_put(key);
+
+	if (next) {
+		afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
+		next->fl_u.afs.state = AFS_LOCK_YOUR_TRY;
+		trace_afs_flock_op(vnode, next, afs_flock_op_wake);
+		wake_up(&next->fl_wait);
+	} else {
+		afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE);
+		trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0);
+	}
+
+	_leave("");
+}
+
+/*
+ * Kill off all waiters in the the pending lock queue due to the vnode being
+ * deleted.
+ */
+static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
+{
+	struct file_lock *p;
+
+	afs_set_lock_state(vnode, AFS_VNODE_LOCK_DELETED);
+
+	while (!list_empty(&vnode->pending_locks)) {
+		p = list_entry(vnode->pending_locks.next,
+			       struct file_lock, fl_u.afs.link);
+		list_del_init(&p->fl_u.afs.link);
+		p->fl_u.afs.state = -ENOENT;
+		wake_up(&p->fl_wait);
+	}
+
+	key_put(vnode->lock_key);
+	vnode->lock_key = NULL;
+}
+
+/*
  * Get a lock on a file
  */
 static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
 			afs_lock_type_t type)
 {
+	struct afs_status_cb *scb;
 	struct afs_fs_cursor fc;
 	int ret;
 
-	_enter("%s{%x:%u.%u},%x,%u",
+	_enter("%s{%llx:%llu.%u},%x,%u",
 	       vnode->volume->name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
 	       key_serial(key), type);
 
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		return -ENOMEM;
+
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+	if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(vnode);
-			afs_fs_set_lock(&fc, type);
+			afs_fs_set_lock(&fc, type, scb);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		afs_check_for_remote_deletion(&fc, vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb);
 		ret = afs_end_vnode_operation(&fc);
 	}
 
+	kfree(scb);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -104,28 +215,34 @@
  */
 static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
 {
+	struct afs_status_cb *scb;
 	struct afs_fs_cursor fc;
 	int ret;
 
-	_enter("%s{%x:%u.%u},%x",
+	_enter("%s{%llx:%llu.%u},%x",
 	       vnode->volume->name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
 	       key_serial(key));
 
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		return -ENOMEM;
+
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+	if (afs_begin_vnode_operation(&fc, vnode, key, false)) {
 		while (afs_select_current_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(vnode);
-			afs_fs_extend_lock(&fc);
+			afs_fs_extend_lock(&fc, scb);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		afs_check_for_remote_deletion(&fc, vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb);
 		ret = afs_end_vnode_operation(&fc);
 	}
 
+	kfree(scb);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -135,28 +252,34 @@
  */
 static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
 {
+	struct afs_status_cb *scb;
 	struct afs_fs_cursor fc;
 	int ret;
 
-	_enter("%s{%x:%u.%u},%x",
+	_enter("%s{%llx:%llu.%u},%x",
 	       vnode->volume->name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
 	       key_serial(key));
 
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		return -ENOMEM;
+
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+	if (afs_begin_vnode_operation(&fc, vnode, key, false)) {
 		while (afs_select_current_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(vnode);
-			afs_fs_release_lock(&fc);
+			afs_fs_release_lock(&fc, scb);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		afs_check_for_remote_deletion(&fc, vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb);
 		ret = afs_end_vnode_operation(&fc);
 	}
 
+	kfree(scb);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -170,12 +293,10 @@
 {
 	struct afs_vnode *vnode =
 		container_of(work, struct afs_vnode, lock_work.work);
-	struct file_lock *fl, *next;
-	afs_lock_type_t type;
 	struct key *key;
 	int ret;
 
-	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
 
 	spin_lock(&vnode->lock);
 
@@ -183,35 +304,28 @@
 	_debug("wstate %u for %p", vnode->lock_state, vnode);
 	switch (vnode->lock_state) {
 	case AFS_VNODE_LOCK_NEED_UNLOCK:
-		_debug("unlock");
-		vnode->lock_state = AFS_VNODE_LOCK_UNLOCKING;
+		afs_set_lock_state(vnode, AFS_VNODE_LOCK_UNLOCKING);
+		trace_afs_flock_ev(vnode, NULL, afs_flock_work_unlocking, 0);
 		spin_unlock(&vnode->lock);
 
 		/* attempt to release the server lock; if it fails, we just
 		 * wait 5 minutes and it'll expire anyway */
 		ret = afs_release_lock(vnode, vnode->lock_key);
-		if (ret < 0)
+		if (ret < 0 && vnode->lock_state != AFS_VNODE_LOCK_DELETED) {
+			trace_afs_flock_ev(vnode, NULL, afs_flock_release_fail,
+					   ret);
 			printk(KERN_WARNING "AFS:"
-			       " Failed to release lock on {%x:%x} error %d\n",
+			       " Failed to release lock on {%llx:%llx} error %d\n",
 			       vnode->fid.vid, vnode->fid.vnode, ret);
-
-		spin_lock(&vnode->lock);
-		key_put(vnode->lock_key);
-		vnode->lock_key = NULL;
-		vnode->lock_state = AFS_VNODE_LOCK_NONE;
-
-		if (list_empty(&vnode->pending_locks)) {
-			spin_unlock(&vnode->lock);
-			return;
 		}
 
-		/* The new front of the queue now owns the state variables. */
-		next = list_entry(vnode->pending_locks.next,
-				  struct file_lock, fl_u.afs.link);
-		vnode->lock_key = afs_file_key(next->fl_file);
-		vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-		vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
-		goto again;
+		spin_lock(&vnode->lock);
+		if (ret == -ENOENT)
+			afs_kill_lockers_enoent(vnode);
+		else
+			afs_next_locker(vnode, 0);
+		spin_unlock(&vnode->lock);
+		return;
 
 	/* If we've already got a lock, then it must be time to extend that
 	 * lock as AFS locks time out after 5 minutes.
@@ -222,87 +336,57 @@
 		ASSERT(!list_empty(&vnode->granted_locks));
 
 		key = key_get(vnode->lock_key);
-		vnode->lock_state = AFS_VNODE_LOCK_EXTENDING;
+		afs_set_lock_state(vnode, AFS_VNODE_LOCK_EXTENDING);
+		trace_afs_flock_ev(vnode, NULL, afs_flock_work_extending, 0);
 		spin_unlock(&vnode->lock);
 
 		ret = afs_extend_lock(vnode, key); /* RPC */
 		key_put(key);
 
-		if (ret < 0)
-			pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n",
+		if (ret < 0) {
+			trace_afs_flock_ev(vnode, NULL, afs_flock_extend_fail,
+					   ret);
+			pr_warning("AFS: Failed to extend lock on {%llx:%llx} error %d\n",
 				   vnode->fid.vid, vnode->fid.vnode, ret);
+		}
 
 		spin_lock(&vnode->lock);
 
+		if (ret == -ENOENT) {
+			afs_kill_lockers_enoent(vnode);
+			spin_unlock(&vnode->lock);
+			return;
+		}
+
 		if (vnode->lock_state != AFS_VNODE_LOCK_EXTENDING)
 			goto again;
-		vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
+		afs_set_lock_state(vnode, AFS_VNODE_LOCK_GRANTED);
 
-		if (ret == 0)
-			afs_schedule_lock_extension(vnode);
-		else
+		if (ret != 0)
 			queue_delayed_work(afs_lock_manager, &vnode->lock_work,
 					   HZ * 10);
 		spin_unlock(&vnode->lock);
 		_leave(" [ext]");
 		return;
 
-		/* If we don't have a granted lock, then we must've been called
-		 * back by the server, and so if might be possible to get a
-		 * lock we're currently waiting for.
-		 */
+	/* If we're waiting for a callback to indicate lock release, we can't
+	 * actually rely on this, so need to recheck at regular intervals.  The
+	 * problem is that the server might not notify us if the lock just
+	 * expires (say because a client died) rather than being explicitly
+	 * released.
+	 */
 	case AFS_VNODE_LOCK_WAITING_FOR_CB:
-		_debug("get");
-
-		key = key_get(vnode->lock_key);
-		type = vnode->lock_type;
-		vnode->lock_state = AFS_VNODE_LOCK_SETTING;
+		_debug("retry");
+		afs_next_locker(vnode, 0);
 		spin_unlock(&vnode->lock);
+		return;
 
-		ret = afs_set_lock(vnode, key, type); /* RPC */
-		key_put(key);
+	case AFS_VNODE_LOCK_DELETED:
+		afs_kill_lockers_enoent(vnode);
+		spin_unlock(&vnode->lock);
+		return;
 
-		spin_lock(&vnode->lock);
-		switch (ret) {
-		case -EWOULDBLOCK:
-			_debug("blocked");
-			break;
-		case 0:
-			_debug("acquired");
-			vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
-			/* Fall through */
-		default:
-			/* Pass the lock or the error onto the first locker in
-			 * the list - if they're looking for this type of lock.
-			 * If they're not, we assume that whoever asked for it
-			 * took a signal.
-			 */
-			if (list_empty(&vnode->pending_locks)) {
-				_debug("withdrawn");
-				vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
-				goto again;
-			}
-
-			fl = list_entry(vnode->pending_locks.next,
-					struct file_lock, fl_u.afs.link);
-			type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-			if (vnode->lock_type != type) {
-				_debug("changed");
-				vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
-				goto again;
-			}
-
-			fl->fl_u.afs.state = ret;
-			if (ret == 0)
-				afs_grant_locks(vnode, fl);
-			else
-				list_del_init(&fl->fl_u.afs.link);
-			wake_up(&fl->fl_wait);
-			spin_unlock(&vnode->lock);
-			_leave(" [granted]");
-			return;
-		}
-
+		/* Fall through */
 	default:
 		/* Looks like a lock request was withdrawn. */
 		spin_unlock(&vnode->lock);
@@ -319,14 +403,16 @@
  */
 static void afs_defer_unlock(struct afs_vnode *vnode)
 {
-	_enter("");
+	_enter("%u", vnode->lock_state);
 
-	if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED ||
-	    vnode->lock_state == AFS_VNODE_LOCK_EXTENDING) {
+	if (list_empty(&vnode->granted_locks) &&
+	    (vnode->lock_state == AFS_VNODE_LOCK_GRANTED ||
+	     vnode->lock_state == AFS_VNODE_LOCK_EXTENDING)) {
 		cancel_delayed_work(&vnode->lock_work);
 
-		vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
-		afs_lock_may_be_available(vnode);
+		afs_set_lock_state(vnode, AFS_VNODE_LOCK_NEED_UNLOCK);
+		trace_afs_flock_ev(vnode, NULL, afs_flock_defer_unlock, 0);
+		queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
 	}
 }
 
@@ -335,7 +421,7 @@
  * whether we think that we have a locking permit.
  */
 static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key,
-			      afs_lock_type_t type, bool can_sleep)
+			      enum afs_flock_mode mode, afs_lock_type_t type)
 {
 	afs_access_t access;
 	int ret;
@@ -363,160 +449,177 @@
 	if (type == AFS_LOCK_READ) {
 		if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE | AFS_ACE_LOCK)))
 			return -EACCES;
-		if (vnode->status.lock_count == -1 && !can_sleep)
-			return -EAGAIN; /* Write locked */
 	} else {
 		if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE)))
 			return -EACCES;
-		if (vnode->status.lock_count != 0 && !can_sleep)
-			return -EAGAIN; /* Locked */
 	}
 
 	return 0;
 }
 
 /*
- * Remove the front runner from the pending queue.
- * - The caller must hold vnode->lock.
- */
-static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl)
-{
-	struct file_lock *next;
-
-	_enter("");
-
-	/* ->lock_type, ->lock_key and ->lock_state only belong to this
-	 * file_lock if we're at the front of the pending queue or if we have
-	 * the lock granted or if the lock_state is NEED_UNLOCK or UNLOCKING.
-	 */
-	if (vnode->granted_locks.next == &fl->fl_u.afs.link &&
-	    vnode->granted_locks.prev == &fl->fl_u.afs.link) {
-		list_del_init(&fl->fl_u.afs.link);
-		afs_defer_unlock(vnode);
-		return;
-	}
-
-	if (!list_empty(&vnode->granted_locks) ||
-	    vnode->pending_locks.next != &fl->fl_u.afs.link) {
-		list_del_init(&fl->fl_u.afs.link);
-		return;
-	}
-
-	list_del_init(&fl->fl_u.afs.link);
-	key_put(vnode->lock_key);
-	vnode->lock_key = NULL;
-	vnode->lock_state = AFS_VNODE_LOCK_NONE;
-
-	if (list_empty(&vnode->pending_locks))
-		return;
-
-	/* The new front of the queue now owns the state variables. */
-	next = list_entry(vnode->pending_locks.next,
-			  struct file_lock, fl_u.afs.link);
-	vnode->lock_key = afs_file_key(next->fl_file);
-	vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-	vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
-	afs_lock_may_be_available(vnode);
-}
-
-/*
  * request a lock on a file on the server
  */
 static int afs_do_setlk(struct file *file, struct file_lock *fl)
 {
 	struct inode *inode = locks_inode(file);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	enum afs_flock_mode mode = AFS_FS_S(inode->i_sb)->flock_mode;
 	afs_lock_type_t type;
 	struct key *key = afs_file_key(file);
+	bool partial, no_server_lock = false;
 	int ret;
 
-	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+	if (mode == afs_flock_mode_unset)
+		mode = afs_flock_mode_openafs;
 
-	/* only whole-file locks are supported */
-	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
-		return -EINVAL;
+	_enter("{%llx:%llu},%llu-%llu,%u,%u",
+	       vnode->fid.vid, vnode->fid.vnode,
+	       fl->fl_start, fl->fl_end, fl->fl_type, mode);
 
 	fl->fl_ops = &afs_lock_ops;
 	INIT_LIST_HEAD(&fl->fl_u.afs.link);
 	fl->fl_u.afs.state = AFS_LOCK_PENDING;
 
+	partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX);
 	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+	if (mode == afs_flock_mode_write && partial)
+		type = AFS_LOCK_WRITE;
 
-	ret = afs_do_setlk_check(vnode, key, type, fl->fl_flags & FL_SLEEP);
+	ret = afs_do_setlk_check(vnode, key, mode, type);
 	if (ret < 0)
 		return ret;
 
-	spin_lock(&vnode->lock);
+	trace_afs_flock_op(vnode, fl, afs_flock_op_set_lock);
 
-	/* If we've already got a readlock on the server then we instantly
-	 * grant another readlock, irrespective of whether there are any
-	 * pending writelocks.
+	/* AFS3 protocol only supports full-file locks and doesn't provide any
+	 * method of upgrade/downgrade, so we need to emulate for partial-file
+	 * locks.
+	 *
+	 * The OpenAFS client only gets a server lock for a full-file lock and
+	 * keeps partial-file locks local.  Allow this behaviour to be emulated
+	 * (as the default).
 	 */
-	if (type == AFS_LOCK_READ &&
-	    vnode->lock_state == AFS_VNODE_LOCK_GRANTED &&
-	    vnode->lock_type == AFS_LOCK_READ) {
-		_debug("instant readlock");
-		ASSERT(!list_empty(&vnode->granted_locks));
-		goto share_existing_lock;
+	if (mode == afs_flock_mode_local ||
+	    (partial && mode == afs_flock_mode_openafs)) {
+		no_server_lock = true;
+		goto skip_server_lock;
 	}
 
+	spin_lock(&vnode->lock);
 	list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
 
+	ret = -ENOENT;
+	if (vnode->lock_state == AFS_VNODE_LOCK_DELETED)
+		goto error_unlock;
+
+	/* If we've already got a lock on the server then try to move to having
+	 * the VFS grant the requested lock.  Note that this means that other
+	 * clients may get starved out.
+	 */
+	_debug("try %u", vnode->lock_state);
+	if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED) {
+		if (type == AFS_LOCK_READ) {
+			_debug("instant readlock");
+			list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+			fl->fl_u.afs.state = AFS_LOCK_GRANTED;
+			goto vnode_is_locked_u;
+		}
+
+		if (vnode->lock_type == AFS_LOCK_WRITE) {
+			_debug("instant writelock");
+			list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+			fl->fl_u.afs.state = AFS_LOCK_GRANTED;
+			goto vnode_is_locked_u;
+		}
+	}
+
+	if (vnode->lock_state == AFS_VNODE_LOCK_NONE &&
+	    !(fl->fl_flags & FL_SLEEP)) {
+		ret = -EAGAIN;
+		if (type == AFS_LOCK_READ) {
+			if (vnode->status.lock_count == -1)
+				goto lock_is_contended; /* Write locked */
+		} else {
+			if (vnode->status.lock_count != 0)
+				goto lock_is_contended; /* Locked */
+		}
+	}
+
 	if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
 		goto need_to_wait;
 
+try_to_lock:
 	/* We don't have a lock on this vnode and we aren't currently waiting
 	 * for one either, so ask the server for a lock.
 	 *
 	 * Note that we need to be careful if we get interrupted by a signal
 	 * after dispatching the request as we may still get the lock, even
 	 * though we don't wait for the reply (it's not too bad a problem - the
-	 * lock will expire in 10 mins anyway).
+	 * lock will expire in 5 mins anyway).
 	 */
-	_debug("not locked");
+	trace_afs_flock_ev(vnode, fl, afs_flock_try_to_lock, 0);
 	vnode->lock_key = key_get(key);
 	vnode->lock_type = type;
-	vnode->lock_state = AFS_VNODE_LOCK_SETTING;
+	afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
 	spin_unlock(&vnode->lock);
 
 	ret = afs_set_lock(vnode, key, type); /* RPC */
 
 	spin_lock(&vnode->lock);
 	switch (ret) {
+	case -EKEYREJECTED:
+	case -EKEYEXPIRED:
+	case -EKEYREVOKED:
+	case -EPERM:
+	case -EACCES:
+		fl->fl_u.afs.state = ret;
+		trace_afs_flock_ev(vnode, fl, afs_flock_fail_perm, ret);
+		list_del_init(&fl->fl_u.afs.link);
+		afs_next_locker(vnode, ret);
+		goto error_unlock;
+
+	case -ENOENT:
+		fl->fl_u.afs.state = ret;
+		trace_afs_flock_ev(vnode, fl, afs_flock_fail_other, ret);
+		list_del_init(&fl->fl_u.afs.link);
+		afs_kill_lockers_enoent(vnode);
+		goto error_unlock;
+
 	default:
-		goto abort_attempt;
+		fl->fl_u.afs.state = ret;
+		trace_afs_flock_ev(vnode, fl, afs_flock_fail_other, ret);
+		list_del_init(&fl->fl_u.afs.link);
+		afs_next_locker(vnode, 0);
+		goto error_unlock;
 
 	case -EWOULDBLOCK:
 		/* The server doesn't have a lock-waiting queue, so the client
 		 * will have to retry.  The server will break the outstanding
 		 * callbacks on a file when a lock is released.
 		 */
-		_debug("would block");
 		ASSERT(list_empty(&vnode->granted_locks));
 		ASSERTCMP(vnode->pending_locks.next, ==, &fl->fl_u.afs.link);
-		vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
-		goto need_to_wait;
+		goto lock_is_contended;
 
 	case 0:
-		_debug("acquired");
-		break;
+		afs_set_lock_state(vnode, AFS_VNODE_LOCK_GRANTED);
+		trace_afs_flock_ev(vnode, fl, afs_flock_acquired, type);
+		afs_grant_locks(vnode);
+		goto vnode_is_locked_u;
 	}
 
-	/* we've acquired a server lock, but it needs to be renewed after 5
-	 * mins */
-	vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
-	afs_schedule_lock_extension(vnode);
-
-share_existing_lock:
-	/* the lock has been granted as far as we're concerned... */
-	fl->fl_u.afs.state = AFS_LOCK_GRANTED;
-	list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
-
-given_lock:
-	/* ... but we do still need to get the VFS's blessing */
+vnode_is_locked_u:
 	spin_unlock(&vnode->lock);
+vnode_is_locked:
+	/* the lock has been granted by the server... */
+	ASSERTCMP(fl->fl_u.afs.state, ==, AFS_LOCK_GRANTED);
 
-	ret = posix_lock_file(file, fl, NULL);
+skip_server_lock:
+	/* ... but the VFS still needs to distribute access on this client. */
+	trace_afs_flock_ev(vnode, fl, afs_flock_vfs_locking, 0);
+	ret = locks_lock_file_wait(file, fl);
+	trace_afs_flock_ev(vnode, fl, afs_flock_vfs_lock, ret);
 	if (ret < 0)
 		goto vfs_rejected_lock;
 
@@ -528,38 +631,62 @@
 	_leave(" = 0");
 	return 0;
 
+lock_is_contended:
+	if (!(fl->fl_flags & FL_SLEEP)) {
+		list_del_init(&fl->fl_u.afs.link);
+		afs_next_locker(vnode, 0);
+		ret = -EAGAIN;
+		goto error_unlock;
+	}
+
+	afs_set_lock_state(vnode, AFS_VNODE_LOCK_WAITING_FOR_CB);
+	trace_afs_flock_ev(vnode, fl, afs_flock_would_block, ret);
+	queue_delayed_work(afs_lock_manager, &vnode->lock_work, HZ * 5);
+
 need_to_wait:
 	/* We're going to have to wait.  Either this client doesn't have a lock
 	 * on the server yet and we need to wait for a callback to occur, or
-	 * the client does have a lock on the server, but it belongs to some
-	 * other process(es) and is incompatible with the lock we want.
+	 * the client does have a lock on the server, but it's shared and we
+	 * need an exclusive lock.
 	 */
-	ret = -EAGAIN;
-	if (fl->fl_flags & FL_SLEEP) {
-		spin_unlock(&vnode->lock);
+	spin_unlock(&vnode->lock);
 
-		_debug("sleep");
-		ret = wait_event_interruptible(fl->fl_wait,
-					       fl->fl_u.afs.state != AFS_LOCK_PENDING);
+	trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0);
+	ret = wait_event_interruptible(fl->fl_wait,
+				       fl->fl_u.afs.state != AFS_LOCK_PENDING);
+	trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret);
 
+	if (fl->fl_u.afs.state >= 0 && fl->fl_u.afs.state != AFS_LOCK_GRANTED) {
 		spin_lock(&vnode->lock);
+
+		switch (fl->fl_u.afs.state) {
+		case AFS_LOCK_YOUR_TRY:
+			fl->fl_u.afs.state = AFS_LOCK_PENDING;
+			goto try_to_lock;
+		case AFS_LOCK_PENDING:
+			if (ret > 0) {
+				/* We need to retry the lock.  We may not be
+				 * notified by the server if it just expired
+				 * rather than being released.
+				 */
+				ASSERTCMP(vnode->lock_state, ==, AFS_VNODE_LOCK_WAITING_FOR_CB);
+				afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
+				fl->fl_u.afs.state = AFS_LOCK_PENDING;
+				goto try_to_lock;
+			}
+			goto error_unlock;
+		case AFS_LOCK_GRANTED:
+		default:
+			break;
+		}
+
+		spin_unlock(&vnode->lock);
 	}
 
 	if (fl->fl_u.afs.state == AFS_LOCK_GRANTED)
-		goto given_lock;
-	if (fl->fl_u.afs.state < 0)
-		ret = fl->fl_u.afs.state;
-
-abort_attempt:
-	/* we aren't going to get the lock, either because we're unwilling to
-	 * wait, or because some signal happened */
-	_debug("abort");
-	afs_dequeue_lock(vnode, fl);
-
-error_unlock:
-	spin_unlock(&vnode->lock);
-	_leave(" = %d", ret);
-	return ret;
+		goto vnode_is_locked;
+	ret = fl->fl_u.afs.state;
+	goto error;
 
 vfs_rejected_lock:
 	/* The VFS rejected the lock we just obtained, so we have to discard
@@ -567,11 +694,17 @@
 	 * deal with.
 	 */
 	_debug("vfs refused %d", ret);
+	if (no_server_lock)
+		goto error;
 	spin_lock(&vnode->lock);
 	list_del_init(&fl->fl_u.afs.link);
-	if (list_empty(&vnode->granted_locks))
-		afs_defer_unlock(vnode);
-	goto error_unlock;
+	afs_defer_unlock(vnode);
+
+error_unlock:
+	spin_unlock(&vnode->lock);
+error:
+	_leave(" = %d", ret);
+	return ret;
 }
 
 /*
@@ -582,16 +715,14 @@
 	struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
 	int ret;
 
-	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+	_enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+
+	trace_afs_flock_op(vnode, fl, afs_flock_op_unlock);
 
 	/* Flush all pending writes before doing anything with locks. */
 	vfs_fsync(file, 0);
 
-	/* only whole-file unlocks are supported */
-	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
-		return -EINVAL;
-
-	ret = posix_lock_file(file, fl, NULL);
+	ret = locks_lock_file_wait(file, fl);
 	_leave(" = %d [%u]", ret, vnode->lock_state);
 	return ret;
 }
@@ -607,23 +738,29 @@
 
 	_enter("");
 
+	if (vnode->lock_state == AFS_VNODE_LOCK_DELETED)
+		return -ENOENT;
+
 	fl->fl_type = F_UNLCK;
 
 	/* check local lock records first */
 	posix_test_lock(file, fl);
 	if (fl->fl_type == F_UNLCK) {
 		/* no local locks; consult the server */
-		ret = afs_fetch_status(vnode, key, false);
+		ret = afs_fetch_status(vnode, key, false, NULL);
 		if (ret < 0)
 			goto error;
 
 		lock_count = READ_ONCE(vnode->status.lock_count);
-		if (lock_count > 0)
-			fl->fl_type = F_RDLCK;
-		else
-			fl->fl_type = F_WRLCK;
-		fl->fl_start = 0;
-		fl->fl_end = OFFSET_MAX;
+		if (lock_count != 0) {
+			if (lock_count > 0)
+				fl->fl_type = F_RDLCK;
+			else
+				fl->fl_type = F_WRLCK;
+			fl->fl_start = 0;
+			fl->fl_end = OFFSET_MAX;
+			fl->fl_pid = 0;
+		}
 	}
 
 	ret = 0;
@@ -638,8 +775,10 @@
 int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
+	enum afs_flock_operation op;
+	int ret;
 
-	_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
+	_enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
 	       fl->fl_type, fl->fl_flags,
 	       (long long) fl->fl_start, (long long) fl->fl_end);
@@ -650,9 +789,23 @@
 
 	if (IS_GETLK(cmd))
 		return afs_do_getlk(file, fl);
+
+	fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
+	trace_afs_flock_op(vnode, fl, afs_flock_op_lock);
+
 	if (fl->fl_type == F_UNLCK)
-		return afs_do_unlk(file, fl);
-	return afs_do_setlk(file, fl);
+		ret = afs_do_unlk(file, fl);
+	else
+		ret = afs_do_setlk(file, fl);
+
+	switch (ret) {
+	case 0:		op = afs_flock_op_return_ok; break;
+	case -EAGAIN:	op = afs_flock_op_return_eagain; break;
+	case -EDEADLK:	op = afs_flock_op_return_edeadlk; break;
+	default:	op = afs_flock_op_return_error; break;
+	}
+	trace_afs_flock_op(vnode, fl, op);
+	return ret;
 }
 
 /*
@@ -661,8 +814,10 @@
 int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
+	enum afs_flock_operation op;
+	int ret;
 
-	_enter("{%x:%u},%d,{t=%x,fl=%x}",
+	_enter("{%llx:%llu},%d,{t=%x,fl=%x}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
 	       fl->fl_type, fl->fl_flags);
 
@@ -676,10 +831,23 @@
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 
+	fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
+	trace_afs_flock_op(vnode, fl, afs_flock_op_flock);
+
 	/* we're simulating flock() locks using posix locks on the server */
 	if (fl->fl_type == F_UNLCK)
-		return afs_do_unlk(file, fl);
-	return afs_do_setlk(file, fl);
+		ret = afs_do_unlk(file, fl);
+	else
+		ret = afs_do_setlk(file, fl);
+
+	switch (ret) {
+	case 0:		op = afs_flock_op_return_ok; break;
+	case -EAGAIN:	op = afs_flock_op_return_eagain; break;
+	case -EDEADLK:	op = afs_flock_op_return_edeadlk; break;
+	default:	op = afs_flock_op_return_error; break;
+	}
+	trace_afs_flock_op(vnode, fl, op);
+	return ret;
 }
 
 /*
@@ -694,7 +862,10 @@
 
 	_enter("");
 
+	new->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
+
 	spin_lock(&vnode->lock);
+	trace_afs_flock_op(vnode, new, afs_flock_op_copy_lock);
 	list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
 	spin_unlock(&vnode->lock);
 }
@@ -710,7 +881,12 @@
 	_enter("");
 
 	spin_lock(&vnode->lock);
-	afs_dequeue_lock(vnode, fl);
+
+	trace_afs_flock_op(vnode, fl, afs_flock_op_release_lock);
+	list_del_init(&fl->fl_u.afs.link);
+	if (list_empty(&vnode->granted_locks))
+		afs_defer_unlock(vnode);
+
 	_debug("state %u for %p", vnode->lock_state, vnode);
 	spin_unlock(&vnode->lock);
 }

diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
new file mode 100644
index 0000000..cfe62b1
--- /dev/null
+++ b/fs/afs/fs_probe.c

@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* AFS fileserver probing
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "afs_fs.h"
+#include "internal.h"
+#include "protocol_yfs.h"
+
+static bool afs_fs_probe_done(struct afs_server *server)
+{
+	if (!atomic_dec_and_test(&server->probe_outstanding))
+		return false;
+
+	wake_up_var(&server->probe_outstanding);
+	clear_bit_unlock(AFS_SERVER_FL_PROBING, &server->flags);
+	wake_up_bit(&server->flags, AFS_SERVER_FL_PROBING);
+	return true;
+}
+
+/*
+ * Process the result of probing a fileserver.  This is called after successful
+ * or failed delivery of an FS.GetCapabilities operation.
+ */
+void afs_fileserver_probe_result(struct afs_call *call)
+{
+	struct afs_addr_list *alist = call->alist;
+	struct afs_server *server = call->server;
+	unsigned int server_index = call->server_index;
+	unsigned int index = call->addr_ix;
+	unsigned int rtt = UINT_MAX;
+	bool have_result = false;
+	u64 _rtt;
+	int ret = call->error;
+
+	_enter("%pU,%u", &server->uuid, index);
+
+	spin_lock(&server->probe_lock);
+
+	switch (ret) {
+	case 0:
+		server->probe.error = 0;
+		goto responded;
+	case -ECONNABORTED:
+		if (!server->probe.responded) {
+			server->probe.abort_code = call->abort_code;
+			server->probe.error = ret;
+		}
+		goto responded;
+	case -ENOMEM:
+	case -ENONET:
+		server->probe.local_failure = true;
+		afs_io_error(call, afs_io_error_fs_probe_fail);
+		goto out;
+	case -ECONNRESET: /* Responded, but call expired. */
+	case -ERFKILL:
+	case -EADDRNOTAVAIL:
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -EHOSTDOWN:
+	case -ECONNREFUSED:
+	case -ETIMEDOUT:
+	case -ETIME:
+	default:
+		clear_bit(index, &alist->responded);
+		set_bit(index, &alist->failed);
+		if (!server->probe.responded &&
+		    (server->probe.error == 0 ||
+		     server->probe.error == -ETIMEDOUT ||
+		     server->probe.error == -ETIME))
+			server->probe.error = ret;
+		afs_io_error(call, afs_io_error_fs_probe_fail);
+		goto out;
+	}
+
+responded:
+	set_bit(index, &alist->responded);
+	clear_bit(index, &alist->failed);
+
+	if (call->service_id == YFS_FS_SERVICE) {
+		server->probe.is_yfs = true;
+		set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+		alist->addrs[index].srx_service = call->service_id;
+	} else {
+		server->probe.not_yfs = true;
+		if (!server->probe.is_yfs) {
+			clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+			alist->addrs[index].srx_service = call->service_id;
+		}
+	}
+
+	/* Get the RTT and scale it to fit into a 32-bit value that represents
+	 * over a minute of time so that we can access it with one instruction
+	 * on a 32-bit system.
+	 */
+	_rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
+	_rtt /= 64;
+	rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
+	if (rtt < server->probe.rtt) {
+		server->probe.rtt = rtt;
+		alist->preferred = index;
+		have_result = true;
+	}
+
+	smp_wmb(); /* Set rtt before responded. */
+	server->probe.responded = true;
+	set_bit(AFS_SERVER_FL_PROBED, &server->flags);
+out:
+	spin_unlock(&server->probe_lock);
+
+	_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+	       server_index, index, &alist->addrs[index].transport,
+	       (unsigned int)rtt, ret);
+
+	have_result |= afs_fs_probe_done(server);
+	if (have_result) {
+		server->probe.have_result = true;
+		wake_up_var(&server->probe.have_result);
+		wake_up_all(&server->probe_wq);
+	}
+}
+
+/*
+ * Probe all of a fileserver's addresses to find out the best route and to
+ * query its capabilities.
+ */
+static int afs_do_probe_fileserver(struct afs_net *net,
+				   struct afs_server *server,
+				   struct key *key,
+				   unsigned int server_index,
+				   struct afs_error *_e)
+{
+	struct afs_addr_cursor ac = {
+		.index = 0,
+	};
+	struct afs_call *call;
+	bool in_progress = false;
+
+	_enter("%pU", &server->uuid);
+
+	read_lock(&server->fs_lock);
+	ac.alist = rcu_dereference_protected(server->addresses,
+					     lockdep_is_held(&server->fs_lock));
+	read_unlock(&server->fs_lock);
+
+	atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+	memset(&server->probe, 0, sizeof(server->probe));
+	server->probe.rtt = UINT_MAX;
+
+	for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
+		call = afs_fs_get_capabilities(net, server, &ac, key, server_index);
+		if (!IS_ERR(call)) {
+			afs_put_call(call);
+			in_progress = true;
+		} else {
+			afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+		}
+	}
+
+	if (!in_progress)
+		afs_fs_probe_done(server);
+	return in_progress;
+}
+
+/*
+ * Send off probes to all unprobed servers.
+ */
+int afs_probe_fileservers(struct afs_net *net, struct key *key,
+			  struct afs_server_list *list)
+{
+	struct afs_server *server;
+	struct afs_error e;
+	bool in_progress = false;
+	int i;
+
+	e.error = 0;
+	e.responded = false;
+	for (i = 0; i < list->nr_servers; i++) {
+		server = list->servers[i].server;
+		if (test_bit(AFS_SERVER_FL_PROBED, &server->flags))
+			continue;
+
+		if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags) &&
+		    afs_do_probe_fileserver(net, server, key, i, &e))
+			in_progress = true;
+	}
+
+	return in_progress ? 0 : e.error;
+}
+
+/*
+ * Wait for the first as-yet untried fileserver to respond.
+ */
+int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
+{
+	struct wait_queue_entry *waits;
+	struct afs_server *server;
+	unsigned int rtt = UINT_MAX;
+	bool have_responders = false;
+	int pref = -1, i;
+
+	_enter("%u,%lx", slist->nr_servers, untried);
+
+	/* Only wait for servers that have a probe outstanding. */
+	for (i = 0; i < slist->nr_servers; i++) {
+		if (test_bit(i, &untried)) {
+			server = slist->servers[i].server;
+			if (!test_bit(AFS_SERVER_FL_PROBING, &server->flags))
+				__clear_bit(i, &untried);
+			if (server->probe.responded)
+				have_responders = true;
+		}
+	}
+	if (have_responders || !untried)
+		return 0;
+
+	waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL);
+	if (!waits)
+		return -ENOMEM;
+
+	for (i = 0; i < slist->nr_servers; i++) {
+		if (test_bit(i, &untried)) {
+			server = slist->servers[i].server;
+			init_waitqueue_entry(&waits[i], current);
+			add_wait_queue(&server->probe_wq, &waits[i]);
+		}
+	}
+
+	for (;;) {
+		bool still_probing = false;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		for (i = 0; i < slist->nr_servers; i++) {
+			if (test_bit(i, &untried)) {
+				server = slist->servers[i].server;
+				if (server->probe.responded)
+					goto stop;
+				if (test_bit(AFS_SERVER_FL_PROBING, &server->flags))
+					still_probing = true;
+			}
+		}
+
+		if (!still_probing || signal_pending(current))
+			goto stop;
+		schedule();
+	}
+
+stop:
+	set_current_state(TASK_RUNNING);
+
+	for (i = 0; i < slist->nr_servers; i++) {
+		if (test_bit(i, &untried)) {
+			server = slist->servers[i].server;
+			if (server->probe.responded &&
+			    server->probe.rtt < rtt) {
+				pref = i;
+				rtt = server->probe.rtt;
+			}
+
+			remove_wait_queue(&server->probe_wq, &waits[i]);
+		}
+	}
+
+	kfree(waits);
+
+	if (pref == -1 && signal_pending(current))
+		return -ERESTARTSYS;
+
+	if (pref >= 0)
+		slist->preferred = pref;
+	return 0;
+}

diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 50929cb..6f84231 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS File Server client stubs
  *
  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/init.h>
@@ -17,14 +13,7 @@
 #include "internal.h"
 #include "afs_fs.h"
 #include "xdr_fs.h"
-
-static const struct afs_fid afs_zero_fid;
-
-/*
- * We need somewhere to discard into in case the server helpfully returns more
- * than we asked for in FS.FetchData{,64}.
- */
-static u8 afs_discard_buffer[64];
+#include "protocol_yfs.h"
 
 static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi)
 {
@@ -65,79 +54,17 @@
 }
 
 /*
- * Update the core inode struct from a returned status record.
- */
-void afs_update_inode_from_status(struct afs_vnode *vnode,
-				  struct afs_file_status *status,
-				  const afs_dataversion_t *expected_version,
-				  u8 flags)
-{
-	struct timespec64 t;
-	umode_t mode;
-
-	t.tv_sec = status->mtime_client;
-	t.tv_nsec = 0;
-	vnode->vfs_inode.i_ctime = t;
-	vnode->vfs_inode.i_mtime = t;
-	vnode->vfs_inode.i_atime = t;
-
-	if (flags & (AFS_VNODE_META_CHANGED | AFS_VNODE_NOT_YET_SET)) {
-		vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner);
-		vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group);
-		set_nlink(&vnode->vfs_inode, status->nlink);
-
-		mode = vnode->vfs_inode.i_mode;
-		mode &= ~S_IALLUGO;
-		mode |= status->mode;
-		barrier();
-		vnode->vfs_inode.i_mode = mode;
-	}
-
-	if (!(flags & AFS_VNODE_NOT_YET_SET)) {
-		if (expected_version &&
-		    *expected_version != status->data_version) {
-			_debug("vnode modified %llx on {%x:%u} [exp %llx]",
-			       (unsigned long long) status->data_version,
-			       vnode->fid.vid, vnode->fid.vnode,
-			       (unsigned long long) *expected_version);
-			vnode->invalid_before = status->data_version;
-			if (vnode->status.type == AFS_FTYPE_DIR) {
-				if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-					afs_stat_v(vnode, n_inval);
-			} else {
-				set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
-			}
-		} else if (vnode->status.type == AFS_FTYPE_DIR) {
-			/* Expected directory change is handled elsewhere so
-			 * that we can locally edit the directory and save on a
-			 * download.
-			 */
-			if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-				flags &= ~AFS_VNODE_DATA_CHANGED;
-		}
-	}
-
-	if (flags & (AFS_VNODE_DATA_CHANGED | AFS_VNODE_NOT_YET_SET)) {
-		inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
-		i_size_write(&vnode->vfs_inode, status->size);
-	}
-}
-
-/*
  * decode an AFSFetchStatus block
  */
-static int xdr_decode_AFSFetchStatus(struct afs_call *call,
-				     const __be32 **_bp,
-				     struct afs_file_status *status,
-				     struct afs_vnode *vnode,
-				     const afs_dataversion_t *expected_version,
-				     struct afs_read *read_req)
+static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
+				     struct afs_call *call,
+				     struct afs_status_cb *scb)
 {
 	const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp;
+	struct afs_file_status *status = &scb->status;
 	bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus);
 	u64 data_version, size;
 	u32 type, abort_code;
-	u8 flags = 0;
 
 	abort_code = ntohl(xdr->abort_code);
 
@@ -150,6 +77,7 @@
 			 * case.
 			 */
 			status->abort_code = abort_code;
+			scb->have_error = true;
 			return 0;
 		}
 
@@ -167,42 +95,25 @@
 	case AFS_FTYPE_FILE:
 	case AFS_FTYPE_DIR:
 	case AFS_FTYPE_SYMLINK:
-		if (type != status->type &&
-		    vnode &&
-		    !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
-			pr_warning("Vnode %x:%x:%x changed type %u to %u\n",
-				   vnode->fid.vid,
-				   vnode->fid.vnode,
-				   vnode->fid.unique,
-				   status->type, type);
-			goto bad;
-		}
 		status->type = type;
 		break;
 	default:
 		goto bad;
 	}
 
-#define EXTRACT_M(FIELD)					\
-	do {							\
-		u32 x = ntohl(xdr->FIELD);			\
-		if (status->FIELD != x) {			\
-			flags |= AFS_VNODE_META_CHANGED;	\
-			status->FIELD = x;			\
-		}						\
-	} while (0)
+	status->nlink		= ntohl(xdr->nlink);
+	status->author		= ntohl(xdr->author);
+	status->owner		= ntohl(xdr->owner);
+	status->caller_access	= ntohl(xdr->caller_access); /* Ticket dependent */
+	status->anon_access	= ntohl(xdr->anon_access);
+	status->mode		= ntohl(xdr->mode) & S_IALLUGO;
+	status->group		= ntohl(xdr->group);
+	status->lock_count	= ntohl(xdr->lock_count);
 
-	EXTRACT_M(nlink);
-	EXTRACT_M(author);
-	EXTRACT_M(owner);
-	EXTRACT_M(caller_access); /* call ticket dependent */
-	EXTRACT_M(anon_access);
-	EXTRACT_M(mode);
-	EXTRACT_M(group);
-
-	status->mtime_client = ntohl(xdr->mtime_client);
-	status->mtime_server = ntohl(xdr->mtime_server);
-	status->lock_count   = ntohl(xdr->lock_count);
+	status->mtime_client.tv_sec = ntohl(xdr->mtime_client);
+	status->mtime_client.tv_nsec = 0;
+	status->mtime_server.tv_sec = ntohl(xdr->mtime_server);
+	status->mtime_server.tv_nsec = 0;
 
 	size  = (u64)ntohl(xdr->size_lo);
 	size |= (u64)ntohl(xdr->size_hi) << 32;
@@ -210,97 +121,33 @@
 
 	data_version  = (u64)ntohl(xdr->data_version_lo);
 	data_version |= (u64)ntohl(xdr->data_version_hi) << 32;
-	if (data_version != status->data_version) {
-		status->data_version = data_version;
-		flags |= AFS_VNODE_DATA_CHANGED;
-	}
-
-	if (read_req) {
-		read_req->data_version = data_version;
-		read_req->file_size = size;
-	}
+	status->data_version = data_version;
+	scb->have_status = true;
 
 	*_bp = (const void *)*_bp + sizeof(*xdr);
-
-	if (vnode) {
-		if (test_bit(AFS_VNODE_UNSET, &vnode->flags))
-			flags |= AFS_VNODE_NOT_YET_SET;
-		afs_update_inode_from_status(vnode, status, expected_version,
-					     flags);
-	}
-
 	return 0;
 
 bad:
 	xdr_dump_bad(*_bp);
-	return afs_protocol_error(call, -EBADMSG);
+	return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
 }
 
-/*
- * Decode the file status.  We need to lock the target vnode if we're going to
- * update its status so that stat() sees the attributes update atomically.
- */
-static int afs_decode_status(struct afs_call *call,
-			     const __be32 **_bp,
-			     struct afs_file_status *status,
-			     struct afs_vnode *vnode,
-			     const afs_dataversion_t *expected_version,
-			     struct afs_read *read_req)
+static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
 {
-	int ret;
-
-	if (!vnode)
-		return xdr_decode_AFSFetchStatus(call, _bp, status, vnode,
-						 expected_version, read_req);
-
-	write_seqlock(&vnode->cb_lock);
-	ret = xdr_decode_AFSFetchStatus(call, _bp, status, vnode,
-					expected_version, read_req);
-	write_sequnlock(&vnode->cb_lock);
-	return ret;
+	return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry;
 }
 
-/*
- * decode an AFSCallBack block
- */
-static void xdr_decode_AFSCallBack(struct afs_call *call,
-				   struct afs_vnode *vnode,
-				   const __be32 **_bp)
+static void xdr_decode_AFSCallBack(const __be32 **_bp,
+				   struct afs_call *call,
+				   struct afs_status_cb *scb)
 {
-	struct afs_cb_interest *old, *cbi = call->cbi;
-	const __be32 *bp = *_bp;
-	u32 cb_expiry;
-
-	write_seqlock(&vnode->cb_lock);
-
-	if (call->cb_break == afs_cb_break_sum(vnode, cbi)) {
-		vnode->cb_version	= ntohl(*bp++);
-		cb_expiry		= ntohl(*bp++);
-		vnode->cb_type		= ntohl(*bp++);
-		vnode->cb_expires_at	= cb_expiry + ktime_get_real_seconds();
-		old = vnode->cb_interest;
-		if (old != call->cbi) {
-			vnode->cb_interest = cbi;
-			cbi = old;
-		}
-		set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-	} else {
-		bp += 3;
-	}
-
-	write_sequnlock(&vnode->cb_lock);
-	call->cbi = cbi;
-	*_bp = bp;
-}
-
-static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
-				       struct afs_callback *cb)
-{
+	struct afs_callback *cb = &scb->callback;
 	const __be32 *bp = *_bp;
 
-	cb->version	= ntohl(*bp++);
-	cb->expiry	= ntohl(*bp++);
-	cb->type	= ntohl(*bp++);
+	bp++; /* version */
+	cb->expires_at	= xdr_decode_expiry(call, ntohl(*bp++));
+	bp++; /* type */
+	scb->have_cb	= true;
 	*_bp = bp;
 }
 
@@ -311,14 +158,18 @@
 				  struct afs_volsync *volsync)
 {
 	const __be32 *bp = *_bp;
+	u32 creation;
 
-	volsync->creation = ntohl(*bp++);
+	creation = ntohl(*bp++);
 	bp++; /* spare2 */
 	bp++; /* spare3 */
 	bp++; /* spare4 */
 	bp++; /* spare5 */
 	bp++; /* spare6 */
 	*_bp = bp;
+
+	if (volsync)
+		volsync->creation = creation;
 }
 
 /*
@@ -379,6 +230,8 @@
 	vs->blocks_in_use	= ntohl(*bp++);
 	vs->part_blocks_avail	= ntohl(*bp++);
 	vs->part_max_blocks	= ntohl(*bp++);
+	vs->vol_copy_date	= 0;
+	vs->vol_backup_date	= 0;
 	*_bp = bp;
 }
 
@@ -387,7 +240,6 @@
  */
 static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -395,16 +247,13 @@
 	if (ret < 0)
 		return ret;
 
-	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
-
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	if (afs_decode_status(call, &bp, &vnode->status, vnode,
-			      &call->expected_version, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	xdr_decode_AFSCallBack(call, vnode, &bp);
-	if (call->reply[1])
-		xdr_decode_AFSVolSync(&bp, call->reply[1]);
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSCallBack(&bp, call, call->out_scb);
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -423,15 +272,18 @@
 /*
  * fetch the status information for a file
  */
-int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync,
-			     bool new_inode)
+int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
+			     struct afs_volsync *volsync)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
-	_enter(",%x,{%x:%u},,",
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_fetch_file_status(fc, scb, volsync);
+
+	_enter(",%x,{%llx:%llu},,",
 	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode,
@@ -442,9 +294,8 @@
 	}
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->reply[1] = volsync;
-	call->expected_version = new_inode ? 1 : vnode->status.data_version;
+	call->out_scb = scb;
+	call->out_volsync = volsync;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -453,10 +304,12 @@
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
 
-	call->cb_break = fc->cb_break;
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -464,143 +317,127 @@
  */
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply[0];
-	struct afs_read *req = call->reply[2];
+	struct afs_read *req = call->read_request;
 	const __be32 *bp;
 	unsigned int size;
-	void *buffer;
 	int ret;
 
-	_enter("{%u,%zu/%u;%llu/%llu}",
-	       call->unmarshall, call->offset, call->count,
-	       req->remain, req->actual_len);
+	_enter("{%u,%zu/%llu}",
+	       call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
 
 	switch (call->unmarshall) {
 	case 0:
 		req->actual_len = 0;
-		call->offset = 0;
-		call->unmarshall++;
-		if (call->operation_ID != FSFETCHDATA64) {
-			call->unmarshall++;
-			goto no_msw;
-		}
-
-		/* extract the upper part of the returned data length of an
-		 * FSFETCHDATA64 op (which should always be 0 using this
-		 * client) */
-	case 1:
-		_debug("extract data length (MSW)");
-		ret = afs_extract_data(call, &call->tmp, 4, true);
-		if (ret < 0)
-			return ret;
-
-		req->actual_len = ntohl(call->tmp);
-		req->actual_len <<= 32;
-		call->offset = 0;
-		call->unmarshall++;
-
-	no_msw:
-		/* extract the returned data length */
-	case 2:
-		_debug("extract data length");
-		ret = afs_extract_data(call, &call->tmp, 4, true);
-		if (ret < 0)
-			return ret;
-
-		req->actual_len |= ntohl(call->tmp);
-		_debug("DATA length: %llu", req->actual_len);
-
-		req->remain = req->actual_len;
-		call->offset = req->pos & (PAGE_SIZE - 1);
 		req->index = 0;
-		if (req->actual_len == 0)
+		req->offset = req->pos & (PAGE_SIZE - 1);
+		call->unmarshall++;
+		if (call->operation_ID == FSFETCHDATA64) {
+			afs_extract_to_tmp64(call);
+		} else {
+			call->tmp_u = htonl(0);
+			afs_extract_to_tmp(call);
+		}
+		/* Fall through */
+
+		/* extract the returned data length */
+	case 1:
+		_debug("extract data length");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		req->actual_len = be64_to_cpu(call->tmp64);
+		_debug("DATA length: %llu", req->actual_len);
+		req->remain = min(req->len, req->actual_len);
+		if (req->remain == 0)
 			goto no_more_data;
+
 		call->unmarshall++;
 
 	begin_page:
 		ASSERTCMP(req->index, <, req->nr_pages);
-		if (req->remain > PAGE_SIZE - call->offset)
-			size = PAGE_SIZE - call->offset;
+		if (req->remain > PAGE_SIZE - req->offset)
+			size = PAGE_SIZE - req->offset;
 		else
 			size = req->remain;
-		call->count = call->offset + size;
-		ASSERTCMP(call->count, <=, PAGE_SIZE);
-		req->remain -= size;
+		call->bvec[0].bv_len = size;
+		call->bvec[0].bv_offset = req->offset;
+		call->bvec[0].bv_page = req->pages[req->index];
+		iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+		ASSERTCMP(size, <=, PAGE_SIZE);
+		/* Fall through */
 
 		/* extract the returned data */
-	case 3:
-		_debug("extract data %llu/%llu %zu/%u",
-		       req->remain, req->actual_len, call->offset, call->count);
+	case 2:
+		_debug("extract data %zu/%llu",
+		       iov_iter_count(&call->iter), req->remain);
 
-		buffer = kmap(req->pages[req->index]);
-		ret = afs_extract_data(call, buffer, call->count, true);
-		kunmap(req->pages[req->index]);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
-		if (call->offset == PAGE_SIZE) {
+		req->remain -= call->bvec[0].bv_len;
+		req->offset += call->bvec[0].bv_len;
+		ASSERTCMP(req->offset, <=, PAGE_SIZE);
+		if (req->offset == PAGE_SIZE) {
+			req->offset = 0;
 			if (req->page_done)
-				req->page_done(call, req);
+				req->page_done(req);
 			req->index++;
-			if (req->remain > 0) {
-				call->offset = 0;
-				if (req->index >= req->nr_pages) {
-					call->unmarshall = 4;
-					goto begin_discard;
-				}
+			if (req->remain > 0)
 				goto begin_page;
-			}
 		}
-		goto no_more_data;
+
+		ASSERTCMP(req->remain, ==, 0);
+		if (req->actual_len <= req->len)
+			goto no_more_data;
 
 		/* Discard any excess data the server gave us */
-	begin_discard:
-	case 4:
-		size = min_t(loff_t, sizeof(afs_discard_buffer), req->remain);
-		call->count = size;
-		_debug("extract discard %llu/%llu %zu/%u",
-		       req->remain, req->actual_len, call->offset, call->count);
+		afs_extract_discard(call, req->actual_len - req->len);
+		call->unmarshall = 3;
+		/* Fall through */
 
-		call->offset = 0;
-		ret = afs_extract_data(call, afs_discard_buffer, call->count, true);
-		req->remain -= call->offset;
+	case 3:
+		_debug("extract discard %zu/%llu",
+		       iov_iter_count(&call->iter), req->actual_len - req->len);
+
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
-		if (req->remain > 0)
-			goto begin_discard;
 
 	no_more_data:
-		call->offset = 0;
-		call->unmarshall = 5;
+		call->unmarshall = 4;
+		afs_extract_to_buf(call, (21 + 3 + 6) * 4);
+		/* Fall through */
 
 		/* extract the metadata */
-	case 5:
-		ret = afs_extract_data(call, call->buffer,
-				       (21 + 3 + 6) * 4, false);
+	case 4:
+		ret = afs_extract_data(call, false);
 		if (ret < 0)
 			return ret;
 
 		bp = call->buffer;
-		if (afs_decode_status(call, &bp, &vnode->status, vnode,
-				      &vnode->status.data_version, req) < 0)
-			return afs_protocol_error(call, -EBADMSG);
-		xdr_decode_AFSCallBack(call, vnode, &bp);
-		if (call->reply[1])
-			xdr_decode_AFSVolSync(&bp, call->reply[1]);
+		ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+		if (ret < 0)
+			return ret;
+		xdr_decode_AFSCallBack(&bp, call, call->out_scb);
+		xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
-		call->offset = 0;
+		req->data_version = call->out_scb->status.data_version;
+		req->file_size = call->out_scb->status.size;
+
 		call->unmarshall++;
 
-	case 6:
+	case 5:
 		break;
 	}
 
 	for (; req->index < req->nr_pages; req->index++) {
-		if (call->count < PAGE_SIZE)
+		if (req->offset < PAGE_SIZE)
 			zero_user_segment(req->pages[req->index],
-					  call->count, PAGE_SIZE);
+					  req->offset, PAGE_SIZE);
 		if (req->page_done)
-			req->page_done(call, req);
-		call->count = 0;
+			req->page_done(req);
+		req->offset = 0;
 	}
 
 	_leave(" = 0 [done]");
@@ -609,7 +446,7 @@
 
 static void afs_fetch_data_destructor(struct afs_call *call)
 {
-	struct afs_read *req = call->reply[2];
+	struct afs_read *req = call->read_request;
 
 	afs_put_read(req);
 	afs_flat_call_destructor(call);
@@ -635,7 +472,9 @@
 /*
  * fetch data from a very large file
  */
-static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
+static int afs_fs_fetch_data64(struct afs_fs_cursor *fc,
+			       struct afs_status_cb *scb,
+			       struct afs_read *req)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
@@ -649,10 +488,9 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->reply[1] = NULL; /* volsync */
-	call->reply[2] = req;
-	call->expected_version = vnode->status.data_version;
+	call->out_scb = scb;
+	call->out_volsync = NULL;
+	call->read_request = req;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -666,26 +504,32 @@
 	bp[7] = htonl(lower_32_bits(req->len));
 
 	refcount_inc(&req->usage);
-	call->cb_break = fc->cb_break;
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
  * fetch data from a file
  */
-int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
+int afs_fs_fetch_data(struct afs_fs_cursor *fc,
+		      struct afs_status_cb *scb,
+		      struct afs_read *req)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_fetch_data(fc, scb, req);
+
 	if (upper_32_bits(req->pos) ||
 	    upper_32_bits(req->len) ||
 	    upper_32_bits(req->pos + req->len))
-		return afs_fs_fetch_data64(fc, req);
+		return afs_fs_fetch_data64(fc, scb, req);
 
 	_enter("");
 
@@ -694,10 +538,9 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->reply[1] = NULL; /* volsync */
-	call->reply[2] = req;
-	call->expected_version = vnode->status.data_version;
+	call->out_scb = scb;
+	call->out_volsync = NULL;
+	call->read_request = req;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -709,10 +552,11 @@
 	bp[5] = htonl(lower_32_bits(req->len));
 
 	refcount_inc(&req->usage);
-	call->cb_break = fc->cb_break;
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -720,25 +564,24 @@
  */
 static int afs_deliver_fs_create_vnode(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u}", call->unmarshall);
-
 	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFid(&bp, call->reply[1]);
-	if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 ||
-	    afs_decode_status(call, &bp, &vnode->status, vnode,
-			      &call->expected_version, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	xdr_decode_AFSCallBack_raw(&bp, call->reply[3]);
-	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+	xdr_decode_AFSFid(&bp, call->out_fid);
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSCallBack(&bp, call, call->out_scb);
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -767,17 +610,25 @@
 int afs_fs_create(struct afs_fs_cursor *fc,
 		  const char *name,
 		  umode_t mode,
-		  u64 current_data_version,
+		  struct afs_status_cb *dvnode_scb,
 		  struct afs_fid *newfid,
-		  struct afs_file_status *newstatus,
-		  struct afs_callback *newcb)
+		  struct afs_status_cb *new_scb)
 {
-	struct afs_vnode *vnode = fc->vnode;
+	struct afs_vnode *dvnode = fc->vnode;
 	struct afs_call *call;
-	struct afs_net *net = afs_v2net(vnode);
+	struct afs_net *net = afs_v2net(dvnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)){
+		if (S_ISDIR(mode))
+			return yfs_fs_make_dir(fc, name, mode, dvnode_scb,
+					       newfid, new_scb);
+		else
+			return yfs_fs_create_file(fc, name, mode, dvnode_scb,
+						  newfid, new_scb);
+	}
+
 	_enter("");
 
 	namesz = strlen(name);
@@ -791,18 +642,16 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->reply[1] = newfid;
-	call->reply[2] = newstatus;
-	call->reply[3] = newcb;
-	call->expected_version = current_data_version + 1;
+	call->out_dir_scb = dvnode_scb;
+	call->out_fid = newfid;
+	call->out_scb = new_scb;
 
 	/* marshall the parameters */
 	bp = call->request;
 	*bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
-	*bp++ = htonl(vnode->fid.vid);
-	*bp++ = htonl(vnode->fid.vnode);
-	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(dvnode->fid.vid);
+	*bp++ = htonl(dvnode->fid.vnode);
+	*bp++ = htonl(dvnode->fid.unique);
 	*bp++ = htonl(namesz);
 	memcpy(bp, name, namesz);
 	bp = (void *) bp + namesz;
@@ -811,38 +660,38 @@
 		bp = (void *) bp + padsz;
 	}
 	*bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME);
-	*bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */
+	*bp++ = htonl(dvnode->vfs_inode.i_mtime.tv_sec); /* mtime */
 	*bp++ = 0; /* owner */
 	*bp++ = 0; /* group */
 	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
 	afs_use_fs_server(call, fc->cbi);
-	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	trace_afs_make_fs_call1(call, &dvnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
- * deliver reply data to an FS.RemoveFile or FS.RemoveDir
+ * Deliver reply data to any operation that returns directory status and volume
+ * sync.
  */
-static int afs_deliver_fs_remove(struct afs_call *call)
+static int afs_deliver_fs_dir_status_and_vol(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u}", call->unmarshall);
-
 	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	if (afs_decode_status(call, &bp, &vnode->status, vnode,
-			      &call->expected_version, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -854,29 +703,32 @@
 static const struct afs_call_type afs_RXFSRemoveFile = {
 	.name		= "FS.RemoveFile",
 	.op		= afs_FS_RemoveFile,
-	.deliver	= afs_deliver_fs_remove,
+	.deliver	= afs_deliver_fs_dir_status_and_vol,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSRemoveDir = {
 	.name		= "FS.RemoveDir",
 	.op		= afs_FS_RemoveDir,
-	.deliver	= afs_deliver_fs_remove,
+	.deliver	= afs_deliver_fs_dir_status_and_vol,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * remove a file or directory
  */
-int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir,
-		  u64 current_data_version)
+int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+		  const char *name, bool isdir, struct afs_status_cb *dvnode_scb)
 {
-	struct afs_vnode *vnode = fc->vnode;
+	struct afs_vnode *dvnode = fc->vnode;
 	struct afs_call *call;
-	struct afs_net *net = afs_v2net(vnode);
+	struct afs_net *net = afs_v2net(dvnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_remove(fc, vnode, name, isdir, dvnode_scb);
+
 	_enter("");
 
 	namesz = strlen(name);
@@ -890,15 +742,14 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->expected_version = current_data_version + 1;
+	call->out_dir_scb = dvnode_scb;
 
 	/* marshall the parameters */
 	bp = call->request;
 	*bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
-	*bp++ = htonl(vnode->fid.vid);
-	*bp++ = htonl(vnode->fid.vnode);
-	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(dvnode->fid.vid);
+	*bp++ = htonl(dvnode->fid.vnode);
+	*bp++ = htonl(dvnode->fid.unique);
 	*bp++ = htonl(namesz);
 	memcpy(bp, name, namesz);
 	bp = (void *) bp + namesz;
@@ -908,8 +759,10 @@
 	}
 
 	afs_use_fs_server(call, fc->cbi);
-	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	trace_afs_make_fs_call1(call, &dvnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -917,7 +770,6 @@
  */
 static int afs_deliver_fs_link(struct afs_call *call)
 {
-	struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1];
 	const __be32 *bp;
 	int ret;
 
@@ -929,11 +781,13 @@
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	if (afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 ||
-	    afs_decode_status(call, &bp, &dvnode->status, dvnode,
-			      &call->expected_version, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -953,7 +807,9 @@
  * make a hard link
  */
 int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
-		const char *name, u64 current_data_version)
+		const char *name,
+		struct afs_status_cb *dvnode_scb,
+		struct afs_status_cb *vnode_scb)
 {
 	struct afs_vnode *dvnode = fc->vnode;
 	struct afs_call *call;
@@ -961,6 +817,9 @@
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_link(fc, vnode, name, dvnode_scb, vnode_scb);
+
 	_enter("");
 
 	namesz = strlen(name);
@@ -972,9 +831,8 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = dvnode;
-	call->reply[1] = vnode;
-	call->expected_version = current_data_version + 1;
+	call->out_dir_scb = dvnode_scb;
+	call->out_scb = vnode_scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -994,8 +852,10 @@
 	*bp++ = htonl(vnode->fid.unique);
 
 	afs_use_fs_server(call, fc->cbi);
-	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	trace_afs_make_fs_call1(call, &vnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -1003,7 +863,6 @@
  */
 static int afs_deliver_fs_symlink(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -1015,12 +874,14 @@
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFid(&bp, call->reply[1]);
-	if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) ||
-	    afs_decode_status(call, &bp, &vnode->status, vnode,
-			      &call->expected_version, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+	xdr_decode_AFSFid(&bp, call->out_fid);
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1042,16 +903,20 @@
 int afs_fs_symlink(struct afs_fs_cursor *fc,
 		   const char *name,
 		   const char *contents,
-		   u64 current_data_version,
+		   struct afs_status_cb *dvnode_scb,
 		   struct afs_fid *newfid,
-		   struct afs_file_status *newstatus)
+		   struct afs_status_cb *new_scb)
 {
-	struct afs_vnode *vnode = fc->vnode;
+	struct afs_vnode *dvnode = fc->vnode;
 	struct afs_call *call;
-	struct afs_net *net = afs_v2net(vnode);
+	struct afs_net *net = afs_v2net(dvnode);
 	size_t namesz, reqsz, padsz, c_namesz, c_padsz;
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_symlink(fc, name, contents, dvnode_scb,
+				      newfid, new_scb);
+
 	_enter("");
 
 	namesz = strlen(name);
@@ -1068,17 +933,16 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->reply[1] = newfid;
-	call->reply[2] = newstatus;
-	call->expected_version = current_data_version + 1;
+	call->out_dir_scb = dvnode_scb;
+	call->out_fid = newfid;
+	call->out_scb = new_scb;
 
 	/* marshall the parameters */
 	bp = call->request;
 	*bp++ = htonl(FSSYMLINK);
-	*bp++ = htonl(vnode->fid.vid);
-	*bp++ = htonl(vnode->fid.vnode);
-	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(dvnode->fid.vid);
+	*bp++ = htonl(dvnode->fid.vnode);
+	*bp++ = htonl(dvnode->fid.unique);
 	*bp++ = htonl(namesz);
 	memcpy(bp, name, namesz);
 	bp = (void *) bp + namesz;
@@ -1094,15 +958,17 @@
 		bp = (void *) bp + c_padsz;
 	}
 	*bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME);
-	*bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */
+	*bp++ = htonl(dvnode->vfs_inode.i_mtime.tv_sec); /* mtime */
 	*bp++ = 0; /* owner */
 	*bp++ = 0; /* group */
 	*bp++ = htonl(S_IRWXUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
 	afs_use_fs_server(call, fc->cbi);
-	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	trace_afs_make_fs_call1(call, &dvnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -1110,26 +976,24 @@
  */
 static int afs_deliver_fs_rename(struct afs_call *call)
 {
-	struct afs_vnode *orig_dvnode = call->reply[0], *new_dvnode = call->reply[1];
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u}", call->unmarshall);
-
 	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	if (afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
-			      &call->expected_version, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	if (new_dvnode != orig_dvnode &&
-	    afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
-			      &call->expected_version_2, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	if (call->out_dir_scb != call->out_scb) {
+		ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+		if (ret < 0)
+			return ret;
+	}
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1146,14 +1010,14 @@
 };
 
 /*
- * create a symbolic link
+ * Rename/move a file or directory.
  */
 int afs_fs_rename(struct afs_fs_cursor *fc,
 		  const char *orig_name,
 		  struct afs_vnode *new_dvnode,
 		  const char *new_name,
-		  u64 current_orig_data_version,
-		  u64 current_new_data_version)
+		  struct afs_status_cb *orig_dvnode_scb,
+		  struct afs_status_cb *new_dvnode_scb)
 {
 	struct afs_vnode *orig_dvnode = fc->vnode;
 	struct afs_call *call;
@@ -1161,6 +1025,12 @@
 	size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_rename(fc, orig_name,
+				     new_dvnode, new_name,
+				     orig_dvnode_scb,
+				     new_dvnode_scb);
+
 	_enter("");
 
 	o_namesz = strlen(orig_name);
@@ -1179,10 +1049,8 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = orig_dvnode;
-	call->reply[1] = new_dvnode;
-	call->expected_version = current_orig_data_version + 1;
-	call->expected_version_2 = current_new_data_version + 1;
+	call->out_dir_scb = orig_dvnode_scb;
+	call->out_scb = new_dvnode_scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1210,8 +1078,10 @@
 	}
 
 	afs_use_fs_server(call, fc->cbi);
-	trace_afs_make_fs_call(call, &orig_dvnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	trace_afs_make_fs_call2(call, &orig_dvnode->fid, orig_name, new_name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -1219,7 +1089,6 @@
  */
 static int afs_deliver_fs_store_data(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -1231,12 +1100,10 @@
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	if (afs_decode_status(call, &bp, &vnode->status, vnode,
-			      &call->expected_version, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
-
-	afs_pages_written_back(vnode, call);
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1266,14 +1133,15 @@
 			       struct address_space *mapping,
 			       pgoff_t first, pgoff_t last,
 			       unsigned offset, unsigned to,
-			       loff_t size, loff_t pos, loff_t i_size)
+			       loff_t size, loff_t pos, loff_t i_size,
+			       struct afs_status_cb *scb)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
-	_enter(",%x,{%x:%u},,",
+	_enter(",%x,{%llx:%llu},,",
 	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	call = afs_alloc_flat_call(net, &afs_RXFSStoreData64,
@@ -1284,13 +1152,12 @@
 
 	call->key = fc->key;
 	call->mapping = mapping;
-	call->reply[0] = vnode;
 	call->first = first;
 	call->last = last;
 	call->first_offset = offset;
 	call->last_to = to;
 	call->send_pages = true;
-	call->expected_version = vnode->status.data_version + 1;
+	call->out_scb = scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1314,7 +1181,9 @@
 	*bp++ = htonl((u32) i_size);
 
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -1322,7 +1191,8 @@
  */
 int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
 		      pgoff_t first, pgoff_t last,
-		      unsigned offset, unsigned to)
+		      unsigned offset, unsigned to,
+		      struct afs_status_cb *scb)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
@@ -1330,7 +1200,10 @@
 	loff_t size, pos, i_size;
 	__be32 *bp;
 
-	_enter(",%x,{%x:%u},,",
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_store_data(fc, mapping, first, last, offset, to, scb);
+
+	_enter(",%x,{%llx:%llu},,",
 	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	size = (loff_t)to - (loff_t)offset;
@@ -1349,7 +1222,7 @@
 
 	if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
 		return afs_fs_store_data64(fc, mapping, first, last, offset, to,
-					   size, pos, i_size);
+					   size, pos, i_size, scb);
 
 	call = afs_alloc_flat_call(net, &afs_RXFSStoreData,
 				   (4 + 6 + 3) * 4,
@@ -1359,13 +1232,12 @@
 
 	call->key = fc->key;
 	call->mapping = mapping;
-	call->reply[0] = vnode;
 	call->first = first;
 	call->last = last;
 	call->first_offset = offset;
 	call->last_to = to;
 	call->send_pages = true;
-	call->expected_version = vnode->status.data_version + 1;
+	call->out_scb = scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1387,7 +1259,9 @@
 
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -1395,7 +1269,6 @@
  */
 static int afs_deliver_fs_store_status(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -1407,10 +1280,10 @@
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	if (afs_decode_status(call, &bp, &vnode->status, vnode,
-			      &call->expected_version, NULL) < 0)
-		return afs_protocol_error(call, -EBADMSG);
-	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1444,14 +1317,15 @@
  * set the attributes on a very large file, using FS.StoreData rather than
  * FS.StoreStatus so as to alter the file size also
  */
-static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
+static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr,
+				 struct afs_status_cb *scb)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
-	_enter(",%x,{%x:%u},,",
+	_enter(",%x,{%llx:%llu},,",
 	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	ASSERT(attr->ia_valid & ATTR_SIZE);
@@ -1463,8 +1337,7 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->expected_version = vnode->status.data_version + 1;
+	call->out_scb = scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1475,8 +1348,8 @@
 
 	xdr_encode_AFS_StoreStatus(&bp, attr);
 
-	*bp++ = 0;				/* position of start of write */
-	*bp++ = 0;
+	*bp++ = htonl(attr->ia_size >> 32);	/* position of start of write */
+	*bp++ = htonl((u32) attr->ia_size);
 	*bp++ = 0;				/* size of write */
 	*bp++ = 0;
 	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
@@ -1484,26 +1357,29 @@
 
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
  * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
  * so as to alter the file size also
  */
-static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
+static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr,
+			       struct afs_status_cb *scb)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
-	_enter(",%x,{%x:%u},,",
+	_enter(",%x,{%llx:%llu},,",
 	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	ASSERT(attr->ia_valid & ATTR_SIZE);
 	if (attr->ia_size >> 32)
-		return afs_fs_setattr_size64(fc, attr);
+		return afs_fs_setattr_size64(fc, attr, scb);
 
 	call = afs_alloc_flat_call(net, &afs_RXFSStoreData_as_Status,
 				   (4 + 6 + 3) * 4,
@@ -1512,8 +1388,7 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->expected_version = vnode->status.data_version + 1;
+	call->out_scb = scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1524,30 +1399,36 @@
 
 	xdr_encode_AFS_StoreStatus(&bp, attr);
 
-	*bp++ = 0;				/* position of start of write */
+	*bp++ = htonl(attr->ia_size);		/* position of start of write */
 	*bp++ = 0;				/* size of write */
 	*bp++ = htonl(attr->ia_size);		/* new file length */
 
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
  * set the attributes on a file, using FS.StoreData if there's a change in file
  * size, and FS.StoreStatus otherwise
  */
-int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
+int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr,
+		   struct afs_status_cb *scb)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
-	if (attr->ia_valid & ATTR_SIZE)
-		return afs_fs_setattr_size(fc, attr);
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_setattr(fc, attr, scb);
 
-	_enter(",%x,{%x:%u},,",
+	if (attr->ia_valid & ATTR_SIZE)
+		return afs_fs_setattr_size(fc, attr, scb);
+
+	_enter(",%x,{%llx:%llu},,",
 	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus,
@@ -1557,8 +1438,7 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->expected_version = vnode->status.data_version;
+	call->out_scb = scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1571,7 +1451,9 @@
 
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -1581,164 +1463,121 @@
 {
 	const __be32 *bp;
 	char *p;
+	u32 size;
 	int ret;
 
 	_enter("{%u}", call->unmarshall);
 
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
 		call->unmarshall++;
+		afs_extract_to_buf(call, 12 * 4);
+		/* Fall through */
 
 		/* extract the returned status record */
 	case 1:
 		_debug("extract status");
-		ret = afs_extract_data(call, call->buffer,
-				       12 * 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		bp = call->buffer;
-		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]);
-		call->offset = 0;
+		xdr_decode_AFSFetchVolumeStatus(&bp, call->out_volstatus);
 		call->unmarshall++;
+		afs_extract_to_tmp(call);
+		/* Fall through */
 
 		/* extract the volume name length */
 	case 2:
-		ret = afs_extract_data(call, &call->tmp, 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("volname length: %u", call->count);
 		if (call->count >= AFSNAMEMAX)
-			return afs_protocol_error(call, -EBADMSG);
-		call->offset = 0;
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_volname_len);
+		size = (call->count + 3) & ~3; /* It's padded */
+		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* extract the volume name */
 	case 3:
 		_debug("extract volname");
-		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply[2],
-					       call->count, true);
-			if (ret < 0)
-				return ret;
-		}
-
-		p = call->reply[2];
-		p[call->count] = 0;
-		_debug("volname '%s'", p);
-
-		call->offset = 0;
-		call->unmarshall++;
-
-		/* extract the volume name padding */
-		if ((call->count & 3) == 0) {
-			call->unmarshall++;
-			goto no_volname_padding;
-		}
-		call->count = 4 - (call->count & 3);
-
-	case 4:
-		ret = afs_extract_data(call, call->buffer,
-				       call->count, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
-		call->offset = 0;
+		p = call->buffer;
+		p[call->count] = 0;
+		_debug("volname '%s'", p);
+		afs_extract_to_tmp(call);
 		call->unmarshall++;
-	no_volname_padding:
+		/* Fall through */
 
 		/* extract the offline message length */
-	case 5:
-		ret = afs_extract_data(call, &call->tmp, 4, true);
+	case 4:
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("offline msg length: %u", call->count);
 		if (call->count >= AFSNAMEMAX)
-			return afs_protocol_error(call, -EBADMSG);
-		call->offset = 0;
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_offline_msg_len);
+		size = (call->count + 3) & ~3; /* It's padded */
+		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* extract the offline message */
-	case 6:
+	case 5:
 		_debug("extract offline");
-		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply[2],
-					       call->count, true);
-			if (ret < 0)
-				return ret;
-		}
-
-		p = call->reply[2];
-		p[call->count] = 0;
-		_debug("offline '%s'", p);
-
-		call->offset = 0;
-		call->unmarshall++;
-
-		/* extract the offline message padding */
-		if ((call->count & 3) == 0) {
-			call->unmarshall++;
-			goto no_offline_padding;
-		}
-		call->count = 4 - (call->count & 3);
-
-	case 7:
-		ret = afs_extract_data(call, call->buffer,
-				       call->count, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
-		call->offset = 0;
+		p = call->buffer;
+		p[call->count] = 0;
+		_debug("offline '%s'", p);
+
+		afs_extract_to_tmp(call);
 		call->unmarshall++;
-	no_offline_padding:
+		/* Fall through */
 
 		/* extract the message of the day length */
-	case 8:
-		ret = afs_extract_data(call, &call->tmp, 4, true);
+	case 6:
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		call->count = ntohl(call->tmp);
 		_debug("motd length: %u", call->count);
 		if (call->count >= AFSNAMEMAX)
-			return afs_protocol_error(call, -EBADMSG);
-		call->offset = 0;
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_motd_len);
+		size = (call->count + 3) & ~3; /* It's padded */
+		afs_extract_to_buf(call, size);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* extract the message of the day */
-	case 9:
+	case 7:
 		_debug("extract motd");
-		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply[2],
-					       call->count, true);
-			if (ret < 0)
-				return ret;
-		}
-
-		p = call->reply[2];
-		p[call->count] = 0;
-		_debug("motd '%s'", p);
-
-		call->offset = 0;
-		call->unmarshall++;
-
-		/* extract the message of the day padding */
-		call->count = (4 - (call->count & 3)) & 3;
-
-	case 10:
-		ret = afs_extract_data(call, call->buffer,
-				       call->count, false);
+		ret = afs_extract_data(call, false);
 		if (ret < 0)
 			return ret;
 
-		call->offset = 0;
+		p = call->buffer;
+		p[call->count] = 0;
+		_debug("motd '%s'", p);
+
 		call->unmarshall++;
-	case 11:
+
+	case 8:
 		break;
 	}
 
@@ -1747,23 +1586,13 @@
 }
 
 /*
- * destroy an FS.GetVolumeStatus call
- */
-static void afs_get_volume_status_call_destructor(struct afs_call *call)
-{
-	kfree(call->reply[2]);
-	call->reply[2] = NULL;
-	afs_flat_call_destructor(call);
-}
-
-/*
  * FS.GetVolumeStatus operation type
  */
 static const struct afs_call_type afs_RXFSGetVolumeStatus = {
 	.name		= "FS.GetVolumeStatus",
 	.op		= afs_FS_GetVolumeStatus,
 	.deliver	= afs_deliver_fs_get_volume_status,
-	.destructor	= afs_get_volume_status_call_destructor,
+	.destructor	= afs_flat_call_destructor,
 };
 
 /*
@@ -1776,24 +1605,19 @@
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
-	void *tmpbuf;
+
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_get_volume_status(fc, vs);
 
 	_enter("");
 
-	tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
-	if (!tmpbuf)
+	call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4,
+				   max(12 * 4, AFSOPAQUEMAX + 1));
+	if (!call)
 		return -ENOMEM;
 
-	call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
-	if (!call) {
-		kfree(tmpbuf);
-		return -ENOMEM;
-	}
-
 	call->key = fc->key;
-	call->reply[0] = vnode;
-	call->reply[1] = vs;
-	call->reply[2] = tmpbuf;
+	call->out_volstatus = vs;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1802,7 +1626,9 @@
 
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -1821,7 +1647,7 @@
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1834,6 +1660,7 @@
 	.name		= "FS.SetLock",
 	.op		= afs_FS_SetLock,
 	.deliver	= afs_deliver_fs_xxxx_lock,
+	.done		= afs_lock_op_done,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1844,6 +1671,7 @@
 	.name		= "FS.ExtendLock",
 	.op		= afs_FS_ExtendLock,
 	.deliver	= afs_deliver_fs_xxxx_lock,
+	.done		= afs_lock_op_done,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1860,13 +1688,17 @@
 /*
  * Set a lock on a file
  */
-int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
+int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type,
+		    struct afs_status_cb *scb)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_set_lock(fc, type, scb);
+
 	_enter("");
 
 	call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4);
@@ -1874,7 +1706,8 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
+	call->lvnode = vnode;
+	call->out_scb = scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1885,20 +1718,25 @@
 	*bp++ = htonl(type);
 
 	afs_use_fs_server(call, fc->cbi);
-	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	trace_afs_make_fs_calli(call, &vnode->fid, type);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
  * extend a lock on a file
  */
-int afs_fs_extend_lock(struct afs_fs_cursor *fc)
+int afs_fs_extend_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_extend_lock(fc, scb);
+
 	_enter("");
 
 	call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4);
@@ -1906,7 +1744,8 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
+	call->lvnode = vnode;
+	call->out_scb = scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1917,19 +1756,24 @@
 
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
  * release a lock on a file
  */
-int afs_fs_release_lock(struct afs_fs_cursor *fc)
+int afs_fs_release_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb)
 {
 	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
 	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_release_lock(fc, scb);
+
 	_enter("");
 
 	call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4);
@@ -1937,7 +1781,8 @@
 		return -ENOMEM;
 
 	call->key = fc->key;
-	call->reply[0] = vnode;
+	call->lvnode = vnode;
+	call->out_scb = scb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1948,7 +1793,9 @@
 
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &vnode->fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -1993,7 +1840,8 @@
 	*bp++ = htonl(FSGIVEUPALLCALLBACKS);
 
 	/* Can't take a ref on server */
-	return afs_make_call(ac, call, GFP_NOFS, false);
+	afs_make_call(ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, ac);
 }
 
 /*
@@ -2004,19 +1852,17 @@
 	u32 count;
 	int ret;
 
-	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+	_enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter));
 
-again:
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
+		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract the capabilities word count */
 	case 1:
-		ret = afs_extract_data(call, &call->tmp,
-				       1 * sizeof(__be32),
-				       true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
@@ -2024,24 +1870,18 @@
 
 		call->count = count;
 		call->count2 = count;
-		call->offset = 0;
+		afs_extract_discard(call, count * sizeof(__be32));
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract capabilities words */
 	case 2:
-		count = min(call->count, 16U);
-		ret = afs_extract_data(call, call->buffer,
-				       count * sizeof(__be32),
-				       call->count > 16);
+		ret = afs_extract_data(call, false);
 		if (ret < 0)
 			return ret;
 
 		/* TODO: Examine capabilities */
 
-		call->count -= count;
-		if (call->count > 0)
-			goto again;
-		call->offset = 0;
 		call->unmarshall++;
 		break;
 	}
@@ -2057,6 +1897,7 @@
 	.name		= "FS.GetCapabilities",
 	.op		= afs_FS_GetCapabilities,
 	.deliver	= afs_deliver_fs_get_capabilities,
+	.done		= afs_fileserver_probe_result,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -2064,10 +1905,11 @@
  * Probe a fileserver for the capabilities that it supports.  This can
  * return up to 196 words.
  */
-int afs_fs_get_capabilities(struct afs_net *net,
-			    struct afs_server *server,
-			    struct afs_addr_cursor *ac,
-			    struct key *key)
+struct afs_call *afs_fs_get_capabilities(struct afs_net *net,
+					 struct afs_server *server,
+					 struct afs_addr_cursor *ac,
+					 struct key *key,
+					 unsigned int server_index)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -2076,9 +1918,14 @@
 
 	call = afs_alloc_flat_call(net, &afs_RXFSGetCapabilities, 1 * 4, 16 * 4);
 	if (!call)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	call->key = key;
+	call->server = afs_get_server(server, afs_server_trace_get_caps);
+	call->server_index = server_index;
+	call->upgrade = true;
+	call->async = true;
+	call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -2086,7 +1933,8 @@
 
 	/* Can't take a ref on server */
 	trace_afs_make_fs_call(call, NULL);
-	return afs_make_call(ac, call, GFP_NOFS, false);
+	afs_make_call(ac, call, GFP_NOFS);
+	return call;
 }
 
 /*
@@ -2094,10 +1942,6 @@
  */
 static int afs_deliver_fs_fetch_status(struct afs_call *call)
 {
-	struct afs_file_status *status = call->reply[1];
-	struct afs_callback *callback = call->reply[2];
-	struct afs_volsync *volsync = call->reply[3];
-	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -2105,21 +1949,13 @@
 	if (ret < 0)
 		return ret;
 
-	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
-
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	afs_decode_status(call, &bp, status, vnode,
-			  &call->expected_version, NULL);
-	callback[call->count].version	= ntohl(bp[0]);
-	callback[call->count].expiry	= ntohl(bp[1]);
-	callback[call->count].type	= ntohl(bp[2]);
-	if (vnode)
-		xdr_decode_AFSCallBack(call, vnode, &bp);
-	else
-		bp += 3;
-	if (volsync)
-		xdr_decode_AFSVolSync(&bp, volsync);
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSCallBack(&bp, call, call->out_scb);
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -2141,14 +1977,16 @@
 int afs_fs_fetch_status(struct afs_fs_cursor *fc,
 			struct afs_net *net,
 			struct afs_fid *fid,
-			struct afs_file_status *status,
-			struct afs_callback *callback,
+			struct afs_status_cb *scb,
 			struct afs_volsync *volsync)
 {
 	struct afs_call *call;
 	__be32 *bp;
 
-	_enter(",%x,{%x:%u},,",
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_fetch_status(fc, net, fid, scb, volsync);
+
+	_enter(",%x,{%llx:%llu},,",
 	       key_serial(fc->key), fid->vid, fid->vnode);
 
 	call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
@@ -2158,11 +1996,9 @@
 	}
 
 	call->key = fc->key;
-	call->reply[0] = NULL; /* vnode for fid[0] */
-	call->reply[1] = status;
-	call->reply[2] = callback;
-	call->reply[3] = volsync;
-	call->expected_version = 1; /* vnode->status.data_version */
+	call->out_fid = fid;
+	call->out_scb = scb;
+	call->out_volsync = volsync;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -2171,10 +2007,11 @@
 	bp[2] = htonl(fid->vnode);
 	bp[3] = htonl(fid->unique);
 
-	call->cb_break = fc->cb_break;
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, fid);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
@@ -2182,9 +2019,7 @@
  */
 static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
 {
-	struct afs_file_status *statuses;
-	struct afs_callback *callbacks;
-	struct afs_vnode *vnode = call->reply[0];
+	struct afs_status_cb *scb;
 	const __be32 *bp;
 	u32 tmp;
 	int ret;
@@ -2193,38 +2028,40 @@
 
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
+		afs_extract_to_tmp(call);
 		call->unmarshall++;
+		/* Fall through */
 
 		/* Extract the file status count and array in two steps */
 	case 1:
 		_debug("extract status count");
-		ret = afs_extract_data(call, &call->tmp, 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		tmp = ntohl(call->tmp);
 		_debug("status count: %u/%u", tmp, call->count2);
 		if (tmp != call->count2)
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_ibulkst_count);
 
 		call->count = 0;
 		call->unmarshall++;
 	more_counts:
-		call->offset = 0;
+		afs_extract_to_buf(call, 21 * sizeof(__be32));
+		/* Fall through */
 
 	case 2:
 		_debug("extract status array %u", call->count);
-		ret = afs_extract_data(call, call->buffer, 21 * 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		bp = call->buffer;
-		statuses = call->reply[1];
-		if (afs_decode_status(call, &bp, &statuses[call->count],
-				      call->count == 0 ? vnode : NULL,
-				      NULL, NULL) < 0)
-			return afs_protocol_error(call, -EBADMSG);
+		scb = &call->out_scb[call->count];
+		ret = xdr_decode_AFSFetchStatus(&bp, call, scb);
+		if (ret < 0)
+			return ret;
 
 		call->count++;
 		if (call->count < call->count2)
@@ -2232,56 +2069,53 @@
 
 		call->count = 0;
 		call->unmarshall++;
-		call->offset = 0;
+		afs_extract_to_tmp(call);
+		/* Fall through */
 
 		/* Extract the callback count and array in two steps */
 	case 3:
 		_debug("extract CB count");
-		ret = afs_extract_data(call, &call->tmp, 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		tmp = ntohl(call->tmp);
 		_debug("CB count: %u", tmp);
 		if (tmp != call->count2)
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_ibulkst_cb_count);
 		call->count = 0;
 		call->unmarshall++;
 	more_cbs:
-		call->offset = 0;
+		afs_extract_to_buf(call, 3 * sizeof(__be32));
+		/* Fall through */
 
 	case 4:
 		_debug("extract CB array");
-		ret = afs_extract_data(call, call->buffer, 3 * 4, true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		_debug("unmarshall CB array");
 		bp = call->buffer;
-		callbacks = call->reply[2];
-		callbacks[call->count].version	= ntohl(bp[0]);
-		callbacks[call->count].expiry	= ntohl(bp[1]);
-		callbacks[call->count].type	= ntohl(bp[2]);
-		statuses = call->reply[1];
-		if (call->count == 0 && vnode && statuses[0].abort_code == 0)
-			xdr_decode_AFSCallBack(call, vnode, &bp);
+		scb = &call->out_scb[call->count];
+		xdr_decode_AFSCallBack(&bp, call, scb);
 		call->count++;
 		if (call->count < call->count2)
 			goto more_cbs;
 
-		call->offset = 0;
+		afs_extract_to_buf(call, 6 * sizeof(__be32));
 		call->unmarshall++;
+		/* Fall through */
 
 	case 5:
-		ret = afs_extract_data(call, call->buffer, 6 * 4, false);
+		ret = afs_extract_data(call, false);
 		if (ret < 0)
 			return ret;
 
 		bp = call->buffer;
-		if (call->reply[3])
-			xdr_decode_AFSVolSync(&bp, call->reply[3]);
+		xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
-		call->offset = 0;
 		call->unmarshall++;
 
 	case 6:
@@ -2308,8 +2142,7 @@
 int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
 			      struct afs_net *net,
 			      struct afs_fid *fids,
-			      struct afs_file_status *statuses,
-			      struct afs_callback *callbacks,
+			      struct afs_status_cb *statuses,
 			      unsigned int nr_fids,
 			      struct afs_volsync *volsync)
 {
@@ -2317,7 +2150,11 @@
 	__be32 *bp;
 	int i;
 
-	_enter(",%x,{%x:%u},%u",
+	if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+		return yfs_fs_inline_bulk_status(fc, net, fids, statuses,
+						 nr_fids, volsync);
+
+	_enter(",%x,{%llx:%llu},%u",
 	       key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids);
 
 	call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus,
@@ -2329,10 +2166,8 @@
 	}
 
 	call->key = fc->key;
-	call->reply[0] = NULL; /* vnode for fid[0] */
-	call->reply[1] = statuses;
-	call->reply[2] = callbacks;
-	call->reply[3] = volsync;
+	call->out_scb = statuses;
+	call->out_volsync = volsync;
 	call->count2 = nr_fids;
 
 	/* marshall the parameters */
@@ -2345,8 +2180,207 @@
 		*bp++ = htonl(fids[i].unique);
 	}
 
-	call->cb_break = fc->cb_break;
 	afs_use_fs_server(call, fc->cbi);
 	trace_afs_make_fs_call(call, &fids[0]);
-	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * deliver reply data to an FS.FetchACL
+ */
+static int afs_deliver_fs_fetch_acl(struct afs_call *call)
+{
+	struct afs_acl *acl;
+	const __be32 *bp;
+	unsigned int size;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	switch (call->unmarshall) {
+	case 0:
+		afs_extract_to_tmp(call);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the returned data length */
+	case 1:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		size = call->count2 = ntohl(call->tmp);
+		size = round_up(size, 4);
+
+		acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL);
+		if (!acl)
+			return -ENOMEM;
+		call->ret_acl = acl;
+		acl->size = call->count2;
+		afs_extract_begin(call, acl->data, size);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the returned data */
+	case 2:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		afs_extract_to_buf(call, (21 + 6) * 4);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the metadata */
+	case 3:
+		ret = afs_extract_data(call, false);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+		if (ret < 0)
+			return ret;
+		xdr_decode_AFSVolSync(&bp, call->out_volsync);
+
+		call->unmarshall++;
+
+	case 4:
+		break;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+static void afs_destroy_fs_fetch_acl(struct afs_call *call)
+{
+	kfree(call->ret_acl);
+	afs_flat_call_destructor(call);
+}
+
+/*
+ * FS.FetchACL operation type
+ */
+static const struct afs_call_type afs_RXFSFetchACL = {
+	.name		= "FS.FetchACL",
+	.op		= afs_FS_FetchACL,
+	.deliver	= afs_deliver_fs_fetch_acl,
+	.destructor	= afs_destroy_fs_fetch_acl,
+};
+
+/*
+ * Fetch the ACL for a file.
+ */
+struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *fc,
+				 struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchACL, 16, (21 + 6) * 4);
+	if (!call) {
+		fc->ac.error = -ENOMEM;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	call->key = fc->key;
+	call->ret_acl = NULL;
+	call->out_scb = scb;
+	call->out_volsync = NULL;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHACL);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_make_call(&fc->ac, call, GFP_KERNEL);
+	return (struct afs_acl *)afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to any operation that returns file status and volume
+ * sync.
+ */
+static int afs_deliver_fs_file_status_and_vol(struct afs_call *call)
+{
+	const __be32 *bp;
+	int ret;
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_AFSVolSync(&bp, call->out_volsync);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.StoreACL operation type
+ */
+static const struct afs_call_type afs_RXFSStoreACL = {
+	.name		= "FS.StoreACL",
+	.op		= afs_FS_StoreACL,
+	.deliver	= afs_deliver_fs_file_status_and_vol,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the ACL for a file.
+ */
+int afs_fs_store_acl(struct afs_fs_cursor *fc, const struct afs_acl *acl,
+		     struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	size_t size;
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+	size = round_up(acl->size, 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreACL,
+				   5 * 4 + size, (21 + 6) * 4);
+	if (!call) {
+		fc->ac.error = -ENOMEM;
+		return -ENOMEM;
+	}
+
+	call->key = fc->key;
+	call->out_scb = scb;
+	call->out_volsync = NULL;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSSTOREACL);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+	bp[4] = htonl(acl->size);
+	memcpy(&bp[5], acl->data, acl->size);
+	if (acl->size != size)
+		memset((void *)&bp[5] + acl->size, 0, size - acl->size);
+
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_make_call(&fc->ac, call, GFP_KERNEL);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
 }

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 071075d..46d2d7c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c

@@ -23,47 +23,96 @@
 #include <linux/namei.h>
 #include <linux/iversion.h>
 #include "internal.h"
+#include "afs_fs.h"
 
 static const struct inode_operations afs_symlink_inode_operations = {
 	.get_link	= page_get_link,
 	.listxattr	= afs_listxattr,
 };
 
+static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
+{
+	static unsigned long once_only;
+
+	pr_warn("kAFS: AFS vnode with undefined type %u\n",
+		vnode->status.type);
+	pr_warn("kAFS: A=%d m=%o s=%llx v=%llx\n",
+		vnode->status.abort_code,
+		vnode->status.mode,
+		vnode->status.size,
+		vnode->status.data_version);
+	pr_warn("kAFS: vnode %llx:%llx:%x\n",
+		vnode->fid.vid,
+		vnode->fid.vnode,
+		vnode->fid.unique);
+	if (parent_vnode)
+		pr_warn("kAFS: dir %llx:%llx:%x\n",
+			parent_vnode->fid.vid,
+			parent_vnode->fid.vnode,
+			parent_vnode->fid.unique);
+
+	if (!test_and_set_bit(0, &once_only))
+		dump_stack();
+}
+
+/*
+ * Set the file size and block count.  Estimate the number of 512 bytes blocks
+ * used, rounded up to nearest 1K for consistency with other AFS clients.
+ */
+static void afs_set_i_size(struct afs_vnode *vnode, u64 size)
+{
+	i_size_write(&vnode->vfs_inode, size);
+	vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
+}
+
 /*
  * Initialise an inode from the vnode status.
  */
-static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key)
+static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key,
+				      struct afs_cb_interest *cbi,
+				      struct afs_vnode *parent_vnode,
+				      struct afs_status_cb *scb)
 {
+	struct afs_cb_interest *old_cbi = NULL;
+	struct afs_file_status *status = &scb->status;
 	struct inode *inode = AFS_VNODE_TO_I(vnode);
+	struct timespec64 t;
 
 	_debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
-	       vnode->status.type,
-	       vnode->status.nlink,
-	       (unsigned long long) vnode->status.size,
-	       vnode->status.data_version,
-	       vnode->status.mode);
+	       status->type,
+	       status->nlink,
+	       (unsigned long long) status->size,
+	       status->data_version,
+	       status->mode);
 
-	read_seqlock_excl(&vnode->cb_lock);
+	write_seqlock(&vnode->cb_lock);
 
-	afs_update_inode_from_status(vnode, &vnode->status, NULL,
-				     AFS_VNODE_NOT_YET_SET);
+	vnode->status = *status;
 
-	switch (vnode->status.type) {
+	t = status->mtime_client;
+	inode->i_ctime = t;
+	inode->i_mtime = t;
+	inode->i_atime = t;
+	inode->i_uid = make_kuid(&init_user_ns, status->owner);
+	inode->i_gid = make_kgid(&init_user_ns, status->group);
+	set_nlink(&vnode->vfs_inode, status->nlink);
+
+	switch (status->type) {
 	case AFS_FTYPE_FILE:
-		inode->i_mode	= S_IFREG | vnode->status.mode;
+		inode->i_mode	= S_IFREG | status->mode;
 		inode->i_op	= &afs_file_inode_operations;
 		inode->i_fop	= &afs_file_operations;
 		inode->i_mapping->a_ops	= &afs_fs_aops;
 		break;
 	case AFS_FTYPE_DIR:
-		inode->i_mode	= S_IFDIR | vnode->status.mode;
+		inode->i_mode	= S_IFDIR | status->mode;
 		inode->i_op	= &afs_dir_inode_operations;
 		inode->i_fop	= &afs_dir_file_operations;
 		inode->i_mapping->a_ops	= &afs_dir_aops;
 		break;
 	case AFS_FTYPE_SYMLINK:
 		/* Symlinks with a mode of 0644 are actually mountpoints. */
-		if ((vnode->status.mode & 0777) == 0644) {
+		if ((status->mode & 0777) == 0644) {
 			inode->i_flags |= S_AUTOMOUNT;
 
 			set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
@@ -73,50 +122,229 @@
 			inode->i_fop	= &afs_mntpt_file_operations;
 			inode->i_mapping->a_ops	= &afs_fs_aops;
 		} else {
-			inode->i_mode	= S_IFLNK | vnode->status.mode;
+			inode->i_mode	= S_IFLNK | status->mode;
 			inode->i_op	= &afs_symlink_inode_operations;
 			inode->i_mapping->a_ops	= &afs_fs_aops;
 		}
 		inode_nohighmem(inode);
 		break;
 	default:
-		printk("kAFS: AFS vnode with undefined type\n");
-		read_sequnlock_excl(&vnode->cb_lock);
-		return afs_protocol_error(NULL, -EBADMSG);
+		dump_vnode(vnode, parent_vnode);
+		write_sequnlock(&vnode->cb_lock);
+		return afs_protocol_error(NULL, -EBADMSG, afs_eproto_file_type);
 	}
 
-	inode->i_blocks		= 0;
-	vnode->invalid_before	= vnode->status.data_version;
+	afs_set_i_size(vnode, status->size);
 
-	read_sequnlock_excl(&vnode->cb_lock);
+	vnode->invalid_before	= status->data_version;
+	inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+
+	if (!scb->have_cb) {
+		/* it's a symlink we just created (the fileserver
+		 * didn't give us a callback) */
+		vnode->cb_expires_at = ktime_get_real_seconds();
+	} else {
+		vnode->cb_expires_at = scb->callback.expires_at;
+		old_cbi = rcu_dereference_protected(vnode->cb_interest,
+						    lockdep_is_held(&vnode->cb_lock.lock));
+		if (cbi != old_cbi)
+			rcu_assign_pointer(vnode->cb_interest, afs_get_cb_interest(cbi));
+		else
+			old_cbi = NULL;
+		set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+	}
+
+	write_sequnlock(&vnode->cb_lock);
+	afs_put_cb_interest(afs_v2net(vnode), old_cbi);
 	return 0;
 }
 
 /*
+ * Update the core inode struct from a returned status record.
+ */
+static void afs_apply_status(struct afs_fs_cursor *fc,
+			     struct afs_vnode *vnode,
+			     struct afs_status_cb *scb,
+			     const afs_dataversion_t *expected_version)
+{
+	struct afs_file_status *status = &scb->status;
+	struct timespec64 t;
+	umode_t mode;
+	bool data_changed = false;
+
+	BUG_ON(test_bit(AFS_VNODE_UNSET, &vnode->flags));
+
+	if (status->type != vnode->status.type) {
+		pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
+			   vnode->fid.vid,
+			   vnode->fid.vnode,
+			   vnode->fid.unique,
+			   status->type, vnode->status.type);
+		afs_protocol_error(NULL, -EBADMSG, afs_eproto_bad_status);
+		return;
+	}
+
+	if (status->nlink != vnode->status.nlink)
+		set_nlink(&vnode->vfs_inode, status->nlink);
+
+	if (status->owner != vnode->status.owner)
+		vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner);
+
+	if (status->group != vnode->status.group)
+		vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group);
+
+	if (status->mode != vnode->status.mode) {
+		mode = vnode->vfs_inode.i_mode;
+		mode &= ~S_IALLUGO;
+		mode |= status->mode;
+		WRITE_ONCE(vnode->vfs_inode.i_mode, mode);
+	}
+
+	t = status->mtime_client;
+	vnode->vfs_inode.i_ctime = t;
+	vnode->vfs_inode.i_mtime = t;
+	vnode->vfs_inode.i_atime = t;
+
+	if (vnode->status.data_version != status->data_version)
+		data_changed = true;
+
+	vnode->status = *status;
+
+	if (expected_version &&
+	    *expected_version != status->data_version) {
+		if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+			pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s\n",
+				vnode->fid.vid, vnode->fid.vnode,
+				(unsigned long long)*expected_version,
+				(unsigned long long)status->data_version,
+				fc->type ? fc->type->name : "???");
+
+		vnode->invalid_before = status->data_version;
+		if (vnode->status.type == AFS_FTYPE_DIR) {
+			if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+				afs_stat_v(vnode, n_inval);
+		} else {
+			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+		}
+	} else if (vnode->status.type == AFS_FTYPE_DIR) {
+		/* Expected directory change is handled elsewhere so
+		 * that we can locally edit the directory and save on a
+		 * download.
+		 */
+		if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+			data_changed = false;
+	}
+
+	if (data_changed) {
+		inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+		afs_set_i_size(vnode, status->size);
+	}
+}
+
+/*
+ * Apply a callback to a vnode.
+ */
+static void afs_apply_callback(struct afs_fs_cursor *fc,
+			       struct afs_vnode *vnode,
+			       struct afs_status_cb *scb,
+			       unsigned int cb_break)
+{
+	struct afs_cb_interest *old;
+	struct afs_callback *cb = &scb->callback;
+
+	if (!afs_cb_is_broken(cb_break, vnode, fc->cbi)) {
+		vnode->cb_expires_at	= cb->expires_at;
+		old = rcu_dereference_protected(vnode->cb_interest,
+						lockdep_is_held(&vnode->cb_lock.lock));
+		if (old != fc->cbi) {
+			rcu_assign_pointer(vnode->cb_interest, afs_get_cb_interest(fc->cbi));
+			afs_put_cb_interest(afs_v2net(vnode), old);
+		}
+		set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+	}
+}
+
+/*
+ * Apply the received status and callback to an inode all in the same critical
+ * section to avoid races with afs_validate().
+ */
+void afs_vnode_commit_status(struct afs_fs_cursor *fc,
+			     struct afs_vnode *vnode,
+			     unsigned int cb_break,
+			     const afs_dataversion_t *expected_version,
+			     struct afs_status_cb *scb)
+{
+	if (fc->ac.error != 0)
+		return;
+
+	write_seqlock(&vnode->cb_lock);
+
+	if (scb->have_error) {
+		if (scb->status.abort_code == VNOVNODE) {
+			set_bit(AFS_VNODE_DELETED, &vnode->flags);
+			clear_nlink(&vnode->vfs_inode);
+			__afs_break_callback(vnode, afs_cb_break_for_deleted);
+		}
+	} else {
+		if (scb->have_status)
+			afs_apply_status(fc, vnode, scb, expected_version);
+		if (scb->have_cb)
+			afs_apply_callback(fc, vnode, scb, cb_break);
+	}
+
+	write_sequnlock(&vnode->cb_lock);
+
+	if (fc->ac.error == 0 && scb->have_status)
+		afs_cache_permit(vnode, fc->key, cb_break, scb);
+}
+
+/*
  * Fetch file status from the volume.
  */
-int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
+int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool is_new,
+		     afs_access_t *_caller_access)
 {
+	struct afs_status_cb *scb;
 	struct afs_fs_cursor fc;
 	int ret;
 
-	_enter("%s,{%x:%u.%u,S=%lx}",
+	_enter("%s,{%llx:%llu.%u,S=%lx}",
 	       vnode->volume->name,
 	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
 	       vnode->flags);
 
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		return -ENOMEM;
+
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+	if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+		afs_dataversion_t data_version = vnode->status.data_version;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(vnode);
-			afs_fs_fetch_file_status(&fc, NULL, new_inode);
+			afs_fs_fetch_file_status(&fc, scb, NULL);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		if (fc.error) {
+			/* Do nothing. */
+		} else if (is_new) {
+			ret = afs_inode_init_from_status(vnode, key, fc.cbi,
+							 NULL, scb);
+			fc.error = ret;
+			if (ret == 0)
+				afs_cache_permit(vnode, key, fc.cb_break, scb);
+		} else {
+			afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+						&data_version, scb);
+		}
+		afs_check_for_remote_deletion(&fc, vnode);
 		ret = afs_end_vnode_operation(&fc);
 	}
 
+	if (ret == 0 && _caller_access)
+		*_caller_access = scb->status.caller_access;
+	kfree(scb);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -126,10 +354,10 @@
  */
 int afs_iget5_test(struct inode *inode, void *opaque)
 {
-	struct afs_iget_data *data = opaque;
+	struct afs_iget_data *iget_data = opaque;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
 
-	return inode->i_ino == data->fid.vnode &&
-		inode->i_generation == data->fid.unique;
+	return memcmp(&vnode->fid, &iget_data->fid, sizeof(iget_data->fid)) == 0;
 }
 
 /*
@@ -147,14 +375,19 @@
  */
 static int afs_iget5_set(struct inode *inode, void *opaque)
 {
-	struct afs_iget_data *data = opaque;
+	struct afs_iget_data *iget_data = opaque;
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
-	inode->i_ino = data->fid.vnode;
-	inode->i_generation = data->fid.unique;
-	vnode->fid = data->fid;
-	vnode->volume = data->volume;
+	vnode->fid		= iget_data->fid;
+	vnode->volume		= iget_data->volume;
+	vnode->cb_v_break	= iget_data->cb_v_break;
+	vnode->cb_s_break	= iget_data->cb_s_break;
 
+	/* YFS supports 96-bit vnode IDs, but Linux only supports
+	 * 64-bit inode numbers.
+	 */
+	inode->i_ino		= iget_data->fid.vnode;
+	inode->i_generation	= iget_data->fid.unique;
 	return 0;
 }
 
@@ -164,38 +397,42 @@
  */
 struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
 {
-	struct afs_iget_data data;
 	struct afs_super_info *as;
 	struct afs_vnode *vnode;
 	struct inode *inode;
 	static atomic_t afs_autocell_ino;
 
+	struct afs_iget_data iget_data = {
+		.cb_v_break = 0,
+		.cb_s_break = 0,
+	};
+
 	_enter("");
 
 	as = sb->s_fs_info;
 	if (as->volume) {
-		data.volume = as->volume;
-		data.fid.vid = as->volume->vid;
+		iget_data.volume = as->volume;
+		iget_data.fid.vid = as->volume->vid;
 	}
 	if (root) {
-		data.fid.vnode = 1;
-		data.fid.unique = 1;
+		iget_data.fid.vnode = 1;
+		iget_data.fid.unique = 1;
 	} else {
-		data.fid.vnode = atomic_inc_return(&afs_autocell_ino);
-		data.fid.unique = 0;
+		iget_data.fid.vnode = atomic_inc_return(&afs_autocell_ino);
+		iget_data.fid.unique = 0;
 	}
 
-	inode = iget5_locked(sb, data.fid.vnode,
+	inode = iget5_locked(sb, iget_data.fid.vnode,
 			     afs_iget5_pseudo_dir_test, afs_iget5_set,
-			     &data);
+			     &iget_data);
 	if (!inode) {
 		_leave(" = -ENOMEM");
 		return ERR_PTR(-ENOMEM);
 	}
 
-	_debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
-	       inode, inode->i_ino, data.fid.vid, data.fid.vnode,
-	       data.fid.unique);
+	_debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }",
+	       inode, inode->i_ino, iget_data.fid.vid, iget_data.fid.vnode,
+	       iget_data.fid.unique);
 
 	vnode = AFS_FS_I(inode);
 
@@ -206,16 +443,14 @@
 	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
 	if (root) {
 		inode->i_op	= &afs_dynroot_inode_operations;
-		inode->i_fop	= &afs_dynroot_file_operations;
+		inode->i_fop	= &simple_dir_operations;
 	} else {
 		inode->i_op	= &afs_autocell_inode_operations;
 	}
 	set_nlink(inode, 2);
 	inode->i_uid		= GLOBAL_ROOT_UID;
 	inode->i_gid		= GLOBAL_ROOT_GID;
-	inode->i_ctime.tv_sec	= get_seconds();
-	inode->i_ctime.tv_nsec	= 0;
-	inode->i_atime		= inode->i_mtime = inode->i_ctime;
+	inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode);
 	inode->i_blocks		= 0;
 	inode_set_iversion_raw(inode, 0);
 	inode->i_generation	= 0;
@@ -252,8 +487,8 @@
 
 	key.vnode_id		= vnode->fid.vnode;
 	key.unique		= vnode->fid.unique;
-	key.vnode_id_ext[0]	= 0;
-	key.vnode_id_ext[1]	= 0;
+	key.vnode_id_ext[0]	= vnode->fid.vnode >> 32;
+	key.vnode_id_ext[1]	= vnode->fid.vnode_hi;
 	aux.data_version	= vnode->status.data_version;
 
 	vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
@@ -268,28 +503,30 @@
  * inode retrieval
  */
 struct inode *afs_iget(struct super_block *sb, struct key *key,
-		       struct afs_fid *fid, struct afs_file_status *status,
-		       struct afs_callback *cb, struct afs_cb_interest *cbi)
+		       struct afs_iget_data *iget_data,
+		       struct afs_status_cb *scb,
+		       struct afs_cb_interest *cbi,
+		       struct afs_vnode *parent_vnode)
 {
-	struct afs_iget_data data = { .fid = *fid };
 	struct afs_super_info *as;
 	struct afs_vnode *vnode;
+	struct afs_fid *fid = &iget_data->fid;
 	struct inode *inode;
 	int ret;
 
-	_enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique);
+	_enter(",{%llx:%llu.%u},,", fid->vid, fid->vnode, fid->unique);
 
 	as = sb->s_fs_info;
-	data.volume = as->volume;
+	iget_data->volume = as->volume;
 
 	inode = iget5_locked(sb, fid->vnode, afs_iget5_test, afs_iget5_set,
-			     &data);
+			     iget_data);
 	if (!inode) {
 		_leave(" = -ENOMEM");
 		return ERR_PTR(-ENOMEM);
 	}
 
-	_debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+	_debug("GOT INODE %p { vl=%llx vn=%llx, u=%x }",
 	       inode, fid->vid, fid->vnode, fid->unique);
 
 	vnode = AFS_FS_I(inode);
@@ -300,43 +537,25 @@
 		return inode;
 	}
 
-	if (!status) {
+	if (!scb) {
 		/* it's a remotely extant inode */
-		ret = afs_fetch_status(vnode, key, true);
+		ret = afs_fetch_status(vnode, key, true, NULL);
 		if (ret < 0)
 			goto bad_inode;
 	} else {
-		/* it's an inode we just created */
-		memcpy(&vnode->status, status, sizeof(vnode->status));
-
-		if (!cb) {
-			/* it's a symlink we just created (the fileserver
-			 * didn't give us a callback) */
-			vnode->cb_version = 0;
-			vnode->cb_type = 0;
-			vnode->cb_expires_at = 0;
-		} else {
-			vnode->cb_version = cb->version;
-			vnode->cb_type = cb->type;
-			vnode->cb_expires_at = cb->expiry;
-			vnode->cb_interest = afs_get_cb_interest(cbi);
-			set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-		}
-
-		vnode->cb_expires_at += ktime_get_real_seconds();
+		ret = afs_inode_init_from_status(vnode, key, cbi, parent_vnode,
+						 scb);
+		if (ret < 0)
+			goto bad_inode;
 	}
 
-	ret = afs_inode_init_from_status(vnode, key);
-	if (ret < 0)
-		goto bad_inode;
-
 	afs_get_inode_cache(vnode);
 
 	/* success */
 	clear_bit(AFS_VNODE_UNSET, &vnode->flags);
 	inode->i_flags |= S_NOATIME;
 	unlock_new_inode(inode);
-	_leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
+	_leave(" = %p", inode);
 	return inode;
 
 	/* failure */
@@ -352,7 +571,7 @@
  */
 void afs_zap_data(struct afs_vnode *vnode)
 {
-	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
 
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_invalidate(vnode->cache);
@@ -368,6 +587,69 @@
 }
 
 /*
+ * Check the validity of a vnode/inode.
+ */
+bool afs_check_validity(struct afs_vnode *vnode)
+{
+	struct afs_cb_interest *cbi;
+	struct afs_server *server;
+	struct afs_volume *volume = vnode->volume;
+	enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
+	time64_t now = ktime_get_real_seconds();
+	bool valid;
+	unsigned int cb_break, cb_s_break, cb_v_break;
+	int seq = 0;
+
+	do {
+		read_seqbegin_or_lock(&vnode->cb_lock, &seq);
+		cb_v_break = READ_ONCE(volume->cb_v_break);
+		cb_break = vnode->cb_break;
+
+		if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+			cbi = rcu_dereference(vnode->cb_interest);
+			server = rcu_dereference(cbi->server);
+			cb_s_break = READ_ONCE(server->cb_s_break);
+
+			if (vnode->cb_s_break != cb_s_break ||
+			    vnode->cb_v_break != cb_v_break) {
+				vnode->cb_s_break = cb_s_break;
+				vnode->cb_v_break = cb_v_break;
+				need_clear = afs_cb_break_for_vsbreak;
+				valid = false;
+			} else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+				need_clear = afs_cb_break_for_zap;
+				valid = false;
+			} else if (vnode->cb_expires_at - 10 <= now) {
+				need_clear = afs_cb_break_for_lapsed;
+				valid = false;
+			} else {
+				valid = true;
+			}
+		} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+			valid = true;
+		} else {
+			vnode->cb_v_break = cb_v_break;
+			valid = false;
+		}
+
+	} while (need_seqretry(&vnode->cb_lock, seq));
+
+	done_seqretry(&vnode->cb_lock, seq);
+
+	if (need_clear != afs_cb_break_no_break) {
+		write_seqlock(&vnode->cb_lock);
+		if (cb_break == vnode->cb_break)
+			__afs_break_callback(vnode, need_clear);
+		else
+			trace_afs_cb_miss(&vnode->fid, need_clear);
+		write_sequnlock(&vnode->cb_lock);
+		valid = false;
+	}
+
+	return valid;
+}
+
+/*
  * validate a vnode/inode
  * - there are several things we need to check
  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -378,45 +660,16 @@
  */
 int afs_validate(struct afs_vnode *vnode, struct key *key)
 {
-	time64_t now = ktime_get_real_seconds();
 	bool valid;
 	int ret;
 
-	_enter("{v={%x:%u} fl=%lx},%x",
+	_enter("{v={%llx:%llu} fl=%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 	       key_serial(key));
 
-	/* Quickly check the callback state.  Ideally, we'd use read_seqbegin
-	 * here, but we have no way to pass the net namespace to the RCU
-	 * cleanup for the server record.
-	 */
-	read_seqlock_excl(&vnode->cb_lock);
-
-	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
-		if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break ||
-		    vnode->cb_v_break != vnode->volume->cb_v_break) {
-			vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
-			vnode->cb_v_break = vnode->volume->cb_v_break;
-			valid = false;
-		} else if (vnode->status.type == AFS_FTYPE_DIR &&
-			   (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) ||
-			    vnode->cb_expires_at - 10 <= now)) {
-			valid = false;
-		} else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) ||
-			   vnode->cb_expires_at - 10 <= now) {
-			valid = false;
-		} else {
-			valid = true;
-		}
-	} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-		valid = true;
-	} else {
-		vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
-		vnode->cb_v_break = vnode->volume->cb_v_break;
-		valid = false;
-	}
-
-	read_sequnlock_excl(&vnode->cb_lock);
+	rcu_read_lock();
+	valid = afs_check_validity(vnode);
+	rcu_read_unlock();
 
 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 		clear_nlink(&vnode->vfs_inode);
@@ -432,7 +685,7 @@
 	 * access */
 	if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
 		_debug("not promised");
-		ret = afs_fetch_status(vnode, key, false);
+		ret = afs_fetch_status(vnode, key, false, NULL);
 		if (ret < 0) {
 			if (ret == -ENOENT) {
 				set_bit(AFS_VNODE_DELETED, &vnode->flags);
@@ -503,11 +756,12 @@
  */
 void afs_evict_inode(struct inode *inode)
 {
+	struct afs_cb_interest *cbi;
 	struct afs_vnode *vnode;
 
 	vnode = AFS_FS_I(inode);
 
-	_enter("{%x:%u.%d}",
+	_enter("{%llx:%llu.%d}",
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique);
@@ -519,10 +773,14 @@
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 
-	if (vnode->cb_interest) {
-		afs_put_cb_interest(afs_i2net(inode), vnode->cb_interest);
-		vnode->cb_interest = NULL;
+	write_seqlock(&vnode->cb_lock);
+	cbi = rcu_dereference_protected(vnode->cb_interest,
+					lockdep_is_held(&vnode->cb_lock.lock));
+	if (cbi) {
+		afs_put_cb_interest(afs_i2net(inode), cbi);
+		rcu_assign_pointer(vnode->cb_interest, NULL);
 	}
+	write_sequnlock(&vnode->cb_lock);
 
 	while (!list_empty(&vnode->wb_keys)) {
 		struct afs_wb_key *wbk = list_entry(vnode->wb_keys.next,
@@ -542,7 +800,12 @@
 	}
 #endif
 
+	afs_prune_wb_keys(vnode);
 	afs_put_permits(rcu_access_pointer(vnode->permit_cache));
+	key_put(vnode->silly_key);
+	vnode->silly_key = NULL;
+	key_put(vnode->lock_key);
+	vnode->lock_key = NULL;
 	_leave("");
 }
 
@@ -552,11 +815,12 @@
 int afs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
 	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 	struct key *key;
-	int ret;
+	int ret = -ENOMEM;
 
-	_enter("{%x:%u},{n=%pd},%x",
+	_enter("{%llx:%llu},{n=%pd},%x",
 	       vnode->fid.vid, vnode->fid.vnode, dentry,
 	       attr->ia_valid);
 
@@ -566,6 +830,10 @@
 		return 0;
 	}
 
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+	if (!scb)
+		goto error;
+
 	/* flush any dirty data outstanding on a regular file */
 	if (S_ISREG(vnode->vfs_inode.i_mode))
 		filemap_write_and_wait(vnode->vfs_inode.i_mapping);
@@ -576,25 +844,33 @@
 		key = afs_request_key(vnode->volume->cell);
 		if (IS_ERR(key)) {
 			ret = PTR_ERR(key);
-			goto error;
+			goto error_scb;
 		}
 	}
 
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+	if (afs_begin_vnode_operation(&fc, vnode, key, false)) {
+		afs_dataversion_t data_version = vnode->status.data_version;
+
+		if (attr->ia_valid & ATTR_SIZE)
+			data_version++;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(vnode);
-			afs_fs_setattr(&fc, attr);
+			afs_fs_setattr(&fc, attr, scb);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		afs_check_for_remote_deletion(&fc, vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+					&data_version, scb);
 		ret = afs_end_vnode_operation(&fc);
 	}
 
 	if (!(attr->ia_valid & ATTR_FILE))
 		key_put(key);
 
+error_scb:
+	kfree(scb);
 error:
 	_leave(" = %d", ret);
 	return ret;

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 34c02fd..759e057 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* internal AFS stuff
  *
  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/compiler.h>
@@ -22,6 +18,7 @@
 #include <linux/backing-dev.h>
 #include <linux/uuid.h>
 #include <linux/mm_types.h>
+#include <linux/dns_resolver.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/sock.h>
@@ -35,15 +32,27 @@
 struct pagevec;
 struct afs_call;
 
-struct afs_mount_params {
-	bool			rwpath;		/* T if the parent should be considered R/W */
+/*
+ * Partial file-locking emulation mode.  (The problem being that AFS3 only
+ * allows whole-file locks and no upgrading/downgrading).
+ */
+enum afs_flock_mode {
+	afs_flock_mode_unset,
+	afs_flock_mode_local,	/* Local locking only */
+	afs_flock_mode_openafs,	/* Don't get server lock for a partial lock */
+	afs_flock_mode_strict,	/* Always get a server lock for a partial lock */
+	afs_flock_mode_write,	/* Get an exclusive server lock for a partial lock */
+};
+
+struct afs_fs_context {
 	bool			force;		/* T to force cell type */
 	bool			autocell;	/* T if set auto mount operation */
 	bool			dyn_root;	/* T if dynamic root */
+	bool			no_cell;	/* T if the source is "none" (for dynroot) */
+	enum afs_flock_mode	flock_mode;	/* Partial file-locking emulation mode */
 	afs_voltype_t		type;		/* type of volume requested */
-	int			volnamesz;	/* size of volume name */
+	unsigned int		volnamesz;	/* size of volume name */
 	const char		*volname;	/* name of volume to mount */
-	struct net		*net_ns;	/* Network namespace in effect */
 	struct afs_net		*net;		/* the AFS net namespace stuff */
 	struct afs_cell		*cell;		/* cell in which to find volume */
 	struct afs_volume	*volume;	/* volume record */
@@ -53,6 +62,8 @@
 struct afs_iget_data {
 	struct afs_fid		fid;
 	struct afs_volume	*volume;	/* volume on which resides */
+	unsigned int		cb_v_break;	/* Pre-fetch volume break count */
+	unsigned int		cb_s_break;	/* Pre-fetch server break count */
 };
 
 enum afs_call_state {
@@ -73,12 +84,17 @@
 	struct rcu_head		rcu;		/* Must be first */
 	refcount_t		usage;
 	u32			version;	/* Version */
-	unsigned short		nr_addrs;
-	unsigned short		index;		/* Address currently in use */
-	unsigned short		nr_ipv4;	/* Number of IPv4 addresses */
+	unsigned char		max_addrs;
+	unsigned char		nr_addrs;
+	unsigned char		preferred;	/* Preferred address */
+	unsigned char		nr_ipv4;	/* Number of IPv4 addresses */
+	enum dns_record_source	source:8;
+	enum dns_lookup_status	status:8;
 	unsigned long		probed;		/* Mask of servers that have been probed */
-	unsigned long		yfs;		/* Mask of servers that are YFS */
+	unsigned long		failed;		/* Mask of addrs that failed locally/ICMP */
+	unsigned long		responded;	/* Mask of addrs that responded */
 	struct sockaddr_rxrpc	addrs[];
+#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
 };
 
 /*
@@ -86,48 +102,77 @@
  */
 struct afs_call {
 	const struct afs_call_type *type;	/* type of call */
+	struct afs_addr_list	*alist;		/* Address is alist[addr_ix] */
 	wait_queue_head_t	waitq;		/* processes awaiting completion */
 	struct work_struct	async_work;	/* async I/O processor */
 	struct work_struct	work;		/* actual work processor */
 	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
 	struct key		*key;		/* security for this call */
 	struct afs_net		*net;		/* The network namespace */
-	struct afs_server	*cm_server;	/* Server affected by incoming CM call */
+	struct afs_server	*server;	/* The fileserver record if fs op (pins ref) */
+	struct afs_vlserver	*vlserver;	/* The vlserver record if vl op */
 	struct afs_cb_interest	*cbi;		/* Callback interest for server used */
+	struct afs_vnode	*lvnode;	/* vnode being locked */
 	void			*request;	/* request data (first part) */
 	struct address_space	*mapping;	/* Pages being written from */
+	struct iov_iter		iter;		/* Buffer iterator */
+	struct iov_iter		*_iter;		/* Iterator currently in use */
+	union {	/* Convenience for ->iter */
+		struct kvec	kvec[1];
+		struct bio_vec	bvec[1];
+	};
 	void			*buffer;	/* reply receive buffer */
-	void			*reply[4];	/* Where to put the reply */
+	union {
+		long			ret0;	/* Value to reply with instead of 0 */
+		struct afs_addr_list	*ret_alist;
+		struct afs_vldb_entry	*ret_vldb;
+		struct afs_acl		*ret_acl;
+	};
+	struct afs_fid		*out_fid;
+	struct afs_status_cb	*out_dir_scb;
+	struct afs_status_cb	*out_scb;
+	struct yfs_acl		*out_yacl;
+	struct afs_volsync	*out_volsync;
+	struct afs_volume_status *out_volstatus;
+	struct afs_read		*read_request;
+	unsigned int		server_index;
 	pgoff_t			first;		/* first page in mapping to deal with */
 	pgoff_t			last;		/* last page in mapping to deal with */
-	size_t			offset;		/* offset into received data store */
 	atomic_t		usage;
 	enum afs_call_state	state;
 	spinlock_t		state_lock;
 	int			error;		/* error code */
 	u32			abort_code;	/* Remote abort ID or 0 */
+	u32			epoch;
+	unsigned int		max_lifespan;	/* Maximum lifespan to set if not 0 */
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
 	unsigned		first_offset;	/* offset into mapping[first] */
-	unsigned int		cb_break;	/* cb_break + cb_s_break before the call */
 	union {
 		unsigned	last_to;	/* amount of mapping[last] */
 		unsigned	count2;		/* count used in unmarshalling */
 	};
 	unsigned char		unmarshall;	/* unmarshalling phase */
+	unsigned char		addr_ix;	/* Address in ->alist */
 	bool			incoming;	/* T if incoming call */
 	bool			send_pages;	/* T if data from mapping should be sent */
 	bool			need_attention;	/* T if RxRPC poked us */
 	bool			async;		/* T if asynchronous */
-	bool			ret_reply0;	/* T if should return reply[0] on success */
 	bool			upgrade;	/* T to request service upgrade */
+	bool			have_reply_time; /* T if have got reply_time */
+	bool			intr;		/* T if interruptible */
 	u16			service_id;	/* Actual service ID (after upgrade) */
 	unsigned int		debug_id;	/* Trace ID */
 	u32			operation_ID;	/* operation ID for an incoming call */
 	u32			count;		/* count for use in unmarshalling */
-	__be32			tmp;		/* place to extract temporary data */
-	afs_dataversion_t	expected_version; /* Updated version expected from store */
-	afs_dataversion_t	expected_version_2; /* 2nd updated version expected from store */
+	union {					/* place to extract temporary data */
+		struct {
+			__be32	tmp_u;
+			__be32	tmp;
+		} __attribute__((packed));
+		__be64		tmp64;
+	};
+	ktime_t			reply_time;	/* Time of first reply packet */
 };
 
 struct afs_call_type {
@@ -144,6 +189,9 @@
 
 	/* Work function */
 	void (*work)(struct work_struct *work);
+
+	/* Call done function (gets called immediately on success or failure) */
+	void (*done)(struct afs_call *call);
 };
 
 /*
@@ -183,7 +231,9 @@
 	refcount_t		usage;
 	unsigned int		index;		/* Which page we're reading into */
 	unsigned int		nr_pages;
-	void (*page_done)(struct afs_call *, struct afs_read *);
+	unsigned int		offset;		/* offset into current page */
+	struct afs_vnode	*vnode;
+	void (*page_done)(struct afs_read *);
 	struct page		**pages;
 	struct page		*array[];
 };
@@ -196,6 +246,7 @@
 	struct net		*net_ns;	/* Network namespace */
 	struct afs_cell		*cell;		/* The cell in which the volume resides */
 	struct afs_volume	*volume;	/* volume record */
+	enum afs_flock_mode	flock_mode:8;	/* File locking emulation mode */
 	bool			dyn_root;	/* True if dynamic root */
 };
 
@@ -328,26 +379,83 @@
 	time64_t		last_inactive;	/* Time of last drop of usage count */
 	atomic_t		usage;
 	unsigned long		flags;
-#define AFS_CELL_FL_NOT_READY	0		/* The cell record is not ready for use */
-#define AFS_CELL_FL_NO_GC	1		/* The cell was added manually, don't auto-gc */
-#define AFS_CELL_FL_NOT_FOUND	2		/* Permanent DNS error */
-#define AFS_CELL_FL_DNS_FAIL	3		/* Failed to access DNS */
-#define AFS_CELL_FL_NO_LOOKUP_YET 4		/* Not completed first DNS lookup yet */
+#define AFS_CELL_FL_NO_GC	0		/* The cell was added manually, don't auto-gc */
+#define AFS_CELL_FL_DO_LOOKUP	1		/* DNS lookup requested */
 	enum afs_cell_state	state;
 	short			error;
+	enum dns_record_source	dns_source:8;	/* Latest source of data from lookup */
+	enum dns_lookup_status	dns_status:8;	/* Latest status of data from lookup */
+	unsigned int		dns_lookup_count; /* Counter of DNS lookups */
 
 	/* Active fileserver interaction state. */
 	struct list_head	proc_volumes;	/* procfs volume list */
 	rwlock_t		proc_lock;
 
 	/* VL server list. */
-	rwlock_t		vl_addrs_lock;	/* Lock on vl_addrs */
-	struct afs_addr_list	__rcu *vl_addrs; /* List of VL servers */
+	rwlock_t		vl_servers_lock; /* Lock on vl_servers */
+	struct afs_vlserver_list __rcu *vl_servers;
+
 	u8			name_len;	/* Length of name */
 	char			name[64 + 1];	/* Cell name, case-flattened and NUL-padded */
 };
 
 /*
+ * Volume Location server record.
+ */
+struct afs_vlserver {
+	struct rcu_head		rcu;
+	struct afs_addr_list	__rcu *addresses; /* List of addresses for this VL server */
+	unsigned long		flags;
+#define AFS_VLSERVER_FL_PROBED	0		/* The VL server has been probed */
+#define AFS_VLSERVER_FL_PROBING	1		/* VL server is being probed */
+#define AFS_VLSERVER_FL_IS_YFS	2		/* Server is YFS not AFS */
+	rwlock_t		lock;		/* Lock on addresses */
+	atomic_t		usage;
+
+	/* Probe state */
+	wait_queue_head_t	probe_wq;
+	atomic_t		probe_outstanding;
+	spinlock_t		probe_lock;
+	struct {
+		unsigned int	rtt;		/* RTT as ktime/64 */
+		u32		abort_code;
+		short		error;
+		bool		have_result;
+		bool		responded:1;
+		bool		is_yfs:1;
+		bool		not_yfs:1;
+		bool		local_failure:1;
+	} probe;
+
+	u16			port;
+	u16			name_len;	/* Length of name */
+	char			name[];		/* Server name, case-flattened */
+};
+
+/*
+ * Weighted list of Volume Location servers.
+ */
+struct afs_vlserver_entry {
+	u16			priority;	/* Preference (as SRV) */
+	u16			weight;		/* Weight (as SRV) */
+	enum dns_record_source	source:8;
+	enum dns_lookup_status	status:8;
+	struct afs_vlserver	*server;
+};
+
+struct afs_vlserver_list {
+	struct rcu_head		rcu;
+	atomic_t		usage;
+	u8			nr_servers;
+	u8			index;		/* Server currently in use */
+	u8			preferred;	/* Preferred server */
+	enum dns_record_source	source:8;
+	enum dns_lookup_status	status:8;
+	rwlock_t		lock;
+	struct afs_vlserver_entry servers[];
+};
+
+/*
  * Cached VLDB entry.
  *
  * This is pointed to by cell->vldb_entries, indexed by name.
@@ -392,7 +500,6 @@
 	time64_t		put_time;	/* Time at which last put */
 	time64_t		update_at;	/* Time at which to next update the record */
 	unsigned long		flags;
-#define AFS_SERVER_FL_NEW	0		/* New server, don't inc cb_s_break */
 #define AFS_SERVER_FL_NOT_READY	1		/* The record is not ready for use */
 #define AFS_SERVER_FL_NOT_FOUND	2		/* VL server says no such server */
 #define AFS_SERVER_FL_VL_FAIL	3		/* Failed to access VL server */
@@ -401,8 +508,13 @@
 #define AFS_SERVER_FL_PROBING	6		/* Fileserver is being probed */
 #define AFS_SERVER_FL_NO_IBULK	7		/* Fileserver doesn't support FS.InlineBulkStatus */
 #define AFS_SERVER_FL_MAY_HAVE_CB 8		/* May have callbacks on this fileserver */
+#define AFS_SERVER_FL_IS_YFS	9		/* Server is YFS not AFS */
+#define AFS_SERVER_FL_NO_RM2	10		/* Fileserver doesn't support YFS.RemoveFile2 */
+#define AFS_SERVER_FL_HAVE_EPOCH 11		/* ->epoch is valid */
 	atomic_t		usage;
 	u32			addr_version;	/* Address list version */
+	u32			cm_epoch;	/* Server RxRPC epoch */
+	unsigned int		debug_id;	/* Debugging ID for traces */
 
 	/* file service access */
 	rwlock_t		fs_lock;	/* access lock */
@@ -411,6 +523,26 @@
 	struct hlist_head	cb_volumes;	/* List of volume interests on this server */
 	unsigned		cb_s_break;	/* Break-everything counter. */
 	rwlock_t		cb_break_lock;	/* Volume finding lock */
+
+	/* Probe state */
+	wait_queue_head_t	probe_wq;
+	atomic_t		probe_outstanding;
+	spinlock_t		probe_lock;
+	struct {
+		unsigned int	rtt;		/* RTT as ktime/64 */
+		u32		abort_code;
+		u32		cm_epoch;
+		short		error;
+		bool		have_result;
+		bool		responded:1;
+		bool		is_yfs:1;
+		bool		not_yfs:1;
+		bool		local_failure:1;
+		bool		no_epoch:1;
+		bool		cm_probed:1;
+		bool		said_rebooted:1;
+		bool		said_inconsistent:1;
+	} probe;
 };
 
 /*
@@ -419,7 +551,10 @@
 struct afs_vol_interest {
 	struct hlist_node	srv_link;	/* Link in server->cb_volumes */
 	struct hlist_head	cb_interests;	/* List of callback interests on the server */
-	afs_volid_t		vid;		/* Volume ID to match */
+	union {
+		struct rcu_head	rcu;
+		afs_volid_t	vid;		/* Volume ID to match */
+	};
 	unsigned int		usage;
 };
 
@@ -431,7 +566,10 @@
 	struct afs_vol_interest	*vol_interest;
 	struct afs_server	*server;	/* Server on which this interest resides */
 	struct super_block	*sb;		/* Superblock on which inodes reside */
-	afs_volid_t		vid;		/* Volume ID to match */
+	union {
+		struct rcu_head	rcu;
+		afs_volid_t	vid;		/* Volume ID to match */
+	};
 	refcount_t		usage;
 };
 
@@ -445,8 +583,8 @@
 
 struct afs_server_list {
 	refcount_t		usage;
-	unsigned short		nr_servers;
-	unsigned short		index;		/* Server currently in use */
+	unsigned char		nr_servers;
+	unsigned char		preferred;	/* Preferred server */
 	unsigned short		vnovol_mask;	/* Servers to be skipped due to VNOVOL */
 	unsigned int		seq;		/* Set to ->servers_seq when installed */
 	rwlock_t		lock;
@@ -477,7 +615,7 @@
 	unsigned int		servers_seq;	/* Incremented each time ->servers changes */
 
 	unsigned		cb_v_break;	/* Break-everything counter. */
-	rwlock_t		cb_break_lock;
+	rwlock_t		cb_v_break_lock;
 
 	afs_voltype_t		type;		/* type of volume */
 	short			error;
@@ -494,6 +632,7 @@
 	AFS_VNODE_LOCK_EXTENDING,	/* We're extending a lock on the server */
 	AFS_VNODE_LOCK_NEED_UNLOCK,	/* We need to unlock on the server */
 	AFS_VNODE_LOCK_UNLOCKING,	/* We're telling the server to unlock */
+	AFS_VNODE_LOCK_DELETED,		/* The vnode has been deleted whilst we have a lock */
 };
 
 /*
@@ -515,6 +654,8 @@
 	struct afs_permits __rcu *permit_cache;	/* cache of permits so far obtained */
 	struct mutex		io_lock;	/* Lock for serialising I/O on this mutex */
 	struct rw_semaphore	validate_lock;	/* lock for validating this vnode */
+	struct rw_semaphore	rmdir_lock;	/* Lock for rmdir vs sillyrename */
+	struct key		*silly_key;	/* Silly rename key */
 	spinlock_t		wb_lock;	/* lock for wb_keys */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
@@ -533,21 +674,29 @@
 	struct list_head	granted_locks;	/* locks granted on this file */
 	struct delayed_work	lock_work;	/* work to be done in locking */
 	struct key		*lock_key;	/* Key to be used in lock ops */
+	ktime_t			locked_at;	/* Time at which lock obtained */
 	enum afs_lock_state	lock_state : 8;
 	afs_lock_type_t		lock_type : 8;
 
 	/* outstanding callback notification on this file */
-	struct afs_cb_interest	*cb_interest;	/* Server on which this resides */
+	struct afs_cb_interest __rcu *cb_interest; /* Server on which this resides */
 	unsigned int		cb_s_break;	/* Mass break counter on ->server */
 	unsigned int		cb_v_break;	/* Mass break counter on ->volume */
 	unsigned int		cb_break;	/* Break counter on vnode */
 	seqlock_t		cb_lock;	/* Lock for ->cb_interest, ->status, ->cb_*break */
 
 	time64_t		cb_expires_at;	/* time at which callback expires */
-	unsigned		cb_version;	/* callback version */
-	afs_callback_type_t	cb_type;	/* type of callback */
 };
 
+static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
+{
+#ifdef CONFIG_AFS_FSCACHE
+	return vnode->cache;
+#else
+	return NULL;
+#endif
+}
+
 /*
  * cached security record for one user's attempt to access a vnode
  */
@@ -571,12 +720,11 @@
 };
 
 /*
- * record of one of a system's set of network interfaces
+ * Error prioritisation and accumulation.
  */
-struct afs_interface {
-	struct in_addr	address;	/* IPv4 address bound to interface */
-	struct in_addr	netmask;	/* netmask applied to address */
-	unsigned	mtu;		/* MTU of interface */
+struct afs_error {
+	short	error;			/* Accumulated error */
+	bool	responded;		/* T if server responded */
 };
 
 /*
@@ -584,28 +732,48 @@
  */
 struct afs_addr_cursor {
 	struct afs_addr_list	*alist;		/* Current address list (pins ref) */
-	struct sockaddr_rxrpc	*addr;
-	u32			abort_code;
-	unsigned short		start;		/* Starting point in alist->addrs[] */
-	unsigned short		index;		/* Wrapping offset from start to current addr */
-	short			error;
-	bool			begun;		/* T if we've begun iteration */
+	unsigned long		tried;		/* Tried addresses */
+	signed char		index;		/* Current address */
 	bool			responded;	/* T if the current address responded */
+	unsigned short		nr_iterations;	/* Number of address iterations */
+	short			error;
+	u32			abort_code;
+};
+
+/*
+ * Cursor for iterating over a set of volume location servers.
+ */
+struct afs_vl_cursor {
+	struct afs_addr_cursor	ac;
+	struct afs_cell		*cell;		/* The cell we're querying */
+	struct afs_vlserver_list *server_list;	/* Current server list (pins ref) */
+	struct afs_vlserver	*server;	/* Server on which this resides */
+	struct key		*key;		/* Key for the server */
+	unsigned long		untried;	/* Bitmask of untried servers */
+	short			index;		/* Current server */
+	short			error;
+	unsigned short		flags;
+#define AFS_VL_CURSOR_STOP	0x0001		/* Set to cease iteration */
+#define AFS_VL_CURSOR_RETRY	0x0002		/* Set to do a retry */
+#define AFS_VL_CURSOR_RETRIED	0x0004		/* Set if started a retry */
+	unsigned short		nr_iterations;	/* Number of server iterations */
 };
 
 /*
  * Cursor for iterating over a set of fileservers.
  */
 struct afs_fs_cursor {
+	const struct afs_call_type *type;	/* Type of call done */
 	struct afs_addr_cursor	ac;
 	struct afs_vnode	*vnode;
 	struct afs_server_list	*server_list;	/* Current server list (pins ref) */
 	struct afs_cb_interest	*cbi;		/* Server on which this resides (pins ref) */
 	struct key		*key;		/* Key for the server */
+	unsigned long		untried;	/* Bitmask of untried servers */
 	unsigned int		cb_break;	/* cb_break + cb_s_break before the call */
 	unsigned int		cb_break_2;	/* cb_break + cb_s_break (2nd vnode) */
-	unsigned char		start;		/* Initial index in server list */
-	unsigned char		index;		/* Number of servers tried beyond start */
+	short			index;		/* Current server */
+	short			error;
 	unsigned short		flags;
 #define AFS_FS_CURSOR_STOP	0x0001		/* Set to cease iteration */
 #define AFS_FS_CURSOR_VBUSY	0x0002		/* Set if seen VBUSY */
@@ -613,6 +781,8 @@
 #define AFS_FS_CURSOR_VNOVOL	0x0008		/* Set if seen VNOVOL */
 #define AFS_FS_CURSOR_CUR_ONLY	0x0010		/* Set if current server only (file lock held) */
 #define AFS_FS_CURSOR_NO_VSLEEP	0x0020		/* Set to prevent sleep on VBUSY, VOFFLINE, ... */
+#define AFS_FS_CURSOR_INTR	0x0040		/* Set if op is interruptible */
+	unsigned short		nr_iterations;	/* Number of server iterations */
 };
 
 /*
@@ -638,12 +808,12 @@
 						unsigned short,
 						unsigned short);
 extern void afs_put_addrlist(struct afs_addr_list *);
-extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char,
-						  unsigned short, unsigned short);
-extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *);
+extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
+						      const char *, size_t, char,
+						      unsigned short, unsigned short);
+extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
 extern bool afs_iterate_addresses(struct afs_addr_cursor *);
 extern int afs_end_cursor(struct afs_addr_cursor *);
-extern int afs_set_vl_cursor(struct afs_addr_cursor *, struct afs_cell *);
 
 extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
 extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
@@ -666,8 +836,9 @@
  * callback.c
  */
 extern void afs_init_callback_state(struct afs_server *);
-extern void afs_break_callback(struct afs_vnode *);
-extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*);
+extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
+extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
+extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break *);
 
 extern int afs_register_server_cb_interest(struct afs_vnode *,
 					   struct afs_server_list *, unsigned int);
@@ -683,13 +854,15 @@
 
 static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
 {
-	return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break;
+	return vnode->cb_break + vnode->cb_v_break;
 }
 
-static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode,
-					    struct afs_cb_interest *cbi)
+static inline bool afs_cb_is_broken(unsigned int cb_break,
+				    const struct afs_vnode *vnode,
+				    const struct afs_cb_interest *cbi)
 {
-	return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break;
+	return !cbi || cb_break != (vnode->cb_break +
+				    vnode->volume->cb_v_break);
 }
 
 /*
@@ -728,9 +901,15 @@
 extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
 
 /*
+ * dir_silly.c
+ */
+extern int afs_sillyrename(struct afs_vnode *, struct afs_vnode *,
+			   struct dentry *, struct key *);
+extern int afs_silly_iput(struct dentry *, struct inode *);
+
+/*
  * dynroot.c
  */
-extern const struct file_operations afs_dynroot_file_operations;
 extern const struct inode_operations afs_dynroot_inode_operations;
 extern const struct dentry_operations afs_dynroot_dentry_operations;
 
@@ -760,6 +939,7 @@
  */
 extern struct workqueue_struct *afs_lock_manager;
 
+extern void afs_lock_op_done(struct afs_call *);
 extern void afs_lock_work(struct work_struct *);
 extern void afs_lock_may_be_available(struct afs_vnode *);
 extern int afs_lock(struct file *, int, struct file_lock *);
@@ -768,53 +948,73 @@
 /*
  * fsclient.c
  */
-#define AFS_VNODE_NOT_YET_SET	0x01
-#define AFS_VNODE_META_CHANGED	0x02
-#define AFS_VNODE_DATA_CHANGED	0x04
-extern void afs_update_inode_from_status(struct afs_vnode *, struct afs_file_status *,
-					 const afs_dataversion_t *, u8);
-
-extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool);
+extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_status_cb *,
+				    struct afs_volsync *);
 extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *);
-extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
-extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64,
-			 struct afs_fid *, struct afs_file_status *, struct afs_callback *);
-extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64);
-extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
-extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64,
-			  struct afs_fid *, struct afs_file_status *);
+extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_status_cb *, struct afs_read *);
+extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t,
+			 struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *);
+extern int afs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool,
+			 struct afs_status_cb *);
+extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *,
+		       struct afs_status_cb *, struct afs_status_cb *);
+extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
+			  struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *);
 extern int afs_fs_rename(struct afs_fs_cursor *, const char *,
-			 struct afs_vnode *, const char *, u64, u64);
+			 struct afs_vnode *, const char *,
+			 struct afs_status_cb *, struct afs_status_cb *);
 extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
-			     pgoff_t, pgoff_t, unsigned, unsigned);
-extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *);
+			     pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *);
+extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *, struct afs_status_cb *);
 extern int afs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
-extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t);
-extern int afs_fs_extend_lock(struct afs_fs_cursor *);
-extern int afs_fs_release_lock(struct afs_fs_cursor *);
+extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t, struct afs_status_cb *);
+extern int afs_fs_extend_lock(struct afs_fs_cursor *, struct afs_status_cb *);
+extern int afs_fs_release_lock(struct afs_fs_cursor *, struct afs_status_cb *);
 extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
 					struct afs_addr_cursor *, struct key *);
-extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
-				   struct afs_addr_cursor *, struct key *);
+extern struct afs_call *afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
+						struct afs_addr_cursor *, struct key *,
+						unsigned int);
 extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
-				     struct afs_fid *, struct afs_file_status *,
-				     struct afs_callback *, unsigned int,
-				     struct afs_volsync *);
+				     struct afs_fid *, struct afs_status_cb *,
+				     unsigned int, struct afs_volsync *);
 extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
-			       struct afs_fid *, struct afs_file_status *,
-			       struct afs_callback *, struct afs_volsync *);
+			       struct afs_fid *, struct afs_status_cb *,
+			       struct afs_volsync *);
+
+struct afs_acl {
+	u32	size;
+	u8	data[];
+};
+
+extern struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *, struct afs_status_cb *);
+extern int afs_fs_store_acl(struct afs_fs_cursor *, const struct afs_acl *,
+			    struct afs_status_cb *);
+
+/*
+ * fs_probe.c
+ */
+extern void afs_fileserver_probe_result(struct afs_call *);
+extern int afs_probe_fileservers(struct afs_net *, struct key *, struct afs_server_list *);
+extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
 
 /*
  * inode.c
  */
-extern int afs_fetch_status(struct afs_vnode *, struct key *, bool);
+extern void afs_vnode_commit_status(struct afs_fs_cursor *,
+				    struct afs_vnode *,
+				    unsigned int,
+				    const afs_dataversion_t *,
+				    struct afs_status_cb *);
+extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
 extern int afs_iget5_test(struct inode *, void *);
 extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
 extern struct inode *afs_iget(struct super_block *, struct key *,
-			      struct afs_fid *, struct afs_file_status *,
-			      struct afs_callback *,
-			      struct afs_cb_interest *);
+			      struct afs_iget_data *, struct afs_status_cb *,
+			      struct afs_cb_interest *,
+			      struct afs_vnode *);
 extern void afs_zap_data(struct afs_vnode *);
+extern bool afs_check_validity(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
 extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int afs_setattr(struct dentry *, struct iattr *);
@@ -868,6 +1068,7 @@
  * misc.c
  */
 extern int afs_abort_to_error(u32);
+extern void afs_prioritise_error(struct afs_error *, int, u32);
 
 /*
  * mntpt.c
@@ -880,12 +1081,6 @@
 extern void afs_mntpt_kill_timer(void);
 
 /*
- * netdevices.c
- */
-extern int afs_get_ipv4_interfaces(struct afs_net *, struct afs_interface *,
-				   size_t, bool);
-
-/*
  * proc.c
  */
 #ifdef CONFIG_PROC_FS
@@ -906,7 +1101,7 @@
  * rotate.c
  */
 extern bool afs_begin_vnode_operation(struct afs_fs_cursor *, struct afs_vnode *,
-				      struct key *);
+				      struct key *, bool);
 extern bool afs_select_fileserver(struct afs_fs_cursor *);
 extern bool afs_select_current_fileserver(struct afs_fs_cursor *);
 extern int afs_end_vnode_operation(struct afs_fs_cursor *);
@@ -920,20 +1115,53 @@
 extern void __net_exit afs_close_socket(struct afs_net *);
 extern void afs_charge_preallocation(struct work_struct *);
 extern void afs_put_call(struct afs_call *);
-extern int afs_queue_call_work(struct afs_call *);
-extern long afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t, bool);
+extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t);
+extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
 extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
 					    const struct afs_call_type *,
 					    size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
 extern void afs_send_empty_reply(struct afs_call *);
 extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
-extern int afs_extract_data(struct afs_call *, void *, size_t, bool);
-extern int afs_protocol_error(struct afs_call *, int);
+extern int afs_extract_data(struct afs_call *, bool);
+extern int afs_protocol_error(struct afs_call *, int, enum afs_eproto_cause);
+
+static inline void afs_set_fc_call(struct afs_call *call, struct afs_fs_cursor *fc)
+{
+	call->intr = fc->flags & AFS_FS_CURSOR_INTR;
+	fc->type = call->type;
+}
+
+static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
+{
+	call->kvec[0].iov_base = buf;
+	call->kvec[0].iov_len = size;
+	iov_iter_kvec(&call->iter, READ, call->kvec, 1, size);
+}
+
+static inline void afs_extract_to_tmp(struct afs_call *call)
+{
+	afs_extract_begin(call, &call->tmp, sizeof(call->tmp));
+}
+
+static inline void afs_extract_to_tmp64(struct afs_call *call)
+{
+	afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64));
+}
+
+static inline void afs_extract_discard(struct afs_call *call, size_t size)
+{
+	iov_iter_discard(&call->iter, READ, size);
+}
+
+static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
+{
+	afs_extract_begin(call, call->buffer, size);
+}
 
 static inline int afs_transfer_reply(struct afs_call *call)
 {
-	return afs_extract_data(call, call->buffer, call->reply_max, false);
+	return afs_extract_data(call, false);
 }
 
 static inline bool afs_check_call_state(struct afs_call *call,
@@ -984,9 +1212,11 @@
  */
 extern void afs_put_permits(struct afs_permits *);
 extern void afs_clear_permits(struct afs_vnode *);
-extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int,
+			     struct afs_status_cb *);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
+extern struct key *afs_request_key_rcu(struct afs_cell *);
 extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *);
 extern int afs_permission(struct inode *, int);
 extern void __exit afs_clean_up_permit_cache(void);
@@ -996,21 +1226,15 @@
  */
 extern spinlock_t afs_server_peer_lock;
 
-static inline struct afs_server *afs_get_server(struct afs_server *server)
-{
-	atomic_inc(&server->usage);
-	return server;
-}
-
 extern struct afs_server *afs_find_server(struct afs_net *,
 					  const struct sockaddr_rxrpc *);
 extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
 extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *);
-extern void afs_put_server(struct afs_net *, struct afs_server *);
+extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
+extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
 extern void afs_manage_servers(struct work_struct *);
 extern void afs_servers_timer(struct timer_list *);
 extern void __net_exit afs_purge_servers(struct afs_net *);
-extern bool afs_probe_fileserver(struct afs_fs_cursor *);
 extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *);
 
 /*
@@ -1037,14 +1261,51 @@
 /*
  * vlclient.c
  */
-extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *,
-							 struct afs_addr_cursor *,
-							 struct key *, const char *, int);
-extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *, struct afs_addr_cursor *,
-						struct key *, const uuid_t *);
-extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *);
-extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *, struct afs_addr_cursor *,
-						     struct key *, const uuid_t *);
+extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *,
+							 const char *, int);
+extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *);
+extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *,
+						struct key *, struct afs_vlserver *, unsigned int);
+extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *);
+
+/*
+ * vl_probe.c
+ */
+extern void afs_vlserver_probe_result(struct afs_call *);
+extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *);
+extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long);
+
+/*
+ * vl_rotate.c
+ */
+extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *,
+					 struct afs_cell *, struct key *);
+extern bool afs_select_vlserver(struct afs_vl_cursor *);
+extern bool afs_select_current_vlserver(struct afs_vl_cursor *);
+extern int afs_end_vlserver_operation(struct afs_vl_cursor *);
+
+/*
+ * vlserver_list.c
+ */
+static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver)
+{
+	atomic_inc(&vlserver->usage);
+	return vlserver;
+}
+
+static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist)
+{
+	if (vllist)
+		atomic_inc(&vllist->usage);
+	return vllist;
+}
+
+extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short);
+extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *);
+extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int);
+extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *);
+extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *,
+							   const void *, size_t);
 
 /*
  * volume.c
@@ -1056,7 +1317,7 @@
 	return volume;
 }
 
-extern struct afs_volume *afs_create_volume(struct afs_mount_params *);
+extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
 extern void afs_activate_volume(struct afs_volume *);
 extern void afs_deactivate_volume(struct afs_volume *);
 extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
@@ -1074,7 +1335,6 @@
 			struct page *page, void *fsdata);
 extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
 extern int afs_fsync(struct file *, loff_t, loff_t, int);
 extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
@@ -1087,6 +1347,55 @@
 extern const struct xattr_handler *afs_xattr_handlers[];
 extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
 
+/*
+ * yfsclient.c
+ */
+extern int yfs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_status_cb *,
+				    struct afs_volsync *);
+extern int yfs_fs_fetch_data(struct afs_fs_cursor *, struct afs_status_cb *, struct afs_read *);
+extern int yfs_fs_create_file(struct afs_fs_cursor *, const char *, umode_t, struct afs_status_cb *,
+			      struct afs_fid *, struct afs_status_cb *);
+extern int yfs_fs_make_dir(struct afs_fs_cursor *, const char *, umode_t, struct afs_status_cb *,
+			   struct afs_fid *, struct afs_status_cb *);
+extern int yfs_fs_remove_file2(struct afs_fs_cursor *, struct afs_vnode *, const char *,
+			       struct afs_status_cb *, struct afs_status_cb *);
+extern int yfs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool,
+			 struct afs_status_cb *);
+extern int yfs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *,
+		       struct afs_status_cb *, struct afs_status_cb *);
+extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
+			  struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *);
+extern int yfs_fs_rename(struct afs_fs_cursor *, const char *, struct afs_vnode *, const char *,
+			 struct afs_status_cb *, struct afs_status_cb *);
+extern int yfs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
+			     pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *);
+extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *, struct afs_status_cb *);
+extern int yfs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
+extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t, struct afs_status_cb *);
+extern int yfs_fs_extend_lock(struct afs_fs_cursor *, struct afs_status_cb *);
+extern int yfs_fs_release_lock(struct afs_fs_cursor *, struct afs_status_cb *);
+extern int yfs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
+			       struct afs_fid *, struct afs_status_cb *,
+			       struct afs_volsync *);
+extern int yfs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
+				     struct afs_fid *, struct afs_status_cb *,
+				     unsigned int, struct afs_volsync *);
+
+struct yfs_acl {
+	struct afs_acl	*acl;		/* Dir/file/symlink ACL */
+	struct afs_acl	*vol_acl;	/* Whole volume ACL */
+	u32		inherit_flag;	/* True if ACL is inherited from parent dir */
+	u32		num_cleaned;	/* Number of ACEs removed due to subject removal */
+	unsigned int	flags;
+#define YFS_ACL_WANT_ACL	0x01	/* Set if caller wants ->acl */
+#define YFS_ACL_WANT_VOL_ACL	0x02	/* Set if caller wants ->vol_acl */
+};
+
+extern void yfs_free_opaque_acl(struct yfs_acl *);
+extern struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *, struct yfs_acl *,
+					       struct afs_status_cb *);
+extern int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *, const struct afs_acl *,
+				    struct afs_status_cb *);
 
 /*
  * Miscellaneous inline functions.
@@ -1101,23 +1410,26 @@
 	return &vnode->vfs_inode;
 }
 
-static inline void afs_vnode_commit_status(struct afs_fs_cursor *fc,
-					   struct afs_vnode *vnode,
-					   unsigned int cb_break)
-{
-	if (fc->ac.error == 0)
-		afs_cache_permit(vnode, fc->key, cb_break);
-}
-
 static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc,
 						 struct afs_vnode *vnode)
 {
 	if (fc->ac.error == -ENOENT) {
 		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		afs_break_callback(vnode);
+		afs_break_callback(vnode, afs_cb_break_for_deleted);
 	}
 }
 
+static inline int afs_io_error(struct afs_call *call, enum afs_io_error where)
+{
+	trace_afs_io_error(call->debug_id, -EIO, where);
+	return -EIO;
+}
+
+static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where)
+{
+	trace_afs_file_error(vnode, -EIO, where);
+	return -EIO;
+}
 
 /*****************************************************************************/
 /*

diff --git a/fs/afs/main.c b/fs/afs/main.c
index 1074276..c9c45d7 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS client file system
  *
  * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 700a5fa..52b19e9 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* miscellaneous bits
  *
  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
@@ -14,6 +10,7 @@
 #include <linux/errno.h>
 #include "internal.h"
 #include "afs_fs.h"
+#include "protocol_uae.h"
 
 /*
  * convert an AFS abort code to a Linux error number
@@ -69,34 +66,25 @@
 	case AFSVL_PERM:		return -EACCES;
 	case AFSVL_NOMEM:		return -EREMOTEIO;
 
-		/* Unified AFS error table; ET "uae" == 0x2f6df00 */
-	case 0x2f6df00:		return -EPERM;
-	case 0x2f6df01:		return -ENOENT;
-	case 0x2f6df04:		return -EIO;
-	case 0x2f6df0a:		return -EAGAIN;
-	case 0x2f6df0b:		return -ENOMEM;
-	case 0x2f6df0c:		return -EACCES;
-	case 0x2f6df0f:		return -EBUSY;
-	case 0x2f6df10:		return -EEXIST;
-	case 0x2f6df11:		return -EXDEV;
-	case 0x2f6df12:		return -ENODEV;
-	case 0x2f6df13:		return -ENOTDIR;
-	case 0x2f6df14:		return -EISDIR;
-	case 0x2f6df15:		return -EINVAL;
-	case 0x2f6df1a:		return -EFBIG;
-	case 0x2f6df1b:		return -ENOSPC;
-	case 0x2f6df1d:		return -EROFS;
-	case 0x2f6df1e:		return -EMLINK;
-	case 0x2f6df20:		return -EDOM;
-	case 0x2f6df21:		return -ERANGE;
-	case 0x2f6df22:		return -EDEADLK;
-	case 0x2f6df23:		return -ENAMETOOLONG;
-	case 0x2f6df24:		return -ENOLCK;
-	case 0x2f6df26:		return -ENOTEMPTY;
-	case 0x2f6df28:		return -EWOULDBLOCK;
-	case 0x2f6df69:		return -ENOTCONN;
-	case 0x2f6df6c:		return -ETIMEDOUT;
-	case 0x2f6df78:		return -EDQUOT;
+		/* Unified AFS error table */
+	case UAEPERM:			return -EPERM;
+	case UAENOENT:			return -ENOENT;
+	case UAEACCES:			return -EACCES;
+	case UAEBUSY:			return -EBUSY;
+	case UAEEXIST:			return -EEXIST;
+	case UAENOTDIR:			return -ENOTDIR;
+	case UAEISDIR:			return -EISDIR;
+	case UAEFBIG:			return -EFBIG;
+	case UAENOSPC:			return -ENOSPC;
+	case UAEROFS:			return -EROFS;
+	case UAEMLINK:			return -EMLINK;
+	case UAEDEADLK:			return -EDEADLK;
+	case UAENAMETOOLONG:		return -ENAMETOOLONG;
+	case UAENOLCK:			return -ENOLCK;
+	case UAENOTEMPTY:		return -ENOTEMPTY;
+	case UAELOOP:			return -ELOOP;
+	case UAENOMEDIUM:		return -ENOMEDIUM;
+	case UAEDQUOT:			return -EDQUOT;
 
 		/* RXKAD abort codes; from include/rxrpc/packet.h.  ET "RXK" == 0x1260B00 */
 	case RXKADINCONSISTENCY: return -EPROTO;
@@ -118,3 +106,64 @@
 	default:		return -EREMOTEIO;
 	}
 }
+
+/*
+ * Select the error to report from a set of errors.
+ */
+void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
+{
+	switch (error) {
+	case 0:
+		return;
+	default:
+		if (e->error == -ETIMEDOUT ||
+		    e->error == -ETIME)
+			return;
+		/* Fall through */
+	case -ETIMEDOUT:
+	case -ETIME:
+		if (e->error == -ENOMEM ||
+		    e->error == -ENONET)
+			return;
+		/* Fall through */
+	case -ENOMEM:
+	case -ENONET:
+		if (e->error == -ERFKILL)
+			return;
+		/* Fall through */
+	case -ERFKILL:
+		if (e->error == -EADDRNOTAVAIL)
+			return;
+		/* Fall through */
+	case -EADDRNOTAVAIL:
+		if (e->error == -ENETUNREACH)
+			return;
+		/* Fall through */
+	case -ENETUNREACH:
+		if (e->error == -EHOSTUNREACH)
+			return;
+		/* Fall through */
+	case -EHOSTUNREACH:
+		if (e->error == -EHOSTDOWN)
+			return;
+		/* Fall through */
+	case -EHOSTDOWN:
+		if (e->error == -ECONNREFUSED)
+			return;
+		/* Fall through */
+	case -ECONNREFUSED:
+		if (e->error == -ECONNRESET)
+			return;
+		/* Fall through */
+	case -ECONNRESET: /* Responded, but call expired. */
+		if (e->responded)
+			return;
+		e->error = error;
+		return;
+
+	case -ECONNABORTED:
+		e->responded = true;
+		e->error = afs_abort_to_error(abort_code);
+		return;
+	}
+}

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 99fd135..f532d6d 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* mountpoint management
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
@@ -17,6 +13,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/gfp.h>
+#include <linux/fs_context.h>
 #include "internal.h"
 
 
@@ -47,6 +44,8 @@
 
 static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
 
+static const char afs_root_volume[] = "root.cell";
+
 /*
  * no valid lookup procedure on this sort of dir
  */
@@ -68,107 +67,112 @@
 }
 
 /*
+ * Set the parameters for the proposed superblock.
+ */
+static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
+{
+	struct afs_fs_context *ctx = fc->fs_private;
+	struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb);
+	struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt));
+	struct afs_cell *cell;
+	const char *p;
+	int ret;
+
+	if (fc->net_ns != src_as->net_ns) {
+		put_net(fc->net_ns);
+		fc->net_ns = get_net(src_as->net_ns);
+	}
+
+	if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) {
+		ctx->type = AFSVL_RWVOL;
+		ctx->force = true;
+	}
+	if (ctx->cell) {
+		afs_put_cell(ctx->net, ctx->cell);
+		ctx->cell = NULL;
+	}
+	if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
+		/* if the directory is a pseudo directory, use the d_name */
+		unsigned size = mntpt->d_name.len;
+
+		if (size < 2)
+			return -ENOENT;
+
+		p = mntpt->d_name.name;
+		if (mntpt->d_name.name[0] == '.') {
+			size--;
+			p++;
+			ctx->type = AFSVL_RWVOL;
+			ctx->force = true;
+		}
+		if (size > AFS_MAXCELLNAME)
+			return -ENAMETOOLONG;
+
+		cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+		if (IS_ERR(cell)) {
+			pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
+			return PTR_ERR(cell);
+		}
+		ctx->cell = cell;
+
+		ctx->volname = afs_root_volume;
+		ctx->volnamesz = sizeof(afs_root_volume) - 1;
+	} else {
+		/* read the contents of the AFS special symlink */
+		struct page *page;
+		loff_t size = i_size_read(d_inode(mntpt));
+		char *buf;
+
+		if (src_as->cell)
+			ctx->cell = afs_get_cell(src_as->cell);
+
+		if (size > PAGE_SIZE - 1)
+			return -EINVAL;
+
+		page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		if (PageError(page)) {
+			ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
+			put_page(page);
+			return ret;
+		}
+
+		buf = kmap(page);
+		ret = vfs_parse_fs_string(fc, "source", buf, size);
+		kunmap(page);
+		put_page(page);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
  * create a vfsmount to be automounted
  */
 static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
-	struct afs_super_info *as;
+	struct fs_context *fc;
 	struct vfsmount *mnt;
-	struct afs_vnode *vnode;
-	struct page *page;
-	char *devname, *options;
-	bool rwpath = false;
 	int ret;
 
-	_enter("{%pd}", mntpt);
-
 	BUG_ON(!d_inode(mntpt));
 
-	ret = -ENOMEM;
-	devname = (char *) get_zeroed_page(GFP_KERNEL);
-	if (!devname)
-		goto error_no_devname;
+	fc = fs_context_for_submount(&afs_fs_type, mntpt);
+	if (IS_ERR(fc))
+		return ERR_CAST(fc);
 
-	options = (char *) get_zeroed_page(GFP_KERNEL);
-	if (!options)
-		goto error_no_options;
+	ret = afs_mntpt_set_params(fc, mntpt);
+	if (!ret)
+		mnt = fc_mount(fc);
+	else
+		mnt = ERR_PTR(ret);
 
-	vnode = AFS_FS_I(d_inode(mntpt));
-	if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
-		/* if the directory is a pseudo directory, use the d_name */
-		static const char afs_root_cell[] = ":root.cell.";
-		unsigned size = mntpt->d_name.len;
-
-		ret = -ENOENT;
-		if (size < 2 || size > AFS_MAXCELLNAME)
-			goto error_no_page;
-
-		if (mntpt->d_name.name[0] == '.') {
-			devname[0] = '%';
-			memcpy(devname + 1, mntpt->d_name.name + 1, size - 1);
-			memcpy(devname + size, afs_root_cell,
-			       sizeof(afs_root_cell));
-			rwpath = true;
-		} else {
-			devname[0] = '#';
-			memcpy(devname + 1, mntpt->d_name.name, size);
-			memcpy(devname + size + 1, afs_root_cell,
-			       sizeof(afs_root_cell));
-		}
-	} else {
-		/* read the contents of the AFS special symlink */
-		loff_t size = i_size_read(d_inode(mntpt));
-		char *buf;
-
-		ret = -EINVAL;
-		if (size > PAGE_SIZE - 1)
-			goto error_no_page;
-
-		page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
-		if (IS_ERR(page)) {
-			ret = PTR_ERR(page);
-			goto error_no_page;
-		}
-
-		ret = -EIO;
-		if (PageError(page))
-			goto error;
-
-		buf = kmap_atomic(page);
-		memcpy(devname, buf, size);
-		kunmap_atomic(buf);
-		put_page(page);
-		page = NULL;
-	}
-
-	/* work out what options we want */
-	as = AFS_FS_S(mntpt->d_sb);
-	if (as->cell) {
-		memcpy(options, "cell=", 5);
-		strcpy(options + 5, as->cell->name);
-		if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath)
-			strcat(options, ",rwpath");
-	}
-
-	/* try and do the mount */
-	_debug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
-	_debug("--- mount result %p ---", mnt);
-
-	free_page((unsigned long) devname);
-	free_page((unsigned long) options);
-	_leave(" = %p", mnt);
+	put_fs_context(fc);
 	return mnt;
-
-error:
-	put_page(page);
-error_no_page:
-	free_page((unsigned long) options);
-error_no_options:
-	free_page((unsigned long) devname);
-error_no_devname:
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
 }
 
 /*

diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
deleted file mode 100644
index 2a009d1..0000000
--- a/fs/afs/netdevices.c
+++ /dev/null

@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* AFS network device helpers
- *
- * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
- */
-
-#include <linux/string.h>
-#include <linux/rtnetlink.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <net/net_namespace.h>
-#include "internal.h"
-
-/*
- * get a list of this system's interface IPv4 addresses, netmasks and MTUs
- * - maxbufs must be at least 1
- * - returns the number of interface records in the buffer
- */
-int afs_get_ipv4_interfaces(struct afs_net *net, struct afs_interface *bufs,
-			    size_t maxbufs, bool wantloopback)
-{
-	struct net_device *dev;
-	struct in_device *idev;
-	int n = 0;
-
-	ASSERT(maxbufs > 0);
-
-	rtnl_lock();
-	for_each_netdev(net->net, dev) {
-		if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
-			continue;
-		idev = __in_dev_get_rtnl(dev);
-		if (!idev)
-			continue;
-		for_primary_ifa(idev) {
-			bufs[n].address.s_addr = ifa->ifa_address;
-			bufs[n].netmask.s_addr = ifa->ifa_mask;
-			bufs[n].mtu = dev->mtu;
-			n++;
-			if (n >= maxbufs)
-				goto out;
-		} endfor_ifa(idev);
-	}
-out:
-	rtnl_unlock();
-	return n;
-}

diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 9101f62..fba2ec3 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* /proc interface for AFS
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/slab.h>
@@ -17,6 +13,11 @@
 #include <linux/uaccess.h>
 #include "internal.h"
 
+struct afs_vl_seq_net_private {
+	struct seq_net_private		seq;	/* Must be first */
+	struct afs_vlserver_list	*vllist;
+};
+
 static inline struct afs_net *afs_seq2net(struct seq_file *m)
 {
 	return afs_net(seq_file_net(m));
@@ -32,16 +33,24 @@
  */
 static int afs_proc_cells_show(struct seq_file *m, void *v)
 {
-	struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
+	struct afs_vlserver_list *vllist;
+	struct afs_cell *cell;
 
 	if (v == SEQ_START_TOKEN) {
 		/* display header on line 1 */
-		seq_puts(m, "USE NAME\n");
+		seq_puts(m, "USE    TTL SV NAME\n");
 		return 0;
 	}
 
+	cell = list_entry(v, struct afs_cell, proc_link);
+	vllist = rcu_dereference(cell->vl_servers);
+
 	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name);
+	seq_printf(m, "%3u %6lld %2u %s\n",
+		   atomic_read(&cell->usage),
+		   cell->dns_expiry - ktime_get_real_seconds(),
+		   vllist->nr_servers,
+		   cell->name);
 	return 0;
 }
 
@@ -208,7 +217,7 @@
 		return 0;
 	}
 
-	seq_printf(m, "%3d %08x %s\n",
+	seq_printf(m, "%3d %08llx %s\n",
 		   atomic_read(&vol->usage), vol->vid,
 		   afs_vol_types[vol->type]);
 
@@ -247,61 +256,102 @@
 	.show	= afs_proc_cell_volumes_show,
 };
 
+static const char *const dns_record_sources[NR__dns_record_source + 1] = {
+	[DNS_RECORD_UNAVAILABLE]	= "unav",
+	[DNS_RECORD_FROM_CONFIG]	= "cfg",
+	[DNS_RECORD_FROM_DNS_A]		= "A",
+	[DNS_RECORD_FROM_DNS_AFSDB]	= "AFSDB",
+	[DNS_RECORD_FROM_DNS_SRV]	= "SRV",
+	[DNS_RECORD_FROM_NSS]		= "nss",
+	[NR__dns_record_source]		= "[weird]"
+};
+
+static const char *const dns_lookup_statuses[NR__dns_lookup_status + 1] = {
+	[DNS_LOOKUP_NOT_DONE]		= "no-lookup",
+	[DNS_LOOKUP_GOOD]		= "good",
+	[DNS_LOOKUP_GOOD_WITH_BAD]	= "good/bad",
+	[DNS_LOOKUP_BAD]		= "bad",
+	[DNS_LOOKUP_GOT_NOT_FOUND]	= "not-found",
+	[DNS_LOOKUP_GOT_LOCAL_FAILURE]	= "local-failure",
+	[DNS_LOOKUP_GOT_TEMP_FAILURE]	= "temp-failure",
+	[DNS_LOOKUP_GOT_NS_FAILURE]	= "ns-failure",
+	[NR__dns_lookup_status]		= "[weird]"
+};
+
 /*
  * Display the list of Volume Location servers we're using for a cell.
  */
 static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
 {
-	struct sockaddr_rxrpc *addr = v;
+	const struct afs_vl_seq_net_private *priv = m->private;
+	const struct afs_vlserver_list *vllist = priv->vllist;
+	const struct afs_vlserver_entry *entry;
+	const struct afs_vlserver *vlserver;
+	const struct afs_addr_list *alist;
+	int i;
 
-	/* display header on line 1 */
-	if (v == (void *)1) {
-		seq_puts(m, "ADDRESS\n");
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(m, "# source %s, status %s\n",
+			   dns_record_sources[vllist ? vllist->source : 0],
+			   dns_lookup_statuses[vllist ? vllist->status : 0]);
 		return 0;
 	}
 
-	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%pISp\n", &addr->transport);
+	entry = v;
+	vlserver = entry->server;
+	alist = rcu_dereference(vlserver->addresses);
+
+	seq_printf(m, "%s [p=%hu w=%hu s=%s,%s]:\n",
+		   vlserver->name, entry->priority, entry->weight,
+		   dns_record_sources[alist ? alist->source : entry->source],
+		   dns_lookup_statuses[alist ? alist->status : entry->status]);
+	if (alist) {
+		for (i = 0; i < alist->nr_addrs; i++)
+			seq_printf(m, " %c %pISpc\n",
+				   alist->preferred == i ? '>' : '-',
+				   &alist->addrs[i].transport);
+	}
 	return 0;
 }
 
 static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 	__acquires(rcu)
 {
-	struct afs_addr_list *alist;
+	struct afs_vl_seq_net_private *priv = m->private;
+	struct afs_vlserver_list *vllist;
 	struct afs_cell *cell = PDE_DATA(file_inode(m->file));
 	loff_t pos = *_pos;
 
 	rcu_read_lock();
 
-	alist = rcu_dereference(cell->vl_addrs);
+	vllist = rcu_dereference(cell->vl_servers);
+	priv->vllist = vllist;
 
-	/* allow for the header line */
-	if (!pos)
-		return (void *) 1;
-	pos--;
+	if (pos < 0)
+		*_pos = pos = 0;
+	if (pos == 0)
+		return SEQ_START_TOKEN;
 
-	if (!alist || pos >= alist->nr_addrs)
+	if (pos - 1 >= vllist->nr_servers)
 		return NULL;
 
-	return alist->addrs + pos;
+	return &vllist->servers[pos - 1];
 }
 
 static void *afs_proc_cell_vlservers_next(struct seq_file *m, void *v,
 					  loff_t *_pos)
 {
-	struct afs_addr_list *alist;
-	struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+	struct afs_vl_seq_net_private *priv = m->private;
+	struct afs_vlserver_list *vllist = priv->vllist;
 	loff_t pos;
 
-	alist = rcu_dereference(cell->vl_addrs);
-
 	pos = *_pos;
-	(*_pos)++;
-	if (!alist || pos >= alist->nr_addrs)
+	pos++;
+	*_pos = pos;
+	if (!vllist || pos - 1 >= vllist->nr_servers)
 		return NULL;
 
-	return alist->addrs + pos;
+	return &vllist->servers[pos - 1];
 }
 
 static void afs_proc_cell_vlservers_stop(struct seq_file *m, void *v)
@@ -337,11 +387,11 @@
 		   &server->uuid,
 		   atomic_read(&server->usage),
 		   &alist->addrs[0].transport,
-		   alist->index == 0 ? "*" : "");
+		   alist->preferred == 0 ? "*" : "");
 	for (i = 1; i < alist->nr_addrs; i++)
 		seq_printf(m, "                                         %pISpc%s\n",
 			   &alist->addrs[i].transport,
-			   alist->index == i ? "*" : "");
+			   alist->preferred == i ? "*" : "");
 	return 0;
 }
 
@@ -562,7 +612,7 @@
 
 	if (!proc_create_net_data("vlservers", 0444, dir,
 				  &afs_proc_cell_vlservers_ops,
-				  sizeof(struct seq_net_private),
+				  sizeof(struct afs_vl_seq_net_private),
 				  cell) ||
 	    !proc_create_net_data("volumes", 0444, dir,
 				  &afs_proc_cell_volumes_ops,

diff --git a/fs/afs/protocol_uae.h b/fs/afs/protocol_uae.h
new file mode 100644
index 0000000..1b3d106
--- /dev/null
+++ b/fs/afs/protocol_uae.h

@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Universal AFS Error codes (UAE).
+ *
+ * Copyright (C) 2003, Daria Phoebe Brashear
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ */
+
+enum {
+	UAEPERM			= 0x2f6df00, /* Operation not permitted */
+	UAENOENT		= 0x2f6df01, /* No such file or directory */
+	UAESRCH			= 0x2f6df02, /* No such process */
+	UAEINTR			= 0x2f6df03, /* Interrupted system call */
+	UAEIO			= 0x2f6df04, /* I/O error */
+	UAENXIO			= 0x2f6df05, /* No such device or address */
+	UAE2BIG			= 0x2f6df06, /* Arg list too long */
+	UAENOEXEC		= 0x2f6df07, /* Exec format error */
+	UAEBADF			= 0x2f6df08, /* Bad file number */
+	UAECHILD		= 0x2f6df09, /* No child processes */
+	UAEAGAIN		= 0x2f6df0a, /* Try again */
+	UAENOMEM		= 0x2f6df0b, /* Out of memory */
+	UAEACCES		= 0x2f6df0c, /* Permission denied */
+	UAEFAULT		= 0x2f6df0d, /* Bad address */
+	UAENOTBLK		= 0x2f6df0e, /* Block device required */
+	UAEBUSY			= 0x2f6df0f, /* Device or resource busy */
+	UAEEXIST		= 0x2f6df10, /* File exists */
+	UAEXDEV			= 0x2f6df11, /* Cross-device link */
+	UAENODEV		= 0x2f6df12, /* No such device */
+	UAENOTDIR		= 0x2f6df13, /* Not a directory */
+	UAEISDIR		= 0x2f6df14, /* Is a directory */
+	UAEINVAL		= 0x2f6df15, /* Invalid argument */
+	UAENFILE		= 0x2f6df16, /* File table overflow */
+	UAEMFILE		= 0x2f6df17, /* Too many open files */
+	UAENOTTY		= 0x2f6df18, /* Not a typewriter */
+	UAETXTBSY		= 0x2f6df19, /* Text file busy */
+	UAEFBIG			= 0x2f6df1a, /* File too large */
+	UAENOSPC		= 0x2f6df1b, /* No space left on device */
+	UAESPIPE		= 0x2f6df1c, /* Illegal seek */
+	UAEROFS			= 0x2f6df1d, /* Read-only file system */
+	UAEMLINK		= 0x2f6df1e, /* Too many links */
+	UAEPIPE			= 0x2f6df1f, /* Broken pipe */
+	UAEDOM			= 0x2f6df20, /* Math argument out of domain of func */
+	UAERANGE		= 0x2f6df21, /* Math result not representable */
+	UAEDEADLK		= 0x2f6df22, /* Resource deadlock would occur */
+	UAENAMETOOLONG		= 0x2f6df23, /* File name too long */
+	UAENOLCK		= 0x2f6df24, /* No record locks available */
+	UAENOSYS		= 0x2f6df25, /* Function not implemented */
+	UAENOTEMPTY		= 0x2f6df26, /* Directory not empty */
+	UAELOOP			= 0x2f6df27, /* Too many symbolic links encountered */
+	UAEWOULDBLOCK		= 0x2f6df28, /* Operation would block */
+	UAENOMSG		= 0x2f6df29, /* No message of desired type */
+	UAEIDRM			= 0x2f6df2a, /* Identifier removed */
+	UAECHRNG		= 0x2f6df2b, /* Channel number out of range */
+	UAEL2NSYNC		= 0x2f6df2c, /* Level 2 not synchronized */
+	UAEL3HLT		= 0x2f6df2d, /* Level 3 halted */
+	UAEL3RST		= 0x2f6df2e, /* Level 3 reset */
+	UAELNRNG		= 0x2f6df2f, /* Link number out of range */
+	UAEUNATCH		= 0x2f6df30, /* Protocol driver not attached */
+	UAENOCSI		= 0x2f6df31, /* No CSI structure available */
+	UAEL2HLT		= 0x2f6df32, /* Level 2 halted */
+	UAEBADE			= 0x2f6df33, /* Invalid exchange */
+	UAEBADR			= 0x2f6df34, /* Invalid request descriptor */
+	UAEXFULL		= 0x2f6df35, /* Exchange full */
+	UAENOANO		= 0x2f6df36, /* No anode */
+	UAEBADRQC		= 0x2f6df37, /* Invalid request code */
+	UAEBADSLT		= 0x2f6df38, /* Invalid slot */
+	UAEBFONT		= 0x2f6df39, /* Bad font file format */
+	UAENOSTR		= 0x2f6df3a, /* Device not a stream */
+	UAENODATA		= 0x2f6df3b, /* No data available */
+	UAETIME			= 0x2f6df3c, /* Timer expired */
+	UAENOSR			= 0x2f6df3d, /* Out of streams resources */
+	UAENONET		= 0x2f6df3e, /* Machine is not on the network */
+	UAENOPKG		= 0x2f6df3f, /* Package not installed */
+	UAEREMOTE		= 0x2f6df40, /* Object is remote */
+	UAENOLINK		= 0x2f6df41, /* Link has been severed */
+	UAEADV			= 0x2f6df42, /* Advertise error */
+	UAESRMNT		= 0x2f6df43, /* Srmount error */
+	UAECOMM			= 0x2f6df44, /* Communication error on send */
+	UAEPROTO		= 0x2f6df45, /* Protocol error */
+	UAEMULTIHOP		= 0x2f6df46, /* Multihop attempted */
+	UAEDOTDOT		= 0x2f6df47, /* RFS specific error */
+	UAEBADMSG		= 0x2f6df48, /* Not a data message */
+	UAEOVERFLOW		= 0x2f6df49, /* Value too large for defined data type */
+	UAENOTUNIQ		= 0x2f6df4a, /* Name not unique on network */
+	UAEBADFD		= 0x2f6df4b, /* File descriptor in bad state */
+	UAEREMCHG		= 0x2f6df4c, /* Remote address changed */
+	UAELIBACC		= 0x2f6df4d, /* Can not access a needed shared library */
+	UAELIBBAD		= 0x2f6df4e, /* Accessing a corrupted shared library */
+	UAELIBSCN		= 0x2f6df4f, /* .lib section in a.out corrupted */
+	UAELIBMAX		= 0x2f6df50, /* Attempting to link in too many shared libraries */
+	UAELIBEXEC		= 0x2f6df51, /* Cannot exec a shared library directly */
+	UAEILSEQ		= 0x2f6df52, /* Illegal byte sequence */
+	UAERESTART		= 0x2f6df53, /* Interrupted system call should be restarted */
+	UAESTRPIPE		= 0x2f6df54, /* Streams pipe error */
+	UAEUSERS		= 0x2f6df55, /* Too many users */
+	UAENOTSOCK		= 0x2f6df56, /* Socket operation on non-socket */
+	UAEDESTADDRREQ		= 0x2f6df57, /* Destination address required */
+	UAEMSGSIZE		= 0x2f6df58, /* Message too long */
+	UAEPROTOTYPE		= 0x2f6df59, /* Protocol wrong type for socket */
+	UAENOPROTOOPT		= 0x2f6df5a, /* Protocol not available */
+	UAEPROTONOSUPPORT	= 0x2f6df5b, /* Protocol not supported */
+	UAESOCKTNOSUPPORT	= 0x2f6df5c, /* Socket type not supported */
+	UAEOPNOTSUPP		= 0x2f6df5d, /* Operation not supported on transport endpoint */
+	UAEPFNOSUPPORT		= 0x2f6df5e, /* Protocol family not supported */
+	UAEAFNOSUPPORT		= 0x2f6df5f, /* Address family not supported by protocol */
+	UAEADDRINUSE		= 0x2f6df60, /* Address already in use */
+	UAEADDRNOTAVAIL		= 0x2f6df61, /* Cannot assign requested address */
+	UAENETDOWN		= 0x2f6df62, /* Network is down */
+	UAENETUNREACH		= 0x2f6df63, /* Network is unreachable */
+	UAENETRESET		= 0x2f6df64, /* Network dropped connection because of reset */
+	UAECONNABORTED		= 0x2f6df65, /* Software caused connection abort */
+	UAECONNRESET		= 0x2f6df66, /* Connection reset by peer */
+	UAENOBUFS		= 0x2f6df67, /* No buffer space available */
+	UAEISCONN		= 0x2f6df68, /* Transport endpoint is already connected */
+	UAENOTCONN		= 0x2f6df69, /* Transport endpoint is not connected */
+	UAESHUTDOWN		= 0x2f6df6a, /* Cannot send after transport endpoint shutdown */
+	UAETOOMANYREFS		= 0x2f6df6b, /* Too many references: cannot splice */
+	UAETIMEDOUT		= 0x2f6df6c, /* Connection timed out */
+	UAECONNREFUSED		= 0x2f6df6d, /* Connection refused */
+	UAEHOSTDOWN		= 0x2f6df6e, /* Host is down */
+	UAEHOSTUNREACH		= 0x2f6df6f, /* No route to host */
+	UAEALREADY		= 0x2f6df70, /* Operation already in progress */
+	UAEINPROGRESS		= 0x2f6df71, /* Operation now in progress */
+	UAESTALE		= 0x2f6df72, /* Stale NFS file handle */
+	UAEUCLEAN		= 0x2f6df73, /* Structure needs cleaning */
+	UAENOTNAM		= 0x2f6df74, /* Not a XENIX named type file */
+	UAENAVAIL		= 0x2f6df75, /* No XENIX semaphores available */
+	UAEISNAM		= 0x2f6df76, /* Is a named type file */
+	UAEREMOTEIO		= 0x2f6df77, /* Remote I/O error */
+	UAEDQUOT		= 0x2f6df78, /* Quota exceeded */
+	UAENOMEDIUM		= 0x2f6df79, /* No medium found */
+	UAEMEDIUMTYPE		= 0x2f6df7a, /* Wrong medium type */
+};

diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
new file mode 100644
index 0000000..32be9c6
--- /dev/null
+++ b/fs/afs/protocol_yfs.h

@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* YFS protocol bits
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define YFS_FS_SERVICE	2500
+#define YFS_CM_SERVICE	2501
+
+#define YFSCBMAX 1024
+
+enum YFS_CM_Operations {
+	YFSCBProbe		= 206,	/* probe client */
+	YFSCBGetLock		= 207,	/* get contents of CM lock table */
+	YFSCBXStatsVersion	= 209,	/* get version of extended statistics */
+	YFSCBGetXStats		= 210,	/* get contents of extended statistics data */
+	YFSCBInitCallBackState3	= 213,	/* initialise callback state, version 3 */
+	YFSCBProbeUuid		= 214,	/* check the client hasn't rebooted */
+	YFSCBGetServerPrefs	= 215,
+	YFSCBGetCellServDV	= 216,
+	YFSCBGetLocalCell	= 217,
+	YFSCBGetCacheConfig	= 218,
+	YFSCBGetCellByNum	= 65537,
+	YFSCBTellMeAboutYourself = 65538, /* get client capabilities */
+	YFSCBCallBack		= 64204,
+};
+
+enum YFS_FS_Operations {
+	YFSFETCHACL		= 64131, /* YFS Fetch file AFS3 ACL */
+	YFSFETCHSTATUS		= 64132, /* YFS Fetch file status */
+	YFSSTOREACL		= 64134, /* YFS Store file AFS3 ACL */
+	YFSSTORESTATUS		= 64135, /* YFS Store file status */
+	YFSREMOVEFILE		= 64136, /* YFS Remove a file */
+	YFSCREATEFILE		= 64137, /* YFS Create a file */
+	YFSRENAME		= 64138, /* YFS Rename or move a file or directory */
+	YFSSYMLINK		= 64139, /* YFS Create a symbolic link */
+	YFSLINK			= 64140, /* YFS Create a hard link */
+	YFSMAKEDIR		= 64141, /* YFS Create a directory */
+	YFSREMOVEDIR		= 64142, /* YFS Remove a directory */
+	YFSGETVOLUMESTATUS	= 64149, /* YFS Get volume status information */
+	YFSSETVOLUMESTATUS	= 64150, /* YFS Set volume status information */
+	YFSSETLOCK		= 64156, /* YFS Request a file lock */
+	YFSEXTENDLOCK		= 64157, /* YFS Extend a file lock */
+	YFSRELEASELOCK		= 64158, /* YFS Release a file lock */
+	YFSLOOKUP		= 64161, /* YFS lookup file in directory */
+	YFSFLUSHCPS		= 64165,
+	YFSFETCHOPAQUEACL	= 64168, /* YFS Fetch file YFS ACL */
+	YFSWHOAMI		= 64170,
+	YFSREMOVEACL		= 64171,
+	YFSREMOVEFILE2		= 64173,
+	YFSSTOREOPAQUEACL2	= 64174,
+	YFSINLINEBULKSTATUS	= 64536, /* YFS Fetch multiple file statuses with errors */
+	YFSFETCHDATA64		= 64537, /* YFS Fetch file data */
+	YFSSTOREDATA64		= 64538, /* YFS Store file data */
+	YFSUPDATESYMLINK	= 64540,
+};
+
+struct yfs_xdr_u64 {
+	__be32			msw;
+	__be32			lsw;
+} __packed;
+
+static inline u64 xdr_to_u64(const struct yfs_xdr_u64 x)
+{
+	return ((u64)ntohl(x.msw) << 32) | ntohl(x.lsw);
+}
+
+static inline struct yfs_xdr_u64 u64_to_xdr(const u64 x)
+{
+	return (struct yfs_xdr_u64){ .msw = htonl(x >> 32), .lsw = htonl(x) };
+}
+
+struct yfs_xdr_vnode {
+	struct yfs_xdr_u64	lo;
+	__be32			hi;
+	__be32			unique;
+} __packed;
+
+struct yfs_xdr_YFSFid {
+	struct yfs_xdr_u64	volume;
+	struct yfs_xdr_vnode	vnode;
+} __packed;
+
+
+struct yfs_xdr_YFSFetchStatus {
+	__be32			type;
+	__be32			nlink;
+	struct yfs_xdr_u64	size;
+	struct yfs_xdr_u64	data_version;
+	struct yfs_xdr_u64	author;
+	struct yfs_xdr_u64	owner;
+	struct yfs_xdr_u64	group;
+	__be32			mode;
+	__be32			caller_access;
+	__be32			anon_access;
+	struct yfs_xdr_vnode	parent;
+	__be32			data_access_protocol;
+	struct yfs_xdr_u64	mtime_client;
+	struct yfs_xdr_u64	mtime_server;
+	__be32			lock_count;
+	__be32			abort_code;
+} __packed;
+
+struct yfs_xdr_YFSCallBack {
+	__be32			version;
+	struct yfs_xdr_u64	expiration_time;
+	__be32			type;
+} __packed;
+
+struct yfs_xdr_YFSStoreStatus {
+	__be32			mask;
+	__be32			mode;
+	struct yfs_xdr_u64	mtime_client;
+	struct yfs_xdr_u64	owner;
+	struct yfs_xdr_u64	group;
+} __packed;
+
+struct yfs_xdr_RPCFlags {
+	__be32			rpc_flags;
+} __packed;
+
+struct yfs_xdr_YFSVolSync {
+	struct yfs_xdr_u64	vol_creation_date;
+	struct yfs_xdr_u64	vol_update_date;
+	struct yfs_xdr_u64	max_quota;
+	struct yfs_xdr_u64	blocks_in_use;
+	struct yfs_xdr_u64	blocks_avail;
+} __packed;
+
+enum yfs_volume_type {
+	yfs_volume_type_ro = 0,
+	yfs_volume_type_rw = 1,
+};
+
+#define yfs_FVSOnline		0x1
+#define yfs_FVSInservice	0x2
+#define yfs_FVSBlessed		0x4
+#define yfs_FVSNeedsSalvage	0x8
+
+struct yfs_xdr_YFSFetchVolumeStatus {
+	struct yfs_xdr_u64	vid;
+	struct yfs_xdr_u64	parent_id;
+	__be32			flags;
+	__be32			type;
+	struct yfs_xdr_u64	max_quota;
+	struct yfs_xdr_u64	blocks_in_use;
+	struct yfs_xdr_u64	part_blocks_avail;
+	struct yfs_xdr_u64	part_max_blocks;
+	struct yfs_xdr_u64	vol_copy_date;
+	struct yfs_xdr_u64	vol_backup_date;
+} __packed;
+
+struct yfs_xdr_YFSStoreVolumeStatus {
+	__be32			mask;
+	struct yfs_xdr_u64	min_quota;
+	struct yfs_xdr_u64	max_quota;
+	struct yfs_xdr_u64	file_quota;
+} __packed;
+
+enum yfs_lock_type {
+	yfs_LockNone		= -1,
+	yfs_LockRead		= 0,
+	yfs_LockWrite		= 1,
+	yfs_LockExtend		= 2,
+	yfs_LockRelease		= 3,
+	yfs_LockMandatoryRead	= 0x100,
+	yfs_LockMandatoryWrite	= 0x101,
+	yfs_LockMandatoryExtend	= 0x102,
+};

diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 1faef56..172ba56 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Handle fileserver selection and rotation.
  *
  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
@@ -19,31 +15,29 @@
 #include "afs_fs.h"
 
 /*
- * Initialise a filesystem server cursor for iterating over FS servers.
- */
-static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
-{
-	memset(fc, 0, sizeof(*fc));
-}
-
-/*
  * Begin an operation on the fileserver.
  *
  * Fileserver operations are serialised on the server by vnode, so we serialise
  * them here also using the io_lock.
  */
 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
-			       struct key *key)
+			       struct key *key, bool intr)
 {
-	afs_init_fs_cursor(fc, vnode);
+	memset(fc, 0, sizeof(*fc));
 	fc->vnode = vnode;
 	fc->key = key;
 	fc->ac.error = SHRT_MAX;
+	fc->error = -EDESTADDRREQ;
 
-	if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
-		fc->ac.error = -EINTR;
-		fc->flags |= AFS_FS_CURSOR_STOP;
-		return false;
+	if (intr) {
+		fc->flags |= AFS_FS_CURSOR_INTR;
+		if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
+			fc->error = -EINTR;
+			fc->flags |= AFS_FS_CURSOR_STOP;
+			return false;
+		}
+	} else {
+		mutex_lock(&vnode->io_lock);
 	}
 
 	if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
@@ -65,12 +59,16 @@
 	fc->server_list = afs_get_serverlist(vnode->volume->servers);
 	read_unlock(&vnode->volume->servers_lock);
 
-	cbi = vnode->cb_interest;
+	fc->untried = (1UL << fc->server_list->nr_servers) - 1;
+	fc->index = READ_ONCE(fc->server_list->preferred);
+
+	cbi = rcu_dereference_protected(vnode->cb_interest,
+					lockdep_is_held(&vnode->io_lock));
 	if (cbi) {
 		/* See if the vnode's preferred record is still available */
 		for (i = 0; i < fc->server_list->nr_servers; i++) {
 			if (fc->server_list->servers[i].cb_interest == cbi) {
-				fc->start = i;
+				fc->index = i;
 				goto found_interest;
 			}
 		}
@@ -80,26 +78,23 @@
 		 * and have to return an error.
 		 */
 		if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
-			fc->ac.error = -ESTALE;
+			fc->error = -ESTALE;
 			return false;
 		}
 
 		/* Note that the callback promise is effectively broken */
 		write_seqlock(&vnode->cb_lock);
-		ASSERTCMP(cbi, ==, vnode->cb_interest);
-		vnode->cb_interest = NULL;
+		ASSERTCMP(cbi, ==, rcu_access_pointer(vnode->cb_interest));
+		rcu_assign_pointer(vnode->cb_interest, NULL);
 		if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
 			vnode->cb_break++;
 		write_sequnlock(&vnode->cb_lock);
 
 		afs_put_cb_interest(afs_v2net(vnode), cbi);
 		cbi = NULL;
-	} else {
-		fc->start = READ_ONCE(fc->server_list->index);
 	}
 
 found_interest:
-	fc->index = fc->start;
 	return true;
 }
 
@@ -117,7 +112,7 @@
 	default:		m = "busy";		break;
 	}
 
-	pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m);
+	pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
 }
 
 /*
@@ -125,10 +120,14 @@
  */
 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
 {
-	msleep_interruptible(1000);
-	if (signal_pending(current)) {
-		fc->ac.error = -ERESTARTSYS;
-		return false;
+	if (fc->flags & AFS_FS_CURSOR_INTR) {
+		msleep_interruptible(1000);
+		if (signal_pending(current)) {
+			fc->error = -ERESTARTSYS;
+			return false;
+		}
+	} else {
+		msleep(1000);
 	}
 
 	return true;
@@ -143,27 +142,33 @@
 	struct afs_addr_list *alist;
 	struct afs_server *server;
 	struct afs_vnode *vnode = fc->vnode;
+	struct afs_error e;
+	u32 rtt;
+	int error = fc->ac.error, i;
 
-	_enter("%u/%u,%u/%u,%d,%d",
-	       fc->index, fc->start,
-	       fc->ac.index, fc->ac.start,
-	       fc->ac.error, fc->ac.abort_code);
+	_enter("%lx[%d],%lx[%d],%d,%d",
+	       fc->untried, fc->index,
+	       fc->ac.tried, fc->ac.index,
+	       error, fc->ac.abort_code);
 
 	if (fc->flags & AFS_FS_CURSOR_STOP) {
 		_leave(" = f [stopped]");
 		return false;
 	}
 
+	fc->nr_iterations++;
+
 	/* Evaluate the result of the previous operation, if there was one. */
-	switch (fc->ac.error) {
+	switch (error) {
 	case SHRT_MAX:
 		goto start;
 
 	case 0:
 	default:
 		/* Success or local failure.  Stop. */
+		fc->error = error;
 		fc->flags |= AFS_FS_CURSOR_STOP;
-		_leave(" = f [okay/local %d]", fc->ac.error);
+		_leave(" = f [okay/local %d]", error);
 		return false;
 
 	case -ECONNABORTED:
@@ -178,7 +183,7 @@
 			 * - May indicate that the fileserver couldn't attach to the vol.
 			 */
 			if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
-				fc->ac.error = -EREMOTEIO;
+				fc->error = -EREMOTEIO;
 				goto next_server;
 			}
 
@@ -187,12 +192,12 @@
 			write_unlock(&vnode->volume->servers_lock);
 
 			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
-			fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
-			if (fc->ac.error < 0)
-				goto failed;
+			error = afs_check_volume_status(vnode->volume, fc->key);
+			if (error < 0)
+				goto failed_set_error;
 
 			if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
-				fc->ac.error = -ENOMEDIUM;
+				fc->error = -ENOMEDIUM;
 				goto failed;
 			}
 
@@ -200,7 +205,7 @@
 			 * it's the fileserver having trouble.
 			 */
 			if (vnode->volume->servers == fc->server_list) {
-				fc->ac.error = -EREMOTEIO;
+				fc->error = -EREMOTEIO;
 				goto next_server;
 			}
 
@@ -215,7 +220,7 @@
 		case VONLINE:
 		case VDISKFULL:
 		case VOVERQUOTA:
-			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+			fc->error = afs_abort_to_error(fc->ac.abort_code);
 			goto next_server;
 
 		case VOFFLINE:
@@ -224,11 +229,11 @@
 				clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
 			}
 			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
-				fc->ac.error = -EADV;
+				fc->error = -EADV;
 				goto failed;
 			}
 			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
-				fc->ac.error = -ESTALE;
+				fc->error = -ESTALE;
 				goto failed;
 			}
 			goto busy;
@@ -240,7 +245,7 @@
 			 * have a file lock we need to maintain.
 			 */
 			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
-				fc->ac.error = -EBUSY;
+				fc->error = -EBUSY;
 				goto failed;
 			}
 			if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
@@ -269,16 +274,16 @@
 			 * honour, just in case someone sets up a loop.
 			 */
 			if (fc->flags & AFS_FS_CURSOR_VMOVED) {
-				fc->ac.error = -EREMOTEIO;
+				fc->error = -EREMOTEIO;
 				goto failed;
 			}
 			fc->flags |= AFS_FS_CURSOR_VMOVED;
 
 			set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
 			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
-			fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
-			if (fc->ac.error < 0)
-				goto failed;
+			error = afs_check_volume_status(vnode->volume, fc->key);
+			if (error < 0)
+				goto failed_set_error;
 
 			/* If the server list didn't change, then the VLDB is
 			 * out of sync with the fileservers.  This is hopefully
@@ -290,7 +295,7 @@
 			 * TODO: Retry a few times with sleeps.
 			 */
 			if (vnode->volume->servers == fc->server_list) {
-				fc->ac.error = -ENOMEDIUM;
+				fc->error = -ENOMEDIUM;
 				goto failed;
 			}
 
@@ -299,20 +304,28 @@
 		default:
 			clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
 			clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
-			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+			fc->error = afs_abort_to_error(fc->ac.abort_code);
 			goto failed;
 		}
 
-	case -ENETUNREACH:
-	case -EHOSTUNREACH:
-	case -ECONNREFUSED:
 	case -ETIMEDOUT:
 	case -ETIME:
+		if (fc->error != -EDESTADDRREQ)
+			goto iterate_address;
+		/* Fall through */
+	case -ERFKILL:
+	case -EADDRNOTAVAIL:
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -EHOSTDOWN:
+	case -ECONNREFUSED:
 		_debug("no conn");
+		fc->error = error;
 		goto iterate_address;
 
 	case -ECONNRESET:
 		_debug("call reset");
+		fc->error = error;
 		goto failed;
 	}
 
@@ -328,15 +341,57 @@
 	/* See if we need to do an update of the volume record.  Note that the
 	 * volume may have moved or even have been deleted.
 	 */
-	fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
-	if (fc->ac.error < 0)
-		goto failed;
+	error = afs_check_volume_status(vnode->volume, fc->key);
+	if (error < 0)
+		goto failed_set_error;
 
 	if (!afs_start_fs_iteration(fc, vnode))
 		goto failed;
 
-use_server:
-	_debug("use");
+	_debug("__ VOL %llx __", vnode->volume->vid);
+	error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
+	if (error < 0)
+		goto failed_set_error;
+
+pick_server:
+	_debug("pick [%lx]", fc->untried);
+
+	error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
+	if (error < 0)
+		goto failed_set_error;
+
+	/* Pick the untried server with the lowest RTT.  If we have outstanding
+	 * callbacks, we stick with the server we're already using if we can.
+	 */
+	if (fc->cbi) {
+		_debug("cbi %u", fc->index);
+		if (test_bit(fc->index, &fc->untried))
+			goto selected_server;
+		afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
+		fc->cbi = NULL;
+		_debug("nocbi");
+	}
+
+	fc->index = -1;
+	rtt = U32_MAX;
+	for (i = 0; i < fc->server_list->nr_servers; i++) {
+		struct afs_server *s = fc->server_list->servers[i].server;
+
+		if (!test_bit(i, &fc->untried) || !s->probe.responded)
+			continue;
+		if (s->probe.rtt < rtt) {
+			fc->index = i;
+			rtt = s->probe.rtt;
+		}
+	}
+
+	if (fc->index == -1)
+		goto no_more_servers;
+
+selected_server:
+	_debug("use %d", fc->index);
+	__clear_bit(fc->index, &fc->untried);
+
 	/* We're starting on a different fileserver from the list.  We need to
 	 * check it, create a callback intercept, find its address list and
 	 * probe its capabilities before we use it.
@@ -354,12 +409,14 @@
 	 * break request before we've finished decoding the reply and
 	 * installing the vnode.
 	 */
-	fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list,
-						       fc->index);
-	if (fc->ac.error < 0)
-		goto failed;
+	error = afs_register_server_cb_interest(vnode, fc->server_list,
+						fc->index);
+	if (error < 0)
+		goto failed_set_error;
 
-	fc->cbi = afs_get_cb_interest(vnode->cb_interest);
+	fc->cbi = afs_get_cb_interest(
+		rcu_dereference_protected(vnode->cb_interest,
+					  lockdep_is_held(&vnode->io_lock)));
 
 	read_lock(&server->fs_lock);
 	alist = rcu_dereference_protected(server->addresses,
@@ -369,66 +426,55 @@
 
 	memset(&fc->ac, 0, sizeof(fc->ac));
 
-	/* Probe the current fileserver if we haven't done so yet. */
-	if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
-		fc->ac.alist = afs_get_addrlist(alist);
-
-		if (!afs_probe_fileserver(fc)) {
-			switch (fc->ac.error) {
-			case -ENOMEM:
-			case -ERESTARTSYS:
-			case -EINTR:
-				goto failed;
-			default:
-				goto next_server;
-			}
-		}
-	}
-
 	if (!fc->ac.alist)
 		fc->ac.alist = alist;
 	else
 		afs_put_addrlist(alist);
 
-	fc->ac.start = READ_ONCE(alist->index);
-	fc->ac.index = fc->ac.start;
+	fc->ac.index = -1;
 
 iterate_address:
 	ASSERT(fc->ac.alist);
-	_debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs);
 	/* Iterate over the current server's address list to try and find an
 	 * address on which it will respond to us.
 	 */
 	if (!afs_iterate_addresses(&fc->ac))
 		goto next_server;
 
+	_debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
+
 	_leave(" = t");
 	return true;
 
 next_server:
 	_debug("next");
 	afs_end_cursor(&fc->ac);
-	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
-	fc->cbi = NULL;
-	fc->index++;
-	if (fc->index >= fc->server_list->nr_servers)
-		fc->index = 0;
-	if (fc->index != fc->start)
-		goto use_server;
+	goto pick_server;
 
+no_more_servers:
 	/* That's all the servers poked to no good effect.  Try again if some
 	 * of them were busy.
 	 */
 	if (fc->flags & AFS_FS_CURSOR_VBUSY)
 		goto restart_from_beginning;
 
-	fc->ac.error = -EDESTADDRREQ;
-	goto failed;
+	e.error = -EDESTADDRREQ;
+	e.responded = false;
+	for (i = 0; i < fc->server_list->nr_servers; i++) {
+		struct afs_server *s = fc->server_list->servers[i].server;
 
+		afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+				     s->probe.abort_code);
+	}
+
+	error = e.error;
+
+failed_set_error:
+	fc->error = error;
 failed:
 	fc->flags |= AFS_FS_CURSOR_STOP;
 	afs_end_cursor(&fc->ac);
-	_leave(" = f [failed %d]", fc->ac.error);
+	_leave(" = f [failed %d]", fc->error);
 	return false;
 }
 
@@ -440,20 +486,24 @@
 bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
 {
 	struct afs_vnode *vnode = fc->vnode;
-	struct afs_cb_interest *cbi = vnode->cb_interest;
+	struct afs_cb_interest *cbi;
 	struct afs_addr_list *alist;
+	int error = fc->ac.error;
 
 	_enter("");
 
-	switch (fc->ac.error) {
+	cbi = rcu_dereference_protected(vnode->cb_interest,
+					lockdep_is_held(&vnode->io_lock));
+
+	switch (error) {
 	case SHRT_MAX:
 		if (!cbi) {
-			fc->ac.error = -ESTALE;
+			fc->error = -ESTALE;
 			fc->flags |= AFS_FS_CURSOR_STOP;
 			return false;
 		}
 
-		fc->cbi = afs_get_cb_interest(vnode->cb_interest);
+		fc->cbi = afs_get_cb_interest(cbi);
 
 		read_lock(&cbi->server->fs_lock);
 		alist = rcu_dereference_protected(cbi->server->addresses,
@@ -461,35 +511,40 @@
 		afs_get_addrlist(alist);
 		read_unlock(&cbi->server->fs_lock);
 		if (!alist) {
-			fc->ac.error = -ESTALE;
+			fc->error = -ESTALE;
 			fc->flags |= AFS_FS_CURSOR_STOP;
 			return false;
 		}
 
 		memset(&fc->ac, 0, sizeof(fc->ac));
 		fc->ac.alist = alist;
-		fc->ac.start = READ_ONCE(alist->index);
-		fc->ac.index = fc->ac.start;
+		fc->ac.index = -1;
 		goto iterate_address;
 
 	case 0:
 	default:
 		/* Success or local failure.  Stop. */
+		fc->error = error;
 		fc->flags |= AFS_FS_CURSOR_STOP;
-		_leave(" = f [okay/local %d]", fc->ac.error);
+		_leave(" = f [okay/local %d]", error);
 		return false;
 
 	case -ECONNABORTED:
+		fc->error = afs_abort_to_error(fc->ac.abort_code);
 		fc->flags |= AFS_FS_CURSOR_STOP;
 		_leave(" = f [abort]");
 		return false;
 
+	case -ERFKILL:
+	case -EADDRNOTAVAIL:
 	case -ENETUNREACH:
 	case -EHOSTUNREACH:
+	case -EHOSTDOWN:
 	case -ECONNREFUSED:
 	case -ETIMEDOUT:
 	case -ETIME:
 		_debug("no conn");
+		fc->error = error;
 		goto iterate_address;
 	}
 
@@ -507,12 +562,66 @@
 }
 
 /*
+ * Dump cursor state in the case of the error being EDESTADDRREQ.
+ */
+static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
+{
+	static int count;
+	int i;
+
+	if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
+		return;
+	count++;
+
+	rcu_read_lock();
+
+	pr_notice("EDESTADDR occurred\n");
+	pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
+		  fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
+	pr_notice("FC: ut=%lx ix=%d ni=%u\n",
+		  fc->untried, fc->index, fc->nr_iterations);
+
+	if (fc->server_list) {
+		const struct afs_server_list *sl = fc->server_list;
+		pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
+			  sl->nr_servers, sl->preferred, sl->vnovol_mask);
+		for (i = 0; i < sl->nr_servers; i++) {
+			const struct afs_server *s = sl->servers[i].server;
+			pr_notice("FC: server fl=%lx av=%u %pU\n",
+				  s->flags, s->addr_version, &s->uuid);
+			if (s->addresses) {
+				const struct afs_addr_list *a =
+					rcu_dereference(s->addresses);
+				pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
+					  a->version,
+					  a->nr_ipv4, a->nr_addrs, a->max_addrs,
+					  a->preferred);
+				pr_notice("FC:  - pr=%lx R=%lx F=%lx\n",
+					  a->probed, a->responded, a->failed);
+				if (a == fc->ac.alist)
+					pr_notice("FC:  - current\n");
+			}
+		}
+	}
+
+	pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+		  fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
+		  fc->ac.responded, fc->ac.nr_iterations);
+	rcu_read_unlock();
+}
+
+/*
  * Tidy up a filesystem cursor and unlock the vnode.
  */
 int afs_end_vnode_operation(struct afs_fs_cursor *fc)
 {
 	struct afs_net *net = afs_v2net(fc->vnode);
-	int ret;
+
+	if (fc->error == -EDESTADDRREQ ||
+	    fc->error == -EADDRNOTAVAIL ||
+	    fc->error == -ENETUNREACH ||
+	    fc->error == -EHOSTUNREACH)
+		afs_dump_edestaddrreq(fc);
 
 	mutex_unlock(&fc->vnode->io_lock);
 
@@ -520,9 +629,8 @@
 	afs_put_cb_interest(net, fc->cbi);
 	afs_put_serverlist(net, fc->server_list);
 
-	ret = fc->ac.error;
-	if (ret == -ECONNABORTED)
-		afs_abort_to_error(fc->ac.abort_code);
+	if (fc->error == -ECONNABORTED)
+		fc->error = afs_abort_to_error(fc->ac.abort_code);
 
-	return fc->ac.error;
+	return fc->error;
 }

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 2543f24..61498d9 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Maintain an RxRPC server socket to do AFS communications through
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/slab.h>
@@ -16,12 +12,13 @@
 #include <net/af_rxrpc.h>
 #include "internal.h"
 #include "afs_cm.h"
+#include "protocol_yfs.h"
 
 struct workqueue_struct *afs_async_calls;
 
 static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
-static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
 static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
+static void afs_delete_async_call(struct work_struct *);
 static void afs_process_async_call(struct work_struct *);
 static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
 static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
@@ -75,6 +72,18 @@
 	if (ret < 0)
 		goto error_2;
 
+	srx.srx_service = YFS_CM_SERVICE;
+	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	if (ret < 0)
+		goto error_2;
+
+	/* Ideally, we'd turn on service upgrade here, but we can't because
+	 * OpenAFS is buggy and leaks the userStatus field from packet to
+	 * packet and between FS packets and CB packets - so if we try to do an
+	 * upgrade on an FS packet, OpenAFS will leak that into the CB packet
+	 * it sends back to us.
+	 */
+
 	rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
 					   afs_rx_discard_new_call);
 
@@ -143,6 +152,7 @@
 	INIT_WORK(&call->async_work, afs_process_async_call);
 	init_waitqueue_head(&call->waitq);
 	spin_lock_init(&call->state_lock);
+	call->_iter = &call->iter;
 
 	o = atomic_inc_return(&net->nr_outstanding_calls);
 	trace_afs_call(call, afs_call_trace_alloc, 1, o,
@@ -174,8 +184,9 @@
 		if (call->type->destructor)
 			call->type->destructor(call);
 
-		afs_put_server(call->net, call->cm_server);
+		afs_put_server(call->net, call->server, afs_server_trace_put_call);
 		afs_put_cb_interest(call->net, call->cbi);
+		afs_put_addrlist(call->alist);
 		kfree(call->request);
 
 		trace_afs_call(call, afs_call_trace_free, 0, o,
@@ -188,22 +199,29 @@
 	}
 }
 
-/*
- * Queue the call for actual work.  Returns 0 unconditionally for convenience.
- */
-int afs_queue_call_work(struct afs_call *call)
+static struct afs_call *afs_get_call(struct afs_call *call,
+				     enum afs_call_trace why)
 {
 	int u = atomic_inc_return(&call->usage);
 
-	trace_afs_call(call, afs_call_trace_work, u,
+	trace_afs_call(call, why, u,
 		       atomic_read(&call->net->nr_outstanding_calls),
 		       __builtin_return_address(0));
+	return call;
+}
 
-	INIT_WORK(&call->work, call->type->work);
+/*
+ * Queue the call for actual work.
+ */
+static void afs_queue_call_work(struct afs_call *call)
+{
+	if (call->type->work) {
+		INIT_WORK(&call->work, call->type->work);
 
-	if (!queue_work(afs_wq, &call->work))
-		afs_put_call(call);
-	return 0;
+		afs_get_call(call, afs_call_trace_work);
+		if (!queue_work(afs_wq, &call->work))
+			afs_put_call(call);
+	}
 }
 
 /*
@@ -233,6 +251,7 @@
 			goto nomem_free;
 	}
 
+	afs_extract_to_buf(call, call->reply_max);
 	call->operation_ID = type->op;
 	init_waitqueue_head(&call->waitq);
 	return call;
@@ -286,7 +305,7 @@
 		offset = 0;
 	}
 
-	iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes);
+	iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes);
 }
 
 /*
@@ -337,12 +356,12 @@
 }
 
 /*
- * initiate a call
+ * Initiate a call and synchronously queue up the parameters for dispatch.  Any
+ * error is stored into the call struct, which the caller must check for.
  */
-long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
-		   gfp_t gfp, bool async)
+void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
 {
-	struct sockaddr_rxrpc *srx = ac->addr;
+	struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index];
 	struct rxrpc_call *rxcall;
 	struct msghdr msg;
 	struct kvec iov[1];
@@ -358,7 +377,8 @@
 	       call, call->type->name, key_serial(call->key),
 	       atomic_read(&call->net->nr_outstanding_calls));
 
-	call->async = async;
+	call->addr_ix = ac->index;
+	call->alist = afs_get_addrlist(ac->alist);
 
 	/* Work out the length we're going to transmit.  This is awkward for
 	 * calls such as FS.StoreData where there's an extra injection of data
@@ -379,30 +399,41 @@
 		}
 	}
 
+	/* If the call is going to be asynchronous, we need an extra ref for
+	 * the call to hold itself so the caller need not hang on to its ref.
+	 */
+	if (call->async)
+		afs_get_call(call, afs_call_trace_get);
+
 	/* create a call */
 	rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
 					 (unsigned long)call,
 					 tx_total_len, gfp,
-					 (async ?
+					 (call->async ?
 					  afs_wake_up_async_call :
 					  afs_wake_up_call_waiter),
 					 call->upgrade,
+					 call->intr,
 					 call->debug_id);
 	if (IS_ERR(rxcall)) {
 		ret = PTR_ERR(rxcall);
+		call->error = ret;
 		goto error_kill_call;
 	}
 
 	call->rxcall = rxcall;
 
+	if (call->max_lifespan)
+		rxrpc_kernel_set_max_life(call->net->socket, rxcall,
+					  call->max_lifespan);
+
 	/* send the request */
 	iov[0].iov_base	= call->request;
 	iov[0].iov_len	= call->request_size;
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
-		      call->request_size);
+	iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= MSG_WAITALL | (call->send_pages ? MSG_MORE : 0);
@@ -419,20 +450,20 @@
 			goto error_do_abort;
 	}
 
-	/* at this point, an async call may no longer exist as it may have
-	 * already completed */
-	if (call->async)
-		return -EINPROGRESS;
-
-	return afs_wait_for_call_to_complete(call, ac);
+	/* Note that at this point, we may have received the reply or an abort
+	 * - and an asynchronous call may already have completed.
+	 *
+	 * afs_wait_for_call_to_complete(call, ac)
+	 * must be called to synchronously clean up.
+	 */
+	return;
 
 error_do_abort:
-	call->state = AFS_CALL_COMPLETE;
 	if (ret != -ECONNABORTED) {
 		rxrpc_kernel_abort_call(call->net->socket, rxcall,
 					RX_USER_ABORT, ret, "KSD");
 	} else {
-		iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, NULL, 0, 0);
+		iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
 		rxrpc_kernel_recv_data(call->net->socket, rxcall,
 				       &msg.msg_iter, false,
 				       &call->abort_code, &call->service_id);
@@ -442,10 +473,26 @@
 	call->error = ret;
 	trace_afs_call_done(call);
 error_kill_call:
-	afs_put_call(call);
+	if (call->type->done)
+		call->type->done(call);
+
+	/* We need to dispose of the extra ref we grabbed for an async call.
+	 * The call, however, might be queued on afs_async_calls and we need to
+	 * make sure we don't get any more notifications that might requeue it.
+	 */
+	if (call->rxcall) {
+		rxrpc_kernel_end_call(call->net->socket, call->rxcall);
+		call->rxcall = NULL;
+	}
+	if (call->async) {
+		if (cancel_work_sync(&call->async_work))
+			afs_put_call(call);
+		afs_put_call(call);
+	}
+
 	ac->error = ret;
+	call->state = AFS_CALL_COMPLETE;
 	_leave(" = %d", ret);
-	return ret;
 }
 
 /*
@@ -466,14 +513,12 @@
 	       state == AFS_CALL_SV_AWAIT_ACK
 	       ) {
 		if (state == AFS_CALL_SV_AWAIT_ACK) {
-			struct iov_iter iter;
-
-			iov_iter_kvec(&iter, READ | ITER_KVEC, NULL, 0, 0);
+			iov_iter_kvec(&call->iter, READ, NULL, 0, 0);
 			ret = rxrpc_kernel_recv_data(call->net->socket,
-						     call->rxcall, &iter, false,
-						     &remote_abort,
+						     call->rxcall, &call->iter,
+						     false, &remote_abort,
 						     &call->service_id);
-			trace_afs_recv_data(call, 0, 0, false, ret);
+			trace_afs_receive_data(call, &call->iter, false, ret);
 
 			if (ret == -EINPROGRESS || ret == -EAGAIN)
 				return;
@@ -485,10 +530,17 @@
 			return;
 		}
 
+		if (!call->have_reply_time &&
+		    rxrpc_kernel_get_reply_time(call->net->socket,
+						call->rxcall,
+						&call->reply_time))
+			call->have_reply_time = true;
+
 		ret = call->type->deliver(call);
 		state = READ_ONCE(call->state);
 		switch (ret) {
 		case 0:
+			afs_queue_call_work(call);
 			if (state == AFS_CALL_CL_PROC_REPLY) {
 				if (call->cbi)
 					set_bit(AFS_SERVER_FL_MAY_HAVE_CB,
@@ -515,17 +567,23 @@
 		case -ENODATA:
 		case -EBADMSG:
 		case -EMSGSIZE:
-		default:
 			abort_code = RXGEN_CC_UNMARSHAL;
 			if (state != AFS_CALL_CL_AWAIT_REPLY)
 				abort_code = RXGEN_SS_UNMARSHAL;
 			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
-						abort_code, -EBADMSG, "KUM");
+						abort_code, ret, "KUM");
+			goto local_abort;
+		default:
+			abort_code = RX_USER_ABORT;
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+						abort_code, ret, "KER");
 			goto local_abort;
 		}
 	}
 
 done:
+	if (call->type->done)
+		call->type->done(call);
 	if (state == AFS_CALL_COMPLETE && call->incoming)
 		afs_put_call(call);
 out:
@@ -541,27 +599,33 @@
 }
 
 /*
- * wait synchronously for a call to complete
+ * Wait synchronously for a call to complete and clean up the call struct.
  */
-static long afs_wait_for_call_to_complete(struct afs_call *call,
-					  struct afs_addr_cursor *ac)
+long afs_wait_for_call_to_complete(struct afs_call *call,
+				   struct afs_addr_cursor *ac)
 {
 	signed long rtt2, timeout;
 	long ret;
+	bool stalled = false;
 	u64 rtt;
 	u32 life, last_life;
+	bool rxrpc_complete = false;
 
 	DECLARE_WAITQUEUE(myself, current);
 
 	_enter("");
 
+	ret = call->error;
+	if (ret < 0)
+		goto out;
+
 	rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
 	rtt2 = nsecs_to_jiffies64(rtt) * 2;
 	if (rtt2 < 2)
 		rtt2 = 2;
 
 	timeout = rtt2;
-	last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+	rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life);
 
 	add_wait_queue(&call->waitq, &myself);
 	for (;;) {
@@ -573,20 +637,34 @@
 			call->need_attention = false;
 			__set_current_state(TASK_RUNNING);
 			afs_deliver_to_call(call);
+			timeout = rtt2;
 			continue;
 		}
 
 		if (afs_check_call_state(call, AFS_CALL_COMPLETE))
 			break;
 
-		life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
-		if (timeout == 0 &&
-		    life == last_life && signal_pending(current))
+		if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) {
+			/* rxrpc terminated the call. */
+			rxrpc_complete = true;
+			break;
+		}
+
+		if (call->intr && timeout == 0 &&
+		    life == last_life && signal_pending(current)) {
+			if (stalled)
 				break;
+			__set_current_state(TASK_RUNNING);
+			rxrpc_kernel_probe_life(call->net->socket, call->rxcall);
+			timeout = rtt2;
+			stalled = true;
+			continue;
+		}
 
 		if (life != last_life) {
 			timeout = rtt2;
 			last_life = life;
+			stalled = false;
 		}
 
 		timeout = schedule_timeout(timeout);
@@ -595,12 +673,16 @@
 	remove_wait_queue(&call->waitq, &myself);
 	__set_current_state(TASK_RUNNING);
 
-	/* Kill off the call if it's still live. */
 	if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
-		_debug("call interrupted");
-		if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
-					    RX_USER_ABORT, -EINTR, "KWI"))
-			afs_set_call_complete(call, -EINTR, 0);
+		if (rxrpc_complete) {
+			afs_set_call_complete(call, call->error, call->abort_code);
+		} else {
+			/* Kill off the call if it's still live. */
+			_debug("call interrupted");
+			if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+						    RX_USER_ABORT, -EINTR, "KWI"))
+				afs_set_call_complete(call, -EINTR, 0);
+		}
 	}
 
 	spin_lock_bh(&call->state_lock);
@@ -611,16 +693,16 @@
 	ret = ac->error;
 	switch (ret) {
 	case 0:
-		if (call->ret_reply0) {
-			ret = (long)call->reply[0];
-			call->reply[0] = NULL;
-		}
+		ret = call->ret0;
+		call->ret0 = 0;
+
 		/* Fall through */
 	case -ECONNABORTED:
 		ac->responded = true;
 		break;
 	}
 
+out:
 	_debug("call complete");
 	afs_put_call(call);
 	_leave(" = %p", (void *)ret);
@@ -731,6 +813,7 @@
 			call->async = true;
 			call->state = AFS_CALL_SV_AWAIT_OP_ID;
 			init_waitqueue_head(&call->waitq);
+			afs_extract_to_tmp(call);
 		}
 
 		if (rxrpc_kernel_charge_accept(net->socket,
@@ -776,18 +859,15 @@
 {
 	int ret;
 
-	_enter("{%zu}", call->offset);
-
-	ASSERTCMP(call->offset, <, 4);
+	_enter("{%zu}", iov_iter_count(call->_iter));
 
 	/* the operation ID forms the first four bytes of the request data */
-	ret = afs_extract_data(call, &call->tmp, 4, true);
+	ret = afs_extract_data(call, true);
 	if (ret < 0)
 		return ret;
 
 	call->operation_ID = ntohl(call->tmp);
 	afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST);
-	call->offset = 0;
 
 	/* ask the cache manager to route the call (it'll change the call type
 	 * if successful) */
@@ -828,7 +908,7 @@
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
+	iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
@@ -843,6 +923,7 @@
 		_debug("oom");
 		rxrpc_kernel_abort_call(net->socket, call->rxcall,
 					RX_USER_ABORT, -ENOMEM, "KOO");
+		/* Fall through */
 	default:
 		_leave(" [error]");
 		return;
@@ -867,7 +948,7 @@
 	iov[0].iov_len		= len;
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
+	iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
@@ -891,30 +972,19 @@
 /*
  * Extract a piece of data from the received data socket buffers.
  */
-int afs_extract_data(struct afs_call *call, void *buf, size_t count,
-		     bool want_more)
+int afs_extract_data(struct afs_call *call, bool want_more)
 {
 	struct afs_net *net = call->net;
-	struct iov_iter iter;
-	struct kvec iov;
+	struct iov_iter *iter = call->_iter;
 	enum afs_call_state state;
 	u32 remote_abort = 0;
 	int ret;
 
-	_enter("{%s,%zu},,%zu,%d",
-	       call->type->name, call->offset, count, want_more);
+	_enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more);
 
-	ASSERTCMP(call->offset, <=, count);
-
-	iov.iov_base = buf + call->offset;
-	iov.iov_len = count - call->offset;
-	iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, count - call->offset);
-
-	ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, &iter,
+	ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter,
 				     want_more, &remote_abort,
 				     &call->service_id);
-	call->offset += (count - call->offset) - iov_iter_count(&iter);
-	trace_afs_recv_data(call, count, call->offset, want_more, ret);
 	if (ret == 0 || ret == -EAGAIN)
 		return ret;
 
@@ -929,7 +999,7 @@
 			break;
 		case AFS_CALL_COMPLETE:
 			kdebug("prem complete %d", call->error);
-			return -EIO;
+			return afs_io_error(call, afs_io_error_extract);
 		default:
 			break;
 		}
@@ -943,8 +1013,9 @@
 /*
  * Log protocol error production.
  */
-noinline int afs_protocol_error(struct afs_call *call, int error)
+noinline int afs_protocol_error(struct afs_call *call, int error,
+				enum afs_eproto_cause cause)
 {
-	trace_afs_protocol_error(call, error, __builtin_return_address(0));
+	trace_afs_protocol_error(call, error, cause);
 	return error;
 }

diff --git a/fs/afs/security.c b/fs/afs/security.c
index 81dfedb..ce9de1e 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS security handling
  *
  * Copyright (C) 2007, 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/init.h>
@@ -31,8 +27,37 @@
 	_enter("{%x}", key_serial(cell->anonymous_key));
 
 	_debug("key %s", cell->anonymous_key->description);
-	key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
-			  NULL);
+	key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description,
+			      cell->net->net, NULL);
+	if (IS_ERR(key)) {
+		if (PTR_ERR(key) != -ENOKEY) {
+			_leave(" = %ld", PTR_ERR(key));
+			return key;
+		}
+
+		/* act as anonymous user */
+		_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
+		return key_get(cell->anonymous_key);
+	} else {
+		/* act as authorised user */
+		_leave(" = {%x} [auth]", key_serial(key));
+		return key;
+	}
+}
+
+/*
+ * Get a key when pathwalk is in rcuwalk mode.
+ */
+struct key *afs_request_key_rcu(struct afs_cell *cell)
+{
+	struct key *key;
+
+	_enter("{%x}", key_serial(cell->anonymous_key));
+
+	_debug("key %s", cell->anonymous_key->description);
+	key = request_key_net_rcu(&key_type_rxrpc,
+				  cell->anonymous_key->description,
+				  cell->net->net);
 	if (IS_ERR(key)) {
 		if (PTR_ERR(key) != -ENOKEY) {
 			_leave(" = %ld", PTR_ERR(key));
@@ -87,11 +112,9 @@
 	permits = rcu_dereference_protected(vnode->permit_cache,
 					    lockdep_is_held(&vnode->lock));
 	RCU_INIT_POINTER(vnode->permit_cache, NULL);
-	vnode->cb_break++;
 	spin_unlock(&vnode->lock);
 
-	if (permits)
-		afs_put_permits(permits);
+	afs_put_permits(permits);
 }
 
 /*
@@ -118,15 +141,15 @@
  * as the ACL *may* have changed.
  */
 void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
-		      unsigned int cb_break)
+		      unsigned int cb_break, struct afs_status_cb *scb)
 {
 	struct afs_permits *permits, *xpermits, *replacement, *zap, *new = NULL;
-	afs_access_t caller_access = READ_ONCE(vnode->status.caller_access);
+	afs_access_t caller_access = scb->status.caller_access;
 	size_t size = 0;
 	bool changed = false;
 	int i, j;
 
-	_enter("{%x:%u},%x,%x",
+	_enter("{%llx:%llu},%x,%x",
 	       vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access);
 
 	rcu_read_lock();
@@ -147,7 +170,8 @@
 					break;
 				}
 
-				if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) {
+				if (afs_cb_is_broken(cb_break, vnode,
+						     rcu_dereference(vnode->cb_interest))) {
 					changed = true;
 					break;
 				}
@@ -177,7 +201,7 @@
 		}
 	}
 
-	if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest))
+	if (afs_cb_is_broken(cb_break, vnode, rcu_dereference(vnode->cb_interest)))
 		goto someone_else_changed_it;
 
 	/* We need a ref on any permits list we want to copy as we'll have to
@@ -254,14 +278,16 @@
 
 	kfree(new);
 
+	rcu_read_lock();
 	spin_lock(&vnode->lock);
 	zap = rcu_access_pointer(vnode->permit_cache);
-	if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) &&
+	if (!afs_cb_is_broken(cb_break, vnode, rcu_dereference(vnode->cb_interest)) &&
 	    zap == permits)
 		rcu_assign_pointer(vnode->permit_cache, replacement);
 	else
 		zap = replacement;
 	spin_unlock(&vnode->lock);
+	rcu_read_unlock();
 	afs_put_permits(zap);
 out_put:
 	afs_put_permits(permits);
@@ -277,6 +303,40 @@
 	return;
 }
 
+static bool afs_check_permit_rcu(struct afs_vnode *vnode, struct key *key,
+				 afs_access_t *_access)
+{
+	const struct afs_permits *permits;
+	int i;
+
+	_enter("{%llx:%llu},%x",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key));
+
+	/* check the permits to see if we've got one yet */
+	if (key == vnode->volume->cell->anonymous_key) {
+		*_access = vnode->status.anon_access;
+		_leave(" = t [anon %x]", *_access);
+		return true;
+	}
+
+	permits = rcu_dereference(vnode->permit_cache);
+	if (permits) {
+		for (i = 0; i < permits->nr_permits; i++) {
+			if (permits->permits[i].key < key)
+				continue;
+			if (permits->permits[i].key > key)
+				break;
+
+			*_access = permits->permits[i].access;
+			_leave(" = %u [perm %x]", !permits->invalidated, *_access);
+			return !permits->invalidated;
+		}
+	}
+
+	_leave(" = f");
+	return false;
+}
+
 /*
  * check with the fileserver to see if the directory or parent directory is
  * permitted to be accessed with this authorisation, and if so, what access it
@@ -289,7 +349,7 @@
 	bool valid = false;
 	int i, ret;
 
-	_enter("{%x:%u},%x",
+	_enter("{%llx:%llu},%x",
 	       vnode->fid.vid, vnode->fid.vnode, key_serial(key));
 
 	/* check the permits to see if we've got one yet */
@@ -321,13 +381,12 @@
 		 */
 		_debug("no valid permit");
 
-		ret = afs_fetch_status(vnode, key, false);
+		ret = afs_fetch_status(vnode, key, false, _access);
 		if (ret < 0) {
 			*_access = 0;
 			_leave(" = %d", ret);
 			return ret;
 		}
-		*_access = vnode->status.caller_access;
 	}
 
 	_leave(" = 0 [access %x]", *_access);
@@ -344,33 +403,42 @@
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t uninitialized_var(access);
 	struct key *key;
-	int ret;
+	int ret = 0;
 
-	if (mask & MAY_NOT_BLOCK)
-		return -ECHILD;
-
-	_enter("{{%x:%u},%lx},%x,",
+	_enter("{{%llx:%llu},%lx},%x,",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
 
-	key = afs_request_key(vnode->volume->cell);
-	if (IS_ERR(key)) {
-		_leave(" = %ld [key]", PTR_ERR(key));
-		return PTR_ERR(key);
+	if (mask & MAY_NOT_BLOCK) {
+		key = afs_request_key_rcu(vnode->volume->cell);
+		if (IS_ERR(key))
+			return -ECHILD;
+
+		ret = -ECHILD;
+		if (!afs_check_validity(vnode) ||
+		    !afs_check_permit_rcu(vnode, key, &access))
+			goto error;
+	} else {
+		key = afs_request_key(vnode->volume->cell);
+		if (IS_ERR(key)) {
+			_leave(" = %ld [key]", PTR_ERR(key));
+			return PTR_ERR(key);
+		}
+
+		ret = afs_validate(vnode, key);
+		if (ret < 0)
+			goto error;
+
+		/* check the permits to see if we've got one yet */
+		ret = afs_check_permit(vnode, key, &access);
+		if (ret < 0)
+			goto error;
 	}
 
-	ret = afs_validate(vnode, key);
-	if (ret < 0)
-		goto error;
-
-	/* check the permits to see if we've got one yet */
-	ret = afs_check_permit(vnode, key, &access);
-	if (ret < 0)
-		goto error;
-
 	/* interpret the access mask */
 	_debug("REQ %x ACC %x on %s",
 	       mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
 
+	ret = 0;
 	if (S_ISDIR(inode->i_mode)) {
 		if (mask & (MAY_EXEC | MAY_READ | MAY_CHDIR)) {
 			if (!(access & AFS_ACE_LOOKUP))

diff --git a/fs/afs/server.c b/fs/afs/server.c
index 1d329e6..64d440a 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c

@@ -1,21 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS server record management
  *
  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include "afs_fs.h"
 #include "internal.h"
+#include "protocol_yfs.h"
 
 static unsigned afs_server_gc_delay = 10;	/* Server record timeout in seconds */
 static unsigned afs_server_update_delay = 30;	/* Time till VLDB recheck in secs */
+static atomic_t afs_server_debug_id;
 
 static void afs_inc_servers_outstanding(struct afs_net *net)
 {
@@ -50,7 +48,7 @@
 
 	do {
 		if (server)
-			afs_put_server(net, server);
+			afs_put_server(net, server, afs_server_trace_put_find_rsq);
 		server = NULL;
 		read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
 
@@ -115,7 +113,7 @@
 		 * changes.
 		 */
 		if (server)
-			afs_put_server(net, server);
+			afs_put_server(net, server, afs_server_trace_put_uuid_rsq);
 		server = NULL;
 
 		read_seqbegin_or_lock(&net->fs_lock, &seq);
@@ -130,7 +128,7 @@
 			} else if (diff > 0) {
 				p = p->rb_right;
 			} else {
-				afs_get_server(server);
+				afs_get_server(server, afs_server_trace_get_by_uuid);
 				break;
 			}
 
@@ -201,7 +199,7 @@
 	ret = 0;
 
 exists:
-	afs_get_server(server);
+	afs_get_server(server, afs_server_trace_get_install);
 	write_sequnlock(&net->fs_lock);
 	return server;
 }
@@ -222,16 +220,19 @@
 		goto enomem;
 
 	atomic_set(&server->usage, 1);
+	server->debug_id = atomic_inc_return(&afs_server_debug_id);
 	RCU_INIT_POINTER(server->addresses, alist);
 	server->addr_version = alist->version;
 	server->uuid = *uuid;
-	server->flags = (1UL << AFS_SERVER_FL_NEW);
 	server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
 	rwlock_init(&server->fs_lock);
 	INIT_HLIST_HEAD(&server->cb_volumes);
 	rwlock_init(&server->cb_break_lock);
+	init_waitqueue_head(&server->probe_wq);
+	spin_lock_init(&server->probe_lock);
 
 	afs_inc_servers_outstanding(net);
+	trace_afs_server(server, 1, afs_server_trace_alloc);
 	_leave(" = %p", server);
 	return server;
 
@@ -246,41 +247,23 @@
 static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
 						 struct key *key, const uuid_t *uuid)
 {
-	struct afs_addr_cursor ac;
-	struct afs_addr_list *alist;
+	struct afs_vl_cursor vc;
+	struct afs_addr_list *alist = NULL;
 	int ret;
 
-	ret = afs_set_vl_cursor(&ac, cell);
-	if (ret < 0)
-		return ERR_PTR(ret);
-
-	while (afs_iterate_addresses(&ac)) {
-		if (test_bit(ac.index, &ac.alist->yfs))
-			alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid);
-		else
-			alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid);
-		switch (ac.error) {
-		case 0:
-			afs_end_cursor(&ac);
-			return alist;
-		case -ECONNABORTED:
-			ac.error = afs_abort_to_error(ac.abort_code);
-			goto error;
-		case -ENOMEM:
-		case -ENONET:
-			goto error;
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-			break;
-		default:
-			ac.error = -EIO;
-			goto error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vlserver_operation(&vc, cell, key)) {
+		while (afs_select_vlserver(&vc)) {
+			if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags))
+				alist = afs_yfsvl_get_endpoints(&vc, uuid);
+			else
+				alist = afs_vl_get_addrs_u(&vc, uuid);
 		}
+
+		ret = afs_end_vlserver_operation(&vc);
 	}
 
-error:
-	return ERR_PTR(afs_end_cursor(&ac));
+	return ret < 0 ? ERR_PTR(ret) : alist;
 }
 
 /*
@@ -345,9 +328,22 @@
 }
 
 /*
+ * Get a reference on a server object.
+ */
+struct afs_server *afs_get_server(struct afs_server *server,
+				  enum afs_server_trace reason)
+{
+	unsigned int u = atomic_inc_return(&server->usage);
+
+	trace_afs_server(server, u, reason);
+	return server;
+}
+
+/*
  * Release a reference on a server record.
  */
-void afs_put_server(struct afs_net *net, struct afs_server *server)
+void afs_put_server(struct afs_net *net, struct afs_server *server,
+		    enum afs_server_trace reason)
 {
 	unsigned int usage;
 
@@ -358,7 +354,7 @@
 
 	usage = atomic_dec_return(&server->usage);
 
-	_enter("{%u}", usage);
+	trace_afs_server(server, usage, reason);
 
 	if (likely(usage > 0))
 		return;
@@ -370,6 +366,8 @@
 {
 	struct afs_server *server = container_of(rcu, struct afs_server, rcu);
 
+	trace_afs_server(server, atomic_read(&server->usage),
+			 afs_server_trace_free);
 	afs_put_addrlist(rcu_access_pointer(server->addresses));
 	kfree(server);
 }
@@ -382,16 +380,21 @@
 	struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
 	struct afs_addr_cursor ac = {
 		.alist	= alist,
-		.start	= alist->index,
-		.index	= 0,
-		.addr	= &alist->addrs[alist->index],
+		.index	= alist->preferred,
 		.error	= 0,
 	};
-	_enter("%p", server);
+
+	trace_afs_server(server, atomic_read(&server->usage),
+			 afs_server_trace_give_up_cb);
 
 	if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
 		afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
 
+	wait_var_event(&server->probe_outstanding,
+		       atomic_read(&server->probe_outstanding) == 0);
+
+	trace_afs_server(server, atomic_read(&server->usage),
+			 afs_server_trace_destroy);
 	call_rcu(&server->rcu, afs_server_rcu);
 	afs_dec_servers_outstanding(net);
 }
@@ -411,6 +414,7 @@
 		write_seqlock(&net->fs_lock);
 		usage = 1;
 		deleted = atomic_try_cmpxchg(&server->usage, &usage, 0);
+		trace_afs_server(server, usage, afs_server_trace_gc);
 		if (deleted) {
 			rb_erase(&server->uuid_rb, &net->fs_servers);
 			hlist_del_rcu(&server->proc_link);
@@ -525,99 +529,6 @@
 }
 
 /*
- * Probe a fileserver to find its capabilities.
- *
- * TODO: Try service upgrade.
- */
-static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc)
-{
-	_enter("");
-
-	fc->ac.addr = NULL;
-	fc->ac.start = READ_ONCE(fc->ac.alist->index);
-	fc->ac.index = fc->ac.start;
-	fc->ac.error = 0;
-	fc->ac.begun = false;
-
-	while (afs_iterate_addresses(&fc->ac)) {
-		afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server,
-					&fc->ac, fc->key);
-		switch (fc->ac.error) {
-		case 0:
-			afs_end_cursor(&fc->ac);
-			set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags);
-			return true;
-		case -ECONNABORTED:
-			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
-			goto error;
-		case -ENOMEM:
-		case -ENONET:
-			goto error;
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-		case -ETIMEDOUT:
-		case -ETIME:
-			break;
-		default:
-			fc->ac.error = -EIO;
-			goto error;
-		}
-	}
-
-error:
-	afs_end_cursor(&fc->ac);
-	return false;
-}
-
-/*
- * If we haven't already, try probing the fileserver to get its capabilities.
- * We try not to instigate parallel probes, but it's possible that the parallel
- * probes will fail due to authentication failure when ours would succeed.
- *
- * TODO: Try sending an anonymous probe if an authenticated probe fails.
- */
-bool afs_probe_fileserver(struct afs_fs_cursor *fc)
-{
-	bool success;
-	int ret, retries = 0;
-
-	_enter("");
-
-retry:
-	if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) {
-		_leave(" = t");
-		return true;
-	}
-
-	if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) {
-		success = afs_do_probe_fileserver(fc);
-		clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags);
-		wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING);
-		_leave(" = t");
-		return success;
-	}
-
-	_debug("wait");
-	ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING,
-			  TASK_INTERRUPTIBLE);
-	if (ret == -ERESTARTSYS) {
-		fc->ac.error = ret;
-		_leave(" = f [%d]", ret);
-		return false;
-	}
-
-	retries++;
-	if (retries == 4) {
-		fc->ac.error = -ESTALE;
-		_leave(" = f [stale]");
-		return false;
-	}
-	_debug("retry");
-	goto retry;
-}
-
-/*
  * Get an update for a server's address list.
  */
 static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
@@ -626,11 +537,20 @@
 
 	_enter("");
 
+	trace_afs_server(server, atomic_read(&server->usage), afs_server_trace_update);
+
 	alist = afs_vl_lookup_addrs(fc->vnode->volume->cell, fc->key,
 				    &server->uuid);
 	if (IS_ERR(alist)) {
-		fc->ac.error = PTR_ERR(alist);
-		_leave(" = f [%d]", fc->ac.error);
+		if ((PTR_ERR(alist) == -ERESTARTSYS ||
+		     PTR_ERR(alist) == -EINTR) &&
+		    !(fc->flags & AFS_FS_CURSOR_INTR) &&
+		    server->addresses) {
+			_leave(" = t [intr]");
+			return true;
+		}
+		fc->error = PTR_ERR(alist);
+		_leave(" = f [%d]", fc->error);
 		return false;
 	}
 
@@ -682,7 +602,11 @@
 	ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING,
 			  TASK_INTERRUPTIBLE);
 	if (ret == -ERESTARTSYS) {
-		fc->ac.error = ret;
+		if (!(fc->flags & AFS_FS_CURSOR_INTR) && server->addresses) {
+			_leave(" = t [intr]");
+			return true;
+		}
+		fc->error = ret;
 		_leave(" = f [intr]");
 		return false;
 	}

diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 8a5760a..888d91d 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS fileserver list management.
  *
  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
@@ -20,7 +16,8 @@
 	if (slist && refcount_dec_and_test(&slist->usage)) {
 		for (i = 0; i < slist->nr_servers; i++) {
 			afs_put_cb_interest(net, slist->servers[i].cb_interest);
-			afs_put_server(net, slist->servers[i].server);
+			afs_put_server(net, slist->servers[i].server,
+				       afs_server_trace_put_slist);
 		}
 		kfree(slist);
 	}
@@ -42,9 +39,7 @@
 		if (vldb->fs_mask[i] & type_mask)
 			nr_servers++;
 
-	slist = kzalloc(sizeof(struct afs_server_list) +
-			sizeof(struct afs_server_entry) * nr_servers,
-			GFP_KERNEL);
+	slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
 	if (!slist)
 		goto error;
 
@@ -73,7 +68,8 @@
 				break;
 		if (j < slist->nr_servers) {
 			if (slist->servers[j].server == server) {
-				afs_put_server(cell->net, server);
+				afs_put_server(cell->net, server,
+					       afs_server_trace_put_slist_isort);
 				continue;
 			}
 
@@ -118,11 +114,11 @@
 	return false;
 
 changed:
-	/* Maintain the same current server as before if possible. */
-	cur = old->servers[old->index].server;
+	/* Maintain the same preferred server as before if possible. */
+	cur = old->servers[old->preferred].server;
 	for (j = 0; j < new->nr_servers; j++) {
 		if (new->servers[j].server == cur) {
-			new->index = j;
+			new->preferred = j;
 			break;
 		}
 	}

diff --git a/fs/afs/super.c b/fs/afs/super.c
index 4d3e274..488641b 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c

@@ -1,6 +1,6 @@
 /* AFS superblock handling
  *
- * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved.
  *
  * This software may be freely redistributed under the terms of the
  * GNU General Public License.
@@ -21,7 +21,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
 #include <linux/statfs.h>
 #include <linux/sched.h>
 #include <linux/nsproxy.h>
@@ -30,21 +30,23 @@
 #include "internal.h"
 
 static void afs_i_init_once(void *foo);
-static struct dentry *afs_mount(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *data);
 static void afs_kill_super(struct super_block *sb);
 static struct inode *afs_alloc_inode(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
+static void afs_free_inode(struct inode *inode);
 static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int afs_show_devname(struct seq_file *m, struct dentry *root);
 static int afs_show_options(struct seq_file *m, struct dentry *root);
+static int afs_init_fs_context(struct fs_context *fc);
+static const struct fs_parameter_description afs_fs_parameters;
 
 struct file_system_type afs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "afs",
-	.mount		= afs_mount,
-	.kill_sb	= afs_kill_super,
-	.fs_flags	= 0,
+	.owner			= THIS_MODULE,
+	.name			= "afs",
+	.init_fs_context	= afs_init_fs_context,
+	.parameters		= &afs_fs_parameters,
+	.kill_sb		= afs_kill_super,
+	.fs_flags		= FS_RENAME_DOES_D_MOVE,
 };
 MODULE_ALIAS_FS("afs");
 
@@ -55,6 +57,7 @@
 	.alloc_inode	= afs_alloc_inode,
 	.drop_inode	= afs_drop_inode,
 	.destroy_inode	= afs_destroy_inode,
+	.free_inode	= afs_free_inode,
 	.evict_inode	= afs_evict_inode,
 	.show_devname	= afs_show_devname,
 	.show_options	= afs_show_options,
@@ -63,22 +66,33 @@
 static struct kmem_cache *afs_inode_cachep;
 static atomic_t afs_count_active_inodes;
 
-enum {
-	afs_no_opt,
-	afs_opt_cell,
-	afs_opt_dyn,
-	afs_opt_rwpath,
-	afs_opt_vol,
-	afs_opt_autocell,
+enum afs_param {
+	Opt_autocell,
+	Opt_dyn,
+	Opt_flock,
+	Opt_source,
 };
 
-static const match_table_t afs_options_list = {
-	{ afs_opt_cell,		"cell=%s"	},
-	{ afs_opt_dyn,		"dyn"		},
-	{ afs_opt_rwpath,	"rwpath"	},
-	{ afs_opt_vol,		"vol=%s"	},
-	{ afs_opt_autocell,	"autocell"	},
-	{ afs_no_opt,		NULL		},
+static const struct fs_parameter_spec afs_param_specs[] = {
+	fsparam_flag  ("autocell",	Opt_autocell),
+	fsparam_flag  ("dyn",		Opt_dyn),
+	fsparam_enum  ("flock",		Opt_flock),
+	fsparam_string("source",	Opt_source),
+	{}
+};
+
+static const struct fs_parameter_enum afs_param_enums[] = {
+	{ Opt_flock,	"local",	afs_flock_mode_local },
+	{ Opt_flock,	"openafs",	afs_flock_mode_openafs },
+	{ Opt_flock,	"strict",	afs_flock_mode_strict },
+	{ Opt_flock,	"write",	afs_flock_mode_write },
+	{}
+};
+
+static const struct fs_parameter_description afs_fs_parameters = {
+	.name		= "kAFS",
+	.specs		= afs_param_specs,
+	.enums		= afs_param_enums,
 };
 
 /*
@@ -181,93 +195,43 @@
 static int afs_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct afs_super_info *as = AFS_FS_S(root->d_sb);
+	const char *p = NULL;
 
 	if (as->dyn_root)
 		seq_puts(m, ",dyn");
 	if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags))
 		seq_puts(m, ",autocell");
-	return 0;
-}
-
-/*
- * parse the mount options
- * - this function has been shamelessly adapted from the ext3 fs which
- *   shamelessly adapted it from the msdos fs
- */
-static int afs_parse_options(struct afs_mount_params *params,
-			     char *options, const char **devname)
-{
-	struct afs_cell *cell;
-	substring_t args[MAX_OPT_ARGS];
-	char *p;
-	int token;
-
-	_enter("%s", options);
-
-	options[PAGE_SIZE - 1] = 0;
-
-	while ((p = strsep(&options, ","))) {
-		if (!*p)
-			continue;
-
-		token = match_token(p, afs_options_list, args);
-		switch (token) {
-		case afs_opt_cell:
-			rcu_read_lock();
-			cell = afs_lookup_cell_rcu(params->net,
-						   args[0].from,
-						   args[0].to - args[0].from);
-			rcu_read_unlock();
-			if (IS_ERR(cell))
-				return PTR_ERR(cell);
-			afs_put_cell(params->net, params->cell);
-			params->cell = cell;
-			break;
-
-		case afs_opt_rwpath:
-			params->rwpath = true;
-			break;
-
-		case afs_opt_vol:
-			*devname = args[0].from;
-			break;
-
-		case afs_opt_autocell:
-			params->autocell = true;
-			break;
-
-		case afs_opt_dyn:
-			params->dyn_root = true;
-			break;
-
-		default:
-			printk(KERN_ERR "kAFS:"
-			       " Unknown or invalid mount option: '%s'\n", p);
-			return -EINVAL;
-		}
+	switch (as->flock_mode) {
+	case afs_flock_mode_unset:	break;
+	case afs_flock_mode_local:	p = "local";	break;
+	case afs_flock_mode_openafs:	p = "openafs";	break;
+	case afs_flock_mode_strict:	p = "strict";	break;
+	case afs_flock_mode_write:	p = "write";	break;
 	}
+	if (p)
+		seq_printf(m, ",flock=%s", p);
 
-	_leave(" = 0");
 	return 0;
 }
 
 /*
- * parse a device name to get cell name, volume name, volume type and R/W
- * selector
- * - this can be one of the following:
+ * Parse the source name to get cell name, volume name, volume type and R/W
+ * selector.
+ *
+ * This can be one of the following:
  *	"%[cell:]volume[.]"		R/W volume
- *	"#[cell:]volume[.]"		R/O or R/W volume (rwpath=0),
- *					 or R/W (rwpath=1) volume
+ *	"#[cell:]volume[.]"		R/O or R/W volume (R/O parent),
+ *					 or R/W (R/W parent) volume
  *	"%[cell:]volume.readonly"	R/O volume
  *	"#[cell:]volume.readonly"	R/O volume
  *	"%[cell:]volume.backup"		Backup volume
  *	"#[cell:]volume.backup"		Backup volume
  */
-static int afs_parse_device_name(struct afs_mount_params *params,
-				 const char *name)
+static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
 {
+	struct afs_fs_context *ctx = fc->fs_private;
 	struct afs_cell *cell;
-	const char *cellname, *suffix;
+	const char *cellname, *suffix, *name = param->string;
 	int cellnamesz;
 
 	_enter(",%s", name);
@@ -278,69 +242,153 @@
 	}
 
 	if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+		/* To use dynroot, we don't want to have to provide a source */
+		if (strcmp(name, "none") == 0) {
+			ctx->no_cell = true;
+			return 0;
+		}
 		printk(KERN_ERR "kAFS: unparsable volume name\n");
 		return -EINVAL;
 	}
 
 	/* determine the type of volume we're looking for */
-	params->type = AFSVL_ROVOL;
-	params->force = false;
-	if (params->rwpath || name[0] == '%') {
-		params->type = AFSVL_RWVOL;
-		params->force = true;
+	if (name[0] == '%') {
+		ctx->type = AFSVL_RWVOL;
+		ctx->force = true;
 	}
 	name++;
 
 	/* split the cell name out if there is one */
-	params->volname = strchr(name, ':');
-	if (params->volname) {
+	ctx->volname = strchr(name, ':');
+	if (ctx->volname) {
 		cellname = name;
-		cellnamesz = params->volname - name;
-		params->volname++;
+		cellnamesz = ctx->volname - name;
+		ctx->volname++;
 	} else {
-		params->volname = name;
+		ctx->volname = name;
 		cellname = NULL;
 		cellnamesz = 0;
 	}
 
 	/* the volume type is further affected by a possible suffix */
-	suffix = strrchr(params->volname, '.');
+	suffix = strrchr(ctx->volname, '.');
 	if (suffix) {
 		if (strcmp(suffix, ".readonly") == 0) {
-			params->type = AFSVL_ROVOL;
-			params->force = true;
+			ctx->type = AFSVL_ROVOL;
+			ctx->force = true;
 		} else if (strcmp(suffix, ".backup") == 0) {
-			params->type = AFSVL_BACKVOL;
-			params->force = true;
+			ctx->type = AFSVL_BACKVOL;
+			ctx->force = true;
 		} else if (suffix[1] == 0) {
 		} else {
 			suffix = NULL;
 		}
 	}
 
-	params->volnamesz = suffix ?
-		suffix - params->volname : strlen(params->volname);
+	ctx->volnamesz = suffix ?
+		suffix - ctx->volname : strlen(ctx->volname);
 
 	_debug("cell %*.*s [%p]",
-	       cellnamesz, cellnamesz, cellname ?: "", params->cell);
+	       cellnamesz, cellnamesz, cellname ?: "", ctx->cell);
 
 	/* lookup the cell record */
-	if (cellname || !params->cell) {
-		cell = afs_lookup_cell(params->net, cellname, cellnamesz,
+	if (cellname) {
+		cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
 				       NULL, false);
 		if (IS_ERR(cell)) {
-			printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+			pr_err("kAFS: unable to lookup cell '%*.*s'\n",
 			       cellnamesz, cellnamesz, cellname ?: "");
 			return PTR_ERR(cell);
 		}
-		afs_put_cell(params->net, params->cell);
-		params->cell = cell;
+		afs_put_cell(ctx->net, ctx->cell);
+		ctx->cell = cell;
 	}
 
 	_debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
-	       params->cell->name, params->cell,
-	       params->volnamesz, params->volnamesz, params->volname,
-	       suffix ?: "-", params->type, params->force ? " FORCE" : "");
+	       ctx->cell->name, ctx->cell,
+	       ctx->volnamesz, ctx->volnamesz, ctx->volname,
+	       suffix ?: "-", ctx->type, ctx->force ? " FORCE" : "");
+
+	fc->source = param->string;
+	param->string = NULL;
+	return 0;
+}
+
+/*
+ * Parse a single mount parameter.
+ */
+static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct fs_parse_result result;
+	struct afs_fs_context *ctx = fc->fs_private;
+	int opt;
+
+	opt = fs_parse(fc, &afs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_source:
+		return afs_parse_source(fc, param);
+
+	case Opt_autocell:
+		ctx->autocell = true;
+		break;
+
+	case Opt_dyn:
+		ctx->dyn_root = true;
+		break;
+
+	case Opt_flock:
+		ctx->flock_mode = result.uint_32;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * Validate the options, get the cell key and look up the volume.
+ */
+static int afs_validate_fc(struct fs_context *fc)
+{
+	struct afs_fs_context *ctx = fc->fs_private;
+	struct afs_volume *volume;
+	struct key *key;
+
+	if (!ctx->dyn_root) {
+		if (ctx->no_cell) {
+			pr_warn("kAFS: Can only specify source 'none' with -o dyn\n");
+			return -EINVAL;
+		}
+
+		if (!ctx->cell) {
+			pr_warn("kAFS: No cell specified\n");
+			return -EDESTADDRREQ;
+		}
+
+		/* We try to do the mount securely. */
+		key = afs_request_key(ctx->cell);
+		if (IS_ERR(key))
+			return PTR_ERR(key);
+
+		ctx->key = key;
+
+		if (ctx->volume) {
+			afs_put_volume(ctx->cell, ctx->volume);
+			ctx->volume = NULL;
+		}
+
+		volume = afs_create_volume(ctx);
+		if (IS_ERR(volume))
+			return PTR_ERR(volume);
+
+		ctx->volume = volume;
+	}
 
 	return 0;
 }
@@ -348,42 +396,37 @@
 /*
  * check a superblock to see if it's the one we're looking for
  */
-static int afs_test_super(struct super_block *sb, void *data)
+static int afs_test_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct afs_super_info *as1 = data;
+	struct afs_fs_context *ctx = fc->fs_private;
 	struct afs_super_info *as = AFS_FS_S(sb);
 
-	return (as->net_ns == as1->net_ns &&
+	return (as->net_ns == fc->net_ns &&
 		as->volume &&
-		as->volume->vid == as1->volume->vid &&
+		as->volume->vid == ctx->volume->vid &&
 		!as->dyn_root);
 }
 
-static int afs_dynroot_test_super(struct super_block *sb, void *data)
+static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct afs_super_info *as1 = data;
 	struct afs_super_info *as = AFS_FS_S(sb);
 
-	return (as->net_ns == as1->net_ns &&
+	return (as->net_ns == fc->net_ns &&
 		as->dyn_root);
 }
 
-static int afs_set_super(struct super_block *sb, void *data)
+static int afs_set_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct afs_super_info *as = data;
-
-	sb->s_fs_info = as;
 	return set_anon_super(sb, NULL);
 }
 
 /*
  * fill in the superblock
  */
-static int afs_fill_super(struct super_block *sb,
-			  struct afs_mount_params *params)
+static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
 {
 	struct afs_super_info *as = AFS_FS_S(sb);
-	struct afs_fid fid;
+	struct afs_iget_data iget_data;
 	struct inode *inode = NULL;
 	int ret;
 
@@ -392,6 +435,7 @@
 	/* fill in the superblock */
 	sb->s_blocksize		= PAGE_SIZE;
 	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
 	sb->s_magic		= AFS_FS_MAGIC;
 	sb->s_op		= &afs_super_ops;
 	if (!as->dyn_root)
@@ -399,25 +443,28 @@
 	ret = super_setup_bdi(sb);
 	if (ret)
 		return ret;
-	sb->s_bdi->ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	sb->s_bdi->ra_pages	= VM_READAHEAD_PAGES;
 
 	/* allocate the root inode and dentry */
 	if (as->dyn_root) {
 		inode = afs_iget_pseudo_dir(sb, true);
 		sb->s_flags	|= SB_RDONLY;
 	} else {
-		sprintf(sb->s_id, "%u", as->volume->vid);
+		sprintf(sb->s_id, "%llu", as->volume->vid);
 		afs_activate_volume(as->volume);
-		fid.vid		= as->volume->vid;
-		fid.vnode	= 1;
-		fid.unique	= 1;
-		inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
+		iget_data.fid.vid	= as->volume->vid;
+		iget_data.fid.vnode	= 1;
+		iget_data.fid.vnode_hi	= 0;
+		iget_data.fid.unique	= 1;
+		iget_data.cb_v_break	= as->volume->cb_v_break;
+		iget_data.cb_s_break	= 0;
+		inode = afs_iget(sb, ctx->key, &iget_data, NULL, NULL, NULL);
 	}
 
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	if (params->autocell || params->dyn_root)
+	if (ctx->autocell || as->dyn_root)
 		set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
 
 	ret = -ENOMEM;
@@ -442,17 +489,21 @@
 	return ret;
 }
 
-static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params)
+static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
 {
+	struct afs_fs_context *ctx = fc->fs_private;
 	struct afs_super_info *as;
 
 	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
 	if (as) {
-		as->net_ns = get_net(params->net_ns);
-		if (params->dyn_root)
+		as->net_ns = get_net(fc->net_ns);
+		as->flock_mode = ctx->flock_mode;
+		if (ctx->dyn_root) {
 			as->dyn_root = true;
-		else
-			as->cell = afs_get_cell(params->cell);
+		} else {
+			as->cell = afs_get_cell(ctx->cell);
+			as->volume = __afs_get_volume(ctx->volume);
+		}
 	}
 	return as;
 }
@@ -474,7 +525,7 @@
 
 	if (as->dyn_root)
 		afs_dynroot_depopulate(sb);
-	
+
 	/* Clear the callback interests (which will do ilookup5) before
 	 * deactivating the superblock.
 	 */
@@ -487,111 +538,104 @@
 }
 
 /*
- * get an AFS superblock
+ * Get an AFS superblock and root directory.
  */
-static struct dentry *afs_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *options)
+static int afs_get_tree(struct fs_context *fc)
 {
-	struct afs_mount_params params;
+	struct afs_fs_context *ctx = fc->fs_private;
 	struct super_block *sb;
-	struct afs_volume *candidate;
-	struct key *key;
 	struct afs_super_info *as;
 	int ret;
 
-	_enter(",,%s,%p", dev_name, options);
-
-	memset(&params, 0, sizeof(params));
-
-	ret = -EINVAL;
-	if (current->nsproxy->net_ns != &init_net)
+	ret = afs_validate_fc(fc);
+	if (ret)
 		goto error;
-	params.net_ns = current->nsproxy->net_ns;
-	params.net = afs_net(params.net_ns);
-	
-	/* parse the options and device name */
-	if (options) {
-		ret = afs_parse_options(&params, options, &dev_name);
-		if (ret < 0)
-			goto error;
-	}
 
-	if (!params.dyn_root) {
-		ret = afs_parse_device_name(&params, dev_name);
-		if (ret < 0)
-			goto error;
-
-		/* try and do the mount securely */
-		key = afs_request_key(params.cell);
-		if (IS_ERR(key)) {
-			_leave(" = %ld [key]", PTR_ERR(key));
-			ret = PTR_ERR(key);
-			goto error;
-		}
-		params.key = key;
-	}
+	_enter("");
 
 	/* allocate a superblock info record */
 	ret = -ENOMEM;
-	as = afs_alloc_sbi(&params);
+	as = afs_alloc_sbi(fc);
 	if (!as)
-		goto error_key;
-
-	if (!params.dyn_root) {
-		/* Assume we're going to need a volume record; at the very
-		 * least we can use it to update the volume record if we have
-		 * one already.  This checks that the volume exists within the
-		 * cell.
-		 */
-		candidate = afs_create_volume(&params);
-		if (IS_ERR(candidate)) {
-			ret = PTR_ERR(candidate);
-			goto error_as;
-		}
-
-		as->volume = candidate;
-	}
+		goto error;
+	fc->s_fs_info = as;
 
 	/* allocate a deviceless superblock */
-	sb = sget(fs_type,
-		  as->dyn_root ? afs_dynroot_test_super : afs_test_super,
-		  afs_set_super, flags, as);
+	sb = sget_fc(fc,
+		     as->dyn_root ? afs_dynroot_test_super : afs_test_super,
+		     afs_set_super);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
-		goto error_as;
+		goto error;
 	}
 
 	if (!sb->s_root) {
 		/* initial superblock/root creation */
 		_debug("create");
-		ret = afs_fill_super(sb, &params);
+		ret = afs_fill_super(sb, ctx);
 		if (ret < 0)
 			goto error_sb;
-		as = NULL;
 		sb->s_flags |= SB_ACTIVE;
 	} else {
 		_debug("reuse");
 		ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
-		afs_destroy_sbi(as);
-		as = NULL;
 	}
 
-	afs_put_cell(params.net, params.cell);
-	key_put(params.key);
+	fc->root = dget(sb->s_root);
+	trace_afs_get_tree(as->cell, as->volume);
 	_leave(" = 0 [%p]", sb);
-	return dget(sb->s_root);
+	return 0;
 
 error_sb:
 	deactivate_locked_super(sb);
-	goto error_key;
-error_as:
-	afs_destroy_sbi(as);
-error_key:
-	key_put(params.key);
 error:
-	afs_put_cell(params.net, params.cell);
 	_leave(" = %d", ret);
-	return ERR_PTR(ret);
+	return ret;
+}
+
+static void afs_free_fc(struct fs_context *fc)
+{
+	struct afs_fs_context *ctx = fc->fs_private;
+
+	afs_destroy_sbi(fc->s_fs_info);
+	afs_put_volume(ctx->cell, ctx->volume);
+	afs_put_cell(ctx->net, ctx->cell);
+	key_put(ctx->key);
+	kfree(ctx);
+}
+
+static const struct fs_context_operations afs_context_ops = {
+	.free		= afs_free_fc,
+	.parse_param	= afs_parse_param,
+	.get_tree	= afs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int afs_init_fs_context(struct fs_context *fc)
+{
+	struct afs_fs_context *ctx;
+	struct afs_cell *cell;
+
+	ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->type = AFSVL_ROVOL;
+	ctx->net = afs_net(fc->net_ns);
+
+	/* Default to the workstation cell. */
+	rcu_read_lock();
+	cell = afs_lookup_cell_rcu(ctx->net, NULL, 0);
+	rcu_read_unlock();
+	if (IS_ERR(cell))
+		cell = NULL;
+	ctx->cell = cell;
+
+	fc->fs_private = ctx;
+	fc->ops = &afs_context_ops;
+	return 0;
 }
 
 /*
@@ -636,24 +680,23 @@
 	vnode->volume		= NULL;
 	vnode->lock_key		= NULL;
 	vnode->permit_cache	= NULL;
-	vnode->cb_interest	= NULL;
+	RCU_INIT_POINTER(vnode->cb_interest, NULL);
 #ifdef CONFIG_AFS_FSCACHE
 	vnode->cache		= NULL;
 #endif
 
 	vnode->flags		= 1 << AFS_VNODE_UNSET;
-	vnode->cb_type		= 0;
 	vnode->lock_state	= AFS_VNODE_LOCK_NONE;
 
+	init_rwsem(&vnode->rmdir_lock);
+
 	_leave(" = %p", &vnode->vfs_inode);
 	return &vnode->vfs_inode;
 }
 
-static void afs_i_callback(struct rcu_head *head)
+static void afs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	struct afs_vnode *vnode = AFS_FS_I(inode);
-	kmem_cache_free(afs_inode_cachep, vnode);
+	kmem_cache_free(afs_inode_cachep, AFS_FS_I(inode));
 }
 
 /*
@@ -663,13 +706,12 @@
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
-	_enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode);
+	_enter("%p{%llx:%llu}", inode, vnode->fid.vid, vnode->fid.vnode);
 
 	_debug("DESTROY INODE %p", inode);
 
-	ASSERTCMP(vnode->cb_interest, ==, NULL);
+	ASSERTCMP(rcu_access_pointer(vnode->cb_interest), ==, NULL);
 
-	call_rcu(&inode->i_rcu, afs_i_callback);
 	atomic_dec(&afs_count_active_inodes);
 }
 
@@ -701,7 +743,7 @@
 		return PTR_ERR(key);
 
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+	if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
 		fc.flags |= AFS_FS_CURSOR_NO_VSLEEP;
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(vnode);
@@ -709,7 +751,6 @@
 		}
 
 		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
 		ret = afs_end_vnode_operation(&fc);
 	}
 

diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
new file mode 100644
index 0000000..21eb0c0
--- /dev/null
+++ b/fs/afs/vl_list.c

@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* AFS vlserver list management.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
+					unsigned short port)
+{
+	struct afs_vlserver *vlserver;
+
+	vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
+			   GFP_KERNEL);
+	if (vlserver) {
+		atomic_set(&vlserver->usage, 1);
+		rwlock_init(&vlserver->lock);
+		init_waitqueue_head(&vlserver->probe_wq);
+		spin_lock_init(&vlserver->probe_lock);
+		vlserver->name_len = name_len;
+		vlserver->port = port;
+		memcpy(vlserver->name, name, name_len);
+	}
+	return vlserver;
+}
+
+static void afs_vlserver_rcu(struct rcu_head *rcu)
+{
+	struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu);
+
+	afs_put_addrlist(rcu_access_pointer(vlserver->addresses));
+	kfree_rcu(vlserver, rcu);
+}
+
+void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver)
+{
+	if (vlserver) {
+		unsigned int u = atomic_dec_return(&vlserver->usage);
+		//_debug("VL PUT %p{%u}", vlserver, u);
+
+		if (u == 0)
+			call_rcu(&vlserver->rcu, afs_vlserver_rcu);
+	}
+}
+
+struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
+{
+	struct afs_vlserver_list *vllist;
+
+	vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL);
+	if (vllist) {
+		atomic_set(&vllist->usage, 1);
+		rwlock_init(&vllist->lock);
+	}
+
+	return vllist;
+}
+
+void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist)
+{
+	if (vllist) {
+		unsigned int u = atomic_dec_return(&vllist->usage);
+
+		//_debug("VLLS PUT %p{%u}", vllist, u);
+		if (u == 0) {
+			int i;
+
+			for (i = 0; i < vllist->nr_servers; i++) {
+				afs_put_vlserver(net, vllist->servers[i].server);
+			}
+			kfree_rcu(vllist, rcu);
+		}
+	}
+}
+
+static u16 afs_extract_le16(const u8 **_b)
+{
+	u16 val;
+
+	val  = (u16)*(*_b)++ << 0;
+	val |= (u16)*(*_b)++ << 8;
+	return val;
+}
+
+/*
+ * Build a VL server address list from a DNS queried server list.
+ */
+static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+						  u8 nr_addrs, u16 port)
+{
+	struct afs_addr_list *alist;
+	const u8 *b = *_b;
+	int ret = -EINVAL;
+
+	alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port);
+	if (!alist)
+		return ERR_PTR(-ENOMEM);
+	if (nr_addrs == 0)
+		return alist;
+
+	for (; nr_addrs > 0 && end - b >= nr_addrs; nr_addrs--) {
+		struct dns_server_list_v1_address hdr;
+		__be32 x[4];
+
+		hdr.address_type = *b++;
+
+		switch (hdr.address_type) {
+		case DNS_ADDRESS_IS_IPV4:
+			if (end - b < 4) {
+				_leave(" = -EINVAL [short inet]");
+				goto error;
+			}
+			memcpy(x, b, 4);
+			afs_merge_fs_addr4(alist, x[0], port);
+			b += 4;
+			break;
+
+		case DNS_ADDRESS_IS_IPV6:
+			if (end - b < 16) {
+				_leave(" = -EINVAL [short inet6]");
+				goto error;
+			}
+			memcpy(x, b, 16);
+			afs_merge_fs_addr6(alist, x, port);
+			b += 16;
+			break;
+
+		default:
+			_leave(" = -EADDRNOTAVAIL [unknown af %u]",
+			       hdr.address_type);
+			ret = -EADDRNOTAVAIL;
+			goto error;
+		}
+	}
+
+	/* Start with IPv6 if available. */
+	if (alist->nr_ipv4 < alist->nr_addrs)
+		alist->preferred = alist->nr_ipv4;
+
+	*_b = b;
+	return alist;
+
+error:
+	*_b = b;
+	afs_put_addrlist(alist);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Build a VL server list from a DNS queried server list.
+ */
+struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
+						    const void *buffer,
+						    size_t buffer_size)
+{
+	const struct dns_server_list_v1_header *hdr = buffer;
+	struct dns_server_list_v1_server bs;
+	struct afs_vlserver_list *vllist, *previous;
+	struct afs_addr_list *addrs;
+	struct afs_vlserver *server;
+	const u8 *b = buffer, *end = buffer + buffer_size;
+	int ret = -ENOMEM, nr_servers, i, j;
+
+	_enter("");
+
+	/* Check that it's a server list, v1 */
+	if (end - b < sizeof(*hdr) ||
+	    hdr->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST ||
+	    hdr->hdr.version != 1) {
+		pr_notice("kAFS: Got DNS record [%u,%u] len %zu\n",
+			  hdr->hdr.content, hdr->hdr.version, end - b);
+		ret = -EDESTADDRREQ;
+		goto dump;
+	}
+
+	nr_servers = hdr->nr_servers;
+
+	vllist = afs_alloc_vlserver_list(nr_servers);
+	if (!vllist)
+		return ERR_PTR(-ENOMEM);
+
+	vllist->source = (hdr->source < NR__dns_record_source) ?
+		hdr->source : NR__dns_record_source;
+	vllist->status = (hdr->status < NR__dns_lookup_status) ?
+		hdr->status : NR__dns_lookup_status;
+
+	read_lock(&cell->vl_servers_lock);
+	previous = afs_get_vlserverlist(
+		rcu_dereference_protected(cell->vl_servers,
+					  lockdep_is_held(&cell->vl_servers_lock)));
+	read_unlock(&cell->vl_servers_lock);
+
+	b += sizeof(*hdr);
+	while (end - b >= sizeof(bs)) {
+		bs.name_len	= afs_extract_le16(&b);
+		bs.priority	= afs_extract_le16(&b);
+		bs.weight	= afs_extract_le16(&b);
+		bs.port		= afs_extract_le16(&b);
+		bs.source	= *b++;
+		bs.status	= *b++;
+		bs.protocol	= *b++;
+		bs.nr_addrs	= *b++;
+
+		_debug("extract %u %u %u %u %u %u %*.*s",
+		       bs.name_len, bs.priority, bs.weight,
+		       bs.port, bs.protocol, bs.nr_addrs,
+		       bs.name_len, bs.name_len, b);
+
+		if (end - b < bs.name_len)
+			break;
+
+		ret = -EPROTONOSUPPORT;
+		if (bs.protocol == DNS_SERVER_PROTOCOL_UNSPECIFIED) {
+			bs.protocol = DNS_SERVER_PROTOCOL_UDP;
+		} else if (bs.protocol != DNS_SERVER_PROTOCOL_UDP) {
+			_leave(" = [proto %u]", bs.protocol);
+			goto error;
+		}
+
+		if (bs.port == 0)
+			bs.port = AFS_VL_PORT;
+		if (bs.source > NR__dns_record_source)
+			bs.source = NR__dns_record_source;
+		if (bs.status > NR__dns_lookup_status)
+			bs.status = NR__dns_lookup_status;
+
+		/* See if we can update an old server record */
+		server = NULL;
+		for (i = 0; i < previous->nr_servers; i++) {
+			struct afs_vlserver *p = previous->servers[i].server;
+
+			if (p->name_len == bs.name_len &&
+			    p->port == bs.port &&
+			    strncasecmp(b, p->name, bs.name_len) == 0) {
+				server = afs_get_vlserver(p);
+				break;
+			}
+		}
+
+		if (!server) {
+			ret = -ENOMEM;
+			server = afs_alloc_vlserver(b, bs.name_len, bs.port);
+			if (!server)
+				goto error;
+		}
+
+		b += bs.name_len;
+
+		/* Extract the addresses - note that we can't skip this as we
+		 * have to advance the payload pointer.
+		 */
+		addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port);
+		if (IS_ERR(addrs)) {
+			ret = PTR_ERR(addrs);
+			goto error_2;
+		}
+
+		if (vllist->nr_servers >= nr_servers) {
+			_debug("skip %u >= %u", vllist->nr_servers, nr_servers);
+			afs_put_addrlist(addrs);
+			afs_put_vlserver(cell->net, server);
+			continue;
+		}
+
+		addrs->source = bs.source;
+		addrs->status = bs.status;
+
+		if (addrs->nr_addrs == 0) {
+			afs_put_addrlist(addrs);
+			if (!rcu_access_pointer(server->addresses)) {
+				afs_put_vlserver(cell->net, server);
+				continue;
+			}
+		} else {
+			struct afs_addr_list *old = addrs;
+
+			write_lock(&server->lock);
+			rcu_swap_protected(server->addresses, old,
+					   lockdep_is_held(&server->lock));
+			write_unlock(&server->lock);
+			afs_put_addrlist(old);
+		}
+
+
+		/* TODO: Might want to check for duplicates */
+
+		/* Insertion-sort by priority and weight */
+		for (j = 0; j < vllist->nr_servers; j++) {
+			if (bs.priority < vllist->servers[j].priority)
+				break; /* Lower preferable */
+			if (bs.priority == vllist->servers[j].priority &&
+			    bs.weight > vllist->servers[j].weight)
+				break; /* Higher preferable */
+		}
+
+		if (j < vllist->nr_servers) {
+			memmove(vllist->servers + j + 1,
+				vllist->servers + j,
+				(vllist->nr_servers - j) * sizeof(struct afs_vlserver_entry));
+		}
+
+		clear_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+
+		vllist->servers[j].priority = bs.priority;
+		vllist->servers[j].weight = bs.weight;
+		vllist->servers[j].server = server;
+		vllist->nr_servers++;
+	}
+
+	if (b != end) {
+		_debug("parse error %zd", b - end);
+		goto error;
+	}
+
+	afs_put_vlserverlist(cell->net, previous);
+	_leave(" = ok [%u]", vllist->nr_servers);
+	return vllist;
+
+error_2:
+	afs_put_vlserver(cell->net, server);
+error:
+	afs_put_vlserverlist(cell->net, vllist);
+	afs_put_vlserverlist(cell->net, previous);
+dump:
+	if (ret != -ENOMEM) {
+		printk(KERN_DEBUG "DNS: at %zu\n", (const void *)b - buffer);
+		print_hex_dump_bytes("DNS: ", DUMP_PREFIX_NONE, buffer, buffer_size);
+	}
+	return ERR_PTR(ret);
+}

diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
new file mode 100644
index 0000000..858498c
--- /dev/null
+++ b/fs/afs/vl_probe.c

@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* AFS vlserver probing
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "afs_fs.h"
+#include "internal.h"
+#include "protocol_yfs.h"
+
+static bool afs_vl_probe_done(struct afs_vlserver *server)
+{
+	if (!atomic_dec_and_test(&server->probe_outstanding))
+		return false;
+
+	wake_up_var(&server->probe_outstanding);
+	clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags);
+	wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING);
+	return true;
+}
+
+/*
+ * Process the result of probing a vlserver.  This is called after successful
+ * or failed delivery of an VL.GetCapabilities operation.
+ */
+void afs_vlserver_probe_result(struct afs_call *call)
+{
+	struct afs_addr_list *alist = call->alist;
+	struct afs_vlserver *server = call->vlserver;
+	unsigned int server_index = call->server_index;
+	unsigned int index = call->addr_ix;
+	unsigned int rtt = UINT_MAX;
+	bool have_result = false;
+	u64 _rtt;
+	int ret = call->error;
+
+	_enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code);
+
+	spin_lock(&server->probe_lock);
+
+	switch (ret) {
+	case 0:
+		server->probe.error = 0;
+		goto responded;
+	case -ECONNABORTED:
+		if (!server->probe.responded) {
+			server->probe.abort_code = call->abort_code;
+			server->probe.error = ret;
+		}
+		goto responded;
+	case -ENOMEM:
+	case -ENONET:
+		server->probe.local_failure = true;
+		afs_io_error(call, afs_io_error_vl_probe_fail);
+		goto out;
+	case -ECONNRESET: /* Responded, but call expired. */
+	case -ERFKILL:
+	case -EADDRNOTAVAIL:
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -EHOSTDOWN:
+	case -ECONNREFUSED:
+	case -ETIMEDOUT:
+	case -ETIME:
+	default:
+		clear_bit(index, &alist->responded);
+		set_bit(index, &alist->failed);
+		if (!server->probe.responded &&
+		    (server->probe.error == 0 ||
+		     server->probe.error == -ETIMEDOUT ||
+		     server->probe.error == -ETIME))
+			server->probe.error = ret;
+		afs_io_error(call, afs_io_error_vl_probe_fail);
+		goto out;
+	}
+
+responded:
+	set_bit(index, &alist->responded);
+	clear_bit(index, &alist->failed);
+
+	if (call->service_id == YFS_VL_SERVICE) {
+		server->probe.is_yfs = true;
+		set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+		alist->addrs[index].srx_service = call->service_id;
+	} else {
+		server->probe.not_yfs = true;
+		if (!server->probe.is_yfs) {
+			clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+			alist->addrs[index].srx_service = call->service_id;
+		}
+	}
+
+	/* Get the RTT and scale it to fit into a 32-bit value that represents
+	 * over a minute of time so that we can access it with one instruction
+	 * on a 32-bit system.
+	 */
+	_rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
+	_rtt /= 64;
+	rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
+	if (rtt < server->probe.rtt) {
+		server->probe.rtt = rtt;
+		alist->preferred = index;
+		have_result = true;
+	}
+
+	smp_wmb(); /* Set rtt before responded. */
+	server->probe.responded = true;
+	set_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+out:
+	spin_unlock(&server->probe_lock);
+
+	_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+	       server_index, index, &alist->addrs[index].transport,
+	       (unsigned int)rtt, ret);
+
+	have_result |= afs_vl_probe_done(server);
+	if (have_result) {
+		server->probe.have_result = true;
+		wake_up_var(&server->probe.have_result);
+		wake_up_all(&server->probe_wq);
+	}
+}
+
+/*
+ * Probe all of a vlserver's addresses to find out the best route and to
+ * query its capabilities.
+ */
+static bool afs_do_probe_vlserver(struct afs_net *net,
+				  struct afs_vlserver *server,
+				  struct key *key,
+				  unsigned int server_index,
+				  struct afs_error *_e)
+{
+	struct afs_addr_cursor ac = {
+		.index = 0,
+	};
+	struct afs_call *call;
+	bool in_progress = false;
+
+	_enter("%s", server->name);
+
+	read_lock(&server->lock);
+	ac.alist = rcu_dereference_protected(server->addresses,
+					     lockdep_is_held(&server->lock));
+	read_unlock(&server->lock);
+
+	atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+	memset(&server->probe, 0, sizeof(server->probe));
+	server->probe.rtt = UINT_MAX;
+
+	for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
+		call = afs_vl_get_capabilities(net, &ac, key, server,
+					       server_index);
+		if (!IS_ERR(call)) {
+			afs_put_call(call);
+			in_progress = true;
+		} else {
+			afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+		}
+	}
+
+	if (!in_progress)
+		afs_vl_probe_done(server);
+	return in_progress;
+}
+
+/*
+ * Send off probes to all unprobed servers.
+ */
+int afs_send_vl_probes(struct afs_net *net, struct key *key,
+		       struct afs_vlserver_list *vllist)
+{
+	struct afs_vlserver *server;
+	struct afs_error e;
+	bool in_progress = false;
+	int i;
+
+	e.error = 0;
+	e.responded = false;
+	for (i = 0; i < vllist->nr_servers; i++) {
+		server = vllist->servers[i].server;
+		if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
+			continue;
+
+		if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags) &&
+		    afs_do_probe_vlserver(net, server, key, i, &e))
+			in_progress = true;
+	}
+
+	return in_progress ? 0 : e.error;
+}
+
+/*
+ * Wait for the first as-yet untried server to respond.
+ */
+int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
+			   unsigned long untried)
+{
+	struct wait_queue_entry *waits;
+	struct afs_vlserver *server;
+	unsigned int rtt = UINT_MAX;
+	bool have_responders = false;
+	int pref = -1, i;
+
+	_enter("%u,%lx", vllist->nr_servers, untried);
+
+	/* Only wait for servers that have a probe outstanding. */
+	for (i = 0; i < vllist->nr_servers; i++) {
+		if (test_bit(i, &untried)) {
+			server = vllist->servers[i].server;
+			if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
+				__clear_bit(i, &untried);
+			if (server->probe.responded)
+				have_responders = true;
+		}
+	}
+	if (have_responders || !untried)
+		return 0;
+
+	waits = kmalloc(array_size(vllist->nr_servers, sizeof(*waits)), GFP_KERNEL);
+	if (!waits)
+		return -ENOMEM;
+
+	for (i = 0; i < vllist->nr_servers; i++) {
+		if (test_bit(i, &untried)) {
+			server = vllist->servers[i].server;
+			init_waitqueue_entry(&waits[i], current);
+			add_wait_queue(&server->probe_wq, &waits[i]);
+		}
+	}
+
+	for (;;) {
+		bool still_probing = false;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		for (i = 0; i < vllist->nr_servers; i++) {
+			if (test_bit(i, &untried)) {
+				server = vllist->servers[i].server;
+				if (server->probe.responded)
+					goto stop;
+				if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
+					still_probing = true;
+			}
+		}
+
+		if (!still_probing || signal_pending(current))
+			goto stop;
+		schedule();
+	}
+
+stop:
+	set_current_state(TASK_RUNNING);
+
+	for (i = 0; i < vllist->nr_servers; i++) {
+		if (test_bit(i, &untried)) {
+			server = vllist->servers[i].server;
+			if (server->probe.responded &&
+			    server->probe.rtt < rtt) {
+				pref = i;
+				rtt = server->probe.rtt;
+			}
+
+			remove_wait_queue(&server->probe_wq, &waits[i]);
+		}
+	}
+
+	kfree(waits);
+
+	if (pref == -1 && signal_pending(current))
+		return -ERESTARTSYS;
+
+	if (pref >= 0)
+		vllist->preferred = pref;
+
+	_leave(" = 0 [%u]", pref);
+	return 0;
+}

diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
new file mode 100644
index 0000000..9a5ce96
--- /dev/null
+++ b/fs/afs/vl_rotate.c

@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Handle vlserver selection and rotation.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include "internal.h"
+#include "afs_vl.h"
+
+/*
+ * Begin an operation on a volume location server.
+ */
+bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell,
+				  struct key *key)
+{
+	memset(vc, 0, sizeof(*vc));
+	vc->cell = cell;
+	vc->key = key;
+	vc->error = -EDESTADDRREQ;
+	vc->ac.error = SHRT_MAX;
+
+	if (signal_pending(current)) {
+		vc->error = -EINTR;
+		vc->flags |= AFS_VL_CURSOR_STOP;
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Begin iteration through a server list, starting with the last used server if
+ * possible, or the last recorded good server if not.
+ */
+static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
+{
+	struct afs_cell *cell = vc->cell;
+	unsigned int dns_lookup_count;
+
+	if (cell->dns_source == DNS_RECORD_UNAVAILABLE ||
+	    cell->dns_expiry <= ktime_get_real_seconds()) {
+		dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
+		set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
+		queue_work(afs_wq, &cell->manager);
+
+		if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
+			if (wait_var_event_interruptible(
+				    &cell->dns_lookup_count,
+				    smp_load_acquire(&cell->dns_lookup_count)
+				    != dns_lookup_count) < 0) {
+				vc->error = -ERESTARTSYS;
+				return false;
+			}
+		}
+
+		/* Status load is ordered after lookup counter load */
+		if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
+			vc->error = -EDESTADDRREQ;
+			return false;
+		}
+	}
+
+	read_lock(&cell->vl_servers_lock);
+	vc->server_list = afs_get_vlserverlist(
+		rcu_dereference_protected(cell->vl_servers,
+					  lockdep_is_held(&cell->vl_servers_lock)));
+	read_unlock(&cell->vl_servers_lock);
+	if (!vc->server_list->nr_servers)
+		return false;
+
+	vc->untried = (1UL << vc->server_list->nr_servers) - 1;
+	vc->index = -1;
+	return true;
+}
+
+/*
+ * Select the vlserver to use.  May be called multiple times to rotate
+ * through the vlservers.
+ */
+bool afs_select_vlserver(struct afs_vl_cursor *vc)
+{
+	struct afs_addr_list *alist;
+	struct afs_vlserver *vlserver;
+	struct afs_error e;
+	u32 rtt;
+	int error = vc->ac.error, i;
+
+	_enter("%lx[%d],%lx[%d],%d,%d",
+	       vc->untried, vc->index,
+	       vc->ac.tried, vc->ac.index,
+	       error, vc->ac.abort_code);
+
+	if (vc->flags & AFS_VL_CURSOR_STOP) {
+		_leave(" = f [stopped]");
+		return false;
+	}
+
+	vc->nr_iterations++;
+
+	/* Evaluate the result of the previous operation, if there was one. */
+	switch (error) {
+	case SHRT_MAX:
+		goto start;
+
+	default:
+	case 0:
+		/* Success or local failure.  Stop. */
+		vc->error = error;
+		vc->flags |= AFS_VL_CURSOR_STOP;
+		_leave(" = f [okay/local %d]", vc->ac.error);
+		return false;
+
+	case -ECONNABORTED:
+		/* The far side rejected the operation on some grounds.  This
+		 * might involve the server being busy or the volume having been moved.
+		 */
+		switch (vc->ac.abort_code) {
+		case AFSVL_IO:
+		case AFSVL_BADVOLOPER:
+		case AFSVL_NOMEM:
+			/* The server went weird. */
+			vc->error = -EREMOTEIO;
+			//write_lock(&vc->cell->vl_servers_lock);
+			//vc->server_list->weird_mask |= 1 << vc->index;
+			//write_unlock(&vc->cell->vl_servers_lock);
+			goto next_server;
+
+		default:
+			vc->error = afs_abort_to_error(vc->ac.abort_code);
+			goto failed;
+		}
+
+	case -ERFKILL:
+	case -EADDRNOTAVAIL:
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -EHOSTDOWN:
+	case -ECONNREFUSED:
+	case -ETIMEDOUT:
+	case -ETIME:
+		_debug("no conn %d", error);
+		vc->error = error;
+		goto iterate_address;
+
+	case -ECONNRESET:
+		_debug("call reset");
+		vc->error = error;
+		vc->flags |= AFS_VL_CURSOR_RETRY;
+		goto next_server;
+	}
+
+restart_from_beginning:
+	_debug("restart");
+	afs_end_cursor(&vc->ac);
+	afs_put_vlserverlist(vc->cell->net, vc->server_list);
+	vc->server_list = NULL;
+	if (vc->flags & AFS_VL_CURSOR_RETRIED)
+		goto failed;
+	vc->flags |= AFS_VL_CURSOR_RETRIED;
+start:
+	_debug("start");
+
+	if (!afs_start_vl_iteration(vc))
+		goto failed;
+
+	error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
+	if (error < 0)
+		goto failed_set_error;
+
+pick_server:
+	_debug("pick [%lx]", vc->untried);
+
+	error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
+	if (error < 0)
+		goto failed_set_error;
+
+	/* Pick the untried server with the lowest RTT. */
+	vc->index = vc->server_list->preferred;
+	if (test_bit(vc->index, &vc->untried))
+		goto selected_server;
+
+	vc->index = -1;
+	rtt = U32_MAX;
+	for (i = 0; i < vc->server_list->nr_servers; i++) {
+		struct afs_vlserver *s = vc->server_list->servers[i].server;
+
+		if (!test_bit(i, &vc->untried) || !s->probe.responded)
+			continue;
+		if (s->probe.rtt < rtt) {
+			vc->index = i;
+			rtt = s->probe.rtt;
+		}
+	}
+
+	if (vc->index == -1)
+		goto no_more_servers;
+
+selected_server:
+	_debug("use %d", vc->index);
+	__clear_bit(vc->index, &vc->untried);
+
+	/* We're starting on a different vlserver from the list.  We need to
+	 * check it, find its address list and probe its capabilities before we
+	 * use it.
+	 */
+	ASSERTCMP(vc->ac.alist, ==, NULL);
+	vlserver = vc->server_list->servers[vc->index].server;
+	vc->server = vlserver;
+
+	_debug("USING VLSERVER: %s", vlserver->name);
+
+	read_lock(&vlserver->lock);
+	alist = rcu_dereference_protected(vlserver->addresses,
+					  lockdep_is_held(&vlserver->lock));
+	afs_get_addrlist(alist);
+	read_unlock(&vlserver->lock);
+
+	memset(&vc->ac, 0, sizeof(vc->ac));
+
+	if (!vc->ac.alist)
+		vc->ac.alist = alist;
+	else
+		afs_put_addrlist(alist);
+
+	vc->ac.index = -1;
+
+iterate_address:
+	ASSERT(vc->ac.alist);
+	/* Iterate over the current server's address list to try and find an
+	 * address on which it will respond to us.
+	 */
+	if (!afs_iterate_addresses(&vc->ac))
+		goto next_server;
+
+	_debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+
+	_leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
+	return true;
+
+next_server:
+	_debug("next");
+	afs_end_cursor(&vc->ac);
+	goto pick_server;
+
+no_more_servers:
+	/* That's all the servers poked to no good effect.  Try again if some
+	 * of them were busy.
+	 */
+	if (vc->flags & AFS_VL_CURSOR_RETRY)
+		goto restart_from_beginning;
+
+	e.error = -EDESTADDRREQ;
+	e.responded = false;
+	for (i = 0; i < vc->server_list->nr_servers; i++) {
+		struct afs_vlserver *s = vc->server_list->servers[i].server;
+
+		afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+				     s->probe.abort_code);
+	}
+
+failed_set_error:
+	vc->error = error;
+failed:
+	vc->flags |= AFS_VL_CURSOR_STOP;
+	afs_end_cursor(&vc->ac);
+	_leave(" = f [failed %d]", vc->error);
+	return false;
+}
+
+/*
+ * Dump cursor state in the case of the error being EDESTADDRREQ.
+ */
+static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
+{
+	static int count;
+	int i;
+
+	if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
+		return;
+	count++;
+
+	rcu_read_lock();
+	pr_notice("EDESTADDR occurred\n");
+	pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
+		  vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
+
+	if (vc->server_list) {
+		const struct afs_vlserver_list *sl = vc->server_list;
+		pr_notice("VC: SL nr=%u ix=%u\n",
+			  sl->nr_servers, sl->index);
+		for (i = 0; i < sl->nr_servers; i++) {
+			const struct afs_vlserver *s = sl->servers[i].server;
+			pr_notice("VC: server %s+%hu fl=%lx E=%hd\n",
+				  s->name, s->port, s->flags, s->probe.error);
+			if (s->addresses) {
+				const struct afs_addr_list *a =
+					rcu_dereference(s->addresses);
+				pr_notice("VC:  - nr=%u/%u/%u pf=%u\n",
+					  a->nr_ipv4, a->nr_addrs, a->max_addrs,
+					  a->preferred);
+				pr_notice("VC:  - pr=%lx R=%lx F=%lx\n",
+					  a->probed, a->responded, a->failed);
+				if (a == vc->ac.alist)
+					pr_notice("VC:  - current\n");
+			}
+		}
+	}
+
+	pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+		  vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
+		  vc->ac.responded, vc->ac.nr_iterations);
+	rcu_read_unlock();
+}
+
+/*
+ * Tidy up a volume location server cursor and unlock the vnode.
+ */
+int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
+{
+	struct afs_net *net = vc->cell->net;
+
+	if (vc->error == -EDESTADDRREQ ||
+	    vc->error == -EADDRNOTAVAIL ||
+	    vc->error == -ENETUNREACH ||
+	    vc->error == -EHOSTUNREACH)
+		afs_vl_dump_edestaddrreq(vc);
+
+	afs_end_cursor(&vc->ac);
+	afs_put_vlserverlist(net, vc->server_list);
+
+	if (vc->error == -ECONNABORTED)
+		vc->error = afs_abort_to_error(vc->ac.abort_code);
+
+	return vc->error;
+}

diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index c3b7408..cfb0ac4 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS Volume Location Service client
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/gfp.h>
@@ -34,7 +30,7 @@
 
 	/* unmarshall the reply once we've received all of it */
 	uvldb = call->buffer;
-	entry = call->reply[0];
+	entry = call->ret_vldb;
 
 	nr_servers = ntohl(uvldb->nServers);
 	if (nr_servers > AFS_NMAXNSERVERS)
@@ -60,23 +56,24 @@
 		struct afs_uuid__xdr *xdr;
 		struct afs_uuid *uuid;
 		int j;
+		int n = entry->nr_servers;
 
 		tmp = ntohl(uvldb->serverFlags[i]);
 		if (tmp & AFS_VLSF_DONTUSE ||
 		    (new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
 			continue;
 		if (tmp & AFS_VLSF_RWVOL) {
-			entry->fs_mask[i] |= AFS_VOL_VTM_RW;
+			entry->fs_mask[n] |= AFS_VOL_VTM_RW;
 			if (vlflags & AFS_VLF_BACKEXISTS)
-				entry->fs_mask[i] |= AFS_VOL_VTM_BAK;
+				entry->fs_mask[n] |= AFS_VOL_VTM_BAK;
 		}
 		if (tmp & AFS_VLSF_ROVOL)
-			entry->fs_mask[i] |= AFS_VOL_VTM_RO;
-		if (!entry->fs_mask[i])
+			entry->fs_mask[n] |= AFS_VOL_VTM_RO;
+		if (!entry->fs_mask[n])
 			continue;
 
 		xdr = &uvldb->serverNumber[i];
-		uuid = (struct afs_uuid *)&entry->fs_server[i];
+		uuid = (struct afs_uuid *)&entry->fs_server[n];
 		uuid->time_low			= xdr->time_low;
 		uuid->time_mid			= htons(ntohl(xdr->time_mid));
 		uuid->time_hi_and_version	= htons(ntohl(xdr->time_hi_and_version));
@@ -110,7 +107,7 @@
 
 static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
 {
-	kfree(call->reply[0]);
+	kfree(call->ret_vldb);
 	afs_flat_call_destructor(call);
 }
 
@@ -128,14 +125,13 @@
  * Dispatch a get volume entry by name or ID operation (uuid variant).  If the
  * volname is a decimal number then it's a volume ID not a volume name.
  */
-struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
-						  struct afs_addr_cursor *ac,
-						  struct key *key,
+struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
 						  const char *volname,
 						  int volnamesz)
 {
 	struct afs_vldb_entry *entry;
 	struct afs_call *call;
+	struct afs_net *net = vc->cell->net;
 	size_t reqsz, padsz;
 	__be32 *bp;
 
@@ -155,9 +151,9 @@
 		return ERR_PTR(-ENOMEM);
 	}
 
-	call->key = key;
-	call->reply[0] = entry;
-	call->ret_reply0 = true;
+	call->key = vc->key;
+	call->ret_vldb = entry;
+	call->max_lifespan = AFS_VL_MAX_LIFESPAN;
 
 	/* Marshall the parameters */
 	bp = call->request;
@@ -168,7 +164,8 @@
 		memset((void *)bp + volnamesz, 0, padsz);
 
 	trace_afs_make_vl_call(call);
-	return (struct afs_vldb_entry *)afs_make_call(ac, call, GFP_KERNEL, false);
+	afs_make_call(&vc->ac, call, GFP_KERNEL);
+	return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac);
 }
 
 /*
@@ -187,19 +184,20 @@
 	u32 uniquifier, nentries, count;
 	int i, ret;
 
-	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+	_enter("{%u,%zu/%u}",
+	       call->unmarshall, iov_iter_count(call->_iter), call->count);
 
-again:
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
+		afs_extract_to_buf(call,
+				   sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32));
 		call->unmarshall++;
 
-		/* Extract the returned uuid, uniquifier, nentries and blkaddrs size */
+		/* Extract the returned uuid, uniquifier, nentries and
+		 * blkaddrs size */
+		/* Fall through */
 	case 1:
-		ret = afs_extract_data(call, call->buffer,
-				       sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32),
-				       true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
@@ -213,31 +211,31 @@
 		if (!alist)
 			return -ENOMEM;
 		alist->version = uniquifier;
-		call->reply[0] = alist;
+		call->ret_alist = alist;
 		call->count = count;
 		call->count2 = nentries;
-		call->offset = 0;
 		call->unmarshall++;
 
-		/* Extract entries */
-	case 2:
+	more_entries:
 		count = min(call->count, 4U);
-		ret = afs_extract_data(call, call->buffer,
-				       count * sizeof(__be32),
-				       call->count > 4);
+		afs_extract_to_buf(call, count * sizeof(__be32));
+
+		/* Fall through - and extract entries */
+	case 2:
+		ret = afs_extract_data(call, call->count > 4);
 		if (ret < 0)
 			return ret;
 
-		alist = call->reply[0];
+		alist = call->ret_alist;
 		bp = call->buffer;
+		count = min(call->count, 4U);
 		for (i = 0; i < count; i++)
 			if (alist->nr_addrs < call->count2)
 				afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
 
 		call->count -= count;
 		if (call->count > 0)
-			goto again;
-		call->offset = 0;
+			goto more_entries;
 		call->unmarshall++;
 		break;
 	}
@@ -248,8 +246,7 @@
 
 static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
 {
-	afs_put_server(call->net, (struct afs_server *)call->reply[0]);
-	kfree(call->reply[1]);
+	afs_put_addrlist(call->ret_alist);
 	return afs_flat_call_destructor(call);
 }
 
@@ -267,14 +264,13 @@
  * Dispatch an operation to get the addresses for a server, where the server is
  * nominated by UUID.
  */
-struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
-					 struct afs_addr_cursor *ac,
-					 struct key *key,
+struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
 					 const uuid_t *uuid)
 {
 	struct afs_ListAddrByAttributes__xdr *r;
 	const struct afs_uuid *u = (const struct afs_uuid *)uuid;
 	struct afs_call *call;
+	struct afs_net *net = vc->cell->net;
 	__be32 *bp;
 	int i;
 
@@ -286,9 +282,9 @@
 	if (!call)
 		return ERR_PTR(-ENOMEM);
 
-	call->key = key;
-	call->reply[0] = NULL;
-	call->ret_reply0 = true;
+	call->key = vc->key;
+	call->ret_alist = NULL;
+	call->max_lifespan = AFS_VL_MAX_LIFESPAN;
 
 	/* Marshall the parameters */
 	bp = call->request;
@@ -307,7 +303,8 @@
 		r->uuid.node[i] = htonl(u->node[i]);
 
 	trace_afs_make_vl_call(call);
-	return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
+	afs_make_call(&vc->ac, call, GFP_KERNEL);
+	return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
 }
 
 /*
@@ -318,54 +315,49 @@
 	u32 count;
 	int ret;
 
-	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+	_enter("{%u,%zu/%u}",
+	       call->unmarshall, iov_iter_count(call->_iter), call->count);
 
-again:
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
+		afs_extract_to_tmp(call);
 		call->unmarshall++;
 
-		/* Extract the capabilities word count */
+		/* Fall through - and extract the capabilities word count */
 	case 1:
-		ret = afs_extract_data(call, &call->tmp,
-				       1 * sizeof(__be32),
-				       true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		count = ntohl(call->tmp);
-
 		call->count = count;
 		call->count2 = count;
-		call->offset = 0;
-		call->unmarshall++;
 
-		/* Extract capabilities words */
+		call->unmarshall++;
+		afs_extract_discard(call, count * sizeof(__be32));
+
+		/* Fall through - and extract capabilities words */
 	case 2:
-		count = min(call->count, 16U);
-		ret = afs_extract_data(call, call->buffer,
-				       count * sizeof(__be32),
-				       call->count > 16);
+		ret = afs_extract_data(call, false);
 		if (ret < 0)
 			return ret;
 
 		/* TODO: Examine capabilities */
 
-		call->count -= count;
-		if (call->count > 0)
-			goto again;
-		call->offset = 0;
 		call->unmarshall++;
 		break;
 	}
 
-	call->reply[0] = (void *)(unsigned long)call->service_id;
-
 	_leave(" = 0 [done]");
 	return 0;
 }
 
+static void afs_destroy_vl_get_capabilities(struct afs_call *call)
+{
+	afs_put_vlserver(call->net, call->vlserver);
+	afs_flat_call_destructor(call);
+}
+
 /*
  * VL.GetCapabilities operation type
  */
@@ -373,19 +365,22 @@
 	.name		= "VL.GetCapabilities",
 	.op		= afs_VL_GetCapabilities,
 	.deliver	= afs_deliver_vl_get_capabilities,
-	.destructor	= afs_flat_call_destructor,
+	.done		= afs_vlserver_probe_result,
+	.destructor	= afs_destroy_vl_get_capabilities,
 };
 
 /*
- * Probe a fileserver for the capabilities that it supports.  This can
+ * Probe a volume server for the capabilities that it supports.  This can
  * return up to 196 words.
  *
  * We use this to probe for service upgrade to determine what the server at the
  * other end supports.
  */
-int afs_vl_get_capabilities(struct afs_net *net,
-			    struct afs_addr_cursor *ac,
-			    struct key *key)
+struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
+					 struct afs_addr_cursor *ac,
+					 struct key *key,
+					 struct afs_vlserver *server,
+					 unsigned int server_index)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -394,12 +389,14 @@
 
 	call = afs_alloc_flat_call(net, &afs_RXVLGetCapabilities, 1 * 4, 16 * 4);
 	if (!call)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	call->key = key;
-	call->upgrade = true; /* Let's see if this is a YFS server */
-	call->reply[0] = (void *)VLGETCAPABILITIES;
-	call->ret_reply0 = true;
+	call->vlserver = afs_get_vlserver(server);
+	call->server_index = server_index;
+	call->upgrade = true;
+	call->async = true;
+	call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -407,7 +404,8 @@
 
 	/* Can't take a ref on server */
 	trace_afs_make_vl_call(call);
-	return afs_make_call(ac, call, GFP_KERNEL, false);
+	afs_make_call(ac, call, GFP_KERNEL);
+	return call;
 }
 
 /*
@@ -426,22 +424,20 @@
 	u32 uniquifier, size;
 	int ret;
 
-	_enter("{%u,%zu/%u,%u}", call->unmarshall, call->offset, call->count, call->count2);
+	_enter("{%u,%zu,%u}",
+	       call->unmarshall, iov_iter_count(call->_iter), call->count2);
 
-again:
 	switch (call->unmarshall) {
 	case 0:
-		call->offset = 0;
+		afs_extract_to_buf(call, sizeof(uuid_t) + 3 * sizeof(__be32));
 		call->unmarshall = 1;
 
 		/* Extract the returned uuid, uniquifier, fsEndpoints count and
 		 * either the first fsEndpoint type or the volEndpoints
 		 * count if there are no fsEndpoints. */
+		/* Fall through */
 	case 1:
-		ret = afs_extract_data(call, call->buffer,
-				       sizeof(uuid_t) +
-				       3 * sizeof(__be32),
-				       true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
@@ -451,22 +447,19 @@
 		call->count2	= ntohl(*bp); /* Type or next count */
 
 		if (call->count > YFS_MAXENDPOINTS)
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_yvl_fsendpt_num);
 
 		alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
 		if (!alist)
 			return -ENOMEM;
 		alist->version = uniquifier;
-		call->reply[0] = alist;
-		call->offset = 0;
+		call->ret_alist = alist;
 
 		if (call->count == 0)
 			goto extract_volendpoints;
 
-		call->unmarshall = 2;
-
-		/* Extract fsEndpoints[] entries */
-	case 2:
+	next_fsendpoint:
 		switch (call->count2) {
 		case YFS_ENDPOINT_IPV4:
 			size = sizeof(__be32) * (1 + 1 + 1);
@@ -475,31 +468,40 @@
 			size = sizeof(__be32) * (1 + 4 + 1);
 			break;
 		default:
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_yvl_fsendpt_type);
 		}
 
 		size += sizeof(__be32);
-		ret = afs_extract_data(call, call->buffer, size, true);
+		afs_extract_to_buf(call, size);
+		call->unmarshall = 2;
+
+		/* Fall through - and extract fsEndpoints[] entries */
+	case 2:
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
-		alist = call->reply[0];
+		alist = call->ret_alist;
 		bp = call->buffer;
 		switch (call->count2) {
 		case YFS_ENDPOINT_IPV4:
 			if (ntohl(bp[0]) != sizeof(__be32) * 2)
-				return afs_protocol_error(call, -EBADMSG);
+				return afs_protocol_error(call, -EBADMSG,
+							  afs_eproto_yvl_fsendpt4_len);
 			afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
 			bp += 3;
 			break;
 		case YFS_ENDPOINT_IPV6:
 			if (ntohl(bp[0]) != sizeof(__be32) * 5)
-				return afs_protocol_error(call, -EBADMSG);
+				return afs_protocol_error(call, -EBADMSG,
+							  afs_eproto_yvl_fsendpt6_len);
 			afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
 			bp += 6;
 			break;
 		default:
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_yvl_fsendpt_type);
 		}
 
 		/* Got either the type of the next entry or the count of
@@ -507,10 +509,9 @@
 		 */
 		call->count2 = ntohl(*bp++);
 
-		call->offset = 0;
 		call->count--;
 		if (call->count > 0)
-			goto again;
+			goto next_fsendpoint;
 
 	extract_volendpoints:
 		/* Extract the list of volEndpoints. */
@@ -518,26 +519,26 @@
 		if (!call->count)
 			goto end;
 		if (call->count > YFS_MAXENDPOINTS)
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_yvl_vlendpt_type);
 
+		afs_extract_to_buf(call, 1 * sizeof(__be32));
 		call->unmarshall = 3;
 
 		/* Extract the type of volEndpoints[0].  Normally we would
 		 * extract the type of the next endpoint when we extract the
 		 * data of the current one, but this is the first...
 		 */
+		/* Fall through */
 	case 3:
-		ret = afs_extract_data(call, call->buffer, sizeof(__be32), true);
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
 		bp = call->buffer;
-		call->count2 = ntohl(*bp++);
-		call->offset = 0;
-		call->unmarshall = 4;
 
-		/* Extract volEndpoints[] entries */
-	case 4:
+	next_volendpoint:
+		call->count2 = ntohl(*bp++);
 		switch (call->count2) {
 		case YFS_ENDPOINT_IPV4:
 			size = sizeof(__be32) * (1 + 1 + 1);
@@ -546,12 +547,18 @@
 			size = sizeof(__be32) * (1 + 4 + 1);
 			break;
 		default:
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_yvl_vlendpt_type);
 		}
 
 		if (call->count > 1)
-			size += sizeof(__be32);
-		ret = afs_extract_data(call, call->buffer, size, true);
+			size += sizeof(__be32); /* Get next type too */
+		afs_extract_to_buf(call, size);
+		call->unmarshall = 4;
+
+		/* Fall through - and extract volEndpoints[] entries */
+	case 4:
+		ret = afs_extract_data(call, true);
 		if (ret < 0)
 			return ret;
 
@@ -559,34 +566,35 @@
 		switch (call->count2) {
 		case YFS_ENDPOINT_IPV4:
 			if (ntohl(bp[0]) != sizeof(__be32) * 2)
-				return afs_protocol_error(call, -EBADMSG);
+				return afs_protocol_error(call, -EBADMSG,
+							  afs_eproto_yvl_vlendpt4_len);
 			bp += 3;
 			break;
 		case YFS_ENDPOINT_IPV6:
 			if (ntohl(bp[0]) != sizeof(__be32) * 5)
-				return afs_protocol_error(call, -EBADMSG);
+				return afs_protocol_error(call, -EBADMSG,
+							  afs_eproto_yvl_vlendpt6_len);
 			bp += 6;
 			break;
 		default:
-			return afs_protocol_error(call, -EBADMSG);
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_yvl_vlendpt_type);
 		}
 
 		/* Got either the type of the next entry or the count of
 		 * volEndpoints if no more fsEndpoints.
 		 */
-		call->offset = 0;
 		call->count--;
-		if (call->count > 0) {
-			call->count2 = ntohl(*bp++);
-			goto again;
-		}
+		if (call->count > 0)
+			goto next_volendpoint;
 
 	end:
+		afs_extract_discard(call, 0);
 		call->unmarshall = 5;
 
-		/* Done */
+		/* Fall through - Done */
 	case 5:
-		ret = afs_extract_data(call, call->buffer, 0, false);
+		ret = afs_extract_data(call, false);
 		if (ret < 0)
 			return ret;
 		call->unmarshall = 6;
@@ -595,12 +603,6 @@
 		break;
 	}
 
-	alist = call->reply[0];
-
-	/* Start with IPv6 if available. */
-	if (alist->nr_ipv4 < alist->nr_addrs)
-		alist->index = alist->nr_ipv4;
-
 	_leave(" = 0 [done]");
 	return 0;
 }
@@ -619,12 +621,11 @@
  * Dispatch an operation to get the addresses for a server, where the server is
  * nominated by UUID.
  */
-struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
-					      struct afs_addr_cursor *ac,
-					      struct key *key,
+struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
 					      const uuid_t *uuid)
 {
 	struct afs_call *call;
+	struct afs_net *net = vc->cell->net;
 	__be32 *bp;
 
 	_enter("");
@@ -635,9 +636,9 @@
 	if (!call)
 		return ERR_PTR(-ENOMEM);
 
-	call->key = key;
-	call->reply[0] = NULL;
-	call->ret_reply0 = true;
+	call->key = vc->key;
+	call->ret_alist = NULL;
+	call->max_lifespan = AFS_VL_MAX_LIFESPAN;
 
 	/* Marshall the parameters */
 	bp = call->request;
@@ -646,5 +647,6 @@
 	memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */
 
 	trace_afs_make_vl_call(call);
-	return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
+	afs_make_call(&vc->ac, call, GFP_KERNEL);
+	return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
 }

diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 3037bd0..92ca5e2 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* AFS volume management
  *
  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
@@ -16,12 +12,10 @@
 unsigned __read_mostly afs_volume_gc_delay = 10;
 unsigned __read_mostly afs_volume_record_life = 60 * 60;
 
-static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" };
-
 /*
  * Allocate a volume record and load it up from a vldb record.
  */
-static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params,
+static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
 					   struct afs_vldb_entry *vldb,
 					   unsigned long type_mask)
 {
@@ -47,6 +41,7 @@
 	atomic_set(&volume->usage, 1);
 	INIT_LIST_HEAD(&volume->proc_link);
 	rwlock_init(&volume->servers_lock);
+	rwlock_init(&volume->cb_v_break_lock);
 	memcpy(volume->name, vldb->name, vldb->name_len + 1);
 
 	slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask);
@@ -74,55 +69,19 @@
 						 const char *volname,
 						 size_t volnamesz)
 {
-	struct afs_addr_cursor ac;
-	struct afs_vldb_entry *vldb;
+	struct afs_vldb_entry *vldb = ERR_PTR(-EDESTADDRREQ);
+	struct afs_vl_cursor vc;
 	int ret;
 
-	ret = afs_set_vl_cursor(&ac, cell);
-	if (ret < 0)
-		return ERR_PTR(ret);
+	if (!afs_begin_vlserver_operation(&vc, cell, key))
+		return ERR_PTR(-ERESTARTSYS);
 
-	while (afs_iterate_addresses(&ac)) {
-		if (!test_bit(ac.index, &ac.alist->probed)) {
-			ret = afs_vl_get_capabilities(cell->net, &ac, key);
-			switch (ret) {
-			case VL_SERVICE:
-				clear_bit(ac.index, &ac.alist->yfs);
-				set_bit(ac.index, &ac.alist->probed);
-				ac.addr->srx_service = ret;
-				break;
-			case YFS_VL_SERVICE:
-				set_bit(ac.index, &ac.alist->yfs);
-				set_bit(ac.index, &ac.alist->probed);
-				ac.addr->srx_service = ret;
-				break;
-			}
-		}
-		
-		vldb = afs_vl_get_entry_by_name_u(cell->net, &ac, key,
-						  volname, volnamesz);
-		switch (ac.error) {
-		case 0:
-			afs_end_cursor(&ac);
-			return vldb;
-		case -ECONNABORTED:
-			ac.error = afs_abort_to_error(ac.abort_code);
-			goto error;
-		case -ENOMEM:
-		case -ENONET:
-			goto error;
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-			break;
-		default:
-			ac.error = -EIO;
-			goto error;
-		}
+	while (afs_select_vlserver(&vc)) {
+		vldb = afs_vl_get_entry_by_name_u(&vc, volname, volnamesz);
 	}
 
-error:
-	return ERR_PTR(afs_end_cursor(&ac));
+	ret = afs_end_vlserver_operation(&vc);
+	return ret < 0 ? ERR_PTR(ret) : vldb;
 }
 
 /*
@@ -149,7 +108,7 @@
  * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
  *           explicitly told otherwise
  */
-struct afs_volume *afs_create_volume(struct afs_mount_params *params)
+struct afs_volume *afs_create_volume(struct afs_fs_context *params)
 {
 	struct afs_vldb_entry *vldb;
 	struct afs_volume *volume;
@@ -270,7 +229,7 @@
 	/* We look up an ID by passing it as a decimal string in the
 	 * operation's name parameter.
 	 */
-	idsz = sprintf(idbuf, "%u", volume->vid);
+	idsz = sprintf(idbuf, "%llu", volume->vid);
 
 	vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
 	if (IS_ERR(vldb)) {

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 19c04ca..cb76566 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* handling of writes to regular files and writing back to the server
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/backing-dev.h>
@@ -33,12 +29,22 @@
 			 loff_t pos, unsigned int len, struct page *page)
 {
 	struct afs_read *req;
+	size_t p;
+	void *data;
 	int ret;
 
 	_enter(",,%llu", (unsigned long long)pos);
 
-	req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
-		      GFP_KERNEL);
+	if (pos >= vnode->vfs_inode.i_size) {
+		p = pos & ~PAGE_MASK;
+		ASSERTCMP(p + len, <=, PAGE_SIZE);
+		data = kmap(page);
+		memset(data + p, 0, len);
+		kunmap(page);
+		return 0;
+	}
+
+	req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
 	if (!req)
 		return -ENOMEM;
 
@@ -81,7 +87,7 @@
 	pgoff_t index = pos >> PAGE_SHIFT;
 	int ret;
 
-	_enter("{%x:%u},{%lx},%u,%u",
+	_enter("{%llx:%llu},{%lx},%u,%u",
 	       vnode->fid.vid, vnode->fid.vnode, index, from, to);
 
 	/* We want to store information about how much of a page is altered in
@@ -181,7 +187,7 @@
 	loff_t i_size, maybe_i_size;
 	int ret;
 
-	_enter("{%x:%u},{%lx}",
+	_enter("{%llx:%llu},{%lx}",
 	       vnode->fid.vid, vnode->fid.vnode, page->index);
 
 	maybe_i_size = pos + copied;
@@ -230,7 +236,7 @@
 	struct pagevec pv;
 	unsigned count, loop;
 
-	_enter("{%x:%u},%lx-%lx",
+	_enter("{%llx:%llu},%lx-%lx",
 	       vnode->fid.vid, vnode->fid.vnode, first, last);
 
 	pagevec_init(&pv);
@@ -253,6 +259,7 @@
 				first = page->index + 1;
 			lock_page(page);
 			generic_error_remove_page(mapping, page);
+			unlock_page(page);
 		}
 
 		__pagevec_release(&pv);
@@ -272,7 +279,7 @@
 	struct pagevec pv;
 	unsigned count, loop;
 
-	_enter("{%x:%u},%lx-%lx",
+	_enter("{%llx:%llu},%lx-%lx",
 	       vnode->fid.vid, vnode->fid.vnode, first, last);
 
 	pagevec_init(&pv);
@@ -302,6 +309,46 @@
 }
 
 /*
+ * completion of write to server
+ */
+static void afs_pages_written_back(struct afs_vnode *vnode,
+				   pgoff_t first, pgoff_t last)
+{
+	struct pagevec pv;
+	unsigned long priv;
+	unsigned count, loop;
+
+	_enter("{%llx:%llu},{%lx-%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	pagevec_init(&pv);
+
+	do {
+		_debug("done %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+					      first, count, pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		for (loop = 0; loop < count; loop++) {
+			priv = page_private(pv.pages[loop]);
+			trace_afs_page_dirty(vnode, tracepoint_string("clear"),
+					     pv.pages[loop]->index, priv);
+			set_page_private(pv.pages[loop], 0);
+			end_page_writeback(pv.pages[loop]);
+		}
+		first += count;
+		__pagevec_release(&pv);
+	} while (first <= last);
+
+	afs_prune_wb_keys(vnode);
+	_leave("");
+}
+
+/*
  * write to a file
  */
 static int afs_store_data(struct address_space *mapping,
@@ -310,17 +357,22 @@
 {
 	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
 	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
 	struct afs_wb_key *wbk = NULL;
 	struct list_head *p;
 	int ret = -ENOKEY, ret2;
 
-	_enter("%s{%x:%u.%u},%lx,%lx,%x,%x",
+	_enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x",
 	       vnode->volume->name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
 	       first, last, offset, to);
 
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+	if (!scb)
+		return -ENOMEM;
+
 	spin_lock(&vnode->wb_lock);
 	p = vnode->wb_keys.next;
 
@@ -339,6 +391,7 @@
 
 	spin_unlock(&vnode->wb_lock);
 	afs_put_wb_key(wbk);
+	kfree(scb);
 	_leave(" = %d [no keys]", ret);
 	return ret;
 
@@ -349,14 +402,19 @@
 	_debug("USE WB KEY %u", key_serial(wbk->key));
 
 	ret = -ERESTARTSYS;
-	if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) {
+	if (afs_begin_vnode_operation(&fc, vnode, wbk->key, false)) {
+		afs_dataversion_t data_version = vnode->status.data_version + 1;
+
 		while (afs_select_fileserver(&fc)) {
 			fc.cb_break = afs_calc_vnode_cb_break(vnode);
-			afs_fs_store_data(&fc, mapping, first, last, offset, to);
+			afs_fs_store_data(&fc, mapping, first, last, offset, to, scb);
 		}
 
-		afs_check_for_remote_deletion(&fc, fc.vnode);
-		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		afs_check_for_remote_deletion(&fc, vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+					&data_version, scb);
+		if (fc.ac.error == 0)
+			afs_pages_written_back(vnode, first, last);
 		ret = afs_end_vnode_operation(&fc);
 	}
 
@@ -381,6 +439,7 @@
 	}
 
 	afs_put_wb_key(wbk);
+	kfree(scb);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -533,6 +592,7 @@
 	case -ENOENT:
 	case -ENOMEDIUM:
 	case -ENXIO:
+		trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
 		afs_kill_pages(mapping, first, last);
 		mapping_set_error(mapping, ret);
 		break;
@@ -666,46 +726,6 @@
 }
 
 /*
- * completion of write to server
- */
-void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
-{
-	struct pagevec pv;
-	unsigned long priv;
-	unsigned count, loop;
-	pgoff_t first = call->first, last = call->last;
-
-	_enter("{%x:%u},{%lx-%lx}",
-	       vnode->fid.vid, vnode->fid.vnode, first, last);
-
-	pagevec_init(&pv);
-
-	do {
-		_debug("done %lx-%lx", first, last);
-
-		count = last - first + 1;
-		if (count > PAGEVEC_SIZE)
-			count = PAGEVEC_SIZE;
-		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
-					      first, count, pv.pages);
-		ASSERTCMP(pv.nr, ==, count);
-
-		for (loop = 0; loop < count; loop++) {
-			priv = page_private(pv.pages[loop]);
-			trace_afs_page_dirty(vnode, tracepoint_string("clear"),
-					     pv.pages[loop]->index, priv);
-			set_page_private(pv.pages[loop], 0);
-			end_page_writeback(pv.pages[loop]);
-		}
-		first += count;
-		__pagevec_release(&pv);
-	} while (first <= last);
-
-	afs_prune_wb_keys(vnode);
-	_leave("");
-}
-
-/*
  * write to an AFS file
  */
 ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
@@ -714,7 +734,7 @@
 	ssize_t result;
 	size_t count = iov_iter_count(from);
 
-	_enter("{%x.%u},{%zu},",
+	_enter("{%llx:%llu},{%zu},",
 	       vnode->fid.vid, vnode->fid.vnode, count);
 
 	if (IS_SWAPFILE(&vnode->vfs_inode)) {
@@ -742,7 +762,7 @@
 	struct inode *inode = file_inode(file);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
-	_enter("{%x:%u},{n=%pD},%d",
+	_enter("{%llx:%llu},{n=%pD},%d",
 	       vnode->fid.vid, vnode->fid.vnode, file,
 	       datasync);
 
@@ -760,7 +780,7 @@
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	unsigned long priv;
 
-	_enter("{{%x:%u}},{%lx}",
+	_enter("{{%llx:%llu}},{%lx}",
 	       vnode->fid.vid, vnode->fid.vnode, vmf->page->index);
 
 	sb_start_pagefault(inode->i_sb);

diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index cfcc674..5552d03 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Extended attribute handling for AFS.  We use xattrs to get and set metadata
  * instead of providing pioctl().
  *
  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/slab.h>
@@ -16,9 +12,14 @@
 #include "internal.h"
 
 static const char afs_xattr_list[] =
+	"afs.acl\0"
 	"afs.cell\0"
 	"afs.fid\0"
-	"afs.volume";
+	"afs.volume\0"
+	"afs.yfs.acl\0"
+	"afs.yfs.acl_inherited\0"
+	"afs.yfs.acl_num_cleaned\0"
+	"afs.yfs.vol_acl";
 
 /*
  * Retrieve a list of the supported xattrs.
@@ -34,6 +35,300 @@
 }
 
 /*
+ * Get a file's ACL.
+ */
+static int afs_xattr_get_acl(const struct xattr_handler *handler,
+			     struct dentry *dentry,
+			     struct inode *inode, const char *name,
+			     void *buffer, size_t size)
+{
+	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_acl *acl = NULL;
+	struct key *key;
+	int ret = -ENOMEM;
+
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+	if (!scb)
+		goto error;
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error_scb;
+	}
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+		afs_dataversion_t data_version = vnode->status.data_version;
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
+			acl = afs_fs_fetch_acl(&fc, scb);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+					&data_version, scb);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	if (ret == 0) {
+		ret = acl->size;
+		if (size > 0) {
+			if (acl->size <= size)
+				memcpy(buffer, acl->data, acl->size);
+			else
+				ret = -ERANGE;
+		}
+		kfree(acl);
+	}
+
+	key_put(key);
+error_scb:
+	kfree(scb);
+error:
+	return ret;
+}
+
+/*
+ * Set a file's AFS3 ACL.
+ */
+static int afs_xattr_set_acl(const struct xattr_handler *handler,
+                             struct dentry *dentry,
+                             struct inode *inode, const char *name,
+                             const void *buffer, size_t size, int flags)
+{
+	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_acl *acl = NULL;
+	struct key *key;
+	int ret = -ENOMEM;
+
+	if (flags == XATTR_CREATE)
+		return -EINVAL;
+
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+	if (!scb)
+		goto error;
+
+	acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL);
+	if (!acl)
+		goto error_scb;
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error_acl;
+	}
+
+	acl->size = size;
+	memcpy(acl->data, buffer, size);
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+		afs_dataversion_t data_version = vnode->status.data_version;
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
+			afs_fs_store_acl(&fc, acl, scb);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+					&data_version, scb);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	key_put(key);
+error_acl:
+	kfree(acl);
+error_scb:
+	kfree(scb);
+error:
+	return ret;
+}
+
+static const struct xattr_handler afs_xattr_afs_acl_handler = {
+	.name   = "afs.acl",
+	.get    = afs_xattr_get_acl,
+	.set    = afs_xattr_set_acl,
+};
+
+/*
+ * Get a file's YFS ACL.
+ */
+static int afs_xattr_get_yfs(const struct xattr_handler *handler,
+			     struct dentry *dentry,
+			     struct inode *inode, const char *name,
+			     void *buffer, size_t size)
+{
+	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct yfs_acl *yacl = NULL;
+	struct key *key;
+	char buf[16], *data;
+	int which = 0, dsize, ret = -ENOMEM;
+
+	if (strcmp(name, "acl") == 0)
+		which = 0;
+	else if (strcmp(name, "acl_inherited") == 0)
+		which = 1;
+	else if (strcmp(name, "acl_num_cleaned") == 0)
+		which = 2;
+	else if (strcmp(name, "vol_acl") == 0)
+		which = 3;
+	else
+		return -EOPNOTSUPP;
+
+	yacl = kzalloc(sizeof(struct yfs_acl), GFP_KERNEL);
+	if (!yacl)
+		goto error;
+
+	if (which == 0)
+		yacl->flags |= YFS_ACL_WANT_ACL;
+	else if (which == 3)
+		yacl->flags |= YFS_ACL_WANT_VOL_ACL;
+
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+	if (!scb)
+		goto error_yacl;
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error_scb;
+	}
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+		afs_dataversion_t data_version = vnode->status.data_version;
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
+			yfs_fs_fetch_opaque_acl(&fc, yacl, scb);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+					&data_version, scb);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	if (ret < 0)
+		goto error_key;
+
+	switch (which) {
+	case 0:
+		data = yacl->acl->data;
+		dsize = yacl->acl->size;
+		break;
+	case 1:
+		data = buf;
+		dsize = snprintf(buf, sizeof(buf), "%u", yacl->inherit_flag);
+		break;
+	case 2:
+		data = buf;
+		dsize = snprintf(buf, sizeof(buf), "%u", yacl->num_cleaned);
+		break;
+	case 3:
+		data = yacl->vol_acl->data;
+		dsize = yacl->vol_acl->size;
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		goto error_key;
+	}
+
+	ret = dsize;
+	if (size > 0) {
+		if (dsize > size) {
+			ret = -ERANGE;
+			goto error_key;
+		}
+		memcpy(buffer, data, dsize);
+	}
+
+error_key:
+	key_put(key);
+error_scb:
+	kfree(scb);
+error_yacl:
+	yfs_free_opaque_acl(yacl);
+error:
+	return ret;
+}
+
+/*
+ * Set a file's YFS ACL.
+ */
+static int afs_xattr_set_yfs(const struct xattr_handler *handler,
+                             struct dentry *dentry,
+                             struct inode *inode, const char *name,
+                             const void *buffer, size_t size, int flags)
+{
+	struct afs_fs_cursor fc;
+	struct afs_status_cb *scb;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_acl *acl = NULL;
+	struct key *key;
+	int ret = -ENOMEM;
+
+	if (flags == XATTR_CREATE ||
+	    strcmp(name, "acl") != 0)
+		return -EINVAL;
+
+	scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+	if (!scb)
+		goto error;
+
+	acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL);
+	if (!acl)
+		goto error_scb;
+
+	acl->size = size;
+	memcpy(acl->data, buffer, size);
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error_acl;
+	}
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+		afs_dataversion_t data_version = vnode->status.data_version;
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = afs_calc_vnode_cb_break(vnode);
+			yfs_fs_store_opaque_acl2(&fc, acl, scb);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+					&data_version, scb);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+error_acl:
+	kfree(acl);
+	key_put(key);
+error_scb:
+	kfree(scb);
+error:
+	return ret;
+}
+
+static const struct xattr_handler afs_xattr_yfs_handler = {
+	.prefix	= "afs.yfs.",
+	.get	= afs_xattr_get_yfs,
+	.set	= afs_xattr_set_yfs,
+};
+
+/*
  * Get the name of the cell on which a file resides.
  */
 static int afs_xattr_get_cell(const struct xattr_handler *handler,
@@ -50,7 +345,7 @@
 		return namelen;
 	if (namelen > size)
 		return -ERANGE;
-	memcpy(buffer, cell->name, size);
+	memcpy(buffer, cell->name, namelen);
 	return namelen;
 }
 
@@ -69,11 +364,20 @@
 			     void *buffer, size_t size)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
-	char text[8 + 1 + 8 + 1 + 8 + 1];
+	char text[16 + 1 + 24 + 1 + 8 + 1];
 	size_t len;
 
-	len = sprintf(text, "%x:%x:%x",
-		      vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+	/* The volume ID is 64-bit, the vnode ID is 96-bit and the
+	 * uniquifier is 32-bit.
+	 */
+	len = sprintf(text, "%llx:", vnode->fid.vid);
+	if (vnode->fid.vnode_hi)
+		len += sprintf(text + len, "%x%016llx",
+			       vnode->fid.vnode_hi, vnode->fid.vnode);
+	else
+		len += sprintf(text + len, "%llx", vnode->fid.vnode);
+	len += sprintf(text + len, ":%x", vnode->fid.unique);
+
 	if (size == 0)
 		return len;
 	if (len > size)
@@ -104,7 +408,7 @@
 		return namelen;
 	if (namelen > size)
 		return -ERANGE;
-	memcpy(buffer, volname, size);
+	memcpy(buffer, volname, namelen);
 	return namelen;
 }
 
@@ -114,8 +418,10 @@
 };
 
 const struct xattr_handler *afs_xattr_handlers[] = {
+	&afs_xattr_afs_acl_handler,
 	&afs_xattr_afs_cell_handler,
 	&afs_xattr_afs_fid_handler,
 	&afs_xattr_afs_volume_handler,
+	&afs_xattr_yfs_handler,		/* afs.yfs. prefix */
 	NULL
 };

diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h
index aa21f30..94f1f39 100644
--- a/fs/afs/xdr_fs.h
+++ b/fs/afs/xdr_fs.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* AFS fileserver XDR types
  *
  * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #ifndef XDR_FS_H

diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
new file mode 100644
index 0000000..3ee7abf
--- /dev/null
+++ b/fs/afs/yfsclient.c

@@ -0,0 +1,2203 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* YFS File Server client stubs
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/circ_buf.h>
+#include <linux/iversion.h>
+#include "internal.h"
+#include "afs_fs.h"
+#include "xdr_fs.h"
+#include "protocol_yfs.h"
+
+static const struct afs_fid afs_zero_fid;
+
+static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi)
+{
+	call->cbi = afs_get_cb_interest(cbi);
+}
+
+#define xdr_size(x) (sizeof(*x) / sizeof(__be32))
+
+static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid)
+{
+	const struct yfs_xdr_YFSFid *x = (const void *)*_bp;
+
+	fid->vid	= xdr_to_u64(x->volume);
+	fid->vnode	= xdr_to_u64(x->vnode.lo);
+	fid->vnode_hi	= ntohl(x->vnode.hi);
+	fid->unique	= ntohl(x->vnode.unique);
+	*_bp += xdr_size(x);
+}
+
+static __be32 *xdr_encode_u32(__be32 *bp, u32 n)
+{
+	*bp++ = htonl(n);
+	return bp;
+}
+
+static __be32 *xdr_encode_u64(__be32 *bp, u64 n)
+{
+	struct yfs_xdr_u64 *x = (void *)bp;
+
+	*x = u64_to_xdr(n);
+	return bp + xdr_size(x);
+}
+
+static __be32 *xdr_encode_YFSFid(__be32 *bp, struct afs_fid *fid)
+{
+	struct yfs_xdr_YFSFid *x = (void *)bp;
+
+	x->volume	= u64_to_xdr(fid->vid);
+	x->vnode.lo	= u64_to_xdr(fid->vnode);
+	x->vnode.hi	= htonl(fid->vnode_hi);
+	x->vnode.unique	= htonl(fid->unique);
+	return bp + xdr_size(x);
+}
+
+static size_t xdr_strlen(unsigned int len)
+{
+	return sizeof(__be32) + round_up(len, sizeof(__be32));
+}
+
+static __be32 *xdr_encode_string(__be32 *bp, const char *p, unsigned int len)
+{
+	bp = xdr_encode_u32(bp, len);
+	bp = memcpy(bp, p, len);
+	if (len & 3) {
+		unsigned int pad = 4 - (len & 3);
+
+		memset((u8 *)bp + len, 0, pad);
+		len += pad;
+	}
+
+	return bp + len / sizeof(__be32);
+}
+
+static s64 linux_to_yfs_time(const struct timespec64 *t)
+{
+	/* Convert to 100ns intervals. */
+	return (u64)t->tv_sec * 10000000 + t->tv_nsec/100;
+}
+
+static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode)
+{
+	struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+
+	x->mask		= htonl(AFS_SET_MODE);
+	x->mode		= htonl(mode & S_IALLUGO);
+	x->mtime_client	= u64_to_xdr(0);
+	x->owner	= u64_to_xdr(0);
+	x->group	= u64_to_xdr(0);
+	return bp + xdr_size(x);
+}
+
+static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t)
+{
+	struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+	s64 mtime = linux_to_yfs_time(t);
+
+	x->mask		= htonl(AFS_SET_MTIME);
+	x->mode		= htonl(0);
+	x->mtime_client	= u64_to_xdr(mtime);
+	x->owner	= u64_to_xdr(0);
+	x->group	= u64_to_xdr(0);
+	return bp + xdr_size(x);
+}
+
+/*
+ * Convert a signed 100ns-resolution 64-bit time into a timespec.
+ */
+static struct timespec64 yfs_time_to_linux(s64 t)
+{
+	struct timespec64 ts;
+	u64 abs_t;
+
+	/*
+	 * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+	 * the alternative, do_div, does not work with negative numbers so have
+	 * to special case them
+	 */
+	if (t < 0) {
+		abs_t = -t;
+		ts.tv_nsec = (time64_t)(do_div(abs_t, 10000000) * 100);
+		ts.tv_nsec = -ts.tv_nsec;
+		ts.tv_sec = -abs_t;
+	} else {
+		abs_t = t;
+		ts.tv_nsec = (time64_t)do_div(abs_t, 10000000) * 100;
+		ts.tv_sec = abs_t;
+	}
+
+	return ts;
+}
+
+static struct timespec64 xdr_to_time(const struct yfs_xdr_u64 xdr)
+{
+	s64 t = xdr_to_u64(xdr);
+
+	return yfs_time_to_linux(t);
+}
+
+static void yfs_check_req(struct afs_call *call, __be32 *bp)
+{
+	size_t len = (void *)bp - call->request;
+
+	if (len > call->request_size)
+		pr_err("kAFS: %s: Request buffer overflow (%zu>%u)\n",
+		       call->type->name, len, call->request_size);
+	else if (len < call->request_size)
+		pr_warning("kAFS: %s: Request buffer underflow (%zu<%u)\n",
+			   call->type->name, len, call->request_size);
+}
+
+/*
+ * Dump a bad file status record.
+ */
+static void xdr_dump_bad(const __be32 *bp)
+{
+	__be32 x[4];
+	int i;
+
+	pr_notice("YFS XDR: Bad status record\n");
+	for (i = 0; i < 5 * 4 * 4; i += 16) {
+		memcpy(x, bp, 16);
+		bp += 4;
+		pr_notice("%03x: %08x %08x %08x %08x\n",
+			  i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3]));
+	}
+
+	memcpy(x, bp, 4);
+	pr_notice("0x50: %08x\n", ntohl(x[0]));
+}
+
+/*
+ * Decode a YFSFetchStatus block
+ */
+static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
+				     struct afs_call *call,
+				     struct afs_status_cb *scb)
+{
+	const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp;
+	struct afs_file_status *status = &scb->status;
+	u32 type;
+
+	status->abort_code = ntohl(xdr->abort_code);
+	if (status->abort_code != 0) {
+		if (status->abort_code == VNOVNODE)
+			status->nlink = 0;
+		scb->have_error = true;
+		return 0;
+	}
+
+	type = ntohl(xdr->type);
+	switch (type) {
+	case AFS_FTYPE_FILE:
+	case AFS_FTYPE_DIR:
+	case AFS_FTYPE_SYMLINK:
+		status->type = type;
+		break;
+	default:
+		goto bad;
+	}
+
+	status->nlink		= ntohl(xdr->nlink);
+	status->author		= xdr_to_u64(xdr->author);
+	status->owner		= xdr_to_u64(xdr->owner);
+	status->caller_access	= ntohl(xdr->caller_access); /* Ticket dependent */
+	status->anon_access	= ntohl(xdr->anon_access);
+	status->mode		= ntohl(xdr->mode) & S_IALLUGO;
+	status->group		= xdr_to_u64(xdr->group);
+	status->lock_count	= ntohl(xdr->lock_count);
+
+	status->mtime_client	= xdr_to_time(xdr->mtime_client);
+	status->mtime_server	= xdr_to_time(xdr->mtime_server);
+	status->size		= xdr_to_u64(xdr->size);
+	status->data_version	= xdr_to_u64(xdr->data_version);
+	scb->have_status	= true;
+
+	*_bp += xdr_size(xdr);
+	return 0;
+
+bad:
+	xdr_dump_bad(*_bp);
+	return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+}
+
+/*
+ * Decode a YFSCallBack block
+ */
+static void xdr_decode_YFSCallBack(const __be32 **_bp,
+				   struct afs_call *call,
+				   struct afs_status_cb *scb)
+{
+	struct yfs_xdr_YFSCallBack *x = (void *)*_bp;
+	struct afs_callback *cb = &scb->callback;
+	ktime_t cb_expiry;
+
+	cb_expiry = call->reply_time;
+	cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100);
+	cb->expires_at	= ktime_divns(cb_expiry, NSEC_PER_SEC);
+	scb->have_cb	= true;
+	*_bp += xdr_size(x);
+}
+
+/*
+ * Decode a YFSVolSync block
+ */
+static void xdr_decode_YFSVolSync(const __be32 **_bp,
+				  struct afs_volsync *volsync)
+{
+	struct yfs_xdr_YFSVolSync *x = (void *)*_bp;
+	u64 creation;
+
+	if (volsync) {
+		creation = xdr_to_u64(x->vol_creation_date);
+		do_div(creation, 10 * 1000 * 1000);
+		volsync->creation = creation;
+	}
+
+	*_bp += xdr_size(x);
+}
+
+/*
+ * Encode the requested attributes into a YFSStoreStatus block
+ */
+static __be32 *xdr_encode_YFS_StoreStatus(__be32 *bp, struct iattr *attr)
+{
+	struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+	s64 mtime = 0, owner = 0, group = 0;
+	u32 mask = 0, mode = 0;
+
+	mask = 0;
+	if (attr->ia_valid & ATTR_MTIME) {
+		mask |= AFS_SET_MTIME;
+		mtime = linux_to_yfs_time(&attr->ia_mtime);
+	}
+
+	if (attr->ia_valid & ATTR_UID) {
+		mask |= AFS_SET_OWNER;
+		owner = from_kuid(&init_user_ns, attr->ia_uid);
+	}
+
+	if (attr->ia_valid & ATTR_GID) {
+		mask |= AFS_SET_GROUP;
+		group = from_kgid(&init_user_ns, attr->ia_gid);
+	}
+
+	if (attr->ia_valid & ATTR_MODE) {
+		mask |= AFS_SET_MODE;
+		mode = attr->ia_mode & S_IALLUGO;
+	}
+
+	x->mask		= htonl(mask);
+	x->mode		= htonl(mode);
+	x->mtime_client	= u64_to_xdr(mtime);
+	x->owner	= u64_to_xdr(owner);
+	x->group	= u64_to_xdr(group);
+	return bp + xdr_size(x);
+}
+
+/*
+ * Decode a YFSFetchVolumeStatus block.
+ */
+static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp,
+					    struct afs_volume_status *vs)
+{
+	const struct yfs_xdr_YFSFetchVolumeStatus *x = (const void *)*_bp;
+	u32 flags;
+
+	vs->vid			= xdr_to_u64(x->vid);
+	vs->parent_id		= xdr_to_u64(x->parent_id);
+	flags			= ntohl(x->flags);
+	vs->online		= flags & yfs_FVSOnline;
+	vs->in_service		= flags & yfs_FVSInservice;
+	vs->blessed		= flags & yfs_FVSBlessed;
+	vs->needs_salvage	= flags & yfs_FVSNeedsSalvage;
+	vs->type		= ntohl(x->type);
+	vs->min_quota		= 0;
+	vs->max_quota		= xdr_to_u64(x->max_quota);
+	vs->blocks_in_use	= xdr_to_u64(x->blocks_in_use);
+	vs->part_blocks_avail	= xdr_to_u64(x->part_blocks_avail);
+	vs->part_max_blocks	= xdr_to_u64(x->part_max_blocks);
+	vs->vol_copy_date	= xdr_to_u64(x->vol_copy_date);
+	vs->vol_backup_date	= xdr_to_u64(x->vol_backup_date);
+	*_bp += sizeof(*x) / sizeof(__be32);
+}
+
+/*
+ * Deliver a reply that's a status, callback and volsync.
+ */
+static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call)
+{
+	const __be32 *bp;
+	int ret;
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_YFSCallBack(&bp, call, call->out_scb);
+	xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * Deliver reply data to operations that just return a file status and a volume
+ * sync record.
+ */
+static int yfs_deliver_status_and_volsync(struct afs_call *call)
+{
+	const __be32 *bp;
+	int ret;
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * YFS.FetchStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = {
+	.name		= "YFS.FetchStatus(vnode)",
+	.op		= yfs_FS_FetchStatus,
+	.deliver	= yfs_deliver_fs_status_cb_and_volsync,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for a file.
+ */
+int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
+			     struct afs_volsync *volsync)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus_vnode,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSCallBack) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call) {
+		fc->ac.error = -ENOMEM;
+		return -ENOMEM;
+	}
+
+	call->key = fc->key;
+	call->out_scb = scb;
+	call->out_volsync = volsync;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to an YFS.FetchData64.
+ */
+static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
+{
+	struct afs_read *req = call->read_request;
+	const __be32 *bp;
+	unsigned int size;
+	int ret;
+
+	_enter("{%u,%zu/%llu}",
+	       call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
+
+	switch (call->unmarshall) {
+	case 0:
+		req->actual_len = 0;
+		req->index = 0;
+		req->offset = req->pos & (PAGE_SIZE - 1);
+		afs_extract_to_tmp64(call);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the returned data length */
+	case 1:
+		_debug("extract data length");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		req->actual_len = be64_to_cpu(call->tmp64);
+		_debug("DATA length: %llu", req->actual_len);
+		req->remain = min(req->len, req->actual_len);
+		if (req->remain == 0)
+			goto no_more_data;
+
+		call->unmarshall++;
+
+	begin_page:
+		ASSERTCMP(req->index, <, req->nr_pages);
+		if (req->remain > PAGE_SIZE - req->offset)
+			size = PAGE_SIZE - req->offset;
+		else
+			size = req->remain;
+		call->bvec[0].bv_len = size;
+		call->bvec[0].bv_offset = req->offset;
+		call->bvec[0].bv_page = req->pages[req->index];
+		iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+		ASSERTCMP(size, <=, PAGE_SIZE);
+		/* Fall through */
+
+		/* extract the returned data */
+	case 2:
+		_debug("extract data %zu/%llu",
+		       iov_iter_count(&call->iter), req->remain);
+
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+		req->remain -= call->bvec[0].bv_len;
+		req->offset += call->bvec[0].bv_len;
+		ASSERTCMP(req->offset, <=, PAGE_SIZE);
+		if (req->offset == PAGE_SIZE) {
+			req->offset = 0;
+			if (req->page_done)
+				req->page_done(req);
+			req->index++;
+			if (req->remain > 0)
+				goto begin_page;
+		}
+
+		ASSERTCMP(req->remain, ==, 0);
+		if (req->actual_len <= req->len)
+			goto no_more_data;
+
+		/* Discard any excess data the server gave us */
+		afs_extract_discard(call, req->actual_len - req->len);
+		call->unmarshall = 3;
+		/* Fall through */
+
+	case 3:
+		_debug("extract discard %zu/%llu",
+		       iov_iter_count(&call->iter), req->actual_len - req->len);
+
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+	no_more_data:
+		call->unmarshall = 4;
+		afs_extract_to_buf(call,
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSCallBack) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+		/* Fall through */
+
+		/* extract the metadata */
+	case 4:
+		ret = afs_extract_data(call, false);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+		if (ret < 0)
+			return ret;
+		xdr_decode_YFSCallBack(&bp, call, call->out_scb);
+		xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+		req->data_version = call->out_scb->status.data_version;
+		req->file_size = call->out_scb->status.size;
+
+		call->unmarshall++;
+		/* Fall through */
+
+	case 5:
+		break;
+	}
+
+	for (; req->index < req->nr_pages; req->index++) {
+		if (req->offset < PAGE_SIZE)
+			zero_user_segment(req->pages[req->index],
+					  req->offset, PAGE_SIZE);
+		if (req->page_done)
+			req->page_done(req);
+		req->offset = 0;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+static void yfs_fetch_data_destructor(struct afs_call *call)
+{
+	afs_put_read(call->read_request);
+	afs_flat_call_destructor(call);
+}
+
+/*
+ * YFS.FetchData64 operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchData64 = {
+	.name		= "YFS.FetchData64",
+	.op		= yfs_FS_FetchData64,
+	.deliver	= yfs_deliver_fs_fetch_data64,
+	.destructor	= yfs_fetch_data_destructor,
+};
+
+/*
+ * Fetch data from a file.
+ */
+int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
+		      struct afs_read *req)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},%llx,%llx",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode,
+	       req->pos, req->len);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSFetchData64,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_u64) * 2,
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSCallBack) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_scb = scb;
+	call->out_volsync = NULL;
+	call->read_request = req;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSFETCHDATA64);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	bp = xdr_encode_u64(bp, req->pos);
+	bp = xdr_encode_u64(bp, req->len);
+	yfs_check_req(call, bp);
+
+	refcount_inc(&req->usage);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data for YFS.CreateFile or YFS.MakeDir.
+ */
+static int yfs_deliver_fs_create_vnode(struct afs_call *call)
+{
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_YFSFid(&bp, call->out_fid);
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_YFSCallBack(&bp, call, call->out_scb);
+	xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateFile = {
+	.name		= "YFS.CreateFile",
+	.op		= yfs_FS_CreateFile,
+	.deliver	= yfs_deliver_fs_create_vnode,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Create a file.
+ */
+int yfs_fs_create_file(struct afs_fs_cursor *fc,
+		       const char *name,
+		       umode_t mode,
+		       struct afs_status_cb *dvnode_scb,
+		       struct afs_fid *newfid,
+		       struct afs_status_cb *new_scb)
+{
+	struct afs_vnode *dvnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(dvnode);
+	size_t namesz, reqsz, rplsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	reqsz = (sizeof(__be32) +
+		 sizeof(__be32) +
+		 sizeof(struct yfs_xdr_YFSFid) +
+		 xdr_strlen(namesz) +
+		 sizeof(struct yfs_xdr_YFSStoreStatus) +
+		 sizeof(__be32));
+	rplsz = (sizeof(struct yfs_xdr_YFSFid) +
+		 sizeof(struct yfs_xdr_YFSFetchStatus) +
+		 sizeof(struct yfs_xdr_YFSFetchStatus) +
+		 sizeof(struct yfs_xdr_YFSCallBack) +
+		 sizeof(struct yfs_xdr_YFSVolSync));
+
+	call = afs_alloc_flat_call(net, &afs_RXFSCreateFile, reqsz, rplsz);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_dir_scb = dvnode_scb;
+	call->out_fid = newfid;
+	call->out_scb = new_scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSCREATEFILE);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+	bp = xdr_encode_string(bp, name, namesz);
+	bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
+	bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call1(call, &dvnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+static const struct afs_call_type yfs_RXFSMakeDir = {
+	.name		= "YFS.MakeDir",
+	.op		= yfs_FS_MakeDir,
+	.deliver	= yfs_deliver_fs_create_vnode,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Make a directory.
+ */
+int yfs_fs_make_dir(struct afs_fs_cursor *fc,
+		    const char *name,
+		    umode_t mode,
+		    struct afs_status_cb *dvnode_scb,
+		    struct afs_fid *newfid,
+		    struct afs_status_cb *new_scb)
+{
+	struct afs_vnode *dvnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(dvnode);
+	size_t namesz, reqsz, rplsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	reqsz = (sizeof(__be32) +
+		 sizeof(struct yfs_xdr_RPCFlags) +
+		 sizeof(struct yfs_xdr_YFSFid) +
+		 xdr_strlen(namesz) +
+		 sizeof(struct yfs_xdr_YFSStoreStatus));
+	rplsz = (sizeof(struct yfs_xdr_YFSFid) +
+		 sizeof(struct yfs_xdr_YFSFetchStatus) +
+		 sizeof(struct yfs_xdr_YFSFetchStatus) +
+		 sizeof(struct yfs_xdr_YFSCallBack) +
+		 sizeof(struct yfs_xdr_YFSVolSync));
+
+	call = afs_alloc_flat_call(net, &yfs_RXFSMakeDir, reqsz, rplsz);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_dir_scb = dvnode_scb;
+	call->out_fid = newfid;
+	call->out_scb = new_scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSMAKEDIR);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+	bp = xdr_encode_string(bp, name, namesz);
+	bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call1(call, &dvnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to a YFS.RemoveFile2 operation.
+ */
+static int yfs_deliver_fs_remove_file2(struct afs_call *call)
+{
+	struct afs_fid fid;
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+
+	xdr_decode_YFSFid(&bp, &fid);
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	/* Was deleted if vnode->status.abort_code == VNOVNODE. */
+
+	xdr_decode_YFSVolSync(&bp, call->out_volsync);
+	return 0;
+}
+
+/*
+ * YFS.RemoveFile2 operation type.
+ */
+static const struct afs_call_type yfs_RXYFSRemoveFile2 = {
+	.name		= "YFS.RemoveFile2",
+	.op		= yfs_FS_RemoveFile2,
+	.deliver	= yfs_deliver_fs_remove_file2,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Remove a file and retrieve new file status.
+ */
+int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+			const char *name, struct afs_status_cb *dvnode_scb,
+			struct afs_status_cb *vnode_scb)
+{
+	struct afs_vnode *dvnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(dvnode);
+	size_t namesz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSRemoveFile2,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(namesz),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_dir_scb = dvnode_scb;
+	call->out_scb = vnode_scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSREMOVEFILE2);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+	bp = xdr_encode_string(bp, name, namesz);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call1(call, &dvnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to a YFS.RemoveFile or YFS.RemoveDir operation.
+ */
+static int yfs_deliver_fs_remove(struct afs_call *call)
+{
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+
+	xdr_decode_YFSVolSync(&bp, call->out_volsync);
+	return 0;
+}
+
+/*
+ * FS.RemoveDir and FS.RemoveFile operation types.
+ */
+static const struct afs_call_type yfs_RXYFSRemoveFile = {
+	.name		= "YFS.RemoveFile",
+	.op		= yfs_FS_RemoveFile,
+	.deliver	= yfs_deliver_fs_remove,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type yfs_RXYFSRemoveDir = {
+	.name		= "YFS.RemoveDir",
+	.op		= yfs_FS_RemoveDir,
+	.deliver	= yfs_deliver_fs_remove,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * remove a file or directory
+ */
+int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+		  const char *name, bool isdir,
+		  struct afs_status_cb *dvnode_scb)
+{
+	struct afs_vnode *dvnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(dvnode);
+	size_t namesz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	call = afs_alloc_flat_call(
+		net, isdir ? &yfs_RXYFSRemoveDir : &yfs_RXYFSRemoveFile,
+		sizeof(__be32) +
+		sizeof(struct yfs_xdr_RPCFlags) +
+		sizeof(struct yfs_xdr_YFSFid) +
+		xdr_strlen(namesz),
+		sizeof(struct yfs_xdr_YFSFetchStatus) +
+		sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_dir_scb = dvnode_scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, isdir ? YFSREMOVEDIR : YFSREMOVEFILE);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+	bp = xdr_encode_string(bp, name, namesz);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call1(call, &dvnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to a YFS.Link operation.
+ */
+static int yfs_deliver_fs_link(struct afs_call *call)
+{
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_YFSVolSync(&bp, call->out_volsync);
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * YFS.Link operation type.
+ */
+static const struct afs_call_type yfs_RXYFSLink = {
+	.name		= "YFS.Link",
+	.op		= yfs_FS_Link,
+	.deliver	= yfs_deliver_fs_link,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Make a hard link.
+ */
+int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+		const char *name,
+		struct afs_status_cb *dvnode_scb,
+		struct afs_status_cb *vnode_scb)
+{
+	struct afs_vnode *dvnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	size_t namesz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	call = afs_alloc_flat_call(net, &yfs_RXYFSLink,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(namesz) +
+				   sizeof(struct yfs_xdr_YFSFid),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_dir_scb = dvnode_scb;
+	call->out_scb = vnode_scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSLINK);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+	bp = xdr_encode_string(bp, name, namesz);
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call1(call, &vnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to a YFS.Symlink operation.
+ */
+static int yfs_deliver_fs_symlink(struct afs_call *call)
+{
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_YFSFid(&bp, call->out_fid);
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+	if (ret < 0)
+		return ret;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * YFS.Symlink operation type
+ */
+static const struct afs_call_type yfs_RXYFSSymlink = {
+	.name		= "YFS.Symlink",
+	.op		= yfs_FS_Symlink,
+	.deliver	= yfs_deliver_fs_symlink,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Create a symbolic link.
+ */
+int yfs_fs_symlink(struct afs_fs_cursor *fc,
+		   const char *name,
+		   const char *contents,
+		   struct afs_status_cb *dvnode_scb,
+		   struct afs_fid *newfid,
+		   struct afs_status_cb *vnode_scb)
+{
+	struct afs_vnode *dvnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(dvnode);
+	size_t namesz, contents_sz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	contents_sz = strlen(contents);
+	call = afs_alloc_flat_call(net, &yfs_RXYFSSymlink,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(namesz) +
+				   xdr_strlen(contents_sz) +
+				   sizeof(struct yfs_xdr_YFSStoreStatus),
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_dir_scb = dvnode_scb;
+	call->out_fid = newfid;
+	call->out_scb = vnode_scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSSYMLINK);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+	bp = xdr_encode_string(bp, name, namesz);
+	bp = xdr_encode_string(bp, contents, contents_sz);
+	bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call1(call, &dvnode->fid, name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to a YFS.Rename operation.
+ */
+static int yfs_deliver_fs_rename(struct afs_call *call)
+{
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
+	if (ret < 0)
+		return ret;
+	if (call->out_dir_scb != call->out_scb) {
+		ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+		if (ret < 0)
+			return ret;
+	}
+
+	xdr_decode_YFSVolSync(&bp, call->out_volsync);
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * YFS.Rename operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename = {
+	.name		= "FS.Rename",
+	.op		= yfs_FS_Rename,
+	.deliver	= yfs_deliver_fs_rename,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Rename a file or directory.
+ */
+int yfs_fs_rename(struct afs_fs_cursor *fc,
+		  const char *orig_name,
+		  struct afs_vnode *new_dvnode,
+		  const char *new_name,
+		  struct afs_status_cb *orig_dvnode_scb,
+		  struct afs_status_cb *new_dvnode_scb)
+{
+	struct afs_vnode *orig_dvnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(orig_dvnode);
+	size_t o_namesz, n_namesz;
+	__be32 *bp;
+
+	_enter("");
+
+	o_namesz = strlen(orig_name);
+	n_namesz = strlen(new_name);
+	call = afs_alloc_flat_call(net, &yfs_RXYFSRename,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(o_namesz) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(n_namesz),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_dir_scb = orig_dvnode_scb;
+	call->out_scb = new_dvnode_scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSRENAME);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &orig_dvnode->fid);
+	bp = xdr_encode_string(bp, orig_name, o_namesz);
+	bp = xdr_encode_YFSFid(bp, &new_dvnode->fid);
+	bp = xdr_encode_string(bp, new_name, n_namesz);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call2(call, &orig_dvnode->fid, orig_name, new_name);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * YFS.StoreData64 operation type.
+ */
+static const struct afs_call_type yfs_RXYFSStoreData64 = {
+	.name		= "YFS.StoreData64",
+	.op		= yfs_FS_StoreData64,
+	.deliver	= yfs_deliver_status_and_volsync,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Store a set of pages to a large file.
+ */
+int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
+		      pgoff_t first, pgoff_t last,
+		      unsigned offset, unsigned to,
+		      struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	loff_t size, pos, i_size;
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+	size = (loff_t)to - (loff_t)offset;
+	if (first != last)
+		size += (loff_t)(last - first) << PAGE_SHIFT;
+	pos = (loff_t)first << PAGE_SHIFT;
+	pos += offset;
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (pos + size > i_size)
+		i_size = size + pos;
+
+	_debug("size %llx, at %llx, i_size %llx",
+	       (unsigned long long)size, (unsigned long long)pos,
+	       (unsigned long long)i_size);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64,
+				   sizeof(__be32) +
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSStoreStatus) +
+				   sizeof(struct yfs_xdr_u64) * 3,
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->mapping = mapping;
+	call->first = first;
+	call->last = last;
+	call->first_offset = offset;
+	call->last_to = to;
+	call->send_pages = true;
+	call->out_scb = scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSSTOREDATA64);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	bp = xdr_encode_YFSStoreStatus_mtime(bp, &vnode->vfs_inode.i_mtime);
+	bp = xdr_encode_u64(bp, pos);
+	bp = xdr_encode_u64(bp, size);
+	bp = xdr_encode_u64(bp, i_size);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * YFS.StoreStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSStoreStatus = {
+	.name		= "YFS.StoreStatus",
+	.op		= yfs_FS_StoreStatus,
+	.deliver	= yfs_deliver_status_and_volsync,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = {
+	.name		= "YFS.StoreData64",
+	.op		= yfs_FS_StoreData64,
+	.deliver	= yfs_deliver_status_and_volsync,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Set the attributes on a file, using YFS.StoreData64 rather than
+ * YFS.StoreStatus so as to alter the file size also.
+ */
+static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr,
+			       struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64_as_Status,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSStoreStatus) +
+				   sizeof(struct yfs_xdr_u64) * 3,
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_scb = scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSSTOREDATA64);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	bp = xdr_encode_YFS_StoreStatus(bp, attr);
+	bp = xdr_encode_u64(bp, attr->ia_size);	/* position of start of write */
+	bp = xdr_encode_u64(bp, 0);		/* size of write */
+	bp = xdr_encode_u64(bp, attr->ia_size);	/* new file length */
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Set the attributes on a file, using YFS.StoreData64 if there's a change in
+ * file size, and YFS.StoreStatus otherwise.
+ */
+int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr,
+		   struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		return yfs_fs_setattr_size(fc, attr, scb);
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSStoreStatus,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSStoreStatus),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_scb = scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSSTORESTATUS);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	bp = xdr_encode_YFS_StoreStatus(bp, attr);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to a YFS.GetVolumeStatus operation.
+ */
+static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
+{
+	const __be32 *bp;
+	char *p;
+	u32 size;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->unmarshall++;
+		afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus));
+		/* Fall through */
+
+		/* extract the returned status record */
+	case 1:
+		_debug("extract status");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		xdr_decode_YFSFetchVolumeStatus(&bp, call->out_volstatus);
+		call->unmarshall++;
+		afs_extract_to_tmp(call);
+		/* Fall through */
+
+		/* extract the volume name length */
+	case 2:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		call->count = ntohl(call->tmp);
+		_debug("volname length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_volname_len);
+		size = (call->count + 3) & ~3; /* It's padded */
+		afs_extract_to_buf(call, size);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the volume name */
+	case 3:
+		_debug("extract volname");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		p = call->buffer;
+		p[call->count] = 0;
+		_debug("volname '%s'", p);
+		afs_extract_to_tmp(call);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the offline message length */
+	case 4:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		call->count = ntohl(call->tmp);
+		_debug("offline msg length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_offline_msg_len);
+		size = (call->count + 3) & ~3; /* It's padded */
+		afs_extract_to_buf(call, size);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the offline message */
+	case 5:
+		_debug("extract offline");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		p = call->buffer;
+		p[call->count] = 0;
+		_debug("offline '%s'", p);
+
+		afs_extract_to_tmp(call);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the message of the day length */
+	case 6:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		call->count = ntohl(call->tmp);
+		_debug("motd length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_motd_len);
+		size = (call->count + 3) & ~3; /* It's padded */
+		afs_extract_to_buf(call, size);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the message of the day */
+	case 7:
+		_debug("extract motd");
+		ret = afs_extract_data(call, false);
+		if (ret < 0)
+			return ret;
+
+		p = call->buffer;
+		p[call->count] = 0;
+		_debug("motd '%s'", p);
+
+		call->unmarshall++;
+		/* Fall through */
+
+	case 8:
+		break;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * YFS.GetVolumeStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSGetVolumeStatus = {
+	.name		= "YFS.GetVolumeStatus",
+	.op		= yfs_FS_GetVolumeStatus,
+	.deliver	= yfs_deliver_fs_get_volume_status,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * fetch the status of a volume
+ */
+int yfs_fs_get_volume_status(struct afs_fs_cursor *fc,
+			     struct afs_volume_status *vs)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSGetVolumeStatus,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_u64),
+				   max_t(size_t,
+					 sizeof(struct yfs_xdr_YFSFetchVolumeStatus) +
+					 sizeof(__be32),
+					 AFSOPAQUEMAX + 1));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->out_volstatus = vs;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSGETVOLUMESTATUS);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_u64(bp, vnode->fid.vid);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * YFS.SetLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSSetLock = {
+	.name		= "YFS.SetLock",
+	.op		= yfs_FS_SetLock,
+	.deliver	= yfs_deliver_status_and_volsync,
+	.done		= afs_lock_op_done,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * YFS.ExtendLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSExtendLock = {
+	.name		= "YFS.ExtendLock",
+	.op		= yfs_FS_ExtendLock,
+	.deliver	= yfs_deliver_status_and_volsync,
+	.done		= afs_lock_op_done,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * YFS.ReleaseLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSReleaseLock = {
+	.name		= "YFS.ReleaseLock",
+	.op		= yfs_FS_ReleaseLock,
+	.deliver	= yfs_deliver_status_and_volsync,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Set a lock on a file
+ */
+int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type,
+		    struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSSetLock,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(__be32),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->lvnode = vnode;
+	call->out_scb = scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSSETLOCK);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	bp = xdr_encode_u32(bp, type);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_calli(call, &vnode->fid, type);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * extend a lock on a file
+ */
+int yfs_fs_extend_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSExtendLock,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->lvnode = vnode;
+	call->out_scb = scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSEXTENDLOCK);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * release a lock on a file
+ */
+int yfs_fs_release_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSReleaseLock,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return -ENOMEM;
+
+	call->key = fc->key;
+	call->lvnode = vnode;
+	call->out_scb = scb;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSRELEASELOCK);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * YFS.FetchStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchStatus = {
+	.name		= "YFS.FetchStatus",
+	.op		= yfs_FS_FetchStatus,
+	.deliver	= yfs_deliver_fs_status_cb_and_volsync,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for a fid without needing a vnode handle.
+ */
+int yfs_fs_fetch_status(struct afs_fs_cursor *fc,
+			struct afs_net *net,
+			struct afs_fid *fid,
+			struct afs_status_cb *scb,
+			struct afs_volsync *volsync)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), fid->vid, fid->vnode);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSCallBack) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call) {
+		fc->ac.error = -ENOMEM;
+		return -ENOMEM;
+	}
+
+	call->key = fc->key;
+	call->out_scb = scb;
+	call->out_volsync = volsync;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, fid);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, fid);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to an YFS.InlineBulkStatus call
+ */
+static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
+{
+	struct afs_status_cb *scb;
+	const __be32 *bp;
+	u32 tmp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	switch (call->unmarshall) {
+	case 0:
+		afs_extract_to_tmp(call);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* Extract the file status count and array in two steps */
+	case 1:
+		_debug("extract status count");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		tmp = ntohl(call->tmp);
+		_debug("status count: %u/%u", tmp, call->count2);
+		if (tmp != call->count2)
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_ibulkst_count);
+
+		call->count = 0;
+		call->unmarshall++;
+	more_counts:
+		afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus));
+		/* Fall through */
+
+	case 2:
+		_debug("extract status array %u", call->count);
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		scb = &call->out_scb[call->count];
+		ret = xdr_decode_YFSFetchStatus(&bp, call, scb);
+		if (ret < 0)
+			return ret;
+
+		call->count++;
+		if (call->count < call->count2)
+			goto more_counts;
+
+		call->count = 0;
+		call->unmarshall++;
+		afs_extract_to_tmp(call);
+		/* Fall through */
+
+		/* Extract the callback count and array in two steps */
+	case 3:
+		_debug("extract CB count");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		tmp = ntohl(call->tmp);
+		_debug("CB count: %u", tmp);
+		if (tmp != call->count2)
+			return afs_protocol_error(call, -EBADMSG,
+						  afs_eproto_ibulkst_cb_count);
+		call->count = 0;
+		call->unmarshall++;
+	more_cbs:
+		afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack));
+		/* Fall through */
+
+	case 4:
+		_debug("extract CB array");
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		_debug("unmarshall CB array");
+		bp = call->buffer;
+		scb = &call->out_scb[call->count];
+		xdr_decode_YFSCallBack(&bp, call, scb);
+		call->count++;
+		if (call->count < call->count2)
+			goto more_cbs;
+
+		afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync));
+		call->unmarshall++;
+		/* Fall through */
+
+	case 5:
+		ret = afs_extract_data(call, false);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+		call->unmarshall++;
+		/* Fall through */
+
+	case 6:
+		break;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.InlineBulkStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSInlineBulkStatus = {
+	.name		= "YFS.InlineBulkStatus",
+	.op		= yfs_FS_InlineBulkStatus,
+	.deliver	= yfs_deliver_fs_inline_bulk_status,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for up to 1024 files
+ */
+int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
+			      struct afs_net *net,
+			      struct afs_fid *fids,
+			      struct afs_status_cb *statuses,
+			      unsigned int nr_fids,
+			      struct afs_volsync *volsync)
+{
+	struct afs_call *call;
+	__be32 *bp;
+	int i;
+
+	_enter(",%x,{%llx:%llu},%u",
+	       key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSInlineBulkStatus,
+				   sizeof(__be32) +
+				   sizeof(__be32) +
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_YFSFid) * nr_fids,
+				   sizeof(struct yfs_xdr_YFSFetchStatus));
+	if (!call) {
+		fc->ac.error = -ENOMEM;
+		return -ENOMEM;
+	}
+
+	call->key = fc->key;
+	call->out_scb = statuses;
+	call->out_volsync = volsync;
+	call->count2 = nr_fids;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSINLINEBULKSTATUS);
+	bp = xdr_encode_u32(bp, 0); /* RPCFlags */
+	bp = xdr_encode_u32(bp, nr_fids);
+	for (i = 0; i < nr_fids; i++)
+		bp = xdr_encode_YFSFid(bp, &fids[i]);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &fids[0]);
+	afs_set_fc_call(call, fc);
+	afs_make_call(&fc->ac, call, GFP_NOFS);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * Deliver reply data to an YFS.FetchOpaqueACL.
+ */
+static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
+{
+	struct yfs_acl *yacl = call->out_yacl;
+	struct afs_acl *acl;
+	const __be32 *bp;
+	unsigned int size;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	switch (call->unmarshall) {
+	case 0:
+		afs_extract_to_tmp(call);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* Extract the file ACL length */
+	case 1:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		size = call->count2 = ntohl(call->tmp);
+		size = round_up(size, 4);
+
+		if (yacl->flags & YFS_ACL_WANT_ACL) {
+			acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL);
+			if (!acl)
+				return -ENOMEM;
+			yacl->acl = acl;
+			acl->size = call->count2;
+			afs_extract_begin(call, acl->data, size);
+		} else {
+			afs_extract_discard(call, size);
+		}
+		call->unmarshall++;
+		/* Fall through */
+
+		/* Extract the file ACL */
+	case 2:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		afs_extract_to_tmp(call);
+		call->unmarshall++;
+		/* Fall through */
+
+		/* Extract the volume ACL length */
+	case 3:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		size = call->count2 = ntohl(call->tmp);
+		size = round_up(size, 4);
+
+		if (yacl->flags & YFS_ACL_WANT_VOL_ACL) {
+			acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL);
+			if (!acl)
+				return -ENOMEM;
+			yacl->vol_acl = acl;
+			acl->size = call->count2;
+			afs_extract_begin(call, acl->data, size);
+		} else {
+			afs_extract_discard(call, size);
+		}
+		call->unmarshall++;
+		/* Fall through */
+
+		/* Extract the volume ACL */
+	case 4:
+		ret = afs_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		afs_extract_to_buf(call,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+		call->unmarshall++;
+		/* Fall through */
+
+		/* extract the metadata */
+	case 5:
+		ret = afs_extract_data(call, false);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		yacl->inherit_flag = ntohl(*bp++);
+		yacl->num_cleaned = ntohl(*bp++);
+		ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+		if (ret < 0)
+			return ret;
+		xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+		call->unmarshall++;
+		/* Fall through */
+
+	case 6:
+		break;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+void yfs_free_opaque_acl(struct yfs_acl *yacl)
+{
+	if (yacl) {
+		kfree(yacl->acl);
+		kfree(yacl->vol_acl);
+		kfree(yacl);
+	}
+}
+
+/*
+ * YFS.FetchOpaqueACL operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchOpaqueACL = {
+	.name		= "YFS.FetchOpaqueACL",
+	.op		= yfs_FS_FetchOpaqueACL,
+	.deliver	= yfs_deliver_fs_fetch_opaque_acl,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the YFS advanced ACLs for a file.
+ */
+struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *fc,
+					struct yfs_acl *yacl,
+					struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(net, &yfs_RXYFSFetchOpaqueACL,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid),
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call) {
+		fc->ac.error = -ENOMEM;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	call->key = fc->key;
+	call->out_yacl = yacl;
+	call->out_scb = scb;
+	call->out_volsync = NULL;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSFETCHOPAQUEACL);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	yfs_check_req(call, bp);
+
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_make_call(&fc->ac, call, GFP_KERNEL);
+	return (struct yfs_acl *)afs_wait_for_call_to_complete(call, &fc->ac);
+}
+
+/*
+ * YFS.StoreOpaqueACL2 operation type
+ */
+static const struct afs_call_type yfs_RXYFSStoreOpaqueACL2 = {
+	.name		= "YFS.StoreOpaqueACL2",
+	.op		= yfs_FS_StoreOpaqueACL2,
+	.deliver	= yfs_deliver_status_and_volsync,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the YFS ACL for a file.
+ */
+int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *fc, const struct afs_acl *acl,
+			     struct afs_status_cb *scb)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
+	size_t size;
+	__be32 *bp;
+
+	_enter(",%x,{%llx:%llu},,",
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+	size = round_up(acl->size, 4);
+	call = afs_alloc_flat_call(net, &yfs_RXYFSStoreOpaqueACL2,
+				   sizeof(__be32) * 2 +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(__be32) + size,
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call) {
+		fc->ac.error = -ENOMEM;
+		return -ENOMEM;
+	}
+
+	call->key = fc->key;
+	call->out_scb = scb;
+	call->out_volsync = NULL;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSSTOREOPAQUEACL2);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &vnode->fid);
+	bp = xdr_encode_u32(bp, acl->size);
+	memcpy(bp, acl->data, acl->size);
+	if (acl->size != size)
+		memset((void *)bp + acl->size, 0, size - acl->size);
+	yfs_check_req(call, bp);
+
+	trace_afs_make_fs_call(call, &vnode->fid);
+	afs_make_call(&fc->ac, call, GFP_KERNEL);
+	return afs_wait_for_call_to_complete(call, &fc->ac);
+}

diff --git a/fs/aio.c b/fs/aio.c
index 44551d9..0d9a559 100644
--- a/fs/aio.c
+++ b/fs/aio.c

@@ -42,6 +42,7 @@
 #include <linux/ramfs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
 
 #include <asm/kmap_types.h>
 #include <linux/uaccess.h>
@@ -70,6 +71,12 @@
 	struct io_event		io_events[0];
 }; /* 128 bytes + ring size */
 
+/*
+ * Plugging is meant to work with larger batches of IOs. If we don't
+ * have more than the below, then don't bother setting up a plug.
+ */
+#define AIO_PLUG_THRESHOLD	2
+
 #define AIO_RING_PAGES	8
 
 struct kioctx_table {
@@ -161,9 +168,13 @@
 	unsigned		id;
 };
 
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
 struct fsync_iocb {
-	struct work_struct	work;
 	struct file		*file;
+	struct work_struct	work;
 	bool			datasync;
 };
 
@@ -171,14 +182,21 @@
 	struct file		*file;
 	struct wait_queue_head	*head;
 	__poll_t		events;
-	bool			woken;
+	bool			done;
 	bool			cancelled;
 	struct wait_queue_entry	wait;
 	struct work_struct	work;
 };
 
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
 struct aio_kiocb {
 	union {
+		struct file		*ki_filp;
 		struct kiocb		rw;
 		struct fsync_iocb	fsync;
 		struct poll_iocb	poll;
@@ -187,8 +205,7 @@
 	struct kioctx		*ki_ctx;
 	kiocb_cancel_fn		*ki_cancel;
 
-	struct iocb __user	*ki_user_iocb;	/* user's aiocb */
-	__u64			ki_user_data;	/* user's data for completion */
+	struct io_event		ki_res;
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
@@ -233,15 +250,12 @@
 	return file;
 }
 
-static struct dentry *aio_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
+static int aio_init_fs_context(struct fs_context *fc)
 {
-	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL,
-					   AIO_RING_MAGIC);
-
-	if (!IS_ERR(root))
-		root->d_sb->s_iflags |= SB_I_NOEXEC;
-	return root;
+	if (!init_pseudo(fc, AIO_RING_MAGIC))
+		return -ENOMEM;
+	fc->s_iflags |= SB_I_NOEXEC;
+	return 0;
 }
 
 /* aio_setup
@@ -252,7 +266,7 @@
 {
 	static struct file_system_type aio_fs = {
 		.name		= "aio",
-		.mount		= aio_mount,
+		.init_fs_context = aio_init_fs_context,
 		.kill_sb	= kill_anon_super,
 	};
 	aio_mnt = kern_mount(&aio_fs);
@@ -409,7 +423,7 @@
 	BUG_ON(PageWriteback(old));
 	get_page(new);
 
-	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
+	rc = migrate_page_move_mapping(mapping, new, old, 1);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		put_page(new);
 		goto out_unlock;
@@ -902,7 +916,7 @@
 	local_irq_restore(flags);
 }
 
-static bool get_reqs_available(struct kioctx *ctx)
+static bool __get_reqs_available(struct kioctx *ctx)
 {
 	struct kioctx_cpu *kcpu;
 	bool ret = false;
@@ -994,32 +1008,40 @@
 	spin_unlock_irq(&ctx->completion_lock);
 }
 
+static bool get_reqs_available(struct kioctx *ctx)
+{
+	if (__get_reqs_available(ctx))
+		return true;
+	user_refill_reqs_available(ctx);
+	return __get_reqs_available(ctx);
+}
+
 /* aio_get_req
  *	Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
+ *
+ * The refcount is initialized to 2 - one for the async op completion,
+ * one for the synchronous code that does this.
  */
 static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 {
 	struct aio_kiocb *req;
 
-	if (!get_reqs_available(ctx)) {
-		user_refill_reqs_available(ctx);
-		if (!get_reqs_available(ctx))
-			return NULL;
+	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+	if (unlikely(!req))
+		return NULL;
+
+	if (unlikely(!get_reqs_available(ctx))) {
+		kmem_cache_free(kiocb_cachep, req);
+		return NULL;
 	}
 
-	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
-	if (unlikely(!req))
-		goto out_put;
-
 	percpu_ref_get(&ctx->reqs);
-	INIT_LIST_HEAD(&req->ki_list);
-	refcount_set(&req->ki_refcnt, 0);
 	req->ki_ctx = ctx;
+	INIT_LIST_HEAD(&req->ki_list);
+	refcount_set(&req->ki_refcnt, 2);
+	req->ki_eventfd = NULL;
 	return req;
-out_put:
-	put_reqs_available(ctx, 1);
-	return NULL;
 }
 
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
@@ -1050,19 +1072,20 @@
 	return ret;
 }
 
-static inline void iocb_put(struct aio_kiocb *iocb)
+static inline void iocb_destroy(struct aio_kiocb *iocb)
 {
-	if (refcount_read(&iocb->ki_refcnt) == 0 ||
-	    refcount_dec_and_test(&iocb->ki_refcnt)) {
-		percpu_ref_put(&iocb->ki_ctx->reqs);
-		kmem_cache_free(kiocb_cachep, iocb);
-	}
+	if (iocb->ki_eventfd)
+		eventfd_ctx_put(iocb->ki_eventfd);
+	if (iocb->ki_filp)
+		fput(iocb->ki_filp);
+	percpu_ref_put(&iocb->ki_ctx->reqs);
+	kmem_cache_free(kiocb_cachep, iocb);
 }
 
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
@@ -1086,17 +1109,14 @@
 	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
 
-	event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
-	event->data = iocb->ki_user_data;
-	event->res = res;
-	event->res2 = res2;
+	*event = iocb->ki_res;
 
 	kunmap_atomic(ev_page);
 	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 
-	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
-		 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
-		 res, res2);
+	pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
+		 (void __user *)(unsigned long)iocb->ki_res.obj,
+		 iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
 
 	/* after flagging the request as done, we
 	 * must never even look at it again
@@ -1123,10 +1143,8 @@
 	 * eventfd. The eventfd_signal() function is safe to be called
 	 * from IRQ context.
 	 */
-	if (iocb->ki_eventfd) {
+	if (iocb->ki_eventfd)
 		eventfd_signal(iocb->ki_eventfd, 1);
-		eventfd_ctx_put(iocb->ki_eventfd);
-	}
 
 	/*
 	 * We have to order our ring_info tail store above and test
@@ -1138,7 +1156,14 @@
 
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
-	iocb_put(iocb);
+}
+
+static inline void iocb_put(struct aio_kiocb *iocb)
+{
+	if (refcount_dec_and_test(&iocb->ki_refcnt)) {
+		aio_complete(iocb);
+		iocb_destroy(iocb);
+	}
 }
 
 /* aio_read_events_ring
@@ -1412,18 +1437,17 @@
 		file_end_write(kiocb->ki_filp);
 	}
 
-	fput(kiocb->ki_filp);
-	aio_complete(iocb, res, res2);
+	iocb->ki_res.res = res;
+	iocb->ki_res.res2 = res2;
+	iocb_put(iocb);
 }
 
-static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 {
 	int ret;
 
-	req->ki_filp = fget(iocb->aio_fildes);
-	if (unlikely(!req->ki_filp))
-		return -EBADF;
 	req->ki_complete = aio_complete_rw;
+	req->private = NULL;
 	req->ki_pos = iocb->aio_offset;
 	req->ki_flags = iocb_flags(req->ki_filp);
 	if (iocb->aio_flags & IOCB_FLAG_RESFD)
@@ -1438,22 +1462,24 @@
 		ret = ioprio_check_cap(iocb->aio_reqprio);
 		if (ret) {
 			pr_debug("aio ioprio check cap error: %d\n", ret);
-			fput(req->ki_filp);
 			return ret;
 		}
 
 		req->ki_ioprio = iocb->aio_reqprio;
 	} else
-		req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+		req->ki_ioprio = get_current_ioprio();
 
 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
 	if (unlikely(ret))
-		fput(req->ki_filp);
-	return ret;
+		return ret;
+
+	req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
+	return 0;
 }
 
-static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
-		bool vectored, bool compat, struct iov_iter *iter)
+static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
+		struct iovec **iovec, bool vectored, bool compat,
+		struct iov_iter *iter)
 {
 	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
 	size_t len = iocb->aio_nbytes;
@@ -1487,66 +1513,59 @@
 		ret = -EINTR;
 		/*FALLTHRU*/
 	default:
-		aio_complete_rw(req, ret, 0);
+		req->ki_complete(req, ret, 0);
 	}
 }
 
-static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
-		bool compat)
+static int aio_read(struct kiocb *req, const struct iocb *iocb,
+			bool vectored, bool compat)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
 	struct file *file;
-	ssize_t ret;
+	int ret;
 
 	ret = aio_prep_rw(req, iocb);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
-
-	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_READ)))
-		goto out_fput;
+		return -EBADF;
 	ret = -EINVAL;
 	if (unlikely(!file->f_op->read_iter))
-		goto out_fput;
+		return -EINVAL;
 
 	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
-	if (ret)
-		goto out_fput;
+	if (ret < 0)
+		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
 		aio_rw_done(req, call_read_iter(file, req, &iter));
 	kfree(iovec);
-out_fput:
-	if (unlikely(ret))
-		fput(file);
 	return ret;
 }
 
-static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
-		bool compat)
+static int aio_write(struct kiocb *req, const struct iocb *iocb,
+			 bool vectored, bool compat)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
 	struct file *file;
-	ssize_t ret;
+	int ret;
 
 	ret = aio_prep_rw(req, iocb);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
 
-	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		goto out_fput;
-	ret = -EINVAL;
+		return -EBADF;
 	if (unlikely(!file->f_op->write_iter))
-		goto out_fput;
+		return -EINVAL;
 
 	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
-	if (ret)
-		goto out_fput;
+	if (ret < 0)
+		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
 		/*
@@ -1564,35 +1583,26 @@
 		aio_rw_done(req, call_write_iter(file, req, &iter));
 	}
 	kfree(iovec);
-out_fput:
-	if (unlikely(ret))
-		fput(file);
 	return ret;
 }
 
 static void aio_fsync_work(struct work_struct *work)
 {
-	struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
-	int ret;
+	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
 
-	ret = vfs_fsync(req->file, req->datasync);
-	fput(req->file);
-	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+	iocb_put(iocb);
 }
 
-static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
+		     bool datasync)
 {
 	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
 			iocb->aio_rw_flags))
 		return -EINVAL;
 
-	req->file = fget(iocb->aio_fildes);
-	if (unlikely(!req->file))
-		return -EBADF;
-	if (unlikely(!req->file->f_op->fsync)) {
-		fput(req->file);
+	if (unlikely(!req->file->f_op->fsync))
 		return -EINVAL;
-	}
 
 	req->datasync = datasync;
 	INIT_WORK(&req->work, aio_fsync_work);
@@ -1600,14 +1610,6 @@
 	return 0;
 }
 
-static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
-{
-	struct file *file = iocb->poll.file;
-
-	aio_complete(iocb, mangle_poll(mask), 0);
-	fput(file);
-}
-
 static void aio_poll_complete_work(struct work_struct *work)
 {
 	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1633,9 +1635,11 @@
 		return;
 	}
 	list_del_init(&iocb->ki_list);
+	iocb->ki_res.res = mangle_poll(mask);
+	req->done = true;
 	spin_unlock_irq(&ctx->ctx_lock);
 
-	aio_poll_complete(iocb, mask);
+	iocb_put(iocb);
 }
 
 /* assumes we are called with irqs disabled */
@@ -1661,27 +1665,29 @@
 	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
 	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
 	__poll_t mask = key_to_poll(key);
-
-	req->woken = true;
+	unsigned long flags;
 
 	/* for instances that support it check for an event match first: */
-	if (mask) {
-		if (!(mask & req->events))
-			return 0;
-
-		/* try to complete the iocb inline if we can: */
-		if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
-			list_del(&iocb->ki_list);
-			spin_unlock(&iocb->ki_ctx->ctx_lock);
-
-			list_del_init(&req->wait.entry);
-			aio_poll_complete(iocb, mask);
-			return 1;
-		}
-	}
+	if (mask && !(mask & req->events))
+		return 0;
 
 	list_del_init(&req->wait.entry);
-	schedule_work(&req->work);
+
+	if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+		/*
+		 * Try to complete the iocb inline if we can. Use
+		 * irqsave/irqrestore because not all filesystems (e.g. fuse)
+		 * call this function with IRQs disabled and because IRQs
+		 * have to be disabled before ctx_lock is obtained.
+		 */
+		list_del(&iocb->ki_list);
+		iocb->ki_res.res = mangle_poll(mask);
+		req->done = true;
+		spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
+		iocb_put(iocb);
+	} else {
+		schedule_work(&req->work);
+	}
 	return 1;
 }
 
@@ -1708,11 +1714,12 @@
 	add_wait_queue(head, &pt->iocb->poll.wait);
 }
 
-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 {
 	struct kioctx *ctx = aiocb->ki_ctx;
 	struct poll_iocb *req = &aiocb->poll;
 	struct aio_poll_table apt;
+	bool cancel = false;
 	__poll_t mask;
 
 	/* reject any unknown events outside the normal event mask. */
@@ -1724,9 +1731,10 @@
 
 	INIT_WORK(&req->work, aio_poll_complete_work);
 	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
-	req->file = fget(iocb->aio_fildes);
-	if (unlikely(!req->file))
-		return -EBADF;
+
+	req->head = NULL;
+	req->done = false;
+	req->cancelled = false;
 
 	apt.pt._qproc = aio_poll_queue_proc;
 	apt.pt._key = req->events;
@@ -1737,43 +1745,88 @@
 	INIT_LIST_HEAD(&req->wait.entry);
 	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
 
-	/* one for removal from waitqueue, one for this function */
-	refcount_set(&aiocb->ki_refcnt, 2);
-
 	mask = vfs_poll(req->file, &apt.pt) & req->events;
-	if (unlikely(!req->head)) {
-		/* we did not manage to set up a waitqueue, done */
-		goto out;
-	}
-
 	spin_lock_irq(&ctx->ctx_lock);
-	spin_lock(&req->head->lock);
-	if (req->woken) {
-		/* wake_up context handles the rest */
-		mask = 0;
+	if (likely(req->head)) {
+		spin_lock(&req->head->lock);
+		if (unlikely(list_empty(&req->wait.entry))) {
+			if (apt.error)
+				cancel = true;
+			apt.error = 0;
+			mask = 0;
+		}
+		if (mask || apt.error) {
+			list_del_init(&req->wait.entry);
+		} else if (cancel) {
+			WRITE_ONCE(req->cancelled, true);
+		} else if (!req->done) { /* actually waiting for an event */
+			list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+			aiocb->ki_cancel = aio_poll_cancel;
+		}
+		spin_unlock(&req->head->lock);
+	}
+	if (mask) { /* no async, we'd stolen it */
+		aiocb->ki_res.res = mangle_poll(mask);
 		apt.error = 0;
-	} else if (mask || apt.error) {
-		/* if we get an error or a mask we are done */
-		WARN_ON_ONCE(list_empty(&req->wait.entry));
-		list_del_init(&req->wait.entry);
-	} else {
-		/* actually waiting for an event */
-		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
-		aiocb->ki_cancel = aio_poll_cancel;
 	}
-	spin_unlock(&req->head->lock);
 	spin_unlock_irq(&ctx->ctx_lock);
+	if (mask)
+		iocb_put(aiocb);
+	return apt.error;
+}
 
-out:
-	if (unlikely(apt.error)) {
-		fput(req->file);
-		return apt.error;
+static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
+			   struct iocb __user *user_iocb, struct aio_kiocb *req,
+			   bool compat)
+{
+	req->ki_filp = fget(iocb->aio_fildes);
+	if (unlikely(!req->ki_filp))
+		return -EBADF;
+
+	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+		struct eventfd_ctx *eventfd;
+		/*
+		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
+		 * instance of the file* now. The file descriptor must be
+		 * an eventfd() fd, and will be signaled for each completed
+		 * event using the eventfd_signal() function.
+		 */
+		eventfd = eventfd_ctx_fdget(iocb->aio_resfd);
+		if (IS_ERR(eventfd))
+			return PTR_ERR(eventfd);
+
+		req->ki_eventfd = eventfd;
 	}
 
-	if (mask)
-		aio_poll_complete(aiocb, mask);
-	iocb_put(aiocb);
-	return 0;
+	if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
+		pr_debug("EFAULT: aio_key\n");
+		return -EFAULT;
+	}
+
+	req->ki_res.obj = (u64)(unsigned long)user_iocb;
+	req->ki_res.data = iocb->aio_data;
+	req->ki_res.res = 0;
+	req->ki_res.res2 = 0;
+
+	switch (iocb->aio_lio_opcode) {
+	case IOCB_CMD_PREAD:
+		return aio_read(&req->rw, iocb, false, compat);
+	case IOCB_CMD_PWRITE:
+		return aio_write(&req->rw, iocb, false, compat);
+	case IOCB_CMD_PREADV:
+		return aio_read(&req->rw, iocb, true, compat);
+	case IOCB_CMD_PWRITEV:
+		return aio_write(&req->rw, iocb, true, compat);
+	case IOCB_CMD_FSYNC:
+		return aio_fsync(&req->fsync, iocb, false);
+	case IOCB_CMD_FDSYNC:
+		return aio_fsync(&req->fsync, iocb, true);
+	case IOCB_CMD_POLL:
+		return aio_poll(req, iocb);
+	default:
+		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
+		return -EINVAL;
+	}
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
@@ -1781,7 +1834,7 @@
 {
 	struct aio_kiocb *req;
 	struct iocb iocb;
-	ssize_t ret;
+	int err;
 
 	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
 		return -EFAULT;
@@ -1806,73 +1859,21 @@
 	if (unlikely(!req))
 		return -EAGAIN;
 
-	if (iocb.aio_flags & IOCB_FLAG_RESFD) {
-		/*
-		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
-		 * instance of the file* now. The file descriptor must be
-		 * an eventfd() fd, and will be signaled for each completed
-		 * event using the eventfd_signal() function.
-		 */
-		req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
-		if (IS_ERR(req->ki_eventfd)) {
-			ret = PTR_ERR(req->ki_eventfd);
-			req->ki_eventfd = NULL;
-			goto out_put_req;
-		}
-	}
+	err = __io_submit_one(ctx, &iocb, user_iocb, req, compat);
 
-	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
-	if (unlikely(ret)) {
-		pr_debug("EFAULT: aio_key\n");
-		goto out_put_req;
-	}
-
-	req->ki_user_iocb = user_iocb;
-	req->ki_user_data = iocb.aio_data;
-
-	switch (iocb.aio_lio_opcode) {
-	case IOCB_CMD_PREAD:
-		ret = aio_read(&req->rw, &iocb, false, compat);
-		break;
-	case IOCB_CMD_PWRITE:
-		ret = aio_write(&req->rw, &iocb, false, compat);
-		break;
-	case IOCB_CMD_PREADV:
-		ret = aio_read(&req->rw, &iocb, true, compat);
-		break;
-	case IOCB_CMD_PWRITEV:
-		ret = aio_write(&req->rw, &iocb, true, compat);
-		break;
-	case IOCB_CMD_FSYNC:
-		ret = aio_fsync(&req->fsync, &iocb, false);
-		break;
-	case IOCB_CMD_FDSYNC:
-		ret = aio_fsync(&req->fsync, &iocb, true);
-		break;
-	case IOCB_CMD_POLL:
-		ret = aio_poll(req, &iocb);
-		break;
-	default:
-		pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
-		ret = -EINVAL;
-		break;
-	}
+	/* Done with the synchronous reference */
+	iocb_put(req);
 
 	/*
-	 * If ret is 0, we'd either done aio_complete() ourselves or have
+	 * If err is 0, we'd either done aio_complete() ourselves or have
 	 * arranged for that to be done asynchronously.  Anything non-zero
 	 * means that we need to destroy req ourselves.
 	 */
-	if (ret)
-		goto out_put_req;
-	return 0;
-out_put_req:
-	put_reqs_available(ctx, 1);
-	percpu_ref_put(&ctx->reqs);
-	if (req->ki_eventfd)
-		eventfd_ctx_put(req->ki_eventfd);
-	kmem_cache_free(kiocb_cachep, req);
-	return ret;
+	if (unlikely(err)) {
+		iocb_destroy(req);
+		put_reqs_available(ctx, 1);
+	}
+	return err;
 }
 
 /* sys_io_submit:
@@ -1907,7 +1908,8 @@
 	if (nr > ctx->nr_events)
 		nr = ctx->nr_events;
 
-	blk_start_plug(&plug);
+	if (nr > AIO_PLUG_THRESHOLD)
+		blk_start_plug(&plug);
 	for (i = 0; i < nr; i++) {
 		struct iocb __user *user_iocb;
 
@@ -1920,7 +1922,8 @@
 		if (ret)
 			break;
 	}
-	blk_finish_plug(&plug);
+	if (nr > AIO_PLUG_THRESHOLD)
+		blk_finish_plug(&plug);
 
 	percpu_ref_put(&ctx->users);
 	return i ? i : ret;
@@ -1947,7 +1950,8 @@
 	if (nr > ctx->nr_events)
 		nr = ctx->nr_events;
 
-	blk_start_plug(&plug);
+	if (nr > AIO_PLUG_THRESHOLD)
+		blk_start_plug(&plug);
 	for (i = 0; i < nr; i++) {
 		compat_uptr_t user_iocb;
 
@@ -1960,31 +1964,14 @@
 		if (ret)
 			break;
 	}
-	blk_finish_plug(&plug);
+	if (nr > AIO_PLUG_THRESHOLD)
+		blk_finish_plug(&plug);
 
 	percpu_ref_put(&ctx->users);
 	return i ? i : ret;
 }
 #endif
 
-/* lookup_kiocb
- *	Finds a given iocb for cancellation.
- */
-static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
-{
-	struct aio_kiocb *kiocb;
-
-	assert_spin_locked(&ctx->ctx_lock);
-
-	/* TODO: use a hash or array, this sucks. */
-	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
-		if (kiocb->ki_user_iocb == iocb)
-			return kiocb;
-	}
-	return NULL;
-}
-
 /* sys_io_cancel:
  *	Attempts to cancel an iocb previously passed to io_submit.  If
  *	the operation is successfully cancelled, the resulting event is
@@ -2002,6 +1989,7 @@
 	struct aio_kiocb *kiocb;
 	int ret = -EINVAL;
 	u32 key;
+	u64 obj = (u64)(unsigned long)iocb;
 
 	if (unlikely(get_user(key, &iocb->aio_key)))
 		return -EFAULT;
@@ -2013,10 +2001,13 @@
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->ctx_lock);
-	kiocb = lookup_kiocb(ctx, iocb);
-	if (kiocb) {
-		ret = kiocb->ki_cancel(&kiocb->rw);
-		list_del_init(&kiocb->ki_list);
+	/* TODO: use a hash or array, this sucks. */
+	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+		if (kiocb->ki_res.obj == obj) {
+			ret = kiocb->ki_cancel(&kiocb->rw);
+			list_del_init(&kiocb->ki_list);
+			break;
+		}
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
 
@@ -2065,11 +2056,13 @@
  *	specifies an infinite timeout. Note that the timeout pointed to by
  *	timeout is relative.  Will fail with -ENOSYS if not implemented.
  */
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+
 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		long, min_nr,
 		long, nr,
 		struct io_event __user *, events,
-		struct timespec __user *, timeout)
+		struct __kernel_timespec __user *, timeout)
 {
 	struct timespec64	ts;
 	int			ret;
@@ -2083,6 +2076,8 @@
 	return ret;
 }
 
+#endif
+
 struct __aio_sigset {
 	const sigset_t __user	*sigmask;
 	size_t		sigsetsize;
@@ -2093,12 +2088,12 @@
 		long, min_nr,
 		long, nr,
 		struct io_event __user *, events,
-		struct timespec __user *, timeout,
+		struct __kernel_timespec __user *, timeout,
 		const struct __aio_sigset __user *, usig)
 {
 	struct __aio_sigset	ksig = { NULL, };
-	sigset_t		ksigmask, sigsaved;
 	struct timespec64	ts;
+	bool interrupted;
 	int ret;
 
 	if (timeout && unlikely(get_timespec64(&ts, timeout)))
@@ -2107,43 +2102,70 @@
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	if (ksig.sigmask) {
-		if (ksig.sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
-	if (signal_pending(current)) {
-		if (ksig.sigmask) {
-			current->saved_sigmask = sigsaved;
-			set_restore_sigmask();
-		}
 
-		if (!ret)
-			ret = -ERESTARTNOHAND;
-	} else {
-		if (ksig.sigmask)
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	}
+	interrupted = signal_pending(current);
+	restore_saved_sigmask_unless(interrupted);
+	if (interrupted && !ret)
+		ret = -ERESTARTNOHAND;
 
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
-		       compat_long_t, min_nr,
-		       compat_long_t, nr,
-		       struct io_event __user *, events,
-		       struct compat_timespec __user *, timeout)
+#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE6(io_pgetevents_time32,
+		aio_context_t, ctx_id,
+		long, min_nr,
+		long, nr,
+		struct io_event __user *, events,
+		struct old_timespec32 __user *, timeout,
+		const struct __aio_sigset __user *, usig)
+{
+	struct __aio_sigset	ksig = { NULL, };
+	struct timespec64	ts;
+	bool interrupted;
+	int ret;
+
+	if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
+		return -EFAULT;
+
+	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+		return -EFAULT;
+
+
+	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
+	if (ret)
+		return ret;
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+
+	interrupted = signal_pending(current);
+	restore_saved_sigmask_unless(interrupted);
+	if (interrupted && !ret)
+		ret = -ERESTARTNOHAND;
+
+	return ret;
+}
+
+#endif
+
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+
+SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
+		__s32, min_nr,
+		__s32, nr,
+		struct io_event __user *, events,
+		struct old_timespec32 __user *, timeout)
 {
 	struct timespec64 t;
 	int ret;
 
-	if (timeout && compat_get_timespec64(&t, timeout))
+	if (timeout && get_old_timespec32(&t, timeout))
 		return -EFAULT;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
@@ -2152,52 +2174,81 @@
 	return ret;
 }
 
+#endif
+
+#ifdef CONFIG_COMPAT
 
 struct __compat_aio_sigset {
-	compat_sigset_t __user	*sigmask;
+	compat_uptr_t		sigmask;
 	compat_size_t		sigsetsize;
 };
 
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+
 COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 		compat_aio_context_t, ctx_id,
 		compat_long_t, min_nr,
 		compat_long_t, nr,
 		struct io_event __user *, events,
-		struct compat_timespec __user *, timeout,
+		struct old_timespec32 __user *, timeout,
 		const struct __compat_aio_sigset __user *, usig)
 {
-	struct __compat_aio_sigset ksig = { NULL, };
-	sigset_t ksigmask, sigsaved;
+	struct __compat_aio_sigset ksig = { 0, };
 	struct timespec64 t;
+	bool interrupted;
 	int ret;
 
-	if (timeout && compat_get_timespec64(&t, timeout))
+	if (timeout && get_old_timespec32(&t, timeout))
 		return -EFAULT;
 
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	if (ksig.sigmask) {
-		if (ksig.sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (get_compat_sigset(&ksigmask, ksig.sigmask))
-			return -EFAULT;
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
-	if (signal_pending(current)) {
-		if (ksig.sigmask) {
-			current->saved_sigmask = sigsaved;
-			set_restore_sigmask();
-		}
-		if (!ret)
-			ret = -ERESTARTNOHAND;
-	} else {
-		if (ksig.sigmask)
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	}
+
+	interrupted = signal_pending(current);
+	restore_saved_sigmask_unless(interrupted);
+	if (interrupted && !ret)
+		ret = -ERESTARTNOHAND;
+
+	return ret;
+}
+
+#endif
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
+		compat_aio_context_t, ctx_id,
+		compat_long_t, min_nr,
+		compat_long_t, nr,
+		struct io_event __user *, events,
+		struct __kernel_timespec __user *, timeout,
+		const struct __compat_aio_sigset __user *, usig)
+{
+	struct __compat_aio_sigset ksig = { 0, };
+	struct timespec64 t;
+	bool interrupted;
+	int ret;
+
+	if (timeout && get_timespec64(&t, timeout))
+		return -EFAULT;
+
+	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+		return -EFAULT;
+
+	ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
+	if (ret)
+		return ret;
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+
+	interrupted = signal_pending(current);
+	restore_saved_sigmask_unless(interrupted);
+	if (interrupted && !ret)
+		ret = -ERESTARTNOHAND;
 
 	return ret;
 }

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 91262c3..8971430 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  fs/anon_inodes.c
  *
@@ -19,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/magic.h>
 #include <linux/anon_inodes.h>
+#include <linux/pseudo_fs.h>
 
 #include <linux/uaccess.h>
 
@@ -38,16 +40,18 @@
 	.d_dname	= anon_inodefs_dname,
 };
 
-static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
+static int anon_inodefs_init_fs_context(struct fs_context *fc)
 {
-	return mount_pseudo(fs_type, "anon_inode:", NULL,
-			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+	struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+	ctx->dops = &anon_inodefs_dentry_operations;
+	return 0;
 }
 
 static struct file_system_type anon_inode_fs_type = {
 	.name		= "anon_inodefs",
-	.mount		= anon_inodefs_mount,
+	.init_fs_context = anon_inodefs_init_fs_context,
 	.kill_sb	= kill_anon_super,
 };
 

diff --git a/fs/attr.c b/fs/attr.c
index d22e818..df28035 100644
--- a/fs/attr.c
+++ b/fs/attr.c

@@ -183,15 +183,18 @@
 		inode->i_uid = attr->ia_uid;
 	if (ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
-	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = timespec64_trunc(attr->ia_atime,
-						  inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_MTIME)
-		inode->i_mtime = timespec64_trunc(attr->ia_mtime,
-						  inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_CTIME)
-		inode->i_ctime = timespec64_trunc(attr->ia_ctime,
-						  inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_ATIME) {
+		inode->i_atime = timestamp_truncate(attr->ia_atime,
+						  inode);
+	}
+	if (ia_valid & ATTR_MTIME) {
+		inode->i_mtime = timestamp_truncate(attr->ia_mtime,
+						  inode);
+	}
+	if (ia_valid & ATTR_CTIME) {
+		inode->i_ctime = timestamp_truncate(attr->ia_ctime,
+						  inode);
+	}
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 

diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
index eaebcd4..3b3a6b1 100644
--- a/fs/autofs/Kconfig
+++ b/fs/autofs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config AUTOFS4_FS
 	tristate "Old Kconfig name for Kernel automounter support"
 	select AUTOFS_FS

diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
index 1f85d35..495ac54 100644
--- a/fs/autofs/Makefile
+++ b/fs/autofs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux autofs-filesystem routines.
 #

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 9f9cadb..8bcec8d 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *  Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
  *  Copyright 2005-2006 Ian Kent <raven@themaw.net>
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
  */
 
 /* Internal header file for autofs */
@@ -42,6 +39,8 @@
 #endif
 #define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
 
+extern struct file_system_type autofs_fs_type;
+
 /*
  * Unified info structure.  This is pointed to by both the dentry and
  * inode structures.  Each file in the filesystem has an instance of this
@@ -59,7 +58,6 @@
 	struct completion expire_complete;
 
 	struct list_head active;
-	int active_count;
 
 	struct list_head expiring;
 
@@ -69,6 +67,7 @@
 
 	kuid_t uid;
 	kgid_t gid;
+	struct rcu_head rcu;
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry in the process of expiring */
@@ -101,16 +100,20 @@
 
 #define AUTOFS_SBI_MAGIC 0x6d4a556d
 
+#define AUTOFS_SBI_CATATONIC	0x0001
+#define AUTOFS_SBI_STRICTEXPIRE 0x0002
+#define AUTOFS_SBI_IGNORE	0x0004
+
 struct autofs_sb_info {
 	u32 magic;
 	int pipefd;
 	struct file *pipe;
 	struct pid *oz_pgrp;
-	int catatonic;
 	int version;
 	int sub_version;
 	int min_proto;
 	int max_proto;
+	unsigned int flags;
 	unsigned long exp_timeout;
 	unsigned int type;
 	struct super_block *sb;
@@ -126,8 +129,7 @@
 
 static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
 {
-	return sb->s_magic != AUTOFS_SUPER_MAGIC ?
-		NULL : (struct autofs_sb_info *)(sb->s_fs_info);
+	return (struct autofs_sb_info *)(sb->s_fs_info);
 }
 
 static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
@@ -141,7 +143,8 @@
  */
 static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
 {
-	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
+	return ((sbi->flags & AUTOFS_SBI_CATATONIC) ||
+		 task_pgrp(current) == sbi->oz_pgrp);
 }
 
 struct inode *autofs_get_inode(struct super_block *, umode_t);
@@ -210,6 +213,8 @@
 		return -EINVAL;
 	/* We want a packet pipe */
 	pipe->f_flags |= O_DIRECT;
+	/* We don't expect -EAGAIN */
+	pipe->f_flags &= ~O_NONBLOCK;
 	return 0;
 }
 

diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 86eafda..a3cdb00 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright 2008 Red Hat, Inc. All rights reserved.
  * Copyright 2008 Ian Kent <raven@themaw.net>
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
  */
 
 #include <linux/miscdevice.h>
@@ -151,22 +148,6 @@
 	return err;
 }
 
-/*
- * Get the autofs super block info struct from the file opened on
- * the autofs mount point.
- */
-static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
-{
-	struct autofs_sb_info *sbi = NULL;
-	struct inode *inode;
-
-	if (f) {
-		inode = file_inode(f);
-		sbi = autofs_sbi(inode->i_sb);
-	}
-	return sbi;
-}
-
 /* Return autofs dev ioctl version */
 static int autofs_dev_ioctl_version(struct file *fp,
 				    struct autofs_sb_info *sbi,
@@ -366,7 +347,7 @@
 	pipefd = param->setpipefd.pipefd;
 
 	mutex_lock(&sbi->wq_mutex);
-	if (!sbi->catatonic) {
+	if (!(sbi->flags & AUTOFS_SBI_CATATONIC)) {
 		mutex_unlock(&sbi->wq_mutex);
 		return -EBUSY;
 	} else {
@@ -393,7 +374,7 @@
 		swap(sbi->oz_pgrp, new_pid);
 		sbi->pipefd = pipefd;
 		sbi->pipe = pipe;
-		sbi->catatonic = 0;
+		sbi->flags &= ~AUTOFS_SBI_CATATONIC;
 	}
 out:
 	put_pid(new_pid);
@@ -658,6 +639,8 @@
 	if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
 	    cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
 	    cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
+		struct super_block *sb;
+
 		fp = fget(param->ioctlfd);
 		if (!fp) {
 			if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
@@ -666,12 +649,13 @@
 			goto out;
 		}
 
-		sbi = autofs_dev_ioctl_sbi(fp);
-		if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
+		sb = file_inode(fp)->i_sb;
+		if (sb->s_type != &autofs_fs_type) {
 			err = -EINVAL;
 			fput(fp);
 			goto out;
 		}
+		sbi = autofs_sbi(sb);
 
 		/*
 		 * Admin needs to be able to set the mount catatonic in

diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index d441244..91f5787 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c

@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
  * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
  * Copyright 2001-2006 Ian Kent <raven@themaw.net>
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
  */
 
 #include "autofs_i.h"
@@ -73,6 +70,27 @@
 	return status;
 }
 
+/* p->d_lock held */
+static struct dentry *positive_after(struct dentry *p, struct dentry *child)
+{
+	if (child)
+		child = list_next_entry(child, d_child);
+	else
+		child = list_first_entry(&p->d_subdirs, struct dentry, d_child);
+
+	list_for_each_entry_from(child, &p->d_subdirs, d_child) {
+		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+		if (simple_positive(child)) {
+			dget_dlock(child);
+			spin_unlock(&child->d_lock);
+			return child;
+		}
+		spin_unlock(&child->d_lock);
+	}
+
+	return NULL;
+}
+
 /*
  * Calculate and dget next entry in the subdirs list under root.
  */
@@ -80,43 +98,14 @@
 					       struct dentry *root)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(root->d_sb);
-	struct list_head *next;
 	struct dentry *q;
 
 	spin_lock(&sbi->lookup_lock);
 	spin_lock(&root->d_lock);
-
-	if (prev)
-		next = prev->d_child.next;
-	else {
-		prev = dget_dlock(root);
-		next = prev->d_subdirs.next;
-	}
-
-cont:
-	if (next == &root->d_subdirs) {
-		spin_unlock(&root->d_lock);
-		spin_unlock(&sbi->lookup_lock);
-		dput(prev);
-		return NULL;
-	}
-
-	q = list_entry(next, struct dentry, d_child);
-
-	spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
-	/* Already gone or negative dentry (under construction) - try next */
-	if (!d_count(q) || !simple_positive(q)) {
-		spin_unlock(&q->d_lock);
-		next = q->d_child.next;
-		goto cont;
-	}
-	dget_dlock(q);
-	spin_unlock(&q->d_lock);
+	q = positive_after(root, prev);
 	spin_unlock(&root->d_lock);
 	spin_unlock(&sbi->lookup_lock);
-
 	dput(prev);
-
 	return q;
 }
 
@@ -127,59 +116,28 @@
 					       struct dentry *root)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(root->d_sb);
-	struct list_head *next;
-	struct dentry *p, *ret;
+	struct dentry *p = prev, *ret = NULL, *d = NULL;
 
 	if (prev == NULL)
 		return dget(root);
 
 	spin_lock(&sbi->lookup_lock);
-relock:
-	p = prev;
 	spin_lock(&p->d_lock);
-again:
-	next = p->d_subdirs.next;
-	if (next == &p->d_subdirs) {
-		while (1) {
-			struct dentry *parent;
+	while (1) {
+		struct dentry *parent;
 
-			if (p == root) {
-				spin_unlock(&p->d_lock);
-				spin_unlock(&sbi->lookup_lock);
-				dput(prev);
-				return NULL;
-			}
-
-			parent = p->d_parent;
-			if (!spin_trylock(&parent->d_lock)) {
-				spin_unlock(&p->d_lock);
-				cpu_relax();
-				goto relock;
-			}
-			spin_unlock(&p->d_lock);
-			next = p->d_child.next;
-			p = parent;
-			if (next != &parent->d_subdirs)
-				break;
-		}
-	}
-	ret = list_entry(next, struct dentry, d_child);
-
-	spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
-	/* Negative dentry - try next */
-	if (!simple_positive(ret)) {
+		ret = positive_after(p, d);
+		if (ret || p == root)
+			break;
+		parent = p->d_parent;
 		spin_unlock(&p->d_lock);
-		lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_);
-		p = ret;
-		goto again;
+		spin_lock(&parent->d_lock);
+		d = p;
+		p = parent;
 	}
-	dget_dlock(ret);
-	spin_unlock(&ret->d_lock);
 	spin_unlock(&p->d_lock);
 	spin_unlock(&sbi->lookup_lock);
-
 	dput(prev);
-
 	return ret;
 }
 
@@ -501,9 +459,10 @@
 		 */
 		how &= ~AUTOFS_EXP_LEAVES;
 		found = should_expire(expired, mnt, timeout, how);
-		if (!found || found != expired)
-			/* Something has changed, continue */
+		if (found != expired) { // something has changed, continue
+			dput(found);
 			goto next;
+		}
 
 		if (expired != dentry)
 			dput(dentry);
@@ -596,7 +555,6 @@
 	pkt.len = dentry->d_name.len;
 	memcpy(pkt.name, dentry->d_name.name, pkt.len);
 	pkt.name[pkt.len] = '\0';
-	dput(dentry);
 
 	if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
 		ret = -EFAULT;
@@ -609,6 +567,8 @@
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
+	dput(dentry);
+
 	return ret;
 }
 

diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index 79ae07d..d3f55e8 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c

@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
  */
 
 #include <linux/module.h>
@@ -16,7 +13,7 @@
 	return mount_nodev(fs_type, flags, data, autofs_fill_super);
 }
 
-static struct file_system_type autofs_fs_type = {
+struct file_system_type autofs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "autofs",
 	.mount		= autofs_mount,

diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 846c052..9edf243 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
  * Copyright 2005-2006 Ian Kent <raven@themaw.net>
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
  */
 
 #include <linux/seq_file.h>
@@ -36,7 +33,7 @@
 
 void autofs_free_ino(struct autofs_info *ino)
 {
-	kfree(ino);
+	kfree_rcu(ino, rcu);
 }
 
 void autofs_kill_sb(struct super_block *sb)
@@ -82,16 +79,20 @@
 	seq_printf(m, ",maxproto=%d", sbi->max_proto);
 
 	if (autofs_type_offset(sbi->type))
-		seq_printf(m, ",offset");
+		seq_puts(m, ",offset");
 	else if (autofs_type_direct(sbi->type))
-		seq_printf(m, ",direct");
+		seq_puts(m, ",direct");
 	else
-		seq_printf(m, ",indirect");
+		seq_puts(m, ",indirect");
+	if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
+		seq_puts(m, ",strictexpire");
+	if (sbi->flags & AUTOFS_SBI_IGNORE)
+		seq_puts(m, ",ignore");
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	if (sbi->pipe)
 		seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
 	else
-		seq_printf(m, ",pipe_ino=-1");
+		seq_puts(m, ",pipe_ino=-1");
 #endif
 	return 0;
 }
@@ -109,7 +110,8 @@
 };
 
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
-	Opt_indirect, Opt_direct, Opt_offset};
+	Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
+	Opt_ignore};
 
 static const match_table_t tokens = {
 	{Opt_fd, "fd=%u"},
@@ -121,24 +123,29 @@
 	{Opt_indirect, "indirect"},
 	{Opt_direct, "direct"},
 	{Opt_offset, "offset"},
+	{Opt_strictexpire, "strictexpire"},
+	{Opt_ignore, "ignore"},
 	{Opt_err, NULL}
 };
 
-static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
-			 int *pgrp, bool *pgrp_set, unsigned int *type,
-			 int *minproto, int *maxproto)
+static int parse_options(char *options,
+			 struct inode *root, int *pgrp, bool *pgrp_set,
+			 struct autofs_sb_info *sbi)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 	int option;
+	int pipefd = -1;
+	kuid_t uid;
+	kgid_t gid;
 
-	*uid = current_uid();
-	*gid = current_gid();
+	root->i_uid = current_uid();
+	root->i_gid = current_gid();
 
-	*minproto = AUTOFS_MIN_PROTO_VERSION;
-	*maxproto = AUTOFS_MAX_PROTO_VERSION;
+	sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
+	sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
 
-	*pipefd = -1;
+	sbi->pipefd = -1;
 
 	if (!options)
 		return 1;
@@ -152,22 +159,25 @@
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_fd:
-			if (match_int(args, pipefd))
+			if (match_int(args, &pipefd))
 				return 1;
+			sbi->pipefd = pipefd;
 			break;
 		case Opt_uid:
 			if (match_int(args, &option))
 				return 1;
-			*uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(*uid))
+			uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uid))
 				return 1;
+			root->i_uid = uid;
 			break;
 		case Opt_gid:
 			if (match_int(args, &option))
 				return 1;
-			*gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(*gid))
+			gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(gid))
 				return 1;
+			root->i_gid = gid;
 			break;
 		case Opt_pgrp:
 			if (match_int(args, &option))
@@ -178,27 +188,33 @@
 		case Opt_minproto:
 			if (match_int(args, &option))
 				return 1;
-			*minproto = option;
+			sbi->min_proto = option;
 			break;
 		case Opt_maxproto:
 			if (match_int(args, &option))
 				return 1;
-			*maxproto = option;
+			sbi->max_proto = option;
 			break;
 		case Opt_indirect:
-			set_autofs_type_indirect(type);
+			set_autofs_type_indirect(&sbi->type);
 			break;
 		case Opt_direct:
-			set_autofs_type_direct(type);
+			set_autofs_type_direct(&sbi->type);
 			break;
 		case Opt_offset:
-			set_autofs_type_offset(type);
+			set_autofs_type_offset(&sbi->type);
+			break;
+		case Opt_strictexpire:
+			sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
+			break;
+		case Opt_ignore:
+			sbi->flags |= AUTOFS_SBI_IGNORE;
 			break;
 		default:
 			return 1;
 		}
 	}
-	return (*pipefd < 0);
+	return (sbi->pipefd < 0);
 }
 
 int autofs_fill_super(struct super_block *s, void *data, int silent)
@@ -206,7 +222,6 @@
 	struct inode *root_inode;
 	struct dentry *root;
 	struct file *pipe;
-	int pipefd;
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
 	int pgrp = 0;
@@ -222,12 +237,12 @@
 	sbi->magic = AUTOFS_SBI_MAGIC;
 	sbi->pipefd = -1;
 	sbi->pipe = NULL;
-	sbi->catatonic = 1;
 	sbi->exp_timeout = 0;
 	sbi->oz_pgrp = NULL;
 	sbi->sb = s;
 	sbi->version = 0;
 	sbi->sub_version = 0;
+	sbi->flags = AUTOFS_SBI_CATATONIC;
 	set_autofs_type_indirect(&sbi->type);
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
@@ -255,16 +270,16 @@
 	}
 	root_inode = autofs_get_inode(s, S_IFDIR | 0755);
 	root = d_make_root(root_inode);
-	if (!root)
+	if (!root) {
+		ret = -ENOMEM;
 		goto fail_ino;
+	}
 	pipe = NULL;
 
 	root->d_fsdata = ino;
 
 	/* Can this call block? */
-	if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
-			  &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
-			  &sbi->max_proto)) {
+	if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) {
 		pr_err("called with bogus options\n");
 		goto fail_dput;
 	}
@@ -303,8 +318,9 @@
 	root_inode->i_fop = &autofs_root_operations;
 	root_inode->i_op = &autofs_dir_inode_operations;
 
-	pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
-	pipe = fget(pipefd);
+	pr_debug("pipe fd = %d, pgrp = %u\n",
+		 sbi->pipefd, pid_nr(sbi->oz_pgrp));
+	pipe = fget(sbi->pipefd);
 
 	if (!pipe) {
 		pr_err("could not open pipe file descriptor\n");
@@ -314,8 +330,7 @@
 	if (ret < 0)
 		goto fail_fput;
 	sbi->pipe = pipe;
-	sbi->pipefd = pipefd;
-	sbi->catatonic = 0;
+	sbi->flags &= ~AUTOFS_SBI_CATATONIC;
 
 	/*
 	 * Success! Install the root dentry now to indicate completion.

diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 782e57b..29abafc 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c

@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
  * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
  * Copyright 2001-2006 Ian Kent <raven@themaw.net>
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
  */
 
 #include <linux/capability.h>
@@ -63,38 +60,15 @@
 	.d_release	= autofs_dentry_release,
 };
 
-static void autofs_add_active(struct dentry *dentry)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
-	struct autofs_info *ino;
-
-	ino = autofs_dentry_ino(dentry);
-	if (ino) {
-		spin_lock(&sbi->lookup_lock);
-		if (!ino->active_count) {
-			if (list_empty(&ino->active))
-				list_add(&ino->active, &sbi->active_list);
-		}
-		ino->active_count++;
-		spin_unlock(&sbi->lookup_lock);
-	}
-}
-
 static void autofs_del_active(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
 	struct autofs_info *ino;
 
 	ino = autofs_dentry_ino(dentry);
-	if (ino) {
-		spin_lock(&sbi->lookup_lock);
-		ino->active_count--;
-		if (!ino->active_count) {
-			if (!list_empty(&ino->active))
-				list_del_init(&ino->active);
-		}
-		spin_unlock(&sbi->lookup_lock);
-	}
+	spin_lock(&sbi->lookup_lock);
+	list_del_init(&ino->active);
+	spin_unlock(&sbi->lookup_lock);
 }
 
 static int autofs_dir_open(struct inode *inode, struct file *file)
@@ -275,8 +249,11 @@
 		pr_debug("waiting for mount name=%pd\n", path->dentry);
 		status = autofs_wait(sbi, path, NFY_MOUNT);
 		pr_debug("mount wait done status=%d\n", status);
+		ino->last_used = jiffies;
+		return status;
 	}
-	ino->last_used = jiffies;
+	if (!(sbi->flags & AUTOFS_SBI_STRICTEXPIRE))
+		ino->last_used = jiffies;
 	return status;
 }
 
@@ -510,7 +487,8 @@
 	sbi = autofs_sbi(dir->i_sb);
 
 	pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
-		 current->pid, task_pgrp_nr(current), sbi->catatonic,
+		 current->pid, task_pgrp_nr(current),
+		 sbi->flags & AUTOFS_SBI_CATATONIC,
 		 autofs_oz_mode(sbi));
 
 	active = autofs_lookup_active(dentry);
@@ -526,19 +504,22 @@
 		if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
 			return ERR_PTR(-ENOENT);
 
-		/* Mark entries in the root as mount triggers */
-		if (IS_ROOT(dentry->d_parent) &&
-		    autofs_type_indirect(sbi->type))
-			__managed_dentry_set_managed(dentry);
-
 		ino = autofs_new_ino(sbi);
 		if (!ino)
 			return ERR_PTR(-ENOMEM);
 
+		spin_lock(&sbi->lookup_lock);
+		spin_lock(&dentry->d_lock);
+		/* Mark entries in the root as mount triggers */
+		if (IS_ROOT(dentry->d_parent) &&
+		    autofs_type_indirect(sbi->type))
+			__managed_dentry_set_managed(dentry);
 		dentry->d_fsdata = ino;
 		ino->dentry = dentry;
 
-		autofs_add_active(dentry);
+		list_add(&ino->active, &sbi->active_list);
+		spin_unlock(&sbi->lookup_lock);
+		spin_unlock(&dentry->d_lock);
 	}
 	return NULL;
 }
@@ -563,7 +544,7 @@
 	 * autofs mount is catatonic but the state of an autofs
 	 * file system needs to be preserved over restarts.
 	 */
-	if (sbi->catatonic)
+	if (sbi->flags & AUTOFS_SBI_CATATONIC)
 		return -EACCES;
 
 	BUG_ON(!ino);
@@ -626,7 +607,7 @@
 	 * autofs mount is catatonic but the state of an autofs
 	 * file system needs to be preserved over restarts.
 	 */
-	if (sbi->catatonic)
+	if (sbi->flags & AUTOFS_SBI_CATATONIC)
 		return -EACCES;
 
 	if (atomic_dec_and_test(&ino->count)) {
@@ -714,7 +695,7 @@
 	 * autofs mount is catatonic but the state of an autofs
 	 * file system needs to be preserved over restarts.
 	 */
-	if (sbi->catatonic)
+	if (sbi->flags & AUTOFS_SBI_CATATONIC)
 		return -EACCES;
 
 	spin_lock(&sbi->lookup_lock);
@@ -759,7 +740,7 @@
 	 * autofs mount is catatonic but the state of an autofs
 	 * file system needs to be preserved over restarts.
 	 */
-	if (sbi->catatonic)
+	if (sbi->flags & AUTOFS_SBI_CATATONIC)
 		return -EACCES;
 
 	pr_debug("dentry %p, creating %pd\n", dentry, dentry);

diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
index aad3902..7ac67dc 100644
--- a/fs/autofs/symlink.c
+++ b/fs/autofs/symlink.c

@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
  */
 
 #include "autofs_i.h"

diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index f6385c6..b04c528 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
  * Copyright 2001-2006 Ian Kent <raven@themaw.net>
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
  */
 
 #include <linux/sched/signal.h>
@@ -20,14 +17,14 @@
 	struct autofs_wait_queue *wq, *nwq;
 
 	mutex_lock(&sbi->wq_mutex);
-	if (sbi->catatonic) {
+	if (sbi->flags & AUTOFS_SBI_CATATONIC) {
 		mutex_unlock(&sbi->wq_mutex);
 		return;
 	}
 
 	pr_debug("entering catatonic mode\n");
 
-	sbi->catatonic = 1;
+	sbi->flags |= AUTOFS_SBI_CATATONIC;
 	wq = sbi->queues;
 	sbi->queues = NULL;	/* Erase all wait queues */
 	while (wq) {
@@ -255,7 +252,7 @@
 	struct autofs_wait_queue *wq;
 	struct autofs_info *ino;
 
-	if (sbi->catatonic)
+	if (sbi->flags & AUTOFS_SBI_CATATONIC)
 		return -ENOENT;
 
 	/* Wait in progress, continue; */
@@ -290,7 +287,7 @@
 			if (mutex_lock_interruptible(&sbi->wq_mutex))
 				return -EINTR;
 
-			if (sbi->catatonic)
+			if (sbi->flags & AUTOFS_SBI_CATATONIC)
 				return -ENOENT;
 
 			wq = autofs_find_wait(sbi, qstr);
@@ -359,7 +356,7 @@
 	pid_t tgid;
 
 	/* In catatonic mode, we don't wait for nobody */
-	if (sbi->catatonic)
+	if (sbi->flags & AUTOFS_SBI_CATATONIC)
 		return -ENOENT;
 
 	/*

diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index edc5cc2..9550b64 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config BEFS_FS
 	tristate "BeOS file system (BeFS) support (read only)"
 	depends on BLOCK

diff --git a/fs/befs/Makefile b/fs/befs/Makefile
index 8b9f666..6c9c3cb 100644
--- a/fs/befs/Makefile
+++ b/fs/befs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux BeOS filesystem routines.
 #

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 4700b45..64cdf4d8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/befs/linuxvfs.c
  *
@@ -44,7 +45,7 @@
 				  unsigned int);
 static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
-static void befs_destroy_inode(struct inode *inode);
+static void befs_free_inode(struct inode *inode);
 static void befs_destroy_inodecache(void);
 static int befs_symlink_readpage(struct file *, struct page *);
 static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
@@ -64,7 +65,7 @@
 
 static const struct super_operations befs_sops = {
 	.alloc_inode	= befs_alloc_inode,	/* allocate a new inode */
-	.destroy_inode	= befs_destroy_inode, /* deallocate an inode */
+	.free_inode	= befs_free_inode, /* deallocate an inode */
 	.put_super	= befs_put_super,	/* uninit super */
 	.statfs		= befs_statfs,	/* statfs */
 	.remount_fs	= befs_remount,
@@ -281,17 +282,11 @@
 	return &bi->vfs_inode;
 }
 
-static void befs_i_callback(struct rcu_head *head)
+static void befs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
-static void befs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, befs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct befs_inode_info *bi = (struct befs_inode_info *) foo;
@@ -898,6 +893,8 @@
 	sb_set_blocksize(sb, (ulong) befs_sb->block_size);
 	sb->s_op = &befs_sops;
 	sb->s_export_op = &befs_export_operations;
+	sb->s_time_min = 0;
+	sb->s_time_max = 0xffffffffffffll;
 	root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
 	if (IS_ERR(root)) {
 		ret = PTR_ERR(root);

diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index 3728a64..3e1247f 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config BFS_FS
 	tristate "BFS file system support"
 	depends on BLOCK

diff --git a/fs/bfs/Makefile b/fs/bfs/Makefile
index c787b36..2b6bc5e 100644
--- a/fs/bfs/Makefile
+++ b/fs/bfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for BFS filesystem.
 #

diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 67aef3b..606f937 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h

@@ -1,13 +1,20 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *	fs/bfs/bfs.h
- *	Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
+ *	Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
  */
 #ifndef _FS_BFS_BFS_H
 #define _FS_BFS_BFS_H
 
 #include <linux/bfs_fs.h>
 
+/* In theory BFS supports up to 512 inodes, numbered from 2 (for /) up to 513 inclusive.
+   In actual fact, attempting to create the 512th inode (i.e. inode No. 513 or file No. 511)
+   will fail with ENOSPC in bfs_add_entry(): the root directory cannot contain so many entries, counting '..'.
+   So, mkfs.bfs(8) should really limit its -N option to 511 and not 512. For now, we just print a warning
+   if a filesystem is mounted with such "impossible to fill up" number of inodes */
+#define BFS_MAX_LASTI	513
+
 /*
  * BFS file system in-core superblock info
  */
@@ -17,7 +24,7 @@
 	unsigned long si_freei;
 	unsigned long si_lf_eblk;
 	unsigned long si_lasti;
-	unsigned long *si_imap;
+	DECLARE_BITMAP(si_imap, BFS_MAX_LASTI+1);
 	struct mutex bfs_lock;
 };
 

diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index f32f21c..d8dfe3a 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c

@@ -2,8 +2,8 @@
 /*
  *	fs/bfs/dir.c
  *	BFS directory operations.
- *	Copyright (C) 1999,2000  Tigran Aivazian <tigran@veritas.com>
- *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
+ *	Copyright (C) 1999-2018  Tigran Aivazian <aivazian.tigran@gmail.com>
+ *  Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
  */
 
 #include <linux/time.h>

diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 1476cdd..0dceefc 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c

@@ -2,7 +2,7 @@
 /*
  *	fs/bfs/file.c
  *	BFS file operations.
- *	Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ *	Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
  *
  *	Make the file block allocation algorithm understand the size
  *	of the underlying block device.

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index d81c148..f8ce136 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c

@@ -1,10 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *	fs/bfs/inode.c
  *	BFS superblock and inode operations.
- *	Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ *	Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
  *	From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
- *
- *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
+ *	Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
  */
 
 #include <linux/module.h>
@@ -118,12 +118,12 @@
 {
 	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	unsigned int ino = (u16)inode->i_ino;
-        unsigned long i_sblock;
+	unsigned long i_sblock;
 	struct bfs_inode *di;
 	struct buffer_head *bh;
 	int err = 0;
 
-        dprintf("ino=%08x\n", ino);
+	dprintf("ino=%08x\n", ino);
 
 	di = find_inode(inode->i_sb, ino, &bh);
 	if (IS_ERR(di))
@@ -144,7 +144,7 @@
 	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
 	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
 	di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
-        i_sblock = BFS_I(inode)->i_sblock;
+	i_sblock = BFS_I(inode)->i_sblock;
 	di->i_sblock = cpu_to_le32(i_sblock);
 	di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
@@ -188,13 +188,13 @@
 	mark_buffer_dirty(bh);
 	brelse(bh);
 
-        if (bi->i_dsk_ino) {
+	if (bi->i_dsk_ino) {
 		if (bi->i_sblock)
 			info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
 		info->si_freei++;
 		clear_bit(ino, info->si_imap);
-		bfs_dump_imap("delete_inode", s);
-        }
+		bfs_dump_imap("evict_inode", s);
+	}
 
 	/*
 	 * If this was the last file, make the previous block
@@ -214,7 +214,6 @@
 		return;
 
 	mutex_destroy(&info->bfs_lock);
-	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
 }
@@ -247,17 +246,11 @@
 	return &bi->vfs_inode;
 }
 
-static void bfs_i_callback(struct rcu_head *head)
+static void bfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
-static void bfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, bfs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct bfs_inode_info *bi = foo;
@@ -289,7 +282,7 @@
 
 static const struct super_operations bfs_sops = {
 	.alloc_inode	= bfs_alloc_inode,
-	.destroy_inode	= bfs_destroy_inode,
+	.free_inode	= bfs_free_inode,
 	.write_inode	= bfs_write_inode,
 	.evict_inode	= bfs_evict_inode,
 	.put_super	= bfs_put_super,
@@ -311,8 +304,7 @@
 		else
 			strcat(tmpbuf, "0");
 	}
-	printf("BFS-fs: %s: lasti=%08lx <%s>\n",
-				prefix, BFS_SB(s)->si_lasti, tmpbuf);
+	printf("%s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf);
 	free_page((unsigned long)tmpbuf);
 #endif
 }
@@ -322,7 +314,7 @@
 	struct buffer_head *bh, *sbh;
 	struct bfs_super_block *bfs_sb;
 	struct inode *inode;
-	unsigned i, imap_len;
+	unsigned i;
 	struct bfs_sb_info *info;
 	int ret = -EINVAL;
 	unsigned long i_sblock, i_eblock, i_eoff, s_size;
@@ -332,6 +324,8 @@
 		return -ENOMEM;
 	mutex_init(&info->bfs_lock);
 	s->s_fs_info = info;
+	s->s_time_min = 0;
+	s->s_time_max = U32_MAX;
 
 	sb_set_blocksize(s, BFS_BSIZE);
 
@@ -341,8 +335,7 @@
 	bfs_sb = (struct bfs_super_block *)sbh->b_data;
 	if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
 		if (!silent)
-			printf("No BFS filesystem on %s (magic=%08x)\n", 
-				s->s_id,  le32_to_cpu(bfs_sb->s_magic));
+			printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id,  le32_to_cpu(bfs_sb->s_magic));
 		goto out1;
 	}
 	if (BFS_UNCLEAN(bfs_sb, s) && !silent)
@@ -351,18 +344,16 @@
 	s->s_magic = BFS_MAGIC;
 
 	if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) ||
-	    le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) {
-		printf("Superblock is corrupted\n");
+	    le32_to_cpu(bfs_sb->s_start) < sizeof(struct bfs_super_block) + sizeof(struct bfs_dirent)) {
+		printf("Superblock is corrupted on %s\n", s->s_id);
 		goto out1;
 	}
 
-	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
-					sizeof(struct bfs_inode)
-					+ BFS_ROOT_INO - 1;
-	imap_len = (info->si_lasti / 8) + 1;
-	info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN);
-	if (!info->si_imap) {
-		printf("Cannot allocate %u bytes\n", imap_len);
+	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1;
+	if (info->si_lasti == BFS_MAX_LASTI)
+		printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id);
+	else if (info->si_lasti > BFS_MAX_LASTI) {
+		printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id);
 		goto out1;
 	}
 	for (i = 0; i < BFS_ROOT_INO; i++)
@@ -372,26 +363,25 @@
 	inode = bfs_iget(s, BFS_ROOT_INO);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
-		goto out2;
+		goto out1;
 	}
 	s->s_root = d_make_root(inode);
 	if (!s->s_root) {
 		ret = -ENOMEM;
-		goto out2;
+		goto out1;
 	}
 
 	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
-	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
-			- le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
+	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
 	info->si_freei = 0;
 	info->si_lf_eblk = 0;
 
 	/* can we read the last block? */
 	bh = sb_bread(s, info->si_blocks - 1);
 	if (!bh) {
-		printf("Last block not available: %lu\n", info->si_blocks - 1);
+		printf("Last block not available on %s: %lu\n", s->s_id, info->si_blocks - 1);
 		ret = -EIO;
-		goto out3;
+		goto out2;
 	}
 	brelse(bh);
 
@@ -425,11 +415,11 @@
 			(i_eoff != le32_to_cpu(-1) && i_eoff > s_size) ||
 			i_sblock * BFS_BSIZE > i_eoff) {
 
-			printf("Inode 0x%08x corrupted\n", i);
+			printf("Inode 0x%08x corrupted on %s\n", i, s->s_id);
 
 			brelse(bh);
 			ret = -EIO;
-			goto out3;
+			goto out2;
 		}
 
 		if (!di->i_ino) {
@@ -445,14 +435,12 @@
 	}
 	brelse(bh);
 	brelse(sbh);
-	bfs_dump_imap("read_super", s);
+	bfs_dump_imap("fill_super", s);
 	return 0;
 
-out3:
+out2:
 	dput(s->s_root);
 	s->s_root = NULL;
-out2:
-	kfree(info->si_imap);
 out1:
 	brelse(sbh);
 out:
@@ -482,7 +470,7 @@
 	int err = init_inodecache();
 	if (err)
 		goto out1;
-        err = register_filesystem(&bfs_fs_type);
+	err = register_filesystem(&bfs_fs_type);
 	if (err)
 		goto out;
 	return 0;

diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index c3deb2e..8e8346a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/binfmt_aout.c
  *
@@ -29,97 +30,14 @@
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
-#include <asm/a.out-core.h>
 
 static int load_aout_binary(struct linux_binprm *);
 static int load_aout_library(struct file*);
 
-#ifdef CONFIG_COREDUMP
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-static int aout_core_dump(struct coredump_params *cprm)
-{
-	mm_segment_t fs;
-	int has_dumped = 0;
-	void __user *dump_start;
-	int dump_size;
-	struct user dump;
-#ifdef __alpha__
-#       define START_DATA(u)	((void __user *)u.start_data)
-#else
-#	define START_DATA(u)	((void __user *)((u.u_tsize << PAGE_SHIFT) + \
-				 u.start_code))
-#endif
-#       define START_STACK(u)   ((void __user *)u.start_stack)
-
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-	has_dumped = 1;
-       	strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
-	dump.u_ar0 = offsetof(struct user, regs);
-	dump.signal = cprm->siginfo->si_signo;
-	aout_dump_thread(cprm->regs, &dump);
-
-/* If the size of the dump file exceeds the rlimit, then see what would happen
-   if we wrote the stack, but not the data area.  */
-	if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
-		dump.u_dsize = 0;
-
-/* Make sure we have enough room to write the stack and data areas. */
-	if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
-		dump.u_ssize = 0;
-
-/* make sure we actually have a data and stack area to dump */
-	set_fs(USER_DS);
-	if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
-		dump.u_dsize = 0;
-	if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
-		dump.u_ssize = 0;
-
-	set_fs(KERNEL_DS);
-/* struct user */
-	if (!dump_emit(cprm, &dump, sizeof(dump)))
-		goto end_coredump;
-/* Now dump all of the user data.  Include malloced stuff as well */
-	if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
-		goto end_coredump;
-/* now we start writing out the user space info */
-	set_fs(USER_DS);
-/* Dump the data area */
-	if (dump.u_dsize != 0) {
-		dump_start = START_DATA(dump);
-		dump_size = dump.u_dsize << PAGE_SHIFT;
-		if (!dump_emit(cprm, dump_start, dump_size))
-			goto end_coredump;
-	}
-/* Now prepare to dump the stack area */
-	if (dump.u_ssize != 0) {
-		dump_start = START_STACK(dump);
-		dump_size = dump.u_ssize << PAGE_SHIFT;
-		if (!dump_emit(cprm, dump_start, dump_size))
-			goto end_coredump;
-	}
-end_coredump:
-	set_fs(fs);
-	return has_dumped;
-}
-#else
-#define aout_core_dump NULL
-#endif
-
 static struct linux_binfmt aout_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_aout_binary,
 	.load_shlib	= load_aout_library,
-	.core_dump	= aout_core_dump,
-	.min_coredump	= PAGE_SIZE
 };
 
 #define BAD_ADDR(x)	((unsigned long)(x) >= TASK_SIZE)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index efae2fb..c5642bc 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/binfmt_elf.c
  *
@@ -57,8 +58,6 @@
 #endif
 
 static int load_elf_binary(struct linux_binprm *bprm);
-static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
-				int, int, unsigned long);
 
 #ifdef CONFIG_USELIB
 static int load_elf_library(struct file *);
@@ -347,7 +346,7 @@
 #ifndef elf_map
 
 static unsigned long elf_map(struct file *filep, unsigned long addr,
-		struct elf_phdr *eppnt, int prot, int type,
+		const struct elf_phdr *eppnt, int prot, int type,
 		unsigned long total_size)
 {
 	unsigned long map_addr;
@@ -387,7 +386,7 @@
 
 #endif /* !elf_map */
 
-static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
 {
 	int i, first_idx = -1, last_idx = -1;
 
@@ -414,12 +413,13 @@
  * header pointed to by elf_ex, into a newly allocated array. The caller is
  * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
  */
-static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
 				       struct file *elf_file)
 {
 	struct elf_phdr *elf_phdata = NULL;
-	int retval, size, err = -1;
+	int retval, err = -1;
 	loff_t pos = elf_ex->e_phoff;
+	unsigned int size;
 
 	/*
 	 * If the size of this structure has changed, then punt, since
@@ -429,13 +429,9 @@
 		goto out;
 
 	/* Sanity check the number of program headers... */
-	if (elf_ex->e_phnum < 1 ||
-		elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
-		goto out;
-
 	/* ...and their total size. */
 	size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
-	if (size > ELF_MIN_ALIGN)
+	if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
 		goto out;
 
 	elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -529,6 +525,19 @@
 
 #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
 
+static inline int make_prot(u32 p_flags)
+{
+	int prot = 0;
+
+	if (p_flags & PF_R)
+		prot |= PROT_READ;
+	if (p_flags & PF_W)
+		prot |= PROT_WRITE;
+	if (p_flags & PF_X)
+		prot |= PROT_EXEC;
+	return prot;
+}
+
 /* This is much more generalized than the library routine read function,
    so we keep this separate.  Technically the library read function
    is only provided so that we can read a.out libraries that have
@@ -568,16 +577,10 @@
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
 		if (eppnt->p_type == PT_LOAD) {
 			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
-			int elf_prot = 0;
+			int elf_prot = make_prot(eppnt->p_flags);
 			unsigned long vaddr = 0;
 			unsigned long k, map_addr;
 
-			if (eppnt->p_flags & PF_R)
-		    		elf_prot = PROT_READ;
-			if (eppnt->p_flags & PF_W)
-				elf_prot |= PROT_WRITE;
-			if (eppnt->p_flags & PF_X)
-				elf_prot |= PROT_EXEC;
 			vaddr = eppnt->p_vaddr;
 			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
 				elf_type |= MAP_FIXED_NOREPLACE;
@@ -667,32 +670,11 @@
  * libraries.  There is no binary dependent code anywhere else.
  */
 
-#ifndef STACK_RND_MASK
-#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))	/* 8MB of VA */
-#endif
-
-static unsigned long randomize_stack_top(unsigned long stack_top)
-{
-	unsigned long random_variable = 0;
-
-	if (current->flags & PF_RANDOMIZE) {
-		random_variable = get_random_long();
-		random_variable &= STACK_RND_MASK;
-		random_variable <<= PAGE_SHIFT;
-	}
-#ifdef CONFIG_STACK_GROWSUP
-	return PAGE_ALIGN(stack_top) + random_variable;
-#else
-	return PAGE_ALIGN(stack_top) - random_variable;
-#endif
-}
-
 static int load_elf_binary(struct linux_binprm *bprm)
 {
 	struct file *interpreter = NULL; /* to shut gcc up */
  	unsigned long load_addr = 0, load_bias = 0;
 	int load_addr_set = 0;
-	char * elf_interpreter = NULL;
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
 	unsigned long elf_bss, elf_brk;
@@ -703,13 +685,12 @@
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long reloc_func_desc __maybe_unused = 0;
 	int executable_stack = EXSTACK_DEFAULT;
-	struct pt_regs *regs = current_pt_regs();
 	struct {
 		struct elfhdr elf_ex;
 		struct elfhdr interp_elf_ex;
 	} *loc;
 	struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
-	loff_t pos;
+	struct pt_regs *regs;
 
 	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
 	if (!loc) {
@@ -739,69 +720,66 @@
 		goto out;
 
 	elf_ppnt = elf_phdata;
-	elf_bss = 0;
-	elf_brk = 0;
+	for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+		char *elf_interpreter;
+		loff_t pos;
 
-	start_code = ~0UL;
-	end_code = 0;
-	start_data = 0;
-	end_data = 0;
+		if (elf_ppnt->p_type != PT_INTERP)
+			continue;
 
-	for (i = 0; i < loc->elf_ex.e_phnum; i++) {
-		if (elf_ppnt->p_type == PT_INTERP) {
-			/* This is the program interpreter used for
-			 * shared libraries - for now assume that this
-			 * is an a.out format binary
-			 */
-			retval = -ENOEXEC;
-			if (elf_ppnt->p_filesz > PATH_MAX || 
-			    elf_ppnt->p_filesz < 2)
-				goto out_free_ph;
+		/*
+		 * This is the program interpreter used for shared libraries -
+		 * for now assume that this is an a.out format binary.
+		 */
+		retval = -ENOEXEC;
+		if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
+			goto out_free_ph;
 
-			retval = -ENOMEM;
-			elf_interpreter = kmalloc(elf_ppnt->p_filesz,
-						  GFP_KERNEL);
-			if (!elf_interpreter)
-				goto out_free_ph;
+		retval = -ENOMEM;
+		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
+		if (!elf_interpreter)
+			goto out_free_ph;
 
-			pos = elf_ppnt->p_offset;
-			retval = kernel_read(bprm->file, elf_interpreter,
-					     elf_ppnt->p_filesz, &pos);
-			if (retval != elf_ppnt->p_filesz) {
-				if (retval >= 0)
-					retval = -EIO;
-				goto out_free_interp;
-			}
-			/* make sure path is NULL terminated */
-			retval = -ENOEXEC;
-			if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
-				goto out_free_interp;
-
-			interpreter = open_exec(elf_interpreter);
-			retval = PTR_ERR(interpreter);
-			if (IS_ERR(interpreter))
-				goto out_free_interp;
-
-			/*
-			 * If the binary is not readable then enforce
-			 * mm->dumpable = 0 regardless of the interpreter's
-			 * permissions.
-			 */
-			would_dump(bprm, interpreter);
-
-			/* Get the exec headers */
-			pos = 0;
-			retval = kernel_read(interpreter, &loc->interp_elf_ex,
-					     sizeof(loc->interp_elf_ex), &pos);
-			if (retval != sizeof(loc->interp_elf_ex)) {
-				if (retval >= 0)
-					retval = -EIO;
-				goto out_free_dentry;
-			}
-
-			break;
+		pos = elf_ppnt->p_offset;
+		retval = kernel_read(bprm->file, elf_interpreter,
+				     elf_ppnt->p_filesz, &pos);
+		if (retval != elf_ppnt->p_filesz) {
+			if (retval >= 0)
+				retval = -EIO;
+			goto out_free_interp;
 		}
-		elf_ppnt++;
+		/* make sure path is NULL terminated */
+		retval = -ENOEXEC;
+		if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
+			goto out_free_interp;
+
+		interpreter = open_exec(elf_interpreter);
+		kfree(elf_interpreter);
+		retval = PTR_ERR(interpreter);
+		if (IS_ERR(interpreter))
+			goto out_free_ph;
+
+		/*
+		 * If the binary is not readable then enforce mm->dumpable = 0
+		 * regardless of the interpreter's permissions.
+		 */
+		would_dump(bprm, interpreter);
+
+		/* Get the exec headers */
+		pos = 0;
+		retval = kernel_read(interpreter, &loc->interp_elf_ex,
+				     sizeof(loc->interp_elf_ex), &pos);
+		if (retval != sizeof(loc->interp_elf_ex)) {
+			if (retval >= 0)
+				retval = -EIO;
+			goto out_free_dentry;
+		}
+
+		break;
+
+out_free_interp:
+		kfree(elf_interpreter);
+		goto out_free_ph;
 	}
 
 	elf_ppnt = elf_phdata;
@@ -824,7 +802,7 @@
 		}
 
 	/* Some simple consistency checks for the interpreter */
-	if (elf_interpreter) {
+	if (interpreter) {
 		retval = -ELIBBAD;
 		/* Not an ELF interpreter */
 		if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
@@ -889,13 +867,19 @@
 	if (retval < 0)
 		goto out_free_dentry;
 	
-	current->mm->start_stack = bprm->p;
+	elf_bss = 0;
+	elf_brk = 0;
+
+	start_code = ~0UL;
+	end_code = 0;
+	start_data = 0;
+	end_data = 0;
 
 	/* Now we do a little grungy work by mmapping the ELF image into
 	   the correct location in memory. */
 	for(i = 0, elf_ppnt = elf_phdata;
 	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
-		int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
+		int elf_prot, elf_flags;
 		unsigned long k, vaddr;
 		unsigned long total_size = 0;
 
@@ -927,21 +911,9 @@
 					 */
 				}
 			}
-
-			/*
-			 * Some binaries have overlapping elf segments and then
-			 * we have to forcefully map over an existing mapping
-			 * e.g. over this newly established brk mapping.
-			 */
-			elf_fixed = MAP_FIXED;
 		}
 
-		if (elf_ppnt->p_flags & PF_R)
-			elf_prot |= PROT_READ;
-		if (elf_ppnt->p_flags & PF_W)
-			elf_prot |= PROT_WRITE;
-		if (elf_ppnt->p_flags & PF_X)
-			elf_prot |= PROT_EXEC;
+		elf_prot = make_prot(elf_ppnt->p_flags);
 
 		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
 
@@ -951,7 +923,7 @@
 		 * the ET_DYN load_addr calculations, proceed normally.
 		 */
 		if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
-			elf_flags |= elf_fixed;
+			elf_flags |= MAP_FIXED;
 		} else if (loc->elf_ex.e_type == ET_DYN) {
 			/*
 			 * This logic is run once for the first LOAD Program
@@ -983,11 +955,11 @@
 			 * independently randomized mmap region (0 load_bias
 			 * without MAP_FIXED).
 			 */
-			if (elf_interpreter) {
+			if (interpreter) {
 				load_bias = ELF_ET_DYN_BASE;
 				if (current->flags & PF_RANDOMIZE)
 					load_bias += arch_mmap_rnd();
-				elf_flags |= elf_fixed;
+				elf_flags |= MAP_FIXED;
 			} else
 				load_bias = 0;
 
@@ -1081,7 +1053,7 @@
 		goto out_free_dentry;
 	}
 
-	if (elf_interpreter) {
+	if (interpreter) {
 		unsigned long interp_map_addr = 0;
 
 		elf_entry = load_elf_interp(&loc->interp_elf_ex,
@@ -1105,7 +1077,6 @@
 
 		allow_write_access(interpreter);
 		fput(interpreter);
-		kfree(elf_interpreter);
 	} else {
 		elf_entry = loc->elf_ex.e_entry;
 		if (BAD_ADDR(elf_entry)) {
@@ -1120,7 +1091,7 @@
 	set_binfmt(&elf_format);
 
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
-	retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
+	retval = arch_setup_additional_pages(bprm, !!interpreter);
 	if (retval < 0)
 		goto out;
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
@@ -1129,7 +1100,6 @@
 			  load_addr, interp_load_addr);
 	if (retval < 0)
 		goto out;
-	/* N.B. passed_fileno might not be initialized? */
 	current->mm->end_code = end_code;
 	current->mm->start_code = start_code;
 	current->mm->start_data = start_data;
@@ -1137,6 +1107,18 @@
 	current->mm->start_stack = bprm->p;
 
 	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+		/*
+		 * For architectures with ELF randomization, when executing
+		 * a loader directly (i.e. no interpreter listed in ELF
+		 * headers), move the brk area out of the mmap region
+		 * (since it grows up, and may collide early with the stack
+		 * growing down), and into the unused ELF_ET_DYN_BASE region.
+		 */
+		if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
+		    loc->elf_ex.e_type == ET_DYN && !interpreter)
+			current->mm->brk = current->mm->start_brk =
+				ELF_ET_DYN_BASE;
+
 		current->mm->brk = current->mm->start_brk =
 			arch_randomize_brk(current->mm);
 #ifdef compat_brk_randomized
@@ -1153,6 +1135,7 @@
 				MAP_FIXED | MAP_PRIVATE, 0);
 	}
 
+	regs = current_pt_regs();
 #ifdef ELF_PLAT_INIT
 	/*
 	 * The ABI may specify that certain registers be set up in special
@@ -1181,8 +1164,6 @@
 	allow_write_access(interpreter);
 	if (interpreter)
 		fput(interpreter);
-out_free_interp:
-	kfree(elf_interpreter);
 out_free_ph:
 	kfree(elf_phdata);
 	goto out;
@@ -1461,8 +1442,6 @@
 	elf->e_ehsize = sizeof(struct elfhdr);
 	elf->e_phentsize = sizeof(struct elf_phdr);
 	elf->e_phnum = segs;
-
-	return;
 }
 
 static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
@@ -1475,7 +1454,6 @@
 	phdr->p_memsz = 0;
 	phdr->p_flags = 0;
 	phdr->p_align = 0;
-	return;
 }
 
 static void fill_note(struct memelfnote *note, const char *name, int type, 
@@ -1485,7 +1463,6 @@
 	note->type = type;
 	note->datasz = sz;
 	note->data = data;
-	return;
 }
 
 /*
@@ -1580,7 +1557,7 @@
 }
 
 static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
-		const siginfo_t *siginfo)
+		const kernel_siginfo_t *siginfo)
 {
 	mm_segment_t old_fs = get_fs();
 	set_fs(KERNEL_DS);
@@ -1782,7 +1759,7 @@
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
 			  struct elf_note_info *info,
-			  const siginfo_t *siginfo, struct pt_regs *regs)
+			  const kernel_siginfo_t *siginfo, struct pt_regs *regs)
 {
 	struct task_struct *dump_task = current;
 	const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -2031,9 +2008,8 @@
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
 			  struct elf_note_info *info,
-			  const siginfo_t *siginfo, struct pt_regs *regs)
+			  const kernel_siginfo_t *siginfo, struct pt_regs *regs)
 {
-	struct list_head *t;
 	struct core_thread *ct;
 	struct elf_thread_status *ets;
 
@@ -2050,10 +2026,9 @@
 		list_add(&ets->list, &info->thread_list);
 	}
 
-	list_for_each(t, &info->thread_list) {
+	list_for_each_entry(ets, &info->thread_list, list) {
 		int sz;
 
-		ets = list_entry(t, struct elf_thread_status, list);
 		sz = elf_dump_thread_status(siginfo->si_signo, ets);
 		info->thread_status_size += sz;
 	}
@@ -2117,20 +2092,17 @@
 static int write_note_info(struct elf_note_info *info,
 			   struct coredump_params *cprm)
 {
+	struct elf_thread_status *ets;
 	int i;
-	struct list_head *t;
 
 	for (i = 0; i < info->numnote; i++)
 		if (!writenote(info->notes + i, cprm))
 			return 0;
 
 	/* write out the thread status notes section */
-	list_for_each(t, &info->thread_list) {
-		struct elf_thread_status *tmp =
-				list_entry(t, struct elf_thread_status, list);
-
-		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], cprm))
+	list_for_each_entry(ets, &info->thread_list, list) {
+		for (i = 0; i < ets->num_notes; i++)
+			if (!writenote(&ets->notes[i], cprm))
 				return 0;
 	}
 

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index b53bb37..d86ebd0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* binfmt_elf_fdpic.c: FDPIC ELF binary format
  *
  * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  * Derived from binfmt_elf.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index dd2d3f0..4664978 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/binfmt_em86.c
  *

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 82a48e8..831a2b2 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c

@@ -42,6 +42,11 @@
 #include <asm/unaligned.h>
 #include <asm/cacheflush.h>
 #include <asm/page.h>
+#include <asm/flat.h>
+
+#ifndef flat_get_relocate_addr
+#define flat_get_relocate_addr(rel)	(rel)
+#endif
 
 /****************************************************************************/
 
@@ -63,6 +68,12 @@
 #define RELOC_FAILED 0xff00ff01		/* Relocation incorrect somewhere */
 #define UNLOADED_LIB 0x7ff000ff		/* Placeholder for unused library */
 
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+#define	MAX_SHARED_LIBS			(4)
+#else
+#define	MAX_SHARED_LIBS			(1)
+#endif
+
 struct lib_info {
 	struct {
 		unsigned long start_code;		/* Start of text segment */
@@ -120,14 +131,15 @@
 
 	sp -= bprm->envc + 1;
 	sp -= bprm->argc + 1;
-	sp -= flat_argvp_envp_on_stack() ? 2 : 0;
+	if (IS_ENABLED(CONFIG_BINFMT_FLAT_ARGVP_ENVP_ON_STACK))
+		sp -= 2; /* argvp + envp */
 	sp -= 1;  /* &argc */
 
 	current->mm->start_stack = (unsigned long)sp & -FLAT_STACK_ALIGN;
 	sp = (unsigned long __user *)current->mm->start_stack;
 
 	__put_user(bprm->argc, sp++);
-	if (flat_argvp_envp_on_stack()) {
+	if (IS_ENABLED(CONFIG_BINFMT_FLAT_ARGVP_ENVP_ON_STACK)) {
 		unsigned long argv, envp;
 		argv = (unsigned long)(sp + 2);
 		envp = (unsigned long)(sp + 2 + bprm->argc + 1);
@@ -345,7 +357,7 @@
 	start_code = p->lib_list[id].start_code;
 	text_len = p->lib_list[id].text_len;
 
-	if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
+	if (r > start_brk - start_data + text_len) {
 		pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)",
 		       r, start_brk-start_data+text_len, text_len);
 		goto failed;
@@ -368,6 +380,7 @@
 
 /****************************************************************************/
 
+#ifdef CONFIG_BINFMT_FLAT_OLD
 static void old_reloc(unsigned long rl)
 {
 	static const char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
@@ -405,6 +418,7 @@
 
 	pr_debug("Relocation became %lx\n", val);
 }
+#endif /* CONFIG_BINFMT_FLAT_OLD */
 
 /****************************************************************************/
 
@@ -415,8 +429,8 @@
 	unsigned long textpos, datapos, realdatastart;
 	u32 text_len, data_len, bss_len, stack_len, full_data, flags;
 	unsigned long len, memp, memp_size, extra, rlim;
-	u32 __user *reloc, *rp;
-	struct inode *inode;
+	__be32 __user *reloc;
+	u32 __user *rp;
 	int i, rev, relocs;
 	loff_t fpos;
 	unsigned long start_code, end_code;
@@ -424,7 +438,6 @@
 	int ret;
 
 	hdr = ((struct flat_hdr *) bprm->buf);		/* exec-header */
-	inode = file_inode(bprm->file);
 
 	text_len  = ntohl(hdr->data_start);
 	data_len  = ntohl(hdr->data_end) - ntohl(hdr->data_start);
@@ -454,6 +467,7 @@
 	if (flags & FLAT_FLAG_KTRACE)
 		pr_info("Loading file: %s\n", bprm->filename);
 
+#ifdef CONFIG_BINFMT_FLAT_OLD
 	if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) {
 		pr_err("bad flat file version 0x%x (supported 0x%lx and 0x%lx)\n",
 		       rev, FLAT_VERSION, OLD_FLAT_VERSION);
@@ -470,6 +484,23 @@
 	}
 
 	/*
+	 * fix up the flags for the older format,  there were all kinds
+	 * of endian hacks,  this only works for the simple cases
+	 */
+	if (rev == OLD_FLAT_VERSION &&
+	   (flags || IS_ENABLED(CONFIG_BINFMT_FLAT_OLD_ALWAYS_RAM)))
+		flags = FLAT_FLAG_RAM;
+
+#else /* CONFIG_BINFMT_FLAT_OLD */
+	if (rev != FLAT_VERSION) {
+		pr_err("bad flat file version 0x%x (supported 0x%lx)\n",
+		       rev, FLAT_VERSION);
+		ret = -ENOEXEC;
+		goto err;
+	}
+#endif /* !CONFIG_BINFMT_FLAT_OLD */
+
+	/*
 	 * Make sure the header params are sane.
 	 * 28 bits (256 MB) is way more than reasonable in this case.
 	 * If some top bits are set we have probable binary corruption.
@@ -480,13 +511,6 @@
 		goto err;
 	}
 
-	/*
-	 * fix up the flags for the older format,  there were all kinds
-	 * of endian hacks,  this only works for the simple cases
-	 */
-	if (rev == OLD_FLAT_VERSION && flat_old_ram_flag(flags))
-		flags = FLAT_FLAG_RAM;
-
 #ifndef CONFIG_BINFMT_ZFLAT
 	if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) {
 		pr_err("Support for ZFLAT executables is not enabled.\n");
@@ -547,7 +571,7 @@
 			goto err;
 		}
 
-		len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+		len = data_len + extra;
 		len = PAGE_ALIGN(len);
 		realdatastart = vm_mmap(NULL, 0, len,
 			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -561,9 +585,7 @@
 			vm_munmap(textpos, text_len);
 			goto err;
 		}
-		datapos = ALIGN(realdatastart +
-				MAX_SHARED_LIBS * sizeof(unsigned long),
-				FLAT_DATA_ALIGN);
+		datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
 
 		pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
 			 data_len + bss_len + stack_len, datapos);
@@ -587,13 +609,13 @@
 			goto err;
 		}
 
-		reloc = (u32 __user *)
+		reloc = (__be32 __user *)
 			(datapos + (ntohl(hdr->reloc_start) - text_len));
 		memp = realdatastart;
 		memp_size = len;
 	} else {
 
-		len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
+		len = text_len + data_len + extra;
 		len = PAGE_ALIGN(len);
 		textpos = vm_mmap(NULL, 0, len,
 			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -608,11 +630,9 @@
 		}
 
 		realdatastart = textpos + ntohl(hdr->data_start);
-		datapos = ALIGN(realdatastart +
-				MAX_SHARED_LIBS * sizeof(u32),
-				FLAT_DATA_ALIGN);
+		datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
 
-		reloc = (u32 __user *)
+		reloc = (__be32 __user *)
 			(datapos + (ntohl(hdr->reloc_start) - text_len));
 		memp = textpos;
 		memp_size = len;
@@ -627,8 +647,9 @@
 					 (text_len + full_data
 						  - sizeof(struct flat_hdr)),
 					 0);
-			memmove((void *) datapos, (void *) realdatastart,
-					full_data);
+			if (datapos != realdatastart)
+				memmove((void *)datapos, (void *)realdatastart,
+						full_data);
 #else
 			/*
 			 * This is used on MMU systems mainly for testing.
@@ -684,8 +705,7 @@
 		if (IS_ERR_VALUE(result)) {
 			ret = result;
 			pr_err("Unable to read code+data+bss, errno %d\n", ret);
-			vm_munmap(textpos, text_len + data_len + extra +
-				MAX_SHARED_LIBS * sizeof(u32));
+			vm_munmap(textpos, text_len + data_len + extra);
 			goto err;
 		}
 	}
@@ -775,20 +795,18 @@
 	 * __start to address 4 so that is okay).
 	 */
 	if (rev > OLD_FLAT_VERSION) {
-		u32 __maybe_unused persistent = 0;
 		for (i = 0; i < relocs; i++) {
 			u32 addr, relval;
+			__be32 tmp;
 
 			/*
 			 * Get the address of the pointer to be
 			 * relocated (of course, the address has to be
 			 * relocated first).
 			 */
-			if (get_user(relval, reloc + i))
+			if (get_user(tmp, reloc + i))
 				return -EFAULT;
-			relval = ntohl(relval);
-			if (flat_set_persistent(relval, &persistent))
-				continue;
+			relval = ntohl(tmp);
 			addr = flat_get_relocate_addr(relval);
 			rp = (u32 __user *)calc_reloc(addr, libinfo, id, 1);
 			if (rp == (u32 __user *)RELOC_FAILED) {
@@ -797,8 +815,7 @@
 			}
 
 			/* Get the pointer's value.  */
-			ret = flat_get_addr_from_rp(rp, relval, flags,
-							&addr, &persistent);
+			ret = flat_get_addr_from_rp(rp, relval, flags, &addr);
 			if (unlikely(ret))
 				goto err;
 
@@ -807,8 +824,13 @@
 				 * Do the relocation.  PIC relocs in the data section are
 				 * already in target order
 				 */
-				if ((flags & FLAT_FLAG_GOTPIC) == 0)
-					addr = ntohl(addr);
+				if ((flags & FLAT_FLAG_GOTPIC) == 0) {
+					/*
+					 * Meh, the same value can have a different
+					 * byte order based on a flag..
+					 */
+					addr = ntohl((__force __be32)addr);
+				}
 				addr = calc_reloc(addr, libinfo, id, 0);
 				if (addr == RELOC_FAILED) {
 					ret = -ENOEXEC;
@@ -821,14 +843,15 @@
 					goto err;
 			}
 		}
+#ifdef CONFIG_BINFMT_FLAT_OLD
 	} else {
 		for (i = 0; i < relocs; i++) {
-			u32 relval;
+			__be32 relval;
 			if (get_user(relval, reloc + i))
 				return -EFAULT;
-			relval = ntohl(relval);
-			old_reloc(relval);
+			old_reloc(ntohl(relval));
 		}
+#endif /* CONFIG_BINFMT_FLAT_OLD */
 	}
 
 	flush_icache_range(start_code, end_code);
@@ -856,9 +879,14 @@
 
 static int load_flat_shared_library(int id, struct lib_info *libs)
 {
+	/*
+	 * This is a fake bprm struct; only the members "buf", "file" and
+	 * "filename" are actually used.
+	 */
 	struct linux_binprm bprm;
 	int res;
 	char buf[16];
+	loff_t pos = 0;
 
 	memset(&bprm, 0, sizeof(bprm));
 
@@ -872,25 +900,11 @@
 	if (IS_ERR(bprm.file))
 		return res;
 
-	bprm.cred = prepare_exec_creds();
-	res = -ENOMEM;
-	if (!bprm.cred)
-		goto out;
+	res = kernel_read(bprm.file, bprm.buf, BINPRM_BUF_SIZE, &pos);
 
-	/* We don't really care about recalculating credentials at this point
-	 * as we're past the point of no return and are dealing with shared
-	 * libraries.
-	 */
-	bprm.called_set_creds = 1;
-
-	res = prepare_binprm(&bprm);
-
-	if (!res)
+	if (res >= 0)
 		res = load_flat_file(&bprm, libs, id, NULL);
 
-	abort_creds(bprm.cred);
-
-out:
 	allow_write_access(bprm.file);
 	fput(bprm.file);
 

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index aa4a7a2..cdb4582 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * binfmt_misc.c
  *
@@ -22,6 +23,7 @@
 #include <linux/pagemap.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/fs_context.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
@@ -820,7 +822,7 @@
 	.evict_inode	= bm_evict_inode,
 };
 
-static int bm_fill_super(struct super_block *sb, void *data, int silent)
+static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	int err;
 	static const struct tree_descr bm_files[] = {
@@ -835,10 +837,19 @@
 	return err;
 }
 
-static struct dentry *bm_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bm_get_tree(struct fs_context *fc)
 {
-	return mount_single(fs_type, flags, data, bm_fill_super);
+	return get_tree_single(fc, bm_fill_super);
+}
+
+static const struct fs_context_operations bm_context_ops = {
+	.get_tree	= bm_get_tree,
+};
+
+static int bm_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &bm_context_ops;
+	return 0;
 }
 
 static struct linux_binfmt misc_format = {
@@ -849,7 +860,7 @@
 static struct file_system_type bm_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "binfmt_misc",
-	.mount		= bm_mount,
+	.init_fs_context = bm_init_fs_context,
 	.kill_sb	= kill_litter_super,
 };
 MODULE_ALIAS_FS("binfmt_misc");

diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 7cde3f4..e9e6a6f 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/binfmt_script.c
  *
@@ -14,13 +15,30 @@
 #include <linux/err.h>
 #include <linux/fs.h>
 
+static inline bool spacetab(char c) { return c == ' ' || c == '\t'; }
+static inline char *next_non_spacetab(char *first, const char *last)
+{
+	for (; first <= last; first++)
+		if (!spacetab(*first))
+			return first;
+	return NULL;
+}
+static inline char *next_terminator(char *first, const char *last)
+{
+	for (; first <= last; first++)
+		if (spacetab(*first) || !*first)
+			return first;
+	return NULL;
+}
+
 static int load_script(struct linux_binprm *bprm)
 {
 	const char *i_arg, *i_name;
-	char *cp;
+	char *cp, *buf_end;
 	struct file *file;
 	int retval;
 
+	/* Not ours to exec if we don't start with "#!". */
 	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
 		return -ENOEXEC;
 
@@ -33,18 +51,40 @@
 	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
 		return -ENOENT;
 
-	/*
-	 * This section does the #! interpretation.
-	 * Sorta complicated, but hopefully it will work.  -TYT
-	 */
-
+	/* Release since we are not mapping a binary into memory. */
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
 
-	bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
-	if ((cp = strchr(bprm->buf, '\n')) == NULL)
-		cp = bprm->buf+BINPRM_BUF_SIZE-1;
+	/*
+	 * This section handles parsing the #! line into separate
+	 * interpreter path and argument strings. We must be careful
+	 * because bprm->buf is not yet guaranteed to be NUL-terminated
+	 * (though the buffer will have trailing NUL padding when the
+	 * file size was smaller than the buffer size).
+	 *
+	 * We do not want to exec a truncated interpreter path, so either
+	 * we find a newline (which indicates nothing is truncated), or
+	 * we find a space/tab/NUL after the interpreter path (which
+	 * itself may be preceded by spaces/tabs). Truncating the
+	 * arguments is fine: the interpreter can re-read the script to
+	 * parse them on its own.
+	 */
+	buf_end = bprm->buf + sizeof(bprm->buf) - 1;
+	cp = strnchr(bprm->buf, sizeof(bprm->buf), '\n');
+	if (!cp) {
+		cp = next_non_spacetab(bprm->buf + 2, buf_end);
+		if (!cp)
+			return -ENOEXEC; /* Entire buf is spaces/tabs */
+		/*
+		 * If there is no later space/tab/NUL we must assume the
+		 * interpreter path is truncated.
+		 */
+		if (!next_terminator(cp, buf_end))
+			return -ENOEXEC;
+		cp = buf_end;
+	}
+	/* NUL-terminate the buffer and any trailing spaces/tabs. */
 	*cp = '\0';
 	while (cp > bprm->buf) {
 		cp--;

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38b8ce0..9c073db 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/block_dev.c
  *
@@ -25,12 +26,11 @@
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
-#include <linux/dax.h>
-#include <linux/badblocks.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/falloc.h>
 #include <linux/uaccess.h>
@@ -104,6 +104,20 @@
 }
 EXPORT_SYMBOL(invalidate_bdev);
 
+static void set_init_blocksize(struct block_device *bdev)
+{
+	unsigned bsize = bdev_logical_block_size(bdev);
+	loff_t size = i_size_read(bdev->bd_inode);
+
+	while (bsize < PAGE_SIZE) {
+		if (size & bsize)
+			break;
+		bsize <<= 1;
+	}
+	bdev->bd_block_size = bsize;
+	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+
 int set_blocksize(struct block_device *bdev, int size)
 {
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
@@ -181,7 +195,7 @@
 	struct task_struct *waiter = bio->bi_private;
 
 	WRITE_ONCE(bio->bi_private, NULL);
-	wake_up_process(waiter);
+	blk_wake_io_task(waiter);
 }
 
 static ssize_t
@@ -190,13 +204,12 @@
 {
 	struct file *file = iocb->ki_filp;
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
+	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
 	struct bio bio;
 	ssize_t ret;
 	blk_qc_t qc;
-	int i;
 
 	if ((pos | iov_iter_alignment(iter)) &
 	    (bdev_logical_block_size(bdev) - 1))
@@ -232,6 +245,8 @@
 		bio.bi_opf = dio_bio_write_op(iocb);
 		task_io_account_write(ret);
 	}
+	if (iocb->ki_flags & IOCB_HIPRI)
+		bio_set_polled(&bio, iocb);
 
 	qc = submit_bio(&bio);
 	for (;;) {
@@ -239,17 +254,12 @@
 		if (!READ_ONCE(bio.bi_private))
 			break;
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc, true))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
 
-	bio_for_each_segment_all(bvec, &bio, i) {
-		if (should_dirty && !PageCompound(bvec->bv_page))
-			set_page_dirty_lock(bvec->bv_page);
-		put_page(bvec->bv_page);
-	}
-
+	bio_release_pages(&bio, should_dirty);
 	if (unlikely(bio.bi_status))
 		ret = blk_status_to_errno(bio.bi_status);
 
@@ -277,15 +287,23 @@
 
 static struct bio_set blkdev_dio_pool;
 
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+	struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
 	struct blkdev_dio *dio = bio->bi_private;
 	bool should_dirty = dio->should_dirty;
 
-	if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
-		if (bio->bi_status && !dio->bio.bi_status)
-			dio->bio.bi_status = bio->bi_status;
-	} else {
+	if (bio->bi_status && !dio->bio.bi_status)
+		dio->bio.bi_status = bio->bi_status;
+
+	if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
 		if (!dio->is_sync) {
 			struct kiocb *iocb = dio->iocb;
 			ssize_t ret;
@@ -298,23 +316,20 @@
 			}
 
 			dio->iocb->ki_complete(iocb, ret, 0);
-			bio_put(&dio->bio);
+			if (dio->multi_bio)
+				bio_put(&dio->bio);
 		} else {
 			struct task_struct *waiter = dio->waiter;
 
 			WRITE_ONCE(dio->waiter, NULL);
-			wake_up_process(waiter);
+			blk_wake_io_task(waiter);
 		}
 	}
 
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
 	} else {
-		struct bio_vec *bvec;
-		int i;
-
-		bio_for_each_segment_all(bvec, bio, i)
-			put_page(bvec->bv_page);
+		bio_release_pages(bio, false);
 		bio_put(bio);
 	}
 }
@@ -328,6 +343,7 @@
 	struct blk_plug plug;
 	struct blkdev_dio *dio;
 	struct bio *bio;
+	bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
 	loff_t pos = iocb->ki_pos;
 	blk_qc_t qc = BLK_QC_T_NONE;
@@ -338,20 +354,27 @@
 		return -EINVAL;
 
 	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
-	bio_get(bio); /* extra ref for the completion handler */
 
 	dio = container_of(bio, struct blkdev_dio, bio);
 	dio->is_sync = is_sync = is_sync_kiocb(iocb);
-	if (dio->is_sync)
+	if (dio->is_sync) {
 		dio->waiter = current;
-	else
+		bio_get(bio);
+	} else {
 		dio->iocb = iocb;
+	}
 
 	dio->size = 0;
 	dio->multi_bio = false;
-	dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
+	dio->should_dirty = is_read && iter_is_iovec(iter);
 
-	blk_start_plug(&plug);
+	/*
+	 * Don't plug for HIPRI/polled IO, as those should go straight
+	 * to issue
+	 */
+	if (!is_poll)
+		blk_start_plug(&plug);
+
 	for (;;) {
 		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = pos >> 9;
@@ -381,11 +404,28 @@
 
 		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
 		if (!nr_pages) {
+			bool polled = false;
+
+			if (iocb->ki_flags & IOCB_HIPRI) {
+				bio_set_polled(bio, iocb);
+				polled = true;
+			}
+
 			qc = submit_bio(bio);
+
+			if (polled)
+				WRITE_ONCE(iocb->ki_cookie, qc);
 			break;
 		}
 
 		if (!dio->multi_bio) {
+			/*
+			 * AIO needs an extra reference to ensure the dio
+			 * structure which is embedded into the first bio
+			 * stays around.
+			 */
+			if (!is_sync)
+				bio_get(bio);
 			dio->multi_bio = true;
 			atomic_set(&dio->ref, 2);
 		} else {
@@ -395,7 +435,9 @@
 		submit_bio(bio);
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 	}
-	blk_finish_plug(&plug);
+
+	if (!is_poll)
+		blk_finish_plug(&plug);
 
 	if (!is_sync)
 		return -EIOCBQUEUED;
@@ -406,7 +448,7 @@
 			break;
 
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc, true))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -733,17 +775,9 @@
 	return &ei->vfs_inode;
 }
 
-static void bdev_i_callback(struct rcu_head *head)
+static void bdev_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	struct bdev_inode *bdi = BDEV_I(inode);
-
-	kmem_cache_free(bdev_cachep, bdi);
-}
-
-static void bdev_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, bdev_i_callback);
+	kmem_cache_free(bdev_cachep, BDEV_I(inode));
 }
 
 static void init_once(void *foo)
@@ -783,24 +817,24 @@
 static const struct super_operations bdev_sops = {
 	.statfs = simple_statfs,
 	.alloc_inode = bdev_alloc_inode,
-	.destroy_inode = bdev_destroy_inode,
+	.free_inode = bdev_free_inode,
 	.drop_inode = generic_delete_inode,
 	.evict_inode = bdev_evict_inode,
 };
 
-static struct dentry *bd_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bd_init_fs_context(struct fs_context *fc)
 {
-	struct dentry *dent;
-	dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
-	if (!IS_ERR(dent))
-		dent->d_sb->s_iflags |= SB_I_CGROUPWB;
-	return dent;
+	struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+	fc->s_iflags |= SB_I_CGROUPWB;
+	ctx->ops = &bdev_sops;
+	return 0;
 }
 
 static struct file_system_type bd_type = {
 	.name		= "bdev",
-	.mount		= bd_mount,
+	.init_fs_context = bd_init_fs_context,
 	.kill_sb	= kill_anon_super,
 };
 
@@ -1105,8 +1139,7 @@
  * Pointer to the block device containing @bdev on success, ERR_PTR()
  * value on failure.
  */
-static struct block_device *bd_start_claiming(struct block_device *bdev,
-					      void *holder)
+struct block_device *bd_start_claiming(struct block_device *bdev, void *holder)
 {
 	struct gendisk *disk;
 	struct block_device *whole;
@@ -1153,6 +1186,62 @@
 		return ERR_PTR(err);
 	}
 }
+EXPORT_SYMBOL(bd_start_claiming);
+
+static void bd_clear_claiming(struct block_device *whole, void *holder)
+{
+	lockdep_assert_held(&bdev_lock);
+	/* tell others that we're done */
+	BUG_ON(whole->bd_claiming != holder);
+	whole->bd_claiming = NULL;
+	wake_up_bit(&whole->bd_claiming, 0);
+}
+
+/**
+ * bd_finish_claiming - finish claiming of a block device
+ * @bdev: block device of interest
+ * @whole: whole block device (returned from bd_start_claiming())
+ * @holder: holder that has claimed @bdev
+ *
+ * Finish exclusive open of a block device. Mark the device as exlusively
+ * open by the holder and wake up all waiters for exclusive open to finish.
+ */
+void bd_finish_claiming(struct block_device *bdev, struct block_device *whole,
+			void *holder)
+{
+	spin_lock(&bdev_lock);
+	BUG_ON(!bd_may_claim(bdev, whole, holder));
+	/*
+	 * Note that for a whole device bd_holders will be incremented twice,
+	 * and bd_holder will be set to bd_may_claim before being set to holder
+	 */
+	whole->bd_holders++;
+	whole->bd_holder = bd_may_claim;
+	bdev->bd_holders++;
+	bdev->bd_holder = holder;
+	bd_clear_claiming(whole, holder);
+	spin_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_finish_claiming);
+
+/**
+ * bd_abort_claiming - abort claiming of a block device
+ * @bdev: block device of interest
+ * @whole: whole block device (returned from bd_start_claiming())
+ * @holder: holder that has claimed @bdev
+ *
+ * Abort claiming of a block device when the exclusive open failed. This can be
+ * also used when exclusive open is not actually desired and we just needed
+ * to block other exclusive openers for a while.
+ */
+void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
+		       void *holder)
+{
+	spin_lock(&bdev_lock);
+	bd_clear_claiming(whole, holder);
+	spin_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_abort_claiming);
 
 #ifdef CONFIG_SYSFS
 struct bd_holder_disk {
@@ -1360,20 +1449,27 @@
  */
 int revalidate_disk(struct gendisk *disk)
 {
-	struct block_device *bdev;
 	int ret = 0;
 
 	if (disk->fops->revalidate_disk)
 		ret = disk->fops->revalidate_disk(disk);
-	bdev = bdget_disk(disk, 0);
-	if (!bdev)
-		return ret;
 
-	mutex_lock(&bdev->bd_mutex);
-	check_disk_size_change(disk, bdev, ret == 0);
-	bdev->bd_invalidated = 0;
-	mutex_unlock(&bdev->bd_mutex);
-	bdput(bdev);
+	/*
+	 * Hidden disks don't have associated bdev so there's no point in
+	 * revalidating it.
+	 */
+	if (!(disk->flags & GENHD_FL_HIDDEN)) {
+		struct block_device *bdev = bdget_disk(disk, 0);
+
+		if (!bdev)
+			return ret;
+
+		mutex_lock(&bdev->bd_mutex);
+		check_disk_size_change(disk, bdev, ret == 0);
+		bdev->bd_invalidated = 0;
+		mutex_unlock(&bdev->bd_mutex);
+		bdput(bdev);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(revalidate_disk);
@@ -1408,18 +1504,9 @@
 
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-	unsigned bsize = bdev_logical_block_size(bdev);
-
 	inode_lock(bdev->bd_inode);
 	i_size_write(bdev->bd_inode, size);
 	inode_unlock(bdev->bd_inode);
-	while (bsize < PAGE_SIZE) {
-		if (size & bsize)
-			break;
-		bsize <<= 1;
-	}
-	bdev->bd_block_size = bsize;
-	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
 }
 EXPORT_SYMBOL(bd_set_size);
 
@@ -1496,8 +1583,10 @@
 				}
 			}
 
-			if (!ret)
+			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+				set_init_blocksize(bdev);
+			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1532,6 +1621,7 @@
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+			set_init_blocksize(bdev);
 		}
 
 		if (bdev->bd_bdi == &noop_backing_dev_info)
@@ -1621,29 +1711,10 @@
 
 		/* finish claiming */
 		mutex_lock(&bdev->bd_mutex);
-		spin_lock(&bdev_lock);
-
-		if (!res) {
-			BUG_ON(!bd_may_claim(bdev, whole, holder));
-			/*
-			 * Note that for a whole device bd_holders
-			 * will be incremented twice, and bd_holder
-			 * will be set to bd_may_claim before being
-			 * set to holder
-			 */
-			whole->bd_holders++;
-			whole->bd_holder = bd_may_claim;
-			bdev->bd_holders++;
-			bdev->bd_holder = holder;
-		}
-
-		/* tell others that we're done */
-		BUG_ON(whole->bd_claiming != holder);
-		whole->bd_claiming = NULL;
-		wake_up_bit(&whole->bd_claiming, 0);
-
-		spin_unlock(&bdev_lock);
-
+		if (!res)
+			bd_finish_claiming(bdev, whole, holder);
+		else
+			bd_abort_claiming(bdev, whole, holder);
 		/*
 		 * Block event polling for write claims if requested.  Any
 		 * write holder makes the write_holder state stick until
@@ -1901,6 +1972,9 @@
 	if (bdev_read_only(I_BDEV(bd_inode)))
 		return -EPERM;
 
+	if (IS_SWAPFILE(bd_inode))
+		return -ETXTBSY;
+
 	if (!iov_iter_count(from))
 		return 0;
 
@@ -1966,6 +2040,7 @@
 	.writepages	= blkdev_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
+	.migratepage	= buffer_migrate_page_norefs,
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
@@ -2044,6 +2119,7 @@
 	.llseek		= block_llseek,
 	.read_iter	= blkdev_read_iter,
 	.write_iter	= blkdev_write_iter,
+	.iopoll		= blkdev_iopoll,
 	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 23537bc..38651fa 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig

@@ -2,6 +2,8 @@
 
 config BTRFS_FS
 	tristate "Btrfs filesystem support"
+	select CRYPTO
+	select CRYPTO_CRC32C
 	select LIBCRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ca693dd..82200db 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile

@@ -10,7 +10,8 @@
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-	   uuid-tree.o props.o free-space-tree.o tree-checker.o
+	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
+	   block-rsv.o delalloc-space.o block-group.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 3b66c95..a0af1b9 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c

@@ -9,6 +9,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 
 #include "ctree.h"
@@ -72,8 +73,16 @@
 	}
 
 	if (acl) {
+		unsigned int nofs_flag;
+
 		size = posix_acl_xattr_size(acl->a_count);
+		/*
+		 * We're holding a transaction handle, so use a NOFS memory
+		 * allocation context to avoid deadlock if reclaim happens.
+		 */
+		nofs_flag = memalloc_nofs_save();
 		value = kmalloc(size, GFP_KERNEL);
+		memalloc_nofs_restore(nofs_flag);
 		if (!value) {
 			ret = -ENOMEM;
 			goto out;
@@ -84,7 +93,11 @@
 			goto out;
 	}
 
-	ret = btrfs_setxattr(trans, inode, name, value, size, 0);
+	if (trans)
+		ret = btrfs_setxattr(trans, inode, name, value, size, 0);
+	else
+		ret = btrfs_setxattr_trans(inode, name, value, size, 0);
+
 out:
 	kfree(value);
 

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d522494..2e9e13f 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c

@@ -12,9 +12,11 @@
 #include "async-thread.h"
 #include "ctree.h"
 
-#define WORK_DONE_BIT 0
-#define WORK_ORDER_DONE_BIT 1
-#define WORK_HIGH_PRIO_BIT 2
+enum {
+	WORK_DONE_BIT,
+	WORK_ORDER_DONE_BIT,
+	WORK_HIGH_PRIO_BIT,
+};
 
 #define NO_THRESHOLD (-1)
 #define DFT_THRESHOLD (32)
@@ -139,13 +141,11 @@
 	}
 
 	if (flags & WQ_HIGHPRI)
-		ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
-						 ret->current_active, "btrfs",
-						 name);
+		ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
+						 ret->current_active, name);
 	else
-		ret->normal_wq = alloc_workqueue("%s-%s", flags,
-						 ret->current_active, "btrfs",
-						 name);
+		ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
+						 ret->current_active, name);
 	if (!ret->normal_wq) {
 		kfree(ret);
 		return NULL;

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ae750b1..e5d8531 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c

@@ -112,11 +112,11 @@
 }
 
 struct preftree {
-	struct rb_root root;
+	struct rb_root_cached root;
 	unsigned int count;
 };
 
-#define PREFTREE_INIT	{ .root = RB_ROOT, .count = 0 }
+#define PREFTREE_INIT	{ .root = RB_ROOT_CACHED, .count = 0 }
 
 struct preftrees {
 	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -225,14 +225,15 @@
 			      struct prelim_ref *newref,
 			      struct share_check *sc)
 {
-	struct rb_root *root;
+	struct rb_root_cached *root;
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct prelim_ref *ref;
 	int result;
+	bool leftmost = true;
 
 	root = &preftree->root;
-	p = &root->rb_node;
+	p = &root->rb_root.rb_node;
 
 	while (*p) {
 		parent = *p;
@@ -242,6 +243,7 @@
 			p = &(*p)->rb_left;
 		} else if (result > 0) {
 			p = &(*p)->rb_right;
+			leftmost = false;
 		} else {
 			/* Identical refs, merge them and free @newref */
 			struct extent_inode_elem *eie = ref->inode_list;
@@ -272,7 +274,7 @@
 	preftree->count++;
 	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
 	rb_link_node(&newref->rbnode, parent, p);
-	rb_insert_color(&newref->rbnode, root);
+	rb_insert_color_cached(&newref->rbnode, root, leftmost);
 }
 
 /*
@@ -283,11 +285,11 @@
 {
 	struct prelim_ref *ref, *next_ref;
 
-	rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
-					     rbnode)
+	rbtree_postorder_for_each_entry_safe(ref, next_ref,
+					     &preftree->root.rb_root, rbnode)
 		free_pref(ref);
 
-	preftree->root = RB_ROOT;
+	preftree->root = RB_ROOT_CACHED;
 	preftree->count = 0;
 }
 
@@ -589,7 +591,7 @@
 }
 
 /*
- * We maintain three seperate rbtrees: one for direct refs, one for
+ * We maintain three separate rbtrees: one for direct refs, one for
  * indirect refs which have a key, and one for indirect refs which do not
  * have a key. Each tree does merge on insertion.
  *
@@ -627,7 +629,7 @@
 	 * freeing the entire indirect tree when we're done.  In some test
 	 * cases, the tree can grow quite large (~200k objects).
 	 */
-	while ((rnode = rb_first(&preftrees->indirect.root))) {
+	while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
 		struct prelim_ref *ref;
 
 		ref = rb_entry(rnode, struct prelim_ref, rbnode);
@@ -637,7 +639,7 @@
 			goto out;
 		}
 
-		rb_erase(&ref->rbnode, &preftrees->indirect.root);
+		rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
 		preftrees->indirect.count--;
 
 		if (ref->count == 0) {
@@ -693,7 +695,7 @@
 		}
 
 		/*
-		 * Now it's a direct ref, put it in the the direct tree. We must
+		 * Now it's a direct ref, put it in the direct tree. We must
 		 * do this last because the ref could be merged/freed here.
 		 */
 		prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
@@ -710,16 +712,16 @@
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-			    struct preftrees *preftrees)
+			    struct preftrees *preftrees, bool lock)
 {
 	struct prelim_ref *ref;
 	struct extent_buffer *eb;
 	struct preftree *tree = &preftrees->indirect_missing_keys;
 	struct rb_node *node;
 
-	while ((node = rb_first(&tree->root))) {
+	while ((node = rb_first_cached(&tree->root))) {
 		ref = rb_entry(node, struct prelim_ref, rbnode);
-		rb_erase(node, &tree->root);
+		rb_erase_cached(node, &tree->root);
 
 		BUG_ON(ref->parent);	/* should not be a direct ref */
 		BUG_ON(ref->key_for_search.type);
@@ -735,12 +737,14 @@
 			free_extent_buffer(eb);
 			return -EIO;
 		}
-		btrfs_tree_read_lock(eb);
+		if (lock)
+			btrfs_tree_read_lock(eb);
 		if (btrfs_header_level(eb) == 0)
 			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
 		else
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
-		btrfs_tree_read_unlock(eb);
+		if (lock)
+			btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
 		prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
 		cond_resched();
@@ -769,7 +773,7 @@
 		btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
 
 	spin_lock(&head->lock);
-	for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
+	for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
 		node = rb_entry(n, struct btrfs_delayed_ref_node,
 				ref_node);
 		if (node->seq > seq)
@@ -787,7 +791,7 @@
 			count = node->ref_mod * -1;
 			break;
 		default:
-			BUG_ON(1);
+			BUG();
 		}
 		*total_refs += count;
 		switch (node->type) {
@@ -1225,18 +1229,18 @@
 
 	btrfs_release_path(path);
 
-	ret = add_missing_keys(fs_info, &preftrees);
+	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
 	if (ret)
 		goto out;
 
-	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
 
 	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
 				    extent_item_pos, total_refs, sc, ignore_offset);
 	if (ret)
 		goto out;
 
-	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));
 
 	/*
 	 * This walks the tree of merged and resolved refs. Tree blocks are
@@ -1245,7 +1249,7 @@
 	 *
 	 * We release the entire tree in one go before returning.
 	 */
-	node = rb_first(&preftrees.direct.root);
+	node = rb_first_cached(&preftrees.direct.root);
 	while (node) {
 		ref = rb_entry(node, struct prelim_ref, rbnode);
 		node = rb_next(&ref->rbnode);
@@ -1286,11 +1290,15 @@
 					ret = -EIO;
 					goto out;
 				}
-				btrfs_tree_read_lock(eb);
-				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+				if (!path->skip_locking) {
+					btrfs_tree_read_lock(eb);
+					btrfs_set_lock_blocking_read(eb);
+				}
 				ret = find_extent_in_eb(eb, bytenr,
 							*extent_item_pos, &eie, ignore_offset);
-				btrfs_tree_read_unlock_blocking(eb);
+				if (!path->skip_locking)
+					btrfs_tree_read_unlock_blocking(eb);
 				free_extent_buffer(eb);
 				if (ret < 0)
 					goto out;
@@ -1452,37 +1460,35 @@
  * callers (such as fiemap) which want to know whether the extent is
  * shared but do not need a ref count.
  *
- * This attempts to allocate a transaction in order to account for
- * delayed refs, but continues on even when the alloc fails.
+ * This attempts to attach to the running transaction in order to account for
+ * delayed refs, but continues on even when no running transaction exists.
  *
  * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
  */
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
+int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+		struct ulist *roots, struct ulist *tmp)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
-	struct ulist *tmp = NULL;
-	struct ulist *roots = NULL;
 	struct ulist_iterator uiter;
 	struct ulist_node *node;
 	struct seq_list elem = SEQ_LIST_INIT(elem);
 	int ret = 0;
 	struct share_check shared = {
-		.root_objectid = root->objectid,
+		.root_objectid = root->root_key.objectid,
 		.inum = inum,
 		.share_count = 0,
 	};
 
-	tmp = ulist_alloc(GFP_NOFS);
-	roots = ulist_alloc(GFP_NOFS);
-	if (!tmp || !roots) {
-		ulist_free(tmp);
-		ulist_free(roots);
-		return -ENOMEM;
-	}
+	ulist_init(roots);
+	ulist_init(tmp);
 
-	trans = btrfs_join_transaction(root);
+	trans = btrfs_join_transaction_nostart(root);
 	if (IS_ERR(trans)) {
+		if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
 		trans = NULL;
 		down_read(&fs_info->commit_root_sem);
 	} else {
@@ -1515,8 +1521,9 @@
 	} else {
 		up_read(&fs_info->commit_root_sem);
 	}
-	ulist_free(tmp);
-	ulist_free(roots);
+out:
+	ulist_release(roots);
+	ulist_release(tmp);
 	return ret;
 }
 
@@ -1648,7 +1655,7 @@
 		/* make sure we can use eb after releasing the path */
 		if (eb != eb_in) {
 			if (!path->skip_locking)
-				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+				btrfs_set_lock_blocking_read(eb);
 			path->nodes[0] = NULL;
 			path->locks[0] = 0;
 		}
@@ -1739,7 +1746,7 @@
 		else if (flags & BTRFS_EXTENT_FLAG_DATA)
 			*flags_ret = BTRFS_EXTENT_FLAG_DATA;
 		else
-			BUG_ON(1);
+			BUG();
 		return 0;
 	}
 
@@ -1904,14 +1911,20 @@
 			extent_item_objectid);
 
 	if (!search_commit_root) {
-		trans = btrfs_join_transaction(fs_info->extent_root);
-		if (IS_ERR(trans))
-			return PTR_ERR(trans);
-		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
-	} else {
-		down_read(&fs_info->commit_root_sem);
+		trans = btrfs_attach_transaction(fs_info->extent_root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) != -ENOENT &&
+			    PTR_ERR(trans) != -EROFS)
+				return PTR_ERR(trans);
+			trans = NULL;
+		}
 	}
 
+	if (trans)
+		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+	else
+		down_read(&fs_info->commit_root_sem);
+
 	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
 				   tree_mod_seq_elem.seq, &refs,
 				   &extent_item_pos, ignore_offset);
@@ -1943,7 +1956,7 @@
 
 	free_leaf_list(refs);
 out:
-	if (!search_commit_root) {
+	if (trans) {
 		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 		btrfs_end_transaction(trans);
 	} else {
@@ -2018,9 +2031,6 @@
 			ret = -ENOMEM;
 			break;
 		}
-		extent_buffer_get(eb);
-		btrfs_tree_read_lock(eb);
-		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
 		item = btrfs_item_nr(slot);
@@ -2031,7 +2041,8 @@
 			/* path must be released before calling iterate()! */
 			btrfs_debug(fs_root->fs_info,
 				"following ref at offset %u for inode %llu in tree %llu",
-				cur, found_key.objectid, fs_root->objectid);
+				cur, found_key.objectid,
+				fs_root->root_key.objectid);
 			ret = iterate(parent, name_len,
 				      (unsigned long)(iref + 1), eb, ctx);
 			if (ret)
@@ -2039,7 +2050,6 @@
 			len = sizeof(*iref) + name_len;
 			iref = (struct btrfs_inode_ref *)((char *)iref + len);
 		}
-		btrfs_tree_read_unlock_blocking(eb);
 		free_extent_buffer(eb);
 	}
 
@@ -2080,10 +2090,6 @@
 			ret = -ENOMEM;
 			break;
 		}
-		extent_buffer_get(eb);
-
-		btrfs_tree_read_lock(eb);
-		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
 		item_size = btrfs_item_size_nr(eb, slot);
@@ -2104,7 +2110,6 @@
 			cur_offset += btrfs_inode_extref_name_len(eb, extref);
 			cur_offset += sizeof(*extref);
 		}
-		btrfs_tree_read_unlock_blocking(eb);
 		free_extent_buffer(eb);
 
 		offset++;

diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 54d5898..777f61d 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h

@@ -57,7 +57,8 @@
 			  u64 start_off, struct btrfs_path *path,
 			  struct btrfs_inode_extref **ret_extref,
 			  u64 *found_off);
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
+int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+		struct ulist *roots, struct ulist *tmp_ulist);
 
 int __init btrfs_prelim_ref_init(void);
 void __cold btrfs_prelim_ref_exit(void);

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
new file mode 100644
index 0000000..670700c
--- /dev/null
+++ b/fs/btrfs/block-group.c

@@ -0,0 +1,3174 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "ctree.h"
+#include "block-group.h"
+#include "space-info.h"
+#include "disk-io.h"
+#include "free-space-cache.h"
+#include "free-space-tree.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "transaction.h"
+#include "ref-verify.h"
+#include "sysfs.h"
+#include "tree-log.h"
+#include "delalloc-space.h"
+
+/*
+ * Return target flags in extended format or 0 if restripe for this chunk_type
+ * is not in progress
+ *
+ * Should be called with balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+	u64 target = 0;
+
+	if (!bctl)
+		return 0;
+
+	if (flags & BTRFS_BLOCK_GROUP_DATA &&
+	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+	}
+
+	return target;
+}
+
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Return reduced profile in chunk format.  If profile changing is in progress
+ * (either running or paused) picks the target profile (if it's already
+ * available), otherwise falls back to plain reducing.
+ */
+static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 num_devices = fs_info->fs_devices->rw_devices;
+	u64 target;
+	u64 raid_type;
+	u64 allowed = 0;
+
+	/*
+	 * See if restripe for this chunk_type is in progress, if so try to
+	 * reduce to the target profile
+	 */
+	spin_lock(&fs_info->balance_lock);
+	target = get_restripe_target(fs_info, flags);
+	if (target) {
+		/* Pick target profile only if it's already available */
+		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+			spin_unlock(&fs_info->balance_lock);
+			return extended_to_chunk(target);
+		}
+	}
+	spin_unlock(&fs_info->balance_lock);
+
+	/* First, mask out the RAID levels which aren't possible */
+	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+			allowed |= btrfs_raid_array[raid_type].bg_flag;
+	}
+	allowed &= flags;
+
+	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+		allowed = BTRFS_BLOCK_GROUP_RAID6;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+		allowed = BTRFS_BLOCK_GROUP_RAID5;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+		allowed = BTRFS_BLOCK_GROUP_RAID10;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+		allowed = BTRFS_BLOCK_GROUP_RAID1;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+		allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	return extended_to_chunk(flags | allowed);
+}
+
+static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+{
+	unsigned seq;
+	u64 flags;
+
+	do {
+		flags = orig_flags;
+		seq = read_seqbegin(&fs_info->profiles_lock);
+
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			flags |= fs_info->avail_data_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			flags |= fs_info->avail_system_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			flags |= fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&fs_info->profiles_lock, seq));
+
+	return btrfs_reduce_alloc_profile(fs_info, flags);
+}
+
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+{
+	return get_alloc_profile(fs_info, orig_flags);
+}
+
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+	atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->count)) {
+		WARN_ON(cache->pinned > 0);
+		WARN_ON(cache->reserved > 0);
+
+		/*
+		 * If not empty, someone is still holding mutex of
+		 * full_stripe_lock, which can only be released by caller.
+		 * And it will definitely cause use-after-free when caller
+		 * tries to release full stripe lock.
+		 *
+		 * No better way to resolve, but only to warn.
+		 */
+		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
+		kfree(cache->free_space_ctl);
+		kfree(cache);
+	}
+}
+
+/*
+ * This adds the block group to the fs_info rb tree for the block group cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+				struct btrfs_block_group_cache *block_group)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct btrfs_block_group_cache *cache;
+
+	spin_lock(&info->block_group_cache_lock);
+	p = &info->block_group_cache_tree.rb_node;
+
+	while (*p) {
+		parent = *p;
+		cache = rb_entry(parent, struct btrfs_block_group_cache,
+				 cache_node);
+		if (block_group->key.objectid < cache->key.objectid) {
+			p = &(*p)->rb_left;
+		} else if (block_group->key.objectid > cache->key.objectid) {
+			p = &(*p)->rb_right;
+		} else {
+			spin_unlock(&info->block_group_cache_lock);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&block_group->cache_node, parent, p);
+	rb_insert_color(&block_group->cache_node,
+			&info->block_group_cache_tree);
+
+	if (info->first_logical_byte > block_group->key.objectid)
+		info->first_logical_byte = block_group->key.objectid;
+
+	spin_unlock(&info->block_group_cache_lock);
+
+	return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *block_group_cache_tree_search(
+		struct btrfs_fs_info *info, u64 bytenr, int contains)
+{
+	struct btrfs_block_group_cache *cache, *ret = NULL;
+	struct rb_node *n;
+	u64 end, start;
+
+	spin_lock(&info->block_group_cache_lock);
+	n = info->block_group_cache_tree.rb_node;
+
+	while (n) {
+		cache = rb_entry(n, struct btrfs_block_group_cache,
+				 cache_node);
+		end = cache->key.objectid + cache->key.offset - 1;
+		start = cache->key.objectid;
+
+		if (bytenr < start) {
+			if (!contains && (!ret || start < ret->key.objectid))
+				ret = cache;
+			n = n->rb_left;
+		} else if (bytenr > start) {
+			if (contains && bytenr <= end) {
+				ret = cache;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			ret = cache;
+			break;
+		}
+	}
+	if (ret) {
+		btrfs_get_block_group(ret);
+		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+			info->first_logical_byte = ret->key.objectid;
+	}
+	spin_unlock(&info->block_group_cache_lock);
+
+	return ret;
+}
+
+/*
+ * Return the block group that starts at or after bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
+		struct btrfs_fs_info *info, u64 bytenr)
+{
+	return block_group_cache_tree_search(info, bytenr, 0);
+}
+
+/*
+ * Return the block group that contains the given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+		struct btrfs_fs_info *info, u64 bytenr)
+{
+	return block_group_cache_tree_search(info, bytenr, 1);
+}
+
+struct btrfs_block_group_cache *btrfs_next_block_group(
+		struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct rb_node *node;
+
+	spin_lock(&fs_info->block_group_cache_lock);
+
+	/* If our block group was removed, we need a full search. */
+	if (RB_EMPTY_NODE(&cache->cache_node)) {
+		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+
+		spin_unlock(&fs_info->block_group_cache_lock);
+		btrfs_put_block_group(cache);
+		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
+	}
+	node = rb_next(&cache->cache_node);
+	btrfs_put_block_group(cache);
+	if (node) {
+		cache = rb_entry(node, struct btrfs_block_group_cache,
+				 cache_node);
+		btrfs_get_block_group(cache);
+	} else
+		cache = NULL;
+	spin_unlock(&fs_info->block_group_cache_lock);
+	return cache;
+}
+
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+	struct btrfs_block_group_cache *bg;
+	bool ret = true;
+
+	bg = btrfs_lookup_block_group(fs_info, bytenr);
+	if (!bg)
+		return false;
+
+	spin_lock(&bg->lock);
+	if (bg->ro)
+		ret = false;
+	else
+		atomic_inc(&bg->nocow_writers);
+	spin_unlock(&bg->lock);
+
+	/* No put on block group, done by btrfs_dec_nocow_writers */
+	if (!ret)
+		btrfs_put_block_group(bg);
+
+	return ret;
+}
+
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+	struct btrfs_block_group_cache *bg;
+
+	bg = btrfs_lookup_block_group(fs_info, bytenr);
+	ASSERT(bg);
+	if (atomic_dec_and_test(&bg->nocow_writers))
+		wake_up_var(&bg->nocow_writers);
+	/*
+	 * Once for our lookup and once for the lookup done by a previous call
+	 * to btrfs_inc_nocow_writers()
+	 */
+	btrfs_put_block_group(bg);
+	btrfs_put_block_group(bg);
+}
+
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+{
+	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
+}
+
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+					const u64 start)
+{
+	struct btrfs_block_group_cache *bg;
+
+	bg = btrfs_lookup_block_group(fs_info, start);
+	ASSERT(bg);
+	if (atomic_dec_and_test(&bg->reservations))
+		wake_up_var(&bg->reservations);
+	btrfs_put_block_group(bg);
+}
+
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+{
+	struct btrfs_space_info *space_info = bg->space_info;
+
+	ASSERT(bg->ro);
+
+	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+		return;
+
+	/*
+	 * Our block group is read only but before we set it to read only,
+	 * some task might have had allocated an extent from it already, but it
+	 * has not yet created a respective ordered extent (and added it to a
+	 * root's list of ordered extents).
+	 * Therefore wait for any task currently allocating extents, since the
+	 * block group's reservations counter is incremented while a read lock
+	 * on the groups' semaphore is held and decremented after releasing
+	 * the read access on that semaphore and creating the ordered extent.
+	 */
+	down_write(&space_info->groups_sem);
+	up_write(&space_info->groups_sem);
+
+	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
+}
+
+struct btrfs_caching_control *btrfs_get_caching_control(
+		struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_caching_control *ctl;
+
+	spin_lock(&cache->lock);
+	if (!cache->caching_ctl) {
+		spin_unlock(&cache->lock);
+		return NULL;
+	}
+
+	ctl = cache->caching_ctl;
+	refcount_inc(&ctl->count);
+	spin_unlock(&cache->lock);
+	return ctl;
+}
+
+void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
+{
+	if (refcount_dec_and_test(&ctl->count))
+		kfree(ctl);
+}
+
+/*
+ * When we wait for progress in the block group caching, its because our
+ * allocation attempt failed at least once.  So, we must sleep and let some
+ * progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to show
+ * up, and then it will check the block group free space numbers for our min
+ * num_bytes.  Another option is to have it go ahead and look in the rbtree for
+ * a free extent of a given size, but this is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
+ */
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+					   u64 num_bytes)
+{
+	struct btrfs_caching_control *caching_ctl;
+
+	caching_ctl = btrfs_get_caching_control(cache);
+	if (!caching_ctl)
+		return;
+
+	wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) ||
+		   (cache->free_space_ctl->free_space >= num_bytes));
+
+	btrfs_put_caching_control(caching_ctl);
+}
+
+int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_caching_control *caching_ctl;
+	int ret = 0;
+
+	caching_ctl = btrfs_get_caching_control(cache);
+	if (!caching_ctl)
+		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
+
+	wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache));
+	if (cache->cached == BTRFS_CACHE_ERROR)
+		ret = -EIO;
+	btrfs_put_caching_control(caching_ctl);
+	return ret;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	u64 start = block_group->key.objectid;
+	u64 len = block_group->key.offset;
+	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+		fs_info->nodesize : fs_info->sectorsize;
+	u64 step = chunk << 1;
+
+	while (len > chunk) {
+		btrfs_remove_free_space(block_group, start, chunk);
+		start += step;
+		if (len < step)
+			len = 0;
+		else
+			len -= step;
+	}
+}
+#endif
+
+/*
+ * This is only called by btrfs_cache_block_group, since we could have freed
+ * extents we need to check the pinned_extents for any extents that can't be
+ * used yet since their free space will be released as soon as the transaction
+ * commits.
+ */
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+		       u64 start, u64 end)
+{
+	struct btrfs_fs_info *info = block_group->fs_info;
+	u64 extent_start, extent_end, size, total_added = 0;
+	int ret;
+
+	while (start < end) {
+		ret = find_first_extent_bit(info->pinned_extents, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY | EXTENT_UPTODATE,
+					    NULL);
+		if (ret)
+			break;
+
+		if (extent_start <= start) {
+			start = extent_end + 1;
+		} else if (extent_start > start && extent_start < end) {
+			size = extent_start - start;
+			total_added += size;
+			ret = btrfs_add_free_space(block_group, start,
+						   size);
+			BUG_ON(ret); /* -ENOMEM or logic error */
+			start = extent_end + 1;
+		} else {
+			break;
+		}
+	}
+
+	if (start < end) {
+		size = end - start;
+		total_added += size;
+		ret = btrfs_add_free_space(block_group, start, size);
+		BUG_ON(ret); /* -ENOMEM or logic error */
+	}
+
+	return total_added;
+}
+
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
+{
+	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u64 total_found = 0;
+	u64 last = 0;
+	u32 nritems;
+	int ret;
+	bool wakeup = true;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
+#ifdef CONFIG_BTRFS_DEBUG
+	/*
+	 * If we're fragmenting we don't want to make anybody think we can
+	 * allocate from this block group until we've had a chance to fragment
+	 * the free space.
+	 */
+	if (btrfs_should_fragment_free_space(block_group))
+		wakeup = false;
+#endif
+	/*
+	 * We don't want to deadlock with somebody trying to allocate a new
+	 * extent for the extent root while also trying to search the extent
+	 * root to add free space.  So we skip locking and search the commit
+	 * root, since its read-only
+	 */
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+	path->reada = READA_FORWARD;
+
+	key.objectid = last;
+	key.offset = 0;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+
+next:
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	while (1) {
+		if (btrfs_fs_closing(fs_info) > 1) {
+			last = (u64)-1;
+			break;
+		}
+
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		} else {
+			ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
+			if (ret)
+				break;
+
+			if (need_resched() ||
+			    rwsem_is_contended(&fs_info->commit_root_sem)) {
+				if (wakeup)
+					caching_ctl->progress = last;
+				btrfs_release_path(path);
+				up_read(&fs_info->commit_root_sem);
+				mutex_unlock(&caching_ctl->mutex);
+				cond_resched();
+				mutex_lock(&caching_ctl->mutex);
+				down_read(&fs_info->commit_root_sem);
+				goto next;
+			}
+
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			continue;
+		}
+
+		if (key.objectid < last) {
+			key.objectid = last;
+			key.offset = 0;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+
+			if (wakeup)
+				caching_ctl->progress = last;
+			btrfs_release_path(path);
+			goto next;
+		}
+
+		if (key.objectid < block_group->key.objectid) {
+			path->slots[0]++;
+			continue;
+		}
+
+		if (key.objectid >= block_group->key.objectid +
+		    block_group->key.offset)
+			break;
+
+		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    key.type == BTRFS_METADATA_ITEM_KEY) {
+			total_found += add_new_free_space(block_group, last,
+							  key.objectid);
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				last = key.objectid +
+					fs_info->nodesize;
+			else
+				last = key.objectid + key.offset;
+
+			if (total_found > CACHING_CTL_WAKE_UP) {
+				total_found = 0;
+				if (wakeup)
+					wake_up(&caching_ctl->wait);
+			}
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+
+	total_found += add_new_free_space(block_group, last,
+					  block_group->key.objectid +
+					  block_group->key.offset);
+	caching_ctl->progress = (u64)-1;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_caching_control *caching_ctl;
+	int ret;
+
+	caching_ctl = container_of(work, struct btrfs_caching_control, work);
+	block_group = caching_ctl->block_group;
+	fs_info = block_group->fs_info;
+
+	mutex_lock(&caching_ctl->mutex);
+	down_read(&fs_info->commit_root_sem);
+
+	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+		ret = load_free_space_tree(caching_ctl);
+	else
+		ret = load_extent_tree_free(caching_ctl);
+
+	spin_lock(&block_group->lock);
+	block_group->caching_ctl = NULL;
+	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
+	spin_unlock(&block_group->lock);
+
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_should_fragment_free_space(block_group)) {
+		u64 bytes_used;
+
+		spin_lock(&block_group->space_info->lock);
+		spin_lock(&block_group->lock);
+		bytes_used = block_group->key.offset -
+			btrfs_block_group_used(&block_group->item);
+		block_group->space_info->bytes_used += bytes_used >> 1;
+		spin_unlock(&block_group->lock);
+		spin_unlock(&block_group->space_info->lock);
+		fragment_free_space(block_group);
+	}
+#endif
+
+	caching_ctl->progress = (u64)-1;
+
+	up_read(&fs_info->commit_root_sem);
+	btrfs_free_excluded_extents(block_group);
+	mutex_unlock(&caching_ctl->mutex);
+
+	wake_up(&caching_ctl->wait);
+
+	btrfs_put_caching_control(caching_ctl);
+	btrfs_put_block_group(block_group);
+}
+
+int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
+			    int load_cache_only)
+{
+	DEFINE_WAIT(wait);
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct btrfs_caching_control *caching_ctl;
+	int ret = 0;
+
+	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+	if (!caching_ctl)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&caching_ctl->list);
+	mutex_init(&caching_ctl->mutex);
+	init_waitqueue_head(&caching_ctl->wait);
+	caching_ctl->block_group = cache;
+	caching_ctl->progress = cache->key.objectid;
+	refcount_set(&caching_ctl->count, 1);
+	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+			caching_thread, NULL, NULL);
+
+	spin_lock(&cache->lock);
+	/*
+	 * This should be a rare occasion, but this could happen I think in the
+	 * case where one thread starts to load the space cache info, and then
+	 * some other thread starts a transaction commit which tries to do an
+	 * allocation while the other thread is still loading the space cache
+	 * info.  The previous loop should have kept us from choosing this block
+	 * group, but if we've moved to the state where we will wait on caching
+	 * block groups we need to first check if we're doing a fast load here,
+	 * so we can wait for it to finish, otherwise we could end up allocating
+	 * from a block group who's cache gets evicted for one reason or
+	 * another.
+	 */
+	while (cache->cached == BTRFS_CACHE_FAST) {
+		struct btrfs_caching_control *ctl;
+
+		ctl = cache->caching_ctl;
+		refcount_inc(&ctl->count);
+		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&cache->lock);
+
+		schedule();
+
+		finish_wait(&ctl->wait, &wait);
+		btrfs_put_caching_control(ctl);
+		spin_lock(&cache->lock);
+	}
+
+	if (cache->cached != BTRFS_CACHE_NO) {
+		spin_unlock(&cache->lock);
+		kfree(caching_ctl);
+		return 0;
+	}
+	WARN_ON(cache->caching_ctl);
+	cache->caching_ctl = caching_ctl;
+	cache->cached = BTRFS_CACHE_FAST;
+	spin_unlock(&cache->lock);
+
+	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
+		mutex_lock(&caching_ctl->mutex);
+		ret = load_free_space_cache(cache);
+
+		spin_lock(&cache->lock);
+		if (ret == 1) {
+			cache->caching_ctl = NULL;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			cache->last_byte_to_unpin = (u64)-1;
+			caching_ctl->progress = (u64)-1;
+		} else {
+			if (load_cache_only) {
+				cache->caching_ctl = NULL;
+				cache->cached = BTRFS_CACHE_NO;
+			} else {
+				cache->cached = BTRFS_CACHE_STARTED;
+				cache->has_caching_ctl = 1;
+			}
+		}
+		spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+		if (ret == 1 &&
+		    btrfs_should_fragment_free_space(cache)) {
+			u64 bytes_used;
+
+			spin_lock(&cache->space_info->lock);
+			spin_lock(&cache->lock);
+			bytes_used = cache->key.offset -
+				btrfs_block_group_used(&cache->item);
+			cache->space_info->bytes_used += bytes_used >> 1;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+			fragment_free_space(cache);
+		}
+#endif
+		mutex_unlock(&caching_ctl->mutex);
+
+		wake_up(&caching_ctl->wait);
+		if (ret == 1) {
+			btrfs_put_caching_control(caching_ctl);
+			btrfs_free_excluded_extents(cache);
+			return 0;
+		}
+	} else {
+		/*
+		 * We're either using the free space tree or no caching at all.
+		 * Set cached to the appropriate value and wakeup any waiters.
+		 */
+		spin_lock(&cache->lock);
+		if (load_cache_only) {
+			cache->caching_ctl = NULL;
+			cache->cached = BTRFS_CACHE_NO;
+		} else {
+			cache->cached = BTRFS_CACHE_STARTED;
+			cache->has_caching_ctl = 1;
+		}
+		spin_unlock(&cache->lock);
+		wake_up(&caching_ctl->wait);
+	}
+
+	if (load_cache_only) {
+		btrfs_put_caching_control(caching_ctl);
+		return 0;
+	}
+
+	down_write(&fs_info->commit_root_sem);
+	refcount_inc(&caching_ctl->count);
+	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+	up_write(&fs_info->commit_root_sem);
+
+	btrfs_get_block_group(cache);
+
+	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+
+	return ret;
+}
+
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = chunk_to_extended(flags) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	write_seqlock(&fs_info->profiles_lock);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits &= ~extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
+}
+
+/*
+ * Clear incompat bits for the following feature(s):
+ *
+ * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
+ *            in the whole filesystem
+ */
+static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		struct list_head *head = &fs_info->space_info;
+		struct btrfs_space_info *sinfo;
+
+		list_for_each_entry_rcu(sinfo, head, list) {
+			bool found = false;
+
+			down_read(&sinfo->groups_sem);
+			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
+				found = true;
+			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
+				found = true;
+			up_read(&sinfo->groups_sem);
+
+			if (found)
+				return;
+		}
+		btrfs_clear_fs_incompat(fs_info, RAID56);
+	}
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     u64 group_start, struct extent_map *em)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_path *path;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_free_cluster *cluster;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_key key;
+	struct inode *inode;
+	struct kobject *kobj = NULL;
+	int ret;
+	int index;
+	int factor;
+	struct btrfs_caching_control *caching_ctl = NULL;
+	bool remove_em;
+	bool remove_rsv = false;
+
+	block_group = btrfs_lookup_block_group(fs_info, group_start);
+	BUG_ON(!block_group);
+	BUG_ON(!block_group->ro);
+
+	trace_btrfs_remove_block_group(block_group);
+	/*
+	 * Free the reserved super bytes from this block group before
+	 * remove it.
+	 */
+	btrfs_free_excluded_extents(block_group);
+	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
+				  block_group->key.offset);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+	index = btrfs_bg_flags_to_raid_index(block_group->flags);
+	factor = btrfs_bg_type_to_factor(block_group->flags);
+
+	/* make sure this block group isn't part of an allocation cluster */
+	cluster = &fs_info->data_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
+	/*
+	 * make sure this block group isn't part of a metadata
+	 * allocation cluster
+	 */
+	cluster = &fs_info->meta_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * get the inode first so any iput calls done for the io_list
+	 * aren't the final iput (no unlinks allowed now)
+	 */
+	inode = lookup_free_space_inode(block_group, path);
+
+	mutex_lock(&trans->transaction->cache_write_mutex);
+	/*
+	 * Make sure our free space cache IO is done before removing the
+	 * free space inode
+	 */
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	if (!list_empty(&block_group->io_list)) {
+		list_del_init(&block_group->io_list);
+
+		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+		btrfs_wait_cache_io(trans, block_group, path);
+		btrfs_put_block_group(block_group);
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+	}
+
+	if (!list_empty(&block_group->dirty_list)) {
+		list_del_init(&block_group->dirty_list);
+		remove_rsv = true;
+		btrfs_put_block_group(block_group);
+	}
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
+	mutex_unlock(&trans->transaction->cache_write_mutex);
+
+	if (!IS_ERR(inode)) {
+		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+		if (ret) {
+			btrfs_add_delayed_iput(inode);
+			goto out;
+		}
+		clear_nlink(inode);
+		/* One for the block groups ref */
+		spin_lock(&block_group->lock);
+		if (block_group->iref) {
+			block_group->iref = 0;
+			block_group->inode = NULL;
+			spin_unlock(&block_group->lock);
+			iput(inode);
+		} else {
+			spin_unlock(&block_group->lock);
+		}
+		/* One for our lookup ref */
+		btrfs_add_delayed_iput(inode);
+	}
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0)
+		btrfs_release_path(path);
+	if (ret == 0) {
+		ret = btrfs_del_item(trans, tree_root, path);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+	}
+
+	spin_lock(&fs_info->block_group_cache_lock);
+	rb_erase(&block_group->cache_node,
+		 &fs_info->block_group_cache_tree);
+	RB_CLEAR_NODE(&block_group->cache_node);
+
+	if (fs_info->first_logical_byte == block_group->key.objectid)
+		fs_info->first_logical_byte = (u64)-1;
+	spin_unlock(&fs_info->block_group_cache_lock);
+
+	down_write(&block_group->space_info->groups_sem);
+	/*
+	 * we must use list_del_init so people can check to see if they
+	 * are still on the list after taking the semaphore
+	 */
+	list_del_init(&block_group->list);
+	if (list_empty(&block_group->space_info->block_groups[index])) {
+		kobj = block_group->space_info->block_group_kobjs[index];
+		block_group->space_info->block_group_kobjs[index] = NULL;
+		clear_avail_alloc_bits(fs_info, block_group->flags);
+	}
+	up_write(&block_group->space_info->groups_sem);
+	clear_incompat_bg_bits(fs_info, block_group->flags);
+	if (kobj) {
+		kobject_del(kobj);
+		kobject_put(kobj);
+	}
+
+	if (block_group->has_caching_ctl)
+		caching_ctl = btrfs_get_caching_control(block_group);
+	if (block_group->cached == BTRFS_CACHE_STARTED)
+		btrfs_wait_block_group_cache_done(block_group);
+	if (block_group->has_caching_ctl) {
+		down_write(&fs_info->commit_root_sem);
+		if (!caching_ctl) {
+			struct btrfs_caching_control *ctl;
+
+			list_for_each_entry(ctl,
+				    &fs_info->caching_block_groups, list)
+				if (ctl->block_group == block_group) {
+					caching_ctl = ctl;
+					refcount_inc(&caching_ctl->count);
+					break;
+				}
+		}
+		if (caching_ctl)
+			list_del_init(&caching_ctl->list);
+		up_write(&fs_info->commit_root_sem);
+		if (caching_ctl) {
+			/* Once for the caching bgs list and once for us. */
+			btrfs_put_caching_control(caching_ctl);
+			btrfs_put_caching_control(caching_ctl);
+		}
+	}
+
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	WARN_ON(!list_empty(&block_group->dirty_list));
+	WARN_ON(!list_empty(&block_group->io_list));
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+	btrfs_remove_free_space_cache(block_group);
+
+	spin_lock(&block_group->space_info->lock);
+	list_del_init(&block_group->ro_list);
+
+	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+		WARN_ON(block_group->space_info->total_bytes
+			< block_group->key.offset);
+		WARN_ON(block_group->space_info->bytes_readonly
+			< block_group->key.offset);
+		WARN_ON(block_group->space_info->disk_total
+			< block_group->key.offset * factor);
+	}
+	block_group->space_info->total_bytes -= block_group->key.offset;
+	block_group->space_info->bytes_readonly -= block_group->key.offset;
+	block_group->space_info->disk_total -= block_group->key.offset * factor;
+
+	spin_unlock(&block_group->space_info->lock);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+
+	mutex_lock(&fs_info->chunk_mutex);
+	spin_lock(&block_group->lock);
+	block_group->removed = 1;
+	/*
+	 * At this point trimming can't start on this block group, because we
+	 * removed the block group from the tree fs_info->block_group_cache_tree
+	 * so no one can't find it anymore and even if someone already got this
+	 * block group before we removed it from the rbtree, they have already
+	 * incremented block_group->trimming - if they didn't, they won't find
+	 * any free space entries because we already removed them all when we
+	 * called btrfs_remove_free_space_cache().
+	 *
+	 * And we must not remove the extent map from the fs_info->mapping_tree
+	 * to prevent the same logical address range and physical device space
+	 * ranges from being reused for a new block group. This is because our
+	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+	 * completely transactionless, so while it is trimming a range the
+	 * currently running transaction might finish and a new one start,
+	 * allowing for new block groups to be created that can reuse the same
+	 * physical device locations unless we take this special care.
+	 *
+	 * There may also be an implicit trim operation if the file system
+	 * is mounted with -odiscard. The same protections must remain
+	 * in place until the extents have been discarded completely when
+	 * the transaction commit has completed.
+	 */
+	remove_em = (atomic_read(&block_group->trimming) == 0);
+	spin_unlock(&block_group->lock);
+
+	mutex_unlock(&fs_info->chunk_mutex);
+
+	ret = remove_block_group_free_space(trans, block_group);
+	if (ret)
+		goto out;
+
+	btrfs_put_block_group(block_group);
+	btrfs_put_block_group(block_group);
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+
+	if (remove_em) {
+		struct extent_map_tree *em_tree;
+
+		em_tree = &fs_info->mapping_tree;
+		write_lock(&em_tree->lock);
+		remove_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+out:
+	if (remove_rsv)
+		btrfs_delayed_refs_rsv_release(fs_info, 1);
+	btrfs_free_path(path);
+	return ret;
+}
+
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
+{
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	unsigned int num_items;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	read_unlock(&em_tree->lock);
+	ASSERT(em && em->start == chunk_offset);
+
+	/*
+	 * We need to reserve 3 + N units from the metadata space info in order
+	 * to remove a block group (done at btrfs_remove_chunk() and at
+	 * btrfs_remove_block_group()), which are used for:
+	 *
+	 * 1 unit for adding the free space inode's orphan (located in the tree
+	 * of tree roots).
+	 * 1 unit for deleting the block group item (located in the extent
+	 * tree).
+	 * 1 unit for deleting the free space item (located in tree of tree
+	 * roots).
+	 * N units for deleting N device extent items corresponding to each
+	 * stripe (located in the device tree).
+	 *
+	 * In order to remove a block group we also need to reserve units in the
+	 * system space info in order to update the chunk tree (update one or
+	 * more device items and remove one chunk item), but this is done at
+	 * btrfs_remove_chunk() through a call to check_system_chunk().
+	 */
+	map = em->map_lookup;
+	num_items = 3 + map->num_stripes;
+	free_extent_map(em);
+
+	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+							   num_items, 1);
+}
+
+/*
+ * Mark block group @cache read-only, so later write won't happen to block
+ * group @cache.
+ *
+ * If @force is not set, this function will only mark the block group readonly
+ * if we have enough free space (1M) in other metadata/system block groups.
+ * If @force is not set, this function will mark the block group readonly
+ * without checking free space.
+ *
+ * NOTE: This function doesn't care if other block groups can contain all the
+ * data in this block group. That check should be done by relocation routine,
+ * not this function.
+ */
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+{
+	struct btrfs_space_info *sinfo = cache->space_info;
+	u64 num_bytes;
+	u64 sinfo_used;
+	u64 min_allocable_bytes;
+	int ret = -ENOSPC;
+
+	/*
+	 * We need some metadata space and system metadata space for
+	 * allocating chunks in some corner cases until we force to set
+	 * it to be readonly.
+	 */
+	if ((sinfo->flags &
+	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
+	    !force)
+		min_allocable_bytes = SZ_1M;
+	else
+		min_allocable_bytes = 0;
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&cache->lock);
+
+	if (cache->ro) {
+		cache->ro++;
+		ret = 0;
+		goto out;
+	}
+
+	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+	sinfo_used = btrfs_space_info_used(sinfo, true);
+
+	/*
+	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
+	 *
+	 * Here we make sure if we mark this bg RO, we still have enough
+	 * free space as buffer (if min_allocable_bytes is not 0).
+	 */
+	if (sinfo_used + num_bytes + min_allocable_bytes <=
+	    sinfo->total_bytes) {
+		sinfo->bytes_readonly += num_bytes;
+		cache->ro++;
+		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
+		ret = 0;
+	}
+out:
+	spin_unlock(&cache->lock);
+	spin_unlock(&sinfo->lock);
+	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+		btrfs_info(cache->fs_info,
+			"unable to make block group %llu ro",
+			cache->key.objectid);
+		btrfs_info(cache->fs_info,
+			"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+			sinfo_used, num_bytes, min_allocable_bytes);
+		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+	}
+	return ret;
+}
+
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+		return;
+
+	spin_lock(&fs_info->unused_bgs_lock);
+	while (!list_empty(&fs_info->unused_bgs)) {
+		u64 start, end;
+		int trimming;
+
+		block_group = list_first_entry(&fs_info->unused_bgs,
+					       struct btrfs_block_group_cache,
+					       bg_list);
+		list_del_init(&block_group->bg_list);
+
+		space_info = block_group->space_info;
+
+		if (ret || btrfs_mixed_space_info(space_info)) {
+			btrfs_put_block_group(block_group);
+			continue;
+		}
+		spin_unlock(&fs_info->unused_bgs_lock);
+
+		mutex_lock(&fs_info->delete_unused_bgs_mutex);
+
+		/* Don't want to race with allocators so take the groups_sem */
+		down_write(&space_info->groups_sem);
+		spin_lock(&block_group->lock);
+		if (block_group->reserved || block_group->pinned ||
+		    btrfs_block_group_used(&block_group->item) ||
+		    block_group->ro ||
+		    list_is_singular(&block_group->list)) {
+			/*
+			 * We want to bail if we made new allocations or have
+			 * outstanding allocations in this block group.  We do
+			 * the ro check in case balance is currently acting on
+			 * this block group.
+			 */
+			trace_btrfs_skip_unused_block_group(block_group);
+			spin_unlock(&block_group->lock);
+			up_write(&space_info->groups_sem);
+			goto next;
+		}
+		spin_unlock(&block_group->lock);
+
+		/* We don't want to force the issue, only flip if it's ok. */
+		ret = inc_block_group_ro(block_group, 0);
+		up_write(&space_info->groups_sem);
+		if (ret < 0) {
+			ret = 0;
+			goto next;
+		}
+
+		/*
+		 * Want to do this before we do anything else so we can recover
+		 * properly if we fail to join the transaction.
+		 */
+		trans = btrfs_start_trans_remove_block_group(fs_info,
+						     block_group->key.objectid);
+		if (IS_ERR(trans)) {
+			btrfs_dec_block_group_ro(block_group);
+			ret = PTR_ERR(trans);
+			goto next;
+		}
+
+		/*
+		 * We could have pending pinned extents for this block group,
+		 * just delete them, we don't care about them anymore.
+		 */
+		start = block_group->key.objectid;
+		end = start + block_group->key.offset - 1;
+		/*
+		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
+		 * btrfs_finish_extent_commit(). If we are at transaction N,
+		 * another task might be running finish_extent_commit() for the
+		 * previous transaction N - 1, and have seen a range belonging
+		 * to the block group in freed_extents[] before we were able to
+		 * clear the whole block group range from freed_extents[]. This
+		 * means that task can lookup for the block group after we
+		 * unpinned it from freed_extents[] and removed it, leading to
+		 * a BUG_ON() at btrfs_unpin_extent_range().
+		 */
+		mutex_lock(&fs_info->unused_bg_unpin_mutex);
+		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
+				  EXTENT_DIRTY);
+		if (ret) {
+			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+			btrfs_dec_block_group_ro(block_group);
+			goto end_trans;
+		}
+		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
+				  EXTENT_DIRTY);
+		if (ret) {
+			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+			btrfs_dec_block_group_ro(block_group);
+			goto end_trans;
+		}
+		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+
+		/* Reset pinned so btrfs_put_block_group doesn't complain */
+		spin_lock(&space_info->lock);
+		spin_lock(&block_group->lock);
+
+		btrfs_space_info_update_bytes_pinned(fs_info, space_info,
+						     -block_group->pinned);
+		space_info->bytes_readonly += block_group->pinned;
+		percpu_counter_add_batch(&space_info->total_bytes_pinned,
+				   -block_group->pinned,
+				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
+		block_group->pinned = 0;
+
+		spin_unlock(&block_group->lock);
+		spin_unlock(&space_info->lock);
+
+		/* DISCARD can flip during remount */
+		trimming = btrfs_test_opt(fs_info, DISCARD);
+
+		/* Implicit trim during transaction commit. */
+		if (trimming)
+			btrfs_get_block_group_trimming(block_group);
+
+		/*
+		 * Btrfs_remove_chunk will abort the transaction if things go
+		 * horribly wrong.
+		 */
+		ret = btrfs_remove_chunk(trans, block_group->key.objectid);
+
+		if (ret) {
+			if (trimming)
+				btrfs_put_block_group_trimming(block_group);
+			goto end_trans;
+		}
+
+		/*
+		 * If we're not mounted with -odiscard, we can just forget
+		 * about this block group. Otherwise we'll need to wait
+		 * until transaction commit to do the actual discard.
+		 */
+		if (trimming) {
+			spin_lock(&fs_info->unused_bgs_lock);
+			/*
+			 * A concurrent scrub might have added us to the list
+			 * fs_info->unused_bgs, so use a list_move operation
+			 * to add the block group to the deleted_bgs list.
+			 */
+			list_move(&block_group->bg_list,
+				  &trans->transaction->deleted_bgs);
+			spin_unlock(&fs_info->unused_bgs_lock);
+			btrfs_get_block_group(block_group);
+		}
+end_trans:
+		btrfs_end_transaction(trans);
+next:
+		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+		btrfs_put_block_group(block_group);
+		spin_lock(&fs_info->unused_bgs_lock);
+	}
+	spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+
+	spin_lock(&fs_info->unused_bgs_lock);
+	if (list_empty(&bg->bg_list)) {
+		btrfs_get_block_group(bg);
+		trace_btrfs_add_unused_block_group(bg);
+		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
+	}
+	spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+static int find_first_block_group(struct btrfs_fs_info *fs_info,
+				  struct btrfs_path *path,
+				  struct btrfs_key *key)
+{
+	struct btrfs_root *root = fs_info->extent_root;
+	int ret = 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	struct btrfs_block_group_item bg;
+	u64 flags;
+	int slot;
+
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid >= key->objectid &&
+		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			struct extent_map_tree *em_tree;
+			struct extent_map *em;
+
+			em_tree = &root->fs_info->mapping_tree;
+			read_lock(&em_tree->lock);
+			em = lookup_extent_mapping(em_tree, found_key.objectid,
+						   found_key.offset);
+			read_unlock(&em_tree->lock);
+			if (!em) {
+				btrfs_err(fs_info,
+			"logical %llu len %llu found bg but no related chunk",
+					  found_key.objectid, found_key.offset);
+				ret = -ENOENT;
+			} else if (em->start != found_key.objectid ||
+				   em->len != found_key.offset) {
+				btrfs_err(fs_info,
+		"block group %llu len %llu mismatch with chunk %llu len %llu",
+					  found_key.objectid, found_key.offset,
+					  em->start, em->len);
+				ret = -EUCLEAN;
+			} else {
+				read_extent_buffer(leaf, &bg,
+					btrfs_item_ptr_offset(leaf, slot),
+					sizeof(bg));
+				flags = btrfs_block_group_flags(&bg) &
+					BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+				if (flags != (em->map_lookup->type &
+					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+					btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+						found_key.objectid,
+						found_key.offset, flags,
+						(BTRFS_BLOCK_GROUP_TYPE_MASK &
+						 em->map_lookup->type));
+					ret = -EUCLEAN;
+				} else {
+					ret = 0;
+				}
+			}
+			free_extent_map(em);
+			goto out;
+		}
+		path->slots[0]++;
+	}
+out:
+	return ret;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = chunk_to_extended(flags) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	write_seqlock(&fs_info->profiles_lock);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits |= extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
+}
+
+static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	u64 bytenr;
+	u64 *logical;
+	int stripe_len;
+	int i, nr, ret;
+
+	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+		cache->bytes_super += stripe_len;
+		ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid,
+						stripe_len);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
+				       bytenr, &logical, &nr, &stripe_len);
+		if (ret)
+			return ret;
+
+		while (nr--) {
+			u64 start, len;
+
+			if (logical[nr] > cache->key.objectid +
+			    cache->key.offset)
+				continue;
+
+			if (logical[nr] + stripe_len <= cache->key.objectid)
+				continue;
+
+			start = logical[nr];
+			if (start < cache->key.objectid) {
+				start = cache->key.objectid;
+				len = (logical[nr] + stripe_len) - start;
+			} else {
+				len = min_t(u64, stripe_len,
+					    cache->key.objectid +
+					    cache->key.offset - start);
+			}
+
+			cache->bytes_super += len;
+			ret = btrfs_add_excluded_extent(fs_info, start, len);
+			if (ret) {
+				kfree(logical);
+				return ret;
+			}
+		}
+
+		kfree(logical);
+	}
+	return 0;
+}
+
+static void link_block_group(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_space_info *space_info = cache->space_info;
+	int index = btrfs_bg_flags_to_raid_index(cache->flags);
+	bool first = false;
+
+	down_write(&space_info->groups_sem);
+	if (list_empty(&space_info->block_groups[index]))
+		first = true;
+	list_add_tail(&cache->list, &space_info->block_groups[index]);
+	up_write(&space_info->groups_sem);
+
+	if (first)
+		btrfs_sysfs_add_block_group_type(cache);
+}
+
+static struct btrfs_block_group_cache *btrfs_create_block_group_cache(
+		struct btrfs_fs_info *fs_info, u64 start, u64 size)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = start;
+	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+
+	cache->fs_info = fs_info;
+	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
+	set_free_space_tree_thresholds(cache);
+
+	atomic_set(&cache->count, 1);
+	spin_lock_init(&cache->lock);
+	init_rwsem(&cache->data_rwsem);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->bg_list);
+	INIT_LIST_HEAD(&cache->ro_list);
+	INIT_LIST_HEAD(&cache->dirty_list);
+	INIT_LIST_HEAD(&cache->io_list);
+	btrfs_init_free_space_ctl(cache);
+	atomic_set(&cache->trimming, 0);
+	mutex_init(&cache->free_space_lock);
+	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+
+	return cache;
+}
+
+/*
+ * Iterate all chunks and verify that each of them has the corresponding block
+ * group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map *em;
+	struct btrfs_block_group_cache *bg;
+	u64 start = 0;
+	int ret = 0;
+
+	while (1) {
+		read_lock(&map_tree->lock);
+		/*
+		 * lookup_extent_mapping will return the first extent map
+		 * intersecting the range, so setting @len to 1 is enough to
+		 * get the first chunk.
+		 */
+		em = lookup_extent_mapping(map_tree, start, 1);
+		read_unlock(&map_tree->lock);
+		if (!em)
+			break;
+
+		bg = btrfs_lookup_block_group(fs_info, em->start);
+		if (!bg) {
+			btrfs_err(fs_info,
+	"chunk start=%llu len=%llu doesn't have corresponding block group",
+				     em->start, em->len);
+			ret = -EUCLEAN;
+			free_extent_map(em);
+			break;
+		}
+		if (bg->key.objectid != em->start ||
+		    bg->key.offset != em->len ||
+		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+			btrfs_err(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
+				em->start, em->len,
+				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+				bg->key.objectid, bg->key.offset,
+				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+			ret = -EUCLEAN;
+			free_extent_map(em);
+			btrfs_put_block_group(bg);
+			break;
+		}
+		start = em->start + em->len;
+		free_extent_map(em);
+		btrfs_put_block_group(bg);
+	}
+	return ret;
+}
+
+int btrfs_read_block_groups(struct btrfs_fs_info *info)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_space_info *space_info;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int need_clear = 0;
+	u64 cache_gen;
+	u64 feature;
+	int mixed;
+
+	feature = btrfs_super_incompat_flags(info->super_copy);
+	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
+
+	key.objectid = 0;
+	key.offset = 0;
+	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = READA_FORWARD;
+
+	cache_gen = btrfs_super_cache_generation(info->super_copy);
+	if (btrfs_test_opt(info, SPACE_CACHE) &&
+	    btrfs_super_generation(info->super_copy) != cache_gen)
+		need_clear = 1;
+	if (btrfs_test_opt(info, CLEAR_CACHE))
+		need_clear = 1;
+
+	while (1) {
+		ret = find_first_block_group(info, path, &key);
+		if (ret > 0)
+			break;
+		if (ret != 0)
+			goto error;
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		cache = btrfs_create_block_group_cache(info, found_key.objectid,
+						       found_key.offset);
+		if (!cache) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		if (need_clear) {
+			/*
+			 * When we mount with old space cache, we need to
+			 * set BTRFS_DC_CLEAR and set dirty flag.
+			 *
+			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+			 *    truncate the old free space cache inode and
+			 *    setup a new one.
+			 * b) Setting 'dirty flag' makes sure that we flush
+			 *    the new space cache info onto disk.
+			 */
+			if (btrfs_test_opt(info, SPACE_CACHE))
+				cache->disk_cache_state = BTRFS_DC_CLEAR;
+		}
+
+		read_extent_buffer(leaf, &cache->item,
+				   btrfs_item_ptr_offset(leaf, path->slots[0]),
+				   sizeof(cache->item));
+		cache->flags = btrfs_block_group_flags(&cache->item);
+		if (!mixed &&
+		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
+			btrfs_err(info,
+"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
+				  cache->key.objectid);
+			btrfs_put_block_group(cache);
+			ret = -EINVAL;
+			goto error;
+		}
+
+		key.objectid = found_key.objectid + found_key.offset;
+		btrfs_release_path(path);
+
+		/*
+		 * We need to exclude the super stripes now so that the space
+		 * info has super bytes accounted for, otherwise we'll think
+		 * we have more space than we actually do.
+		 */
+		ret = exclude_super_stripes(cache);
+		if (ret) {
+			/*
+			 * We may have excluded something, so call this just in
+			 * case.
+			 */
+			btrfs_free_excluded_extents(cache);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
+		/*
+		 * Check for two cases, either we are full, and therefore
+		 * don't need to bother with the caching work since we won't
+		 * find any space, or we are empty, and we can just add all
+		 * the space in and be done with it.  This saves us _a_lot_ of
+		 * time, particularly in the full case.
+		 */
+		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+			cache->last_byte_to_unpin = (u64)-1;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			btrfs_free_excluded_extents(cache);
+		} else if (btrfs_block_group_used(&cache->item) == 0) {
+			cache->last_byte_to_unpin = (u64)-1;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			add_new_free_space(cache, found_key.objectid,
+					   found_key.objectid +
+					   found_key.offset);
+			btrfs_free_excluded_extents(cache);
+		}
+
+		ret = btrfs_add_block_group_cache(info, cache);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
+		trace_btrfs_add_block_group(info, cache, 0);
+		btrfs_update_space_info(info, cache->flags, found_key.offset,
+					btrfs_block_group_used(&cache->item),
+					cache->bytes_super, &space_info);
+
+		cache->space_info = space_info;
+
+		link_block_group(cache);
+
+		set_avail_alloc_bits(info, cache->flags);
+		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
+			inc_block_group_ro(cache, 1);
+		} else if (btrfs_block_group_used(&cache->item) == 0) {
+			ASSERT(list_empty(&cache->bg_list));
+			btrfs_mark_bg_unused(cache);
+		}
+	}
+
+	list_for_each_entry_rcu(space_info, &info->space_info, list) {
+		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
+		      (BTRFS_BLOCK_GROUP_RAID10 |
+		       BTRFS_BLOCK_GROUP_RAID1_MASK |
+		       BTRFS_BLOCK_GROUP_RAID56_MASK |
+		       BTRFS_BLOCK_GROUP_DUP)))
+			continue;
+		/*
+		 * Avoid allocating from un-mirrored block group if there are
+		 * mirrored block groups.
+		 */
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_RAID0],
+				list)
+			inc_block_group_ro(cache, 1);
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_SINGLE],
+				list)
+			inc_block_group_ro(cache, 1);
+	}
+
+	btrfs_init_global_block_rsv(info);
+	ret = check_chunk_block_group_mappings(info);
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_block_group_item item;
+	struct btrfs_key key;
+	int ret = 0;
+
+	if (!trans->can_flush_pending_bgs)
+		return;
+
+	while (!list_empty(&trans->new_bgs)) {
+		block_group = list_first_entry(&trans->new_bgs,
+					       struct btrfs_block_group_cache,
+					       bg_list);
+		if (ret)
+			goto next;
+
+		spin_lock(&block_group->lock);
+		memcpy(&item, &block_group->item, sizeof(item));
+		memcpy(&key, &block_group->key, sizeof(key));
+		spin_unlock(&block_group->lock);
+
+		ret = btrfs_insert_item(trans, extent_root, &key, &item,
+					sizeof(item));
+		if (ret)
+			btrfs_abort_transaction(trans, ret);
+		ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
+		if (ret)
+			btrfs_abort_transaction(trans, ret);
+		add_block_group_free_space(trans, block_group);
+		/* Already aborted the transaction if it failed. */
+next:
+		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		list_del_init(&block_group->bg_list);
+	}
+	btrfs_trans_release_chunk_metadata(trans);
+}
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+			   u64 type, u64 chunk_offset, u64 size)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_block_group_cache *cache;
+	int ret;
+
+	btrfs_set_log_full_commit(trans);
+
+	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
+	if (!cache)
+		return -ENOMEM;
+
+	btrfs_set_block_group_used(&cache->item, bytes_used);
+	btrfs_set_block_group_chunk_objectid(&cache->item,
+					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+	btrfs_set_block_group_flags(&cache->item, type);
+
+	cache->flags = type;
+	cache->last_byte_to_unpin = (u64)-1;
+	cache->cached = BTRFS_CACHE_FINISHED;
+	cache->needs_free_space = 1;
+	ret = exclude_super_stripes(cache);
+	if (ret) {
+		/* We may have excluded something, so call this just in case */
+		btrfs_free_excluded_extents(cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
+	add_new_free_space(cache, chunk_offset, chunk_offset + size);
+
+	btrfs_free_excluded_extents(cache);
+
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_should_fragment_free_space(cache)) {
+		u64 new_bytes_used = size - bytes_used;
+
+		bytes_used += new_bytes_used >> 1;
+		fragment_free_space(cache);
+	}
+#endif
+	/*
+	 * Ensure the corresponding space_info object is created and
+	 * assigned to our block group. We want our bg to be added to the rbtree
+	 * with its ->space_info set.
+	 */
+	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
+	ASSERT(cache->space_info);
+
+	ret = btrfs_add_block_group_cache(fs_info, cache);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
+	/*
+	 * Now that our block group has its ->space_info set and is inserted in
+	 * the rbtree, update the space info's counters.
+	 */
+	trace_btrfs_add_block_group(fs_info, cache, 1);
+	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
+				cache->bytes_super, &cache->space_info);
+	btrfs_update_global_block_rsv(fs_info);
+
+	link_block_group(cache);
+
+	list_add_tail(&cache->bg_list, &trans->new_bgs);
+	trans->delayed_ref_updates++;
+	btrfs_update_delayed_refs_rsv(trans);
+
+	set_avail_alloc_bits(fs_info, type);
+	return 0;
+}
+
+static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 num_devices;
+	u64 stripped;
+
+	/*
+	 * if restripe for this chunk_type is on pick target profile and
+	 * return, otherwise do the usual balance
+	 */
+	stripped = get_restripe_target(fs_info, flags);
+	if (stripped)
+		return extended_to_chunk(stripped);
+
+	num_devices = fs_info->fs_devices->rw_devices;
+
+	stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
+		BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
+
+	if (num_devices == 1) {
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* turn raid0 into single device chunks */
+		if (flags & BTRFS_BLOCK_GROUP_RAID0)
+			return stripped;
+
+		/* turn mirroring into duplication */
+		if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
+			     BTRFS_BLOCK_GROUP_RAID10))
+			return stripped | BTRFS_BLOCK_GROUP_DUP;
+	} else {
+		/* they already had raid on here, just return */
+		if (flags & stripped)
+			return flags;
+
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* switch duplicated blocks with raid1 */
+		if (flags & BTRFS_BLOCK_GROUP_DUP)
+			return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+		/* this is drive concat, leave it alone */
+	}
+
+	return flags;
+}
+
+int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
+
+{
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct btrfs_trans_handle *trans;
+	u64 alloc_flags;
+	int ret;
+
+again:
+	trans = btrfs_join_transaction(fs_info->extent_root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	/*
+	 * we're not allowed to set block groups readonly after the dirty
+	 * block groups cache has started writing.  If it already started,
+	 * back off and let this transaction commit
+	 */
+	mutex_lock(&fs_info->ro_block_group_mutex);
+	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
+		u64 transid = trans->transid;
+
+		mutex_unlock(&fs_info->ro_block_group_mutex);
+		btrfs_end_transaction(trans);
+
+		ret = btrfs_wait_for_commit(fs_info, transid);
+		if (ret)
+			return ret;
+		goto again;
+	}
+
+	/*
+	 * if we are changing raid levels, try to allocate a corresponding
+	 * block group with the new raid level.
+	 */
+	alloc_flags = update_block_group_flags(fs_info, cache->flags);
+	if (alloc_flags != cache->flags) {
+		ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+		/*
+		 * ENOSPC is allowed here, we may have enough space
+		 * already allocated at the new raid level to
+		 * carry on
+		 */
+		if (ret == -ENOSPC)
+			ret = 0;
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = inc_block_group_ro(cache, 0);
+	if (!ret)
+		goto out;
+	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
+	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+	if (ret < 0)
+		goto out;
+	ret = inc_block_group_ro(cache, 0);
+out:
+	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+		alloc_flags = update_block_group_flags(fs_info, cache->flags);
+		mutex_lock(&fs_info->chunk_mutex);
+		check_system_chunk(trans, alloc_flags);
+		mutex_unlock(&fs_info->chunk_mutex);
+	}
+	mutex_unlock(&fs_info->ro_block_group_mutex);
+
+	btrfs_end_transaction(trans);
+	return ret;
+}
+
+void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_space_info *sinfo = cache->space_info;
+	u64 num_bytes;
+
+	BUG_ON(!cache->ro);
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&cache->lock);
+	if (!--cache->ro) {
+		num_bytes = cache->key.offset - cache->reserved -
+			    cache->pinned - cache->bytes_super -
+			    btrfs_block_group_used(&cache->item);
+		sinfo->bytes_readonly -= num_bytes;
+		list_del_init(&cache->ro_list);
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&sinfo->lock);
+}
+
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+				 struct btrfs_path *path,
+				 struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	int ret;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	unsigned long bi;
+	struct extent_buffer *leaf;
+
+	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto fail;
+	}
+
+	leaf = path->nodes[0];
+	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+	btrfs_mark_buffer_dirty(leaf);
+fail:
+	btrfs_release_path(path);
+	return ret;
+
+}
+
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_path *path)
+{
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct inode *inode = NULL;
+	struct extent_changeset *data_reserved = NULL;
+	u64 alloc_hint = 0;
+	int dcs = BTRFS_DC_ERROR;
+	u64 num_pages = 0;
+	int retries = 0;
+	int ret = 0;
+
+	/*
+	 * If this block group is smaller than 100 megs don't bother caching the
+	 * block group.
+	 */
+	if (block_group->key.offset < (100 * SZ_1M)) {
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+
+	if (trans->aborted)
+		return 0;
+again:
+	inode = lookup_free_space_inode(block_group, path);
+	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+		ret = PTR_ERR(inode);
+		btrfs_release_path(path);
+		goto out;
+	}
+
+	if (IS_ERR(inode)) {
+		BUG_ON(retries);
+		retries++;
+
+		if (block_group->ro)
+			goto out_free;
+
+		ret = create_free_space_inode(trans, block_group, path);
+		if (ret)
+			goto out_free;
+		goto again;
+	}
+
+	/*
+	 * We want to set the generation to 0, that way if anything goes wrong
+	 * from here on out we know not to trust this cache when we load up next
+	 * time.
+	 */
+	BTRFS_I(inode)->generation = 0;
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		/*
+		 * So theoretically we could recover from this, simply set the
+		 * super cache generation to 0 so we know to invalidate the
+		 * cache, but then we'd have to keep track of the block groups
+		 * that fail this way so we know we _have_ to reset this cache
+		 * before the next commit or risk reading stale cache.  So to
+		 * limit our exposure to horrible edge cases lets just abort the
+		 * transaction, this only happens in really bad situations
+		 * anyway.
+		 */
+		btrfs_abort_transaction(trans, ret);
+		goto out_put;
+	}
+	WARN_ON(ret);
+
+	/* We've already setup this transaction, go ahead and exit */
+	if (block_group->cache_generation == trans->transid &&
+	    i_size_read(inode)) {
+		dcs = BTRFS_DC_SETUP;
+		goto out_put;
+	}
+
+	if (i_size_read(inode) > 0) {
+		ret = btrfs_check_trunc_cache_free_space(fs_info,
+					&fs_info->global_block_rsv);
+		if (ret)
+			goto out_put;
+
+		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
+		if (ret)
+			goto out_put;
+	}
+
+	spin_lock(&block_group->lock);
+	if (block_group->cached != BTRFS_CACHE_FINISHED ||
+	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
+		/*
+		 * don't bother trying to write stuff out _if_
+		 * a) we're not cached,
+		 * b) we're with nospace_cache mount option,
+		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
+		 */
+		dcs = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		goto out_put;
+	}
+	spin_unlock(&block_group->lock);
+
+	/*
+	 * We hit an ENOSPC when setting up the cache in this transaction, just
+	 * skip doing the setup, we've already cleared the cache so we're safe.
+	 */
+	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+		ret = -ENOSPC;
+		goto out_put;
+	}
+
+	/*
+	 * Try to preallocate enough space based on how big the block group is.
+	 * Keep in mind this has to include any pinned space which could end up
+	 * taking up quite a bit since it's not folded into the other space
+	 * cache.
+	 */
+	num_pages = div_u64(block_group->key.offset, SZ_256M);
+	if (!num_pages)
+		num_pages = 1;
+
+	num_pages *= 16;
+	num_pages *= PAGE_SIZE;
+
+	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
+	if (ret)
+		goto out_put;
+
+	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+					      num_pages, num_pages,
+					      &alloc_hint);
+	/*
+	 * Our cache requires contiguous chunks so that we don't modify a bunch
+	 * of metadata or split extents when writing the cache out, which means
+	 * we can enospc if we are heavily fragmented in addition to just normal
+	 * out of space conditions.  So if we hit this just skip setting up any
+	 * other block groups for this transaction, maybe we'll unpin enough
+	 * space the next time around.
+	 */
+	if (!ret)
+		dcs = BTRFS_DC_SETUP;
+	else if (ret == -ENOSPC)
+		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+
+out_put:
+	iput(inode);
+out_free:
+	btrfs_release_path(path);
+out:
+	spin_lock(&block_group->lock);
+	if (!ret && dcs == BTRFS_DC_SETUP)
+		block_group->cache_generation = trans->transid;
+	block_group->disk_cache_state = dcs;
+	spin_unlock(&block_group->lock);
+
+	extent_changeset_free(data_reserved);
+	return ret;
+}
+
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_block_group_cache *cache, *tmp;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct btrfs_path *path;
+
+	if (list_empty(&cur_trans->dirty_bgs) ||
+	    !btrfs_test_opt(fs_info, SPACE_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* Could add new block groups, use _safe just in case */
+	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+				 dirty_list) {
+		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+			cache_save_setup(cache, trans, path);
+	}
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * Transaction commit does final block group cache writeback during a critical
+ * section where nothing is allowed to change the FS.  This is required in
+ * order for the cache to actually match the block group, but can introduce a
+ * lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
+ * There's a chance we'll have to redo some of it if the block group changes
+ * again during the commit, but it greatly reduces the commit latency by
+ * getting rid of the easy block groups while we're still allowing others to
+ * join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int ret = 0;
+	int should_put;
+	struct btrfs_path *path = NULL;
+	LIST_HEAD(dirty);
+	struct list_head *io = &cur_trans->io_bgs;
+	int num_started = 0;
+	int loops = 0;
+
+	spin_lock(&cur_trans->dirty_bgs_lock);
+	if (list_empty(&cur_trans->dirty_bgs)) {
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+		return 0;
+	}
+	list_splice_init(&cur_trans->dirty_bgs, &dirty);
+	spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+	/* Make sure all the block groups on our dirty list actually exist */
+	btrfs_create_pending_block_groups(trans);
+
+	if (!path) {
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+	}
+
+	/*
+	 * cache_write_mutex is here only to save us from balance or automatic
+	 * removal of empty block groups deleting this block group while we are
+	 * writing out the cache
+	 */
+	mutex_lock(&trans->transaction->cache_write_mutex);
+	while (!list_empty(&dirty)) {
+		bool drop_reserve = true;
+
+		cache = list_first_entry(&dirty,
+					 struct btrfs_block_group_cache,
+					 dirty_list);
+		/*
+		 * This can happen if something re-dirties a block group that
+		 * is already under IO.  Just wait for it to finish and then do
+		 * it all again
+		 */
+		if (!list_empty(&cache->io_list)) {
+			list_del_init(&cache->io_list);
+			btrfs_wait_cache_io(trans, cache, path);
+			btrfs_put_block_group(cache);
+		}
+
+
+		/*
+		 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
+		 * it should update the cache_state.  Don't delete until after
+		 * we wait.
+		 *
+		 * Since we're not running in the commit critical section
+		 * we need the dirty_bgs_lock to protect from update_block_group
+		 */
+		spin_lock(&cur_trans->dirty_bgs_lock);
+		list_del_init(&cache->dirty_list);
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+
+		should_put = 1;
+
+		cache_save_setup(cache, trans, path);
+
+		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+			cache->io_ctl.inode = NULL;
+			ret = btrfs_write_out_cache(trans, cache, path);
+			if (ret == 0 && cache->io_ctl.inode) {
+				num_started++;
+				should_put = 0;
+
+				/*
+				 * The cache_write_mutex is protecting the
+				 * io_list, also refer to the definition of
+				 * btrfs_transaction::io_bgs for more details
+				 */
+				list_add_tail(&cache->io_list, io);
+			} else {
+				/*
+				 * If we failed to write the cache, the
+				 * generation will be bad and life goes on
+				 */
+				ret = 0;
+			}
+		}
+		if (!ret) {
+			ret = write_one_cache_group(trans, path, cache);
+			/*
+			 * Our block group might still be attached to the list
+			 * of new block groups in the transaction handle of some
+			 * other task (struct btrfs_trans_handle->new_bgs). This
+			 * means its block group item isn't yet in the extent
+			 * tree. If this happens ignore the error, as we will
+			 * try again later in the critical section of the
+			 * transaction commit.
+			 */
+			if (ret == -ENOENT) {
+				ret = 0;
+				spin_lock(&cur_trans->dirty_bgs_lock);
+				if (list_empty(&cache->dirty_list)) {
+					list_add_tail(&cache->dirty_list,
+						      &cur_trans->dirty_bgs);
+					btrfs_get_block_group(cache);
+					drop_reserve = false;
+				}
+				spin_unlock(&cur_trans->dirty_bgs_lock);
+			} else if (ret) {
+				btrfs_abort_transaction(trans, ret);
+			}
+		}
+
+		/* If it's not on the io list, we need to put the block group */
+		if (should_put)
+			btrfs_put_block_group(cache);
+		if (drop_reserve)
+			btrfs_delayed_refs_rsv_release(fs_info, 1);
+
+		if (ret)
+			break;
+
+		/*
+		 * Avoid blocking other tasks for too long. It might even save
+		 * us from writing caches for block groups that are going to be
+		 * removed.
+		 */
+		mutex_unlock(&trans->transaction->cache_write_mutex);
+		mutex_lock(&trans->transaction->cache_write_mutex);
+	}
+	mutex_unlock(&trans->transaction->cache_write_mutex);
+
+	/*
+	 * Go through delayed refs for all the stuff we've just kicked off
+	 * and then loop back (just once)
+	 */
+	ret = btrfs_run_delayed_refs(trans, 0);
+	if (!ret && loops == 0) {
+		loops++;
+		spin_lock(&cur_trans->dirty_bgs_lock);
+		list_splice_init(&cur_trans->dirty_bgs, &dirty);
+		/*
+		 * dirty_bgs_lock protects us from concurrent block group
+		 * deletes too (not just cache_write_mutex).
+		 */
+		if (!list_empty(&dirty)) {
+			spin_unlock(&cur_trans->dirty_bgs_lock);
+			goto again;
+		}
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+	} else if (ret < 0) {
+		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int ret = 0;
+	int should_put;
+	struct btrfs_path *path;
+	struct list_head *io = &cur_trans->io_bgs;
+	int num_started = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * Even though we are in the critical section of the transaction commit,
+	 * we can still have concurrent tasks adding elements to this
+	 * transaction's list of dirty block groups. These tasks correspond to
+	 * endio free space workers started when writeback finishes for a
+	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+	 * allocate new block groups as a result of COWing nodes of the root
+	 * tree when updating the free space inode. The writeback for the space
+	 * caches is triggered by an earlier call to
+	 * btrfs_start_dirty_block_groups() and iterations of the following
+	 * loop.
+	 * Also we want to do the cache_save_setup first and then run the
+	 * delayed refs to make sure we have the best chance at doing this all
+	 * in one shot.
+	 */
+	spin_lock(&cur_trans->dirty_bgs_lock);
+	while (!list_empty(&cur_trans->dirty_bgs)) {
+		cache = list_first_entry(&cur_trans->dirty_bgs,
+					 struct btrfs_block_group_cache,
+					 dirty_list);
+
+		/*
+		 * This can happen if cache_save_setup re-dirties a block group
+		 * that is already under IO.  Just wait for it to finish and
+		 * then do it all again
+		 */
+		if (!list_empty(&cache->io_list)) {
+			spin_unlock(&cur_trans->dirty_bgs_lock);
+			list_del_init(&cache->io_list);
+			btrfs_wait_cache_io(trans, cache, path);
+			btrfs_put_block_group(cache);
+			spin_lock(&cur_trans->dirty_bgs_lock);
+		}
+
+		/*
+		 * Don't remove from the dirty list until after we've waited on
+		 * any pending IO
+		 */
+		list_del_init(&cache->dirty_list);
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+		should_put = 1;
+
+		cache_save_setup(cache, trans, path);
+
+		if (!ret)
+			ret = btrfs_run_delayed_refs(trans,
+						     (unsigned long) -1);
+
+		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+			cache->io_ctl.inode = NULL;
+			ret = btrfs_write_out_cache(trans, cache, path);
+			if (ret == 0 && cache->io_ctl.inode) {
+				num_started++;
+				should_put = 0;
+				list_add_tail(&cache->io_list, io);
+			} else {
+				/*
+				 * If we failed to write the cache, the
+				 * generation will be bad and life goes on
+				 */
+				ret = 0;
+			}
+		}
+		if (!ret) {
+			ret = write_one_cache_group(trans, path, cache);
+			/*
+			 * One of the free space endio workers might have
+			 * created a new block group while updating a free space
+			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
+			 * and hasn't released its transaction handle yet, in
+			 * which case the new block group is still attached to
+			 * its transaction handle and its creation has not
+			 * finished yet (no block group item in the extent tree
+			 * yet, etc). If this is the case, wait for all free
+			 * space endio workers to finish and retry. This is a
+			 * a very rare case so no need for a more efficient and
+			 * complex approach.
+			 */
+			if (ret == -ENOENT) {
+				wait_event(cur_trans->writer_wait,
+				   atomic_read(&cur_trans->num_writers) == 1);
+				ret = write_one_cache_group(trans, path, cache);
+			}
+			if (ret)
+				btrfs_abort_transaction(trans, ret);
+		}
+
+		/* If its not on the io list, we need to put the block group */
+		if (should_put)
+			btrfs_put_block_group(cache);
+		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		spin_lock(&cur_trans->dirty_bgs_lock);
+	}
+	spin_unlock(&cur_trans->dirty_bgs_lock);
+
+	/*
+	 * Refer to the definition of io_bgs member for details why it's safe
+	 * to use it without any locking
+	 */
+	while (!list_empty(io)) {
+		cache = list_first_entry(io, struct btrfs_block_group_cache,
+					 io_list);
+		list_del_init(&cache->io_list);
+		btrfs_wait_cache_io(trans, cache, path);
+		btrfs_put_block_group(cache);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+			     u64 bytenr, u64 num_bytes, int alloc)
+{
+	struct btrfs_fs_info *info = trans->fs_info;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 total = num_bytes;
+	u64 old_val;
+	u64 byte_in_group;
+	int factor;
+	int ret = 0;
+
+	/* Block accounting for super block */
+	spin_lock(&info->delalloc_root_lock);
+	old_val = btrfs_super_bytes_used(info->super_copy);
+	if (alloc)
+		old_val += num_bytes;
+	else
+		old_val -= num_bytes;
+	btrfs_set_super_bytes_used(info->super_copy, old_val);
+	spin_unlock(&info->delalloc_root_lock);
+
+	while (total) {
+		cache = btrfs_lookup_block_group(info, bytenr);
+		if (!cache) {
+			ret = -ENOENT;
+			break;
+		}
+		factor = btrfs_bg_type_to_factor(cache->flags);
+
+		/*
+		 * If this block group has free space cache written out, we
+		 * need to make sure to load it if we are removing space.  This
+		 * is because we need the unpinning stage to actually add the
+		 * space back to the block group, otherwise we will leak space.
+		 */
+		if (!alloc && cache->cached == BTRFS_CACHE_NO)
+			btrfs_cache_block_group(cache, 1);
+
+		byte_in_group = bytenr - cache->key.objectid;
+		WARN_ON(byte_in_group > cache->key.offset);
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
+
+		if (btrfs_test_opt(info, SPACE_CACHE) &&
+		    cache->disk_cache_state < BTRFS_DC_CLEAR)
+			cache->disk_cache_state = BTRFS_DC_CLEAR;
+
+		old_val = btrfs_block_group_used(&cache->item);
+		num_bytes = min(total, cache->key.offset - byte_in_group);
+		if (alloc) {
+			old_val += num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			cache->reserved -= num_bytes;
+			cache->space_info->bytes_reserved -= num_bytes;
+			cache->space_info->bytes_used += num_bytes;
+			cache->space_info->disk_used += num_bytes * factor;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+		} else {
+			old_val -= num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			cache->pinned += num_bytes;
+			btrfs_space_info_update_bytes_pinned(info,
+					cache->space_info, num_bytes);
+			cache->space_info->bytes_used -= num_bytes;
+			cache->space_info->disk_used -= num_bytes * factor;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+
+			percpu_counter_add_batch(
+					&cache->space_info->total_bytes_pinned,
+					num_bytes,
+					BTRFS_TOTAL_BYTES_PINNED_BATCH);
+			set_extent_dirty(info->pinned_extents,
+					 bytenr, bytenr + num_bytes - 1,
+					 GFP_NOFS | __GFP_NOFAIL);
+		}
+
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+		if (list_empty(&cache->dirty_list)) {
+			list_add_tail(&cache->dirty_list,
+				      &trans->transaction->dirty_bgs);
+			trans->delayed_ref_updates++;
+			btrfs_get_block_group(cache);
+		}
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+		/*
+		 * No longer have used bytes in this block group, queue it for
+		 * deletion. We do this after adding the block group to the
+		 * dirty list to avoid races between cleaner kthread and space
+		 * cache writeout.
+		 */
+		if (!alloc && old_val == 0)
+			btrfs_mark_bg_unused(cache);
+
+		btrfs_put_block_group(cache);
+		total -= num_bytes;
+		bytenr += num_bytes;
+	}
+
+	/* Modified block groups are accounted for in the delayed_refs_rsv. */
+	btrfs_update_delayed_refs_rsv(trans);
+	return ret;
+}
+
+/**
+ * btrfs_add_reserved_bytes - update the block_group and space info counters
+ * @cache:	The cache we are manipulating
+ * @ram_bytes:  The number of bytes of file content, and will be same to
+ *              @num_bytes except for the compress path.
+ * @num_bytes:	The number of bytes in question
+ * @delalloc:   The blocks are allocated for the delalloc write
+ *
+ * This is called by the allocator when it reserves space. If this is a
+ * reservation and the block group has become read only we cannot make the
+ * reservation and return -EAGAIN, otherwise this function always succeeds.
+ */
+int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+			     u64 ram_bytes, u64 num_bytes, int delalloc)
+{
+	struct btrfs_space_info *space_info = cache->space_info;
+	int ret = 0;
+
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
+	if (cache->ro) {
+		ret = -EAGAIN;
+	} else {
+		cache->reserved += num_bytes;
+		space_info->bytes_reserved += num_bytes;
+		trace_btrfs_space_reservation(cache->fs_info, "space_info",
+					      space_info->flags, num_bytes, 1);
+		btrfs_space_info_update_bytes_may_use(cache->fs_info,
+						      space_info, -ram_bytes);
+		if (delalloc)
+			cache->delalloc_bytes += num_bytes;
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&space_info->lock);
+	return ret;
+}
+
+/**
+ * btrfs_free_reserved_bytes - update the block_group and space info counters
+ * @cache:      The cache we are manipulating
+ * @num_bytes:  The number of bytes in question
+ * @delalloc:   The blocks are allocated for the delalloc write
+ *
+ * This is called by somebody who is freeing space that was never actually used
+ * on disk.  For example if you reserve some space for a new leaf in transaction
+ * A and before transaction A commits you free that leaf, you call this with
+ * reserve set to 0 in order to clear the reservation.
+ */
+void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+			       u64 num_bytes, int delalloc)
+{
+	struct btrfs_space_info *space_info = cache->space_info;
+
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
+	if (cache->ro)
+		space_info->bytes_readonly += num_bytes;
+	cache->reserved -= num_bytes;
+	space_info->bytes_reserved -= num_bytes;
+	space_info->max_extent_size = 0;
+
+	if (delalloc)
+		cache->delalloc_bytes -= num_bytes;
+	spin_unlock(&cache->lock);
+	spin_unlock(&space_info->lock);
+}
+
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+			found->force_alloc = CHUNK_ALLOC_FORCE;
+	}
+	rcu_read_unlock();
+}
+
+static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
+			      struct btrfs_space_info *sinfo, int force)
+{
+	u64 bytes_used = btrfs_space_info_used(sinfo, false);
+	u64 thresh;
+
+	if (force == CHUNK_ALLOC_FORCE)
+		return 1;
+
+	/*
+	 * in limited mode, we want to have some free space up to
+	 * about 1% of the FS size.
+	 */
+	if (force == CHUNK_ALLOC_LIMITED) {
+		thresh = btrfs_super_total_bytes(fs_info->super_copy);
+		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
+
+		if (sinfo->total_bytes - bytes_used < thresh)
+			return 1;
+	}
+
+	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
+		return 0;
+	return 1;
+}
+
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
+{
+	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
+
+	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+}
+
+/*
+ * If force is CHUNK_ALLOC_FORCE:
+ *    - return 1 if it successfully allocates a chunk,
+ *    - return errors including -ENOSPC otherwise.
+ * If force is NOT CHUNK_ALLOC_FORCE:
+ *    - return 0 if it doesn't need to allocate a new chunk,
+ *    - return 1 if it successfully allocates a chunk,
+ *    - return errors including -ENOSPC otherwise.
+ */
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+		      enum btrfs_chunk_alloc_enum force)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_space_info *space_info;
+	bool wait_for_alloc = false;
+	bool should_alloc = false;
+	int ret = 0;
+
+	/* Don't re-enter if we're already allocating a chunk */
+	if (trans->allocating_chunk)
+		return -ENOSPC;
+
+	space_info = btrfs_find_space_info(fs_info, flags);
+	ASSERT(space_info);
+
+	do {
+		spin_lock(&space_info->lock);
+		if (force < space_info->force_alloc)
+			force = space_info->force_alloc;
+		should_alloc = should_alloc_chunk(fs_info, space_info, force);
+		if (space_info->full) {
+			/* No more free physical space */
+			if (should_alloc)
+				ret = -ENOSPC;
+			else
+				ret = 0;
+			spin_unlock(&space_info->lock);
+			return ret;
+		} else if (!should_alloc) {
+			spin_unlock(&space_info->lock);
+			return 0;
+		} else if (space_info->chunk_alloc) {
+			/*
+			 * Someone is already allocating, so we need to block
+			 * until this someone is finished and then loop to
+			 * recheck if we should continue with our allocation
+			 * attempt.
+			 */
+			wait_for_alloc = true;
+			spin_unlock(&space_info->lock);
+			mutex_lock(&fs_info->chunk_mutex);
+			mutex_unlock(&fs_info->chunk_mutex);
+		} else {
+			/* Proceed with allocation */
+			space_info->chunk_alloc = 1;
+			wait_for_alloc = false;
+			spin_unlock(&space_info->lock);
+		}
+
+		cond_resched();
+	} while (wait_for_alloc);
+
+	mutex_lock(&fs_info->chunk_mutex);
+	trans->allocating_chunk = true;
+
+	/*
+	 * If we have mixed data/metadata chunks we want to make sure we keep
+	 * allocating mixed chunks instead of individual chunks.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
+	/*
+	 * if we're doing a data chunk, go ahead and make sure that
+	 * we keep a reasonable number of metadata chunks allocated in the
+	 * FS as well.
+	 */
+	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+		fs_info->data_chunk_allocations++;
+		if (!(fs_info->data_chunk_allocations %
+		      fs_info->metadata_ratio))
+			force_metadata_allocation(fs_info);
+	}
+
+	/*
+	 * Check if we have enough space in SYSTEM chunk because we may need
+	 * to update devices.
+	 */
+	check_system_chunk(trans, flags);
+
+	ret = btrfs_alloc_chunk(trans, flags);
+	trans->allocating_chunk = false;
+
+	spin_lock(&space_info->lock);
+	if (ret < 0) {
+		if (ret == -ENOSPC)
+			space_info->full = 1;
+		else
+			goto out;
+	} else {
+		ret = 1;
+		space_info->max_extent_size = 0;
+	}
+
+	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
+	space_info->chunk_alloc = 0;
+	spin_unlock(&space_info->lock);
+	mutex_unlock(&fs_info->chunk_mutex);
+	/*
+	 * When we allocate a new chunk we reserve space in the chunk block
+	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
+	 * add new nodes/leafs to it if we end up needing to do it when
+	 * inserting the chunk item and updating device items as part of the
+	 * second phase of chunk allocation, performed by
+	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+	 * large number of new block groups to create in our transaction
+	 * handle's new_bgs list to avoid exhausting the chunk block reserve
+	 * in extreme cases - like having a single transaction create many new
+	 * block groups when starting to write out the free space caches of all
+	 * the block groups that were made dirty during the lifetime of the
+	 * transaction.
+	 */
+	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
+		btrfs_create_pending_block_groups(trans);
+
+	return ret;
+}
+
+static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
+{
+	u64 num_dev;
+
+	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
+	if (!num_dev)
+		num_dev = fs_info->fs_devices->rw_devices;
+
+	return num_dev;
+}
+
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_space_info *info;
+	u64 left;
+	u64 thresh;
+	int ret = 0;
+	u64 num_devs;
+
+	/*
+	 * Needed because we can end up allocating a system chunk and for an
+	 * atomic and race free space reservation in the chunk block reserve.
+	 */
+	lockdep_assert_held(&fs_info->chunk_mutex);
+
+	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	spin_lock(&info->lock);
+	left = info->total_bytes - btrfs_space_info_used(info, true);
+	spin_unlock(&info->lock);
+
+	num_devs = get_profile_num_devs(fs_info, type);
+
+	/* num_devs device items to update and 1 chunk item to add or remove */
+	thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
+		btrfs_calc_insert_metadata_size(fs_info, 1);
+
+	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
+			   left, thresh, type);
+		btrfs_dump_space_info(fs_info, info, 0, 0);
+	}
+
+	if (left < thresh) {
+		u64 flags = btrfs_system_alloc_profile(fs_info);
+
+		/*
+		 * Ignore failure to create system chunk. We might end up not
+		 * needing it, as we might not need to COW all nodes/leafs from
+		 * the paths we visit in the chunk tree (they were already COWed
+		 * or created in the current transaction for example).
+		 */
+		ret = btrfs_alloc_chunk(trans, flags);
+	}
+
+	if (!ret) {
+		ret = btrfs_block_rsv_add(fs_info->chunk_root,
+					  &fs_info->chunk_block_rsv,
+					  thresh, BTRFS_RESERVE_NO_FLUSH);
+		if (!ret)
+			trans->chunk_bytes_reserved += thresh;
+	}
+}
+
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 last = 0;
+
+	while (1) {
+		struct inode *inode;
+
+		block_group = btrfs_lookup_first_block_group(info, last);
+		while (block_group) {
+			btrfs_wait_block_group_cache_done(block_group);
+			spin_lock(&block_group->lock);
+			if (block_group->iref)
+				break;
+			spin_unlock(&block_group->lock);
+			block_group = btrfs_next_block_group(block_group);
+		}
+		if (!block_group) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		inode = block_group->inode;
+		block_group->iref = 0;
+		block_group->inode = NULL;
+		spin_unlock(&block_group->lock);
+		ASSERT(block_group->io_ctl.inode == NULL);
+		iput(inode);
+		last = block_group->key.objectid + block_group->key.offset;
+		btrfs_put_block_group(block_group);
+	}
+}
+
+/*
+ * Must be called only after stopping all workers, since we could have block
+ * group caching kthreads running, and therefore they could race with us if we
+ * freed the block groups before stopping them.
+ */
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
+	struct btrfs_caching_control *caching_ctl;
+	struct rb_node *n;
+
+	down_write(&info->commit_root_sem);
+	while (!list_empty(&info->caching_block_groups)) {
+		caching_ctl = list_entry(info->caching_block_groups.next,
+					 struct btrfs_caching_control, list);
+		list_del(&caching_ctl->list);
+		btrfs_put_caching_control(caching_ctl);
+	}
+	up_write(&info->commit_root_sem);
+
+	spin_lock(&info->unused_bgs_lock);
+	while (!list_empty(&info->unused_bgs)) {
+		block_group = list_first_entry(&info->unused_bgs,
+					       struct btrfs_block_group_cache,
+					       bg_list);
+		list_del_init(&block_group->bg_list);
+		btrfs_put_block_group(block_group);
+	}
+	spin_unlock(&info->unused_bgs_lock);
+
+	spin_lock(&info->block_group_cache_lock);
+	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+		block_group = rb_entry(n, struct btrfs_block_group_cache,
+				       cache_node);
+		rb_erase(&block_group->cache_node,
+			 &info->block_group_cache_tree);
+		RB_CLEAR_NODE(&block_group->cache_node);
+		spin_unlock(&info->block_group_cache_lock);
+
+		down_write(&block_group->space_info->groups_sem);
+		list_del(&block_group->list);
+		up_write(&block_group->space_info->groups_sem);
+
+		/*
+		 * We haven't cached this block group, which means we could
+		 * possibly have excluded extents on this block group.
+		 */
+		if (block_group->cached == BTRFS_CACHE_NO ||
+		    block_group->cached == BTRFS_CACHE_ERROR)
+			btrfs_free_excluded_extents(block_group);
+
+		btrfs_remove_free_space_cache(block_group);
+		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
+		ASSERT(list_empty(&block_group->dirty_list));
+		ASSERT(list_empty(&block_group->io_list));
+		ASSERT(list_empty(&block_group->bg_list));
+		ASSERT(atomic_read(&block_group->count) == 1);
+		btrfs_put_block_group(block_group);
+
+		spin_lock(&info->block_group_cache_lock);
+	}
+	spin_unlock(&info->block_group_cache_lock);
+
+	/*
+	 * Now that all the block groups are freed, go through and free all the
+	 * space_info structs.  This is only called during the final stages of
+	 * unmount, and so we know nobody is using them.  We call
+	 * synchronize_rcu() once before we start, just to be on the safe side.
+	 */
+	synchronize_rcu();
+
+	btrfs_release_global_block_rsv(info);
+
+	while (!list_empty(&info->space_info)) {
+		space_info = list_entry(info->space_info.next,
+					struct btrfs_space_info,
+					list);
+
+		/*
+		 * Do not hide this behind enospc_debug, this is actually
+		 * important and indicates a real bug if this happens.
+		 */
+		if (WARN_ON(space_info->bytes_pinned > 0 ||
+			    space_info->bytes_reserved > 0 ||
+			    space_info->bytes_may_use > 0))
+			btrfs_dump_space_info(info, space_info, 0, 0);
+		list_del(&space_info->list);
+		btrfs_sysfs_remove_space_info(space_info);
+	}
+	return 0;
+}

diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
new file mode 100644
index 0000000..c391800
--- /dev/null
+++ b/fs/btrfs/block-group.h

@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_BLOCK_GROUP_H
+#define BTRFS_BLOCK_GROUP_H
+
+#include "free-space-cache.h"
+
+enum btrfs_disk_cache_state {
+	BTRFS_DC_WRITTEN,
+	BTRFS_DC_ERROR,
+	BTRFS_DC_CLEAR,
+	BTRFS_DC_SETUP,
+};
+
+/*
+ * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
+ * only allocate a chunk if we really need one.
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few
+ * chunks already allocated.  This is used as part of the clustering code to
+ * help make sure we have a good pool of storage to cluster in, without filling
+ * the FS with empty chunks
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ */
+enum btrfs_chunk_alloc_enum {
+	CHUNK_ALLOC_NO_FORCE,
+	CHUNK_ALLOC_LIMITED,
+	CHUNK_ALLOC_FORCE,
+};
+
+struct btrfs_caching_control {
+	struct list_head list;
+	struct mutex mutex;
+	wait_queue_head_t wait;
+	struct btrfs_work work;
+	struct btrfs_block_group_cache *block_group;
+	u64 progress;
+	refcount_t count;
+};
+
+/* Once caching_thread() finds this much free space, it will wake up waiters. */
+#define CACHING_CTL_WAKE_UP SZ_2M
+
+struct btrfs_block_group_cache {
+	struct btrfs_key key;
+	struct btrfs_block_group_item item;
+	struct btrfs_fs_info *fs_info;
+	struct inode *inode;
+	spinlock_t lock;
+	u64 pinned;
+	u64 reserved;
+	u64 delalloc_bytes;
+	u64 bytes_super;
+	u64 flags;
+	u64 cache_generation;
+
+	/*
+	 * If the free space extent count exceeds this number, convert the block
+	 * group to bitmaps.
+	 */
+	u32 bitmap_high_thresh;
+
+	/*
+	 * If the free space extent count drops below this number, convert the
+	 * block group back to extents.
+	 */
+	u32 bitmap_low_thresh;
+
+	/*
+	 * It is just used for the delayed data space allocation because
+	 * only the data space allocation and the relative metadata update
+	 * can be done cross the transaction.
+	 */
+	struct rw_semaphore data_rwsem;
+
+	/* For raid56, this is a full stripe, without parity */
+	unsigned long full_stripe_len;
+
+	unsigned int ro;
+	unsigned int iref:1;
+	unsigned int has_caching_ctl:1;
+	unsigned int removed:1;
+
+	int disk_cache_state;
+
+	/* Cache tracking stuff */
+	int cached;
+	struct btrfs_caching_control *caching_ctl;
+	u64 last_byte_to_unpin;
+
+	struct btrfs_space_info *space_info;
+
+	/* Free space cache stuff */
+	struct btrfs_free_space_ctl *free_space_ctl;
+
+	/* Block group cache stuff */
+	struct rb_node cache_node;
+
+	/* For block groups in the same raid type */
+	struct list_head list;
+
+	/* Usage count */
+	atomic_t count;
+
+	/*
+	 * List of struct btrfs_free_clusters for this block group.
+	 * Today it will only have one thing on it, but that may change
+	 */
+	struct list_head cluster_list;
+
+	/* For delayed block group creation or deletion of empty block groups */
+	struct list_head bg_list;
+
+	/* For read-only block groups */
+	struct list_head ro_list;
+
+	atomic_t trimming;
+
+	/* For dirty block groups */
+	struct list_head dirty_list;
+	struct list_head io_list;
+
+	struct btrfs_io_ctl io_ctl;
+
+	/*
+	 * Incremented when doing extent allocations and holding a read lock
+	 * on the space_info's groups_sem semaphore.
+	 * Decremented when an ordered extent that represents an IO against this
+	 * block group's range is created (after it's added to its inode's
+	 * root's list of ordered extents) or immediately after the allocation
+	 * if it's a metadata extent or fallocate extent (for these cases we
+	 * don't create ordered extents).
+	 */
+	atomic_t reservations;
+
+	/*
+	 * Incremented while holding the spinlock *lock* by a task checking if
+	 * it can perform a nocow write (incremented if the value for the *ro*
+	 * field is 0). Decremented by such tasks once they create an ordered
+	 * extent or before that if some error happens before reaching that step.
+	 * This is to prevent races between block group relocation and nocow
+	 * writes through direct IO.
+	 */
+	atomic_t nocow_writers;
+
+	/* Lock for free space tree operations. */
+	struct mutex free_space_lock;
+
+	/*
+	 * Does the block group need to be added to the free space tree?
+	 * Protected by free_space_lock.
+	 */
+	int needs_free_space;
+
+	/* Record locked full stripes for RAID5/6 block group */
+	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int btrfs_should_fragment_free_space(
+		struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+	return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
+		block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+	       (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
+		block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
+struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
+		struct btrfs_fs_info *info, u64 bytenr);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+		struct btrfs_fs_info *info, u64 bytenr);
+struct btrfs_block_group_cache *btrfs_next_block_group(
+		struct btrfs_block_group_cache *cache);
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+					const u64 start);
+void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+				           u64 num_bytes);
+int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache);
+int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
+			    int load_cache_only);
+void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
+struct btrfs_caching_control *btrfs_get_caching_control(
+		struct btrfs_block_group_cache *cache);
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+		       u64 start, u64 end);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+				struct btrfs_fs_info *fs_info,
+				const u64 chunk_offset);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     u64 group_start, struct extent_map *em);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg);
+int btrfs_read_block_groups(struct btrfs_fs_info *info);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+			   u64 type, u64 chunk_offset, u64 size);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
+void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
+int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+			     u64 bytenr, u64 num_bytes, int alloc);
+int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+			     u64 ram_bytes, u64 num_bytes, int delalloc);
+void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+			       u64 num_bytes, int delalloc);
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+		      enum btrfs_chunk_alloc_enum force);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
+void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+
+static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+	return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
+}
+
+static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+	return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+}
+
+static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+	return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+}
+
+static inline int btrfs_block_group_cache_done(
+		struct btrfs_block_group_cache *cache)
+{
+	smp_mb();
+	return cache->cached == BTRFS_CACHE_FINISHED ||
+		cache->cached == BTRFS_CACHE_ERROR;
+}
+
+#endif /* BTRFS_BLOCK_GROUP_H */

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
new file mode 100644
index 0000000..d07bd41
--- /dev/null
+++ b/fs/btrfs/block-rsv.c

@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "ctree.h"
+#include "block-rsv.h"
+#include "space-info.h"
+#include "transaction.h"
+
+static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_rsv *block_rsv,
+				    struct btrfs_block_rsv *dest, u64 num_bytes,
+				    u64 *qgroup_to_release_ret)
+{
+	struct btrfs_space_info *space_info = block_rsv->space_info;
+	u64 qgroup_to_release = 0;
+	u64 ret;
+
+	spin_lock(&block_rsv->lock);
+	if (num_bytes == (u64)-1) {
+		num_bytes = block_rsv->size;
+		qgroup_to_release = block_rsv->qgroup_rsv_size;
+	}
+	block_rsv->size -= num_bytes;
+	if (block_rsv->reserved >= block_rsv->size) {
+		num_bytes = block_rsv->reserved - block_rsv->size;
+		block_rsv->reserved = block_rsv->size;
+		block_rsv->full = 1;
+	} else {
+		num_bytes = 0;
+	}
+	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+		qgroup_to_release = block_rsv->qgroup_rsv_reserved -
+				    block_rsv->qgroup_rsv_size;
+		block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
+	} else {
+		qgroup_to_release = 0;
+	}
+	spin_unlock(&block_rsv->lock);
+
+	ret = num_bytes;
+	if (num_bytes > 0) {
+		if (dest) {
+			spin_lock(&dest->lock);
+			if (!dest->full) {
+				u64 bytes_to_add;
+
+				bytes_to_add = dest->size - dest->reserved;
+				bytes_to_add = min(num_bytes, bytes_to_add);
+				dest->reserved += bytes_to_add;
+				if (dest->reserved >= dest->size)
+					dest->full = 1;
+				num_bytes -= bytes_to_add;
+			}
+			spin_unlock(&dest->lock);
+		}
+		if (num_bytes)
+			btrfs_space_info_free_bytes_may_use(fs_info,
+							    space_info,
+							    num_bytes);
+	}
+	if (qgroup_to_release_ret)
+		*qgroup_to_release_ret = qgroup_to_release;
+	return ret;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
+			    struct btrfs_block_rsv *dst, u64 num_bytes,
+			    bool update_size)
+{
+	int ret;
+
+	ret = btrfs_block_rsv_use_bytes(src, num_bytes);
+	if (ret)
+		return ret;
+
+	btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
+	return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
+{
+	memset(rsv, 0, sizeof(*rsv));
+	spin_lock_init(&rsv->lock);
+	rsv->type = type;
+}
+
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_rsv *rsv,
+				   unsigned short type)
+{
+	btrfs_init_block_rsv(rsv, type);
+	rsv->space_info = btrfs_find_space_info(fs_info,
+					    BTRFS_BLOCK_GROUP_METADATA);
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
+					      unsigned short type)
+{
+	struct btrfs_block_rsv *block_rsv;
+
+	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+	if (!block_rsv)
+		return NULL;
+
+	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
+	return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_rsv *rsv)
+{
+	if (!rsv)
+		return;
+	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+	kfree(rsv);
+}
+
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush)
+{
+	int ret;
+
+	if (num_bytes == 0)
+		return 0;
+
+	ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	if (!ret)
+		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
+
+	return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
+{
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	num_bytes = div_factor(block_rsv->size, min_factor);
+	if (block_rsv->reserved >= num_bytes)
+		ret = 0;
+	spin_unlock(&block_rsv->lock);
+
+	return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush)
+{
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	num_bytes = min_reserved;
+	if (block_rsv->reserved >= num_bytes)
+		ret = 0;
+	else
+		num_bytes -= block_rsv->reserved;
+	spin_unlock(&block_rsv->lock);
+
+	if (!ret)
+		return 0;
+
+	ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	if (!ret) {
+		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
+		return 0;
+	}
+
+	return ret;
+}
+
+u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+			      struct btrfs_block_rsv *block_rsv,
+			      u64 num_bytes, u64 *qgroup_to_release)
+{
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_block_rsv *target = NULL;
+
+	/*
+	 * If we are the delayed_rsv then push to the global rsv, otherwise dump
+	 * into the delayed rsv if it is not full.
+	 */
+	if (block_rsv == delayed_rsv)
+		target = global_rsv;
+	else if (block_rsv != global_rsv && !delayed_rsv->full)
+		target = delayed_rsv;
+
+	if (target && block_rsv->space_info != target->space_info)
+		target = NULL;
+
+	return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
+				       qgroup_to_release);
+}
+
+int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
+{
+	int ret = -ENOSPC;
+
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved >= num_bytes) {
+		block_rsv->reserved -= num_bytes;
+		if (block_rsv->reserved < block_rsv->size)
+			block_rsv->full = 0;
+		ret = 0;
+	}
+	spin_unlock(&block_rsv->lock);
+	return ret;
+}
+
+void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes, bool update_size)
+{
+	spin_lock(&block_rsv->lock);
+	block_rsv->reserved += num_bytes;
+	if (update_size)
+		block_rsv->size += num_bytes;
+	else if (block_rsv->reserved >= block_rsv->size)
+		block_rsv->full = 1;
+	spin_unlock(&block_rsv->lock);
+}
+
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor)
+{
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	u64 min_bytes;
+
+	if (global_rsv->space_info != dest->space_info)
+		return -ENOSPC;
+
+	spin_lock(&global_rsv->lock);
+	min_bytes = div_factor(global_rsv->size, min_factor);
+	if (global_rsv->reserved < min_bytes + num_bytes) {
+		spin_unlock(&global_rsv->lock);
+		return -ENOSPC;
+	}
+	global_rsv->reserved -= num_bytes;
+	if (global_rsv->reserved < global_rsv->size)
+		global_rsv->full = 0;
+	spin_unlock(&global_rsv->lock);
+
+	btrfs_block_rsv_add_bytes(dest, num_bytes, true);
+	return 0;
+}
+
+void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	struct btrfs_space_info *sinfo = block_rsv->space_info;
+	u64 num_bytes;
+	unsigned min_items;
+
+	/*
+	 * The global block rsv is based on the size of the extent tree, the
+	 * checksum tree and the root tree.  If the fs is empty we want to set
+	 * it to a minimal amount for safety.
+	 */
+	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
+		btrfs_root_used(&fs_info->csum_root->root_item) +
+		btrfs_root_used(&fs_info->tree_root->root_item);
+
+	/*
+	 * We at a minimum are going to modify the csum root, the tree root, and
+	 * the extent root.
+	 */
+	min_items = 3;
+
+	/*
+	 * But we also want to reserve enough space so we can do the fallback
+	 * global reserve for an unlink, which is an additional 5 items (see the
+	 * comment in __unlink_start_trans for what we're modifying.)
+	 *
+	 * But we also need space for the delayed ref updates from the unlink,
+	 * so its 10, 5 for the actual operation, and 5 for the delayed ref
+	 * updates.
+	 */
+	min_items += 10;
+
+	num_bytes = max_t(u64, num_bytes,
+			  btrfs_calc_insert_metadata_size(fs_info, min_items));
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&block_rsv->lock);
+
+	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
+
+	if (block_rsv->reserved < block_rsv->size) {
+		num_bytes = block_rsv->size - block_rsv->reserved;
+		block_rsv->reserved += num_bytes;
+		btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
+						      num_bytes);
+	} else if (block_rsv->reserved > block_rsv->size) {
+		num_bytes = block_rsv->reserved - block_rsv->size;
+		btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
+						      -num_bytes);
+		block_rsv->reserved = block_rsv->size;
+		btrfs_try_granting_tickets(fs_info, sinfo);
+	}
+
+	if (block_rsv->reserved == block_rsv->size)
+		block_rsv->full = 1;
+	else
+		block_rsv->full = 0;
+
+	spin_unlock(&block_rsv->lock);
+	spin_unlock(&sinfo->lock);
+}
+
+void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *space_info;
+
+	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	fs_info->chunk_block_rsv.space_info = space_info;
+
+	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	fs_info->global_block_rsv.space_info = space_info;
+	fs_info->trans_block_rsv.space_info = space_info;
+	fs_info->empty_block_rsv.space_info = space_info;
+	fs_info->delayed_block_rsv.space_info = space_info;
+	fs_info->delayed_refs_rsv.space_info = space_info;
+
+	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
+	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
+	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+	if (fs_info->quota_root)
+		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+	btrfs_update_global_block_rsv(fs_info);
+}
+
+void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
+	WARN_ON(fs_info->trans_block_rsv.size > 0);
+	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+	WARN_ON(fs_info->chunk_block_rsv.size > 0);
+	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_block_rsv.size > 0);
+	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_refs_rsv.size > 0);
+}
+
+static struct btrfs_block_rsv *get_block_rsv(
+					const struct btrfs_trans_handle *trans,
+					const struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_rsv *block_rsv = NULL;
+
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+	    (root == fs_info->csum_root && trans->adding_csums) ||
+	    (root == fs_info->uuid_root))
+		block_rsv = trans->block_rsv;
+
+	if (!block_rsv)
+		block_rsv = root->block_rsv;
+
+	if (!block_rsv)
+		block_rsv = &fs_info->empty_block_rsv;
+
+	return block_rsv;
+}
+
+struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    u32 blocksize)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	int ret;
+	bool global_updated = false;
+
+	block_rsv = get_block_rsv(trans, root);
+
+	if (unlikely(block_rsv->size == 0))
+		goto try_reserve;
+again:
+	ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
+	if (!ret)
+		return block_rsv;
+
+	if (block_rsv->failfast)
+		return ERR_PTR(ret);
+
+	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+		global_updated = true;
+		btrfs_update_global_block_rsv(fs_info);
+		goto again;
+	}
+
+	/*
+	 * The global reserve still exists to save us from ourselves, so don't
+	 * warn_on if we are short on our delayed refs reserve.
+	 */
+	if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
+	    btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+		static DEFINE_RATELIMIT_STATE(_rs,
+				DEFAULT_RATELIMIT_INTERVAL * 10,
+				/*DEFAULT_RATELIMIT_BURST*/ 1);
+		if (__ratelimit(&_rs))
+			WARN(1, KERN_DEBUG
+				"BTRFS: block rsv returned %d\n", ret);
+	}
+try_reserve:
+	ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
+					   BTRFS_RESERVE_NO_FLUSH);
+	if (!ret)
+		return block_rsv;
+	/*
+	 * If we couldn't reserve metadata bytes try and use some from
+	 * the global reserve if its space type is the same as the global
+	 * reservation.
+	 */
+	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+	    block_rsv->space_info == global_rsv->space_info) {
+		ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
+		if (!ret)
+			return global_rsv;
+	}
+	return ERR_PTR(ret);
+}

diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
new file mode 100644
index 0000000..d1428bb
--- /dev/null
+++ b/fs/btrfs/block-rsv.h

@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_BLOCK_RSV_H
+#define BTRFS_BLOCK_RSV_H
+
+struct btrfs_trans_handle;
+enum btrfs_reserve_flush_enum;
+
+/*
+ * Types of block reserves
+ */
+enum {
+	BTRFS_BLOCK_RSV_GLOBAL,
+	BTRFS_BLOCK_RSV_DELALLOC,
+	BTRFS_BLOCK_RSV_TRANS,
+	BTRFS_BLOCK_RSV_CHUNK,
+	BTRFS_BLOCK_RSV_DELOPS,
+	BTRFS_BLOCK_RSV_DELREFS,
+	BTRFS_BLOCK_RSV_EMPTY,
+	BTRFS_BLOCK_RSV_TEMP,
+};
+
+struct btrfs_block_rsv {
+	u64 size;
+	u64 reserved;
+	struct btrfs_space_info *space_info;
+	spinlock_t lock;
+	unsigned short full;
+	unsigned short type;
+	unsigned short failfast;
+
+	/*
+	 * Qgroup equivalent for @size @reserved
+	 *
+	 * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
+	 * about things like csum size nor how many tree blocks it will need to
+	 * reserve.
+	 *
+	 * Qgroup cares more about net change of the extent usage.
+	 *
+	 * So for one newly inserted file extent, in worst case it will cause
+	 * leaf split and level increase, nodesize for each file extent is
+	 * already too much.
+	 *
+	 * In short, qgroup_size/reserved is the upper limit of possible needed
+	 * qgroup metadata reservation.
+	 */
+	u64 qgroup_rsv_size;
+	u64 qgroup_rsv_reserved;
+};
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
+					      unsigned short type);
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_rsv *rsv,
+				   unsigned short type);
+void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+			    struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
+			    bool update_size);
+int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor);
+void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes, bool update_size);
+u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+			      struct btrfs_block_rsv *block_rsv,
+			      u64 num_bytes, u64 *qgroup_to_release);
+void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
+void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info);
+void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
+struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    u32 blocksize);
+
+static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+					   struct btrfs_block_rsv *block_rsv,
+					   u64 num_bytes)
+{
+	__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
+}
+
+static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
+					 struct btrfs_block_rsv *block_rsv,
+					 u32 blocksize)
+{
+	btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
+	btrfs_block_rsv_release(fs_info, block_rsv, 0);
+}
+
+#endif /* BTRFS_BLOCK_RSV_H */

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac5..f853835 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h

@@ -20,7 +20,7 @@
  * new data the application may have written before commit.
  */
 enum {
-	BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
+	BTRFS_INODE_ORDERED_DATA_CLOSE,
 	BTRFS_INODE_DUMMY,
 	BTRFS_INODE_IN_DEFRAG,
 	BTRFS_INODE_HAS_ASYNC_EXTENT,
@@ -29,6 +29,7 @@
 	BTRFS_INODE_IN_DELALLOC_LIST,
 	BTRFS_INODE_READDIO_NEED_LOCK,
 	BTRFS_INODE_HAS_PROPS,
+	BTRFS_INODE_SNAPSHOT_FLUSH,
 };
 
 /* in memory btrfs inode */
@@ -196,8 +197,6 @@
 	struct inode vfs_inode;
 };
 
-extern unsigned char btrfs_filetype_table[];
-
 static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
 {
 	return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -206,7 +205,7 @@
 static inline unsigned long btrfs_inode_hash(u64 objectid,
 					     const struct btrfs_root *root)
 {
-	u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+	u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
 
 #if BITS_PER_LONG == 32
 	h = (h >> 32) ^ (h & 0xffffffff);
@@ -253,6 +252,11 @@
 	return false;
 }
 
+static inline bool is_data_inode(struct inode *inode)
+{
+	return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
+}
+
 static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
 						 int mod)
 {
@@ -333,22 +337,34 @@
 	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
 }
 
+/* Array of bytes with variable length, hexadecimal format 0x1234 */
+#define CSUM_FMT				"0x%*phN"
+#define CSUM_FMT_VALUE(size, bytes)		size, bytes
+
 static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
-		u64 logical_start, u32 csum, u32 csum_expected, int mirror_num)
+		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
 {
 	struct btrfs_root *root = inode->root;
+	struct btrfs_super_block *sb = root->fs_info->super_copy;
+	const u16 csum_size = btrfs_super_csum_size(sb);
 
 	/* Output minus objectid, which is more meaningful */
-	if (root->objectid >= BTRFS_LAST_FREE_OBJECTID)
+	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
 		btrfs_warn_rl(root->fs_info,
-	"csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
-			root->objectid, btrfs_ino(inode),
-			logical_start, csum, csum_expected, mirror_num);
+"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+			root->root_key.objectid, btrfs_ino(inode),
+			logical_start,
+			CSUM_FMT_VALUE(csum_size, csum),
+			CSUM_FMT_VALUE(csum_size, csum_expected),
+			mirror_num);
 	else
 		btrfs_warn_rl(root->fs_info,
-	"csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
-			root->objectid, btrfs_ino(inode),
-			logical_start, csum, csum_expected, mirror_num);
+"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+			root->root_key.objectid, btrfs_ino(inode),
+			logical_start,
+			CSUM_FMT_VALUE(csum_size, csum),
+			CSUM_FMT_VALUE(csum_size, csum_expected),
+			mirror_num);
 }
 
 #endif

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 833cf3c..0b52ab4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c

@@ -83,7 +83,7 @@
 #include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/string.h>
-#include <linux/crc32c.h>
+#include <crypto/hash.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -940,7 +940,7 @@
 	kfree(sf);
 }
 
-static int btrfsic_process_metablock(
+static noinline_for_stack int btrfsic_process_metablock(
 		struct btrfsic_state *state,
 		struct btrfsic_block *const first_block,
 		struct btrfsic_block_data_ctx *const first_block_ctx,
@@ -1202,24 +1202,24 @@
 	void *dstv, u32 offset, size_t len)
 {
 	size_t cur;
-	size_t offset_in_page;
+	size_t pgoff;
 	char *kaddr;
 	char *dst = (char *)dstv;
-	size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(block_ctx->start);
 	unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
 
 	WARN_ON(offset + len > block_ctx->len);
-	offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1);
+	pgoff = offset_in_page(start_offset + offset);
 
 	while (len > 0) {
-		cur = min(len, ((size_t)PAGE_SIZE - offset_in_page));
+		cur = min(len, ((size_t)PAGE_SIZE - pgoff));
 		BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
 		kaddr = block_ctx->datav[i];
-		memcpy(dst, kaddr + offset_in_page, cur);
+		memcpy(dst, kaddr + pgoff, cur);
 
 		dst += cur;
 		len -= cur;
-		offset_in_page = 0;
+		pgoff = 0;
 		i++;
 	}
 }
@@ -1594,13 +1594,14 @@
 {
 	unsigned int num_pages;
 	unsigned int i;
+	size_t size;
 	u64 dev_bytenr;
 	int ret;
 
 	BUG_ON(block_ctx->datav);
 	BUG_ON(block_ctx->pagev);
 	BUG_ON(block_ctx->mem_to_free);
-	if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) {
+	if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
 		pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
 		       block_ctx->dev_bytenr);
 		return -1;
@@ -1608,9 +1609,8 @@
 
 	num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
 		    PAGE_SHIFT;
-	block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) +
-						sizeof(*block_ctx->pagev),
-					 num_pages, GFP_NOFS);
+	size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
+	block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
 	if (!block_ctx->mem_to_free)
 		return -ENOMEM;
 	block_ctx->datav = block_ctx->mem_to_free;
@@ -1706,13 +1706,14 @@
  * Test whether the disk block contains a tree block (leaf or node)
  * (note that this test fails for the super block)
  */
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-				     char **datav, unsigned int num_pages)
+static noinline_for_stack int btrfsic_test_for_metadata(
+		struct btrfsic_state *state,
+		char **datav, unsigned int num_pages)
 {
 	struct btrfs_fs_info *fs_info = state->fs_info;
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	struct btrfs_header *h;
 	u8 csum[BTRFS_CSUM_SIZE];
-	u32 crc = ~(u32)0;
 	unsigned int i;
 
 	if (num_pages * PAGE_SIZE < state->metablock_size)
@@ -1720,17 +1721,20 @@
 	num_pages = state->metablock_size >> PAGE_SHIFT;
 	h = (struct btrfs_header *)datav[0];
 
-	if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE))
+	if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
 		return 1;
 
+	shash->tfm = fs_info->csum_shash;
+	crypto_shash_init(shash);
+
 	for (i = 0; i < num_pages; i++) {
 		u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
 		size_t sublen = i ? PAGE_SIZE :
 				    (PAGE_SIZE - BTRFS_CSUM_SIZE);
 
-		crc = crc32c(crc, data, sublen);
+		crypto_shash_update(shash, data, sublen);
 	}
-	btrfs_csum_final(crc, csum);
+	crypto_shash_final(shash, csum);
 	if (memcmp(csum, h->csum, state->csum_size))
 		return 1;
 
@@ -1778,7 +1782,7 @@
 				return;
 			}
 			is_metadata = 1;
-			BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1));
+			BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
 			processed_len = BTRFS_SUPER_INFO_SIZE;
 			if (state->print_mask &
 			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
@@ -2327,7 +2331,7 @@
 		 * write operations. Therefore it keeps the linkage
 		 * information for a block until a block is
 		 * rewritten. This can temporarily cause incorrect
-		 * and even circular linkage informations. This
+		 * and even circular linkage information. This
 		 * causes no harm unless such blocks are referenced
 		 * by the most recent super block.
 		 */
@@ -2892,12 +2896,12 @@
 	struct list_head *dev_head = &fs_devices->devices;
 	struct btrfs_device *device;
 
-	if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) {
+	if (!PAGE_ALIGNED(fs_info->nodesize)) {
 		pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
 		       fs_info->nodesize, PAGE_SIZE);
 		return -1;
 	}
-	if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) {
+	if (!PAGE_ALIGNED(fs_info->sectorsize)) {
 		pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
 		       fs_info->sectorsize, PAGE_SIZE);
 		return -1;

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9bfa665..b05b361 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c

@@ -17,6 +17,8 @@
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
 #include <linux/log2.h>
+#include <crypto/hash.h>
+#include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -42,6 +44,22 @@
 	return NULL;
 }
 
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+	int i;
+
+	for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+		size_t comp_len = strlen(btrfs_compress_types[i]);
+
+		if (len < comp_len)
+			continue;
+
+		if (!strncmp(btrfs_compress_types[i], str, comp_len))
+			return true;
+	}
+	return false;
+}
+
 static int btrfs_decompress_bio(struct compressed_bio *cb);
 
 static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
@@ -57,32 +75,37 @@
 				 struct compressed_bio *cb,
 				 u64 disk_start)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+	const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	int ret;
 	struct page *page;
 	unsigned long i;
 	char *kaddr;
-	u32 csum;
-	u32 *cb_sum = &cb->sums;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u8 *cb_sum = cb->sums;
 
 	if (inode->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
+	shash->tfm = fs_info->csum_shash;
+
 	for (i = 0; i < cb->nr_pages; i++) {
 		page = cb->compressed_pages[i];
-		csum = ~(u32)0;
 
+		crypto_shash_init(shash);
 		kaddr = kmap_atomic(page);
-		csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE);
-		btrfs_csum_final(csum, (u8 *)&csum);
+		crypto_shash_update(shash, kaddr, PAGE_SIZE);
 		kunmap_atomic(kaddr);
+		crypto_shash_final(shash, (u8 *)&csum);
 
-		if (csum != *cb_sum) {
-			btrfs_print_data_csum_error(inode, disk_start, csum,
-					*cb_sum, cb->mirror_num);
+		if (memcmp(&csum, cb_sum, csum_size)) {
+			btrfs_print_data_csum_error(inode, disk_start,
+					csum, cb_sum, cb->mirror_num);
 			ret = -EIO;
 			goto fail;
 		}
-		cb_sum++;
+		cb_sum += csum_size;
 
 	}
 	ret = 0;
@@ -160,15 +183,15 @@
 	if (cb->errors) {
 		bio_io_error(cb->orig_bio);
 	} else {
-		int i;
 		struct bio_vec *bvec;
+		struct bvec_iter_all iter_all;
 
 		/*
 		 * we have verified the checksum already, set page
 		 * checked so the end_io handlers know about it
 		 */
 		ASSERT(!bio_flagged(bio, BIO_CLONED));
-		bio_for_each_segment_all(bvec, cb->orig_bio, i)
+		bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
 			SetPageChecked(bvec->bv_page);
 
 		bio_endio(cb->orig_bio);
@@ -229,7 +252,6 @@
  */
 static void end_compressed_bio_write(struct bio *bio)
 {
-	struct extent_io_tree *tree;
 	struct compressed_bio *cb = bio->bi_private;
 	struct inode *inode;
 	struct page *page;
@@ -248,14 +270,10 @@
 	 * call back into the FS and do all the end_io operations
 	 */
 	inode = cb->inode;
-	tree = &BTRFS_I(inode)->io_tree;
 	cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
-	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
-					 cb->start,
-					 cb->start + cb->len - 1,
-					 NULL,
-					 bio->bi_status ?
-					 BLK_STS_OK : BLK_STS_NOTSUPP);
+	btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
+			cb->start, cb->start + cb->len - 1,
+			bio->bi_status == BLK_STS_OK);
 	cb->compressed_pages[0]->mapping = NULL;
 
 	end_compressed_writeback(inode, cb);
@@ -306,7 +324,7 @@
 	blk_status_t ret;
 	int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	WARN_ON(start & ((u64)PAGE_SIZE - 1));
+	WARN_ON(!PAGE_ALIGNED(start));
 	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 	if (!cb)
 		return BLK_STS_RESOURCE;
@@ -323,7 +341,8 @@
 
 	bdev = fs_info->fs_devices->latest_bdev;
 
-	bio = btrfs_bio_alloc(bdev, first_byte);
+	bio = btrfs_bio_alloc(first_byte);
+	bio_set_dev(bio, bdev);
 	bio->bi_opf = REQ_OP_WRITE | write_flags;
 	bio->bi_private = cb;
 	bio->bi_end_io = end_compressed_bio_write;
@@ -337,7 +356,8 @@
 		page = compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
 		if (bio->bi_iter.bi_size)
-			submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0);
+			submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
+							  0);
 
 		page->mapping = NULL;
 		if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
@@ -364,7 +384,8 @@
 				bio_endio(bio);
 			}
 
-			bio = btrfs_bio_alloc(bdev, first_byte);
+			bio = btrfs_bio_alloc(first_byte);
+			bio_set_dev(bio, bdev);
 			bio->bi_opf = REQ_OP_WRITE | write_flags;
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
@@ -437,10 +458,8 @@
 		if (pg_index > end_index)
 			break;
 
-		rcu_read_lock();
-		page = radix_tree_lookup(&mapping->i_pages, pg_index);
-		rcu_read_unlock();
-		if (page && !radix_tree_exceptional_entry(page)) {
+		page = xa_load(&mapping->i_pages, pg_index);
+		if (page && !xa_is_value(page)) {
 			misses++;
 			if (misses > 4)
 				break;
@@ -483,7 +502,7 @@
 
 		if (page->index == end_index) {
 			char *userpage;
-			size_t zero_offset = isize & (PAGE_SIZE - 1);
+			size_t zero_offset = offset_in_page(isize);
 
 			if (zero_offset) {
 				int zeros;
@@ -528,7 +547,6 @@
 				 int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct extent_io_tree *tree;
 	struct extent_map_tree *em_tree;
 	struct compressed_bio *cb;
 	unsigned long compressed_len;
@@ -543,9 +561,9 @@
 	struct extent_map *em;
 	blk_status_t ret = BLK_STS_RESOURCE;
 	int faili = 0;
-	u32 *sums;
+	const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+	u8 *sums;
 
-	tree = &BTRFS_I(inode)->io_tree;
 	em_tree = &BTRFS_I(inode)->extent_tree;
 
 	/* we need the actual starting offset of this extent in the file */
@@ -566,7 +584,7 @@
 	cb->errors = 0;
 	cb->inode = inode;
 	cb->mirror_num = mirror_num;
-	sums = &cb->sums;
+	sums = cb->sums;
 
 	cb->start = em->orig_start;
 	em_len = em->len;
@@ -605,7 +623,8 @@
 	/* include any pages we added in add_ra-bio_pages */
 	cb->len = bio->bi_iter.bi_size;
 
-	comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
+	comp_bio = btrfs_bio_alloc(cur_disk_byte);
+	bio_set_dev(comp_bio, bdev);
 	comp_bio->bi_opf = REQ_OP_READ;
 	comp_bio->bi_private = cb;
 	comp_bio->bi_end_io = end_compressed_bio_read;
@@ -619,12 +638,14 @@
 		page->index = em_start >> PAGE_SHIFT;
 
 		if (comp_bio->bi_iter.bi_size)
-			submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE,
-					comp_bio, 0);
+			submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE,
+							  comp_bio, 0);
 
 		page->mapping = NULL;
 		if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
 		    PAGE_SIZE) {
+			unsigned int nr_sectors;
+
 			ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
 						  BTRFS_WQ_ENDIO_DATA);
 			BUG_ON(ret); /* -ENOMEM */
@@ -642,8 +663,10 @@
 							    sums);
 				BUG_ON(ret); /* -ENOMEM */
 			}
-			sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
-					     fs_info->sectorsize);
+
+			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
+						  fs_info->sectorsize);
+			sums += csum_size * nr_sectors;
 
 			ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
 			if (ret) {
@@ -651,7 +674,8 @@
 				bio_endio(comp_bio);
 			}
 
-			comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
+			comp_bio = btrfs_bio_alloc(cur_disk_byte);
+			bio_set_dev(comp_bio, bdev);
 			comp_bio->bi_opf = REQ_OP_READ;
 			comp_bio->bi_private = cb;
 			comp_bio->bi_end_io = end_compressed_bio_read;
@@ -738,6 +762,28 @@
 	struct list_head list;
 };
 
+static struct workspace_manager heuristic_wsm;
+
+static void heuristic_init_workspace_manager(void)
+{
+	btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
+}
+
+static void heuristic_cleanup_workspace_manager(void)
+{
+	btrfs_cleanup_workspace_manager(&heuristic_wsm);
+}
+
+static struct list_head *heuristic_get_workspace(unsigned int level)
+{
+	return btrfs_get_workspace(&heuristic_wsm, level);
+}
+
+static void heuristic_put_workspace(struct list_head *ws)
+{
+	btrfs_put_workspace(&heuristic_wsm, ws);
+}
+
 static void free_heuristic_ws(struct list_head *ws)
 {
 	struct heuristic_ws *workspace;
@@ -750,7 +796,7 @@
 	kfree(workspace);
 }
 
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(unsigned int level)
 {
 	struct heuristic_ws *ws;
 
@@ -777,65 +823,59 @@
 	return ERR_PTR(-ENOMEM);
 }
 
-struct workspaces_list {
-	struct list_head idle_ws;
-	spinlock_t ws_lock;
-	/* Number of free workspaces */
-	int free_ws;
-	/* Total number of allocated workspaces */
-	atomic_t total_ws;
-	/* Waiters for a free workspace */
-	wait_queue_head_t ws_wait;
+const struct btrfs_compress_op btrfs_heuristic_compress = {
+	.init_workspace_manager = heuristic_init_workspace_manager,
+	.cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
+	.get_workspace = heuristic_get_workspace,
+	.put_workspace = heuristic_put_workspace,
+	.alloc_workspace = alloc_heuristic_ws,
+	.free_workspace = free_heuristic_ws,
 };
 
-static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
-
-static struct workspaces_list btrfs_heuristic_ws;
-
 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+	/* The heuristic is represented as compression type 0 */
+	&btrfs_heuristic_compress,
 	&btrfs_zlib_compress,
 	&btrfs_lzo_compress,
 	&btrfs_zstd_compress,
 };
 
-void __init btrfs_init_compress(void)
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+				  const struct btrfs_compress_op *ops)
 {
 	struct list_head *workspace;
-	int i;
 
-	INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
-	spin_lock_init(&btrfs_heuristic_ws.ws_lock);
-	atomic_set(&btrfs_heuristic_ws.total_ws, 0);
-	init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+	wsm->ops = ops;
 
-	workspace = alloc_heuristic_ws();
+	INIT_LIST_HEAD(&wsm->idle_ws);
+	spin_lock_init(&wsm->ws_lock);
+	atomic_set(&wsm->total_ws, 0);
+	init_waitqueue_head(&wsm->ws_wait);
+
+	/*
+	 * Preallocate one workspace for each compression type so we can
+	 * guarantee forward progress in the worst case
+	 */
+	workspace = wsm->ops->alloc_workspace(0);
 	if (IS_ERR(workspace)) {
 		pr_warn(
-	"BTRFS: cannot preallocate heuristic workspace, will try later\n");
+	"BTRFS: cannot preallocate compression workspace, will try later\n");
 	} else {
-		atomic_set(&btrfs_heuristic_ws.total_ws, 1);
-		btrfs_heuristic_ws.free_ws = 1;
-		list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+		atomic_set(&wsm->total_ws, 1);
+		wsm->free_ws = 1;
+		list_add(workspace, &wsm->idle_ws);
 	}
+}
 
-	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-		INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
-		spin_lock_init(&btrfs_comp_ws[i].ws_lock);
-		atomic_set(&btrfs_comp_ws[i].total_ws, 0);
-		init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+{
+	struct list_head *ws;
 
-		/*
-		 * Preallocate one workspace for each compression type so
-		 * we can guarantee forward progress in the worst case
-		 */
-		workspace = btrfs_compress_op[i]->alloc_workspace();
-		if (IS_ERR(workspace)) {
-			pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
-		} else {
-			atomic_set(&btrfs_comp_ws[i].total_ws, 1);
-			btrfs_comp_ws[i].free_ws = 1;
-			list_add(workspace, &btrfs_comp_ws[i].idle_ws);
-		}
+	while (!list_empty(&wsman->idle_ws)) {
+		ws = wsman->idle_ws.next;
+		list_del(ws);
+		wsman->ops->free_workspace(ws);
+		atomic_dec(&wsman->total_ws);
 	}
 }
 
@@ -845,11 +885,11 @@
  * Preallocation makes a forward progress guarantees and we do not return
  * errors.
  */
-static struct list_head *__find_workspace(int type, bool heuristic)
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+				      unsigned int level)
 {
 	struct list_head *workspace;
 	int cpus = num_online_cpus();
-	int idx = type - 1;
 	unsigned nofs_flag;
 	struct list_head *idle_ws;
 	spinlock_t *ws_lock;
@@ -857,19 +897,11 @@
 	wait_queue_head_t *ws_wait;
 	int *free_ws;
 
-	if (heuristic) {
-		idle_ws	 = &btrfs_heuristic_ws.idle_ws;
-		ws_lock	 = &btrfs_heuristic_ws.ws_lock;
-		total_ws = &btrfs_heuristic_ws.total_ws;
-		ws_wait	 = &btrfs_heuristic_ws.ws_wait;
-		free_ws	 = &btrfs_heuristic_ws.free_ws;
-	} else {
-		idle_ws	 = &btrfs_comp_ws[idx].idle_ws;
-		ws_lock	 = &btrfs_comp_ws[idx].ws_lock;
-		total_ws = &btrfs_comp_ws[idx].total_ws;
-		ws_wait	 = &btrfs_comp_ws[idx].ws_wait;
-		free_ws	 = &btrfs_comp_ws[idx].free_ws;
-	}
+	idle_ws	 = &wsm->idle_ws;
+	ws_lock	 = &wsm->ws_lock;
+	total_ws = &wsm->total_ws;
+	ws_wait	 = &wsm->ws_wait;
+	free_ws	 = &wsm->free_ws;
 
 again:
 	spin_lock(ws_lock);
@@ -900,10 +932,7 @@
 	 * context of btrfs_compress_bio/btrfs_compress_pages
 	 */
 	nofs_flag = memalloc_nofs_save();
-	if (heuristic)
-		workspace = alloc_heuristic_ws();
-	else
-		workspace = btrfs_compress_op[idx]->alloc_workspace();
+	workspace = wsm->ops->alloc_workspace(level);
 	memalloc_nofs_restore(nofs_flag);
 
 	if (IS_ERR(workspace)) {
@@ -934,85 +963,47 @@
 	return workspace;
 }
 
-static struct list_head *find_workspace(int type)
+static struct list_head *get_workspace(int type, int level)
 {
-	return __find_workspace(type, false);
+	return btrfs_compress_op[type]->get_workspace(level);
 }
 
 /*
  * put a workspace struct back on the list or free it if we have enough
  * idle ones sitting around
  */
-static void __free_workspace(int type, struct list_head *workspace,
-			     bool heuristic)
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
 {
-	int idx = type - 1;
 	struct list_head *idle_ws;
 	spinlock_t *ws_lock;
 	atomic_t *total_ws;
 	wait_queue_head_t *ws_wait;
 	int *free_ws;
 
-	if (heuristic) {
-		idle_ws	 = &btrfs_heuristic_ws.idle_ws;
-		ws_lock	 = &btrfs_heuristic_ws.ws_lock;
-		total_ws = &btrfs_heuristic_ws.total_ws;
-		ws_wait	 = &btrfs_heuristic_ws.ws_wait;
-		free_ws	 = &btrfs_heuristic_ws.free_ws;
-	} else {
-		idle_ws	 = &btrfs_comp_ws[idx].idle_ws;
-		ws_lock	 = &btrfs_comp_ws[idx].ws_lock;
-		total_ws = &btrfs_comp_ws[idx].total_ws;
-		ws_wait	 = &btrfs_comp_ws[idx].ws_wait;
-		free_ws	 = &btrfs_comp_ws[idx].free_ws;
-	}
+	idle_ws	 = &wsm->idle_ws;
+	ws_lock	 = &wsm->ws_lock;
+	total_ws = &wsm->total_ws;
+	ws_wait	 = &wsm->ws_wait;
+	free_ws	 = &wsm->free_ws;
 
 	spin_lock(ws_lock);
 	if (*free_ws <= num_online_cpus()) {
-		list_add(workspace, idle_ws);
+		list_add(ws, idle_ws);
 		(*free_ws)++;
 		spin_unlock(ws_lock);
 		goto wake;
 	}
 	spin_unlock(ws_lock);
 
-	if (heuristic)
-		free_heuristic_ws(workspace);
-	else
-		btrfs_compress_op[idx]->free_workspace(workspace);
+	wsm->ops->free_workspace(ws);
 	atomic_dec(total_ws);
 wake:
 	cond_wake_up(ws_wait);
 }
 
-static void free_workspace(int type, struct list_head *ws)
+static void put_workspace(int type, struct list_head *ws)
 {
-	return __free_workspace(type, ws, false);
-}
-
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
-	struct list_head *workspace;
-	int i;
-
-	while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
-		workspace = btrfs_heuristic_ws.idle_ws.next;
-		list_del(workspace);
-		free_heuristic_ws(workspace);
-		atomic_dec(&btrfs_heuristic_ws.total_ws);
-	}
-
-	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-		while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
-			workspace = btrfs_comp_ws[i].idle_ws.next;
-			list_del(workspace);
-			btrfs_compress_op[i]->free_workspace(workspace);
-			atomic_dec(&btrfs_comp_ws[i].total_ws);
-		}
-	}
+	return btrfs_compress_op[type]->put_workspace(ws);
 }
 
 /*
@@ -1044,18 +1035,18 @@
 			 unsigned long *total_in,
 			 unsigned long *total_out)
 {
+	int type = btrfs_compress_type(type_level);
+	int level = btrfs_compress_level(type_level);
 	struct list_head *workspace;
 	int ret;
-	int type = type_level & 0xF;
 
-	workspace = find_workspace(type);
-
-	btrfs_compress_op[type - 1]->set_level(workspace, type_level);
-	ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+	level = btrfs_compress_set_level(type, level);
+	workspace = get_workspace(type, level);
+	ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
 						      start, pages,
 						      out_pages,
 						      total_in, total_out);
-	free_workspace(type, workspace);
+	put_workspace(type, workspace);
 	return ret;
 }
 
@@ -1079,9 +1070,9 @@
 	int ret;
 	int type = cb->compress_type;
 
-	workspace = find_workspace(type);
-	ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
-	free_workspace(type, workspace);
+	workspace = get_workspace(type, 0);
+	ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+	put_workspace(type, workspace);
 
 	return ret;
 }
@@ -1097,19 +1088,29 @@
 	struct list_head *workspace;
 	int ret;
 
-	workspace = find_workspace(type);
-
-	ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+	workspace = get_workspace(type, 0);
+	ret = btrfs_compress_op[type]->decompress(workspace, data_in,
 						  dest_page, start_byte,
 						  srclen, destlen);
+	put_workspace(type, workspace);
 
-	free_workspace(type, workspace);
 	return ret;
 }
 
+void __init btrfs_init_compress(void)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+		btrfs_compress_op[i]->init_workspace_manager();
+}
+
 void __cold btrfs_exit_compress(void)
 {
-	free_workspaces();
+	int i;
+
+	for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+		btrfs_compress_op[i]->cleanup_workspace_manager();
 }
 
 /*
@@ -1211,7 +1212,7 @@
 /*
  * Shannon Entropy calculation
  *
- * Pure byte distribution analysis fails to determine compressiability of data.
+ * Pure byte distribution analysis fails to determine compressibility of data.
  * Try calculating entropy to estimate the average minimum number of bits
  * needed to encode the sampled data.
  *
@@ -1275,7 +1276,7 @@
 
 /*
  * Use 4 bits as radix base
- * Use 16 u32 counters for calculating new possition in buf array
+ * Use 16 u32 counters for calculating new position in buf array
  *
  * @array     - array that will be sorted
  * @array_buf - buffer array to store sorting results
@@ -1520,7 +1521,7 @@
  */
 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 {
-	struct list_head *ws_list = __find_workspace(0, true);
+	struct list_head *ws_list = get_workspace(0, 0);
 	struct heuristic_ws *ws;
 	u32 i;
 	u8 byte;
@@ -1589,18 +1590,45 @@
 	}
 
 out:
-	__free_workspace(0, ws_list, true);
+	put_workspace(0, ws_list);
 	return ret;
 }
 
-unsigned int btrfs_compress_str2level(const char *str)
+/*
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ * level, unrecognized string will set the default level
+ */
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
 {
-	if (strncmp(str, "zlib", 4) != 0)
+	unsigned int level = 0;
+	int ret;
+
+	if (!type)
 		return 0;
 
-	/* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
-	if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
-		return str[5] - '0';
+	if (str[0] == ':') {
+		ret = kstrtouint(str + 1, 10, &level);
+		if (ret)
+			level = 0;
+	}
 
-	return BTRFS_ZLIB_DEFAULT_LEVEL;
+	level = btrfs_compress_set_level(type, level);
+
+	return level;
+}
+
+/*
+ * Adjust @level according to the limits of the compression algorithm or
+ * fallback to default
+ */
+unsigned int btrfs_compress_set_level(int type, unsigned level)
+{
+	const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+
+	if (level == 0)
+		level = ops->default_level;
+	else
+		level = min(level, ops->max_level);
+
+	return level;
 }

diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ddda9b8..4cb8be9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h

@@ -61,9 +61,19 @@
 	 * the start of a variable length array of checksums only
 	 * used by reads
 	 */
-	u32 sums;
+	u8 sums[];
 };
 
+static inline unsigned int btrfs_compress_type(unsigned int type_level)
+{
+	return (type_level & 0xF);
+}
+
+static inline unsigned int btrfs_compress_level(unsigned int type_level)
+{
+	return ((type_level & 0xF0) >> 4);
+}
+
 void __init btrfs_init_compress(void);
 void __cold btrfs_exit_compress(void);
 
@@ -87,7 +97,7 @@
 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags);
 
-unsigned btrfs_compress_str2level(const char *str);
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
 
 enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
@@ -97,8 +107,35 @@
 	BTRFS_COMPRESS_TYPES = 3,
 };
 
+struct workspace_manager {
+	const struct btrfs_compress_op *ops;
+	struct list_head idle_ws;
+	spinlock_t ws_lock;
+	/* Number of free workspaces */
+	int free_ws;
+	/* Total number of allocated workspaces */
+	atomic_t total_ws;
+	/* Waiters for a free workspace */
+	wait_queue_head_t ws_wait;
+};
+
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+				  const struct btrfs_compress_op *ops);
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+				      unsigned int level);
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+
 struct btrfs_compress_op {
-	struct list_head *(*alloc_workspace)(void);
+	void (*init_workspace_manager)(void);
+
+	void (*cleanup_workspace_manager)(void);
+
+	struct list_head *(*get_workspace)(unsigned int level);
+
+	void (*put_workspace)(struct list_head *ws);
+
+	struct list_head *(*alloc_workspace)(unsigned int level);
 
 	void (*free_workspace)(struct list_head *workspace);
 
@@ -119,14 +156,23 @@
 			  unsigned long start_byte,
 			  size_t srclen, size_t destlen);
 
-	void (*set_level)(struct list_head *ws, unsigned int type);
+	/* Maximum level supported by the compression algorithm */
+	unsigned int max_level;
+	unsigned int default_level;
 };
 
+/* The heuristic workspaces are managed via the 0th workspace manager */
+#define BTRFS_NR_WORKSPACE_MANAGERS	(BTRFS_COMPRESS_TYPES + 1)
+
+extern const struct btrfs_compress_op btrfs_heuristic_compress;
 extern const struct btrfs_compress_op btrfs_zlib_compress;
 extern const struct btrfs_compress_op btrfs_lzo_compress;
 extern const struct btrfs_compress_op btrfs_zstd_compress;
 
 const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
+
+unsigned int btrfs_compress_set_level(int type, unsigned level);
 
 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
 

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 089b46c..e59cde2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c

@@ -12,6 +12,8 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "locking.h"
+#include "volumes.h"
+#include "qgroup.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
@@ -19,16 +21,36 @@
 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
 		      int data_size, int extend);
 static int push_node_left(struct btrfs_trans_handle *trans,
-			  struct btrfs_fs_info *fs_info,
 			  struct extent_buffer *dst,
 			  struct extent_buffer *src, int empty);
 static int balance_node_right(struct btrfs_trans_handle *trans,
-			      struct btrfs_fs_info *fs_info,
 			      struct extent_buffer *dst_buf,
 			      struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 		    int level, int slot);
 
+static const struct btrfs_csums {
+	u16		size;
+	const char	*name;
+} btrfs_csums[] = {
+	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+};
+
+int btrfs_super_csum_size(const struct btrfs_super_block *s)
+{
+	u16 t = btrfs_super_csum_type(s);
+	/*
+	 * csum type is validated at mount time
+	 */
+	return btrfs_csums[t].size;
+}
+
+const char *btrfs_super_csum_name(u16 csum_type)
+{
+	/* csum type is validated at mount time */
+	return btrfs_csums[csum_type].name;
+}
+
 struct btrfs_path *btrfs_alloc_path(void)
 {
 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
@@ -44,48 +66,19 @@
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 		if (!p->nodes[i] || !p->locks[i])
 			continue;
-		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
-		if (p->locks[i] == BTRFS_READ_LOCK)
+		/*
+		 * If we currently have a spinning reader or writer lock this
+		 * will bump the count of blocking holders and drop the
+		 * spinlock.
+		 */
+		if (p->locks[i] == BTRFS_READ_LOCK) {
+			btrfs_set_lock_blocking_read(p->nodes[i]);
 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
-		else if (p->locks[i] == BTRFS_WRITE_LOCK)
+		} else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+			btrfs_set_lock_blocking_write(p->nodes[i]);
 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
-	}
-}
-
-/*
- * reset all the locked nodes in the patch to spinning locks.
- *
- * held is used to keep lockdep happy, when lockdep is enabled
- * we set held to a blocking lock before we go around and
- * retake all the spinlocks in the path.  You can safely use NULL
- * for held
- */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
-					struct extent_buffer *held, int held_rw)
-{
-	int i;
-
-	if (held) {
-		btrfs_set_lock_blocking_rw(held, held_rw);
-		if (held_rw == BTRFS_WRITE_LOCK)
-			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
-		else if (held_rw == BTRFS_READ_LOCK)
-			held_rw = BTRFS_READ_LOCK_BLOCKING;
-	}
-	btrfs_set_path_blocking(p);
-
-	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
-		if (p->nodes[i] && p->locks[i]) {
-			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
-			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
-				p->locks[i] = BTRFS_WRITE_LOCK;
-			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
-				p->locks[i] = BTRFS_READ_LOCK;
 		}
 	}
-
-	if (held)
-		btrfs_clear_lock_blocking_rw(held, held_rw);
 }
 
 /* this also releases the path */
@@ -207,7 +200,7 @@
 	spin_lock(&fs_info->trans_lock);
 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 		/* Want the extent tree to be the last on the list */
-		if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
 			list_move_tail(&root->dirty_list,
 				       &fs_info->dirty_cowonly_roots);
 		else
@@ -260,7 +253,7 @@
 	else
 		btrfs_set_header_owner(cow, new_root_objectid);
 
-	write_extent_buffer_fsid(cow, fs_info->fsid);
+	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
@@ -405,8 +398,6 @@
  * The 'start address' is the logical address of the *new* root node
  * for root replace operations, or the logical address of the affected
  * block for all other operations.
- *
- * Note: must be called with write lock for fs_info::tree_mod_log_lock.
  */
 static noinline int
 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -416,6 +407,8 @@
 	struct rb_node *parent = NULL;
 	struct tree_mod_elem *cur;
 
+	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
+
 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 
 	tm_root = &fs_info->tree_mod_log;
@@ -753,11 +746,11 @@
 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
 }
 
-static noinline int
-tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
 		     struct extent_buffer *src, unsigned long dst_offset,
 		     unsigned long src_offset, int nr_items)
 {
+	struct btrfs_fs_info *fs_info = dst->fs_info;
 	int ret = 0;
 	struct tree_mod_elem **tm_list = NULL;
 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
@@ -977,7 +970,7 @@
 		if (new_flags != 0) {
 			int level = btrfs_header_level(buf);
 
-			ret = btrfs_set_disk_extent_flags(trans, fs_info,
+			ret = btrfs_set_disk_extent_flags(trans,
 							  buf->start,
 							  buf->len,
 							  new_flags, level, 0);
@@ -997,12 +990,54 @@
 			if (ret)
 				return ret;
 		}
-		clean_tree_block(fs_info, buf);
+		btrfs_clean_tree_block(buf);
 		*last_ref = 1;
 	}
 	return 0;
 }
 
+static struct extent_buffer *alloc_tree_block_no_bg_flush(
+					  struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  u64 parent_start,
+					  const struct btrfs_disk_key *disk_key,
+					  int level,
+					  u64 hint,
+					  u64 empty_size)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_buffer *ret;
+
+	/*
+	 * If we are COWing a node/leaf from the extent, chunk, device or free
+	 * space trees, make sure that we do not finish block group creation of
+	 * pending block groups. We do this to avoid a deadlock.
+	 * COWing can result in allocation of a new chunk, and flushing pending
+	 * block groups (btrfs_create_pending_block_groups()) can be triggered
+	 * when finishing allocation of a new chunk. Creation of a pending block
+	 * group modifies the extent, chunk, device and free space trees,
+	 * therefore we could deadlock with ourselves since we are holding a
+	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
+	 * try to COW later.
+	 * For similar reasons, we also need to delay flushing pending block
+	 * groups when splitting a leaf or node, from one of those trees, since
+	 * we are holding a write lock on it and its parent or when inserting a
+	 * new root node for one of those trees.
+	 */
+	if (root == fs_info->extent_root ||
+	    root == fs_info->chunk_root ||
+	    root == fs_info->dev_root ||
+	    root == fs_info->free_space_root)
+		trans->can_flush_pending_bgs = false;
+
+	ret = btrfs_alloc_tree_block(trans, root, parent_start,
+				     root->root_key.objectid, disk_key, level,
+				     hint, empty_size);
+	trans->can_flush_pending_bgs = true;
+
+	return ret;
+}
+
 /*
  * does the dirty work in cow of a single block.  The parent block (if
  * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -1050,26 +1085,8 @@
 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
 		parent_start = parent->start;
 
-	/*
-	 * If we are COWing a node/leaf from the extent, chunk or device trees,
-	 * make sure that we do not finish block group creation of pending block
-	 * groups. We do this to avoid a deadlock.
-	 * COWing can result in allocation of a new chunk, and flushing pending
-	 * block groups (btrfs_create_pending_block_groups()) can be triggered
-	 * when finishing allocation of a new chunk. Creation of a pending block
-	 * group modifies the extent, chunk and device trees, therefore we could
-	 * deadlock with ourselves since we are holding a lock on an extent
-	 * buffer that btrfs_create_pending_block_groups() may try to COW later.
-	 */
-	if (root == fs_info->extent_root ||
-	    root == fs_info->chunk_root ||
-	    root == fs_info->dev_root)
-		trans->can_flush_pending_bgs = false;
-
-	cow = btrfs_alloc_tree_block(trans, root, parent_start,
-			root->root_key.objectid, &disk_key, level,
-			search_start, empty_size);
-	trans->can_flush_pending_bgs = true;
+	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
+					   level, search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -1086,7 +1103,7 @@
 	else
 		btrfs_set_header_owner(cow, root->root_key.objectid);
 
-	write_extent_buffer_fsid(cow, fs_info->fsid);
+	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 
 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
 	if (ret) {
@@ -1299,7 +1316,7 @@
 		return eb;
 
 	btrfs_set_path_blocking(path);
-	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+	btrfs_set_lock_blocking_read(eb);
 
 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 		BUG_ON(tm->slot != 0);
@@ -1323,11 +1340,9 @@
 		}
 	}
 
-	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
 	btrfs_tree_read_unlock_blocking(eb);
 	free_extent_buffer(eb);
 
-	extent_buffer_get(eb_rewin);
 	btrfs_tree_read_lock(eb_rewin);
 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
 	WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1350,6 +1365,7 @@
 	struct tree_mod_elem *tm;
 	struct extent_buffer *eb = NULL;
 	struct extent_buffer *eb_root;
+	u64 eb_root_owner = 0;
 	struct extent_buffer *old;
 	struct tree_mod_root *old_root = NULL;
 	u64 old_generation = 0;
@@ -1387,11 +1403,12 @@
 			free_extent_buffer(old);
 		}
 	} else if (old_root) {
+		eb_root_owner = btrfs_header_owner(eb_root);
 		btrfs_tree_read_unlock(eb_root);
 		free_extent_buffer(eb_root);
 		eb = alloc_dummy_extent_buffer(fs_info, logical);
 	} else {
-		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+		btrfs_set_lock_blocking_read(eb_root);
 		eb = btrfs_clone_extent_buffer(eb_root);
 		btrfs_tree_read_unlock_blocking(eb_root);
 		free_extent_buffer(eb_root);
@@ -1399,12 +1416,11 @@
 
 	if (!eb)
 		return NULL;
-	extent_buffer_get(eb);
 	btrfs_tree_read_lock(eb);
 	if (old_root) {
 		btrfs_set_header_bytenr(eb, eb->start);
 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
-		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+		btrfs_set_header_owner(eb, eb_root_owner);
 		btrfs_set_header_level(eb, old_root->level);
 		btrfs_set_header_generation(eb, old_generation);
 	}
@@ -1452,7 +1468,7 @@
 	 *
 	 * What is forced COW:
 	 *    when we create snapshot during committing the transaction,
-	 *    after we've finished coping src root, we must COW the shared
+	 *    after we've finished copying src root, we must COW the shared
 	 *    block to ensure the metadata consistency.
 	 */
 	if (btrfs_header_generation(buf) == trans->transid &&
@@ -1478,6 +1494,10 @@
 	u64 search_start;
 	int ret;
 
+	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
+		btrfs_err(fs_info,
+			"COW'ing blocks on a fs root that's being dropped");
+
 	if (trans->transaction != fs_info->running_transaction)
 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
 		       trans->transid,
@@ -1496,9 +1516,16 @@
 	search_start = buf->start & ~((u64)SZ_1G - 1);
 
 	if (parent)
-		btrfs_set_lock_blocking(parent);
-	btrfs_set_lock_blocking(buf);
+		btrfs_set_lock_blocking_write(parent);
+	btrfs_set_lock_blocking_write(buf);
 
+	/*
+	 * Before CoWing this block for later modification, check if it's
+	 * the subtree root and do the delayed subtree trace if needed.
+	 *
+	 * Also We don't care about the error, as it's handled internally.
+	 */
+	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
 
@@ -1592,7 +1619,7 @@
 	if (parent_nritems <= 1)
 		return 0;
 
-	btrfs_set_lock_blocking(parent);
+	btrfs_set_lock_blocking_write(parent);
 
 	for (i = start_slot; i <= end_slot; i++) {
 		struct btrfs_key first_key;
@@ -1651,7 +1678,7 @@
 			search_start = last_block;
 
 		btrfs_tree_lock(cur);
-		btrfs_set_lock_blocking(cur);
+		btrfs_set_lock_blocking_write(cur);
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&cur, search_start,
 					min(16 * blocksize,
@@ -1787,9 +1814,8 @@
 /* given a node and slot number, this reads the blocks it points to.  The
  * extent buffer is returned with a reference taken (but unlocked).
  */
-static noinline struct extent_buffer *
-read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
-	       int slot)
+struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
+					   int slot)
 {
 	int level = btrfs_header_level(parent);
 	struct extent_buffer *eb;
@@ -1801,7 +1827,7 @@
 	BUG_ON(level == 0);
 
 	btrfs_node_key_to_cpu(parent, &first_key, slot);
-	eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
+	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
 			     btrfs_node_ptr_generation(parent, slot),
 			     level - 1, &first_key);
 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
@@ -1832,8 +1858,7 @@
 	int orig_slot = path->slots[level];
 	u64 orig_ptr;
 
-	if (level == 0)
-		return 0;
+	ASSERT(level > 0);
 
 	mid = path->nodes[level];
 
@@ -1859,7 +1884,7 @@
 			return 0;
 
 		/* promote the child to a root */
-		child = read_node_slot(fs_info, mid, 0);
+		child = btrfs_read_node_slot(mid, 0);
 		if (IS_ERR(child)) {
 			ret = PTR_ERR(child);
 			btrfs_handle_fs_error(fs_info, ret, NULL);
@@ -1867,7 +1892,7 @@
 		}
 
 		btrfs_tree_lock(child);
-		btrfs_set_lock_blocking(child);
+		btrfs_set_lock_blocking_write(child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
 		if (ret) {
 			btrfs_tree_unlock(child);
@@ -1884,7 +1909,7 @@
 
 		path->locks[level] = 0;
 		path->nodes[level] = NULL;
-		clean_tree_block(fs_info, mid);
+		btrfs_clean_tree_block(mid);
 		btrfs_tree_unlock(mid);
 		/* once for the path */
 		free_extent_buffer(mid);
@@ -1899,13 +1924,13 @@
 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
 		return 0;
 
-	left = read_node_slot(fs_info, parent, pslot - 1);
+	left = btrfs_read_node_slot(parent, pslot - 1);
 	if (IS_ERR(left))
 		left = NULL;
 
 	if (left) {
 		btrfs_tree_lock(left);
-		btrfs_set_lock_blocking(left);
+		btrfs_set_lock_blocking_write(left);
 		wret = btrfs_cow_block(trans, root, left,
 				       parent, pslot - 1, &left);
 		if (wret) {
@@ -1914,13 +1939,13 @@
 		}
 	}
 
-	right = read_node_slot(fs_info, parent, pslot + 1);
+	right = btrfs_read_node_slot(parent, pslot + 1);
 	if (IS_ERR(right))
 		right = NULL;
 
 	if (right) {
 		btrfs_tree_lock(right);
-		btrfs_set_lock_blocking(right);
+		btrfs_set_lock_blocking_write(right);
 		wret = btrfs_cow_block(trans, root, right,
 				       parent, pslot + 1, &right);
 		if (wret) {
@@ -1932,7 +1957,7 @@
 	/* first, try to make some room in the middle buffer */
 	if (left) {
 		orig_slot += btrfs_header_nritems(left);
-		wret = push_node_left(trans, fs_info, left, mid, 1);
+		wret = push_node_left(trans, left, mid, 1);
 		if (wret < 0)
 			ret = wret;
 	}
@@ -1941,11 +1966,11 @@
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right) {
-		wret = push_node_left(trans, fs_info, mid, right, 1);
+		wret = push_node_left(trans, mid, right, 1);
 		if (wret < 0 && wret != -ENOSPC)
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
-			clean_tree_block(fs_info, right);
+			btrfs_clean_tree_block(right);
 			btrfs_tree_unlock(right);
 			del_ptr(root, path, level + 1, pslot + 1);
 			root_sub_used(root, right->len);
@@ -1977,20 +2002,20 @@
 			btrfs_handle_fs_error(fs_info, ret, NULL);
 			goto enospc;
 		}
-		wret = balance_node_right(trans, fs_info, mid, left);
+		wret = balance_node_right(trans, mid, left);
 		if (wret < 0) {
 			ret = wret;
 			goto enospc;
 		}
 		if (wret == 1) {
-			wret = push_node_left(trans, fs_info, left, mid, 1);
+			wret = push_node_left(trans, left, mid, 1);
 			if (wret < 0)
 				ret = wret;
 		}
 		BUG_ON(wret == 1);
 	}
 	if (btrfs_header_nritems(mid) == 0) {
-		clean_tree_block(fs_info, mid);
+		btrfs_clean_tree_block(mid);
 		btrfs_tree_unlock(mid);
 		del_ptr(root, path, level + 1, pslot);
 		root_sub_used(root, mid->len);
@@ -2074,7 +2099,7 @@
 	if (!parent)
 		return 1;
 
-	left = read_node_slot(fs_info, parent, pslot - 1);
+	left = btrfs_read_node_slot(parent, pslot - 1);
 	if (IS_ERR(left))
 		left = NULL;
 
@@ -2083,7 +2108,7 @@
 		u32 left_nr;
 
 		btrfs_tree_lock(left);
-		btrfs_set_lock_blocking(left);
+		btrfs_set_lock_blocking_write(left);
 
 		left_nr = btrfs_header_nritems(left);
 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2094,8 +2119,7 @@
 			if (ret)
 				wret = 1;
 			else {
-				wret = push_node_left(trans, fs_info,
-						      left, mid, 0);
+				wret = push_node_left(trans, left, mid, 0);
 			}
 		}
 		if (wret < 0)
@@ -2127,7 +2151,7 @@
 		btrfs_tree_unlock(left);
 		free_extent_buffer(left);
 	}
-	right = read_node_slot(fs_info, parent, pslot + 1);
+	right = btrfs_read_node_slot(parent, pslot + 1);
 	if (IS_ERR(right))
 		right = NULL;
 
@@ -2138,7 +2162,7 @@
 		u32 right_nr;
 
 		btrfs_tree_lock(right);
-		btrfs_set_lock_blocking(right);
+		btrfs_set_lock_blocking_write(right);
 
 		right_nr = btrfs_header_nritems(right);
 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2150,8 +2174,7 @@
 			if (ret)
 				wret = 1;
 			else {
-				wret = balance_node_right(trans, fs_info,
-							  right, mid);
+				wret = balance_node_right(trans, right, mid);
 			}
 		}
 		if (wret < 0)
@@ -2412,6 +2435,16 @@
 	if (tmp) {
 		/* first we do an atomic uptodate check */
 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+			/*
+			 * Do extra check for first_key, eb can be stale due to
+			 * being cached, read from scrub, or have multiple
+			 * parents (shared tree blocks).
+			 */
+			if (btrfs_verify_level_key(tmp,
+					parent_level - 1, &first_key, gen)) {
+				free_extent_buffer(tmp);
+				return -EUCLEAN;
+			}
 			*eb_ret = tmp;
 			return 0;
 		}
@@ -2500,7 +2533,6 @@
 		btrfs_set_path_blocking(p);
 		reada_for_balance(fs_info, p, level);
 		sret = split_node(trans, root, p, level);
-		btrfs_clear_path_blocking(p, NULL, 0);
 
 		BUG_ON(sret > 0);
 		if (sret) {
@@ -2521,7 +2553,6 @@
 		btrfs_set_path_blocking(p);
 		reada_for_balance(fs_info, p, level);
 		sret = balance_level(trans, root, p, level);
-		btrfs_clear_path_blocking(p, NULL, 0);
 
 		if (sret) {
 			ret = sret;
@@ -2542,26 +2573,6 @@
 	return ret;
 }
 
-static void key_search_validate(struct extent_buffer *b,
-				const struct btrfs_key *key,
-				int level)
-{
-#ifdef CONFIG_BTRFS_ASSERT
-	struct btrfs_disk_key disk_key;
-
-	btrfs_cpu_key_to_disk(&disk_key, key);
-
-	if (level == 0)
-		ASSERT(!memcmp_extent_buffer(b, &disk_key,
-		    offsetof(struct btrfs_leaf, items[0].key),
-		    sizeof(disk_key)));
-	else
-		ASSERT(!memcmp_extent_buffer(b, &disk_key,
-		    offsetof(struct btrfs_node, ptrs[0].key),
-		    sizeof(disk_key)));
-#endif
-}
-
 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
 		      int level, int *prev_cmp, int *slot)
 {
@@ -2570,7 +2581,6 @@
 		return *prev_cmp;
 	}
 
-	key_search_validate(b, key, level);
 	*slot = 0;
 
 	return 0;
@@ -2624,14 +2634,27 @@
 	root_lock = BTRFS_READ_LOCK;
 
 	if (p->search_commit_root) {
-		/* The commit roots are read only so we always do read locks */
-		if (p->need_commit_sem)
+		/*
+		 * The commit roots are read only so we always do read locks,
+		 * and we always must hold the commit_root_sem when doing
+		 * searches on them, the only exception is send where we don't
+		 * want to block transaction commits for a long time, so
+		 * we need to clone the commit root in order to avoid races
+		 * with transaction commits that create a snapshot of one of
+		 * the roots used by a send operation.
+		 */
+		if (p->need_commit_sem) {
 			down_read(&fs_info->commit_root_sem);
-		b = root->commit_root;
-		extent_buffer_get(b);
-		level = btrfs_header_level(b);
-		if (p->need_commit_sem)
+			b = btrfs_clone_extent_buffer(root->commit_root);
 			up_read(&fs_info->commit_root_sem);
+			if (!b)
+				return ERR_PTR(-ENOMEM);
+
+		} else {
+			b = root->commit_root;
+			extent_buffer_get(b);
+		}
+		level = btrfs_header_level(b);
 		/*
 		 * Ensure that all callers have set skip_locking when
 		 * p->search_commit_root = 1.
@@ -2712,7 +2735,6 @@
 		      const struct btrfs_key *key, struct btrfs_path *p,
 		      int ins_len, int cow)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *b;
 	int slot;
 	int ret;
@@ -2757,6 +2779,10 @@
 again:
 	prev_cmp = -1;
 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
+	if (IS_ERR(b)) {
+		ret = PTR_ERR(b);
+		goto done;
+	}
 
 	while (b) {
 		level = btrfs_header_level(b);
@@ -2806,7 +2832,10 @@
 		}
 cow_done:
 		p->nodes[level] = b;
-		btrfs_clear_path_blocking(p, NULL, 0);
+		/*
+		 * Leave path with blocking locks to avoid massive
+		 * lock context switch, this is made on purpose.
+		 */
 
 		/*
 		 * we have a lock on b and as long as we aren't changing
@@ -2884,21 +2913,15 @@
 			if (!p->skip_locking) {
 				level = btrfs_header_level(b);
 				if (level <= write_lock_level) {
-					err = btrfs_try_tree_write_lock(b);
-					if (!err) {
+					if (!btrfs_try_tree_write_lock(b)) {
 						btrfs_set_path_blocking(p);
 						btrfs_tree_lock(b);
-						btrfs_clear_path_blocking(p, b,
-								  BTRFS_WRITE_LOCK);
 					}
 					p->locks[level] = BTRFS_WRITE_LOCK;
 				} else {
-					err = btrfs_tree_read_lock_atomic(b);
-					if (!err) {
+					if (!btrfs_tree_read_lock_atomic(b)) {
 						btrfs_set_path_blocking(p);
 						btrfs_tree_read_lock(b);
-						btrfs_clear_path_blocking(p, b,
-								  BTRFS_READ_LOCK);
 					}
 					p->locks[level] = BTRFS_READ_LOCK;
 				}
@@ -2907,7 +2930,7 @@
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 &&
-			    btrfs_leaf_free_space(fs_info, b) < ins_len) {
+			    btrfs_leaf_free_space(b) < ins_len) {
 				if (write_lock_level < 1) {
 					write_lock_level = 1;
 					btrfs_release_path(p);
@@ -2917,7 +2940,6 @@
 				btrfs_set_path_blocking(p);
 				err = split_leaf(trans, root, key,
 						 p, ins_len, ret == 0);
-				btrfs_clear_path_blocking(p, NULL, 0);
 
 				BUG_ON(err > 0);
 				if (err) {
@@ -2927,7 +2949,7 @@
 			}
 			if (!p->search_for_split)
 				unlock_up(p, level, lowest_unlock,
-					  min_write_lock_level, &write_lock_level);
+					  min_write_lock_level, NULL);
 			goto done;
 		}
 	}
@@ -2978,13 +3000,16 @@
 
 again:
 	b = get_old_root(root, time_seq);
+	if (!b) {
+		ret = -EIO;
+		goto done;
+	}
 	level = btrfs_header_level(b);
 	p->locks[level] = BTRFS_READ_LOCK;
 
 	while (b) {
 		level = btrfs_header_level(b);
 		p->nodes[level] = b;
-		btrfs_clear_path_blocking(p, NULL, 0);
 
 		/*
 		 * we have a lock on b and as long as we aren't changing
@@ -3000,6 +3025,8 @@
 		 */
 		prev_cmp = -1;
 		ret = key_search(b, key, level, &prev_cmp, &slot);
+		if (ret < 0)
+			goto done;
 
 		if (level != 0) {
 			int dec = 0;
@@ -3026,12 +3053,9 @@
 			}
 
 			level = btrfs_header_level(b);
-			err = btrfs_tree_read_lock_atomic(b);
-			if (!err) {
+			if (!btrfs_tree_read_lock_atomic(b)) {
 				btrfs_set_path_blocking(p);
 				btrfs_tree_read_lock(b);
-				btrfs_clear_path_blocking(p, b,
-							  BTRFS_READ_LOCK);
 			}
 			b = tree_mod_log_rewind(fs_info, p, b, time_seq);
 			if (!b) {
@@ -3182,11 +3206,31 @@
 	slot = path->slots[0];
 	if (slot > 0) {
 		btrfs_item_key(eb, &disk_key, slot - 1);
-		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
+		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+			btrfs_crit(fs_info,
+		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+				   slot, btrfs_disk_key_objectid(&disk_key),
+				   btrfs_disk_key_type(&disk_key),
+				   btrfs_disk_key_offset(&disk_key),
+				   new_key->objectid, new_key->type,
+				   new_key->offset);
+			btrfs_print_leaf(eb);
+			BUG();
+		}
 	}
 	if (slot < btrfs_header_nritems(eb) - 1) {
 		btrfs_item_key(eb, &disk_key, slot + 1);
-		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
+		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+			btrfs_crit(fs_info,
+		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+				   slot, btrfs_disk_key_objectid(&disk_key),
+				   btrfs_disk_key_type(&disk_key),
+				   btrfs_disk_key_offset(&disk_key),
+				   new_key->objectid, new_key->type,
+				   new_key->offset);
+			btrfs_print_leaf(eb);
+			BUG();
+		}
 	}
 
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3204,10 +3248,10 @@
  * error, and > 0 if there was no room in the left hand block.
  */
 static int push_node_left(struct btrfs_trans_handle *trans,
-			  struct btrfs_fs_info *fs_info,
 			  struct extent_buffer *dst,
 			  struct extent_buffer *src, int empty)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int push_items = 0;
 	int src_nritems;
 	int dst_nritems;
@@ -3240,8 +3284,7 @@
 	} else
 		push_items = min(src_nritems - 8, push_items);
 
-	ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
-				   push_items);
+	ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
@@ -3279,10 +3322,10 @@
  * this will  only push up to 1/2 the contents of the left node over
  */
 static int balance_node_right(struct btrfs_trans_handle *trans,
-			      struct btrfs_fs_info *fs_info,
 			      struct extent_buffer *dst,
 			      struct extent_buffer *src)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int push_items = 0;
 	int max_push;
 	int src_nritems;
@@ -3316,8 +3359,8 @@
 				      (dst_nritems) *
 				      sizeof(struct btrfs_key_ptr));
 
-	ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
-				   src_nritems - push_items, push_items);
+	ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
+				   push_items);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
@@ -3364,8 +3407,8 @@
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 
-	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-				   &lower_key, level, root->node->start, 0);
+	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
+					 root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
@@ -3405,7 +3448,7 @@
  * blocknr is the block the key points to.
  */
 static void insert_ptr(struct btrfs_trans_handle *trans,
-		       struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+		       struct btrfs_path *path,
 		       struct btrfs_disk_key *key, u64 bytenr,
 		       int slot, int level)
 {
@@ -3418,7 +3461,7 @@
 	lower = path->nodes[level];
 	nritems = btrfs_header_nritems(lower);
 	BUG_ON(slot > nritems);
-	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
 	if (slot != nritems) {
 		if (level) {
 			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
@@ -3494,15 +3537,15 @@
 	mid = (c_nritems + 1) / 2;
 	btrfs_node_key(c, &disk_key, mid);
 
-	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-			&disk_key, level, c->start, 0);
+	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
+					     c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
 	root_add_used(root, fs_info->nodesize);
 	ASSERT(btrfs_header_level(c) == level);
 
-	ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
+	ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
@@ -3518,7 +3561,7 @@
 	btrfs_mark_buffer_dirty(c);
 	btrfs_mark_buffer_dirty(split);
 
-	insert_ptr(trans, fs_info, path, &disk_key, split->start,
+	insert_ptr(trans, path, &disk_key, split->start,
 		   path->slots[level + 1] + 1, level + 1);
 
 	if (path->slots[level] >= mid) {
@@ -3550,7 +3593,7 @@
 
 	if (!nr)
 		return 0;
-	btrfs_init_map_token(&token);
+	btrfs_init_map_token(&token, l);
 	start_item = btrfs_item_nr(start);
 	end_item = btrfs_item_nr(end);
 	data_len = btrfs_token_item_offset(l, start_item, &token) +
@@ -3566,9 +3609,9 @@
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
-				   struct extent_buffer *leaf)
+noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
 {
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	int nritems = btrfs_header_nritems(leaf);
 	int ret;
 
@@ -3587,13 +3630,13 @@
  * min slot controls the lowest index we're willing to push to the
  * right.  We'll push up to and including min_slot, but no lower
  */
-static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
-				      struct btrfs_path *path,
+static noinline int __push_leaf_right(struct btrfs_path *path,
 				      int data_size, int empty,
 				      struct extent_buffer *right,
 				      int free_space, u32 left_nritems,
 				      u32 min_slot)
 {
+	struct btrfs_fs_info *fs_info = right->fs_info;
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *upper = path->nodes[1];
 	struct btrfs_map_token token;
@@ -3608,8 +3651,6 @@
 	u32 data_end;
 	u32 this_item_size;
 
-	btrfs_init_map_token(&token);
-
 	if (empty)
 		nr = 0;
 	else
@@ -3627,7 +3668,8 @@
 			if (path->slots[0] > i)
 				break;
 			if (path->slots[0] == i) {
-				int space = btrfs_leaf_free_space(fs_info, left);
+				int space = btrfs_leaf_free_space(left);
+
 				if (space + push_space * 2 > free_space)
 					break;
 			}
@@ -3656,10 +3698,10 @@
 	right_nritems = btrfs_header_nritems(right);
 
 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
-	push_space -= leaf_data_end(fs_info, left);
+	push_space -= leaf_data_end(left);
 
 	/* make room in the right data area */
-	data_end = leaf_data_end(fs_info, right);
+	data_end = leaf_data_end(right);
 	memmove_extent_buffer(right,
 			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
 			      BTRFS_LEAF_DATA_OFFSET + data_end,
@@ -3668,7 +3710,7 @@
 	/* copy from the left data area */
 	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
 		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
-		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
+		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
 		     push_space);
 
 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
@@ -3681,6 +3723,7 @@
 		   push_items * sizeof(struct btrfs_item));
 
 	/* update the item pointers */
+	btrfs_init_map_token(&token, right);
 	right_nritems += push_items;
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
@@ -3696,7 +3739,7 @@
 	if (left_nritems)
 		btrfs_mark_buffer_dirty(left);
 	else
-		clean_tree_block(fs_info, left);
+		btrfs_clean_tree_block(left);
 
 	btrfs_mark_buffer_dirty(right);
 
@@ -3708,7 +3751,7 @@
 	if (path->slots[0] >= left_nritems) {
 		path->slots[0] -= left_nritems;
 		if (btrfs_header_nritems(path->nodes[0]) == 0)
-			clean_tree_block(fs_info, path->nodes[0]);
+			btrfs_clean_tree_block(path->nodes[0]);
 		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = right;
@@ -3740,7 +3783,6 @@
 			   int min_data_size, int data_size,
 			   int empty, u32 min_slot)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *right;
 	struct extent_buffer *upper;
@@ -3759,7 +3801,7 @@
 
 	btrfs_assert_tree_locked(path->nodes[1]);
 
-	right = read_node_slot(fs_info, upper, slot + 1);
+	right = btrfs_read_node_slot(upper, slot + 1);
 	/*
 	 * slot + 1 is not valid or we fail to read the right node,
 	 * no big deal, just return.
@@ -3768,9 +3810,9 @@
 		return 1;
 
 	btrfs_tree_lock(right);
-	btrfs_set_lock_blocking(right);
+	btrfs_set_lock_blocking_write(right);
 
-	free_space = btrfs_leaf_free_space(fs_info, right);
+	free_space = btrfs_leaf_free_space(right);
 	if (free_space < data_size)
 		goto out_unlock;
 
@@ -3780,7 +3822,7 @@
 	if (ret)
 		goto out_unlock;
 
-	free_space = btrfs_leaf_free_space(fs_info, right);
+	free_space = btrfs_leaf_free_space(right);
 	if (free_space < data_size)
 		goto out_unlock;
 
@@ -3792,7 +3834,7 @@
 		/* Key greater than all keys in the leaf, right neighbor has
 		 * enough room for it and we're not emptying our leaf to delete
 		 * it, therefore use right neighbor to insert the new item and
-		 * no need to touch/dirty our left leaft. */
+		 * no need to touch/dirty our left leaf. */
 		btrfs_tree_unlock(left);
 		free_extent_buffer(left);
 		path->nodes[0] = right;
@@ -3801,7 +3843,7 @@
 		return 0;
 	}
 
-	return __push_leaf_right(fs_info, path, min_data_size, empty,
+	return __push_leaf_right(path, min_data_size, empty,
 				right, free_space, left_nritems, min_slot);
 out_unlock:
 	btrfs_tree_unlock(right);
@@ -3817,12 +3859,12 @@
  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
  * items
  */
-static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
-				     struct btrfs_path *path, int data_size,
+static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 				     int empty, struct extent_buffer *left,
 				     int free_space, u32 right_nritems,
 				     u32 max_slot)
 {
+	struct btrfs_fs_info *fs_info = left->fs_info;
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
 	int i;
@@ -3836,8 +3878,6 @@
 	u32 old_left_item_size;
 	struct btrfs_map_token token;
 
-	btrfs_init_map_token(&token);
-
 	if (empty)
 		nr = min(right_nritems, max_slot);
 	else
@@ -3850,7 +3890,8 @@
 			if (path->slots[0] < i)
 				break;
 			if (path->slots[0] == i) {
-				int space = btrfs_leaf_free_space(fs_info, right);
+				int space = btrfs_leaf_free_space(right);
+
 				if (space + push_space * 2 > free_space)
 					break;
 			}
@@ -3883,13 +3924,14 @@
 		     btrfs_item_offset_nr(right, push_items - 1);
 
 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
-		     leaf_data_end(fs_info, left) - push_space,
+		     leaf_data_end(left) - push_space,
 		     BTRFS_LEAF_DATA_OFFSET +
 		     btrfs_item_offset_nr(right, push_items - 1),
 		     push_space);
 	old_left_nritems = btrfs_header_nritems(left);
 	BUG_ON(old_left_nritems <= 0);
 
+	btrfs_init_map_token(&token, left);
 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		u32 ioff;
@@ -3910,17 +3952,19 @@
 
 	if (push_items < right_nritems) {
 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
-						  leaf_data_end(fs_info, right);
+						  leaf_data_end(right);
 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
 				      BTRFS_LEAF_DATA_OFFSET +
-				      leaf_data_end(fs_info, right), push_space);
+				      leaf_data_end(right), push_space);
 
 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
 			      btrfs_item_nr_offset(push_items),
 			     (btrfs_header_nritems(right) - push_items) *
 			     sizeof(struct btrfs_item));
 	}
+
+	btrfs_init_map_token(&token, right);
 	right_nritems -= push_items;
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
@@ -3936,7 +3980,7 @@
 	if (right_nritems)
 		btrfs_mark_buffer_dirty(right);
 	else
-		clean_tree_block(fs_info, right);
+		btrfs_clean_tree_block(right);
 
 	btrfs_item_key(right, &disk_key, 0);
 	fixup_low_keys(path, &disk_key, 1);
@@ -3973,7 +4017,6 @@
 			  *root, struct btrfs_path *path, int min_data_size,
 			  int data_size, int empty, u32 max_slot)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *right = path->nodes[0];
 	struct extent_buffer *left;
 	int slot;
@@ -3993,7 +4036,7 @@
 
 	btrfs_assert_tree_locked(path->nodes[1]);
 
-	left = read_node_slot(fs_info, path->nodes[1], slot - 1);
+	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
 	/*
 	 * slot - 1 is not valid or we fail to read the left node,
 	 * no big deal, just return.
@@ -4002,9 +4045,9 @@
 		return 1;
 
 	btrfs_tree_lock(left);
-	btrfs_set_lock_blocking(left);
+	btrfs_set_lock_blocking_write(left);
 
-	free_space = btrfs_leaf_free_space(fs_info, left);
+	free_space = btrfs_leaf_free_space(left);
 	if (free_space < data_size) {
 		ret = 1;
 		goto out;
@@ -4020,13 +4063,13 @@
 		goto out;
 	}
 
-	free_space = btrfs_leaf_free_space(fs_info, left);
+	free_space = btrfs_leaf_free_space(left);
 	if (free_space < data_size) {
 		ret = 1;
 		goto out;
 	}
 
-	return __push_leaf_left(fs_info, path, min_data_size,
+	return __push_leaf_left(path, min_data_size,
 			       empty, left, free_space, right_nritems,
 			       max_slot);
 out:
@@ -4040,23 +4083,21 @@
  * available for the resulting leaf level of the path.
  */
 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info,
 				    struct btrfs_path *path,
 				    struct extent_buffer *l,
 				    struct extent_buffer *right,
 				    int slot, int mid, int nritems)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int data_copy_size;
 	int rt_data_off;
 	int i;
 	struct btrfs_disk_key disk_key;
 	struct btrfs_map_token token;
 
-	btrfs_init_map_token(&token);
-
 	nritems = nritems - mid;
 	btrfs_set_header_nritems(right, nritems);
-	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
+	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
 
 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
 			   btrfs_item_nr_offset(mid),
@@ -4065,10 +4106,11 @@
 	copy_extent_buffer(right, l,
 		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
-		     leaf_data_end(fs_info, l), data_copy_size);
+		     leaf_data_end(l), data_copy_size);
 
 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
 
+	btrfs_init_map_token(&token, right);
 	for (i = 0; i < nritems; i++) {
 		struct btrfs_item *item = btrfs_item_nr(i);
 		u32 ioff;
@@ -4080,8 +4122,7 @@
 
 	btrfs_set_header_nritems(l, mid);
 	btrfs_item_key(right, &disk_key, 0);
-	insert_ptr(trans, fs_info, path, &disk_key, right->start,
-		   path->slots[1] + 1, 1);
+	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
 
 	btrfs_mark_buffer_dirty(right);
 	btrfs_mark_buffer_dirty(l);
@@ -4116,7 +4157,6 @@
 					  struct btrfs_path *path,
 					  int data_size)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	int progress = 0;
 	int slot;
@@ -4125,7 +4165,7 @@
 
 	slot = path->slots[0];
 	if (slot < btrfs_header_nritems(path->nodes[0]))
-		space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
+		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
 
 	/*
 	 * try to push all the items after our slot into the
@@ -4146,14 +4186,14 @@
 	if (path->slots[0] == 0 || path->slots[0] == nritems)
 		return 0;
 
-	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
+	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
 		return 0;
 
 	/* try to push all the items before our slot into the next leaf */
 	slot = path->slots[0];
 	space_needed = data_size;
 	if (slot > 0)
-		space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
+		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
 	if (ret < 0)
 		return ret;
@@ -4202,7 +4242,7 @@
 		int space_needed = data_size;
 
 		if (slot < btrfs_header_nritems(l))
-			space_needed -= btrfs_leaf_free_space(fs_info, l);
+			space_needed -= btrfs_leaf_free_space(l);
 
 		wret = push_leaf_right(trans, root, path, space_needed,
 				       space_needed, 0, 0);
@@ -4211,8 +4251,7 @@
 		if (wret) {
 			space_needed = data_size;
 			if (slot > 0)
-				space_needed -= btrfs_leaf_free_space(fs_info,
-								      l);
+				space_needed -= btrfs_leaf_free_space(l);
 			wret = push_leaf_left(trans, root, path, space_needed,
 					      space_needed, 0, (u32)-1);
 			if (wret < 0)
@@ -4221,7 +4260,7 @@
 		l = path->nodes[0];
 
 		/* did the pushes work? */
-		if (btrfs_leaf_free_space(fs_info, l) >= data_size)
+		if (btrfs_leaf_free_space(l) >= data_size)
 			return 0;
 	}
 
@@ -4279,8 +4318,8 @@
 	else
 		btrfs_item_key(l, &disk_key, mid);
 
-	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-			&disk_key, 0, l->start, 0);
+	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
+					     l->start, 0);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
@@ -4289,7 +4328,7 @@
 	if (split == 0) {
 		if (mid <= slot) {
 			btrfs_set_header_nritems(right, 0);
-			insert_ptr(trans, fs_info, path, &disk_key,
+			insert_ptr(trans, path, &disk_key,
 				   right->start, path->slots[1] + 1, 1);
 			btrfs_tree_unlock(path->nodes[0]);
 			free_extent_buffer(path->nodes[0]);
@@ -4298,7 +4337,7 @@
 			path->slots[1] += 1;
 		} else {
 			btrfs_set_header_nritems(right, 0);
-			insert_ptr(trans, fs_info, path, &disk_key,
+			insert_ptr(trans, path, &disk_key,
 				   right->start, path->slots[1], 1);
 			btrfs_tree_unlock(path->nodes[0]);
 			free_extent_buffer(path->nodes[0]);
@@ -4315,7 +4354,7 @@
 		return ret;
 	}
 
-	copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
+	copy_for_split(trans, path, l, right, slot, mid, nritems);
 
 	if (split == 2) {
 		BUG_ON(num_doubles != 0);
@@ -4328,7 +4367,7 @@
 push_for_double:
 	push_for_double_split(trans, root, path, data_size);
 	tried_avoid_double = 1;
-	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
+	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
 		return 0;
 	goto again;
 }
@@ -4337,7 +4376,6 @@
 					 struct btrfs_root *root,
 					 struct btrfs_path *path, int ins_len)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
@@ -4351,7 +4389,7 @@
 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
 	       key.type != BTRFS_EXTENT_CSUM_KEY);
 
-	if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
+	if (btrfs_leaf_free_space(leaf) >= ins_len)
 		return 0;
 
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -4378,7 +4416,7 @@
 		goto err;
 
 	/* the leaf has  changed, it now has room.  return now */
-	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
+	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
 		goto err;
 
 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
@@ -4401,8 +4439,7 @@
 	return ret;
 }
 
-static noinline int split_item(struct btrfs_fs_info *fs_info,
-			       struct btrfs_path *path,
+static noinline int split_item(struct btrfs_path *path,
 			       const struct btrfs_key *new_key,
 			       unsigned long split_offset)
 {
@@ -4417,7 +4454,7 @@
 	struct btrfs_disk_key disk_key;
 
 	leaf = path->nodes[0];
-	BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
+	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
 
 	btrfs_set_path_blocking(path);
 
@@ -4466,7 +4503,7 @@
 			    item_size - split_offset);
 	btrfs_mark_buffer_dirty(leaf);
 
-	BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
+	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
 	kfree(buf);
 	return 0;
 }
@@ -4498,7 +4535,7 @@
 	if (ret)
 		return ret;
 
-	ret = split_item(root->fs_info, path, new_key, split_offset);
+	ret = split_item(path, new_key, split_offset);
 	return ret;
 }
 
@@ -4544,8 +4581,7 @@
  * off the end of the item or if we shift the item to chop bytes off
  * the front.
  */
-void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
-			 struct btrfs_path *path, u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 {
 	int slot;
 	struct extent_buffer *leaf;
@@ -4558,8 +4594,6 @@
 	int i;
 	struct btrfs_map_token token;
 
-	btrfs_init_map_token(&token);
-
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 
@@ -4568,7 +4602,7 @@
 		return;
 
 	nritems = btrfs_header_nritems(leaf);
-	data_end = leaf_data_end(fs_info, leaf);
+	data_end = leaf_data_end(leaf);
 
 	old_data_start = btrfs_item_offset_nr(leaf, slot);
 
@@ -4581,6 +4615,7 @@
 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
 	 */
 	/* first correct the data pointers */
+	btrfs_init_map_token(&token, leaf);
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
 		item = btrfs_item_nr(i);
@@ -4634,7 +4669,7 @@
 	btrfs_set_item_size(leaf, item, new_size);
 	btrfs_mark_buffer_dirty(leaf);
 
-	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
 		BUG();
 	}
@@ -4643,8 +4678,7 @@
 /*
  * make the item pointed to by the path bigger, data_size is the added size.
  */
-void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-		       u32 data_size)
+void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 {
 	int slot;
 	struct extent_buffer *leaf;
@@ -4656,14 +4690,12 @@
 	int i;
 	struct btrfs_map_token token;
 
-	btrfs_init_map_token(&token);
-
 	leaf = path->nodes[0];
 
 	nritems = btrfs_header_nritems(leaf);
-	data_end = leaf_data_end(fs_info, leaf);
+	data_end = leaf_data_end(leaf);
 
-	if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
+	if (btrfs_leaf_free_space(leaf) < data_size) {
 		btrfs_print_leaf(leaf);
 		BUG();
 	}
@@ -4673,15 +4705,16 @@
 	BUG_ON(slot < 0);
 	if (slot >= nritems) {
 		btrfs_print_leaf(leaf);
-		btrfs_crit(fs_info, "slot %d too large, nritems %d",
+		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
 			   slot, nritems);
-		BUG_ON(1);
+		BUG();
 	}
 
 	/*
 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
 	 */
 	/* first correct the data pointers */
+	btrfs_init_map_token(&token, leaf);
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
 		item = btrfs_item_nr(i);
@@ -4702,7 +4735,7 @@
 	btrfs_set_item_size(leaf, item, old_size + data_size);
 	btrfs_mark_buffer_dirty(leaf);
 
-	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
 		BUG();
 	}
@@ -4733,21 +4766,20 @@
 	}
 	btrfs_unlock_up_safe(path, 1);
 
-	btrfs_init_map_token(&token);
-
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 
 	nritems = btrfs_header_nritems(leaf);
-	data_end = leaf_data_end(fs_info, leaf);
+	data_end = leaf_data_end(leaf);
 
-	if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
+	if (btrfs_leaf_free_space(leaf) < total_size) {
 		btrfs_print_leaf(leaf);
 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
-			   total_size, btrfs_leaf_free_space(fs_info, leaf));
+			   total_size, btrfs_leaf_free_space(leaf));
 		BUG();
 	}
 
+	btrfs_init_map_token(&token, leaf);
 	if (slot != nritems) {
 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
 
@@ -4755,7 +4787,7 @@
 			btrfs_print_leaf(leaf);
 			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
 				   slot, old_data, data_end);
-			BUG_ON(1);
+			BUG();
 		}
 		/*
 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
@@ -4795,7 +4827,7 @@
 	btrfs_set_header_nritems(leaf, nritems + nr);
 	btrfs_mark_buffer_dirty(leaf);
 
-	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
 		BUG();
 	}
@@ -4954,9 +4986,6 @@
 	int wret;
 	int i;
 	u32 nritems;
-	struct btrfs_map_token token;
-
-	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
@@ -4967,13 +4996,15 @@
 	nritems = btrfs_header_nritems(leaf);
 
 	if (slot + nr != nritems) {
-		int data_end = leaf_data_end(fs_info, leaf);
+		int data_end = leaf_data_end(leaf);
+		struct btrfs_map_token token;
 
 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
 			      data_end + dsize,
 			      BTRFS_LEAF_DATA_OFFSET + data_end,
 			      last_off - data_end);
 
+		btrfs_init_map_token(&token, leaf);
 		for (i = slot + nr; i < nritems; i++) {
 			u32 ioff;
 
@@ -4997,7 +5028,7 @@
 			btrfs_set_header_level(leaf, 0);
 		} else {
 			btrfs_set_path_blocking(path);
-			clean_tree_block(fs_info, leaf);
+			btrfs_clean_tree_block(leaf);
 			btrfs_del_leaf(trans, root, path, leaf);
 		}
 	} else {
@@ -5127,7 +5158,6 @@
 			 struct btrfs_path *path,
 			 u64 min_trans)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *cur;
 	struct btrfs_key found_key;
 	int slot;
@@ -5153,6 +5183,10 @@
 		nritems = btrfs_header_nritems(cur);
 		level = btrfs_header_level(cur);
 		sret = btrfs_bin_search(cur, min_key, level, &slot);
+		if (sret < 0) {
+			ret = sret;
+			goto out;
+		}
 
 		/* at the lowest level, we're done, setup the path and exit */
 		if (level == path->lowest_level) {
@@ -5204,7 +5238,7 @@
 			goto out;
 		}
 		btrfs_set_path_blocking(path);
-		cur = read_node_slot(fs_info, cur, slot);
+		cur = btrfs_read_node_slot(cur, slot);
 		if (IS_ERR(cur)) {
 			ret = PTR_ERR(cur);
 			goto out;
@@ -5215,7 +5249,6 @@
 		path->locks[level - 1] = BTRFS_READ_LOCK;
 		path->nodes[level - 1] = cur;
 		unlock_up(path, level, 1, 0, NULL);
-		btrfs_clear_path_blocking(path, NULL, 0);
 	}
 out:
 	path->keep_locks = keep_locks;
@@ -5227,373 +5260,6 @@
 	return ret;
 }
 
-static int tree_move_down(struct btrfs_fs_info *fs_info,
-			   struct btrfs_path *path,
-			   int *level)
-{
-	struct extent_buffer *eb;
-
-	BUG_ON(*level == 0);
-	eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
-	if (IS_ERR(eb))
-		return PTR_ERR(eb);
-
-	path->nodes[*level - 1] = eb;
-	path->slots[*level - 1] = 0;
-	(*level)--;
-	return 0;
-}
-
-static int tree_move_next_or_upnext(struct btrfs_path *path,
-				    int *level, int root_level)
-{
-	int ret = 0;
-	int nritems;
-	nritems = btrfs_header_nritems(path->nodes[*level]);
-
-	path->slots[*level]++;
-
-	while (path->slots[*level] >= nritems) {
-		if (*level == root_level)
-			return -1;
-
-		/* move upnext */
-		path->slots[*level] = 0;
-		free_extent_buffer(path->nodes[*level]);
-		path->nodes[*level] = NULL;
-		(*level)++;
-		path->slots[*level]++;
-
-		nritems = btrfs_header_nritems(path->nodes[*level]);
-		ret = 1;
-	}
-	return ret;
-}
-
-/*
- * Returns 1 if it had to move up and next. 0 is returned if it moved only next
- * or down.
- */
-static int tree_advance(struct btrfs_fs_info *fs_info,
-			struct btrfs_path *path,
-			int *level, int root_level,
-			int allow_down,
-			struct btrfs_key *key)
-{
-	int ret;
-
-	if (*level == 0 || !allow_down) {
-		ret = tree_move_next_or_upnext(path, level, root_level);
-	} else {
-		ret = tree_move_down(fs_info, path, level);
-	}
-	if (ret >= 0) {
-		if (*level == 0)
-			btrfs_item_key_to_cpu(path->nodes[*level], key,
-					path->slots[*level]);
-		else
-			btrfs_node_key_to_cpu(path->nodes[*level], key,
-					path->slots[*level]);
-	}
-	return ret;
-}
-
-static int tree_compare_item(struct btrfs_path *left_path,
-			     struct btrfs_path *right_path,
-			     char *tmp_buf)
-{
-	int cmp;
-	int len1, len2;
-	unsigned long off1, off2;
-
-	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
-	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
-	if (len1 != len2)
-		return 1;
-
-	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
-	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
-				right_path->slots[0]);
-
-	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
-
-	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
-	if (cmp)
-		return 1;
-	return 0;
-}
-
-#define ADVANCE 1
-#define ADVANCE_ONLY_NEXT -1
-
-/*
- * This function compares two trees and calls the provided callback for
- * every changed/new/deleted item it finds.
- * If shared tree blocks are encountered, whole subtrees are skipped, making
- * the compare pretty fast on snapshotted subvolumes.
- *
- * This currently works on commit roots only. As commit roots are read only,
- * we don't do any locking. The commit roots are protected with transactions.
- * Transactions are ended and rejoined when a commit is tried in between.
- *
- * This function checks for modifications done to the trees while comparing.
- * If it detects a change, it aborts immediately.
- */
-int btrfs_compare_trees(struct btrfs_root *left_root,
-			struct btrfs_root *right_root,
-			btrfs_changed_cb_t changed_cb, void *ctx)
-{
-	struct btrfs_fs_info *fs_info = left_root->fs_info;
-	int ret;
-	int cmp;
-	struct btrfs_path *left_path = NULL;
-	struct btrfs_path *right_path = NULL;
-	struct btrfs_key left_key;
-	struct btrfs_key right_key;
-	char *tmp_buf = NULL;
-	int left_root_level;
-	int right_root_level;
-	int left_level;
-	int right_level;
-	int left_end_reached;
-	int right_end_reached;
-	int advance_left;
-	int advance_right;
-	u64 left_blockptr;
-	u64 right_blockptr;
-	u64 left_gen;
-	u64 right_gen;
-
-	left_path = btrfs_alloc_path();
-	if (!left_path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	right_path = btrfs_alloc_path();
-	if (!right_path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
-	if (!tmp_buf) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	left_path->search_commit_root = 1;
-	left_path->skip_locking = 1;
-	right_path->search_commit_root = 1;
-	right_path->skip_locking = 1;
-
-	/*
-	 * Strategy: Go to the first items of both trees. Then do
-	 *
-	 * If both trees are at level 0
-	 *   Compare keys of current items
-	 *     If left < right treat left item as new, advance left tree
-	 *       and repeat
-	 *     If left > right treat right item as deleted, advance right tree
-	 *       and repeat
-	 *     If left == right do deep compare of items, treat as changed if
-	 *       needed, advance both trees and repeat
-	 * If both trees are at the same level but not at level 0
-	 *   Compare keys of current nodes/leafs
-	 *     If left < right advance left tree and repeat
-	 *     If left > right advance right tree and repeat
-	 *     If left == right compare blockptrs of the next nodes/leafs
-	 *       If they match advance both trees but stay at the same level
-	 *         and repeat
-	 *       If they don't match advance both trees while allowing to go
-	 *         deeper and repeat
-	 * If tree levels are different
-	 *   Advance the tree that needs it and repeat
-	 *
-	 * Advancing a tree means:
-	 *   If we are at level 0, try to go to the next slot. If that's not
-	 *   possible, go one level up and repeat. Stop when we found a level
-	 *   where we could go to the next slot. We may at this point be on a
-	 *   node or a leaf.
-	 *
-	 *   If we are not at level 0 and not on shared tree blocks, go one
-	 *   level deeper.
-	 *
-	 *   If we are not at level 0 and on shared tree blocks, go one slot to
-	 *   the right if possible or go up and right.
-	 */
-
-	down_read(&fs_info->commit_root_sem);
-	left_level = btrfs_header_level(left_root->commit_root);
-	left_root_level = left_level;
-	left_path->nodes[left_level] =
-			btrfs_clone_extent_buffer(left_root->commit_root);
-	if (!left_path->nodes[left_level]) {
-		up_read(&fs_info->commit_root_sem);
-		ret = -ENOMEM;
-		goto out;
-	}
-	extent_buffer_get(left_path->nodes[left_level]);
-
-	right_level = btrfs_header_level(right_root->commit_root);
-	right_root_level = right_level;
-	right_path->nodes[right_level] =
-			btrfs_clone_extent_buffer(right_root->commit_root);
-	if (!right_path->nodes[right_level]) {
-		up_read(&fs_info->commit_root_sem);
-		ret = -ENOMEM;
-		goto out;
-	}
-	extent_buffer_get(right_path->nodes[right_level]);
-	up_read(&fs_info->commit_root_sem);
-
-	if (left_level == 0)
-		btrfs_item_key_to_cpu(left_path->nodes[left_level],
-				&left_key, left_path->slots[left_level]);
-	else
-		btrfs_node_key_to_cpu(left_path->nodes[left_level],
-				&left_key, left_path->slots[left_level]);
-	if (right_level == 0)
-		btrfs_item_key_to_cpu(right_path->nodes[right_level],
-				&right_key, right_path->slots[right_level]);
-	else
-		btrfs_node_key_to_cpu(right_path->nodes[right_level],
-				&right_key, right_path->slots[right_level]);
-
-	left_end_reached = right_end_reached = 0;
-	advance_left = advance_right = 0;
-
-	while (1) {
-		if (advance_left && !left_end_reached) {
-			ret = tree_advance(fs_info, left_path, &left_level,
-					left_root_level,
-					advance_left != ADVANCE_ONLY_NEXT,
-					&left_key);
-			if (ret == -1)
-				left_end_reached = ADVANCE;
-			else if (ret < 0)
-				goto out;
-			advance_left = 0;
-		}
-		if (advance_right && !right_end_reached) {
-			ret = tree_advance(fs_info, right_path, &right_level,
-					right_root_level,
-					advance_right != ADVANCE_ONLY_NEXT,
-					&right_key);
-			if (ret == -1)
-				right_end_reached = ADVANCE;
-			else if (ret < 0)
-				goto out;
-			advance_right = 0;
-		}
-
-		if (left_end_reached && right_end_reached) {
-			ret = 0;
-			goto out;
-		} else if (left_end_reached) {
-			if (right_level == 0) {
-				ret = changed_cb(left_path, right_path,
-						&right_key,
-						BTRFS_COMPARE_TREE_DELETED,
-						ctx);
-				if (ret < 0)
-					goto out;
-			}
-			advance_right = ADVANCE;
-			continue;
-		} else if (right_end_reached) {
-			if (left_level == 0) {
-				ret = changed_cb(left_path, right_path,
-						&left_key,
-						BTRFS_COMPARE_TREE_NEW,
-						ctx);
-				if (ret < 0)
-					goto out;
-			}
-			advance_left = ADVANCE;
-			continue;
-		}
-
-		if (left_level == 0 && right_level == 0) {
-			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
-			if (cmp < 0) {
-				ret = changed_cb(left_path, right_path,
-						&left_key,
-						BTRFS_COMPARE_TREE_NEW,
-						ctx);
-				if (ret < 0)
-					goto out;
-				advance_left = ADVANCE;
-			} else if (cmp > 0) {
-				ret = changed_cb(left_path, right_path,
-						&right_key,
-						BTRFS_COMPARE_TREE_DELETED,
-						ctx);
-				if (ret < 0)
-					goto out;
-				advance_right = ADVANCE;
-			} else {
-				enum btrfs_compare_tree_result result;
-
-				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
-				ret = tree_compare_item(left_path, right_path,
-							tmp_buf);
-				if (ret)
-					result = BTRFS_COMPARE_TREE_CHANGED;
-				else
-					result = BTRFS_COMPARE_TREE_SAME;
-				ret = changed_cb(left_path, right_path,
-						 &left_key, result, ctx);
-				if (ret < 0)
-					goto out;
-				advance_left = ADVANCE;
-				advance_right = ADVANCE;
-			}
-		} else if (left_level == right_level) {
-			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
-			if (cmp < 0) {
-				advance_left = ADVANCE;
-			} else if (cmp > 0) {
-				advance_right = ADVANCE;
-			} else {
-				left_blockptr = btrfs_node_blockptr(
-						left_path->nodes[left_level],
-						left_path->slots[left_level]);
-				right_blockptr = btrfs_node_blockptr(
-						right_path->nodes[right_level],
-						right_path->slots[right_level]);
-				left_gen = btrfs_node_ptr_generation(
-						left_path->nodes[left_level],
-						left_path->slots[left_level]);
-				right_gen = btrfs_node_ptr_generation(
-						right_path->nodes[right_level],
-						right_path->slots[right_level]);
-				if (left_blockptr == right_blockptr &&
-				    left_gen == right_gen) {
-					/*
-					 * As we're on a shared block, don't
-					 * allow to go deeper.
-					 */
-					advance_left = ADVANCE_ONLY_NEXT;
-					advance_right = ADVANCE_ONLY_NEXT;
-				} else {
-					advance_left = ADVANCE;
-					advance_right = ADVANCE;
-				}
-			}
-		} else if (left_level < right_level) {
-			advance_right = ADVANCE;
-		} else {
-			advance_left = ADVANCE;
-		}
-	}
-
-out:
-	btrfs_free_path(left_path);
-	btrfs_free_path(right_path);
-	kvfree(tmp_buf);
-	return ret;
-}
-
 /*
  * this is similar to btrfs_next_leaf, but does not try to preserve
  * and fixup the path.  It looks for and returns the next key in the
@@ -5611,7 +5277,7 @@
 	int slot;
 	struct extent_buffer *c;
 
-	WARN_ON(!path->keep_locks);
+	WARN_ON(!path->keep_locks && !path->skip_locking);
 	while (level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
@@ -5627,7 +5293,7 @@
 			    !path->nodes[level + 1])
 				return 1;
 
-			if (path->locks[level + 1]) {
+			if (path->locks[level + 1] || path->skip_locking) {
 				level++;
 				continue;
 			}
@@ -5800,8 +5466,6 @@
 			if (!ret) {
 				btrfs_set_path_blocking(path);
 				btrfs_tree_read_lock(next);
-				btrfs_clear_path_blocking(path, next,
-							  BTRFS_READ_LOCK);
 			}
 			next_rw_lock = BTRFS_READ_LOCK;
 		}
@@ -5837,8 +5501,6 @@
 			if (!ret) {
 				btrfs_set_path_blocking(path);
 				btrfs_tree_read_lock(next);
-				btrfs_clear_path_blocking(path, next,
-							  BTRFS_READ_LOCK);
 			}
 			next_rw_lock = BTRFS_READ_LOCK;
 		}

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7..fe2b876 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h

@@ -16,9 +16,9 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
-#include <linux/kobject.h>
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
+#include <asm/unaligned.h>
 #include <linux/pagemap.h>
 #include <linux/btrfs.h>
 #include <linux/btrfs_tree.h>
@@ -31,25 +31,34 @@
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
+#include "block-rsv.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
 struct btrfs_pending_snapshot;
+struct btrfs_delayed_ref_root;
+struct btrfs_space_info;
+struct btrfs_block_group_cache;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
+extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
 struct btrfs_ordered_sum;
-
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define STATIC noinline
-#else
-#define STATIC static noinline
-#endif
+struct btrfs_ref;
 
 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
-#define BTRFS_MAX_MIRRORS 3
+/*
+ * Maximum number of mirrors that can be available for all profiles counting
+ * the target device of dev-replace as one. During an active device replace
+ * procedure, the target device of the copy operation is a mirror for the
+ * filesystem data as well that can be used to read data in order to repair
+ * read errors on other disks.
+ *
+ * Current value is derived from RAID1 with 2 copies.
+ */
+#define BTRFS_MAX_MIRRORS (2 + 1)
 
 #define BTRFS_MAX_LEVEL 8
 
@@ -74,9 +83,6 @@
  */
 #define BTRFS_LINK_MAX 65535U
 
-/* four bytes for CRC32 */
-static const int btrfs_csum_sizes[] = { 4 };
-
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 /* ioprio of readahead is set to idle */
@@ -103,10 +109,6 @@
 	return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
 }
 
-struct btrfs_mapping_tree {
-	struct extent_map_tree map_tree;
-};
-
 static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 {
 	BUG_ON(num_stripes == 0);
@@ -115,13 +117,26 @@
 }
 
 /*
- * File system states
+ * Runtime (in-memory) states of filesystem
  */
-#define BTRFS_FS_STATE_ERROR		0
-#define BTRFS_FS_STATE_REMOUNTING	1
-#define BTRFS_FS_STATE_TRANS_ABORTED	2
-#define BTRFS_FS_STATE_DEV_REPLACING	3
-#define BTRFS_FS_STATE_DUMMY_FS_INFO	4
+enum {
+	/* Global indicator of serious filesystem errors */
+	BTRFS_FS_STATE_ERROR,
+	/*
+	 * Filesystem is being remounted, allow to skip some operations, like
+	 * defrag
+	 */
+	BTRFS_FS_STATE_REMOUNTING,
+	/* Track if a transaction abort has been reported on this filesystem */
+	BTRFS_FS_STATE_TRANS_ABORTED,
+	/*
+	 * Bio operations should be blocked on this filesystem because a source
+	 * or target device is being destroyed as part of a device replace
+	 */
+	BTRFS_FS_STATE_DEV_REPLACING,
+	/* The btrfs_fs_info created for self-tests */
+	BTRFS_FS_STATE_DUMMY_FS_INFO,
+};
 
 #define BTRFS_BACKREF_REV_MAX		256
 #define BTRFS_BACKREF_REV_SHIFT		56
@@ -201,9 +216,10 @@
  * it currently lacks any block count etc etc
  */
 struct btrfs_super_block {
-	u8 csum[BTRFS_CSUM_SIZE];
 	/* the first 4 fields must match struct btrfs_header */
-	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
+	u8 csum[BTRFS_CSUM_SIZE];
+	/* FS specific UUID, visible to user */
+	u8 fsid[BTRFS_FSID_SIZE];
 	__le64 bytenr; /* this block number */
 	__le64 flags;
 
@@ -240,8 +256,11 @@
 	__le64 cache_generation;
 	__le64 uuid_tree_generation;
 
+	/* the UUID written into btree blocks */
+	u8 metadata_uuid[BTRFS_FSID_SIZE];
+
 	/* future expansion */
-	__le64 reserved[30];
+	__le64 reserved[28];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
@@ -271,7 +290,8 @@
 	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
 	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
-	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
+	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID)
 
 #define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
 	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -322,7 +342,7 @@
  * The slots array records the index of the item or block pointer
  * used while walking the tree.
  */
-enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
+enum { READA_NONE, READA_BACK, READA_FORWARD };
 struct btrfs_path {
 	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
@@ -366,122 +386,12 @@
 	struct btrfs_device *tgtdev;
 
 	struct mutex lock_finishing_cancel_unmount;
-	rwlock_t lock;
-	atomic_t read_locks;
-	atomic_t blocking_readers;
-	wait_queue_head_t read_lock_wq;
+	struct rw_semaphore rwsem;
 
 	struct btrfs_scrub_progress scrub_progress;
-};
 
-/* For raid type sysfs entries */
-struct raid_kobject {
-	u64 flags;
-	struct kobject kobj;
-	struct list_head list;
-};
-
-struct btrfs_space_info {
-	spinlock_t lock;
-
-	u64 total_bytes;	/* total bytes in the space,
-				   this doesn't take mirrors into account */
-	u64 bytes_used;		/* total bytes used,
-				   this doesn't take mirrors into account */
-	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
-				   transaction finishes */
-	u64 bytes_reserved;	/* total bytes the allocator has reserved for
-				   current allocations */
-	u64 bytes_may_use;	/* number of bytes that may be used for
-				   delalloc/allocations */
-	u64 bytes_readonly;	/* total bytes that are read only */
-
-	u64 max_extent_size;	/* This will hold the maximum extent size of
-				   the space info if we had an ENOSPC in the
-				   allocator. */
-
-	unsigned int full:1;	/* indicates that we cannot allocate any more
-				   chunks for this space */
-	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
-
-	unsigned int flush:1;		/* set if we are trying to make space */
-
-	unsigned int force_alloc;	/* set if we need to force a chunk
-					   alloc for this space */
-
-	u64 disk_used;		/* total bytes used on disk */
-	u64 disk_total;		/* total bytes on disk, takes mirrors into
-				   account */
-
-	u64 flags;
-
-	/*
-	 * bytes_pinned is kept in line with what is actually pinned, as in
-	 * we've called update_block_group and dropped the bytes_used counter
-	 * and increased the bytes_pinned counter.  However this means that
-	 * bytes_pinned does not reflect the bytes that will be pinned once the
-	 * delayed refs are flushed, so this counter is inc'ed every time we
-	 * call btrfs_free_extent so it is a realtime count of what will be
-	 * freed once the transaction is committed.  It will be zeroed every
-	 * time the transaction commits.
-	 */
-	struct percpu_counter total_bytes_pinned;
-
-	struct list_head list;
-	/* Protected by the spinlock 'lock'. */
-	struct list_head ro_bgs;
-	struct list_head priority_tickets;
-	struct list_head tickets;
-	/*
-	 * tickets_id just indicates the next ticket will be handled, so note
-	 * it's not stored per ticket.
-	 */
-	u64 tickets_id;
-
-	struct rw_semaphore groups_sem;
-	/* for block groups in our same type */
-	struct list_head block_groups[BTRFS_NR_RAID_TYPES];
-	wait_queue_head_t wait;
-
-	struct kobject kobj;
-	struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
-};
-
-#define	BTRFS_BLOCK_RSV_GLOBAL		1
-#define	BTRFS_BLOCK_RSV_DELALLOC	2
-#define	BTRFS_BLOCK_RSV_TRANS		3
-#define	BTRFS_BLOCK_RSV_CHUNK		4
-#define	BTRFS_BLOCK_RSV_DELOPS		5
-#define	BTRFS_BLOCK_RSV_EMPTY		6
-#define	BTRFS_BLOCK_RSV_TEMP		7
-
-struct btrfs_block_rsv {
-	u64 size;
-	u64 reserved;
-	struct btrfs_space_info *space_info;
-	spinlock_t lock;
-	unsigned short full;
-	unsigned short type;
-	unsigned short failfast;
-
-	/*
-	 * Qgroup equivalent for @size @reserved
-	 *
-	 * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
-	 * about things like csum size nor how many tree blocks it will need to
-	 * reserve.
-	 *
-	 * Qgroup cares more about net change of the extent usage.
-	 *
-	 * So for one newly inserted file extent, in worst case it will cause
-	 * leaf split and level increase, nodesize for each file extent is
-	 * already too much.
-	 *
-	 * In short, qgroup_size/reserved is the upper limit of possible needed
-	 * qgroup metadata reservation.
-	 */
-	u64 qgroup_rsv_size;
-	u64 qgroup_rsv_reserved;
+	struct percpu_counter bio_counter;
+	wait_queue_head_t replace_wait;
 };
 
 /*
@@ -513,45 +423,11 @@
 };
 
 enum btrfs_caching_type {
-	BTRFS_CACHE_NO		= 0,
-	BTRFS_CACHE_STARTED	= 1,
-	BTRFS_CACHE_FAST	= 2,
-	BTRFS_CACHE_FINISHED	= 3,
-	BTRFS_CACHE_ERROR	= 4,
-};
-
-enum btrfs_disk_cache_state {
-	BTRFS_DC_WRITTEN	= 0,
-	BTRFS_DC_ERROR		= 1,
-	BTRFS_DC_CLEAR		= 2,
-	BTRFS_DC_SETUP		= 3,
-};
-
-struct btrfs_caching_control {
-	struct list_head list;
-	struct mutex mutex;
-	wait_queue_head_t wait;
-	struct btrfs_work work;
-	struct btrfs_block_group_cache *block_group;
-	u64 progress;
-	refcount_t count;
-};
-
-/* Once caching_thread() finds this much free space, it will wake up waiters. */
-#define CACHING_CTL_WAKE_UP SZ_2M
-
-struct btrfs_io_ctl {
-	void *cur, *orig;
-	struct page *page;
-	struct page **pages;
-	struct btrfs_fs_info *fs_info;
-	struct inode *inode;
-	unsigned long size;
-	int index;
-	int num_pages;
-	int entries;
-	int bitmaps;
-	unsigned check_crcs:1;
+	BTRFS_CACHE_NO,
+	BTRFS_CACHE_STARTED,
+	BTRFS_CACHE_FAST,
+	BTRFS_CACHE_FINISHED,
+	BTRFS_CACHE_ERROR,
 };
 
 /*
@@ -562,120 +438,6 @@
 	struct mutex lock;
 };
 
-struct btrfs_block_group_cache {
-	struct btrfs_key key;
-	struct btrfs_block_group_item item;
-	struct btrfs_fs_info *fs_info;
-	struct inode *inode;
-	spinlock_t lock;
-	u64 pinned;
-	u64 reserved;
-	u64 delalloc_bytes;
-	u64 bytes_super;
-	u64 flags;
-	u64 cache_generation;
-
-	/*
-	 * If the free space extent count exceeds this number, convert the block
-	 * group to bitmaps.
-	 */
-	u32 bitmap_high_thresh;
-
-	/*
-	 * If the free space extent count drops below this number, convert the
-	 * block group back to extents.
-	 */
-	u32 bitmap_low_thresh;
-
-	/*
-	 * It is just used for the delayed data space allocation because
-	 * only the data space allocation and the relative metadata update
-	 * can be done cross the transaction.
-	 */
-	struct rw_semaphore data_rwsem;
-
-	/* for raid56, this is a full stripe, without parity */
-	unsigned long full_stripe_len;
-
-	unsigned int ro;
-	unsigned int iref:1;
-	unsigned int has_caching_ctl:1;
-	unsigned int removed:1;
-
-	int disk_cache_state;
-
-	/* cache tracking stuff */
-	int cached;
-	struct btrfs_caching_control *caching_ctl;
-	u64 last_byte_to_unpin;
-
-	struct btrfs_space_info *space_info;
-
-	/* free space cache stuff */
-	struct btrfs_free_space_ctl *free_space_ctl;
-
-	/* block group cache stuff */
-	struct rb_node cache_node;
-
-	/* for block groups in the same raid type */
-	struct list_head list;
-
-	/* usage count */
-	atomic_t count;
-
-	/* List of struct btrfs_free_clusters for this block group.
-	 * Today it will only have one thing on it, but that may change
-	 */
-	struct list_head cluster_list;
-
-	/* For delayed block group creation or deletion of empty block groups */
-	struct list_head bg_list;
-
-	/* For read-only block groups */
-	struct list_head ro_list;
-
-	atomic_t trimming;
-
-	/* For dirty block groups */
-	struct list_head dirty_list;
-	struct list_head io_list;
-
-	struct btrfs_io_ctl io_ctl;
-
-	/*
-	 * Incremented when doing extent allocations and holding a read lock
-	 * on the space_info's groups_sem semaphore.
-	 * Decremented when an ordered extent that represents an IO against this
-	 * block group's range is created (after it's added to its inode's
-	 * root's list of ordered extents) or immediately after the allocation
-	 * if it's a metadata extent or fallocate extent (for these cases we
-	 * don't create ordered extents).
-	 */
-	atomic_t reservations;
-
-	/*
-	 * Incremented while holding the spinlock *lock* by a task checking if
-	 * it can perform a nocow write (incremented if the value for the *ro*
-	 * field is 0). Decremented by such tasks once they create an ordered
-	 * extent or before that if some error happens before reaching that step.
-	 * This is to prevent races between block group relocation and nocow
-	 * writes through direct IO.
-	 */
-	atomic_t nocow_writers;
-
-	/* Lock for free space tree operations. */
-	struct mutex free_space_lock;
-
-	/*
-	 * Does the block group need to be added to the free space tree?
-	 * Protected by free_space_lock.
-	 */
-	int needs_free_space;
-
-	/* Record locked full stripes for RAID5/6 block group */
-	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
-};
-
 /* delayed seq elem */
 struct seq_list {
 	struct list_head list;
@@ -691,22 +453,6 @@
 	ORPHAN_CLEANUP_DONE	= 2,
 };
 
-/* used by the raid56 code to lock stripes for read/modify/write */
-struct btrfs_stripe_hash {
-	struct list_head hash_list;
-	spinlock_t lock;
-};
-
-/* used by the raid56 code to lock stripes for read/modify/write */
-struct btrfs_stripe_hash_table {
-	struct list_head stripe_cache;
-	spinlock_t cache_lock;
-	int cache_size;
-	struct btrfs_stripe_hash table[];
-};
-
-#define BTRFS_STRIPE_HASH_TABLE_BITS 11
-
 void btrfs_init_async_reclaim_work(struct work_struct *work);
 
 /* fs_info */
@@ -716,41 +462,71 @@
 struct btrfs_balance_control;
 struct btrfs_delayed_root;
 
-#define BTRFS_FS_BARRIER			1
-#define BTRFS_FS_CLOSING_START			2
-#define BTRFS_FS_CLOSING_DONE			3
-#define BTRFS_FS_LOG_RECOVERING			4
-#define BTRFS_FS_OPEN				5
-#define BTRFS_FS_QUOTA_ENABLED			6
-#define BTRFS_FS_UPDATE_UUID_TREE_GEN		9
-#define BTRFS_FS_CREATING_FREE_SPACE_TREE	10
-#define BTRFS_FS_BTREE_ERR			11
-#define BTRFS_FS_LOG1_ERR			12
-#define BTRFS_FS_LOG2_ERR			13
-#define BTRFS_FS_QUOTA_OVERRIDE			14
-/* Used to record internally whether fs has been frozen */
-#define BTRFS_FS_FROZEN				15
-
 /*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
  */
-#define BTRFS_FS_EXCL_OP			16
+struct btrfs_swapfile_pin {
+	struct rb_node node;
+	void *ptr;
+	struct inode *inode;
+	/*
+	 * If true, ptr points to a struct btrfs_block_group_cache. Otherwise,
+	 * ptr points to a struct btrfs_device.
+	 */
+	bool is_block_group;
+};
 
-/*
- * To info transaction_kthread we need an immediate commit so it doesn't
- * need to wait for commit_interval
- */
-#define BTRFS_FS_NEED_ASYNC_COMMIT		17
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
 
-/*
- * Indicate that balance has been set up from the ioctl and is in the main
- * phase. The fs_info::balance_ctl is initialized.
- */
-#define BTRFS_FS_BALANCE_RUNNING		18
+enum {
+	BTRFS_FS_BARRIER,
+	BTRFS_FS_CLOSING_START,
+	BTRFS_FS_CLOSING_DONE,
+	BTRFS_FS_LOG_RECOVERING,
+	BTRFS_FS_OPEN,
+	BTRFS_FS_QUOTA_ENABLED,
+	BTRFS_FS_UPDATE_UUID_TREE_GEN,
+	BTRFS_FS_CREATING_FREE_SPACE_TREE,
+	BTRFS_FS_BTREE_ERR,
+	BTRFS_FS_LOG1_ERR,
+	BTRFS_FS_LOG2_ERR,
+	BTRFS_FS_QUOTA_OVERRIDE,
+	/* Used to record internally whether fs has been frozen */
+	BTRFS_FS_FROZEN,
+	/*
+	 * Indicate that a whole-filesystem exclusive operation is running
+	 * (device replace, resize, device add/delete, balance)
+	 */
+	BTRFS_FS_EXCL_OP,
+	/*
+	 * To info transaction_kthread we need an immediate commit so it
+	 * doesn't need to wait for commit_interval
+	 */
+	BTRFS_FS_NEED_ASYNC_COMMIT,
+	/*
+	 * Indicate that balance has been set up from the ioctl and is in the
+	 * main phase. The fs_info::balance_ctl is initialized.
+	 * Set and cleared while holding fs_info::balance_mutex.
+	 */
+	BTRFS_FS_BALANCE_RUNNING,
+
+	/* Indicate that the cleaner thread is awake and doing something. */
+	BTRFS_FS_CLEANER_RUNNING,
+
+	/*
+	 * The checksumming has an optimized version and is considered fast,
+	 * so we don't need to offload checksums to workqueues.
+	 */
+	BTRFS_FS_CSUM_IMPL_FAST,
+};
 
 struct btrfs_fs_info {
-	u8 fsid[BTRFS_FSID_SIZE];
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	unsigned long flags;
 	struct btrfs_root *extent_root;
@@ -781,7 +557,7 @@
 	struct extent_io_tree *pinned_extents;
 
 	/* logical->physical extent mapping */
-	struct btrfs_mapping_tree mapping_tree;
+	struct extent_map_tree mapping_tree;
 
 	/*
 	 * block reservation for extent, checksum, root tree and
@@ -794,6 +570,8 @@
 	struct btrfs_block_rsv chunk_block_rsv;
 	/* block reservation for delayed operations */
 	struct btrfs_block_rsv delayed_block_rsv;
+	/* block reservation for delayed refs */
+	struct btrfs_block_rsv delayed_refs_rsv;
 
 	struct btrfs_block_rsv empty_block_rsv;
 
@@ -890,7 +668,8 @@
 
 	spinlock_t delayed_iput_lock;
 	struct list_head delayed_iputs;
-	struct mutex cleaner_delayed_iput_mutex;
+	atomic_t nr_delayed_iputs;
+	wait_queue_head_t delayed_iputs_wait;
 
 	/* this protects tree_mod_seq_list */
 	spinlock_t tree_mod_seq_lock;
@@ -955,21 +734,18 @@
 	struct btrfs_workqueue *fixup_workers;
 	struct btrfs_workqueue *delayed_workers;
 
-	/* the extent workers do delayed refs on the extent allocation tree */
-	struct btrfs_workqueue *extent_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
 	u32 thread_pool_size;
 
 	struct kobject *space_info_kobj;
-	struct list_head pending_raid_kobjs;
-	spinlock_t pending_raid_kobjs_lock; /* uncontended */
 
 	u64 total_pinned;
 
 	/* used to keep from writing metadata until there is a nice batch */
 	struct percpu_counter dirty_metadata_bytes;
 	struct percpu_counter delalloc_bytes;
+	struct percpu_counter dio_bytes;
 	s32 dirty_metadata_batch;
 	s32 delalloc_batch;
 
@@ -1030,10 +806,13 @@
 	atomic_t scrubs_paused;
 	atomic_t scrub_cancel_req;
 	wait_queue_head_t scrub_pause_wait;
-	int scrub_workers_refcnt;
+	/*
+	 * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+	 * running.
+	 */
+	refcount_t scrub_workers_refcnt;
 	struct btrfs_workqueue *scrub_workers;
 	struct btrfs_workqueue *scrub_wr_completion_workers;
-	struct btrfs_workqueue *scrub_nocow_workers;
 	struct btrfs_workqueue *scrub_parity_workers;
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1044,10 +823,7 @@
 
 	/* holds configuration and tracking. Protected by qgroup_lock */
 	struct rb_root qgroup_tree;
-	struct rb_root qgroup_op_tree;
 	spinlock_t qgroup_lock;
-	spinlock_t qgroup_op_lock;
-	atomic_t qgroup_op_seq;
 
 	/*
 	 * used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -1094,9 +870,6 @@
 	/* device replace state */
 	struct btrfs_dev_replace dev_replace;
 
-	struct percpu_counter bio_counter;
-	wait_queue_head_t replace_wait;
-
 	struct semaphore uuid_tree_rescan_sem;
 
 	/* Used to reclaim the metadata space in the background. */
@@ -1107,20 +880,23 @@
 	struct mutex unused_bg_unpin_mutex;
 	struct mutex delete_unused_bgs_mutex;
 
-	/* For btrfs to record security options */
-	struct security_mnt_opts security_opts;
-
-	/*
-	 * Chunks that can't be freed yet (under a trim/discard operation)
-	 * and will be latter freed. Protected by fs_info->chunk_mutex.
-	 */
-	struct list_head pinned_chunks;
-
 	/* Cached block sizes */
 	u32 nodesize;
 	u32 sectorsize;
 	u32 stripesize;
 
+	/* Block groups and devices containing active swapfiles. */
+	spinlock_t swapfile_pins_lock;
+	struct rb_root swapfile_pins;
+
+	struct crypto_shash *csum_shash;
+
+	/*
+	 * Number of send operations in progress.
+	 * Updated while holding fs_info::balance_mutex.
+	 */
+	int send_in_progress;
+
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	spinlock_t ref_verify_lock;
 	struct rb_root block_tree;
@@ -1140,22 +916,44 @@
 /*
  * The state of btrfs root
  */
+enum {
+	/*
+	 * btrfs_record_root_in_trans is a multi-step process, and it can race
+	 * with the balancing code.   But the race is very small, and only the
+	 * first time the root is added to each transaction.  So IN_TRANS_SETUP
+	 * is used to tell us when more checks are required
+	 */
+	BTRFS_ROOT_IN_TRANS_SETUP,
+	BTRFS_ROOT_REF_COWS,
+	BTRFS_ROOT_TRACK_DIRTY,
+	BTRFS_ROOT_IN_RADIX,
+	BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+	BTRFS_ROOT_DEFRAG_RUNNING,
+	BTRFS_ROOT_FORCE_COW,
+	BTRFS_ROOT_MULTI_LOG_TASKS,
+	BTRFS_ROOT_DIRTY,
+	BTRFS_ROOT_DELETING,
+
+	/*
+	 * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
+	 *
+	 * Set for the subvolume tree owning the reloc tree.
+	 */
+	BTRFS_ROOT_DEAD_RELOC_TREE,
+	/* Mark dead root stored on device whose cleanup needs to be resumed */
+	BTRFS_ROOT_DEAD_TREE,
+};
+
 /*
- * btrfs_record_root_in_trans is a multi-step process,
- * and it can race with the balancing code.   But the
- * race is very small, and only the first time the root
- * is added to each transaction.  So IN_TRANS_SETUP
- * is used to tell us when more checks are required
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
  */
-#define BTRFS_ROOT_IN_TRANS_SETUP	0
-#define BTRFS_ROOT_REF_COWS		1
-#define BTRFS_ROOT_TRACK_DIRTY		2
-#define BTRFS_ROOT_IN_RADIX		3
-#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED	4
-#define BTRFS_ROOT_DEFRAG_RUNNING	5
-#define BTRFS_ROOT_FORCE_COW		6
-#define BTRFS_ROOT_MULTI_LOG_TASKS	7
-#define BTRFS_ROOT_DIRTY		8
+struct btrfs_qgroup_swapped_blocks {
+	spinlock_t lock;
+	/* RM_EMPTY_ROOT() of above blocks[] */
+	bool swapped;
+	struct rb_root blocks[BTRFS_MAX_LEVEL];
+};
 
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
@@ -1202,18 +1000,12 @@
 	int last_log_commit;
 	pid_t log_start_pid;
 
-	u64 objectid;
 	u64 last_trans;
 
 	u32 type;
 
 	u64 highest_objectid;
 
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-	/* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
-	u64 alloc_bytenr;
-#endif
-
 	u64 defrag_trans_start;
 	struct btrfs_key defrag_progress;
 	struct btrfs_key defrag_max;
@@ -1274,10 +1066,24 @@
 	u64 nr_ordered_extents;
 
 	/*
+	 * Not empty if this subvolume root has gone through tree block swap
+	 * (relocation)
+	 *
+	 * Will be used by reloc_control::dirty_subvol_roots.
+	 */
+	struct list_head reloc_dirty_list;
+
+	/*
 	 * Number of currently running SEND ioctls to prevent
 	 * manipulation with the read-only status via SUBVOL_SETFLAGS
 	 */
 	int send_in_progress;
+	/*
+	 * Number of currently running deduplication operations that have a
+	 * destination inode belonging to this root. Protected by the lock
+	 * root_item_lock.
+	 */
+	int dedupe_in_progress;
 	struct btrfs_subvolume_writers *subv_writers;
 	atomic_t will_be_snapshotted;
 	atomic_t snapshot_force_cow;
@@ -1286,6 +1092,26 @@
 	spinlock_t qgroup_meta_rsv_lock;
 	u64 qgroup_meta_rsv_pertrans;
 	u64 qgroup_meta_rsv_prealloc;
+
+	/* Number of active swapfiles */
+	atomic_t nr_swapfiles;
+
+	/* Record pairs of swapped blocks for qgroup */
+	struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	u64 alloc_bytenr;
+#endif
+};
+
+struct btrfs_clone_extent_info {
+	u64 disk_offset;
+	u64 disk_len;
+	u64 data_offset;
+	u64 data_len;
+	u64 file_offset;
+	char *extent_buf;
+	u32 item_size;
 };
 
 struct btrfs_file_private {
@@ -1386,19 +1212,6 @@
 	btrfs_clear_opt(fs_info->mount_opt, opt);			\
 }
 
-#ifdef CONFIG_BTRFS_DEBUG
-static inline int
-btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group)
-{
-	struct btrfs_fs_info *fs_info = block_group->fs_info;
-
-	return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
-		block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
-	       (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
-		block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
-}
-#endif
-
 /*
  * Requests for changes that need to be done during transaction commit.
  *
@@ -1460,6 +1273,21 @@
 
 #define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
 
+#define BTRFS_INODE_FLAG_MASK						\
+	(BTRFS_INODE_NODATASUM |					\
+	 BTRFS_INODE_NODATACOW |					\
+	 BTRFS_INODE_READONLY |						\
+	 BTRFS_INODE_NOCOMPRESS |					\
+	 BTRFS_INODE_PREALLOC |						\
+	 BTRFS_INODE_SYNC |						\
+	 BTRFS_INODE_IMMUTABLE |					\
+	 BTRFS_INODE_APPEND |						\
+	 BTRFS_INODE_NODUMP |						\
+	 BTRFS_INODE_NOATIME |						\
+	 BTRFS_INODE_DIRSYNC |						\
+	 BTRFS_INODE_COMPRESS |						\
+	 BTRFS_INODE_ROOT_ITEM_INIT)
+
 struct btrfs_map_token {
 	const struct extent_buffer *eb;
 	char *kaddr;
@@ -1469,8 +1297,10 @@
 #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
 				((bytes) >> (fs_info)->sb->s_blocksize_bits)
 
-static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+static inline void btrfs_init_map_token(struct btrfs_map_token *token,
+					struct extent_buffer *eb)
 {
+	token->eb = eb;
 	token->kaddr = NULL;
 }
 
@@ -1501,17 +1331,10 @@
 void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr,	\
 			    unsigned long off, u##bits val,		\
 			    struct btrfs_map_token *token);		\
-static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb,	\
-				       const void *ptr,			\
-				       unsigned long off)		\
-{									\
-	return btrfs_get_token_##bits(eb, ptr, off, NULL);		\
-}									\
-static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\
-				    unsigned long off, u##bits val)	\
-{									\
-       btrfs_set_token_##bits(eb, ptr, off, val, NULL);			\
-}
+u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
+			 const void *ptr, unsigned long off);		\
+void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,		\
+		      unsigned long off, u##bits val);
 
 DECLARE_BTRFS_SETGET_BITS(8)
 DECLARE_BTRFS_SETGET_BITS(16)
@@ -2053,16 +1876,6 @@
 	btrfs_disk_key_to_cpu(key, &disk_key);
 }
 
-static inline u8 btrfs_key_type(const struct btrfs_key *key)
-{
-	return key->type;
-}
-
-static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
-{
-	key->type = val;
-}
-
 /* struct btrfs_header */
 BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
@@ -2083,18 +1896,16 @@
 	return (btrfs_header_flags(eb) & flag) == flag;
 }
 
-static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
 {
 	u64 flags = btrfs_header_flags(eb);
 	btrfs_set_header_flags(eb, flags | flag);
-	return (flags & flag) == flag;
 }
 
-static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
 {
 	u64 flags = btrfs_header_flags(eb);
 	btrfs_set_header_flags(eb, flags & ~flag);
-	return (flags & flag) == flag;
 }
 
 static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
@@ -2350,28 +2161,20 @@
 BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
 			 uuid_tree_generation, 64);
 
-static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
-{
-	u16 t = btrfs_super_csum_type(s);
-	/*
-	 * csum type is validated at mount time
-	 */
-	return btrfs_csum_sizes[t];
-}
-
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
 
 /*
  * The leaf data grows from end-to-front in the node.
  * this returns the address of the start of the last item,
  * which is the stop of the leaf data stack
  */
-static inline unsigned int leaf_data_end(const struct btrfs_fs_info *fs_info,
-					 const struct extent_buffer *leaf)
+static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
 {
 	u32 nr = btrfs_header_nritems(leaf);
 
 	if (nr == 0)
-		return BTRFS_LEAF_DATA_SIZE(fs_info);
+		return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
 	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
@@ -2432,30 +2235,6 @@
 	return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
 }
 
-/* btrfs_dev_stats_item */
-static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
-					const struct btrfs_dev_stats_item *ptr,
-					int index)
-{
-	u64 val;
-
-	read_extent_buffer(eb, &val,
-			   offsetof(struct btrfs_dev_stats_item, values) +
-			    ((unsigned long)ptr) + (index * sizeof(u64)),
-			   sizeof(val));
-	return val;
-}
-
-static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
-					     struct btrfs_dev_stats_item *ptr,
-					     int index, u64 val)
-{
-	write_extent_buffer(eb, &val,
-			    offsetof(struct btrfs_dev_stats_item, values) +
-			     ((unsigned long)ptr) + (index * sizeof(u64)),
-			    sizeof(val));
-}
-
 /* btrfs_qgroup_status_item */
 BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
 		   generation, 64);
@@ -2551,6 +2330,16 @@
 	((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
 	btrfs_item_offset_nr(leaf, slot)))
 
+static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
+{
+	return crc32c(crc, address, length);
+}
+
+static inline void btrfs_crc32c_final(u32 crc, u8 *result)
+{
+	put_unaligned_le32(~crc, result);
+}
+
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
        return crc32c((u32)~1, name, len);
@@ -2565,12 +2354,6 @@
        return (u64) crc32c(parent_objectid, name, len);
 }
 
-static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
-{
-	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
-		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
-}
-
 static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 {
 	return mapping_gfp_constraint(mapping, ~__GFP_FS);
@@ -2579,49 +2362,47 @@
 /* extent-tree.c */
 
 enum btrfs_inline_ref_type {
-	BTRFS_REF_TYPE_INVALID =	 0,
-	BTRFS_REF_TYPE_BLOCK =		 1,
-	BTRFS_REF_TYPE_DATA =		 2,
-	BTRFS_REF_TYPE_ANY =		 3,
+	BTRFS_REF_TYPE_INVALID,
+	BTRFS_REF_TYPE_BLOCK,
+	BTRFS_REF_TYPE_DATA,
+	BTRFS_REF_TYPE_ANY,
 };
 
 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 				     struct btrfs_extent_inline_ref *iref,
 				     enum btrfs_inline_ref_type is_data);
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
 
 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
 
-static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
-						 unsigned num_items)
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+						  unsigned num_items)
 {
 	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
 }
 
 /*
- * Doing a truncate won't result in new nodes or leaves, just what we need for
- * COW.
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
  */
-static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
 						 unsigned num_items)
 {
 	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
 }
 
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
-				       struct btrfs_fs_info *fs_info);
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
-				       struct btrfs_fs_info *fs_info);
-void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
-					 const u64 start);
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+			      u64 start, u64 num_bytes);
+void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   unsigned long count);
-int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
-				 unsigned long count, u64 transid, int wait);
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+				  struct btrfs_delayed_ref_root *delayed_refs,
+				  struct btrfs_delayed_ref_head *head);
 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -2630,15 +2411,9 @@
 		     u64 bytenr, u64 num, int reserved);
 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
 				    u64 bytenr, u64 num_bytes);
-int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
-				 struct extent_buffer *eb);
+int btrfs_exclude_logged_extents(struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_root *root,
 			  u64 objectid, u64 offset, u64 bytenr);
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
-						 struct btrfs_fs_info *info,
-						 u64 bytenr);
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     u64 parent, u64 root_objectid,
@@ -2664,13 +2439,9 @@
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf, int full_backref);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info,
 				u64 bytenr, u64 num_bytes, u64 flags,
 				int level, int is_data);
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset);
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
 
 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 			       u64 start, u64 len, int delalloc);
@@ -2679,35 +2450,11 @@
 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset);
+			 struct btrfs_ref *generic_ref);
 
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info);
-int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info);
 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
-int btrfs_free_block_groups(struct btrfs_fs_info *info);
-int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans,
-			   u64 bytes_used, u64 type, u64 chunk_offset,
-			   u64 size);
-void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info);
-struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
-				struct btrfs_fs_info *fs_info,
-				const u64 chunk_offset);
-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-			     u64 group_start, struct extent_map *em);
-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
-u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
-u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
-u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 
 enum btrfs_reserve_flush_enum {
@@ -2718,75 +2465,36 @@
 	 * case, use FLUSH LIMIT
 	 */
 	BTRFS_RESERVE_FLUSH_LIMIT,
+	BTRFS_RESERVE_FLUSH_EVICT,
 	BTRFS_RESERVE_FLUSH_ALL,
 };
 
 enum btrfs_flush_state {
 	FLUSH_DELAYED_ITEMS_NR	=	1,
 	FLUSH_DELAYED_ITEMS	=	2,
-	FLUSH_DELALLOC		=	3,
-	FLUSH_DELALLOC_WAIT	=	4,
-	ALLOC_CHUNK		=	5,
-	COMMIT_TRANS		=	6,
+	FLUSH_DELAYED_REFS_NR	=	3,
+	FLUSH_DELAYED_REFS	=	4,
+	FLUSH_DELALLOC		=	5,
+	FLUSH_DELALLOC_WAIT	=	6,
+	ALLOC_CHUNK		=	7,
+	ALLOC_CHUNK_FORCE	=	8,
+	RUN_DELAYED_IPUTS	=	9,
+	COMMIT_TRANS		=	10,
 };
 
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
-int btrfs_check_data_free_space(struct inode *inode,
-			struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_free_reserved_data_space(struct inode *inode,
-			struct extent_changeset *reserved, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode,
-				  struct extent_changeset *reserved,
-				  u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
-					    u64 len);
-void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int nitems, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 				      struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
-				    bool qgroup_free);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
-				     bool qgroup_free);
-int btrfs_delalloc_reserve_space(struct inode *inode,
-			struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
-					      unsigned short type);
-void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
-				   struct btrfs_block_rsv *rsv,
-				   unsigned short type);
-void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
-			  struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
-			enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
-			   enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
-			    struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
-			    int update_size);
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_rsv *dest, u64 num_bytes,
-			     int min_factor);
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_rsv *block_rsv,
-			     u64 num_bytes);
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
 				   u64 start, u64 end);
 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 num_bytes, u64 *actual_bytes);
-int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
 
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
@@ -2795,10 +2503,6 @@
 int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
-void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-		       u64 start, u64 end);
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg);
 
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
@@ -2821,20 +2525,9 @@
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 			 struct btrfs_path *path,
 			 u64 min_trans);
-enum btrfs_compare_tree_result {
-	BTRFS_COMPARE_TREE_NEW,
-	BTRFS_COMPARE_TREE_DELETED,
-	BTRFS_COMPARE_TREE_CHANGED,
-	BTRFS_COMPARE_TREE_SAME,
-};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
-				  struct btrfs_path *right_path,
-				  struct btrfs_key *key,
-				  enum btrfs_compare_tree_result result,
-				  void *ctx);
-int btrfs_compare_trees(struct btrfs_root *left_root,
-			struct btrfs_root *right_root,
-			btrfs_changed_cb_t cb, void *ctx);
+struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
+					   int slot);
+
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
@@ -2845,10 +2538,8 @@
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
 int btrfs_block_can_be_shared(struct btrfs_root *root,
 			      struct extent_buffer *buf);
-void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-		       u32 data_size);
-void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
-			 struct btrfs_path *path, u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_path *path, u32 data_size);
+void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end);
 int btrfs_split_item(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_path *path,
@@ -2877,8 +2568,6 @@
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p,
-			       struct extent_buffer *held, int held_rw);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2926,8 +2615,7 @@
 {
 	return btrfs_next_old_item(root, p, 0);
 }
-int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
-			  struct extent_buffer *leaf);
+int btrfs_leaf_free_space(struct extent_buffer *leaf);
 int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
 				     struct btrfs_block_rsv *block_rsv,
 				     int update_ref, int for_reloc);
@@ -2972,7 +2660,6 @@
 	kfree(fs_info->free_space_root);
 	kfree(fs_info->super_copy);
 	kfree(fs_info->super_for_commit);
-	security_free_mnt_opts(&fs_info->security_opts);
 	kvfree(fs_info);
 }
 
@@ -3021,8 +2708,7 @@
 /* dir-item.c */
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 			  const char *name, int name_len);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, const char *name,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
 			  int name_len, struct btrfs_inode *dir,
 			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
@@ -3090,19 +2776,18 @@
 			  u64 inode_objectid, u64 ref_objectid, int ins_len,
 			  int cow);
 
-int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
-			       const char *name,
-			       int name_len, struct btrfs_inode_ref **ref_ret);
-int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
-				   u64 ref_objectid, const char *name,
-				   int name_len,
-				   struct btrfs_inode_extref **extref_ret);
-
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+						   int slot, const char *name,
+						   int name_len);
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+		struct extent_buffer *leaf, int slot, u64 ref_objectid,
+		const char *name, int name_len);
 /* file-item.c */
 struct btrfs_dio_private;
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+				   u8 *dst);
 blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
 			      u64 logical_offset);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
@@ -3130,8 +2815,7 @@
 
 /* inode.c */
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
-		struct page *page, size_t pg_offset, u64 start,
-		u64 len, int create);
+					   u64 start, u64 len);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes);
@@ -3155,18 +2839,25 @@
 			       struct inode *inode, u64 new_size,
 			       u32 min_type);
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      unsigned int extra_bits,
-			      struct extent_state **cached_state, int dedupe);
+			      struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
 			     struct btrfs_root *parent_root,
 			     u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio,
-			 unsigned long bio_flags);
+ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+			       unsigned *bits);
+void btrfs_clear_delalloc_extent(struct inode *inode,
+				 struct extent_state *state, unsigned *bits);
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+				 struct extent_state *other);
+void btrfs_split_delalloc_extent(struct inode *inode,
+				 struct extent_state *orig, u64 split);
+int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
+			     unsigned long bio_flags);
 void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
@@ -3174,14 +2865,18 @@
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
+void btrfs_free_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
 int __init btrfs_init_cachep(void);
 void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+			      struct btrfs_root *root, int *new,
+			      struct btrfs_path *path);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 			 struct btrfs_root *root, int *was_new);
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
-		struct page *page, size_t pg_offset,
-		u64 start, u64 end, int create);
+				    struct page *page, size_t pg_offset,
+				    u64 start, u64 end, int create);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode);
@@ -3193,6 +2888,7 @@
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
 			      u64 start, u64 num_bytes, u64 min_size,
 			      loff_t actual_len, u64 *alloc_hint);
@@ -3200,10 +2896,13 @@
 				    struct btrfs_trans_handle *trans, int mode,
 				    u64 start, u64 num_bytes, u64 min_size,
 				    loff_t actual_len, u64 *alloc_hint);
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+		u64 start, u64 end, int *page_started, unsigned long *nr_written,
+		struct writeback_control *wbc);
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+					  u64 end, int uptodate);
 extern const struct dentry_operations btrfs_dentry_operations;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
-#endif
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3218,9 +2917,6 @@
 				struct btrfs_ioctl_space_info *space);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 			       struct btrfs_ioctl_balance_args *bargs);
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-			    struct file *dst_file, loff_t dst_loff,
-			    u64 olen);
 
 /* file.c */
 int __init btrfs_auto_defrag_init(void);
@@ -3243,6 +2939,10 @@
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
 		       u64 end, int drop_cache);
+int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+			   const u64 start, const u64 end,
+			   struct btrfs_clone_extent_info *clone_info,
+			   struct btrfs_trans_handle **trans_out);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct btrfs_inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3250,19 +2950,14 @@
 		      size_t num_pages, loff_t pos, size_t write_bytes,
 		      struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
-			   struct file *file_out, loff_t pos_out, u64 len);
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+			      struct file *file_out, loff_t pos_out,
+			      loff_t len, unsigned int remap_flags);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root);
 
-/* sysfs.c */
-int __init btrfs_init_sysfs(void);
-void __cold btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
-
 /* super.c */
 int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			unsigned long new_flags);
@@ -3353,31 +3048,17 @@
 
 #if defined(CONFIG_DYNAMIC_DEBUG)
 #define btrfs_debug(fs_info, fmt, args...)				\
-do {									\
-        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);         	\
-        if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT))  	\
-		btrfs_printk(fs_info, KERN_DEBUG fmt, ##args);		\
-} while (0)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) 			\
-do {									\
-        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); 	        \
-        if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) 		\
-		btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args);	\
-} while (0)
+	_dynamic_func_call_no_desc(fmt, btrfs_printk,			\
+				   fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...)			\
+	_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu,		\
+				   fs_info, KERN_DEBUG fmt, ##args)
 #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...)			\
-do {									\
-        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);         	\
-        if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT))  	\
-		btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt,		\
-				       ##args);\
-} while (0)
-#define btrfs_debug_rl(fs_info, fmt, args...) 				\
-do {									\
-        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);         	\
-        if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT))  	\
-		btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt,	\
-					 ##args);			\
-} while (0)
+	_dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu,		\
+				   fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...)				\
+	_dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited,	\
+				   fs_info, KERN_DEBUG fmt, ##args)
 #elif defined(DEBUG)
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
@@ -3428,20 +3109,26 @@
 	rcu_read_unlock();					\
 } while (0)
 
-#ifdef CONFIG_BTRFS_ASSERT
-
 __cold
 static inline void assfail(const char *expr, const char *file, int line)
 {
-	pr_err("assertion failed: %s, file: %s, line: %d\n",
-	       expr, file, line);
-	BUG();
+	if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+		pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+		BUG();
+	}
 }
 
 #define ASSERT(expr)	\
 	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+/*
+ * Use that for functions that are conditionally exported for sanity tests but
+ * otherwise static
+ */
+#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define EXPORT_FOR_TESTS static
 #else
-#define ASSERT(expr)	((void)0)
+#define EXPORT_FOR_TESTS
 #endif
 
 __cold
@@ -3510,10 +3197,11 @@
 /* compatibility and incompatibility defines */
 
 #define btrfs_set_fs_incompat(__fs_info, opt) \
-	__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+	__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
+				#opt)
 
 static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
-					   u64 flag)
+					   u64 flag, const char* name)
 {
 	struct btrfs_super_block *disk_super;
 	u64 features;
@@ -3526,18 +3214,20 @@
 		if (!(features & flag)) {
 			features |= flag;
 			btrfs_set_super_incompat_flags(disk_super, features);
-			btrfs_info(fs_info, "setting %llu feature flag",
-					 flag);
+			btrfs_info(fs_info,
+				"setting incompat feature flag for %s (0x%llx)",
+				name, flag);
 		}
 		spin_unlock(&fs_info->super_lock);
 	}
 }
 
 #define btrfs_clear_fs_incompat(__fs_info, opt) \
-	__btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+	__btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
+				  #opt)
 
 static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
-					     u64 flag)
+					     u64 flag, const char* name)
 {
 	struct btrfs_super_block *disk_super;
 	u64 features;
@@ -3550,8 +3240,9 @@
 		if (features & flag) {
 			features &= ~flag;
 			btrfs_set_super_incompat_flags(disk_super, features);
-			btrfs_info(fs_info, "clearing %llu feature flag",
-					 flag);
+			btrfs_info(fs_info,
+				"clearing incompat feature flag for %s (0x%llx)",
+				name, flag);
 		}
 		spin_unlock(&fs_info->super_lock);
 	}
@@ -3568,10 +3259,11 @@
 }
 
 #define btrfs_set_fs_compat_ro(__fs_info, opt) \
-	__btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+	__btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
+				 #opt)
 
 static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
-					    u64 flag)
+					    u64 flag, const char *name)
 {
 	struct btrfs_super_block *disk_super;
 	u64 features;
@@ -3584,18 +3276,20 @@
 		if (!(features & flag)) {
 			features |= flag;
 			btrfs_set_super_compat_ro_flags(disk_super, features);
-			btrfs_info(fs_info, "setting %llu ro feature flag",
-				   flag);
+			btrfs_info(fs_info,
+				"setting compat-ro feature flag for %s (0x%llx)",
+				name, flag);
 		}
 		spin_unlock(&fs_info->super_lock);
 	}
 }
 
 #define btrfs_clear_fs_compat_ro(__fs_info, opt) \
-	__btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+	__btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
+				   #opt)
 
 static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
-					      u64 flag)
+					      u64 flag, const char *name)
 {
 	struct btrfs_super_block *disk_super;
 	u64 features;
@@ -3608,8 +3302,9 @@
 		if (features & flag) {
 			features &= ~flag;
 			btrfs_set_super_compat_ro_flags(disk_super, features);
-			btrfs_info(fs_info, "clearing %llu ro feature flag",
-				   flag);
+			btrfs_info(fs_info,
+				"clearing compat-ro feature flag for %s (0x%llx)",
+				name, flag);
 		}
 		spin_unlock(&fs_info->super_lock);
 	}
@@ -3664,8 +3359,7 @@
 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
 int btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
-			   struct btrfs_device *dev);
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
 			 struct btrfs_scrub_progress *progress);
 static inline void btrfs_init_full_stripe_locks_tree(
@@ -3714,41 +3408,22 @@
 	return signal_pending(current);
 }
 
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+
 /* Sanity test specific functions */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
 void btrfs_test_destroy_inode(struct inode *inode);
-#endif
 
 static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
 {
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-	if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
-			      &fs_info->fs_state)))
-		return 1;
-#endif
+	return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+#else
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
 	return 0;
 }
-
-static inline void cond_wake_up(struct wait_queue_head *wq)
-{
-	/*
-	 * This implies a full smp_mb barrier, see comments for
-	 * waitqueue_active why.
-	 */
-	if (wq_has_sleeper(wq))
-		wake_up(wq);
-}
-
-static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
-{
-	/*
-	 * Special case for conditional wakeup where the barrier required for
-	 * waitqueue_active is implied by some of the preceding code. Eg. one
-	 * of such atomic operations (atomic_dec_and_return, ...), or a
-	 * unlock/lock sequence, etc.
-	 */
-	if (waitqueue_active(wq))
-		wake_up(wq);
-}
+#endif
 
 #endif

diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
deleted file mode 100644
index 90281a7..0000000
--- a/fs/btrfs/dedupe.h
+++ /dev/null

@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2016 Fujitsu.  All rights reserved.
- */
-
-#ifndef BTRFS_DEDUPE_H
-#define BTRFS_DEDUPE_H
-
-/* later in-band dedupe will expand this struct */
-struct btrfs_dedupe_hash;
-
-#endif

diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
new file mode 100644
index 0000000..db9f2c5
--- /dev/null
+++ b/fs/btrfs/delalloc-space.c

@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "block-rsv.h"
+#include "btrfs_inode.h"
+#include "space-info.h"
+#include "transaction.h"
+#include "qgroup.h"
+#include "block-group.h"
+
+int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+	u64 used;
+	int ret = 0;
+	int need_commit = 2;
+	int have_pinned_space;
+
+	/* Make sure bytes are sectorsize aligned */
+	bytes = ALIGN(bytes, fs_info->sectorsize);
+
+	if (btrfs_is_free_space_inode(inode)) {
+		need_commit = 0;
+		ASSERT(current->journal_info);
+	}
+
+again:
+	/* Make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	used = btrfs_space_info_used(data_sinfo, true);
+
+	if (used + bytes > data_sinfo->total_bytes) {
+		struct btrfs_trans_handle *trans;
+
+		/*
+		 * If we don't have enough free bytes in this space then we need
+		 * to alloc a new chunk.
+		 */
+		if (!data_sinfo->full) {
+			u64 alloc_target;
+
+			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
+			spin_unlock(&data_sinfo->lock);
+
+			alloc_target = btrfs_data_alloc_profile(fs_info);
+			/*
+			 * It is ugly that we don't call nolock join
+			 * transaction for the free space inode case here.
+			 * But it is safe because we only do the data space
+			 * reservation for the free space cache in the
+			 * transaction context, the common join transaction
+			 * just increase the counter of the current transaction
+			 * handler, doesn't try to acquire the trans_lock of
+			 * the fs.
+			 */
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
+
+			ret = btrfs_chunk_alloc(trans, alloc_target,
+						CHUNK_ALLOC_NO_FORCE);
+			btrfs_end_transaction(trans);
+			if (ret < 0) {
+				if (ret != -ENOSPC)
+					return ret;
+				else {
+					have_pinned_space = 1;
+					goto commit_trans;
+				}
+			}
+
+			goto again;
+		}
+
+		/*
+		 * If we don't have enough pinned space to deal with this
+		 * allocation, and no removed chunk in current transaction,
+		 * don't bother committing the transaction.
+		 */
+		have_pinned_space = __percpu_counter_compare(
+			&data_sinfo->total_bytes_pinned,
+			used + bytes - data_sinfo->total_bytes,
+			BTRFS_TOTAL_BYTES_PINNED_BATCH);
+		spin_unlock(&data_sinfo->lock);
+
+		/* Commit the current transaction and try again */
+commit_trans:
+		if (need_commit) {
+			need_commit--;
+
+			if (need_commit > 0) {
+				btrfs_start_delalloc_roots(fs_info, -1);
+				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
+							 (u64)-1);
+			}
+
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
+			if (have_pinned_space >= 0 ||
+			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+				     &trans->transaction->flags) ||
+			    need_commit > 0) {
+				ret = btrfs_commit_transaction(trans);
+				if (ret)
+					return ret;
+				/*
+				 * The cleaner kthread might still be doing iput
+				 * operations. Wait for it to finish so that
+				 * more space is released.  We don't need to
+				 * explicitly run the delayed iputs here because
+				 * the commit_transaction would have woken up
+				 * the cleaner.
+				 */
+				ret = btrfs_wait_on_delayed_iputs(fs_info);
+				if (ret)
+					return ret;
+				goto again;
+			} else {
+				btrfs_end_transaction(trans);
+			}
+		}
+
+		trace_btrfs_space_reservation(fs_info,
+					      "space_info:enospc",
+					      data_sinfo->flags, bytes, 1);
+		return -ENOSPC;
+	}
+	btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
+	spin_unlock(&data_sinfo->lock);
+
+	return 0;
+}
+
+int btrfs_check_data_free_space(struct inode *inode,
+			struct extent_changeset **reserved, u64 start, u64 len)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	int ret;
+
+	/* align the range */
+	len = round_up(start + len, fs_info->sectorsize) -
+	      round_down(start, fs_info->sectorsize);
+	start = round_down(start, fs_info->sectorsize);
+
+	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
+	if (ret < 0)
+		return ret;
+
+	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
+	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
+	if (ret < 0)
+		btrfs_free_reserved_data_space_noquota(inode, start, len);
+	else
+		ret = 0;
+	return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
+ */
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+					    u64 len)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_space_info *data_sinfo;
+
+	/* Make sure the range is aligned to sectorsize */
+	len = round_up(start + len, fs_info->sectorsize) -
+	      round_down(start, fs_info->sectorsize);
+	start = round_down(start, fs_info->sectorsize);
+
+	data_sinfo = fs_info->data_sinfo;
+	spin_lock(&data_sinfo->lock);
+	btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
+	spin_unlock(&data_sinfo->lock);
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-inode data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode,
+			struct extent_changeset *reserved, u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	/* Make sure the range is aligned to sectorsize */
+	len = round_up(start + len, root->fs_info->sectorsize) -
+	      round_down(start, root->fs_info->sectorsize);
+	start = round_down(start, root->fs_info->sectorsize);
+
+	btrfs_free_reserved_data_space_noquota(inode, start, len);
+	btrfs_qgroup_free_data(inode, reserved, start, len);
+}
+
+/**
+ * btrfs_inode_rsv_release - release any excessive reservation.
+ * @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ *   Unlike normal operation, qgroup meta reservation needs to know if we are
+ *   freeing qgroup reservation or just converting it into per-trans.  Normally
+ *   @qgroup_free is true for error handling, and false for normal release.
+ *
+ * This is the same as btrfs_block_rsv_release, except that it handles the
+ * tracepoint for the reservation.
+ */
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 released = 0;
+	u64 qgroup_to_release = 0;
+
+	/*
+	 * Since we statically set the block_rsv->size we just want to say we
+	 * are releasing 0 bytes, and then we'll just get the reservation over
+	 * the size free'd.
+	 */
+	released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
+					     &qgroup_to_release);
+	if (released > 0)
+		trace_btrfs_space_reservation(fs_info, "delalloc",
+					      btrfs_ino(inode), released, 0);
+	if (qgroup_free)
+		btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
+	else
+		btrfs_qgroup_convert_reserved_meta(inode->root,
+						   qgroup_to_release);
+}
+
+static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
+						 struct btrfs_inode *inode)
+{
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 reserve_size = 0;
+	u64 qgroup_rsv_size = 0;
+	u64 csum_leaves;
+	unsigned outstanding_extents;
+
+	lockdep_assert_held(&inode->lock);
+	outstanding_extents = inode->outstanding_extents;
+
+	/*
+	 * Insert size for the number of outstanding extents, 1 normal size for
+	 * updating the inode.
+	 */
+	if (outstanding_extents) {
+		reserve_size = btrfs_calc_insert_metadata_size(fs_info,
+						outstanding_extents);
+		reserve_size += btrfs_calc_metadata_size(fs_info, 1);
+	}
+	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+						 inode->csum_bytes);
+	reserve_size += btrfs_calc_insert_metadata_size(fs_info,
+							csum_leaves);
+	/*
+	 * For qgroup rsv, the calculation is very simple:
+	 * account one nodesize for each outstanding extent
+	 *
+	 * This is overestimating in most cases.
+	 */
+	qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
+
+	spin_lock(&block_rsv->lock);
+	block_rsv->size = reserve_size;
+	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
+	spin_unlock(&block_rsv->lock);
+}
+
+static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+				    u64 num_bytes, u64 *meta_reserve,
+				    u64 *qgroup_reserve)
+{
+	u64 nr_extents = count_max_extents(num_bytes);
+	u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
+	u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+
+	*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
+						nr_extents + csum_leaves);
+
+	/*
+	 * finish_ordered_io has to update the inode, so add the space required
+	 * for an inode update.
+	 */
+	*meta_reserve += inode_update;
+	*qgroup_reserve = nr_extents * fs_info->nodesize;
+}
+
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 meta_reserve, qgroup_reserve;
+	unsigned nr_extents;
+	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+	int ret = 0;
+	bool delalloc_lock = true;
+
+	/*
+	 * If we are a free space inode we need to not flush since we will be in
+	 * the middle of a transaction commit.  We also don't need the delalloc
+	 * mutex since we won't race with anybody.  We need this mostly to make
+	 * lockdep shut its filthy mouth.
+	 *
+	 * If we have a transaction open (can happen if we call truncate_block
+	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
+	 */
+	if (btrfs_is_free_space_inode(inode)) {
+		flush = BTRFS_RESERVE_NO_FLUSH;
+		delalloc_lock = false;
+	} else {
+		if (current->journal_info)
+			flush = BTRFS_RESERVE_FLUSH_LIMIT;
+
+		if (btrfs_transaction_in_commit(fs_info))
+			schedule_timeout(1);
+	}
+
+	if (delalloc_lock)
+		mutex_lock(&inode->delalloc_mutex);
+
+	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+
+	/*
+	 * We always want to do it this way, every other way is wrong and ends
+	 * in tears.  Pre-reserving the amount we are going to add will always
+	 * be the right way, because otherwise if we have enough parallelism we
+	 * could end up with thousands of inodes all holding little bits of
+	 * reservations they were able to make previously and the only way to
+	 * reclaim that space is to ENOSPC out the operations and clear
+	 * everything out and try again, which is bad.  This way we just
+	 * over-reserve slightly, and clean up the mess when we are done.
+	 */
+	calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
+				&qgroup_reserve);
+	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+	if (ret)
+		goto out_fail;
+	ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+	if (ret)
+		goto out_qgroup;
+
+	/*
+	 * Now we need to update our outstanding extents and csum bytes _first_
+	 * and then add the reservation to the block_rsv.  This keeps us from
+	 * racing with an ordered completion or some such that would think it
+	 * needs to free the reservation we just made.
+	 */
+	spin_lock(&inode->lock);
+	nr_extents = count_max_extents(num_bytes);
+	btrfs_mod_outstanding_extents(inode, nr_extents);
+	inode->csum_bytes += num_bytes;
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+	spin_unlock(&inode->lock);
+
+	/* Now we can safely add our space to our block rsv */
+	btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
+	trace_btrfs_space_reservation(root->fs_info, "delalloc",
+				      btrfs_ino(inode), meta_reserve, 1);
+
+	spin_lock(&block_rsv->lock);
+	block_rsv->qgroup_rsv_reserved += qgroup_reserve;
+	spin_unlock(&block_rsv->lock);
+
+	if (delalloc_lock)
+		mutex_unlock(&inode->delalloc_mutex);
+	return 0;
+out_qgroup:
+	btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
+out_fail:
+	if (delalloc_lock)
+		mutex_unlock(&inode->delalloc_mutex);
+	return ret;
+}
+
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
+ *
+ * This will release the metadata reservation for an inode.  This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations, or on error for the same reason.
+ */
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+				     bool qgroup_free)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+	spin_lock(&inode->lock);
+	inode->csum_bytes -= num_bytes;
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+	spin_unlock(&inode->lock);
+
+	if (btrfs_is_testing(fs_info))
+		return;
+
+	btrfs_inode_rsv_release(inode, qgroup_free);
+}
+
+/**
+ * btrfs_delalloc_release_extents - release our outstanding_extents
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
+ *
+ * When we reserve space we increase outstanding_extents for the extents we may
+ * add.  Once we've set the range as delalloc or created our ordered extents we
+ * have outstanding_extents to track the real usage, so we use this to free our
+ * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
+ * with btrfs_delalloc_reserve_metadata.
+ */
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	unsigned num_extents;
+
+	spin_lock(&inode->lock);
+	num_extents = count_max_extents(num_bytes);
+	btrfs_mod_outstanding_extents(inode, -num_extents);
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+	spin_unlock(&inode->lock);
+
+	if (btrfs_is_testing(fs_info))
+		return;
+
+	btrfs_inode_rsv_release(inode, true);
+}
+
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ * @reserved: mandatory parameter, record actually reserved qgroup ranges of
+ * 	      current reservation.
+ *
+ * This will do the following things
+ *
+ * - reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * - reserve space for metadata space, based on the number of outstanding
+ *   extents and how much csums will be needed
+ *   also reserve metadata space in a per root over-reserve method.
+ * - add to the inodes->delalloc_bytes
+ * - add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
+ *
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
+ */
+int btrfs_delalloc_reserve_space(struct inode *inode,
+			struct extent_changeset **reserved, u64 start, u64 len)
+{
+	int ret;
+
+	ret = btrfs_check_data_free_space(inode, reserved, start, len);
+	if (ret < 0)
+		return ret;
+	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+	if (ret < 0)
+		btrfs_free_reserved_data_space(inode, *reserved, start, len);
+	return ret;
+}
+
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
+ * @release_bytes: the len of the space we consumed or didn't use
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
+ */
+void btrfs_delalloc_release_space(struct inode *inode,
+				  struct extent_changeset *reserved,
+				  u64 start, u64 len, bool qgroup_free)
+{
+	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
+	btrfs_free_reserved_data_space(inode, reserved, start, len);
+}

diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
new file mode 100644
index 0000000..54466fb
--- /dev/null
+++ b/fs/btrfs/delalloc-space.h

@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DELALLOC_SPACE_H
+#define BTRFS_DELALLOC_SPACE_H
+
+struct extent_changeset;
+
+int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode,
+			struct extent_changeset **reserved, u64 start, u64 len);
+void btrfs_free_reserved_data_space(struct inode *inode,
+			struct extent_changeset *reserved, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode,
+				  struct extent_changeset *reserved,
+				  u64 start, u64 len, bool qgroup_free);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+					    u64 len);
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+				     bool qgroup_free);
+int btrfs_delalloc_reserve_space(struct inode *inode,
+			struct extent_changeset **reserved, u64 start, u64 len);
+
+#endif /* BTRFS_DELALLOC_SPACE_H */

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f51b509..1f7f39b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c

@@ -6,6 +6,7 @@
 
 #include <linux/slab.h>
 #include <linux/iversion.h>
+#include "misc.h"
 #include "delayed-inode.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -42,8 +43,8 @@
 	delayed_node->root = root;
 	delayed_node->inode_id = inode_id;
 	refcount_set(&delayed_node->refs, 0);
-	delayed_node->ins_root = RB_ROOT;
-	delayed_node->del_root = RB_ROOT;
+	delayed_node->ins_root = RB_ROOT_CACHED;
+	delayed_node->del_root = RB_ROOT_CACHED;
 	mutex_init(&delayed_node->mutex);
 	INIT_LIST_HEAD(&delayed_node->n_list);
 	INIT_LIST_HEAD(&delayed_node->p_list);
@@ -390,7 +391,7 @@
 					struct btrfs_delayed_node *delayed_node,
 					struct btrfs_key *key)
 {
-	return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+	return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
 					   NULL, NULL);
 }
 
@@ -400,9 +401,10 @@
 {
 	struct rb_node **p, *node;
 	struct rb_node *parent_node = NULL;
-	struct rb_root *root;
+	struct rb_root_cached *root;
 	struct btrfs_delayed_item *item;
 	int cmp;
+	bool leftmost = true;
 
 	if (action == BTRFS_DELAYED_INSERTION_ITEM)
 		root = &delayed_node->ins_root;
@@ -410,7 +412,7 @@
 		root = &delayed_node->del_root;
 	else
 		BUG();
-	p = &root->rb_node;
+	p = &root->rb_root.rb_node;
 	node = &ins->rb_node;
 
 	while (*p) {
@@ -419,16 +421,18 @@
 				 rb_node);
 
 		cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
-		if (cmp < 0)
+		if (cmp < 0) {
 			p = &(*p)->rb_right;
-		else if (cmp > 0)
+			leftmost = false;
+		} else if (cmp > 0) {
 			p = &(*p)->rb_left;
-		else
+		} else {
 			return -EEXIST;
+		}
 	}
 
 	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
+	rb_insert_color_cached(node, root, leftmost);
 	ins->delayed_node = delayed_node;
 	ins->ins_or_del = action;
 
@@ -468,9 +472,12 @@
 
 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 {
-	struct rb_root *root;
+	struct rb_root_cached *root;
 	struct btrfs_delayed_root *delayed_root;
 
+	/* Not associated with any delayed_node */
+	if (!delayed_item->delayed_node)
+		return;
 	delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
 
 	BUG_ON(!delayed_root);
@@ -482,7 +489,7 @@
 	else
 		root = &delayed_item->delayed_node->del_root;
 
-	rb_erase(&delayed_item->rb_node, root);
+	rb_erase_cached(&delayed_item->rb_node, root);
 	delayed_item->delayed_node->count--;
 
 	finish_one_item(delayed_root);
@@ -503,7 +510,7 @@
 	struct rb_node *p;
 	struct btrfs_delayed_item *item = NULL;
 
-	p = rb_first(&delayed_node->ins_root);
+	p = rb_first_cached(&delayed_node->ins_root);
 	if (p)
 		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
 
@@ -516,7 +523,7 @@
 	struct rb_node *p;
 	struct btrfs_delayed_item *item = NULL;
 
-	p = rb_first(&delayed_node->del_root);
+	p = rb_first_cached(&delayed_node->del_root);
 	if (p)
 		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
 
@@ -552,14 +559,14 @@
 	src_rsv = trans->block_rsv;
 	dst_rsv = &fs_info->delayed_block_rsv;
 
-	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+	num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
 
 	/*
 	 * Here we migrate space rsv from transaction rsv, since have already
 	 * reserved space when starting a transaction.  So no need to reserve
 	 * qgroup space here.
 	 */
-	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
 	if (!ret) {
 		trace_btrfs_space_reservation(fs_info, "delayed_item",
 					      item->key.objectid,
@@ -606,7 +613,7 @@
 	src_rsv = trans->block_rsv;
 	dst_rsv = &fs_info->delayed_block_rsv;
 
-	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+	num_bytes = btrfs_calc_metadata_size(fs_info, 1);
 
 	/*
 	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
@@ -647,7 +654,7 @@
 		return ret;
 	}
 
-	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
 	if (!ret) {
 		trace_btrfs_space_reservation(fs_info, "delayed_inode",
 					      btrfs_ino(inode), num_bytes, 1);
@@ -688,7 +695,6 @@
 				    struct btrfs_path *path,
 				    struct btrfs_delayed_item *item)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_delayed_item *curr, *next;
 	int free_space;
 	int total_data_size = 0, total_size = 0;
@@ -705,7 +711,7 @@
 	BUG_ON(!path->nodes[0]);
 
 	leaf = path->nodes[0];
-	free_space = btrfs_leaf_free_space(fs_info, leaf);
+	free_space = btrfs_leaf_free_space(leaf);
 	INIT_LIST_HEAD(&head);
 
 	next = item;
@@ -762,9 +768,6 @@
 		i++;
 	}
 
-	/* reset all the locked nodes in the patch to spinning locks. */
-	btrfs_clear_path_blocking(path, NULL, 0);
-
 	/* insert the keys of the items */
 	setup_items_for_insert(root, path, keys, data_size,
 			       total_data_size, total_size, nitems);
@@ -1462,7 +1465,7 @@
 	if (unlikely(ret)) {
 		btrfs_err(trans->fs_info,
 			  "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
-			  name_len, name, delayed_node->root->objectid,
+			  name_len, name, delayed_node->root->root_key.objectid,
 			  delayed_node->inode_id, ret);
 		BUG();
 	}
@@ -1526,15 +1529,22 @@
 	 * we have reserved enough space when we start a new transaction,
 	 * so reserving metadata failure is impossible.
 	 */
-	BUG_ON(ret);
+	if (ret < 0) {
+		btrfs_err(trans->fs_info,
+"metadata reservation failed for delayed dir item deltiona, should have been reserved");
+		btrfs_release_delayed_item(item);
+		goto end;
+	}
 
 	mutex_lock(&node->mutex);
 	ret = __btrfs_add_delayed_deletion_item(node, item);
 	if (unlikely(ret)) {
 		btrfs_err(trans->fs_info,
 			  "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
-			  index, node->root->objectid, node->inode_id, ret);
-		BUG();
+			  index, node->root->root_key.objectid,
+			  node->inode_id, ret);
+		btrfs_delayed_item_release_metadata(dir->root, item);
+		btrfs_release_delayed_item(item);
 	}
 	mutex_unlock(&node->mutex);
 end:
@@ -1691,7 +1701,7 @@
 		name = (char *)(di + 1);
 		name_len = btrfs_stack_dir_name_len(di);
 
-		d_type = btrfs_filetype_table[di->type];
+		d_type = fs_ftype_to_dtype(di->type);
 		btrfs_disk_key_to_cpu(&location, &di->location);
 
 		over = !dir_emit(ctx, name, name_len,

diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 33536cd..74ae226 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h

@@ -50,8 +50,8 @@
 	 * is waiting to be dealt with by the async worker.
 	 */
 	struct list_head p_list;
-	struct rb_root ins_root;
-	struct rb_root del_root;
+	struct rb_root_cached ins_root;
+	struct rb_root_cached del_root;
 	struct mutex mutex;
 	struct btrfs_inode_item inode_item;
 	refcount_t refs;

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 62ff545..df3bd88 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c

@@ -10,6 +10,7 @@
 #include "delayed-ref.h"
 #include "transaction.h"
 #include "qgroup.h"
+#include "space-info.h"
 
 struct kmem_cache *btrfs_delayed_ref_head_cachep;
 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -24,6 +25,179 @@
  * of hammering updates on the extent allocation tree.
  */
 
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	bool ret = false;
+	u64 reserved;
+
+	spin_lock(&global_rsv->lock);
+	reserved = global_rsv->reserved;
+	spin_unlock(&global_rsv->lock);
+
+	/*
+	 * Since the global reserve is just kind of magic we don't really want
+	 * to rely on it to save our bacon, so if our size is more than the
+	 * delayed_refs_rsv and the global rsv then it's time to think about
+	 * bailing.
+	 */
+	spin_lock(&delayed_refs_rsv->lock);
+	reserved += delayed_refs_rsv->reserved;
+	if (delayed_refs_rsv->size >= reserved)
+		ret = true;
+	spin_unlock(&delayed_refs_rsv->lock);
+	return ret;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
+{
+	u64 num_entries =
+		atomic_read(&trans->transaction->delayed_refs.num_entries);
+	u64 avg_runtime;
+	u64 val;
+
+	smp_mb();
+	avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
+	val = num_entries * avg_runtime;
+	if (val >= NSEC_PER_SEC)
+		return 1;
+	if (val >= NSEC_PER_SEC / 2)
+		return 2;
+
+	return btrfs_check_space_for_delayed_refs(trans->fs_info);
+}
+
+/**
+ * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
+ * @fs_info - the fs_info for our fs.
+ * @nr - the number of items to drop.
+ *
+ * This drops the delayed ref head's count from the delayed refs rsv and frees
+ * any excess reservation we had.
+ */
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+{
+	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+	u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
+	u64 released = 0;
+
+	released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
+					     NULL);
+	if (released)
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, released, 0);
+}
+
+/*
+ * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
+ * @trans - the trans that may have generated delayed refs
+ *
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
+ * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ */
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	u64 num_bytes;
+
+	if (!trans->delayed_ref_updates)
+		return;
+
+	num_bytes = btrfs_calc_insert_metadata_size(fs_info,
+						    trans->delayed_ref_updates);
+	spin_lock(&delayed_rsv->lock);
+	delayed_rsv->size += num_bytes;
+	delayed_rsv->full = 0;
+	spin_unlock(&delayed_rsv->lock);
+	trans->delayed_ref_updates = 0;
+}
+
+/**
+ * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
+ * @fs_info - the fs info for our fs.
+ * @src - the source block rsv to transfer from.
+ * @num_bytes - the number of bytes to transfer.
+ *
+ * This transfers up to the num_bytes amount from the src rsv to the
+ * delayed_refs_rsv.  Any extra bytes are returned to the space info.
+ */
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+				       struct btrfs_block_rsv *src,
+				       u64 num_bytes)
+{
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	u64 to_free = 0;
+
+	spin_lock(&src->lock);
+	src->reserved -= num_bytes;
+	src->size -= num_bytes;
+	spin_unlock(&src->lock);
+
+	spin_lock(&delayed_refs_rsv->lock);
+	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
+		u64 delta = delayed_refs_rsv->size -
+			delayed_refs_rsv->reserved;
+		if (num_bytes > delta) {
+			to_free = num_bytes - delta;
+			num_bytes = delta;
+		}
+	} else {
+		to_free = num_bytes;
+		num_bytes = 0;
+	}
+
+	if (num_bytes)
+		delayed_refs_rsv->reserved += num_bytes;
+	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
+		delayed_refs_rsv->full = 1;
+	spin_unlock(&delayed_refs_rsv->lock);
+
+	if (num_bytes)
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, num_bytes, 1);
+	if (to_free)
+		btrfs_space_info_free_bytes_may_use(fs_info,
+				delayed_refs_rsv->space_info, to_free);
+}
+
+/**
+ * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
+ * @fs_info - the fs_info for our fs.
+ * @flush - control how we can flush for this reservation.
+ *
+ * This will refill the delayed block_rsv up to 1 items size worth of space and
+ * will return -ENOSPC if we can't make the reservation.
+ */
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+				  enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+	u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved < block_rsv->size) {
+		num_bytes = block_rsv->size - block_rsv->reserved;
+		num_bytes = min(num_bytes, limit);
+	}
+	spin_unlock(&block_rsv->lock);
+
+	if (!num_bytes)
+		return 0;
+
+	ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
+					   num_bytes, flush);
+	if (ret)
+		return ret;
+	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
+	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+				      0, num_bytes, 1);
+	return 0;
+}
+
 /*
  * compare two delayed tree backrefs with same bytenr and type
  */
@@ -101,14 +275,15 @@
 }
 
 /* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
 						   struct rb_node *node)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p = &root->rb_root.rb_node;
 	struct rb_node *parent_node = NULL;
 	struct btrfs_delayed_ref_head *entry;
 	struct btrfs_delayed_ref_head *ins;
 	u64 bytenr;
+	bool leftmost = true;
 
 	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
 	bytenr = ins->bytenr;
@@ -117,26 +292,29 @@
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
 				 href_node);
 
-		if (bytenr < entry->bytenr)
+		if (bytenr < entry->bytenr) {
 			p = &(*p)->rb_left;
-		else if (bytenr > entry->bytenr)
+		} else if (bytenr > entry->bytenr) {
 			p = &(*p)->rb_right;
-		else
+			leftmost = false;
+		} else {
 			return entry;
+		}
 	}
 
 	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
+	rb_insert_color_cached(node, root, leftmost);
 	return NULL;
 }
 
-static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
 		struct btrfs_delayed_ref_node *ins)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p = &root->rb_root.rb_node;
 	struct rb_node *node = &ins->ref_node;
 	struct rb_node *parent_node = NULL;
 	struct btrfs_delayed_ref_node *entry;
+	bool leftmost = true;
 
 	while (*p) {
 		int comp;
@@ -145,32 +323,49 @@
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 ref_node);
 		comp = comp_refs(ins, entry, true);
-		if (comp < 0)
+		if (comp < 0) {
 			p = &(*p)->rb_left;
-		else if (comp > 0)
+		} else if (comp > 0) {
 			p = &(*p)->rb_right;
-		else
+			leftmost = false;
+		} else {
 			return entry;
+		}
 	}
 
 	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
+	rb_insert_color_cached(node, root, leftmost);
 	return NULL;
 }
 
-/*
- * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot.
- * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found.
- */
-static struct btrfs_delayed_ref_head *
-find_ref_head(struct rb_root *root, u64 bytenr,
-	      int return_bigger)
+static struct btrfs_delayed_ref_head *find_first_ref_head(
+		struct btrfs_delayed_ref_root *dr)
 {
 	struct rb_node *n;
 	struct btrfs_delayed_ref_head *entry;
 
+	n = rb_first_cached(&dr->href_root);
+	if (!n)
+		return NULL;
+
+	entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+	return entry;
+}
+
+/*
+ * Find a head entry based on bytenr. This returns the delayed ref head if it
+ * was able to find one, or NULL if nothing was in that spot.  If return_bigger
+ * is given, the next bigger entry is returned if no exact match is found.
+ */
+static struct btrfs_delayed_ref_head *find_ref_head(
+		struct btrfs_delayed_ref_root *dr, u64 bytenr,
+		bool return_bigger)
+{
+	struct rb_root *root = &dr->href_root.rb_root;
+	struct rb_node *n;
+	struct btrfs_delayed_ref_head *entry;
+
 	n = root->rb_node;
 	entry = NULL;
 	while (n) {
@@ -187,22 +382,18 @@
 		if (bytenr > entry->bytenr) {
 			n = rb_next(&entry->href_node);
 			if (!n)
-				n = rb_first(root);
+				return NULL;
 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
 					 href_node);
-			return entry;
 		}
 		return entry;
 	}
 	return NULL;
 }
 
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
 			   struct btrfs_delayed_ref_head *head)
 {
-	struct btrfs_delayed_ref_root *delayed_refs;
-
-	delayed_refs = &trans->transaction->delayed_refs;
 	lockdep_assert_held(&delayed_refs->lock);
 	if (mutex_trylock(&head->mutex))
 		return 0;
@@ -227,15 +418,13 @@
 				    struct btrfs_delayed_ref_node *ref)
 {
 	lockdep_assert_held(&head->lock);
-	rb_erase(&ref->ref_node, &head->ref_tree);
+	rb_erase_cached(&ref->ref_node, &head->ref_tree);
 	RB_CLEAR_NODE(&ref->ref_node);
 	if (!list_empty(&ref->add_list))
 		list_del(&ref->add_list);
 	ref->in_tree = 0;
 	btrfs_put_delayed_ref(ref);
 	atomic_dec(&delayed_refs->num_entries);
-	if (trans->delayed_ref_updates)
-		trans->delayed_ref_updates--;
 }
 
 static bool merge_ref(struct btrfs_trans_handle *trans,
@@ -296,7 +485,7 @@
 
 	lockdep_assert_held(&head->lock);
 
-	if (RB_EMPTY_ROOT(&head->ref_tree))
+	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
 		return;
 
 	/* We don't have too many refs to merge for data. */
@@ -314,7 +503,8 @@
 	spin_unlock(&fs_info->tree_mod_seq_lock);
 
 again:
-	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+	for (node = rb_first_cached(&head->ref_tree); node;
+	     node = rb_next(node)) {
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		if (seq && ref->seq >= seq)
 			continue;
@@ -345,40 +535,29 @@
 	return ret;
 }
 
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+		struct btrfs_delayed_ref_root *delayed_refs)
 {
-	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_head *head;
-	u64 start;
-	bool loop = false;
-
-	delayed_refs = &trans->transaction->delayed_refs;
 
 again:
-	start = delayed_refs->run_delayed_start;
-	head = find_ref_head(&delayed_refs->href_root, start, 1);
-	if (!head && !loop) {
+	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+			     true);
+	if (!head && delayed_refs->run_delayed_start != 0) {
 		delayed_refs->run_delayed_start = 0;
-		start = 0;
-		loop = true;
-		head = find_ref_head(&delayed_refs->href_root, start, 1);
-		if (!head)
-			return NULL;
-	} else if (!head && loop) {
-		return NULL;
+		head = find_first_ref_head(delayed_refs);
 	}
+	if (!head)
+		return NULL;
 
 	while (head->processing) {
 		struct rb_node *node;
 
 		node = rb_next(&head->href_node);
 		if (!node) {
-			if (loop)
+			if (delayed_refs->run_delayed_start == 0)
 				return NULL;
 			delayed_refs->run_delayed_start = 0;
-			start = 0;
-			loop = true;
 			goto again;
 		}
 		head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -393,6 +572,20 @@
 	return head;
 }
 
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+			   struct btrfs_delayed_ref_head *head)
+{
+	lockdep_assert_held(&delayed_refs->lock);
+	lockdep_assert_held(&head->lock);
+
+	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
+	RB_CLEAR_NODE(&head->href_node);
+	atomic_dec(&delayed_refs->num_entries);
+	delayed_refs->num_heads--;
+	if (head->processing == 0)
+		delayed_refs->num_heads_ready--;
+}
+
 /*
  * Helper to insert the ref_node to the tail or merge with tail.
  *
@@ -446,7 +639,6 @@
 	if (ref->action == BTRFS_ADD_DELAYED_REF)
 		list_add_tail(&ref->add_list, &href->ref_add_list);
 	atomic_inc(&root->num_entries);
-	trans->delayed_ref_updates++;
 	spin_unlock(&href->lock);
 	return ret;
 }
@@ -455,12 +647,14 @@
  * helper function to update the accounting in the head ref
  * existing and update must have the same bytenr
  */
-static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_delayed_ref_head *existing,
 			 struct btrfs_delayed_ref_head *update,
 			 int *old_ref_mod_ret)
 {
+	struct btrfs_delayed_ref_root *delayed_refs =
+		&trans->transaction->delayed_refs;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int old_ref_mod;
 
 	BUG_ON(existing->is_data != update->is_data);
@@ -518,10 +712,18 @@
 	 * versa we need to make sure to adjust pending_csums accordingly.
 	 */
 	if (existing->is_data) {
-		if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
+		u64 csum_leaves =
+			btrfs_csum_bytes_to_leaves(fs_info,
+						   existing->num_bytes);
+
+		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
 			delayed_refs->pending_csums -= existing->num_bytes;
-		if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
+			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+		}
+		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
 			delayed_refs->pending_csums += existing->num_bytes;
+			trans->delayed_ref_updates += csum_leaves;
+		}
 	}
 	spin_unlock(&existing->lock);
 }
@@ -569,22 +771,19 @@
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
 	head_ref->is_system = is_system;
-	head_ref->ref_tree = RB_ROOT;
+	head_ref->ref_tree = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&head_ref->ref_add_list);
 	RB_CLEAR_NODE(&head_ref->href_node);
 	head_ref->processing = 0;
 	head_ref->total_ref_mod = count_mod;
-	head_ref->qgroup_reserved = 0;
-	head_ref->qgroup_ref_root = 0;
 	spin_lock_init(&head_ref->lock);
 	mutex_init(&head_ref->mutex);
 
 	if (qrecord) {
 		if (ref_root && reserved) {
-			head_ref->qgroup_ref_root = ref_root;
-			head_ref->qgroup_reserved = reserved;
+			qrecord->data_rsv = reserved;
+			qrecord->data_rsv_refroot = ref_root;
 		}
-
 		qrecord->bytenr = bytenr;
 		qrecord->num_bytes = num_bytes;
 		qrecord->old_roots = NULL;
@@ -623,11 +822,7 @@
 	existing = htree_insert(&delayed_refs->href_root,
 				&head_ref->href_node);
 	if (existing) {
-		WARN_ON(qrecord && head_ref->qgroup_ref_root
-			&& head_ref->qgroup_reserved
-			&& existing->qgroup_ref_root
-			&& existing->qgroup_reserved);
-		update_existing_head_ref(delayed_refs, existing, head_ref,
+		update_existing_head_ref(trans, existing, head_ref,
 					 old_ref_mod);
 		/*
 		 * we've updated the existing ref, free the newly
@@ -638,8 +833,12 @@
 	} else {
 		if (old_ref_mod)
 			*old_ref_mod = 0;
-		if (head_ref->is_data && head_ref->ref_mod < 0)
+		if (head_ref->is_data && head_ref->ref_mod < 0) {
 			delayed_refs->pending_csums += head_ref->num_bytes;
+			trans->delayed_ref_updates +=
+				btrfs_csum_bytes_to_leaves(trans->fs_info,
+							   head_ref->num_bytes);
+		}
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		atomic_inc(&delayed_refs->num_entries);
@@ -710,8 +909,7 @@
  * transaction commits.
  */
 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-			       u64 bytenr, u64 num_bytes, u64 parent,
-			       u64 ref_root,  int level, int action,
+			       struct btrfs_ref *generic_ref,
 			       struct btrfs_delayed_extent_op *extent_op,
 			       int *old_ref_mod, int *new_ref_mod)
 {
@@ -721,10 +919,18 @@
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 	int qrecord_inserted;
-	bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+	bool is_system;
+	int action = generic_ref->action;
+	int level = generic_ref->tree_ref.level;
 	int ret;
+	u64 bytenr = generic_ref->bytenr;
+	u64 num_bytes = generic_ref->len;
+	u64 parent = generic_ref->parent;
 	u8 ref_type;
 
+	is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+
+	ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
 	BUG_ON(extent_op && extent_op->is_data);
 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
 	if (!ref)
@@ -737,8 +943,10 @@
 	}
 
 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    is_fstree(ref_root)) {
-		record = kmalloc(sizeof(*record), GFP_NOFS);
+	    is_fstree(generic_ref->real_root) &&
+	    is_fstree(generic_ref->tree_ref.root) &&
+	    !generic_ref->skip_qgroup) {
+		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
 			kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -752,13 +960,14 @@
 		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
 
 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
-				ref_root, action, ref_type);
-	ref->root = ref_root;
+				generic_ref->tree_ref.root, action, ref_type);
+	ref->root = generic_ref->tree_ref.root;
 	ref->parent = parent;
 	ref->level = level;
 
 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
-			      ref_root, 0, action, false, is_system);
+			      generic_ref->tree_ref.root, 0, action, false,
+			      is_system);
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -775,6 +984,12 @@
 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
+	/*
+	 * Need to update the delayed_refs_rsv with any changes we may have
+	 * made.
+	 */
+	btrfs_update_delayed_refs_rsv(trans);
+
 	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
 				   action == BTRFS_ADD_DELAYED_EXTENT ?
 				   BTRFS_ADD_DELAYED_REF : action);
@@ -791,10 +1006,9 @@
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
-			       u64 bytenr, u64 num_bytes,
-			       u64 parent, u64 ref_root,
-			       u64 owner, u64 offset, u64 reserved, int action,
-			       int *old_ref_mod, int *new_ref_mod)
+			       struct btrfs_ref *generic_ref,
+			       u64 reserved, int *old_ref_mod,
+			       int *new_ref_mod)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_data_ref *ref;
@@ -802,9 +1016,17 @@
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 	int qrecord_inserted;
+	int action = generic_ref->action;
 	int ret;
+	u64 bytenr = generic_ref->bytenr;
+	u64 num_bytes = generic_ref->len;
+	u64 parent = generic_ref->parent;
+	u64 ref_root = generic_ref->data_ref.ref_root;
+	u64 owner = generic_ref->data_ref.ino;
+	u64 offset = generic_ref->data_ref.offset;
 	u8 ref_type;
 
+	ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
 	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
@@ -828,8 +1050,10 @@
 	}
 
 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    is_fstree(ref_root)) {
-		record = kmalloc(sizeof(*record), GFP_NOFS);
+	    is_fstree(ref_root) &&
+	    is_fstree(generic_ref->real_root) &&
+	    !generic_ref->skip_qgroup) {
+		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
 			kmem_cache_free(btrfs_delayed_ref_head_cachep,
@@ -856,6 +1080,12 @@
 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
+	/*
+	 * Need to update the delayed_refs_rsv with any changes we may have
+	 * made.
+	 */
+	btrfs_update_delayed_refs_rsv(trans);
+
 	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
 				   action == BTRFS_ADD_DELAYED_EXTENT ?
 				   BTRFS_ADD_DELAYED_REF : action);
@@ -868,8 +1098,7 @@
 	return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
-				struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op)
 {
@@ -892,18 +1121,25 @@
 			     NULL, NULL, NULL);
 
 	spin_unlock(&delayed_refs->lock);
+
+	/*
+	 * Need to update the delayed_refs_rsv with any changes we may have
+	 * made.
+	 */
+	btrfs_update_delayed_refs_rsv(trans);
 	return 0;
 }
 
 /*
- * this does a simple search for the head node for a given extent.
- * It must be called with the delayed ref spinlock held, and it returns
- * the head node if any where found, or NULL if not.
+ * This does a simple search for the head node for a given extent.  Returns the
+ * head node if found, or NULL if not.
  */
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
 {
-	return find_ref_head(&delayed_refs->href_root, bytenr, 0);
+	lockdep_assert_held(&delayed_refs->lock);
+
+	return find_ref_head(delayed_refs, bytenr, false);
 }
 
 void __cold btrfs_delayed_ref_exit(void)

diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d9f2a4e..1c977e6 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h

@@ -79,7 +79,7 @@
 	struct mutex mutex;
 
 	spinlock_t lock;
-	struct rb_root ref_tree;
+	struct rb_root_cached ref_tree;
 	/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
 	struct list_head ref_add_list;
 
@@ -103,17 +103,6 @@
 	int ref_mod;
 
 	/*
-	 * For qgroup reserved space freeing.
-	 *
-	 * ref_root and reserved will be recorded after
-	 * BTRFS_ADD_DELAYED_EXTENT is called.
-	 * And will be used to free reserved qgroup space at
-	 * run_delayed_refs() time.
-	 */
-	u64 qgroup_ref_root;
-	u64 qgroup_reserved;
-
-	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
 	 * until the delayed ref is processed.  must_insert_reserved is
@@ -148,7 +137,7 @@
 
 struct btrfs_delayed_ref_root {
 	/* head ref rbtree */
-	struct rb_root href_root;
+	struct rb_root_cached href_root;
 
 	/* dirty extent records */
 	struct rb_root dirty_extent_root;
@@ -187,6 +176,83 @@
 	u64 qgroup_to_skip;
 };
 
+enum btrfs_ref_type {
+	BTRFS_REF_NOT_SET,
+	BTRFS_REF_DATA,
+	BTRFS_REF_METADATA,
+	BTRFS_REF_LAST,
+};
+
+struct btrfs_data_ref {
+	/* For EXTENT_DATA_REF */
+
+	/* Root which refers to this data extent */
+	u64 ref_root;
+
+	/* Inode which refers to this data extent */
+	u64 ino;
+
+	/*
+	 * file_offset - extent_offset
+	 *
+	 * file_offset is the key.offset of the EXTENT_DATA key.
+	 * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
+	 */
+	u64 offset;
+};
+
+struct btrfs_tree_ref {
+	/*
+	 * Level of this tree block
+	 *
+	 * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
+	 */
+	int level;
+
+	/*
+	 * Root which refers to this tree block.
+	 *
+	 * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
+	 */
+	u64 root;
+
+	/* For non-skinny metadata, no special member needed */
+};
+
+struct btrfs_ref {
+	enum btrfs_ref_type type;
+	int action;
+
+	/*
+	 * Whether this extent should go through qgroup record.
+	 *
+	 * Normally false, but for certain cases like delayed subtree scan,
+	 * setting this flag can hugely reduce qgroup overhead.
+	 */
+	bool skip_qgroup;
+
+	/*
+	 * Optional. For which root is this modification.
+	 * Mostly used for qgroup optimization.
+	 *
+	 * When unset, data/tree ref init code will populate it.
+	 * In certain cases, we're modifying reference for a different root.
+	 * E.g. COW fs tree blocks for balance.
+	 * In that case, tree_ref::root will be fs tree, but we're doing this
+	 * for reloc tree, then we should set @real_root to reloc tree.
+	 */
+	u64 real_root;
+	u64 bytenr;
+	u64 len;
+
+	/* Bytenr of the parent tree block */
+	u64 parent;
+	union {
+		struct btrfs_data_ref data_ref;
+		struct btrfs_tree_ref tree_ref;
+	};
+};
+
 extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
 extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
 extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
@@ -195,6 +261,38 @@
 int __init btrfs_delayed_ref_init(void);
 void __cold btrfs_delayed_ref_exit(void);
 
+static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
+				int action, u64 bytenr, u64 len, u64 parent)
+{
+	generic_ref->action = action;
+	generic_ref->bytenr = bytenr;
+	generic_ref->len = len;
+	generic_ref->parent = parent;
+}
+
+static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
+				int level, u64 root)
+{
+	/* If @real_root not set, use @root as fallback */
+	if (!generic_ref->real_root)
+		generic_ref->real_root = root;
+	generic_ref->tree_ref.level = level;
+	generic_ref->tree_ref.root = root;
+	generic_ref->type = BTRFS_REF_METADATA;
+}
+
+static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
+				u64 ref_root, u64 ino, u64 offset)
+{
+	/* If @real_root not set, use @root as fallback */
+	if (!generic_ref->real_root)
+		generic_ref->real_root = ref_root;
+	generic_ref->data_ref.ref_root = ref_root;
+	generic_ref->data_ref.ino = ino;
+	generic_ref->data_ref.offset = offset;
+	generic_ref->type = BTRFS_REF_DATA;
+}
+
 static inline struct btrfs_delayed_extent_op *
 btrfs_alloc_delayed_extent_op(void)
 {
@@ -235,17 +333,14 @@
 }
 
 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-			       u64 bytenr, u64 num_bytes, u64 parent,
-			       u64 ref_root, int level, int action,
+			       struct btrfs_ref *generic_ref,
 			       struct btrfs_delayed_extent_op *extent_op,
 			       int *old_ref_mod, int *new_ref_mod);
 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
-			       u64 bytenr, u64 num_bytes,
-			       u64 parent, u64 ref_root,
-			       u64 owner, u64 offset, u64 reserved, int action,
-			       int *old_ref_mod, int *new_ref_mod);
-int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
-				struct btrfs_trans_handle *trans,
+			       struct btrfs_ref *generic_ref,
+			       u64 reserved, int *old_ref_mod,
+			       int *new_ref_mod);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
@@ -255,19 +350,30 @@
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
 			   struct btrfs_delayed_ref_head *head);
 static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
 {
 	mutex_unlock(&head->mutex);
 }
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+			   struct btrfs_delayed_ref_head *head);
 
-
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+		struct btrfs_delayed_ref_root *delayed_refs);
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+				  enum btrfs_reserve_flush_enum flush);
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+				       struct btrfs_block_rsv *src,
+				       u64 num_bytes);
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
+
 /*
  * helper functions to cast a node into its container
  */

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9814347..4889082 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c

@@ -9,6 +9,7 @@
 #include <linux/blkdev.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
+#include "misc.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -56,10 +57,9 @@
 no_valid_dev_replace_entry_found:
 		ret = 0;
 		dev_replace->replace_state =
-			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
 		dev_replace->cont_reading_from_srcdev_mode =
 		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
-		dev_replace->replace_state = 0;
 		dev_replace->time_started = 0;
 		dev_replace->time_stopped = 0;
 		atomic64_set(&dev_replace->num_write_errors, 0);
@@ -112,11 +112,11 @@
 		break;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
-							NULL, NULL);
-		dev_replace->tgtdev = btrfs_find_device(fs_info,
+		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
+						src_devid, NULL, NULL, true);
+		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
 							BTRFS_DEV_REPLACE_DEVID,
-							NULL, NULL);
+							NULL, NULL, true);
 		/*
 		 * allow 'btrfs dev replace_cancel' if src/tgt device is
 		 * missing
@@ -202,7 +202,7 @@
 		return PTR_ERR(bdev);
 	}
 
-	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	sync_blockdev(bdev);
 
 	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
@@ -238,7 +238,6 @@
 	}
 	rcu_assign_pointer(device->name, name);
 
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	device->generation = 0;
 	device->io_width = fs_info->sectorsize;
@@ -257,6 +256,8 @@
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_info->fs_devices;
+
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	list_add(&device->dev_list, &fs_info->fs_devices->devices);
 	fs_info->fs_devices->num_devices++;
 	fs_info->fs_devices->open_devices++;
@@ -274,9 +275,9 @@
  * called from commit_transaction. Writes changed device replace state to
  * disk.
  */
-int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
-			  struct btrfs_fs_info *fs_info)
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int ret;
 	struct btrfs_root *dev_root = fs_info->dev_root;
 	struct btrfs_path *path;
@@ -285,13 +286,13 @@
 	struct btrfs_dev_replace_item *ptr;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-	btrfs_dev_replace_read_lock(dev_replace);
+	down_read(&dev_replace->rwsem);
 	if (!dev_replace->is_valid ||
 	    !dev_replace->item_needs_writeback) {
-		btrfs_dev_replace_read_unlock(dev_replace);
+		up_read(&dev_replace->rwsem);
 		return 0;
 	}
-	btrfs_dev_replace_read_unlock(dev_replace);
+	up_read(&dev_replace->rwsem);
 
 	key.objectid = 0;
 	key.type = BTRFS_DEV_REPLACE_KEY;
@@ -349,7 +350,7 @@
 	ptr = btrfs_item_ptr(eb, path->slots[0],
 			     struct btrfs_dev_replace_item);
 
-	btrfs_dev_replace_write_lock(dev_replace);
+	down_write(&dev_replace->rwsem);
 	if (dev_replace->srcdev)
 		btrfs_set_dev_replace_src_devid(eb, ptr,
 			dev_replace->srcdev->devid);
@@ -372,7 +373,7 @@
 	btrfs_set_dev_replace_cursor_right(eb, ptr,
 		dev_replace->cursor_right);
 	dev_replace->item_needs_writeback = 0;
-	btrfs_dev_replace_write_unlock(dev_replace);
+	up_write(&dev_replace->rwsem);
 
 	btrfs_mark_buffer_dirty(eb);
 
@@ -382,14 +383,6 @@
 	return ret;
 }
 
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-
-	dev_replace->committed_cursor_left =
-		dev_replace->cursor_left_last_write_of_item;
-}
-
 static char* btrfs_dev_name(struct btrfs_device *device)
 {
 	if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
@@ -398,7 +391,7 @@
 		return rcu_str_deref(device->name);
 }
 
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
+static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
 		int read_src)
 {
@@ -409,15 +402,17 @@
 	struct btrfs_device *tgt_device = NULL;
 	struct btrfs_device *src_device = NULL;
 
-	ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
-					    srcdev_name, &src_device);
-	if (ret)
-		return ret;
+	src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
+						  srcdev_name);
+	if (IS_ERR(src_device))
+		return PTR_ERR(src_device);
 
-	ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
-					    src_device, &tgt_device);
-	if (ret)
-		return ret;
+	if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
+		btrfs_warn_in_rcu(fs_info,
+	  "cannot replace device %s (devid %llu) due to active swapfile",
+			btrfs_dev_name(src_device), src_device->devid);
+		return -ETXTBSY;
+	}
 
 	/*
 	 * Here we commit the transaction to make sure commit_total_bytes
@@ -432,7 +427,12 @@
 		return PTR_ERR(trans);
 	}
 
-	btrfs_dev_replace_write_lock(dev_replace);
+	ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
+					    src_device, &tgt_device);
+	if (ret)
+		return ret;
+
+	down_write(&dev_replace->rwsem);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -442,11 +442,11 @@
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 		ASSERT(0);
 		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+		up_write(&dev_replace->rwsem);
 		goto leave;
 	}
 
 	dev_replace->cont_reading_from_srcdev_mode = read_src;
-	WARN_ON(!src_device);
 	dev_replace->srcdev = src_device;
 	dev_replace->tgtdev = tgt_device;
 
@@ -470,7 +470,7 @@
 	dev_replace->item_needs_writeback = 1;
 	atomic64_set(&dev_replace->num_write_errors, 0);
 	atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
-	btrfs_dev_replace_write_unlock(dev_replace);
+	up_write(&dev_replace->rwsem);
 
 	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
 	if (ret)
@@ -478,15 +478,16 @@
 
 	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 
-	/* force writing the updated state information to disk */
-	trans = btrfs_start_transaction(root, 0);
+	/* Commit dev_replace state and reserve 1 item for it. */
+	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		btrfs_dev_replace_write_lock(dev_replace);
+		down_write(&dev_replace->rwsem);
 		dev_replace->replace_state =
 			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
 		dev_replace->srcdev = NULL;
 		dev_replace->tgtdev = NULL;
+		up_write(&dev_replace->rwsem);
 		goto leave;
 	}
 
@@ -501,14 +502,13 @@
 	ret = btrfs_dev_replace_finishing(fs_info, ret);
 	if (ret == -EINPROGRESS) {
 		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
-	} else {
+	} else if (ret != -ECANCELED) {
 		WARN_ON(ret);
 	}
 
 	return ret;
 
 leave:
-	btrfs_dev_replace_write_unlock(dev_replace);
 	btrfs_destroy_dev_replace_tgtdev(tgt_device);
 	return ret;
 }
@@ -536,8 +536,9 @@
 					args->start.cont_reading_from_srcdev_mode);
 	args->result = ret;
 	/* don't warn if EINPROGRESS, someone else might be running scrub */
-	if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
-		ret = 0;
+	if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+	    ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
+		return 0;
 
 	return ret;
 }
@@ -548,8 +549,8 @@
 static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 {
 	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
-	wait_event(fs_info->replace_wait, !percpu_counter_sum(
-		   &fs_info->bio_counter));
+	wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
+		   &fs_info->dev_replace.bio_counter));
 }
 
 /*
@@ -558,7 +559,7 @@
 static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
 {
 	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
-	wake_up(&fs_info->replace_wait);
+	wake_up(&fs_info->dev_replace.replace_wait);
 }
 
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -575,18 +576,18 @@
 	/* don't allow cancel or unmount to disturb the finishing procedure */
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 
-	btrfs_dev_replace_read_lock(dev_replace);
+	down_read(&dev_replace->rwsem);
 	/* was the operation canceled, or is it finished? */
 	if (dev_replace->replace_state !=
 	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
-		btrfs_dev_replace_read_unlock(dev_replace);
+		up_read(&dev_replace->rwsem);
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return 0;
 	}
 
 	tgt_device = dev_replace->tgtdev;
 	src_device = dev_replace->srcdev;
-	btrfs_dev_replace_read_unlock(dev_replace);
+	up_read(&dev_replace->rwsem);
 
 	/*
 	 * flush all outstanding I/O and inode extent mappings before the
@@ -599,18 +600,34 @@
 	}
 	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans)) {
-		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
-		return PTR_ERR(trans);
-	}
-	ret = btrfs_commit_transaction(trans);
-	WARN_ON(ret);
+	/*
+	 * We have to use this loop approach because at this point src_device
+	 * has to be available for transaction commit to complete, yet new
+	 * chunks shouldn't be allocated on the device.
+	 */
+	while (1) {
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+			return PTR_ERR(trans);
+		}
+		ret = btrfs_commit_transaction(trans);
+		WARN_ON(ret);
 
-	/* keep away write_all_supers() during the finishing procedure */
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	mutex_lock(&fs_info->chunk_mutex);
-	btrfs_dev_replace_write_lock(dev_replace);
+		/* Prevent write_all_supers() during the finishing procedure */
+		mutex_lock(&fs_info->fs_devices->device_list_mutex);
+		/* Prevent new chunks being allocated on the source device */
+		mutex_lock(&fs_info->chunk_mutex);
+
+		if (!list_empty(&src_device->post_commit_list)) {
+			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+			mutex_unlock(&fs_info->chunk_mutex);
+		} else {
+			break;
+		}
+	}
+
+	down_write(&dev_replace->rwsem);
 	dev_replace->replace_state =
 		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
 			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -625,12 +642,13 @@
 								src_device,
 								tgt_device);
 	} else {
-		btrfs_err_in_rcu(fs_info,
+		if (scrub_ret != -ECANCELED)
+			btrfs_err_in_rcu(fs_info,
 				 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 				 btrfs_dev_name(src_device),
 				 src_device->devid,
 				 rcu_str_deref(tgt_device->name), scrub_ret);
-		btrfs_dev_replace_write_unlock(dev_replace);
+		up_write(&dev_replace->rwsem);
 		mutex_unlock(&fs_info->chunk_mutex);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		btrfs_rm_dev_replace_blocked(fs_info);
@@ -657,8 +675,6 @@
 	btrfs_device_set_disk_total_bytes(tgt_device,
 					  src_device->disk_total_bytes);
 	btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
-	ASSERT(list_empty(&src_device->resized_list));
-	tgt_device->commit_total_bytes = src_device->commit_total_bytes;
 	tgt_device->commit_bytes_used = src_device->bytes_used;
 
 	btrfs_assign_next_active_device(src_device, tgt_device);
@@ -666,8 +682,7 @@
 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 	fs_info->fs_devices->rw_devices++;
 
-	btrfs_dev_replace_write_unlock(dev_replace);
-
+	up_write(&dev_replace->rwsem);
 	btrfs_rm_dev_replace_blocked(fs_info);
 
 	btrfs_rm_dev_replace_remove_srcdev(src_device);
@@ -692,7 +707,7 @@
 
 	/* replace the sysfs entry */
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
-	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
+	btrfs_rm_dev_replace_free_srcdev(src_device);
 
 	/* write back the superblocks */
 	trans = btrfs_start_transaction(root, 0);
@@ -709,7 +724,7 @@
 						struct btrfs_device *srcdev,
 						struct btrfs_device *tgtdev)
 {
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
 	u64 start = 0;
@@ -764,7 +779,7 @@
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-	btrfs_dev_replace_read_lock(dev_replace);
+	down_read(&dev_replace->rwsem);
 	/* even if !dev_replace_is_valid, the values are good enough for
 	 * the replace_status ioctl */
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -776,7 +791,7 @@
 	args->status.num_uncorrectable_read_errors =
 		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
 	args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
-	btrfs_dev_replace_read_unlock(dev_replace);
+	up_read(&dev_replace->rwsem);
 }
 
 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
@@ -793,46 +808,75 @@
 		return -EROFS;
 
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-	btrfs_dev_replace_write_lock(dev_replace);
+	down_write(&dev_replace->rwsem);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
-		btrfs_dev_replace_write_unlock(dev_replace);
-		goto leave;
+		up_write(&dev_replace->rwsem);
+		break;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		tgt_device = dev_replace->tgtdev;
+		src_device = dev_replace->srcdev;
+		up_write(&dev_replace->rwsem);
+		ret = btrfs_scrub_cancel(fs_info);
+		if (ret < 0) {
+			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+		} else {
+			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+			/*
+			 * btrfs_dev_replace_finishing() will handle the
+			 * cleanup part
+			 */
+			btrfs_info_in_rcu(fs_info,
+				"dev_replace from %s (devid %llu) to %s canceled",
+				btrfs_dev_name(src_device), src_device->devid,
+				btrfs_dev_name(tgt_device));
+		}
+		break;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		/*
+		 * Scrub doing the replace isn't running so we need to do the
+		 * cleanup step of btrfs_dev_replace_finishing() here
+		 */
 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 		tgt_device = dev_replace->tgtdev;
 		src_device = dev_replace->srcdev;
 		dev_replace->tgtdev = NULL;
 		dev_replace->srcdev = NULL;
+		dev_replace->replace_state =
+				BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+		dev_replace->time_stopped = ktime_get_real_seconds();
+		dev_replace->item_needs_writeback = 1;
+
+		up_write(&dev_replace->rwsem);
+
+		/* Scrub for replace must not be running in suspended state */
+		ret = btrfs_scrub_cancel(fs_info);
+		ASSERT(ret != -ENOTCONN);
+
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+			return PTR_ERR(trans);
+		}
+		ret = btrfs_commit_transaction(trans);
+		WARN_ON(ret);
+
+		btrfs_info_in_rcu(fs_info,
+		"suspended dev_replace from %s (devid %llu) to %s canceled",
+			btrfs_dev_name(src_device), src_device->devid,
+			btrfs_dev_name(tgt_device));
+
+		if (tgt_device)
+			btrfs_destroy_dev_replace_tgtdev(tgt_device);
 		break;
+	default:
+		up_write(&dev_replace->rwsem);
+		result = -EINVAL;
 	}
-	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
-	dev_replace->time_stopped = ktime_get_real_seconds();
-	dev_replace->item_needs_writeback = 1;
-	btrfs_dev_replace_write_unlock(dev_replace);
-	btrfs_scrub_cancel(fs_info);
 
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans)) {
-		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
-		return PTR_ERR(trans);
-	}
-	ret = btrfs_commit_transaction(trans);
-	WARN_ON(ret);
-
-	btrfs_info_in_rcu(fs_info,
-		"dev_replace from %s (devid %llu) to %s canceled",
-		btrfs_dev_name(src_device), src_device->devid,
-		btrfs_dev_name(tgt_device));
-
-	if (tgt_device)
-		btrfs_destroy_dev_replace_tgtdev(tgt_device);
-
-leave:
 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 	return result;
 }
@@ -842,7 +886,8 @@
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-	btrfs_dev_replace_write_lock(dev_replace);
+	down_write(&dev_replace->rwsem);
+
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -858,7 +903,7 @@
 		break;
 	}
 
-	btrfs_dev_replace_write_unlock(dev_replace);
+	up_write(&dev_replace->rwsem);
 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 }
 
@@ -868,12 +913,13 @@
 	struct task_struct *task;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-	btrfs_dev_replace_write_lock(dev_replace);
+	down_write(&dev_replace->rwsem);
+
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
-		btrfs_dev_replace_write_unlock(dev_replace);
+		up_write(&dev_replace->rwsem);
 		return 0;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 		break;
@@ -887,10 +933,12 @@
 			   "cannot continue dev_replace, tgtdev is missing");
 		btrfs_info(fs_info,
 			   "you may cancel the operation after 'mount -o degraded'");
-		btrfs_dev_replace_write_unlock(dev_replace);
+		dev_replace->replace_state =
+					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+		up_write(&dev_replace->rwsem);
 		return 0;
 	}
-	btrfs_dev_replace_write_unlock(dev_replace);
+	up_write(&dev_replace->rwsem);
 
 	/*
 	 * This could collide with a paused balance, but the exclusive op logic
@@ -898,6 +946,10 @@
 	 * dev-replace to start anyway.
 	 */
 	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+		down_write(&dev_replace->rwsem);
+		dev_replace->replace_state =
+					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+		up_write(&dev_replace->rwsem);
 		btrfs_info(fs_info,
 		"cannot resume dev-replace, other exclusive operation running");
 		return 0;
@@ -928,7 +980,7 @@
 			      btrfs_device_get_total_bytes(dev_replace->srcdev),
 			      &dev_replace->scrub_progress, 0, 1);
 	ret = btrfs_dev_replace_finishing(fs_info, ret);
-	WARN_ON(ret);
+	WARN_ON(ret && ret != -ECANCELED);
 
 	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 	return 0;
@@ -951,7 +1003,7 @@
 		 * something that can happen if the dev_replace
 		 * procedure is suspended by an umount and then
 		 * the tgtdev is missing (or "btrfs dev scan") was
-		 * not called and the the filesystem is remounted
+		 * not called and the filesystem is remounted
 		 * in degraded state. This does not stop the
 		 * dev_replace procedure. It needs to be canceled
 		 * manually if the cancellation is wanted.
@@ -961,81 +1013,27 @@
 	return 1;
 }
 
-void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
-{
-	read_lock(&dev_replace->lock);
-	atomic_inc(&dev_replace->read_locks);
-}
-
-void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
-{
-	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
-	atomic_dec(&dev_replace->read_locks);
-	read_unlock(&dev_replace->lock);
-}
-
-void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace)
-{
-again:
-	wait_event(dev_replace->read_lock_wq,
-		   atomic_read(&dev_replace->blocking_readers) == 0);
-	write_lock(&dev_replace->lock);
-	if (atomic_read(&dev_replace->blocking_readers)) {
-		write_unlock(&dev_replace->lock);
-		goto again;
-	}
-}
-
-void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
-{
-	ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
-	write_unlock(&dev_replace->lock);
-}
-
-/* inc blocking cnt and release read lock */
-void btrfs_dev_replace_set_lock_blocking(
-					struct btrfs_dev_replace *dev_replace)
-{
-	/* only set blocking for read lock */
-	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
-	atomic_inc(&dev_replace->blocking_readers);
-	read_unlock(&dev_replace->lock);
-}
-
-/* acquire read lock and dec blocking cnt */
-void btrfs_dev_replace_clear_lock_blocking(
-					struct btrfs_dev_replace *dev_replace)
-{
-	/* only set blocking for read lock */
-	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
-	ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
-	read_lock(&dev_replace->lock);
-	/* Barrier implied by atomic_dec_and_test */
-	if (atomic_dec_and_test(&dev_replace->blocking_readers))
-		cond_wake_up_nomb(&dev_replace->read_lock_wq);
-}
-
 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
 {
-	percpu_counter_inc(&fs_info->bio_counter);
+	percpu_counter_inc(&fs_info->dev_replace.bio_counter);
 }
 
 void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
 {
-	percpu_counter_sub(&fs_info->bio_counter, amount);
-	cond_wake_up_nomb(&fs_info->replace_wait);
+	percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
+	cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
 }
 
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
 {
 	while (1) {
-		percpu_counter_inc(&fs_info->bio_counter);
+		percpu_counter_inc(&fs_info->dev_replace.bio_counter);
 		if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
 				     &fs_info->fs_state)))
 			break;
 
 		btrfs_bio_counter_dec(fs_info);
-		wait_event(fs_info->replace_wait,
+		wait_event(fs_info->dev_replace.replace_wait,
 			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
 				     &fs_info->fs_state));
 	}

diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index b6d4206..78c5d8f 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h

@@ -9,31 +9,14 @@
 struct btrfs_ioctl_dev_replace_args;
 
 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
-int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
-			  struct btrfs_fs_info *fs_info);
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
 int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
 			    struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
-		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
-		int read_src);
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 			      struct btrfs_ioctl_dev_replace_args *args);
 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_clear_lock_blocking(
-					struct btrfs_dev_replace *dev_replace);
-
-static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
-{
-	atomic64_inc(stat_value);
-}
 
 #endif

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a678b07..863367c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c

@@ -36,7 +36,7 @@
 		di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
 		if (di)
 			return ERR_PTR(-EEXIST);
-		btrfs_extend_item(fs_info, path, data_size);
+		btrfs_extend_item(path, data_size);
 	} else if (ret < 0)
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
@@ -105,13 +105,13 @@
  * to use for the second index (if one is created).
  * Will return 0 or -ENOMEM
  */
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len,
-			  struct btrfs_inode *dir, struct btrfs_key *location,
-			  u8 type, u64 index)
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
+			  int name_len, struct btrfs_inode *dir,
+			  struct btrfs_key *location, u8 type, u64 index)
 {
 	int ret = 0;
 	int ret2 = 0;
+	struct btrfs_root *root = dir->root;
 	struct btrfs_path *path;
 	struct btrfs_dir_item *dir_item;
 	struct extent_buffer *leaf;
@@ -429,8 +429,7 @@
 		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			item_len - (ptr + sub_item_len - start));
-		btrfs_truncate_item(root->fs_info, path,
-				    item_len - sub_item_len, 1);
+		btrfs_truncate_item(path, item_len - sub_item_len, 1);
 	}
 	return ret;
 }

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d4a7f7c..402b61b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c

@@ -17,7 +17,9 @@
 #include <linux/semaphore.h>
 #include <linux/error-injection.h>
 #include <linux/crc32c.h>
+#include <linux/sched/mm.h>
 #include <asm/unaligned.h>
+#include <crypto/hash.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -38,10 +40,7 @@
 #include "compression.h"
 #include "tree-checker.h"
 #include "ref-verify.h"
-
-#ifdef CONFIG_X86
-#include <asm/cpufeature.h>
-#endif
+#include "block-group.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
@@ -125,8 +124,8 @@
  * Different roots are used for different purposes and may nest inside each
  * other and they require separate keysets.  As lockdep keys should be
  * static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->objectid.  This ensures that all special purpose roots
- * have separate keysets.
+ * by btrfs_root->root_key.objectid.  This ensures that all special purpose
+ * roots have separate keysets.
  *
  * Lock-nesting across peer nodes is always done with the immediate parent
  * node locked thus preventing deadlock.  As lockdep doesn't know this, use
@@ -248,26 +247,15 @@
 	return em;
 }
 
-u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
-{
-	return crc32c(seed, data, len);
-}
-
-void btrfs_csum_final(u32 crc, u8 *result)
-{
-	put_unaligned_le32(~crc, result);
-}
-
 /*
- * compute the csum for a btree block, and either verify it or write it
- * into the csum field of the block.
+ * Compute the csum of a btree block and store the result to provided buffer.
+ *
+ * Returns error if the extent buffer cannot be mapped.
  */
-static int csum_tree_block(struct btrfs_fs_info *fs_info,
-			   struct extent_buffer *buf,
-			   int verify)
+static int csum_tree_block(struct extent_buffer *buf, u8 *result)
 {
-	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
-	char result[BTRFS_CSUM_SIZE];
+	struct btrfs_fs_info *fs_info = buf->fs_info;
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	unsigned long len;
 	unsigned long cur_len;
 	unsigned long offset = BTRFS_CSUM_SIZE;
@@ -275,40 +263,31 @@
 	unsigned long map_start;
 	unsigned long map_len;
 	int err;
-	u32 crc = ~(u32)0;
+
+	shash->tfm = fs_info->csum_shash;
+	crypto_shash_init(shash);
 
 	len = buf->len - offset;
+
 	while (len > 0) {
+		/*
+		 * Note: we don't need to check for the err == 1 case here, as
+		 * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
+		 * and 'min_len = 32' and the currently implemented mapping
+		 * algorithm we cannot cross a page boundary.
+		 */
 		err = map_private_extent_buffer(buf, offset, 32,
 					&kaddr, &map_start, &map_len);
-		if (err)
+		if (WARN_ON(err))
 			return err;
 		cur_len = min(len, map_len - (offset - map_start));
-		crc = btrfs_csum_data(kaddr + offset - map_start,
-				      crc, cur_len);
+		crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
 		len -= cur_len;
 		offset += cur_len;
 	}
 	memset(result, 0, BTRFS_CSUM_SIZE);
 
-	btrfs_csum_final(crc, result);
-
-	if (verify) {
-		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
-			u32 val;
-			u32 found = 0;
-			memcpy(&found, result, csum_size);
-
-			read_extent_buffer(buf, &val, 0, csum_size);
-			btrfs_warn_rl(fs_info,
-				"%s checksum verify failed on %llu wanted %X found %X level %d",
-				fs_info->sb->s_id, buf->start,
-				val, found, btrfs_header_level(buf));
-			return -EUCLEAN;
-		}
-	} else {
-		write_extent_buffer(buf, result, 0, csum_size);
-	}
+	crypto_shash_final(shash, result);
 
 	return 0;
 }
@@ -335,7 +314,7 @@
 
 	if (need_lock) {
 		btrfs_tree_read_lock(eb);
-		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		btrfs_set_lock_blocking_read(eb);
 	}
 
 	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@ -369,6 +348,16 @@
 	return ret;
 }
 
+static bool btrfs_supported_super_csum(u16 csum_type)
+{
+	switch (csum_type) {
+	case BTRFS_CSUM_TYPE_CRC32:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /*
  * Return 0 if the superblock checksum type matches the checksum value of that
  * algorithm. Pass the raw disk superblock data.
@@ -378,51 +367,42 @@
 {
 	struct btrfs_super_block *disk_sb =
 		(struct btrfs_super_block *)raw_disk_sb;
-	u16 csum_type = btrfs_super_csum_type(disk_sb);
-	int ret = 0;
+	char result[BTRFS_CSUM_SIZE];
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 
-	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
-		u32 crc = ~(u32)0;
-		char result[sizeof(crc)];
+	shash->tfm = fs_info->csum_shash;
+	crypto_shash_init(shash);
 
-		/*
-		 * The super_block structure does not span the whole
-		 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
-		 * is filled with zeros and is included in the checksum.
-		 */
-		crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
-				crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
-		btrfs_csum_final(crc, result);
+	/*
+	 * The super_block structure does not span the whole
+	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
+	 * filled with zeros and is included in the checksum.
+	 */
+	crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+	crypto_shash_final(shash, result);
 
-		if (memcmp(raw_disk_sb, result, sizeof(result)))
-			ret = 1;
-	}
+	if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
+		return 1;
 
-	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
-		btrfs_err(fs_info, "unsupported checksum algorithm %u",
-				csum_type);
-		ret = 1;
-	}
-
-	return ret;
+	return 0;
 }
 
-static int verify_level_key(struct btrfs_fs_info *fs_info,
-			    struct extent_buffer *eb, int level,
-			    struct btrfs_key *first_key, u64 parent_transid)
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+			   struct btrfs_key *first_key, u64 parent_transid)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
 	int found_level;
 	struct btrfs_key found_key;
 	int ret;
 
 	found_level = btrfs_header_level(eb);
 	if (found_level != level) {
-#ifdef CONFIG_BTRFS_DEBUG
-		WARN_ON(1);
+		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+		     KERN_ERR "BTRFS: tree level check failed\n");
 		btrfs_err(fs_info,
 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
 			  eb->start, level, found_level);
-#endif
 		return -EIO;
 	}
 
@@ -437,15 +417,25 @@
 	 */
 	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
 		return 0;
+
+	/* We have @first_key, so this @eb must have at least one item */
+	if (btrfs_header_nritems(eb) == 0) {
+		btrfs_err(fs_info,
+		"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
+			  eb->start);
+		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+		return -EUCLEAN;
+	}
+
 	if (found_level)
 		btrfs_node_key_to_cpu(eb, &found_key, 0);
 	else
 		btrfs_item_key_to_cpu(eb, &found_key, 0);
 	ret = btrfs_comp_cpu_keys(first_key, &found_key);
 
-#ifdef CONFIG_BTRFS_DEBUG
 	if (ret) {
-		WARN_ON(1);
+		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+		     KERN_ERR "BTRFS: tree first key check failed\n");
 		btrfs_err(fs_info,
 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
 			  eb->start, parent_transid, first_key->objectid,
@@ -453,7 +443,6 @@
 			  found_key.objectid, found_key.type,
 			  found_key.offset);
 	}
-#endif
 	return ret;
 }
 
@@ -465,11 +454,11 @@
  * @level:		expected level, mandatory check
  * @first_key:		expected key of first slot, skip check if NULL
  */
-static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
-					  struct extent_buffer *eb,
+static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
 					  u64 parent_transid, int level,
 					  struct btrfs_key *first_key)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct extent_io_tree *io_tree;
 	int failed = 0;
 	int ret;
@@ -480,14 +469,13 @@
 	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 	while (1) {
 		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
-		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
-					       mirror_num);
+		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
 		if (!ret) {
 			if (verify_parent_transid(io_tree, eb,
 						   parent_transid, 0))
 				ret = -EIO;
-			else if (verify_level_key(fs_info, eb, level,
-						  first_key, parent_transid))
+			else if (btrfs_verify_level_key(eb, level,
+						first_key, parent_transid))
 				ret = -EUCLEAN;
 			else
 				break;
@@ -512,7 +500,7 @@
 	}
 
 	if (failed && !ret && failed_mirror)
-		repair_eb_io_failure(fs_info, eb, failed_mirror);
+		btrfs_repair_eb_io_failure(eb, failed_mirror);
 
 	return ret;
 }
@@ -526,7 +514,10 @@
 {
 	u64 start = page_offset(page);
 	u64 found_start;
+	u8 result[BTRFS_CSUM_SIZE];
+	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	struct extent_buffer *eb;
+	int ret;
 
 	eb = (struct extent_buffer *)page->private;
 	if (page != eb->pages[0])
@@ -542,22 +533,51 @@
 	if (WARN_ON(!PageUptodate(page)))
 		return -EUCLEAN;
 
-	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
+	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
 			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
 
-	return csum_tree_block(fs_info, eb, 0);
+	if (csum_tree_block(eb, result))
+		return -EINVAL;
+
+	if (btrfs_header_level(eb))
+		ret = btrfs_check_node(eb);
+	else
+		ret = btrfs_check_leaf_full(eb);
+
+	if (ret < 0) {
+		btrfs_err(fs_info,
+		"block=%llu write time tree block corruption detected",
+			  eb->start);
+		return ret;
+	}
+	write_extent_buffer(eb, result, 0, csum_size);
+
+	return 0;
 }
 
-static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
-				 struct extent_buffer *eb)
+static int check_tree_block_fsid(struct extent_buffer *eb)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	u8 fsid[BTRFS_FSID_SIZE];
 	int ret = 1;
 
 	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	while (fs_devices) {
-		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+		u8 *metadata_uuid;
+
+		/*
+		 * Checking the incompat flag is only valid for the current
+		 * fs. For seed devices it's forbidden to have their uuid
+		 * changed so reading ->fsid in this case is fine
+		 */
+		if (fs_devices == fs_info->fs_devices &&
+		    btrfs_fs_incompat(fs_info, METADATA_UUID))
+			metadata_uuid = fs_devices->metadata_uuid;
+		else
+			metadata_uuid = fs_devices->fsid;
+
+		if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
 			ret = 0;
 			break;
 		}
@@ -575,7 +595,9 @@
 	struct extent_buffer *eb;
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	int ret = 0;
+	u8 result[BTRFS_CSUM_SIZE];
 	int reads_done;
 
 	if (!page->private)
@@ -605,7 +627,7 @@
 		ret = -EIO;
 		goto err;
 	}
-	if (check_tree_block_fsid(fs_info, eb)) {
+	if (check_tree_block_fsid(eb)) {
 		btrfs_err_rl(fs_info, "bad fsid on block %llu",
 			     eb->start);
 		ret = -EIO;
@@ -622,25 +644,44 @@
 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
 				       eb, found_level);
 
-	ret = csum_tree_block(fs_info, eb, 1);
+	ret = csum_tree_block(eb, result);
 	if (ret)
 		goto err;
 
+	if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
+		u32 val;
+		u32 found = 0;
+
+		memcpy(&found, result, csum_size);
+
+		read_extent_buffer(eb, &val, 0, csum_size);
+		btrfs_warn_rl(fs_info,
+		"%s checksum verify failed on %llu wanted %x found %x level %d",
+			      fs_info->sb->s_id, eb->start,
+			      val, found, btrfs_header_level(eb));
+		ret = -EUCLEAN;
+		goto err;
+	}
+
 	/*
 	 * If this is a leaf block and it is corrupt, set the corrupt bit so
 	 * that we don't try and read the other copies of this block, just
 	 * return -EIO.
 	 */
-	if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
+	if (found_level == 0 && btrfs_check_leaf_full(eb)) {
 		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
 	}
 
-	if (found_level > 0 && btrfs_check_node(fs_info, eb))
+	if (found_level > 0 && btrfs_check_node(eb))
 		ret = -EIO;
 
 	if (!ret)
 		set_extent_buffer_uptodate(eb);
+	else
+		btrfs_err(fs_info,
+			  "block=%llu read time tree block corruption detected",
+			  eb->start);
 err:
 	if (reads_done &&
 	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -660,19 +701,6 @@
 	return ret;
 }
 
-static int btree_io_failed_hook(struct page *page, int failed_mirror)
-{
-	struct extent_buffer *eb;
-
-	eb = (struct extent_buffer *)page->private;
-	set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
-	eb->read_mirror = failed_mirror;
-	atomic_dec(&eb->io_pages);
-	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-		btree_readahead_hook(eb, -EIO);
-	return -EIO;	/* we fixed nothing */
-}
-
 static void end_workqueue_bio(struct bio *bio)
 {
 	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
@@ -751,11 +779,22 @@
 		async->status = ret;
 }
 
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time.   All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the tree.
+ */
 static void run_one_async_done(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
+	struct inode *inode;
+	blk_status_t ret;
 
 	async = container_of(work, struct  async_submit_bio, work);
+	inode = async->private_data;
 
 	/* If an error occurred we just want to clean up the bio and move on */
 	if (async->status) {
@@ -764,7 +803,12 @@
 		return;
 	}
 
-	btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num);
+	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio,
+			async->mirror_num, 1);
+	if (ret) {
+		async->bio->bi_status = ret;
+		bio_endio(async->bio);
+	}
 }
 
 static void run_one_async_free(struct btrfs_work *work)
@@ -809,10 +853,11 @@
 {
 	struct bio_vec *bvec;
 	struct btrfs_root *root;
-	int i, ret = 0;
+	int ret = 0;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
 		if (ret)
@@ -832,24 +877,22 @@
 	return btree_csum_one_bio(bio);
 }
 
-static int check_async_write(struct btrfs_inode *bi)
+static int check_async_write(struct btrfs_fs_info *fs_info,
+			     struct btrfs_inode *bi)
 {
 	if (atomic_read(&bi->sync_writers))
 		return 0;
-#ifdef CONFIG_X86
-	if (static_cpu_has(X86_FEATURE_XMM4_2))
+	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
 		return 0;
-#endif
 	return 1;
 }
 
-static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
-					  int mirror_num, unsigned long bio_flags,
-					  u64 bio_offset)
+static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
+					  int mirror_num,
+					  unsigned long bio_flags)
 {
-	struct inode *inode = private_data;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	int async = check_async_write(BTRFS_I(inode));
+	int async = check_async_write(fs_info, BTRFS_I(inode));
 	blk_status_t ret;
 
 	if (bio_op(bio) != REQ_OP_WRITE) {
@@ -873,8 +916,7 @@
 		 * checksumming can happen in parallel across all CPUs
 		 */
 		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
-					  bio_offset, private_data,
-					  btree_submit_bio_start);
+					  0, inode, btree_submit_bio_start);
 	}
 
 	if (ret)
@@ -993,46 +1035,17 @@
 void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
 {
 	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = fs_info->btree_inode;
-
-	buf = btrfs_find_create_tree_block(fs_info, bytenr);
-	if (IS_ERR(buf))
-		return;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, WAIT_NONE, 0);
-	free_extent_buffer(buf);
-}
-
-int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
-			 int mirror_num, struct extent_buffer **eb)
-{
-	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = fs_info->btree_inode;
-	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
 	int ret;
 
 	buf = btrfs_find_create_tree_block(fs_info, bytenr);
 	if (IS_ERR(buf))
-		return 0;
+		return;
 
-	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
-
-	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
-				       mirror_num);
-	if (ret) {
+	ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
+	if (ret < 0)
+		free_extent_buffer_stale(buf);
+	else
 		free_extent_buffer(buf);
-		return ret;
-	}
-
-	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
-		free_extent_buffer(buf);
-		return -EIO;
-	} else if (extent_buffer_uptodate(buf)) {
-		*eb = buf;
-	} else {
-		free_extent_buffer(buf);
-	}
-	return 0;
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(
@@ -1044,19 +1057,6 @@
 	return alloc_extent_buffer(fs_info, bytenr);
 }
 
-
-int btrfs_write_tree_block(struct extent_buffer *buf)
-{
-	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
-					buf->start + buf->len - 1);
-}
-
-void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
-{
-	filemap_fdatawait_range(buf->pages[0]->mapping,
-			        buf->start, buf->start + buf->len - 1);
-}
-
 /*
  * Read tree block at logical address @bytenr and do variant basic but critical
  * verification.
@@ -1076,19 +1076,19 @@
 	if (IS_ERR(buf))
 		return buf;
 
-	ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+	ret = btree_read_extent_buffer_pages(buf, parent_transid,
 					     level, first_key);
 	if (ret) {
-		free_extent_buffer(buf);
+		free_extent_buffer_stale(buf);
 		return ERR_PTR(ret);
 	}
 	return buf;
 
 }
 
-void clean_tree_block(struct btrfs_fs_info *fs_info,
-		      struct extent_buffer *buf)
+void btrfs_clean_tree_block(struct extent_buffer *buf)
 {
+	struct btrfs_fs_info *fs_info = buf->fs_info;
 	if (btrfs_header_generation(buf) ==
 	    fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
@@ -1098,7 +1098,7 @@
 						 -buf->len,
 						 fs_info->dirty_metadata_batch);
 			/* ugh, clear_extent_buffer_dirty needs to lock the page */
-			btrfs_set_lock_blocking(buf);
+			btrfs_set_lock_blocking_write(buf);
 			clear_extent_buffer_dirty(buf);
 		}
 	}
@@ -1139,7 +1139,6 @@
 	root->state = 0;
 	root->orphan_cleanup_state = 0;
 
-	root->objectid = objectid;
 	root->last_trans = 0;
 	root->highest_objectid = 0;
 	root->nr_delalloc_inodes = 0;
@@ -1154,6 +1153,7 @@
 	INIT_LIST_HEAD(&root->delalloc_root);
 	INIT_LIST_HEAD(&root->ordered_extents);
 	INIT_LIST_HEAD(&root->ordered_root);
+	INIT_LIST_HEAD(&root->reloc_dirty_list);
 	INIT_LIST_HEAD(&root->logged_list[0]);
 	INIT_LIST_HEAD(&root->logged_list[1]);
 	spin_lock_init(&root->inode_lock);
@@ -1179,11 +1179,13 @@
 	refcount_set(&root->refs, 1);
 	atomic_set(&root->will_be_snapshotted, 0);
 	atomic_set(&root->snapshot_force_cow, 0);
+	atomic_set(&root->nr_swapfiles, 0);
 	root->log_transid = 0;
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
 	if (!dummy)
-		extent_io_tree_init(&root->dirty_log_pages, NULL);
+		extent_io_tree_init(fs_info, &root->dirty_log_pages,
+				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
 
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
@@ -1196,6 +1198,7 @@
 	root->anon_dev = 0;
 
 	spin_lock_init(&root->root_item_lock);
+	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
 }
 
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@ -1229,17 +1232,24 @@
 #endif
 
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
-				     struct btrfs_fs_info *fs_info,
 				     u64 objectid)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct extent_buffer *leaf;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *root;
 	struct btrfs_key key;
+	unsigned int nofs_flag;
 	int ret = 0;
 	uuid_le uuid = NULL_UUID_LE;
 
+	/*
+	 * We're holding a transaction handle, so use a NOFS memory allocation
+	 * context to avoid deadlock if reclaim happens.
+	 */
+	nofs_flag = memalloc_nofs_save();
 	root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+	memalloc_nofs_restore(nofs_flag);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
@@ -1660,6 +1670,8 @@
 	while (1) {
 		again = 0;
 
+		set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+
 		/* Make the cleaner go to sleep early. */
 		if (btrfs_need_cleaner_sleep(fs_info))
 			goto sleep;
@@ -1683,9 +1695,7 @@
 			goto sleep;
 		}
 
-		mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
 		btrfs_run_delayed_iputs(fs_info);
-		mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
 
 		again = btrfs_clean_one_deleted_snapshot(root);
 		mutex_unlock(&fs_info->cleaner_mutex);
@@ -1706,6 +1716,7 @@
 		 */
 		btrfs_delete_unused_bgs(fs_info);
 sleep:
+		clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
 		if (kthread_should_park())
 			kthread_parkme();
 		if (kthread_should_stop())
@@ -1997,7 +2008,6 @@
 	btrfs_destroy_workqueue(fs_info->readahead_workers);
 	btrfs_destroy_workqueue(fs_info->flush_workers);
 	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
-	btrfs_destroy_workqueue(fs_info->extent_workers);
 	/*
 	 * Now that all other work queues are destroyed, we can safely destroy
 	 * the queues used for metadata I/O, since tasks from those other work
@@ -2076,7 +2086,7 @@
 	atomic_set(&fs_info->scrubs_paused, 0);
 	atomic_set(&fs_info->scrub_cancel_req, 0);
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
-	fs_info->scrub_workers_refcnt = 0;
+	refcount_set(&fs_info->scrub_workers_refcnt, 0);
 }
 
 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@ -2104,8 +2114,9 @@
 	inode->i_mapping->a_ops = &btree_aops;
 
 	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-	extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
-	BTRFS_I(inode)->io_tree.track_uptodate = 0;
+	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
+			    IO_TREE_INODE_IO, inode);
+	BTRFS_I(inode)->io_tree.track_uptodate = false;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
 
 	BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
@@ -2119,11 +2130,8 @@
 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
 {
 	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
-	rwlock_init(&fs_info->dev_replace.lock);
-	atomic_set(&fs_info->dev_replace.read_locks, 0);
-	atomic_set(&fs_info->dev_replace.blocking_readers, 0);
-	init_waitqueue_head(&fs_info->replace_wait);
-	init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
+	init_rwsem(&fs_info->dev_replace.rwsem);
+	init_waitqueue_head(&fs_info->dev_replace.replace_wait);
 }
 
 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
@@ -2131,7 +2139,6 @@
 	spin_lock_init(&fs_info->qgroup_lock);
 	mutex_init(&fs_info->qgroup_ioctl_lock);
 	fs_info->qgroup_tree = RB_ROOT;
-	fs_info->qgroup_op_tree = RB_ROOT;
 	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
 	fs_info->qgroup_seq = 1;
 	fs_info->qgroup_ulist = NULL;
@@ -2206,10 +2213,6 @@
 				      max_active, 2);
 	fs_info->qgroup_rescan_workers =
 		btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
-	fs_info->extent_workers =
-		btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
-				      min_t(u64, fs_devices->num_devices,
-					    max_active), 8);
 
 	if (!(fs_info->workers && fs_info->delalloc_workers &&
 	      fs_info->submit_workers && fs_info->flush_workers &&
@@ -2220,7 +2223,6 @@
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
 	      fs_info->caching_workers && fs_info->readahead_workers &&
 	      fs_info->fixup_workers && fs_info->delayed_workers &&
-	      fs_info->extent_workers &&
 	      fs_info->qgroup_rescan_workers)) {
 		return -ENOMEM;
 	}
@@ -2228,6 +2230,29 @@
 	return 0;
 }
 
+static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
+{
+	struct crypto_shash *csum_shash;
+	const char *csum_name = btrfs_super_csum_name(csum_type);
+
+	csum_shash = crypto_alloc_shash(csum_name, 0, 0);
+
+	if (IS_ERR(csum_shash)) {
+		btrfs_err(fs_info, "error allocating %s hash for checksum",
+			  csum_name);
+		return PTR_ERR(csum_shash);
+	}
+
+	fs_info->csum_shash = csum_shash;
+
+	return 0;
+}
+
+static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
+{
+	crypto_free_shash(fs_info->csum_shash);
+}
+
 static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 			    struct btrfs_fs_devices *fs_devices)
 {
@@ -2444,10 +2469,11 @@
 		ret = -EINVAL;
 	}
 
-	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
+		   BTRFS_FSID_SIZE) != 0) {
 		btrfs_err(fs_info,
-			   "dev_item UUID does not match fsid: %pU != %pU",
-			   fs_info->fsid, sb->dev_item.fsid);
+			"dev_item UUID does not match metadata fsid: %pU != %pU",
+			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
 		ret = -EINVAL;
 	}
 
@@ -2542,7 +2568,7 @@
 	ret = validate_super(fs_info, sb, -1);
 	if (ret < 0)
 		goto out;
-	if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
+	if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
 		ret = -EUCLEAN;
 		btrfs_err(fs_info, "invalid csum type, has %u want %u",
 			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
@@ -2572,6 +2598,7 @@
 	u32 stripesize;
 	u64 generation;
 	u64 features;
+	u16 csum_type;
 	struct btrfs_key location;
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
@@ -2598,11 +2625,17 @@
 		goto fail;
 	}
 
-	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+	ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
 	if (ret) {
 		err = ret;
 		goto fail_srcu;
 	}
+
+	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+	if (ret) {
+		err = ret;
+		goto fail_dio_bytes;
+	}
 	fs_info->dirty_metadata_batch = PAGE_SIZE *
 					(1 + ilog2(nr_cpu_ids));
 
@@ -2612,7 +2645,8 @@
 		goto fail_dirty_metadata_bytes;
 	}
 
-	ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
+	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+			GFP_KERNEL);
 	if (ret) {
 		err = ret;
 		goto fail_delalloc_bytes;
@@ -2625,8 +2659,6 @@
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
 	INIT_LIST_HEAD(&fs_info->delalloc_roots);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
-	INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
-	spin_lock_init(&fs_info->pending_raid_kobjs_lock);
 	spin_lock_init(&fs_info->delalloc_root_lock);
 	spin_lock_init(&fs_info->trans_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
@@ -2634,7 +2666,6 @@
 	spin_lock_init(&fs_info->defrag_inodes_lock);
 	spin_lock_init(&fs_info->tree_mod_seq_lock);
 	spin_lock_init(&fs_info->super_lock);
-	spin_lock_init(&fs_info->qgroup_op_lock);
 	spin_lock_init(&fs_info->buffer_lock);
 	spin_lock_init(&fs_info->unused_bgs_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
@@ -2642,14 +2673,13 @@
 	mutex_init(&fs_info->delete_unused_bgs_mutex);
 	mutex_init(&fs_info->reloc_mutex);
 	mutex_init(&fs_info->delalloc_root_mutex);
-	mutex_init(&fs_info->cleaner_delayed_iput_mutex);
 	seqlock_init(&fs_info->profiles_lock);
 
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	INIT_LIST_HEAD(&fs_info->unused_bgs);
-	btrfs_mapping_init(&fs_info->mapping_tree);
+	extent_map_tree_init(&fs_info->mapping_tree);
 	btrfs_init_block_rsv(&fs_info->global_block_rsv,
 			     BTRFS_BLOCK_RSV_GLOBAL);
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2657,10 +2687,13 @@
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
 	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
 			     BTRFS_BLOCK_RSV_DELOPS);
+	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
+			     BTRFS_BLOCK_RSV_DELREFS);
+
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->defrag_running, 0);
-	atomic_set(&fs_info->qgroup_op_seq, 0);
 	atomic_set(&fs_info->reada_works_cnt, 0);
+	atomic_set(&fs_info->nr_delayed_iputs, 0);
 	atomic64_set(&fs_info->tree_mod_seq, 0);
 	fs_info->sb = sb;
 	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2712,8 +2745,10 @@
 	fs_info->block_group_cache_tree = RB_ROOT;
 	fs_info->first_logical_byte = (u64)-1;
 
-	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
-	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+	extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
+			    IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
+	extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
+			    IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
 	fs_info->pinned_extents = &fs_info->freed_extents[0];
 	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
 
@@ -2738,14 +2773,18 @@
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
-
-	INIT_LIST_HEAD(&fs_info->pinned_chunks);
+	init_waitqueue_head(&fs_info->delayed_iputs_wait);
 
 	/* Usable values until the real ones are cached from the superblock */
 	fs_info->nodesize = 4096;
 	fs_info->sectorsize = 4096;
 	fs_info->stripesize = 4096;
 
+	spin_lock_init(&fs_info->swapfile_pins_lock);
+	fs_info->swapfile_pins = RB_ROOT;
+
+	fs_info->send_in_progress = 0;
+
 	ret = btrfs_alloc_stripe_hash_table(fs_info);
 	if (ret) {
 		err = ret;
@@ -2766,6 +2805,25 @@
 	}
 
 	/*
+	 * Verify the type first, if that or the the checksum value are
+	 * corrupted, we'll find out
+	 */
+	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
+	if (!btrfs_supported_super_csum(csum_type)) {
+		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
+			  csum_type);
+		err = -EINVAL;
+		brelse(bh);
+		goto fail_alloc;
+	}
+
+	ret = btrfs_init_csum_hash(fs_info, csum_type);
+	if (ret) {
+		err = ret;
+		goto fail_alloc;
+	}
+
+	/*
 	 * We want to check superblock checksum, the type is stored inside.
 	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
 	 */
@@ -2773,7 +2831,7 @@
 		btrfs_err(fs_info, "superblock checksum mismatch");
 		err = -EINVAL;
 		brelse(bh);
-		goto fail_alloc;
+		goto fail_csum;
 	}
 
 	/*
@@ -2782,22 +2840,39 @@
 	 * the whole block of INFO_SIZE
 	 */
 	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
-	memcpy(fs_info->super_for_commit, fs_info->super_copy,
-	       sizeof(*fs_info->super_for_commit));
 	brelse(bh);
 
-	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+	disk_super = fs_info->super_copy;
+
+	ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
+		       BTRFS_FSID_SIZE));
+
+	if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
+		ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
+				fs_info->super_copy->metadata_uuid,
+				BTRFS_FSID_SIZE));
+	}
+
+	features = btrfs_super_flags(disk_super);
+	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
+		btrfs_set_super_flags(disk_super, features);
+		btrfs_info(fs_info,
+			"found metadata UUID change in progress flag, clearing");
+	}
+
+	memcpy(fs_info->super_for_commit, fs_info->super_copy,
+	       sizeof(*fs_info->super_for_commit));
 
 	ret = btrfs_validate_mount_super(fs_info);
 	if (ret) {
 		btrfs_err(fs_info, "superblock contains fatal errors");
 		err = -EINVAL;
-		goto fail_alloc;
+		goto fail_csum;
 	}
 
-	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
-		goto fail_alloc;
+		goto fail_csum;
 
 	/* check FS state, whether FS is broken. */
 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
@@ -2819,7 +2894,7 @@
 	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
 	if (ret) {
 		err = ret;
-		goto fail_alloc;
+		goto fail_csum;
 	}
 
 	features = btrfs_super_incompat_flags(disk_super) &
@@ -2829,7 +2904,7 @@
 		    "cannot mount because of unsupported optional features (%llx)",
 		    features);
 		err = -EINVAL;
-		goto fail_alloc;
+		goto fail_csum;
 	}
 
 	features = btrfs_super_incompat_flags(disk_super);
@@ -2873,7 +2948,7 @@
 		btrfs_err(fs_info,
 "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
 			nodesize, sectorsize);
-		goto fail_alloc;
+		goto fail_csum;
 	}
 
 	/*
@@ -2889,7 +2964,7 @@
 	"cannot mount read-write because of unsupported optional features (%llx)",
 		       features);
 		err = -EINVAL;
-		goto fail_alloc;
+		goto fail_csum;
 	}
 
 	ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -2901,13 +2976,13 @@
 	sb->s_bdi->congested_fn = btrfs_congested_fn;
 	sb->s_bdi->congested_data = fs_info;
 	sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
-	sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
+	sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
 	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
 	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
 
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
-	memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
+	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
 
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(fs_info);
@@ -3056,7 +3131,7 @@
 
 	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
 		btrfs_warn(fs_info,
-		"writeable mount is not allowed due to too many missing devices");
+		"writable mount is not allowed due to too many missing devices");
 		goto fail_sysfs;
 	}
 
@@ -3267,17 +3342,21 @@
 fail_sb_buffer:
 	btrfs_stop_all_workers(fs_info);
 	btrfs_free_block_groups(fs_info);
+fail_csum:
+	btrfs_free_csum_hash(fs_info);
 fail_alloc:
 fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 	iput(fs_info->btree_inode);
 fail_bio_counter:
-	percpu_counter_destroy(&fs_info->bio_counter);
+	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
 fail_delalloc_bytes:
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
 fail_dirty_metadata_bytes:
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+fail_dio_bytes:
+	percpu_counter_destroy(&fs_info->dio_bytes);
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
@@ -3406,17 +3485,20 @@
 static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb, int max_mirrors)
 {
+	struct btrfs_fs_info *fs_info = device->fs_info;
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	struct buffer_head *bh;
 	int i;
 	int ret;
 	int errors = 0;
-	u32 crc;
 	u64 bytenr;
 	int op_flags;
 
 	if (max_mirrors == 0)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
+	shash->tfm = fs_info->csum_shash;
+
 	for (i = 0; i < max_mirrors; i++) {
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@@ -3425,10 +3507,10 @@
 
 		btrfs_set_super_bytenr(sb, bytenr);
 
-		crc = ~(u32)0;
-		crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc,
-				      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
-		btrfs_csum_final(crc, sb->csum);
+		crypto_shash_init(shash);
+		crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
+				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		crypto_shash_final(shash, sb->csum);
 
 		/* One reference for us, and we leave it for the caller */
 		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
@@ -3643,7 +3725,7 @@
 
 	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
 	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
-		min_tolerated = min(min_tolerated,
+		min_tolerated = min_t(int, min_tolerated,
 				    btrfs_raid_array[BTRFS_RAID_SINGLE].
 				    tolerated_failures);
 
@@ -3652,7 +3734,7 @@
 			continue;
 		if (!(flags & btrfs_raid_array[raid_type].bg_flag))
 			continue;
-		min_tolerated = min(min_tolerated,
+		min_tolerated = min_t(int, min_tolerated,
 				    btrfs_raid_array[raid_type].
 				    tolerated_failures);
 	}
@@ -3725,7 +3807,8 @@
 		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
 		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
 		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
-		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
+		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
+		       BTRFS_FSID_SIZE);
 
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
@@ -3947,6 +4030,7 @@
 	kthread_stop(fs_info->transaction_kthread);
 	kthread_stop(fs_info->cleaner_kthread);
 
+	ASSERT(list_empty(&fs_info->delayed_iputs));
 	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
 
 	btrfs_free_qgroup_config(fs_info);
@@ -3957,6 +4041,10 @@
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
+	if (percpu_counter_sum(&fs_info->dio_bytes))
+		btrfs_info(fs_info, "at unmount dio bytes count %lld",
+			   percpu_counter_sum(&fs_info->dio_bytes));
+
 	btrfs_sysfs_remove_mounted(fs_info);
 	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 
@@ -3983,25 +4071,18 @@
 		btrfsic_unmount(fs_info->fs_devices);
 #endif
 
-	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+	btrfs_close_devices(fs_info->fs_devices);
 
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
-	percpu_counter_destroy(&fs_info->bio_counter);
+	percpu_counter_destroy(&fs_info->dio_bytes);
+	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
+	btrfs_free_csum_hash(fs_info);
 	btrfs_free_stripe_hash_table(fs_info);
 	btrfs_free_ref_cache(fs_info);
-
-	while (!list_empty(&fs_info->pinned_chunks)) {
-		struct extent_map *em;
-
-		em = list_first_entry(&fs_info->pinned_chunks,
-				      struct extent_map, list);
-		list_del_init(&em->list);
-		free_extent_map(em);
-	}
 }
 
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -4031,7 +4112,7 @@
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 	/*
 	 * This is a fast path so only do this check if we have sanity tests
-	 * enabled.  Normal people shouldn't be using umapped buffers as dirty
+	 * enabled.  Normal people shouldn't be using unmapped buffers as dirty
 	 * outside of the sanity tests.
 	 */
 	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
@@ -4055,7 +4136,7 @@
 	 * So here we should only check item pointers, not item data.
 	 */
 	if (btrfs_header_level(buf) == 0 &&
-	    btrfs_check_leaf_relaxed(fs_info, buf)) {
+	    btrfs_check_leaf_relaxed(buf)) {
 		btrfs_print_leaf(buf);
 		ASSERT(0);
 	}
@@ -4098,10 +4179,7 @@
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
 		      struct btrfs_key *first_key)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
-	struct btrfs_fs_info *fs_info = root->fs_info;
-
-	return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+	return btree_read_extent_buffer_pages(buf, parent_transid,
 					      level, first_key);
 }
 
@@ -4155,6 +4233,14 @@
 		spin_lock(&fs_info->ordered_root_lock);
 	}
 	spin_unlock(&fs_info->ordered_root_lock);
+
+	/*
+	 * We need this here because if we've been flipped read-only we won't
+	 * get sync() from the umount, so we need to make sure any ordered
+	 * extents that haven't had their dirty pages IO start writeout yet
+	 * actually get run and error out properly.
+	 */
+	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 }
 
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -4174,29 +4260,22 @@
 		return ret;
 	}
 
-	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
 		struct btrfs_delayed_ref_head *head;
 		struct rb_node *n;
 		bool pin_bytes = false;
 
 		head = rb_entry(node, struct btrfs_delayed_ref_head,
 				href_node);
-		if (!mutex_trylock(&head->mutex)) {
-			refcount_inc(&head->refs);
-			spin_unlock(&delayed_refs->lock);
-
-			mutex_lock(&head->mutex);
-			mutex_unlock(&head->mutex);
-			btrfs_put_delayed_ref_head(head);
-			spin_lock(&delayed_refs->lock);
+		if (btrfs_delayed_ref_lock(delayed_refs, head))
 			continue;
-		}
+
 		spin_lock(&head->lock);
-		while ((n = rb_first(&head->ref_tree)) != NULL) {
+		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
 			ref = rb_entry(n, struct btrfs_delayed_ref_node,
 				       ref_node);
 			ref->in_tree = 0;
-			rb_erase(&ref->ref_node, &head->ref_tree);
+			rb_erase_cached(&ref->ref_node, &head->ref_tree);
 			RB_CLEAR_NODE(&ref->ref_node);
 			if (!list_empty(&ref->add_list))
 				list_del(&ref->add_list);
@@ -4206,12 +4285,7 @@
 		if (head->must_insert_reserved)
 			pin_bytes = true;
 		btrfs_free_delayed_extent_op(head->extent_op);
-		delayed_refs->num_heads--;
-		if (head->processing == 0)
-			delayed_refs->num_heads_ready--;
-		atomic_dec(&delayed_refs->num_entries);
-		rb_erase(&head->href_node, &delayed_refs->href_root);
-		RB_CLEAR_NODE(&head->href_node);
+		btrfs_delete_ref_head(delayed_refs, head);
 		spin_unlock(&head->lock);
 		spin_unlock(&delayed_refs->lock);
 		mutex_unlock(&head->mutex);
@@ -4219,6 +4293,7 @@
 		if (pin_bytes)
 			btrfs_pin_extent(fs_info, head->bytenr,
 					 head->num_bytes, 1);
+		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
 		btrfs_put_delayed_ref_head(head);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
@@ -4329,6 +4404,8 @@
 	unpin = pinned_extents;
 again:
 	while (1) {
+		struct extent_state *cached_state = NULL;
+
 		/*
 		 * The btrfs_finish_extent_commit() may get the same range as
 		 * ours between find_first_extent_bit and clear_extent_dirty.
@@ -4337,13 +4414,14 @@
 		 */
 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY, NULL);
+					    EXTENT_DIRTY, &cached_state);
 		if (ret) {
 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 			break;
 		}
 
-		clear_extent_dirty(unpin, start, end);
+		clear_extent_dirty(unpin, start, end, &cached_state);
+		free_extent_state(cached_state);
 		btrfs_error_unpin_extent_range(fs_info, start, end);
 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 		cond_resched();
@@ -4400,6 +4478,7 @@
 
 		spin_unlock(&cur_trans->dirty_bgs_lock);
 		btrfs_put_block_group(cache);
+		btrfs_delayed_refs_rsv_release(fs_info, 1);
 		spin_lock(&cur_trans->dirty_bgs_lock);
 	}
 	spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4424,10 +4503,17 @@
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				   struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_device *dev, *tmp;
+
 	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
 	ASSERT(list_empty(&cur_trans->dirty_bgs));
 	ASSERT(list_empty(&cur_trans->io_bgs));
 
+	list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
+				 post_commit_list) {
+		list_del_init(&dev->post_commit_list);
+	}
+
 	btrfs_destroy_delayed_refs(cur_trans, fs_info);
 
 	cur_trans->state = TRANS_STATE_COMMIT_START;
@@ -4505,7 +4591,4 @@
 	/* mandatory callbacks */
 	.submit_bio_hook = btree_submit_bio_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
-	.readpage_io_failed_hook = btree_io_failed_hook,
-
-	/* optional callbacks */
 };

diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4cccba2..a695810 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h

@@ -21,11 +21,11 @@
 #define BTRFS_BDEV_BLOCKSIZE	(4096)
 
 enum btrfs_wq_endio_type {
-	BTRFS_WQ_ENDIO_DATA = 0,
-	BTRFS_WQ_ENDIO_METADATA = 1,
-	BTRFS_WQ_ENDIO_FREE_SPACE = 2,
-	BTRFS_WQ_ENDIO_RAID56 = 3,
-	BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
+	BTRFS_WQ_ENDIO_DATA,
+	BTRFS_WQ_ENDIO_METADATA,
+	BTRFS_WQ_ENDIO_FREE_SPACE,
+	BTRFS_WQ_ENDIO_RAID56,
+	BTRFS_WQ_ENDIO_DIO_REPAIR,
 };
 
 static inline u64 btrfs_sb_offset(int mirror)
@@ -39,16 +39,16 @@
 struct btrfs_device;
 struct btrfs_fs_devices;
 
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+			   struct btrfs_key *first_key, u64 parent_transid);
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
 				      u64 parent_transid, int level,
 				      struct btrfs_key *first_key);
 void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
-int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
-			 int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(
 						struct btrfs_fs_info *fs_info,
 						u64 bytenr);
-void clean_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
+void btrfs_clean_tree_block(struct extent_buffer *buf);
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options);
@@ -113,8 +113,6 @@
 			  int atomic);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
 		      struct btrfs_key *first_key);
-u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
-void btrfs_csum_final(u32 crc, u8 *result);
 blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			enum btrfs_wq_endio_type metadata);
 blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
@@ -123,8 +121,6 @@
 			extent_submit_bio_start_t *submit_bio_start);
 blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
 			  int mirror_num);
-int btrfs_write_tree_block(struct extent_buffer *buf);
-void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
@@ -134,7 +130,6 @@
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
 				  struct btrfs_fs_info *fs_info);
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
-				     struct btrfs_fs_info *fs_info,
 				     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
 				void (*flush_fn)(void *));

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1f3755b..ddf28ec 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c

@@ -33,7 +33,7 @@
 	type = FILEID_BTRFS_WITHOUT_PARENT;
 
 	fid->objectid = btrfs_ino(BTRFS_I(inode));
-	fid->root_objectid = BTRFS_I(inode)->root->objectid;
+	fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
 	fid->gen = inode->i_generation;
 
 	if (parent) {
@@ -41,7 +41,7 @@
 
 		fid->parent_objectid = BTRFS_I(parent)->location.objectid;
 		fid->parent_gen = parent->i_generation;
-		parent_root_id = BTRFS_I(parent)->root->objectid;
+		parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
 
 		if (parent_root_id != fid->root_objectid) {
 			fid->parent_root_objectid = parent_root_id;

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 51e41e5..49cb26f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c

@@ -16,6 +16,7 @@
 #include <linux/percpu_counter.h>
 #include <linux/lockdep.h>
 #include <linux/crc32c.h>
+#include "misc.h"
 #include "tree-log.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -24,32 +25,16 @@
 #include "locking.h"
 #include "free-space-cache.h"
 #include "free-space-tree.h"
-#include "math.h"
 #include "sysfs.h"
 #include "qgroup.h"
 #include "ref-verify.h"
+#include "space-info.h"
+#include "block-rsv.h"
+#include "delalloc-space.h"
+#include "block-group.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
-/*
- * control flags for do_chunk_alloc's force field
- * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
- * if we really need one.
- *
- * CHUNK_ALLOC_LIMITED means to only try and allocate one
- * if we have very few chunks already allocated.  This is
- * used as part of the clustering code to help make sure
- * we have a good pool of storage to cluster in, without
- * filling the FS with empty chunks
- *
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
- */
-enum {
-	CHUNK_ALLOC_NO_FORCE = 0,
-	CHUNK_ALLOC_LIMITED = 1,
-	CHUNK_ALLOC_FORCE = 2,
-};
 
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_delayed_ref_node *node, u64 parent,
@@ -66,148 +51,16 @@
 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_delayed_ref_node *node,
 				     struct btrfs_delayed_extent_op *extent_op);
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
-			  int force);
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
-static void dump_space_info(struct btrfs_fs_info *fs_info,
-			    struct btrfs_space_info *info, u64 bytes,
-			    int dump_block_groups);
-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
-			       u64 num_bytes);
-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
-				     struct btrfs_space_info *space_info,
-				     u64 num_bytes);
-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
-				     struct btrfs_space_info *space_info,
-				     u64 num_bytes);
-
-static noinline int
-block_group_cache_done(struct btrfs_block_group_cache *cache)
-{
-	smp_mb();
-	return cache->cached == BTRFS_CACHE_FINISHED ||
-		cache->cached == BTRFS_CACHE_ERROR;
-}
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
 }
 
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
-{
-	atomic_inc(&cache->count);
-}
-
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
-{
-	if (atomic_dec_and_test(&cache->count)) {
-		WARN_ON(cache->pinned > 0);
-		WARN_ON(cache->reserved > 0);
-
-		/*
-		 * If not empty, someone is still holding mutex of
-		 * full_stripe_lock, which can only be released by caller.
-		 * And it will definitely cause use-after-free when caller
-		 * tries to release full stripe lock.
-		 *
-		 * No better way to resolve, but only to warn.
-		 */
-		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
-		kfree(cache->free_space_ctl);
-		kfree(cache);
-	}
-}
-
-/*
- * this adds the block group to the fs_info rb tree for the block group
- * cache
- */
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
-				struct btrfs_block_group_cache *block_group)
-{
-	struct rb_node **p;
-	struct rb_node *parent = NULL;
-	struct btrfs_block_group_cache *cache;
-
-	spin_lock(&info->block_group_cache_lock);
-	p = &info->block_group_cache_tree.rb_node;
-
-	while (*p) {
-		parent = *p;
-		cache = rb_entry(parent, struct btrfs_block_group_cache,
-				 cache_node);
-		if (block_group->key.objectid < cache->key.objectid) {
-			p = &(*p)->rb_left;
-		} else if (block_group->key.objectid > cache->key.objectid) {
-			p = &(*p)->rb_right;
-		} else {
-			spin_unlock(&info->block_group_cache_lock);
-			return -EEXIST;
-		}
-	}
-
-	rb_link_node(&block_group->cache_node, parent, p);
-	rb_insert_color(&block_group->cache_node,
-			&info->block_group_cache_tree);
-
-	if (info->first_logical_byte > block_group->key.objectid)
-		info->first_logical_byte = block_group->key.objectid;
-
-	spin_unlock(&info->block_group_cache_lock);
-
-	return 0;
-}
-
-/*
- * This will return the block group at or after bytenr if contains is 0, else
- * it will return the block group that contains the bytenr
- */
-static struct btrfs_block_group_cache *
-block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
-			      int contains)
-{
-	struct btrfs_block_group_cache *cache, *ret = NULL;
-	struct rb_node *n;
-	u64 end, start;
-
-	spin_lock(&info->block_group_cache_lock);
-	n = info->block_group_cache_tree.rb_node;
-
-	while (n) {
-		cache = rb_entry(n, struct btrfs_block_group_cache,
-				 cache_node);
-		end = cache->key.objectid + cache->key.offset - 1;
-		start = cache->key.objectid;
-
-		if (bytenr < start) {
-			if (!contains && (!ret || start < ret->key.objectid))
-				ret = cache;
-			n = n->rb_left;
-		} else if (bytenr > start) {
-			if (contains && bytenr <= end) {
-				ret = cache;
-				break;
-			}
-			n = n->rb_right;
-		} else {
-			ret = cache;
-			break;
-		}
-	}
-	if (ret) {
-		btrfs_get_block_group(ret);
-		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
-			info->first_logical_byte = ret->key.objectid;
-	}
-	spin_unlock(&info->block_group_cache_lock);
-
-	return ret;
-}
-
-static int add_excluded_extent(struct btrfs_fs_info *fs_info,
-			       u64 start, u64 num_bytes)
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+			      u64 start, u64 num_bytes)
 {
 	u64 end = start + num_bytes - 1;
 	set_extent_bits(&fs_info->freed_extents[0],
@@ -217,7 +70,7 @@
 	return 0;
 }
 
-static void free_excluded_extents(struct btrfs_block_group_cache *cache)
+void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	u64 start, end;
@@ -231,547 +84,39 @@
 			  start, end, EXTENT_UPTODATE);
 }
 
-static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
+static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
 {
-	struct btrfs_fs_info *fs_info = cache->fs_info;
-	u64 bytenr;
-	u64 *logical;
-	int stripe_len;
-	int i, nr, ret;
-
-	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
-		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
-		cache->bytes_super += stripe_len;
-		ret = add_excluded_extent(fs_info, cache->key.objectid,
-					  stripe_len);
-		if (ret)
-			return ret;
-	}
-
-	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-		bytenr = btrfs_sb_offset(i);
-		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
-				       bytenr, &logical, &nr, &stripe_len);
-		if (ret)
-			return ret;
-
-		while (nr--) {
-			u64 start, len;
-
-			if (logical[nr] > cache->key.objectid +
-			    cache->key.offset)
-				continue;
-
-			if (logical[nr] + stripe_len <= cache->key.objectid)
-				continue;
-
-			start = logical[nr];
-			if (start < cache->key.objectid) {
-				start = cache->key.objectid;
-				len = (logical[nr] + stripe_len) - start;
-			} else {
-				len = min_t(u64, stripe_len,
-					    cache->key.objectid +
-					    cache->key.offset - start);
-			}
-
-			cache->bytes_super += len;
-			ret = add_excluded_extent(fs_info, start, len);
-			if (ret) {
-				kfree(logical);
-				return ret;
-			}
-		}
-
-		kfree(logical);
-	}
-	return 0;
-}
-
-static struct btrfs_caching_control *
-get_caching_control(struct btrfs_block_group_cache *cache)
-{
-	struct btrfs_caching_control *ctl;
-
-	spin_lock(&cache->lock);
-	if (!cache->caching_ctl) {
-		spin_unlock(&cache->lock);
-		return NULL;
-	}
-
-	ctl = cache->caching_ctl;
-	refcount_inc(&ctl->count);
-	spin_unlock(&cache->lock);
-	return ctl;
-}
-
-static void put_caching_control(struct btrfs_caching_control *ctl)
-{
-	if (refcount_dec_and_test(&ctl->count))
-		kfree(ctl);
-}
-
-#ifdef CONFIG_BTRFS_DEBUG
-static void fragment_free_space(struct btrfs_block_group_cache *block_group)
-{
-	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	u64 start = block_group->key.objectid;
-	u64 len = block_group->key.offset;
-	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
-		fs_info->nodesize : fs_info->sectorsize;
-	u64 step = chunk << 1;
-
-	while (len > chunk) {
-		btrfs_remove_free_space(block_group, start, chunk);
-		start += step;
-		if (len < step)
-			len = 0;
+	if (ref->type == BTRFS_REF_METADATA) {
+		if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
+			return BTRFS_BLOCK_GROUP_SYSTEM;
 		else
-			len -= step;
+			return BTRFS_BLOCK_GROUP_METADATA;
 	}
-}
-#endif
-
-/*
- * this is only called by cache_block_group, since we could have freed extents
- * we need to check the pinned_extents for any extents that can't be used yet
- * since their free space will be released as soon as the transaction commits.
- */
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-		       u64 start, u64 end)
-{
-	struct btrfs_fs_info *info = block_group->fs_info;
-	u64 extent_start, extent_end, size, total_added = 0;
-	int ret;
-
-	while (start < end) {
-		ret = find_first_extent_bit(info->pinned_extents, start,
-					    &extent_start, &extent_end,
-					    EXTENT_DIRTY | EXTENT_UPTODATE,
-					    NULL);
-		if (ret)
-			break;
-
-		if (extent_start <= start) {
-			start = extent_end + 1;
-		} else if (extent_start > start && extent_start < end) {
-			size = extent_start - start;
-			total_added += size;
-			ret = btrfs_add_free_space(block_group, start,
-						   size);
-			BUG_ON(ret); /* -ENOMEM or logic error */
-			start = extent_end + 1;
-		} else {
-			break;
-		}
-	}
-
-	if (start < end) {
-		size = end - start;
-		total_added += size;
-		ret = btrfs_add_free_space(block_group, start, size);
-		BUG_ON(ret); /* -ENOMEM or logic error */
-	}
-
-	return total_added;
+	return BTRFS_BLOCK_GROUP_DATA;
 }
 
-static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
-{
-	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
-	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct btrfs_root *extent_root = fs_info->extent_root;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	struct btrfs_key key;
-	u64 total_found = 0;
-	u64 last = 0;
-	u32 nritems;
-	int ret;
-	bool wakeup = true;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
-
-#ifdef CONFIG_BTRFS_DEBUG
-	/*
-	 * If we're fragmenting we don't want to make anybody think we can
-	 * allocate from this block group until we've had a chance to fragment
-	 * the free space.
-	 */
-	if (btrfs_should_fragment_free_space(block_group))
-		wakeup = false;
-#endif
-	/*
-	 * We don't want to deadlock with somebody trying to allocate a new
-	 * extent for the extent root while also trying to search the extent
-	 * root to add free space.  So we skip locking and search the commit
-	 * root, since its read-only
-	 */
-	path->skip_locking = 1;
-	path->search_commit_root = 1;
-	path->reada = READA_FORWARD;
-
-	key.objectid = last;
-	key.offset = 0;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-
-next:
-	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-
-	leaf = path->nodes[0];
-	nritems = btrfs_header_nritems(leaf);
-
-	while (1) {
-		if (btrfs_fs_closing(fs_info) > 1) {
-			last = (u64)-1;
-			break;
-		}
-
-		if (path->slots[0] < nritems) {
-			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		} else {
-			ret = find_next_key(path, 0, &key);
-			if (ret)
-				break;
-
-			if (need_resched() ||
-			    rwsem_is_contended(&fs_info->commit_root_sem)) {
-				if (wakeup)
-					caching_ctl->progress = last;
-				btrfs_release_path(path);
-				up_read(&fs_info->commit_root_sem);
-				mutex_unlock(&caching_ctl->mutex);
-				cond_resched();
-				mutex_lock(&caching_ctl->mutex);
-				down_read(&fs_info->commit_root_sem);
-				goto next;
-			}
-
-			ret = btrfs_next_leaf(extent_root, path);
-			if (ret < 0)
-				goto out;
-			if (ret)
-				break;
-			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
-			continue;
-		}
-
-		if (key.objectid < last) {
-			key.objectid = last;
-			key.offset = 0;
-			key.type = BTRFS_EXTENT_ITEM_KEY;
-
-			if (wakeup)
-				caching_ctl->progress = last;
-			btrfs_release_path(path);
-			goto next;
-		}
-
-		if (key.objectid < block_group->key.objectid) {
-			path->slots[0]++;
-			continue;
-		}
-
-		if (key.objectid >= block_group->key.objectid +
-		    block_group->key.offset)
-			break;
-
-		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
-		    key.type == BTRFS_METADATA_ITEM_KEY) {
-			total_found += add_new_free_space(block_group, last,
-							  key.objectid);
-			if (key.type == BTRFS_METADATA_ITEM_KEY)
-				last = key.objectid +
-					fs_info->nodesize;
-			else
-				last = key.objectid + key.offset;
-
-			if (total_found > CACHING_CTL_WAKE_UP) {
-				total_found = 0;
-				if (wakeup)
-					wake_up(&caching_ctl->wait);
-			}
-		}
-		path->slots[0]++;
-	}
-	ret = 0;
-
-	total_found += add_new_free_space(block_group, last,
-					  block_group->key.objectid +
-					  block_group->key.offset);
-	caching_ctl->progress = (u64)-1;
-
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-static noinline void caching_thread(struct btrfs_work *work)
-{
-	struct btrfs_block_group_cache *block_group;
-	struct btrfs_fs_info *fs_info;
-	struct btrfs_caching_control *caching_ctl;
-	int ret;
-
-	caching_ctl = container_of(work, struct btrfs_caching_control, work);
-	block_group = caching_ctl->block_group;
-	fs_info = block_group->fs_info;
-
-	mutex_lock(&caching_ctl->mutex);
-	down_read(&fs_info->commit_root_sem);
-
-	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
-		ret = load_free_space_tree(caching_ctl);
-	else
-		ret = load_extent_tree_free(caching_ctl);
-
-	spin_lock(&block_group->lock);
-	block_group->caching_ctl = NULL;
-	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
-	spin_unlock(&block_group->lock);
-
-#ifdef CONFIG_BTRFS_DEBUG
-	if (btrfs_should_fragment_free_space(block_group)) {
-		u64 bytes_used;
-
-		spin_lock(&block_group->space_info->lock);
-		spin_lock(&block_group->lock);
-		bytes_used = block_group->key.offset -
-			btrfs_block_group_used(&block_group->item);
-		block_group->space_info->bytes_used += bytes_used >> 1;
-		spin_unlock(&block_group->lock);
-		spin_unlock(&block_group->space_info->lock);
-		fragment_free_space(block_group);
-	}
-#endif
-
-	caching_ctl->progress = (u64)-1;
-
-	up_read(&fs_info->commit_root_sem);
-	free_excluded_extents(block_group);
-	mutex_unlock(&caching_ctl->mutex);
-
-	wake_up(&caching_ctl->wait);
-
-	put_caching_control(caching_ctl);
-	btrfs_put_block_group(block_group);
-}
-
-static int cache_block_group(struct btrfs_block_group_cache *cache,
-			     int load_cache_only)
-{
-	DEFINE_WAIT(wait);
-	struct btrfs_fs_info *fs_info = cache->fs_info;
-	struct btrfs_caching_control *caching_ctl;
-	int ret = 0;
-
-	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-	if (!caching_ctl)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&caching_ctl->list);
-	mutex_init(&caching_ctl->mutex);
-	init_waitqueue_head(&caching_ctl->wait);
-	caching_ctl->block_group = cache;
-	caching_ctl->progress = cache->key.objectid;
-	refcount_set(&caching_ctl->count, 1);
-	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
-			caching_thread, NULL, NULL);
-
-	spin_lock(&cache->lock);
-	/*
-	 * This should be a rare occasion, but this could happen I think in the
-	 * case where one thread starts to load the space cache info, and then
-	 * some other thread starts a transaction commit which tries to do an
-	 * allocation while the other thread is still loading the space cache
-	 * info.  The previous loop should have kept us from choosing this block
-	 * group, but if we've moved to the state where we will wait on caching
-	 * block groups we need to first check if we're doing a fast load here,
-	 * so we can wait for it to finish, otherwise we could end up allocating
-	 * from a block group who's cache gets evicted for one reason or
-	 * another.
-	 */
-	while (cache->cached == BTRFS_CACHE_FAST) {
-		struct btrfs_caching_control *ctl;
-
-		ctl = cache->caching_ctl;
-		refcount_inc(&ctl->count);
-		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock(&cache->lock);
-
-		schedule();
-
-		finish_wait(&ctl->wait, &wait);
-		put_caching_control(ctl);
-		spin_lock(&cache->lock);
-	}
-
-	if (cache->cached != BTRFS_CACHE_NO) {
-		spin_unlock(&cache->lock);
-		kfree(caching_ctl);
-		return 0;
-	}
-	WARN_ON(cache->caching_ctl);
-	cache->caching_ctl = caching_ctl;
-	cache->cached = BTRFS_CACHE_FAST;
-	spin_unlock(&cache->lock);
-
-	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
-		mutex_lock(&caching_ctl->mutex);
-		ret = load_free_space_cache(fs_info, cache);
-
-		spin_lock(&cache->lock);
-		if (ret == 1) {
-			cache->caching_ctl = NULL;
-			cache->cached = BTRFS_CACHE_FINISHED;
-			cache->last_byte_to_unpin = (u64)-1;
-			caching_ctl->progress = (u64)-1;
-		} else {
-			if (load_cache_only) {
-				cache->caching_ctl = NULL;
-				cache->cached = BTRFS_CACHE_NO;
-			} else {
-				cache->cached = BTRFS_CACHE_STARTED;
-				cache->has_caching_ctl = 1;
-			}
-		}
-		spin_unlock(&cache->lock);
-#ifdef CONFIG_BTRFS_DEBUG
-		if (ret == 1 &&
-		    btrfs_should_fragment_free_space(cache)) {
-			u64 bytes_used;
-
-			spin_lock(&cache->space_info->lock);
-			spin_lock(&cache->lock);
-			bytes_used = cache->key.offset -
-				btrfs_block_group_used(&cache->item);
-			cache->space_info->bytes_used += bytes_used >> 1;
-			spin_unlock(&cache->lock);
-			spin_unlock(&cache->space_info->lock);
-			fragment_free_space(cache);
-		}
-#endif
-		mutex_unlock(&caching_ctl->mutex);
-
-		wake_up(&caching_ctl->wait);
-		if (ret == 1) {
-			put_caching_control(caching_ctl);
-			free_excluded_extents(cache);
-			return 0;
-		}
-	} else {
-		/*
-		 * We're either using the free space tree or no caching at all.
-		 * Set cached to the appropriate value and wakeup any waiters.
-		 */
-		spin_lock(&cache->lock);
-		if (load_cache_only) {
-			cache->caching_ctl = NULL;
-			cache->cached = BTRFS_CACHE_NO;
-		} else {
-			cache->cached = BTRFS_CACHE_STARTED;
-			cache->has_caching_ctl = 1;
-		}
-		spin_unlock(&cache->lock);
-		wake_up(&caching_ctl->wait);
-	}
-
-	if (load_cache_only) {
-		put_caching_control(caching_ctl);
-		return 0;
-	}
-
-	down_write(&fs_info->commit_root_sem);
-	refcount_inc(&caching_ctl->count);
-	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
-	up_write(&fs_info->commit_root_sem);
-
-	btrfs_get_block_group(cache);
-
-	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
-
-	return ret;
-}
-
-/*
- * return the block group that starts at or after bytenr
- */
-static struct btrfs_block_group_cache *
-btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
-{
-	return block_group_cache_tree_search(info, bytenr, 0);
-}
-
-/*
- * return the block group that contains the given bytenr
- */
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
-						 struct btrfs_fs_info *info,
-						 u64 bytenr)
-{
-	return block_group_cache_tree_search(info, bytenr, 1);
-}
-
-static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
-						  u64 flags)
-{
-	struct list_head *head = &info->space_info;
-	struct btrfs_space_info *found;
-
-	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(found, head, list) {
-		if (found->flags & flags) {
-			rcu_read_unlock();
-			return found;
-		}
-	}
-	rcu_read_unlock();
-	return NULL;
-}
-
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
-			     bool metadata, u64 root_objectid)
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ref *ref)
 {
 	struct btrfs_space_info *space_info;
-	u64 flags;
+	u64 flags = generic_ref_to_space_flags(ref);
 
-	if (metadata) {
-		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
-			flags = BTRFS_BLOCK_GROUP_SYSTEM;
-		else
-			flags = BTRFS_BLOCK_GROUP_METADATA;
-	} else {
-		flags = BTRFS_BLOCK_GROUP_DATA;
-	}
-
-	space_info = __find_space_info(fs_info, flags);
+	space_info = btrfs_find_space_info(fs_info, flags);
 	ASSERT(space_info);
-	percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
+	percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
 		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
 }
 
-/*
- * after adding space to the filesystem, we need to clear the full flags
- * on all the space infos.
- */
-void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ref *ref)
 {
-	struct list_head *head = &info->space_info;
-	struct btrfs_space_info *found;
+	struct btrfs_space_info *space_info;
+	u64 flags = generic_ref_to_space_flags(ref);
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(found, head, list)
-		found->full = 0;
-	rcu_read_unlock();
+	space_info = btrfs_find_space_info(fs_info, flags);
+	ASSERT(space_info);
+	percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
+		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
 }
 
 /* simple helper to search for an existing data extent at a given offset */
@@ -1037,7 +382,7 @@
 
 /*
  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
- * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
  */
 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
@@ -1093,18 +438,18 @@
 	return BTRFS_REF_TYPE_INVALID;
 }
 
-static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 {
 	u32 high_crc = ~(u32)0;
 	u32 low_crc = ~(u32)0;
 	__le64 lenum;
 
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(owner);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(offset);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 
 	return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -1686,7 +1031,7 @@
 	type = extent_ref_type(parent, owner);
 	size = btrfs_extent_inline_ref_size(type);
 
-	btrfs_extend_item(fs_info, path, size);
+	btrfs_extend_item(path, size);
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
@@ -1761,7 +1106,6 @@
 				  int *last_ref)
 {
 	struct extent_buffer *leaf = path->nodes[0];
-	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_extent_item *ei;
 	struct btrfs_extent_data_ref *dref = NULL;
 	struct btrfs_shared_data_ref *sref = NULL;
@@ -1816,7 +1160,7 @@
 			memmove_extent_buffer(leaf, ptr, ptr + size,
 					      end - ptr - size);
 		item_size -= size;
-		btrfs_truncate_item(fs_info, path, item_size, 1);
+		btrfs_truncate_item(path, item_size, 1);
 	}
 	btrfs_mark_buffer_dirty(leaf);
 }
@@ -1887,7 +1231,6 @@
 	return ret;
 }
 
-#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 			       u64 *discarded_bytes)
 {
@@ -2025,39 +1368,28 @@
 
 /* Can return -ENOMEM */
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset)
+			 struct btrfs_ref *generic_ref)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int old_ref_mod, new_ref_mod;
 	int ret;
 
-	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
-	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
+	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
+	       generic_ref->action);
+	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
+	       generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
 
-	btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
-			   owner, offset, BTRFS_ADD_DELAYED_REF);
-
-	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr,
-						 num_bytes, parent,
-						 root_objectid, (int)owner,
-						 BTRFS_ADD_DELAYED_REF, NULL,
+	if (generic_ref->type == BTRFS_REF_METADATA)
+		ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
+				NULL, &old_ref_mod, &new_ref_mod);
+	else
+		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
 						 &old_ref_mod, &new_ref_mod);
-	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr,
-						 num_bytes, parent,
-						 root_objectid, owner, offset,
-						 0, BTRFS_ADD_DELAYED_REF,
-						 &old_ref_mod, &new_ref_mod);
-	}
 
-	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
-		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+	btrfs_ref_tree_mod(fs_info, generic_ref);
 
-		add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
-	}
+	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
+		sub_pinned_bytes(fs_info, generic_ref);
 
 	return ret;
 }
@@ -2377,7 +1709,7 @@
 {
 	struct btrfs_delayed_ref_node *ref;
 
-	if (RB_EMPTY_ROOT(&head->ref_tree))
+	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
 		return NULL;
 
 	/*
@@ -2390,7 +1722,7 @@
 		return list_first_entry(&head->ref_add_list,
 				struct btrfs_delayed_ref_node, add_list);
 
-	ref = rb_entry(rb_first(&head->ref_tree),
+	ref = rb_entry(rb_first_cached(&head->ref_tree),
 		       struct btrfs_delayed_ref_node, ref_node);
 	ASSERT(list_empty(&ref->add_list));
 	return ref;
@@ -2406,25 +1738,77 @@
 	btrfs_delayed_ref_unlock(head);
 }
 
-static int cleanup_extent_op(struct btrfs_trans_handle *trans,
-			     struct btrfs_delayed_ref_head *head)
+static struct btrfs_delayed_extent_op *cleanup_extent_op(
+				struct btrfs_delayed_ref_head *head)
 {
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+
+	if (!extent_op)
+		return NULL;
+
+	if (head->must_insert_reserved) {
+		head->extent_op = NULL;
+		btrfs_free_delayed_extent_op(extent_op);
+		return NULL;
+	}
+	return extent_op;
+}
+
+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
+				     struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_extent_op *extent_op;
 	int ret;
 
+	extent_op = cleanup_extent_op(head);
 	if (!extent_op)
 		return 0;
 	head->extent_op = NULL;
-	if (head->must_insert_reserved) {
-		btrfs_free_delayed_extent_op(extent_op);
-		return 0;
-	}
 	spin_unlock(&head->lock);
 	ret = run_delayed_extent_op(trans, head, extent_op);
 	btrfs_free_delayed_extent_op(extent_op);
 	return ret ? ret : 1;
 }
 
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+				  struct btrfs_delayed_ref_root *delayed_refs,
+				  struct btrfs_delayed_ref_head *head)
+{
+	int nr_items = 1;	/* Dropping this ref head update. */
+
+	if (head->total_ref_mod < 0) {
+		struct btrfs_space_info *space_info;
+		u64 flags;
+
+		if (head->is_data)
+			flags = BTRFS_BLOCK_GROUP_DATA;
+		else if (head->is_system)
+			flags = BTRFS_BLOCK_GROUP_SYSTEM;
+		else
+			flags = BTRFS_BLOCK_GROUP_METADATA;
+		space_info = btrfs_find_space_info(fs_info, flags);
+		ASSERT(space_info);
+		percpu_counter_add_batch(&space_info->total_bytes_pinned,
+				   -head->num_bytes,
+				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
+
+		/*
+		 * We had csum deletions accounted for in our delayed refs rsv,
+		 * we need to drop the csum leaves for this update from our
+		 * delayed_refs_rsv.
+		 */
+		if (head->is_data) {
+			spin_lock(&delayed_refs->lock);
+			delayed_refs->pending_csums -= head->num_bytes;
+			spin_unlock(&delayed_refs->lock);
+			nr_items += btrfs_csum_bytes_to_leaves(fs_info,
+				head->num_bytes);
+		}
+	}
+
+	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+}
+
 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 			    struct btrfs_delayed_ref_head *head)
 {
@@ -2435,7 +1819,7 @@
 
 	delayed_refs = &trans->transaction->delayed_refs;
 
-	ret = cleanup_extent_op(trans, head);
+	ret = run_and_cleanup_extent_op(trans, head);
 	if (ret < 0) {
 		unselect_delayed_ref_head(delayed_refs, head);
 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
@@ -2451,42 +1835,14 @@
 	spin_unlock(&head->lock);
 	spin_lock(&delayed_refs->lock);
 	spin_lock(&head->lock);
-	if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
 		spin_unlock(&head->lock);
 		spin_unlock(&delayed_refs->lock);
 		return 1;
 	}
-	delayed_refs->num_heads--;
-	rb_erase(&head->href_node, &delayed_refs->href_root);
-	RB_CLEAR_NODE(&head->href_node);
+	btrfs_delete_ref_head(delayed_refs, head);
 	spin_unlock(&head->lock);
 	spin_unlock(&delayed_refs->lock);
-	atomic_dec(&delayed_refs->num_entries);
-
-	trace_run_delayed_ref_head(fs_info, head, 0);
-
-	if (head->total_ref_mod < 0) {
-		struct btrfs_space_info *space_info;
-		u64 flags;
-
-		if (head->is_data)
-			flags = BTRFS_BLOCK_GROUP_DATA;
-		else if (head->is_system)
-			flags = BTRFS_BLOCK_GROUP_SYSTEM;
-		else
-			flags = BTRFS_BLOCK_GROUP_METADATA;
-		space_info = __find_space_info(fs_info, flags);
-		ASSERT(space_info);
-		percpu_counter_add_batch(&space_info->total_bytes_pinned,
-				   -head->num_bytes,
-				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
-
-		if (head->is_data) {
-			spin_lock(&delayed_refs->lock);
-			delayed_refs->pending_csums -= head->num_bytes;
-			spin_unlock(&delayed_refs->lock);
-		}
-	}
 
 	if (head->must_insert_reserved) {
 		btrfs_pin_extent(fs_info, head->bytenr,
@@ -2497,110 +1853,74 @@
 		}
 	}
 
-	/* Also free its reserved qgroup space */
-	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
-				      head->qgroup_reserved);
+	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+
+	trace_run_delayed_ref_head(fs_info, head, 0);
 	btrfs_delayed_ref_unlock(head);
 	btrfs_put_delayed_ref_head(head);
 	return 0;
 }
 
-/*
- * Returns 0 on success or if called with an already aborted transaction.
- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
- */
-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-					     unsigned long nr)
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
+					struct btrfs_trans_handle *trans)
+{
+	struct btrfs_delayed_ref_root *delayed_refs =
+		&trans->transaction->delayed_refs;
+	struct btrfs_delayed_ref_head *head = NULL;
+	int ret;
+
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_select_ref_head(delayed_refs);
+	if (!head) {
+		spin_unlock(&delayed_refs->lock);
+		return head;
+	}
+
+	/*
+	 * Grab the lock that says we are going to process all the refs for
+	 * this head
+	 */
+	ret = btrfs_delayed_ref_lock(delayed_refs, head);
+	spin_unlock(&delayed_refs->lock);
+
+	/*
+	 * We may have dropped the spin lock to get the head mutex lock, and
+	 * that might have given someone else time to free the head.  If that's
+	 * true, it has been removed from our list and we can move on.
+	 */
+	if (ret == -EAGAIN)
+		head = ERR_PTR(-EAGAIN);
+
+	return head;
+}
+
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+				    struct btrfs_delayed_ref_head *locked_ref,
+				    unsigned long *run_refs)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	struct btrfs_delayed_extent_op *extent_op;
-	ktime_t start = ktime_get();
-	int ret;
-	unsigned long count = 0;
-	unsigned long actual_count = 0;
+	struct btrfs_delayed_ref_node *ref;
 	int must_insert_reserved = 0;
+	int ret;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	while (1) {
-		if (!locked_ref) {
-			if (count >= nr)
-				break;
 
-			spin_lock(&delayed_refs->lock);
-			locked_ref = btrfs_select_ref_head(trans);
-			if (!locked_ref) {
-				spin_unlock(&delayed_refs->lock);
-				break;
-			}
+	lockdep_assert_held(&locked_ref->mutex);
+	lockdep_assert_held(&locked_ref->lock);
 
-			/* grab the lock that says we are going to process
-			 * all the refs for this head */
-			ret = btrfs_delayed_ref_lock(trans, locked_ref);
-			spin_unlock(&delayed_refs->lock);
-			/*
-			 * we may have dropped the spin lock to get the head
-			 * mutex lock, and that might have given someone else
-			 * time to free the head.  If that's true, it has been
-			 * removed from our list and we can move on.
-			 */
-			if (ret == -EAGAIN) {
-				locked_ref = NULL;
-				count++;
-				continue;
-			}
-		}
-
-		/*
-		 * We need to try and merge add/drops of the same ref since we
-		 * can run into issues with relocate dropping the implicit ref
-		 * and then it being added back again before the drop can
-		 * finish.  If we merged anything we need to re-loop so we can
-		 * get a good ref.
-		 * Or we can get node references of the same type that weren't
-		 * merged when created due to bumps in the tree mod seq, and
-		 * we need to merge them to prevent adding an inline extent
-		 * backref before dropping it (triggering a BUG_ON at
-		 * insert_inline_extent_backref()).
-		 */
-		spin_lock(&locked_ref->lock);
-		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
-
-		ref = select_delayed_ref(locked_ref);
-
-		if (ref && ref->seq &&
+	while ((ref = select_delayed_ref(locked_ref))) {
+		if (ref->seq &&
 		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
 			spin_unlock(&locked_ref->lock);
 			unselect_delayed_ref_head(delayed_refs, locked_ref);
-			locked_ref = NULL;
-			cond_resched();
-			count++;
-			continue;
+			return -EAGAIN;
 		}
 
-		/*
-		 * We're done processing refs in this ref_head, clean everything
-		 * up and move on to the next ref_head.
-		 */
-		if (!ref) {
-			ret = cleanup_ref_head(trans, locked_ref);
-			if (ret > 0 ) {
-				/* We dropped our lock, we need to loop. */
-				ret = 0;
-				continue;
-			} else if (ret) {
-				return ret;
-			}
-			locked_ref = NULL;
-			count++;
-			continue;
-		}
-
-		actual_count++;
+		(*run_refs)++;
 		ref->in_tree = 0;
-		rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
 		RB_CLEAR_NODE(&ref->ref_node);
 		if (!list_empty(&ref->add_list))
 			list_del(&ref->add_list);
@@ -2622,8 +1942,8 @@
 		atomic_dec(&delayed_refs->num_entries);
 
 		/*
-		 * Record the must-insert_reserved flag before we drop the spin
-		 * lock.
+		 * Record the must_insert_reserved flag before we drop the
+		 * spin lock.
 		 */
 		must_insert_reserved = locked_ref->must_insert_reserved;
 		locked_ref->must_insert_reserved = 0;
@@ -2645,10 +1965,90 @@
 		}
 
 		btrfs_put_delayed_ref(ref);
-		count++;
 		cond_resched();
+
+		spin_lock(&locked_ref->lock);
+		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
 	}
 
+	return 0;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+					     unsigned long nr)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_head *locked_ref = NULL;
+	ktime_t start = ktime_get();
+	int ret;
+	unsigned long count = 0;
+	unsigned long actual_count = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	do {
+		if (!locked_ref) {
+			locked_ref = btrfs_obtain_ref_head(trans);
+			if (IS_ERR_OR_NULL(locked_ref)) {
+				if (PTR_ERR(locked_ref) == -EAGAIN) {
+					continue;
+				} else {
+					break;
+				}
+			}
+			count++;
+		}
+		/*
+		 * We need to try and merge add/drops of the same ref since we
+		 * can run into issues with relocate dropping the implicit ref
+		 * and then it being added back again before the drop can
+		 * finish.  If we merged anything we need to re-loop so we can
+		 * get a good ref.
+		 * Or we can get node references of the same type that weren't
+		 * merged when created due to bumps in the tree mod seq, and
+		 * we need to merge them to prevent adding an inline extent
+		 * backref before dropping it (triggering a BUG_ON at
+		 * insert_inline_extent_backref()).
+		 */
+		spin_lock(&locked_ref->lock);
+		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+
+		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+						      &actual_count);
+		if (ret < 0 && ret != -EAGAIN) {
+			/*
+			 * Error, btrfs_run_delayed_refs_for_head already
+			 * unlocked everything so just bail out
+			 */
+			return ret;
+		} else if (!ret) {
+			/*
+			 * Success, perform the usual cleanup of a processed
+			 * head
+			 */
+			ret = cleanup_ref_head(trans, locked_ref);
+			if (ret > 0 ) {
+				/* We dropped our lock, we need to loop. */
+				ret = 0;
+				continue;
+			} else if (ret) {
+				return ret;
+			}
+		}
+
+		/*
+		 * Either success case or btrfs_run_delayed_refs_for_head
+		 * returned -EAGAIN, meaning we need to select another head
+		 */
+
+		locked_ref = NULL;
+		cond_resched();
+	} while ((nr != -1 && count < nr) || locked_ref);
+
 	/*
 	 * We don't want to include ref heads since we can have empty ref heads
 	 * and those will drastically skew our runtime down since we just do
@@ -2748,153 +2148,6 @@
 	return num_csums;
 }
 
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
-				       struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_block_rsv *global_rsv;
-	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
-	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
-	unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
-	u64 num_bytes, num_dirty_bgs_bytes;
-	int ret = 0;
-
-	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-	num_heads = heads_to_leaves(fs_info, num_heads);
-	if (num_heads > 1)
-		num_bytes += (num_heads - 1) * fs_info->nodesize;
-	num_bytes <<= 1;
-	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
-							fs_info->nodesize;
-	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
-							     num_dirty_bgs);
-	global_rsv = &fs_info->global_block_rsv;
-
-	/*
-	 * If we can't allocate any more chunks lets make sure we have _lots_ of
-	 * wiggle room since running delayed refs can create more delayed refs.
-	 */
-	if (global_rsv->space_info->full) {
-		num_dirty_bgs_bytes <<= 1;
-		num_bytes <<= 1;
-	}
-
-	spin_lock(&global_rsv->lock);
-	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
-		ret = 1;
-	spin_unlock(&global_rsv->lock);
-	return ret;
-}
-
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
-				       struct btrfs_fs_info *fs_info)
-{
-	u64 num_entries =
-		atomic_read(&trans->transaction->delayed_refs.num_entries);
-	u64 avg_runtime;
-	u64 val;
-
-	smp_mb();
-	avg_runtime = fs_info->avg_delayed_ref_runtime;
-	val = num_entries * avg_runtime;
-	if (val >= NSEC_PER_SEC)
-		return 1;
-	if (val >= NSEC_PER_SEC / 2)
-		return 2;
-
-	return btrfs_check_space_for_delayed_refs(trans, fs_info);
-}
-
-struct async_delayed_refs {
-	struct btrfs_root *root;
-	u64 transid;
-	int count;
-	int error;
-	int sync;
-	struct completion wait;
-	struct btrfs_work work;
-};
-
-static inline struct async_delayed_refs *
-to_async_delayed_refs(struct btrfs_work *work)
-{
-	return container_of(work, struct async_delayed_refs, work);
-}
-
-static void delayed_ref_async_start(struct btrfs_work *work)
-{
-	struct async_delayed_refs *async = to_async_delayed_refs(work);
-	struct btrfs_trans_handle *trans;
-	struct btrfs_fs_info *fs_info = async->root->fs_info;
-	int ret;
-
-	/* if the commit is already started, we don't need to wait here */
-	if (btrfs_transaction_blocked(fs_info))
-		goto done;
-
-	trans = btrfs_join_transaction(async->root);
-	if (IS_ERR(trans)) {
-		async->error = PTR_ERR(trans);
-		goto done;
-	}
-
-	/*
-	 * trans->sync means that when we call end_transaction, we won't
-	 * wait on delayed refs
-	 */
-	trans->sync = true;
-
-	/* Don't bother flushing if we got into a different transaction */
-	if (trans->transid > async->transid)
-		goto end;
-
-	ret = btrfs_run_delayed_refs(trans, async->count);
-	if (ret)
-		async->error = ret;
-end:
-	ret = btrfs_end_transaction(trans);
-	if (ret && !async->error)
-		async->error = ret;
-done:
-	if (async->sync)
-		complete(&async->wait);
-	else
-		kfree(async);
-}
-
-int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
-				 unsigned long count, u64 transid, int wait)
-{
-	struct async_delayed_refs *async;
-	int ret;
-
-	async = kmalloc(sizeof(*async), GFP_NOFS);
-	if (!async)
-		return -ENOMEM;
-
-	async->root = fs_info->tree_root;
-	async->count = count;
-	async->error = 0;
-	async->transid = transid;
-	if (wait)
-		async->sync = 1;
-	else
-		async->sync = 0;
-	init_completion(&async->wait);
-
-	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
-			delayed_ref_async_start, NULL, NULL);
-
-	btrfs_queue_work(fs_info->extent_workers, &async->work);
-
-	if (wait) {
-		wait_for_completion(&async->wait);
-		ret = async->error;
-		kfree(async);
-		return ret;
-	}
-	return 0;
-}
-
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2937,11 +2190,10 @@
 	}
 
 	if (run_all) {
-		if (!list_empty(&trans->new_bgs))
-			btrfs_create_pending_block_groups(trans);
+		btrfs_create_pending_block_groups(trans);
 
 		spin_lock(&delayed_refs->lock);
-		node = rb_first(&delayed_refs->href_root);
+		node = rb_first_cached(&delayed_refs->href_root);
 		if (!node) {
 			spin_unlock(&delayed_refs->lock);
 			goto out;
@@ -2964,7 +2216,6 @@
 }
 
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info,
 				u64 bytenr, u64 num_bytes, u64 flags,
 				int level, int is_data)
 {
@@ -2981,8 +2232,7 @@
 	extent_op->is_data = is_data ? true : false;
 	extent_op->level = level;
 
-	ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
-					  num_bytes, extent_op);
+	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
 	if (ret)
 		btrfs_free_delayed_extent_op(extent_op);
 	return ret;
@@ -3040,7 +2290,8 @@
 	 * XXX: We should replace this with a proper search function in the
 	 * future.
 	 */
-	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+	for (node = rb_first_cached(&head->ref_tree); node;
+	     node = rb_next(node)) {
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		/* If it's a shared ref we know a cross reference exists */
 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -3106,16 +2357,19 @@
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 
+	/* If extent item has more than 1 inline ref then it's shared */
 	if (item_size != sizeof(*ei) +
 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
 		goto out;
 
+	/* If extent created before last snapshot => it's definitely shared */
 	if (btrfs_extent_generation(leaf, ei) <=
 	    btrfs_root_last_snapshot(&root->root_item))
 		goto out;
 
 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
 
+	/* If this extent has SHARED_DATA_REF then it's shared */
 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
 		goto out;
@@ -3139,7 +2393,6 @@
 {
 	struct btrfs_path *path;
 	int ret;
-	int ret2;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -3151,17 +2404,9 @@
 		if (ret && ret != -ENOENT)
 			goto out;
 
-		ret2 = check_delayed_ref(root, path, objectid,
-					 offset, bytenr);
-	} while (ret2 == -EAGAIN);
+		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+	} while (ret == -EAGAIN);
 
-	if (ret2 && ret2 != -ENOENT) {
-		ret = ret2;
-		goto out;
-	}
-
-	if (ret != -ENOENT || ret2 != -ENOENT)
-		ret = 0;
 out:
 	btrfs_free_path(path);
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
@@ -3182,13 +2427,12 @@
 	u32 nritems;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
+	struct btrfs_ref generic_ref = { 0 };
+	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
 	int i;
+	int action;
 	int level;
 	int ret = 0;
-	int (*process_func)(struct btrfs_trans_handle *,
-			    struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64);
-
 
 	if (btrfs_is_testing(fs_info))
 		return 0;
@@ -3200,15 +2444,14 @@
 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
 		return 0;
 
-	if (inc)
-		process_func = btrfs_inc_extent_ref;
-	else
-		process_func = btrfs_free_extent;
-
 	if (full_backref)
 		parent = buf->start;
 	else
 		parent = 0;
+	if (inc)
+		action = BTRFS_ADD_DELAYED_REF;
+	else
+		action = BTRFS_DROP_DELAYED_REF;
 
 	for (i = 0; i < nritems; i++) {
 		if (level == 0) {
@@ -3226,16 +2469,30 @@
 
 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
 			key.offset -= btrfs_file_extent_offset(buf, fi);
-			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, key.objectid,
-					   key.offset);
+			btrfs_init_generic_ref(&generic_ref, action, bytenr,
+					       num_bytes, parent);
+			generic_ref.real_root = root->root_key.objectid;
+			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
+					    key.offset);
+			generic_ref.skip_qgroup = for_reloc;
+			if (inc)
+				ret = btrfs_inc_extent_ref(trans, &generic_ref);
+			else
+				ret = btrfs_free_extent(trans, &generic_ref);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = fs_info->nodesize;
-			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0);
+			btrfs_init_generic_ref(&generic_ref, action, bytenr,
+					       num_bytes, parent);
+			generic_ref.real_root = root->root_key.objectid;
+			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
+			generic_ref.skip_qgroup = for_reloc;
+			if (inc)
+				ret = btrfs_inc_extent_ref(trans, &generic_ref);
+			else
+				ret = btrfs_free_extent(trans, &generic_ref);
 			if (ret)
 				goto fail;
 		}
@@ -3257,552 +2514,6 @@
 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
 }
 
-static int write_one_cache_group(struct btrfs_trans_handle *trans,
-				 struct btrfs_fs_info *fs_info,
-				 struct btrfs_path *path,
-				 struct btrfs_block_group_cache *cache)
-{
-	int ret;
-	struct btrfs_root *extent_root = fs_info->extent_root;
-	unsigned long bi;
-	struct extent_buffer *leaf;
-
-	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
-	if (ret) {
-		if (ret > 0)
-			ret = -ENOENT;
-		goto fail;
-	}
-
-	leaf = path->nodes[0];
-	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
-	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
-	btrfs_mark_buffer_dirty(leaf);
-fail:
-	btrfs_release_path(path);
-	return ret;
-
-}
-
-static struct btrfs_block_group_cache *
-next_block_group(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *cache)
-{
-	struct rb_node *node;
-
-	spin_lock(&fs_info->block_group_cache_lock);
-
-	/* If our block group was removed, we need a full search. */
-	if (RB_EMPTY_NODE(&cache->cache_node)) {
-		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
-
-		spin_unlock(&fs_info->block_group_cache_lock);
-		btrfs_put_block_group(cache);
-		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
-	}
-	node = rb_next(&cache->cache_node);
-	btrfs_put_block_group(cache);
-	if (node) {
-		cache = rb_entry(node, struct btrfs_block_group_cache,
-				 cache_node);
-		btrfs_get_block_group(cache);
-	} else
-		cache = NULL;
-	spin_unlock(&fs_info->block_group_cache_lock);
-	return cache;
-}
-
-static int cache_save_setup(struct btrfs_block_group_cache *block_group,
-			    struct btrfs_trans_handle *trans,
-			    struct btrfs_path *path)
-{
-	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct btrfs_root *root = fs_info->tree_root;
-	struct inode *inode = NULL;
-	struct extent_changeset *data_reserved = NULL;
-	u64 alloc_hint = 0;
-	int dcs = BTRFS_DC_ERROR;
-	u64 num_pages = 0;
-	int retries = 0;
-	int ret = 0;
-
-	/*
-	 * If this block group is smaller than 100 megs don't bother caching the
-	 * block group.
-	 */
-	if (block_group->key.offset < (100 * SZ_1M)) {
-		spin_lock(&block_group->lock);
-		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
-		spin_unlock(&block_group->lock);
-		return 0;
-	}
-
-	if (trans->aborted)
-		return 0;
-again:
-	inode = lookup_free_space_inode(fs_info, block_group, path);
-	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
-		ret = PTR_ERR(inode);
-		btrfs_release_path(path);
-		goto out;
-	}
-
-	if (IS_ERR(inode)) {
-		BUG_ON(retries);
-		retries++;
-
-		if (block_group->ro)
-			goto out_free;
-
-		ret = create_free_space_inode(fs_info, trans, block_group,
-					      path);
-		if (ret)
-			goto out_free;
-		goto again;
-	}
-
-	/*
-	 * We want to set the generation to 0, that way if anything goes wrong
-	 * from here on out we know not to trust this cache when we load up next
-	 * time.
-	 */
-	BTRFS_I(inode)->generation = 0;
-	ret = btrfs_update_inode(trans, root, inode);
-	if (ret) {
-		/*
-		 * So theoretically we could recover from this, simply set the
-		 * super cache generation to 0 so we know to invalidate the
-		 * cache, but then we'd have to keep track of the block groups
-		 * that fail this way so we know we _have_ to reset this cache
-		 * before the next commit or risk reading stale cache.  So to
-		 * limit our exposure to horrible edge cases lets just abort the
-		 * transaction, this only happens in really bad situations
-		 * anyway.
-		 */
-		btrfs_abort_transaction(trans, ret);
-		goto out_put;
-	}
-	WARN_ON(ret);
-
-	/* We've already setup this transaction, go ahead and exit */
-	if (block_group->cache_generation == trans->transid &&
-	    i_size_read(inode)) {
-		dcs = BTRFS_DC_SETUP;
-		goto out_put;
-	}
-
-	if (i_size_read(inode) > 0) {
-		ret = btrfs_check_trunc_cache_free_space(fs_info,
-					&fs_info->global_block_rsv);
-		if (ret)
-			goto out_put;
-
-		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
-		if (ret)
-			goto out_put;
-	}
-
-	spin_lock(&block_group->lock);
-	if (block_group->cached != BTRFS_CACHE_FINISHED ||
-	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
-		/*
-		 * don't bother trying to write stuff out _if_
-		 * a) we're not cached,
-		 * b) we're with nospace_cache mount option,
-		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
-		 */
-		dcs = BTRFS_DC_WRITTEN;
-		spin_unlock(&block_group->lock);
-		goto out_put;
-	}
-	spin_unlock(&block_group->lock);
-
-	/*
-	 * We hit an ENOSPC when setting up the cache in this transaction, just
-	 * skip doing the setup, we've already cleared the cache so we're safe.
-	 */
-	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
-		ret = -ENOSPC;
-		goto out_put;
-	}
-
-	/*
-	 * Try to preallocate enough space based on how big the block group is.
-	 * Keep in mind this has to include any pinned space which could end up
-	 * taking up quite a bit since it's not folded into the other space
-	 * cache.
-	 */
-	num_pages = div_u64(block_group->key.offset, SZ_256M);
-	if (!num_pages)
-		num_pages = 1;
-
-	num_pages *= 16;
-	num_pages *= PAGE_SIZE;
-
-	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
-	if (ret)
-		goto out_put;
-
-	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
-					      num_pages, num_pages,
-					      &alloc_hint);
-	/*
-	 * Our cache requires contiguous chunks so that we don't modify a bunch
-	 * of metadata or split extents when writing the cache out, which means
-	 * we can enospc if we are heavily fragmented in addition to just normal
-	 * out of space conditions.  So if we hit this just skip setting up any
-	 * other block groups for this transaction, maybe we'll unpin enough
-	 * space the next time around.
-	 */
-	if (!ret)
-		dcs = BTRFS_DC_SETUP;
-	else if (ret == -ENOSPC)
-		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
-
-out_put:
-	iput(inode);
-out_free:
-	btrfs_release_path(path);
-out:
-	spin_lock(&block_group->lock);
-	if (!ret && dcs == BTRFS_DC_SETUP)
-		block_group->cache_generation = trans->transid;
-	block_group->disk_cache_state = dcs;
-	spin_unlock(&block_group->lock);
-
-	extent_changeset_free(data_reserved);
-	return ret;
-}
-
-int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_block_group_cache *cache, *tmp;
-	struct btrfs_transaction *cur_trans = trans->transaction;
-	struct btrfs_path *path;
-
-	if (list_empty(&cur_trans->dirty_bgs) ||
-	    !btrfs_test_opt(fs_info, SPACE_CACHE))
-		return 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	/* Could add new block groups, use _safe just in case */
-	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
-				 dirty_list) {
-		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-			cache_save_setup(cache, trans, path);
-	}
-
-	btrfs_free_path(path);
-	return 0;
-}
-
-/*
- * transaction commit does final block group cache writeback during a
- * critical section where nothing is allowed to change the FS.  This is
- * required in order for the cache to actually match the block group,
- * but can introduce a lot of latency into the commit.
- *
- * So, btrfs_start_dirty_block_groups is here to kick off block group
- * cache IO.  There's a chance we'll have to redo some of it if the
- * block group changes again during the commit, but it greatly reduces
- * the commit latency by getting rid of the easy block groups while
- * we're still allowing others to join the commit.
- */
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_block_group_cache *cache;
-	struct btrfs_transaction *cur_trans = trans->transaction;
-	int ret = 0;
-	int should_put;
-	struct btrfs_path *path = NULL;
-	LIST_HEAD(dirty);
-	struct list_head *io = &cur_trans->io_bgs;
-	int num_started = 0;
-	int loops = 0;
-
-	spin_lock(&cur_trans->dirty_bgs_lock);
-	if (list_empty(&cur_trans->dirty_bgs)) {
-		spin_unlock(&cur_trans->dirty_bgs_lock);
-		return 0;
-	}
-	list_splice_init(&cur_trans->dirty_bgs, &dirty);
-	spin_unlock(&cur_trans->dirty_bgs_lock);
-
-again:
-	/*
-	 * make sure all the block groups on our dirty list actually
-	 * exist
-	 */
-	btrfs_create_pending_block_groups(trans);
-
-	if (!path) {
-		path = btrfs_alloc_path();
-		if (!path)
-			return -ENOMEM;
-	}
-
-	/*
-	 * cache_write_mutex is here only to save us from balance or automatic
-	 * removal of empty block groups deleting this block group while we are
-	 * writing out the cache
-	 */
-	mutex_lock(&trans->transaction->cache_write_mutex);
-	while (!list_empty(&dirty)) {
-		cache = list_first_entry(&dirty,
-					 struct btrfs_block_group_cache,
-					 dirty_list);
-		/*
-		 * this can happen if something re-dirties a block
-		 * group that is already under IO.  Just wait for it to
-		 * finish and then do it all again
-		 */
-		if (!list_empty(&cache->io_list)) {
-			list_del_init(&cache->io_list);
-			btrfs_wait_cache_io(trans, cache, path);
-			btrfs_put_block_group(cache);
-		}
-
-
-		/*
-		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
-		 * if it should update the cache_state.  Don't delete
-		 * until after we wait.
-		 *
-		 * Since we're not running in the commit critical section
-		 * we need the dirty_bgs_lock to protect from update_block_group
-		 */
-		spin_lock(&cur_trans->dirty_bgs_lock);
-		list_del_init(&cache->dirty_list);
-		spin_unlock(&cur_trans->dirty_bgs_lock);
-
-		should_put = 1;
-
-		cache_save_setup(cache, trans, path);
-
-		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
-			cache->io_ctl.inode = NULL;
-			ret = btrfs_write_out_cache(fs_info, trans,
-						    cache, path);
-			if (ret == 0 && cache->io_ctl.inode) {
-				num_started++;
-				should_put = 0;
-
-				/*
-				 * The cache_write_mutex is protecting the
-				 * io_list, also refer to the definition of
-				 * btrfs_transaction::io_bgs for more details
-				 */
-				list_add_tail(&cache->io_list, io);
-			} else {
-				/*
-				 * if we failed to write the cache, the
-				 * generation will be bad and life goes on
-				 */
-				ret = 0;
-			}
-		}
-		if (!ret) {
-			ret = write_one_cache_group(trans, fs_info,
-						    path, cache);
-			/*
-			 * Our block group might still be attached to the list
-			 * of new block groups in the transaction handle of some
-			 * other task (struct btrfs_trans_handle->new_bgs). This
-			 * means its block group item isn't yet in the extent
-			 * tree. If this happens ignore the error, as we will
-			 * try again later in the critical section of the
-			 * transaction commit.
-			 */
-			if (ret == -ENOENT) {
-				ret = 0;
-				spin_lock(&cur_trans->dirty_bgs_lock);
-				if (list_empty(&cache->dirty_list)) {
-					list_add_tail(&cache->dirty_list,
-						      &cur_trans->dirty_bgs);
-					btrfs_get_block_group(cache);
-				}
-				spin_unlock(&cur_trans->dirty_bgs_lock);
-			} else if (ret) {
-				btrfs_abort_transaction(trans, ret);
-			}
-		}
-
-		/* if its not on the io list, we need to put the block group */
-		if (should_put)
-			btrfs_put_block_group(cache);
-
-		if (ret)
-			break;
-
-		/*
-		 * Avoid blocking other tasks for too long. It might even save
-		 * us from writing caches for block groups that are going to be
-		 * removed.
-		 */
-		mutex_unlock(&trans->transaction->cache_write_mutex);
-		mutex_lock(&trans->transaction->cache_write_mutex);
-	}
-	mutex_unlock(&trans->transaction->cache_write_mutex);
-
-	/*
-	 * go through delayed refs for all the stuff we've just kicked off
-	 * and then loop back (just once)
-	 */
-	ret = btrfs_run_delayed_refs(trans, 0);
-	if (!ret && loops == 0) {
-		loops++;
-		spin_lock(&cur_trans->dirty_bgs_lock);
-		list_splice_init(&cur_trans->dirty_bgs, &dirty);
-		/*
-		 * dirty_bgs_lock protects us from concurrent block group
-		 * deletes too (not just cache_write_mutex).
-		 */
-		if (!list_empty(&dirty)) {
-			spin_unlock(&cur_trans->dirty_bgs_lock);
-			goto again;
-		}
-		spin_unlock(&cur_trans->dirty_bgs_lock);
-	} else if (ret < 0) {
-		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
-	}
-
-	btrfs_free_path(path);
-	return ret;
-}
-
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_block_group_cache *cache;
-	struct btrfs_transaction *cur_trans = trans->transaction;
-	int ret = 0;
-	int should_put;
-	struct btrfs_path *path;
-	struct list_head *io = &cur_trans->io_bgs;
-	int num_started = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	/*
-	 * Even though we are in the critical section of the transaction commit,
-	 * we can still have concurrent tasks adding elements to this
-	 * transaction's list of dirty block groups. These tasks correspond to
-	 * endio free space workers started when writeback finishes for a
-	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
-	 * allocate new block groups as a result of COWing nodes of the root
-	 * tree when updating the free space inode. The writeback for the space
-	 * caches is triggered by an earlier call to
-	 * btrfs_start_dirty_block_groups() and iterations of the following
-	 * loop.
-	 * Also we want to do the cache_save_setup first and then run the
-	 * delayed refs to make sure we have the best chance at doing this all
-	 * in one shot.
-	 */
-	spin_lock(&cur_trans->dirty_bgs_lock);
-	while (!list_empty(&cur_trans->dirty_bgs)) {
-		cache = list_first_entry(&cur_trans->dirty_bgs,
-					 struct btrfs_block_group_cache,
-					 dirty_list);
-
-		/*
-		 * this can happen if cache_save_setup re-dirties a block
-		 * group that is already under IO.  Just wait for it to
-		 * finish and then do it all again
-		 */
-		if (!list_empty(&cache->io_list)) {
-			spin_unlock(&cur_trans->dirty_bgs_lock);
-			list_del_init(&cache->io_list);
-			btrfs_wait_cache_io(trans, cache, path);
-			btrfs_put_block_group(cache);
-			spin_lock(&cur_trans->dirty_bgs_lock);
-		}
-
-		/*
-		 * don't remove from the dirty list until after we've waited
-		 * on any pending IO
-		 */
-		list_del_init(&cache->dirty_list);
-		spin_unlock(&cur_trans->dirty_bgs_lock);
-		should_put = 1;
-
-		cache_save_setup(cache, trans, path);
-
-		if (!ret)
-			ret = btrfs_run_delayed_refs(trans,
-						     (unsigned long) -1);
-
-		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
-			cache->io_ctl.inode = NULL;
-			ret = btrfs_write_out_cache(fs_info, trans,
-						    cache, path);
-			if (ret == 0 && cache->io_ctl.inode) {
-				num_started++;
-				should_put = 0;
-				list_add_tail(&cache->io_list, io);
-			} else {
-				/*
-				 * if we failed to write the cache, the
-				 * generation will be bad and life goes on
-				 */
-				ret = 0;
-			}
-		}
-		if (!ret) {
-			ret = write_one_cache_group(trans, fs_info,
-						    path, cache);
-			/*
-			 * One of the free space endio workers might have
-			 * created a new block group while updating a free space
-			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
-			 * and hasn't released its transaction handle yet, in
-			 * which case the new block group is still attached to
-			 * its transaction handle and its creation has not
-			 * finished yet (no block group item in the extent tree
-			 * yet, etc). If this is the case, wait for all free
-			 * space endio workers to finish and retry. This is a
-			 * a very rare case so no need for a more efficient and
-			 * complex approach.
-			 */
-			if (ret == -ENOENT) {
-				wait_event(cur_trans->writer_wait,
-				   atomic_read(&cur_trans->num_writers) == 1);
-				ret = write_one_cache_group(trans, fs_info,
-							    path, cache);
-			}
-			if (ret)
-				btrfs_abort_transaction(trans, ret);
-		}
-
-		/* if its not on the io list, we need to put the block group */
-		if (should_put)
-			btrfs_put_block_group(cache);
-		spin_lock(&cur_trans->dirty_bgs_lock);
-	}
-	spin_unlock(&cur_trans->dirty_bgs_lock);
-
-	/*
-	 * Refer to the definition of io_bgs member for details why it's safe
-	 * to use it without any locking
-	 */
-	while (!list_empty(io)) {
-		cache = list_first_entry(io, struct btrfs_block_group_cache,
-					 io_list);
-		list_del_init(&cache->io_list);
-		btrfs_wait_cache_io(trans, cache, path);
-		btrfs_put_block_group(cache);
-	}
-
-	btrfs_free_path(path);
-	return ret;
-}
-
 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
 {
 	struct btrfs_block_group_cache *block_group;
@@ -3816,254 +2527,6 @@
 	return readonly;
 }
 
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
-	struct btrfs_block_group_cache *bg;
-	bool ret = true;
-
-	bg = btrfs_lookup_block_group(fs_info, bytenr);
-	if (!bg)
-		return false;
-
-	spin_lock(&bg->lock);
-	if (bg->ro)
-		ret = false;
-	else
-		atomic_inc(&bg->nocow_writers);
-	spin_unlock(&bg->lock);
-
-	/* no put on block group, done by btrfs_dec_nocow_writers */
-	if (!ret)
-		btrfs_put_block_group(bg);
-
-	return ret;
-
-}
-
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
-	struct btrfs_block_group_cache *bg;
-
-	bg = btrfs_lookup_block_group(fs_info, bytenr);
-	ASSERT(bg);
-	if (atomic_dec_and_test(&bg->nocow_writers))
-		wake_up_var(&bg->nocow_writers);
-	/*
-	 * Once for our lookup and once for the lookup done by a previous call
-	 * to btrfs_inc_nocow_writers()
-	 */
-	btrfs_put_block_group(bg);
-	btrfs_put_block_group(bg);
-}
-
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
-{
-	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
-}
-
-static const char *alloc_name(u64 flags)
-{
-	switch (flags) {
-	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
-		return "mixed";
-	case BTRFS_BLOCK_GROUP_METADATA:
-		return "metadata";
-	case BTRFS_BLOCK_GROUP_DATA:
-		return "data";
-	case BTRFS_BLOCK_GROUP_SYSTEM:
-		return "system";
-	default:
-		WARN_ON(1);
-		return "invalid-combination";
-	};
-}
-
-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
-{
-
-	struct btrfs_space_info *space_info;
-	int i;
-	int ret;
-
-	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
-	if (!space_info)
-		return -ENOMEM;
-
-	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
-				 GFP_KERNEL);
-	if (ret) {
-		kfree(space_info);
-		return ret;
-	}
-
-	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
-		INIT_LIST_HEAD(&space_info->block_groups[i]);
-	init_rwsem(&space_info->groups_sem);
-	spin_lock_init(&space_info->lock);
-	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
-	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
-	init_waitqueue_head(&space_info->wait);
-	INIT_LIST_HEAD(&space_info->ro_bgs);
-	INIT_LIST_HEAD(&space_info->tickets);
-	INIT_LIST_HEAD(&space_info->priority_tickets);
-
-	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
-				    info->space_info_kobj, "%s",
-				    alloc_name(space_info->flags));
-	if (ret) {
-		percpu_counter_destroy(&space_info->total_bytes_pinned);
-		kfree(space_info);
-		return ret;
-	}
-
-	list_add_rcu(&space_info->list, &info->space_info);
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		info->data_sinfo = space_info;
-
-	return ret;
-}
-
-static void update_space_info(struct btrfs_fs_info *info, u64 flags,
-			     u64 total_bytes, u64 bytes_used,
-			     u64 bytes_readonly,
-			     struct btrfs_space_info **space_info)
-{
-	struct btrfs_space_info *found;
-	int factor;
-
-	factor = btrfs_bg_type_to_factor(flags);
-
-	found = __find_space_info(info, flags);
-	ASSERT(found);
-	spin_lock(&found->lock);
-	found->total_bytes += total_bytes;
-	found->disk_total += total_bytes * factor;
-	found->bytes_used += bytes_used;
-	found->disk_used += bytes_used * factor;
-	found->bytes_readonly += bytes_readonly;
-	if (total_bytes > 0)
-		found->full = 0;
-	space_info_add_new_bytes(info, found, total_bytes -
-				 bytes_used - bytes_readonly);
-	spin_unlock(&found->lock);
-	*space_info = found;
-}
-
-static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
-{
-	u64 extra_flags = chunk_to_extended(flags) &
-				BTRFS_EXTENDED_PROFILE_MASK;
-
-	write_seqlock(&fs_info->profiles_lock);
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		fs_info->avail_data_alloc_bits |= extra_flags;
-	if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		fs_info->avail_metadata_alloc_bits |= extra_flags;
-	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		fs_info->avail_system_alloc_bits |= extra_flags;
-	write_sequnlock(&fs_info->profiles_lock);
-}
-
-/*
- * returns target flags in extended format or 0 if restripe for this
- * chunk_type is not in progress
- *
- * should be called with balance_lock held
- */
-static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
-{
-	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
-	u64 target = 0;
-
-	if (!bctl)
-		return 0;
-
-	if (flags & BTRFS_BLOCK_GROUP_DATA &&
-	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
-	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
-		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
-	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
-		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
-	}
-
-	return target;
-}
-
-/*
- * @flags: available profiles in extended format (see ctree.h)
- *
- * Returns reduced profile in chunk format.  If profile changing is in
- * progress (either running or paused) picks the target profile (if it's
- * already available), otherwise falls back to plain reducing.
- */
-static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
-{
-	u64 num_devices = fs_info->fs_devices->rw_devices;
-	u64 target;
-	u64 raid_type;
-	u64 allowed = 0;
-
-	/*
-	 * see if restripe for this chunk_type is in progress, if so
-	 * try to reduce to the target profile
-	 */
-	spin_lock(&fs_info->balance_lock);
-	target = get_restripe_target(fs_info, flags);
-	if (target) {
-		/* pick target profile only if it's already available */
-		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
-			spin_unlock(&fs_info->balance_lock);
-			return extended_to_chunk(target);
-		}
-	}
-	spin_unlock(&fs_info->balance_lock);
-
-	/* First, mask out the RAID levels which aren't possible */
-	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
-		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
-			allowed |= btrfs_raid_array[raid_type].bg_flag;
-	}
-	allowed &= flags;
-
-	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
-		allowed = BTRFS_BLOCK_GROUP_RAID6;
-	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
-		allowed = BTRFS_BLOCK_GROUP_RAID5;
-	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
-		allowed = BTRFS_BLOCK_GROUP_RAID10;
-	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
-		allowed = BTRFS_BLOCK_GROUP_RAID1;
-	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
-		allowed = BTRFS_BLOCK_GROUP_RAID0;
-
-	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
-	return extended_to_chunk(flags | allowed);
-}
-
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
-{
-	unsigned seq;
-	u64 flags;
-
-	do {
-		flags = orig_flags;
-		seq = read_seqbegin(&fs_info->profiles_lock);
-
-		if (flags & BTRFS_BLOCK_GROUP_DATA)
-			flags |= fs_info->avail_data_alloc_bits;
-		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-			flags |= fs_info->avail_system_alloc_bits;
-		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-			flags |= fs_info->avail_metadata_alloc_bits;
-	} while (read_seqretry(&fs_info->profiles_lock, seq));
-
-	return btrfs_reduce_alloc_profile(fs_info, flags);
-}
-
 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4077,2089 +2540,10 @@
 	else
 		flags = BTRFS_BLOCK_GROUP_METADATA;
 
-	ret = get_alloc_profile(fs_info, flags);
+	ret = btrfs_get_alloc_profile(fs_info, flags);
 	return ret;
 }
 
-u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
-{
-	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
-}
-
-u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
-{
-	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-}
-
-u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
-{
-	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
-}
-
-static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
-				 bool may_use_included)
-{
-	ASSERT(s_info);
-	return s_info->bytes_used + s_info->bytes_reserved +
-		s_info->bytes_pinned + s_info->bytes_readonly +
-		(may_use_included ? s_info->bytes_may_use : 0);
-}
-
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
-{
-	struct btrfs_root *root = inode->root;
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
-	u64 used;
-	int ret = 0;
-	int need_commit = 2;
-	int have_pinned_space;
-
-	/* make sure bytes are sectorsize aligned */
-	bytes = ALIGN(bytes, fs_info->sectorsize);
-
-	if (btrfs_is_free_space_inode(inode)) {
-		need_commit = 0;
-		ASSERT(current->journal_info);
-	}
-
-again:
-	/* make sure we have enough space to handle the data first */
-	spin_lock(&data_sinfo->lock);
-	used = btrfs_space_info_used(data_sinfo, true);
-
-	if (used + bytes > data_sinfo->total_bytes) {
-		struct btrfs_trans_handle *trans;
-
-		/*
-		 * if we don't have enough free bytes in this space then we need
-		 * to alloc a new chunk.
-		 */
-		if (!data_sinfo->full) {
-			u64 alloc_target;
-
-			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
-			spin_unlock(&data_sinfo->lock);
-
-			alloc_target = btrfs_data_alloc_profile(fs_info);
-			/*
-			 * It is ugly that we don't call nolock join
-			 * transaction for the free space inode case here.
-			 * But it is safe because we only do the data space
-			 * reservation for the free space cache in the
-			 * transaction context, the common join transaction
-			 * just increase the counter of the current transaction
-			 * handler, doesn't try to acquire the trans_lock of
-			 * the fs.
-			 */
-			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
-				return PTR_ERR(trans);
-
-			ret = do_chunk_alloc(trans, alloc_target,
-					     CHUNK_ALLOC_NO_FORCE);
-			btrfs_end_transaction(trans);
-			if (ret < 0) {
-				if (ret != -ENOSPC)
-					return ret;
-				else {
-					have_pinned_space = 1;
-					goto commit_trans;
-				}
-			}
-
-			goto again;
-		}
-
-		/*
-		 * If we don't have enough pinned space to deal with this
-		 * allocation, and no removed chunk in current transaction,
-		 * don't bother committing the transaction.
-		 */
-		have_pinned_space = __percpu_counter_compare(
-			&data_sinfo->total_bytes_pinned,
-			used + bytes - data_sinfo->total_bytes,
-			BTRFS_TOTAL_BYTES_PINNED_BATCH);
-		spin_unlock(&data_sinfo->lock);
-
-		/* commit the current transaction and try again */
-commit_trans:
-		if (need_commit) {
-			need_commit--;
-
-			if (need_commit > 0) {
-				btrfs_start_delalloc_roots(fs_info, -1);
-				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
-							 (u64)-1);
-			}
-
-			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
-				return PTR_ERR(trans);
-			if (have_pinned_space >= 0 ||
-			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
-				     &trans->transaction->flags) ||
-			    need_commit > 0) {
-				ret = btrfs_commit_transaction(trans);
-				if (ret)
-					return ret;
-				/*
-				 * The cleaner kthread might still be doing iput
-				 * operations. Wait for it to finish so that
-				 * more space is released.
-				 */
-				mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
-				mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
-				goto again;
-			} else {
-				btrfs_end_transaction(trans);
-			}
-		}
-
-		trace_btrfs_space_reservation(fs_info,
-					      "space_info:enospc",
-					      data_sinfo->flags, bytes, 1);
-		return -ENOSPC;
-	}
-	data_sinfo->bytes_may_use += bytes;
-	trace_btrfs_space_reservation(fs_info, "space_info",
-				      data_sinfo->flags, bytes, 1);
-	spin_unlock(&data_sinfo->lock);
-
-	return 0;
-}
-
-int btrfs_check_data_free_space(struct inode *inode,
-			struct extent_changeset **reserved, u64 start, u64 len)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	int ret;
-
-	/* align the range */
-	len = round_up(start + len, fs_info->sectorsize) -
-	      round_down(start, fs_info->sectorsize);
-	start = round_down(start, fs_info->sectorsize);
-
-	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
-	if (ret < 0)
-		return ret;
-
-	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
-	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
-	if (ret < 0)
-		btrfs_free_reserved_data_space_noquota(inode, start, len);
-	else
-		ret = 0;
-	return ret;
-}
-
-/*
- * Called if we need to clear a data reservation for this inode
- * Normally in a error case.
- *
- * This one will *NOT* use accurate qgroup reserved space API, just for case
- * which we can't sleep and is sure it won't affect qgroup reserved space.
- * Like clear_bit_hook().
- */
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
-					    u64 len)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_space_info *data_sinfo;
-
-	/* Make sure the range is aligned to sectorsize */
-	len = round_up(start + len, fs_info->sectorsize) -
-	      round_down(start, fs_info->sectorsize);
-	start = round_down(start, fs_info->sectorsize);
-
-	data_sinfo = fs_info->data_sinfo;
-	spin_lock(&data_sinfo->lock);
-	if (WARN_ON(data_sinfo->bytes_may_use < len))
-		data_sinfo->bytes_may_use = 0;
-	else
-		data_sinfo->bytes_may_use -= len;
-	trace_btrfs_space_reservation(fs_info, "space_info",
-				      data_sinfo->flags, len, 0);
-	spin_unlock(&data_sinfo->lock);
-}
-
-/*
- * Called if we need to clear a data reservation for this inode
- * Normally in a error case.
- *
- * This one will handle the per-inode data rsv map for accurate reserved
- * space framework.
- */
-void btrfs_free_reserved_data_space(struct inode *inode,
-			struct extent_changeset *reserved, u64 start, u64 len)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-
-	/* Make sure the range is aligned to sectorsize */
-	len = round_up(start + len, root->fs_info->sectorsize) -
-	      round_down(start, root->fs_info->sectorsize);
-	start = round_down(start, root->fs_info->sectorsize);
-
-	btrfs_free_reserved_data_space_noquota(inode, start, len);
-	btrfs_qgroup_free_data(inode, reserved, start, len);
-}
-
-static void force_metadata_allocation(struct btrfs_fs_info *info)
-{
-	struct list_head *head = &info->space_info;
-	struct btrfs_space_info *found;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(found, head, list) {
-		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
-			found->force_alloc = CHUNK_ALLOC_FORCE;
-	}
-	rcu_read_unlock();
-}
-
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
-	return (global->size << 1);
-}
-
-static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
-			      struct btrfs_space_info *sinfo, int force)
-{
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	u64 bytes_used = btrfs_space_info_used(sinfo, false);
-	u64 thresh;
-
-	if (force == CHUNK_ALLOC_FORCE)
-		return 1;
-
-	/*
-	 * We need to take into account the global rsv because for all intents
-	 * and purposes it's used space.  Don't worry about locking the
-	 * global_rsv, it doesn't change except when the transaction commits.
-	 */
-	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
-		bytes_used += calc_global_rsv_need_space(global_rsv);
-
-	/*
-	 * in limited mode, we want to have some free space up to
-	 * about 1% of the FS size.
-	 */
-	if (force == CHUNK_ALLOC_LIMITED) {
-		thresh = btrfs_super_total_bytes(fs_info->super_copy);
-		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
-
-		if (sinfo->total_bytes - bytes_used < thresh)
-			return 1;
-	}
-
-	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
-		return 0;
-	return 1;
-}
-
-static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
-{
-	u64 num_dev;
-
-	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
-		    BTRFS_BLOCK_GROUP_RAID0 |
-		    BTRFS_BLOCK_GROUP_RAID5 |
-		    BTRFS_BLOCK_GROUP_RAID6))
-		num_dev = fs_info->fs_devices->rw_devices;
-	else if (type & BTRFS_BLOCK_GROUP_RAID1)
-		num_dev = 2;
-	else
-		num_dev = 1;	/* DUP or single */
-
-	return num_dev;
-}
-
-/*
- * If @is_allocation is true, reserve space in the system space info necessary
- * for allocating a chunk, otherwise if it's false, reserve space necessary for
- * removing a chunk.
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_space_info *info;
-	u64 left;
-	u64 thresh;
-	int ret = 0;
-	u64 num_devs;
-
-	/*
-	 * Needed because we can end up allocating a system chunk and for an
-	 * atomic and race free space reservation in the chunk block reserve.
-	 */
-	lockdep_assert_held(&fs_info->chunk_mutex);
-
-	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
-	spin_lock(&info->lock);
-	left = info->total_bytes - btrfs_space_info_used(info, true);
-	spin_unlock(&info->lock);
-
-	num_devs = get_profile_num_devs(fs_info, type);
-
-	/* num_devs device items to update and 1 chunk item to add or remove */
-	thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
-		btrfs_calc_trans_metadata_size(fs_info, 1);
-
-	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
-		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
-			   left, thresh, type);
-		dump_space_info(fs_info, info, 0, 0);
-	}
-
-	if (left < thresh) {
-		u64 flags = btrfs_system_alloc_profile(fs_info);
-
-		/*
-		 * Ignore failure to create system chunk. We might end up not
-		 * needing it, as we might not need to COW all nodes/leafs from
-		 * the paths we visit in the chunk tree (they were already COWed
-		 * or created in the current transaction for example).
-		 */
-		ret = btrfs_alloc_chunk(trans, flags);
-	}
-
-	if (!ret) {
-		ret = btrfs_block_rsv_add(fs_info->chunk_root,
-					  &fs_info->chunk_block_rsv,
-					  thresh, BTRFS_RESERVE_NO_FLUSH);
-		if (!ret)
-			trans->chunk_bytes_reserved += thresh;
-	}
-}
-
-/*
- * If force is CHUNK_ALLOC_FORCE:
- *    - return 1 if it successfully allocates a chunk,
- *    - return errors including -ENOSPC otherwise.
- * If force is NOT CHUNK_ALLOC_FORCE:
- *    - return 0 if it doesn't need to allocate a new chunk,
- *    - return 1 if it successfully allocates a chunk,
- *    - return errors including -ENOSPC otherwise.
- */
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
-			  int force)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_space_info *space_info;
-	bool wait_for_alloc = false;
-	bool should_alloc = false;
-	int ret = 0;
-
-	/* Don't re-enter if we're already allocating a chunk */
-	if (trans->allocating_chunk)
-		return -ENOSPC;
-
-	space_info = __find_space_info(fs_info, flags);
-	ASSERT(space_info);
-
-	do {
-		spin_lock(&space_info->lock);
-		if (force < space_info->force_alloc)
-			force = space_info->force_alloc;
-		should_alloc = should_alloc_chunk(fs_info, space_info, force);
-		if (space_info->full) {
-			/* No more free physical space */
-			if (should_alloc)
-				ret = -ENOSPC;
-			else
-				ret = 0;
-			spin_unlock(&space_info->lock);
-			return ret;
-		} else if (!should_alloc) {
-			spin_unlock(&space_info->lock);
-			return 0;
-		} else if (space_info->chunk_alloc) {
-			/*
-			 * Someone is already allocating, so we need to block
-			 * until this someone is finished and then loop to
-			 * recheck if we should continue with our allocation
-			 * attempt.
-			 */
-			wait_for_alloc = true;
-			spin_unlock(&space_info->lock);
-			mutex_lock(&fs_info->chunk_mutex);
-			mutex_unlock(&fs_info->chunk_mutex);
-		} else {
-			/* Proceed with allocation */
-			space_info->chunk_alloc = 1;
-			wait_for_alloc = false;
-			spin_unlock(&space_info->lock);
-		}
-
-		cond_resched();
-	} while (wait_for_alloc);
-
-	mutex_lock(&fs_info->chunk_mutex);
-	trans->allocating_chunk = true;
-
-	/*
-	 * If we have mixed data/metadata chunks we want to make sure we keep
-	 * allocating mixed chunks instead of individual chunks.
-	 */
-	if (btrfs_mixed_space_info(space_info))
-		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
-
-	/*
-	 * if we're doing a data chunk, go ahead and make sure that
-	 * we keep a reasonable number of metadata chunks allocated in the
-	 * FS as well.
-	 */
-	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
-		fs_info->data_chunk_allocations++;
-		if (!(fs_info->data_chunk_allocations %
-		      fs_info->metadata_ratio))
-			force_metadata_allocation(fs_info);
-	}
-
-	/*
-	 * Check if we have enough space in SYSTEM chunk because we may need
-	 * to update devices.
-	 */
-	check_system_chunk(trans, flags);
-
-	ret = btrfs_alloc_chunk(trans, flags);
-	trans->allocating_chunk = false;
-
-	spin_lock(&space_info->lock);
-	if (ret < 0) {
-		if (ret == -ENOSPC)
-			space_info->full = 1;
-		else
-			goto out;
-	} else {
-		ret = 1;
-		space_info->max_extent_size = 0;
-	}
-
-	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
-out:
-	space_info->chunk_alloc = 0;
-	spin_unlock(&space_info->lock);
-	mutex_unlock(&fs_info->chunk_mutex);
-	/*
-	 * When we allocate a new chunk we reserve space in the chunk block
-	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
-	 * add new nodes/leafs to it if we end up needing to do it when
-	 * inserting the chunk item and updating device items as part of the
-	 * second phase of chunk allocation, performed by
-	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
-	 * large number of new block groups to create in our transaction
-	 * handle's new_bgs list to avoid exhausting the chunk block reserve
-	 * in extreme cases - like having a single transaction create many new
-	 * block groups when starting to write out the free space caches of all
-	 * the block groups that were made dirty during the lifetime of the
-	 * transaction.
-	 */
-	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
-		btrfs_create_pending_block_groups(trans);
-
-	return ret;
-}
-
-static int can_overcommit(struct btrfs_fs_info *fs_info,
-			  struct btrfs_space_info *space_info, u64 bytes,
-			  enum btrfs_reserve_flush_enum flush,
-			  bool system_chunk)
-{
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	u64 profile;
-	u64 space_size;
-	u64 avail;
-	u64 used;
-	int factor;
-
-	/* Don't overcommit when in mixed mode. */
-	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
-		return 0;
-
-	if (system_chunk)
-		profile = btrfs_system_alloc_profile(fs_info);
-	else
-		profile = btrfs_metadata_alloc_profile(fs_info);
-
-	used = btrfs_space_info_used(space_info, false);
-
-	/*
-	 * We only want to allow over committing if we have lots of actual space
-	 * free, but if we don't have enough space to handle the global reserve
-	 * space then we could end up having a real enospc problem when trying
-	 * to allocate a chunk or some other such important allocation.
-	 */
-	spin_lock(&global_rsv->lock);
-	space_size = calc_global_rsv_need_space(global_rsv);
-	spin_unlock(&global_rsv->lock);
-	if (used + space_size >= space_info->total_bytes)
-		return 0;
-
-	used += space_info->bytes_may_use;
-
-	avail = atomic64_read(&fs_info->free_chunk_space);
-
-	/*
-	 * If we have dup, raid1 or raid10 then only half of the free
-	 * space is actually useable.  For raid56, the space info used
-	 * doesn't include the parity drive, so we don't have to
-	 * change the math
-	 */
-	factor = btrfs_bg_type_to_factor(profile);
-	avail = div_u64(avail, factor);
-
-	/*
-	 * If we aren't flushing all things, let us overcommit up to
-	 * 1/2th of the space. If we can flush, don't let us overcommit
-	 * too much, let it overcommit up to 1/8 of the space.
-	 */
-	if (flush == BTRFS_RESERVE_FLUSH_ALL)
-		avail >>= 3;
-	else
-		avail >>= 1;
-
-	if (used + bytes < space_info->total_bytes + avail)
-		return 1;
-	return 0;
-}
-
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
-					 unsigned long nr_pages, int nr_items)
-{
-	struct super_block *sb = fs_info->sb;
-
-	if (down_read_trylock(&sb->s_umount)) {
-		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
-		up_read(&sb->s_umount);
-	} else {
-		/*
-		 * We needn't worry the filesystem going from r/w to r/o though
-		 * we don't acquire ->s_umount mutex, because the filesystem
-		 * should guarantee the delalloc inodes list be empty after
-		 * the filesystem is readonly(all dirty pages are written to
-		 * the disk).
-		 */
-		btrfs_start_delalloc_roots(fs_info, nr_items);
-		if (!current->journal_info)
-			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
-	}
-}
-
-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
-					u64 to_reclaim)
-{
-	u64 bytes;
-	u64 nr;
-
-	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-	nr = div64_u64(to_reclaim, bytes);
-	if (!nr)
-		nr = 1;
-	return nr;
-}
-
-#define EXTENT_SIZE_PER_ITEM	SZ_256K
-
-/*
- * shrink metadata reservation for delalloc
- */
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
-			    u64 orig, bool wait_ordered)
-{
-	struct btrfs_space_info *space_info;
-	struct btrfs_trans_handle *trans;
-	u64 delalloc_bytes;
-	u64 max_reclaim;
-	u64 items;
-	long time_left;
-	unsigned long nr_pages;
-	int loops;
-
-	/* Calc the number of the pages we need flush for space reservation */
-	items = calc_reclaim_items_nr(fs_info, to_reclaim);
-	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
-
-	trans = (struct btrfs_trans_handle *)current->journal_info;
-	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-
-	delalloc_bytes = percpu_counter_sum_positive(
-						&fs_info->delalloc_bytes);
-	if (delalloc_bytes == 0) {
-		if (trans)
-			return;
-		if (wait_ordered)
-			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
-		return;
-	}
-
-	loops = 0;
-	while (delalloc_bytes && loops < 3) {
-		max_reclaim = min(delalloc_bytes, to_reclaim);
-		nr_pages = max_reclaim >> PAGE_SHIFT;
-		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
-		/*
-		 * We need to wait for the async pages to actually start before
-		 * we do anything.
-		 */
-		max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
-		if (!max_reclaim)
-			goto skip_async;
-
-		if (max_reclaim <= nr_pages)
-			max_reclaim = 0;
-		else
-			max_reclaim -= nr_pages;
-
-		wait_event(fs_info->async_submit_wait,
-			   atomic_read(&fs_info->async_delalloc_pages) <=
-			   (int)max_reclaim);
-skip_async:
-		spin_lock(&space_info->lock);
-		if (list_empty(&space_info->tickets) &&
-		    list_empty(&space_info->priority_tickets)) {
-			spin_unlock(&space_info->lock);
-			break;
-		}
-		spin_unlock(&space_info->lock);
-
-		loops++;
-		if (wait_ordered && !trans) {
-			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
-		} else {
-			time_left = schedule_timeout_killable(1);
-			if (time_left)
-				break;
-		}
-		delalloc_bytes = percpu_counter_sum_positive(
-						&fs_info->delalloc_bytes);
-	}
-}
-
-struct reserve_ticket {
-	u64 bytes;
-	int error;
-	struct list_head list;
-	wait_queue_head_t wait;
-};
-
-/**
- * maybe_commit_transaction - possibly commit the transaction if its ok to
- * @root - the root we're allocating for
- * @bytes - the number of bytes we want to reserve
- * @force - force the commit
- *
- * This will check to make sure that committing the transaction will actually
- * get us somewhere and then commit the transaction if it does.  Otherwise it
- * will return -ENOSPC.
- */
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
-				  struct btrfs_space_info *space_info)
-{
-	struct reserve_ticket *ticket = NULL;
-	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
-	struct btrfs_trans_handle *trans;
-	u64 bytes;
-
-	trans = (struct btrfs_trans_handle *)current->journal_info;
-	if (trans)
-		return -EAGAIN;
-
-	spin_lock(&space_info->lock);
-	if (!list_empty(&space_info->priority_tickets))
-		ticket = list_first_entry(&space_info->priority_tickets,
-					  struct reserve_ticket, list);
-	else if (!list_empty(&space_info->tickets))
-		ticket = list_first_entry(&space_info->tickets,
-					  struct reserve_ticket, list);
-	bytes = (ticket) ? ticket->bytes : 0;
-	spin_unlock(&space_info->lock);
-
-	if (!bytes)
-		return 0;
-
-	/* See if there is enough pinned space to make this reservation */
-	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
-				   bytes,
-				   BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
-		goto commit;
-
-	/*
-	 * See if there is some space in the delayed insertion reservation for
-	 * this reservation.
-	 */
-	if (space_info != delayed_rsv->space_info)
-		return -ENOSPC;
-
-	spin_lock(&delayed_rsv->lock);
-	if (delayed_rsv->size > bytes)
-		bytes = 0;
-	else
-		bytes -= delayed_rsv->size;
-	spin_unlock(&delayed_rsv->lock);
-
-	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
-				   bytes,
-				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
-		return -ENOSPC;
-	}
-
-commit:
-	trans = btrfs_join_transaction(fs_info->extent_root);
-	if (IS_ERR(trans))
-		return -ENOSPC;
-
-	return btrfs_commit_transaction(trans);
-}
-
-/*
- * Try to flush some data based on policy set by @state. This is only advisory
- * and may fail for various reasons. The caller is supposed to examine the
- * state of @space_info to detect the outcome.
- */
-static void flush_space(struct btrfs_fs_info *fs_info,
-		       struct btrfs_space_info *space_info, u64 num_bytes,
-		       int state)
-{
-	struct btrfs_root *root = fs_info->extent_root;
-	struct btrfs_trans_handle *trans;
-	int nr;
-	int ret = 0;
-
-	switch (state) {
-	case FLUSH_DELAYED_ITEMS_NR:
-	case FLUSH_DELAYED_ITEMS:
-		if (state == FLUSH_DELAYED_ITEMS_NR)
-			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
-		else
-			nr = -1;
-
-		trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			break;
-		}
-		ret = btrfs_run_delayed_items_nr(trans, nr);
-		btrfs_end_transaction(trans);
-		break;
-	case FLUSH_DELALLOC:
-	case FLUSH_DELALLOC_WAIT:
-		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
-				state == FLUSH_DELALLOC_WAIT);
-		break;
-	case ALLOC_CHUNK:
-		trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			break;
-		}
-		ret = do_chunk_alloc(trans,
-				     btrfs_metadata_alloc_profile(fs_info),
-				     CHUNK_ALLOC_NO_FORCE);
-		btrfs_end_transaction(trans);
-		if (ret > 0 || ret == -ENOSPC)
-			ret = 0;
-		break;
-	case COMMIT_TRANS:
-		ret = may_commit_transaction(fs_info, space_info);
-		break;
-	default:
-		ret = -ENOSPC;
-		break;
-	}
-
-	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
-				ret);
-	return;
-}
-
-static inline u64
-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
-				 struct btrfs_space_info *space_info,
-				 bool system_chunk)
-{
-	struct reserve_ticket *ticket;
-	u64 used;
-	u64 expected;
-	u64 to_reclaim = 0;
-
-	list_for_each_entry(ticket, &space_info->tickets, list)
-		to_reclaim += ticket->bytes;
-	list_for_each_entry(ticket, &space_info->priority_tickets, list)
-		to_reclaim += ticket->bytes;
-	if (to_reclaim)
-		return to_reclaim;
-
-	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
-	if (can_overcommit(fs_info, space_info, to_reclaim,
-			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
-		return 0;
-
-	used = btrfs_space_info_used(space_info, true);
-
-	if (can_overcommit(fs_info, space_info, SZ_1M,
-			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
-		expected = div_factor_fine(space_info->total_bytes, 95);
-	else
-		expected = div_factor_fine(space_info->total_bytes, 90);
-
-	if (used > expected)
-		to_reclaim = used - expected;
-	else
-		to_reclaim = 0;
-	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
-				     space_info->bytes_reserved);
-	return to_reclaim;
-}
-
-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
-					struct btrfs_space_info *space_info,
-					u64 used, bool system_chunk)
-{
-	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
-
-	/* If we're just plain full then async reclaim just slows us down. */
-	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
-		return 0;
-
-	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-					      system_chunk))
-		return 0;
-
-	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
-		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
-}
-
-static void wake_all_tickets(struct list_head *head)
-{
-	struct reserve_ticket *ticket;
-
-	while (!list_empty(head)) {
-		ticket = list_first_entry(head, struct reserve_ticket, list);
-		list_del_init(&ticket->list);
-		ticket->error = -ENOSPC;
-		wake_up(&ticket->wait);
-	}
-}
-
-/*
- * This is for normal flushers, we can wait all goddamned day if we want to.  We
- * will loop and continuously try to flush as long as we are making progress.
- * We count progress as clearing off tickets each time we have to loop.
- */
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
-{
-	struct btrfs_fs_info *fs_info;
-	struct btrfs_space_info *space_info;
-	u64 to_reclaim;
-	int flush_state;
-	int commit_cycles = 0;
-	u64 last_tickets_id;
-
-	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
-	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-
-	spin_lock(&space_info->lock);
-	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-						      false);
-	if (!to_reclaim) {
-		space_info->flush = 0;
-		spin_unlock(&space_info->lock);
-		return;
-	}
-	last_tickets_id = space_info->tickets_id;
-	spin_unlock(&space_info->lock);
-
-	flush_state = FLUSH_DELAYED_ITEMS_NR;
-	do {
-		flush_space(fs_info, space_info, to_reclaim, flush_state);
-		spin_lock(&space_info->lock);
-		if (list_empty(&space_info->tickets)) {
-			space_info->flush = 0;
-			spin_unlock(&space_info->lock);
-			return;
-		}
-		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
-							      space_info,
-							      false);
-		if (last_tickets_id == space_info->tickets_id) {
-			flush_state++;
-		} else {
-			last_tickets_id = space_info->tickets_id;
-			flush_state = FLUSH_DELAYED_ITEMS_NR;
-			if (commit_cycles)
-				commit_cycles--;
-		}
-
-		if (flush_state > COMMIT_TRANS) {
-			commit_cycles++;
-			if (commit_cycles > 2) {
-				wake_all_tickets(&space_info->tickets);
-				space_info->flush = 0;
-			} else {
-				flush_state = FLUSH_DELAYED_ITEMS_NR;
-			}
-		}
-		spin_unlock(&space_info->lock);
-	} while (flush_state <= COMMIT_TRANS);
-}
-
-void btrfs_init_async_reclaim_work(struct work_struct *work)
-{
-	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
-}
-
-static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
-					    struct btrfs_space_info *space_info,
-					    struct reserve_ticket *ticket)
-{
-	u64 to_reclaim;
-	int flush_state = FLUSH_DELAYED_ITEMS_NR;
-
-	spin_lock(&space_info->lock);
-	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-						      false);
-	if (!to_reclaim) {
-		spin_unlock(&space_info->lock);
-		return;
-	}
-	spin_unlock(&space_info->lock);
-
-	do {
-		flush_space(fs_info, space_info, to_reclaim, flush_state);
-		flush_state++;
-		spin_lock(&space_info->lock);
-		if (ticket->bytes == 0) {
-			spin_unlock(&space_info->lock);
-			return;
-		}
-		spin_unlock(&space_info->lock);
-
-		/*
-		 * Priority flushers can't wait on delalloc without
-		 * deadlocking.
-		 */
-		if (flush_state == FLUSH_DELALLOC ||
-		    flush_state == FLUSH_DELALLOC_WAIT)
-			flush_state = ALLOC_CHUNK;
-	} while (flush_state < COMMIT_TRANS);
-}
-
-static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
-			       struct btrfs_space_info *space_info,
-			       struct reserve_ticket *ticket, u64 orig_bytes)
-
-{
-	DEFINE_WAIT(wait);
-	int ret = 0;
-
-	spin_lock(&space_info->lock);
-	while (ticket->bytes > 0 && ticket->error == 0) {
-		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
-		if (ret) {
-			ret = -EINTR;
-			break;
-		}
-		spin_unlock(&space_info->lock);
-
-		schedule();
-
-		finish_wait(&ticket->wait, &wait);
-		spin_lock(&space_info->lock);
-	}
-	if (!ret)
-		ret = ticket->error;
-	if (!list_empty(&ticket->list))
-		list_del_init(&ticket->list);
-	if (ticket->bytes && ticket->bytes < orig_bytes) {
-		u64 num_bytes = orig_bytes - ticket->bytes;
-		space_info->bytes_may_use -= num_bytes;
-		trace_btrfs_space_reservation(fs_info, "space_info",
-					      space_info->flags, num_bytes, 0);
-	}
-	spin_unlock(&space_info->lock);
-
-	return ret;
-}
-
-/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @space_info - the space info we want to allocate from
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
- *
- * This will reserve orig_bytes number of bytes from the space info associated
- * with the block_rsv.  If there is not enough space it will make an attempt to
- * flush out space to make room.  It will do this by flushing delalloc if
- * possible or committing the transaction.  If flush is 0 then no attempts to
- * regain reservations will be made and this will fail if there is not enough
- * space already.
- */
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
-				    struct btrfs_space_info *space_info,
-				    u64 orig_bytes,
-				    enum btrfs_reserve_flush_enum flush,
-				    bool system_chunk)
-{
-	struct reserve_ticket ticket;
-	u64 used;
-	int ret = 0;
-
-	ASSERT(orig_bytes);
-	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
-
-	spin_lock(&space_info->lock);
-	ret = -ENOSPC;
-	used = btrfs_space_info_used(space_info, true);
-
-	/*
-	 * If we have enough space then hooray, make our reservation and carry
-	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
-	 * If not things get more complicated.
-	 */
-	if (used + orig_bytes <= space_info->total_bytes) {
-		space_info->bytes_may_use += orig_bytes;
-		trace_btrfs_space_reservation(fs_info, "space_info",
-					      space_info->flags, orig_bytes, 1);
-		ret = 0;
-	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
-				  system_chunk)) {
-		space_info->bytes_may_use += orig_bytes;
-		trace_btrfs_space_reservation(fs_info, "space_info",
-					      space_info->flags, orig_bytes, 1);
-		ret = 0;
-	}
-
-	/*
-	 * If we couldn't make a reservation then setup our reservation ticket
-	 * and kick the async worker if it's not already running.
-	 *
-	 * If we are a priority flusher then we just need to add our ticket to
-	 * the list and we will do our own flushing further down.
-	 */
-	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
-		ticket.bytes = orig_bytes;
-		ticket.error = 0;
-		init_waitqueue_head(&ticket.wait);
-		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
-			list_add_tail(&ticket.list, &space_info->tickets);
-			if (!space_info->flush) {
-				space_info->flush = 1;
-				trace_btrfs_trigger_flush(fs_info,
-							  space_info->flags,
-							  orig_bytes, flush,
-							  "enospc");
-				queue_work(system_unbound_wq,
-					   &fs_info->async_reclaim_work);
-			}
-		} else {
-			list_add_tail(&ticket.list,
-				      &space_info->priority_tickets);
-		}
-	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
-		used += orig_bytes;
-		/*
-		 * We will do the space reservation dance during log replay,
-		 * which means we won't have fs_info->fs_root set, so don't do
-		 * the async reclaim as we will panic.
-		 */
-		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
-		    need_do_async_reclaim(fs_info, space_info,
-					  used, system_chunk) &&
-		    !work_busy(&fs_info->async_reclaim_work)) {
-			trace_btrfs_trigger_flush(fs_info, space_info->flags,
-						  orig_bytes, flush, "preempt");
-			queue_work(system_unbound_wq,
-				   &fs_info->async_reclaim_work);
-		}
-	}
-	spin_unlock(&space_info->lock);
-	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
-		return ret;
-
-	if (flush == BTRFS_RESERVE_FLUSH_ALL)
-		return wait_reserve_ticket(fs_info, space_info, &ticket,
-					   orig_bytes);
-
-	ret = 0;
-	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
-	spin_lock(&space_info->lock);
-	if (ticket.bytes) {
-		if (ticket.bytes < orig_bytes) {
-			u64 num_bytes = orig_bytes - ticket.bytes;
-			space_info->bytes_may_use -= num_bytes;
-			trace_btrfs_space_reservation(fs_info, "space_info",
-						      space_info->flags,
-						      num_bytes, 0);
-
-		}
-		list_del_init(&ticket.list);
-		ret = -ENOSPC;
-	}
-	spin_unlock(&space_info->lock);
-	ASSERT(list_empty(&ticket.list));
-	return ret;
-}
-
-/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
- *
- * This will reserve orgi_bytes number of bytes from the space info associated
- * with the block_rsv.  If there is not enough space it will make an attempt to
- * flush out space to make room.  It will do this by flushing delalloc if
- * possible or committing the transaction.  If flush is 0 then no attempts to
- * regain reservations will be made and this will fail if there is not enough
- * space already.
- */
-static int reserve_metadata_bytes(struct btrfs_root *root,
-				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes,
-				  enum btrfs_reserve_flush_enum flush)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	int ret;
-	bool system_chunk = (root == fs_info->chunk_root);
-
-	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
-				       orig_bytes, flush, system_chunk);
-	if (ret == -ENOSPC &&
-	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
-		if (block_rsv != global_rsv &&
-		    !block_rsv_use_bytes(global_rsv, orig_bytes))
-			ret = 0;
-	}
-	if (ret == -ENOSPC) {
-		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
-					      block_rsv->space_info->flags,
-					      orig_bytes, 1);
-
-		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
-			dump_space_info(fs_info, block_rsv->space_info,
-					orig_bytes, 0);
-	}
-	return ret;
-}
-
-static struct btrfs_block_rsv *get_block_rsv(
-					const struct btrfs_trans_handle *trans,
-					const struct btrfs_root *root)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_block_rsv *block_rsv = NULL;
-
-	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
-	    (root == fs_info->csum_root && trans->adding_csums) ||
-	    (root == fs_info->uuid_root))
-		block_rsv = trans->block_rsv;
-
-	if (!block_rsv)
-		block_rsv = root->block_rsv;
-
-	if (!block_rsv)
-		block_rsv = &fs_info->empty_block_rsv;
-
-	return block_rsv;
-}
-
-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
-			       u64 num_bytes)
-{
-	int ret = -ENOSPC;
-	spin_lock(&block_rsv->lock);
-	if (block_rsv->reserved >= num_bytes) {
-		block_rsv->reserved -= num_bytes;
-		if (block_rsv->reserved < block_rsv->size)
-			block_rsv->full = 0;
-		ret = 0;
-	}
-	spin_unlock(&block_rsv->lock);
-	return ret;
-}
-
-static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes, int update_size)
-{
-	spin_lock(&block_rsv->lock);
-	block_rsv->reserved += num_bytes;
-	if (update_size)
-		block_rsv->size += num_bytes;
-	else if (block_rsv->reserved >= block_rsv->size)
-		block_rsv->full = 1;
-	spin_unlock(&block_rsv->lock);
-}
-
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_rsv *dest, u64 num_bytes,
-			     int min_factor)
-{
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	u64 min_bytes;
-
-	if (global_rsv->space_info != dest->space_info)
-		return -ENOSPC;
-
-	spin_lock(&global_rsv->lock);
-	min_bytes = div_factor(global_rsv->size, min_factor);
-	if (global_rsv->reserved < min_bytes + num_bytes) {
-		spin_unlock(&global_rsv->lock);
-		return -ENOSPC;
-	}
-	global_rsv->reserved -= num_bytes;
-	if (global_rsv->reserved < global_rsv->size)
-		global_rsv->full = 0;
-	spin_unlock(&global_rsv->lock);
-
-	block_rsv_add_bytes(dest, num_bytes, 1);
-	return 0;
-}
-
-/*
- * This is for space we already have accounted in space_info->bytes_may_use, so
- * basically when we're returning space from block_rsv's.
- */
-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
-				     struct btrfs_space_info *space_info,
-				     u64 num_bytes)
-{
-	struct reserve_ticket *ticket;
-	struct list_head *head;
-	u64 used;
-	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
-	bool check_overcommit = false;
-
-	spin_lock(&space_info->lock);
-	head = &space_info->priority_tickets;
-
-	/*
-	 * If we are over our limit then we need to check and see if we can
-	 * overcommit, and if we can't then we just need to free up our space
-	 * and not satisfy any requests.
-	 */
-	used = btrfs_space_info_used(space_info, true);
-	if (used - num_bytes >= space_info->total_bytes)
-		check_overcommit = true;
-again:
-	while (!list_empty(head) && num_bytes) {
-		ticket = list_first_entry(head, struct reserve_ticket,
-					  list);
-		/*
-		 * We use 0 bytes because this space is already reserved, so
-		 * adding the ticket space would be a double count.
-		 */
-		if (check_overcommit &&
-		    !can_overcommit(fs_info, space_info, 0, flush, false))
-			break;
-		if (num_bytes >= ticket->bytes) {
-			list_del_init(&ticket->list);
-			num_bytes -= ticket->bytes;
-			ticket->bytes = 0;
-			space_info->tickets_id++;
-			wake_up(&ticket->wait);
-		} else {
-			ticket->bytes -= num_bytes;
-			num_bytes = 0;
-		}
-	}
-
-	if (num_bytes && head == &space_info->priority_tickets) {
-		head = &space_info->tickets;
-		flush = BTRFS_RESERVE_FLUSH_ALL;
-		goto again;
-	}
-	space_info->bytes_may_use -= num_bytes;
-	trace_btrfs_space_reservation(fs_info, "space_info",
-				      space_info->flags, num_bytes, 0);
-	spin_unlock(&space_info->lock);
-}
-
-/*
- * This is for newly allocated space that isn't accounted in
- * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
- * we use this helper.
- */
-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
-				     struct btrfs_space_info *space_info,
-				     u64 num_bytes)
-{
-	struct reserve_ticket *ticket;
-	struct list_head *head = &space_info->priority_tickets;
-
-again:
-	while (!list_empty(head) && num_bytes) {
-		ticket = list_first_entry(head, struct reserve_ticket,
-					  list);
-		if (num_bytes >= ticket->bytes) {
-			trace_btrfs_space_reservation(fs_info, "space_info",
-						      space_info->flags,
-						      ticket->bytes, 1);
-			list_del_init(&ticket->list);
-			num_bytes -= ticket->bytes;
-			space_info->bytes_may_use += ticket->bytes;
-			ticket->bytes = 0;
-			space_info->tickets_id++;
-			wake_up(&ticket->wait);
-		} else {
-			trace_btrfs_space_reservation(fs_info, "space_info",
-						      space_info->flags,
-						      num_bytes, 1);
-			space_info->bytes_may_use += num_bytes;
-			ticket->bytes -= num_bytes;
-			num_bytes = 0;
-		}
-	}
-
-	if (num_bytes && head == &space_info->priority_tickets) {
-		head = &space_info->tickets;
-		goto again;
-	}
-}
-
-static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
-				    struct btrfs_block_rsv *block_rsv,
-				    struct btrfs_block_rsv *dest, u64 num_bytes,
-				    u64 *qgroup_to_release_ret)
-{
-	struct btrfs_space_info *space_info = block_rsv->space_info;
-	u64 qgroup_to_release = 0;
-	u64 ret;
-
-	spin_lock(&block_rsv->lock);
-	if (num_bytes == (u64)-1) {
-		num_bytes = block_rsv->size;
-		qgroup_to_release = block_rsv->qgroup_rsv_size;
-	}
-	block_rsv->size -= num_bytes;
-	if (block_rsv->reserved >= block_rsv->size) {
-		num_bytes = block_rsv->reserved - block_rsv->size;
-		block_rsv->reserved = block_rsv->size;
-		block_rsv->full = 1;
-	} else {
-		num_bytes = 0;
-	}
-	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
-		qgroup_to_release = block_rsv->qgroup_rsv_reserved -
-				    block_rsv->qgroup_rsv_size;
-		block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
-	} else {
-		qgroup_to_release = 0;
-	}
-	spin_unlock(&block_rsv->lock);
-
-	ret = num_bytes;
-	if (num_bytes > 0) {
-		if (dest) {
-			spin_lock(&dest->lock);
-			if (!dest->full) {
-				u64 bytes_to_add;
-
-				bytes_to_add = dest->size - dest->reserved;
-				bytes_to_add = min(num_bytes, bytes_to_add);
-				dest->reserved += bytes_to_add;
-				if (dest->reserved >= dest->size)
-					dest->full = 1;
-				num_bytes -= bytes_to_add;
-			}
-			spin_unlock(&dest->lock);
-		}
-		if (num_bytes)
-			space_info_add_old_bytes(fs_info, space_info,
-						 num_bytes);
-	}
-	if (qgroup_to_release_ret)
-		*qgroup_to_release_ret = qgroup_to_release;
-	return ret;
-}
-
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
-			    struct btrfs_block_rsv *dst, u64 num_bytes,
-			    int update_size)
-{
-	int ret;
-
-	ret = block_rsv_use_bytes(src, num_bytes);
-	if (ret)
-		return ret;
-
-	block_rsv_add_bytes(dst, num_bytes, update_size);
-	return 0;
-}
-
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
-{
-	memset(rsv, 0, sizeof(*rsv));
-	spin_lock_init(&rsv->lock);
-	rsv->type = type;
-}
-
-void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
-				   struct btrfs_block_rsv *rsv,
-				   unsigned short type)
-{
-	btrfs_init_block_rsv(rsv, type);
-	rsv->space_info = __find_space_info(fs_info,
-					    BTRFS_BLOCK_GROUP_METADATA);
-}
-
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
-					      unsigned short type)
-{
-	struct btrfs_block_rsv *block_rsv;
-
-	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
-	if (!block_rsv)
-		return NULL;
-
-	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
-	return block_rsv;
-}
-
-void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
-			  struct btrfs_block_rsv *rsv)
-{
-	if (!rsv)
-		return;
-	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
-	kfree(rsv);
-}
-
-int btrfs_block_rsv_add(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
-			enum btrfs_reserve_flush_enum flush)
-{
-	int ret;
-
-	if (num_bytes == 0)
-		return 0;
-
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
-	if (!ret) {
-		block_rsv_add_bytes(block_rsv, num_bytes, 1);
-		return 0;
-	}
-
-	return ret;
-}
-
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
-{
-	u64 num_bytes = 0;
-	int ret = -ENOSPC;
-
-	if (!block_rsv)
-		return 0;
-
-	spin_lock(&block_rsv->lock);
-	num_bytes = div_factor(block_rsv->size, min_factor);
-	if (block_rsv->reserved >= num_bytes)
-		ret = 0;
-	spin_unlock(&block_rsv->lock);
-
-	return ret;
-}
-
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
-			   enum btrfs_reserve_flush_enum flush)
-{
-	u64 num_bytes = 0;
-	int ret = -ENOSPC;
-
-	if (!block_rsv)
-		return 0;
-
-	spin_lock(&block_rsv->lock);
-	num_bytes = min_reserved;
-	if (block_rsv->reserved >= num_bytes)
-		ret = 0;
-	else
-		num_bytes -= block_rsv->reserved;
-	spin_unlock(&block_rsv->lock);
-
-	if (!ret)
-		return 0;
-
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
-	if (!ret) {
-		block_rsv_add_bytes(block_rsv, num_bytes, 0);
-		return 0;
-	}
-
-	return ret;
-}
-
-/**
- * btrfs_inode_rsv_refill - refill the inode block rsv.
- * @inode - the inode we are refilling.
- * @flush - the flusing restriction.
- *
- * Essentially the same as btrfs_block_rsv_refill, except it uses the
- * block_rsv->size as the minimum size.  We'll either refill the missing amount
- * or return if we already have enough space.  This will also handle the resreve
- * tracepoint for the reserved amount.
- */
-static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
-				  enum btrfs_reserve_flush_enum flush)
-{
-	struct btrfs_root *root = inode->root;
-	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
-	u64 num_bytes = 0;
-	u64 qgroup_num_bytes = 0;
-	int ret = -ENOSPC;
-
-	spin_lock(&block_rsv->lock);
-	if (block_rsv->reserved < block_rsv->size)
-		num_bytes = block_rsv->size - block_rsv->reserved;
-	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
-		qgroup_num_bytes = block_rsv->qgroup_rsv_size -
-				   block_rsv->qgroup_rsv_reserved;
-	spin_unlock(&block_rsv->lock);
-
-	if (num_bytes == 0)
-		return 0;
-
-	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
-	if (ret)
-		return ret;
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
-	if (!ret) {
-		block_rsv_add_bytes(block_rsv, num_bytes, 0);
-		trace_btrfs_space_reservation(root->fs_info, "delalloc",
-					      btrfs_ino(inode), num_bytes, 1);
-
-		/* Don't forget to increase qgroup_rsv_reserved */
-		spin_lock(&block_rsv->lock);
-		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
-		spin_unlock(&block_rsv->lock);
-	} else
-		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
-	return ret;
-}
-
-/**
- * btrfs_inode_rsv_release - release any excessive reservation.
- * @inode - the inode we need to release from.
- * @qgroup_free - free or convert qgroup meta.
- *   Unlike normal operation, qgroup meta reservation needs to know if we are
- *   freeing qgroup reservation or just converting it into per-trans.  Normally
- *   @qgroup_free is true for error handling, and false for normal release.
- *
- * This is the same as btrfs_block_rsv_release, except that it handles the
- * tracepoint for the reservation.
- */
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
-	u64 released = 0;
-	u64 qgroup_to_release = 0;
-
-	/*
-	 * Since we statically set the block_rsv->size we just want to say we
-	 * are releasing 0 bytes, and then we'll just get the reservation over
-	 * the size free'd.
-	 */
-	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
-					   &qgroup_to_release);
-	if (released > 0)
-		trace_btrfs_space_reservation(fs_info, "delalloc",
-					      btrfs_ino(inode), released, 0);
-	if (qgroup_free)
-		btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
-	else
-		btrfs_qgroup_convert_reserved_meta(inode->root,
-						   qgroup_to_release);
-}
-
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_rsv *block_rsv,
-			     u64 num_bytes)
-{
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-
-	if (global_rsv == block_rsv ||
-	    block_rsv->space_info != global_rsv->space_info)
-		global_rsv = NULL;
-	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
-}
-
-static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
-	struct btrfs_space_info *sinfo = block_rsv->space_info;
-	u64 num_bytes;
-
-	/*
-	 * The global block rsv is based on the size of the extent tree, the
-	 * checksum tree and the root tree.  If the fs is empty we want to set
-	 * it to a minimal amount for safety.
-	 */
-	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
-		btrfs_root_used(&fs_info->csum_root->root_item) +
-		btrfs_root_used(&fs_info->tree_root->root_item);
-	num_bytes = max_t(u64, num_bytes, SZ_16M);
-
-	spin_lock(&sinfo->lock);
-	spin_lock(&block_rsv->lock);
-
-	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
-
-	if (block_rsv->reserved < block_rsv->size) {
-		num_bytes = btrfs_space_info_used(sinfo, true);
-		if (sinfo->total_bytes > num_bytes) {
-			num_bytes = sinfo->total_bytes - num_bytes;
-			num_bytes = min(num_bytes,
-					block_rsv->size - block_rsv->reserved);
-			block_rsv->reserved += num_bytes;
-			sinfo->bytes_may_use += num_bytes;
-			trace_btrfs_space_reservation(fs_info, "space_info",
-						      sinfo->flags, num_bytes,
-						      1);
-		}
-	} else if (block_rsv->reserved > block_rsv->size) {
-		num_bytes = block_rsv->reserved - block_rsv->size;
-		sinfo->bytes_may_use -= num_bytes;
-		trace_btrfs_space_reservation(fs_info, "space_info",
-				      sinfo->flags, num_bytes, 0);
-		block_rsv->reserved = block_rsv->size;
-	}
-
-	if (block_rsv->reserved == block_rsv->size)
-		block_rsv->full = 1;
-	else
-		block_rsv->full = 0;
-
-	spin_unlock(&block_rsv->lock);
-	spin_unlock(&sinfo->lock);
-}
-
-static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_space_info *space_info;
-
-	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
-	fs_info->chunk_block_rsv.space_info = space_info;
-
-	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-	fs_info->global_block_rsv.space_info = space_info;
-	fs_info->trans_block_rsv.space_info = space_info;
-	fs_info->empty_block_rsv.space_info = space_info;
-	fs_info->delayed_block_rsv.space_info = space_info;
-
-	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
-	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
-	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
-	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
-	if (fs_info->quota_root)
-		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
-	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-
-	update_global_block_rsv(fs_info);
-}
-
-static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
-{
-	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
-				(u64)-1, NULL);
-	WARN_ON(fs_info->trans_block_rsv.size > 0);
-	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
-	WARN_ON(fs_info->chunk_block_rsv.size > 0);
-	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
-	WARN_ON(fs_info->delayed_block_rsv.size > 0);
-	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
-}
-
-
-/*
- * To be called after all the new block groups attached to the transaction
- * handle have been created (btrfs_create_pending_block_groups()).
- */
-void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-
-	if (!trans->chunk_bytes_reserved)
-		return;
-
-	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
-
-	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
-				trans->chunk_bytes_reserved, NULL);
-	trans->chunk_bytes_reserved = 0;
-}
-
-/*
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
- * root: the root of the parent directory
- * rsv: block reservation
- * items: the number of items that we need do reservation
- * use_global_rsv: allow fallback to the global block reservation
- *
- * This function is used to reserve the space for snapshot/subvolume
- * creation and deletion. Those operations are different with the
- * common file/directory operations, they change two fs/file trees
- * and root tree, the number of items that the qgroup reserves is
- * different with the free space reservation. So we can not use
- * the space reservation mechanism in start_transaction().
- */
-int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
-				     struct btrfs_block_rsv *rsv, int items,
-				     bool use_global_rsv)
-{
-	u64 qgroup_num_bytes = 0;
-	u64 num_bytes;
-	int ret;
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-		/* One for parent inode, two for dir entries */
-		qgroup_num_bytes = 3 * fs_info->nodesize;
-		ret = btrfs_qgroup_reserve_meta_prealloc(root,
-				qgroup_num_bytes, true);
-		if (ret)
-			return ret;
-	}
-
-	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
-	rsv->space_info = __find_space_info(fs_info,
-					    BTRFS_BLOCK_GROUP_METADATA);
-	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
-				  BTRFS_RESERVE_FLUSH_ALL);
-
-	if (ret == -ENOSPC && use_global_rsv)
-		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
-
-	if (ret && qgroup_num_bytes)
-		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
-
-	return ret;
-}
-
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
-				      struct btrfs_block_rsv *rsv)
-{
-	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
-}
-
-static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
-						 struct btrfs_inode *inode)
-{
-	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
-	u64 reserve_size = 0;
-	u64 qgroup_rsv_size = 0;
-	u64 csum_leaves;
-	unsigned outstanding_extents;
-
-	lockdep_assert_held(&inode->lock);
-	outstanding_extents = inode->outstanding_extents;
-	if (outstanding_extents)
-		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
-						outstanding_extents + 1);
-	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
-						 inode->csum_bytes);
-	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
-						       csum_leaves);
-	/*
-	 * For qgroup rsv, the calculation is very simple:
-	 * account one nodesize for each outstanding extent
-	 *
-	 * This is overestimating in most cases.
-	 */
-	qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
-
-	spin_lock(&block_rsv->lock);
-	block_rsv->size = reserve_size;
-	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
-	spin_unlock(&block_rsv->lock);
-}
-
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	unsigned nr_extents;
-	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
-	int ret = 0;
-	bool delalloc_lock = true;
-
-	/* If we are a free space inode we need to not flush since we will be in
-	 * the middle of a transaction commit.  We also don't need the delalloc
-	 * mutex since we won't race with anybody.  We need this mostly to make
-	 * lockdep shut its filthy mouth.
-	 *
-	 * If we have a transaction open (can happen if we call truncate_block
-	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
-	 */
-	if (btrfs_is_free_space_inode(inode)) {
-		flush = BTRFS_RESERVE_NO_FLUSH;
-		delalloc_lock = false;
-	} else {
-		if (current->journal_info)
-			flush = BTRFS_RESERVE_FLUSH_LIMIT;
-
-		if (btrfs_transaction_in_commit(fs_info))
-			schedule_timeout(1);
-	}
-
-	if (delalloc_lock)
-		mutex_lock(&inode->delalloc_mutex);
-
-	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
-
-	/* Add our new extents and calculate the new rsv size. */
-	spin_lock(&inode->lock);
-	nr_extents = count_max_extents(num_bytes);
-	btrfs_mod_outstanding_extents(inode, nr_extents);
-	inode->csum_bytes += num_bytes;
-	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
-	spin_unlock(&inode->lock);
-
-	ret = btrfs_inode_rsv_refill(inode, flush);
-	if (unlikely(ret))
-		goto out_fail;
-
-	if (delalloc_lock)
-		mutex_unlock(&inode->delalloc_mutex);
-	return 0;
-
-out_fail:
-	spin_lock(&inode->lock);
-	nr_extents = count_max_extents(num_bytes);
-	btrfs_mod_outstanding_extents(inode, -nr_extents);
-	inode->csum_bytes -= num_bytes;
-	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
-	spin_unlock(&inode->lock);
-
-	btrfs_inode_rsv_release(inode, true);
-	if (delalloc_lock)
-		mutex_unlock(&inode->delalloc_mutex);
-	return ret;
-}
-
-/**
- * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
- * @inode: the inode to release the reservation for.
- * @num_bytes: the number of bytes we are releasing.
- * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
- *
- * This will release the metadata reservation for an inode.  This can be called
- * once we complete IO for a given set of bytes to release their metadata
- * reservations, or on error for the same reason.
- */
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
-				     bool qgroup_free)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-
-	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
-	spin_lock(&inode->lock);
-	inode->csum_bytes -= num_bytes;
-	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
-	spin_unlock(&inode->lock);
-
-	if (btrfs_is_testing(fs_info))
-		return;
-
-	btrfs_inode_rsv_release(inode, qgroup_free);
-}
-
-/**
- * btrfs_delalloc_release_extents - release our outstanding_extents
- * @inode: the inode to balance the reservation for.
- * @num_bytes: the number of bytes we originally reserved with
- * @qgroup_free: do we need to free qgroup meta reservation or convert them.
- *
- * When we reserve space we increase outstanding_extents for the extents we may
- * add.  Once we've set the range as delalloc or created our ordered extents we
- * have outstanding_extents to track the real usage, so we use this to free our
- * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
- * with btrfs_delalloc_reserve_metadata.
- */
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
-				    bool qgroup_free)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	unsigned num_extents;
-
-	spin_lock(&inode->lock);
-	num_extents = count_max_extents(num_bytes);
-	btrfs_mod_outstanding_extents(inode, -num_extents);
-	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
-	spin_unlock(&inode->lock);
-
-	if (btrfs_is_testing(fs_info))
-		return;
-
-	btrfs_inode_rsv_release(inode, qgroup_free);
-}
-
-/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for
- * delalloc
- * @inode: inode we're writing to
- * @start: start range we are writing to
- * @len: how long the range we are writing to
- * @reserved: mandatory parameter, record actually reserved qgroup ranges of
- * 	      current reservation.
- *
- * This will do the following things
- *
- * o reserve space in data space info for num bytes
- *   and reserve precious corresponding qgroup space
- *   (Done in check_data_free_space)
- *
- * o reserve space for metadata space, based on the number of outstanding
- *   extents and how much csums will be needed
- *   also reserve metadata space in a per root over-reserve method.
- * o add to the inodes->delalloc_bytes
- * o add it to the fs_info's delalloc inodes list.
- *   (Above 3 all done in delalloc_reserve_metadata)
- *
- * Return 0 for success
- * Return <0 for error(-ENOSPC or -EQUOT)
- */
-int btrfs_delalloc_reserve_space(struct inode *inode,
-			struct extent_changeset **reserved, u64 start, u64 len)
-{
-	int ret;
-
-	ret = btrfs_check_data_free_space(inode, reserved, start, len);
-	if (ret < 0)
-		return ret;
-	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
-	if (ret < 0)
-		btrfs_free_reserved_data_space(inode, *reserved, start, len);
-	return ret;
-}
-
-/**
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
- * @inode: inode we're releasing space for
- * @start: start position of the space already reserved
- * @len: the len of the space already reserved
- * @release_bytes: the len of the space we consumed or didn't use
- *
- * This function will release the metadata space that was not used and will
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
- * list if there are no delalloc bytes left.
- * Also it will handle the qgroup reserved space.
- */
-void btrfs_delalloc_release_space(struct inode *inode,
-				  struct extent_changeset *reserved,
-				  u64 start, u64 len, bool qgroup_free)
-{
-	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
-	btrfs_free_reserved_data_space(inode, reserved, start, len);
-}
-
-static int update_block_group(struct btrfs_trans_handle *trans,
-			      struct btrfs_fs_info *info, u64 bytenr,
-			      u64 num_bytes, int alloc)
-{
-	struct btrfs_block_group_cache *cache = NULL;
-	u64 total = num_bytes;
-	u64 old_val;
-	u64 byte_in_group;
-	int factor;
-
-	/* block accounting for super block */
-	spin_lock(&info->delalloc_root_lock);
-	old_val = btrfs_super_bytes_used(info->super_copy);
-	if (alloc)
-		old_val += num_bytes;
-	else
-		old_val -= num_bytes;
-	btrfs_set_super_bytes_used(info->super_copy, old_val);
-	spin_unlock(&info->delalloc_root_lock);
-
-	while (total) {
-		cache = btrfs_lookup_block_group(info, bytenr);
-		if (!cache)
-			return -ENOENT;
-		factor = btrfs_bg_type_to_factor(cache->flags);
-
-		/*
-		 * If this block group has free space cache written out, we
-		 * need to make sure to load it if we are removing space.  This
-		 * is because we need the unpinning stage to actually add the
-		 * space back to the block group, otherwise we will leak space.
-		 */
-		if (!alloc && cache->cached == BTRFS_CACHE_NO)
-			cache_block_group(cache, 1);
-
-		byte_in_group = bytenr - cache->key.objectid;
-		WARN_ON(byte_in_group > cache->key.offset);
-
-		spin_lock(&cache->space_info->lock);
-		spin_lock(&cache->lock);
-
-		if (btrfs_test_opt(info, SPACE_CACHE) &&
-		    cache->disk_cache_state < BTRFS_DC_CLEAR)
-			cache->disk_cache_state = BTRFS_DC_CLEAR;
-
-		old_val = btrfs_block_group_used(&cache->item);
-		num_bytes = min(total, cache->key.offset - byte_in_group);
-		if (alloc) {
-			old_val += num_bytes;
-			btrfs_set_block_group_used(&cache->item, old_val);
-			cache->reserved -= num_bytes;
-			cache->space_info->bytes_reserved -= num_bytes;
-			cache->space_info->bytes_used += num_bytes;
-			cache->space_info->disk_used += num_bytes * factor;
-			spin_unlock(&cache->lock);
-			spin_unlock(&cache->space_info->lock);
-		} else {
-			old_val -= num_bytes;
-			btrfs_set_block_group_used(&cache->item, old_val);
-			cache->pinned += num_bytes;
-			cache->space_info->bytes_pinned += num_bytes;
-			cache->space_info->bytes_used -= num_bytes;
-			cache->space_info->disk_used -= num_bytes * factor;
-			spin_unlock(&cache->lock);
-			spin_unlock(&cache->space_info->lock);
-
-			trace_btrfs_space_reservation(info, "pinned",
-						      cache->space_info->flags,
-						      num_bytes, 1);
-			percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
-					   num_bytes,
-					   BTRFS_TOTAL_BYTES_PINNED_BATCH);
-			set_extent_dirty(info->pinned_extents,
-					 bytenr, bytenr + num_bytes - 1,
-					 GFP_NOFS | __GFP_NOFAIL);
-		}
-
-		spin_lock(&trans->transaction->dirty_bgs_lock);
-		if (list_empty(&cache->dirty_list)) {
-			list_add_tail(&cache->dirty_list,
-				      &trans->transaction->dirty_bgs);
-			trans->transaction->num_dirty_bgs++;
-			btrfs_get_block_group(cache);
-		}
-		spin_unlock(&trans->transaction->dirty_bgs_lock);
-
-		/*
-		 * No longer have used bytes in this block group, queue it for
-		 * deletion. We do this after adding the block group to the
-		 * dirty list to avoid races between cleaner kthread and space
-		 * cache writeout.
-		 */
-		if (!alloc && old_val == 0)
-			btrfs_mark_bg_unused(cache);
-
-		btrfs_put_block_group(cache);
-		total -= num_bytes;
-		bytenr += num_bytes;
-	}
-	return 0;
-}
-
 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
 {
 	struct btrfs_block_group_cache *cache;
@@ -6182,14 +2566,16 @@
 	return bytenr;
 }
 
-static int pin_down_extent(struct btrfs_fs_info *fs_info,
-			   struct btrfs_block_group_cache *cache,
+static int pin_down_extent(struct btrfs_block_group_cache *cache,
 			   u64 bytenr, u64 num_bytes, int reserved)
 {
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+
 	spin_lock(&cache->space_info->lock);
 	spin_lock(&cache->lock);
 	cache->pinned += num_bytes;
-	cache->space_info->bytes_pinned += num_bytes;
+	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
+					     num_bytes);
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
@@ -6197,8 +2583,6 @@
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
 
-	trace_btrfs_space_reservation(fs_info, "pinned",
-				      cache->space_info->flags, num_bytes, 1);
 	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
 		    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
 	set_extent_dirty(fs_info->pinned_extents, bytenr,
@@ -6217,7 +2601,7 @@
 	cache = btrfs_lookup_block_group(fs_info, bytenr);
 	BUG_ON(!cache); /* Logic error */
 
-	pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
+	pin_down_extent(cache, bytenr, num_bytes, reserved);
 
 	btrfs_put_block_group(cache);
 	return 0;
@@ -6242,9 +2626,9 @@
 	 * to one because the slow code to read in the free extents does check
 	 * the pinned extents.
 	 */
-	cache_block_group(cache, 1);
+	btrfs_cache_block_group(cache, 1);
 
-	pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
+	pin_down_extent(cache, bytenr, num_bytes, 0);
 
 	/* remove us from the free space cache (if we're there at all) */
 	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
@@ -6263,18 +2647,19 @@
 	if (!block_group)
 		return -EINVAL;
 
-	cache_block_group(block_group, 0);
-	caching_ctl = get_caching_control(block_group);
+	btrfs_cache_block_group(block_group, 0);
+	caching_ctl = btrfs_get_caching_control(block_group);
 
 	if (!caching_ctl) {
 		/* Logic error */
-		BUG_ON(!block_group_cache_done(block_group));
+		BUG_ON(!btrfs_block_group_cache_done(block_group));
 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
 	} else {
 		mutex_lock(&caching_ctl->mutex);
 
 		if (start >= caching_ctl->progress) {
-			ret = add_excluded_extent(fs_info, start, num_bytes);
+			ret = btrfs_add_excluded_extent(fs_info, start,
+							num_bytes);
 		} else if (start + num_bytes <= caching_ctl->progress) {
 			ret = btrfs_remove_free_space(block_group,
 						      start, num_bytes);
@@ -6288,19 +2673,20 @@
 			num_bytes = (start + num_bytes) -
 				caching_ctl->progress;
 			start = caching_ctl->progress;
-			ret = add_excluded_extent(fs_info, start, num_bytes);
+			ret = btrfs_add_excluded_extent(fs_info, start,
+							num_bytes);
 		}
 out_lock:
 		mutex_unlock(&caching_ctl->mutex);
-		put_caching_control(caching_ctl);
+		btrfs_put_caching_control(caching_ctl);
 	}
 	btrfs_put_block_group(block_group);
 	return ret;
 }
 
-int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
-				 struct extent_buffer *eb)
+int btrfs_exclude_logged_extents(struct extent_buffer *eb)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_key key;
 	int found_type;
@@ -6336,113 +2722,6 @@
 	atomic_inc(&bg->reservations);
 }
 
-void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
-					const u64 start)
-{
-	struct btrfs_block_group_cache *bg;
-
-	bg = btrfs_lookup_block_group(fs_info, start);
-	ASSERT(bg);
-	if (atomic_dec_and_test(&bg->reservations))
-		wake_up_var(&bg->reservations);
-	btrfs_put_block_group(bg);
-}
-
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
-{
-	struct btrfs_space_info *space_info = bg->space_info;
-
-	ASSERT(bg->ro);
-
-	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
-		return;
-
-	/*
-	 * Our block group is read only but before we set it to read only,
-	 * some task might have had allocated an extent from it already, but it
-	 * has not yet created a respective ordered extent (and added it to a
-	 * root's list of ordered extents).
-	 * Therefore wait for any task currently allocating extents, since the
-	 * block group's reservations counter is incremented while a read lock
-	 * on the groups' semaphore is held and decremented after releasing
-	 * the read access on that semaphore and creating the ordered extent.
-	 */
-	down_write(&space_info->groups_sem);
-	up_write(&space_info->groups_sem);
-
-	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
-}
-
-/**
- * btrfs_add_reserved_bytes - update the block_group and space info counters
- * @cache:	The cache we are manipulating
- * @ram_bytes:  The number of bytes of file content, and will be same to
- *              @num_bytes except for the compress path.
- * @num_bytes:	The number of bytes in question
- * @delalloc:   The blocks are allocated for the delalloc write
- *
- * This is called by the allocator when it reserves space. If this is a
- * reservation and the block group has become read only we cannot make the
- * reservation and return -EAGAIN, otherwise this function always succeeds.
- */
-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
-				    u64 ram_bytes, u64 num_bytes, int delalloc)
-{
-	struct btrfs_space_info *space_info = cache->space_info;
-	int ret = 0;
-
-	spin_lock(&space_info->lock);
-	spin_lock(&cache->lock);
-	if (cache->ro) {
-		ret = -EAGAIN;
-	} else {
-		cache->reserved += num_bytes;
-		space_info->bytes_reserved += num_bytes;
-
-		trace_btrfs_space_reservation(cache->fs_info,
-				"space_info", space_info->flags,
-				ram_bytes, 0);
-		space_info->bytes_may_use -= ram_bytes;
-		if (delalloc)
-			cache->delalloc_bytes += num_bytes;
-	}
-	spin_unlock(&cache->lock);
-	spin_unlock(&space_info->lock);
-	return ret;
-}
-
-/**
- * btrfs_free_reserved_bytes - update the block_group and space info counters
- * @cache:      The cache we are manipulating
- * @num_bytes:  The number of bytes in question
- * @delalloc:   The blocks are allocated for the delalloc write
- *
- * This is called by somebody who is freeing space that was never actually used
- * on disk.  For example if you reserve some space for a new leaf in transaction
- * A and before transaction A commits you free that leaf, you call this with
- * reserve set to 0 in order to clear the reservation.
- */
-
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
-				     u64 num_bytes, int delalloc)
-{
-	struct btrfs_space_info *space_info = cache->space_info;
-	int ret = 0;
-
-	spin_lock(&space_info->lock);
-	spin_lock(&cache->lock);
-	if (cache->ro)
-		space_info->bytes_readonly += num_bytes;
-	cache->reserved -= num_bytes;
-	space_info->bytes_reserved -= num_bytes;
-	space_info->max_extent_size = 0;
-
-	if (delalloc)
-		cache->delalloc_bytes -= num_bytes;
-	spin_unlock(&cache->lock);
-	spin_unlock(&space_info->lock);
-	return ret;
-}
 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_caching_control *next;
@@ -6454,10 +2733,10 @@
 	list_for_each_entry_safe(caching_ctl, next,
 				 &fs_info->caching_block_groups, list) {
 		cache = caching_ctl->block_group;
-		if (block_group_cache_done(cache)) {
+		if (btrfs_block_group_cache_done(cache)) {
 			cache->last_byte_to_unpin = (u64)-1;
 			list_del_init(&caching_ctl->list);
-			put_caching_control(caching_ctl);
+			btrfs_put_caching_control(caching_ctl);
 		} else {
 			cache->last_byte_to_unpin = caching_ctl->progress;
 		}
@@ -6470,7 +2749,7 @@
 
 	up_write(&fs_info->commit_root_sem);
 
-	update_global_block_rsv(fs_info);
+	btrfs_update_global_block_rsv(fs_info);
 }
 
 /*
@@ -6560,10 +2839,7 @@
 		spin_lock(&space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
-		space_info->bytes_pinned -= len;
-
-		trace_btrfs_space_reservation(fs_info, "pinned",
-					      space_info->flags, len, 0);
+		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
 		space_info->max_extent_size = 0;
 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
 			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
@@ -6581,20 +2857,17 @@
 				to_add = min(len, global_rsv->size -
 					     global_rsv->reserved);
 				global_rsv->reserved += to_add;
-				space_info->bytes_may_use += to_add;
+				btrfs_space_info_update_bytes_may_use(fs_info,
+						space_info, to_add);
 				if (global_rsv->reserved >= global_rsv->size)
 					global_rsv->full = 1;
-				trace_btrfs_space_reservation(fs_info,
-							      "space_info",
-							      space_info->flags,
-							      to_add, 1);
 				len -= to_add;
 			}
 			spin_unlock(&global_rsv->lock);
 			/* Add to any tickets we may have */
 			if (len)
-				space_info_add_new_bytes(fs_info, space_info,
-							 len);
+				btrfs_try_granting_tickets(fs_info,
+							   space_info);
 		}
 		spin_unlock(&space_info->lock);
 	}
@@ -6620,9 +2893,11 @@
 		unpin = &fs_info->freed_extents[0];
 
 	while (!trans->aborted) {
+		struct extent_state *cached_state = NULL;
+
 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY, NULL);
+					    EXTENT_DIRTY, &cached_state);
 		if (ret) {
 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 			break;
@@ -6632,9 +2907,10 @@
 			ret = btrfs_discard_extent(fs_info, start,
 						   end + 1 - start, NULL);
 
-		clear_extent_dirty(unpin, start, end);
+		clear_extent_dirty(unpin, start, end, &cached_state);
 		unpin_extent_range(fs_info, start, end, true);
 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+		free_extent_state(cached_state);
 		cond_resched();
 	}
 
@@ -6892,7 +3168,7 @@
 			goto out;
 		}
 
-		ret = update_block_group(trans, info, bytenr, num_bytes, 0);
+		ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -6925,15 +3201,11 @@
 		goto out_delayed_unlock;
 
 	spin_lock(&head->lock);
-	if (!RB_EMPTY_ROOT(&head->ref_tree))
+	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
 		goto out;
 
-	if (head->extent_op) {
-		if (!head->must_insert_reserved)
-			goto out;
-		btrfs_free_delayed_extent_op(head->extent_op);
-		head->extent_op = NULL;
-	}
+	if (cleanup_extent_op(head) != NULL)
+		goto out;
 
 	/*
 	 * waiting for the lock here would deadlock.  If someone else has it
@@ -6942,22 +3214,9 @@
 	if (!mutex_trylock(&head->mutex))
 		goto out;
 
-	/*
-	 * at this point we have a head with no other entries.  Go
-	 * ahead and process it.
-	 */
-	rb_erase(&head->href_node, &delayed_refs->href_root);
-	RB_CLEAR_NODE(&head->href_node);
-	atomic_dec(&delayed_refs->num_entries);
-
-	/*
-	 * we don't take a ref on the node because we're removing it from the
-	 * tree, so we just steal the ref the tree was holding.
-	 */
-	delayed_refs->num_heads--;
-	if (head->processing == 0)
-		delayed_refs->num_heads_ready--;
+	btrfs_delete_ref_head(delayed_refs, head);
 	head->processing = 0;
+
 	spin_unlock(&head->lock);
 	spin_unlock(&delayed_refs->lock);
 
@@ -6965,6 +3224,7 @@
 	if (head->must_insert_reserved)
 		ret = 1;
 
+	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
 	mutex_unlock(&head->mutex);
 	btrfs_put_delayed_ref_head(head);
 	return ret;
@@ -6982,21 +3242,20 @@
 			   u64 parent, int last_ref)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ref generic_ref = { 0 };
 	int pin = 1;
 	int ret;
 
+	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
+			       buf->start, buf->len, parent);
+	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
+			    root->root_key.objectid);
+
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 		int old_ref_mod, new_ref_mod;
 
-		btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
-				   root->root_key.objectid,
-				   btrfs_header_level(buf), 0,
-				   BTRFS_DROP_DELAYED_REF);
-		ret = btrfs_add_delayed_tree_ref(trans, buf->start,
-						 buf->len, parent,
-						 root->root_key.objectid,
-						 btrfs_header_level(buf),
-						 BTRFS_DROP_DELAYED_REF, NULL,
+		btrfs_ref_tree_mod(fs_info, &generic_ref);
+		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
 						 &old_ref_mod, &new_ref_mod);
 		BUG_ON(ret); /* -ENOMEM */
 		pin = old_ref_mod >= 0 && new_ref_mod < 0;
@@ -7015,8 +3274,7 @@
 		cache = btrfs_lookup_block_group(fs_info, buf->start);
 
 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-			pin_down_extent(fs_info, cache, buf->start,
-					buf->len, 1);
+			pin_down_extent(cache, buf->start, buf->len, 1);
 			btrfs_put_block_group(cache);
 			goto out;
 		}
@@ -7030,8 +3288,7 @@
 	}
 out:
 	if (pin)
-		add_pinned_bytes(fs_info, buf->len, true,
-				 root->root_key.objectid);
+		add_pinned_bytes(fs_info, &generic_ref);
 
 	if (last_ref) {
 		/*
@@ -7043,108 +3300,52 @@
 }
 
 /* Can return -ENOMEM */
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int old_ref_mod, new_ref_mod;
 	int ret;
 
 	if (btrfs_is_testing(fs_info))
 		return 0;
 
-	if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
-		btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
-				   root_objectid, owner, offset,
-				   BTRFS_DROP_DELAYED_REF);
-
 	/*
 	 * tree log blocks never actually go into the extent allocation
 	 * tree, just update pinning info and exit early.
 	 */
-	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
-		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+	if ((ref->type == BTRFS_REF_METADATA &&
+	     ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+	    (ref->type == BTRFS_REF_DATA &&
+	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
 		/* unlocks the pinned mutex */
-		btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
+		btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
 		old_ref_mod = new_ref_mod = 0;
 		ret = 0;
-	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr,
-						 num_bytes, parent,
-						 root_objectid, (int)owner,
-						 BTRFS_DROP_DELAYED_REF, NULL,
+	} else if (ref->type == BTRFS_REF_METADATA) {
+		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
 						 &old_ref_mod, &new_ref_mod);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr,
-						 num_bytes, parent,
-						 root_objectid, owner, offset,
-						 0, BTRFS_DROP_DELAYED_REF,
+		ret = btrfs_add_delayed_data_ref(trans, ref, 0,
 						 &old_ref_mod, &new_ref_mod);
 	}
 
-	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
-		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+	if (!((ref->type == BTRFS_REF_METADATA &&
+	       ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+	      (ref->type == BTRFS_REF_DATA &&
+	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+		btrfs_ref_tree_mod(fs_info, ref);
 
-		add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
-	}
+	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
+		add_pinned_bytes(fs_info, ref);
 
 	return ret;
 }
 
-/*
- * when we wait for progress in the block group caching, its because
- * our allocation attempt failed at least once.  So, we must sleep
- * and let some progress happen before we try again.
- *
- * This function will sleep at least once waiting for new free space to
- * show up, and then it will check the block group free space numbers
- * for our min num_bytes.  Another option is to have it go ahead
- * and look in the rbtree for a free extent of a given size, but this
- * is a good start.
- *
- * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
- * any of the information in this block group.
- */
-static noinline void
-wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
-				u64 num_bytes)
-{
-	struct btrfs_caching_control *caching_ctl;
-
-	caching_ctl = get_caching_control(cache);
-	if (!caching_ctl)
-		return;
-
-	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
-		   (cache->free_space_ctl->free_space >= num_bytes));
-
-	put_caching_control(caching_ctl);
-}
-
-static noinline int
-wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
-{
-	struct btrfs_caching_control *caching_ctl;
-	int ret = 0;
-
-	caching_ctl = get_caching_control(cache);
-	if (!caching_ctl)
-		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
-	wait_event(caching_ctl->wait, block_group_cache_done(cache));
-	if (cache->cached == BTRFS_CACHE_ERROR)
-		ret = -EIO;
-	put_caching_control(caching_ctl);
-	return ret;
-}
-
 enum btrfs_loop_type {
-	LOOP_CACHING_NOWAIT = 0,
-	LOOP_CACHING_WAIT = 1,
-	LOOP_ALLOC_CHUNK = 2,
-	LOOP_NO_EMPTY_SIZE = 3,
+	LOOP_CACHING_NOWAIT,
+	LOOP_CACHING_WAIT,
+	LOOP_ALLOC_CHUNK,
+	LOOP_NO_EMPTY_SIZE,
 };
 
 static inline void
@@ -7212,6 +3413,343 @@
 }
 
 /*
+ * Structure used internally for find_free_extent() function.  Wraps needed
+ * parameters.
+ */
+struct find_free_extent_ctl {
+	/* Basic allocation info */
+	u64 ram_bytes;
+	u64 num_bytes;
+	u64 empty_size;
+	u64 flags;
+	int delalloc;
+
+	/* Where to start the search inside the bg */
+	u64 search_start;
+
+	/* For clustered allocation */
+	u64 empty_cluster;
+
+	bool have_caching_bg;
+	bool orig_have_caching_bg;
+
+	/* RAID index, converted from flags */
+	int index;
+
+	/*
+	 * Current loop number, check find_free_extent_update_loop() for details
+	 */
+	int loop;
+
+	/*
+	 * Whether we're refilling a cluster, if true we need to re-search
+	 * current block group but don't try to refill the cluster again.
+	 */
+	bool retry_clustered;
+
+	/*
+	 * Whether we're updating free space cache, if true we need to re-search
+	 * current block group but don't try updating free space cache again.
+	 */
+	bool retry_unclustered;
+
+	/* If current block group is cached */
+	int cached;
+
+	/* Max contiguous hole found */
+	u64 max_extent_size;
+
+	/* Total free space from free space cache, not always contiguous */
+	u64 total_free_space;
+
+	/* Found result */
+	u64 found_offset;
+};
+
+
+/*
+ * Helper function for find_free_extent().
+ *
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ * Return >0 to inform caller that we find nothing
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
+ */
+static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
+		struct btrfs_free_cluster *last_ptr,
+		struct find_free_extent_ctl *ffe_ctl,
+		struct btrfs_block_group_cache **cluster_bg_ret)
+{
+	struct btrfs_block_group_cache *cluster_bg;
+	u64 aligned_cluster;
+	u64 offset;
+	int ret;
+
+	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
+	if (!cluster_bg)
+		goto refill_cluster;
+	if (cluster_bg != bg && (cluster_bg->ro ||
+	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
+		goto release_cluster;
+
+	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
+			ffe_ctl->num_bytes, cluster_bg->key.objectid,
+			&ffe_ctl->max_extent_size);
+	if (offset) {
+		/* We have a block, we're done */
+		spin_unlock(&last_ptr->refill_lock);
+		trace_btrfs_reserve_extent_cluster(cluster_bg,
+				ffe_ctl->search_start, ffe_ctl->num_bytes);
+		*cluster_bg_ret = cluster_bg;
+		ffe_ctl->found_offset = offset;
+		return 0;
+	}
+	WARN_ON(last_ptr->block_group != cluster_bg);
+
+release_cluster:
+	/*
+	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
+	 * lets just skip it and let the allocator find whatever block it can
+	 * find. If we reach this point, we will have tried the cluster
+	 * allocator plenty of times and not have found anything, so we are
+	 * likely way too fragmented for the clustering stuff to find anything.
+	 *
+	 * However, if the cluster is taken from the current block group,
+	 * release the cluster first, so that we stand a better chance of
+	 * succeeding in the unclustered allocation.
+	 */
+	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
+		spin_unlock(&last_ptr->refill_lock);
+		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+		return -ENOENT;
+	}
+
+	/* This cluster didn't work out, free it and start over */
+	btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+	if (cluster_bg != bg)
+		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+
+refill_cluster:
+	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
+		spin_unlock(&last_ptr->refill_lock);
+		return -ENOENT;
+	}
+
+	aligned_cluster = max_t(u64,
+			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
+			bg->full_stripe_len);
+	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
+			ffe_ctl->num_bytes, aligned_cluster);
+	if (ret == 0) {
+		/* Now pull our allocation out of this cluster */
+		offset = btrfs_alloc_from_cluster(bg, last_ptr,
+				ffe_ctl->num_bytes, ffe_ctl->search_start,
+				&ffe_ctl->max_extent_size);
+		if (offset) {
+			/* We found one, proceed */
+			spin_unlock(&last_ptr->refill_lock);
+			trace_btrfs_reserve_extent_cluster(bg,
+					ffe_ctl->search_start,
+					ffe_ctl->num_bytes);
+			ffe_ctl->found_offset = offset;
+			return 0;
+		}
+	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+		   !ffe_ctl->retry_clustered) {
+		spin_unlock(&last_ptr->refill_lock);
+
+		ffe_ctl->retry_clustered = true;
+		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
+		return -EAGAIN;
+	}
+	/*
+	 * At this point we either didn't find a cluster or we weren't able to
+	 * allocate a block from our cluster.  Free the cluster we've been
+	 * trying to use, and go to the next block group.
+	 */
+	btrfs_return_cluster_to_free_space(NULL, last_ptr);
+	spin_unlock(&last_ptr->refill_lock);
+	return 1;
+}
+
+/*
+ * Return >0 to inform caller that we find nothing
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ */
+static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
+		struct btrfs_free_cluster *last_ptr,
+		struct find_free_extent_ctl *ffe_ctl)
+{
+	u64 offset;
+
+	/*
+	 * We are doing an unclustered allocation, set the fragmented flag so
+	 * we don't bother trying to setup a cluster again until we get more
+	 * space.
+	 */
+	if (unlikely(last_ptr)) {
+		spin_lock(&last_ptr->lock);
+		last_ptr->fragmented = 1;
+		spin_unlock(&last_ptr->lock);
+	}
+	if (ffe_ctl->cached) {
+		struct btrfs_free_space_ctl *free_space_ctl;
+
+		free_space_ctl = bg->free_space_ctl;
+		spin_lock(&free_space_ctl->tree_lock);
+		if (free_space_ctl->free_space <
+		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
+		    ffe_ctl->empty_size) {
+			ffe_ctl->total_free_space = max_t(u64,
+					ffe_ctl->total_free_space,
+					free_space_ctl->free_space);
+			spin_unlock(&free_space_ctl->tree_lock);
+			return 1;
+		}
+		spin_unlock(&free_space_ctl->tree_lock);
+	}
+
+	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
+			ffe_ctl->num_bytes, ffe_ctl->empty_size,
+			&ffe_ctl->max_extent_size);
+
+	/*
+	 * If we didn't find a chunk, and we haven't failed on this block group
+	 * before, and this block group is in the middle of caching and we are
+	 * ok with waiting, then go ahead and wait for progress to be made, and
+	 * set @retry_unclustered to true.
+	 *
+	 * If @retry_unclustered is true then we've already waited on this
+	 * block group once and should move on to the next block group.
+	 */
+	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
+	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
+		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+						      ffe_ctl->empty_size);
+		ffe_ctl->retry_unclustered = true;
+		return -EAGAIN;
+	} else if (!offset) {
+		return 1;
+	}
+	ffe_ctl->found_offset = offset;
+	return 0;
+}
+
+/*
+ * Return >0 means caller needs to re-search for free extent
+ * Return 0 means we have the needed free extent.
+ * Return <0 means we failed to locate any free extent.
+ */
+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+					struct btrfs_free_cluster *last_ptr,
+					struct btrfs_key *ins,
+					struct find_free_extent_ctl *ffe_ctl,
+					int full_search, bool use_cluster)
+{
+	struct btrfs_root *root = fs_info->extent_root;
+	int ret;
+
+	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
+	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
+		ffe_ctl->orig_have_caching_bg = true;
+
+	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
+	    ffe_ctl->have_caching_bg)
+		return 1;
+
+	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
+		return 1;
+
+	if (ins->objectid) {
+		if (!use_cluster && last_ptr) {
+			spin_lock(&last_ptr->lock);
+			last_ptr->window_start = ins->objectid;
+			spin_unlock(&last_ptr->lock);
+		}
+		return 0;
+	}
+
+	/*
+	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+	 *			caching kthreads as we move along
+	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+	 *		       again
+	 */
+	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
+		ffe_ctl->index = 0;
+		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
+			/*
+			 * We want to skip the LOOP_CACHING_WAIT step if we
+			 * don't have any uncached bgs and we've already done a
+			 * full search through.
+			 */
+			if (ffe_ctl->orig_have_caching_bg || !full_search)
+				ffe_ctl->loop = LOOP_CACHING_WAIT;
+			else
+				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
+		} else {
+			ffe_ctl->loop++;
+		}
+
+		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
+			struct btrfs_trans_handle *trans;
+			int exist = 0;
+
+			trans = current->journal_info;
+			if (trans)
+				exist = 1;
+			else
+				trans = btrfs_join_transaction(root);
+
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				return ret;
+			}
+
+			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
+						CHUNK_ALLOC_FORCE);
+
+			/*
+			 * If we can't allocate a new chunk we've already looped
+			 * through at least once, move on to the NO_EMPTY_SIZE
+			 * case.
+			 */
+			if (ret == -ENOSPC)
+				ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+
+			/* Do not bail out on ENOSPC since we can do more. */
+			if (ret < 0 && ret != -ENOSPC)
+				btrfs_abort_transaction(trans, ret);
+			else
+				ret = 0;
+			if (!exist)
+				btrfs_end_transaction(trans);
+			if (ret)
+				return ret;
+		}
+
+		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+			/*
+			 * Don't loop again if we already have no empty_size and
+			 * no empty_cluster.
+			 */
+			if (ffe_ctl->empty_size == 0 &&
+			    ffe_ctl->empty_cluster == 0)
+				return -ENOSPC;
+			ffe_ctl->empty_size = 0;
+			ffe_ctl->empty_cluster = 0;
+		}
+		return 1;
+	}
+	return -ENOSPC;
+}
+
+/*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
  * ins->objectid == start position
@@ -7221,6 +3759,20 @@
  *
  * If there is no suitable free space, we will record the max size of
  * the free space extent currently.
+ *
+ * The overall logic and call chain:
+ *
+ * find_free_extent()
+ * |- Iterate through all block groups
+ * |  |- Get a valid block group
+ * |  |- Try to do clustered allocation in that block group
+ * |  |- Try to do unclustered allocation in that block group
+ * |  |- Check if the result is valid
+ * |  |  |- If valid, then exit
+ * |  |- Jump to next block group
+ * |
+ * |- Push harder to find free extents
+ *    |- If not found, re-iterate all block groups
  */
 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 				u64 ram_bytes, u64 num_bytes, u64 empty_size,
@@ -7228,31 +3780,35 @@
 				u64 flags, int delalloc)
 {
 	int ret = 0;
-	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_free_cluster *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group = NULL;
-	u64 search_start = 0;
-	u64 max_extent_size = 0;
-	u64 max_free_space = 0;
-	u64 empty_cluster = 0;
+	struct find_free_extent_ctl ffe_ctl = {0};
 	struct btrfs_space_info *space_info;
-	int loop = 0;
-	int index = btrfs_bg_flags_to_raid_index(flags);
-	bool failed_cluster_refill = false;
-	bool failed_alloc = false;
 	bool use_cluster = true;
-	bool have_caching_bg = false;
-	bool orig_have_caching_bg = false;
 	bool full_search = false;
 
 	WARN_ON(num_bytes < fs_info->sectorsize);
+
+	ffe_ctl.ram_bytes = ram_bytes;
+	ffe_ctl.num_bytes = num_bytes;
+	ffe_ctl.empty_size = empty_size;
+	ffe_ctl.flags = flags;
+	ffe_ctl.search_start = 0;
+	ffe_ctl.retry_clustered = false;
+	ffe_ctl.retry_unclustered = false;
+	ffe_ctl.delalloc = delalloc;
+	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
+	ffe_ctl.have_caching_bg = false;
+	ffe_ctl.orig_have_caching_bg = false;
+	ffe_ctl.found_offset = 0;
+
 	ins->type = BTRFS_EXTENT_ITEM_KEY;
 	ins->objectid = 0;
 	ins->offset = 0;
 
 	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
 
-	space_info = __find_space_info(fs_info, flags);
+	space_info = btrfs_find_space_info(fs_info, flags);
 	if (!space_info) {
 		btrfs_err(fs_info, "No space info for %llu", flags);
 		return -ENOSPC;
@@ -7281,7 +3837,8 @@
 		spin_unlock(&space_info->lock);
 	}
 
-	last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
+	last_ptr = fetch_cluster_info(fs_info, space_info,
+				      &ffe_ctl.empty_cluster);
 	if (last_ptr) {
 		spin_lock(&last_ptr->lock);
 		if (last_ptr->block_group)
@@ -7298,10 +3855,12 @@
 		spin_unlock(&last_ptr->lock);
 	}
 
-	search_start = max(search_start, first_logical_byte(fs_info, 0));
-	search_start = max(search_start, hint_byte);
-	if (search_start == hint_byte) {
-		block_group = btrfs_lookup_block_group(fs_info, search_start);
+	ffe_ctl.search_start = max(ffe_ctl.search_start,
+				   first_logical_byte(fs_info, 0));
+	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
+	if (ffe_ctl.search_start == hint_byte) {
+		block_group = btrfs_lookup_block_group(fs_info,
+						       ffe_ctl.search_start);
 		/*
 		 * we don't want to use the block group if it doesn't match our
 		 * allocation bits, or if its not cached.
@@ -7323,7 +3882,7 @@
 				btrfs_put_block_group(block_group);
 				up_read(&space_info->groups_sem);
 			} else {
-				index = btrfs_bg_flags_to_raid_index(
+				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
 						block_group->flags);
 				btrfs_lock_block_group(block_group, delalloc);
 				goto have_block_group;
@@ -7333,21 +3892,19 @@
 		}
 	}
 search:
-	have_caching_bg = false;
-	if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
+	ffe_ctl.have_caching_bg = false;
+	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
+	    ffe_ctl.index == 0)
 		full_search = true;
 	down_read(&space_info->groups_sem);
-	list_for_each_entry(block_group, &space_info->block_groups[index],
-			    list) {
-		u64 offset;
-		int cached;
-
+	list_for_each_entry(block_group,
+			    &space_info->block_groups[ffe_ctl.index], list) {
 		/* If the block group is read-only, we can skip it entirely. */
 		if (unlikely(block_group->ro))
 			continue;
 
 		btrfs_grab_block_group(block_group, delalloc);
-		search_start = block_group->key.objectid;
+		ffe_ctl.search_start = block_group->key.objectid;
 
 		/*
 		 * this can happen if we end up cycling through all the
@@ -7356,9 +3913,8 @@
 		 */
 		if (!block_group_bits(block_group, flags)) {
 			u64 extra = BTRFS_BLOCK_GROUP_DUP |
-				BTRFS_BLOCK_GROUP_RAID1 |
-				BTRFS_BLOCK_GROUP_RAID5 |
-				BTRFS_BLOCK_GROUP_RAID6 |
+				BTRFS_BLOCK_GROUP_RAID1_MASK |
+				BTRFS_BLOCK_GROUP_RAID56_MASK |
 				BTRFS_BLOCK_GROUP_RAID10;
 
 			/*
@@ -7368,13 +3924,21 @@
 			 */
 			if ((flags & extra) && !(block_group->flags & extra))
 				goto loop;
+
+			/*
+			 * This block group has different flags than we want.
+			 * It's possible that we have MIXED_GROUP flag but no
+			 * block group is mixed.  Just skip such block group.
+			 */
+			btrfs_release_block_group(block_group, delalloc);
+			continue;
 		}
 
 have_block_group:
-		cached = block_group_cache_done(block_group);
-		if (unlikely(!cached)) {
-			have_caching_bg = true;
-			ret = cache_block_group(block_group, 0);
+		ffe_ctl.cached = btrfs_block_group_cache_done(block_group);
+		if (unlikely(!ffe_ctl.cached)) {
+			ffe_ctl.have_caching_bg = true;
+			ret = btrfs_cache_block_group(block_group, 0);
 			BUG_ON(ret < 0);
 			ret = 0;
 		}
@@ -7387,365 +3951,96 @@
 		 * lets look there
 		 */
 		if (last_ptr && use_cluster) {
-			struct btrfs_block_group_cache *used_block_group;
-			unsigned long aligned_cluster;
-			/*
-			 * the refill lock keeps out other
-			 * people trying to start a new cluster
-			 */
-			used_block_group = btrfs_lock_cluster(block_group,
-							      last_ptr,
-							      delalloc);
-			if (!used_block_group)
-				goto refill_cluster;
+			struct btrfs_block_group_cache *cluster_bg = NULL;
 
-			if (used_block_group != block_group &&
-			    (used_block_group->ro ||
-			     !block_group_bits(used_block_group, flags)))
-				goto release_cluster;
+			ret = find_free_extent_clustered(block_group, last_ptr,
+							 &ffe_ctl, &cluster_bg);
 
-			offset = btrfs_alloc_from_cluster(used_block_group,
-						last_ptr,
-						num_bytes,
-						used_block_group->key.objectid,
-						&max_extent_size);
-			if (offset) {
-				/* we have a block, we're done */
-				spin_unlock(&last_ptr->refill_lock);
-				trace_btrfs_reserve_extent_cluster(
-						used_block_group,
-						search_start, num_bytes);
-				if (used_block_group != block_group) {
+			if (ret == 0) {
+				if (cluster_bg && cluster_bg != block_group) {
 					btrfs_release_block_group(block_group,
 								  delalloc);
-					block_group = used_block_group;
+					block_group = cluster_bg;
 				}
 				goto checks;
-			}
-
-			WARN_ON(last_ptr->block_group != used_block_group);
-release_cluster:
-			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
-			 * set up a new clusters, so lets just skip it
-			 * and let the allocator find whatever block
-			 * it can find.  If we reach this point, we
-			 * will have tried the cluster allocator
-			 * plenty of times and not have found
-			 * anything, so we are likely way too
-			 * fragmented for the clustering stuff to find
-			 * anything.
-			 *
-			 * However, if the cluster is taken from the
-			 * current block group, release the cluster
-			 * first, so that we stand a better chance of
-			 * succeeding in the unclustered
-			 * allocation.  */
-			if (loop >= LOOP_NO_EMPTY_SIZE &&
-			    used_block_group != block_group) {
-				spin_unlock(&last_ptr->refill_lock);
-				btrfs_release_block_group(used_block_group,
-							  delalloc);
-				goto unclustered_alloc;
-			}
-
-			/*
-			 * this cluster didn't work out, free it and
-			 * start over
-			 */
-			btrfs_return_cluster_to_free_space(NULL, last_ptr);
-
-			if (used_block_group != block_group)
-				btrfs_release_block_group(used_block_group,
-							  delalloc);
-refill_cluster:
-			if (loop >= LOOP_NO_EMPTY_SIZE) {
-				spin_unlock(&last_ptr->refill_lock);
-				goto unclustered_alloc;
-			}
-
-			aligned_cluster = max_t(unsigned long,
-						empty_cluster + empty_size,
-					      block_group->full_stripe_len);
-
-			/* allocate a cluster in this block group */
-			ret = btrfs_find_space_cluster(fs_info, block_group,
-						       last_ptr, search_start,
-						       num_bytes,
-						       aligned_cluster);
-			if (ret == 0) {
-				/*
-				 * now pull our allocation out of this
-				 * cluster
-				 */
-				offset = btrfs_alloc_from_cluster(block_group,
-							last_ptr,
-							num_bytes,
-							search_start,
-							&max_extent_size);
-				if (offset) {
-					/* we found one, proceed */
-					spin_unlock(&last_ptr->refill_lock);
-					trace_btrfs_reserve_extent_cluster(
-						block_group, search_start,
-						num_bytes);
-					goto checks;
-				}
-			} else if (!cached && loop > LOOP_CACHING_NOWAIT
-				   && !failed_cluster_refill) {
-				spin_unlock(&last_ptr->refill_lock);
-
-				failed_cluster_refill = true;
-				wait_block_group_cache_progress(block_group,
-				       num_bytes + empty_cluster + empty_size);
+			} else if (ret == -EAGAIN) {
 				goto have_block_group;
-			}
-
-			/*
-			 * at this point we either didn't find a cluster
-			 * or we weren't able to allocate a block from our
-			 * cluster.  Free the cluster we've been trying
-			 * to use, and go to the next block group
-			 */
-			btrfs_return_cluster_to_free_space(NULL, last_ptr);
-			spin_unlock(&last_ptr->refill_lock);
-			goto loop;
-		}
-
-unclustered_alloc:
-		/*
-		 * We are doing an unclustered alloc, set the fragmented flag so
-		 * we don't bother trying to setup a cluster again until we get
-		 * more space.
-		 */
-		if (unlikely(last_ptr)) {
-			spin_lock(&last_ptr->lock);
-			last_ptr->fragmented = 1;
-			spin_unlock(&last_ptr->lock);
-		}
-		if (cached) {
-			struct btrfs_free_space_ctl *ctl =
-				block_group->free_space_ctl;
-
-			spin_lock(&ctl->tree_lock);
-			if (ctl->free_space <
-			    num_bytes + empty_cluster + empty_size) {
-				max_free_space = max(max_free_space,
-						     ctl->free_space);
-				spin_unlock(&ctl->tree_lock);
+			} else if (ret > 0) {
 				goto loop;
 			}
-			spin_unlock(&ctl->tree_lock);
+			/* ret == -ENOENT case falls through */
 		}
 
-		offset = btrfs_find_space_for_alloc(block_group, search_start,
-						    num_bytes, empty_size,
-						    &max_extent_size);
-		/*
-		 * If we didn't find a chunk, and we haven't failed on this
-		 * block group before, and this block group is in the middle of
-		 * caching and we are ok with waiting, then go ahead and wait
-		 * for progress to be made, and set failed_alloc to true.
-		 *
-		 * If failed_alloc is true then we've already waited on this
-		 * block group once and should move on to the next block group.
-		 */
-		if (!offset && !failed_alloc && !cached &&
-		    loop > LOOP_CACHING_NOWAIT) {
-			wait_block_group_cache_progress(block_group,
-						num_bytes + empty_size);
-			failed_alloc = true;
+		ret = find_free_extent_unclustered(block_group, last_ptr,
+						   &ffe_ctl);
+		if (ret == -EAGAIN)
 			goto have_block_group;
-		} else if (!offset) {
+		else if (ret > 0)
 			goto loop;
-		}
+		/* ret == 0 case falls through */
 checks:
-		search_start = round_up(offset, fs_info->stripesize);
+		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
+					     fs_info->stripesize);
 
 		/* move on to the next group */
-		if (search_start + num_bytes >
+		if (ffe_ctl.search_start + num_bytes >
 		    block_group->key.objectid + block_group->key.offset) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
+			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+					     num_bytes);
 			goto loop;
 		}
 
-		if (offset < search_start)
-			btrfs_add_free_space(block_group, offset,
-					     search_start - offset);
+		if (ffe_ctl.found_offset < ffe_ctl.search_start)
+			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+				ffe_ctl.search_start - ffe_ctl.found_offset);
 
 		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
 				num_bytes, delalloc);
 		if (ret == -EAGAIN) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
+			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+					     num_bytes);
 			goto loop;
 		}
 		btrfs_inc_block_group_reservations(block_group);
 
 		/* we are all good, lets return */
-		ins->objectid = search_start;
+		ins->objectid = ffe_ctl.search_start;
 		ins->offset = num_bytes;
 
-		trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
+		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
+					   num_bytes);
 		btrfs_release_block_group(block_group, delalloc);
 		break;
 loop:
-		failed_cluster_refill = false;
-		failed_alloc = false;
+		ffe_ctl.retry_clustered = false;
+		ffe_ctl.retry_unclustered = false;
 		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
-		       index);
+		       ffe_ctl.index);
 		btrfs_release_block_group(block_group, delalloc);
 		cond_resched();
 	}
 	up_read(&space_info->groups_sem);
 
-	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
-		&& !orig_have_caching_bg)
-		orig_have_caching_bg = true;
-
-	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
+					   full_search, use_cluster);
+	if (ret > 0)
 		goto search;
 
-	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
-		goto search;
-
-	/*
-	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
-	 *			caching kthreads as we move along
-	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
-	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
-	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
-	 *			again
-	 */
-	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
-		index = 0;
-		if (loop == LOOP_CACHING_NOWAIT) {
-			/*
-			 * We want to skip the LOOP_CACHING_WAIT step if we
-			 * don't have any uncached bgs and we've already done a
-			 * full search through.
-			 */
-			if (orig_have_caching_bg || !full_search)
-				loop = LOOP_CACHING_WAIT;
-			else
-				loop = LOOP_ALLOC_CHUNK;
-		} else {
-			loop++;
-		}
-
-		if (loop == LOOP_ALLOC_CHUNK) {
-			struct btrfs_trans_handle *trans;
-			int exist = 0;
-
-			trans = current->journal_info;
-			if (trans)
-				exist = 1;
-			else
-				trans = btrfs_join_transaction(root);
-
-			if (IS_ERR(trans)) {
-				ret = PTR_ERR(trans);
-				goto out;
-			}
-
-			ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
-
-			/*
-			 * If we can't allocate a new chunk we've already looped
-			 * through at least once, move on to the NO_EMPTY_SIZE
-			 * case.
-			 */
-			if (ret == -ENOSPC)
-				loop = LOOP_NO_EMPTY_SIZE;
-
-			/*
-			 * Do not bail out on ENOSPC since we
-			 * can do more things.
-			 */
-			if (ret < 0 && ret != -ENOSPC)
-				btrfs_abort_transaction(trans, ret);
-			else
-				ret = 0;
-			if (!exist)
-				btrfs_end_transaction(trans);
-			if (ret)
-				goto out;
-		}
-
-		if (loop == LOOP_NO_EMPTY_SIZE) {
-			/*
-			 * Don't loop again if we already have no empty_size and
-			 * no empty_cluster.
-			 */
-			if (empty_size == 0 &&
-			    empty_cluster == 0) {
-				ret = -ENOSPC;
-				goto out;
-			}
-			empty_size = 0;
-			empty_cluster = 0;
-		}
-
-		goto search;
-	} else if (!ins->objectid) {
-		ret = -ENOSPC;
-	} else if (ins->objectid) {
-		if (!use_cluster && last_ptr) {
-			spin_lock(&last_ptr->lock);
-			last_ptr->window_start = ins->objectid;
-			spin_unlock(&last_ptr->lock);
-		}
-		ret = 0;
-	}
-out:
 	if (ret == -ENOSPC) {
-		if (!max_extent_size)
-			max_extent_size = max_free_space;
+		/*
+		 * Use ffe_ctl->total_free_space as fallback if we can't find
+		 * any contiguous hole.
+		 */
+		if (!ffe_ctl.max_extent_size)
+			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
 		spin_lock(&space_info->lock);
-		space_info->max_extent_size = max_extent_size;
+		space_info->max_extent_size = ffe_ctl.max_extent_size;
 		spin_unlock(&space_info->lock);
-		ins->offset = max_extent_size;
+		ins->offset = ffe_ctl.max_extent_size;
 	}
 	return ret;
 }
 
-static void dump_space_info(struct btrfs_fs_info *fs_info,
-			    struct btrfs_space_info *info, u64 bytes,
-			    int dump_block_groups)
-{
-	struct btrfs_block_group_cache *cache;
-	int index = 0;
-
-	spin_lock(&info->lock);
-	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
-		   info->flags,
-		   info->total_bytes - btrfs_space_info_used(info, true),
-		   info->full ? "" : "not ");
-	btrfs_info(fs_info,
-		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
-		info->total_bytes, info->bytes_used, info->bytes_pinned,
-		info->bytes_reserved, info->bytes_may_use,
-		info->bytes_readonly);
-	spin_unlock(&info->lock);
-
-	if (!dump_block_groups)
-		return;
-
-	down_read(&info->groups_sem);
-again:
-	list_for_each_entry(cache, &info->block_groups[index], list) {
-		spin_lock(&cache->lock);
-		btrfs_info(fs_info,
-			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
-			cache->key.objectid, cache->key.offset,
-			btrfs_block_group_used(&cache->item), cache->pinned,
-			cache->reserved, cache->ro ? "[readonly]" : "");
-		btrfs_dump_free_space(cache, bytes);
-		spin_unlock(&cache->lock);
-	}
-	if (++index < BTRFS_NR_RAID_TYPES)
-		goto again;
-	up_read(&info->groups_sem);
-}
-
 /*
  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
  *			  hole that is at least as big as @num_bytes.
@@ -7821,12 +4116,13 @@
 		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 			struct btrfs_space_info *sinfo;
 
-			sinfo = __find_space_info(fs_info, flags);
+			sinfo = btrfs_find_space_info(fs_info, flags);
 			btrfs_err(fs_info,
 				  "allocation failed flags %llu, wanted %llu",
 				  flags, num_bytes);
 			if (sinfo)
-				dump_space_info(fs_info, sinfo, num_bytes, 1);
+				btrfs_dump_space_info(fs_info, sinfo,
+						      num_bytes, 1);
 		}
 	}
 
@@ -7848,7 +4144,7 @@
 	}
 
 	if (pin)
-		pin_down_extent(fs_info, cache, start, len, 1);
+		pin_down_extent(cache, start, len, 1);
 	else {
 		if (btrfs_test_opt(fs_info, DISCARD))
 			ret = btrfs_discard_extent(fs_info, start, len, NULL);
@@ -7937,7 +4233,7 @@
 	if (ret)
 		return ret;
 
-	ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
+	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
 			ins->objectid, ins->offset);
@@ -8027,8 +4323,8 @@
 	if (ret)
 		return ret;
 
-	ret = update_block_group(trans, fs_info, extent_key.objectid,
-				 fs_info->nodesize, 1);
+	ret = btrfs_update_block_group(trans, extent_key.objectid,
+				       fs_info->nodesize, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
 			extent_key.objectid, extent_key.offset);
@@ -8045,19 +4341,17 @@
 				     u64 offset, u64 ram_bytes,
 				     struct btrfs_key *ins)
 {
+	struct btrfs_ref generic_ref = { 0 };
 	int ret;
 
 	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 
-	btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
-			   root->root_key.objectid, owner, offset,
-			   BTRFS_ADD_DELAYED_EXTENT);
-
-	ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
-					 ins->offset, 0,
-					 root->root_key.objectid, owner,
-					 offset, ram_bytes,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
+	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
+			       ins->objectid, ins->offset, 0);
+	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
+	ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
+					 ram_bytes, NULL, NULL);
 	return ret;
 }
 
@@ -8130,10 +4424,10 @@
 
 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
 	btrfs_tree_lock(buf);
-	clean_tree_block(fs_info, buf);
+	btrfs_clean_tree_block(buf);
 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
-	btrfs_set_lock_blocking(buf);
+	btrfs_set_lock_blocking_write(buf);
 	set_extent_buffer_uptodate(buf);
 
 	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8142,13 +4436,13 @@
 	btrfs_set_header_generation(buf, trans->transid);
 	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(buf, owner);
-	write_extent_buffer_fsid(buf, fs_info->fsid);
+	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
 	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		buf->log_index = root->log_transid % 2;
 		/*
 		 * we allow two log transactions at a time, use different
-		 * EXENT bit to differentiate dirty pages.
+		 * EXTENT bit to differentiate dirty pages.
 		 */
 		if (buf->log_index == 0)
 			set_extent_dirty(&root->dirty_log_pages, buf->start,
@@ -8166,68 +4460,6 @@
 	return buf;
 }
 
-static struct btrfs_block_rsv *
-use_block_rsv(struct btrfs_trans_handle *trans,
-	      struct btrfs_root *root, u32 blocksize)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_block_rsv *block_rsv;
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	int ret;
-	bool global_updated = false;
-
-	block_rsv = get_block_rsv(trans, root);
-
-	if (unlikely(block_rsv->size == 0))
-		goto try_reserve;
-again:
-	ret = block_rsv_use_bytes(block_rsv, blocksize);
-	if (!ret)
-		return block_rsv;
-
-	if (block_rsv->failfast)
-		return ERR_PTR(ret);
-
-	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
-		global_updated = true;
-		update_global_block_rsv(fs_info);
-		goto again;
-	}
-
-	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
-		static DEFINE_RATELIMIT_STATE(_rs,
-				DEFAULT_RATELIMIT_INTERVAL * 10,
-				/*DEFAULT_RATELIMIT_BURST*/ 1);
-		if (__ratelimit(&_rs))
-			WARN(1, KERN_DEBUG
-				"BTRFS: block rsv returned %d\n", ret);
-	}
-try_reserve:
-	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
-				     BTRFS_RESERVE_NO_FLUSH);
-	if (!ret)
-		return block_rsv;
-	/*
-	 * If we couldn't reserve metadata bytes try and use some from
-	 * the global reserve if its space type is the same as the global
-	 * reservation.
-	 */
-	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
-	    block_rsv->space_info == global_rsv->space_info) {
-		ret = block_rsv_use_bytes(global_rsv, blocksize);
-		if (!ret)
-			return global_rsv;
-	}
-	return ERR_PTR(ret);
-}
-
-static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
-			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
-{
-	block_rsv_add_bytes(block_rsv, blocksize, 0);
-	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
-}
-
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the tree buffer or an ERR_PTR on error.
@@ -8244,6 +4476,7 @@
 	struct btrfs_block_rsv *block_rsv;
 	struct extent_buffer *buf;
 	struct btrfs_delayed_extent_op *extent_op;
+	struct btrfs_ref generic_ref = { 0 };
 	u64 flags = 0;
 	int ret;
 	u32 blocksize = fs_info->nodesize;
@@ -8259,7 +4492,7 @@
 	}
 #endif
 
-	block_rsv = use_block_rsv(trans, root, blocksize);
+	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
 	if (IS_ERR(block_rsv))
 		return ERR_CAST(block_rsv);
 
@@ -8298,13 +4531,12 @@
 		extent_op->is_data = false;
 		extent_op->level = level;
 
-		btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
-				   root_objectid, level, 0,
-				   BTRFS_ADD_DELAYED_EXTENT);
-		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
-						 ins.offset, parent,
-						 root_objectid, level,
-						 BTRFS_ADD_DELAYED_EXTENT,
+		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
+				       ins.objectid, ins.offset, parent);
+		generic_ref.real_root = root->root_key.objectid;
+		btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+		btrfs_ref_tree_mod(fs_info, &generic_ref);
+		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
 						 extent_op, NULL, NULL);
 		if (ret)
 			goto out_free_delayed;
@@ -8318,7 +4550,7 @@
 out_free_reserved:
 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
 out_unuse:
-	unuse_block_rsv(fs_info, block_rsv, blocksize);
+	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
 	return ERR_PTR(ret);
 }
 
@@ -8326,6 +4558,8 @@
 	u64 refs[BTRFS_MAX_LEVEL];
 	u64 flags[BTRFS_MAX_LEVEL];
 	struct btrfs_key update_progress;
+	struct btrfs_key drop_progress;
+	int drop_level;
 	int stage;
 	int level;
 	int shared_level;
@@ -8333,6 +4567,7 @@
 	int keep_locks;
 	int reada_slot;
 	int reada_count;
+	int restarted;
 };
 
 #define DROP_REFERENCE	1
@@ -8477,7 +4712,7 @@
 		BUG_ON(ret); /* -ENOMEM */
 		ret = btrfs_dec_ref(trans, root, eb, 0);
 		BUG_ON(ret); /* -ENOMEM */
-		ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
+		ret = btrfs_set_disk_extent_flags(trans, eb->start,
 						  eb->len, flag,
 						  btrfs_header_level(eb), 0);
 		BUG_ON(ret); /* -ENOMEM */
@@ -8496,6 +4731,33 @@
 }
 
 /*
+ * This is used to verify a ref exists for this root to deal with a bug where we
+ * would have a drop_progress key that hadn't been updated properly.
+ */
+static int check_ref_exists(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr, u64 parent,
+			    int level)
+{
+	struct btrfs_path *path;
+	struct btrfs_extent_inline_ref *iref;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = lookup_extent_backref(trans, path, &iref, bytenr,
+				    root->fs_info->nodesize, parent,
+				    root->root_key.objectid, level, 0);
+	btrfs_free_path(path);
+	if (ret == -ENOENT)
+		return 0;
+	if (ret < 0)
+		return ret;
+	return 1;
+}
+
+/*
  * helper to process tree block pointer.
  *
  * when wc->stage == DROP_REFERENCE, this function checks
@@ -8517,9 +4779,9 @@
 	u64 bytenr;
 	u64 generation;
 	u64 parent;
-	u32 blocksize;
 	struct btrfs_key key;
 	struct btrfs_key first_key;
+	struct btrfs_ref ref = { 0 };
 	struct extent_buffer *next;
 	int level = wc->level;
 	int reada = 0;
@@ -8542,7 +4804,6 @@
 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
 	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
 			      path->slots[level]);
-	blocksize = fs_info->nodesize;
 
 	next = find_extent_buffer(fs_info, bytenr);
 	if (!next) {
@@ -8555,7 +4816,7 @@
 		reada = 1;
 	}
 	btrfs_tree_lock(next);
-	btrfs_set_lock_blocking(next);
+	btrfs_set_lock_blocking_write(next);
 
 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
 				       &wc->refs[level - 1],
@@ -8615,7 +4876,7 @@
 			return -EIO;
 		}
 		btrfs_tree_lock(next);
-		btrfs_set_lock_blocking(next);
+		btrfs_set_lock_blocking_write(next);
 	}
 
 	level--;
@@ -8651,7 +4912,30 @@
 			parent = 0;
 		}
 
-		if (need_account) {
+		/*
+		 * If we had a drop_progress we need to verify the refs are set
+		 * as expected.  If we find our ref then we know that from here
+		 * on out everything should be correct, and we can clear the
+		 * ->restarted flag.
+		 */
+		if (wc->restarted) {
+			ret = check_ref_exists(trans, root, bytenr, parent,
+					       level - 1);
+			if (ret < 0)
+				goto out_unlock;
+			if (ret == 0)
+				goto no_delete;
+			ret = 0;
+			wc->restarted = 0;
+		}
+
+		/*
+		 * Reloc tree doesn't contribute to qgroup numbers, and we have
+		 * already accounted them at merge time (replace_path),
+		 * thus we could skip expensive subtree trace here.
+		 */
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+		    need_account) {
 			ret = btrfs_qgroup_trace_subtree(trans, next,
 							 generation, level - 1);
 			if (ret) {
@@ -8660,13 +4944,24 @@
 					     ret);
 			}
 		}
-		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
-					parent, root->root_key.objectid,
-					level - 1, 0);
+
+		/*
+		 * We need to update the next key in our walk control so we can
+		 * update the drop_progress key accordingly.  We don't care if
+		 * find_next_key doesn't find a key because that means we're at
+		 * the end and are going to clean up now.
+		 */
+		wc->drop_level = level;
+		find_next_key(path, level, &wc->drop_progress);
+
+		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+				       fs_info->nodesize, parent);
+		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+		ret = btrfs_free_extent(trans, &ref);
 		if (ret)
 			goto out_unlock;
 	}
-
+no_delete:
 	*lookup_info = 1;
 	ret = 1;
 
@@ -8721,7 +5016,7 @@
 		if (!path->locks[level]) {
 			BUG_ON(level == 0);
 			btrfs_tree_lock(eb);
-			btrfs_set_lock_blocking(eb);
+			btrfs_set_lock_blocking_write(eb);
 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
 			ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -8752,21 +5047,23 @@
 			else
 				ret = btrfs_dec_ref(trans, root, eb, 0);
 			BUG_ON(ret); /* -ENOMEM */
-			ret = btrfs_qgroup_trace_leaf_items(trans, eb);
-			if (ret) {
-				btrfs_err_rl(fs_info,
-					     "error %d accounting leaf items. Quota is out of sync, rescan required.",
+			if (is_fstree(root->root_key.objectid)) {
+				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
+				if (ret) {
+					btrfs_err_rl(fs_info,
+	"error %d accounting leaf items, quota is out of sync, rescan required",
 					     ret);
+				}
 			}
 		}
-		/* make block locked assertion in clean_tree_block happy */
+		/* make block locked assertion in btrfs_clean_tree_block happy */
 		if (!path->locks[level] &&
 		    btrfs_header_generation(eb) == trans->transid) {
 			btrfs_tree_lock(eb);
-			btrfs_set_lock_blocking(eb);
+			btrfs_set_lock_blocking_write(eb);
 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 		}
-		clean_tree_block(fs_info, eb);
+		btrfs_clean_tree_block(eb);
 	}
 
 	if (eb == root->node) {
@@ -8890,7 +5187,7 @@
 	int level;
 	bool root_dropped = false;
 
-	btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
+	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -8911,13 +5208,26 @@
 		goto out_free;
 	}
 
+	err = btrfs_run_delayed_items(trans);
+	if (err)
+		goto out_end_trans;
+
 	if (block_rsv)
 		trans->block_rsv = block_rsv;
 
+	/*
+	 * This will help us catch people modifying the fs tree while we're
+	 * dropping it.  It is unsafe to mess with the fs tree while it's being
+	 * dropped as we unlock the root node and parent nodes as we walk down
+	 * the tree, assuming nothing will change.  If something does change
+	 * then we'll have stale information and drop references to blocks we've
+	 * already dropped.
+	 */
+	set_bit(BTRFS_ROOT_DELETING, &root->state);
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
 		level = btrfs_header_level(root->node);
 		path->nodes[level] = btrfs_lock_root_node(root);
-		btrfs_set_lock_blocking(path->nodes[level]);
+		btrfs_set_lock_blocking_write(path->nodes[level]);
 		path->slots[level] = 0;
 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 		memset(&wc->update_progress, 0,
@@ -8947,7 +5257,7 @@
 		level = btrfs_header_level(root->node);
 		while (1) {
 			btrfs_tree_lock(path->nodes[level]);
-			btrfs_set_lock_blocking(path->nodes[level]);
+			btrfs_set_lock_blocking_write(path->nodes[level]);
 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
 			ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -8970,6 +5280,7 @@
 		}
 	}
 
+	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
 	wc->level = level;
 	wc->shared_level = -1;
 	wc->stage = DROP_REFERENCE;
@@ -8997,12 +5308,14 @@
 		}
 
 		if (wc->stage == DROP_REFERENCE) {
-			level = wc->level;
-			btrfs_node_key(path->nodes[level],
-				       &root_item->drop_progress,
-				       path->slots[level]);
-			root_item->drop_level = level;
+			wc->drop_level = wc->level;
+			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
+					      &wc->drop_progress,
+					      path->slots[wc->drop_level]);
 		}
+		btrfs_cpu_key_to_disk(&root_item->drop_progress,
+				      &wc->drop_progress);
+		root_item->drop_level = wc->drop_level;
 
 		BUG_ON(wc->level == 0);
 		if (btrfs_should_end_transaction(trans) ||
@@ -9161,177 +5474,6 @@
 	return ret;
 }
 
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
-{
-	u64 num_devices;
-	u64 stripped;
-
-	/*
-	 * if restripe for this chunk_type is on pick target profile and
-	 * return, otherwise do the usual balance
-	 */
-	stripped = get_restripe_target(fs_info, flags);
-	if (stripped)
-		return extended_to_chunk(stripped);
-
-	num_devices = fs_info->fs_devices->rw_devices;
-
-	stripped = BTRFS_BLOCK_GROUP_RAID0 |
-		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
-		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
-
-	if (num_devices == 1) {
-		stripped |= BTRFS_BLOCK_GROUP_DUP;
-		stripped = flags & ~stripped;
-
-		/* turn raid0 into single device chunks */
-		if (flags & BTRFS_BLOCK_GROUP_RAID0)
-			return stripped;
-
-		/* turn mirroring into duplication */
-		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-			     BTRFS_BLOCK_GROUP_RAID10))
-			return stripped | BTRFS_BLOCK_GROUP_DUP;
-	} else {
-		/* they already had raid on here, just return */
-		if (flags & stripped)
-			return flags;
-
-		stripped |= BTRFS_BLOCK_GROUP_DUP;
-		stripped = flags & ~stripped;
-
-		/* switch duplicated blocks with raid1 */
-		if (flags & BTRFS_BLOCK_GROUP_DUP)
-			return stripped | BTRFS_BLOCK_GROUP_RAID1;
-
-		/* this is drive concat, leave it alone */
-	}
-
-	return flags;
-}
-
-static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
-{
-	struct btrfs_space_info *sinfo = cache->space_info;
-	u64 num_bytes;
-	u64 min_allocable_bytes;
-	int ret = -ENOSPC;
-
-	/*
-	 * We need some metadata space and system metadata space for
-	 * allocating chunks in some corner cases until we force to set
-	 * it to be readonly.
-	 */
-	if ((sinfo->flags &
-	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
-	    !force)
-		min_allocable_bytes = SZ_1M;
-	else
-		min_allocable_bytes = 0;
-
-	spin_lock(&sinfo->lock);
-	spin_lock(&cache->lock);
-
-	if (cache->ro) {
-		cache->ro++;
-		ret = 0;
-		goto out;
-	}
-
-	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
-		    cache->bytes_super - btrfs_block_group_used(&cache->item);
-
-	if (btrfs_space_info_used(sinfo, true) + num_bytes +
-	    min_allocable_bytes <= sinfo->total_bytes) {
-		sinfo->bytes_readonly += num_bytes;
-		cache->ro++;
-		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
-		ret = 0;
-	}
-out:
-	spin_unlock(&cache->lock);
-	spin_unlock(&sinfo->lock);
-	return ret;
-}
-
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
-
-{
-	struct btrfs_fs_info *fs_info = cache->fs_info;
-	struct btrfs_trans_handle *trans;
-	u64 alloc_flags;
-	int ret;
-
-again:
-	trans = btrfs_join_transaction(fs_info->extent_root);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
-
-	/*
-	 * we're not allowed to set block groups readonly after the dirty
-	 * block groups cache has started writing.  If it already started,
-	 * back off and let this transaction commit
-	 */
-	mutex_lock(&fs_info->ro_block_group_mutex);
-	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
-		u64 transid = trans->transid;
-
-		mutex_unlock(&fs_info->ro_block_group_mutex);
-		btrfs_end_transaction(trans);
-
-		ret = btrfs_wait_for_commit(fs_info, transid);
-		if (ret)
-			return ret;
-		goto again;
-	}
-
-	/*
-	 * if we are changing raid levels, try to allocate a corresponding
-	 * block group with the new raid level.
-	 */
-	alloc_flags = update_block_group_flags(fs_info, cache->flags);
-	if (alloc_flags != cache->flags) {
-		ret = do_chunk_alloc(trans, alloc_flags,
-				     CHUNK_ALLOC_FORCE);
-		/*
-		 * ENOSPC is allowed here, we may have enough space
-		 * already allocated at the new raid level to
-		 * carry on
-		 */
-		if (ret == -ENOSPC)
-			ret = 0;
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = inc_block_group_ro(cache, 0);
-	if (!ret)
-		goto out;
-	alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
-	ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
-	if (ret < 0)
-		goto out;
-	ret = inc_block_group_ro(cache, 0);
-out:
-	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
-		alloc_flags = update_block_group_flags(fs_info, cache->flags);
-		mutex_lock(&fs_info->chunk_mutex);
-		check_system_chunk(trans, alloc_flags);
-		mutex_unlock(&fs_info->chunk_mutex);
-	}
-	mutex_unlock(&fs_info->ro_block_group_mutex);
-
-	btrfs_end_transaction(trans);
-	return ret;
-}
-
-int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
-{
-	u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
-
-	return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
-}
-
 /*
  * helper to account the unused space of all the readonly block group in the
  * space_info. takes mirrors into account.
@@ -9367,1397 +5509,6 @@
 	return free_bytes;
 }
 
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
-{
-	struct btrfs_space_info *sinfo = cache->space_info;
-	u64 num_bytes;
-
-	BUG_ON(!cache->ro);
-
-	spin_lock(&sinfo->lock);
-	spin_lock(&cache->lock);
-	if (!--cache->ro) {
-		num_bytes = cache->key.offset - cache->reserved -
-			    cache->pinned - cache->bytes_super -
-			    btrfs_block_group_used(&cache->item);
-		sinfo->bytes_readonly -= num_bytes;
-		list_del_init(&cache->ro_list);
-	}
-	spin_unlock(&cache->lock);
-	spin_unlock(&sinfo->lock);
-}
-
-/*
- * checks to see if its even possible to relocate this block group.
- *
- * @return - -1 if it's not a good idea to relocate this block group, 0 if its
- * ok to go ahead and try.
- */
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
-	struct btrfs_root *root = fs_info->extent_root;
-	struct btrfs_block_group_cache *block_group;
-	struct btrfs_space_info *space_info;
-	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
-	struct btrfs_device *device;
-	struct btrfs_trans_handle *trans;
-	u64 min_free;
-	u64 dev_min = 1;
-	u64 dev_nr = 0;
-	u64 target;
-	int debug;
-	int index;
-	int full = 0;
-	int ret = 0;
-
-	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
-
-	block_group = btrfs_lookup_block_group(fs_info, bytenr);
-
-	/* odd, couldn't find the block group, leave it alone */
-	if (!block_group) {
-		if (debug)
-			btrfs_warn(fs_info,
-				   "can't find block group for bytenr %llu",
-				   bytenr);
-		return -1;
-	}
-
-	min_free = btrfs_block_group_used(&block_group->item);
-
-	/* no bytes used, we're good */
-	if (!min_free)
-		goto out;
-
-	space_info = block_group->space_info;
-	spin_lock(&space_info->lock);
-
-	full = space_info->full;
-
-	/*
-	 * if this is the last block group we have in this space, we can't
-	 * relocate it unless we're able to allocate a new chunk below.
-	 *
-	 * Otherwise, we need to make sure we have room in the space to handle
-	 * all of the extents from this block group.  If we can, we're good
-	 */
-	if ((space_info->total_bytes != block_group->key.offset) &&
-	    (btrfs_space_info_used(space_info, false) + min_free <
-	     space_info->total_bytes)) {
-		spin_unlock(&space_info->lock);
-		goto out;
-	}
-	spin_unlock(&space_info->lock);
-
-	/*
-	 * ok we don't have enough space, but maybe we have free space on our
-	 * devices to allocate new chunks for relocation, so loop through our
-	 * alloc devices and guess if we have enough space.  if this block
-	 * group is going to be restriped, run checks against the target
-	 * profile instead of the current one.
-	 */
-	ret = -1;
-
-	/*
-	 * index:
-	 *      0: raid10
-	 *      1: raid1
-	 *      2: dup
-	 *      3: raid0
-	 *      4: single
-	 */
-	target = get_restripe_target(fs_info, block_group->flags);
-	if (target) {
-		index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
-	} else {
-		/*
-		 * this is just a balance, so if we were marked as full
-		 * we know there is no space for a new chunk
-		 */
-		if (full) {
-			if (debug)
-				btrfs_warn(fs_info,
-					   "no space to alloc new chunk for block group %llu",
-					   block_group->key.objectid);
-			goto out;
-		}
-
-		index = btrfs_bg_flags_to_raid_index(block_group->flags);
-	}
-
-	if (index == BTRFS_RAID_RAID10) {
-		dev_min = 4;
-		/* Divide by 2 */
-		min_free >>= 1;
-	} else if (index == BTRFS_RAID_RAID1) {
-		dev_min = 2;
-	} else if (index == BTRFS_RAID_DUP) {
-		/* Multiply by 2 */
-		min_free <<= 1;
-	} else if (index == BTRFS_RAID_RAID0) {
-		dev_min = fs_devices->rw_devices;
-		min_free = div64_u64(min_free, dev_min);
-	}
-
-	/* We need to do this so that we can look at pending chunks */
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans)) {
-		ret = PTR_ERR(trans);
-		goto out;
-	}
-
-	mutex_lock(&fs_info->chunk_mutex);
-	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-		u64 dev_offset;
-
-		/*
-		 * check to make sure we can actually find a chunk with enough
-		 * space to fit our block group in.
-		 */
-		if (device->total_bytes > device->bytes_used + min_free &&
-		    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
-			ret = find_free_dev_extent(trans, device, min_free,
-						   &dev_offset, NULL);
-			if (!ret)
-				dev_nr++;
-
-			if (dev_nr >= dev_min)
-				break;
-
-			ret = -1;
-		}
-	}
-	if (debug && ret == -1)
-		btrfs_warn(fs_info,
-			   "no space to allocate a new chunk for block group %llu",
-			   block_group->key.objectid);
-	mutex_unlock(&fs_info->chunk_mutex);
-	btrfs_end_transaction(trans);
-out:
-	btrfs_put_block_group(block_group);
-	return ret;
-}
-
-static int find_first_block_group(struct btrfs_fs_info *fs_info,
-				  struct btrfs_path *path,
-				  struct btrfs_key *key)
-{
-	struct btrfs_root *root = fs_info->extent_root;
-	int ret = 0;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
-	struct btrfs_block_group_item bg;
-	u64 flags;
-	int slot;
-
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-
-	while (1) {
-		slot = path->slots[0];
-		leaf = path->nodes[0];
-		if (slot >= btrfs_header_nritems(leaf)) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret == 0)
-				continue;
-			if (ret < 0)
-				goto out;
-			break;
-		}
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
-		if (found_key.objectid >= key->objectid &&
-		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			struct extent_map_tree *em_tree;
-			struct extent_map *em;
-
-			em_tree = &root->fs_info->mapping_tree.map_tree;
-			read_lock(&em_tree->lock);
-			em = lookup_extent_mapping(em_tree, found_key.objectid,
-						   found_key.offset);
-			read_unlock(&em_tree->lock);
-			if (!em) {
-				btrfs_err(fs_info,
-			"logical %llu len %llu found bg but no related chunk",
-					  found_key.objectid, found_key.offset);
-				ret = -ENOENT;
-			} else if (em->start != found_key.objectid ||
-				   em->len != found_key.offset) {
-				btrfs_err(fs_info,
-		"block group %llu len %llu mismatch with chunk %llu len %llu",
-					  found_key.objectid, found_key.offset,
-					  em->start, em->len);
-				ret = -EUCLEAN;
-			} else {
-				read_extent_buffer(leaf, &bg,
-					btrfs_item_ptr_offset(leaf, slot),
-					sizeof(bg));
-				flags = btrfs_block_group_flags(&bg) &
-					BTRFS_BLOCK_GROUP_TYPE_MASK;
-
-				if (flags != (em->map_lookup->type &
-					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
-					btrfs_err(fs_info,
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
-						found_key.objectid,
-						found_key.offset, flags,
-						(BTRFS_BLOCK_GROUP_TYPE_MASK &
-						 em->map_lookup->type));
-					ret = -EUCLEAN;
-				} else {
-					ret = 0;
-				}
-			}
-			free_extent_map(em);
-			goto out;
-		}
-		path->slots[0]++;
-	}
-out:
-	return ret;
-}
-
-void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
-{
-	struct btrfs_block_group_cache *block_group;
-	u64 last = 0;
-
-	while (1) {
-		struct inode *inode;
-
-		block_group = btrfs_lookup_first_block_group(info, last);
-		while (block_group) {
-			wait_block_group_cache_done(block_group);
-			spin_lock(&block_group->lock);
-			if (block_group->iref)
-				break;
-			spin_unlock(&block_group->lock);
-			block_group = next_block_group(info, block_group);
-		}
-		if (!block_group) {
-			if (last == 0)
-				break;
-			last = 0;
-			continue;
-		}
-
-		inode = block_group->inode;
-		block_group->iref = 0;
-		block_group->inode = NULL;
-		spin_unlock(&block_group->lock);
-		ASSERT(block_group->io_ctl.inode == NULL);
-		iput(inode);
-		last = block_group->key.objectid + block_group->key.offset;
-		btrfs_put_block_group(block_group);
-	}
-}
-
-/*
- * Must be called only after stopping all workers, since we could have block
- * group caching kthreads running, and therefore they could race with us if we
- * freed the block groups before stopping them.
- */
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
-{
-	struct btrfs_block_group_cache *block_group;
-	struct btrfs_space_info *space_info;
-	struct btrfs_caching_control *caching_ctl;
-	struct rb_node *n;
-
-	down_write(&info->commit_root_sem);
-	while (!list_empty(&info->caching_block_groups)) {
-		caching_ctl = list_entry(info->caching_block_groups.next,
-					 struct btrfs_caching_control, list);
-		list_del(&caching_ctl->list);
-		put_caching_control(caching_ctl);
-	}
-	up_write(&info->commit_root_sem);
-
-	spin_lock(&info->unused_bgs_lock);
-	while (!list_empty(&info->unused_bgs)) {
-		block_group = list_first_entry(&info->unused_bgs,
-					       struct btrfs_block_group_cache,
-					       bg_list);
-		list_del_init(&block_group->bg_list);
-		btrfs_put_block_group(block_group);
-	}
-	spin_unlock(&info->unused_bgs_lock);
-
-	spin_lock(&info->block_group_cache_lock);
-	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
-		block_group = rb_entry(n, struct btrfs_block_group_cache,
-				       cache_node);
-		rb_erase(&block_group->cache_node,
-			 &info->block_group_cache_tree);
-		RB_CLEAR_NODE(&block_group->cache_node);
-		spin_unlock(&info->block_group_cache_lock);
-
-		down_write(&block_group->space_info->groups_sem);
-		list_del(&block_group->list);
-		up_write(&block_group->space_info->groups_sem);
-
-		/*
-		 * We haven't cached this block group, which means we could
-		 * possibly have excluded extents on this block group.
-		 */
-		if (block_group->cached == BTRFS_CACHE_NO ||
-		    block_group->cached == BTRFS_CACHE_ERROR)
-			free_excluded_extents(block_group);
-
-		btrfs_remove_free_space_cache(block_group);
-		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
-		ASSERT(list_empty(&block_group->dirty_list));
-		ASSERT(list_empty(&block_group->io_list));
-		ASSERT(list_empty(&block_group->bg_list));
-		ASSERT(atomic_read(&block_group->count) == 1);
-		btrfs_put_block_group(block_group);
-
-		spin_lock(&info->block_group_cache_lock);
-	}
-	spin_unlock(&info->block_group_cache_lock);
-
-	/* now that all the block groups are freed, go through and
-	 * free all the space_info structs.  This is only called during
-	 * the final stages of unmount, and so we know nobody is
-	 * using them.  We call synchronize_rcu() once before we start,
-	 * just to be on the safe side.
-	 */
-	synchronize_rcu();
-
-	release_global_block_rsv(info);
-
-	while (!list_empty(&info->space_info)) {
-		int i;
-
-		space_info = list_entry(info->space_info.next,
-					struct btrfs_space_info,
-					list);
-
-		/*
-		 * Do not hide this behind enospc_debug, this is actually
-		 * important and indicates a real bug if this happens.
-		 */
-		if (WARN_ON(space_info->bytes_pinned > 0 ||
-			    space_info->bytes_reserved > 0 ||
-			    space_info->bytes_may_use > 0))
-			dump_space_info(info, space_info, 0, 0);
-		list_del(&space_info->list);
-		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
-			struct kobject *kobj;
-			kobj = space_info->block_group_kobjs[i];
-			space_info->block_group_kobjs[i] = NULL;
-			if (kobj) {
-				kobject_del(kobj);
-				kobject_put(kobj);
-			}
-		}
-		kobject_del(&space_info->kobj);
-		kobject_put(&space_info->kobj);
-	}
-	return 0;
-}
-
-/* link_block_group will queue up kobjects to add when we're reclaim-safe */
-void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_space_info *space_info;
-	struct raid_kobject *rkobj;
-	LIST_HEAD(list);
-	int index;
-	int ret = 0;
-
-	spin_lock(&fs_info->pending_raid_kobjs_lock);
-	list_splice_init(&fs_info->pending_raid_kobjs, &list);
-	spin_unlock(&fs_info->pending_raid_kobjs_lock);
-
-	list_for_each_entry(rkobj, &list, list) {
-		space_info = __find_space_info(fs_info, rkobj->flags);
-		index = btrfs_bg_flags_to_raid_index(rkobj->flags);
-
-		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
-				  "%s", get_raid_name(index));
-		if (ret) {
-			kobject_put(&rkobj->kobj);
-			break;
-		}
-	}
-	if (ret)
-		btrfs_warn(fs_info,
-			   "failed to add kobject for block cache, ignoring");
-}
-
-static void link_block_group(struct btrfs_block_group_cache *cache)
-{
-	struct btrfs_space_info *space_info = cache->space_info;
-	struct btrfs_fs_info *fs_info = cache->fs_info;
-	int index = btrfs_bg_flags_to_raid_index(cache->flags);
-	bool first = false;
-
-	down_write(&space_info->groups_sem);
-	if (list_empty(&space_info->block_groups[index]))
-		first = true;
-	list_add_tail(&cache->list, &space_info->block_groups[index]);
-	up_write(&space_info->groups_sem);
-
-	if (first) {
-		struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
-		if (!rkobj) {
-			btrfs_warn(cache->fs_info,
-				"couldn't alloc memory for raid level kobject");
-			return;
-		}
-		rkobj->flags = cache->flags;
-		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
-
-		spin_lock(&fs_info->pending_raid_kobjs_lock);
-		list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
-		spin_unlock(&fs_info->pending_raid_kobjs_lock);
-		space_info->block_group_kobjs[index] = &rkobj->kobj;
-	}
-}
-
-static struct btrfs_block_group_cache *
-btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
-			       u64 start, u64 size)
-{
-	struct btrfs_block_group_cache *cache;
-
-	cache = kzalloc(sizeof(*cache), GFP_NOFS);
-	if (!cache)
-		return NULL;
-
-	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-					GFP_NOFS);
-	if (!cache->free_space_ctl) {
-		kfree(cache);
-		return NULL;
-	}
-
-	cache->key.objectid = start;
-	cache->key.offset = size;
-	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-
-	cache->fs_info = fs_info;
-	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
-	set_free_space_tree_thresholds(cache);
-
-	atomic_set(&cache->count, 1);
-	spin_lock_init(&cache->lock);
-	init_rwsem(&cache->data_rwsem);
-	INIT_LIST_HEAD(&cache->list);
-	INIT_LIST_HEAD(&cache->cluster_list);
-	INIT_LIST_HEAD(&cache->bg_list);
-	INIT_LIST_HEAD(&cache->ro_list);
-	INIT_LIST_HEAD(&cache->dirty_list);
-	INIT_LIST_HEAD(&cache->io_list);
-	btrfs_init_free_space_ctl(cache);
-	atomic_set(&cache->trimming, 0);
-	mutex_init(&cache->free_space_lock);
-	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
-
-	return cache;
-}
-
-
-/*
- * Iterate all chunks and verify that each of them has the corresponding block
- * group
- */
-static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-	struct extent_map *em;
-	struct btrfs_block_group_cache *bg;
-	u64 start = 0;
-	int ret = 0;
-
-	while (1) {
-		read_lock(&map_tree->map_tree.lock);
-		/*
-		 * lookup_extent_mapping will return the first extent map
-		 * intersecting the range, so setting @len to 1 is enough to
-		 * get the first chunk.
-		 */
-		em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
-		read_unlock(&map_tree->map_tree.lock);
-		if (!em)
-			break;
-
-		bg = btrfs_lookup_block_group(fs_info, em->start);
-		if (!bg) {
-			btrfs_err(fs_info,
-	"chunk start=%llu len=%llu doesn't have corresponding block group",
-				     em->start, em->len);
-			ret = -EUCLEAN;
-			free_extent_map(em);
-			break;
-		}
-		if (bg->key.objectid != em->start ||
-		    bg->key.offset != em->len ||
-		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
-		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
-			btrfs_err(fs_info,
-"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
-				em->start, em->len,
-				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
-				bg->key.objectid, bg->key.offset,
-				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
-			ret = -EUCLEAN;
-			free_extent_map(em);
-			btrfs_put_block_group(bg);
-			break;
-		}
-		start = em->start + em->len;
-		free_extent_map(em);
-		btrfs_put_block_group(bg);
-	}
-	return ret;
-}
-
-int btrfs_read_block_groups(struct btrfs_fs_info *info)
-{
-	struct btrfs_path *path;
-	int ret;
-	struct btrfs_block_group_cache *cache;
-	struct btrfs_space_info *space_info;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
-	int need_clear = 0;
-	u64 cache_gen;
-	u64 feature;
-	int mixed;
-
-	feature = btrfs_super_incompat_flags(info->super_copy);
-	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
-
-	key.objectid = 0;
-	key.offset = 0;
-	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	path->reada = READA_FORWARD;
-
-	cache_gen = btrfs_super_cache_generation(info->super_copy);
-	if (btrfs_test_opt(info, SPACE_CACHE) &&
-	    btrfs_super_generation(info->super_copy) != cache_gen)
-		need_clear = 1;
-	if (btrfs_test_opt(info, CLEAR_CACHE))
-		need_clear = 1;
-
-	while (1) {
-		ret = find_first_block_group(info, path, &key);
-		if (ret > 0)
-			break;
-		if (ret != 0)
-			goto error;
-
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-		cache = btrfs_create_block_group_cache(info, found_key.objectid,
-						       found_key.offset);
-		if (!cache) {
-			ret = -ENOMEM;
-			goto error;
-		}
-
-		if (need_clear) {
-			/*
-			 * When we mount with old space cache, we need to
-			 * set BTRFS_DC_CLEAR and set dirty flag.
-			 *
-			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
-			 *    truncate the old free space cache inode and
-			 *    setup a new one.
-			 * b) Setting 'dirty flag' makes sure that we flush
-			 *    the new space cache info onto disk.
-			 */
-			if (btrfs_test_opt(info, SPACE_CACHE))
-				cache->disk_cache_state = BTRFS_DC_CLEAR;
-		}
-
-		read_extent_buffer(leaf, &cache->item,
-				   btrfs_item_ptr_offset(leaf, path->slots[0]),
-				   sizeof(cache->item));
-		cache->flags = btrfs_block_group_flags(&cache->item);
-		if (!mixed &&
-		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
-		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
-			btrfs_err(info,
-"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
-				  cache->key.objectid);
-			ret = -EINVAL;
-			goto error;
-		}
-
-		key.objectid = found_key.objectid + found_key.offset;
-		btrfs_release_path(path);
-
-		/*
-		 * We need to exclude the super stripes now so that the space
-		 * info has super bytes accounted for, otherwise we'll think
-		 * we have more space than we actually do.
-		 */
-		ret = exclude_super_stripes(cache);
-		if (ret) {
-			/*
-			 * We may have excluded something, so call this just in
-			 * case.
-			 */
-			free_excluded_extents(cache);
-			btrfs_put_block_group(cache);
-			goto error;
-		}
-
-		/*
-		 * check for two cases, either we are full, and therefore
-		 * don't need to bother with the caching work since we won't
-		 * find any space, or we are empty, and we can just add all
-		 * the space in and be done with it.  This saves us _alot_ of
-		 * time, particularly in the full case.
-		 */
-		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
-			cache->last_byte_to_unpin = (u64)-1;
-			cache->cached = BTRFS_CACHE_FINISHED;
-			free_excluded_extents(cache);
-		} else if (btrfs_block_group_used(&cache->item) == 0) {
-			cache->last_byte_to_unpin = (u64)-1;
-			cache->cached = BTRFS_CACHE_FINISHED;
-			add_new_free_space(cache, found_key.objectid,
-					   found_key.objectid +
-					   found_key.offset);
-			free_excluded_extents(cache);
-		}
-
-		ret = btrfs_add_block_group_cache(info, cache);
-		if (ret) {
-			btrfs_remove_free_space_cache(cache);
-			btrfs_put_block_group(cache);
-			goto error;
-		}
-
-		trace_btrfs_add_block_group(info, cache, 0);
-		update_space_info(info, cache->flags, found_key.offset,
-				  btrfs_block_group_used(&cache->item),
-				  cache->bytes_super, &space_info);
-
-		cache->space_info = space_info;
-
-		link_block_group(cache);
-
-		set_avail_alloc_bits(info, cache->flags);
-		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
-			inc_block_group_ro(cache, 1);
-		} else if (btrfs_block_group_used(&cache->item) == 0) {
-			ASSERT(list_empty(&cache->bg_list));
-			btrfs_mark_bg_unused(cache);
-		}
-	}
-
-	list_for_each_entry_rcu(space_info, &info->space_info, list) {
-		if (!(get_alloc_profile(info, space_info->flags) &
-		      (BTRFS_BLOCK_GROUP_RAID10 |
-		       BTRFS_BLOCK_GROUP_RAID1 |
-		       BTRFS_BLOCK_GROUP_RAID5 |
-		       BTRFS_BLOCK_GROUP_RAID6 |
-		       BTRFS_BLOCK_GROUP_DUP)))
-			continue;
-		/*
-		 * avoid allocating from un-mirrored block group if there are
-		 * mirrored block groups.
-		 */
-		list_for_each_entry(cache,
-				&space_info->block_groups[BTRFS_RAID_RAID0],
-				list)
-			inc_block_group_ro(cache, 1);
-		list_for_each_entry(cache,
-				&space_info->block_groups[BTRFS_RAID_SINGLE],
-				list)
-			inc_block_group_ro(cache, 1);
-	}
-
-	btrfs_add_raid_kobjects(info);
-	init_global_block_rsv(info);
-	ret = check_chunk_block_group_mappings(info);
-error:
-	btrfs_free_path(path);
-	return ret;
-}
-
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_block_group_cache *block_group;
-	struct btrfs_root *extent_root = fs_info->extent_root;
-	struct btrfs_block_group_item item;
-	struct btrfs_key key;
-	int ret = 0;
-
-	if (!trans->can_flush_pending_bgs)
-		return;
-
-	while (!list_empty(&trans->new_bgs)) {
-		block_group = list_first_entry(&trans->new_bgs,
-					       struct btrfs_block_group_cache,
-					       bg_list);
-		if (ret)
-			goto next;
-
-		spin_lock(&block_group->lock);
-		memcpy(&item, &block_group->item, sizeof(item));
-		memcpy(&key, &block_group->key, sizeof(key));
-		spin_unlock(&block_group->lock);
-
-		ret = btrfs_insert_item(trans, extent_root, &key, &item,
-					sizeof(item));
-		if (ret)
-			btrfs_abort_transaction(trans, ret);
-		ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
-		if (ret)
-			btrfs_abort_transaction(trans, ret);
-		add_block_group_free_space(trans, block_group);
-		/* already aborted the transaction if it failed. */
-next:
-		list_del_init(&block_group->bg_list);
-	}
-	btrfs_trans_release_chunk_metadata(trans);
-}
-
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
-			   u64 type, u64 chunk_offset, u64 size)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_block_group_cache *cache;
-	int ret;
-
-	btrfs_set_log_full_commit(fs_info, trans);
-
-	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
-	if (!cache)
-		return -ENOMEM;
-
-	btrfs_set_block_group_used(&cache->item, bytes_used);
-	btrfs_set_block_group_chunk_objectid(&cache->item,
-					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
-	btrfs_set_block_group_flags(&cache->item, type);
-
-	cache->flags = type;
-	cache->last_byte_to_unpin = (u64)-1;
-	cache->cached = BTRFS_CACHE_FINISHED;
-	cache->needs_free_space = 1;
-	ret = exclude_super_stripes(cache);
-	if (ret) {
-		/*
-		 * We may have excluded something, so call this just in
-		 * case.
-		 */
-		free_excluded_extents(cache);
-		btrfs_put_block_group(cache);
-		return ret;
-	}
-
-	add_new_free_space(cache, chunk_offset, chunk_offset + size);
-
-	free_excluded_extents(cache);
-
-#ifdef CONFIG_BTRFS_DEBUG
-	if (btrfs_should_fragment_free_space(cache)) {
-		u64 new_bytes_used = size - bytes_used;
-
-		bytes_used += new_bytes_used >> 1;
-		fragment_free_space(cache);
-	}
-#endif
-	/*
-	 * Ensure the corresponding space_info object is created and
-	 * assigned to our block group. We want our bg to be added to the rbtree
-	 * with its ->space_info set.
-	 */
-	cache->space_info = __find_space_info(fs_info, cache->flags);
-	ASSERT(cache->space_info);
-
-	ret = btrfs_add_block_group_cache(fs_info, cache);
-	if (ret) {
-		btrfs_remove_free_space_cache(cache);
-		btrfs_put_block_group(cache);
-		return ret;
-	}
-
-	/*
-	 * Now that our block group has its ->space_info set and is inserted in
-	 * the rbtree, update the space info's counters.
-	 */
-	trace_btrfs_add_block_group(fs_info, cache, 1);
-	update_space_info(fs_info, cache->flags, size, bytes_used,
-				cache->bytes_super, &cache->space_info);
-	update_global_block_rsv(fs_info);
-
-	link_block_group(cache);
-
-	list_add_tail(&cache->bg_list, &trans->new_bgs);
-
-	set_avail_alloc_bits(fs_info, type);
-	return 0;
-}
-
-static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
-{
-	u64 extra_flags = chunk_to_extended(flags) &
-				BTRFS_EXTENDED_PROFILE_MASK;
-
-	write_seqlock(&fs_info->profiles_lock);
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		fs_info->avail_data_alloc_bits &= ~extra_flags;
-	if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
-	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		fs_info->avail_system_alloc_bits &= ~extra_flags;
-	write_sequnlock(&fs_info->profiles_lock);
-}
-
-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-			     u64 group_start, struct extent_map *em)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_root *root = fs_info->extent_root;
-	struct btrfs_path *path;
-	struct btrfs_block_group_cache *block_group;
-	struct btrfs_free_cluster *cluster;
-	struct btrfs_root *tree_root = fs_info->tree_root;
-	struct btrfs_key key;
-	struct inode *inode;
-	struct kobject *kobj = NULL;
-	int ret;
-	int index;
-	int factor;
-	struct btrfs_caching_control *caching_ctl = NULL;
-	bool remove_em;
-
-	block_group = btrfs_lookup_block_group(fs_info, group_start);
-	BUG_ON(!block_group);
-	BUG_ON(!block_group->ro);
-
-	trace_btrfs_remove_block_group(block_group);
-	/*
-	 * Free the reserved super bytes from this block group before
-	 * remove it.
-	 */
-	free_excluded_extents(block_group);
-	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
-				  block_group->key.offset);
-
-	memcpy(&key, &block_group->key, sizeof(key));
-	index = btrfs_bg_flags_to_raid_index(block_group->flags);
-	factor = btrfs_bg_type_to_factor(block_group->flags);
-
-	/* make sure this block group isn't part of an allocation cluster */
-	cluster = &fs_info->data_alloc_cluster;
-	spin_lock(&cluster->refill_lock);
-	btrfs_return_cluster_to_free_space(block_group, cluster);
-	spin_unlock(&cluster->refill_lock);
-
-	/*
-	 * make sure this block group isn't part of a metadata
-	 * allocation cluster
-	 */
-	cluster = &fs_info->meta_alloc_cluster;
-	spin_lock(&cluster->refill_lock);
-	btrfs_return_cluster_to_free_space(block_group, cluster);
-	spin_unlock(&cluster->refill_lock);
-
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	/*
-	 * get the inode first so any iput calls done for the io_list
-	 * aren't the final iput (no unlinks allowed now)
-	 */
-	inode = lookup_free_space_inode(fs_info, block_group, path);
-
-	mutex_lock(&trans->transaction->cache_write_mutex);
-	/*
-	 * make sure our free spache cache IO is done before remove the
-	 * free space inode
-	 */
-	spin_lock(&trans->transaction->dirty_bgs_lock);
-	if (!list_empty(&block_group->io_list)) {
-		list_del_init(&block_group->io_list);
-
-		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
-
-		spin_unlock(&trans->transaction->dirty_bgs_lock);
-		btrfs_wait_cache_io(trans, block_group, path);
-		btrfs_put_block_group(block_group);
-		spin_lock(&trans->transaction->dirty_bgs_lock);
-	}
-
-	if (!list_empty(&block_group->dirty_list)) {
-		list_del_init(&block_group->dirty_list);
-		btrfs_put_block_group(block_group);
-	}
-	spin_unlock(&trans->transaction->dirty_bgs_lock);
-	mutex_unlock(&trans->transaction->cache_write_mutex);
-
-	if (!IS_ERR(inode)) {
-		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
-		if (ret) {
-			btrfs_add_delayed_iput(inode);
-			goto out;
-		}
-		clear_nlink(inode);
-		/* One for the block groups ref */
-		spin_lock(&block_group->lock);
-		if (block_group->iref) {
-			block_group->iref = 0;
-			block_group->inode = NULL;
-			spin_unlock(&block_group->lock);
-			iput(inode);
-		} else {
-			spin_unlock(&block_group->lock);
-		}
-		/* One for our lookup ref */
-		btrfs_add_delayed_iput(inode);
-	}
-
-	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-	key.offset = block_group->key.objectid;
-	key.type = 0;
-
-	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
-	if (ret < 0)
-		goto out;
-	if (ret > 0)
-		btrfs_release_path(path);
-	if (ret == 0) {
-		ret = btrfs_del_item(trans, tree_root, path);
-		if (ret)
-			goto out;
-		btrfs_release_path(path);
-	}
-
-	spin_lock(&fs_info->block_group_cache_lock);
-	rb_erase(&block_group->cache_node,
-		 &fs_info->block_group_cache_tree);
-	RB_CLEAR_NODE(&block_group->cache_node);
-
-	if (fs_info->first_logical_byte == block_group->key.objectid)
-		fs_info->first_logical_byte = (u64)-1;
-	spin_unlock(&fs_info->block_group_cache_lock);
-
-	down_write(&block_group->space_info->groups_sem);
-	/*
-	 * we must use list_del_init so people can check to see if they
-	 * are still on the list after taking the semaphore
-	 */
-	list_del_init(&block_group->list);
-	if (list_empty(&block_group->space_info->block_groups[index])) {
-		kobj = block_group->space_info->block_group_kobjs[index];
-		block_group->space_info->block_group_kobjs[index] = NULL;
-		clear_avail_alloc_bits(fs_info, block_group->flags);
-	}
-	up_write(&block_group->space_info->groups_sem);
-	if (kobj) {
-		kobject_del(kobj);
-		kobject_put(kobj);
-	}
-
-	if (block_group->has_caching_ctl)
-		caching_ctl = get_caching_control(block_group);
-	if (block_group->cached == BTRFS_CACHE_STARTED)
-		wait_block_group_cache_done(block_group);
-	if (block_group->has_caching_ctl) {
-		down_write(&fs_info->commit_root_sem);
-		if (!caching_ctl) {
-			struct btrfs_caching_control *ctl;
-
-			list_for_each_entry(ctl,
-				    &fs_info->caching_block_groups, list)
-				if (ctl->block_group == block_group) {
-					caching_ctl = ctl;
-					refcount_inc(&caching_ctl->count);
-					break;
-				}
-		}
-		if (caching_ctl)
-			list_del_init(&caching_ctl->list);
-		up_write(&fs_info->commit_root_sem);
-		if (caching_ctl) {
-			/* Once for the caching bgs list and once for us. */
-			put_caching_control(caching_ctl);
-			put_caching_control(caching_ctl);
-		}
-	}
-
-	spin_lock(&trans->transaction->dirty_bgs_lock);
-	if (!list_empty(&block_group->dirty_list)) {
-		WARN_ON(1);
-	}
-	if (!list_empty(&block_group->io_list)) {
-		WARN_ON(1);
-	}
-	spin_unlock(&trans->transaction->dirty_bgs_lock);
-	btrfs_remove_free_space_cache(block_group);
-
-	spin_lock(&block_group->space_info->lock);
-	list_del_init(&block_group->ro_list);
-
-	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
-		WARN_ON(block_group->space_info->total_bytes
-			< block_group->key.offset);
-		WARN_ON(block_group->space_info->bytes_readonly
-			< block_group->key.offset);
-		WARN_ON(block_group->space_info->disk_total
-			< block_group->key.offset * factor);
-	}
-	block_group->space_info->total_bytes -= block_group->key.offset;
-	block_group->space_info->bytes_readonly -= block_group->key.offset;
-	block_group->space_info->disk_total -= block_group->key.offset * factor;
-
-	spin_unlock(&block_group->space_info->lock);
-
-	memcpy(&key, &block_group->key, sizeof(key));
-
-	mutex_lock(&fs_info->chunk_mutex);
-	if (!list_empty(&em->list)) {
-		/* We're in the transaction->pending_chunks list. */
-		free_extent_map(em);
-	}
-	spin_lock(&block_group->lock);
-	block_group->removed = 1;
-	/*
-	 * At this point trimming can't start on this block group, because we
-	 * removed the block group from the tree fs_info->block_group_cache_tree
-	 * so no one can't find it anymore and even if someone already got this
-	 * block group before we removed it from the rbtree, they have already
-	 * incremented block_group->trimming - if they didn't, they won't find
-	 * any free space entries because we already removed them all when we
-	 * called btrfs_remove_free_space_cache().
-	 *
-	 * And we must not remove the extent map from the fs_info->mapping_tree
-	 * to prevent the same logical address range and physical device space
-	 * ranges from being reused for a new block group. This is because our
-	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
-	 * completely transactionless, so while it is trimming a range the
-	 * currently running transaction might finish and a new one start,
-	 * allowing for new block groups to be created that can reuse the same
-	 * physical device locations unless we take this special care.
-	 *
-	 * There may also be an implicit trim operation if the file system
-	 * is mounted with -odiscard. The same protections must remain
-	 * in place until the extents have been discarded completely when
-	 * the transaction commit has completed.
-	 */
-	remove_em = (atomic_read(&block_group->trimming) == 0);
-	/*
-	 * Make sure a trimmer task always sees the em in the pinned_chunks list
-	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
-	 * before checking block_group->removed).
-	 */
-	if (!remove_em) {
-		/*
-		 * Our em might be in trans->transaction->pending_chunks which
-		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
-		 * and so is the fs_info->pinned_chunks list.
-		 *
-		 * So at this point we must be holding the chunk_mutex to avoid
-		 * any races with chunk allocation (more specifically at
-		 * volumes.c:contains_pending_extent()), to ensure it always
-		 * sees the em, either in the pending_chunks list or in the
-		 * pinned_chunks list.
-		 */
-		list_move_tail(&em->list, &fs_info->pinned_chunks);
-	}
-	spin_unlock(&block_group->lock);
-
-	if (remove_em) {
-		struct extent_map_tree *em_tree;
-
-		em_tree = &fs_info->mapping_tree.map_tree;
-		write_lock(&em_tree->lock);
-		/*
-		 * The em might be in the pending_chunks list, so make sure the
-		 * chunk mutex is locked, since remove_extent_mapping() will
-		 * delete us from that list.
-		 */
-		remove_extent_mapping(em_tree, em);
-		write_unlock(&em_tree->lock);
-		/* once for the tree */
-		free_extent_map(em);
-	}
-
-	mutex_unlock(&fs_info->chunk_mutex);
-
-	ret = remove_block_group_free_space(trans, block_group);
-	if (ret)
-		goto out;
-
-	btrfs_put_block_group(block_group);
-	btrfs_put_block_group(block_group);
-
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret > 0)
-		ret = -EIO;
-	if (ret < 0)
-		goto out;
-
-	ret = btrfs_del_item(trans, root, path);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-struct btrfs_trans_handle *
-btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
-				     const u64 chunk_offset)
-{
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
-	struct extent_map *em;
-	struct map_lookup *map;
-	unsigned int num_items;
-
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
-	read_unlock(&em_tree->lock);
-	ASSERT(em && em->start == chunk_offset);
-
-	/*
-	 * We need to reserve 3 + N units from the metadata space info in order
-	 * to remove a block group (done at btrfs_remove_chunk() and at
-	 * btrfs_remove_block_group()), which are used for:
-	 *
-	 * 1 unit for adding the free space inode's orphan (located in the tree
-	 * of tree roots).
-	 * 1 unit for deleting the block group item (located in the extent
-	 * tree).
-	 * 1 unit for deleting the free space item (located in tree of tree
-	 * roots).
-	 * N units for deleting N device extent items corresponding to each
-	 * stripe (located in the device tree).
-	 *
-	 * In order to remove a block group we also need to reserve units in the
-	 * system space info in order to update the chunk tree (update one or
-	 * more device items and remove one chunk item), but this is done at
-	 * btrfs_remove_chunk() through a call to check_system_chunk().
-	 */
-	map = em->map_lookup;
-	num_items = 3 + map->num_stripes;
-	free_extent_map(em);
-
-	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
-							   num_items, 1);
-}
-
-/*
- * Process the unused_bgs list and remove any that don't have any allocated
- * space inside of them.
- */
-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_block_group_cache *block_group;
-	struct btrfs_space_info *space_info;
-	struct btrfs_trans_handle *trans;
-	int ret = 0;
-
-	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
-		return;
-
-	spin_lock(&fs_info->unused_bgs_lock);
-	while (!list_empty(&fs_info->unused_bgs)) {
-		u64 start, end;
-		int trimming;
-
-		block_group = list_first_entry(&fs_info->unused_bgs,
-					       struct btrfs_block_group_cache,
-					       bg_list);
-		list_del_init(&block_group->bg_list);
-
-		space_info = block_group->space_info;
-
-		if (ret || btrfs_mixed_space_info(space_info)) {
-			btrfs_put_block_group(block_group);
-			continue;
-		}
-		spin_unlock(&fs_info->unused_bgs_lock);
-
-		mutex_lock(&fs_info->delete_unused_bgs_mutex);
-
-		/* Don't want to race with allocators so take the groups_sem */
-		down_write(&space_info->groups_sem);
-		spin_lock(&block_group->lock);
-		if (block_group->reserved || block_group->pinned ||
-		    btrfs_block_group_used(&block_group->item) ||
-		    block_group->ro ||
-		    list_is_singular(&block_group->list)) {
-			/*
-			 * We want to bail if we made new allocations or have
-			 * outstanding allocations in this block group.  We do
-			 * the ro check in case balance is currently acting on
-			 * this block group.
-			 */
-			trace_btrfs_skip_unused_block_group(block_group);
-			spin_unlock(&block_group->lock);
-			up_write(&space_info->groups_sem);
-			goto next;
-		}
-		spin_unlock(&block_group->lock);
-
-		/* We don't want to force the issue, only flip if it's ok. */
-		ret = inc_block_group_ro(block_group, 0);
-		up_write(&space_info->groups_sem);
-		if (ret < 0) {
-			ret = 0;
-			goto next;
-		}
-
-		/*
-		 * Want to do this before we do anything else so we can recover
-		 * properly if we fail to join the transaction.
-		 */
-		trans = btrfs_start_trans_remove_block_group(fs_info,
-						     block_group->key.objectid);
-		if (IS_ERR(trans)) {
-			btrfs_dec_block_group_ro(block_group);
-			ret = PTR_ERR(trans);
-			goto next;
-		}
-
-		/*
-		 * We could have pending pinned extents for this block group,
-		 * just delete them, we don't care about them anymore.
-		 */
-		start = block_group->key.objectid;
-		end = start + block_group->key.offset - 1;
-		/*
-		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
-		 * btrfs_finish_extent_commit(). If we are at transaction N,
-		 * another task might be running finish_extent_commit() for the
-		 * previous transaction N - 1, and have seen a range belonging
-		 * to the block group in freed_extents[] before we were able to
-		 * clear the whole block group range from freed_extents[]. This
-		 * means that task can lookup for the block group after we
-		 * unpinned it from freed_extents[] and removed it, leading to
-		 * a BUG_ON() at btrfs_unpin_extent_range().
-		 */
-		mutex_lock(&fs_info->unused_bg_unpin_mutex);
-		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
-				  EXTENT_DIRTY);
-		if (ret) {
-			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-			btrfs_dec_block_group_ro(block_group);
-			goto end_trans;
-		}
-		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
-				  EXTENT_DIRTY);
-		if (ret) {
-			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-			btrfs_dec_block_group_ro(block_group);
-			goto end_trans;
-		}
-		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-
-		/* Reset pinned so btrfs_put_block_group doesn't complain */
-		spin_lock(&space_info->lock);
-		spin_lock(&block_group->lock);
-
-		space_info->bytes_pinned -= block_group->pinned;
-		space_info->bytes_readonly += block_group->pinned;
-		percpu_counter_add_batch(&space_info->total_bytes_pinned,
-				   -block_group->pinned,
-				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
-		block_group->pinned = 0;
-
-		spin_unlock(&block_group->lock);
-		spin_unlock(&space_info->lock);
-
-		/* DISCARD can flip during remount */
-		trimming = btrfs_test_opt(fs_info, DISCARD);
-
-		/* Implicit trim during transaction commit. */
-		if (trimming)
-			btrfs_get_block_group_trimming(block_group);
-
-		/*
-		 * Btrfs_remove_chunk will abort the transaction if things go
-		 * horribly wrong.
-		 */
-		ret = btrfs_remove_chunk(trans, block_group->key.objectid);
-
-		if (ret) {
-			if (trimming)
-				btrfs_put_block_group_trimming(block_group);
-			goto end_trans;
-		}
-
-		/*
-		 * If we're not mounted with -odiscard, we can just forget
-		 * about this block group. Otherwise we'll need to wait
-		 * until transaction commit to do the actual discard.
-		 */
-		if (trimming) {
-			spin_lock(&fs_info->unused_bgs_lock);
-			/*
-			 * A concurrent scrub might have added us to the list
-			 * fs_info->unused_bgs, so use a list_move operation
-			 * to add the block group to the deleted_bgs list.
-			 */
-			list_move(&block_group->bg_list,
-				  &trans->transaction->deleted_bgs);
-			spin_unlock(&fs_info->unused_bgs_lock);
-			btrfs_get_block_group(block_group);
-		}
-end_trans:
-		btrfs_end_transaction(trans);
-next:
-		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-		btrfs_put_block_group(block_group);
-		spin_lock(&fs_info->unused_bgs_lock);
-	}
-	spin_unlock(&fs_info->unused_bgs_lock);
-}
-
-int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_super_block *disk_super;
-	u64 features;
-	u64 flags;
-	int mixed = 0;
-	int ret;
-
-	disk_super = fs_info->super_copy;
-	if (!btrfs_super_root(disk_super))
-		return -EINVAL;
-
-	features = btrfs_super_incompat_flags(disk_super);
-	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
-		mixed = 1;
-
-	flags = BTRFS_BLOCK_GROUP_SYSTEM;
-	ret = create_space_info(fs_info, flags);
-	if (ret)
-		goto out;
-
-	if (mixed) {
-		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
-		ret = create_space_info(fs_info, flags);
-	} else {
-		flags = BTRFS_BLOCK_GROUP_METADATA;
-		ret = create_space_info(fs_info, flags);
-		if (ret)
-			goto out;
-
-		flags = BTRFS_BLOCK_GROUP_DATA;
-		ret = create_space_info(fs_info, flags);
-	}
-out:
-	return ret;
-}
-
 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
 				   u64 start, u64 end)
 {
@@ -10784,10 +5535,9 @@
  * it while performing the free space search since we have already
  * held back allocations.
  */
-static int btrfs_trim_free_extents(struct btrfs_device *device,
-				   u64 minlen, u64 *trimmed)
+static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
 {
-	u64 start = 0, len = 0;
+	u64 start = SZ_1M, len = 0, end = 0;
 	int ret;
 
 	*trimmed = 0;
@@ -10796,7 +5546,7 @@
 	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
 		return 0;
 
-	/* Not writeable = nothing to do. */
+	/* Not writable = nothing to do. */
 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
 		return 0;
 
@@ -10808,43 +5558,41 @@
 
 	while (1) {
 		struct btrfs_fs_info *fs_info = device->fs_info;
-		struct btrfs_transaction *trans;
 		u64 bytes;
 
 		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
 		if (ret)
 			break;
 
-		ret = down_read_killable(&fs_info->commit_root_sem);
-		if (ret) {
+		find_first_clear_extent_bit(&device->alloc_state, start,
+					    &start, &end,
+					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+		/* Ensure we skip the reserved area in the first 1M */
+		start = max_t(u64, start, SZ_1M);
+
+		/*
+		 * If find_first_clear_extent_bit find a range that spans the
+		 * end of the device it will set end to -1, in this case it's up
+		 * to the caller to trim the value to the size of the device.
+		 */
+		end = min(end, device->total_bytes - 1);
+
+		len = end - start + 1;
+
+		/* We didn't find any extents */
+		if (!len) {
 			mutex_unlock(&fs_info->chunk_mutex);
+			ret = 0;
 			break;
 		}
 
-		spin_lock(&fs_info->trans_lock);
-		trans = fs_info->running_transaction;
-		if (trans)
-			refcount_inc(&trans->use_count);
-		spin_unlock(&fs_info->trans_lock);
-
-		if (!trans)
-			up_read(&fs_info->commit_root_sem);
-
-		ret = find_free_dev_extent_start(trans, device, minlen, start,
-						 &start, &len);
-		if (trans) {
-			up_read(&fs_info->commit_root_sem);
-			btrfs_put_transaction(trans);
-		}
-
-		if (ret) {
-			mutex_unlock(&fs_info->chunk_mutex);
-			if (ret == -ENOSPC)
-				ret = 0;
-			break;
-		}
-
-		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+		ret = btrfs_issue_discard(device->bdev, start, len,
+					  &bytes);
+		if (!ret)
+			set_extent_bits(&device->alloc_state, start,
+					start + bytes - 1,
+					CHUNK_TRIMMED);
 		mutex_unlock(&fs_info->chunk_mutex);
 
 		if (ret)
@@ -10879,6 +5627,7 @@
 	struct btrfs_device *device;
 	struct list_head *devices;
 	u64 group_trimmed;
+	u64 range_end = U64_MAX;
 	u64 start;
 	u64 end;
 	u64 trimmed = 0;
@@ -10888,26 +5637,33 @@
 	int dev_ret = 0;
 	int ret = 0;
 
+	/*
+	 * Check range overflow if range->len is set.
+	 * The default range->len is U64_MAX.
+	 */
+	if (range->len != U64_MAX &&
+	    check_add_overflow(range->start, range->len, &range_end))
+		return -EINVAL;
+
 	cache = btrfs_lookup_first_block_group(fs_info, range->start);
-	for (; cache; cache = next_block_group(fs_info, cache)) {
-		if (cache->key.objectid >= (range->start + range->len)) {
+	for (; cache; cache = btrfs_next_block_group(cache)) {
+		if (cache->key.objectid >= range_end) {
 			btrfs_put_block_group(cache);
 			break;
 		}
 
 		start = max(range->start, cache->key.objectid);
-		end = min(range->start + range->len,
-				cache->key.objectid + cache->key.offset);
+		end = min(range_end, cache->key.objectid + cache->key.offset);
 
 		if (end - start >= range->minlen) {
-			if (!block_group_cache_done(cache)) {
-				ret = cache_block_group(cache, 0);
+			if (!btrfs_block_group_cache_done(cache)) {
+				ret = btrfs_cache_block_group(cache, 0);
 				if (ret) {
 					bg_failed++;
 					bg_ret = ret;
 					continue;
 				}
-				ret = wait_block_group_cache_done(cache);
+				ret = btrfs_wait_block_group_cache_done(cache);
 				if (ret) {
 					bg_failed++;
 					bg_ret = ret;
@@ -10936,8 +5692,7 @@
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
-		ret = btrfs_trim_free_extents(device, range->minlen,
-					      &group_trimmed);
+		ret = btrfs_trim_free_extents(device, &group_trimmed);
 		if (ret) {
 			dev_failed++;
 			dev_ret = ret;
@@ -11001,16 +5756,3 @@
 			       !atomic_read(&root->will_be_snapshotted));
 	}
 }
-
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
-{
-	struct btrfs_fs_info *fs_info = bg->fs_info;
-
-	spin_lock(&fs_info->unused_bgs_lock);
-	if (list_empty(&bg->bg_list)) {
-		btrfs_get_block_group(bg);
-		trace_btrfs_add_unused_block_group(bg);
-		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
-	}
-	spin_unlock(&fs_info->unused_bgs_lock);
-}

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faa..cceaf05 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c

@@ -89,9 +89,18 @@
 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 		struct extent_io_tree *tree, u64 start, u64 end)
 {
-	if (tree->ops && tree->ops->check_extent_io_range)
-		tree->ops->check_extent_io_range(tree->private_data, caller,
-						 start, end);
+	struct inode *inode = tree->private_data;
+	u64 isize;
+
+	if (!inode || !is_data_inode(inode))
+		return;
+
+	isize = i_size_read(inode);
+	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
+			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+	}
 }
 #else
 #define btrfs_leak_debug_add(new, head)	do {} while (0)
@@ -100,8 +109,6 @@
 #define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
 #endif
 
-#define BUFFER_LRU_MAX 64
-
 struct tree_entry {
 	u64 start;
 	u64 end;
@@ -138,7 +145,56 @@
 	return ret;
 }
 
-static void flush_write_bio(struct extent_page_data *epd);
+static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+				       unsigned long bio_flags)
+{
+	blk_status_t ret = 0;
+	struct extent_io_tree *tree = bio->bi_private;
+
+	bio->bi_private = NULL;
+
+	if (tree->ops)
+		ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+						 mirror_num, bio_flags);
+	else
+		btrfsic_submit_bio(bio);
+
+	return blk_status_to_errno(ret);
+}
+
+/* Cleanup unsubmitted bios */
+static void end_write_bio(struct extent_page_data *epd, int ret)
+{
+	if (epd->bio) {
+		epd->bio->bi_status = errno_to_blk_status(ret);
+		bio_endio(epd->bio);
+		epd->bio = NULL;
+	}
+}
+
+/*
+ * Submit bio from extent page data via submit_one_bio
+ *
+ * Return 0 if everything is OK.
+ * Return <0 for error.
+ */
+static int __must_check flush_write_bio(struct extent_page_data *epd)
+{
+	int ret = 0;
+
+	if (epd->bio) {
+		ret = submit_one_bio(epd->bio, 0, 0);
+		/*
+		 * Clean up of epd->bio is handled by its endio function.
+		 * And endio is either triggered by successful bio execution
+		 * or the error handler of submit bio hook.
+		 * So at this point, no matter what happened, we don't need
+		 * to clean up epd->bio.
+		 */
+		epd->bio = NULL;
+	}
+	return ret;
+}
 
 int __init extent_io_init(void)
 {
@@ -191,14 +247,46 @@
 	bioset_exit(&btrfs_bioset);
 }
 
-void extent_io_tree_init(struct extent_io_tree *tree,
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+			 struct extent_io_tree *tree, unsigned int owner,
 			 void *private_data)
 {
+	tree->fs_info = fs_info;
 	tree->state = RB_ROOT;
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
 	tree->private_data = private_data;
+	tree->owner = owner;
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+	spin_lock(&tree->lock);
+	/*
+	 * Do a single barrier for the waitqueue_active check here, the state
+	 * of the waitqueue should not change once extent_io_tree_release is
+	 * called.
+	 */
+	smp_mb();
+	while (!RB_EMPTY_ROOT(&tree->state)) {
+		struct rb_node *node;
+		struct extent_state *state;
+
+		node = rb_first(&tree->state);
+		state = rb_entry(node, struct extent_state, rb_node);
+		rb_erase(&state->rb_node, &tree->state);
+		RB_CLEAR_NODE(&state->rb_node);
+		/*
+		 * btree io trees aren't supposed to have tasks waiting for
+		 * changes in the flags of extent states ever.
+		 */
+		ASSERT(!waitqueue_active(&state->wq));
+		free_extent_state(state);
+
+		cond_resched_lock(&tree->lock);
+	}
+	spin_unlock(&tree->lock);
 }
 
 static struct extent_state *alloc_extent_state(gfp_t mask)
@@ -271,9 +359,27 @@
 	return NULL;
 }
 
+/**
+ * __etree_search - searche @tree for an entry that contains @offset. Such
+ * entry would have entry->start <= offset && entry->end >= offset.
+ *
+ * @tree - the tree to search
+ * @offset - offset that should fall within an entry in @tree
+ * @next_ret - pointer to the first entry whose range ends after @offset
+ * @prev - pointer to the first entry whose range begins before @offset
+ * @p_ret - pointer where new node should be anchored (used when inserting an
+ *	    entry in the tree)
+ * @parent_ret - points to entry which would have been the parent of the entry,
+ *               containing @offset
+ *
+ * This function returns a pointer to the entry that contains @offset byte
+ * address. If no such entry exists, then NULL is returned and the other
+ * pointer arguments to the function are filled, otherwise the found entry is
+ * returned and other pointers are left untouched.
+ */
 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
-				      struct rb_node **prev_ret,
 				      struct rb_node **next_ret,
+				      struct rb_node **prev_ret,
 				      struct rb_node ***p_ret,
 				      struct rb_node **parent_ret)
 {
@@ -302,23 +408,23 @@
 	if (parent_ret)
 		*parent_ret = prev;
 
-	if (prev_ret) {
+	if (next_ret) {
 		orig_prev = prev;
 		while (prev && offset > prev_entry->end) {
 			prev = rb_next(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
-		*prev_ret = prev;
+		*next_ret = prev;
 		prev = orig_prev;
 	}
 
-	if (next_ret) {
+	if (prev_ret) {
 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		while (prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
-		*next_ret = prev;
+		*prev_ret = prev;
 	}
 	return NULL;
 }
@@ -329,12 +435,12 @@
 		       struct rb_node ***p_ret,
 		       struct rb_node **parent_ret)
 {
-	struct rb_node *prev = NULL;
+	struct rb_node *next= NULL;
 	struct rb_node *ret;
 
-	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+	ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
 	if (!ret)
-		return prev;
+		return next;
 	return ret;
 }
 
@@ -344,13 +450,6 @@
 	return tree_search_for_insert(tree, offset, NULL, NULL);
 }
 
-static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
-		     struct extent_state *other)
-{
-	if (tree->ops && tree->ops->merge_extent_hook)
-		tree->ops->merge_extent_hook(tree->private_data, new, other);
-}
-
 /*
  * utility function to look for merge candidates inside a given range.
  * Any extents with matching state are merged together into a single
@@ -366,7 +465,7 @@
 	struct extent_state *other;
 	struct rb_node *other_node;
 
-	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+	if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 		return;
 
 	other_node = rb_prev(&state->rb_node);
@@ -374,7 +473,10 @@
 		other = rb_entry(other_node, struct extent_state, rb_node);
 		if (other->end == state->start - 1 &&
 		    other->state == state->state) {
-			merge_cb(tree, state, other);
+			if (tree->private_data &&
+			    is_data_inode(tree->private_data))
+				btrfs_merge_delalloc_extent(tree->private_data,
+							    state, other);
 			state->start = other->start;
 			rb_erase(&other->rb_node, &tree->state);
 			RB_CLEAR_NODE(&other->rb_node);
@@ -386,7 +488,10 @@
 		other = rb_entry(other_node, struct extent_state, rb_node);
 		if (other->start == state->end + 1 &&
 		    other->state == state->state) {
-			merge_cb(tree, state, other);
+			if (tree->private_data &&
+			    is_data_inode(tree->private_data))
+				btrfs_merge_delalloc_extent(tree->private_data,
+							    state, other);
 			state->end = other->end;
 			rb_erase(&other->rb_node, &tree->state);
 			RB_CLEAR_NODE(&other->rb_node);
@@ -395,20 +500,6 @@
 	}
 }
 
-static void set_state_cb(struct extent_io_tree *tree,
-			 struct extent_state *state, unsigned *bits)
-{
-	if (tree->ops && tree->ops->set_bit_hook)
-		tree->ops->set_bit_hook(tree->private_data, state, bits);
-}
-
-static void clear_state_cb(struct extent_io_tree *tree,
-			   struct extent_state *state, unsigned *bits)
-{
-	if (tree->ops && tree->ops->clear_bit_hook)
-		tree->ops->clear_bit_hook(tree->private_data, state, bits);
-}
-
 static void set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state, unsigned *bits,
 			   struct extent_changeset *changeset);
@@ -431,9 +522,11 @@
 {
 	struct rb_node *node;
 
-	if (end < start)
-		WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
-		       end, start);
+	if (end < start) {
+		btrfs_err(tree->fs_info,
+			"insert state: end < start %llu %llu", end, start);
+		WARN_ON(1);
+	}
 	state->start = start;
 	state->end = end;
 
@@ -443,7 +536,8 @@
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n",
+		btrfs_err(tree->fs_info,
+		       "found node %llu %llu on insert of %llu %llu",
 		       found->start, found->end, start, end);
 		return -EEXIST;
 	}
@@ -451,13 +545,6 @@
 	return 0;
 }
 
-static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
-		     u64 split)
-{
-	if (tree->ops && tree->ops->split_extent_hook)
-		tree->ops->split_extent_hook(tree->private_data, orig, split);
-}
-
 /*
  * split a given extent state struct in two, inserting the preallocated
  * struct 'prealloc' as the newly created second half.  'split' indicates an
@@ -477,7 +564,8 @@
 {
 	struct rb_node *node;
 
-	split_cb(tree, orig, split);
+	if (tree->private_data && is_data_inode(tree->private_data))
+		btrfs_split_delalloc_extent(tree->private_data, orig, split);
 
 	prealloc->start = orig->start;
 	prealloc->end = split - 1;
@@ -504,7 +592,7 @@
 
 /*
  * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1).
+ * it will optionally wake up anyone waiting on this state (wake == 1).
  *
  * If no bits are set on the state struct after clearing things, the
  * struct is freed and removed from the tree
@@ -523,7 +611,10 @@
 		WARN_ON(range > tree->dirty_bytes);
 		tree->dirty_bytes -= range;
 	}
-	clear_state_cb(tree, state, bits);
+
+	if (tree->private_data && is_data_inode(tree->private_data))
+		btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
 	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 	BUG_ON(ret < 0);
 	state->state &= ~bits_to_clear;
@@ -588,15 +679,15 @@
 	int clear = 0;
 
 	btrfs_debug_check_extent_io_range(tree, start, end);
+	trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
 
 	if (bits & EXTENT_DELALLOC)
 		bits |= EXTENT_NORESERVE;
 
 	if (delete)
 		bits |= ~EXTENT_CTLBITS;
-	bits |= EXTENT_FIRST_DELALLOC;
 
-	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+	if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 		clear = 1;
 again:
 	if (!prealloc && gfpflags_allow_blocking(mask)) {
@@ -800,7 +891,9 @@
 	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 	int ret;
 
-	set_state_cb(tree, state, bits);
+	if (tree->private_data && is_data_inode(tree->private_data))
+		btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
@@ -826,7 +919,7 @@
 			struct extent_state **cached_ptr)
 {
 	return cache_state_if_flags(state, cached_ptr,
-				    EXTENT_IOBITS | EXTENT_BOUNDARY);
+				    EXTENT_LOCKED | EXTENT_BOUNDARY);
 }
 
 /*
@@ -856,8 +949,8 @@
 	u64 last_end;
 
 	btrfs_debug_check_extent_io_range(tree, start, end);
+	trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
 
-	bits |= EXTENT_FIRST_DELALLOC;
 again:
 	if (!prealloc && gfpflags_allow_blocking(mask)) {
 		/*
@@ -1089,6 +1182,8 @@
 	bool first_iteration = true;
 
 	btrfs_debug_check_extent_io_range(tree, start, end);
+	trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+				       clear_bits);
 
 again:
 	if (!prealloc) {
@@ -1288,6 +1383,13 @@
 				changeset);
 }
 
+int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
+			   unsigned bits)
+{
+	return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
+				GFP_NOWAIT, NULL);
+}
+
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     unsigned bits, int wake, int delete,
 		     struct extent_state **cached)
@@ -1424,20 +1526,15 @@
 			  struct extent_state **cached_state)
 {
 	struct extent_state *state;
-	struct rb_node *n;
 	int ret = 1;
 
 	spin_lock(&tree->lock);
 	if (cached_state && *cached_state) {
 		state = *cached_state;
 		if (state->end == start - 1 && extent_state_in_tree(state)) {
-			n = rb_next(&state->rb_node);
-			while (n) {
-				state = rb_entry(n, struct extent_state,
-						 rb_node);
+			while ((state = next_state(state)) != NULL) {
 				if (state->state & bits)
 					goto got_it;
-				n = rb_next(n);
 			}
 			free_extent_state(*cached_state);
 			*cached_state = NULL;
@@ -1460,20 +1557,133 @@
 	return ret;
 }
 
+/**
+ * find_first_clear_extent_bit - find the first range that has @bits not set.
+ * This range could start before @start.
+ *
+ * @tree - the tree to search
+ * @start - the offset at/after which the found extent should start
+ * @start_ret - records the beginning of the range
+ * @end_ret - records the end of the range (inclusive)
+ * @bits - the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+				 u64 *start_ret, u64 *end_ret, unsigned bits)
+{
+	struct extent_state *state;
+	struct rb_node *node, *prev = NULL, *next;
+
+	spin_lock(&tree->lock);
+
+	/* Find first extent with bits cleared */
+	while (1) {
+		node = __etree_search(tree, start, &next, &prev, NULL, NULL);
+		if (!node) {
+			node = next;
+			if (!node) {
+				/*
+				 * We are past the last allocated chunk,
+				 * set start at the end of the last extent. The
+				 * device alloc tree should never be empty so
+				 * prev is always set.
+				 */
+				ASSERT(prev);
+				state = rb_entry(prev, struct extent_state, rb_node);
+				*start_ret = state->end + 1;
+				*end_ret = -1;
+				goto out;
+			}
+		}
+		/*
+		 * At this point 'node' either contains 'start' or start is
+		 * before 'node'
+		 */
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (in_range(start, state->start, state->end - state->start + 1)) {
+			if (state->state & bits) {
+				/*
+				 * |--range with bits sets--|
+				 *    |
+				 *    start
+				 */
+				start = state->end + 1;
+			} else {
+				/*
+				 * 'start' falls within a range that doesn't
+				 * have the bits set, so take its start as
+				 * the beginning of the desired range
+				 *
+				 * |--range with bits cleared----|
+				 *      |
+				 *      start
+				 */
+				*start_ret = state->start;
+				break;
+			}
+		} else {
+			/*
+			 * |---prev range---|---hole/unset---|---node range---|
+			 *                          |
+			 *                        start
+			 *
+			 *                        or
+			 *
+			 * |---hole/unset--||--first node--|
+			 * 0   |
+			 *    start
+			 */
+			if (prev) {
+				state = rb_entry(prev, struct extent_state,
+						 rb_node);
+				*start_ret = state->end + 1;
+			} else {
+				*start_ret = 0;
+			}
+			break;
+		}
+	}
+
+	/*
+	 * Find the longest stretch from start until an entry which has the
+	 * bits set
+	 */
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && !(state->state & bits)) {
+			*end_ret = state->end;
+		} else {
+			*end_ret = state->start - 1;
+			break;
+		}
+
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+}
+
 /*
  * find a contiguous range of bytes in the file marked as delalloc, not
  * more than 'max_bytes'.  start and end are used to return the range,
  *
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * true is returned if we find something, false if nothing was in the tree
  */
-static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+static noinline bool find_delalloc_range(struct extent_io_tree *tree,
 					u64 *start, u64 *end, u64 max_bytes,
 					struct extent_state **cached_state)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	u64 cur_start = *start;
-	u64 found = 0;
+	bool found = false;
 	u64 total_bytes = 0;
 
 	spin_lock(&tree->lock);
@@ -1484,8 +1694,7 @@
 	 */
 	node = tree_search(tree, cur_start);
 	if (!node) {
-		if (!found)
-			*end = (u64)-1;
+		*end = (u64)-1;
 		goto out;
 	}
 
@@ -1505,7 +1714,7 @@
 			*cached_state = state;
 			refcount_inc(&state->refs);
 		}
-		found++;
+		found = true;
 		*end = state->end;
 		cur_start = state->end + 1;
 		node = rb_next(node);
@@ -1563,19 +1772,22 @@
 }
 
 /*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'.  start and end are used to return the range,
+ * Find and lock a contiguous range of bytes in the file marked as delalloc, no
+ * more than @max_bytes.  @Start and @end are used to return the range,
  *
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * Return: true if we find something
+ *         false if nothing was in the tree
  */
-STATIC u64 find_lock_delalloc_range(struct inode *inode,
-				    struct extent_io_tree *tree,
+EXPORT_FOR_TESTS
+noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
 				    struct page *locked_page, u64 *start,
-				    u64 *end, u64 max_bytes)
+				    u64 *end)
 {
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
 	u64 delalloc_start;
 	u64 delalloc_end;
-	u64 found;
+	bool found;
 	struct extent_state *cached_state = NULL;
 	int ret;
 	int loops = 0;
@@ -1590,7 +1802,7 @@
 		*start = delalloc_start;
 		*end = delalloc_end;
 		free_extent_state(cached_state);
-		return 0;
+		return false;
 	}
 
 	/*
@@ -1610,6 +1822,7 @@
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
 				  delalloc_start, delalloc_end);
+	ASSERT(!ret || ret == -EAGAIN);
 	if (ret == -EAGAIN) {
 		/* some of the pages are gone, lets avoid looping by
 		 * shortening the size of the delalloc range we're searching
@@ -1621,11 +1834,10 @@
 			loops = 1;
 			goto again;
 		} else {
-			found = 0;
+			found = false;
 			goto out_failed;
 		}
 	}
-	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
 
 	/* step three, lock the state bits for the whole range */
 	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
@@ -1726,9 +1938,9 @@
 }
 
 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
-				 u64 delalloc_end, struct page *locked_page,
-				 unsigned clear_bits,
-				 unsigned long page_ops)
+				  struct page *locked_page,
+				  unsigned clear_bits,
+				  unsigned long page_ops)
 {
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
 			 NULL);
@@ -2041,9 +2253,9 @@
 	return 0;
 }
 
-int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
-			 struct extent_buffer *eb, int mirror_num)
+int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
 	u64 start = eb->start;
 	int i, num_pages = num_extent_pages(eb);
 	int ret = 0;
@@ -2343,13 +2555,11 @@
 }
 
 /*
- * this is a generic handler for readpage errors (default
- * readpage_io_failed_hook). if other copies exist, read those and write back
- * good data to the failed position. does not investigate in remapping the
- * failed extent elsewhere, hoping the device will be smart enough to do this as
- * needed
+ * This is a generic handler for readpage errors. If other copies exist, read
+ * those and write back good data to the failed position. Does not investigate
+ * in remapping the failed extent elsewhere, hoping the device will be smart
+ * enough to do this as needed
  */
-
 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 			      struct page *page, u64 start, u64 end,
 			      int failed_mirror)
@@ -2362,7 +2572,7 @@
 	int read_mode = 0;
 	blk_status_t status;
 	int ret;
-	unsigned failed_bio_pages = bio_pages_all(failed_bio);
+	unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
@@ -2391,7 +2601,7 @@
 		read_mode, failrec->this_mirror, failrec->in_validation);
 
 	status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
-					 failrec->bio_flags, 0);
+					 failrec->bio_flags);
 	if (status) {
 		free_io_failure(failure_tree, tree, failrec);
 		bio_put(bio);
@@ -2406,14 +2616,9 @@
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 {
 	int uptodate = (err == 0);
-	struct extent_io_tree *tree;
 	int ret = 0;
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-
-	if (tree->ops && tree->ops->writepage_end_io_hook)
-		tree->ops->writepage_end_io_hook(page, start, end, NULL,
-				uptodate);
+	btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
 
 	if (!uptodate) {
 		ClearPageUptodate(page);
@@ -2438,10 +2643,10 @@
 	struct bio_vec *bvec;
 	u64 start;
 	u64 end;
-	int i;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2509,13 +2714,15 @@
 	u64 extent_len = 0;
 	int mirror;
 	int ret;
-	int i;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+		bool data_inode = btrfs_ino(BTRFS_I(inode))
+			!= BTRFS_BTREE_INODE_OBJECTID;
 
 		btrfs_debug(fs_info,
 			"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2545,7 +2752,7 @@
 		len = bvec->bv_len;
 
 		mirror = io_bio->mirror_num;
-		if (likely(uptodate && tree->ops)) {
+		if (likely(uptodate)) {
 			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
 							      page, start, end,
 							      mirror);
@@ -2561,38 +2768,35 @@
 		if (likely(uptodate))
 			goto readpage_ok;
 
-		if (tree->ops) {
-			ret = tree->ops->readpage_io_failed_hook(page, mirror);
-			if (ret == -EAGAIN) {
-				/*
-				 * Data inode's readpage_io_failed_hook() always
-				 * returns -EAGAIN.
-				 *
-				 * The generic bio_readpage_error handles errors
-				 * the following way: If possible, new read
-				 * requests are created and submitted and will
-				 * end up in end_bio_extent_readpage as well (if
-				 * we're lucky, not in the !uptodate case). In
-				 * that case it returns 0 and we just go on with
-				 * the next page in our bio. If it can't handle
-				 * the error it will return -EIO and we remain
-				 * responsible for that page.
-				 */
-				ret = bio_readpage_error(bio, offset, page,
-							 start, end, mirror);
-				if (ret == 0) {
-					uptodate = !bio->bi_status;
-					offset += len;
-					continue;
-				}
-			}
+		if (data_inode) {
 
 			/*
-			 * metadata's readpage_io_failed_hook() always returns
-			 * -EIO and fixes nothing.  -EIO is also returned if
-			 * data inode error could not be fixed.
+			 * The generic bio_readpage_error handles errors the
+			 * following way: If possible, new read requests are
+			 * created and submitted and will end up in
+			 * end_bio_extent_readpage as well (if we're lucky,
+			 * not in the !uptodate case). In that case it returns
+			 * 0 and we just go on with the next page in our bio.
+			 * If it can't handle the error it will return -EIO and
+			 * we remain responsible for that page.
 			 */
-			ASSERT(ret == -EIO);
+			ret = bio_readpage_error(bio, offset, page, start, end,
+						 mirror);
+			if (ret == 0) {
+				uptodate = !bio->bi_status;
+				offset += len;
+				continue;
+			}
+		} else {
+			struct extent_buffer *eb;
+
+			eb = (struct extent_buffer *)page->private;
+			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+			eb->read_mirror = mirror;
+			atomic_dec(&eb->io_pages);
+			if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
+					       &eb->bflags))
+				btree_readahead_hook(eb, -EIO);
 		}
 readpage_ok:
 		if (likely(uptodate)) {
@@ -2601,7 +2805,7 @@
 			unsigned off;
 
 			/* Zero out the end if this page straddles i_size */
-			off = i_size & (PAGE_SIZE-1);
+			off = offset_in_page(i_size);
 			if (page->index == end_index && off)
 				zero_user_segment(page, off, PAGE_SIZE);
 			SetPageUptodate(page);
@@ -2638,8 +2842,7 @@
 	if (extent_len)
 		endio_readpage_release_extent(tree, extent_start, extent_len,
 					      uptodate);
-	if (io_bio->end_io)
-		io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
+	btrfs_io_bio_free_csum(io_bio);
 	bio_put(bio);
 }
 
@@ -2658,12 +2861,11 @@
  * never fail.  We're returning a bio right now but you can call btrfs_io_bio
  * for the appropriate container_of magic
  */
-struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
+struct bio *btrfs_bio_alloc(u64 first_byte)
 {
 	struct bio *bio;
 
 	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
-	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = first_byte >> 9;
 	btrfs_io_bio_init(btrfs_io_bio(bio));
 	return bio;
@@ -2709,28 +2911,6 @@
 	return bio;
 }
 
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
-				       unsigned long bio_flags)
-{
-	blk_status_t ret = 0;
-	struct bio_vec *bvec = bio_last_bvec_all(bio);
-	struct page *page = bvec->bv_page;
-	struct extent_io_tree *tree = bio->bi_private;
-	u64 start;
-
-	start = page_offset(page) + bvec->bv_offset;
-
-	bio->bi_private = NULL;
-
-	if (tree->ops)
-		ret = tree->ops->submit_bio_hook(tree->private_data, bio,
-					   mirror_num, bio_flags, start);
-	else
-		btrfsic_submit_bio(bio);
-
-	return blk_status_to_errno(ret);
-}
-
 /*
  * @opf:	bio REQ_OP_* and REQ_* flags as one value
  * @tree:	tree so we can call our merge_bio hook
@@ -2776,8 +2956,8 @@
 		else
 			contig = bio_end_sector(bio) == sector;
 
-		if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size,
-						      bio, bio_flags))
+		ASSERT(tree->ops);
+		if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
 			can_merge = false;
 
 		if (prev_bio_flags != bio_flags || !contig || !can_merge ||
@@ -2791,12 +2971,13 @@
 			bio = NULL;
 		} else {
 			if (wbc)
-				wbc_account_io(wbc, page, page_size);
+				wbc_account_cgroup_owner(wbc, page, page_size);
 			return 0;
 		}
 	}
 
-	bio = btrfs_bio_alloc(bdev, offset);
+	bio = btrfs_bio_alloc(offset);
+	bio_set_dev(bio, bdev);
 	bio_add_page(bio, page, page_size, pg_offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
@@ -2804,7 +2985,7 @@
 	bio->bi_opf = opf;
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, page, page_size);
+		wbc_account_cgroup_owner(wbc, page, page_size);
 	}
 
 	*bio_ret = bio;
@@ -2905,7 +3086,7 @@
 
 	if (page->index == last_byte >> PAGE_SHIFT) {
 		char *userpage;
-		size_t zero_offset = last_byte & (PAGE_SIZE - 1);
+		size_t zero_offset = offset_in_page(last_byte);
 
 		if (zero_offset) {
 			iosize = PAGE_SIZE - zero_offset;
@@ -3002,11 +3183,11 @@
 		 */
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
 		    prev_em_start && *prev_em_start != (u64)-1 &&
-		    *prev_em_start != em->orig_start)
+		    *prev_em_start != em->start)
 			force_bio_submit = true;
 
 		if (prev_em_start)
-			*prev_em_start = em->orig_start;
+			*prev_em_start = em->start;
 
 		free_extent_map(em);
 		em = NULL;
@@ -3076,7 +3257,7 @@
 	return ret;
 }
 
-static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
+static inline void contiguous_readpages(struct extent_io_tree *tree,
 					     struct page *pages[], int nr_pages,
 					     u64 start, u64 end,
 					     struct extent_map **em_cached,
@@ -3084,21 +3265,10 @@
 					     unsigned long *bio_flags,
 					     u64 *prev_em_start)
 {
-	struct inode *inode;
-	struct btrfs_ordered_extent *ordered;
+	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
 	int index;
 
-	inode = pages[0]->mapping->host;
-	while (1) {
-		lock_extent(tree, start, end);
-		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
-						     end - start + 1);
-		if (!ordered)
-			break;
-		unlock_extent(tree, start, end);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-	}
+	btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
 
 	for (index = 0; index < nr_pages; index++) {
 		__do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
@@ -3107,46 +3277,6 @@
 	}
 }
 
-static void __extent_readpages(struct extent_io_tree *tree,
-			       struct page *pages[],
-			       int nr_pages,
-			       struct extent_map **em_cached,
-			       struct bio **bio, unsigned long *bio_flags,
-			       u64 *prev_em_start)
-{
-	u64 start = 0;
-	u64 end = 0;
-	u64 page_start;
-	int index;
-	int first_index = 0;
-
-	for (index = 0; index < nr_pages; index++) {
-		page_start = page_offset(pages[index]);
-		if (!end) {
-			start = page_start;
-			end = start + PAGE_SIZE - 1;
-			first_index = index;
-		} else if (end + 1 == page_start) {
-			end += PAGE_SIZE;
-		} else {
-			__do_contiguous_readpages(tree, &pages[first_index],
-						  index - first_index, start,
-						  end, em_cached,
-						  bio, bio_flags,
-						  prev_em_start);
-			start = page_start;
-			end = start + PAGE_SIZE - 1;
-			first_index = index;
-		}
-	}
-
-	if (end)
-		__do_contiguous_readpages(tree, &pages[first_index],
-					  index - first_index, start,
-					  end, em_cached, bio,
-					  bio_flags, prev_em_start);
-}
-
 static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
@@ -3154,22 +3284,12 @@
 				   unsigned long *bio_flags,
 				   unsigned int read_flags)
 {
-	struct inode *inode = page->mapping->host;
-	struct btrfs_ordered_extent *ordered;
+	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
 	u64 start = page_offset(page);
 	u64 end = start + PAGE_SIZE - 1;
 	int ret;
 
-	while (1) {
-		lock_extent(tree, start, end);
-		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
-						PAGE_SIZE);
-		if (!ordered)
-			break;
-		unlock_extent(tree, start, end);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-	}
+	btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
 
 	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
 			    bio_flags, read_flags, NULL);
@@ -3199,7 +3319,7 @@
 /*
  * helper for __extent_writepage, doing all of the delayed allocation setup.
  *
- * This returns 1 if our fill_delalloc function did all the work required
+ * This returns 1 if btrfs_run_delalloc_range function did all the work required
  * to write the page (copy into inline extent).  In this case the IO has
  * been started and the page is already unlocked.
  *
@@ -3207,44 +3327,34 @@
  * This returns < 0 if there were errors (page still locked)
  */
 static noinline_for_stack int writepage_delalloc(struct inode *inode,
-			      struct page *page, struct writeback_control *wbc,
-			      struct extent_page_data *epd,
-			      u64 delalloc_start,
-			      unsigned long *nr_written)
+		struct page *page, struct writeback_control *wbc,
+		u64 delalloc_start, unsigned long *nr_written)
 {
-	struct extent_io_tree *tree = epd->tree;
 	u64 page_end = delalloc_start + PAGE_SIZE - 1;
-	u64 nr_delalloc;
+	bool found;
 	u64 delalloc_to_write = 0;
 	u64 delalloc_end = 0;
 	int ret;
 	int page_started = 0;
 
-	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
-		return 0;
 
 	while (delalloc_end < page_end) {
-		nr_delalloc = find_lock_delalloc_range(inode, tree,
-					       page,
+		found = find_lock_delalloc_range(inode, page,
 					       &delalloc_start,
-					       &delalloc_end,
-					       BTRFS_MAX_EXTENT_SIZE);
-		if (nr_delalloc == 0) {
+					       &delalloc_end);
+		if (!found) {
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
-		ret = tree->ops->fill_delalloc(inode, page,
-					       delalloc_start,
-					       delalloc_end,
-					       &page_started,
-					       nr_written, wbc);
-		/* File system has been set read-only */
+		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
+				delalloc_end, &page_started, nr_written, wbc);
 		if (ret) {
 			SetPageError(page);
-			/* fill_delalloc should be return < 0 for error
-			 * but just in case, we use > 0 here meaning the
-			 * IO is started, so we don't want to return > 0
-			 * unless things are going well.
+			/*
+			 * btrfs_run_delalloc_range should return < 0 for error
+			 * but just in case, we use > 0 here meaning the IO is
+			 * started, so we don't want to return > 0 unless
+			 * things are going well.
 			 */
 			ret = ret < 0 ? ret : -EIO;
 			goto done;
@@ -3317,20 +3427,17 @@
 	int nr = 0;
 	bool compressed;
 
-	if (tree->ops && tree->ops->writepage_start_hook) {
-		ret = tree->ops->writepage_start_hook(page, start,
-						      page_end);
-		if (ret) {
-			/* Fixup worker will requeue */
-			if (ret == -EBUSY)
-				wbc->pages_skipped++;
-			else
-				redirty_page_for_writepage(wbc, page);
+	ret = btrfs_writepage_cow_fixup(page, start, page_end);
+	if (ret) {
+		/* Fixup worker will requeue */
+		if (ret == -EBUSY)
+			wbc->pages_skipped++;
+		else
+			redirty_page_for_writepage(wbc, page);
 
-			update_nr_written(wbc, nr_written);
-			unlock_page(page);
-			return 1;
-		}
+		update_nr_written(wbc, nr_written);
+		unlock_page(page);
+		return 1;
 	}
 
 	/*
@@ -3341,9 +3448,7 @@
 
 	end = page_end;
 	if (i_size <= start) {
-		if (tree->ops && tree->ops->writepage_end_io_hook)
-			tree->ops->writepage_end_io_hook(page, start,
-							 page_end, NULL, 1);
+		btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);
 		goto done;
 	}
 
@@ -3354,9 +3459,8 @@
 		u64 offset;
 
 		if (cur >= i_size) {
-			if (tree->ops && tree->ops->writepage_end_io_hook)
-				tree->ops->writepage_end_io_hook(page, cur,
-							 page_end, NULL, 1);
+			btrfs_writepage_endio_finish_ordered(page, cur,
+							     page_end, 1);
 			break;
 		}
 		em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
@@ -3390,11 +3494,10 @@
 			 * end_io notification does not happen here for
 			 * compressed extents
 			 */
-			if (!compressed && tree->ops &&
-			    tree->ops->writepage_end_io_hook)
-				tree->ops->writepage_end_io_hook(page, cur,
-							 cur + iosize - 1,
-							 NULL, 1);
+			if (!compressed)
+				btrfs_writepage_endio_finish_ordered(page, cur,
+							    cur + iosize - 1,
+							    1);
 			else if (compressed) {
 				/* we don't want to end_page_writeback on
 				 * a compressed extent.  this happens
@@ -3440,6 +3543,9 @@
  * records are inserted to lock ranges in the tree, and as dirty areas
  * are found, they are marked writeback.  Then the lock bits are removed
  * and the end_io handler clears the writeback ranges
+ *
+ * Return 0 if everything goes well.
+ * Return <0 for error.
  */
 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			      struct extent_page_data *epd)
@@ -3463,7 +3569,7 @@
 
 	ClearPageError(page);
 
-	pg_offset = i_size & (PAGE_SIZE - 1);
+	pg_offset = offset_in_page(i_size);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
 		page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
@@ -3485,11 +3591,13 @@
 
 	set_page_extent_mapped(page);
 
-	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
-	if (ret == 1)
-		goto done_unlocked;
-	if (ret)
-		goto done;
+	if (!epd->extent_locked) {
+		ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
+		if (ret == 1)
+			goto done_unlocked;
+		if (ret)
+			goto done;
+	}
 
 	ret = __extent_writepage_io(inode, page, wbc, epd,
 				    i_size, nr_written, write_flags, &nr);
@@ -3507,6 +3615,7 @@
 		end_extent_writepage(page, ret, start, page_end);
 	}
 	unlock_page(page);
+	ASSERT(ret <= 0);
 	return ret;
 
 done_unlocked:
@@ -3519,18 +3628,33 @@
 		       TASK_UNINTERRUPTIBLE);
 }
 
-static noinline_for_stack int
-lock_extent_buffer_for_io(struct extent_buffer *eb,
-			  struct btrfs_fs_info *fs_info,
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+	smp_mb__after_atomic();
+	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+/*
+ * Lock eb pages and flush the bio if we can't the locks
+ *
+ * Return  0 if nothing went wrong
+ * Return >0 is same as 0, except bio is not submitted
+ * Return <0 if something went wrong, no page is locked
+ */
+static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
 			  struct extent_page_data *epd)
 {
-	int i, num_pages;
+	struct btrfs_fs_info *fs_info = eb->fs_info;
+	int i, num_pages, failed_page_nr;
 	int flush = 0;
 	int ret = 0;
 
 	if (!btrfs_try_tree_write_lock(eb)) {
+		ret = flush_write_bio(epd);
+		if (ret < 0)
+			return ret;
 		flush = 1;
-		flush_write_bio(epd);
 		btrfs_tree_lock(eb);
 	}
 
@@ -3539,7 +3663,9 @@
 		if (!epd->sync_io)
 			return 0;
 		if (!flush) {
-			flush_write_bio(epd);
+			ret = flush_write_bio(epd);
+			if (ret < 0)
+				return ret;
 			flush = 1;
 		}
 		while (1) {
@@ -3580,7 +3706,14 @@
 
 		if (!trylock_page(p)) {
 			if (!flush) {
-				flush_write_bio(epd);
+				int err;
+
+				err = flush_write_bio(epd);
+				if (err < 0) {
+					ret = err;
+					failed_page_nr = i;
+					goto err_unlock;
+				}
 				flush = 1;
 			}
 			lock_page(p);
@@ -3588,24 +3721,45 @@
 	}
 
 	return ret;
-}
-
-static void end_extent_buffer_writeback(struct extent_buffer *eb)
-{
-	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
-	smp_mb__after_atomic();
-	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+err_unlock:
+	/* Unlock already locked pages */
+	for (i = 0; i < failed_page_nr; i++)
+		unlock_page(eb->pages[i]);
+	/*
+	 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
+	 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
+	 * be made and undo everything done before.
+	 */
+	btrfs_tree_lock(eb);
+	spin_lock(&eb->refs_lock);
+	set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+	end_extent_buffer_writeback(eb);
+	spin_unlock(&eb->refs_lock);
+	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
+				 fs_info->dirty_metadata_batch);
+	btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+	btrfs_tree_unlock(eb);
+	return ret;
 }
 
 static void set_btree_ioerr(struct page *page)
 {
 	struct extent_buffer *eb = (struct extent_buffer *)page->private;
+	struct btrfs_fs_info *fs_info;
 
 	SetPageError(page);
 	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
 		return;
 
 	/*
+	 * If we error out, we should add back the dirty_metadata_bytes
+	 * to make it consistent.
+	 */
+	fs_info = eb->fs_info;
+	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+				 eb->len, fs_info->dirty_metadata_batch);
+
+	/*
 	 * If writeback for a btree extent that doesn't belong to a log tree
 	 * failed, increment the counter transaction->eb_write_errors.
 	 * We do this because while the transaction is running and before it's
@@ -3662,10 +3816,11 @@
 {
 	struct bio_vec *bvec;
 	struct extent_buffer *eb;
-	int i, done;
+	int done;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
 
 		eb = (struct extent_buffer *)page->private;
@@ -3690,10 +3845,10 @@
 }
 
 static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
-			struct btrfs_fs_info *fs_info,
 			struct writeback_control *wbc,
 			struct extent_page_data *epd)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
 	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 	u64 offset = eb->start;
@@ -3719,7 +3874,7 @@
 		 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
 		 */
 		start = btrfs_item_nr_offset(nritems);
-		end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
+		end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
 		memzero_extent_buffer(eb, start, end - start);
 	}
 
@@ -3762,7 +3917,6 @@
 				   struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
-	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
 	struct extent_buffer *eb, *prev_eb = NULL;
 	struct extent_page_data epd = {
 		.bio = NULL,
@@ -3778,7 +3932,7 @@
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
-	int tag;
+	xa_mark_t tag;
 
 	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
@@ -3837,13 +3991,17 @@
 				continue;
 
 			prev_eb = eb;
-			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+			ret = lock_extent_buffer_for_io(eb, &epd);
 			if (!ret) {
 				free_extent_buffer(eb);
 				continue;
+			} else if (ret < 0) {
+				done = 1;
+				free_extent_buffer(eb);
+				break;
 			}
 
-			ret = write_one_eb(eb, fs_info, wbc, &epd);
+			ret = write_one_eb(eb, wbc, &epd);
 			if (ret) {
 				done = 1;
 				free_extent_buffer(eb);
@@ -3870,7 +4028,12 @@
 		index = 0;
 		goto retry;
 	}
-	flush_write_bio(&epd);
+	ASSERT(ret <= 0);
+	if (ret < 0) {
+		end_write_bio(&epd, ret);
+		return ret;
+	}
+	ret = flush_write_bio(&epd);
 	return ret;
 }
 
@@ -3903,7 +4066,7 @@
 	pgoff_t done_index;
 	int range_whole = 0;
 	int scanned = 0;
-	int tag;
+	xa_mark_t tag;
 
 	/*
 	 * We have to hold onto the inode so that ordered extents can do their
@@ -3928,12 +4091,25 @@
 			range_whole = 1;
 		scanned = 1;
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
+
+	/*
+	 * We do the tagged writepage as long as the snapshot flush bit is set
+	 * and we are the first one who do the filemap_flush() on this inode.
+	 *
+	 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
+	 * not race in and drop the bit.
+	 */
+	if (range_whole && wbc->nr_to_write == LONG_MAX &&
+	    test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+			       &BTRFS_I(inode)->runtime_flags))
+		wbc->tagged_writepages = 1;
+
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && !nr_to_write_done && (index <= end) &&
@@ -3954,7 +4130,8 @@
 			 * tmpfs file mapping
 			 */
 			if (!trylock_page(page)) {
-				flush_write_bio(epd);
+				ret = flush_write_bio(epd);
+				BUG_ON(ret < 0);
 				lock_page(page);
 			}
 
@@ -3964,8 +4141,10 @@
 			}
 
 			if (wbc->sync_mode != WB_SYNC_NONE) {
-				if (PageWriteback(page))
-					flush_write_bio(epd);
+				if (PageWriteback(page)) {
+					ret = flush_write_bio(epd);
+					BUG_ON(ret < 0);
+				}
 				wait_on_page_writeback(page);
 			}
 
@@ -3976,11 +4155,6 @@
 			}
 
 			ret = __extent_writepage(page, wbc, epd);
-
-			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
-				unlock_page(page);
-				ret = 0;
-			}
 			if (ret < 0) {
 				/*
 				 * done_index is set past this page,
@@ -4023,17 +4197,6 @@
 	return ret;
 }
 
-static void flush_write_bio(struct extent_page_data *epd)
-{
-	if (epd->bio) {
-		int ret;
-
-		ret = submit_one_bio(epd->bio, 0, 0);
-		BUG_ON(ret < 0); /* -ENOMEM */
-		epd->bio = NULL;
-	}
-}
-
 int extent_write_full_page(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
@@ -4045,8 +4208,14 @@
 	};
 
 	ret = __extent_writepage(page, wbc, &epd);
+	ASSERT(ret <= 0);
+	if (ret < 0) {
+		end_write_bio(&epd, ret);
+		return ret;
+	}
 
-	flush_write_bio(&epd);
+	ret = flush_write_bio(&epd);
+	ASSERT(ret <= 0);
 	return ret;
 }
 
@@ -4078,17 +4247,20 @@
 		if (clear_page_dirty_for_io(page))
 			ret = __extent_writepage(page, &wbc_writepages, &epd);
 		else {
-			if (tree->ops && tree->ops->writepage_end_io_hook)
-				tree->ops->writepage_end_io_hook(page, start,
-						 start + PAGE_SIZE - 1,
-						 NULL, 1);
+			btrfs_writepage_endio_finish_ordered(page, start,
+						    start + PAGE_SIZE - 1, 1);
 			unlock_page(page);
 		}
 		put_page(page);
 		start += PAGE_SIZE;
 	}
 
-	flush_write_bio(&epd);
+	ASSERT(ret <= 0);
+	if (ret < 0) {
+		end_write_bio(&epd, ret);
+		return ret;
+	}
+	ret = flush_write_bio(&epd);
 	return ret;
 }
 
@@ -4104,7 +4276,12 @@
 	};
 
 	ret = extent_write_cache_pages(mapping, wbc, &epd);
-	flush_write_bio(&epd);
+	ASSERT(ret <= 0);
+	if (ret < 0) {
+		end_write_bio(&epd, ret);
+		return ret;
+	}
+	ret = flush_write_bio(&epd);
 	return ret;
 }
 
@@ -4112,42 +4289,45 @@
 		     unsigned nr_pages)
 {
 	struct bio *bio = NULL;
-	unsigned page_idx;
 	unsigned long bio_flags = 0;
 	struct page *pagepool[16];
-	struct page *page;
 	struct extent_map *em_cached = NULL;
 	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
 	int nr = 0;
 	u64 prev_em_start = (u64)-1;
 
-	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		page = list_entry(pages->prev, struct page, lru);
+	while (!list_empty(pages)) {
+		u64 contig_end = 0;
 
-		prefetchw(&page->flags);
-		list_del(&page->lru);
-		if (add_to_page_cache_lru(page, mapping,
-					page->index,
-					readahead_gfp_mask(mapping))) {
-			put_page(page);
-			continue;
+		for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
+			struct page *page = lru_to_page(pages);
+
+			prefetchw(&page->flags);
+			list_del(&page->lru);
+			if (add_to_page_cache_lru(page, mapping, page->index,
+						readahead_gfp_mask(mapping))) {
+				put_page(page);
+				break;
+			}
+
+			pagepool[nr++] = page;
+			contig_end = page_offset(page) + PAGE_SIZE - 1;
 		}
 
-		pagepool[nr++] = page;
-		if (nr < ARRAY_SIZE(pagepool))
-			continue;
-		__extent_readpages(tree, pagepool, nr, &em_cached, &bio,
-				&bio_flags, &prev_em_start);
-		nr = 0;
+		if (nr) {
+			u64 contig_start = page_offset(pagepool[0]);
+
+			ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
+
+			contiguous_readpages(tree, pagepool, nr, contig_start,
+				     contig_end, &em_cached, &bio, &bio_flags,
+				     &prev_em_start);
+		}
 	}
-	if (nr)
-		__extent_readpages(tree, pagepool, nr, &em_cached, &bio,
-				&bio_flags, &prev_em_start);
 
 	if (em_cached)
 		free_extent_map(em_cached);
 
-	BUG_ON(!list_empty(pages));
 	if (bio)
 		return submit_one_bio(bio, 0, bio_flags);
 	return 0;
@@ -4172,10 +4352,8 @@
 
 	lock_extent_bits(tree, start, end, &cached_state);
 	wait_on_page_writeback(page);
-	clear_extent_bit(tree, start, end,
-			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-			 EXTENT_DO_ACCOUNTING,
-			 1, 1, &cached_state);
+	clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
+			 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
 	return 0;
 }
 
@@ -4191,10 +4369,9 @@
 	u64 end = start + PAGE_SIZE - 1;
 	int ret = 1;
 
-	if (test_range_bit(tree, start, end,
-			   EXTENT_IOBITS, 0, NULL))
+	if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
 		ret = 0;
-	else {
+	} else {
 		/*
 		 * at this point we can safely clear everything except the
 		 * locked bit and the nodatasum bit
@@ -4247,8 +4424,7 @@
 			}
 			if (!test_range_bit(tree, em->start,
 					    extent_map_end(em) - 1,
-					    EXTENT_LOCKED | EXTENT_WRITEBACK,
-					    0, NULL)) {
+					    EXTENT_LOCKED, 0, NULL)) {
 				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 					&btrfs_inode->runtime_flags);
 				remove_extent_mapping(map, em);
@@ -4284,8 +4460,7 @@
 		if (len == 0)
 			break;
 		len = ALIGN(len, sectorsize);
-		em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
-				len, 0);
+		em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
 		if (IS_ERR_OR_NULL(em))
 			return em;
 
@@ -4336,7 +4511,7 @@
 
 	/*
 	 * Sanity check, extent_fiemap() should have ensured that new
-	 * fiemap extent won't overlap with cahced one.
+	 * fiemap extent won't overlap with cached one.
 	 * Not recoverable.
 	 *
 	 * NOTE: Physical address can overlap, due to compression
@@ -4398,8 +4573,7 @@
  * In this case, the first extent range will be cached but not emitted.
  * So we must emit it before ending extent_fiemap().
  */
-static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
-				  struct fiemap_extent_info *fieinfo,
+static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
 				  struct fiemap_cache *cache)
 {
 	int ret;
@@ -4433,6 +4607,8 @@
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct fiemap_cache cache = { 0 };
+	struct ulist *roots;
+	struct ulist *tmp_ulist;
 	int end = 0;
 	u64 em_start = 0;
 	u64 em_len = 0;
@@ -4446,6 +4622,13 @@
 		return -ENOMEM;
 	path->leave_spinning = 1;
 
+	roots = ulist_alloc(GFP_KERNEL);
+	tmp_ulist = ulist_alloc(GFP_KERNEL);
+	if (!roots || !tmp_ulist) {
+		ret = -ENOMEM;
+		goto out_free_ulist;
+	}
+
 	start = round_down(start, btrfs_inode_sectorsize(inode));
 	len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
 
@@ -4456,8 +4639,7 @@
 	ret = btrfs_lookup_file_extent(NULL, root, path,
 			btrfs_ino(BTRFS_I(inode)), -1, 0);
 	if (ret < 0) {
-		btrfs_free_path(path);
-		return ret;
+		goto out_free_ulist;
 	} else {
 		WARN_ON(!ret);
 		if (ret == 1)
@@ -4566,7 +4748,7 @@
 			 */
 			ret = btrfs_check_shared(root,
 						 btrfs_ino(BTRFS_I(inode)),
-						 bytenr);
+						 bytenr, roots, tmp_ulist);
 			if (ret < 0)
 				goto out_free;
 			if (ret)
@@ -4606,12 +4788,16 @@
 	}
 out_free:
 	if (!ret)
-		ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
+		ret = emit_last_fiemap_cache(fieinfo, &cache);
 	free_extent_map(em);
 out:
-	btrfs_free_path(path);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			     &cached_state);
+
+out_free_ulist:
+	btrfs_free_path(path);
+	ulist_free(roots);
+	ulist_free(tmp_ulist);
 	return ret;
 }
 
@@ -4698,13 +4884,9 @@
 	eb->fs_info = fs_info;
 	eb->bflags = 0;
 	rwlock_init(&eb->lock);
-	atomic_set(&eb->write_locks, 0);
-	atomic_set(&eb->read_locks, 0);
 	atomic_set(&eb->blocking_readers, 0);
-	atomic_set(&eb->blocking_writers, 0);
-	atomic_set(&eb->spinning_readers, 0);
-	atomic_set(&eb->spinning_writers, 0);
-	eb->lock_nested = 0;
+	eb->blocking_writers = 0;
+	eb->lock_nested = false;
 	init_waitqueue_head(&eb->write_lock_wq);
 	init_waitqueue_head(&eb->read_lock_wq);
 
@@ -4721,6 +4903,13 @@
 		> MAX_INLINE_EXTENT_BUFFER_SIZE);
 	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
 
+#ifdef CONFIG_BTRFS_DEBUG
+	eb->spinning_writers = 0;
+	atomic_set(&eb->spinning_readers, 0);
+	atomic_set(&eb->read_locks, 0);
+	eb->write_locks = 0;
+#endif
+
 	return eb;
 }
 
@@ -4908,13 +5097,6 @@
 	check_buffer_tree_ref(eb);
 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
 
-	/*
-	 * We will free dummy extent buffer's if they come into
-	 * free_extent_buffer with a ref count of 2, but if we are using this we
-	 * want the buffers to stay in memory until we're done with them, so
-	 * bump the ref count again.
-	 */
-	atomic_inc(&eb->refs);
 	return eb;
 free_eb:
 	btrfs_release_extent_buffer(eb);
@@ -5096,7 +5278,9 @@
 
 	while (1) {
 		refs = atomic_read(&eb->refs);
-		if (refs <= 3)
+		if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
+		    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
+			refs == 1))
 			break;
 		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
 		if (old == refs)
@@ -5105,10 +5289,6 @@
 
 	spin_lock(&eb->refs_lock);
 	if (atomic_read(&eb->refs) == 2 &&
-	    test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))
-		atomic_dec(&eb->refs);
-
-	if (atomic_read(&eb->refs) == 2 &&
 	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
 	    !extent_buffer_under_io(eb) &&
 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5153,11 +5333,9 @@
 
 		clear_page_dirty_for_io(page);
 		xa_lock_irq(&page->mapping->i_pages);
-		if (!PageDirty(page)) {
-			radix_tree_tag_clear(&page->mapping->i_pages,
-						page_index(page),
-						PAGECACHE_TAG_DIRTY);
-		}
+		if (!PageDirty(page))
+			__xa_clear_mark(&page->mapping->i_pages,
+					page_index(page), PAGECACHE_TAG_DIRTY);
 		xa_unlock_irq(&page->mapping->i_pages);
 		ClearPageError(page);
 		unlock_page(page);
@@ -5165,11 +5343,11 @@
 	WARN_ON(atomic_read(&eb->refs) == 0);
 }
 
-int set_extent_buffer_dirty(struct extent_buffer *eb)
+bool set_extent_buffer_dirty(struct extent_buffer *eb)
 {
 	int i;
 	int num_pages;
-	int was_dirty = 0;
+	bool was_dirty;
 
 	check_buffer_tree_ref(eb);
 
@@ -5179,8 +5357,15 @@
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
 
+	if (!was_dirty)
+		for (i = 0; i < num_pages; i++)
+			set_page_dirty(eb->pages[i]);
+
+#ifdef CONFIG_BTRFS_DEBUG
 	for (i = 0; i < num_pages; i++)
-		set_page_dirty(eb->pages[i]);
+		ASSERT(PageDirty(eb->pages[i]));
+#endif
+
 	return was_dirty;
 }
 
@@ -5213,8 +5398,7 @@
 	}
 }
 
-int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, int wait, int mirror_num)
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
 {
 	int i;
 	struct page *page;
@@ -5226,6 +5410,7 @@
 	unsigned long num_reads = 0;
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
+	struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
 
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
@@ -5329,7 +5514,7 @@
 	struct page *page;
 	char *kaddr;
 	char *dst = (char *)dstv;
-	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(eb->start);
 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 
 	if (start + len > eb->len) {
@@ -5339,7 +5524,7 @@
 		return;
 	}
 
-	offset = (start_offset + start) & (PAGE_SIZE - 1);
+	offset = offset_in_page(start_offset + start);
 
 	while (len > 0) {
 		page = eb->pages[i];
@@ -5364,14 +5549,14 @@
 	struct page *page;
 	char *kaddr;
 	char __user *dst = (char __user *)dstv;
-	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(eb->start);
 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 	int ret = 0;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_SIZE - 1);
+	offset = offset_in_page(start_offset + start);
 
 	while (len > 0) {
 		page = eb->pages[i];
@@ -5402,10 +5587,10 @@
 			      char **map, unsigned long *map_start,
 			      unsigned long *map_len)
 {
-	size_t offset = start & (PAGE_SIZE - 1);
+	size_t offset;
 	char *kaddr;
 	struct page *p;
-	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(eb->start);
 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 	unsigned long end_i = (start_offset + start + min_len - 1) >>
 		PAGE_SHIFT;
@@ -5442,14 +5627,14 @@
 	struct page *page;
 	char *kaddr;
 	char *ptr = (char *)ptrv;
-	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(eb->start);
 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 	int ret = 0;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_SIZE - 1);
+	offset = offset_in_page(start_offset + start);
 
 	while (len > 0) {
 		page = eb->pages[i];
@@ -5498,13 +5683,13 @@
 	struct page *page;
 	char *kaddr;
 	char *src = (char *)srcv;
-	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(eb->start);
 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_SIZE - 1);
+	offset = offset_in_page(start_offset + start);
 
 	while (len > 0) {
 		page = eb->pages[i];
@@ -5528,13 +5713,13 @@
 	size_t offset;
 	struct page *page;
 	char *kaddr;
-	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(eb->start);
 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_SIZE - 1);
+	offset = offset_in_page(start_offset + start);
 
 	while (len > 0) {
 		page = eb->pages[i];
@@ -5573,13 +5758,12 @@
 	size_t offset;
 	struct page *page;
 	char *kaddr;
-	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(dst->start);
 	unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
 
 	WARN_ON(src->len != dst_len);
 
-	offset = (start_offset + dst_offset) &
-		(PAGE_SIZE - 1);
+	offset = offset_in_page(start_offset + dst_offset);
 
 	while (len > 0) {
 		page = dst->pages[i];
@@ -5615,7 +5799,7 @@
 				    unsigned long *page_index,
 				    size_t *page_offset)
 {
-	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(eb->start);
 	size_t byte_offset = BIT_BYTE(nr);
 	size_t offset;
 
@@ -5627,7 +5811,7 @@
 	offset = start_offset + start + byte_offset;
 
 	*page_index = offset >> PAGE_SHIFT;
-	*page_offset = offset & (PAGE_SIZE - 1);
+	*page_offset = offset_in_page(offset);
 }
 
 /**
@@ -5769,7 +5953,7 @@
 	size_t cur;
 	size_t dst_off_in_page;
 	size_t src_off_in_page;
-	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(dst->start);
 	unsigned long dst_i;
 	unsigned long src_i;
 
@@ -5777,20 +5961,18 @@
 		btrfs_err(fs_info,
 			"memmove bogus src_offset %lu move len %lu dst len %lu",
 			 src_offset, len, dst->len);
-		BUG_ON(1);
+		BUG();
 	}
 	if (dst_offset + len > dst->len) {
 		btrfs_err(fs_info,
 			"memmove bogus dst_offset %lu move len %lu dst len %lu",
 			 dst_offset, len, dst->len);
-		BUG_ON(1);
+		BUG();
 	}
 
 	while (len > 0) {
-		dst_off_in_page = (start_offset + dst_offset) &
-			(PAGE_SIZE - 1);
-		src_off_in_page = (start_offset + src_offset) &
-			(PAGE_SIZE - 1);
+		dst_off_in_page = offset_in_page(start_offset + dst_offset);
+		src_off_in_page = offset_in_page(start_offset + src_offset);
 
 		dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
 		src_i = (start_offset + src_offset) >> PAGE_SHIFT;
@@ -5818,7 +6000,7 @@
 	size_t src_off_in_page;
 	unsigned long dst_end = dst_offset + len - 1;
 	unsigned long src_end = src_offset + len - 1;
-	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+	size_t start_offset = offset_in_page(dst->start);
 	unsigned long dst_i;
 	unsigned long src_i;
 
@@ -5826,13 +6008,13 @@
 		btrfs_err(fs_info,
 			  "memmove bogus src_offset %lu move len %lu len %lu",
 			  src_offset, len, dst->len);
-		BUG_ON(1);
+		BUG();
 	}
 	if (dst_offset + len > dst->len) {
 		btrfs_err(fs_info,
 			  "memmove bogus dst_offset %lu move len %lu len %lu",
 			  dst_offset, len, dst->len);
-		BUG_ON(1);
+		BUG();
 	}
 	if (dst_offset < src_offset) {
 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
@@ -5842,10 +6024,8 @@
 		dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
 		src_i = (start_offset + src_end) >> PAGE_SHIFT;
 
-		dst_off_in_page = (start_offset + dst_end) &
-			(PAGE_SIZE - 1);
-		src_off_in_page = (start_offset + src_end) &
-			(PAGE_SIZE - 1);
+		dst_off_in_page = offset_in_page(start_offset + dst_end);
+		src_off_in_page = offset_in_page(start_offset + src_end);
 
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b4d03e6..cf3424d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h

@@ -9,26 +9,32 @@
 
 /* bits for the extent state */
 #define EXTENT_DIRTY		(1U << 0)
-#define EXTENT_WRITEBACK	(1U << 1)
-#define EXTENT_UPTODATE		(1U << 2)
-#define EXTENT_LOCKED		(1U << 3)
-#define EXTENT_NEW		(1U << 4)
-#define EXTENT_DELALLOC		(1U << 5)
-#define EXTENT_DEFRAG		(1U << 6)
-#define EXTENT_BOUNDARY		(1U << 9)
-#define EXTENT_NODATASUM	(1U << 10)
-#define EXTENT_CLEAR_META_RESV	(1U << 11)
-#define EXTENT_FIRST_DELALLOC	(1U << 12)
-#define EXTENT_NEED_WAIT	(1U << 13)
-#define EXTENT_DAMAGED		(1U << 14)
-#define EXTENT_NORESERVE	(1U << 15)
-#define EXTENT_QGROUP_RESERVED	(1U << 16)
-#define EXTENT_CLEAR_DATA_RESV	(1U << 17)
-#define EXTENT_DELALLOC_NEW	(1U << 18)
-#define EXTENT_IOBITS		(EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_UPTODATE		(1U << 1)
+#define EXTENT_LOCKED		(1U << 2)
+#define EXTENT_NEW		(1U << 3)
+#define EXTENT_DELALLOC		(1U << 4)
+#define EXTENT_DEFRAG		(1U << 5)
+#define EXTENT_BOUNDARY		(1U << 6)
+#define EXTENT_NODATASUM	(1U << 7)
+#define EXTENT_CLEAR_META_RESV	(1U << 8)
+#define EXTENT_NEED_WAIT	(1U << 9)
+#define EXTENT_DAMAGED		(1U << 10)
+#define EXTENT_NORESERVE	(1U << 11)
+#define EXTENT_QGROUP_RESERVED	(1U << 12)
+#define EXTENT_CLEAR_DATA_RESV	(1U << 13)
+#define EXTENT_DELALLOC_NEW	(1U << 14)
 #define EXTENT_DO_ACCOUNTING    (EXTENT_CLEAR_META_RESV | \
 				 EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS		(EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_CTLBITS		(EXTENT_DO_ACCOUNTING)
+
+/*
+ * Redefined bits above which are used only in the device allocation tree,
+ * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
+ * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
+ * manipulation functions
+ */
+#define CHUNK_ALLOCATED EXTENT_DIRTY
+#define CHUNK_TRIMMED   EXTENT_DEFRAG
 
 /*
  * flags for bio submission. The high bits indicate the compression
@@ -37,18 +43,22 @@
 #define EXTENT_BIO_COMPRESSED 1
 #define EXTENT_BIO_FLAG_SHIFT 16
 
-/* these are bit numbers for test/set bit */
-#define EXTENT_BUFFER_UPTODATE 0
-#define EXTENT_BUFFER_DIRTY 2
-#define EXTENT_BUFFER_CORRUPT 3
-#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */
-#define EXTENT_BUFFER_TREE_REF 5
-#define EXTENT_BUFFER_STALE 6
-#define EXTENT_BUFFER_WRITEBACK 7
-#define EXTENT_BUFFER_READ_ERR 8        /* read IO error */
-#define EXTENT_BUFFER_UNMAPPED 9
-#define EXTENT_BUFFER_IN_TREE 10
-#define EXTENT_BUFFER_WRITE_ERR 11    /* write IO error */
+enum {
+	EXTENT_BUFFER_UPTODATE,
+	EXTENT_BUFFER_DIRTY,
+	EXTENT_BUFFER_CORRUPT,
+	/* this got triggered by readahead */
+	EXTENT_BUFFER_READAHEAD,
+	EXTENT_BUFFER_TREE_REF,
+	EXTENT_BUFFER_STALE,
+	EXTENT_BUFFER_WRITEBACK,
+	/* read IO error */
+	EXTENT_BUFFER_READ_ERR,
+	EXTENT_BUFFER_UNMAPPED,
+	EXTENT_BUFFER_IN_TREE,
+	/* write IO error */
+	EXTENT_BUFFER_WRITE_ERR,
+};
 
 /* these are flags for __process_pages_contig */
 #define PAGE_UNLOCK		(1 << 0)
@@ -85,54 +95,43 @@
 struct btrfs_io_bio;
 struct io_failure_record;
 
-typedef	blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
-				       int mirror_num, unsigned long bio_flags,
-				       u64 bio_offset);
 
 typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
 		struct bio *bio, u64 bio_offset);
 
 struct extent_io_ops {
 	/*
-	 * The following callbacks must be allways defined, the function
+	 * The following callbacks must be always defined, the function
 	 * pointer will be called unconditionally.
 	 */
-	extent_submit_bio_hook_t *submit_bio_hook;
+	blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio,
+					int mirror_num, unsigned long bio_flags);
 	int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
 				    struct page *page, u64 start, u64 end,
 				    int mirror);
-	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
+};
 
-	/*
-	 * Optional hooks, called if the pointer is not NULL
-	 */
-	int (*fill_delalloc)(void *private_data, struct page *locked_page,
-			     u64 start, u64 end, int *page_started,
-			     unsigned long *nr_written,
-			     struct writeback_control *wbc);
-
-	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
-	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
-				      struct extent_state *state, int uptodate);
-	void (*set_bit_hook)(void *private_data, struct extent_state *state,
-			     unsigned *bits);
-	void (*clear_bit_hook)(void *private_data,
-			struct extent_state *state,
-			unsigned *bits);
-	void (*merge_extent_hook)(void *private_data,
-				  struct extent_state *new,
-				  struct extent_state *other);
-	void (*split_extent_hook)(void *private_data,
-				  struct extent_state *orig, u64 split);
-	void (*check_extent_io_range)(void *private_data, const char *caller,
-				      u64 start, u64 end);
+enum {
+	IO_TREE_FS_INFO_FREED_EXTENTS0,
+	IO_TREE_FS_INFO_FREED_EXTENTS1,
+	IO_TREE_INODE_IO,
+	IO_TREE_INODE_IO_FAILURE,
+	IO_TREE_RELOC_BLOCKS,
+	IO_TREE_TRANS_DIRTY_PAGES,
+	IO_TREE_ROOT_DIRTY_LOG_PAGES,
+	IO_TREE_SELFTEST,
 };
 
 struct extent_io_tree {
 	struct rb_root state;
+	struct btrfs_fs_info *fs_info;
 	void *private_data;
 	u64 dirty_bytes;
-	int track_uptodate;
+	bool track_uptodate;
+
+	/* Who owns this io tree, should be one of IO_TREE_* */
+	u8 owner;
+
 	spinlock_t lock;
 	const struct extent_io_ops *ops;
 };
@@ -168,14 +167,9 @@
 	struct rcu_head rcu_head;
 	pid_t lock_owner;
 
-	/* count of read lock holders on the extent buffer */
-	atomic_t write_locks;
-	atomic_t read_locks;
-	atomic_t blocking_writers;
+	int blocking_writers;
 	atomic_t blocking_readers;
-	atomic_t spinning_readers;
-	atomic_t spinning_writers;
-	short lock_nested;
+	bool lock_nested;
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	short log_index;
 
@@ -193,6 +187,10 @@
 	wait_queue_head_t read_lock_wq;
 	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
+	int spinning_writers;
+	atomic_t spinning_readers;
+	atomic_t read_locks;
+	int write_locks;
 	struct list_head leak_list;
 #endif
 };
@@ -261,7 +259,10 @@
 					  u64 start, u64 len,
 					  int create);
 
-void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+			 struct extent_io_tree *tree, unsigned int owner,
+			 void *private_data);
+void extent_io_tree_release(struct extent_io_tree *tree);
 int try_release_extent_mapping(struct page *page, gfp_t mask);
 int try_release_extent_buffer(struct page *page);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -331,6 +332,8 @@
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   unsigned bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask);
+int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
+			   unsigned bits);
 
 static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
 		u64 end, unsigned bits)
@@ -353,11 +356,11 @@
 }
 
 static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
-		u64 end)
+				     u64 end, struct extent_state **cached)
 {
 	return clear_extent_bit(tree, start, end,
 				EXTENT_DIRTY | EXTENT_DELALLOC |
-				EXTENT_DO_ACCOUNTING, 0, 0, NULL);
+				EXTENT_DO_ACCOUNTING, 0, 0, cached);
 }
 
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -398,6 +401,8 @@
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, unsigned bits,
 			  struct extent_state **cached_state);
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+				 u64 *start_ret, u64 *end_ret, unsigned bits);
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct page *page, struct writeback_control *wbc);
@@ -427,8 +432,7 @@
 #define WAIT_NONE	0
 #define WAIT_COMPLETE	1
 #define WAIT_PAGE_LOCK	2
-int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, int wait,
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
 			     int mirror_num);
 void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
 
@@ -479,7 +483,7 @@
 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
 				unsigned long pos, unsigned long len);
 void clear_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_buffer *eb);
+bool set_extent_buffer_dirty(struct extent_buffer *eb);
 void set_extent_buffer_uptodate(struct extent_buffer *eb);
 void clear_extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_under_io(struct extent_buffer *eb);
@@ -490,10 +494,10 @@
 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
-				 u64 delalloc_end, struct page *locked_page,
-				 unsigned bits_to_clear,
-				 unsigned long page_ops);
-struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte);
+				  struct page *locked_page,
+				  unsigned bits_to_clear,
+				  unsigned long page_ops);
+struct bio *btrfs_bio_alloc(u64 first_byte);
 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
 struct bio *btrfs_bio_clone(struct bio *bio);
 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
@@ -509,8 +513,7 @@
 		     struct extent_io_tree *io_tree, u64 start,
 		     struct page *page, u64 ino, unsigned int pg_offset);
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
-			 struct extent_buffer *eb, int mirror_num);
+int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num);
 
 /*
  * When IO fails, either with EIO or csum verification fails, we
@@ -546,10 +549,9 @@
 		    struct extent_io_tree *io_tree,
 		    struct io_failure_record *rec);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-noinline u64 find_lock_delalloc_range(struct inode *inode,
-				      struct extent_io_tree *tree,
-				      struct page *locked_page, u64 *start,
-				      u64 *end, u64 max_bytes);
+bool find_lock_delalloc_range(struct inode *inode,
+			     struct page *locked_page, u64 *start,
+			     u64 *end);
 #endif
 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
 					       u64 start);

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6648d55..9d30acc 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c

@@ -4,6 +4,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include "ctree.h"
+#include "volumes.h"
 #include "extent_map.h"
 #include "compression.h"
 
@@ -34,7 +35,7 @@
  */
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
-	tree->map = RB_ROOT;
+	tree->map = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&tree->modified_extents);
 	rwlock_init(&tree->lock);
 }
@@ -90,24 +91,27 @@
 	return start + len;
 }
 
-static int tree_insert(struct rb_root *root, struct extent_map *em)
+static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p = &root->rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct extent_map *entry = NULL;
 	struct rb_node *orig_parent = NULL;
 	u64 end = range_end(em->start, em->len);
+	bool leftmost = true;
 
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct extent_map, rb_node);
 
-		if (em->start < entry->start)
+		if (em->start < entry->start) {
 			p = &(*p)->rb_left;
-		else if (em->start >= extent_map_end(entry))
+		} else if (em->start >= extent_map_end(entry)) {
 			p = &(*p)->rb_right;
-		else
+			leftmost = false;
+		} else {
 			return -EEXIST;
+		}
 	}
 
 	orig_parent = parent;
@@ -130,7 +134,7 @@
 			return -EEXIST;
 
 	rb_link_node(&em->rb_node, orig_parent, p);
-	rb_insert_color(&em->rb_node, root);
+	rb_insert_color_cached(&em->rb_node, root, leftmost);
 	return 0;
 }
 
@@ -207,6 +211,9 @@
 	if (!list_empty(&prev->list) || !list_empty(&next->list))
 		return 0;
 
+	ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
+	       prev->block_start != EXTENT_MAP_DELALLOC);
+
 	if (extent_map_end(prev) == next->start &&
 	    prev->flags == next->flags &&
 	    prev->bdev == next->bdev &&
@@ -214,8 +221,6 @@
 	      prev->block_start == EXTENT_MAP_HOLE) ||
 	     (next->block_start == EXTENT_MAP_INLINE &&
 	      prev->block_start == EXTENT_MAP_INLINE) ||
-	     (next->block_start == EXTENT_MAP_DELALLOC &&
-	      prev->block_start == EXTENT_MAP_DELALLOC) ||
 	     (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
 	      next->block_start == extent_map_block_end(prev)))) {
 		return 1;
@@ -242,7 +247,7 @@
 			em->mod_start = merge->mod_start;
 			em->generation = max(em->generation, merge->generation);
 
-			rb_erase(&merge->rb_node, &tree->map);
+			rb_erase_cached(&merge->rb_node, &tree->map);
 			RB_CLEAR_NODE(&merge->rb_node);
 			free_extent_map(merge);
 		}
@@ -254,7 +259,7 @@
 	if (rb && mergable_maps(em, merge)) {
 		em->len += merge->len;
 		em->block_len += merge->block_len;
-		rb_erase(&merge->rb_node, &tree->map);
+		rb_erase_cached(&merge->rb_node, &tree->map);
 		RB_CLEAR_NODE(&merge->rb_node);
 		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
 		em->generation = max(em->generation, merge->generation);
@@ -333,6 +338,37 @@
 		try_merge_map(tree, em);
 }
 
+static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
+{
+	struct map_lookup *map = em->map_lookup;
+	u64 stripe_size = em->orig_block_len;
+	int i;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		struct btrfs_bio_stripe *stripe = &map->stripes[i];
+		struct btrfs_device *device = stripe->dev;
+
+		set_extent_bits_nowait(&device->alloc_state, stripe->physical,
+				 stripe->physical + stripe_size - 1, bits);
+	}
+}
+
+static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
+{
+	struct map_lookup *map = em->map_lookup;
+	u64 stripe_size = em->orig_block_len;
+	int i;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		struct btrfs_bio_stripe *stripe = &map->stripes[i];
+		struct btrfs_device *device = stripe->dev;
+
+		__clear_extent_bit(&device->alloc_state, stripe->physical,
+				   stripe->physical + stripe_size - 1, bits,
+				   0, 0, NULL, GFP_NOWAIT, NULL);
+	}
+}
+
 /**
  * add_extent_mapping - add new extent map to the extent tree
  * @tree:	tree to insert new map in
@@ -348,11 +384,17 @@
 {
 	int ret = 0;
 
+	lockdep_assert_held_write(&tree->lock);
+
 	ret = tree_insert(&tree->map, em);
 	if (ret)
 		goto out;
 
 	setup_extent_mapping(tree, em, modified);
+	if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
+		extent_map_device_set_bits(em, CHUNK_ALLOCATED);
+		extent_map_device_clear_bits(em, CHUNK_TRIMMED);
+	}
 out:
 	return ret;
 }
@@ -367,7 +409,7 @@
 	struct rb_node *next = NULL;
 	u64 end = range_end(start, len);
 
-	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
 	if (!rb_node) {
 		if (prev)
 			rb_node = prev;
@@ -428,16 +470,15 @@
  * Removes @em from @tree.  No reference counts are dropped, and no checks
  * are done to see if the range is in use
  */
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
-	int ret = 0;
-
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
-	rb_erase(&em->rb_node, &tree->map);
+	rb_erase_cached(&em->rb_node, &tree->map);
 	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
 		list_del_init(&em->list);
+	if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+		extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
 	RB_CLEAR_NODE(&em->rb_node);
-	return ret;
 }
 
 void replace_extent_mapping(struct extent_map_tree *tree,
@@ -449,7 +490,7 @@
 	ASSERT(extent_map_in_tree(cur));
 	if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
 		list_del_init(&cur->list);
-	rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+	rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
 	RB_CLEAR_NODE(&cur->rb_node);
 
 	setup_extent_mapping(tree, new, modified);
@@ -475,7 +516,8 @@
 	return container_of(prev, struct extent_map, rb_node);
 }
 
-/* helper for btfs_get_extent.  Given an existing extent in the tree,
+/*
+ * Helper for btrfs_get_extent.  Given an existing extent in the tree,
  * the existing extent is the nearest extent to map_start,
  * and an extent that you want to insert, deal with overlap and insert
  * the best fitted new extent into the tree.

diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 25d985e..473f039 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h

@@ -9,15 +9,23 @@
 #define EXTENT_MAP_LAST_BYTE ((u64)-4)
 #define EXTENT_MAP_HOLE ((u64)-3)
 #define EXTENT_MAP_INLINE ((u64)-2)
+/* used only during fiemap calls */
 #define EXTENT_MAP_DELALLOC ((u64)-1)
 
-/* bits for the flags field */
-#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
-#define EXTENT_FLAG_COMPRESSED 1
-#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
-#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
-#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
-#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
+/* bits for the extent_map::flags field */
+enum {
+	/* this entry not yet on disk, don't free it */
+	EXTENT_FLAG_PINNED,
+	EXTENT_FLAG_COMPRESSED,
+	/* pre-allocated extent */
+	EXTENT_FLAG_PREALLOC,
+	/* Logging this extent */
+	EXTENT_FLAG_LOGGING,
+	/* Filling in a preallocated extent */
+	EXTENT_FLAG_FILLING,
+	/* filesystem extent mapping type */
+	EXTENT_FLAG_FS_MAPPING,
+};
 
 struct extent_map {
 	struct rb_node rb_node;
@@ -49,7 +57,7 @@
 };
 
 struct extent_map_tree {
-	struct rb_root map;
+	struct rb_root_cached map;
 	struct list_head modified_extents;
 	rwlock_t lock;
 };
@@ -78,7 +86,7 @@
 					 u64 start, u64 len);
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em, int modified);
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
 void replace_extent_mapping(struct extent_map_tree *tree,
 			    struct extent_map *cur,
 			    struct extent_map *new,

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ba74827..1a599f5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c

@@ -7,6 +7,8 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/sched/mm.h>
+#include <crypto/hash.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -21,9 +23,13 @@
 #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
 				       PAGE_SIZE))
 
-#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \
-				   sizeof(struct btrfs_ordered_sum)) / \
-				   sizeof(u32) * (fs_info)->sectorsize)
+static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
+					u16 csum_size)
+{
+	u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
+
+	return ncsums * fs_info->sectorsize;
+}
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
@@ -142,13 +148,8 @@
 	return ret;
 }
 
-static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
-{
-	kfree(bio->csum_allocated);
-}
-
 static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
-				   u64 logical_offset, u32 *dst, int dio)
+				   u64 logical_offset, u8 *dst, int dio)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct bio_vec bvec;
@@ -175,20 +176,18 @@
 	nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
 	if (!dst) {
 		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
-			btrfs_bio->csum_allocated = kmalloc_array(nblocks,
-					csum_size, GFP_NOFS);
-			if (!btrfs_bio->csum_allocated) {
+			btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
+							GFP_NOFS);
+			if (!btrfs_bio->csum) {
 				btrfs_free_path(path);
 				return BLK_STS_RESOURCE;
 			}
-			btrfs_bio->csum = btrfs_bio->csum_allocated;
-			btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
 		} else {
 			btrfs_bio->csum = btrfs_bio->csum_inline;
 		}
 		csum = btrfs_bio->csum;
 	} else {
-		csum = (u8 *)dst;
+		csum = dst;
 	}
 
 	if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
@@ -217,7 +216,7 @@
 		if (!dio)
 			offset = page_offset(bvec.bv_page) + bvec.bv_offset;
 		count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
-					       (u32 *)csum, nblocks);
+					       csum, nblocks);
 		if (count)
 			goto found;
 
@@ -289,7 +288,8 @@
 	return 0;
 }
 
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+				   u8 *dst)
 {
 	return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
 }
@@ -380,7 +380,7 @@
 				      struct btrfs_csum_item);
 		while (start < csum_end) {
 			size = min_t(size_t, csum_end - start,
-				     MAX_ORDERED_SUM_BYTES(fs_info));
+				     max_ordered_sum_bytes(fs_info, csum_size));
 			sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
 				       GFP_NOFS);
 			if (!sums) {
@@ -419,10 +419,21 @@
 	return ret;
 }
 
+/*
+ * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio
+ * @inode:	 Owner of the data inside the bio
+ * @bio:	 Contains the data to be checksummed
+ * @file_start:  offset in file this bio begins to describe
+ * @contig:	 Boolean. If true/1 means all bio vecs in this bio are
+ *		 contiguous and they begin at @file_start in the file. False/0
+ *		 means this bio can contains potentially discontigous bio vecs
+ *		 so the logical offset of each should be calculated separately.
+ */
 blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
 		       u64 file_start, int contig)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_ordered_extent *ordered = NULL;
 	char *data;
@@ -434,9 +445,14 @@
 	unsigned long this_sum_bytes = 0;
 	int i;
 	u64 offset;
+	unsigned nofs_flag;
+	const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 
-	sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
-		       GFP_NOFS);
+	nofs_flag = memalloc_nofs_save();
+	sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
+		       GFP_KERNEL);
+	memalloc_nofs_restore(nofs_flag);
+
 	if (!sums)
 		return BLK_STS_RESOURCE;
 
@@ -451,6 +467,8 @@
 	sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
 	index = 0;
 
+	shash->tfm = fs_info->csum_shash;
+
 	bio_for_each_segment(bvec, bio, iter) {
 		if (!contig)
 			offset = page_offset(bvec.bv_page) + bvec.bv_offset;
@@ -460,8 +478,6 @@
 			BUG_ON(!ordered); /* Logic error */
 		}
 
-		data = kmap_atomic(bvec.bv_page);
-
 		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
 						 bvec.bv_len + fs_info->sectorsize
 						 - 1);
@@ -471,16 +487,17 @@
 				offset < ordered->file_offset) {
 				unsigned long bytes_left;
 
-				kunmap_atomic(data);
 				sums->len = this_sum_bytes;
 				this_sum_bytes = 0;
-				btrfs_add_ordered_sum(inode, ordered, sums);
+				btrfs_add_ordered_sum(ordered, sums);
 				btrfs_put_ordered_extent(ordered);
 
 				bytes_left = bio->bi_iter.bi_size - total_bytes;
 
-				sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left),
-					       GFP_NOFS);
+				nofs_flag = memalloc_nofs_save();
+				sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
+						      bytes_left), GFP_KERNEL);
+				memalloc_nofs_restore(nofs_flag);
 				BUG_ON(!sums); /* -ENOMEM */
 				sums->len = bytes_left;
 				ordered = btrfs_lookup_ordered_extent(inode,
@@ -489,28 +506,24 @@
 				sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
 					+ total_bytes;
 				index = 0;
-
-				data = kmap_atomic(bvec.bv_page);
 			}
 
-			sums->sums[index] = ~(u32)0;
-			sums->sums[index]
-				= btrfs_csum_data(data + bvec.bv_offset
-						+ (i * fs_info->sectorsize),
-						sums->sums[index],
-						fs_info->sectorsize);
-			btrfs_csum_final(sums->sums[index],
-					(char *)(sums->sums + index));
-			index++;
+			crypto_shash_init(shash);
+			data = kmap_atomic(bvec.bv_page);
+			crypto_shash_update(shash, data + bvec.bv_offset
+					    + (i * fs_info->sectorsize),
+					    fs_info->sectorsize);
+			kunmap_atomic(data);
+			crypto_shash_final(shash, (char *)(sums->sums + index));
+			index += csum_size;
 			offset += fs_info->sectorsize;
 			this_sum_bytes += fs_info->sectorsize;
 			total_bytes += fs_info->sectorsize;
 		}
 
-		kunmap_atomic(data);
 	}
 	this_sum_bytes = 0;
-	btrfs_add_ordered_sum(inode, ordered, sums);
+	btrfs_add_ordered_sum(ordered, sums);
 	btrfs_put_ordered_extent(ordered);
 	return 0;
 }
@@ -551,7 +564,7 @@
 		 */
 		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
 		new_size *= csum_size;
-		btrfs_truncate_item(fs_info, path, new_size, 1);
+		btrfs_truncate_item(path, new_size, 1);
 	} else if (key->offset >= bytenr && csum_end > end_byte &&
 		   end_byte > key->offset) {
 		/*
@@ -563,7 +576,7 @@
 		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
 		new_size *= csum_size;
 
-		btrfs_truncate_item(fs_info, path, new_size, 0);
+		btrfs_truncate_item(path, new_size, 0);
 
 		key->offset = end_byte;
 		btrfs_set_item_key_safe(fs_info, path, key);
@@ -832,11 +845,11 @@
 		u32 diff;
 		u32 free_space;
 
-		if (btrfs_leaf_free_space(fs_info, leaf) <
+		if (btrfs_leaf_free_space(leaf) <
 				 sizeof(struct btrfs_item) + csum_size * 2)
 			goto insert;
 
-		free_space = btrfs_leaf_free_space(fs_info, leaf) -
+		free_space = btrfs_leaf_free_space(leaf) -
 					 sizeof(struct btrfs_item) - csum_size;
 		tmp = sums->len - total_bytes;
 		tmp >>= fs_info->sb->s_blocksize_bits;
@@ -852,7 +865,7 @@
 		diff /= csum_size;
 		diff *= csum_size;
 
-		btrfs_extend_item(fs_info, path, diff);
+		btrfs_extend_item(path, diff);
 		ret = 0;
 		goto csum;
 	}
@@ -898,9 +911,9 @@
 	write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
 			    ins_size);
 
+	index += ins_size;
 	ins_size /= csum_size;
 	total_bytes += ins_size * fs_info->sectorsize;
-	index += ins_size;
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	if (total_bytes < sums->len) {

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ca4902c..435a502 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c

@@ -26,6 +26,7 @@
 #include "volumes.h"
 #include "qgroup.h"
 #include "compression.h"
+#include "delalloc-space.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
@@ -399,7 +400,7 @@
 	size_t copied = 0;
 	size_t total_copied = 0;
 	int pg = 0;
-	int offset = pos & (PAGE_SIZE - 1);
+	int offset = offset_in_page(pos);
 
 	while (write_bytes > 0) {
 		size_t count = min_t(size_t,
@@ -536,8 +537,8 @@
 	 * we can set things up properly
 	 */
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
-			 EXTENT_DIRTY | EXTENT_DELALLOC |
-			 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
+			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+			 0, 0, cached);
 
 	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
 		if (start_pos >= isize &&
@@ -558,7 +559,7 @@
 	}
 
 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-					extra_bits, cached, 0);
+					extra_bits, cached);
 	if (err)
 		return err;
 
@@ -754,6 +755,7 @@
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
+	struct btrfs_ref ref = { 0 };
 	struct btrfs_key key;
 	struct btrfs_key new_key;
 	u64 ino = btrfs_ino(BTRFS_I(inode));
@@ -909,11 +911,14 @@
 			btrfs_mark_buffer_dirty(leaf);
 
 			if (update_refs && disk_bytenr > 0) {
-				ret = btrfs_inc_extent_ref(trans, root,
-						disk_bytenr, num_bytes, 0,
+				btrfs_init_generic_ref(&ref,
+						BTRFS_ADD_DELAYED_REF,
+						disk_bytenr, num_bytes, 0);
+				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						new_key.objectid,
 						start - extent_offset);
+				ret = btrfs_inc_extent_ref(trans, &ref);
 				BUG_ON(ret); /* -ENOMEM */
 			}
 			key.offset = start;
@@ -993,11 +998,14 @@
 				extent_end = ALIGN(extent_end,
 						   fs_info->sectorsize);
 			} else if (update_refs && disk_bytenr > 0) {
-				ret = btrfs_free_extent(trans, root,
-						disk_bytenr, num_bytes, 0,
+				btrfs_init_generic_ref(&ref,
+						BTRFS_DROP_DELAYED_REF,
+						disk_bytenr, num_bytes, 0);
+				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
-						key.objectid, key.offset -
-						extent_offset);
+						key.objectid,
+						key.offset - extent_offset);
+				ret = btrfs_free_extent(trans, &ref);
 				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
@@ -1025,7 +1033,7 @@
 			continue;
 		}
 
-		BUG_ON(1);
+		BUG();
 	}
 
 	if (!ret && del_nr > 0) {
@@ -1050,7 +1058,7 @@
 	if (!ret && replace_extent && leafs_visited == 1 &&
 	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
 	     path->locks[0] == BTRFS_WRITE_LOCK) &&
-	    btrfs_leaf_free_space(fs_info, leaf) >=
+	    btrfs_leaf_free_space(leaf) >=
 	    sizeof(struct btrfs_item) + extent_item_size) {
 
 		key.objectid = ino;
@@ -1142,6 +1150,7 @@
 	struct extent_buffer *leaf;
 	struct btrfs_path *path;
 	struct btrfs_file_extent_item *fi;
+	struct btrfs_ref ref = { 0 };
 	struct btrfs_key key;
 	struct btrfs_key new_key;
 	u64 bytenr;
@@ -1287,9 +1296,11 @@
 						extent_end - split);
 		btrfs_mark_buffer_dirty(leaf);
 
-		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
-					   0, root->root_key.objectid,
-					   ino, orig_offset);
+		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
+				       num_bytes, 0);
+		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
+				    orig_offset);
+		ret = btrfs_inc_extent_ref(trans, &ref);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -1311,6 +1322,9 @@
 
 	other_start = end;
 	other_end = 0;
+	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+			       num_bytes, 0);
+	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
 	if (extent_mergeable(leaf, path->slots[0] + 1,
 			     ino, bytenr, orig_offset,
 			     &other_start, &other_end)) {
@@ -1321,9 +1335,7 @@
 		extent_end = other_end;
 		del_slot = path->slots[0] + 1;
 		del_nr++;
-		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-					0, root->root_key.objectid,
-					ino, orig_offset);
+		ret = btrfs_free_extent(trans, &ref);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -1341,9 +1353,7 @@
 		key.offset = other_start;
 		del_slot = path->slots[0];
 		del_nr++;
-		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-					0, root->root_key.objectid,
-					ino, orig_offset);
+		ret = btrfs_free_extent(trans, &ref);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -1541,30 +1551,20 @@
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_root *root = inode->root;
-	struct btrfs_ordered_extent *ordered;
 	u64 lockstart, lockend;
 	u64 num_bytes;
 	int ret;
 
 	ret = btrfs_start_write_no_snapshotting(root);
 	if (!ret)
-		return -ENOSPC;
+		return -EAGAIN;
 
 	lockstart = round_down(pos, fs_info->sectorsize);
 	lockend = round_up(pos + *write_bytes,
 			   fs_info->sectorsize) - 1;
 
-	while (1) {
-		lock_extent(&inode->io_tree, lockstart, lockend);
-		ordered = btrfs_lookup_ordered_range(inode, lockstart,
-						     lockend - lockstart + 1);
-		if (!ordered) {
-			break;
-		}
-		unlock_extent(&inode->io_tree, lockstart, lockend);
-		btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-	}
+	btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
+					   lockend, NULL);
 
 	num_bytes = lockend - lockstart + 1;
 	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
@@ -1591,7 +1591,6 @@
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page **pages = NULL;
-	struct extent_state *cached_state = NULL;
 	struct extent_changeset *data_reserved = NULL;
 	u64 release_bytes = 0;
 	u64 lockstart;
@@ -1611,7 +1610,8 @@
 		return -ENOMEM;
 
 	while (iov_iter_count(i) > 0) {
-		size_t offset = pos & (PAGE_SIZE - 1);
+		struct extent_state *cached_state = NULL;
+		size_t offset = offset_in_page(pos);
 		size_t sector_offset;
 		size_t write_bytes = min(iov_iter_count(i),
 					 nrptrs * (size_t)PAGE_SIZE -
@@ -1692,7 +1692,7 @@
 				    force_page_uptodate);
 		if (ret) {
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
-						       reserve_bytes, true);
+						       reserve_bytes);
 			break;
 		}
 
@@ -1704,7 +1704,7 @@
 			if (extents_locked == -EAGAIN)
 				goto again;
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
-						       reserve_bytes, true);
+						       reserve_bytes);
 			ret = extents_locked;
 			break;
 		}
@@ -1758,11 +1758,21 @@
 		if (copied > 0)
 			ret = btrfs_dirty_pages(inode, pages, dirty_pages,
 						pos, copied, &cached_state);
+
+		/*
+		 * If we have not locked the extent range, because the range's
+		 * start offset is >= i_size, we might still have a non-NULL
+		 * cached extent state, acquired while marking the extent range
+		 * as delalloc through btrfs_dirty_pages(). Therefore free any
+		 * possible cached extent state to avoid a memory leak.
+		 */
 		if (extents_locked)
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     lockstart, lockend, &cached_state);
-		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
-					       true);
+		else
+			free_extent_state(cached_state);
+
+		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
 		if (ret) {
 			btrfs_drop_pages(pages, num_pages);
 			break;
@@ -1882,10 +1892,10 @@
 	u64 start_pos;
 	u64 end_pos;
 	ssize_t num_written = 0;
-	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+	const bool sync = iocb->ki_flags & IOCB_DSYNC;
 	ssize_t err;
 	loff_t pos;
-	size_t count = iov_iter_count(from);
+	size_t count;
 	loff_t oldsize;
 	int clean_page = 0;
 
@@ -1906,6 +1916,7 @@
 	}
 
 	pos = iocb->ki_pos;
+	count = iov_iter_count(from);
 	if (iocb->ki_flags & IOCB_NOWAIT) {
 		/*
 		 * We will allocate space in case nodatacow is not set,
@@ -2005,7 +2016,7 @@
 	filp->private_data = NULL;
 
 	/*
-	 * ordered_data_close is set by settattr when we are about to truncate
+	 * ordered_data_close is set by setattr when we are about to truncate
 	 * a file from a non-zero size to a zero size.  This tries to
 	 * flush down new bytes that may have been written if the
 	 * application were using truncate to replace a file in place.
@@ -2056,13 +2067,7 @@
 	struct btrfs_trans_handle *trans;
 	struct btrfs_log_ctx ctx;
 	int ret = 0, err;
-	u64 len;
 
-	/*
-	 * The range length can be represented by u64, we have to do the typecasts
-	 * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
-	 */
-	len = (u64)end - (u64)start + 1;
 	trace_btrfs_sync_file(file, datasync);
 
 	btrfs_init_log_ctx(&ctx, inode);
@@ -2089,6 +2094,19 @@
 	atomic_inc(&root->log_batch);
 
 	/*
+	 * If the inode needs a full sync, make sure we use a full range to
+	 * avoid log tree corruption, due to hole detection racing with ordered
+	 * extent completion for adjacent ranges, and assertion failures during
+	 * hole detection. Do this while holding the inode lock, to avoid races
+	 * with other tasks.
+	 */
+	if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+		     &BTRFS_I(inode)->runtime_flags)) {
+		start = 0;
+		end = LLONG_MAX;
+	}
+
+	/*
 	 * Before we acquired the inode's lock, someone may have dirtied more
 	 * pages in the target range. We need to make sure that writeback for
 	 * any such pages does not start while we are logging the inode, because
@@ -2114,9 +2132,12 @@
 
 	/*
 	 * We have to do this here to avoid the priority inversion of waiting on
-	 * IO of a lower priority task while holding a transaciton open.
+	 * IO of a lower priority task while holding a transaction open.
+	 *
+	 * Also, the range length can be represented by u64, we have to do the
+	 * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
 	 */
-	ret = btrfs_wait_ordered_range(inode, start, len);
+	ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
 	if (ret) {
 		up_write(&BTRFS_I(inode)->dio_sem);
 		inode_unlock(inode);
@@ -2154,7 +2175,7 @@
 	 * here we could get into a situation where we're waiting on IO to
 	 * happen that is blocked on a transaction trying to commit.  With start
 	 * we inc the extwriter counter, so we wait for all extwriters to exit
-	 * before we start blocking join'ers.  This comment is to keep somebody
+	 * before we start blocking joiners.  This comment is to keep somebody
 	 * from thinking they are super smart and changing this to
 	 * btrfs_join_transaction *cough*Josef*cough*.
 	 */
@@ -2165,7 +2186,6 @@
 		inode_unlock(inode);
 		goto out;
 	}
-	trans->sync = true;
 
 	ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
 	if (ret < 0) {
@@ -2186,25 +2206,6 @@
 	up_write(&BTRFS_I(inode)->dio_sem);
 	inode_unlock(inode);
 
-	/*
-	 * If any of the ordered extents had an error, just return it to user
-	 * space, so that the application knows some writes didn't succeed and
-	 * can take proper action (retry for e.g.). Blindly committing the
-	 * transaction in this case, would fool userspace that everything was
-	 * successful. And we also want to make sure our log doesn't contain
-	 * file extent items pointing to extents that weren't fully written to -
-	 * just like in the non fast fsync path, where we check for the ordered
-	 * operation's error flag before writing to the log tree and return -EIO
-	 * if any of them had this flag set (btrfs_wait_ordered_range) -
-	 * therefore we need to check for errors in the ordered operations,
-	 * which are indicated by ctx.io_err.
-	 */
-	if (ctx.io_err) {
-		btrfs_end_transaction(trans);
-		ret = ctx.io_err;
-		goto out;
-	}
-
 	if (ret != BTRFS_NO_LOG_SYNC) {
 		if (!ret) {
 			ret = btrfs_sync_log(trans, root, &ctx);
@@ -2447,27 +2448,286 @@
 	return 0;
 }
 
+static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+				     struct inode *inode,
+				     struct btrfs_path *path,
+				     struct btrfs_clone_extent_info *clone_info,
+				     const u64 clone_len)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int slot;
+	struct btrfs_ref ref = { 0 };
+	u64 ref_offset;
+	int ret;
+
+	if (clone_len == 0)
+		return 0;
+
+	if (clone_info->disk_offset == 0 &&
+	    btrfs_fs_incompat(fs_info, NO_HOLES))
+		return 0;
+
+	key.objectid = btrfs_ino(BTRFS_I(inode));
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = clone_info->file_offset;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      clone_info->item_size);
+	if (ret)
+		return ret;
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	write_extent_buffer(leaf, clone_info->extent_buf,
+			    btrfs_item_ptr_offset(leaf, slot),
+			    clone_info->item_size);
+	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
+	btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	/* If it's a hole, nothing more needs to be done. */
+	if (clone_info->disk_offset == 0)
+		return 0;
+
+	inode_add_bytes(inode, clone_len);
+	btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+			       clone_info->disk_offset,
+			       clone_info->disk_len, 0);
+	ref_offset = clone_info->file_offset - clone_info->data_offset;
+	btrfs_init_data_ref(&ref, root->root_key.objectid,
+			    btrfs_ino(BTRFS_I(inode)), ref_offset);
+	ret = btrfs_inc_extent_ref(trans, &ref);
+
+	return ret;
+}
+
+/*
+ * The respective range must have been previously locked, as well as the inode.
+ * The end offset is inclusive (last byte of the range).
+ * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
+ * cloning.
+ * When cloning, we don't want to end up in a state where we dropped extents
+ * without inserting a new one, so we must abort the transaction to avoid a
+ * corruption.
+ */
+int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+			   const u64 start, const u64 end,
+			   struct btrfs_clone_extent_info *clone_info,
+			   struct btrfs_trans_handle **trans_out)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
+	u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_block_rsv *rsv;
+	unsigned int rsv_count;
+	u64 cur_offset;
+	u64 drop_end;
+	u64 len = end - start;
+	int ret = 0;
+
+	if (end <= start)
+		return -EINVAL;
+
+	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
+	rsv->failfast = 1;
+
+	/*
+	 * 1 - update the inode
+	 * 1 - removing the extents in the range
+	 * 1 - adding the hole extent if no_holes isn't set or if we are cloning
+	 *     an extent
+	 */
+	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+		rsv_count = 3;
+	else
+		rsv_count = 2;
+
+	trans = btrfs_start_transaction(root, rsv_count);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out_free;
+	}
+
+	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+				      min_size, false);
+	BUG_ON(ret);
+	trans->block_rsv = rsv;
+
+	cur_offset = start;
+	while (cur_offset < end) {
+		ret = __btrfs_drop_extents(trans, root, inode, path,
+					   cur_offset, end + 1, &drop_end,
+					   1, 0, 0, NULL);
+		if (ret != -ENOSPC) {
+			/*
+			 * When cloning we want to avoid transaction aborts when
+			 * nothing was done and we are attempting to clone parts
+			 * of inline extents, in such cases -EOPNOTSUPP is
+			 * returned by __btrfs_drop_extents() without having
+			 * changed anything in the file.
+			 */
+			if (clone_info && ret && ret != -EOPNOTSUPP)
+				btrfs_abort_transaction(trans, ret);
+			break;
+		}
+
+		trans->block_rsv = &fs_info->trans_block_rsv;
+
+		if (!clone_info && cur_offset < drop_end &&
+		    cur_offset < ino_size) {
+			ret = fill_holes(trans, BTRFS_I(inode), path,
+					cur_offset, drop_end);
+			if (ret) {
+				/*
+				 * If we failed then we didn't insert our hole
+				 * entries for the area we dropped, so now the
+				 * fs is corrupted, so we must abort the
+				 * transaction.
+				 */
+				btrfs_abort_transaction(trans, ret);
+				break;
+			}
+		}
+
+		if (clone_info) {
+			u64 clone_len = drop_end - cur_offset;
+
+			ret = btrfs_insert_clone_extent(trans, inode, path,
+							clone_info, clone_len);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				break;
+			}
+			clone_info->data_len -= clone_len;
+			clone_info->data_offset += clone_len;
+			clone_info->file_offset += clone_len;
+		}
+
+		cur_offset = drop_end;
+
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret)
+			break;
+
+		btrfs_end_transaction(trans);
+		btrfs_btree_balance_dirty(fs_info);
+
+		trans = btrfs_start_transaction(root, rsv_count);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
+					      rsv, min_size, false);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
+
+		if (!clone_info) {
+			ret = find_first_non_hole(inode, &cur_offset, &len);
+			if (unlikely(ret < 0))
+				break;
+			if (ret && !len) {
+				ret = 0;
+				break;
+			}
+		}
+	}
+
+	/*
+	 * If we were cloning, force the next fsync to be a full one since we
+	 * we replaced (or just dropped in the case of cloning holes when
+	 * NO_HOLES is enabled) extents and extent maps.
+	 * This is for the sake of simplicity, and cloning into files larger
+	 * than 16Mb would force the full fsync any way (when
+	 * try_release_extent_mapping() is invoked during page cache truncation.
+	 */
+	if (clone_info)
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+
+	if (ret)
+		goto out_trans;
+
+	trans->block_rsv = &fs_info->trans_block_rsv;
+	/*
+	 * If we are using the NO_HOLES feature we might have had already an
+	 * hole that overlaps a part of the region [lockstart, lockend] and
+	 * ends at (or beyond) lockend. Since we have no file extent items to
+	 * represent holes, drop_end can be less than lockend and so we must
+	 * make sure we have an extent map representing the existing hole (the
+	 * call to __btrfs_drop_extents() might have dropped the existing extent
+	 * map representing the existing hole), otherwise the fast fsync path
+	 * will not record the existence of the hole region
+	 * [existing_hole_start, lockend].
+	 */
+	if (drop_end <= end)
+		drop_end = end + 1;
+	/*
+	 * Don't insert file hole extent item if it's for a range beyond eof
+	 * (because it's useless) or if it represents a 0 bytes range (when
+	 * cur_offset == drop_end).
+	 */
+	if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
+		ret = fill_holes(trans, BTRFS_I(inode), path,
+				cur_offset, drop_end);
+		if (ret) {
+			/* Same comment as above. */
+			btrfs_abort_transaction(trans, ret);
+			goto out_trans;
+		}
+	}
+	if (clone_info) {
+		ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
+						clone_info->data_len);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_trans;
+		}
+	}
+
+out_trans:
+	if (!trans)
+		goto out_free;
+
+	trans->block_rsv = &fs_info->trans_block_rsv;
+	if (ret)
+		btrfs_end_transaction(trans);
+	else
+		*trans_out = trans;
+out_free:
+	btrfs_free_block_rsv(fs_info, rsv);
+out:
+	return ret;
+}
+
 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
-	struct btrfs_block_rsv *rsv;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_trans_handle *trans = NULL;
 	u64 lockstart;
 	u64 lockend;
 	u64 tail_start;
 	u64 tail_len;
 	u64 orig_start = offset;
-	u64 cur_offset;
-	u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
-	u64 drop_end;
 	int ret = 0;
-	int err = 0;
-	unsigned int rsv_count;
 	bool same_block;
-	bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
 	u64 ino_size;
 	bool truncated_block = false;
 	bool updated_inode = false;
@@ -2565,10 +2825,8 @@
 
 	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
 					  &cached_state);
-	if (ret) {
-		inode_unlock(inode);
+	if (ret)
 		goto out_only_mutex;
-	}
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -2576,145 +2834,24 @@
 		goto out;
 	}
 
-	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
-	if (!rsv) {
-		ret = -ENOMEM;
-		goto out_free;
-	}
-	rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
-	rsv->failfast = 1;
+	ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+				     &trans);
+	btrfs_free_path(path);
+	if (ret)
+		goto out;
 
-	/*
-	 * 1 - update the inode
-	 * 1 - removing the extents in the range
-	 * 1 - adding the hole extent if no_holes isn't set
-	 */
-	rsv_count = no_holes ? 2 : 3;
-	trans = btrfs_start_transaction(root, rsv_count);
-	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
-		goto out_free;
-	}
-
-	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
-				      min_size, 0);
-	BUG_ON(ret);
-	trans->block_rsv = rsv;
-
-	cur_offset = lockstart;
-	len = lockend - cur_offset;
-	while (cur_offset < lockend) {
-		ret = __btrfs_drop_extents(trans, root, inode, path,
-					   cur_offset, lockend + 1,
-					   &drop_end, 1, 0, 0, NULL);
-		if (ret != -ENOSPC)
-			break;
-
-		trans->block_rsv = &fs_info->trans_block_rsv;
-
-		if (cur_offset < drop_end && cur_offset < ino_size) {
-			ret = fill_holes(trans, BTRFS_I(inode), path,
-					cur_offset, drop_end);
-			if (ret) {
-				/*
-				 * If we failed then we didn't insert our hole
-				 * entries for the area we dropped, so now the
-				 * fs is corrupted, so we must abort the
-				 * transaction.
-				 */
-				btrfs_abort_transaction(trans, ret);
-				err = ret;
-				break;
-			}
-		}
-
-		cur_offset = drop_end;
-
-		ret = btrfs_update_inode(trans, root, inode);
-		if (ret) {
-			err = ret;
-			break;
-		}
-
-		btrfs_end_transaction(trans);
-		btrfs_btree_balance_dirty(fs_info);
-
-		trans = btrfs_start_transaction(root, rsv_count);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			trans = NULL;
-			break;
-		}
-
-		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
-					      rsv, min_size, 0);
-		BUG_ON(ret);	/* shouldn't happen */
-		trans->block_rsv = rsv;
-
-		ret = find_first_non_hole(inode, &cur_offset, &len);
-		if (unlikely(ret < 0))
-			break;
-		if (ret && !len) {
-			ret = 0;
-			break;
-		}
-	}
-
-	if (ret) {
-		err = ret;
-		goto out_trans;
-	}
-
-	trans->block_rsv = &fs_info->trans_block_rsv;
-	/*
-	 * If we are using the NO_HOLES feature we might have had already an
-	 * hole that overlaps a part of the region [lockstart, lockend] and
-	 * ends at (or beyond) lockend. Since we have no file extent items to
-	 * represent holes, drop_end can be less than lockend and so we must
-	 * make sure we have an extent map representing the existing hole (the
-	 * call to __btrfs_drop_extents() might have dropped the existing extent
-	 * map representing the existing hole), otherwise the fast fsync path
-	 * will not record the existence of the hole region
-	 * [existing_hole_start, lockend].
-	 */
-	if (drop_end <= lockend)
-		drop_end = lockend + 1;
-	/*
-	 * Don't insert file hole extent item if it's for a range beyond eof
-	 * (because it's useless) or if it represents a 0 bytes range (when
-	 * cur_offset == drop_end).
-	 */
-	if (cur_offset < ino_size && cur_offset < drop_end) {
-		ret = fill_holes(trans, BTRFS_I(inode), path,
-				cur_offset, drop_end);
-		if (ret) {
-			/* Same comment as above. */
-			btrfs_abort_transaction(trans, ret);
-			err = ret;
-			goto out_trans;
-		}
-	}
-
-out_trans:
-	if (!trans)
-		goto out_free;
-
+	ASSERT(trans != NULL);
 	inode_inc_iversion(inode);
 	inode->i_mtime = inode->i_ctime = current_time(inode);
-
-	trans->block_rsv = &fs_info->trans_block_rsv;
 	ret = btrfs_update_inode(trans, root, inode);
 	updated_inode = true;
 	btrfs_end_transaction(trans);
 	btrfs_btree_balance_dirty(fs_info);
-out_free:
-	btrfs_free_path(path);
-	btrfs_free_block_rsv(fs_info, rsv);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			     &cached_state);
 out_only_mutex:
-	if (!updated_inode && truncated_block && !ret && !err) {
+	if (!updated_inode && truncated_block && !ret) {
 		/*
 		 * If we only end up zeroing part of a page, we still need to
 		 * update the inode item, so that all the time fields are
@@ -2722,18 +2859,25 @@
 		 * for detecting, at fsync time, if the inode isn't yet in the
 		 * log tree or it's there but not up to date.
 		 */
+		struct timespec64 now = current_time(inode);
+
+		inode_inc_iversion(inode);
+		inode->i_mtime = now;
+		inode->i_ctime = now;
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans)) {
-			err = PTR_ERR(trans);
+			ret = PTR_ERR(trans);
 		} else {
-			err = btrfs_update_inode(trans, root, inode);
-			ret = btrfs_end_transaction(trans);
+			int ret2;
+
+			ret = btrfs_update_inode(trans, root, inode);
+			ret2 = btrfs_end_transaction(trans);
+			if (!ret)
+				ret = ret2;
 		}
 	}
 	inode_unlock(inode);
-	if (ret && !err)
-		err = ret;
-	return err;
+	return ret;
 }
 
 /* Helper structure to record which range is already reserved */
@@ -2802,9 +2946,9 @@
 }
 
 enum {
-	RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
-	RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
-	RANGE_BOUNDARY_HOLE = 2,
+	RANGE_BOUNDARY_WRITTEN_EXTENT,
+	RANGE_BOUNDARY_PREALLOC_EXTENT,
+	RANGE_BOUNDARY_HOLE,
 };
 
 static int btrfs_zero_range_check_range_boundary(struct inode *inode,
@@ -3151,6 +3295,7 @@
 			ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
 					cur_offset, last_byte - cur_offset);
 			if (ret < 0) {
+				cur_offset = last_byte;
 				free_extent_map(em);
 				break;
 			}
@@ -3200,7 +3345,7 @@
 	/* Let go of our reservation. */
 	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
 		btrfs_free_reserved_data_space(inode, data_reserved,
-				alloc_start, alloc_end - cur_offset);
+				cur_offset, alloc_end - cur_offset);
 	extent_changeset_free(data_reserved);
 	return ret;
 }
@@ -3237,8 +3382,7 @@
 			 &cached_state);
 
 	while (start < inode->i_size) {
-		em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
-				start, len, 0);
+		em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			em = NULL;
@@ -3322,8 +3466,7 @@
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_compat_ioctl,
 #endif
-	.clone_file_range = btrfs_clone_file_range,
-	.dedupe_file_range = btrfs_dedupe_file_range,
+	.remap_file_range = btrfs_remap_file_range,
 };
 
 void __cold btrfs_auto_defrag_exit(void)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 8ecf8c0..d54dcd0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c

@@ -18,6 +18,9 @@
 #include "extent_io.h"
 #include "inode-map.h"
 #include "volumes.h"
+#include "space-info.h"
+#include "delalloc-space.h"
+#include "block-group.h"
 
 #define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
 #define MAX_CACHE_BYTES_PER_GIG	SZ_32K
@@ -75,7 +78,8 @@
 	 * sure NOFS is set to keep us from deadlocking.
 	 */
 	nofs_flag = memalloc_nofs_save();
-	inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+	inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
+	btrfs_release_path(path);
 	memalloc_nofs_restore(nofs_flag);
 	if (IS_ERR(inode))
 		return inode;
@@ -87,10 +91,11 @@
 	return inode;
 }
 
-struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info,
-				      struct btrfs_block_group_cache
-				      *block_group, struct btrfs_path *path)
+struct inode *lookup_free_space_inode(
+		struct btrfs_block_group_cache *block_group,
+		struct btrfs_path *path)
 {
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct inode *inode = NULL;
 	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 
@@ -184,20 +189,19 @@
 	return 0;
 }
 
-int create_free_space_inode(struct btrfs_fs_info *fs_info,
-			    struct btrfs_trans_handle *trans,
+int create_free_space_inode(struct btrfs_trans_handle *trans,
 			    struct btrfs_block_group_cache *block_group,
 			    struct btrfs_path *path)
 {
 	int ret;
 	u64 ino;
 
-	ret = btrfs_find_free_objectid(fs_info->tree_root, &ino);
+	ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
 	if (ret < 0)
 		return ret;
 
-	return __create_free_space_inode(fs_info->tree_root, trans, path, ino,
-					 block_group->key.objectid);
+	return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
+					 ino, block_group->key.objectid);
 }
 
 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
@@ -207,8 +211,8 @@
 	int ret;
 
 	/* 1 for slack space, 1 for updating the inode */
-	needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) +
-		btrfs_calc_trans_metadata_size(fs_info, 1);
+	needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
+		btrfs_calc_metadata_size(fs_info, 1);
 
 	spin_lock(&rsv->lock);
 	if (rsv->reserved < needed_bytes)
@@ -464,9 +468,8 @@
 	if (index == 0)
 		offset = sizeof(u32) * io_ctl->num_pages;
 
-	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
-			      PAGE_SIZE - offset);
-	btrfs_csum_final(crc, (u8 *)&crc);
+	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+	btrfs_crc32c_final(crc, (u8 *)&crc);
 	io_ctl_unmap_page(io_ctl);
 	tmp = page_address(io_ctl->pages[0]);
 	tmp += index;
@@ -492,9 +495,8 @@
 	val = *tmp;
 
 	io_ctl_map_page(io_ctl, 0);
-	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
-			      PAGE_SIZE - offset);
-	btrfs_csum_final(crc, (u8 *)&crc);
+	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+	btrfs_crc32c_final(crc, (u8 *)&crc);
 	if (val != crc) {
 		btrfs_err_rl(io_ctl->fs_info,
 			"csum mismatch on free space cache");
@@ -763,7 +765,8 @@
 		} else {
 			ASSERT(num_bitmaps);
 			num_bitmaps--;
-			e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
+			e->bitmap = kmem_cache_zalloc(
+					btrfs_free_space_bitmap_cachep, GFP_NOFS);
 			if (!e->bitmap) {
 				kmem_cache_free(
 					btrfs_free_space_cachep, e);
@@ -811,9 +814,9 @@
 	goto out;
 }
 
-int load_free_space_cache(struct btrfs_fs_info *fs_info,
-			  struct btrfs_block_group_cache *block_group)
+int load_free_space_cache(struct btrfs_block_group_cache *block_group)
 {
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct inode *inode;
 	struct btrfs_path *path;
@@ -838,7 +841,26 @@
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
-	inode = lookup_free_space_inode(fs_info, block_group, path);
+	/*
+	 * We must pass a path with search_commit_root set to btrfs_iget in
+	 * order to avoid a deadlock when allocating extents for the tree root.
+	 *
+	 * When we are COWing an extent buffer from the tree root, when looking
+	 * for a free extent, at extent-tree.c:find_free_extent(), we can find
+	 * block group without its free space cache loaded. When we find one
+	 * we must load its space cache which requires reading its free space
+	 * cache's inode item from the root tree. If this inode item is located
+	 * in the same leaf that we started COWing before, then we end up in
+	 * deadlock on the extent buffer (trying to read lock it when we
+	 * previously write locked it).
+	 *
+	 * It's safe to read the inode item using the commit root because
+	 * block groups, once loaded, stay in memory forever (until they are
+	 * removed) as well as their space caches once loaded. New block groups
+	 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
+	 * we will never try to read their inode item while the fs is mounted.
+	 */
+	inode = lookup_free_space_inode(block_group, path);
 	if (IS_ERR(inode)) {
 		btrfs_free_path(path);
 		return 0;
@@ -984,7 +1006,7 @@
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
+				 EXTENT_DELALLOC, 0, 0, NULL);
 		goto fail;
 	}
 	leaf = path->nodes[0];
@@ -996,9 +1018,8 @@
 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
 		    found_key.offset != offset) {
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
-					 inode->i_size - 1,
-					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
-					 NULL);
+					 inode->i_size - 1, EXTENT_DELALLOC, 0,
+					 0, NULL);
 			btrfs_release_path(path);
 			goto fail;
 		}
@@ -1019,8 +1040,7 @@
 	return -1;
 }
 
-static noinline_for_stack int
-write_pinned_extent_entries(struct btrfs_fs_info *fs_info,
+static noinline_for_stack int write_pinned_extent_entries(
 			    struct btrfs_block_group_cache *block_group,
 			    struct btrfs_io_ctl *io_ctl,
 			    int *entries)
@@ -1039,7 +1059,7 @@
 	 * We shouldn't have switched the pinned extents yet so this is the
 	 * right one
 	 */
-	unpin = fs_info->pinned_extents;
+	unpin = block_group->fs_info->pinned_extents;
 
 	start = block_group->key.objectid;
 
@@ -1095,7 +1115,7 @@
 	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	if (ret)
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
+				 EXTENT_DELALLOC, 0, 0, NULL);
 
 	return ret;
 }
@@ -1215,7 +1235,6 @@
 				   struct btrfs_io_ctl *io_ctl,
 				   struct btrfs_trans_handle *trans)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_state *cached_state = NULL;
 	LIST_HEAD(bitmap_list);
 	int entries = 0;
@@ -1273,8 +1292,7 @@
 	 * If this changes while we are working we'll get added back to
 	 * the dirty list and redo it.  No locking needed
 	 */
-	ret = write_pinned_extent_entries(fs_info, block_group,
-					  io_ctl, &entries);
+	ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
 	if (ret)
 		goto out_nospc_locked;
 
@@ -1350,11 +1368,11 @@
 	goto out;
 }
 
-int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
-			  struct btrfs_trans_handle *trans,
+int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
 			  struct btrfs_block_group_cache *block_group,
 			  struct btrfs_path *path)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct inode *inode;
 	int ret = 0;
@@ -1366,7 +1384,7 @@
 	}
 	spin_unlock(&block_group->lock);
 
-	inode = lookup_free_space_inode(fs_info, block_group, path);
+	inode = lookup_free_space_inode(block_group, path);
 	if (IS_ERR(inode))
 		return 0;
 
@@ -1864,7 +1882,7 @@
 			struct btrfs_free_space *bitmap_info)
 {
 	unlink_free_space(ctl, bitmap_info);
-	kfree(bitmap_info->bitmap);
+	kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
 	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
 	ctl->total_bitmaps--;
 	ctl->op->recalc_thresholds(ctl);
@@ -2118,7 +2136,8 @@
 		}
 
 		/* allocate the bitmap */
-		info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
+		info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
+						 GFP_NOFS);
 		spin_lock(&ctl->tree_lock);
 		if (!info->bitmap) {
 			ret = -ENOMEM;
@@ -2130,7 +2149,8 @@
 out:
 	if (info) {
 		if (info->bitmap)
-			kfree(info->bitmap);
+			kmem_cache_free(btrfs_free_space_bitmap_cachep,
+					info->bitmap);
 		kmem_cache_free(btrfs_free_space_cachep, info);
 	}
 
@@ -2360,6 +2380,14 @@
 	return ret;
 }
 
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 bytenr, u64 size)
+{
+	return __btrfs_add_free_space(block_group->fs_info,
+				      block_group->free_space_ctl,
+				      bytenr, size);
+}
+
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 offset, u64 bytes)
 {
@@ -2786,7 +2814,8 @@
 	if (entry->bytes == 0) {
 		ctl->free_extents--;
 		if (entry->bitmap) {
-			kfree(entry->bitmap);
+			kmem_cache_free(btrfs_free_space_bitmap_cachep,
+					entry->bitmap);
 			ctl->total_bitmaps--;
 			ctl->op->recalc_thresholds(ctl);
 		}
@@ -3021,11 +3050,11 @@
  * returns zero and sets up cluster if things worked out, otherwise
  * it returns -enospc
  */
-int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_group_cache *block_group,
+int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size)
 {
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry, *tmp;
 	LIST_HEAD(bitmaps);
@@ -3150,8 +3179,8 @@
 			space_info->bytes_readonly += reserved_bytes;
 		block_group->reserved -= reserved_bytes;
 		space_info->bytes_reserved -= reserved_bytes;
-		spin_unlock(&space_info->lock);
 		spin_unlock(&block_group->lock);
+		spin_unlock(&space_info->lock);
 	}
 
 	return ret;
@@ -3342,15 +3371,11 @@
 
 	if (cleanup) {
 		mutex_lock(&fs_info->chunk_mutex);
-		em_tree = &fs_info->mapping_tree.map_tree;
+		em_tree = &fs_info->mapping_tree;
 		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, block_group->key.objectid,
 					   1);
 		BUG_ON(!em); /* logic error, can't happen */
-		/*
-		 * remove_extent_mapping() will delete us from the pinned_chunks
-		 * list, which is protected by the chunk mutex.
-		 */
 		remove_extent_mapping(em_tree, em);
 		write_unlock(&em_tree->lock);
 		mutex_unlock(&fs_info->chunk_mutex);
@@ -3594,7 +3619,7 @@
 	}
 
 	if (!map) {
-		map = kzalloc(PAGE_SIZE, GFP_NOFS);
+		map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
 		if (!map) {
 			kmem_cache_free(btrfs_free_space_cachep, info);
 			return -ENOMEM;
@@ -3624,7 +3649,7 @@
 	if (info)
 		kmem_cache_free(btrfs_free_space_cachep, info);
 	if (map)
-		kfree(map);
+		kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
 	return 0;
 }
 

diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 15e30b9..39c32c8 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h

@@ -36,13 +36,24 @@
 			   struct btrfs_free_space *info);
 };
 
-struct btrfs_io_ctl;
+struct btrfs_io_ctl {
+	void *cur, *orig;
+	struct page *page;
+	struct page **pages;
+	struct btrfs_fs_info *fs_info;
+	struct inode *inode;
+	unsigned long size;
+	int index;
+	int num_pages;
+	int entries;
+	int bitmaps;
+	unsigned check_crcs:1;
+};
 
-struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info,
-				      struct btrfs_block_group_cache
-				      *block_group, struct btrfs_path *path);
-int create_free_space_inode(struct btrfs_fs_info *fs_info,
-			    struct btrfs_trans_handle *trans,
+struct inode *lookup_free_space_inode(
+		struct btrfs_block_group_cache *block_group,
+		struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_trans_handle *trans,
 			    struct btrfs_block_group_cache *block_group,
 			    struct btrfs_path *path);
 
@@ -51,13 +62,11 @@
 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 				    struct btrfs_block_group_cache *block_group,
 				    struct inode *inode);
-int load_free_space_cache(struct btrfs_fs_info *fs_info,
-			  struct btrfs_block_group_cache *block_group);
+int load_free_space_cache(struct btrfs_block_group_cache *block_group);
 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
 			struct btrfs_block_group_cache *block_group,
 			struct btrfs_path *path);
-int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
-			  struct btrfs_trans_handle *trans,
+int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
 			  struct btrfs_block_group_cache *block_group,
 			  struct btrfs_path *path);
 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
@@ -76,14 +85,8 @@
 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 			   struct btrfs_free_space_ctl *ctl,
 			   u64 bytenr, u64 size);
-static inline int
-btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-		     u64 bytenr, u64 size)
-{
-	return __btrfs_add_free_space(block_group->fs_info,
-				      block_group->free_space_ctl,
-				      bytenr, size);
-}
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 bytenr, u64 size);
 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
@@ -95,8 +98,7 @@
 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes);
-int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_group_cache *block_group,
+int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size);
 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index d673659..48a03f5 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c

@@ -10,6 +10,7 @@
 #include "locking.h"
 #include "free-space-tree.h"
 #include "transaction.h"
+#include "block-group.h"
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 					struct btrfs_block_group_cache *block_group,
@@ -74,12 +75,13 @@
 	return ret;
 }
 
-struct btrfs_free_space_info *
-search_free_space_info(struct btrfs_trans_handle *trans,
-		       struct btrfs_fs_info *fs_info,
-		       struct btrfs_block_group_cache *block_group,
-		       struct btrfs_path *path, int cow)
+EXPORT_FOR_TESTS
+struct btrfs_free_space_info *search_free_space_info(
+		struct btrfs_trans_handle *trans,
+		struct btrfs_block_group_cache *block_group,
+		struct btrfs_path *path, int cow)
 {
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct btrfs_root *root = fs_info->free_space_root;
 	struct btrfs_key key;
 	int ret;
@@ -176,6 +178,7 @@
 	}
 }
 
+EXPORT_FOR_TESTS
 int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path)
@@ -252,7 +255,7 @@
 		btrfs_release_path(path);
 	}
 
-	info = search_free_space_info(trans, fs_info, block_group, path, 1);
+	info = search_free_space_info(trans, block_group, path, 1);
 	if (IS_ERR(info)) {
 		ret = PTR_ERR(info);
 		goto out;
@@ -315,6 +318,7 @@
 	return ret;
 }
 
+EXPORT_FOR_TESTS
 int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path)
@@ -396,7 +400,7 @@
 		btrfs_release_path(path);
 	}
 
-	info = search_free_space_info(trans, fs_info, block_group, path, 1);
+	info = search_free_space_info(trans, block_group, path, 1);
 	if (IS_ERR(info)) {
 		ret = PTR_ERR(info);
 		goto out;
@@ -461,8 +465,7 @@
 	if (new_extents == 0)
 		return 0;
 
-	info = search_free_space_info(trans, trans->fs_info, block_group, path,
-				      1);
+	info = search_free_space_info(trans, block_group, path, 1);
 	if (IS_ERR(info)) {
 		ret = PTR_ERR(info);
 		goto out;
@@ -487,6 +490,7 @@
 	return ret;
 }
 
+EXPORT_FOR_TESTS
 int free_space_test_bit(struct btrfs_block_group_cache *block_group,
 			struct btrfs_path *path, u64 offset)
 {
@@ -775,6 +779,7 @@
 	return ret;
 }
 
+EXPORT_FOR_TESTS
 int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
 				  struct btrfs_block_group_cache *block_group,
 				  struct btrfs_path *path, u64 start, u64 size)
@@ -789,8 +794,7 @@
 			return ret;
 	}
 
-	info = search_free_space_info(NULL, trans->fs_info, block_group, path,
-				      0);
+	info = search_free_space_info(NULL, block_group, path, 0);
 	if (IS_ERR(info))
 		return PTR_ERR(info);
 	flags = btrfs_free_space_flags(path->nodes[0], info);
@@ -968,11 +972,11 @@
 	return ret;
 }
 
+EXPORT_FOR_TESTS
 int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_path *path, u64 start, u64 size)
 {
-	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_free_space_info *info;
 	u32 flags;
 	int ret;
@@ -983,7 +987,7 @@
 			return ret;
 	}
 
-	info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+	info = search_free_space_info(NULL, block_group, path, 0);
 	if (IS_ERR(info))
 		return PTR_ERR(info);
 	flags = btrfs_free_space_flags(path->nodes[0], info);
@@ -1145,7 +1149,7 @@
 		return PTR_ERR(trans);
 
 	set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
-	free_space_root = btrfs_create_tree(trans, fs_info,
+	free_space_root = btrfs_create_tree(trans,
 					    BTRFS_FREE_SPACE_TREE_OBJECTID);
 	if (IS_ERR(free_space_root)) {
 		ret = PTR_ERR(free_space_root);
@@ -1243,7 +1247,7 @@
 	list_del(&free_space_root->dirty_list);
 
 	btrfs_tree_lock(free_space_root->node);
-	clean_tree_block(fs_info, free_space_root->node);
+	btrfs_clean_tree_block(free_space_root->node);
 	btrfs_tree_unlock(free_space_root->node);
 	btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
 			      0, 1);
@@ -1529,14 +1533,12 @@
 int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
 {
 	struct btrfs_block_group_cache *block_group;
-	struct btrfs_fs_info *fs_info;
 	struct btrfs_free_space_info *info;
 	struct btrfs_path *path;
 	u32 extent_count, flags;
 	int ret;
 
 	block_group = caching_ctl->block_group;
-	fs_info = block_group->fs_info;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1550,7 +1552,7 @@
 	path->search_commit_root = 1;
 	path->reada = READA_FORWARD;
 
-	info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+	info = search_free_space_info(NULL, block_group, path, 0);
 	if (IS_ERR(info)) {
 		ret = PTR_ERR(info);
 		goto out;

diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 3133651..360d50e 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h

@@ -6,6 +6,8 @@
 #ifndef BTRFS_FREE_SPACE_TREE_H
 #define BTRFS_FREE_SPACE_TREE_H
 
+struct btrfs_caching_control;
+
 /*
  * The default size for new free space bitmap items. The last bitmap in a block
  * group may be truncated, and none of the free space tree code assumes that
@@ -30,7 +32,6 @@
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 struct btrfs_free_space_info *
 search_free_space_info(struct btrfs_trans_handle *trans,
-		       struct btrfs_fs_info *fs_info,
 		       struct btrfs_block_group_cache *block_group,
 		       struct btrfs_path *path, int cow);
 int __add_to_free_space_tree(struct btrfs_trans_handle *trans,

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a8956a3..6687018 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c

@@ -8,9 +8,9 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
-			       const char *name,
-			       int name_len, struct btrfs_inode_ref **ref_ret)
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+						   int slot, const char *name,
+						   int name_len)
 {
 	struct btrfs_inode_ref *ref;
 	unsigned long ptr;
@@ -28,19 +28,15 @@
 		cur_offset += len + sizeof(*ref);
 		if (len != name_len)
 			continue;
-		if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
-			if (ref_ret)
-				*ref_ret = ref;
-			return 1;
-		}
+		if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+			return ref;
 	}
-	return 0;
+	return NULL;
 }
 
-int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
-				   u64 ref_objectid,
-				   const char *name, int name_len,
-				   struct btrfs_inode_extref **extref_ret)
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+		struct extent_buffer *leaf, int slot, u64 ref_objectid,
+		const char *name, int name_len)
 {
 	struct btrfs_inode_extref *extref;
 	unsigned long ptr;
@@ -65,15 +61,12 @@
 
 		if (ref_name_len == name_len &&
 		    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
-		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
-			if (extref_ret)
-				*extref_ret = extref;
-			return 1;
-		}
+		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0))
+			return extref;
 
 		cur_offset += ref_name_len + sizeof(*extref);
 	}
-	return 0;
+	return NULL;
 }
 
 /* Returns NULL if no extref found */
@@ -87,7 +80,6 @@
 {
 	int ret;
 	struct btrfs_key key;
-	struct btrfs_inode_extref *extref;
 
 	key.objectid = inode_objectid;
 	key.type = BTRFS_INODE_EXTREF_KEY;
@@ -98,11 +90,9 @@
 		return ERR_PTR(ret);
 	if (ret > 0)
 		return NULL;
-	if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
-					    ref_objectid, name, name_len,
-					    &extref))
-		return NULL;
-	return extref;
+	return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+					      ref_objectid, name, name_len);
+
 }
 
 static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
@@ -142,9 +132,9 @@
 	 * This should always succeed so error here will make the FS
 	 * readonly.
 	 */
-	if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
-					    ref_objectid,
-					    name, name_len, &extref)) {
+	extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+						ref_objectid, name, name_len);
+	if (!extref) {
 		btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
 		ret = -EROFS;
 		goto out;
@@ -170,7 +160,7 @@
 	memmove_extent_buffer(leaf, ptr, ptr + del_len,
 			      item_size - (ptr + del_len - item_start));
 
-	btrfs_truncate_item(root->fs_info, path, item_size - del_len, 1);
+	btrfs_truncate_item(path, item_size - del_len, 1);
 
 out:
 	btrfs_free_path(path);
@@ -213,8 +203,10 @@
 	} else if (ret < 0) {
 		goto out;
 	}
-	if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
-					name, name_len, &ref)) {
+
+	ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name,
+					 name_len);
+	if (!ref) {
 		ret = -ENOENT;
 		search_ext_refs = 1;
 		goto out;
@@ -234,7 +226,7 @@
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
-	btrfs_truncate_item(root->fs_info, path, item_size - sub_item_len, 1);
+	btrfs_truncate_item(path, item_size - sub_item_len, 1);
 out:
 	btrfs_free_path(path);
 
@@ -285,10 +277,10 @@
 		if (btrfs_find_name_in_ext_backref(path->nodes[0],
 						   path->slots[0],
 						   ref_objectid,
-						   name, name_len, NULL))
+						   name, name_len))
 			goto out;
 
-		btrfs_extend_item(root->fs_info, path, ins_len);
+		btrfs_extend_item(path, ins_len);
 		ret = 0;
 	}
 	if (ret < 0)
@@ -341,13 +333,13 @@
 				      ins_len);
 	if (ret == -EEXIST) {
 		u32 old_size;
-
-		if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
-					       name, name_len, &ref))
+		ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+						 name, name_len);
+		if (ref)
 			goto out;
 
 		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-		btrfs_extend_item(fs_info, path, ins_len);
+		btrfs_extend_item(path, ins_len);
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -359,7 +351,7 @@
 		if (ret == -EOVERFLOW) {
 			if (btrfs_find_name_in_backref(path->nodes[0],
 						       path->slots[0],
-						       name, name_len, &ref))
+						       name, name_len))
 				ret = -EEXIST;
 			else
 				ret = -EMLINK;

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ffca2ab..37345fb 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c

@@ -11,6 +11,20 @@
 #include "free-space-cache.h"
 #include "inode-map.h"
 #include "transaction.h"
+#include "delalloc-space.h"
+
+static void fail_caching_thread(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	btrfs_warn(fs_info, "failed to start inode caching task");
+	btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
+				     "disabling inode map caching");
+	spin_lock(&root->ino_cache_lock);
+	root->ino_cache_state = BTRFS_CACHE_ERROR;
+	spin_unlock(&root->ino_cache_lock);
+	wake_up(&root->ino_cache_wait);
+}
 
 static int caching_kthread(void *data)
 {
@@ -28,8 +42,10 @@
 		return 0;
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		fail_caching_thread(root);
 		return -ENOMEM;
+	}
 
 	/* Since the commit root is read-only, we can safely skip locking. */
 	path->skip_locking = 1;
@@ -145,6 +161,7 @@
 		spin_lock(&root->ino_cache_lock);
 		root->ino_cache_state = BTRFS_CACHE_FINISHED;
 		spin_unlock(&root->ino_cache_lock);
+		wake_up(&root->ino_cache_wait);
 		return;
 	}
 
@@ -159,15 +176,13 @@
 	if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
 		__btrfs_add_free_space(fs_info, ctl, objectid,
 				       BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+		wake_up(&root->ino_cache_wait);
 	}
 
 	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
 			  root->root_key.objectid);
-	if (IS_ERR(tsk)) {
-		btrfs_warn(fs_info, "failed to start inode caching task");
-		btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
-					     "disabling inode map caching");
-	}
+	if (IS_ERR(tsk))
+		fail_caching_thread(root);
 }
 
 int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -185,11 +200,14 @@
 
 	wait_event(root->ino_cache_wait,
 		   root->ino_cache_state == BTRFS_CACHE_FINISHED ||
+		   root->ino_cache_state == BTRFS_CACHE_ERROR ||
 		   root->free_ino_ctl->free_space > 0);
 
 	if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
 	    root->free_ino_ctl->free_space == 0)
 		return -ENOSPC;
+	else if (root->ino_cache_state == BTRFS_CACHE_ERROR)
+		return btrfs_find_free_objectid(root, objectid);
 	else
 		goto again;
 }
@@ -418,7 +436,7 @@
 	 * 1 item for free space object
 	 * 3 items for pre-allocation
 	 */
-	trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10);
+	trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10);
 	ret = btrfs_block_rsv_add(root, trans->block_rsv,
 				  trans->bytes_reserved,
 				  BTRFS_RESERVE_NO_FLUSH);
@@ -483,12 +501,13 @@
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
 	if (ret) {
-		btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+		btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
 		goto out_put;
 	}
 
 	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
 out_put:
 	iput(inode);
 out_release:

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7158b5b..0159100 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c

@@ -27,7 +27,10 @@
 #include <linux/uio.h>
 #include <linux/magic.h>
 #include <linux/iversion.h>
+#include <linux/swap.h>
+#include <linux/sched/mm.h>
 #include <asm/unaligned.h>
+#include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -44,7 +47,8 @@
 #include "backref.h"
 #include "props.h"
 #include "qgroup.h"
-#include "dedupe.h"
+#include "delalloc-space.h"
+#include "block-group.h"
 
 struct btrfs_iget_args {
 	struct btrfs_key *location;
@@ -64,7 +68,6 @@
 static const struct inode_operations btrfs_special_inode_operations;
 static const struct inode_operations btrfs_file_inode_operations;
 static const struct address_space_operations btrfs_aops;
-static const struct address_space_operations btrfs_symlink_aops;
 static const struct file_operations btrfs_dir_file_operations;
 static const struct extent_io_ops btrfs_extent_io_ops;
 
@@ -72,26 +75,15 @@
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_path_cachep;
 struct kmem_cache *btrfs_free_space_cachep;
-
-#define S_SHIFT 12
-static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
-};
+struct kmem_cache *btrfs_free_space_bitmap_cachep;
 
 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
-				   u64 start, u64 end, u64 delalloc_end,
-				   int *page_started, unsigned long *nr_written,
-				   int unlock, struct btrfs_dedupe_hash *hash);
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written, int unlock);
 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
 				       u64 orig_start, u64 block_start,
 				       u64 block_len, u64 orig_block_len,
@@ -104,23 +96,23 @@
 
 /*
  * Cleanup all submitted ordered extents in specified range to handle errors
- * from the fill_dellaloc() callback.
+ * from the btrfs_run_delalloc_range() callback.
  *
  * NOTE: caller must ensure that when an error happens, it can not call
  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
  * to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of the
- * fill_delalloc() callback already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
  */
 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
-						 const u64 offset,
-						 const u64 bytes)
+						 struct page *locked_page,
+						 u64 offset, u64 bytes)
 {
 	unsigned long index = offset >> PAGE_SHIFT;
 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+	u64 page_start = page_offset(locked_page);
+	u64 page_end = page_start + PAGE_SIZE - 1;
+
 	struct page *page;
 
 	while (index <= end_index) {
@@ -131,8 +123,18 @@
 		ClearPagePrivate2(page);
 		put_page(page);
 	}
-	return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
-					    bytes - PAGE_SIZE, false);
+
+	/*
+	 * In case this page belongs to the delalloc range being instantiated
+	 * then skip it, since the first page of a range is going to be
+	 * properly cleaned up by the caller of run_delalloc_range
+	 */
+	if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+		offset += PAGE_SIZE;
+		bytes -= PAGE_SIZE;
+	}
+
+	return __endio_write_update_ordered(inode, offset, bytes, false);
 }
 
 static int btrfs_dirty_inode(struct inode *inode);
@@ -177,6 +179,9 @@
 	size_t cur_size = size;
 	unsigned long offset;
 
+	ASSERT((compressed_size > 0 && compressed_pages) ||
+	       (compressed_size == 0 && !compressed_pages));
+
 	if (compressed_size && compressed_pages)
 		cur_size = compressed_size;
 
@@ -230,7 +235,7 @@
 				     start >> PAGE_SHIFT);
 		btrfs_set_file_extent_compression(leaf, ei, 0);
 		kaddr = kmap_atomic(page);
-		offset = start & (PAGE_SIZE - 1);
+		offset = offset_in_page(start);
 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
 		kunmap_atomic(kaddr);
 		put_page(page);
@@ -356,18 +361,24 @@
 	struct list_head list;
 };
 
-struct async_cow {
+struct async_chunk {
 	struct inode *inode;
-	struct btrfs_root *root;
 	struct page *locked_page;
 	u64 start;
 	u64 end;
 	unsigned int write_flags;
 	struct list_head extents;
 	struct btrfs_work work;
+	atomic_t *pending;
 };
 
-static noinline int add_async_extent(struct async_cow *cow,
+struct async_cow {
+	/* Number of chunks in flight; must be first in the structure */
+	atomic_t num_chunks;
+	struct async_chunk chunks[];
+};
+
+static noinline int add_async_extent(struct async_chunk *cow,
 				     u64 start, u64 ram_size,
 				     u64 compressed_size,
 				     struct page **pages,
@@ -388,10 +399,31 @@
 	return 0;
 }
 
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool inode_can_compress(struct inode *inode)
+{
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
+	    BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+		return false;
+	return true;
+}
+
+/*
+ * Check if the inode needs to be submitted to compression, based on mount
+ * options, defragmentation, properties or heuristics.
+ */
 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 
+	if (!inode_can_compress(inode)) {
+		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
+			btrfs_ino(BTRFS_I(inode)));
+		return 0;
+	}
 	/* force compress */
 	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
 		return 1;
@@ -434,16 +466,15 @@
  * are written in the same order that the flusher thread sent them
  * down.
  */
-static noinline void compress_file_range(struct inode *inode,
-					struct page *locked_page,
-					u64 start, u64 end,
-					struct async_cow *async_cow,
-					int *num_added)
+static noinline int compress_file_range(struct async_chunk *async_chunk)
 {
+	struct inode *inode = async_chunk->inode;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	u64 blocksize = fs_info->sectorsize;
+	u64 start = async_chunk->start;
+	u64 end = async_chunk->end;
 	u64 actual_end;
-	u64 isize = i_size_read(inode);
+	u64 i_size;
 	int ret = 0;
 	struct page **pages = NULL;
 	unsigned long nr_pages;
@@ -452,12 +483,25 @@
 	int i;
 	int will_compress;
 	int compress_type = fs_info->compress_type;
+	int compressed_extents = 0;
 	int redirty = 0;
 
 	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
 			SZ_16K);
 
-	actual_end = min_t(u64, isize, end + 1);
+	/*
+	 * We need to save i_size before now because it could change in between
+	 * us evaluating the size and assigning it.  This is because we lock and
+	 * unlock the page in truncate and fallocate, and then modify the i_size
+	 * later on.
+	 *
+	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
+	 * does that for us.
+	 */
+	barrier();
+	i_size = i_size_read(inode);
+	barrier();
+	actual_end = min_t(u64, i_size, end + 1);
 again:
 	will_compress = 0;
 	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -539,8 +583,7 @@
 					   &total_compressed);
 
 		if (!ret) {
-			unsigned long offset = total_compressed &
-				(PAGE_SIZE - 1);
+			unsigned long offset = offset_in_page(total_compressed);
 			struct page *page = pages[nr_pages - 1];
 			char *kaddr;
 
@@ -589,14 +632,21 @@
 			 * our outstanding extent for clearing delalloc for this
 			 * range.
 			 */
-			extent_clear_unlock_delalloc(inode, start, end, end,
-						     NULL, clear_flags,
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+						     clear_flags,
 						     PAGE_UNLOCK |
 						     PAGE_CLEAR_DIRTY |
 						     PAGE_SET_WRITEBACK |
 						     page_error_op |
 						     PAGE_END_WRITEBACK);
-			goto free_pages_out;
+
+			for (i = 0; i < nr_pages; i++) {
+				WARN_ON(pages[i]->mapping);
+				put_page(pages[i]);
+			}
+			kfree(pages);
+
+			return 0;
 		}
 	}
 
@@ -615,14 +665,14 @@
 		 */
 		total_in = ALIGN(total_in, PAGE_SIZE);
 		if (total_compressed + blocksize <= total_in) {
-			*num_added += 1;
+			compressed_extents++;
 
 			/*
 			 * The async work queues will take care of doing actual
 			 * allocation on disk for these compressed pages, and
 			 * will submit them to the elevator.
 			 */
-			add_async_extent(async_cow, start, total_in,
+			add_async_extent(async_chunk, start, total_in,
 					total_compressed, pages, nr_pages,
 					compress_type);
 
@@ -632,7 +682,7 @@
 				cond_resched();
 				goto again;
 			}
-			return;
+			return compressed_extents;
 		}
 	}
 	if (pages) {
@@ -662,25 +712,18 @@
 	 * to our extent and set things up for the async work queue to run
 	 * cow_file_range to do the normal delalloc dance.
 	 */
-	if (page_offset(locked_page) >= start &&
-	    page_offset(locked_page) <= end)
-		__set_page_dirty_nobuffers(locked_page);
+	if (page_offset(async_chunk->locked_page) >= start &&
+	    page_offset(async_chunk->locked_page) <= end)
+		__set_page_dirty_nobuffers(async_chunk->locked_page);
 		/* unlocked later on in the async handlers */
 
 	if (redirty)
 		extent_range_redirty_for_io(inode, start, end);
-	add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
+	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
 			 BTRFS_COMPRESS_NONE);
-	*num_added += 1;
+	compressed_extents++;
 
-	return;
-
-free_pages_out:
-	for (i = 0; i < nr_pages; i++) {
-		WARN_ON(pages[i]->mapping);
-		put_page(pages[i]);
-	}
-	kfree(pages);
+	return compressed_extents;
 }
 
 static void free_async_extent_pages(struct async_extent *async_extent)
@@ -705,45 +748,38 @@
  * queued.  We walk all the async extents created by compress_file_range
  * and send them down to the disk.
  */
-static noinline void submit_compressed_extents(struct inode *inode,
-					      struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
 {
+	struct inode *inode = async_chunk->inode;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct async_extent *async_extent;
 	u64 alloc_hint = 0;
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	int ret = 0;
 
 again:
-	while (!list_empty(&async_cow->extents)) {
-		async_extent = list_entry(async_cow->extents.next,
+	while (!list_empty(&async_chunk->extents)) {
+		async_extent = list_entry(async_chunk->extents.next,
 					  struct async_extent, list);
 		list_del(&async_extent->list);
 
-		io_tree = &BTRFS_I(inode)->io_tree;
-
 retry:
+		lock_extent(io_tree, async_extent->start,
+			    async_extent->start + async_extent->ram_size - 1);
 		/* did the compression code fall back to uncompressed IO? */
 		if (!async_extent->pages) {
 			int page_started = 0;
 			unsigned long nr_written = 0;
 
-			lock_extent(io_tree, async_extent->start,
-					 async_extent->start +
-					 async_extent->ram_size - 1);
-
 			/* allocate blocks */
-			ret = cow_file_range(inode, async_cow->locked_page,
+			ret = cow_file_range(inode, async_chunk->locked_page,
 					     async_extent->start,
 					     async_extent->start +
 					     async_extent->ram_size - 1,
-					     async_extent->start +
-					     async_extent->ram_size - 1,
-					     &page_started, &nr_written, 0,
-					     NULL);
+					     &page_started, &nr_written, 0);
 
 			/* JDM XXX */
 
@@ -760,15 +796,12 @@
 						  async_extent->ram_size - 1,
 						  WB_SYNC_ALL);
 			else if (ret)
-				unlock_page(async_cow->locked_page);
+				unlock_page(async_chunk->locked_page);
 			kfree(async_extent);
 			cond_resched();
 			continue;
 		}
 
-		lock_extent(io_tree, async_extent->start,
-			    async_extent->start + async_extent->ram_size - 1);
-
 		ret = btrfs_reserve_extent(root, async_extent->ram_size,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
@@ -836,8 +869,6 @@
 		extent_clear_unlock_delalloc(inode, async_extent->start,
 				async_extent->start +
 				async_extent->ram_size - 1,
-				async_extent->start +
-				async_extent->ram_size - 1,
 				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 				PAGE_SET_WRITEBACK);
@@ -847,17 +878,16 @@
 				    ins.objectid,
 				    ins.offset, async_extent->pages,
 				    async_extent->nr_pages,
-				    async_cow->write_flags)) {
-			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+				    async_chunk->write_flags)) {
 			struct page *p = async_extent->pages[0];
 			const u64 start = async_extent->start;
 			const u64 end = start + async_extent->ram_size - 1;
 
 			p->mapping = inode->i_mapping;
-			tree->ops->writepage_end_io_hook(p, start, end,
-							 NULL, 0);
+			btrfs_writepage_endio_finish_ordered(p, start, end, 0);
+
 			p->mapping = NULL;
-			extent_clear_unlock_delalloc(inode, start, end, end,
+			extent_clear_unlock_delalloc(inode, start, end,
 						     NULL, 0,
 						     PAGE_END_WRITEBACK |
 						     PAGE_SET_ERROR);
@@ -875,8 +905,6 @@
 	extent_clear_unlock_delalloc(inode, async_extent->start,
 				     async_extent->start +
 				     async_extent->ram_size - 1,
-				     async_extent->start +
-				     async_extent->ram_size - 1,
 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 				     EXTENT_DELALLOC_NEW |
 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@ -935,9 +963,8 @@
  */
 static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
-				   u64 start, u64 end, u64 delalloc_end,
-				   int *page_started, unsigned long *nr_written,
-				   int unlock, struct btrfs_dedupe_hash *hash)
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written, int unlock)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -976,8 +1003,7 @@
 			 * our outstanding extent for clearing delalloc for this
 			 * range.
 			 */
-			extent_clear_unlock_delalloc(inode, start, end,
-				     delalloc_end, NULL,
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
 				     EXTENT_LOCKED | EXTENT_DELALLOC |
 				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
 				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -1060,7 +1086,7 @@
 
 		extent_clear_unlock_delalloc(inode, start,
 					     start + ram_size - 1,
-					     delalloc_end, locked_page,
+					     locked_page,
 					     EXTENT_LOCKED | EXTENT_DELALLOC,
 					     page_ops);
 		if (num_bytes < cur_alloc_size)
@@ -1105,7 +1131,6 @@
 	if (extent_reserved) {
 		extent_clear_unlock_delalloc(inode, start,
 					     start + cur_alloc_size,
-					     start + cur_alloc_size,
 					     locked_page,
 					     clear_bits,
 					     page_ops);
@@ -1113,8 +1138,7 @@
 		if (start >= end)
 			goto out;
 	}
-	extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
-				     locked_page,
+	extent_clear_unlock_delalloc(inode, start, end, locked_page,
 				     clear_bits | EXTENT_CLEAR_DATA_RESV,
 				     page_ops);
 	goto out;
@@ -1125,16 +1149,15 @@
  */
 static noinline void async_cow_start(struct btrfs_work *work)
 {
-	struct async_cow *async_cow;
-	int num_added = 0;
-	async_cow = container_of(work, struct async_cow, work);
+	struct async_chunk *async_chunk;
+	int compressed_extents;
 
-	compress_file_range(async_cow->inode, async_cow->locked_page,
-			    async_cow->start, async_cow->end, async_cow,
-			    &num_added);
-	if (num_added == 0) {
-		btrfs_add_delayed_iput(async_cow->inode);
-		async_cow->inode = NULL;
+	async_chunk = container_of(work, struct async_chunk, work);
+
+	compressed_extents = compress_file_range(async_chunk);
+	if (compressed_extents == 0) {
+		btrfs_add_delayed_iput(async_chunk->inode);
+		async_chunk->inode = NULL;
 	}
 }
 
@@ -1143,16 +1166,12 @@
  */
 static noinline void async_cow_submit(struct btrfs_work *work)
 {
-	struct btrfs_fs_info *fs_info;
-	struct async_cow *async_cow;
-	struct btrfs_root *root;
+	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
+						     work);
+	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
 	unsigned long nr_pages;
 
-	async_cow = container_of(work, struct async_cow, work);
-
-	root = async_cow->root;
-	fs_info = root->fs_info;
-	nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
+	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
 		PAGE_SHIFT;
 
 	/* atomic_sub_return implies a barrier */
@@ -1160,17 +1179,29 @@
 	    5 * SZ_1M)
 		cond_wake_up_nomb(&fs_info->async_submit_wait);
 
-	if (async_cow->inode)
-		submit_compressed_extents(async_cow->inode, async_cow);
+	/*
+	 * ->inode could be NULL if async_chunk_start has failed to compress,
+	 * in which case we don't have anything to submit, yet we need to
+	 * always adjust ->async_delalloc_pages as its paired with the init
+	 * happening in cow_file_range_async
+	 */
+	if (async_chunk->inode)
+		submit_compressed_extents(async_chunk);
 }
 
 static noinline void async_cow_free(struct btrfs_work *work)
 {
-	struct async_cow *async_cow;
-	async_cow = container_of(work, struct async_cow, work);
-	if (async_cow->inode)
-		btrfs_add_delayed_iput(async_cow->inode);
-	kfree(async_cow);
+	struct async_chunk *async_chunk;
+
+	async_chunk = container_of(work, struct async_chunk, work);
+	if (async_chunk->inode)
+		btrfs_add_delayed_iput(async_chunk->inode);
+	/*
+	 * Since the pointer to 'pending' is at the beginning of the array of
+	 * async_chunk's, freeing it ensures the whole array has been freed.
+	 */
+	if (atomic_dec_and_test(async_chunk->pending))
+		kvfree(async_chunk->pending);
 }
 
 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
@@ -1179,41 +1210,73 @@
 				unsigned int write_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct async_cow *async_cow;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct async_cow *ctx;
+	struct async_chunk *async_chunk;
 	unsigned long nr_pages;
 	u64 cur_end;
+	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
+	int i;
+	bool should_compress;
+	unsigned nofs_flag;
 
-	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
-			 1, 0, NULL);
-	while (start < end) {
-		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
-		BUG_ON(!async_cow); /* -ENOMEM */
-		async_cow->inode = igrab(inode);
-		async_cow->root = root;
-		async_cow->locked_page = locked_page;
-		async_cow->start = start;
-		async_cow->write_flags = write_flags;
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
 
-		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
-		    !btrfs_test_opt(fs_info, FORCE_COMPRESS))
-			cur_end = end;
-		else
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
+		num_chunks = 1;
+		should_compress = false;
+	} else {
+		should_compress = true;
+	}
+
+	nofs_flag = memalloc_nofs_save();
+	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
+	memalloc_nofs_restore(nofs_flag);
+
+	if (!ctx) {
+		unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
+			EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+			EXTENT_DO_ACCOUNTING;
+		unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+			PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+			PAGE_SET_ERROR;
+
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     clear_bits, page_ops);
+		return -ENOMEM;
+	}
+
+	async_chunk = ctx->chunks;
+	atomic_set(&ctx->num_chunks, num_chunks);
+
+	for (i = 0; i < num_chunks; i++) {
+		if (should_compress)
 			cur_end = min(end, start + SZ_512K - 1);
+		else
+			cur_end = end;
 
-		async_cow->end = cur_end;
-		INIT_LIST_HEAD(&async_cow->extents);
+		/*
+		 * igrab is called higher up in the call chain, take only the
+		 * lightweight reference for the callback lifetime
+		 */
+		ihold(inode);
+		async_chunk[i].pending = &ctx->num_chunks;
+		async_chunk[i].inode = inode;
+		async_chunk[i].start = start;
+		async_chunk[i].end = cur_end;
+		async_chunk[i].locked_page = locked_page;
+		async_chunk[i].write_flags = write_flags;
+		INIT_LIST_HEAD(&async_chunk[i].extents);
 
-		btrfs_init_work(&async_cow->work,
+		btrfs_init_work(&async_chunk[i].work,
 				btrfs_delalloc_helper,
 				async_cow_start, async_cow_submit,
 				async_cow_free);
 
-		nr_pages = (cur_end - start + PAGE_SIZE) >>
-			PAGE_SHIFT;
+		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
 		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
 
-		btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
+		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
 
 		*nr_written += nr_pages;
 		start = cur_end + 1;
@@ -1253,36 +1316,25 @@
  */
 static noinline int run_delalloc_nocow(struct inode *inode,
 				       struct page *locked_page,
-			      u64 start, u64 end, int *page_started, int force,
-			      unsigned long *nr_written)
+				       const u64 start, const u64 end,
+				       int *page_started, int force,
+				       unsigned long *nr_written)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_buffer *leaf;
 	struct btrfs_path *path;
-	struct btrfs_file_extent_item *fi;
-	struct btrfs_key found_key;
-	struct extent_map *em;
-	u64 cow_start;
-	u64 cur_offset;
-	u64 extent_end;
-	u64 extent_offset;
-	u64 disk_bytenr;
-	u64 num_bytes;
-	u64 disk_num_bytes;
-	u64 ram_bytes;
-	int extent_type;
+	u64 cow_start = (u64)-1;
+	u64 cur_offset = start;
 	int ret;
-	int type;
-	int nocow;
-	int check_prev = 1;
-	bool nolock;
+	bool check_prev = true;
+	const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
 	u64 ino = btrfs_ino(BTRFS_I(inode));
+	bool nocow = false;
+	u64 disk_bytenr = 0;
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		extent_clear_unlock_delalloc(inode, start, end, end,
-					     locked_page,
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
 					     EXTENT_LOCKED | EXTENT_DELALLOC |
 					     EXTENT_DO_ACCOUNTING |
 					     EXTENT_DEFRAG, PAGE_UNLOCK |
@@ -1292,15 +1344,29 @@
 		return -ENOMEM;
 	}
 
-	nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
-
-	cow_start = (u64)-1;
-	cur_offset = start;
 	while (1) {
+		struct btrfs_key found_key;
+		struct btrfs_file_extent_item *fi;
+		struct extent_buffer *leaf;
+		u64 extent_end;
+		u64 extent_offset;
+		u64 num_bytes = 0;
+		u64 disk_num_bytes;
+		u64 ram_bytes;
+		int extent_type;
+
+		nocow = false;
+
 		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
 					       cur_offset, 0);
 		if (ret < 0)
 			goto error;
+
+		/*
+		 * If there is no extent for our range when doing the initial
+		 * search, then go back to the previous slot as it will be the
+		 * one containing the search offset
+		 */
 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
 			leaf = path->nodes[0];
 			btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1309,8 +1375,9 @@
 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
 				path->slots[0]--;
 		}
-		check_prev = 0;
+		check_prev = false;
 next_slot:
+		/* Go to next leaf if we have exhausted the current one */
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
@@ -1324,28 +1391,40 @@
 			leaf = path->nodes[0];
 		}
 
-		nocow = 0;
-		disk_bytenr = 0;
-		num_bytes = 0;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
+		/* Didn't find anything for our INO */
 		if (found_key.objectid > ino)
 			break;
+		/*
+		 * Keep searching until we find an EXTENT_ITEM or there are no
+		 * more extents for this inode
+		 */
 		if (WARN_ON_ONCE(found_key.objectid < ino) ||
 		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
 			path->slots[0]++;
 			goto next_slot;
 		}
+
+		/* Found key is not EXTENT_DATA_KEY or starts after req range */
 		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
 		    found_key.offset > end)
 			break;
 
+		/*
+		 * If the found extent starts after requested offset, then
+		 * adjust extent_end to be right before this extent begins
+		 */
 		if (found_key.offset > cur_offset) {
 			extent_end = found_key.offset;
 			extent_type = 0;
 			goto out_check;
 		}
 
+		/*
+		 * Found extent which begins before our range and potentially
+		 * intersect it
+		 */
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		extent_type = btrfs_file_extent_type(leaf, fi);
@@ -1359,25 +1438,36 @@
 				btrfs_file_extent_num_bytes(leaf, fi);
 			disk_num_bytes =
 				btrfs_file_extent_disk_num_bytes(leaf, fi);
+			/*
+			 * If extent we got ends before our range starts, skip
+			 * to next extent
+			 */
 			if (extent_end <= start) {
 				path->slots[0]++;
 				goto next_slot;
 			}
+			/* Skip holes */
 			if (disk_bytenr == 0)
 				goto out_check;
+			/* Skip compressed/encrypted/encoded extents */
 			if (btrfs_file_extent_compression(leaf, fi) ||
 			    btrfs_file_extent_encryption(leaf, fi) ||
 			    btrfs_file_extent_other_encoding(leaf, fi))
 				goto out_check;
 			/*
-			 * Do the same check as in btrfs_cross_ref_exist but
-			 * without the unnecessary search.
+			 * If extent is created before the last volume's snapshot
+			 * this implies the extent is shared, hence we can't do
+			 * nocow. This is the same check as in
+			 * btrfs_cross_ref_exist but without calling
+			 * btrfs_search_slot.
 			 */
-			if (btrfs_file_extent_generation(leaf, fi) <=
+			if (!freespace_inode &&
+			    btrfs_file_extent_generation(leaf, fi) <=
 			    btrfs_root_last_snapshot(&root->root_item))
 				goto out_check;
 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
 				goto out_check;
+			/* If extent is RO, we must COW it */
 			if (btrfs_extent_readonly(fs_info, disk_bytenr))
 				goto out_check;
 			ret = btrfs_cross_ref_exist(root, ino,
@@ -1394,17 +1484,17 @@
 					goto error;
 				}
 
-				WARN_ON_ONCE(nolock);
+				WARN_ON_ONCE(freespace_inode);
 				goto out_check;
 			}
 			disk_bytenr += extent_offset;
 			disk_bytenr += cur_offset - found_key.offset;
 			num_bytes = min(end + 1, extent_end) - cur_offset;
 			/*
-			 * if there are pending snapshots for this root,
-			 * we fall into common COW way.
+			 * If there are pending snapshots for this root, we
+			 * fall into common COW way
 			 */
-			if (!nolock && atomic_read(&root->snapshot_force_cow))
+			if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
 				goto out_check;
 			/*
 			 * force cow if csum exists in the range.
@@ -1423,27 +1513,29 @@
 						cur_offset = cow_start;
 					goto error;
 				}
-				WARN_ON_ONCE(nolock);
+				WARN_ON_ONCE(freespace_inode);
 				goto out_check;
 			}
 			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
 				goto out_check;
-			nocow = 1;
+			nocow = true;
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-			extent_end = found_key.offset +
-				btrfs_file_extent_ram_bytes(leaf, fi);
-			extent_end = ALIGN(extent_end,
-					   fs_info->sectorsize);
+			extent_end = found_key.offset + ram_bytes;
+			extent_end = ALIGN(extent_end, fs_info->sectorsize);
+			/* Skip extents outside of our requested range */
+			if (extent_end <= start) {
+				path->slots[0]++;
+				goto next_slot;
+			}
 		} else {
-			BUG_ON(1);
+			/* If this triggers then we have a memory corruption */
+			BUG();
 		}
 out_check:
-		if (extent_end <= start) {
-			path->slots[0]++;
-			if (nocow)
-				btrfs_dec_nocow_writers(fs_info, disk_bytenr);
-			goto next_slot;
-		}
+		/*
+		 * If nocow is false then record the beginning of the range
+		 * that needs to be COWed
+		 */
 		if (!nocow) {
 			if (cow_start == (u64)-1)
 				cow_start = cur_offset;
@@ -1455,11 +1547,16 @@
 		}
 
 		btrfs_release_path(path);
+
+		/*
+		 * COW range from cow_start to found_key.offset - 1. As the key
+		 * will contain the beginning of the first extent that can be
+		 * NOCOW, following one which needs to be COW'ed
+		 */
 		if (cow_start != (u64)-1) {
 			ret = cow_file_range(inode, locked_page,
 					     cow_start, found_key.offset - 1,
-					     end, page_started, nr_written, 1,
-					     NULL);
+					     page_started, nr_written, 1);
 			if (ret) {
 				if (nocow)
 					btrfs_dec_nocow_writers(fs_info,
@@ -1471,6 +1568,7 @@
 
 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			u64 orig_start = found_key.offset - extent_offset;
+			struct extent_map *em;
 
 			em = create_io_em(inode, cur_offset, num_bytes,
 					  orig_start,
@@ -1487,19 +1585,29 @@
 				goto error;
 			}
 			free_extent_map(em);
-		}
-
-		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			type = BTRFS_ORDERED_PREALLOC;
+			ret = btrfs_add_ordered_extent(inode, cur_offset,
+						       disk_bytenr, num_bytes,
+						       num_bytes,
+						       BTRFS_ORDERED_PREALLOC);
+			if (ret) {
+				btrfs_drop_extent_cache(BTRFS_I(inode),
+							cur_offset,
+							cur_offset + num_bytes - 1,
+							0);
+				goto error;
+			}
 		} else {
-			type = BTRFS_ORDERED_NOCOW;
+			ret = btrfs_add_ordered_extent(inode, cur_offset,
+						       disk_bytenr, num_bytes,
+						       num_bytes,
+						       BTRFS_ORDERED_NOCOW);
+			if (ret)
+				goto error;
 		}
 
-		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
-					       num_bytes, num_bytes, type);
 		if (nocow)
 			btrfs_dec_nocow_writers(fs_info, disk_bytenr);
-		BUG_ON(ret); /* -ENOMEM */
+		nocow = false;
 
 		if (root->root_key.objectid ==
 		    BTRFS_DATA_RELOC_TREE_OBJECTID)
@@ -1512,7 +1620,7 @@
 						      num_bytes);
 
 		extent_clear_unlock_delalloc(inode, cur_offset,
-					     cur_offset + num_bytes - 1, end,
+					     cur_offset + num_bytes - 1,
 					     locked_page, EXTENT_LOCKED |
 					     EXTENT_DELALLOC |
 					     EXTENT_CLEAR_DATA_RESV,
@@ -1537,15 +1645,18 @@
 
 	if (cow_start != (u64)-1) {
 		cur_offset = end;
-		ret = cow_file_range(inode, locked_page, cow_start, end, end,
-				     page_started, nr_written, 1, NULL);
+		ret = cow_file_range(inode, locked_page, cow_start, end,
+				     page_started, nr_written, 1);
 		if (ret)
 			goto error;
 	}
 
 error:
+	if (nocow)
+		btrfs_dec_nocow_writers(fs_info, disk_bytenr);
+
 	if (ret && cur_offset < end)
-		extent_clear_unlock_delalloc(inode, cur_offset, end, end,
+		extent_clear_unlock_delalloc(inode, cur_offset, end,
 					     locked_page, EXTENT_LOCKED |
 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -1577,14 +1688,13 @@
 }
 
 /*
- * extent_io.c call back to do delayed allocation processing
+ * Function to process delayed allocation (create CoW) for ranges which are
+ * being touched for the first time.
  */
-static int run_delalloc_range(void *private_data, struct page *locked_page,
-			      u64 start, u64 end, int *page_started,
-			      unsigned long *nr_written,
-			      struct writeback_control *wbc)
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+		u64 start, u64 end, int *page_started, unsigned long *nr_written,
+		struct writeback_control *wbc)
 {
-	struct inode *inode = private_data;
 	int ret;
 	int force_cow = need_force_cow(inode, start, end);
 	unsigned int write_flags = wbc_to_write_flags(wbc);
@@ -1595,9 +1705,10 @@
 	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
-	} else if (!inode_need_compress(inode, start, end)) {
-		ret = cow_file_range(inode, locked_page, start, end, end,
-				      page_started, nr_written, 1, NULL);
+	} else if (!inode_can_compress(inode) ||
+		   !inode_need_compress(inode, start, end)) {
+		ret = cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
 	} else {
 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 			&BTRFS_I(inode)->runtime_flags);
@@ -1606,14 +1717,14 @@
 					   write_flags);
 	}
 	if (ret)
-		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
+		btrfs_cleanup_ordered_extents(inode, locked_page, start,
+					      end - start + 1);
 	return ret;
 }
 
-static void btrfs_split_extent_hook(void *private_data,
-				    struct extent_state *orig, u64 split)
+void btrfs_split_delalloc_extent(struct inode *inode,
+				 struct extent_state *orig, u64 split)
 {
-	struct inode *inode = private_data;
 	u64 size;
 
 	/* not delalloc, ignore it */
@@ -1626,7 +1737,7 @@
 		u64 new_size;
 
 		/*
-		 * See the explanation in btrfs_merge_extent_hook, the same
+		 * See the explanation in btrfs_merge_delalloc_extent, the same
 		 * applies here, just in reverse.
 		 */
 		new_size = orig->end - split + 1;
@@ -1643,16 +1754,13 @@
 }
 
 /*
- * extent_io.c merge_extent_hook, used to track merged delayed allocation
- * extents so we can keep track of new extents that are just merged onto old
- * extents, such as when we are doing sequential writes, so we can properly
- * account for the metadata space we'll need.
+ * Handle merged delayed allocation extents so we can keep track of new extents
+ * that are just merged onto old extents, such as when we are doing sequential
+ * writes, so we can properly account for the metadata space we'll need.
  */
-static void btrfs_merge_extent_hook(void *private_data,
-				    struct extent_state *new,
-				    struct extent_state *other)
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+				 struct extent_state *other)
 {
-	struct inode *inode = private_data;
 	u64 new_size, old_size;
 	u32 num_extents;
 
@@ -1756,15 +1864,12 @@
 }
 
 /*
- * extent_io.c set_bit_hook, used to track delayed allocation
- * bytes in this file, and to maintain the list of inodes that
- * have pending delalloc work to be done.
+ * Properly track delayed allocation bytes in the inode and to maintain the
+ * list of inodes that have pending delalloc work to be done.
  */
-static void btrfs_set_bit_hook(void *private_data,
-			       struct extent_state *state, unsigned *bits)
+void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+			       unsigned *bits)
 {
-	struct inode *inode = private_data;
-
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 
 	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1810,14 +1915,14 @@
 }
 
 /*
- * extent_io.c clear_bit_hook, see set_bit_hook for why
+ * Once a range is no longer delalloc this function ensures that proper
+ * accounting happens.
  */
-static void btrfs_clear_bit_hook(void *private_data,
-				 struct extent_state *state,
-				 unsigned *bits)
+void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
+				 struct extent_state *state, unsigned *bits)
 {
-	struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+	struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
 	u64 len = state->end + 1 - state->start;
 	u32 num_extents = count_max_extents(len);
 
@@ -1842,7 +1947,7 @@
 
 		/*
 		 * We don't reserve metadata space for space cache inodes so we
-		 * don't need to call dellalloc_release_metadata if there is an
+		 * don't need to call delalloc_release_metadata if there is an
 		 * error.
 		 */
 		if (*bits & EXTENT_CLEAR_META_RESV &&
@@ -1881,16 +1986,21 @@
 }
 
 /*
- * Merge bio hook, this must check the chunk tree to make sure we don't create
- * bios that span stripes or chunks
+ * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
+ * in a chunk's stripe. This function ensures that bios do not span a
+ * stripe/chunk
  *
- * return 1 if page cannot be merged to bio
- * return 0 if page can be merged to bio
+ * @page - The page we are about to add to the bio
+ * @size - size we want to add to the bio
+ * @bio - bio we want to ensure is smaller than a stripe
+ * @bio_flags - flags of the bio
+ *
+ * return 1 if page cannot be added to the bio
+ * return 0 if page can be added to the bio
  * return error otherwise
  */
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio,
-			 unsigned long bio_flags)
+int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
+			     unsigned long bio_flags)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1898,17 +2008,19 @@
 	u64 length = 0;
 	u64 map_length;
 	int ret;
+	struct btrfs_io_geometry geom;
 
 	if (bio_flags & EXTENT_BIO_COMPRESSED)
 		return 0;
 
 	length = bio->bi_iter.bi_size;
 	map_length = length;
-	ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
-			      NULL, 0);
+	ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
+				    &geom);
 	if (ret < 0)
 		return ret;
-	if (map_length < length + size)
+
+	if (geom.len < length + size)
 		return 1;
 	return 0;
 }
@@ -1933,29 +2045,6 @@
 }
 
 /*
- * in order to insert checksums into the metadata in large chunks,
- * we wait until bio submission time.   All the pages in the bio are
- * checksummed and sums are attached onto the ordered extent record.
- *
- * At IO completion time the cums attached on the ordered extent record
- * are inserted into the btree
- */
-blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
-			  int mirror_num)
-{
-	struct inode *inode = private_data;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	blk_status_t ret;
-
-	ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
-	if (ret) {
-		bio->bi_status = ret;
-		bio_endio(bio);
-	}
-	return ret;
-}
-
-/*
  * extent_io.c submission hook. This does the right thing for csum calculation
  * on write, or reading the csums from the tree before a read.
  *
@@ -1973,11 +2062,11 @@
  *
  *    c-3) otherwise:			async submit
  */
-static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
-				 int mirror_num, unsigned long bio_flags,
-				 u64 bio_offset)
+static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
+					  int mirror_num,
+					  unsigned long bio_flags)
+
 {
-	struct inode *inode = private_data;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
@@ -2012,8 +2101,7 @@
 			goto mapit;
 		/* we're doing a write, do the async checksumming */
 		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
-					  bio_offset, inode,
-					  btrfs_submit_bio_start);
+					  0, inode, btrfs_submit_bio_start);
 		goto out;
 	} else if (!skip_sum) {
 		ret = btrfs_csum_one_bio(inode, bio, 0, 0);
@@ -2055,9 +2143,9 @@
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      unsigned int extra_bits,
-			      struct extent_state **cached_state, int dedupe)
+			      struct extent_state **cached_state)
 {
-	WARN_ON((end & (PAGE_SIZE - 1)) == 0);
+	WARN_ON(PAGE_ALIGNED(end));
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   extra_bits, cached_state);
 }
@@ -2121,7 +2209,7 @@
 	 }
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
-					&cached_state, 0);
+					&cached_state);
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		end_extent_writepage(page, ret, page_start, page_end);
@@ -2131,7 +2219,7 @@
 
 	ClearPageChecked(page);
 	set_page_dirty(page);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			     &cached_state);
@@ -2153,7 +2241,7 @@
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
  */
-static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2540,6 +2628,7 @@
 	struct btrfs_file_extent_item *item;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_ref ref = { 0 };
 	struct btrfs_root *root;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -2710,10 +2799,11 @@
 	inode_add_bytes(inode, len);
 	btrfs_release_path(path);
 
-	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
-			new->disk_len, 0,
-			backref->root_id, backref->inum,
-			new->file_pos);	/* start - extent_offset */
+	btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr,
+			       new->disk_len, 0);
+	btrfs_init_data_ref(&ref, backref->root_id, backref->inum,
+			    new->file_pos);  /* start - extent_offset */
+	ret = btrfs_inc_extent_ref(trans, &ref);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
@@ -2750,12 +2840,9 @@
 	struct btrfs_path *path;
 	struct sa_defrag_extent_backref *backref;
 	struct sa_defrag_extent_backref *prev = NULL;
-	struct inode *inode;
 	struct rb_node *node;
 	int ret;
 
-	inode = new->inode;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return;
@@ -3150,9 +3237,6 @@
 	/* once for the tree */
 	btrfs_put_ordered_extent(ordered_extent);
 
-	/* Try to release some metadata so we don't get an OOM but don't wait */
-	btrfs_btree_balance_dirty_nodelay(fs_info);
-
 	return ret;
 }
 
@@ -3163,8 +3247,8 @@
 	btrfs_finish_ordered_io(ordered_extent);
 }
 
-static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
-				struct extent_state *state, int uptodate)
+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+					  u64 end, int uptodate)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3197,16 +3281,23 @@
 				  int icsum, struct page *page,
 				  int pgoff, u64 start, size_t len)
 {
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	char *kaddr;
-	u32 csum_expected;
-	u32 csum = ~(u32)0;
+	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+	u8 *csum_expected;
+	u8 csum[BTRFS_CSUM_SIZE];
 
-	csum_expected = *(((u32 *)io_bio->csum) + icsum);
+	csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
 
 	kaddr = kmap_atomic(page);
-	csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
-	btrfs_csum_final(csum, (u8 *)&csum);
-	if (csum != csum_expected)
+	shash->tfm = fs_info->csum_shash;
+
+	crypto_shash_init(shash);
+	crypto_shash_update(shash, kaddr + pgoff, len);
+	crypto_shash_final(shash, csum);
+
+	if (memcmp(csum, csum_expected, csum_size))
 		goto zeroit;
 
 	kunmap_atomic(kaddr);
@@ -3271,10 +3362,35 @@
 	if (atomic_add_unless(&inode->i_count, -1, 1))
 		return;
 
+	atomic_inc(&fs_info->nr_delayed_iputs);
 	spin_lock(&fs_info->delayed_iput_lock);
 	ASSERT(list_empty(&binode->delayed_iput));
 	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
 	spin_unlock(&fs_info->delayed_iput_lock);
+	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
+		wake_up_process(fs_info->cleaner_kthread);
+}
+
+static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
+				    struct btrfs_inode *inode)
+{
+	list_del_init(&inode->delayed_iput);
+	spin_unlock(&fs_info->delayed_iput_lock);
+	iput(&inode->vfs_inode);
+	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+		wake_up(&fs_info->delayed_iputs_wait);
+	spin_lock(&fs_info->delayed_iput_lock);
+}
+
+static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
+				   struct btrfs_inode *inode)
+{
+	if (!list_empty(&inode->delayed_iput)) {
+		spin_lock(&fs_info->delayed_iput_lock);
+		if (!list_empty(&inode->delayed_iput))
+			run_delayed_iput_locked(fs_info, inode);
+		spin_unlock(&fs_info->delayed_iput_lock);
+	}
 }
 
 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
@@ -3286,14 +3402,30 @@
 
 		inode = list_first_entry(&fs_info->delayed_iputs,
 				struct btrfs_inode, delayed_iput);
-		list_del_init(&inode->delayed_iput);
-		spin_unlock(&fs_info->delayed_iput_lock);
-		iput(&inode->vfs_inode);
-		spin_lock(&fs_info->delayed_iput_lock);
+		run_delayed_iput_locked(fs_info, inode);
 	}
 	spin_unlock(&fs_info->delayed_iput_lock);
 }
 
+/**
+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
+ * @fs_info - the fs_info for this fs
+ * @return - EINTR if we were killed, 0 if nothing's pending
+ *
+ * This will wait on any delayed iputs that are currently running with KILLABLE
+ * set.  Once they are all done running we will return, unless we are killed in
+ * which case we return EINTR. This helps in user operations like fallocate etc
+ * that might get blocked on the iputs.
+ */
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+			atomic_read(&fs_info->nr_delayed_iputs) == 0);
+	if (ret)
+		return -EINTR;
+	return 0;
+}
+
 /*
  * This creates an orphan entry for the given inode in case something goes wrong
  * in the middle of an unlink.
@@ -3479,8 +3611,6 @@
 
 		/* this will do delete_inode and everything for us */
 		iput(inode);
-		if (ret)
-			goto out;
 	}
 	/* release the path since we're done with it */
 	btrfs_release_path(path);
@@ -3575,10 +3705,11 @@
 /*
  * read an inode from the btree into the in-memory inode
  */
-static int btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode,
+				   struct btrfs_path *in_path)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_path *path;
+	struct btrfs_path *path = in_path;
 	struct extent_buffer *leaf;
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3594,15 +3725,18 @@
 	if (!ret)
 		filled = true;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+	}
 
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret) {
-		btrfs_free_path(path);
+		if (path != in_path)
+			btrfs_free_path(path);
 		return ret;
 	}
 
@@ -3727,7 +3861,8 @@
 				  btrfs_ino(BTRFS_I(inode)),
 				  root->root_key.objectid, ret);
 	}
-	btrfs_free_path(path);
+	if (path != in_path)
+		btrfs_free_path(path);
 
 	if (!maybe_acls)
 		cache_no_acl(inode);
@@ -3746,7 +3881,7 @@
 	case S_IFLNK:
 		inode->i_op = &btrfs_symlink_inode_operations;
 		inode_nohighmem(inode);
-		inode->i_mapping->a_ops = &btrfs_symlink_aops;
+		inode->i_mapping->a_ops = &btrfs_aops;
 		break;
 	default:
 		inode->i_op = &btrfs_special_inode_operations;
@@ -3768,7 +3903,7 @@
 {
 	struct btrfs_map_token token;
 
-	btrfs_init_map_token(&token);
+	btrfs_init_map_token(&token, leaf);
 
 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
@@ -3902,9 +4037,7 @@
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_path *path;
 	int ret = 0;
-	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
-	struct btrfs_key key;
 	u64 index;
 	u64 ino = btrfs_ino(inode);
 	u64 dir_ino = btrfs_ino(dir);
@@ -3918,16 +4051,10 @@
 	path->leave_spinning = 1;
 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
 				    name, name_len, -1);
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
+	if (IS_ERR_OR_NULL(di)) {
+		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto err;
 	}
-	if (!di) {
-		ret = -ENOENT;
-		goto err;
-	}
-	leaf = path->nodes[0];
-	btrfs_dir_item_key_to_cpu(leaf, di, &key);
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 	if (ret)
 		goto err;
@@ -3980,6 +4107,17 @@
 		ret = 0;
 	else if (ret)
 		btrfs_abort_transaction(trans, ret);
+
+	/*
+	 * If we have a pending delayed iput we could end up with the final iput
+	 * being run in btrfs-cleaner context.  If we have enough of these built
+	 * up we can end up burning a lot of time in btrfs-cleaner without any
+	 * way to throttle the unlinks.  Since we're currently holding a ref on
+	 * the inode we can run the delayed iput here without any issues as the
+	 * final iput won't be done until after we drop the ref we're currently
+	 * holding.
+	 */
+	btrfs_run_delayed_iput(fs_info, inode);
 err:
 	btrfs_free_path(path);
 	if (ret)
@@ -4083,10 +4221,7 @@
 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
 				   name, name_len, -1);
 	if (IS_ERR_OR_NULL(di)) {
-		if (!di)
-			ret = -ENOENT;
-		else
-			ret = PTR_ERR(di);
+		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
 	}
 
@@ -4278,18 +4413,17 @@
 	 * again is not run concurrently.
 	 */
 	spin_lock(&dest->root_item_lock);
-	root_flags = btrfs_root_flags(&dest->root_item);
-	if (dest->send_in_progress == 0) {
-		btrfs_set_root_flags(&dest->root_item,
-				root_flags | BTRFS_ROOT_SUBVOL_DEAD);
-		spin_unlock(&dest->root_item_lock);
-	} else {
+	if (dest->send_in_progress) {
 		spin_unlock(&dest->root_item_lock);
 		btrfs_warn(fs_info,
 			   "attempt to delete subvolume %llu during send",
 			   dest->root_key.objectid);
 		return -EPERM;
 	}
+	root_flags = btrfs_root_flags(&dest->root_item);
+	btrfs_set_root_flags(&dest->root_item,
+			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+	spin_unlock(&dest->root_item_lock);
 
 	down_write(&fs_info->subvol_sem);
 
@@ -4453,31 +4587,6 @@
 	return err;
 }
 
-static int truncate_space_check(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytes_deleted)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	int ret;
-
-	/*
-	 * This is only used to apply pressure to the enospc system, we don't
-	 * intend to use this reservation at all.
-	 */
-	bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
-	bytes_deleted *= fs_info->nodesize;
-	ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
-				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
-	if (!ret) {
-		trace_btrfs_space_reservation(fs_info, "transaction",
-					      trans->transid,
-					      bytes_deleted, 1);
-		trans->bytes_reserved += bytes_deleted;
-	}
-	return ret;
-
-}
-
 /*
  * Return this if we need to call truncate_block for the last bit of the
  * truncate.
@@ -4522,7 +4631,6 @@
 	u64 bytes_deleted = 0;
 	bool be_nice = false;
 	bool should_throttle = false;
-	bool should_end = false;
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
@@ -4553,7 +4661,7 @@
 	/*
 	 * This function is also used to drop the items in the log tree before
 	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
-	 * it is used to drop the loged items. So we shouldn't kill the delayed
+	 * it is used to drop the logged items. So we shouldn't kill the delayed
 	 * items.
 	 */
 	if (min_type == 0 && root == BTRFS_I(inode)->root)
@@ -4686,7 +4794,7 @@
 
 				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
 				size = btrfs_file_extent_calc_inline_size(size);
-				btrfs_truncate_item(root->fs_info, path, size, 1);
+				btrfs_truncate_item(path, size, 1);
 			} else if (!del_item) {
 				/*
 				 * We have to bail so the last_size is set to
@@ -4725,27 +4833,23 @@
 		if (found_extent &&
 		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 		     root == fs_info->tree_root)) {
+			struct btrfs_ref ref = { 0 };
+
 			btrfs_set_path_blocking(path);
 			bytes_deleted += extent_num_bytes;
-			ret = btrfs_free_extent(trans, root, extent_start,
-						extent_num_bytes, 0,
-						btrfs_header_owner(leaf),
-						ino, extent_offset);
+
+			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
+					extent_start, extent_num_bytes, 0);
+			ref.real_root = root->root_key.objectid;
+			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+					ino, extent_offset);
+			ret = btrfs_free_extent(trans, &ref);
 			if (ret) {
 				btrfs_abort_transaction(trans, ret);
 				break;
 			}
-			if (btrfs_should_throttle_delayed_refs(trans, fs_info))
-				btrfs_async_run_delayed_refs(fs_info,
-					trans->delayed_ref_updates * 2,
-					trans->transid, 0);
 			if (be_nice) {
-				if (truncate_space_check(trans, root,
-							 extent_num_bytes)) {
-					should_end = true;
-				}
-				if (btrfs_should_throttle_delayed_refs(trans,
-								       fs_info))
+				if (btrfs_should_throttle_delayed_refs(trans))
 					should_throttle = true;
 			}
 		}
@@ -4755,7 +4859,7 @@
 
 		if (path->slots[0] == 0 ||
 		    path->slots[0] != pending_del_slot ||
-		    should_throttle || should_end) {
+		    should_throttle) {
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
@@ -4767,23 +4871,24 @@
 				pending_del_nr = 0;
 			}
 			btrfs_release_path(path);
-			if (should_throttle) {
-				unsigned long updates = trans->delayed_ref_updates;
-				if (updates) {
-					trans->delayed_ref_updates = 0;
-					ret = btrfs_run_delayed_refs(trans,
-								   updates * 2);
-					if (ret)
-						break;
-				}
-			}
+
 			/*
-			 * if we failed to refill our space rsv, bail out
-			 * and let the transaction restart
+			 * We can generate a lot of delayed refs, so we need to
+			 * throttle every once and a while and make sure we're
+			 * adding enough space to keep up with the work we are
+			 * generating.  Since we hold a transaction here we
+			 * can't flush, and we don't want to FLUSH_LIMIT because
+			 * we could have generated too many delayed refs to
+			 * actually allocate, so just bail if we're short and
+			 * let the normal reservation dance happen higher up.
 			 */
-			if (should_end) {
-				ret = -EAGAIN;
-				break;
+			if (should_throttle) {
+				ret = btrfs_delayed_refs_rsv_refill(fs_info,
+							BTRFS_RESERVE_NO_FLUSH);
+				if (ret) {
+					ret = -EAGAIN;
+					break;
+				}
 			}
 			goto search_again;
 		} else {
@@ -4809,18 +4914,6 @@
 	}
 
 	btrfs_free_path(path);
-
-	if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
-		unsigned long updates = trans->delayed_ref_updates;
-		int err;
-
-		if (updates) {
-			trans->delayed_ref_updates = 0;
-			err = btrfs_run_delayed_refs(trans, updates * 2);
-			if (err)
-				ret = err;
-		}
-	}
 	return ret;
 }
 
@@ -4871,7 +4964,7 @@
 	if (!page) {
 		btrfs_delalloc_release_space(inode, data_reserved,
 					     block_start, blocksize, true);
-		btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -4906,12 +4999,11 @@
 	}
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC |
-			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-			  0, 0, &cached_state);
+			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+			 0, 0, &cached_state);
 
 	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
-					&cached_state, 0);
+					&cached_state);
 	if (ret) {
 		unlock_extent_cached(io_tree, block_start, block_end,
 				     &cached_state);
@@ -4939,7 +5031,7 @@
 	if (ret)
 		btrfs_delalloc_release_space(inode, data_reserved, block_start,
 					     blocksize, true);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
+	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
 	unlock_page(page);
 	put_page(page);
 out:
@@ -5024,21 +5116,8 @@
 	if (size <= hole_start)
 		return 0;
 
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		lock_extent_bits(io_tree, hole_start, block_end - 1,
-				 &cached_state);
-		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
-						     block_end - hole_start);
-		if (!ordered)
-			break;
-		unlock_extent_cached(io_tree, hole_start, block_end - 1,
-				     &cached_state);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-	}
-
+	btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start,
+					   block_end - 1, &cached_state);
 	cur_offset = hole_start;
 	while (1) {
 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
@@ -5165,7 +5244,7 @@
 
 		truncate_setsize(inode, newsize);
 
-		/* Disable nonlocked read DIO to avoid the end less truncate */
+		/* Disable nonlocked read DIO to avoid the endless truncate */
 		btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
 		inode_dio_wait(inode);
 		btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
@@ -5243,10 +5322,10 @@
 	truncate_inode_pages_final(&inode->i_data);
 
 	write_lock(&map_tree->lock);
-	while (!RB_EMPTY_ROOT(&map_tree->map)) {
+	while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
 		struct extent_map *em;
 
-		node = rb_first(&map_tree->map);
+		node = rb_first_cached(&map_tree->map);
 		em = rb_entry(node, struct extent_map, rb_node);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -5305,9 +5384,9 @@
 			btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
 
 		clear_extent_bit(io_tree, start, end,
-				 EXTENT_LOCKED | EXTENT_DIRTY |
-				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-				 EXTENT_DEFRAG, 1, 1, &cached_state);
+				 EXTENT_LOCKED | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+				 &cached_state);
 
 		cond_resched();
 		spin_lock(&io_tree->lock);
@@ -5316,43 +5395,54 @@
 }
 
 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
-							struct btrfs_block_rsv *rsv,
-							u64 min_size)
+							struct btrfs_block_rsv *rsv)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	int failures = 0;
+	struct btrfs_trans_handle *trans;
+	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
+	int ret;
 
-	for (;;) {
-		struct btrfs_trans_handle *trans;
-		int ret;
-
-		ret = btrfs_block_rsv_refill(root, rsv, min_size,
-					     BTRFS_RESERVE_FLUSH_LIMIT);
-
-		if (ret && ++failures > 2) {
-			btrfs_warn(fs_info,
-				   "could not allocate space for a delete; will truncate on mount");
-			return ERR_PTR(-ENOSPC);
-		}
-
-		trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans) || !ret)
-			return trans;
-
+	/*
+	 * Eviction should be taking place at some place safe because of our
+	 * delayed iputs.  However the normal flushing code will run delayed
+	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
+	 *
+	 * We reserve the delayed_refs_extra here again because we can't use
+	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
+	 * above.  We reserve our extra bit here because we generate a ton of
+	 * delayed refs activity by truncating.
+	 *
+	 * If we cannot make our reservation we'll attempt to steal from the
+	 * global reserve, because we really want to be able to free up space.
+	 */
+	ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
+				     BTRFS_RESERVE_FLUSH_EVICT);
+	if (ret) {
 		/*
 		 * Try to steal from the global reserve if there is space for
 		 * it.
 		 */
-		if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
-		    !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
-			return trans;
-
-		/* If not, commit and try again. */
-		ret = btrfs_commit_transaction(trans);
-		if (ret)
-			return ERR_PTR(ret);
+		if (btrfs_check_space_for_delayed_refs(fs_info) ||
+		    btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
+			btrfs_warn(fs_info,
+				   "could not allocate space for delete; will truncate on mount");
+			return ERR_PTR(-ENOSPC);
+		}
+		delayed_refs_extra = 0;
 	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return trans;
+
+	if (delayed_refs_extra) {
+		trans->block_rsv = &fs_info->trans_block_rsv;
+		trans->bytes_reserved = delayed_refs_extra;
+		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+					delayed_refs_extra, 1);
+	}
+	return trans;
 }
 
 void btrfs_evict_inode(struct inode *inode)
@@ -5361,7 +5451,6 @@
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv;
-	u64 min_size;
 	int ret;
 
 	trace_btrfs_inode_evict(inode);
@@ -5371,8 +5460,6 @@
 		return;
 	}
 
-	min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
-
 	evict_inode_truncate_pages(inode);
 
 	if (inode->i_nlink &&
@@ -5383,9 +5470,6 @@
 
 	if (is_bad_inode(inode))
 		goto no_delete;
-	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
-	if (!special_file(inode->i_mode))
-		btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
 
@@ -5405,13 +5489,13 @@
 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv)
 		goto no_delete;
-	rsv->size = min_size;
+	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
 	rsv->failfast = 1;
 
 	btrfs_i_size_write(BTRFS_I(inode), 0);
 
 	while (1) {
-		trans = evict_refill_and_join(root, rsv, min_size);
+		trans = evict_refill_and_join(root, rsv);
 		if (IS_ERR(trans))
 			goto free_rsv;
 
@@ -5436,7 +5520,7 @@
 	 * If it turns out that we are dropping too many of these, we might want
 	 * to add a mechanism for retrying these after a commit.
 	 */
-	trans = evict_refill_and_join(root, rsv, min_size);
+	trans = evict_refill_and_join(root, rsv);
 	if (!IS_ERR(trans)) {
 		trans->block_rsv = rsv;
 		btrfs_orphan_del(trans, BTRFS_I(inode));
@@ -5461,12 +5545,14 @@
 }
 
 /*
- * this returns the key found in the dir entry in the location pointer.
+ * Return the key found in the dir entry in the location pointer, fill @type
+ * with BTRFS_FT_*, and return 0.
+ *
  * If no dir entries were found, returns -ENOENT.
  * If found a corrupted location in dir entry, returns -EUCLEAN.
  */
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
-			       struct btrfs_key *location)
+			       struct btrfs_key *location, u8 *type)
 {
 	const char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
@@ -5481,12 +5567,8 @@
 
 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
 			name, namelen, 0);
-	if (!di) {
-		ret = -ENOENT;
-		goto out;
-	}
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
+	if (IS_ERR_OR_NULL(di)) {
+		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
 	}
 
@@ -5499,6 +5581,8 @@
 			   __func__, name, btrfs_ino(BTRFS_I(dir)),
 			   location->objectid, location->type, location->offset);
 	}
+	if (!ret)
+		*type = btrfs_dir_type(path->nodes[0], di);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -5669,8 +5753,9 @@
 /* Get an inode object given its location and corresponding root.
  * Returns in *is_new if the inode was read from disk
  */
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root, int *new)
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+			      struct btrfs_root *root, int *new,
+			      struct btrfs_path *path)
 {
 	struct inode *inode;
 
@@ -5681,7 +5766,7 @@
 	if (inode->i_state & I_NEW) {
 		int ret;
 
-		ret = btrfs_read_locked_inode(inode);
+		ret = btrfs_read_locked_inode(inode, path);
 		if (!ret) {
 			inode_tree_add(inode);
 			unlock_new_inode(inode);
@@ -5703,6 +5788,12 @@
 	return inode;
 }
 
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *new)
+{
+	return btrfs_iget_path(s, location, root, new, NULL);
+}
+
 static struct inode *new_simple_dir(struct super_block *s,
 				    struct btrfs_key *key,
 				    struct btrfs_root *root)
@@ -5729,6 +5820,24 @@
 	return inode;
 }
 
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+	/*
+	 * Compile-time asserts that generic FT_* types still match
+	 * BTRFS_FT_* types
+	 */
+	BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
+	BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
+	BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
+	BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
+	BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
+	BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
+	BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
+	BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
+
+	return fs_umode_to_ftype(inode->i_mode);
+}
+
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -5736,18 +5845,31 @@
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
+	u8 di_type = 0;
 	int index;
 	int ret = 0;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	ret = btrfs_inode_by_name(dir, dentry, &location);
+	ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
 	if (location.type == BTRFS_INODE_ITEM_KEY) {
 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
+		if (IS_ERR(inode))
+			return inode;
+
+		/* Do extra check against inode mode with di_type */
+		if (btrfs_inode_type(inode) != di_type) {
+			btrfs_crit(fs_info,
+"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
+				  inode->i_mode, btrfs_inode_type(inode),
+				  di_type);
+			iput(inode);
+			return ERR_PTR(-EUCLEAN);
+		}
 		return inode;
 	}
 
@@ -5800,23 +5922,13 @@
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   unsigned int flags)
 {
-	struct inode *inode;
+	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
 
-	inode = btrfs_lookup_dentry(dir, dentry);
-	if (IS_ERR(inode)) {
-		if (PTR_ERR(inode) == -ENOENT)
-			inode = NULL;
-		else
-			return ERR_CAST(inode);
-	}
-
+	if (inode == ERR_PTR(-ENOENT))
+		inode = NULL;
 	return d_splice_alias(inode, dentry);
 }
 
-unsigned char btrfs_filetype_table[] = {
-	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
 /*
  * All this infrastructure exists because dir_emit can fault, and we are holding
  * the tree lock when doing readdir.  For now just allocate a buffer and copy
@@ -5955,7 +6067,7 @@
 		name_ptr = (char *)(entry + 1);
 		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
 				   name_len);
-		put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)],
+		put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
 				&entry->type);
 		btrfs_dir_item_key_to_cpu(leaf, di, &location);
 		put_unaligned(location.objectid, &entry->ino);
@@ -6206,13 +6318,16 @@
 	u32 sizes[2];
 	int nitems = name ? 2 : 1;
 	unsigned long ptr;
+	unsigned int nofs_flag;
 	int ret;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return ERR_PTR(-ENOMEM);
 
+	nofs_flag = memalloc_nofs_save();
 	inode = new_inode(fs_info->sb);
+	memalloc_nofs_restore(nofs_flag);
 	if (!inode) {
 		btrfs_free_path(path);
 		return ERR_PTR(-ENOMEM);
@@ -6358,11 +6473,6 @@
 	return ERR_PTR(ret);
 }
 
-static inline u8 btrfs_inode_type(struct inode *inode)
-{
-	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
-}
-
 /*
  * utility function to add 'inode' into 'parent_inode' with
  * a give name and a given sequence number.
@@ -6400,8 +6510,7 @@
 	if (ret)
 		return ret;
 
-	ret = btrfs_insert_dir_item(trans, root, name, name_len,
-				    parent_inode, &key,
+	ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
 				    btrfs_inode_type(&inode->vfs_inode), index);
 	if (ret == -EEXIST || ret == -EOVERFLOW)
 		goto fail_dir_item;
@@ -6413,8 +6522,18 @@
 	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
 			   name_len * 2);
 	inode_inc_iversion(&parent_inode->vfs_inode);
-	parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime =
-		current_time(&parent_inode->vfs_inode);
+	/*
+	 * If we are replaying a log tree, we do not want to update the mtime
+	 * and ctime of the parent directory with the current time, since the
+	 * log replay procedure is responsible for setting them to their correct
+	 * values (the ones it had when the fsync was done).
+	 */
+	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
+		struct timespec64 now = current_time(&parent_inode->vfs_inode);
+
+		parent_inode->vfs_inode.i_mtime = now;
+		parent_inode->vfs_inode.i_ctime = now;
+	}
 	ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
@@ -6427,14 +6546,19 @@
 		err = btrfs_del_root_ref(trans, key.objectid,
 					 root->root_key.objectid, parent_ino,
 					 &local_index, name, name_len);
-
+		if (err)
+			btrfs_abort_transaction(trans, err);
 	} else if (add_backref) {
 		u64 local_index;
 		int err;
 
 		err = btrfs_del_inode_ref(trans, root, name, name_len,
 					  ino, parent_ino, &local_index);
+		if (err)
+			btrfs_abort_transaction(trans, err);
 	}
+
+	/* Return the original error code */
 	return ret;
 }
 
@@ -6594,7 +6718,7 @@
 	int drop_inode = 0;
 
 	/* do not allow sys_link's with other subvols of the same device */
-	if (root->objectid != BTRFS_I(inode)->root->objectid)
+	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
 		return -EXDEV;
 
 	if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6673,7 +6797,6 @@
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int err = 0;
-	int drop_on_err = 0;
 	u64 objectid = 0;
 	u64 index = 0;
 
@@ -6699,7 +6822,6 @@
 		goto out_fail;
 	}
 
-	drop_on_err = 1;
 	/* these must be set before we unlock the inode */
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
@@ -6720,7 +6842,6 @@
 		goto out_fail;
 
 	d_instantiate_new(dentry, inode);
-	drop_on_err = 0;
 
 out_fail:
 	btrfs_end_transaction(trans);
@@ -6787,9 +6908,9 @@
  * This also copies inline extents directly into the page.
  */
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
-		struct page *page,
-	    size_t pg_offset, u64 start, u64 len,
-		int create)
+				    struct page *page,
+				    size_t pg_offset, u64 start, u64 len,
+				    int create)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	int ret;
@@ -6797,7 +6918,7 @@
 	u64 extent_start = 0;
 	u64 extent_end = 0;
 	u64 objectid = btrfs_ino(inode);
-	u32 found_type;
+	int extent_type = -1;
 	struct btrfs_path *path = NULL;
 	struct btrfs_root *root = inode->root;
 	struct btrfs_file_extent_item *item;
@@ -6833,26 +6954,26 @@
 	em->len = (u64)-1;
 	em->block_len = (u64)-1;
 
+	path = btrfs_alloc_path();
 	if (!path) {
-		path = btrfs_alloc_path();
-		if (!path) {
-			err = -ENOMEM;
-			goto out;
-		}
-		/*
-		 * Chances are we'll be called again, so go ahead and do
-		 * readahead
-		 */
-		path->reada = READA_FORWARD;
+		err = -ENOMEM;
+		goto out;
 	}
 
+	/* Chances are we'll be called again, so go ahead and do readahead */
+	path->reada = READA_FORWARD;
+
+	/*
+	 * Unless we're going to uncompress the inline extent, no sleep would
+	 * happen.
+	 */
+	path->leave_spinning = 1;
+
 	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
 	if (ret < 0) {
 		err = ret;
 		goto out;
-	}
-
-	if (ret != 0) {
+	} else if (ret > 0) {
 		if (path->slots[0] == 0)
 			goto not_found;
 		path->slots[0]--;
@@ -6861,11 +6982,9 @@
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
-	/* are we inside the extent that was found? */
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-	found_type = found_key.type;
 	if (found_key.objectid != objectid ||
-	    found_type != BTRFS_EXTENT_DATA_KEY) {
+	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
 		/*
 		 * If we backup past the first extent we want to move forward
 		 * and see if there is an extent in front of us, otherwise we'll
@@ -6876,16 +6995,24 @@
 		goto next;
 	}
 
-	found_type = btrfs_file_extent_type(leaf, item);
+	extent_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
-	if (found_type == BTRFS_FILE_EXTENT_REG ||
-	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+	if (extent_type == BTRFS_FILE_EXTENT_REG ||
+	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		/* Only regular file could have regular/prealloc extent */
+		if (!S_ISREG(inode->vfs_inode.i_mode)) {
+			ret = -EUCLEAN;
+			btrfs_crit(fs_info,
+		"regular/prealloc extent found for non-regular inode %llu",
+				   btrfs_ino(inode));
+			goto out;
+		}
 		extent_end = extent_start +
 		       btrfs_file_extent_num_bytes(leaf, item);
 
 		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
 						       extent_start);
-	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 		size_t size;
 
 		size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -6904,9 +7031,9 @@
 			if (ret < 0) {
 				err = ret;
 				goto out;
-			}
-			if (ret > 0)
+			} else if (ret > 0) {
 				goto not_found;
+			}
 			leaf = path->nodes[0];
 		}
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6917,19 +7044,22 @@
 			goto not_found;
 		if (start > found_key.offset)
 			goto next;
+
+		/* New extent overlaps with existing one */
 		em->start = start;
 		em->orig_start = start;
 		em->len = found_key.offset - start;
-		goto not_found_em;
+		em->block_start = EXTENT_MAP_HOLE;
+		goto insert;
 	}
 
 	btrfs_extent_item_to_extent_map(inode, path, item,
 			new_inline, em);
 
-	if (found_type == BTRFS_FILE_EXTENT_REG ||
-	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+	if (extent_type == BTRFS_FILE_EXTENT_REG ||
+	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		goto insert;
-	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 		unsigned long ptr;
 		char *map;
 		size_t size;
@@ -6948,6 +7078,8 @@
 		em->orig_block_len = em->len;
 		em->orig_start = em->start;
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+
+		btrfs_set_path_blocking(path);
 		if (!PageUptodate(page)) {
 			if (btrfs_file_extent_compression(leaf, item) !=
 			    BTRFS_COMPRESS_NONE) {
@@ -6978,7 +7110,6 @@
 	em->start = start;
 	em->orig_start = start;
 	em->len = len;
-not_found_em:
 	em->block_start = EXTENT_MAP_HOLE;
 insert:
 	btrfs_release_path(path);
@@ -6995,10 +7126,10 @@
 	err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	write_unlock(&em_tree->lock);
 out:
+	btrfs_free_path(path);
 
 	trace_btrfs_get_extent(root, inode, em);
 
-	btrfs_free_path(path);
 	if (err) {
 		free_extent_map(em);
 		return ERR_PTR(err);
@@ -7008,19 +7139,17 @@
 }
 
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
-		struct page *page,
-		size_t pg_offset, u64 start, u64 len,
-		int create)
+					   u64 start, u64 len)
 {
 	struct extent_map *em;
 	struct extent_map *hole_em = NULL;
-	u64 range_start = start;
+	u64 delalloc_start = start;
 	u64 end;
-	u64 found;
-	u64 found_end;
+	u64 delalloc_len;
+	u64 delalloc_end;
 	int err = 0;
 
-	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
 	if (IS_ERR(em))
 		return em;
 	/*
@@ -7045,80 +7174,84 @@
 	em = NULL;
 
 	/* ok, we didn't find anything, lets look for delalloc */
-	found = count_range_bits(&inode->io_tree, &range_start,
+	delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
 				 end, len, EXTENT_DELALLOC, 1);
-	found_end = range_start + found;
-	if (found_end < range_start)
-		found_end = (u64)-1;
+	delalloc_end = delalloc_start + delalloc_len;
+	if (delalloc_end < delalloc_start)
+		delalloc_end = (u64)-1;
 
 	/*
-	 * we didn't find anything useful, return
-	 * the original results from get_extent()
+	 * We didn't find anything useful, return the original results from
+	 * get_extent()
 	 */
-	if (range_start > end || found_end <= start) {
+	if (delalloc_start > end || delalloc_end <= start) {
 		em = hole_em;
 		hole_em = NULL;
 		goto out;
 	}
 
-	/* adjust the range_start to make sure it doesn't
-	 * go backwards from the start they passed in
+	/*
+	 * Adjust the delalloc_start to make sure it doesn't go backwards from
+	 * the start they passed in
 	 */
-	range_start = max(start, range_start);
-	found = found_end - range_start;
+	delalloc_start = max(start, delalloc_start);
+	delalloc_len = delalloc_end - delalloc_start;
 
-	if (found > 0) {
-		u64 hole_start = start;
-		u64 hole_len = len;
+	if (delalloc_len > 0) {
+		u64 hole_start;
+		u64 hole_len;
+		const u64 hole_end = extent_map_end(hole_em);
 
 		em = alloc_extent_map();
 		if (!em) {
 			err = -ENOMEM;
 			goto out;
 		}
-		/*
-		 * when btrfs_get_extent can't find anything it
-		 * returns one huge hole
-		 *
-		 * make sure what it found really fits our range, and
-		 * adjust to make sure it is based on the start from
-		 * the caller
-		 */
-		if (hole_em) {
-			u64 calc_end = extent_map_end(hole_em);
-
-			if (calc_end <= start || (hole_em->start > end)) {
-				free_extent_map(hole_em);
-				hole_em = NULL;
-			} else {
-				hole_start = max(hole_em->start, start);
-				hole_len = calc_end - hole_start;
-			}
-		}
 		em->bdev = NULL;
-		if (hole_em && range_start > hole_start) {
-			/* our hole starts before our delalloc, so we
-			 * have to return just the parts of the hole
-			 * that go until  the delalloc starts
+
+		ASSERT(hole_em);
+		/*
+		 * When btrfs_get_extent can't find anything it returns one
+		 * huge hole
+		 *
+		 * Make sure what it found really fits our range, and adjust to
+		 * make sure it is based on the start from the caller
+		 */
+		if (hole_end <= start || hole_em->start > end) {
+		       free_extent_map(hole_em);
+		       hole_em = NULL;
+		} else {
+		       hole_start = max(hole_em->start, start);
+		       hole_len = hole_end - hole_start;
+		}
+
+		if (hole_em && delalloc_start > hole_start) {
+			/*
+			 * Our hole starts before our delalloc, so we have to
+			 * return just the parts of the hole that go until the
+			 * delalloc starts
 			 */
-			em->len = min(hole_len,
-				      range_start - hole_start);
+			em->len = min(hole_len, delalloc_start - hole_start);
 			em->start = hole_start;
 			em->orig_start = hole_start;
 			/*
-			 * don't adjust block start at all,
-			 * it is fixed at EXTENT_MAP_HOLE
+			 * Don't adjust block start at all, it is fixed at
+			 * EXTENT_MAP_HOLE
 			 */
 			em->block_start = hole_em->block_start;
 			em->block_len = hole_len;
 			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 		} else {
-			em->start = range_start;
-			em->len = found;
-			em->orig_start = range_start;
+			/*
+			 * Hole is out of passed range or it starts after
+			 * delalloc range
+			 */
+			em->start = delalloc_start;
+			em->len = delalloc_len;
+			em->orig_start = delalloc_start;
 			em->block_start = EXTENT_MAP_DELALLOC;
-			em->block_len = found;
+			em->block_len = delalloc_len;
 		}
 	} else {
 		return hole_em;
@@ -7614,12 +7747,9 @@
 	u64 start = iblock << inode->i_blkbits;
 	u64 lockstart, lockend;
 	u64 len = bh_result->b_size;
-	int unlock_bits = EXTENT_LOCKED;
 	int ret = 0;
 
-	if (create)
-		unlock_bits |= EXTENT_DIRTY;
-	else
+	if (!create)
 		len = min_t(u64, len, fs_info->sectorsize);
 
 	lockstart = start;
@@ -7678,9 +7808,8 @@
 		if (ret < 0)
 			goto unlock_err;
 
-		/* clear and unlock the entire range */
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 unlock_bits, 1, 0, &cached_state);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state);
 	} else {
 		ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
 						   start, len);
@@ -7696,9 +7825,8 @@
 		 */
 		lockstart = start + bh_result->b_size;
 		if (lockstart < lockend) {
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-					 lockend, unlock_bits, 1, 0,
-					 &cached_state);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     lockstart, lockend, &cached_state);
 		} else {
 			free_extent_state(cached_state);
 		}
@@ -7709,8 +7837,8 @@
 	return 0;
 
 unlock_err:
-	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-			 unlock_bits, 1, 0, &cached_state);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			     &cached_state);
 err:
 	if (dio_data)
 		current->journal_info = dio_data;
@@ -7838,7 +7966,7 @@
 	struct inode *inode = done->inode;
 	struct bio_vec *bvec;
 	struct extent_io_tree *io_tree, *failure_tree;
-	int i;
+	struct bvec_iter_all iter_all;
 
 	if (bio->bi_status)
 		goto end;
@@ -7850,7 +7978,7 @@
 
 	done->uptodate = 1;
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, iter_all)
 		clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
 				 io_tree, done->start, bvec->bv_page,
 				 btrfs_ino(BTRFS_I(inode)), 0);
@@ -7928,7 +8056,8 @@
 	struct bio_vec *bvec;
 	int uptodate;
 	int ret;
-	int i;
+	int i = 0;
+	struct bvec_iter_all iter_all;
 
 	if (bio->bi_status)
 		goto end;
@@ -7942,7 +8071,7 @@
 	failure_tree = &BTRFS_I(inode)->io_failure_tree;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
 					     bvec->bv_offset, done->start,
 					     bvec->bv_len);
@@ -7954,6 +8083,7 @@
 					 bvec->bv_offset);
 		else
 			uptodate = 0;
+		i++;
 	}
 
 	done->uptodate = uptodate;
@@ -8070,9 +8200,7 @@
 
 	dio_bio->bi_status = err;
 	dio_end_io(dio_bio);
-
-	if (io_bio->end_io)
-		io_bio->end_io(io_bio, blk_status_to_errno(err));
+	btrfs_io_bio_free_csum(io_bio);
 	bio_put(bio);
 }
 
@@ -8115,7 +8243,7 @@
 			return;
 		/*
 		 * Our bio might span multiple ordered extents. In this case
-		 * we keep goin until we have accounted the whole dio.
+		 * we keep going until we have accounted the whole dio.
 		 */
 		if (ordered_offset < offset + bytes) {
 			ordered_bytes = offset + bytes - ordered_offset;
@@ -8274,22 +8402,21 @@
 	struct bio *orig_bio = dip->orig_bio;
 	u64 start_sector = orig_bio->bi_iter.bi_sector;
 	u64 file_offset = dip->logical_offset;
-	u64 map_length;
 	int async_submit = 0;
 	u64 submit_len;
 	int clone_offset = 0;
 	int clone_len;
 	int ret;
 	blk_status_t status;
+	struct btrfs_io_geometry geom;
 
-	map_length = orig_bio->bi_iter.bi_size;
-	submit_len = map_length;
-	ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
-			      &map_length, NULL, 0);
+	submit_len = orig_bio->bi_iter.bi_size;
+	ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
+				    start_sector << 9, submit_len, &geom);
 	if (ret)
 		return -EIO;
 
-	if (map_length >= submit_len) {
+	if (geom.len >= submit_len) {
 		bio = orig_bio;
 		dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
 		goto submit;
@@ -8302,10 +8429,10 @@
 		async_submit = 1;
 
 	/* bio split */
-	ASSERT(map_length <= INT_MAX);
+	ASSERT(geom.len <= INT_MAX);
 	atomic_inc(&dip->pending_bios);
 	do {
-		clone_len = min_t(int, submit_len, map_length);
+		clone_len = min_t(int, submit_len, geom.len);
 
 		/*
 		 * This will never fail as it's passing GPF_NOFS and
@@ -8342,9 +8469,8 @@
 		start_sector += clone_len >> 9;
 		file_offset += clone_len;
 
-		map_length = submit_len;
-		ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
-				      start_sector << 9, &map_length, NULL, 0);
+		ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
+				      start_sector << 9, submit_len, &geom);
 		if (ret)
 			goto out_err;
 	} while (submit_len > 0);
@@ -8425,8 +8551,7 @@
 	if (!ret)
 		return;
 
-	if (io_bio->end_io)
-		io_bio->end_io(io_bio, ret);
+	btrfs_io_bio_free_csum(io_bio);
 
 free_ordered:
 	/*
@@ -8597,7 +8722,7 @@
 		} else if (ret >= 0 && (size_t)ret < count)
 			btrfs_delalloc_release_space(inode, data_reserved,
 					offset, count - (size_t)ret, true);
-		btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
 	}
 out:
 	if (wakeup)
@@ -8728,8 +8853,7 @@
 		 */
 		if (!inode_evicting)
 			clear_extent_bit(tree, start, end,
-					 EXTENT_DIRTY | EXTENT_DELALLOC |
-					 EXTENT_DELALLOC_NEW |
+					 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
 					 EXTENT_DEFRAG, 1, 0, &cached_state);
 		/*
@@ -8784,8 +8908,7 @@
 	if (PageDirty(page))
 		btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
 	if (!inode_evicting) {
-		clear_extent_bit(tree, page_start, page_end,
-				 EXTENT_LOCKED | EXTENT_DIRTY |
+		clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
 				 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
 				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
 				 &cached_state);
@@ -8913,12 +9036,11 @@
 	 * reserve data&meta space before lock_page() (see above comments).
 	 */
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC |
-			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-			  0, 0, &cached_state);
+			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+			  EXTENT_DEFRAG, 0, 0, &cached_state);
 
 	ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
-					&cached_state, 0);
+					&cached_state);
 	if (ret2) {
 		unlock_extent_cached(io_tree, page_start, page_end,
 				     &cached_state);
@@ -8929,7 +9051,7 @@
 
 	/* page is wholly or partially inside EOF */
 	if (page_start + PAGE_SIZE > size)
-		zero_start = size & ~PAGE_MASK;
+		zero_start = offset_in_page(size);
 	else
 		zero_start = PAGE_SIZE;
 
@@ -8950,7 +9072,7 @@
 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
 
 	if (!ret2) {
-		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 		sb_end_pagefault(inode->i_sb);
 		extent_changeset_free(data_reserved);
 		return VM_FAULT_LOCKED;
@@ -8959,7 +9081,7 @@
 out_unlock:
 	unlock_page(page);
 out:
-	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
+	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 	btrfs_delalloc_release_space(inode, data_reserved, page_start,
 				     reserved_space, (ret != 0));
 out_noreserve:
@@ -8976,7 +9098,7 @@
 	int ret;
 	struct btrfs_trans_handle *trans;
 	u64 mask = fs_info->sectorsize - 1;
-	u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
+	u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
 
 	if (!skip_writeback) {
 		ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -9031,7 +9153,7 @@
 
 	/* Migrate the slack space for the truncate to our reserve */
 	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
-				      min_size, 0);
+				      min_size, false);
 	BUG_ON(ret);
 
 	/*
@@ -9068,7 +9190,7 @@
 
 		btrfs_block_rsv_release(fs_info, rsv, -1);
 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
-					      rsv, min_size, 0);
+					      rsv, min_size, false);
 		BUG_ON(ret);	/* shouldn't happen */
 		trans->block_rsv = rsv;
 	}
@@ -9192,10 +9314,11 @@
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree);
-	extent_io_tree_init(&ei->io_tree, inode);
-	extent_io_tree_init(&ei->io_failure_tree, inode);
-	ei->io_tree.track_uptodate = 1;
-	ei->io_failure_tree.track_uptodate = 1;
+	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
+	extent_io_tree_init(fs_info, &ei->io_failure_tree,
+			    IO_TREE_INODE_IO_FAILURE, inode);
+	ei->io_tree.track_uptodate = true;
+	ei->io_failure_tree.track_uptodate = true;
 	atomic_set(&ei->sync_writers, 0);
 	mutex_init(&ei->log_mutex);
 	mutex_init(&ei->delalloc_mutex);
@@ -9216,9 +9339,8 @@
 }
 #endif
 
-static void btrfs_i_callback(struct rcu_head *head)
+void btrfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
@@ -9244,7 +9366,7 @@
 	 * created.
 	 */
 	if (!root)
-		goto free;
+		return;
 
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -9262,8 +9384,6 @@
 	btrfs_qgroup_check_reserved_leak(inode);
 	inode_tree_del(inode);
 	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
-free:
-	call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 
 int btrfs_drop_inode(struct inode *inode)
@@ -9298,6 +9418,7 @@
 	kmem_cache_destroy(btrfs_trans_handle_cachep);
 	kmem_cache_destroy(btrfs_path_cachep);
 	kmem_cache_destroy(btrfs_free_space_cachep);
+	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
 }
 
 int __init btrfs_init_cachep(void)
@@ -9327,6 +9448,12 @@
 	if (!btrfs_free_space_cachep)
 		goto fail;
 
+	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
+							PAGE_SIZE, PAGE_SIZE,
+							SLAB_RED_ZONE, NULL);
+	if (!btrfs_free_space_bitmap_cachep)
+		goto fail;
+
 	return 0;
 fail:
 	btrfs_destroy_cachep();
@@ -9440,7 +9567,7 @@
 	/* Reference for the source. */
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		/* force full log commit if subvolume involved. */
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 	} else {
 		btrfs_pin_log_trans(root);
 		root_log_pinned = true;
@@ -9457,7 +9584,7 @@
 	/* And now for the dest. */
 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		/* force full log commit if subvolume involved. */
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 	} else {
 		btrfs_pin_log_trans(dest);
 		dest_log_pinned = true;
@@ -9593,7 +9720,7 @@
 		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
 		    (new_inode &&
 		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
-			btrfs_set_log_full_commit(fs_info, trans);
+			btrfs_set_log_full_commit(trans);
 
 		if (root_log_pinned) {
 			btrfs_end_log_trans(root);
@@ -9617,6 +9744,18 @@
 			commit_transaction = true;
 	}
 	if (commit_transaction) {
+		/*
+		 * We may have set commit_transaction when logging the new name
+		 * in the destination root, in which case we left the source
+		 * root context in the list of log contextes. So make sure we
+		 * remove it to avoid invalid memory accesses, since the context
+		 * was allocated in our stack frame.
+		 */
+		if (sync_log_root) {
+			mutex_lock(&root->log_mutex);
+			list_del_init(&ctx_root.list);
+			mutex_unlock(&root->log_mutex);
+		}
 		ret = btrfs_commit_transaction(trans);
 	} else {
 		int ret2;
@@ -9630,6 +9769,9 @@
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&fs_info->subvol_sem);
 
+	ASSERT(list_empty(&ctx_root.list));
+	ASSERT(list_empty(&ctx_dest.list));
+
 	return ret;
 }
 
@@ -9779,7 +9921,7 @@
 	BTRFS_I(old_inode)->dir_index = 0ULL;
 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		/* force full log commit if subvolume involved. */
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 	} else {
 		btrfs_pin_log_trans(root);
 		log_pinned = true;
@@ -9900,7 +10042,7 @@
 		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
 		    (new_inode &&
 		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
-			btrfs_set_log_full_commit(fs_info, trans);
+			btrfs_set_log_full_commit(trans);
 
 		btrfs_end_log_trans(root);
 		log_pinned = false;
@@ -9974,7 +10116,6 @@
 	init_completion(&work->completion);
 	INIT_LIST_HEAD(&work->list);
 	work->inode = inode;
-	WARN_ON_ONCE(!inode);
 	btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
 			btrfs_run_delalloc_work, NULL, NULL);
 
@@ -9985,7 +10126,7 @@
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int start_delalloc_inodes(struct btrfs_root *root, int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -10013,6 +10154,9 @@
 		}
 		spin_unlock(&root->delalloc_lock);
 
+		if (snapshot)
+			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+				&binode->runtime_flags);
 		work = btrfs_alloc_delalloc_work(inode);
 		if (!work) {
 			iput(inode);
@@ -10046,7 +10190,7 @@
 	return ret;
 }
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
@@ -10054,7 +10198,7 @@
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
 
-	ret = start_delalloc_inodes(root, -1);
+	ret = start_delalloc_inodes(root, -1, true);
 	if (ret > 0)
 		ret = 0;
 	return ret;
@@ -10083,7 +10227,7 @@
 			       &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 
-		ret = start_delalloc_inodes(root, nr);
+		ret = start_delalloc_inodes(root, nr, false);
 		btrfs_put_fs_root(root);
 		if (ret < 0)
 			goto out;
@@ -10201,7 +10345,6 @@
 
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode_nohighmem(inode);
-	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(BTRFS_I(inode), name_len);
 	err = btrfs_update_inode(trans, root, inode);
@@ -10462,26 +10605,6 @@
 	return ret;
 }
 
-__attribute__((const))
-static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
-{
-	return -EAGAIN;
-}
-
-static void btrfs_check_extent_io_range(void *private_data, const char *caller,
-					u64 start, u64 end)
-{
-	struct inode *inode = private_data;
-	u64 isize;
-
-	isize = i_size_read(inode);
-	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
-		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
-			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
-	}
-}
-
 void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	struct inode *inode = tree->private_data;
@@ -10498,6 +10621,343 @@
 	}
 }
 
+#ifdef CONFIG_SWAP
+/*
+ * Add an entry indicating a block group or device which is pinned by a
+ * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
+ * negative errno on failure.
+ */
+static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
+				  bool is_block_group)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct btrfs_swapfile_pin *sp, *entry;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+
+	sp = kmalloc(sizeof(*sp), GFP_NOFS);
+	if (!sp)
+		return -ENOMEM;
+	sp->ptr = ptr;
+	sp->inode = inode;
+	sp->is_block_group = is_block_group;
+
+	spin_lock(&fs_info->swapfile_pins_lock);
+	p = &fs_info->swapfile_pins.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
+		if (sp->ptr < entry->ptr ||
+		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
+			p = &(*p)->rb_left;
+		} else if (sp->ptr > entry->ptr ||
+			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
+			p = &(*p)->rb_right;
+		} else {
+			spin_unlock(&fs_info->swapfile_pins_lock);
+			kfree(sp);
+			return 1;
+		}
+	}
+	rb_link_node(&sp->node, parent, p);
+	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
+	spin_unlock(&fs_info->swapfile_pins_lock);
+	return 0;
+}
+
+/* Free all of the entries pinned by this swapfile. */
+static void btrfs_free_swapfile_pins(struct inode *inode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct btrfs_swapfile_pin *sp;
+	struct rb_node *node, *next;
+
+	spin_lock(&fs_info->swapfile_pins_lock);
+	node = rb_first(&fs_info->swapfile_pins);
+	while (node) {
+		next = rb_next(node);
+		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+		if (sp->inode == inode) {
+			rb_erase(&sp->node, &fs_info->swapfile_pins);
+			if (sp->is_block_group)
+				btrfs_put_block_group(sp->ptr);
+			kfree(sp);
+		}
+		node = next;
+	}
+	spin_unlock(&fs_info->swapfile_pins_lock);
+}
+
+struct btrfs_swap_info {
+	u64 start;
+	u64 block_start;
+	u64 block_len;
+	u64 lowest_ppage;
+	u64 highest_ppage;
+	unsigned long nr_pages;
+	int nr_extents;
+};
+
+static int btrfs_add_swap_extent(struct swap_info_struct *sis,
+				 struct btrfs_swap_info *bsi)
+{
+	unsigned long nr_pages;
+	u64 first_ppage, first_ppage_reported, next_ppage;
+	int ret;
+
+	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
+	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
+				PAGE_SIZE) >> PAGE_SHIFT;
+
+	if (first_ppage >= next_ppage)
+		return 0;
+	nr_pages = next_ppage - first_ppage;
+
+	first_ppage_reported = first_ppage;
+	if (bsi->start == 0)
+		first_ppage_reported++;
+	if (bsi->lowest_ppage > first_ppage_reported)
+		bsi->lowest_ppage = first_ppage_reported;
+	if (bsi->highest_ppage < (next_ppage - 1))
+		bsi->highest_ppage = next_ppage - 1;
+
+	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
+	if (ret < 0)
+		return ret;
+	bsi->nr_extents += ret;
+	bsi->nr_pages += nr_pages;
+	return 0;
+}
+
+static void btrfs_swap_deactivate(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+
+	btrfs_free_swapfile_pins(inode);
+	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
+}
+
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+			       sector_t *span)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_state *cached_state = NULL;
+	struct extent_map *em = NULL;
+	struct btrfs_device *device = NULL;
+	struct btrfs_swap_info bsi = {
+		.lowest_ppage = (sector_t)-1ULL,
+	};
+	int ret = 0;
+	u64 isize;
+	u64 start;
+
+	/*
+	 * If the swap file was just created, make sure delalloc is done. If the
+	 * file changes again after this, the user is doing something stupid and
+	 * we don't really care.
+	 */
+	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	if (ret)
+		return ret;
+
+	/*
+	 * The inode is locked, so these flags won't change after we check them.
+	 */
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
+		btrfs_warn(fs_info, "swapfile must not be compressed");
+		return -EINVAL;
+	}
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
+		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
+		return -EINVAL;
+	}
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+		btrfs_warn(fs_info, "swapfile must not be checksummed");
+		return -EINVAL;
+	}
+
+	/*
+	 * Balance or device remove/replace/resize can move stuff around from
+	 * under us. The EXCL_OP flag makes sure they aren't running/won't run
+	 * concurrently while we are mapping the swap extents, and
+	 * fs_info->swapfile_pins prevents them from running while the swap file
+	 * is active and moving the extents. Note that this also prevents a
+	 * concurrent device add which isn't actually necessary, but it's not
+	 * really worth the trouble to allow it.
+	 */
+	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+		btrfs_warn(fs_info,
+	   "cannot activate swapfile while exclusive operation is running");
+		return -EBUSY;
+	}
+	/*
+	 * Snapshots can create extents which require COW even if NODATACOW is
+	 * set. We use this counter to prevent snapshots. We must increment it
+	 * before walking the extents because we don't want a concurrent
+	 * snapshot to run after we've already checked the extents.
+	 */
+	atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles);
+
+	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
+
+	lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
+	start = 0;
+	while (start < isize) {
+		u64 logical_block_start, physical_block_start;
+		struct btrfs_block_group_cache *bg;
+		u64 len = isize - start;
+
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		if (em->block_start == EXTENT_MAP_HOLE) {
+			btrfs_warn(fs_info, "swapfile must not have holes");
+			ret = -EINVAL;
+			goto out;
+		}
+		if (em->block_start == EXTENT_MAP_INLINE) {
+			/*
+			 * It's unlikely we'll ever actually find ourselves
+			 * here, as a file small enough to fit inline won't be
+			 * big enough to store more than the swap header, but in
+			 * case something changes in the future, let's catch it
+			 * here rather than later.
+			 */
+			btrfs_warn(fs_info, "swapfile must not be inline");
+			ret = -EINVAL;
+			goto out;
+		}
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			btrfs_warn(fs_info, "swapfile must not be compressed");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		logical_block_start = em->block_start + (start - em->start);
+		len = min(len, em->len - (start - em->start));
+		free_extent_map(em);
+		em = NULL;
+
+		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+		if (ret < 0) {
+			goto out;
+		} else if (ret) {
+			ret = 0;
+		} else {
+			btrfs_warn(fs_info,
+				   "swapfile must not be copy-on-write");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+			btrfs_warn(fs_info,
+				   "swapfile must have single data profile");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (device == NULL) {
+			device = em->map_lookup->stripes[0].dev;
+			ret = btrfs_add_swapfile_pin(inode, device, false);
+			if (ret == 1)
+				ret = 0;
+			else if (ret)
+				goto out;
+		} else if (device != em->map_lookup->stripes[0].dev) {
+			btrfs_warn(fs_info, "swapfile must be on one device");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		physical_block_start = (em->map_lookup->stripes[0].physical +
+					(logical_block_start - em->start));
+		len = min(len, em->len - (logical_block_start - em->start));
+		free_extent_map(em);
+		em = NULL;
+
+		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
+		if (!bg) {
+			btrfs_warn(fs_info,
+			   "could not find block group containing swapfile");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		ret = btrfs_add_swapfile_pin(inode, bg, true);
+		if (ret) {
+			btrfs_put_block_group(bg);
+			if (ret == 1)
+				ret = 0;
+			else
+				goto out;
+		}
+
+		if (bsi.block_len &&
+		    bsi.block_start + bsi.block_len == physical_block_start) {
+			bsi.block_len += len;
+		} else {
+			if (bsi.block_len) {
+				ret = btrfs_add_swap_extent(sis, &bsi);
+				if (ret)
+					goto out;
+			}
+			bsi.start = start;
+			bsi.block_start = physical_block_start;
+			bsi.block_len = len;
+		}
+
+		start += len;
+	}
+
+	if (bsi.block_len)
+		ret = btrfs_add_swap_extent(sis, &bsi);
+
+out:
+	if (!IS_ERR_OR_NULL(em))
+		free_extent_map(em);
+
+	unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
+
+	if (ret)
+		btrfs_swap_deactivate(file);
+
+	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
+	if (ret)
+		return ret;
+
+	if (device)
+		sis->bdev = device->bdev;
+	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
+	sis->max = bsi.nr_pages;
+	sis->pages = bsi.nr_pages - 1;
+	sis->highest_bit = bsi.nr_pages - 1;
+	return bsi.nr_extents;
+}
+#else
+static void btrfs_swap_deactivate(struct file *file)
+{
+}
+
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+			       sector_t *span)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 static const struct inode_operations btrfs_dir_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.lookup		= btrfs_lookup,
@@ -10540,17 +11000,6 @@
 	/* mandatory callbacks */
 	.submit_bio_hook = btrfs_submit_bio_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
-	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
-
-	/* optional callbacks */
-	.fill_delalloc = run_delalloc_range,
-	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
-	.writepage_start_hook = btrfs_writepage_start_hook,
-	.set_bit_hook = btrfs_set_bit_hook,
-	.clear_bit_hook = btrfs_clear_bit_hook,
-	.merge_extent_hook = btrfs_merge_extent_hook,
-	.split_extent_hook = btrfs_split_extent_hook,
-	.check_extent_io_range = btrfs_check_extent_io_range,
 };
 
 /*
@@ -10575,13 +11024,8 @@
 	.releasepage	= btrfs_releasepage,
 	.set_page_dirty	= btrfs_set_page_dirty,
 	.error_remove_page = generic_error_remove_page,
-};
-
-static const struct address_space_operations btrfs_symlink_aops = {
-	.readpage	= btrfs_readpage,
-	.writepage	= btrfs_writepage,
-	.invalidatepage = btrfs_invalidatepage,
-	.releasepage	= btrfs_releasepage,
+	.swap_activate	= btrfs_swap_activate,
+	.swap_deactivate = btrfs_swap_deactivate,
 };
 
 static const struct inode_operations btrfs_file_inode_operations = {

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c915215..23272d9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c

@@ -43,6 +43,9 @@
 #include "qgroup.h"
 #include "tree-log.h"
 #include "compression.h"
+#include "space-info.h"
+#include "delalloc-space.h"
+#include "block-group.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -189,9 +192,8 @@
 	struct btrfs_trans_handle *trans;
 	unsigned int fsflags, old_fsflags;
 	int ret;
-	u64 old_flags;
-	unsigned int old_i_flags;
-	umode_t mode;
+	const char *comp = NULL;
+	u32 binode_flags = binode->flags;
 
 	if (!inode_owner_or_capable(inode))
 		return -EPERM;
@@ -212,66 +214,59 @@
 
 	inode_lock(inode);
 
-	old_flags = binode->flags;
-	old_i_flags = inode->i_flags;
-	mode = inode->i_mode;
-
 	fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
 	old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
-	if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
-		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			ret = -EPERM;
-			goto out_unlock;
-		}
-	}
+	ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
+	if (ret)
+		goto out_unlock;
 
 	if (fsflags & FS_SYNC_FL)
-		binode->flags |= BTRFS_INODE_SYNC;
+		binode_flags |= BTRFS_INODE_SYNC;
 	else
-		binode->flags &= ~BTRFS_INODE_SYNC;
+		binode_flags &= ~BTRFS_INODE_SYNC;
 	if (fsflags & FS_IMMUTABLE_FL)
-		binode->flags |= BTRFS_INODE_IMMUTABLE;
+		binode_flags |= BTRFS_INODE_IMMUTABLE;
 	else
-		binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+		binode_flags &= ~BTRFS_INODE_IMMUTABLE;
 	if (fsflags & FS_APPEND_FL)
-		binode->flags |= BTRFS_INODE_APPEND;
+		binode_flags |= BTRFS_INODE_APPEND;
 	else
-		binode->flags &= ~BTRFS_INODE_APPEND;
+		binode_flags &= ~BTRFS_INODE_APPEND;
 	if (fsflags & FS_NODUMP_FL)
-		binode->flags |= BTRFS_INODE_NODUMP;
+		binode_flags |= BTRFS_INODE_NODUMP;
 	else
-		binode->flags &= ~BTRFS_INODE_NODUMP;
+		binode_flags &= ~BTRFS_INODE_NODUMP;
 	if (fsflags & FS_NOATIME_FL)
-		binode->flags |= BTRFS_INODE_NOATIME;
+		binode_flags |= BTRFS_INODE_NOATIME;
 	else
-		binode->flags &= ~BTRFS_INODE_NOATIME;
+		binode_flags &= ~BTRFS_INODE_NOATIME;
 	if (fsflags & FS_DIRSYNC_FL)
-		binode->flags |= BTRFS_INODE_DIRSYNC;
+		binode_flags |= BTRFS_INODE_DIRSYNC;
 	else
-		binode->flags &= ~BTRFS_INODE_DIRSYNC;
+		binode_flags &= ~BTRFS_INODE_DIRSYNC;
 	if (fsflags & FS_NOCOW_FL) {
-		if (S_ISREG(mode)) {
+		if (S_ISREG(inode->i_mode)) {
 			/*
 			 * It's safe to turn csums off here, no extents exist.
 			 * Otherwise we want the flag to reflect the real COW
 			 * status of the file and will not set it.
 			 */
 			if (inode->i_size == 0)
-				binode->flags |= BTRFS_INODE_NODATACOW
-					      | BTRFS_INODE_NODATASUM;
+				binode_flags |= BTRFS_INODE_NODATACOW |
+						BTRFS_INODE_NODATASUM;
 		} else {
-			binode->flags |= BTRFS_INODE_NODATACOW;
+			binode_flags |= BTRFS_INODE_NODATACOW;
 		}
 	} else {
 		/*
 		 * Revert back under same assumptions as above
 		 */
-		if (S_ISREG(mode)) {
+		if (S_ISREG(inode->i_mode)) {
 			if (inode->i_size == 0)
-				binode->flags &= ~(BTRFS_INODE_NODATACOW
-				             | BTRFS_INODE_NODATASUM);
+				binode_flags &= ~(BTRFS_INODE_NODATACOW |
+						  BTRFS_INODE_NODATASUM);
 		} else {
-			binode->flags &= ~BTRFS_INODE_NODATACOW;
+			binode_flags &= ~BTRFS_INODE_NODATACOW;
 		}
 	}
 
@@ -281,52 +276,59 @@
 	 * things smaller.
 	 */
 	if (fsflags & FS_NOCOMP_FL) {
-		binode->flags &= ~BTRFS_INODE_COMPRESS;
-		binode->flags |= BTRFS_INODE_NOCOMPRESS;
-
-		ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
-		if (ret && ret != -ENODATA)
-			goto out_drop;
+		binode_flags &= ~BTRFS_INODE_COMPRESS;
+		binode_flags |= BTRFS_INODE_NOCOMPRESS;
 	} else if (fsflags & FS_COMPR_FL) {
-		const char *comp;
 
-		binode->flags |= BTRFS_INODE_COMPRESS;
-		binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+		if (IS_SWAPFILE(inode)) {
+			ret = -ETXTBSY;
+			goto out_unlock;
+		}
+
+		binode_flags |= BTRFS_INODE_COMPRESS;
+		binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
 
 		comp = btrfs_compress_type2str(fs_info->compress_type);
 		if (!comp || comp[0] == 0)
 			comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
-
-		ret = btrfs_set_prop(inode, "btrfs.compression",
-				     comp, strlen(comp), 0);
-		if (ret)
-			goto out_drop;
-
 	} else {
-		ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
-		if (ret && ret != -ENODATA)
-			goto out_drop;
-		binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+		binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 	}
 
-	trans = btrfs_start_transaction(root, 1);
+	/*
+	 * 1 for inode item
+	 * 2 for properties
+	 */
+	trans = btrfs_start_transaction(root, 3);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		goto out_drop;
+		goto out_unlock;
 	}
 
+	if (comp) {
+		ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
+				     strlen(comp), 0);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_end_trans;
+		}
+	} else {
+		ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
+				     0, 0);
+		if (ret && ret != -ENODATA) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_end_trans;
+		}
+	}
+
+	binode->flags = binode_flags;
 	btrfs_sync_inode_flags_to_i_flags(inode);
 	inode_inc_iversion(inode);
 	inode->i_ctime = current_time(inode);
 	ret = btrfs_update_inode(trans, root, inode);
 
+ out_end_trans:
 	btrfs_end_transaction(trans);
- out_drop:
-	if (ret) {
-		binode->flags = old_flags;
-		inode->i_flags = old_i_flags;
-	}
-
  out_unlock:
 	inode_unlock(inode);
 	mnt_drop_write_file(file);
@@ -374,9 +376,7 @@
 	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
 	struct fsxattr fa;
 
-	memset(&fa, 0, sizeof(fa));
-	fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
-
+	simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags));
 	if (copy_to_user(arg, &fa, sizeof(fa)))
 		return -EFAULT;
 
@@ -389,7 +389,7 @@
 	struct btrfs_inode *binode = BTRFS_I(inode);
 	struct btrfs_root *root = binode->root;
 	struct btrfs_trans_handle *trans;
-	struct fsxattr fa;
+	struct fsxattr fa, old_fa;
 	unsigned old_flags;
 	unsigned old_i_flags;
 	int ret = 0;
@@ -400,7 +400,6 @@
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
-	memset(&fa, 0, sizeof(fa));
 	if (copy_from_user(&fa, arg, sizeof(fa)))
 		return -EFAULT;
 
@@ -420,13 +419,11 @@
 	old_flags = binode->flags;
 	old_i_flags = inode->i_flags;
 
-	/* We need the capabilities to change append-only or immutable inode */
-	if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) ||
-	     (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) &&
-	    !capable(CAP_LINUX_IMMUTABLE)) {
-		ret = -EPERM;
+	simple_fill_fsxattr(&old_fa,
+			    btrfs_inode_flags_to_xflags(binode->flags));
+	ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
+	if (ret)
 		goto out_unlock;
-	}
 
 	if (fa.fsx_xflags & FS_XFLAG_SYNC)
 		binode->flags |= BTRFS_INODE_SYNC;
@@ -496,6 +493,16 @@
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	/*
+	 * If the fs is mounted with nologreplay, which requires it to be
+	 * mounted in RO mode as well, we can not allow discard on free space
+	 * inside block groups, because log trees refer to extents that are not
+	 * pinned in a block group's free space cache (pinning the extents is
+	 * precisely the first phase of replaying a log tree).
+	 */
+	if (btrfs_test_opt(fs_info, NOLOGREPLAY))
+		return -EROFS;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
 				dev_list) {
@@ -689,8 +696,7 @@
 		goto fail;
 	}
 
-	ret = btrfs_insert_dir_item(trans, root,
-				    name, namelen, BTRFS_I(dir), &key,
+	ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
 				    BTRFS_FT_DIR, index);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -755,6 +761,12 @@
 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		return -EINVAL;
 
+	if (atomic_read(&root->nr_swapfiles)) {
+		btrfs_warn(fs_info,
+			   "cannot snapshot subvolume with active swapfile");
+		return -ETXTBSY;
+	}
+
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
 	if (!pending_snapshot)
 		return -ENOMEM;
@@ -778,7 +790,7 @@
 	wait_event(root->subv_writers->wait,
 		   percpu_counter_sum(&root->subv_writers->counter) == 0);
 
-	ret = btrfs_start_delalloc_inodes(root);
+	ret = btrfs_start_delalloc_snapshot(root);
 	if (ret)
 		goto dec_and_free;
 
@@ -1321,13 +1333,12 @@
 	lock_extent_bits(&BTRFS_I(inode)->io_tree,
 			 page_start, page_end - 1, &cached_state);
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
-			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
-			  &cached_state);
+			  page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+			  EXTENT_DEFRAG, 0, 0, &cached_state);
 
 	if (i_done != page_cnt) {
 		spin_lock(&BTRFS_I(inode)->lock);
-		BTRFS_I(inode)->outstanding_extents++;
+		btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
 		spin_unlock(&BTRFS_I(inode)->lock);
 		btrfs_delalloc_release_space(inode, data_reserved,
 				start_index << PAGE_SHIFT,
@@ -1349,8 +1360,7 @@
 		unlock_page(pages[i]);
 		put_page(pages[i]);
 	}
-	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
-				       false);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
 	extent_changeset_free(data_reserved);
 	return i_done;
 out:
@@ -1361,8 +1371,7 @@
 	btrfs_delalloc_release_space(inode, data_reserved,
 			start_index << PAGE_SHIFT,
 			page_cnt << PAGE_SHIFT, true);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
-				       true);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
 	extent_changeset_free(data_reserved);
 	return ret;
 
@@ -1506,9 +1515,13 @@
 		}
 
 		inode_lock(inode);
-		if (do_compress)
-			BTRFS_I(inode)->defrag_compress = compress_type;
-		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+		if (IS_SWAPFILE(inode)) {
+			ret = -ETXTBSY;
+		} else {
+			if (do_compress)
+				BTRFS_I(inode)->defrag_compress = compress_type;
+			ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+		}
 		if (ret < 0) {
 			inode_unlock(inode);
 			goto out_ra;
@@ -1628,7 +1641,7 @@
 		btrfs_info(fs_info, "resizing devid %llu", devid);
 	}
 
-	device = btrfs_find_device(fs_info, devid, NULL, NULL);
+	device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
 	if (!device) {
 		btrfs_info(fs_info, "resizer unable to find device %llu",
 			   devid);
@@ -1825,8 +1838,15 @@
 		goto free_args;
 	}
 
-	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) {
+		struct inode *inode = file_inode(file);
+		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+		btrfs_warn(fs_info,
+"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7");
+
 		ptr = &transid;
+	}
 	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
 		readonly = true;
 	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
@@ -2907,8 +2927,10 @@
 	inode_lock(inode);
 	err = btrfs_delete_subvolume(dir, dentry);
 	inode_unlock(inode);
-	if (!err)
+	if (!err) {
+		fsnotify_rmdir(dir, dentry);
 		d_delete(dentry);
+	}
 
 out_dput:
 	dput(dentry);
@@ -3136,7 +3158,7 @@
 	}
 	rcu_read_unlock();
 
-	memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
+	memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
 	fi_args->nodesize = fs_info->nodesize;
 	fi_args->sectorsize = fs_info->sectorsize;
 	fi_args->clone_alignment = fs_info->sectorsize;
@@ -3164,7 +3186,8 @@
 		s_uuid = di_args->uuid;
 
 	rcu_read_lock();
-	dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
+	dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
+				NULL, true);
 
 	if (!dev) {
 		ret = -ENODEV;
@@ -3192,362 +3215,38 @@
 	return ret;
 }
 
-static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
-{
-	struct page *page;
-
-	page = grab_cache_page(inode->i_mapping, index);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
-
-	if (!PageUptodate(page)) {
-		int ret;
-
-		ret = btrfs_readpage(NULL, page);
-		if (ret)
-			return ERR_PTR(ret);
-		lock_page(page);
-		if (!PageUptodate(page)) {
-			unlock_page(page);
-			put_page(page);
-			return ERR_PTR(-EIO);
-		}
-		if (page->mapping != inode->i_mapping) {
-			unlock_page(page);
-			put_page(page);
-			return ERR_PTR(-EAGAIN);
-		}
-	}
-
-	return page;
-}
-
-static int gather_extent_pages(struct inode *inode, struct page **pages,
-			       int num_pages, u64 off)
-{
-	int i;
-	pgoff_t index = off >> PAGE_SHIFT;
-
-	for (i = 0; i < num_pages; i++) {
-again:
-		pages[i] = extent_same_get_page(inode, index + i);
-		if (IS_ERR(pages[i])) {
-			int err = PTR_ERR(pages[i]);
-
-			if (err == -EAGAIN)
-				goto again;
-			pages[i] = NULL;
-			return err;
-		}
-	}
-	return 0;
-}
-
-static int lock_extent_range(struct inode *inode, u64 off, u64 len,
-			     bool retry_range_locking)
-{
-	/*
-	 * Do any pending delalloc/csum calculations on inode, one way or
-	 * another, and lock file content.
-	 * The locking order is:
-	 *
-	 *   1) pages
-	 *   2) range in the inode's io tree
-	 */
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
-		ordered = btrfs_lookup_first_ordered_extent(inode,
-							    off + len - 1);
-		if ((!ordered ||
-		     ordered->file_offset + ordered->len <= off ||
-		     ordered->file_offset >= off + len) &&
-		    !test_range_bit(&BTRFS_I(inode)->io_tree, off,
-				    off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-		unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		if (!retry_range_locking)
-			return -EAGAIN;
-		btrfs_wait_ordered_range(inode, off, len);
-	}
-	return 0;
-}
-
-static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
-{
-	inode_unlock(inode1);
-	inode_unlock(inode2);
-}
-
-static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
-{
-	if (inode1 < inode2)
-		swap(inode1, inode2);
-
-	inode_lock_nested(inode1, I_MUTEX_PARENT);
-	inode_lock_nested(inode2, I_MUTEX_CHILD);
-}
-
 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
-				      struct inode *inode2, u64 loff2, u64 len)
+				       struct inode *inode2, u64 loff2, u64 len)
 {
 	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
 	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
 }
 
-static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
-				    struct inode *inode2, u64 loff2, u64 len,
-				    bool retry_range_locking)
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+				     struct inode *inode2, u64 loff2, u64 len)
 {
-	int ret;
-
 	if (inode1 < inode2) {
 		swap(inode1, inode2);
 		swap(loff1, loff2);
+	} else if (inode1 == inode2 && loff2 < loff1) {
+		swap(loff1, loff2);
 	}
-	ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
-	if (ret)
-		return ret;
-	ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
-	if (ret)
-		unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
-			      loff1 + len - 1);
-	return ret;
+	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
 }
 
-struct cmp_pages {
-	int		num_pages;
-	struct page	**src_pages;
-	struct page	**dst_pages;
-};
-
-static void btrfs_cmp_data_free(struct cmp_pages *cmp)
-{
-	int i;
-	struct page *pg;
-
-	for (i = 0; i < cmp->num_pages; i++) {
-		pg = cmp->src_pages[i];
-		if (pg) {
-			unlock_page(pg);
-			put_page(pg);
-			cmp->src_pages[i] = NULL;
-		}
-		pg = cmp->dst_pages[i];
-		if (pg) {
-			unlock_page(pg);
-			put_page(pg);
-			cmp->dst_pages[i] = NULL;
-		}
-	}
-}
-
-static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
-				  struct inode *dst, u64 dst_loff,
-				  u64 len, struct cmp_pages *cmp)
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
+				   struct inode *dst, u64 dst_loff)
 {
 	int ret;
-	int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
 
-	cmp->num_pages = num_pages;
-
-	ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
-	if (ret)
-		goto out;
-
-	ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
-
-out:
-	if (ret)
-		btrfs_cmp_data_free(cmp);
-	return ret;
-}
-
-static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
-{
-	int ret = 0;
-	int i;
-	struct page *src_page, *dst_page;
-	unsigned int cmp_len = PAGE_SIZE;
-	void *addr, *dst_addr;
-
-	i = 0;
-	while (len) {
-		if (len < PAGE_SIZE)
-			cmp_len = len;
-
-		BUG_ON(i >= cmp->num_pages);
-
-		src_page = cmp->src_pages[i];
-		dst_page = cmp->dst_pages[i];
-		ASSERT(PageLocked(src_page));
-		ASSERT(PageLocked(dst_page));
-
-		addr = kmap_atomic(src_page);
-		dst_addr = kmap_atomic(dst_page);
-
-		flush_dcache_page(src_page);
-		flush_dcache_page(dst_page);
-
-		if (memcmp(addr, dst_addr, cmp_len))
-			ret = -EBADE;
-
-		kunmap_atomic(addr);
-		kunmap_atomic(dst_addr);
-
-		if (ret)
-			break;
-
-		len -= cmp_len;
-		i++;
-	}
-
-	return ret;
-}
-
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
-				     u64 olen)
-{
-	u64 len = *plen;
-	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
-
-	if (off + olen > inode->i_size || off + olen < off)
-		return -EINVAL;
-
-	/* if we extend to eof, continue to block boundary */
-	if (off + len == inode->i_size)
-		*plen = len = ALIGN(inode->i_size, bs) - off;
-
-	/* Check that we are block aligned - btrfs_clone() requires this */
-	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
-				   struct inode *dst, u64 dst_loff,
-				   struct cmp_pages *cmp)
-{
-	int ret;
-	u64 len = olen;
-	bool same_inode = (src == dst);
-	u64 same_lock_start = 0;
-	u64 same_lock_len = 0;
-
-	ret = extent_same_check_offsets(src, loff, &len, olen);
-	if (ret)
-		return ret;
-
-	ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
-	if (ret)
-		return ret;
-
-	if (same_inode) {
-		/*
-		 * Single inode case wants the same checks, except we
-		 * don't want our length pushed out past i_size as
-		 * comparing that data range makes no sense.
-		 *
-		 * extent_same_check_offsets() will do this for an
-		 * unaligned length at i_size, so catch it here and
-		 * reject the request.
-		 *
-		 * This effectively means we require aligned extents
-		 * for the single-inode case, whereas the other cases
-		 * allow an unaligned length so long as it ends at
-		 * i_size.
-		 */
-		if (len != olen)
-			return -EINVAL;
-
-		/* Check for overlapping ranges */
-		if (dst_loff + len > loff && dst_loff < loff + len)
-			return -EINVAL;
-
-		same_lock_start = min_t(u64, loff, dst_loff);
-		same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
-	} else {
-		/*
-		 * If the source and destination inodes are different, the
-		 * source's range end offset matches the source's i_size, that
-		 * i_size is not a multiple of the sector size, and the
-		 * destination range does not go past the destination's i_size,
-		 * we must round down the length to the nearest sector size
-		 * multiple. If we don't do this adjustment we end replacing
-		 * with zeroes the bytes in the range that starts at the
-		 * deduplication range's end offset and ends at the next sector
-		 * size multiple.
-		 */
-		if (loff + olen == i_size_read(src) &&
-		    dst_loff + len < i_size_read(dst)) {
-			const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
-
-			len = round_down(i_size_read(src), sz) - loff;
-			if (len == 0)
-				return 0;
-			olen = len;
-		}
-	}
-
-again:
-	ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
-	if (ret)
-		return ret;
-
-	if (same_inode)
-		ret = lock_extent_range(src, same_lock_start, same_lock_len,
-					false);
-	else
-		ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
-					       false);
 	/*
-	 * If one of the inodes has dirty pages in the respective range or
-	 * ordered extents, we need to flush dellaloc and wait for all ordered
-	 * extents in the range. We must unlock the pages and the ranges in the
-	 * io trees to avoid deadlocks when flushing delalloc (requires locking
-	 * pages) and when waiting for ordered extents to complete (they require
-	 * range locking).
+	 * Lock destination range to serialize with concurrent readpages() and
+	 * source range to serialize with relocation.
 	 */
-	if (ret == -EAGAIN) {
-		/*
-		 * Ranges in the io trees already unlocked. Now unlock all
-		 * pages before waiting for all IO to complete.
-		 */
-		btrfs_cmp_data_free(cmp);
-		if (same_inode) {
-			btrfs_wait_ordered_range(src, same_lock_start,
-						 same_lock_len);
-		} else {
-			btrfs_wait_ordered_range(src, loff, len);
-			btrfs_wait_ordered_range(dst, dst_loff, len);
-		}
-		goto again;
-	}
-	ASSERT(ret == 0);
-	if (WARN_ON(ret)) {
-		/* ranges in the io trees already unlocked */
-		btrfs_cmp_data_free(cmp);
-		return ret;
-	}
-
-	/* pass original length for comparison so we stay within i_size */
-	ret = btrfs_cmp_data(olen, cmp);
-	if (ret == 0)
-		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
-
-	if (same_inode)
-		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
-			      same_lock_start + same_lock_len - 1);
-	else
-		btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
-
-	btrfs_cmp_data_free(cmp);
+	btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+	ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
+	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
 
 	return ret;
 }
@@ -3558,58 +3257,29 @@
 			     struct inode *dst, u64 dst_loff)
 {
 	int ret;
-	struct cmp_pages cmp;
-	int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
-	bool same_inode = (src == dst);
 	u64 i, tail_len, chunk_count;
+	struct btrfs_root *root_dst = BTRFS_I(dst)->root;
 
-	if (olen == 0)
-		return 0;
-
-	if (same_inode)
-		inode_lock(src);
-	else
-		btrfs_double_inode_lock(src, dst);
-
-	/* don't make the dst file partly checksummed */
-	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
-	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
-		ret = -EINVAL;
-		goto out_unlock;
+	spin_lock(&root_dst->root_item_lock);
+	if (root_dst->send_in_progress) {
+		btrfs_warn_rl(root_dst->fs_info,
+"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
+			      root_dst->root_key.objectid,
+			      root_dst->send_in_progress);
+		spin_unlock(&root_dst->root_item_lock);
+		return -EAGAIN;
 	}
+	root_dst->dedupe_in_progress++;
+	spin_unlock(&root_dst->root_item_lock);
 
 	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
 	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
-	if (chunk_count == 0)
-		num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
-
-	/*
-	 * If deduping ranges in the same inode, locking rules make it
-	 * mandatory to always lock pages in ascending order to avoid deadlocks
-	 * with concurrent tasks (such as starting writeback/delalloc).
-	 */
-	if (same_inode && dst_loff < loff)
-		swap(loff, dst_loff);
-
-	/*
-	 * We must gather up all the pages before we initiate our extent
-	 * locking. We use an array for the page pointers. Size of the array is
-	 * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
-	 */
-	cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
-				       GFP_KERNEL | __GFP_ZERO);
-	cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
-				       GFP_KERNEL | __GFP_ZERO);
-	if (!cmp.src_pages || !cmp.dst_pages) {
-		ret = -ENOMEM;
-		goto out_free;
-	}
 
 	for (i = 0; i < chunk_count; i++) {
 		ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
-					      dst, dst_loff, &cmp);
+					      dst, dst_loff);
 		if (ret)
-			goto out_free;
+			goto out;
 
 		loff += BTRFS_MAX_DEDUPE_LEN;
 		dst_loff += BTRFS_MAX_DEDUPE_LEN;
@@ -3617,41 +3287,15 @@
 
 	if (tail_len > 0)
 		ret = btrfs_extent_same_range(src, loff, tail_len, dst,
-					      dst_loff, &cmp);
-
-out_free:
-	kvfree(cmp.src_pages);
-	kvfree(cmp.dst_pages);
-
-out_unlock:
-	if (same_inode)
-		inode_unlock(src);
-	else
-		btrfs_double_inode_unlock(src, dst);
+					      dst_loff);
+out:
+	spin_lock(&root_dst->root_item_lock);
+	root_dst->dedupe_in_progress--;
+	spin_unlock(&root_dst->root_item_lock);
 
 	return ret;
 }
 
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-			    struct file *dst_file, loff_t dst_loff,
-			    u64 olen)
-{
-	struct inode *src = file_inode(src_file);
-	struct inode *dst = file_inode(dst_file);
-	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-
-	if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
-		/*
-		 * Btrfs does not support blocksize < page_size. As a
-		 * result, btrfs_cmp_data() won't correctly handle
-		 * this situation without an update.
-		 */
-		return -EINVAL;
-	}
-
-	return btrfs_extent_same(src, src_loff, olen, dst, dst_loff);
-}
-
 static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 				     struct inode *inode,
 				     u64 endoff,
@@ -3685,61 +3329,6 @@
 	return ret;
 }
 
-static void clone_update_extent_map(struct btrfs_inode *inode,
-				    const struct btrfs_trans_handle *trans,
-				    const struct btrfs_path *path,
-				    const u64 hole_offset,
-				    const u64 hole_len)
-{
-	struct extent_map_tree *em_tree = &inode->extent_tree;
-	struct extent_map *em;
-	int ret;
-
-	em = alloc_extent_map();
-	if (!em) {
-		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
-		return;
-	}
-
-	if (path) {
-		struct btrfs_file_extent_item *fi;
-
-		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				    struct btrfs_file_extent_item);
-		btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
-		em->generation = -1;
-		if (btrfs_file_extent_type(path->nodes[0], fi) ==
-		    BTRFS_FILE_EXTENT_INLINE)
-			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
-					&inode->runtime_flags);
-	} else {
-		em->start = hole_offset;
-		em->len = hole_len;
-		em->ram_bytes = em->len;
-		em->orig_start = hole_offset;
-		em->block_start = EXTENT_MAP_HOLE;
-		em->block_len = 0;
-		em->orig_block_len = 0;
-		em->compress_type = BTRFS_COMPRESS_NONE;
-		em->generation = trans->transid;
-	}
-
-	while (1) {
-		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em, 1);
-		write_unlock(&em_tree->lock);
-		if (ret != -EEXIST) {
-			free_extent_map(em);
-			break;
-		}
-		btrfs_drop_extent_cache(inode, em->start,
-					em->start + em->len - 1, 0);
-	}
-
-	if (ret)
-		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
-}
-
 /*
  * Make sure we do not end up inserting an inline extent into a file that has
  * already other (non-inline) extents. If a file has an inline extent it can
@@ -3880,6 +3469,7 @@
 						  path->slots[0]),
 			    size);
 	inode_add_bytes(dst, datal);
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
 
 	return 0;
 }
@@ -3931,6 +3521,14 @@
 
 	while (1) {
 		u64 next_key_min_offset = key.offset + 1;
+		struct btrfs_file_extent_item *extent;
+		int type;
+		u32 size;
+		struct btrfs_key new_key;
+		u64 disko = 0, diskl = 0;
+		u64 datao = 0, datal = 0;
+		u8 comp;
+		u64 drop_start;
 
 		/*
 		 * note the key will change type as we walk through the
@@ -3971,75 +3569,115 @@
 		    key.objectid != btrfs_ino(BTRFS_I(src)))
 			break;
 
-		if (key.type == BTRFS_EXTENT_DATA_KEY) {
-			struct btrfs_file_extent_item *extent;
-			int type;
-			u32 size;
-			struct btrfs_key new_key;
-			u64 disko = 0, diskl = 0;
-			u64 datao = 0, datal = 0;
-			u8 comp;
-			u64 drop_start;
+		ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
 
-			extent = btrfs_item_ptr(leaf, slot,
-						struct btrfs_file_extent_item);
-			comp = btrfs_file_extent_compression(leaf, extent);
-			type = btrfs_file_extent_type(leaf, extent);
-			if (type == BTRFS_FILE_EXTENT_REG ||
-			    type == BTRFS_FILE_EXTENT_PREALLOC) {
-				disko = btrfs_file_extent_disk_bytenr(leaf,
-								      extent);
-				diskl = btrfs_file_extent_disk_num_bytes(leaf,
-								 extent);
-				datao = btrfs_file_extent_offset(leaf, extent);
-				datal = btrfs_file_extent_num_bytes(leaf,
-								    extent);
-			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
-				/* take upper bound, may be compressed */
-				datal = btrfs_file_extent_ram_bytes(leaf,
-								    extent);
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+		comp = btrfs_file_extent_compression(leaf, extent);
+		type = btrfs_file_extent_type(leaf, extent);
+		if (type == BTRFS_FILE_EXTENT_REG ||
+		    type == BTRFS_FILE_EXTENT_PREALLOC) {
+			disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+			diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+			datao = btrfs_file_extent_offset(leaf, extent);
+			datal = btrfs_file_extent_num_bytes(leaf, extent);
+		} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+			/* Take upper bound, may be compressed */
+			datal = btrfs_file_extent_ram_bytes(leaf, extent);
+		}
+
+		/*
+		 * The first search might have left us at an extent item that
+		 * ends before our target range's start, can happen if we have
+		 * holes and NO_HOLES feature enabled.
+		 */
+		if (key.offset + datal <= off) {
+			path->slots[0]++;
+			goto process_slot;
+		} else if (key.offset >= off + len) {
+			break;
+		}
+		next_key_min_offset = key.offset + datal;
+		size = btrfs_item_size_nr(leaf, slot);
+		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
+				   size);
+
+		btrfs_release_path(path);
+		path->leave_spinning = 0;
+
+		memcpy(&new_key, &key, sizeof(new_key));
+		new_key.objectid = btrfs_ino(BTRFS_I(inode));
+		if (off <= key.offset)
+			new_key.offset = key.offset + destoff - off;
+		else
+			new_key.offset = destoff;
+
+		/*
+		 * Deal with a hole that doesn't have an extent item that
+		 * represents it (NO_HOLES feature enabled).
+		 * This hole is either in the middle of the cloning range or at
+		 * the beginning (fully overlaps it or partially overlaps it).
+		 */
+		if (new_key.offset != last_dest_end)
+			drop_start = last_dest_end;
+		else
+			drop_start = new_key.offset;
+
+		if (type == BTRFS_FILE_EXTENT_REG ||
+		    type == BTRFS_FILE_EXTENT_PREALLOC) {
+			struct btrfs_clone_extent_info clone_info;
+
+			/*
+			 *    a  | --- range to clone ---|  b
+			 * | ------------- extent ------------- |
+			 */
+
+			/* Subtract range b */
+			if (key.offset + datal > off + len)
+				datal = off + len - key.offset;
+
+			/* Subtract range a */
+			if (off > key.offset) {
+				datao += off - key.offset;
+				datal -= off - key.offset;
 			}
 
-			/*
-			 * The first search might have left us at an extent
-			 * item that ends before our target range's start, can
-			 * happen if we have holes and NO_HOLES feature enabled.
-			 */
-			if (key.offset + datal <= off) {
-				path->slots[0]++;
-				goto process_slot;
-			} else if (key.offset >= off + len) {
-				break;
+			clone_info.disk_offset = disko;
+			clone_info.disk_len = diskl;
+			clone_info.data_offset = datao;
+			clone_info.data_len = datal;
+			clone_info.file_offset = new_key.offset;
+			clone_info.extent_buf = buf;
+			clone_info.item_size = size;
+			ret = btrfs_punch_hole_range(inode, path,
+						     drop_start,
+						     new_key.offset + datal - 1,
+						     &clone_info, &trans);
+			if (ret)
+				goto out;
+		} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+			u64 skip = 0;
+			u64 trim = 0;
+
+			if (off > key.offset) {
+				skip = off - key.offset;
+				new_key.offset += skip;
 			}
-			next_key_min_offset = key.offset + datal;
-			size = btrfs_item_size_nr(leaf, slot);
-			read_extent_buffer(leaf, buf,
-					   btrfs_item_ptr_offset(leaf, slot),
-					   size);
 
-			btrfs_release_path(path);
-			path->leave_spinning = 0;
+			if (key.offset + datal > off + len)
+				trim = key.offset + datal - (off + len);
 
-			memcpy(&new_key, &key, sizeof(new_key));
-			new_key.objectid = btrfs_ino(BTRFS_I(inode));
-			if (off <= key.offset)
-				new_key.offset = key.offset + destoff - off;
-			else
-				new_key.offset = destoff;
+			if (comp && (skip || trim)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			size -= skip + trim;
+			datal -= skip + trim;
 
 			/*
-			 * Deal with a hole that doesn't have an extent item
-			 * that represents it (NO_HOLES feature enabled).
-			 * This hole is either in the middle of the cloning
-			 * range or at the beginning (fully overlaps it or
-			 * partially overlaps it).
-			 */
-			if (new_key.offset != last_dest_end)
-				drop_start = last_dest_end;
-			else
-				drop_start = new_key.offset;
-
-			/*
+			 * If our extent is inline, we know we will drop or
+			 * adjust at most 1 extent item in the destination root.
+			 *
 			 * 1 - adjusting old extent (we may have to split it)
 			 * 1 - add new extent
 			 * 1 - inode update
@@ -4050,137 +3688,28 @@
 				goto out;
 			}
 
-			if (type == BTRFS_FILE_EXTENT_REG ||
-			    type == BTRFS_FILE_EXTENT_PREALLOC) {
-				/*
-				 *    a  | --- range to clone ---|  b
-				 * | ------------- extent ------------- |
-				 */
-
-				/* subtract range b */
-				if (key.offset + datal > off + len)
-					datal = off + len - key.offset;
-
-				/* subtract range a */
-				if (off > key.offset) {
-					datao += off - key.offset;
-					datal -= off - key.offset;
-				}
-
-				ret = btrfs_drop_extents(trans, root, inode,
-							 drop_start,
-							 new_key.offset + datal,
-							 1);
-				if (ret) {
-					if (ret != -EOPNOTSUPP)
-						btrfs_abort_transaction(trans,
-									ret);
-					btrfs_end_transaction(trans);
-					goto out;
-				}
-
-				ret = btrfs_insert_empty_item(trans, root, path,
-							      &new_key, size);
-				if (ret) {
+			ret = clone_copy_inline_extent(inode, trans, path,
+						       &new_key, drop_start,
+						       datal, skip, size, buf);
+			if (ret) {
+				if (ret != -EOPNOTSUPP)
 					btrfs_abort_transaction(trans, ret);
-					btrfs_end_transaction(trans);
-					goto out;
-				}
-
-				leaf = path->nodes[0];
-				slot = path->slots[0];
-				write_extent_buffer(leaf, buf,
-					    btrfs_item_ptr_offset(leaf, slot),
-					    size);
-
-				extent = btrfs_item_ptr(leaf, slot,
-						struct btrfs_file_extent_item);
-
-				/* disko == 0 means it's a hole */
-				if (!disko)
-					datao = 0;
-
-				btrfs_set_file_extent_offset(leaf, extent,
-							     datao);
-				btrfs_set_file_extent_num_bytes(leaf, extent,
-								datal);
-
-				if (disko) {
-					inode_add_bytes(inode, datal);
-					ret = btrfs_inc_extent_ref(trans,
-							root,
-							disko, diskl, 0,
-							root->root_key.objectid,
-							btrfs_ino(BTRFS_I(inode)),
-							new_key.offset - datao);
-					if (ret) {
-						btrfs_abort_transaction(trans,
-									ret);
-						btrfs_end_transaction(trans);
-						goto out;
-
-					}
-				}
-			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
-				u64 skip = 0;
-				u64 trim = 0;
-
-				if (off > key.offset) {
-					skip = off - key.offset;
-					new_key.offset += skip;
-				}
-
-				if (key.offset + datal > off + len)
-					trim = key.offset + datal - (off + len);
-
-				if (comp && (skip || trim)) {
-					ret = -EINVAL;
-					btrfs_end_transaction(trans);
-					goto out;
-				}
-				size -= skip + trim;
-				datal -= skip + trim;
-
-				ret = clone_copy_inline_extent(inode,
-							       trans, path,
-							       &new_key,
-							       drop_start,
-							       datal,
-							       skip, size, buf);
-				if (ret) {
-					if (ret != -EOPNOTSUPP)
-						btrfs_abort_transaction(trans,
-									ret);
-					btrfs_end_transaction(trans);
-					goto out;
-				}
-				leaf = path->nodes[0];
-				slot = path->slots[0];
-			}
-
-			/* If we have an implicit hole (NO_HOLES feature). */
-			if (drop_start < new_key.offset)
-				clone_update_extent_map(BTRFS_I(inode), trans,
-						NULL, drop_start,
-						new_key.offset - drop_start);
-
-			clone_update_extent_map(BTRFS_I(inode), trans,
-					path, 0, 0);
-
-			btrfs_mark_buffer_dirty(leaf);
-			btrfs_release_path(path);
-
-			last_dest_end = ALIGN(new_key.offset + datal,
-					      fs_info->sectorsize);
-			ret = clone_finish_inode_update(trans, inode,
-							last_dest_end,
-							destoff, olen,
-							no_time_update);
-			if (ret)
+				btrfs_end_transaction(trans);
 				goto out;
-			if (new_key.offset + datal >= destoff + len)
-				break;
+			}
 		}
+
+		btrfs_release_path(path);
+
+		last_dest_end = ALIGN(new_key.offset + datal,
+				      fs_info->sectorsize);
+		ret = clone_finish_inode_update(trans, inode, last_dest_end,
+						destoff, olen, no_time_update);
+		if (ret)
+			goto out;
+		if (new_key.offset + datal >= destoff + len)
+			break;
+
 		btrfs_release_path(path);
 		key.offset = next_key_min_offset;
 
@@ -4192,32 +3721,27 @@
 	ret = 0;
 
 	if (last_dest_end < destoff + len) {
+		struct btrfs_clone_extent_info clone_info = { 0 };
 		/*
 		 * We have an implicit hole (NO_HOLES feature is enabled) that
 		 * fully or partially overlaps our cloning range at its end.
 		 */
 		btrfs_release_path(path);
+		path->leave_spinning = 0;
 
 		/*
-		 * 1 - remove extent(s)
-		 * 1 - inode update
+		 * We are dealing with a hole and our clone_info already has a
+		 * disk_offset of 0, we only need to fill the data length and
+		 * file offset.
 		 */
-		trans = btrfs_start_transaction(root, 2);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
+		clone_info.data_len = destoff + len - last_dest_end;
+		clone_info.file_offset = last_dest_end;
+		ret = btrfs_punch_hole_range(inode, path,
+					     last_dest_end, destoff + len - 1,
+					     &clone_info, &trans);
+		if (ret)
 			goto out;
-		}
-		ret = btrfs_drop_extents(trans, root, inode,
-					 last_dest_end, destoff + len, 1);
-		if (ret) {
-			if (ret != -EOPNOTSUPP)
-				btrfs_abort_transaction(trans, ret);
-			btrfs_end_transaction(trans);
-			goto out;
-		}
-		clone_update_extent_map(BTRFS_I(inode), trans, NULL,
-				last_dest_end,
-				destoff + len - last_dest_end);
+
 		ret = clone_finish_inode_update(trans, inode, destoff + len,
 						destoff, olen, no_time_update);
 	}
@@ -4234,11 +3758,9 @@
 	struct inode *inode = file_inode(file);
 	struct inode *src = file_inode(file_src);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 	u64 len = olen;
 	u64 bs = fs_info->sb->s_blocksize;
-	int same_inode = src == inode;
 
 	/*
 	 * TODO:
@@ -4251,101 +3773,43 @@
 	 *   be either compressed or non-compressed.
 	 */
 
-	if (btrfs_root_readonly(root))
-		return -EROFS;
-
-	if (file_src->f_path.mnt != file->f_path.mnt ||
-	    src->i_sb != inode->i_sb)
-		return -EXDEV;
-
-	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
-		return -EISDIR;
-
-	if (!same_inode) {
-		btrfs_double_inode_lock(src, inode);
-	} else {
-		inode_lock(src);
-	}
-
-	/* don't make the dst file partly checksummed */
-	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
-	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* determine range to clone */
-	ret = -EINVAL;
-	if (off + len > src->i_size || off + len < off)
-		goto out_unlock;
-	if (len == 0)
-		olen = len = src->i_size - off;
 	/*
-	 * If we extend to eof, continue to block boundary if and only if the
-	 * destination end offset matches the destination file's size, otherwise
-	 * we would be corrupting data by placing the eof block into the middle
-	 * of a file.
+	 * VFS's generic_remap_file_range_prep() protects us from cloning the
+	 * eof block into the middle of a file, which would result in corruption
+	 * if the file size is not blocksize aligned. So we don't need to check
+	 * for that case here.
 	 */
-	if (off + len == src->i_size) {
-		if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
-			goto out_unlock;
+	if (off + len == src->i_size)
 		len = ALIGN(src->i_size, bs) - off;
-	}
-
-	if (len == 0) {
-		ret = 0;
-		goto out_unlock;
-	}
-
-	/* verify the end result is block aligned */
-	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
-	    !IS_ALIGNED(destoff, bs))
-		goto out_unlock;
-
-	/* verify if ranges are overlapped within the same file */
-	if (same_inode) {
-		if (destoff + len > off && destoff < off + len)
-			goto out_unlock;
-	}
 
 	if (destoff > inode->i_size) {
+		const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
+
 		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
 		if (ret)
-			goto out_unlock;
+			return ret;
+		/*
+		 * We may have truncated the last block if the inode's size is
+		 * not sector size aligned, so we need to wait for writeback to
+		 * complete before proceeding further, otherwise we can race
+		 * with cloning and attempt to increment a reference to an
+		 * extent that no longer exists (writeback completed right after
+		 * we found the previous extent covering eof and before we
+		 * attempted to increment its reference count).
+		 */
+		ret = btrfs_wait_ordered_range(inode, wb_start,
+					       destoff - wb_start);
+		if (ret)
+			return ret;
 	}
 
 	/*
-	 * Lock the target range too. Right after we replace the file extent
-	 * items in the fs tree (which now point to the cloned data), we might
-	 * have a worker replace them with extent items relative to a write
-	 * operation that was issued before this clone operation (i.e. confront
-	 * with inode.c:btrfs_finish_ordered_io).
+	 * Lock destination range to serialize with concurrent readpages() and
+	 * source range to serialize with relocation.
 	 */
-	if (same_inode) {
-		u64 lock_start = min_t(u64, off, destoff);
-		u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
-
-		ret = lock_extent_range(src, lock_start, lock_len, true);
-	} else {
-		ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
-					       true);
-	}
-	ASSERT(ret == 0);
-	if (WARN_ON(ret)) {
-		/* ranges in the io trees already unlocked */
-		goto out_unlock;
-	}
-
+	btrfs_double_extent_lock(src, off, inode, destoff, len);
 	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
-
-	if (same_inode) {
-		u64 lock_start = min_t(u64, off, destoff);
-		u64 lock_end = max_t(u64, off, destoff) + len - 1;
-
-		unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
-	} else {
-		btrfs_double_extent_unlock(src, off, inode, destoff, len);
-	}
+	btrfs_double_extent_unlock(src, off, inode, destoff, len);
 	/*
 	 * Truncate page cache pages so that future reads will see the cloned
 	 * data immediately and not the previous data.
@@ -4353,18 +3817,134 @@
 	truncate_inode_pages_range(&inode->i_data,
 				round_down(destoff, PAGE_SIZE),
 				round_up(destoff + len, PAGE_SIZE) - 1);
-out_unlock:
-	if (!same_inode)
-		btrfs_double_inode_unlock(src, inode);
-	else
-		inode_unlock(src);
+
 	return ret;
 }
 
-int btrfs_clone_file_range(struct file *src_file, loff_t off,
-		struct file *dst_file, loff_t destoff, u64 len)
+static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				       struct file *file_out, loff_t pos_out,
+				       loff_t *len, unsigned int remap_flags)
 {
-	return btrfs_clone_files(dst_file, src_file, off, len, destoff);
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+	bool same_inode = inode_out == inode_in;
+	u64 wb_len;
+	int ret;
+
+	if (!(remap_flags & REMAP_FILE_DEDUP)) {
+		struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+
+		if (btrfs_root_readonly(root_out))
+			return -EROFS;
+
+		if (file_in->f_path.mnt != file_out->f_path.mnt ||
+		    inode_in->i_sb != inode_out->i_sb)
+			return -EXDEV;
+	}
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+		return -EINVAL;
+	}
+
+	/*
+	 * Now that the inodes are locked, we need to start writeback ourselves
+	 * and can not rely on the writeback from the VFS's generic helper
+	 * generic_remap_file_range_prep() because:
+	 *
+	 * 1) For compression we must call filemap_fdatawrite_range() range
+	 *    twice (btrfs_fdatawrite_range() does it for us), and the generic
+	 *    helper only calls it once;
+	 *
+	 * 2) filemap_fdatawrite_range(), called by the generic helper only
+	 *    waits for the writeback to complete, i.e. for IO to be done, and
+	 *    not for the ordered extents to complete. We need to wait for them
+	 *    to complete so that new file extent items are in the fs tree.
+	 */
+	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
+		wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+	else
+		wb_len = ALIGN(*len, bs);
+
+	/*
+	 * Since we don't lock ranges, wait for ongoing lockless dio writes (as
+	 * any in progress could create its ordered extents after we wait for
+	 * existing ordered extents below).
+	 */
+	inode_dio_wait(inode_in);
+	if (!same_inode)
+		inode_dio_wait(inode_out);
+
+	/*
+	 * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
+	 *
+	 * Btrfs' back references do not have a block level granularity, they
+	 * work at the whole extent level.
+	 * NOCOW buffered write without data space reserved may not be able
+	 * to fall back to CoW due to lack of data space, thus could cause
+	 * data loss.
+	 *
+	 * Here we take a shortcut by flushing the whole inode, so that all
+	 * nocow write should reach disk as nocow before we increase the
+	 * reference of the extent. We could do better by only flushing NOCOW
+	 * data, but that needs extra accounting.
+	 *
+	 * Also we don't need to check ASYNC_EXTENT, as async extent will be
+	 * CoWed anyway, not affecting nocow part.
+	 */
+	ret = filemap_flush(inode_in->i_mapping);
+	if (ret < 0)
+		return ret;
+
+	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+				       wb_len);
+	if (ret < 0)
+		return ret;
+	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+				       wb_len);
+	if (ret < 0)
+		return ret;
+
+	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+					    len, remap_flags);
+}
+
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+		struct file *dst_file, loff_t destoff, loff_t len,
+		unsigned int remap_flags)
+{
+	struct inode *src_inode = file_inode(src_file);
+	struct inode *dst_inode = file_inode(dst_file);
+	bool same_inode = dst_inode == src_inode;
+	int ret;
+
+	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (same_inode)
+		inode_lock(src_inode);
+	else
+		lock_two_nondirectories(src_inode, dst_inode);
+
+	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
+					  &len, remap_flags);
+	if (ret < 0 || len == 0)
+		goto out_unlock;
+
+	if (remap_flags & REMAP_FILE_DEDUP)
+		ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+	else
+		ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+
+out_unlock:
+	if (same_inode)
+		inode_unlock(src_inode);
+	else
+		unlock_two_nondirectories(src_inode, dst_inode);
+
+	return ret < 0 ? ret : len;
 }
 
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
@@ -4406,7 +3986,7 @@
 		ret = PTR_ERR(new_root);
 		goto out;
 	}
-	if (!is_fstree(new_root->objectid)) {
+	if (!is_fstree(new_root->root_key.objectid)) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -4674,7 +4254,7 @@
 			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
 			      0);
 
-	if (copy_to_user(arg, sa, sizeof(*sa)))
+	if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
 		ret = -EFAULT;
 
 	if (!(sa->flags & BTRFS_SCRUB_READONLY))
@@ -4707,7 +4287,7 @@
 
 	ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
 
-	if (copy_to_user(arg, sa, sizeof(*sa)))
+	if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
 		ret = -EFAULT;
 
 	kfree(sa);
@@ -4731,7 +4311,7 @@
 
 	ret = btrfs_get_dev_stats(fs_info, sa);
 
-	if (copy_to_user(arg, sa, sizeof(*sa)))
+	if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
 		ret = -EFAULT;
 
 	kfree(sa);
@@ -4777,7 +4357,7 @@
 		break;
 	}
 
-	if (copy_to_user(arg, p, sizeof(*p)))
+	if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
 		ret = -EFAULT;
 out:
 	kfree(p);
@@ -5083,7 +4663,7 @@
 	ret = btrfs_balance(fs_info, bctl, bargs);
 	bctl = NULL;
 
-	if (arg) {
+	if ((ret == 0 || ret == -ECANCELED) && arg) {
 		if (copy_to_user(arg, bargs, sizeof(*bargs)))
 			ret = -EFAULT;
 	}
@@ -5683,7 +5263,7 @@
 			      u64 change_mask, u64 flags, u64 supported_flags,
 			      u64 safe_set, u64 safe_clear)
 {
-	const char *type = btrfs_feature_set_names[set];
+	const char *type = btrfs_feature_set_name(set);
 	char *names;
 	u64 disallowed, unsupported;
 	u64 set_mask = flags & change_mask;
@@ -5864,6 +5444,10 @@
 		return btrfs_ioctl_setflags(file, argp);
 	case FS_IOC_GETVERSION:
 		return btrfs_ioctl_getversion(file, argp);
+	case FS_IOC_GETFSLABEL:
+		return btrfs_ioctl_get_fslabel(file, argp);
+	case FS_IOC_SETFSLABEL:
+		return btrfs_ioctl_set_fslabel(file, argp);
 	case FITRIM:
 		return btrfs_ioctl_fitrim(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
@@ -5975,10 +5559,6 @@
 		return btrfs_ioctl_quota_rescan_wait(file, argp);
 	case BTRFS_IOC_DEV_REPLACE:
 		return btrfs_ioctl_dev_replace(fs_info, argp);
-	case BTRFS_IOC_GET_FSLABEL:
-		return btrfs_ioctl_get_fslabel(file, argp);
-	case BTRFS_IOC_SET_FSLABEL:
-		return btrfs_ioctl_set_fslabel(file, argp);
 	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
 		return btrfs_ioctl_get_supported_features(argp);
 	case BTRFS_IOC_GET_FEATURES:

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1da768e..7f9a578 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c

@@ -8,74 +8,115 @@
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
 #include <asm/bug.h>
+#include "misc.h"
 #include "ctree.h"
 #include "extent_io.h"
 #include "locking.h"
 
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
-
-/*
- * if we currently have a spinning reader or writer lock
- * (indicated by the rw flag) this will bump the count
- * of blocking holders and drop the spinlock.
- */
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
+#ifdef CONFIG_BTRFS_DEBUG
+static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
 {
-	/*
-	 * no lock is required.  The lock owner may change if
-	 * we have a read lock, but it won't change to or away
-	 * from us.  If we have the write lock, we are the owner
-	 * and it'll never change.
-	 */
-	if (eb->lock_nested && current->pid == eb->lock_owner)
-		return;
-	if (rw == BTRFS_WRITE_LOCK) {
-		if (atomic_read(&eb->blocking_writers) == 0) {
-			WARN_ON(atomic_read(&eb->spinning_writers) != 1);
-			atomic_dec(&eb->spinning_writers);
-			btrfs_assert_tree_locked(eb);
-			atomic_inc(&eb->blocking_writers);
-			write_unlock(&eb->lock);
-		}
-	} else if (rw == BTRFS_READ_LOCK) {
-		btrfs_assert_tree_read_locked(eb);
-		atomic_inc(&eb->blocking_readers);
-		WARN_ON(atomic_read(&eb->spinning_readers) == 0);
-		atomic_dec(&eb->spinning_readers);
-		read_unlock(&eb->lock);
-	}
+	WARN_ON(eb->spinning_writers);
+	eb->spinning_writers++;
 }
 
-/*
- * if we currently have a blocking lock, take the spinlock
- * and drop our blocking count
- */
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
+static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
 {
+	WARN_ON(eb->spinning_writers != 1);
+	eb->spinning_writers--;
+}
+
+static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
+{
+	WARN_ON(eb->spinning_writers);
+}
+
+static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
+{
+	atomic_inc(&eb->spinning_readers);
+}
+
+static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
+{
+	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+	atomic_dec(&eb->spinning_readers);
+}
+
+static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
+{
+	atomic_inc(&eb->read_locks);
+}
+
+static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
+{
+	atomic_dec(&eb->read_locks);
+}
+
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+	BUG_ON(!atomic_read(&eb->read_locks));
+}
+
+static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
+{
+	eb->write_locks++;
+}
+
+static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
+{
+	eb->write_locks--;
+}
+
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
+{
+	BUG_ON(!eb->write_locks);
+}
+
+#else
+static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
+static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
+static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { }
+static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { }
+static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
+void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
+#endif
+
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
+{
+	trace_btrfs_set_lock_blocking_read(eb);
 	/*
-	 * no lock is required.  The lock owner may change if
-	 * we have a read lock, but it won't change to or away
-	 * from us.  If we have the write lock, we are the owner
-	 * and it'll never change.
+	 * No lock is required.  The lock owner may change if we have a read
+	 * lock, but it won't change to or away from us.  If we have the write
+	 * lock, we are the owner and it'll never change.
 	 */
 	if (eb->lock_nested && current->pid == eb->lock_owner)
 		return;
+	btrfs_assert_tree_read_locked(eb);
+	atomic_inc(&eb->blocking_readers);
+	btrfs_assert_spinning_readers_put(eb);
+	read_unlock(&eb->lock);
+}
 
-	if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
-		BUG_ON(atomic_read(&eb->blocking_writers) != 1);
-		write_lock(&eb->lock);
-		WARN_ON(atomic_read(&eb->spinning_writers));
-		atomic_inc(&eb->spinning_writers);
-		/* atomic_dec_and_test implies a barrier */
-		if (atomic_dec_and_test(&eb->blocking_writers))
-			cond_wake_up_nomb(&eb->write_lock_wq);
-	} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
-		BUG_ON(atomic_read(&eb->blocking_readers) == 0);
-		read_lock(&eb->lock);
-		atomic_inc(&eb->spinning_readers);
-		/* atomic_dec_and_test implies a barrier */
-		if (atomic_dec_and_test(&eb->blocking_readers))
-			cond_wake_up_nomb(&eb->read_lock_wq);
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
+{
+	trace_btrfs_set_lock_blocking_write(eb);
+	/*
+	 * No lock is required.  The lock owner may change if we have a read
+	 * lock, but it won't change to or away from us.  If we have the write
+	 * lock, we are the owner and it'll never change.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner)
+		return;
+	if (eb->blocking_writers == 0) {
+		btrfs_assert_spinning_writers_put(eb);
+		btrfs_assert_tree_locked(eb);
+		eb->blocking_writers++;
+		write_unlock(&eb->lock);
 	}
 }
 
@@ -85,13 +126,15 @@
  */
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
-again:
-	BUG_ON(!atomic_read(&eb->blocking_writers) &&
-	       current->pid == eb->lock_owner);
+	u64 start_ns = 0;
 
+	if (trace_btrfs_tree_read_lock_enabled())
+		start_ns = ktime_get_ns();
+again:
 	read_lock(&eb->lock);
-	if (atomic_read(&eb->blocking_writers) &&
-	    current->pid == eb->lock_owner) {
+	BUG_ON(eb->blocking_writers == 0 &&
+	       current->pid == eb->lock_owner);
+	if (eb->blocking_writers && current->pid == eb->lock_owner) {
 		/*
 		 * This extent is already write-locked by our thread. We allow
 		 * an additional read lock to be added because it's for the same
@@ -99,18 +142,20 @@
 		 * called on a partly (write-)locked tree.
 		 */
 		BUG_ON(eb->lock_nested);
-		eb->lock_nested = 1;
+		eb->lock_nested = true;
 		read_unlock(&eb->lock);
+		trace_btrfs_tree_read_lock(eb, start_ns);
 		return;
 	}
-	if (atomic_read(&eb->blocking_writers)) {
+	if (eb->blocking_writers) {
 		read_unlock(&eb->lock);
 		wait_event(eb->write_lock_wq,
-			   atomic_read(&eb->blocking_writers) == 0);
+			   eb->blocking_writers == 0);
 		goto again;
 	}
-	atomic_inc(&eb->read_locks);
-	atomic_inc(&eb->spinning_readers);
+	btrfs_assert_tree_read_locks_get(eb);
+	btrfs_assert_spinning_readers_get(eb);
+	trace_btrfs_tree_read_lock(eb, start_ns);
 }
 
 /*
@@ -120,16 +165,17 @@
  */
 int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
 {
-	if (atomic_read(&eb->blocking_writers))
+	if (eb->blocking_writers)
 		return 0;
 
 	read_lock(&eb->lock);
-	if (atomic_read(&eb->blocking_writers)) {
+	if (eb->blocking_writers) {
 		read_unlock(&eb->lock);
 		return 0;
 	}
-	atomic_inc(&eb->read_locks);
-	atomic_inc(&eb->spinning_readers);
+	btrfs_assert_tree_read_locks_get(eb);
+	btrfs_assert_spinning_readers_get(eb);
+	trace_btrfs_tree_read_lock_atomic(eb);
 	return 1;
 }
 
@@ -139,18 +185,19 @@
  */
 int btrfs_try_tree_read_lock(struct extent_buffer *eb)
 {
-	if (atomic_read(&eb->blocking_writers))
+	if (eb->blocking_writers)
 		return 0;
 
 	if (!read_trylock(&eb->lock))
 		return 0;
 
-	if (atomic_read(&eb->blocking_writers)) {
+	if (eb->blocking_writers) {
 		read_unlock(&eb->lock);
 		return 0;
 	}
-	atomic_inc(&eb->read_locks);
-	atomic_inc(&eb->spinning_readers);
+	btrfs_assert_tree_read_locks_get(eb);
+	btrfs_assert_spinning_readers_get(eb);
+	trace_btrfs_try_tree_read_lock(eb);
 	return 1;
 }
 
@@ -160,19 +207,18 @@
  */
 int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 {
-	if (atomic_read(&eb->blocking_writers) ||
-	    atomic_read(&eb->blocking_readers))
+	if (eb->blocking_writers || atomic_read(&eb->blocking_readers))
 		return 0;
 
 	write_lock(&eb->lock);
-	if (atomic_read(&eb->blocking_writers) ||
-	    atomic_read(&eb->blocking_readers)) {
+	if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) {
 		write_unlock(&eb->lock);
 		return 0;
 	}
-	atomic_inc(&eb->write_locks);
-	atomic_inc(&eb->spinning_writers);
+	btrfs_assert_tree_write_locks_get(eb);
+	btrfs_assert_spinning_writers_get(eb);
 	eb->lock_owner = current->pid;
+	trace_btrfs_try_tree_write_lock(eb);
 	return 1;
 }
 
@@ -181,6 +227,7 @@
  */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
+	trace_btrfs_tree_read_unlock(eb);
 	/*
 	 * if we're nested, we have the write lock.  No new locking
 	 * is needed as long as we are the lock owner.
@@ -188,13 +235,12 @@
 	 * field only matters to the lock owner.
 	 */
 	if (eb->lock_nested && current->pid == eb->lock_owner) {
-		eb->lock_nested = 0;
+		eb->lock_nested = false;
 		return;
 	}
 	btrfs_assert_tree_read_locked(eb);
-	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
-	atomic_dec(&eb->spinning_readers);
-	atomic_dec(&eb->read_locks);
+	btrfs_assert_spinning_readers_put(eb);
+	btrfs_assert_tree_read_locks_put(eb);
 	read_unlock(&eb->lock);
 }
 
@@ -203,6 +249,7 @@
  */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
+	trace_btrfs_tree_read_unlock_blocking(eb);
 	/*
 	 * if we're nested, we have the write lock.  No new locking
 	 * is needed as long as we are the lock owner.
@@ -210,7 +257,7 @@
 	 * field only matters to the lock owner.
 	 */
 	if (eb->lock_nested && current->pid == eb->lock_owner) {
-		eb->lock_nested = 0;
+		eb->lock_nested = false;
 		return;
 	}
 	btrfs_assert_tree_read_locked(eb);
@@ -218,7 +265,7 @@
 	/* atomic_dec_and_test implies a barrier */
 	if (atomic_dec_and_test(&eb->blocking_readers))
 		cond_wake_up_nomb(&eb->read_lock_wq);
-	atomic_dec(&eb->read_locks);
+	btrfs_assert_tree_read_locks_put(eb);
 }
 
 /*
@@ -227,27 +274,24 @@
  */
 void btrfs_tree_lock(struct extent_buffer *eb)
 {
+	u64 start_ns = 0;
+
+	if (trace_btrfs_tree_lock_enabled())
+		start_ns = ktime_get_ns();
+
 	WARN_ON(eb->lock_owner == current->pid);
 again:
 	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
-	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
+	wait_event(eb->write_lock_wq, eb->blocking_writers == 0);
 	write_lock(&eb->lock);
-	if (atomic_read(&eb->blocking_readers)) {
+	if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) {
 		write_unlock(&eb->lock);
-		wait_event(eb->read_lock_wq,
-			   atomic_read(&eb->blocking_readers) == 0);
 		goto again;
 	}
-	if (atomic_read(&eb->blocking_writers)) {
-		write_unlock(&eb->lock);
-		wait_event(eb->write_lock_wq,
-			   atomic_read(&eb->blocking_writers) == 0);
-		goto again;
-	}
-	WARN_ON(atomic_read(&eb->spinning_writers));
-	atomic_inc(&eb->spinning_writers);
-	atomic_inc(&eb->write_locks);
+	btrfs_assert_spinning_writers_get(eb);
+	btrfs_assert_tree_write_locks_get(eb);
 	eb->lock_owner = current->pid;
+	trace_btrfs_tree_lock(eb, start_ns);
 }
 
 /*
@@ -255,33 +299,26 @@
  */
 void btrfs_tree_unlock(struct extent_buffer *eb)
 {
-	int blockers = atomic_read(&eb->blocking_writers);
+	int blockers = eb->blocking_writers;
 
 	BUG_ON(blockers > 1);
 
 	btrfs_assert_tree_locked(eb);
+	trace_btrfs_tree_unlock(eb);
 	eb->lock_owner = 0;
-	atomic_dec(&eb->write_locks);
+	btrfs_assert_tree_write_locks_put(eb);
 
 	if (blockers) {
-		WARN_ON(atomic_read(&eb->spinning_writers));
-		atomic_dec(&eb->blocking_writers);
-		/* Use the lighter barrier after atomic */
-		smp_mb__after_atomic();
-		cond_wake_up_nomb(&eb->write_lock_wq);
+		btrfs_assert_no_spinning_writers(eb);
+		eb->blocking_writers--;
+		/*
+		 * We need to order modifying blocking_writers above with
+		 * actually waking up the sleepers to ensure they see the
+		 * updated value of blocking_writers
+		 */
+		cond_wake_up(&eb->write_lock_wq);
 	} else {
-		WARN_ON(atomic_read(&eb->spinning_writers) != 1);
-		atomic_dec(&eb->spinning_writers);
+		btrfs_assert_spinning_writers_put(eb);
 		write_unlock(&eb->lock);
 	}
 }
-
-void btrfs_assert_tree_locked(struct extent_buffer *eb)
-{
-	BUG_ON(!atomic_read(&eb->write_locks));
-}
-
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
-{
-	BUG_ON(!atomic_read(&eb->read_locks));
-}

diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 29135de..b775a42 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h

@@ -17,8 +17,8 @@
 void btrfs_tree_read_lock(struct extent_buffer *eb);
 void btrfs_tree_read_unlock(struct extent_buffer *eb);
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
 void btrfs_assert_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_read_lock(struct extent_buffer *eb);
 int btrfs_try_tree_write_lock(struct extent_buffer *eb);
@@ -37,13 +37,4 @@
 		BUG();
 }
 
-static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
-{
-	btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
-}
-
-static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
-{
-	btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
-}
 #endif

diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b6a4cc1..acad417 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c

@@ -27,7 +27,7 @@
  *     Records the total size (including the header) of compressed data.
  *
  * 2.  Segment(s)
- *     Variable size. Each segment includes one segment header, followd by data
+ *     Variable size. Each segment includes one segment header, followed by data
  *     payload.
  *     One regular LZO compressed extent can have one or more segments.
  *     For inlined LZO compressed extent, only one segment is allowed.
@@ -61,6 +61,28 @@
 	struct list_head list;
 };
 
+static struct workspace_manager wsm;
+
+static void lzo_init_workspace_manager(void)
+{
+	btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
+}
+
+static void lzo_cleanup_workspace_manager(void)
+{
+	btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *lzo_get_workspace(unsigned int level)
+{
+	return btrfs_get_workspace(&wsm, level);
+}
+
+static void lzo_put_workspace(struct list_head *ws)
+{
+	btrfs_put_workspace(&wsm, ws);
+}
+
 static void lzo_free_workspace(struct list_head *ws)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -71,7 +93,7 @@
 	kfree(workspace);
 }
 
-static struct list_head *lzo_alloc_workspace(void)
+static struct list_head *lzo_alloc_workspace(unsigned int level)
 {
 	struct workspace *workspace;
 
@@ -485,15 +507,16 @@
 	return ret;
 }
 
-static void lzo_set_level(struct list_head *ws, unsigned int type)
-{
-}
-
 const struct btrfs_compress_op btrfs_lzo_compress = {
+	.init_workspace_manager	= lzo_init_workspace_manager,
+	.cleanup_workspace_manager = lzo_cleanup_workspace_manager,
+	.get_workspace		= lzo_get_workspace,
+	.put_workspace		= lzo_put_workspace,
 	.alloc_workspace	= lzo_alloc_workspace,
 	.free_workspace		= lzo_free_workspace,
 	.compress_pages		= lzo_compress_pages,
 	.decompress_bio		= lzo_decompress_bio,
 	.decompress		= lzo_decompress,
-	.set_level		= lzo_set_level,
+	.max_level		= 1,
+	.default_level		= 1,
 };

diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
deleted file mode 100644
index 75246f2..0000000
--- a/fs/btrfs/math.h
+++ /dev/null

@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2012 Fujitsu.  All rights reserved.
- * Written by Miao Xie <miaox@cn.fujitsu.com>
- */
-
-#ifndef BTRFS_MATH_H
-#define BTRFS_MATH_H
-
-#include <asm/div64.h>
-
-static inline u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	return div_u64(num, 10);
-}
-
-static inline u64 div_factor_fine(u64 num, int factor)
-{
-	if (factor == 100)
-		return num;
-	num *= factor;
-	return div_u64(num, 100);
-}
-
-#endif

diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
new file mode 100644
index 0000000..7d56492
--- /dev/null
+++ b/fs/btrfs/misc.h

@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_MISC_H
+#define BTRFS_MISC_H
+
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <asm/div64.h>
+
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+
+static inline void cond_wake_up(struct wait_queue_head *wq)
+{
+	/*
+	 * This implies a full smp_mb barrier, see comments for
+	 * waitqueue_active why.
+	 */
+	if (wq_has_sleeper(wq))
+		wake_up(wq);
+}
+
+static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
+{
+	/*
+	 * Special case for conditional wakeup where the barrier required for
+	 * waitqueue_active is implied by some of the preceding code. Eg. one
+	 * of such atomic operations (atomic_dec_and_return, ...), or a
+	 * unlock/lock sequence, etc.
+	 */
+	if (waitqueue_active(wq))
+		wake_up(wq);
+}
+
+static inline u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	return div_u64(num, 10);
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor == 100)
+		return num;
+	num *= factor;
+	return div_u64(num, 100);
+}
+
+#endif

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef20..24b6c72 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c

@@ -6,12 +6,15 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
+#include <linux/sched/mm.h>
+#include "misc.h"
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "extent_io.h"
 #include "disk-io.h"
 #include "compression.h"
+#include "delalloc-space.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
@@ -194,8 +197,11 @@
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
-	if (dio)
+	if (dio) {
+		percpu_counter_add_batch(&fs_info->dio_bytes, len,
+					 fs_info->delalloc_batch);
 		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+	}
 
 	/* one ref for the tree */
 	refcount_set(&entry->refs, 1);
@@ -270,13 +276,12 @@
  * when an ordered extent is finished.  If the list covers more than one
  * ordered extent, it is split across multiples.
  */
-void btrfs_add_ordered_sum(struct inode *inode,
-			   struct btrfs_ordered_extent *entry,
+void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
 			   struct btrfs_ordered_sum *sum)
 {
 	struct btrfs_ordered_inode_tree *tree;
 
-	tree = &BTRFS_I(inode)->ordered_tree;
+	tree = &BTRFS_I(entry->inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	list_add_tail(&sum->list, &entry->list);
 	spin_unlock_irq(&tree->lock);
@@ -442,7 +447,7 @@
 			cur = entry->list.next;
 			sum = list_entry(cur, struct btrfs_ordered_sum, list);
 			list_del(&sum->list);
-			kfree(sum);
+			kvfree(sum);
 		}
 		kmem_cache_free(btrfs_ordered_extent_cache, entry);
 	}
@@ -460,7 +465,6 @@
 	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
 	struct btrfs_root *root = btrfs_inode->root;
 	struct rb_node *node;
-	bool dec_pending_ordered = false;
 
 	/* This is paired with btrfs_add_ordered_extent. */
 	spin_lock(&btrfs_inode->lock);
@@ -469,6 +473,10 @@
 	if (root != fs_info->tree_root)
 		btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
 
+	if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+		percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len,
+					 fs_info->delalloc_batch);
+
 	tree = &btrfs_inode->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	node = &entry->rb_node;
@@ -477,37 +485,8 @@
 	if (tree->last == node)
 		tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
-	if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
-		dec_pending_ordered = true;
 	spin_unlock_irq(&tree->lock);
 
-	/*
-	 * The current running transaction is waiting on us, we need to let it
-	 * know that we're complete and wake it up.
-	 */
-	if (dec_pending_ordered) {
-		struct btrfs_transaction *trans;
-
-		/*
-		 * The checks for trans are just a formality, it should be set,
-		 * but if it isn't we don't want to deref/assert under the spin
-		 * lock, so be nice and check if trans is set, but ASSERT() so
-		 * if it isn't set a developer will notice.
-		 */
-		spin_lock(&fs_info->trans_lock);
-		trans = fs_info->running_transaction;
-		if (trans)
-			refcount_inc(&trans->use_count);
-		spin_unlock(&fs_info->trans_lock);
-
-		ASSERT(trans);
-		if (trans) {
-			if (atomic_dec_and_test(&trans->pending_ordered))
-				wake_up(&trans->pending_wait);
-			btrfs_put_transaction(trans);
-		}
-	}
-
 	spin_lock(&root->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
 	root->nr_ordered_extents--;
@@ -947,14 +926,16 @@
  * be reclaimed before their checksum is actually put into the btree
  */
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
-			   u32 *sum, int len)
+			   u8 *sum, int len)
 {
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_ordered_sum *ordered_sum;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 	unsigned long num_sectors;
 	unsigned long i;
 	u32 sectorsize = btrfs_inode_sectorsize(inode);
+	const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	int index = 0;
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
@@ -970,10 +951,10 @@
 			num_sectors = ordered_sum->len >>
 				      inode->i_sb->s_blocksize_bits;
 			num_sectors = min_t(int, len - index, num_sectors - i);
-			memcpy(sum + index, ordered_sum->sums + i,
-			       num_sectors);
+			memcpy(sum + index, ordered_sum->sums + i * csum_size,
+			       num_sectors * csum_size);
 
-			index += (int)num_sectors;
+			index += (int)num_sectors * csum_size;
 			if (index == len)
 				goto out;
 			disk_bytenr += num_sectors * sectorsize;
@@ -985,6 +966,52 @@
 	return index;
 }
 
+/*
+ * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
+ * ordered extents in it are run to completion.
+ *
+ * @tree:         IO tree used for locking out other users of the range
+ * @inode:        Inode whose ordered tree is to be searched
+ * @start:        Beginning of range to flush
+ * @end:          Last byte of range to lock
+ * @cached_state: If passed, will return the extent state responsible for the
+ * locked range. It's the caller's responsibility to free the cached state.
+ *
+ * This function always returns with the given range locked, ensuring after it's
+ * called no order extent can be pending.
+ */
+void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
+					struct btrfs_inode *inode, u64 start,
+					u64 end,
+					struct extent_state **cached_state)
+{
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cache = NULL;
+	struct extent_state **cachedp = &cache;
+
+	if (cached_state)
+		cachedp = cached_state;
+
+	while (1) {
+		lock_extent_bits(tree, start, end, cachedp);
+		ordered = btrfs_lookup_ordered_range(inode, start,
+						     end - start + 1);
+		if (!ordered) {
+			/*
+			 * If no external cached_state has been passed then
+			 * decrement the extra ref taken for cachedp since we
+			 * aren't exposing it outside of this function
+			 */
+			if (!cached_state)
+				refcount_dec(&cache->refs);
+			break;
+		}
+		unlock_extent_cached(tree, start, end, cachedp);
+		btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+}
+
 int __init ordered_data_init(void)
 {
 	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 02d813a..5204171 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h

@@ -23,7 +23,7 @@
 	int len;
 	struct list_head list;
 	/* last field is a variable length array of csums */
-	u32 sums[];
+	u8 sums[];
 };
 
 /*
@@ -37,28 +37,31 @@
  * rbtree, just before waking any waiters.  It is used to indicate the
  * IO is done and any metadata is inserted into the tree.
  */
-#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
-
-#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
-
-#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
-
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
-
-#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */
-
-#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
-
-#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
-
-#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
-				       * has done its due diligence in updating
-				       * the isize. */
-#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */
-
-#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to
-				  * complete in the current transaction. */
-#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */
+enum {
+	/* set when all the pages are written */
+	BTRFS_ORDERED_IO_DONE,
+	/* set when removed from the tree */
+	BTRFS_ORDERED_COMPLETE,
+	/* set when we want to write in place */
+	BTRFS_ORDERED_NOCOW,
+	/* writing a zlib compressed extent */
+	BTRFS_ORDERED_COMPRESSED,
+	/* set when writing to preallocated extent */
+	BTRFS_ORDERED_PREALLOC,
+	/* set when we're doing DIO with this extent */
+	BTRFS_ORDERED_DIRECT,
+	/* We had an io error when writing this out */
+	BTRFS_ORDERED_IOERR,
+	/*
+	 * indicates whether this ordered extent has done its due diligence in
+	 * updating the isize
+	 */
+	BTRFS_ORDERED_UPDATED_ISIZE,
+	/* Set when we have to truncate an extent */
+	BTRFS_ORDERED_TRUNCATED,
+	/* Regular IO for COW */
+	BTRFS_ORDERED_REGULAR,
+};
 
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
@@ -164,8 +167,7 @@
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
 				      u64 start, u64 len, u64 disk_len,
 				      int type, int compress_type);
-void btrfs_add_ordered_sum(struct inode *inode,
-			   struct btrfs_ordered_extent *entry,
+void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
 			   struct btrfs_ordered_sum *sum);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
@@ -181,11 +183,15 @@
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
-			   u32 *sum, int len);
+			   u8 *sum, int len);
 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
 			       const u64 range_start, const u64 range_len);
 u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
 			      const u64 range_start, const u64 range_len);
+void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
+					struct btrfs_inode *inode, u64 start,
+					u64 end,
+					struct extent_state **cached_state);
 int __init ordered_data_init(void);
 void __cold ordered_data_exit(void);
 

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index df49931..9cb5057 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c

@@ -153,11 +153,11 @@
 #ifdef CONFIG_BTRFS_DEBUG
 	btrfs_info(eb->fs_info,
 "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
-		   atomic_read(&eb->refs), atomic_read(&eb->write_locks),
+		   atomic_read(&eb->refs), eb->write_locks,
 		   atomic_read(&eb->read_locks),
-		   atomic_read(&eb->blocking_writers),
+		   eb->blocking_writers,
 		   atomic_read(&eb->blocking_readers),
-		   atomic_read(&eb->spinning_writers),
+		   eb->spinning_writers,
 		   atomic_read(&eb->spinning_readers),
 		   eb->lock_owner, current->pid);
 #endif
@@ -189,7 +189,7 @@
 	btrfs_info(fs_info,
 		   "leaf %llu gen %llu total ptrs %d free space %d owner %llu",
 		   btrfs_header_bytenr(l), btrfs_header_generation(l), nr,
-		   btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l));
+		   btrfs_leaf_free_space(l), btrfs_header_owner(l));
 	print_eb_refs_lock(l);
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(i);

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dc61400..1e664e0 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c

@@ -23,36 +23,6 @@
 	int inheritable;
 };
 
-static int prop_compression_validate(const char *value, size_t len);
-static int prop_compression_apply(struct inode *inode,
-				  const char *value,
-				  size_t len);
-static const char *prop_compression_extract(struct inode *inode);
-
-static struct prop_handler prop_handlers[] = {
-	{
-		.xattr_name = XATTR_BTRFS_PREFIX "compression",
-		.validate = prop_compression_validate,
-		.apply = prop_compression_apply,
-		.extract = prop_compression_extract,
-		.inheritable = 1
-	},
-};
-
-void __init btrfs_props_init(void)
-{
-	int i;
-
-	hash_init(prop_handlers_ht);
-
-	for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
-		struct prop_handler *p = &prop_handlers[i];
-		u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
-
-		hash_add(prop_handlers_ht, &p->node, h);
-	}
-}
-
 static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
 {
 	struct hlist_head *h;
@@ -85,15 +55,9 @@
 	return NULL;
 }
 
-static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
-			    struct inode *inode,
-			    const char *name,
-			    const char *value,
-			    size_t value_len,
-			    int flags)
+int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
 {
 	const struct prop_handler *handler;
-	int ret;
 
 	if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
 		return -EINVAL;
@@ -102,9 +66,26 @@
 	if (!handler)
 		return -EINVAL;
 
+	if (value_len == 0)
+		return 0;
+
+	return handler->validate(value, value_len);
+}
+
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+		   const char *name, const char *value, size_t value_len,
+		   int flags)
+{
+	const struct prop_handler *handler;
+	int ret;
+
+	handler = find_prop_handler(name, NULL);
+	if (!handler)
+		return -EINVAL;
+
 	if (value_len == 0) {
 		ret = btrfs_setxattr(trans, inode, handler->xattr_name,
-				       NULL, 0, flags);
+				     NULL, 0, flags);
 		if (ret)
 			return ret;
 
@@ -114,17 +95,14 @@
 		return ret;
 	}
 
-	ret = handler->validate(value, value_len);
-	if (ret)
-		return ret;
-	ret = btrfs_setxattr(trans, inode, handler->xattr_name,
-			       value, value_len, flags);
+	ret = btrfs_setxattr(trans, inode, handler->xattr_name, value,
+			     value_len, flags);
 	if (ret)
 		return ret;
 	ret = handler->apply(inode, value, value_len);
 	if (ret) {
-		btrfs_setxattr(trans, inode, handler->xattr_name,
-				 NULL, 0, flags);
+		btrfs_setxattr(trans, inode, handler->xattr_name, NULL,
+			       0, flags);
 		return ret;
 	}
 
@@ -133,15 +111,6 @@
 	return 0;
 }
 
-int btrfs_set_prop(struct inode *inode,
-		   const char *name,
-		   const char *value,
-		   size_t value_len,
-		   int flags)
-{
-	return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
-}
-
 static int iterate_object_props(struct btrfs_root *root,
 				struct btrfs_path *path,
 				u64 objectid,
@@ -283,6 +252,74 @@
 	return ret;
 }
 
+static int prop_compression_validate(const char *value, size_t len)
+{
+	if (!value)
+		return 0;
+
+	if (btrfs_compress_is_valid_type(value, len))
+		return 0;
+
+	return -EINVAL;
+}
+
+static int prop_compression_apply(struct inode *inode, const char *value,
+				  size_t len)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	int type;
+
+	if (len == 0) {
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+
+		return 0;
+	}
+
+	if (!strncmp("lzo", value, 3)) {
+		type = BTRFS_COMPRESS_LZO;
+		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+	} else if (!strncmp("zlib", value, 4)) {
+		type = BTRFS_COMPRESS_ZLIB;
+	} else if (!strncmp("zstd", value, 4)) {
+		type = BTRFS_COMPRESS_ZSTD;
+		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+	} else {
+		return -EINVAL;
+	}
+
+	BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+	BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	BTRFS_I(inode)->prop_compress = type;
+
+	return 0;
+}
+
+static const char *prop_compression_extract(struct inode *inode)
+{
+	switch (BTRFS_I(inode)->prop_compress) {
+	case BTRFS_COMPRESS_ZLIB:
+	case BTRFS_COMPRESS_LZO:
+	case BTRFS_COMPRESS_ZSTD:
+		return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+static struct prop_handler prop_handlers[] = {
+	{
+		.xattr_name = XATTR_BTRFS_PREFIX "compression",
+		.validate = prop_compression_validate,
+		.apply = prop_compression_apply,
+		.extract = prop_compression_extract,
+		.inheritable = 1
+	},
+};
+
 static int inherit_props(struct btrfs_trans_handle *trans,
 			 struct inode *inode,
 			 struct inode *parent)
@@ -291,6 +328,7 @@
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	int i;
+	bool need_reserve = false;
 
 	if (!test_bit(BTRFS_INODE_HAS_PROPS,
 		      &BTRFS_I(parent)->runtime_flags))
@@ -299,7 +337,7 @@
 	for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
 		const struct prop_handler *h = &prop_handlers[i];
 		const char *value;
-		u64 num_bytes;
+		u64 num_bytes = 0;
 
 		if (!h->inheritable)
 			continue;
@@ -308,20 +346,51 @@
 		if (!value)
 			continue;
 
-		num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-		ret = btrfs_block_rsv_add(root, trans->block_rsv,
-					  num_bytes, BTRFS_RESERVE_NO_FLUSH);
+		/*
+		 * This is not strictly necessary as the property should be
+		 * valid, but in case it isn't, don't propagate it futher.
+		 */
+		ret = h->validate(value, strlen(value));
 		if (ret)
-			goto out;
-		ret = __btrfs_set_prop(trans, inode, h->xattr_name,
-				       value, strlen(value), 0);
-		btrfs_block_rsv_release(fs_info, trans->block_rsv, num_bytes);
-		if (ret)
-			goto out;
+			continue;
+
+		/*
+		 * Currently callers should be reserving 1 item for properties,
+		 * since we only have 1 property that we currently support.  If
+		 * we add more in the future we need to try and reserve more
+		 * space for them.  But we should also revisit how we do space
+		 * reservations if we do add more properties in the future.
+		 */
+		if (need_reserve) {
+			num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+			ret = btrfs_block_rsv_add(root, trans->block_rsv,
+					num_bytes, BTRFS_RESERVE_NO_FLUSH);
+			if (ret)
+				return ret;
+		}
+
+		ret = btrfs_setxattr(trans, inode, h->xattr_name, value,
+				     strlen(value), 0);
+		if (!ret) {
+			ret = h->apply(inode, value, strlen(value));
+			if (ret)
+				btrfs_setxattr(trans, inode, h->xattr_name,
+					       NULL, 0, 0);
+			else
+				set_bit(BTRFS_INODE_HAS_PROPS,
+					&BTRFS_I(inode)->runtime_flags);
+		}
+
+		if (need_reserve) {
+			btrfs_block_rsv_release(fs_info, trans->block_rsv,
+					num_bytes);
+			if (ret)
+				return ret;
+		}
+		need_reserve = true;
 	}
-	ret = 0;
-out:
-	return ret;
+
+	return 0;
 }
 
 int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
@@ -364,64 +433,17 @@
 	return ret;
 }
 
-static int prop_compression_validate(const char *value, size_t len)
+void __init btrfs_props_init(void)
 {
-	if (!strncmp("lzo", value, len))
-		return 0;
-	else if (!strncmp("zlib", value, len))
-		return 0;
-	else if (!strncmp("zstd", value, len))
-		return 0;
+	int i;
 
-	return -EINVAL;
-}
+	hash_init(prop_handlers_ht);
 
-static int prop_compression_apply(struct inode *inode,
-				  const char *value,
-				  size_t len)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	int type;
+	for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+		struct prop_handler *p = &prop_handlers[i];
+		u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
 
-	if (len == 0) {
-		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
-		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
-		BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
-
-		return 0;
+		hash_add(prop_handlers_ht, &p->node, h);
 	}
-
-	if (!strncmp("lzo", value, 3)) {
-		type = BTRFS_COMPRESS_LZO;
-		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
-	} else if (!strncmp("zlib", value, 4)) {
-		type = BTRFS_COMPRESS_ZLIB;
-	} else if (!strncmp("zstd", value, len)) {
-		type = BTRFS_COMPRESS_ZSTD;
-		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
-	} else {
-		return -EINVAL;
-	}
-
-	BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
-	BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
-	BTRFS_I(inode)->prop_compress = type;
-
-	return 0;
 }
 
-static const char *prop_compression_extract(struct inode *inode)
-{
-	switch (BTRFS_I(inode)->prop_compress) {
-	case BTRFS_COMPRESS_ZLIB:
-	case BTRFS_COMPRESS_LZO:
-	case BTRFS_COMPRESS_ZSTD:
-		return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
-	default:
-		break;
-	}
-
-	return NULL;
-}
-
-

diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 618815b..40b2c65 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h

@@ -10,11 +10,10 @@
 
 void __init btrfs_props_init(void);
 
-int btrfs_set_prop(struct inode *inode,
-		   const char *name,
-		   const char *value,
-		   size_t value_len,
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+		   const char *name, const char *value, size_t value_len,
 		   int flags);
+int btrfs_validate_prop(const char *name, const char *value, size_t value_len);
 
 int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
 

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ff43466..3ad1516 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c

@@ -21,7 +21,7 @@
 #include "backref.h"
 #include "extent_io.h"
 #include "qgroup.h"
-
+#include "block-group.h"
 
 /* TODO XXX FIXME
  *  - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -30,7 +30,7 @@
  *  - sync
  *  - copy also limits on subvol creation
  *  - limit
- *  - caches fuer ulists
+ *  - caches for ulists
  *  - performance benchmarks
  *  - check all ioctl parameters
  */
@@ -522,7 +522,7 @@
 		__del_qgroup_rb(qgroup);
 	}
 	/*
-	 * we call btrfs_free_qgroup_config() when umounting
+	 * We call btrfs_free_qgroup_config() when unmounting
 	 * filesystem and disabling quota, so we set qgroup_ulist
 	 * to be null here to avoid double free.
 	 */
@@ -894,6 +894,12 @@
 	if (fs_info->quota_root)
 		goto out;
 
+	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
+	if (!fs_info->qgroup_ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	/*
 	 * 1 for quota root item
 	 * 1 for BTRFS_QGROUP_STATUS item
@@ -909,18 +915,10 @@
 		goto out;
 	}
 
-	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
-	if (!fs_info->qgroup_ulist) {
-		ret = -ENOMEM;
-		btrfs_abort_transaction(trans, ret);
-		goto out;
-	}
-
 	/*
 	 * initially create the quota tree
 	 */
-	quota_root = btrfs_create_tree(trans, fs_info,
-				       BTRFS_QUOTA_TREE_OBJECTID);
+	quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
 	if (IS_ERR(quota_root)) {
 		ret =  PTR_ERR(quota_root);
 		btrfs_abort_transaction(trans, ret);
@@ -1013,16 +1011,22 @@
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
 	}
-	spin_lock(&fs_info->qgroup_lock);
-	fs_info->quota_root = quota_root;
-	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
-	spin_unlock(&fs_info->qgroup_lock);
 
 	ret = btrfs_commit_transaction(trans);
 	trans = NULL;
 	if (ret)
 		goto out_free_path;
 
+	/*
+	 * Set quota enabled flag after committing the transaction, to avoid
+	 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
+	 * creation.
+	 */
+	spin_lock(&fs_info->qgroup_lock);
+	fs_info->quota_root = quota_root;
+	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+	spin_unlock(&fs_info->qgroup_lock);
+
 	ret = qgroup_rescan_init(fs_info, 0, 1);
 	if (!ret) {
 	        qgroup_rescan_zero_tracking(fs_info);
@@ -1096,7 +1100,7 @@
 	list_del(&quota_root->dirty_list);
 
 	btrfs_tree_lock(quota_root->node);
-	clean_tree_block(fs_info, quota_root->node);
+	btrfs_clean_tree_block(quota_root->node);
 	btrfs_tree_unlock(quota_root->node);
 	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
 
@@ -1122,7 +1126,7 @@
  * The easy accounting, we're updating qgroup relationship whose child qgroup
  * only has exclusive extents.
  *
- * In this case, all exclsuive extents will also be exlusive for parent, so
+ * In this case, all exclusive extents will also be exclusive for parent, so
  * excl/rfer just get added/removed.
  *
  * So is qgroup reservation space, which should also be added/removed to
@@ -1308,8 +1312,9 @@
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup_list *list;
 	struct ulist *tmp;
+	bool found = false;
 	int ret = 0;
-	int err;
+	int ret2;
 
 	tmp = ulist_alloc(GFP_KERNEL);
 	if (!tmp)
@@ -1323,28 +1328,39 @@
 
 	member = find_qgroup_rb(fs_info, src);
 	parent = find_qgroup_rb(fs_info, dst);
-	if (!member || !parent) {
-		ret = -EINVAL;
-		goto out;
-	}
+	/*
+	 * The parent/member pair doesn't exist, then try to delete the dead
+	 * relation items only.
+	 */
+	if (!member || !parent)
+		goto delete_item;
 
 	/* check if such qgroup relation exist firstly */
 	list_for_each_entry(list, &member->groups, next_group) {
-		if (list->group == parent)
-			goto exist;
+		if (list->group == parent) {
+			found = true;
+			break;
+		}
 	}
-	ret = -ENOENT;
-	goto out;
-exist:
-	ret = del_qgroup_relation_item(trans, src, dst);
-	err = del_qgroup_relation_item(trans, dst, src);
-	if (err && !ret)
-		ret = err;
 
-	spin_lock(&fs_info->qgroup_lock);
-	del_relation_rb(fs_info, src, dst);
-	ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
-	spin_unlock(&fs_info->qgroup_lock);
+delete_item:
+	ret = del_qgroup_relation_item(trans, src, dst);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	ret2 = del_qgroup_relation_item(trans, dst, src);
+	if (ret2 < 0 && ret2 != -ENOENT)
+		goto out;
+
+	/* At least one deletion succeeded, return 0 */
+	if (!ret || !ret2)
+		ret = 0;
+
+	if (found) {
+		spin_lock(&fs_info->qgroup_lock);
+		del_relation_rb(fs_info, src, dst);
+		ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+		spin_unlock(&fs_info->qgroup_lock);
+	}
 out:
 	ulist_free(tmp);
 	return ret;
@@ -1416,13 +1432,14 @@
 	if (!qgroup) {
 		ret = -ENOENT;
 		goto out;
-	} else {
-		/* check if there are no children of this qgroup */
-		if (!list_empty(&qgroup->members)) {
-			ret = -EBUSY;
-			goto out;
-		}
 	}
+
+	/* Check if there are no children of this qgroup */
+	if (!list_empty(&qgroup->members)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	ret = del_qgroup_item(trans, qgroupid);
 	if (ret && ret != -ENOENT)
 		goto out;
@@ -1539,12 +1556,18 @@
 		parent_node = *p;
 		entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
 				 node);
-		if (bytenr < entry->bytenr)
+		if (bytenr < entry->bytenr) {
 			p = &(*p)->rb_left;
-		else if (bytenr > entry->bytenr)
+		} else if (bytenr > entry->bytenr) {
 			p = &(*p)->rb_right;
-		else
+		} else {
+			if (record->data_rsv && !entry->data_rsv) {
+				entry->data_rsv = record->data_rsv;
+				entry->data_rsv_refroot =
+					record->data_rsv_refroot;
+			}
 			return 1;
+		}
 	}
 
 	rb_link_node(&record->node, parent_node, p);
@@ -1590,7 +1613,7 @@
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
 	    || bytenr == 0 || num_bytes == 0)
 		return 0;
-	record = kmalloc(sizeof(*record), gfp_flag);
+	record = kzalloc(sizeof(*record), gfp_flag);
 	if (!record)
 		return -ENOMEM;
 
@@ -1712,6 +1735,357 @@
 	return 0;
 }
 
+/*
+ * Helper function to trace a subtree tree block swap.
+ *
+ * The swap will happen in highest tree block, but there may be a lot of
+ * tree blocks involved.
+ *
+ * For example:
+ *  OO = Old tree blocks
+ *  NN = New tree blocks allocated during balance
+ *
+ *           File tree (257)                  Reloc tree for 257
+ * L2              OO                                NN
+ *               /    \                            /    \
+ * L1          OO      OO (a)                    OO      NN (a)
+ *            / \     / \                       / \     / \
+ * L0       OO   OO OO   OO                   OO   OO NN   NN
+ *                  (b)  (c)                          (b)  (c)
+ *
+ * When calling qgroup_trace_extent_swap(), we will pass:
+ * @src_eb = OO(a)
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
+ * @dst_level = 0
+ * @root_level = 1
+ *
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
+ *
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
+ *
+ * 1) Tree search from @src_eb
+ *    It should acts as a simplified btrfs_search_slot().
+ *    The key for search can be extracted from @dst_path->nodes[dst_level]
+ *    (first key).
+ *
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
+ *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
+ *    They should be marked during previous (@dst_level = 1) iteration.
+ *
+ * 3) Mark file extents in leaves dirty
+ *    We don't have good way to pick out new file extents only.
+ *    So we still follow the old method by scanning all file extents in
+ *    the leave.
+ *
+ * This function can free us from keeping two paths, thus later we only need
+ * to care about how to iterate all new tree blocks in reloc tree.
+ */
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
+				    struct extent_buffer *src_eb,
+				    struct btrfs_path *dst_path,
+				    int dst_level, int root_level,
+				    bool trace_leaf)
+{
+	struct btrfs_key key;
+	struct btrfs_path *src_path;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	u32 nodesize = fs_info->nodesize;
+	int cur_level = root_level;
+	int ret;
+
+	BUG_ON(dst_level > root_level);
+	/* Level mismatch */
+	if (btrfs_header_level(src_eb) != root_level)
+		return -EINVAL;
+
+	src_path = btrfs_alloc_path();
+	if (!src_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (dst_level)
+		btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+	else
+		btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+
+	/* For src_path */
+	extent_buffer_get(src_eb);
+	src_path->nodes[root_level] = src_eb;
+	src_path->slots[root_level] = dst_path->slots[root_level];
+	src_path->locks[root_level] = 0;
+
+	/* A simplified version of btrfs_search_slot() */
+	while (cur_level >= dst_level) {
+		struct btrfs_key src_key;
+		struct btrfs_key dst_key;
+
+		if (src_path->nodes[cur_level] == NULL) {
+			struct btrfs_key first_key;
+			struct extent_buffer *eb;
+			int parent_slot;
+			u64 child_gen;
+			u64 child_bytenr;
+
+			eb = src_path->nodes[cur_level + 1];
+			parent_slot = src_path->slots[cur_level + 1];
+			child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+			child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+			btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+			eb = read_tree_block(fs_info, child_bytenr, child_gen,
+					     cur_level, &first_key);
+			if (IS_ERR(eb)) {
+				ret = PTR_ERR(eb);
+				goto out;
+			} else if (!extent_buffer_uptodate(eb)) {
+				free_extent_buffer(eb);
+				ret = -EIO;
+				goto out;
+			}
+
+			src_path->nodes[cur_level] = eb;
+
+			btrfs_tree_read_lock(eb);
+			btrfs_set_lock_blocking_read(eb);
+			src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+		}
+
+		src_path->slots[cur_level] = dst_path->slots[cur_level];
+		if (cur_level) {
+			btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
+					&dst_key, dst_path->slots[cur_level]);
+			btrfs_node_key_to_cpu(src_path->nodes[cur_level],
+					&src_key, src_path->slots[cur_level]);
+		} else {
+			btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
+					&dst_key, dst_path->slots[cur_level]);
+			btrfs_item_key_to_cpu(src_path->nodes[cur_level],
+					&src_key, src_path->slots[cur_level]);
+		}
+		/* Content mismatch, something went wrong */
+		if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
+			ret = -ENOENT;
+			goto out;
+		}
+		cur_level--;
+	}
+
+	/*
+	 * Now both @dst_path and @src_path have been populated, record the tree
+	 * blocks for qgroup accounting.
+	 */
+	ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
+			nodesize, GFP_NOFS);
+	if (ret < 0)
+		goto out;
+	ret = btrfs_qgroup_trace_extent(trans,
+			dst_path->nodes[dst_level]->start,
+			nodesize, GFP_NOFS);
+	if (ret < 0)
+		goto out;
+
+	/* Record leaf file extents */
+	if (dst_level == 0 && trace_leaf) {
+		ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
+		if (ret < 0)
+			goto out;
+		ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
+	}
+out:
+	btrfs_free_path(src_path);
+	return ret;
+}
+
+/*
+ * Helper function to do recursive generation-aware depth-first search, to
+ * locate all new tree blocks in a subtree of reloc tree.
+ *
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
+ *         reloc tree
+ * L2         NN (a)
+ *          /    \
+ * L1    OO        NN (b)
+ *      /  \      /  \
+ * L0  OO  OO    OO  NN
+ *               (c) (d)
+ * If we pass:
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
+ * @cur_level = 1
+ * @root_level = 1
+ *
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
+ * above tree blocks along with their counter parts in file tree.
+ * While during search, old tree blocks OO(c) will be skipped as tree block swap
+ * won't affect OO(c).
+ */
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
+					   struct extent_buffer *src_eb,
+					   struct btrfs_path *dst_path,
+					   int cur_level, int root_level,
+					   u64 last_snapshot, bool trace_leaf)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct extent_buffer *eb;
+	bool need_cleanup = false;
+	int ret = 0;
+	int i;
+
+	/* Level sanity check */
+	if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+	    root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
+	    root_level < cur_level) {
+		btrfs_err_rl(fs_info,
+			"%s: bad levels, cur_level=%d root_level=%d",
+			__func__, cur_level, root_level);
+		return -EUCLEAN;
+	}
+
+	/* Read the tree block if needed */
+	if (dst_path->nodes[cur_level] == NULL) {
+		struct btrfs_key first_key;
+		int parent_slot;
+		u64 child_gen;
+		u64 child_bytenr;
+
+		/*
+		 * dst_path->nodes[root_level] must be initialized before
+		 * calling this function.
+		 */
+		if (cur_level == root_level) {
+			btrfs_err_rl(fs_info,
+	"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
+				__func__, root_level, root_level, cur_level);
+			return -EUCLEAN;
+		}
+
+		/*
+		 * We need to get child blockptr/gen from parent before we can
+		 * read it.
+		  */
+		eb = dst_path->nodes[cur_level + 1];
+		parent_slot = dst_path->slots[cur_level + 1];
+		child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+		child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+		btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+		/* This node is old, no need to trace */
+		if (child_gen < last_snapshot)
+			goto out;
+
+		eb = read_tree_block(fs_info, child_bytenr, child_gen,
+				     cur_level, &first_key);
+		if (IS_ERR(eb)) {
+			ret = PTR_ERR(eb);
+			goto out;
+		} else if (!extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			ret = -EIO;
+			goto out;
+		}
+
+		dst_path->nodes[cur_level] = eb;
+		dst_path->slots[cur_level] = 0;
+
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_read(eb);
+		dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+		need_cleanup = true;
+	}
+
+	/* Now record this tree block and its counter part for qgroups */
+	ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
+				       root_level, trace_leaf);
+	if (ret < 0)
+		goto cleanup;
+
+	eb = dst_path->nodes[cur_level];
+
+	if (cur_level > 0) {
+		/* Iterate all child tree blocks */
+		for (i = 0; i < btrfs_header_nritems(eb); i++) {
+			/* Skip old tree blocks as they won't be swapped */
+			if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
+				continue;
+			dst_path->slots[cur_level] = i;
+
+			/* Recursive call (at most 7 times) */
+			ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
+					dst_path, cur_level - 1, root_level,
+					last_snapshot, trace_leaf);
+			if (ret < 0)
+				goto cleanup;
+		}
+	}
+
+cleanup:
+	if (need_cleanup) {
+		/* Clean up */
+		btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
+				     dst_path->locks[cur_level]);
+		free_extent_buffer(dst_path->nodes[cur_level]);
+		dst_path->nodes[cur_level] = NULL;
+		dst_path->slots[cur_level] = 0;
+		dst_path->locks[cur_level] = 0;
+	}
+out:
+	return ret;
+}
+
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+				struct extent_buffer *src_eb,
+				struct extent_buffer *dst_eb,
+				u64 last_snapshot, bool trace_leaf)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_path *dst_path = NULL;
+	int level;
+	int ret;
+
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+		return 0;
+
+	/* Wrong parameter order */
+	if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
+		btrfs_err_rl(fs_info,
+		"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
+			     btrfs_header_generation(src_eb),
+			     btrfs_header_generation(dst_eb));
+		return -EUCLEAN;
+	}
+
+	if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	level = btrfs_header_level(dst_eb);
+	dst_path = btrfs_alloc_path();
+	if (!dst_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	/* For dst_path */
+	extent_buffer_get(dst_eb);
+	dst_path->nodes[level] = dst_eb;
+	dst_path->slots[level] = 0;
+	dst_path->locks[level] = 0;
+
+	/* Do the generation aware breadth-first search */
+	ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
+					      level, last_snapshot, trace_leaf);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	btrfs_free_path(dst_path);
+	if (ret < 0)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	return ret;
+}
+
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 			       struct extent_buffer *root_eb,
 			       u64 root_gen, int root_level)
@@ -1790,7 +2164,7 @@
 			path->slots[level] = 0;
 
 			btrfs_tree_read_lock(eb);
-			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+			btrfs_set_lock_blocking_read(eb);
 			path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
 
 			ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -2132,6 +2506,7 @@
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct ulist *new_roots = NULL;
 	struct rb_node *node;
+	u64 num_dirty_extents = 0;
 	u64 qgroup_to_skip;
 	int ret = 0;
 
@@ -2141,6 +2516,7 @@
 		record = rb_entry(node, struct btrfs_qgroup_extent_record,
 				  node);
 
+		num_dirty_extents++;
 		trace_btrfs_qgroup_account_extents(fs_info, record);
 
 		if (!ret) {
@@ -2157,6 +2533,11 @@
 					goto cleanup;
 			}
 
+			/* Free the reserved data space */
+			btrfs_qgroup_free_refroot(fs_info,
+					record->data_rsv_refroot,
+					record->data_rsv,
+					BTRFS_QGROUP_RSV_DATA);
 			/*
 			 * Use SEQ_LAST as time_seq to do special search, which
 			 * doesn't lock tree or delayed_refs and search current
@@ -2186,6 +2567,8 @@
 		kfree(record);
 
 	}
+	trace_qgroup_num_dirty_extents(fs_info, trans->transid,
+				       num_dirty_extents);
 	return ret;
 }
 
@@ -2243,6 +2626,7 @@
 	int ret = 0;
 	int i;
 	u64 *i_qgroups;
+	bool committing = false;
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *srcgroup;
@@ -2250,7 +2634,25 @@
 	u32 level_size = 0;
 	u64 nums;
 
-	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	/*
+	 * There are only two callers of this function.
+	 *
+	 * One in create_subvol() in the ioctl context, which needs to hold
+	 * the qgroup_ioctl_lock.
+	 *
+	 * The other one in create_pending_snapshot() where no other qgroup
+	 * code can modify the fs as they all need to either start a new trans
+	 * or hold a trans handler, thus we don't need to hold
+	 * qgroup_ioctl_lock.
+	 * This would avoid long and complex lock chain and make lockdep happy.
+	 */
+	spin_lock(&fs_info->trans_lock);
+	if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
+		committing = true;
+	spin_unlock(&fs_info->trans_lock);
+
+	if (!committing)
+		mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 		goto out;
 
@@ -2414,23 +2816,23 @@
 unlock:
 	spin_unlock(&fs_info->qgroup_lock);
 out:
-	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	if (!committing)
+		mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
 /*
  * Two limits to commit transaction in advance.
  *
- * For RATIO, it will be 1/RATIO of the remaining limit
- * (excluding data and prealloc meta) as threshold.
+ * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
  * For SIZE, it will be in byte unit as threshold.
  */
-#define QGROUP_PERTRANS_RATIO		32
-#define QGROUP_PERTRANS_SIZE		SZ_32M
+#define QGROUP_FREE_RATIO		32
+#define QGROUP_FREE_SIZE		SZ_32M
 static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
 				const struct btrfs_qgroup *qg, u64 num_bytes)
 {
-	u64 limit;
+	u64 free;
 	u64 threshold;
 
 	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2449,20 +2851,21 @@
 	 */
 	if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
 			      BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
-		if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
-			limit = qg->max_excl;
-		else
-			limit = qg->max_rfer;
-		threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
-			    qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
-			    QGROUP_PERTRANS_RATIO;
-		threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+		if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+			free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+			threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+					  QGROUP_FREE_SIZE);
+		} else {
+			free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+			threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+					  QGROUP_FREE_SIZE);
+		}
 
 		/*
 		 * Use transaction_kthread to commit transaction, so we no
 		 * longer need to bother nested transaction nor lock context.
 		 */
-		if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+		if (free < threshold)
 			btrfs_commit_transaction_locksafe(fs_info);
 	}
 
@@ -2538,7 +2941,6 @@
 
 		qg = unode_aux_to_qgroup(unode);
 
-		trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
 		qgroup_rsv_add(fs_info, qg, num_bytes, type);
 	}
 
@@ -2605,7 +3007,6 @@
 
 		qg = unode_aux_to_qgroup(unode);
 
-		trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
 		qgroup_rsv_release(fs_info, qg, num_bytes, type);
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
@@ -2689,9 +3090,6 @@
 		mutex_unlock(&fs_info->qgroup_rescan_lock);
 		goto out;
 	}
-	extent_buffer_get(scratch_leaf);
-	btrfs_tree_read_lock(scratch_leaf);
-	btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
 	slot = path->slots[0];
 	btrfs_release_path(path);
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2717,10 +3115,8 @@
 			goto out;
 	}
 out:
-	if (scratch_leaf) {
-		btrfs_tree_read_unlock_blocking(scratch_leaf);
+	if (scratch_leaf)
 		free_extent_buffer(scratch_leaf);
-	}
 
 	if (done && !ret) {
 		ret = 1;
@@ -2770,9 +3166,6 @@
 	btrfs_free_path(path);
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);
-	if (!btrfs_fs_closing(fs_info))
-		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
-
 	if (err > 0 &&
 	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
 		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2788,16 +3181,30 @@
 	trans = btrfs_start_transaction(fs_info->quota_root, 1);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
+		trans = NULL;
 		btrfs_err(fs_info,
 			  "fail to start transaction for status update: %d",
 			  err);
-		goto done;
 	}
-	ret = update_qgroup_status_item(trans);
-	if (ret < 0) {
-		err = ret;
-		btrfs_err(fs_info, "fail to update qgroup status: %d", err);
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	if (!btrfs_fs_closing(fs_info))
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	if (trans) {
+		ret = update_qgroup_status_item(trans);
+		if (ret < 0) {
+			err = ret;
+			btrfs_err(fs_info, "fail to update qgroup status: %d",
+				  err);
+		}
 	}
+	fs_info->qgroup_rescan_running = false;
+	complete_all(&fs_info->qgroup_rescan_completion);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	if (!trans)
+		return;
+
 	btrfs_end_transaction(trans);
 
 	if (btrfs_fs_closing(fs_info)) {
@@ -2808,12 +3215,6 @@
 	} else {
 		btrfs_err(fs_info, "qgroup scan failed with %d", err);
 	}
-
-done:
-	mutex_lock(&fs_info->qgroup_rescan_lock);
-	fs_info->qgroup_rescan_running = false;
-	mutex_unlock(&fs_info->qgroup_rescan_lock);
-	complete_all(&fs_info->qgroup_rescan_completion);
 }
 
 /*
@@ -3006,7 +3407,7 @@
 	int ret;
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
-	    !is_fstree(root->objectid) || len == 0)
+	    !is_fstree(root->root_key.objectid) || len == 0)
 		return 0;
 
 	/* @reserved parameter is mandatory for qgroup */
@@ -3041,6 +3442,9 @@
 	while ((unode = ulist_next(&reserved->range_changed, &uiter)))
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
 				 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
+	/* Also free data bytes of already reserved one */
+	btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
+				  orig_reserved, BTRFS_QGROUP_RSV_DATA);
 	extent_changeset_release(reserved);
 	return ret;
 }
@@ -3085,14 +3489,14 @@
 		 * EXTENT_QGROUP_RESERVED, we won't double free.
 		 * So not need to rush.
 		 */
-		ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+		ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
 				free_start, free_start + free_len - 1,
 				EXTENT_QGROUP_RESERVED, &changeset);
 		if (ret < 0)
 			goto out;
 		freed += changeset.bytes_changed;
 	}
-	btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+	btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
 				  BTRFS_QGROUP_RSV_DATA);
 	ret = freed;
 out:
@@ -3128,7 +3532,7 @@
 					changeset.bytes_changed, trace_op);
 	if (free)
 		btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
-				BTRFS_I(inode)->root->objectid,
+				BTRFS_I(inode)->root->root_key.objectid,
 				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
 	ret = changeset.bytes_changed;
 out:
@@ -3221,11 +3625,11 @@
 	int ret;
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
-	    !is_fstree(root->objectid) || num_bytes == 0)
+	    !is_fstree(root->root_key.objectid) || num_bytes == 0)
 		return 0;
 
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
-	trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
+	trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
 	ret = qgroup_reserve(root, num_bytes, enforce, type);
 	if (ret < 0)
 		return ret;
@@ -3246,13 +3650,13 @@
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
-	    !is_fstree(root->objectid))
+	    !is_fstree(root->root_key.objectid))
 		return;
 
 	/* TODO: Update trace point to handle such free */
 	trace_qgroup_meta_free_all_pertrans(root);
 	/* Special value -1 means to free all reserved space */
-	btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
+	btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
 				  BTRFS_QGROUP_RSV_META_PERTRANS);
 }
 
@@ -3262,7 +3666,7 @@
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
-	    !is_fstree(root->objectid))
+	    !is_fstree(root->root_key.objectid))
 		return;
 
 	/*
@@ -3272,8 +3676,9 @@
 	 */
 	num_bytes = sub_root_meta_rsv(root, num_bytes, type);
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
-	trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
-	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
+	trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
+	btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
+				  num_bytes, type);
 }
 
 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
@@ -3327,13 +3732,13 @@
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
-	    !is_fstree(root->objectid))
+	    !is_fstree(root->root_key.objectid))
 		return;
 	/* Same as btrfs_qgroup_free_meta_prealloc() */
 	num_bytes = sub_root_meta_rsv(root, num_bytes,
 				      BTRFS_QGROUP_RSV_META_PREALLOC);
 	trace_qgroup_meta_convert(root, num_bytes);
-	qgroup_convert_meta(fs_info, root->objectid, num_bytes);
+	qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
 }
 
 /*
@@ -3360,9 +3765,253 @@
 				inode->i_ino, unode->val, unode->aux);
 		}
 		btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
-				BTRFS_I(inode)->root->objectid,
+				BTRFS_I(inode)->root->root_key.objectid,
 				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
 
 	}
 	extent_changeset_release(&changeset);
 }
+
+void btrfs_qgroup_init_swapped_blocks(
+	struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+	int i;
+
+	spin_lock_init(&swapped_blocks->lock);
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		swapped_blocks->blocks[i] = RB_ROOT;
+	swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+	struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+	int i;
+
+	swapped_blocks = &root->swapped_blocks;
+
+	spin_lock(&swapped_blocks->lock);
+	if (!swapped_blocks->swapped)
+		goto out;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		struct rb_root *cur_root = &swapped_blocks->blocks[i];
+		struct btrfs_qgroup_swapped_block *entry;
+		struct btrfs_qgroup_swapped_block *next;
+
+		rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+						     node)
+			kfree(entry);
+		swapped_blocks->blocks[i] = RB_ROOT;
+	}
+	swapped_blocks->swapped = false;
+out:
+	spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root:	tree root of the subvolume tree get swapped
+ * @bg:			block group under balance
+ * @subvol_parent/slot:	pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot:	pointer to the subtree root in reloc tree
+ *			BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot:	last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+		struct btrfs_root *subvol_root,
+		struct btrfs_block_group_cache *bg,
+		struct extent_buffer *subvol_parent, int subvol_slot,
+		struct extent_buffer *reloc_parent, int reloc_slot,
+		u64 last_snapshot)
+{
+	struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+	struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+	struct btrfs_qgroup_swapped_block *block;
+	struct rb_node **cur;
+	struct rb_node *parent = NULL;
+	int level = btrfs_header_level(subvol_parent) - 1;
+	int ret = 0;
+
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+		return 0;
+
+	if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+	    btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+		btrfs_err_rl(fs_info,
+		"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+			__func__,
+			btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+			btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+		return -EUCLEAN;
+	}
+
+	block = kmalloc(sizeof(*block), GFP_NOFS);
+	if (!block) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * @reloc_parent/slot is still before swap, while @block is going to
+	 * record the bytenr after swap, so we do the swap here.
+	 */
+	block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+	block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+							     reloc_slot);
+	block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+	block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+							    subvol_slot);
+	block->last_snapshot = last_snapshot;
+	block->level = level;
+
+	/*
+	 * If we have bg == NULL, we're called from btrfs_recover_relocation(),
+	 * no one else can modify tree blocks thus we qgroup will not change
+	 * no matter the value of trace_leaf.
+	 */
+	if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
+		block->trace_leaf = true;
+	else
+		block->trace_leaf = false;
+	btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+	/* Insert @block into @blocks */
+	spin_lock(&blocks->lock);
+	cur = &blocks->blocks[level].rb_node;
+	while (*cur) {
+		struct btrfs_qgroup_swapped_block *entry;
+
+		parent = *cur;
+		entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+				 node);
+
+		if (entry->subvol_bytenr < block->subvol_bytenr) {
+			cur = &(*cur)->rb_left;
+		} else if (entry->subvol_bytenr > block->subvol_bytenr) {
+			cur = &(*cur)->rb_right;
+		} else {
+			if (entry->subvol_generation !=
+					block->subvol_generation ||
+			    entry->reloc_bytenr != block->reloc_bytenr ||
+			    entry->reloc_generation !=
+					block->reloc_generation) {
+				/*
+				 * Duplicated but mismatch entry found.
+				 * Shouldn't happen.
+				 *
+				 * Marking qgroup inconsistent should be enough
+				 * for end users.
+				 */
+				WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+				ret = -EEXIST;
+			}
+			kfree(block);
+			goto out_unlock;
+		}
+	}
+	rb_link_node(&block->node, parent, cur);
+	rb_insert_color(&block->node, &blocks->blocks[level]);
+	blocks->swapped = true;
+out_unlock:
+	spin_unlock(&blocks->lock);
+out:
+	if (ret < 0)
+		fs_info->qgroup_flags |=
+			BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct extent_buffer *subvol_eb)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+	struct btrfs_qgroup_swapped_block *block;
+	struct extent_buffer *reloc_eb = NULL;
+	struct rb_node *node;
+	bool found = false;
+	bool swapped = false;
+	int level = btrfs_header_level(subvol_eb);
+	int ret = 0;
+	int i;
+
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+		return 0;
+	if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+		return 0;
+
+	spin_lock(&blocks->lock);
+	if (!blocks->swapped) {
+		spin_unlock(&blocks->lock);
+		return 0;
+	}
+	node = blocks->blocks[level].rb_node;
+
+	while (node) {
+		block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+		if (block->subvol_bytenr < subvol_eb->start) {
+			node = node->rb_left;
+		} else if (block->subvol_bytenr > subvol_eb->start) {
+			node = node->rb_right;
+		} else {
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		spin_unlock(&blocks->lock);
+		goto out;
+	}
+	/* Found one, remove it from @blocks first and update blocks->swapped */
+	rb_erase(&block->node, &blocks->blocks[level]);
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+			swapped = true;
+			break;
+		}
+	}
+	blocks->swapped = swapped;
+	spin_unlock(&blocks->lock);
+
+	/* Read out reloc subtree root */
+	reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+				   block->reloc_generation, block->level,
+				   &block->first_key);
+	if (IS_ERR(reloc_eb)) {
+		ret = PTR_ERR(reloc_eb);
+		reloc_eb = NULL;
+		goto free_out;
+	}
+	if (!extent_buffer_uptodate(reloc_eb)) {
+		ret = -EIO;
+		goto free_out;
+	}
+
+	ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+			block->last_snapshot, block->trace_leaf);
+free_out:
+	kfree(block);
+	free_extent_buffer(reloc_eb);
+out:
+	if (ret < 0) {
+		btrfs_err_rl(fs_info,
+			     "failed to account subtree at bytenr %llu: %d",
+			     subvol_eb->start, ret);
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	}
+	return ret;
+}

diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 4bbcc1e..46ba7bd 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h

@@ -6,6 +6,8 @@
 #ifndef BTRFS_QGROUP_H
 #define BTRFS_QGROUP_H
 
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
 #include "ulist.h"
 #include "delayed-ref.h"
 
@@ -38,6 +40,66 @@
  */
 
 /*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ *    During subtree swap:
+ *    O = Old tree blocks
+ *    N = New tree blocks
+ *          reloc tree                     subvolume tree X
+ *             Root                               Root
+ *            /    \                             /    \
+ *          NA     OB                          OA      OB
+ *        /  |     |  \                      /  |      |  \
+ *      NC  ND     OE  OF                   OC  OD     OE  OF
+ *
+ *   In this case, NA and OA are going to be swapped, record (NA, OA) into
+ *   subvolume tree X.
+ *
+ * 2) After subtree swap.
+ *          reloc tree                     subvolume tree X
+ *             Root                               Root
+ *            /    \                             /    \
+ *          OA     OB                          NA      OB
+ *        /  |     |  \                      /  |      |  \
+ *      OC  OD     OE  OF                   NC  ND     OE  OF
+ *
+ * 3a) COW happens for OB
+ *     If we are going to COW tree block OB, we check OB's bytenr against
+ *     tree X's swapped_blocks structure.
+ *     If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ *     Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ *     Then we do subtree scan on both subtrees OA and NA.
+ *     Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ *     Then no matter what we do to subvolume tree X, qgroup numbers will
+ *     still be correct.
+ *     Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4)  Transaction commit
+ *     Any record in X's swapped_blocks gets removed, since there is no
+ *     modification to the swapped subtrees, no need to trigger heavy qgroup
+ *     subtree rescan for them.
+ */
+
+/*
  * Record a dirty extent, and info qgroup to update quota on it
  * TODO: Use kmem cache to alloc it.
  */
@@ -45,9 +107,38 @@
 	struct rb_node node;
 	u64 bytenr;
 	u64 num_bytes;
+
+	/*
+	 * For qgroup reserved data space freeing.
+	 *
+	 * @data_rsv_refroot and @data_rsv will be recorded after
+	 * BTRFS_ADD_DELAYED_EXTENT is called.
+	 * And will be used to free reserved qgroup space at
+	 * transaction commit time.
+	 */
+	u32 data_rsv;		/* reserved data space needs to be freed */
+	u64 data_rsv_refroot;	/* which root the reserved data belongs to */
 	struct ulist *old_roots;
 };
 
+struct btrfs_qgroup_swapped_block {
+	struct rb_node node;
+
+	int level;
+	bool trace_leaf;
+
+	/* bytenr/generation of the tree block in subvolume tree after swap */
+	u64 subvol_bytenr;
+	u64 subvol_generation;
+
+	/* bytenr/generation of the tree block in reloc tree after swap */
+	u64 reloc_bytenr;
+	u64 reloc_generation;
+
+	u64 last_snapshot;
+	struct btrfs_key first_key;
+};
+
 /*
  * Qgroup reservation types:
  *
@@ -70,7 +161,7 @@
  *	be converted into META_PERTRANS.
  */
 enum btrfs_qgroup_rsv_type {
-	BTRFS_QGROUP_RSV_DATA = 0,
+	BTRFS_QGROUP_RSV_DATA,
 	BTRFS_QGROUP_RSV_META_PERTRANS,
 	BTRFS_QGROUP_RSV_META_PREALLOC,
 	BTRFS_QGROUP_RSV_LAST,
@@ -81,10 +172,10 @@
  *
  * Each type should have different reservation behavior.
  * E.g, data follows its io_tree flag modification, while
- * *currently* meta is just reserve-and-clear during transcation.
+ * *currently* meta is just reserve-and-clear during transaction.
  *
  * TODO: Add new type for reservation which can survive transaction commit.
- * Currect metadata reservation behavior is not suitable for such case.
+ * Current metadata reservation behavior is not suitable for such case.
  */
 struct btrfs_qgroup_rsv {
 	u64 values[BTRFS_QGROUP_RSV_LAST];
@@ -246,15 +337,6 @@
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 			       u64 ref_root, u64 num_bytes,
 			       enum btrfs_qgroup_rsv_type type);
-static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
-						 u64 ref_root, u64 num_bytes)
-{
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
-		return;
-	trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
-	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
-				  BTRFS_QGROUP_RSV_DATA);
-}
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@@ -319,4 +401,18 @@
 
 void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+	struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+		struct btrfs_root *subvol_root,
+		struct btrfs_block_group_cache *bg,
+		struct extent_buffer *subvol_parent, int subvol_slot,
+		struct extent_buffer *reloc_parent, int reloc_slot,
+		u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+		struct btrfs_root *root, struct extent_buffer *eb);
+
 #endif

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index df41d70..57a2ac7 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c

@@ -35,6 +35,22 @@
 
 #define RBIO_CACHE_SIZE 1024
 
+#define BTRFS_STRIPE_HASH_TABLE_BITS				11
+
+/* Used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+	struct list_head hash_list;
+	spinlock_t lock;
+};
+
+/* Used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+	struct list_head stripe_cache;
+	spinlock_t cache_lock;
+	int cache_size;
+	struct btrfs_stripe_hash table[];
+};
+
 enum btrfs_rbio_ops {
 	BTRFS_RBIO_WRITE,
 	BTRFS_RBIO_READ_REBUILD,
@@ -1442,11 +1458,11 @@
 static void set_bio_pages_uptodate(struct bio *bio)
 {
 	struct bio_vec *bvec;
-	int i;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
 
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, iter_all)
 		SetPageUptodate(bvec->bv_page);
 }
 
@@ -1980,7 +1996,7 @@
 		 * - In case of single failure, where rbio->failb == -1:
 		 *
 		 *   Cache this rbio iff the above read reconstruction is
-		 *   excuted without problems.
+		 *   executed without problems.
 		 */
 		if (err == BLK_STS_OK && rbio->failb < 0)
 			cache_rbio_pages(rbio);
@@ -2429,8 +2445,9 @@
 			bitmap_clear(rbio->dbitmap, pagenr, 1);
 		kunmap(p);
 
-		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+		for (stripe = 0; stripe < nr_data; stripe++)
 			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+		kunmap(p_page);
 	}
 
 	__free_page(p_page);

diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index f5d4c13..2503485 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h

@@ -7,7 +7,7 @@
 #ifndef BTRFS_RAID56_H
 #define BTRFS_RAID56_H
 
-static inline int nr_parity_stripes(struct map_lookup *map)
+static inline int nr_parity_stripes(const struct map_lookup *map)
 {
 	if (map->type & BTRFS_BLOCK_GROUP_RAID5)
 		return 1;
@@ -17,7 +17,7 @@
 		return 0;
 }
 
-static inline int nr_data_stripes(struct map_lookup *map)
+static inline int nr_data_stripes(const struct map_lookup *map)
 {
 	return map->num_stripes - nr_parity_stripes(map);
 }

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index dec14b7..ee6f605 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c

@@ -14,6 +14,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "dev-replace.h"
+#include "block-group.h"
 
 #undef DEBUG
 
@@ -376,26 +377,28 @@
 		goto error;
 	}
 
+	/* Insert extent in reada tree + all per-device trees, all or nothing */
+	down_read(&fs_info->dev_replace.rwsem);
 	ret = radix_tree_preload(GFP_KERNEL);
-	if (ret)
+	if (ret) {
+		up_read(&fs_info->dev_replace.rwsem);
 		goto error;
+	}
 
-	/* insert extent in reada_tree + all per-device trees, all or nothing */
-	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
 	if (ret == -EEXIST) {
 		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
 		re_exist->refcnt++;
 		spin_unlock(&fs_info->reada_lock);
-		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 		radix_tree_preload_end();
+		up_read(&fs_info->dev_replace.rwsem);
 		goto error;
 	}
 	if (ret) {
 		spin_unlock(&fs_info->reada_lock);
-		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 		radix_tree_preload_end();
+		up_read(&fs_info->dev_replace.rwsem);
 		goto error;
 	}
 	radix_tree_preload_end();
@@ -437,13 +440,13 @@
 			}
 			radix_tree_delete(&fs_info->reada_tree, index);
 			spin_unlock(&fs_info->reada_lock);
-			btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+			up_read(&fs_info->dev_replace.rwsem);
 			goto error;
 		}
 		have_zone = 1;
 	}
 	spin_unlock(&fs_info->reada_lock);
-	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+	up_read(&fs_info->dev_replace.rwsem);
 
 	if (!have_zone)
 		goto error;
@@ -636,6 +639,35 @@
 	return 1;
 }
 
+static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
+				    int mirror_num, struct extent_buffer **eb)
+{
+	struct extent_buffer *buf = NULL;
+	int ret;
+
+	buf = btrfs_find_create_tree_block(fs_info, bytenr);
+	if (IS_ERR(buf))
+		return 0;
+
+	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+	ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
+	if (ret) {
+		free_extent_buffer_stale(buf);
+		return ret;
+	}
+
+	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+		free_extent_buffer_stale(buf);
+		return -EIO;
+	} else if (extent_buffer_uptodate(buf)) {
+		*eb = buf;
+	} else {
+		free_extent_buffer(buf);
+	}
+	return 0;
+}
+
 static int reada_start_machine_dev(struct btrfs_device *dev)
 {
 	struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -745,6 +777,7 @@
 	u64 total = 0;
 	int i;
 
+again:
 	do {
 		enqueued = 0;
 		mutex_lock(&fs_devices->device_list_mutex);
@@ -756,6 +789,10 @@
 		mutex_unlock(&fs_devices->device_list_mutex);
 		total += enqueued;
 	} while (enqueued && total < 10000);
+	if (fs_devices->seed) {
+		fs_devices = fs_devices->seed;
+		goto again;
+	}
 
 	if (enqueued == 0)
 		return;

diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e5b9e59..b57f361 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c

@@ -43,7 +43,7 @@
  * back to the delayed ref action.  We hold the ref we are changing in the
  * action so we can account for the history properly, and we record the root we
  * were called with since it could be different from ref_root.  We also store
- * stack traces because thats how I roll.
+ * stack traces because that's how I roll.
  */
 struct ref_action {
 	int action;
@@ -56,7 +56,7 @@
 
 /*
  * One of these for every block we reference, it holds the roots and references
- * to it as well as all of the ref actions that have occured to it.  We never
+ * to it as well as all of the ref actions that have occurred to it.  We never
  * free it until we unmount the file system in order to make sure re-allocations
  * are happening properly.
  */
@@ -205,28 +205,17 @@
 #ifdef CONFIG_STACKTRACE
 static void __save_stack_trace(struct ref_action *ra)
 {
-	struct stack_trace stack_trace;
-
-	stack_trace.max_entries = MAX_TRACE;
-	stack_trace.nr_entries = 0;
-	stack_trace.entries = ra->trace;
-	stack_trace.skip = 2;
-	save_stack_trace(&stack_trace);
-	ra->trace_len = stack_trace.nr_entries;
+	ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2);
 }
 
 static void __print_stack_trace(struct btrfs_fs_info *fs_info,
 				struct ref_action *ra)
 {
-	struct stack_trace trace;
-
 	if (ra->trace_len == 0) {
 		btrfs_err(fs_info, "  ref-verify: no stacktrace");
 		return;
 	}
-	trace.nr_entries = ra->trace_len;
-	trace.entries = ra->trace;
-	print_stack_trace(&trace, 2);
+	stack_trace_print(ra->trace, ra->trace_len, 2);
 }
 #else
 static void inline __save_stack_trace(struct ref_action *ra)
@@ -511,7 +500,7 @@
 	struct btrfs_extent_data_ref *dref;
 	struct btrfs_shared_data_ref *sref;
 	u32 count;
-	int i = 0, tree_block_level = 0, ret;
+	int i = 0, tree_block_level = 0, ret = 0;
 	struct btrfs_key key;
 	int nritems = btrfs_header_nritems(leaf);
 
@@ -520,6 +509,7 @@
 		switch (key.type) {
 		case BTRFS_EXTENT_ITEM_KEY:
 			*num_bytes = key.offset;
+			/* fall through */
 		case BTRFS_METADATA_ITEM_KEY:
 			*bytenr = key.objectid;
 			ret = process_extent_item(fs_info, path, &key, i,
@@ -583,7 +573,7 @@
 				return -EIO;
 			}
 			btrfs_tree_read_lock(eb);
-			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+			btrfs_set_lock_blocking_read(eb);
 			path->nodes[level-1] = eb;
 			path->slots[level-1] = 0;
 			path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
@@ -670,36 +660,43 @@
 
 /*
  * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
- * @root: the root we are making this modification from.
- * @bytenr: the bytenr we are modifying.
- * @num_bytes: number of bytes.
- * @parent: the parent bytenr.
- * @ref_root: the original root owner of the bytenr.
- * @owner: level in the case of metadata, inode in the case of data.
- * @offset: 0 for metadata, file offset for data.
- * @action: the action that we are doing, this is the same as the delayed ref
- *	action.
  *
  * This will add an action item to the given bytenr and do sanity checks to make
  * sure we haven't messed something up.  If we are making a new allocation and
  * this block entry has history we will delete all previous actions as long as
  * our sanity checks pass as they are no longer needed.
  */
-int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
-		       u64 parent, u64 ref_root, u64 owner, u64 offset,
-		       int action)
+int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+		       struct btrfs_ref *generic_ref)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct ref_entry *ref = NULL, *exist;
 	struct ref_action *ra = NULL;
 	struct block_entry *be = NULL;
 	struct root_entry *re = NULL;
+	int action = generic_ref->action;
 	int ret = 0;
-	bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+	bool metadata;
+	u64 bytenr = generic_ref->bytenr;
+	u64 num_bytes = generic_ref->len;
+	u64 parent = generic_ref->parent;
+	u64 ref_root;
+	u64 owner;
+	u64 offset;
 
-	if (!btrfs_test_opt(root->fs_info, REF_VERIFY))
+	if (!btrfs_test_opt(fs_info, REF_VERIFY))
 		return 0;
 
+	if (generic_ref->type == BTRFS_REF_METADATA) {
+		ref_root = generic_ref->tree_ref.root;
+		owner = generic_ref->tree_ref.level;
+		offset = 0;
+	} else {
+		ref_root = generic_ref->data_ref.ref_root;
+		owner = generic_ref->data_ref.ino;
+		offset = generic_ref->data_ref.offset;
+	}
+	metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
 	ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
 	ra = kmalloc(sizeof(struct ref_action), GFP_NOFS);
 	if (!ra || !ref) {
@@ -732,7 +729,7 @@
 
 	INIT_LIST_HEAD(&ra->list);
 	ra->action = action;
-	ra->root = root->objectid;
+	ra->root = generic_ref->real_root;
 
 	/*
 	 * This is an allocation, preallocate the block_entry in case we haven't
@@ -745,7 +742,7 @@
 		 * is and the new root objectid, so let's not treat the passed
 		 * in root as if it really has a ref for this bytenr.
 		 */
-		be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root);
+		be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
 		if (IS_ERR(be)) {
 			kfree(ra);
 			ret = PTR_ERR(be);
@@ -787,13 +784,13 @@
 			 * one we want to lookup below when we modify the
 			 * re->num_refs.
 			 */
-			ref_root = root->objectid;
-			re->root_objectid = root->objectid;
+			ref_root = generic_ref->real_root;
+			re->root_objectid = generic_ref->real_root;
 			re->num_refs = 0;
 		}
 
-		spin_lock(&root->fs_info->ref_verify_lock);
-		be = lookup_block_entry(&root->fs_info->block_tree, bytenr);
+		spin_lock(&fs_info->ref_verify_lock);
+		be = lookup_block_entry(&fs_info->block_tree, bytenr);
 		if (!be) {
 			btrfs_err(fs_info,
 "trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
@@ -859,10 +856,10 @@
 			 * This shouldn't happen because we will add our re
 			 * above when we lookup the be with !parent, but just in
 			 * case catch this case so we don't panic because I
-			 * didn't thik of some other corner case.
+			 * didn't think of some other corner case.
 			 */
 			btrfs_err(fs_info, "failed to find root %llu for %llu",
-				  root->objectid, be->bytenr);
+				  generic_ref->real_root, be->bytenr);
 			dump_block_entry(fs_info, be);
 			dump_ref_action(fs_info, ra);
 			kfree(ra);
@@ -881,7 +878,7 @@
 	list_add_tail(&ra->list, &be->actions);
 	ret = 0;
 out_unlock:
-	spin_unlock(&root->fs_info->ref_verify_lock);
+	spin_unlock(&fs_info->ref_verify_lock);
 out:
 	if (ret)
 		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
@@ -987,7 +984,7 @@
 		return -ENOMEM;
 
 	eb = btrfs_read_lock_root_node(fs_info->extent_root);
-	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+	btrfs_set_lock_blocking_read(eb);
 	level = btrfs_header_level(eb);
 	path->nodes[level] = eb;
 	path->slots[level] = 0;

diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index b7d2a4e..855de37 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h

@@ -9,9 +9,8 @@
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
 void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
-int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
-		       u64 parent, u64 ref_root, u64 owner, u64 offset,
-		       int action);
+int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+		       struct btrfs_ref *generic_ref);
 void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
 			       u64 len);
 
@@ -30,9 +29,8 @@
 {
 }
 
-static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr,
-				     u64 num_bytes, u64 parent, u64 ref_root,
-				     u64 owner, u64 offset, int action)
+static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+		       struct btrfs_ref *generic_ref)
 {
 	return 0;
 }

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0526b6c..5cd42b6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c

@@ -20,6 +20,8 @@
 #include "inode-map.h"
 #include "qgroup.h"
 #include "print-tree.h"
+#include "delalloc-space.h"
+#include "block-group.h"
 
 /*
  * backref_node, mapping_node and tree_block start with this
@@ -162,6 +164,8 @@
 	struct mapping_tree reloc_root_tree;
 	/* list of reloc trees */
 	struct list_head reloc_roots;
+	/* list of subvolume trees that get relocated */
+	struct list_head dirty_subvol_roots;
 	/* size of metadata reservation for merging reloc trees */
 	u64 merging_rsv_size;
 	/* size of relocated tree nodes */
@@ -648,8 +652,8 @@
 					int level, u64 bytenr)
 {
 	struct backref_cache *cache = &rc->backref_cache;
-	struct btrfs_path *path1;
-	struct btrfs_path *path2;
+	struct btrfs_path *path1; /* For searching extent root */
+	struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
 	struct extent_buffer *eb;
 	struct btrfs_root *root;
 	struct backref_node *cur;
@@ -662,7 +666,7 @@
 	struct btrfs_key key;
 	unsigned long end;
 	unsigned long ptr;
-	LIST_HEAD(list);
+	LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
 	LIST_HEAD(useless);
 	int cowonly;
 	int ret;
@@ -778,6 +782,10 @@
 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 		}
 
+		/*
+		 * Parent node found and matches current inline ref, no need to
+		 * rebuild this node for this inline ref.
+		 */
 		if (exist &&
 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
 		      exist->owner == key.offset) ||
@@ -787,11 +795,12 @@
 			goto next;
 		}
 
+		/* SHARED_BLOCK_REF means key.offset is the parent bytenr */
 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 			if (key.objectid == key.offset) {
 				/*
-				 * only root blocks of reloc trees use
-				 * backref of this type.
+				 * Only root blocks of reloc trees use backref
+				 * pointing to itself.
 				 */
 				root = find_reloc_root(rc, cur->bytenr);
 				ASSERT(root);
@@ -840,7 +849,11 @@
 			goto next;
 		}
 
-		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+		/*
+		 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
+		 * means the root objectid. We need to search the tree to get
+		 * its parent bytenr.
+		 */
 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
 		if (IS_ERR(root)) {
 			err = PTR_ERR(root);
@@ -863,10 +876,7 @@
 
 		level = cur->level + 1;
 
-		/*
-		 * searching the tree to find upper level blocks
-		 * reference the block.
-		 */
+		/* Search the tree to find parent blocks referring the block. */
 		path2->search_commit_root = 1;
 		path2->skip_locking = 1;
 		path2->lowest_level = level;
@@ -884,7 +894,8 @@
 		    cur->bytenr) {
 			btrfs_err(root->fs_info,
 	"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
-				  cur->bytenr, level - 1, root->objectid,
+				  cur->bytenr, level - 1,
+				  root->root_key.objectid,
 				  node_key->objectid, node_key->type,
 				  node_key->offset);
 			err = -ENOENT;
@@ -892,6 +903,8 @@
 		}
 		lower = cur;
 		need_check = true;
+
+		/* Add all nodes and edges in the path */
 		for (; level < BTRFS_MAX_LEVEL; level++) {
 			if (!path2->nodes[level]) {
 				ASSERT(btrfs_root_bytenr(&root->root_item) ==
@@ -1422,6 +1435,13 @@
 	int clear_rsv = 0;
 	int ret;
 
+	/*
+	 * The subvolume has reloc tree but the swap is finished, no need to
+	 * create/update the dead reloc tree
+	 */
+	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
+		return 0;
+
 	if (root->reloc_root) {
 		reloc_root = root->reloc_root;
 		reloc_root->last_trans = trans->transid;
@@ -1458,15 +1478,17 @@
 	struct btrfs_root_item *root_item;
 	int ret;
 
-	if (!root->reloc_root)
+	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+	    !root->reloc_root)
 		goto out;
 
 	reloc_root = root->reloc_root;
 	root_item = &reloc_root->root_item;
 
+	/* root->reloc_root will stay until current relocation finished */
 	if (fs_info->reloc_ctl->merge_reloc_tree &&
 	    btrfs_root_refs(root_item) == 0) {
-		root->reloc_root = NULL;
+		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
 		__del_reloc_root(reloc_root);
 	}
 
@@ -1630,6 +1652,8 @@
 
 	nritems = btrfs_header_nritems(leaf);
 	for (i = 0; i < nritems; i++) {
+		struct btrfs_ref ref = { 0 };
+
 		cond_resched();
 		btrfs_item_key_to_cpu(leaf, &key, i);
 		if (key.type != BTRFS_EXTENT_DATA_KEY)
@@ -1690,18 +1714,23 @@
 		dirty = 1;
 
 		key.offset -= btrfs_file_extent_offset(leaf, fi);
-		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
-					   num_bytes, parent,
-					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset);
+		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
+				       num_bytes, parent);
+		ref.real_root = root->root_key.objectid;
+		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+				    key.objectid, key.offset);
+		ret = btrfs_inc_extent_ref(trans, &ref);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
 
-		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset);
+		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+				       num_bytes, parent);
+		ref.real_root = root->root_key.objectid;
+		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+				    key.objectid, key.offset);
+		ret = btrfs_free_extent(trans, &ref);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			break;
@@ -1735,7 +1764,7 @@
  * errors, a negative error number is returned.
  */
 static noinline_for_stack
-int replace_path(struct btrfs_trans_handle *trans,
+int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
 		 struct btrfs_root *dest, struct btrfs_root *src,
 		 struct btrfs_path *path, struct btrfs_key *next_key,
 		 int lowest_level, int max_level)
@@ -1743,6 +1772,7 @@
 	struct btrfs_fs_info *fs_info = dest->fs_info;
 	struct extent_buffer *eb;
 	struct extent_buffer *parent;
+	struct btrfs_ref ref = { 0 };
 	struct btrfs_key key;
 	u64 old_bytenr;
 	u64 new_bytenr;
@@ -1764,7 +1794,7 @@
 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
 
 	eb = btrfs_lock_root_node(dest);
-	btrfs_set_lock_blocking(eb);
+	btrfs_set_lock_blocking_write(eb);
 	level = btrfs_header_level(eb);
 
 	if (level < lowest_level) {
@@ -1777,7 +1807,7 @@
 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
 		BUG_ON(ret);
 	}
-	btrfs_set_lock_blocking(eb);
+	btrfs_set_lock_blocking_write(eb);
 
 	if (next_key) {
 		next_key->objectid = (u64)-1;
@@ -1793,6 +1823,8 @@
 		BUG_ON(level < lowest_level);
 
 		ret = btrfs_bin_search(parent, &key, level, &slot);
+		if (ret < 0)
+			break;
 		if (ret && slot > 0)
 			slot--;
 
@@ -1843,7 +1875,7 @@
 						      slot, &eb);
 				BUG_ON(ret);
 			}
-			btrfs_set_lock_blocking(eb);
+			btrfs_set_lock_blocking_write(eb);
 
 			btrfs_tree_unlock(parent);
 			free_extent_buffer(parent);
@@ -1876,20 +1908,18 @@
 		 *    If not traced, we will leak data numbers
 		 * 2) Fs subtree
 		 *    If not traced, we will double count old data
-		 *    and tree block numbers, if current trans doesn't free
-		 *    data reloc tree inode.
+		 *
+		 * We don't scan the subtree right now, but only record
+		 * the swapped tree blocks.
+		 * The real subtree rescan is delayed until we have new
+		 * CoW on the subtree root node before transaction commit.
 		 */
-		ret = btrfs_qgroup_trace_subtree(trans, parent,
-				btrfs_header_generation(parent),
-				btrfs_header_level(parent));
+		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+				rc->block_group, parent, slot,
+				path->nodes[level], path->slots[level],
+				last_snapshot);
 		if (ret < 0)
 			break;
-		ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level],
-				btrfs_header_generation(path->nodes[level]),
-				btrfs_header_level(path->nodes[level]));
-		if (ret < 0)
-			break;
-
 		/*
 		 * swap blocks in fs tree and reloc tree.
 		 */
@@ -1903,23 +1933,31 @@
 					      path->slots[level], old_ptr_gen);
 		btrfs_mark_buffer_dirty(path->nodes[level]);
 
-		ret = btrfs_inc_extent_ref(trans, src, old_bytenr,
-					blocksize, path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
+				       blocksize, path->nodes[level]->start);
+		ref.skip_qgroup = true;
+		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+		ret = btrfs_inc_extent_ref(trans, &ref);
 		BUG_ON(ret);
-		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr,
-					blocksize, 0, dest->root_key.objectid,
-					level - 1, 0);
+		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
+				       blocksize, 0);
+		ref.skip_qgroup = true;
+		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+		ret = btrfs_inc_extent_ref(trans, &ref);
 		BUG_ON(ret);
 
-		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
-					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
+				       blocksize, path->nodes[level]->start);
+		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+		ref.skip_qgroup = true;
+		ret = btrfs_free_extent(trans, &ref);
 		BUG_ON(ret);
 
-		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
-					0, dest->root_key.objectid, level - 1,
-					0);
+		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
+				       blocksize, 0);
+		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+		ref.skip_qgroup = true;
+		ret = btrfs_free_extent(trans, &ref);
 		BUG_ON(ret);
 
 		btrfs_unlock_up_safe(path, 0);
@@ -2117,6 +2155,66 @@
 }
 
 /*
+ * Insert current subvolume into reloc_control::dirty_subvol_roots
+ */
+static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root = root->reloc_root;
+	struct btrfs_root_item *reloc_root_item;
+
+	/* @root must be a subvolume tree root with a valid reloc tree */
+	ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+	ASSERT(reloc_root);
+
+	reloc_root_item = &reloc_root->root_item;
+	memset(&reloc_root_item->drop_progress, 0,
+		sizeof(reloc_root_item->drop_progress));
+	reloc_root_item->drop_level = 0;
+	btrfs_set_root_refs(reloc_root_item, 0);
+	btrfs_update_reloc_root(trans, root);
+
+	if (list_empty(&root->reloc_dirty_list)) {
+		btrfs_grab_fs_root(root);
+		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
+	}
+}
+
+static int clean_dirty_subvols(struct reloc_control *rc)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *next;
+	int ret = 0;
+	int ret2;
+
+	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
+				 reloc_dirty_list) {
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+			/* Merged subvolume, cleanup its reloc root */
+			struct btrfs_root *reloc_root = root->reloc_root;
+
+			list_del_init(&root->reloc_dirty_list);
+			root->reloc_root = NULL;
+			if (reloc_root) {
+
+				ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+				if (ret2 < 0 && !ret)
+					ret = ret2;
+			}
+			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+			btrfs_put_fs_root(root);
+		} else {
+			/* Orphan reloc tree, just clean it up */
+			ret2 = btrfs_drop_snapshot(root, NULL, 0, 1);
+			if (ret2 < 0 && !ret)
+				ret = ret2;
+		}
+	}
+	return ret;
+}
+
+/*
  * merge the relocated tree blocks in reloc tree with corresponding
  * fs tree.
  */
@@ -2124,7 +2222,6 @@
 					       struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	LIST_HEAD(inode_list);
 	struct btrfs_key key;
 	struct btrfs_key next_key;
 	struct btrfs_trans_handle *trans = NULL;
@@ -2205,7 +2302,7 @@
 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
 			ret = 0;
 		} else {
-			ret = replace_path(trans, root, reloc_root, path,
+			ret = replace_path(trans, rc, root, reloc_root, path,
 					   &next_key, level, max_level);
 		}
 		if (ret < 0) {
@@ -2255,13 +2352,8 @@
 out:
 	btrfs_free_path(path);
 
-	if (err == 0) {
-		memset(&root_item->drop_progress, 0,
-		       sizeof(root_item->drop_progress));
-		root_item->drop_level = 0;
-		btrfs_set_root_refs(root_item, 0);
-		btrfs_update_reloc_root(trans, root);
-	}
+	if (err == 0)
+		insert_dirty_subvol(trans, rc, root);
 
 	if (trans)
 		btrfs_end_transaction_throttle(trans);
@@ -2405,14 +2497,9 @@
 			}
 		} else {
 			list_del_init(&reloc_root->root_list);
-		}
-
-		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
-		if (ret < 0) {
-			if (list_empty(&reloc_root->root_list))
-				list_add_tail(&reloc_root->root_list,
-					      &reloc_roots);
-			goto out;
+			/* Don't forget to queue this reloc root for cleanup */
+			list_add_tail(&reloc_root->reloc_dirty_list,
+				      &rc->dirty_subvol_roots);
 		}
 	}
 
@@ -2627,7 +2714,7 @@
 		 * only one thread can access block_rsv at this point,
 		 * so we don't need hold lock to protect block_rsv.
 		 * we expand more reservation size here to allow enough
-		 * space for relocation and we will return eailer in
+		 * space for relocation and we will return earlier in
 		 * enospc case.
 		 */
 		rc->block_rsv->size = tmp + fs_info->nodesize *
@@ -2670,6 +2757,7 @@
 	rc->backref_cache.path[node->level] = node;
 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
 		struct btrfs_key first_key;
+		struct btrfs_ref ref = { 0 };
 
 		cond_resched();
 
@@ -2681,6 +2769,10 @@
 			if (!lowest) {
 				ret = btrfs_bin_search(upper->eb, key,
 						       upper->level, &slot);
+				if (ret < 0) {
+					err = ret;
+					goto next;
+				}
 				BUG_ON(ret);
 				bytenr = btrfs_node_blockptr(upper->eb, slot);
 				if (node->eb->start == bytenr)
@@ -2716,6 +2808,10 @@
 		} else {
 			ret = btrfs_bin_search(upper->eb, key, upper->level,
 					       &slot);
+			if (ret < 0) {
+				err = ret;
+				goto next;
+			}
 			BUG_ON(ret);
 		}
 
@@ -2748,7 +2844,7 @@
 			goto next;
 		}
 		btrfs_tree_lock(eb);
-		btrfs_set_lock_blocking(eb);
+		btrfs_set_lock_blocking_write(eb);
 
 		if (!node->eb) {
 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
@@ -2767,11 +2863,13 @@
 						      trans->transid);
 			btrfs_mark_buffer_dirty(upper->eb);
 
-			ret = btrfs_inc_extent_ref(trans, root,
-						node->eb->start, blocksize,
-						upper->eb->start,
-						btrfs_header_owner(upper->eb),
-						node->level, 0);
+			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+					       node->eb->start, blocksize,
+					       upper->eb->start);
+			ref.real_root = root->root_key.objectid;
+			btrfs_init_tree_ref(&ref, node->level,
+					    btrfs_header_owner(upper->eb));
+			ret = btrfs_inc_extent_ref(trans, &ref);
 			BUG_ON(ret);
 
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2911,7 +3009,6 @@
 		free_extent_buffer(eb);
 		return -EIO;
 	}
-	WARN_ON(btrfs_header_level(eb) != block->level);
 	if (block->level == 0)
 		btrfs_item_key_to_cpu(eb, &block->key, 0);
 	else
@@ -2987,7 +3084,7 @@
 	struct backref_node *node;
 	struct btrfs_path *path;
 	struct tree_block *block;
-	struct rb_node *rb_node;
+	struct tree_block *next;
 	int ret;
 	int err = 0;
 
@@ -2997,29 +3094,23 @@
 		goto out_free_blocks;
 	}
 
-	rb_node = rb_first(blocks);
-	while (rb_node) {
-		block = rb_entry(rb_node, struct tree_block, rb_node);
+	/* Kick in readahead for tree blocks with missing keys */
+	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
 		if (!block->key_ready)
 			readahead_tree_block(fs_info, block->bytenr);
-		rb_node = rb_next(rb_node);
 	}
 
-	rb_node = rb_first(blocks);
-	while (rb_node) {
-		block = rb_entry(rb_node, struct tree_block, rb_node);
+	/* Get first keys */
+	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
 		if (!block->key_ready) {
 			err = get_tree_block_key(fs_info, block);
 			if (err)
 				goto out_free_path;
 		}
-		rb_node = rb_next(rb_node);
 	}
 
-	rb_node = rb_first(blocks);
-	while (rb_node) {
-		block = rb_entry(rb_node, struct tree_block, rb_node);
-
+	/* Do tree relocation */
+	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
 		node = build_backref_tree(rc, &block->key,
 					  block->level, block->bytenr);
 		if (IS_ERR(node)) {
@@ -3030,11 +3121,10 @@
 		ret = relocate_tree_block(trans, rc, node, &block->key,
 					  path);
 		if (ret < 0) {
-			if (ret != -EAGAIN || rb_node == rb_first(blocks))
+			if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
 				err = ret;
 			goto out;
 		}
-		rb_node = rb_next(rb_node);
 	}
 out:
 	err = finish_pending_nodes(trans, rc, path, err);
@@ -3187,6 +3277,8 @@
 			if (!page) {
 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
 							PAGE_SIZE, true);
+				btrfs_delalloc_release_extents(BTRFS_I(inode),
+							PAGE_SIZE);
 				ret = -ENOMEM;
 				goto out;
 			}
@@ -3207,7 +3299,7 @@
 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
 							PAGE_SIZE, true);
 				btrfs_delalloc_release_extents(BTRFS_I(inode),
-							       PAGE_SIZE, true);
+							       PAGE_SIZE);
 				ret = -EIO;
 				goto out;
 			}
@@ -3229,14 +3321,14 @@
 		}
 
 		ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
-						NULL, 0);
+						NULL);
 		if (ret) {
 			unlock_page(page);
 			put_page(page);
 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
 							 PAGE_SIZE, true);
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
-			                               PAGE_SIZE, true);
+			                               PAGE_SIZE);
 
 			clear_extent_bits(&BTRFS_I(inode)->io_tree,
 					  page_start, page_end,
@@ -3252,8 +3344,7 @@
 		put_page(page);
 
 		index++;
-		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
-					       false);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 		balance_dirty_pages_ratelimited(inode->i_mapping);
 		btrfs_throttle(fs_info);
 	}
@@ -4083,6 +4174,9 @@
 		goto out_free;
 	}
 	btrfs_commit_transaction(trans);
+	ret = clean_dirty_subvols(rc);
+	if (ret < 0 && !err)
+		err = ret;
 out_free:
 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
 	btrfs_free_path(path);
@@ -4168,7 +4262,7 @@
 	return inode;
 }
 
-static struct reloc_control *alloc_reloc_control(void)
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 {
 	struct reloc_control *rc;
 
@@ -4177,9 +4271,11 @@
 		return NULL;
 
 	INIT_LIST_HEAD(&rc->reloc_roots);
+	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
 	backref_cache_init(&rc->backref_cache);
 	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(&rc->processed_blocks, NULL);
+	extent_io_tree_init(fs_info, &rc->processed_blocks,
+			    IO_TREE_RELOC_BLOCKS, NULL);
 	return rc;
 }
 
@@ -4189,37 +4285,13 @@
 static void describe_relocation(struct btrfs_fs_info *fs_info,
 				struct btrfs_block_group_cache *block_group)
 {
-	char buf[128];		/* prefixed by a '|' that'll be dropped */
-	u64 flags = block_group->flags;
+	char buf[128] = {'\0'};
 
-	/* Shouldn't happen */
-	if (!flags) {
-		strcpy(buf, "|NONE");
-	} else {
-		char *bp = buf;
-
-#define DESCRIBE_FLAG(f, d) \
-		if (flags & BTRFS_BLOCK_GROUP_##f) { \
-			bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
-			flags &= ~BTRFS_BLOCK_GROUP_##f; \
-		}
-		DESCRIBE_FLAG(DATA,     "data");
-		DESCRIBE_FLAG(SYSTEM,   "system");
-		DESCRIBE_FLAG(METADATA, "metadata");
-		DESCRIBE_FLAG(RAID0,    "raid0");
-		DESCRIBE_FLAG(RAID1,    "raid1");
-		DESCRIBE_FLAG(DUP,      "dup");
-		DESCRIBE_FLAG(RAID10,   "raid10");
-		DESCRIBE_FLAG(RAID5,    "raid5");
-		DESCRIBE_FLAG(RAID6,    "raid6");
-		if (flags)
-			snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
-#undef DESCRIBE_FLAG
-	}
+	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
 
 	btrfs_info(fs_info,
 		   "relocating block group %llu flags %s",
-		   block_group->key.objectid, buf + 1);
+		   block_group->key.objectid, buf);
 }
 
 /*
@@ -4227,6 +4299,7 @@
  */
 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
 {
+	struct btrfs_block_group_cache *bg;
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	struct reloc_control *rc;
 	struct inode *inode;
@@ -4235,14 +4308,23 @@
 	int rw = 0;
 	int err = 0;
 
-	rc = alloc_reloc_control();
-	if (!rc)
+	bg = btrfs_lookup_block_group(fs_info, group_start);
+	if (!bg)
+		return -ENOENT;
+
+	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
+		btrfs_put_block_group(bg);
+		return -ETXTBSY;
+	}
+
+	rc = alloc_reloc_control(fs_info);
+	if (!rc) {
+		btrfs_put_block_group(bg);
 		return -ENOMEM;
+	}
 
 	rc->extent_root = extent_root;
-
-	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
-	BUG_ON(!rc->block_group);
+	rc->block_group = bg;
 
 	ret = btrfs_inc_block_group_ro(rc->block_group);
 	if (ret) {
@@ -4257,7 +4339,7 @@
 		goto out;
 	}
 
-	inode = lookup_free_space_inode(fs_info, rc->block_group, path);
+	inode = lookup_free_space_inode(rc->block_group, path);
 	btrfs_free_path(path);
 
 	if (!IS_ERR(inode))
@@ -4289,27 +4371,36 @@
 		mutex_lock(&fs_info->cleaner_mutex);
 		ret = relocate_block_group(rc);
 		mutex_unlock(&fs_info->cleaner_mutex);
-		if (ret < 0) {
+		if (ret < 0)
 			err = ret;
-			goto out;
+
+		/*
+		 * We may have gotten ENOSPC after we already dirtied some
+		 * extents.  If writeout happens while we're relocating a
+		 * different block group we could end up hitting the
+		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
+		 * btrfs_reloc_cow_block.  Make sure we write everything out
+		 * properly so we don't trip over this problem, and then break
+		 * out of the loop if we hit an error.
+		 */
+		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+						       (u64)-1);
+			if (ret)
+				err = ret;
+			invalidate_mapping_pages(rc->data_inode->i_mapping,
+						 0, -1);
+			rc->stage = UPDATE_DATA_PTRS;
 		}
 
+		if (err < 0)
+			goto out;
+
 		if (rc->extents_found == 0)
 			break;
 
 		btrfs_info(fs_info, "found %llu extents", rc->extents_found);
 
-		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
-			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
-						       (u64)-1);
-			if (ret) {
-				err = ret;
-				goto out;
-			}
-			invalidate_mapping_pages(rc->data_inode->i_mapping,
-						 0, -1);
-			rc->stage = UPDATE_DATA_PTRS;
-		}
 	}
 
 	WARN_ON(rc->block_group->pinned > 0);
@@ -4431,7 +4522,7 @@
 	if (list_empty(&reloc_roots))
 		goto out;
 
-	rc = alloc_reloc_control();
+	rc = alloc_reloc_control(fs_info);
 	if (!rc) {
 		err = -ENOMEM;
 		goto out;
@@ -4486,6 +4577,10 @@
 		goto out_free;
 	}
 	err = btrfs_commit_transaction(trans);
+
+	ret = clean_dirty_subvols(rc);
+	if (ret < 0 && !err)
+		err = ret;
 out_free:
 	kfree(rc);
 out:
@@ -4549,7 +4644,7 @@
 		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
 		sums->bytenr = new_bytenr;
 
-		btrfs_add_ordered_sum(inode, ordered, sums);
+		btrfs_add_ordered_sum(ordered, sums);
 	}
 out:
 	btrfs_put_ordered_extent(ordered);
@@ -4622,14 +4717,12 @@
 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
 			      u64 *bytes_to_reserve)
 {
-	struct btrfs_root *root;
-	struct reloc_control *rc;
+	struct btrfs_root *root = pending->root;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
 
-	root = pending->root;
-	if (!root->reloc_root)
+	if (!root->reloc_root || !rc)
 		return;
 
-	rc = root->fs_info->reloc_ctl;
 	if (!rc->merge_reloc_tree)
 		return;
 
@@ -4658,10 +4751,10 @@
 	struct btrfs_root *root = pending->root;
 	struct btrfs_root *reloc_root;
 	struct btrfs_root *new_root;
-	struct reloc_control *rc;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
 	int ret;
 
-	if (!root->reloc_root)
+	if (!root->reloc_root || !rc)
 		return 0;
 
 	rc = root->fs_info->reloc_ctl;
@@ -4670,7 +4763,7 @@
 	if (rc->merge_reloc_tree) {
 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
 					      rc->block_rsv,
-					      rc->nodes_relocated, 1);
+					      rc->nodes_relocated, true);
 		if (ret)
 			return ret;
 	}

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 65bda06..3b17b64 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c

@@ -9,6 +9,8 @@
 #include "transaction.h"
 #include "disk-io.h"
 #include "print-tree.h"
+#include "qgroup.h"
+#include "space-info.h"
 
 /*
  * Read a root item from the tree. In case we detect a root item smaller then
@@ -21,12 +23,12 @@
 				struct btrfs_root_item *item)
 {
 	uuid_le uuid;
-	int len;
+	u32 len;
 	int need_reset = 0;
 
 	len = btrfs_item_size_nr(eb, slot);
 	read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
-			min_t(int, len, (int)sizeof(*item)));
+			   min_t(u32, len, sizeof(*item)));
 	if (len < sizeof(*item))
 		need_reset = 1;
 	if (!need_reset && btrfs_root_generation(item)
@@ -132,18 +134,19 @@
 		return -ENOMEM;
 
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-	if (ret < 0) {
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		btrfs_crit(fs_info,
+			"unable to find root key (%llu %u %llu) in tree %llu",
+			key->objectid, key->type, key->offset,
+			root->root_key.objectid);
+		ret = -EUCLEAN;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
-	if (ret != 0) {
-		btrfs_print_leaf(path->nodes[0]);
-		btrfs_crit(fs_info, "unable to update root key %llu %u %llu",
-			   key->objectid, key->type, key->offset);
-		BUG_ON(1);
-	}
-
 	l = path->nodes[0];
 	slot = path->slots[0];
 	ptr = btrfs_item_ptr_offset(l, slot);
@@ -263,8 +266,10 @@
 		if (root) {
 			WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
 					  &root->state));
-			if (btrfs_root_refs(&root->root_item) == 0)
+			if (btrfs_root_refs(&root->root_item) == 0) {
+				set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
 				btrfs_add_dead_root(root);
+			}
 			continue;
 		}
 
@@ -310,8 +315,10 @@
 			break;
 		}
 
-		if (btrfs_root_refs(&root->root_item) == 0)
+		if (btrfs_root_refs(&root->root_item) == 0) {
+			set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
 			btrfs_add_dead_root(root);
+		}
 	}
 
 	btrfs_free_path(path);
@@ -492,3 +499,57 @@
 	btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
 	spin_unlock(&root->root_item_lock);
 }
+
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * use_global_rsv: allow fallback to the global block reservation
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reservation mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv, int items,
+				     bool use_global_rsv)
+{
+	u64 qgroup_num_bytes = 0;
+	u64 num_bytes;
+	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+
+	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+		/* One for parent inode, two for dir entries */
+		qgroup_num_bytes = 3 * fs_info->nodesize;
+		ret = btrfs_qgroup_reserve_meta_prealloc(root,
+				qgroup_num_bytes, true);
+		if (ret)
+			return ret;
+	}
+
+	num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
+	rsv->space_info = btrfs_find_space_info(fs_info,
+					    BTRFS_BLOCK_GROUP_METADATA);
+	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+				  BTRFS_RESERVE_FLUSH_ALL);
+
+	if (ret == -ENOSPC && use_global_rsv)
+		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
+
+	if (ret && qgroup_num_bytes)
+		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+
+	return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+				      struct btrfs_block_rsv *rsv)
+{
+	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+}

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456..f7d4e03 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c

@@ -6,6 +6,7 @@
 #include <linux/blkdev.h>
 #include <linux/ratelimit.h>
 #include <linux/sched/mm.h>
+#include <crypto/hash.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "disk-io.h"
@@ -17,6 +18,7 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 #include "raid56.h"
+#include "block-group.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -339,7 +341,9 @@
 		}
 	}
 
-	/* Insert new lock */
+	/*
+	 * Insert new lock.
+	 */
 	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return ERR_PTR(-ENOMEM);
@@ -568,12 +572,11 @@
 		scrub_free_ctx(sctx);
 }
 
-static noinline_for_stack
-struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
+static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
+		struct btrfs_fs_info *fs_info, int is_dev_replace)
 {
 	struct scrub_ctx *sctx;
 	int		i;
-	struct btrfs_fs_info *fs_info = dev->fs_info;
 
 	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 	if (!sctx)
@@ -582,7 +585,8 @@
 	sctx->is_dev_replace = is_dev_replace;
 	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 	sctx->curr = -1;
-	sctx->fs_info = dev->fs_info;
+	sctx->fs_info = fs_info;
+	INIT_LIST_HEAD(&sctx->csum_list);
 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 		struct scrub_bio *sbio;
 
@@ -607,7 +611,6 @@
 	atomic_set(&sctx->workers_pending, 0);
 	atomic_set(&sctx->cancel_req, 0);
 	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
-	INIT_LIST_HEAD(&sctx->csum_list);
 
 	spin_lock_init(&sctx->list_lock);
 	spin_lock_init(&sctx->stat_lock);
@@ -832,6 +835,7 @@
 	int page_num;
 	int success;
 	bool full_stripe_locked;
+	unsigned int nofs_flag;
 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 
@@ -857,6 +861,16 @@
 	dev = sblock_to_check->pagev[0]->dev;
 
 	/*
+	 * We must use GFP_NOFS because the scrub task might be waiting for a
+	 * worker task executing this function and in turn a transaction commit
+	 * might be waiting the scrub task to pause (which needs to wait for all
+	 * the worker tasks to complete before pausing).
+	 * We do allocations in the workers through insert_full_stripe_lock()
+	 * and scrub_add_page_to_wr_bio(), which happens down the call chain of
+	 * this function.
+	 */
+	nofs_flag = memalloc_nofs_save();
+	/*
 	 * For RAID5/6, race can happen for a different device scrub thread.
 	 * For data corruption, Parity and Data threads will both try
 	 * to recovery the data.
@@ -865,6 +879,7 @@
 	 */
 	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
 	if (ret < 0) {
+		memalloc_nofs_restore(nofs_flag);
 		spin_lock(&sctx->stat_lock);
 		if (ret == -ENOMEM)
 			sctx->stat.malloc_errors++;
@@ -904,7 +919,7 @@
 	 */
 
 	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
-				      sizeof(*sblocks_for_recheck), GFP_NOFS);
+				      sizeof(*sblocks_for_recheck), GFP_KERNEL);
 	if (!sblocks_for_recheck) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
@@ -1124,7 +1139,7 @@
 
 			if (scrub_write_page_to_dev_replace(sblock_other,
 							    page_num) != 0) {
-				btrfs_dev_replace_stats_inc(
+				atomic64_inc(
 					&fs_info->dev_replace.num_write_errors);
 				success = 0;
 			}
@@ -1202,6 +1217,7 @@
 	}
 
 	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+	memalloc_nofs_restore(nofs_flag);
 	if (ret < 0)
 		return ret;
 	return 0;
@@ -1564,8 +1580,7 @@
 		if (btrfsic_submit_bio_wait(bio)) {
 			btrfs_dev_stat_inc_and_print(page_bad->dev,
 				BTRFS_DEV_STAT_WRITE_ERRS);
-			btrfs_dev_replace_stats_inc(
-				&fs_info->dev_replace.num_write_errors);
+			atomic64_inc(&fs_info->dev_replace.num_write_errors);
 			bio_put(bio);
 			return -EIO;
 		}
@@ -1592,8 +1607,7 @@
 
 		ret = scrub_write_page_to_dev_replace(sblock, page_num);
 		if (ret)
-			btrfs_dev_replace_stats_inc(
-				&fs_info->dev_replace.num_write_errors);
+			atomic64_inc(&fs_info->dev_replace.num_write_errors);
 	}
 }
 
@@ -1726,8 +1740,7 @@
 			struct scrub_page *spage = sbio->pagev[i];
 
 			spage->io_error = 1;
-			btrfs_dev_replace_stats_inc(&dev_replace->
-						    num_write_errors);
+			atomic64_inc(&dev_replace->num_write_errors);
 		}
 	}
 
@@ -1776,11 +1789,12 @@
 static int scrub_checksum_data(struct scrub_block *sblock)
 {
 	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	u8 csum[BTRFS_CSUM_SIZE];
 	u8 *on_disk_csum;
 	struct page *page;
 	void *buffer;
-	u32 crc = ~(u32)0;
 	u64 len;
 	int index;
 
@@ -1788,6 +1802,9 @@
 	if (!sblock->pagev[0]->have_csum)
 		return 0;
 
+	shash->tfm = fs_info->csum_shash;
+	crypto_shash_init(shash);
+
 	on_disk_csum = sblock->pagev[0]->csum;
 	page = sblock->pagev[0]->page;
 	buffer = kmap_atomic(page);
@@ -1797,7 +1814,7 @@
 	for (;;) {
 		u64 l = min_t(u64, len, PAGE_SIZE);
 
-		crc = btrfs_csum_data(buffer, crc, l);
+		crypto_shash_update(shash, buffer, l);
 		kunmap_atomic(buffer);
 		len -= l;
 		if (len == 0)
@@ -1809,7 +1826,7 @@
 		buffer = kmap_atomic(page);
 	}
 
-	btrfs_csum_final(crc, csum);
+	crypto_shash_final(shash, csum);
 	if (memcmp(csum, on_disk_csum, sctx->csum_size))
 		sblock->checksum_error = 1;
 
@@ -1821,16 +1838,19 @@
 	struct scrub_ctx *sctx = sblock->sctx;
 	struct btrfs_header *h;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
 	struct page *page;
 	void *mapped_buffer;
 	u64 mapped_size;
 	void *p;
-	u32 crc = ~(u32)0;
 	u64 len;
 	int index;
 
+	shash->tfm = fs_info->csum_shash;
+	crypto_shash_init(shash);
+
 	BUG_ON(sblock->page_count < 1);
 	page = sblock->pagev[0]->page;
 	mapped_buffer = kmap_atomic(page);
@@ -1864,7 +1884,7 @@
 	for (;;) {
 		u64 l = min_t(u64, len, mapped_size);
 
-		crc = btrfs_csum_data(p, crc, l);
+		crypto_shash_update(shash, p, l);
 		kunmap_atomic(mapped_buffer);
 		len -= l;
 		if (len == 0)
@@ -1878,7 +1898,7 @@
 		p = mapped_buffer;
 	}
 
-	btrfs_csum_final(crc, calculated_csum);
+	crypto_shash_final(shash, calculated_csum);
 	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
 		sblock->checksum_error = 1;
 
@@ -1889,18 +1909,22 @@
 {
 	struct btrfs_super_block *s;
 	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
 	struct page *page;
 	void *mapped_buffer;
 	u64 mapped_size;
 	void *p;
-	u32 crc = ~(u32)0;
 	int fail_gen = 0;
 	int fail_cor = 0;
 	u64 len;
 	int index;
 
+	shash->tfm = fs_info->csum_shash;
+	crypto_shash_init(shash);
+
 	BUG_ON(sblock->page_count < 1);
 	page = sblock->pagev[0]->page;
 	mapped_buffer = kmap_atomic(page);
@@ -1923,7 +1947,7 @@
 	for (;;) {
 		u64 l = min_t(u64, len, mapped_size);
 
-		crc = btrfs_csum_data(p, crc, l);
+		crypto_shash_update(shash, p, l);
 		kunmap_atomic(mapped_buffer);
 		len -= l;
 		if (len == 0)
@@ -1937,7 +1961,7 @@
 		p = mapped_buffer;
 	}
 
-	btrfs_csum_final(crc, calculated_csum);
+	crypto_shash_final(shash, calculated_csum);
 	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
 		++fail_cor;
 
@@ -2437,7 +2461,7 @@
 	ASSERT(index < UINT_MAX);
 
 	num_sectors = sum->len / sctx->fs_info->sectorsize;
-	memcpy(csum, sum->sums + index, sctx->csum_size);
+	memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
 	if (index == num_sectors - 1) {
 		list_del(&sum->list);
 		kfree(sum);
@@ -2649,18 +2673,18 @@
 	u64 last_offset;
 	u32 stripe_index;
 	u32 rot;
+	const int data_stripes = nr_data_stripes(map);
 
-	last_offset = (physical - map->stripes[num].physical) *
-		      nr_data_stripes(map);
+	last_offset = (physical - map->stripes[num].physical) * data_stripes;
 	if (stripe_start)
 		*stripe_start = last_offset;
 
 	*offset = last_offset;
-	for (i = 0; i < nr_data_stripes(map); i++) {
+	for (i = 0; i < data_stripes; i++) {
 		*offset = last_offset + i * map->stripe_len;
 
 		stripe_nr = div64_u64(*offset, map->stripe_len);
-		stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
+		stripe_nr = div_u64(stripe_nr, data_stripes);
 
 		/* Work out the disk rotation on this stripe-set */
 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
@@ -3022,8 +3046,7 @@
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 					   struct map_lookup *map,
 					   struct btrfs_device *scrub_dev,
-					   int num, u64 base, u64 length,
-					   int is_dev_replace)
+					   int num, u64 base, u64 length)
 {
 	struct btrfs_path *path, *ppath;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3069,7 +3092,7 @@
 		offset = map->stripe_len * (num / map->sub_stripes);
 		increment = map->stripe_len * factor;
 		mirror_num = num % map->sub_stripes + 1;
-	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
 		increment = map->stripe_len;
 		mirror_num = num % map->num_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
@@ -3299,7 +3322,7 @@
 			extent_physical = extent_logical - logical + physical;
 			extent_dev = scrub_dev;
 			extent_mirror_num = mirror_num;
-			if (is_dev_replace)
+			if (sctx->is_dev_replace)
 				scrub_remap_extent(fs_info, extent_logical,
 						   extent_len, &extent_physical,
 						   &extent_dev,
@@ -3397,19 +3420,18 @@
 					  struct btrfs_device *scrub_dev,
 					  u64 chunk_offset, u64 length,
 					  u64 dev_offset,
-					  struct btrfs_block_group_cache *cache,
-					  int is_dev_replace)
+					  struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
 	int i;
 	int ret = 0;
 
-	read_lock(&map_tree->map_tree.lock);
-	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
-	read_unlock(&map_tree->map_tree.lock);
+	read_lock(&map_tree->lock);
+	em = lookup_extent_mapping(map_tree, chunk_offset, 1);
+	read_unlock(&map_tree->lock);
 
 	if (!em) {
 		/*
@@ -3435,8 +3457,7 @@
 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
 		    map->stripes[i].physical == dev_offset) {
 			ret = scrub_stripe(sctx, map, scrub_dev, i,
-					   chunk_offset, length,
-					   is_dev_replace);
+					   chunk_offset, length);
 			if (ret)
 				goto out;
 		}
@@ -3449,8 +3470,7 @@
 
 static noinline_for_stack
 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
-			   struct btrfs_device *scrub_dev, u64 start, u64 end,
-			   int is_dev_replace)
+			   struct btrfs_device *scrub_dev, u64 start, u64 end)
 {
 	struct btrfs_dev_extent *dev_extent = NULL;
 	struct btrfs_path *path;
@@ -3544,10 +3564,10 @@
 		 */
 		scrub_pause_on(fs_info);
 		ret = btrfs_inc_block_group_ro(cache);
-		if (!ret && is_dev_replace) {
+		if (!ret && sctx->is_dev_replace) {
 			/*
 			 * If we are doing a device replace wait for any tasks
-			 * that started dellaloc right before we set the block
+			 * that started delalloc right before we set the block
 			 * group to RO mode, as they might have just allocated
 			 * an extent from it or decided they could do a nocow
 			 * write. And if any such tasks did that, wait for their
@@ -3603,13 +3623,14 @@
 			break;
 		}
 
-		btrfs_dev_replace_write_lock(&fs_info->dev_replace);
+		down_write(&fs_info->dev_replace.rwsem);
 		dev_replace->cursor_right = found_key.offset + length;
 		dev_replace->cursor_left = found_key.offset;
 		dev_replace->item_needs_writeback = 1;
-		btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
+		up_write(&dev_replace->rwsem);
+
 		ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
-				  found_key.offset, cache, is_dev_replace);
+				  found_key.offset, cache);
 
 		/*
 		 * flush, submit all pending read and write bios, afterwards
@@ -3643,10 +3664,10 @@
 
 		scrub_pause_off(fs_info);
 
-		btrfs_dev_replace_write_lock(&fs_info->dev_replace);
+		down_write(&fs_info->dev_replace.rwsem);
 		dev_replace->cursor_left = dev_replace->cursor_right;
 		dev_replace->item_needs_writeback = 1;
-		btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
+		up_write(&fs_info->dev_replace.rwsem);
 
 		if (ro_set)
 			btrfs_dec_block_group_ro(cache);
@@ -3670,7 +3691,7 @@
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
-		if (is_dev_replace &&
+		if (sctx->is_dev_replace &&
 		    atomic64_read(&dev_replace->num_write_errors) > 0) {
 			ret = -EIO;
 			break;
@@ -3733,25 +3754,33 @@
 	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
 	int max_active = fs_info->thread_pool_size;
 
-	if (fs_info->scrub_workers_refcnt == 0) {
+	lockdep_assert_held(&fs_info->scrub_lock);
+
+	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+		ASSERT(fs_info->scrub_workers == NULL);
 		fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
 				flags, is_dev_replace ? 1 : max_active, 4);
 		if (!fs_info->scrub_workers)
 			goto fail_scrub_workers;
 
+		ASSERT(fs_info->scrub_wr_completion_workers == NULL);
 		fs_info->scrub_wr_completion_workers =
 			btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
 					      max_active, 2);
 		if (!fs_info->scrub_wr_completion_workers)
 			goto fail_scrub_wr_completion_workers;
 
+		ASSERT(fs_info->scrub_parity_workers == NULL);
 		fs_info->scrub_parity_workers =
 			btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
 					      max_active, 2);
 		if (!fs_info->scrub_parity_workers)
 			goto fail_scrub_parity_workers;
+
+		refcount_set(&fs_info->scrub_workers_refcnt, 1);
+	} else {
+		refcount_inc(&fs_info->scrub_workers_refcnt);
 	}
-	++fs_info->scrub_workers_refcnt;
 	return 0;
 
 fail_scrub_parity_workers:
@@ -3762,16 +3791,6 @@
 	return -ENOMEM;
 }
 
-static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
-{
-	if (--fs_info->scrub_workers_refcnt == 0) {
-		btrfs_destroy_workqueue(fs_info->scrub_workers);
-		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
-		btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
-	}
-	WARN_ON(fs_info->scrub_workers_refcnt < 0);
-}
-
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace)
@@ -3779,9 +3798,13 @@
 	struct scrub_ctx *sctx;
 	int ret;
 	struct btrfs_device *dev;
+	unsigned int nofs_flag;
+	struct btrfs_workqueue *scrub_workers = NULL;
+	struct btrfs_workqueue *scrub_wr_comp = NULL;
+	struct btrfs_workqueue *scrub_parity = NULL;
 
 	if (btrfs_fs_closing(fs_info))
-		return -EINVAL;
+		return -EAGAIN;
 
 	if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
 		/*
@@ -3820,13 +3843,18 @@
 		return -EINVAL;
 	}
 
+	/* Allocate outside of device_list_mutex */
+	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
+	if (IS_ERR(sctx))
+		return PTR_ERR(sctx);
 
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
 	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
 		     !is_dev_replace)) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto out_free_ctx;
 	}
 
 	if (!is_dev_replace && !readonly &&
@@ -3834,7 +3862,8 @@
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
 				rcu_str_deref(dev->name));
-		return -EROFS;
+		ret = -EROFS;
+		goto out_free_ctx;
 	}
 
 	mutex_lock(&fs_info->scrub_lock);
@@ -3842,34 +3871,29 @@
 	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return -EIO;
+		ret = -EIO;
+		goto out_free_ctx;
 	}
 
-	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+	down_read(&fs_info->dev_replace.rwsem);
 	if (dev->scrub_ctx ||
 	    (!is_dev_replace &&
 	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
-		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+		up_read(&fs_info->dev_replace.rwsem);
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return -EINPROGRESS;
+		ret = -EINPROGRESS;
+		goto out_free_ctx;
 	}
-	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+	up_read(&fs_info->dev_replace.rwsem);
 
 	ret = scrub_workers_get(fs_info, is_dev_replace);
 	if (ret) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return ret;
+		goto out_free_ctx;
 	}
 
-	sctx = scrub_setup_ctx(dev, is_dev_replace);
-	if (IS_ERR(sctx)) {
-		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(fs_info);
-		return PTR_ERR(sctx);
-	}
 	sctx->readonly = readonly;
 	dev->scrub_ctx = sctx;
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3882,7 +3906,18 @@
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
 
+	/*
+	 * In order to avoid deadlock with reclaim when there is a transaction
+	 * trying to pause scrub, make sure we use GFP_NOFS for all the
+	 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+	 * invoked by our callees. The pausing request is done when the
+	 * transaction commit starts, and it blocks the transaction until scrub
+	 * is paused (done at specific points at scrub_stripe() or right above
+	 * before incrementing fs_info->scrubs_running).
+	 */
+	nofs_flag = memalloc_nofs_save();
 	if (!is_dev_replace) {
+		btrfs_info(fs_info, "scrub: started on devid %llu", devid);
 		/*
 		 * by holding device list mutex, we can
 		 * kick off writing super in log tree sync.
@@ -3893,8 +3928,8 @@
 	}
 
 	if (!ret)
-		ret = scrub_enumerate_chunks(sctx, dev, start, end,
-					     is_dev_replace);
+		ret = scrub_enumerate_chunks(sctx, dev, start, end);
+	memalloc_nofs_restore(nofs_flag);
 
 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_dec(&fs_info->scrubs_running);
@@ -3905,14 +3940,34 @@
 	if (progress)
 		memcpy(progress, &sctx->stat, sizeof(*progress));
 
+	if (!is_dev_replace)
+		btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
+			ret ? "not finished" : "finished", devid, ret);
+
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_ctx = NULL;
-	scrub_workers_put(fs_info);
+	if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
+		scrub_workers = fs_info->scrub_workers;
+		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+		scrub_parity = fs_info->scrub_parity_workers;
+
+		fs_info->scrub_workers = NULL;
+		fs_info->scrub_wr_completion_workers = NULL;
+		fs_info->scrub_parity_workers = NULL;
+	}
 	mutex_unlock(&fs_info->scrub_lock);
 
+	btrfs_destroy_workqueue(scrub_workers);
+	btrfs_destroy_workqueue(scrub_wr_comp);
+	btrfs_destroy_workqueue(scrub_parity);
 	scrub_put_ctx(sctx);
 
 	return ret;
+
+out_free_ctx:
+	scrub_free_ctx(sctx);
+
+	return ret;
 }
 
 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
@@ -3957,9 +4012,9 @@
 	return 0;
 }
 
-int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
-			   struct btrfs_device *dev)
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
 {
+	struct btrfs_fs_info *fs_info = dev->fs_info;
 	struct scrub_ctx *sctx;
 
 	mutex_lock(&fs_info->scrub_lock);
@@ -3987,7 +4042,7 @@
 	struct scrub_ctx *sctx = NULL;
 
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
 	if (dev)
 		sctx = dev->scrub_ctx;
 	if (sctx)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 84cb6e5..123ac54 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c

@@ -260,6 +260,21 @@
 	char name[];
 };
 
+#define ADVANCE							1
+#define ADVANCE_ONLY_NEXT					-1
+
+enum btrfs_compare_tree_result {
+	BTRFS_COMPARE_TREE_NEW,
+	BTRFS_COMPARE_TREE_DELETED,
+	BTRFS_COMPARE_TREE_CHANGED,
+	BTRFS_COMPARE_TREE_SAME,
+};
+typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
+				  struct btrfs_path *right_path,
+				  struct btrfs_key *key,
+				  enum btrfs_compare_tree_result result,
+				  void *ctx);
+
 __cold
 static void inconsistent_snapshot_error(struct send_ctx *sctx,
 					enum btrfs_compare_tree_result result,
@@ -686,7 +701,7 @@
 	hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
 	hdr->crc = 0;
 
-	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+	crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
 	hdr->crc = cpu_to_le32(crc);
 
 	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -1160,7 +1175,6 @@
 struct backref_ctx {
 	struct send_ctx *sctx;
 
-	struct btrfs_path *path;
 	/* number of total found references */
 	u64 found;
 
@@ -1186,9 +1200,9 @@
 	u64 root = (u64)(uintptr_t)key;
 	struct clone_root *cr = (struct clone_root *)elt;
 
-	if (root < cr->root->objectid)
+	if (root < cr->root->root_key.objectid)
 		return -1;
-	if (root > cr->root->objectid)
+	if (root > cr->root->root_key.objectid)
 		return 1;
 	return 0;
 }
@@ -1198,9 +1212,9 @@
 	struct clone_root *cr1 = (struct clone_root *)e1;
 	struct clone_root *cr2 = (struct clone_root *)e2;
 
-	if (cr1->root->objectid < cr2->root->objectid)
+	if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
 		return -1;
-	if (cr1->root->objectid > cr2->root->objectid)
+	if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
 		return 1;
 	return 0;
 }
@@ -1213,8 +1227,6 @@
 {
 	struct backref_ctx *bctx = ctx_;
 	struct clone_root *found;
-	int ret;
-	u64 i_size;
 
 	/* First check if the root is in the list of accepted clone sources */
 	found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
@@ -1231,19 +1243,6 @@
 	}
 
 	/*
-	 * There are inodes that have extents that lie behind its i_size. Don't
-	 * accept clones from these extents.
-	 */
-	ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
-			       NULL, NULL, NULL);
-	btrfs_release_path(bctx->path);
-	if (ret < 0)
-		return ret;
-
-	if (offset + bctx->data_offset + bctx->extent_len > i_size)
-		return 0;
-
-	/*
 	 * Make sure we don't consider clones from send_root that are
 	 * behind the current inode/offset.
 	 */
@@ -1319,8 +1318,6 @@
 		goto out;
 	}
 
-	backref_ctx->path = tmp_path;
-
 	if (data_offset >= ino_size) {
 		/*
 		 * There may be extents that lie behind the file's size.
@@ -1693,12 +1690,8 @@
 
 	di = btrfs_lookup_dir_item(NULL, root, path,
 			dir, name, name_len, 0);
-	if (!di) {
-		ret = -ENOENT;
-		goto out;
-	}
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
+	if (IS_ERR_OR_NULL(di)) {
+		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
 	}
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
@@ -2242,7 +2235,7 @@
  * inodes "orphan" name instead of the real name and stop. Same with new inodes
  * that were not created yet and overwritten inodes/refs.
  *
- * When do we have have orphan inodes:
+ * When do we have orphan inodes:
  * 1. When an inode is freshly created and thus no valid refs are available yet
  * 2. When a directory lost all it's refs (deleted) but still has dir items
  *    inside which were not processed yet (pending for move/delete). If anyone
@@ -2346,7 +2339,7 @@
 		return -ENOMEM;
 	}
 
-	key.objectid = send_root->objectid;
+	key.objectid = send_root->root_key.objectid;
 	key.type = BTRFS_ROOT_BACKREF_KEY;
 	key.offset = 0;
 
@@ -2362,7 +2355,7 @@
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 	if (key.type != BTRFS_ROOT_BACKREF_KEY ||
-	    key.objectid != send_root->objectid) {
+	    key.objectid != send_root->root_key.objectid) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -3858,7 +3851,7 @@
 		/*
 		 * We may have refs where the parent directory does not exist
 		 * yet. This happens if the parent directories inum is higher
-		 * the the current inum. To handle this case, we create the
+		 * than the current inum. To handle this case, we create the
 		 * parent directory out of order. But we need to check if this
 		 * did already happen before due to other refs in the same dir.
 		 */
@@ -4779,7 +4772,7 @@
 	struct btrfs_key key;
 	pgoff_t index = offset >> PAGE_SHIFT;
 	pgoff_t last_index;
-	unsigned pg_offset = offset & ~PAGE_MASK;
+	unsigned pg_offset = offset_in_page(offset);
 	ssize_t ret = 0;
 
 	key.objectid = sctx->cur_ino;
@@ -4912,8 +4905,8 @@
 
 	btrfs_debug(sctx->send_root->fs_info,
 		    "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
-		    offset, len, clone_root->root->objectid, clone_root->ino,
-		    clone_root->offset);
+		    offset, len, clone_root->root->root_key.objectid,
+		    clone_root->ino, clone_root->offset);
 
 	p = fs_path_alloc();
 	if (!p)
@@ -5021,6 +5014,12 @@
 	if (offset >= sctx->cur_inode_size)
 		return 0;
 
+	/*
+	 * Don't go beyond the inode's i_size due to prealloc extents that start
+	 * after the i_size.
+	 */
+	end = min_t(u64, end, sctx->cur_inode_size);
+
 	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
 		return send_update_extent(sctx, offset, end - offset);
 
@@ -5086,6 +5085,7 @@
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
+	u64 clone_src_i_size = 0;
 
 	/*
 	 * Prevent cloning from a zero offset with a length matching the sector
@@ -5111,6 +5111,16 @@
 		return -ENOMEM;
 
 	/*
+	 * There are inodes that have extents that lie behind its i_size. Don't
+	 * accept clones from these extents.
+	 */
+	ret = __get_inode_info(clone_root->root, path, clone_root->ino,
+			       &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
+	btrfs_release_path(path);
+	if (ret < 0)
+		goto out;
+
+	/*
 	 * We can't send a clone operation for the entire range if we find
 	 * extent items in the respective range in the source file that
 	 * refer to different extents or if we find holes.
@@ -5152,6 +5162,7 @@
 		u8 type;
 		u64 ext_len;
 		u64 clone_len;
+		u64 clone_data_offset;
 
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(clone_root->root, path);
@@ -5205,13 +5216,73 @@
 		if (key.offset >= clone_root->offset + len)
 			break;
 
+		if (key.offset >= clone_src_i_size)
+			break;
+
+		if (key.offset + ext_len > clone_src_i_size)
+			ext_len = clone_src_i_size - key.offset;
+
+		clone_data_offset = btrfs_file_extent_offset(leaf, ei);
+		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
+			clone_root->offset = key.offset;
+			if (clone_data_offset < data_offset &&
+				clone_data_offset + ext_len > data_offset) {
+				u64 extent_offset;
+
+				extent_offset = data_offset - clone_data_offset;
+				ext_len -= extent_offset;
+				clone_data_offset += extent_offset;
+				clone_root->offset += extent_offset;
+			}
+		}
+
 		clone_len = min_t(u64, ext_len, len);
 
 		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
-		    btrfs_file_extent_offset(leaf, ei) == data_offset)
-			ret = send_clone(sctx, offset, clone_len, clone_root);
-		else
+		    clone_data_offset == data_offset) {
+			const u64 src_end = clone_root->offset + clone_len;
+			const u64 sectorsize = SZ_64K;
+
+			/*
+			 * We can't clone the last block, when its size is not
+			 * sector size aligned, into the middle of a file. If we
+			 * do so, the receiver will get a failure (-EINVAL) when
+			 * trying to clone or will silently corrupt the data in
+			 * the destination file if it's on a kernel without the
+			 * fix introduced by commit ac765f83f1397646
+			 * ("Btrfs: fix data corruption due to cloning of eof
+			 * block).
+			 *
+			 * So issue a clone of the aligned down range plus a
+			 * regular write for the eof block, if we hit that case.
+			 *
+			 * Also, we use the maximum possible sector size, 64K,
+			 * because we don't know what's the sector size of the
+			 * filesystem that receives the stream, so we have to
+			 * assume the largest possible sector size.
+			 */
+			if (src_end == clone_src_i_size &&
+			    !IS_ALIGNED(src_end, sectorsize) &&
+			    offset + clone_len < sctx->cur_inode_size) {
+				u64 slen;
+
+				slen = ALIGN_DOWN(src_end - clone_root->offset,
+						  sectorsize);
+				if (slen > 0) {
+					ret = send_clone(sctx, offset, slen,
+							 clone_root);
+					if (ret < 0)
+						goto out;
+				}
+				ret = send_extent_data(sctx, offset + slen,
+						       clone_len - slen);
+			} else {
+				ret = send_clone(sctx, offset, clone_len,
+						 clone_root);
+			}
+		} else {
 			ret = send_extent_data(sctx, offset, clone_len);
+		}
 
 		if (ret < 0)
 			goto out;
@@ -6266,68 +6337,21 @@
 {
 	int ret = 0;
 
-	if (sctx->cur_ino != sctx->cmp_key->objectid) {
-
-		if (result == BTRFS_COMPARE_TREE_CHANGED) {
-			struct extent_buffer *leaf_l;
-			struct extent_buffer *leaf_r;
-			struct btrfs_file_extent_item *ei_l;
-			struct btrfs_file_extent_item *ei_r;
-
-			leaf_l = sctx->left_path->nodes[0];
-			leaf_r = sctx->right_path->nodes[0];
-			ei_l = btrfs_item_ptr(leaf_l,
-					      sctx->left_path->slots[0],
-					      struct btrfs_file_extent_item);
-			ei_r = btrfs_item_ptr(leaf_r,
-					      sctx->right_path->slots[0],
-					      struct btrfs_file_extent_item);
-
-			/*
-			 * We may have found an extent item that has changed
-			 * only its disk_bytenr field and the corresponding
-			 * inode item was not updated. This case happens due to
-			 * very specific timings during relocation when a leaf
-			 * that contains file extent items is COWed while
-			 * relocation is ongoing and its in the stage where it
-			 * updates data pointers. So when this happens we can
-			 * safely ignore it since we know it's the same extent,
-			 * but just at different logical and physical locations
-			 * (when an extent is fully replaced with a new one, we
-			 * know the generation number must have changed too,
-			 * since snapshot creation implies committing the current
-			 * transaction, and the inode item must have been updated
-			 * as well).
-			 * This replacement of the disk_bytenr happens at
-			 * relocation.c:replace_file_extents() through
-			 * relocation.c:btrfs_reloc_cow_block().
-			 */
-			if (btrfs_file_extent_generation(leaf_l, ei_l) ==
-			    btrfs_file_extent_generation(leaf_r, ei_r) &&
-			    btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
-			    btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
-			    btrfs_file_extent_compression(leaf_l, ei_l) ==
-			    btrfs_file_extent_compression(leaf_r, ei_r) &&
-			    btrfs_file_extent_encryption(leaf_l, ei_l) ==
-			    btrfs_file_extent_encryption(leaf_r, ei_r) &&
-			    btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
-			    btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
-			    btrfs_file_extent_type(leaf_l, ei_l) ==
-			    btrfs_file_extent_type(leaf_r, ei_r) &&
-			    btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
-			    btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
-			    btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
-			    btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
-			    btrfs_file_extent_offset(leaf_l, ei_l) ==
-			    btrfs_file_extent_offset(leaf_r, ei_r) &&
-			    btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
-			    btrfs_file_extent_num_bytes(leaf_r, ei_r))
-				return 0;
-		}
-
-		inconsistent_snapshot_error(sctx, result, "extent");
-		return -EIO;
-	}
+	/*
+	 * We have found an extent item that changed without the inode item
+	 * having changed. This can happen either after relocation (where the
+	 * disk_bytenr of an extent item is replaced at
+	 * relocation.c:replace_file_extents()) or after deduplication into a
+	 * file in both the parent and send snapshots (where an extent item can
+	 * get modified or replaced with a new one). Note that deduplication
+	 * updates the inode item, but it only changes the iversion (sequence
+	 * field in the inode item) of the inode, so if a file is deduplicated
+	 * the same amount of times in both the parent and send snapshots, its
+	 * iversion becames the same in both snapshots, whence the inode item is
+	 * the same on both snapshots.
+	 */
+	if (sctx->cur_ino != sctx->cmp_key->objectid)
+		return 0;
 
 	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
 		if (result != BTRFS_COMPARE_TREE_DELETED)
@@ -6505,6 +6529,366 @@
 	return ret;
 }
 
+static int tree_move_down(struct btrfs_path *path, int *level)
+{
+	struct extent_buffer *eb;
+
+	BUG_ON(*level == 0);
+	eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
+	if (IS_ERR(eb))
+		return PTR_ERR(eb);
+
+	path->nodes[*level - 1] = eb;
+	path->slots[*level - 1] = 0;
+	(*level)--;
+	return 0;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_path *path,
+				    int *level, int root_level)
+{
+	int ret = 0;
+	int nritems;
+	nritems = btrfs_header_nritems(path->nodes[*level]);
+
+	path->slots[*level]++;
+
+	while (path->slots[*level] >= nritems) {
+		if (*level == root_level)
+			return -1;
+
+		/* move upnext */
+		path->slots[*level] = 0;
+		free_extent_buffer(path->nodes[*level]);
+		path->nodes[*level] = NULL;
+		(*level)++;
+		path->slots[*level]++;
+
+		nritems = btrfs_header_nritems(path->nodes[*level]);
+		ret = 1;
+	}
+	return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_path *path,
+			int *level, int root_level,
+			int allow_down,
+			struct btrfs_key *key)
+{
+	int ret;
+
+	if (*level == 0 || !allow_down) {
+		ret = tree_move_next_or_upnext(path, level, root_level);
+	} else {
+		ret = tree_move_down(path, level);
+	}
+	if (ret >= 0) {
+		if (*level == 0)
+			btrfs_item_key_to_cpu(path->nodes[*level], key,
+					path->slots[*level]);
+		else
+			btrfs_node_key_to_cpu(path->nodes[*level], key,
+					path->slots[*level]);
+	}
+	return ret;
+}
+
+static int tree_compare_item(struct btrfs_path *left_path,
+			     struct btrfs_path *right_path,
+			     char *tmp_buf)
+{
+	int cmp;
+	int len1, len2;
+	unsigned long off1, off2;
+
+	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+	if (len1 != len2)
+		return 1;
+
+	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+				right_path->slots[0]);
+
+	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+	if (cmp)
+		return 1;
+	return 0;
+}
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+static int btrfs_compare_trees(struct btrfs_root *left_root,
+			struct btrfs_root *right_root,
+			btrfs_changed_cb_t changed_cb, void *ctx)
+{
+	struct btrfs_fs_info *fs_info = left_root->fs_info;
+	int ret;
+	int cmp;
+	struct btrfs_path *left_path = NULL;
+	struct btrfs_path *right_path = NULL;
+	struct btrfs_key left_key;
+	struct btrfs_key right_key;
+	char *tmp_buf = NULL;
+	int left_root_level;
+	int right_root_level;
+	int left_level;
+	int right_level;
+	int left_end_reached;
+	int right_end_reached;
+	int advance_left;
+	int advance_right;
+	u64 left_blockptr;
+	u64 right_blockptr;
+	u64 left_gen;
+	u64 right_gen;
+
+	left_path = btrfs_alloc_path();
+	if (!left_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	right_path = btrfs_alloc_path();
+	if (!right_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+	if (!tmp_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	left_path->search_commit_root = 1;
+	left_path->skip_locking = 1;
+	right_path->search_commit_root = 1;
+	right_path->skip_locking = 1;
+
+	/*
+	 * Strategy: Go to the first items of both trees. Then do
+	 *
+	 * If both trees are at level 0
+	 *   Compare keys of current items
+	 *     If left < right treat left item as new, advance left tree
+	 *       and repeat
+	 *     If left > right treat right item as deleted, advance right tree
+	 *       and repeat
+	 *     If left == right do deep compare of items, treat as changed if
+	 *       needed, advance both trees and repeat
+	 * If both trees are at the same level but not at level 0
+	 *   Compare keys of current nodes/leafs
+	 *     If left < right advance left tree and repeat
+	 *     If left > right advance right tree and repeat
+	 *     If left == right compare blockptrs of the next nodes/leafs
+	 *       If they match advance both trees but stay at the same level
+	 *         and repeat
+	 *       If they don't match advance both trees while allowing to go
+	 *         deeper and repeat
+	 * If tree levels are different
+	 *   Advance the tree that needs it and repeat
+	 *
+	 * Advancing a tree means:
+	 *   If we are at level 0, try to go to the next slot. If that's not
+	 *   possible, go one level up and repeat. Stop when we found a level
+	 *   where we could go to the next slot. We may at this point be on a
+	 *   node or a leaf.
+	 *
+	 *   If we are not at level 0 and not on shared tree blocks, go one
+	 *   level deeper.
+	 *
+	 *   If we are not at level 0 and on shared tree blocks, go one slot to
+	 *   the right if possible or go up and right.
+	 */
+
+	down_read(&fs_info->commit_root_sem);
+	left_level = btrfs_header_level(left_root->commit_root);
+	left_root_level = left_level;
+	left_path->nodes[left_level] =
+			btrfs_clone_extent_buffer(left_root->commit_root);
+	if (!left_path->nodes[left_level]) {
+		up_read(&fs_info->commit_root_sem);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	right_level = btrfs_header_level(right_root->commit_root);
+	right_root_level = right_level;
+	right_path->nodes[right_level] =
+			btrfs_clone_extent_buffer(right_root->commit_root);
+	if (!right_path->nodes[right_level]) {
+		up_read(&fs_info->commit_root_sem);
+		ret = -ENOMEM;
+		goto out;
+	}
+	up_read(&fs_info->commit_root_sem);
+
+	if (left_level == 0)
+		btrfs_item_key_to_cpu(left_path->nodes[left_level],
+				&left_key, left_path->slots[left_level]);
+	else
+		btrfs_node_key_to_cpu(left_path->nodes[left_level],
+				&left_key, left_path->slots[left_level]);
+	if (right_level == 0)
+		btrfs_item_key_to_cpu(right_path->nodes[right_level],
+				&right_key, right_path->slots[right_level]);
+	else
+		btrfs_node_key_to_cpu(right_path->nodes[right_level],
+				&right_key, right_path->slots[right_level]);
+
+	left_end_reached = right_end_reached = 0;
+	advance_left = advance_right = 0;
+
+	while (1) {
+		cond_resched();
+		if (advance_left && !left_end_reached) {
+			ret = tree_advance(left_path, &left_level,
+					left_root_level,
+					advance_left != ADVANCE_ONLY_NEXT,
+					&left_key);
+			if (ret == -1)
+				left_end_reached = ADVANCE;
+			else if (ret < 0)
+				goto out;
+			advance_left = 0;
+		}
+		if (advance_right && !right_end_reached) {
+			ret = tree_advance(right_path, &right_level,
+					right_root_level,
+					advance_right != ADVANCE_ONLY_NEXT,
+					&right_key);
+			if (ret == -1)
+				right_end_reached = ADVANCE;
+			else if (ret < 0)
+				goto out;
+			advance_right = 0;
+		}
+
+		if (left_end_reached && right_end_reached) {
+			ret = 0;
+			goto out;
+		} else if (left_end_reached) {
+			if (right_level == 0) {
+				ret = changed_cb(left_path, right_path,
+						&right_key,
+						BTRFS_COMPARE_TREE_DELETED,
+						ctx);
+				if (ret < 0)
+					goto out;
+			}
+			advance_right = ADVANCE;
+			continue;
+		} else if (right_end_reached) {
+			if (left_level == 0) {
+				ret = changed_cb(left_path, right_path,
+						&left_key,
+						BTRFS_COMPARE_TREE_NEW,
+						ctx);
+				if (ret < 0)
+					goto out;
+			}
+			advance_left = ADVANCE;
+			continue;
+		}
+
+		if (left_level == 0 && right_level == 0) {
+			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+			if (cmp < 0) {
+				ret = changed_cb(left_path, right_path,
+						&left_key,
+						BTRFS_COMPARE_TREE_NEW,
+						ctx);
+				if (ret < 0)
+					goto out;
+				advance_left = ADVANCE;
+			} else if (cmp > 0) {
+				ret = changed_cb(left_path, right_path,
+						&right_key,
+						BTRFS_COMPARE_TREE_DELETED,
+						ctx);
+				if (ret < 0)
+					goto out;
+				advance_right = ADVANCE;
+			} else {
+				enum btrfs_compare_tree_result result;
+
+				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+				ret = tree_compare_item(left_path, right_path,
+							tmp_buf);
+				if (ret)
+					result = BTRFS_COMPARE_TREE_CHANGED;
+				else
+					result = BTRFS_COMPARE_TREE_SAME;
+				ret = changed_cb(left_path, right_path,
+						 &left_key, result, ctx);
+				if (ret < 0)
+					goto out;
+				advance_left = ADVANCE;
+				advance_right = ADVANCE;
+			}
+		} else if (left_level == right_level) {
+			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+			if (cmp < 0) {
+				advance_left = ADVANCE;
+			} else if (cmp > 0) {
+				advance_right = ADVANCE;
+			} else {
+				left_blockptr = btrfs_node_blockptr(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_blockptr = btrfs_node_blockptr(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				left_gen = btrfs_node_ptr_generation(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_gen = btrfs_node_ptr_generation(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				if (left_blockptr == right_blockptr &&
+				    left_gen == right_gen) {
+					/*
+					 * As we're on a shared block, don't
+					 * allow to go deeper.
+					 */
+					advance_left = ADVANCE_ONLY_NEXT;
+					advance_right = ADVANCE_ONLY_NEXT;
+				} else {
+					advance_left = ADVANCE;
+					advance_right = ADVANCE;
+				}
+			}
+		} else if (left_level < right_level) {
+			advance_right = ADVANCE;
+		} else {
+			advance_left = ADVANCE;
+		}
+	}
+
+out:
+	btrfs_free_path(left_path);
+	btrfs_free_path(right_path);
+	kvfree(tmp_buf);
+	return ret;
+}
+
 static int send_subvol(struct send_ctx *sctx)
 {
 	int ret;
@@ -6583,6 +6967,38 @@
 	return btrfs_commit_transaction(trans);
 }
 
+/*
+ * Make sure any existing dellaloc is flushed for any root used by a send
+ * operation so that we do not miss any data and we do not race with writeback
+ * finishing and changing a tree while send is using the tree. This could
+ * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
+ * a send operation then uses the subvolume.
+ * After flushing delalloc ensure_commit_roots_uptodate() must be called.
+ */
+static int flush_delalloc_roots(struct send_ctx *sctx)
+{
+	struct btrfs_root *root = sctx->parent_root;
+	int ret;
+	int i;
+
+	if (root) {
+		ret = btrfs_start_delalloc_snapshot(root);
+		if (ret)
+			return ret;
+		btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+	}
+
+	for (i = 0; i < sctx->clone_roots_cnt; i++) {
+		root = sctx->clone_roots[i].root;
+		ret = btrfs_start_delalloc_snapshot(root);
+		if (ret)
+			return ret;
+		btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+	}
+
+	return 0;
+}
+
 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
 {
 	spin_lock(&root->root_item_lock);
@@ -6598,6 +7014,13 @@
 	spin_unlock(&root->root_item_lock);
 }
 
+static void dedupe_in_progress_warn(const struct btrfs_root *root)
+{
+	btrfs_warn_rl(root->fs_info,
+"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
+		      root->root_key.objectid, root->dedupe_in_progress);
+}
+
 long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
 {
 	int ret = 0;
@@ -6621,6 +7044,11 @@
 	 * making it RW. This also protects against deletion.
 	 */
 	spin_lock(&send_root->root_item_lock);
+	if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
+		dedupe_in_progress_warn(send_root);
+		spin_unlock(&send_root->root_item_lock);
+		return -EAGAIN;
+	}
 	send_root->send_in_progress++;
 	spin_unlock(&send_root->root_item_lock);
 
@@ -6650,7 +7078,7 @@
 		goto out;
 	}
 
-	if (!access_ok(VERIFY_READ, arg->clone_sources,
+	if (!access_ok(arg->clone_sources,
 			sizeof(*arg->clone_sources) *
 			arg->clone_sources_count)) {
 		ret = -EFAULT;
@@ -6755,6 +7183,13 @@
 				ret = -EPERM;
 				goto out;
 			}
+			if (clone_root->dedupe_in_progress) {
+				dedupe_in_progress_warn(clone_root);
+				spin_unlock(&clone_root->root_item_lock);
+				srcu_read_unlock(&fs_info->subvol_srcu, index);
+				ret = -EAGAIN;
+				goto out;
+			}
 			clone_root->send_in_progress++;
 			spin_unlock(&clone_root->root_item_lock);
 			srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -6789,6 +7224,13 @@
 			ret = -EPERM;
 			goto out;
 		}
+		if (sctx->parent_root->dedupe_in_progress) {
+			dedupe_in_progress_warn(sctx->parent_root);
+			spin_unlock(&sctx->parent_root->root_item_lock);
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+			ret = -EAGAIN;
+			goto out;
+		}
 		spin_unlock(&sctx->parent_root->root_item_lock);
 
 		srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -6807,13 +7249,31 @@
 			NULL);
 	sort_clone_roots = 1;
 
+	ret = flush_delalloc_roots(sctx);
+	if (ret)
+		goto out;
+
 	ret = ensure_commit_roots_uptodate(sctx);
 	if (ret)
 		goto out;
 
+	mutex_lock(&fs_info->balance_mutex);
+	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+		mutex_unlock(&fs_info->balance_mutex);
+		btrfs_warn_rl(fs_info,
+		"cannot run send because a balance operation is in progress");
+		ret = -EAGAIN;
+		goto out;
+	}
+	fs_info->send_in_progress++;
+	mutex_unlock(&fs_info->balance_mutex);
+
 	current->journal_info = BTRFS_SEND_TRANS_STUB;
 	ret = send_subvol(sctx);
 	current->journal_info = NULL;
+	mutex_lock(&fs_info->balance_mutex);
+	fs_info->send_in_progress--;
+	mutex_unlock(&fs_info->balance_mutex);
 	if (ret < 0)
 		goto out;
 

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
new file mode 100644
index 0000000..e8a4b0e
--- /dev/null
+++ b/fs/btrfs/space-info.c

@@ -0,0 +1,1117 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "ctree.h"
+#include "space-info.h"
+#include "sysfs.h"
+#include "volumes.h"
+#include "free-space-cache.h"
+#include "ordered-data.h"
+#include "transaction.h"
+#include "block-group.h"
+
+u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
+			  bool may_use_included)
+{
+	ASSERT(s_info);
+	return s_info->bytes_used + s_info->bytes_reserved +
+		s_info->bytes_pinned + s_info->bytes_readonly +
+		(may_use_included ? s_info->bytes_may_use : 0);
+}
+
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list)
+		found->full = 0;
+	rcu_read_unlock();
+}
+
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+{
+
+	struct btrfs_space_info *space_info;
+	int i;
+	int ret;
+
+	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+	if (!space_info)
+		return -ENOMEM;
+
+	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
+				 GFP_KERNEL);
+	if (ret) {
+		kfree(space_info);
+		return ret;
+	}
+
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+		INIT_LIST_HEAD(&space_info->block_groups[i]);
+	init_rwsem(&space_info->groups_sem);
+	spin_lock_init(&space_info->lock);
+	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+	init_waitqueue_head(&space_info->wait);
+	INIT_LIST_HEAD(&space_info->ro_bgs);
+	INIT_LIST_HEAD(&space_info->tickets);
+	INIT_LIST_HEAD(&space_info->priority_tickets);
+
+	ret = btrfs_sysfs_add_space_info_type(info, space_info);
+	if (ret)
+		return ret;
+
+	list_add_rcu(&space_info->list, &info->space_info);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		info->data_sinfo = space_info;
+
+	return ret;
+}
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+	u64 flags;
+	int mixed = 0;
+	int ret;
+
+	disk_super = fs_info->super_copy;
+	if (!btrfs_super_root(disk_super))
+		return -EINVAL;
+
+	features = btrfs_super_incompat_flags(disk_super);
+	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = 1;
+
+	flags = BTRFS_BLOCK_GROUP_SYSTEM;
+	ret = create_space_info(fs_info, flags);
+	if (ret)
+		goto out;
+
+	if (mixed) {
+		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+		ret = create_space_info(fs_info, flags);
+	} else {
+		flags = BTRFS_BLOCK_GROUP_METADATA;
+		ret = create_space_info(fs_info, flags);
+		if (ret)
+			goto out;
+
+		flags = BTRFS_BLOCK_GROUP_DATA;
+		ret = create_space_info(fs_info, flags);
+	}
+out:
+	return ret;
+}
+
+void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     u64 bytes_readonly,
+			     struct btrfs_space_info **space_info)
+{
+	struct btrfs_space_info *found;
+	int factor;
+
+	factor = btrfs_bg_type_to_factor(flags);
+
+	found = btrfs_find_space_info(info, flags);
+	ASSERT(found);
+	spin_lock(&found->lock);
+	found->total_bytes += total_bytes;
+	found->disk_total += total_bytes * factor;
+	found->bytes_used += bytes_used;
+	found->disk_used += bytes_used * factor;
+	found->bytes_readonly += bytes_readonly;
+	if (total_bytes > 0)
+		found->full = 0;
+	btrfs_try_granting_tickets(info, found);
+	spin_unlock(&found->lock);
+	*space_info = found;
+}
+
+struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
+					       u64 flags)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & flags) {
+			rcu_read_unlock();
+			return found;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+	return (global->size << 1);
+}
+
+static int can_overcommit(struct btrfs_fs_info *fs_info,
+			  struct btrfs_space_info *space_info, u64 bytes,
+			  enum btrfs_reserve_flush_enum flush,
+			  bool system_chunk)
+{
+	u64 profile;
+	u64 avail;
+	u64 used;
+	int factor;
+
+	/* Don't overcommit when in mixed mode. */
+	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+		return 0;
+
+	if (system_chunk)
+		profile = btrfs_system_alloc_profile(fs_info);
+	else
+		profile = btrfs_metadata_alloc_profile(fs_info);
+
+	used = btrfs_space_info_used(space_info, true);
+	avail = atomic64_read(&fs_info->free_chunk_space);
+
+	/*
+	 * If we have dup, raid1 or raid10 then only half of the free
+	 * space is actually usable.  For raid56, the space info used
+	 * doesn't include the parity drive, so we don't have to
+	 * change the math
+	 */
+	factor = btrfs_bg_type_to_factor(profile);
+	avail = div_u64(avail, factor);
+
+	/*
+	 * If we aren't flushing all things, let us overcommit up to
+	 * 1/2th of the space. If we can flush, don't let us overcommit
+	 * too much, let it overcommit up to 1/8 of the space.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_ALL)
+		avail >>= 3;
+	else
+		avail >>= 1;
+
+	if (used + bytes < space_info->total_bytes + avail)
+		return 1;
+	return 0;
+}
+
+/*
+ * This is for space we already have accounted in space_info->bytes_may_use, so
+ * basically when we're returning space from block_rsv's.
+ */
+void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
+				struct btrfs_space_info *space_info)
+{
+	struct list_head *head;
+	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+
+	lockdep_assert_held(&space_info->lock);
+
+	head = &space_info->priority_tickets;
+again:
+	while (!list_empty(head)) {
+		struct reserve_ticket *ticket;
+		u64 used = btrfs_space_info_used(space_info, true);
+
+		ticket = list_first_entry(head, struct reserve_ticket, list);
+
+		/* Check and see if our ticket can be satisified now. */
+		if ((used + ticket->bytes <= space_info->total_bytes) ||
+		    can_overcommit(fs_info, space_info, ticket->bytes, flush,
+				   false)) {
+			btrfs_space_info_update_bytes_may_use(fs_info,
+							      space_info,
+							      ticket->bytes);
+			list_del_init(&ticket->list);
+			ticket->bytes = 0;
+			space_info->tickets_id++;
+			wake_up(&ticket->wait);
+		} else {
+			break;
+		}
+	}
+
+	if (head == &space_info->priority_tickets) {
+		head = &space_info->tickets;
+		flush = BTRFS_RESERVE_FLUSH_ALL;
+		goto again;
+	}
+}
+
+#define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
+do {									\
+	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
+	spin_lock(&__rsv->lock);					\
+	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
+		   __rsv->size, __rsv->reserved);			\
+	spin_unlock(&__rsv->lock);					\
+} while (0)
+
+static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+				    struct btrfs_space_info *info)
+{
+	lockdep_assert_held(&info->lock);
+
+	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
+		   info->flags,
+		   info->total_bytes - btrfs_space_info_used(info, true),
+		   info->full ? "" : "not ");
+	btrfs_info(fs_info,
+		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
+		info->total_bytes, info->bytes_used, info->bytes_pinned,
+		info->bytes_reserved, info->bytes_may_use,
+		info->bytes_readonly);
+
+	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
+}
+
+void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+			   struct btrfs_space_info *info, u64 bytes,
+			   int dump_block_groups)
+{
+	struct btrfs_block_group_cache *cache;
+	int index = 0;
+
+	spin_lock(&info->lock);
+	__btrfs_dump_space_info(fs_info, info);
+	spin_unlock(&info->lock);
+
+	if (!dump_block_groups)
+		return;
+
+	down_read(&info->groups_sem);
+again:
+	list_for_each_entry(cache, &info->block_groups[index], list) {
+		spin_lock(&cache->lock);
+		btrfs_info(fs_info,
+			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
+			cache->key.objectid, cache->key.offset,
+			btrfs_block_group_used(&cache->item), cache->pinned,
+			cache->reserved, cache->ro ? "[readonly]" : "");
+		btrfs_dump_free_space(cache, bytes);
+		spin_unlock(&cache->lock);
+	}
+	if (++index < BTRFS_NR_RAID_TYPES)
+		goto again;
+	up_read(&info->groups_sem);
+}
+
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
+					 unsigned long nr_pages, int nr_items)
+{
+	struct super_block *sb = fs_info->sb;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+		up_read(&sb->s_umount);
+	} else {
+		/*
+		 * We needn't worry the filesystem going from r/w to r/o though
+		 * we don't acquire ->s_umount mutex, because the filesystem
+		 * should guarantee the delalloc inodes list be empty after
+		 * the filesystem is readonly(all dirty pages are written to
+		 * the disk).
+		 */
+		btrfs_start_delalloc_roots(fs_info, nr_items);
+		if (!current->journal_info)
+			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
+	}
+}
+
+static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+					u64 to_reclaim)
+{
+	u64 bytes;
+	u64 nr;
+
+	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+	nr = div64_u64(to_reclaim, bytes);
+	if (!nr)
+		nr = 1;
+	return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM	SZ_256K
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
+			    u64 orig, bool wait_ordered)
+{
+	struct btrfs_space_info *space_info;
+	struct btrfs_trans_handle *trans;
+	u64 delalloc_bytes;
+	u64 dio_bytes;
+	u64 async_pages;
+	u64 items;
+	long time_left;
+	unsigned long nr_pages;
+	int loops;
+
+	/* Calc the number of the pages we need flush for space reservation */
+	items = calc_reclaim_items_nr(fs_info, to_reclaim);
+	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
+	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+	delalloc_bytes = percpu_counter_sum_positive(
+						&fs_info->delalloc_bytes);
+	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+	if (delalloc_bytes == 0 && dio_bytes == 0) {
+		if (trans)
+			return;
+		if (wait_ordered)
+			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+		return;
+	}
+
+	/*
+	 * If we are doing more ordered than delalloc we need to just wait on
+	 * ordered extents, otherwise we'll waste time trying to flush delalloc
+	 * that likely won't give us the space back we need.
+	 */
+	if (dio_bytes > delalloc_bytes)
+		wait_ordered = true;
+
+	loops = 0;
+	while ((delalloc_bytes || dio_bytes) && loops < 3) {
+		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+		/*
+		 * Triggers inode writeback for up to nr_pages. This will invoke
+		 * ->writepages callback and trigger delalloc filling
+		 *  (btrfs_run_delalloc_range()).
+		 */
+		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
+		/*
+		 * We need to wait for the compressed pages to start before
+		 * we continue.
+		 */
+		async_pages = atomic_read(&fs_info->async_delalloc_pages);
+		if (!async_pages)
+			goto skip_async;
+
+		/*
+		 * Calculate how many compressed pages we want to be written
+		 * before we continue. I.e if there are more async pages than we
+		 * require wait_event will wait until nr_pages are written.
+		 */
+		if (async_pages <= nr_pages)
+			async_pages = 0;
+		else
+			async_pages -= nr_pages;
+
+		wait_event(fs_info->async_submit_wait,
+			   atomic_read(&fs_info->async_delalloc_pages) <=
+			   (int)async_pages);
+skip_async:
+		spin_lock(&space_info->lock);
+		if (list_empty(&space_info->tickets) &&
+		    list_empty(&space_info->priority_tickets)) {
+			spin_unlock(&space_info->lock);
+			break;
+		}
+		spin_unlock(&space_info->lock);
+
+		loops++;
+		if (wait_ordered && !trans) {
+			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+		} else {
+			time_left = schedule_timeout_killable(1);
+			if (time_left)
+				break;
+		}
+		delalloc_bytes = percpu_counter_sum_positive(
+						&fs_info->delalloc_bytes);
+		dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+	}
+}
+
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_fs_info *fs_info,
+				  struct btrfs_space_info *space_info)
+{
+	struct reserve_ticket *ticket = NULL;
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_trans_handle *trans;
+	u64 bytes_needed;
+	u64 reclaim_bytes = 0;
+	u64 cur_free_bytes = 0;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
+	if (trans)
+		return -EAGAIN;
+
+	spin_lock(&space_info->lock);
+	cur_free_bytes = btrfs_space_info_used(space_info, true);
+	if (cur_free_bytes < space_info->total_bytes)
+		cur_free_bytes = space_info->total_bytes - cur_free_bytes;
+	else
+		cur_free_bytes = 0;
+
+	if (!list_empty(&space_info->priority_tickets))
+		ticket = list_first_entry(&space_info->priority_tickets,
+					  struct reserve_ticket, list);
+	else if (!list_empty(&space_info->tickets))
+		ticket = list_first_entry(&space_info->tickets,
+					  struct reserve_ticket, list);
+	bytes_needed = (ticket) ? ticket->bytes : 0;
+
+	if (bytes_needed > cur_free_bytes)
+		bytes_needed -= cur_free_bytes;
+	else
+		bytes_needed = 0;
+	spin_unlock(&space_info->lock);
+
+	if (!bytes_needed)
+		return 0;
+
+	trans = btrfs_join_transaction(fs_info->extent_root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	/*
+	 * See if there is enough pinned space to make this reservation, or if
+	 * we have block groups that are going to be freed, allowing us to
+	 * possibly do a chunk allocation the next loop through.
+	 */
+	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+	    __percpu_counter_compare(&space_info->total_bytes_pinned,
+				     bytes_needed,
+				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+		goto commit;
+
+	/*
+	 * See if there is some space in the delayed insertion reservation for
+	 * this reservation.
+	 */
+	if (space_info != delayed_rsv->space_info)
+		goto enospc;
+
+	spin_lock(&delayed_rsv->lock);
+	reclaim_bytes += delayed_rsv->reserved;
+	spin_unlock(&delayed_rsv->lock);
+
+	spin_lock(&delayed_refs_rsv->lock);
+	reclaim_bytes += delayed_refs_rsv->reserved;
+	spin_unlock(&delayed_refs_rsv->lock);
+	if (reclaim_bytes >= bytes_needed)
+		goto commit;
+	bytes_needed -= reclaim_bytes;
+
+	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes_needed,
+				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+		goto enospc;
+
+commit:
+	return btrfs_commit_transaction(trans);
+enospc:
+	btrfs_end_transaction(trans);
+	return -ENOSPC;
+}
+
+/*
+ * Try to flush some data based on policy set by @state. This is only advisory
+ * and may fail for various reasons. The caller is supposed to examine the
+ * state of @space_info to detect the outcome.
+ */
+static void flush_space(struct btrfs_fs_info *fs_info,
+		       struct btrfs_space_info *space_info, u64 num_bytes,
+		       int state)
+{
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_trans_handle *trans;
+	int nr;
+	int ret = 0;
+
+	switch (state) {
+	case FLUSH_DELAYED_ITEMS_NR:
+	case FLUSH_DELAYED_ITEMS:
+		if (state == FLUSH_DELAYED_ITEMS_NR)
+			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
+		else
+			nr = -1;
+
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = btrfs_run_delayed_items_nr(trans, nr);
+		btrfs_end_transaction(trans);
+		break;
+	case FLUSH_DELALLOC:
+	case FLUSH_DELALLOC_WAIT:
+		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+				state == FLUSH_DELALLOC_WAIT);
+		break;
+	case FLUSH_DELAYED_REFS_NR:
+	case FLUSH_DELAYED_REFS:
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		if (state == FLUSH_DELAYED_REFS_NR)
+			nr = calc_reclaim_items_nr(fs_info, num_bytes);
+		else
+			nr = 0;
+		btrfs_run_delayed_refs(trans, nr);
+		btrfs_end_transaction(trans);
+		break;
+	case ALLOC_CHUNK:
+	case ALLOC_CHUNK_FORCE:
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = btrfs_chunk_alloc(trans,
+				btrfs_metadata_alloc_profile(fs_info),
+				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
+					CHUNK_ALLOC_FORCE);
+		btrfs_end_transaction(trans);
+		if (ret > 0 || ret == -ENOSPC)
+			ret = 0;
+		break;
+	case RUN_DELAYED_IPUTS:
+		/*
+		 * If we have pending delayed iputs then we could free up a
+		 * bunch of pinned space, so make sure we run the iputs before
+		 * we do our pinned bytes check below.
+		 */
+		btrfs_run_delayed_iputs(fs_info);
+		btrfs_wait_on_delayed_iputs(fs_info);
+		break;
+	case COMMIT_TRANS:
+		ret = may_commit_transaction(fs_info, space_info);
+		break;
+	default:
+		ret = -ENOSPC;
+		break;
+	}
+
+	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
+				ret);
+	return;
+}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+				 struct btrfs_space_info *space_info,
+				 bool system_chunk)
+{
+	struct reserve_ticket *ticket;
+	u64 used;
+	u64 expected;
+	u64 to_reclaim = 0;
+
+	list_for_each_entry(ticket, &space_info->tickets, list)
+		to_reclaim += ticket->bytes;
+	list_for_each_entry(ticket, &space_info->priority_tickets, list)
+		to_reclaim += ticket->bytes;
+	if (to_reclaim)
+		return to_reclaim;
+
+	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
+	if (can_overcommit(fs_info, space_info, to_reclaim,
+			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+		return 0;
+
+	used = btrfs_space_info_used(space_info, true);
+
+	if (can_overcommit(fs_info, space_info, SZ_1M,
+			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+		expected = div_factor_fine(space_info->total_bytes, 95);
+	else
+		expected = div_factor_fine(space_info->total_bytes, 90);
+
+	if (used > expected)
+		to_reclaim = used - expected;
+	else
+		to_reclaim = 0;
+	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+				     space_info->bytes_reserved);
+	return to_reclaim;
+}
+
+static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
+					struct btrfs_space_info *space_info,
+					u64 used, bool system_chunk)
+{
+	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+	/* If we're just plain full then async reclaim just slows us down. */
+	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
+		return 0;
+
+	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+					      system_chunk))
+		return 0;
+
+	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
+		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+/*
+ * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * @fs_info - fs_info for this fs
+ * @space_info - the space info we were flushing
+ *
+ * We call this when we've exhausted our flushing ability and haven't made
+ * progress in satisfying tickets.  The reservation code handles tickets in
+ * order, so if there is a large ticket first and then smaller ones we could
+ * very well satisfy the smaller tickets.  This will attempt to wake up any
+ * tickets in the list to catch this case.
+ *
+ * This function returns true if it was able to make progress by clearing out
+ * other tickets, or if it stumbles across a ticket that was smaller than the
+ * first ticket.
+ */
+static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
+				   struct btrfs_space_info *space_info)
+{
+	struct reserve_ticket *ticket;
+	u64 tickets_id = space_info->tickets_id;
+	u64 first_ticket_bytes = 0;
+
+	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+		btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
+		__btrfs_dump_space_info(fs_info, space_info);
+	}
+
+	while (!list_empty(&space_info->tickets) &&
+	       tickets_id == space_info->tickets_id) {
+		ticket = list_first_entry(&space_info->tickets,
+					  struct reserve_ticket, list);
+
+		/*
+		 * may_commit_transaction will avoid committing the transaction
+		 * if it doesn't feel like the space reclaimed by the commit
+		 * would result in the ticket succeeding.  However if we have a
+		 * smaller ticket in the queue it may be small enough to be
+		 * satisified by committing the transaction, so if any
+		 * subsequent ticket is smaller than the first ticket go ahead
+		 * and send us back for another loop through the enospc flushing
+		 * code.
+		 */
+		if (first_ticket_bytes == 0)
+			first_ticket_bytes = ticket->bytes;
+		else if (first_ticket_bytes > ticket->bytes)
+			return true;
+
+		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+			btrfs_info(fs_info, "failing ticket with %llu bytes",
+				   ticket->bytes);
+
+		list_del_init(&ticket->list);
+		ticket->error = -ENOSPC;
+		wake_up(&ticket->wait);
+
+		/*
+		 * We're just throwing tickets away, so more flushing may not
+		 * trip over btrfs_try_granting_tickets, so we need to call it
+		 * here to see if we can make progress with the next ticket in
+		 * the list.
+		 */
+		btrfs_try_granting_tickets(fs_info, space_info);
+	}
+	return (tickets_id != space_info->tickets_id);
+}
+
+/*
+ * This is for normal flushers, we can wait all goddamned day if we want to.  We
+ * will loop and continuously try to flush as long as we are making progress.
+ * We count progress as clearing off tickets each time we have to loop.
+ */
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_space_info *space_info;
+	u64 to_reclaim;
+	int flush_state;
+	int commit_cycles = 0;
+	u64 last_tickets_id;
+
+	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+	spin_lock(&space_info->lock);
+	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+						      false);
+	if (!to_reclaim) {
+		space_info->flush = 0;
+		spin_unlock(&space_info->lock);
+		return;
+	}
+	last_tickets_id = space_info->tickets_id;
+	spin_unlock(&space_info->lock);
+
+	flush_state = FLUSH_DELAYED_ITEMS_NR;
+	do {
+		flush_space(fs_info, space_info, to_reclaim, flush_state);
+		spin_lock(&space_info->lock);
+		if (list_empty(&space_info->tickets)) {
+			space_info->flush = 0;
+			spin_unlock(&space_info->lock);
+			return;
+		}
+		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
+							      space_info,
+							      false);
+		if (last_tickets_id == space_info->tickets_id) {
+			flush_state++;
+		} else {
+			last_tickets_id = space_info->tickets_id;
+			flush_state = FLUSH_DELAYED_ITEMS_NR;
+			if (commit_cycles)
+				commit_cycles--;
+		}
+
+		/*
+		 * We don't want to force a chunk allocation until we've tried
+		 * pretty hard to reclaim space.  Think of the case where we
+		 * freed up a bunch of space and so have a lot of pinned space
+		 * to reclaim.  We would rather use that than possibly create a
+		 * underutilized metadata chunk.  So if this is our first run
+		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+		 * commit the transaction.  If nothing has changed the next go
+		 * around then we can force a chunk allocation.
+		 */
+		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+			flush_state++;
+
+		if (flush_state > COMMIT_TRANS) {
+			commit_cycles++;
+			if (commit_cycles > 2) {
+				if (maybe_fail_all_tickets(fs_info, space_info)) {
+					flush_state = FLUSH_DELAYED_ITEMS_NR;
+					commit_cycles--;
+				} else {
+					space_info->flush = 0;
+				}
+			} else {
+				flush_state = FLUSH_DELAYED_ITEMS_NR;
+			}
+		}
+		spin_unlock(&space_info->lock);
+	} while (flush_state <= COMMIT_TRANS);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+static const enum btrfs_flush_state priority_flush_states[] = {
+	FLUSH_DELAYED_ITEMS_NR,
+	FLUSH_DELAYED_ITEMS,
+	ALLOC_CHUNK,
+};
+
+static const enum btrfs_flush_state evict_flush_states[] = {
+	FLUSH_DELAYED_ITEMS_NR,
+	FLUSH_DELAYED_ITEMS,
+	FLUSH_DELAYED_REFS_NR,
+	FLUSH_DELAYED_REFS,
+	FLUSH_DELALLOC,
+	FLUSH_DELALLOC_WAIT,
+	ALLOC_CHUNK,
+	COMMIT_TRANS,
+};
+
+static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
+				struct btrfs_space_info *space_info,
+				struct reserve_ticket *ticket,
+				const enum btrfs_flush_state *states,
+				int states_nr)
+{
+	u64 to_reclaim;
+	int flush_state;
+
+	spin_lock(&space_info->lock);
+	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+						      false);
+	if (!to_reclaim) {
+		spin_unlock(&space_info->lock);
+		return;
+	}
+	spin_unlock(&space_info->lock);
+
+	flush_state = 0;
+	do {
+		flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
+		flush_state++;
+		spin_lock(&space_info->lock);
+		if (ticket->bytes == 0) {
+			spin_unlock(&space_info->lock);
+			return;
+		}
+		spin_unlock(&space_info->lock);
+	} while (flush_state < states_nr);
+}
+
+static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
+				struct btrfs_space_info *space_info,
+				struct reserve_ticket *ticket)
+
+{
+	DEFINE_WAIT(wait);
+	int ret = 0;
+
+	spin_lock(&space_info->lock);
+	while (ticket->bytes > 0 && ticket->error == 0) {
+		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+		if (ret) {
+			/*
+			 * Delete us from the list. After we unlock the space
+			 * info, we don't want the async reclaim job to reserve
+			 * space for this ticket. If that would happen, then the
+			 * ticket's task would not known that space was reserved
+			 * despite getting an error, resulting in a space leak
+			 * (bytes_may_use counter of our space_info).
+			 */
+			list_del_init(&ticket->list);
+			ticket->error = -EINTR;
+			break;
+		}
+		spin_unlock(&space_info->lock);
+
+		schedule();
+
+		finish_wait(&ticket->wait, &wait);
+		spin_lock(&space_info->lock);
+	}
+	spin_unlock(&space_info->lock);
+}
+
+/**
+ * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
+ * @fs_info - the fs
+ * @space_info - the space_info for the reservation
+ * @ticket - the ticket for the reservation
+ * @flush - how much we can flush
+ *
+ * This does the work of figuring out how to flush for the ticket, waiting for
+ * the reservation, and returning the appropriate error if there is one.
+ */
+static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
+				 struct btrfs_space_info *space_info,
+				 struct reserve_ticket *ticket,
+				 enum btrfs_reserve_flush_enum flush)
+{
+	int ret;
+
+	switch (flush) {
+	case BTRFS_RESERVE_FLUSH_ALL:
+		wait_reserve_ticket(fs_info, space_info, ticket);
+		break;
+	case BTRFS_RESERVE_FLUSH_LIMIT:
+		priority_reclaim_metadata_space(fs_info, space_info, ticket,
+						priority_flush_states,
+						ARRAY_SIZE(priority_flush_states));
+		break;
+	case BTRFS_RESERVE_FLUSH_EVICT:
+		priority_reclaim_metadata_space(fs_info, space_info, ticket,
+						evict_flush_states,
+						ARRAY_SIZE(evict_flush_states));
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	spin_lock(&space_info->lock);
+	ret = ticket->error;
+	if (ticket->bytes || ticket->error) {
+		/*
+		 * Need to delete here for priority tickets. For regular tickets
+		 * either the async reclaim job deletes the ticket from the list
+		 * or we delete it ourselves at wait_reserve_ticket().
+		 */
+		list_del_init(&ticket->list);
+		if (!ret)
+			ret = -ENOSPC;
+	}
+	spin_unlock(&space_info->lock);
+	ASSERT(list_empty(&ticket->list));
+	/*
+	 * Check that we can't have an error set if the reservation succeeded,
+	 * as that would confuse tasks and lead them to error out without
+	 * releasing reserved space (if an error happens the expectation is that
+	 * space wasn't reserved at all).
+	 */
+	ASSERT(!(ticket->bytes == 0 && ticket->error));
+	return ret;
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @space_info - the space info we want to allocate from
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orig_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
+				    struct btrfs_space_info *space_info,
+				    u64 orig_bytes,
+				    enum btrfs_reserve_flush_enum flush,
+				    bool system_chunk)
+{
+	struct reserve_ticket ticket;
+	u64 used;
+	int ret = 0;
+	bool pending_tickets;
+
+	ASSERT(orig_bytes);
+	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+
+	spin_lock(&space_info->lock);
+	ret = -ENOSPC;
+	used = btrfs_space_info_used(space_info, true);
+	pending_tickets = !list_empty(&space_info->tickets) ||
+		!list_empty(&space_info->priority_tickets);
+
+	/*
+	 * Carry on if we have enough space (short-circuit) OR call
+	 * can_overcommit() to ensure we can overcommit to continue.
+	 */
+	if (!pending_tickets &&
+	    ((used + orig_bytes <= space_info->total_bytes) ||
+	     can_overcommit(fs_info, space_info, orig_bytes, flush,
+			   system_chunk))) {
+		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+						      orig_bytes);
+		ret = 0;
+	}
+
+	/*
+	 * If we couldn't make a reservation then setup our reservation ticket
+	 * and kick the async worker if it's not already running.
+	 *
+	 * If we are a priority flusher then we just need to add our ticket to
+	 * the list and we will do our own flushing further down.
+	 */
+	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+		ticket.bytes = orig_bytes;
+		ticket.error = 0;
+		init_waitqueue_head(&ticket.wait);
+		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+			list_add_tail(&ticket.list, &space_info->tickets);
+			if (!space_info->flush) {
+				space_info->flush = 1;
+				trace_btrfs_trigger_flush(fs_info,
+							  space_info->flags,
+							  orig_bytes, flush,
+							  "enospc");
+				queue_work(system_unbound_wq,
+					   &fs_info->async_reclaim_work);
+			}
+		} else {
+			list_add_tail(&ticket.list,
+				      &space_info->priority_tickets);
+		}
+	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+		used += orig_bytes;
+		/*
+		 * We will do the space reservation dance during log replay,
+		 * which means we won't have fs_info->fs_root set, so don't do
+		 * the async reclaim as we will panic.
+		 */
+		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
+		    need_do_async_reclaim(fs_info, space_info,
+					  used, system_chunk) &&
+		    !work_busy(&fs_info->async_reclaim_work)) {
+			trace_btrfs_trigger_flush(fs_info, space_info->flags,
+						  orig_bytes, flush, "preempt");
+			queue_work(system_unbound_wq,
+				   &fs_info->async_reclaim_work);
+		}
+	}
+	spin_unlock(&space_info->lock);
+	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+		return ret;
+
+	return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orig_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+				 struct btrfs_block_rsv *block_rsv,
+				 u64 orig_bytes,
+				 enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	int ret;
+	bool system_chunk = (root == fs_info->chunk_root);
+
+	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
+				       orig_bytes, flush, system_chunk);
+	if (ret == -ENOSPC &&
+	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+		if (block_rsv != global_rsv &&
+		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
+			ret = 0;
+	}
+	if (ret == -ENOSPC) {
+		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+					      block_rsv->space_info->flags,
+					      orig_bytes, 1);
+
+		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+			btrfs_dump_space_info(fs_info, block_rsv->space_info,
+					      orig_bytes, 0);
+	}
+	return ret;
+}

diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
new file mode 100644
index 0000000..8867e84
--- /dev/null
+++ b/fs/btrfs/space-info.h

@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SPACE_INFO_H
+#define BTRFS_SPACE_INFO_H
+
+struct btrfs_space_info {
+	spinlock_t lock;
+
+	u64 total_bytes;	/* total bytes in the space,
+				   this doesn't take mirrors into account */
+	u64 bytes_used;		/* total bytes used,
+				   this doesn't take mirrors into account */
+	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
+				   transaction finishes */
+	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+				   current allocations */
+	u64 bytes_may_use;	/* number of bytes that may be used for
+				   delalloc/allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	u64 max_extent_size;	/* This will hold the maximum extent size of
+				   the space info if we had an ENOSPC in the
+				   allocator. */
+
+	unsigned int full:1;	/* indicates that we cannot allocate any more
+				   chunks for this space */
+	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
+
+	unsigned int flush:1;		/* set if we are trying to make space */
+
+	unsigned int force_alloc;	/* set if we need to force a chunk
+					   alloc for this space */
+
+	u64 disk_used;		/* total bytes used on disk */
+	u64 disk_total;		/* total bytes on disk, takes mirrors into
+				   account */
+
+	u64 flags;
+
+	/*
+	 * bytes_pinned is kept in line with what is actually pinned, as in
+	 * we've called update_block_group and dropped the bytes_used counter
+	 * and increased the bytes_pinned counter.  However this means that
+	 * bytes_pinned does not reflect the bytes that will be pinned once the
+	 * delayed refs are flushed, so this counter is inc'ed every time we
+	 * call btrfs_free_extent so it is a realtime count of what will be
+	 * freed once the transaction is committed.  It will be zeroed every
+	 * time the transaction commits.
+	 */
+	struct percpu_counter total_bytes_pinned;
+
+	struct list_head list;
+	/* Protected by the spinlock 'lock'. */
+	struct list_head ro_bgs;
+	struct list_head priority_tickets;
+	struct list_head tickets;
+	/*
+	 * tickets_id just indicates the next ticket will be handled, so note
+	 * it's not stored per ticket.
+	 */
+	u64 tickets_id;
+
+	struct rw_semaphore groups_sem;
+	/* for block groups in our same type */
+	struct list_head block_groups[BTRFS_NR_RAID_TYPES];
+	wait_queue_head_t wait;
+
+	struct kobject kobj;
+	struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+};
+
+struct reserve_ticket {
+	u64 bytes;
+	int error;
+	struct list_head list;
+	wait_queue_head_t wait;
+};
+
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
+/*
+ *
+ * Declare a helper function to detect underflow of various space info members
+ */
+#define DECLARE_SPACE_INFO_UPDATE(name, trace_name)			\
+static inline void							\
+btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info,		\
+			       struct btrfs_space_info *sinfo,		\
+			       s64 bytes)				\
+{									\
+	const u64 abs_bytes = (bytes < 0) ? -bytes : bytes;		\
+	lockdep_assert_held(&sinfo->lock);				\
+	trace_update_##name(fs_info, sinfo, sinfo->name, bytes);	\
+	trace_btrfs_space_reservation(fs_info, trace_name,		\
+				      sinfo->flags, abs_bytes,		\
+				      bytes > 0);			\
+	if (bytes < 0 && sinfo->name < -bytes) {			\
+		WARN_ON(1);						\
+		sinfo->name = 0;					\
+		return;							\
+	}								\
+	sinfo->name += bytes;						\
+}
+
+DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
+DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     u64 bytes_readonly,
+			     struct btrfs_space_info **space_info);
+struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
+					       u64 flags);
+u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
+			  bool may_use_included);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+			   struct btrfs_space_info *info, u64 bytes,
+			   int dump_block_groups);
+int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+				 struct btrfs_block_rsv *block_rsv,
+				 u64 orig_bytes,
+				 enum btrfs_reserve_flush_enum flush);
+void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
+				struct btrfs_space_info *space_info);
+
+static inline void btrfs_space_info_free_bytes_may_use(
+				struct btrfs_fs_info *fs_info,
+				struct btrfs_space_info *space_info,
+				u64 num_bytes)
+{
+	spin_lock(&space_info->lock);
+	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
+	btrfs_try_granting_tickets(fs_info, space_info);
+	spin_unlock(&space_info->lock);
+}
+
+#endif /* BTRFS_SPACE_INFO_H */

diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 4c13b73..73f7987 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c

@@ -33,6 +33,8 @@
  *
  * The extent buffer api is used to do the page spanning work required to
  * have a metadata blocksize different from the page size.
+ *
+ * There are 2 variants defined, one with a token pointer and one without.
  */
 
 #define DEFINE_BTRFS_SETGET_BITS(bits)					\
@@ -50,8 +52,10 @@
 	int size = sizeof(u##bits);					\
 	u##bits res;							\
 									\
-	if (token && token->kaddr && token->offset <= offset &&		\
-	    token->eb == eb &&						\
+	ASSERT(token);							\
+	ASSERT(token->eb == eb);					\
+									\
+	if (token->kaddr && token->offset <= offset &&			\
 	   (token->offset + PAGE_SIZE >= offset + size)) {	\
 		kaddr = token->kaddr;					\
 		p = kaddr + part_offset - token->offset;		\
@@ -68,11 +72,33 @@
 	}								\
 	p = kaddr + part_offset - map_start;				\
 	res = get_unaligned_le##bits(p + off);				\
-	if (token) {							\
-		token->kaddr = kaddr;					\
-		token->offset = map_start;				\
-		token->eb = eb;						\
+	token->kaddr = kaddr;						\
+	token->offset = map_start;					\
+	return res;							\
+}									\
+u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
+			 const void *ptr, unsigned long off)		\
+{									\
+	unsigned long part_offset = (unsigned long)ptr;			\
+	unsigned long offset = part_offset + off;			\
+	void *p;							\
+	int err;							\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	int size = sizeof(u##bits);					\
+	u##bits res;							\
+									\
+	err = map_private_extent_buffer(eb, offset, size,		\
+					&kaddr, &map_start, &map_len);	\
+	if (err) {							\
+		__le##bits leres;					\
+									\
+		read_extent_buffer(eb, &leres, offset, size);		\
+		return le##bits##_to_cpu(leres);			\
 	}								\
+	p = kaddr + part_offset - map_start;				\
+	res = get_unaligned_le##bits(p + off);				\
 	return res;							\
 }									\
 void btrfs_set_token_##bits(struct extent_buffer *eb,			\
@@ -89,8 +115,10 @@
 	unsigned long map_len;						\
 	int size = sizeof(u##bits);					\
 									\
-	if (token && token->kaddr && token->offset <= offset &&		\
-	    token->eb == eb &&						\
+	ASSERT(token);							\
+	ASSERT(token->eb == eb);					\
+									\
+	if (token->kaddr && token->offset <= offset &&			\
 	   (token->offset + PAGE_SIZE >= offset + size)) {	\
 		kaddr = token->kaddr;					\
 		p = kaddr + part_offset - token->offset;		\
@@ -108,11 +136,32 @@
 	}								\
 	p = kaddr + part_offset - map_start;				\
 	put_unaligned_le##bits(val, p + off);				\
-	if (token) {							\
-		token->kaddr = kaddr;					\
-		token->offset = map_start;				\
-		token->eb = eb;						\
+	token->kaddr = kaddr;						\
+	token->offset = map_start;					\
+}									\
+void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,		\
+		      unsigned long off, u##bits val)			\
+{									\
+	unsigned long part_offset = (unsigned long)ptr;			\
+	unsigned long offset = part_offset + off;			\
+	void *p;							\
+	int err;							\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	int size = sizeof(u##bits);					\
+									\
+	err = map_private_extent_buffer(eb, offset, size,		\
+			&kaddr, &map_start, &map_len);			\
+	if (err) {							\
+		__le##bits val2;					\
+									\
+		val2 = cpu_to_le##bits(val);				\
+		write_extent_buffer(eb, &val2, offset, size);		\
+		return;							\
 	}								\
+	p = kaddr + part_offset - map_start;				\
+	put_unaligned_le##bits(val, p + off);				\
 }
 
 DEFINE_BTRFS_SETGET_BITS(8)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8ad1458..1b151af 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c

@@ -42,7 +42,10 @@
 #include "dev-replace.h"
 #include "free-space-cache.h"
 #include "backref.h"
+#include "space-info.h"
+#include "sysfs.h"
 #include "tests/btrfs-tests.h"
+#include "block-group.h"
 
 #include "qgroup.h"
 #define CREATE_TRACE_POINTS
@@ -93,7 +96,7 @@
 
 /*
  * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the approciate error response.
+ * invokes the appropriate error response.
  */
 __cold
 void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
@@ -151,7 +154,7 @@
 	 * although there is no way to update the progress. It would add the
 	 * risk of a deadlock, therefore the canceling is omitted. The only
 	 * penalty is that some I/O remains active until the procedure
-	 * completes. The next time when the filesystem is mounted writeable
+	 * completes. The next time when the filesystem is mounted writable
 	 * again, the device replace operation continues.
 	 */
 }
@@ -529,7 +532,9 @@
 				if (token != Opt_compress &&
 				    token != Opt_compress_force)
 					info->compress_level =
-					  btrfs_compress_str2level(args[0].from);
+					  btrfs_compress_str2level(
+							BTRFS_COMPRESS_ZLIB,
+							args[0].from + 4);
 				btrfs_set_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -542,9 +547,13 @@
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
 				no_compress = 0;
-			} else if (strcmp(args[0].from, "zstd") == 0) {
+			} else if (strncmp(args[0].from, "zstd", 4) == 0) {
 				compress_type = "zstd";
 				info->compress_type = BTRFS_COMPRESS_ZSTD;
+				info->compress_level =
+					btrfs_compress_str2level(
+							 BTRFS_COMPRESS_ZSTD,
+							 args[0].from + 4);
 				btrfs_set_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -1394,7 +1403,7 @@
 }
 
 static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
-				   const char *device_name, struct vfsmount *mnt)
+				   struct vfsmount *mnt)
 {
 	struct dentry *root;
 	int ret;
@@ -1458,56 +1467,6 @@
 	return root;
 }
 
-static int parse_security_options(char *orig_opts,
-				  struct security_mnt_opts *sec_opts)
-{
-	char *secdata = NULL;
-	int ret = 0;
-
-	secdata = alloc_secdata();
-	if (!secdata)
-		return -ENOMEM;
-	ret = security_sb_copy_data(orig_opts, secdata);
-	if (ret) {
-		free_secdata(secdata);
-		return ret;
-	}
-	ret = security_sb_parse_opts_str(secdata, sec_opts);
-	free_secdata(secdata);
-	return ret;
-}
-
-static int setup_security_options(struct btrfs_fs_info *fs_info,
-				  struct super_block *sb,
-				  struct security_mnt_opts *sec_opts)
-{
-	int ret = 0;
-
-	/*
-	 * Call security_sb_set_mnt_opts() to check whether new sec_opts
-	 * is valid.
-	 */
-	ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
-	if (ret)
-		return ret;
-
-#ifdef CONFIG_SECURITY
-	if (!fs_info->security_opts.num_mnt_opts) {
-		/* first time security setup, copy sec_opts to fs_info */
-		memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
-	} else {
-		/*
-		 * Since SELinux (the only one supporting security_mnt_opts)
-		 * does NOT support changing context during remount/mount of
-		 * the same sb, this must be the same or part of the same
-		 * security options, just free it.
-		 */
-		security_free_mnt_opts(sec_opts);
-	}
-#endif
-	return ret;
-}
-
 /*
  * Find a superblock for the given device / mount point.
  *
@@ -1522,16 +1481,15 @@
 	struct btrfs_device *device = NULL;
 	struct btrfs_fs_devices *fs_devices = NULL;
 	struct btrfs_fs_info *fs_info = NULL;
-	struct security_mnt_opts new_sec_opts;
+	void *new_sec_opts = NULL;
 	fmode_t mode = FMODE_READ;
 	int error = 0;
 
 	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
-	security_init_mnt_opts(&new_sec_opts);
 	if (data) {
-		error = parse_security_options(data, &new_sec_opts);
+		error = security_sb_eat_lsm_opts(data, &new_sec_opts);
 		if (error)
 			return ERR_PTR(error);
 	}
@@ -1550,7 +1508,6 @@
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
 	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
-	security_init_mnt_opts(&fs_info->security_opts);
 	if (!fs_info->super_copy || !fs_info->super_for_commit) {
 		error = -ENOMEM;
 		goto error_fs_info;
@@ -1599,18 +1556,16 @@
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
 		btrfs_sb(s)->bdev_holder = fs_type;
+		if (!strstr(crc32c_impl(), "generic"))
+			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
 		error = btrfs_fill_super(s, fs_devices, data);
 	}
+	if (!error)
+		error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
+	security_free_mnt_opts(&new_sec_opts);
 	if (error) {
 		deactivate_locked_super(s);
-		goto error_sec_opts;
-	}
-
-	fs_info = btrfs_sb(s);
-	error = setup_security_options(fs_info, s, &new_sec_opts);
-	if (error) {
-		deactivate_locked_super(s);
-		goto error_sec_opts;
+		return ERR_PTR(error);
 	}
 
 	return dget(s->s_root);
@@ -1651,14 +1606,10 @@
 {
 	struct vfsmount *mnt_root;
 	struct dentry *root;
-	fmode_t mode = FMODE_READ;
 	char *subvol_name = NULL;
 	u64 subvol_objectid = 0;
 	int error = 0;
 
-	if (!(flags & SB_RDONLY))
-		mode |= FMODE_WRITE;
-
 	error = btrfs_parse_subvol_options(data, &subvol_name,
 					&subvol_objectid);
 	if (error) {
@@ -1677,6 +1628,7 @@
 				flags | SB_RDONLY, device_name, data);
 			if (IS_ERR(mnt_root)) {
 				root = ERR_CAST(mnt_root);
+				kfree(subvol_name);
 				goto out;
 			}
 
@@ -1686,17 +1638,19 @@
 			if (error < 0) {
 				root = ERR_PTR(error);
 				mntput(mnt_root);
+				kfree(subvol_name);
 				goto out;
 			}
 		}
 	}
 	if (IS_ERR(mnt_root)) {
 		root = ERR_CAST(mnt_root);
+		kfree(subvol_name);
 		goto out;
 	}
 
 	/* mount_subvol() will free subvol_name and mnt_root */
-	root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root);
+	root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
 
 out:
 	return root;
@@ -1779,18 +1733,14 @@
 	btrfs_remount_prepare(fs_info);
 
 	if (data) {
-		struct security_mnt_opts new_sec_opts;
+		void *new_sec_opts = NULL;
 
-		security_init_mnt_opts(&new_sec_opts);
-		ret = parse_security_options(data, &new_sec_opts);
+		ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
+		if (!ret)
+			ret = security_sb_remount(sb, new_sec_opts);
+		security_free_mnt_opts(&new_sec_opts);
 		if (ret)
 			goto restore;
-		ret = setup_security_options(fs_info, sb,
-					     &new_sec_opts);
-		if (ret) {
-			security_free_mnt_opts(&new_sec_opts);
-			goto restore;
-		}
 	}
 
 	ret = btrfs_parse_options(fs_info, data, *flags);
@@ -1848,7 +1798,7 @@
 
 		if (!btrfs_check_rw_degradable(fs_info, NULL)) {
 			btrfs_warn(fs_info,
-				"too many missing devices, writeable remount is not allowed");
+		"too many missing devices, writable remount is not allowed");
 			ret = -EACCES;
 			goto restore;
 		}
@@ -1916,7 +1866,7 @@
 }
 
 /* Used to sort the devices by max_avail(descending sort) */
-static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
 				       const void *dev_info2)
 {
 	if (((struct btrfs_device_info *)dev_info1)->max_avail >
@@ -1945,18 +1895,18 @@
  * The helper to calc the free space on the devices that can be used to store
  * file data.
  */
-static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
-				       u64 *free_bytes)
+static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
+					      u64 *free_bytes)
 {
 	struct btrfs_device_info *devices_info;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *device;
-	u64 skip_space;
 	u64 type;
 	u64 avail_space;
 	u64 min_stripe_size;
-	int min_stripes = 1, num_stripes = 1;
+	int num_stripes = 1;
 	int i = 0, nr_devices;
+	const struct btrfs_raid_attr *rattr;
 
 	/*
 	 * We aren't under the device list lock, so this is racy-ish, but good
@@ -1980,21 +1930,17 @@
 
 	/* calc min stripe number for data space allocation */
 	type = btrfs_data_alloc_profile(fs_info);
-	if (type & BTRFS_BLOCK_GROUP_RAID0) {
-		min_stripes = 2;
-		num_stripes = nr_devices;
-	} else if (type & BTRFS_BLOCK_GROUP_RAID1) {
-		min_stripes = 2;
-		num_stripes = 2;
-	} else if (type & BTRFS_BLOCK_GROUP_RAID10) {
-		min_stripes = 4;
-		num_stripes = 4;
-	}
+	rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
 
-	if (type & BTRFS_BLOCK_GROUP_DUP)
-		min_stripe_size = 2 * BTRFS_STRIPE_LEN;
-	else
-		min_stripe_size = BTRFS_STRIPE_LEN;
+	if (type & BTRFS_BLOCK_GROUP_RAID0)
+		num_stripes = nr_devices;
+	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		num_stripes = 2;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		num_stripes = 4;
+
+	/* Adjust for more than 1 stripe per device */
+	min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
@@ -2010,28 +1956,21 @@
 		avail_space = device->total_bytes - device->bytes_used;
 
 		/* align with stripe_len */
-		avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
-		avail_space *= BTRFS_STRIPE_LEN;
+		avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
 
 		/*
 		 * In order to avoid overwriting the superblock on the drive,
 		 * btrfs starts at an offset of at least 1MB when doing chunk
 		 * allocation.
+		 *
+		 * This ensures we have at least min_stripe_size free space
+		 * after excluding 1MB.
 		 */
-		skip_space = SZ_1M;
-
-		/*
-		 * we can use the free space in [0, skip_space - 1], subtract
-		 * it from the total.
-		 */
-		if (avail_space && avail_space >= skip_space)
-			avail_space -= skip_space;
-		else
-			avail_space = 0;
-
-		if (avail_space < min_stripe_size)
+		if (avail_space <= SZ_1M + min_stripe_size)
 			continue;
 
+		avail_space -= SZ_1M;
+
 		devices_info[i].dev = device;
 		devices_info[i].max_avail = avail_space;
 
@@ -2045,9 +1984,8 @@
 
 	i = nr_devices - 1;
 	avail_space = 0;
-	while (nr_devices >= min_stripes) {
-		if (num_stripes > nr_devices)
-			num_stripes = nr_devices;
+	while (nr_devices >= rattr->devs_min) {
+		num_stripes = min(num_stripes, nr_devices);
 
 		if (devices_info[i].max_avail >= min_stripe_size) {
 			int j;
@@ -2090,7 +2028,7 @@
 	u64 total_free_data = 0;
 	u64 total_free_meta = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
-	__be32 *fsid = (__be32 *)fs_info->fsid;
+	__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
 	unsigned factor = 1;
 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 	int ret;
@@ -2177,8 +2115,10 @@
 	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
 	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
 	/* Mask in the root object ID too, to disambiguate subvols */
-	buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
-	buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
+	buf->f_fsid.val[0] ^=
+		BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
+	buf->f_fsid.val[1] ^=
+		BTRFS_I(d_inode(dentry))->root->root_key.objectid;
 
 	return 0;
 }
@@ -2245,6 +2185,9 @@
 		ret = PTR_ERR_OR_ZERO(device);
 		mutex_unlock(&uuid_mutex);
 		break;
+	case BTRFS_IOC_FORGET_DEV:
+		ret = btrfs_forget_devices(vol->name);
+		break;
 	case BTRFS_IOC_DEVICES_READY:
 		mutex_lock(&uuid_mutex);
 		device = btrfs_scan_one_device(vol->name, FMODE_READ,
@@ -2310,7 +2253,7 @@
 	 * device_list_mutex here as we only read the device data and the list
 	 * is protected by RCU.  Even if a device is deleted during the list
 	 * traversals, we'll get valid data, the freeing callback will wait at
-	 * least until until the rcu_read_unlock.
+	 * least until the rcu_read_unlock.
 	 */
 	rcu_read_lock();
 	cur_devices = fs_info->fs_devices;
@@ -2344,6 +2287,7 @@
 	.show_devname	= btrfs_show_devname,
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_destroy_inode,
+	.free_inode	= btrfs_free_inode,
 	.statfs		= btrfs_statfs,
 	.remount_fs	= btrfs_remount,
 	.freeze_fs	= btrfs_freeze,
@@ -2511,3 +2455,4 @@
 module_exit(exit_btrfs_fs)
 
 MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: crc32c");

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 3717c86..f6d3c80 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c

@@ -4,22 +4,87 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
-#include <linux/kobject.h>
 #include <linux/bug.h>
-#include <linux/debugfs.h>
 
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "sysfs.h"
 #include "volumes.h"
+#include "space-info.h"
+#include "block-group.h"
+
+struct btrfs_feature_attr {
+	struct kobj_attribute kobj_attr;
+	enum btrfs_feature_set feature_set;
+	u64 feature_bit;
+};
+
+/* For raid type sysfs entries */
+struct raid_kobject {
+	u64 flags;
+	struct kobject kobj;
+};
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)			\
+{									\
+	.attr	= { .name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,						\
+	.store	= _store,						\
+}
+
+#define BTRFS_ATTR_RW(_prefix, _name, _show, _store)			\
+	static struct kobj_attribute btrfs_attr_##_prefix##_##_name =	\
+			__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define BTRFS_ATTR(_prefix, _name, _show)				\
+	static struct kobj_attribute btrfs_attr_##_prefix##_##_name =	\
+			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#define BTRFS_ATTR_PTR(_prefix, _name)					\
+	(&btrfs_attr_##_prefix##_##_name.attr)
+
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit)  \
+static struct btrfs_feature_attr btrfs_attr_features_##_name = {	     \
+	.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO,			     \
+				      btrfs_feature_attr_show,		     \
+				      btrfs_feature_attr_store),	     \
+	.feature_set	= _feature_set,					     \
+	.feature_bit	= _feature_prefix ##_## _feature_bit,		     \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name)					     \
+	(&btrfs_attr_features_##_name.kobj_attr.attr)
+
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
 static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
 
+static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
+{
+	return container_of(a, struct btrfs_feature_attr, kobj_attr);
+}
+
+static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+	return container_of(attr, struct kobj_attribute, attr);
+}
+
+static struct btrfs_feature_attr *attr_to_btrfs_feature_attr(
+		struct attribute *attr)
+{
+	return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
+
 static u64 get_features(struct btrfs_fs_info *fs_info,
 			enum btrfs_feature_set set)
 {
@@ -191,6 +256,7 @@
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
 BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
 BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
 
 static struct attribute *btrfs_supported_feature_attrs[] = {
@@ -204,6 +270,7 @@
 	BTRFS_FEAT_ATTR_PTR(raid56),
 	BTRFS_FEAT_ATTR_PTR(skinny_metadata),
 	BTRFS_FEAT_ATTR_PTR(no_holes),
+	BTRFS_FEAT_ATTR_PTR(metadata_uuid),
 	BTRFS_FEAT_ATTR_PTR(free_space_tree),
 	NULL
 };
@@ -244,6 +311,25 @@
 	.attrs = btrfs_supported_static_feature_attrs,
 };
 
+#ifdef CONFIG_BTRFS_DEBUG
+
+/*
+ * Runtime debugging exported via sysfs
+ *
+ * /sys/fs/btrfs/debug - applies to module or all filesystems
+ * /sys/fs/btrfs/UUID  - applies only to the given filesystem
+ */
+static struct attribute *btrfs_debug_feature_attrs[] = {
+	NULL
+};
+
+static const struct attribute_group btrfs_debug_feature_attr_group = {
+	.name = "debug",
+	.attrs = btrfs_debug_feature_attrs,
+};
+
+#endif
+
 static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
 {
 	u64 val;
@@ -301,21 +387,22 @@
 	return snprintf(buf, PAGE_SIZE, "%llu\n", val);
 }
 
-static struct attribute *raid_attributes[] = {
+static struct attribute *raid_attrs[] = {
 	BTRFS_ATTR_PTR(raid, total_bytes),
 	BTRFS_ATTR_PTR(raid, used_bytes),
 	NULL
 };
+ATTRIBUTE_GROUPS(raid);
 
 static void release_raid_kobj(struct kobject *kobj)
 {
 	kfree(to_raid_kobj(kobj));
 }
 
-struct kobj_type btrfs_raid_ktype = {
+static struct kobj_type btrfs_raid_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
 	.release = release_raid_kobj,
-	.default_attrs = raid_attributes,
+	.default_groups = raid_groups,
 };
 
 #define SPACE_INFO_ATTR(field)						\
@@ -362,6 +449,7 @@
 	BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
 	NULL,
 };
+ATTRIBUTE_GROUPS(space_info);
 
 static void space_info_release(struct kobject *kobj)
 {
@@ -370,10 +458,10 @@
 	kfree(sinfo);
 }
 
-struct kobj_type space_info_ktype = {
+static struct kobj_type space_info_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
 	.release = space_info_release,
-	.default_attrs = space_info_attrs,
+	.default_groups = space_info_groups,
 };
 
 static const struct attribute *allocation_attrs[] = {
@@ -505,12 +593,24 @@
 
 BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store);
 
+static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%pU\n",
+			fs_info->fs_devices->metadata_uuid);
+}
+
+BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
+
 static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, label),
 	BTRFS_ATTR_PTR(, nodesize),
 	BTRFS_ATTR_PTR(, sectorsize),
 	BTRFS_ATTR_PTR(, clone_alignment),
 	BTRFS_ATTR_PTR(, quota_override),
+	BTRFS_ATTR_PTR(, metadata_uuid),
 	NULL,
 };
 
@@ -638,12 +738,17 @@
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
 }
 
-const char * const btrfs_feature_set_names[FEAT_MAX] = {
+static const char * const btrfs_feature_set_names[FEAT_MAX] = {
 	[FEAT_COMPAT]	 = "compat",
 	[FEAT_COMPAT_RO] = "compat_ro",
 	[FEAT_INCOMPAT]	 = "incompat",
 };
 
+const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+{
+	return btrfs_feature_set_names[set];
+}
+
 char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
 {
 	size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
@@ -713,6 +818,110 @@
 	}
 }
 
+/*
+ * Create a sysfs entry for a given block group type at path
+ * /sys/fs/btrfs/UUID/allocation/data/TYPE
+ */
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct btrfs_space_info *space_info = cache->space_info;
+	struct raid_kobject *rkobj;
+	const int index = btrfs_bg_flags_to_raid_index(cache->flags);
+	unsigned int nofs_flag;
+	int ret;
+
+	/*
+	 * Setup a NOFS context because kobject_add(), deep in its call chain,
+	 * does GFP_KERNEL allocations, and we are often called in a context
+	 * where if reclaim is triggered we can deadlock (we are either holding
+	 * a transaction handle or some lock required for a transaction
+	 * commit).
+	 */
+	nofs_flag = memalloc_nofs_save();
+
+	rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+	if (!rkobj) {
+		memalloc_nofs_restore(nofs_flag);
+		btrfs_warn(cache->fs_info,
+				"couldn't alloc memory for raid level kobject");
+		return;
+	}
+
+	rkobj->flags = cache->flags;
+	kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+	ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
+			  btrfs_bg_type_to_raid_name(rkobj->flags));
+	memalloc_nofs_restore(nofs_flag);
+	if (ret) {
+		kobject_put(&rkobj->kobj);
+		btrfs_warn(fs_info,
+			"failed to add kobject for block cache, ignoring");
+		return;
+	}
+
+	space_info->block_group_kobjs[index] = &rkobj->kobj;
+}
+
+/*
+ * Remove sysfs directories for all block group types of a given space info and
+ * the space info as well
+ */
+void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+		struct kobject *kobj;
+
+		kobj = space_info->block_group_kobjs[i];
+		space_info->block_group_kobjs[i] = NULL;
+		if (kobj) {
+			kobject_del(kobj);
+			kobject_put(kobj);
+		}
+	}
+	kobject_del(&space_info->kobj);
+	kobject_put(&space_info->kobj);
+}
+
+static const char *alloc_name(u64 flags)
+{
+	switch (flags) {
+	case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+		return "mixed";
+	case BTRFS_BLOCK_GROUP_METADATA:
+		return "metadata";
+	case BTRFS_BLOCK_GROUP_DATA:
+		return "data";
+	case BTRFS_BLOCK_GROUP_SYSTEM:
+		return "system";
+	default:
+		WARN_ON(1);
+		return "invalid-combination";
+	};
+}
+
+/*
+ * Create a sysfs entry for a space info type at path
+ * /sys/fs/btrfs/UUID/allocation/TYPE
+ */
+int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
+				    struct btrfs_space_info *space_info)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
+				   fs_info->space_info_kobj, "%s",
+				   alloc_name(space_info->flags));
+	if (ret) {
+		kobject_put(&space_info->kobj);
+		return ret;
+	}
+
+	return 0;
+}
+
 /* when one_device is NULL, it removes all device links */
 
 int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
@@ -789,15 +998,35 @@
 	return error;
 }
 
+void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
+{
+	int ret;
+
+	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+	if (ret)
+		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+			action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+			&disk_to_dev(bdev->bd_disk)->kobj);
+}
+
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
+				    const u8 *fsid)
+{
+	char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+
+	/*
+	 * Sprouting changes fsid of the mounted filesystem, rename the fsid
+	 * directory
+	 */
+	snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+	if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
+		btrfs_warn(fs_devices->fs_info,
+				"sysfs: failed to create fsid for sprout");
+}
+
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
 
-/* /sys/kernel/debug/btrfs */
-static struct dentry *btrfs_debugfs_root_dentry;
-
-/* Debugging tunables and exported data */
-u64 btrfs_debugfs_test;
-
 /*
  * Can be called by the device discovery thread.
  * And parent can be specified for seed device
@@ -811,7 +1040,12 @@
 	fs_devs->fsid_kobj.kset = btrfs_kset;
 	error = kobject_init_and_add(&fs_devs->fsid_kobj,
 				&btrfs_ktype, parent, "%pU", fs_devs->fsid);
-	return error;
+	if (error) {
+		kobject_put(&fs_devs->fsid_kobj);
+		return error;
+	}
+
+	return 0;
 }
 
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
@@ -837,6 +1071,13 @@
 	if (error)
 		goto failure;
 
+#ifdef CONFIG_BTRFS_DEBUG
+	error = sysfs_create_group(fsid_kobj,
+				   &btrfs_debug_feature_attr_group);
+	if (error)
+		goto failure;
+#endif
+
 	error = addrm_unknown_feature_attrs(fs_info, true);
 	if (error)
 		goto failure;
@@ -891,28 +1132,6 @@
 	ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
 }
 
-static int btrfs_init_debugfs(void)
-{
-#ifdef CONFIG_DEBUG_FS
-	btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
-	if (!btrfs_debugfs_root_dentry)
-		return -ENOMEM;
-
-	/*
-	 * Example code, how to export data through debugfs.
-	 *
-	 * file:        /sys/kernel/debug/btrfs/test
-	 * contents of: btrfs_debugfs_test
-	 */
-#ifdef CONFIG_BTRFS_DEBUG
-	debugfs_create_u64("test", S_IRUGO | S_IWUSR, btrfs_debugfs_root_dentry,
-			&btrfs_debugfs_test);
-#endif
-
-#endif
-	return 0;
-}
-
 int __init btrfs_init_sysfs(void)
 {
 	int ret;
@@ -921,10 +1140,6 @@
 	if (!btrfs_kset)
 		return -ENOMEM;
 
-	ret = btrfs_init_debugfs();
-	if (ret)
-		goto out1;
-
 	init_feature_attrs();
 	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	if (ret)
@@ -934,13 +1149,17 @@
 	if (ret)
 		goto out_remove_group;
 
+#ifdef CONFIG_BTRFS_DEBUG
+	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
+	if (ret)
+		goto out2;
+#endif
+
 	return 0;
 
 out_remove_group:
 	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 out2:
-	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
-out1:
 	kset_unregister(btrfs_kset);
 
 	return ret;
@@ -952,6 +1171,5 @@
 			    &btrfs_static_feature_attr_group);
 	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	kset_unregister(btrfs_kset);
-	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
 }
 

diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index c6ee600..610e9c3 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h

@@ -3,83 +3,17 @@
 #ifndef BTRFS_SYSFS_H
 #define BTRFS_SYSFS_H
 
-/*
- * Data exported through sysfs
- */
-extern u64 btrfs_debugfs_test;
+#include <linux/kobject.h>
 
 enum btrfs_feature_set {
-	FEAT_COMPAT = 0,
+	FEAT_COMPAT,
 	FEAT_COMPAT_RO,
 	FEAT_INCOMPAT,
 	FEAT_MAX
 };
 
-#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)			\
-{									\
-	.attr	= { .name = __stringify(_name), .mode = _mode },	\
-	.show	= _show,						\
-	.store	= _store,						\
-}
-
-#define BTRFS_ATTR_RW(_prefix, _name, _show, _store)			\
-	static struct kobj_attribute btrfs_attr_##_prefix##_##_name =	\
-			__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
-
-#define BTRFS_ATTR(_prefix, _name, _show)				\
-	static struct kobj_attribute btrfs_attr_##_prefix##_##_name =	\
-			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
-
-#define BTRFS_ATTR_PTR(_prefix, _name)					\
-	(&btrfs_attr_##_prefix##_##_name.attr)
-
-
-struct btrfs_feature_attr {
-	struct kobj_attribute kobj_attr;
-	enum btrfs_feature_set feature_set;
-	u64 feature_bit;
-};
-
-#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit)  \
-static struct btrfs_feature_attr btrfs_attr_features_##_name = {	     \
-	.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO,			     \
-				      btrfs_feature_attr_show,		     \
-				      btrfs_feature_attr_store),	     \
-	.feature_set	= _feature_set,					     \
-	.feature_bit	= _feature_prefix ##_## _feature_bit,		     \
-}
-#define BTRFS_FEAT_ATTR_PTR(_name)					     \
-	(&btrfs_attr_features_##_name.kobj_attr.attr)
-
-#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
-	BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
-#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
-	BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
-#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
-	BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
-
-/* convert from attribute */
-static inline struct btrfs_feature_attr *
-to_btrfs_feature_attr(struct kobj_attribute *a)
-{
-	return container_of(a, struct btrfs_feature_attr, kobj_attr);
-}
-
-static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
-{
-	return container_of(attr, struct kobj_attribute, attr);
-}
-
-static inline struct btrfs_feature_attr *
-attr_to_btrfs_feature_attr(struct attribute *attr)
-{
-	return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
-}
-
 char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-extern const char * const btrfs_feature_set_names[FEAT_MAX];
-extern struct kobj_type space_info_ktype;
-extern struct kobj_type btrfs_raid_ktype;
+const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
 int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
 int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
@@ -88,7 +22,19 @@
 				struct kobject *parent);
 int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
 void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
+				    const u8 *fsid);
 void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
 		u64 bit, enum btrfs_feature_set set);
+void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
+
+int __init btrfs_init_sysfs(void);
+void __cold btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache);
+int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
+				    struct btrfs_space_info *space_info);
+void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
 
 #endif

diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index db72b3b..99fe9bf 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c

@@ -5,6 +5,7 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
 #include <linux/magic.h>
 #include "btrfs-tests.h"
 #include "../ctree.h"
@@ -14,31 +15,50 @@
 #include "../volumes.h"
 #include "../disk-io.h"
 #include "../qgroup.h"
+#include "../block-group.h"
 
 static struct vfsmount *test_mnt = NULL;
 
+const char *test_error[] = {
+	[TEST_ALLOC_FS_INFO]	     = "cannot allocate fs_info",
+	[TEST_ALLOC_ROOT]	     = "cannot allocate root",
+	[TEST_ALLOC_EXTENT_BUFFER]   = "cannot extent buffer",
+	[TEST_ALLOC_PATH]	     = "cannot allocate path",
+	[TEST_ALLOC_INODE]	     = "cannot allocate inode",
+	[TEST_ALLOC_BLOCK_GROUP]     = "cannot allocate block group",
+	[TEST_ALLOC_EXTENT_MAP]      = "cannot allocate extent map",
+};
+
 static const struct super_operations btrfs_test_super_ops = {
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_test_destroy_inode,
 };
 
-static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *data)
+
+static int btrfs_test_init_fs_context(struct fs_context *fc)
 {
-	return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
-			    NULL, BTRFS_TEST_MAGIC);
+	struct pseudo_fs_context *ctx = init_pseudo(fc, BTRFS_TEST_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+	ctx->ops = &btrfs_test_super_ops;
+	return 0;
 }
 
 static struct file_system_type test_type = {
 	.name		= "btrfs_test_fs",
-	.mount		= btrfs_test_mount,
+	.init_fs_context = btrfs_test_init_fs_context,
 	.kill_sb	= kill_anon_super,
 };
 
 struct inode *btrfs_new_test_inode(void)
 {
-	return new_inode(test_mnt->mnt_sb);
+	struct inode *inode;
+
+	inode = new_inode(test_mnt->mnt_sb);
+	if (inode)
+		inode_init_owner(inode, NULL, S_IFREG);
+
+	return inode;
 }
 
 static int btrfs_init_test_fs(void)
@@ -99,7 +119,6 @@
 
 	spin_lock_init(&fs_info->buffer_lock);
 	spin_lock_init(&fs_info->qgroup_lock);
-	spin_lock_init(&fs_info->qgroup_op_lock);
 	spin_lock_init(&fs_info->super_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->tree_mod_seq_lock);
@@ -115,8 +134,10 @@
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
-	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
-	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+	extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
+			    IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
+	extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
+			    IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
 	fs_info->pinned_extents = &fs_info->freed_extents[0];
 	set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
 
@@ -174,8 +195,10 @@
 	/* Will be freed by btrfs_free_fs_roots */
 	if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
 		return;
-	if (root->node)
+	if (root->node) {
+		/* One for allocate_extent_buffer */
 		free_extent_buffer(root->node);
+	}
 	kfree(root);
 }
 

diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 70ff9f9..ee277bb 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h

@@ -10,7 +10,22 @@
 int btrfs_run_sanity_tests(void);
 
 #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
-#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
+#define test_err(fmt, ...) pr_err("BTRFS: selftest: %s:%d " fmt "\n",	\
+		__FILE__, __LINE__, ##__VA_ARGS__)
+
+#define test_std_err(index)	test_err("%s", test_error[index])
+
+enum {
+	TEST_ALLOC_FS_INFO,
+	TEST_ALLOC_ROOT,
+	TEST_ALLOC_EXTENT_BUFFER,
+	TEST_ALLOC_PATH,
+	TEST_ALLOC_INODE,
+	TEST_ALLOC_BLOCK_GROUP,
+	TEST_ALLOC_EXTENT_MAP,
+};
+
+extern const char *test_error[];
 
 struct btrfs_root;
 struct btrfs_trans_handle;

diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 7d72eab..a1b9f9b 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c

@@ -30,27 +30,27 @@
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_err("could not allocate fs_info");
+		test_std_err(TEST_ALLOC_FS_INFO);
 		return -ENOMEM;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_err("could not allocate root");
+		test_std_err(TEST_ALLOC_ROOT);
 		ret = PTR_ERR(root);
 		goto out;
 	}
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_err("could not allocate path");
+		test_std_err(TEST_ALLOC_PATH);
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
 	if (!eb) {
-		test_err("could not allocate dummy buffer");
+		test_std_err(TEST_ALLOC_EXTENT_BUFFER);
 		ret = -ENOMEM;
 		goto out;
 	}

diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d9269a5..123d9a6 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c

@@ -10,6 +10,7 @@
 #include "btrfs-tests.h"
 #include "../ctree.h"
 #include "../extent_io.h"
+#include "../btrfs_inode.h"
 
 #define PROCESS_UNLOCK		(1 << 0)
 #define PROCESS_RELEASE		(1 << 1)
@@ -58,25 +59,31 @@
 static int test_find_delalloc(u32 sectorsize)
 {
 	struct inode *inode;
-	struct extent_io_tree tmp;
+	struct extent_io_tree *tmp;
 	struct page *page;
 	struct page *locked_page = NULL;
 	unsigned long index = 0;
-	u64 total_dirty = SZ_256M;
-	u64 max_bytes = SZ_128M;
+	/* In this test we need at least 2 file extents at its maximum size */
+	u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
+	u64 total_dirty = 2 * max_bytes;
 	u64 start, end, test_start;
-	u64 found;
+	bool found;
 	int ret = -EINVAL;
 
 	test_msg("running find delalloc tests");
 
 	inode = btrfs_new_test_inode();
 	if (!inode) {
-		test_err("failed to allocate test inode");
+		test_std_err(TEST_ALLOC_INODE);
 		return -ENOMEM;
 	}
+	tmp = &BTRFS_I(inode)->io_tree;
 
-	extent_io_tree_init(&tmp, inode);
+	/*
+	 * Passing NULL as we don't have fs_info but tracepoints are not used
+	 * at this point
+	 */
+	extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
 
 	/*
 	 * First go through and create and mark all of our pages dirty, we pin
@@ -103,11 +110,11 @@
 	 * |--- delalloc ---|
 	 * |---  search  ---|
 	 */
-	set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
+	set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
 	start = 0;
 	end = 0;
-	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
-					 &end, max_bytes);
+	found = find_lock_delalloc_range(inode, locked_page, &start,
+					 &end);
 	if (!found) {
 		test_err("should have found at least one delalloc");
 		goto out_bits;
@@ -117,7 +124,7 @@
 			sectorsize - 1, start, end);
 		goto out_bits;
 	}
-	unlock_extent(&tmp, start, end);
+	unlock_extent(tmp, start, end);
 	unlock_page(locked_page);
 	put_page(locked_page);
 
@@ -134,11 +141,11 @@
 		test_err("couldn't find the locked page");
 		goto out_bits;
 	}
-	set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
+	set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
 	start = test_start;
 	end = 0;
-	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
-					 &end, max_bytes);
+	found = find_lock_delalloc_range(inode, locked_page, &start,
+					 &end);
 	if (!found) {
 		test_err("couldn't find delalloc in our range");
 		goto out_bits;
@@ -153,7 +160,7 @@
 		test_err("there were unlocked pages in the range");
 		goto out_bits;
 	}
-	unlock_extent(&tmp, start, end);
+	unlock_extent(tmp, start, end);
 	/* locked_page was unlocked above */
 	put_page(locked_page);
 
@@ -171,8 +178,8 @@
 	}
 	start = test_start;
 	end = 0;
-	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
-					 &end, max_bytes);
+	found = find_lock_delalloc_range(inode, locked_page, &start,
+					 &end);
 	if (found) {
 		test_err("found range when we shouldn't have");
 		goto out_bits;
@@ -189,11 +196,11 @@
 	 *
 	 * We are re-using our test_start from above since it works out well.
 	 */
-	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
+	set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
 	start = test_start;
 	end = 0;
-	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
-					 &end, max_bytes);
+	found = find_lock_delalloc_range(inode, locked_page, &start,
+					 &end);
 	if (!found) {
 		test_err("didn't find our range");
 		goto out_bits;
@@ -208,7 +215,7 @@
 		test_err("pages in range were not all locked");
 		goto out_bits;
 	}
-	unlock_extent(&tmp, start, end);
+	unlock_extent(tmp, start, end);
 
 	/*
 	 * Now to test where we run into a page that is no longer dirty in the
@@ -233,8 +240,8 @@
 	 * this changes at any point in the future we will need to fix this
 	 * tests expected behavior.
 	 */
-	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
-					 &end, max_bytes);
+	found = find_lock_delalloc_range(inode, locked_page, &start,
+					 &end);
 	if (!found) {
 		test_err("didn't find our range");
 		goto out_bits;
@@ -251,7 +258,7 @@
 	}
 	ret = 0;
 out_bits:
-	clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1);
+	clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
 out:
 	if (locked_page)
 		put_page(locked_page);
@@ -373,8 +380,8 @@
 {
 	struct btrfs_fs_info *fs_info;
 	unsigned long len;
-	unsigned long *bitmap;
-	struct extent_buffer *eb;
+	unsigned long *bitmap = NULL;
+	struct extent_buffer *eb = NULL;
 	int ret;
 
 	test_msg("running extent buffer bitmap tests");
@@ -387,18 +394,23 @@
 		? sectorsize * 4 : sectorsize;
 
 	fs_info = btrfs_alloc_dummy_fs_info(len, len);
+	if (!fs_info) {
+		test_std_err(TEST_ALLOC_FS_INFO);
+		return -ENOMEM;
+	}
 
 	bitmap = kmalloc(len, GFP_KERNEL);
 	if (!bitmap) {
 		test_err("couldn't allocate test bitmap");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
 	if (!eb) {
-		test_err("couldn't allocate test extent buffer");
-		kfree(bitmap);
-		return -ENOMEM;
+		test_std_err(TEST_ALLOC_ROOT);
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	ret = __test_eb_bitmaps(bitmap, eb, len);
@@ -407,17 +419,118 @@
 
 	/* Do it over again with an extent buffer which isn't page-aligned. */
 	free_extent_buffer(eb);
-	eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
+	eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len);
 	if (!eb) {
-		test_err("couldn't allocate test extent buffer");
-		kfree(bitmap);
-		return -ENOMEM;
+		test_std_err(TEST_ALLOC_ROOT);
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	ret = __test_eb_bitmaps(bitmap, eb, len);
 out:
 	free_extent_buffer(eb);
 	kfree(bitmap);
+	btrfs_free_dummy_fs_info(fs_info);
+	return ret;
+}
+
+static int test_find_first_clear_extent_bit(void)
+{
+	struct extent_io_tree tree;
+	u64 start, end;
+	int ret = -EINVAL;
+
+	test_msg("running find_first_clear_extent_bit test");
+	extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
+
+	/*
+	 * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
+	 * 4M-32M
+	 */
+	set_extent_bits(&tree, SZ_1M, SZ_4M - 1,
+			CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+	find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
+				    CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+	if (start != 0 || end != SZ_1M - 1) {
+		test_err("error finding beginning range: start %llu end %llu",
+			 start, end);
+		goto out;
+	}
+
+	/* Now add 32M-64M so that we have a hole between 4M-32M */
+	set_extent_bits(&tree, SZ_32M, SZ_64M - 1,
+			CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+	/*
+	 * Request first hole starting at 12M, we should get 4M-32M
+	 */
+	find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
+				    CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+	if (start != SZ_4M || end != SZ_32M - 1) {
+		test_err("error finding trimmed range: start %llu end %llu",
+			 start, end);
+		goto out;
+	}
+
+	/*
+	 * Search in the middle of allocated range, should get the next one
+	 * available, which happens to be unallocated -> 4M-32M
+	 */
+	find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
+				    CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+	if (start != SZ_4M || end != SZ_32M - 1) {
+		test_err("error finding next unalloc range: start %llu end %llu",
+			 start, end);
+		goto out;
+	}
+
+	/*
+	 * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
+	 * being unset in this range, we should get the entry in range 64M-72M
+	 */
+	set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED);
+	find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
+				    CHUNK_TRIMMED);
+
+	if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
+		test_err("error finding exact range: start %llu end %llu",
+			 start, end);
+		goto out;
+	}
+
+	find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
+				    CHUNK_TRIMMED);
+
+	/*
+	 * Search in the middle of set range whose immediate neighbour doesn't
+	 * have the bits set so it must be returned
+	 */
+	if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
+		test_err("error finding next alloc range: start %llu end %llu",
+			 start, end);
+		goto out;
+	}
+
+	/*
+	 * Search beyond any known range, shall return after last known range
+	 * and end should be -1
+	 */
+	find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
+	if (start != SZ_64M + SZ_8M || end != -1) {
+		test_err(
+		"error handling beyond end of range search: start %llu end %llu",
+			start, end);
+		goto out;
+	}
+
+	ret = 0;
+out:
+	clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
 	return ret;
 }
 
@@ -431,8 +544,11 @@
 	if (ret)
 		goto out;
 
+	ret = test_find_first_clear_extent_bit();
+	if (ret)
+		goto out;
+
 	ret = test_eb_bitmaps(sectorsize, nodesize);
 out:
-	test_msg("extent I/O tests finished");
 	return ret;
 }

diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 385a531..4a7f796 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c

@@ -12,8 +12,8 @@
 	struct extent_map *em;
 	struct rb_node *node;
 
-	while (!RB_EMPTY_ROOT(&em_tree->map)) {
-		node = rb_first(&em_tree->map);
+	while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
+		node = rb_first_cached(&em_tree->map);
 		em = rb_entry(node, struct extent_map, rb_node);
 		remove_extent_mapping(em_tree, em);
 
@@ -47,7 +47,7 @@
  *                                    ->add_extent_mapping(0, 16K)
  *                                    -> #handle -EEXIST
  */
-static void test_case_1(struct btrfs_fs_info *fs_info,
+static int test_case_1(struct btrfs_fs_info *fs_info,
 		struct extent_map_tree *em_tree)
 {
 	struct extent_map *em;
@@ -56,55 +56,79 @@
 	int ret;
 
 	em = alloc_extent_map();
-	if (!em)
-		/* Skip the test on error. */
-		return;
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		return -ENOMEM;
+	}
 
 	/* Add [0, 16K) */
 	em->start = 0;
 	em->len = SZ_16K;
 	em->block_start = 0;
 	em->block_len = SZ_16K;
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
-	ASSERT(ret == 0);
+	write_unlock(&em_tree->lock);
+	if (ret < 0) {
+		test_err("cannot add extent range [0, 16K)");
+		goto out;
+	}
 	free_extent_map(em);
 
 	/* Add [16K, 20K) following [0, 16K)  */
 	em = alloc_extent_map();
-	if (!em)
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	em->start = SZ_16K;
 	em->len = SZ_4K;
 	em->block_start = SZ_32K; /* avoid merging */
 	em->block_len = SZ_4K;
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
-	ASSERT(ret == 0);
+	write_unlock(&em_tree->lock);
+	if (ret < 0) {
+		test_err("cannot add extent range [16K, 20K)");
+		goto out;
+	}
 	free_extent_map(em);
 
 	em = alloc_extent_map();
-	if (!em)
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	/* Add [0, 8K), should return [0, 16K) instead. */
 	em->start = start;
 	em->len = len;
 	em->block_start = start;
 	em->block_len = len;
+	write_lock(&em_tree->lock);
 	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
-	if (ret)
+	write_unlock(&em_tree->lock);
+	if (ret) {
 		test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
+		goto out;
+	}
 	if (em &&
 	    (em->start != 0 || extent_map_end(em) != SZ_16K ||
-	     em->block_start != 0 || em->block_len != SZ_16K))
+	     em->block_start != 0 || em->block_len != SZ_16K)) {
 		test_err(
 "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
 			 start, start + len, ret, em->start, em->len,
 			 em->block_start, em->block_len);
+		ret = -EINVAL;
+	}
 	free_extent_map(em);
 out:
-	/* free memory */
 	free_extent_map_tree(em_tree);
+
+	return ret;
 }
 
 /*
@@ -113,65 +137,89 @@
  * Reading the inline ending up with EEXIST, ie. read an inline
  * extent and discard page cache and read it again.
  */
-static void test_case_2(struct btrfs_fs_info *fs_info,
+static int test_case_2(struct btrfs_fs_info *fs_info,
 		struct extent_map_tree *em_tree)
 {
 	struct extent_map *em;
 	int ret;
 
 	em = alloc_extent_map();
-	if (!em)
-		/* Skip the test on error. */
-		return;
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		return -ENOMEM;
+	}
 
 	/* Add [0, 1K) */
 	em->start = 0;
 	em->len = SZ_1K;
 	em->block_start = EXTENT_MAP_INLINE;
 	em->block_len = (u64)-1;
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
-	ASSERT(ret == 0);
+	write_unlock(&em_tree->lock);
+	if (ret < 0) {
+		test_err("cannot add extent range [0, 1K)");
+		goto out;
+	}
 	free_extent_map(em);
 
-	/* Add [4K, 4K) following [0, 1K)  */
+	/* Add [4K, 8K) following [0, 1K)  */
 	em = alloc_extent_map();
-	if (!em)
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	em->start = SZ_4K;
 	em->len = SZ_4K;
 	em->block_start = SZ_4K;
 	em->block_len = SZ_4K;
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
-	ASSERT(ret == 0);
+	write_unlock(&em_tree->lock);
+	if (ret < 0) {
+		test_err("cannot add extent range [4K, 8K)");
+		goto out;
+	}
 	free_extent_map(em);
 
 	em = alloc_extent_map();
-	if (!em)
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	/* Add [0, 1K) */
 	em->start = 0;
 	em->len = SZ_1K;
 	em->block_start = EXTENT_MAP_INLINE;
 	em->block_len = (u64)-1;
+	write_lock(&em_tree->lock);
 	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
-	if (ret)
+	write_unlock(&em_tree->lock);
+	if (ret) {
 		test_err("case2 [0 1K]: ret %d", ret);
+		goto out;
+	}
 	if (em &&
 	    (em->start != 0 || extent_map_end(em) != SZ_1K ||
-	     em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1))
+	     em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
 		test_err(
 "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
 			 ret, em->start, em->len, em->block_start,
 			 em->block_len);
+		ret = -EINVAL;
+	}
 	free_extent_map(em);
 out:
-	/* free memory */
 	free_extent_map_tree(em_tree);
+
+	return ret;
 }
 
-static void __test_case_3(struct btrfs_fs_info *fs_info,
+static int __test_case_3(struct btrfs_fs_info *fs_info,
 		struct extent_map_tree *em_tree, u64 start)
 {
 	struct extent_map *em;
@@ -179,47 +227,63 @@
 	int ret;
 
 	em = alloc_extent_map();
-	if (!em)
-		/* Skip this test on error. */
-		return;
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		return -ENOMEM;
+	}
 
 	/* Add [4K, 8K) */
 	em->start = SZ_4K;
 	em->len = SZ_4K;
 	em->block_start = SZ_4K;
 	em->block_len = SZ_4K;
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
-	ASSERT(ret == 0);
+	write_unlock(&em_tree->lock);
+	if (ret < 0) {
+		test_err("cannot add extent range [4K, 8K)");
+		goto out;
+	}
 	free_extent_map(em);
 
 	em = alloc_extent_map();
-	if (!em)
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	/* Add [0, 16K) */
 	em->start = 0;
 	em->len = SZ_16K;
 	em->block_start = 0;
 	em->block_len = SZ_16K;
+	write_lock(&em_tree->lock);
 	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
-	if (ret)
+	write_unlock(&em_tree->lock);
+	if (ret) {
 		test_err("case3 [0x%llx 0x%llx): ret %d",
 			 start, start + len, ret);
+		goto out;
+	}
 	/*
 	 * Since bytes within em are contiguous, em->block_start is identical to
 	 * em->start.
 	 */
 	if (em &&
 	    (start < em->start || start + len > extent_map_end(em) ||
-	     em->start != em->block_start || em->len != em->block_len))
+	     em->start != em->block_start || em->len != em->block_len)) {
 		test_err(
 "case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
 			 start, start + len, ret, em->start, em->len,
 			 em->block_start, em->block_len);
+		ret = -EINVAL;
+	}
 	free_extent_map(em);
 out:
-	/* free memory */
 	free_extent_map_tree(em_tree);
+
+	return ret;
 }
 
 /*
@@ -238,15 +302,23 @@
  *   -> add_extent_mapping()
  *                            -> add_extent_mapping()
  */
-static void test_case_3(struct btrfs_fs_info *fs_info,
+static int test_case_3(struct btrfs_fs_info *fs_info,
 		struct extent_map_tree *em_tree)
 {
-	__test_case_3(fs_info, em_tree, 0);
-	__test_case_3(fs_info, em_tree, SZ_8K);
-	__test_case_3(fs_info, em_tree, (12 * 1024ULL));
+	int ret;
+
+	ret = __test_case_3(fs_info, em_tree, 0);
+	if (ret)
+		return ret;
+	ret = __test_case_3(fs_info, em_tree, SZ_8K);
+	if (ret)
+		return ret;
+	ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K));
+
+	return ret;
 }
 
-static void __test_case_4(struct btrfs_fs_info *fs_info,
+static int __test_case_4(struct btrfs_fs_info *fs_info,
 		struct extent_map_tree *em_tree, u64 start)
 {
 	struct extent_map *em;
@@ -254,54 +326,77 @@
 	int ret;
 
 	em = alloc_extent_map();
-	if (!em)
-		/* Skip this test on error. */
-		return;
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		return -ENOMEM;
+	}
 
 	/* Add [0K, 8K) */
 	em->start = 0;
 	em->len = SZ_8K;
 	em->block_start = 0;
 	em->block_len = SZ_8K;
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
-	ASSERT(ret == 0);
+	write_unlock(&em_tree->lock);
+	if (ret < 0) {
+		test_err("cannot add extent range [0, 8K)");
+		goto out;
+	}
 	free_extent_map(em);
 
 	em = alloc_extent_map();
-	if (!em)
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		ret = -ENOMEM;
 		goto out;
+	}
 
-	/* Add [8K, 24K) */
+	/* Add [8K, 32K) */
 	em->start = SZ_8K;
-	em->len = 24 * 1024ULL;
+	em->len = 24 * SZ_1K;
 	em->block_start = SZ_16K; /* avoid merging */
-	em->block_len = 24 * 1024ULL;
+	em->block_len = 24 * SZ_1K;
+	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
-	ASSERT(ret == 0);
+	write_unlock(&em_tree->lock);
+	if (ret < 0) {
+		test_err("cannot add extent range [8K, 32K)");
+		goto out;
+	}
 	free_extent_map(em);
 
 	em = alloc_extent_map();
-	if (!em)
+	if (!em) {
+		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		ret = -ENOMEM;
 		goto out;
+	}
 	/* Add [0K, 32K) */
 	em->start = 0;
 	em->len = SZ_32K;
 	em->block_start = 0;
 	em->block_len = SZ_32K;
+	write_lock(&em_tree->lock);
 	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
-	if (ret)
+	write_unlock(&em_tree->lock);
+	if (ret) {
 		test_err("case4 [0x%llx 0x%llx): ret %d",
 			 start, len, ret);
-	if (em &&
-	    (start < em->start || start + len > extent_map_end(em)))
+		goto out;
+	}
+	if (em && (start < em->start || start + len > extent_map_end(em))) {
 		test_err(
 "case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
 			 start, len, ret, em->start, em->len, em->block_start,
 			 em->block_len);
+		ret = -EINVAL;
+	}
 	free_extent_map(em);
 out:
-	/* free memory */
 	free_extent_map_tree(em_tree);
+
+	return ret;
 }
 
 /*
@@ -329,17 +424,24 @@
  *                                             # handle -EEXIST when adding
  *                                             # [0, 32K)
  */
-static void test_case_4(struct btrfs_fs_info *fs_info,
+static int test_case_4(struct btrfs_fs_info *fs_info,
 		struct extent_map_tree *em_tree)
 {
-	__test_case_4(fs_info, em_tree, 0);
-	__test_case_4(fs_info, em_tree, SZ_4K);
+	int ret;
+
+	ret = __test_case_4(fs_info, em_tree, 0);
+	if (ret)
+		return ret;
+	ret = __test_case_4(fs_info, em_tree, SZ_4K);
+
+	return ret;
 }
 
 int btrfs_test_extent_map(void)
 {
 	struct btrfs_fs_info *fs_info = NULL;
 	struct extent_map_tree *em_tree;
+	int ret = 0;
 
 	test_msg("running extent_map tests");
 
@@ -349,25 +451,32 @@
 	 */
 	fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE);
 	if (!fs_info) {
-		test_msg("Couldn't allocate dummy fs info");
+		test_std_err(TEST_ALLOC_FS_INFO);
 		return -ENOMEM;
 	}
 
 	em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
-	if (!em_tree)
-		/* Skip the test on error. */
+	if (!em_tree) {
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	extent_map_tree_init(em_tree);
 
-	test_case_1(fs_info, em_tree);
-	test_case_2(fs_info, em_tree);
-	test_case_3(fs_info, em_tree);
-	test_case_4(fs_info, em_tree);
+	ret = test_case_1(fs_info, em_tree);
+	if (ret)
+		goto out;
+	ret = test_case_2(fs_info, em_tree);
+	if (ret)
+		goto out;
+	ret = test_case_3(fs_info, em_tree);
+	if (ret)
+		goto out;
+	ret = test_case_4(fs_info, em_tree);
 
-	kfree(em_tree);
 out:
+	kfree(em_tree);
 	btrfs_free_dummy_fs_info(fs_info);
 
-	return 0;
+	return ret;
 }

diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 5c2f77e..43ec706 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c

@@ -8,6 +8,7 @@
 #include "../ctree.h"
 #include "../disk-io.h"
 #include "../free-space-cache.h"
+#include "../block-group.h"
 
 #define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
 
@@ -404,7 +405,7 @@
 	};
 	const struct btrfs_free_space_op *orig_free_space_ops;
 
-	test_msg("running space stealing from bitmap to extent");
+	test_msg("running space stealing from bitmap to extent tests");
 
 	/*
 	 * For this test, we want to ensure we end up with an extent entry
@@ -834,9 +835,10 @@
 
 	test_msg("running btrfs free space cache tests");
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
-	if (!fs_info)
+	if (!fs_info) {
+		test_std_err(TEST_ALLOC_FS_INFO);
 		return -ENOMEM;
-
+	}
 
 	/*
 	 * For ppc64 (with 64k page size), bytes per bitmap might be
@@ -846,13 +848,14 @@
 	cache = btrfs_alloc_dummy_block_group(fs_info,
 				      BITS_PER_BITMAP * sectorsize + PAGE_SIZE);
 	if (!cache) {
-		test_err("couldn't run the tests");
+		test_std_err(TEST_ALLOC_BLOCK_GROUP);
 		btrfs_free_dummy_fs_info(fs_info);
 		return 0;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
+		test_std_err(TEST_ALLOC_ROOT);
 		ret = PTR_ERR(root);
 		goto out;
 	}
@@ -874,6 +877,5 @@
 	btrfs_free_dummy_block_group(cache);
 	btrfs_free_dummy_root(root);
 	btrfs_free_dummy_fs_info(fs_info);
-	test_msg("free space cache tests finished");
 	return ret;
 }

diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 89346da..bc92df9 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c

@@ -9,6 +9,7 @@
 #include "../disk-io.h"
 #include "../free-space-tree.h"
 #include "../transaction.h"
+#include "../block-group.h"
 
 struct free_space_extent {
 	u64 start;
@@ -30,7 +31,7 @@
 	unsigned int i;
 	int ret;
 
-	info = search_free_space_info(trans, fs_info, cache, path, 0);
+	info = search_free_space_info(trans, cache, path, 0);
 	if (IS_ERR(info)) {
 		test_err("could not find free space info");
 		ret = PTR_ERR(info);
@@ -115,7 +116,7 @@
 	u32 flags;
 	int ret;
 
-	info = search_free_space_info(trans, fs_info, cache, path, 0);
+	info = search_free_space_info(trans, cache, path, 0);
 	if (IS_ERR(info)) {
 		test_err("could not find free space info");
 		btrfs_release_path(path);
@@ -444,14 +445,14 @@
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_err("couldn't allocate dummy fs info");
+		test_std_err(TEST_ALLOC_FS_INFO);
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_err("couldn't allocate dummy root");
+		test_std_err(TEST_ALLOC_ROOT);
 		ret = PTR_ERR(root);
 		goto out;
 	}
@@ -463,7 +464,7 @@
 
 	root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
 	if (!root->node) {
-		test_err("couldn't allocate dummy buffer");
+		test_std_err(TEST_ALLOC_EXTENT_BUFFER);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -473,7 +474,7 @@
 
 	cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment);
 	if (!cache) {
-		test_err("couldn't allocate dummy block group cache");
+		test_std_err(TEST_ALLOC_BLOCK_GROUP);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -486,7 +487,7 @@
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_err("couldn't allocate path");
+		test_std_err(TEST_ALLOC_ROOT);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -539,7 +540,7 @@
 	ret = run_test(test_func, 0, sectorsize, nodesize, alignment);
 	if (ret) {
 		test_err(
-	"%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
+	"%ps failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
 			 test_func, sectorsize, nodesize, alignment);
 		test_ret = ret;
 	}
@@ -547,7 +548,7 @@
 	ret = run_test(test_func, 1, sectorsize, nodesize, alignment);
 	if (ret) {
 		test_err(
-	"%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
+	"%ps failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
 			 test_func, sectorsize, nodesize, alignment);
 		test_ret = ret;
 	}

diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 64043f0..09ecf7d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c

@@ -226,39 +226,37 @@
 	u64 offset;
 	int ret = -ENOMEM;
 
+	test_msg("running btrfs_get_extent tests");
+
 	inode = btrfs_new_test_inode();
 	if (!inode) {
-		test_err("couldn't allocate inode");
+		test_std_err(TEST_ALLOC_INODE);
 		return ret;
 	}
 
+	inode->i_mode = S_IFREG;
 	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
 	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	BTRFS_I(inode)->location.offset = 0;
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_err("couldn't allocate dummy fs info");
+		test_std_err(TEST_ALLOC_FS_INFO);
 		goto out;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_err("couldn't allocate root");
+		test_std_err(TEST_ALLOC_ROOT);
 		goto out;
 	}
 
 	root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
 	if (!root->node) {
-		test_err("couldn't allocate dummy buffer");
+		test_std_err(TEST_ALLOC_ROOT);
 		goto out;
 	}
 
-	/*
-	 * We will just free a dummy node if it's ref count is 2 so we need an
-	 * extra ref so our searches don't accidentally release our page.
-	 */
-	extent_buffer_get(root->node);
 	btrfs_set_header_nritems(root->node, 0);
 	btrfs_set_header_level(root->node, 0);
 	ret = -EINVAL;
@@ -832,9 +830,11 @@
 	struct extent_map *em = NULL;
 	int ret = -ENOMEM;
 
+	test_msg("running hole first btrfs_get_extent test");
+
 	inode = btrfs_new_test_inode();
 	if (!inode) {
-		test_err("couldn't allocate inode");
+		test_std_err(TEST_ALLOC_INODE);
 		return ret;
 	}
 
@@ -844,23 +844,22 @@
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_err("couldn't allocate dummy fs info");
+		test_std_err(TEST_ALLOC_FS_INFO);
 		goto out;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_err("couldn't allocate root");
+		test_std_err(TEST_ALLOC_ROOT);
 		goto out;
 	}
 
 	root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
 	if (!root->node) {
-		test_err("couldn't allocate dummy buffer");
+		test_std_err(TEST_ALLOC_ROOT);
 		goto out;
 	}
 
-	extent_buffer_get(root->node);
 	btrfs_set_header_nritems(root->node, 0);
 	btrfs_set_header_level(root->node, 0);
 	BTRFS_I(inode)->root = root;
@@ -933,21 +932,23 @@
 	struct btrfs_root *root = NULL;
 	int ret = -ENOMEM;
 
+	test_msg("running outstanding_extents tests");
+
 	inode = btrfs_new_test_inode();
 	if (!inode) {
-		test_err("couldn't allocate inode");
+		test_std_err(TEST_ALLOC_INODE);
 		return ret;
 	}
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_err("couldn't allocate dummy fs info");
+		test_std_err(TEST_ALLOC_FS_INFO);
 		goto out;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_err("couldn't allocate root");
+		test_std_err(TEST_ALLOC_ROOT);
 		goto out;
 	}
 
@@ -956,7 +957,7 @@
 
 	/* [BTRFS_MAX_EXTENT_SIZE] */
 	ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
-					NULL, 0);
+					NULL);
 	if (ret) {
 		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
@@ -971,7 +972,7 @@
 	/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
 	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
 					BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
-					0, NULL, 0);
+					0, NULL);
 	if (ret) {
 		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
@@ -987,8 +988,7 @@
 	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
 			       BTRFS_MAX_EXTENT_SIZE >> 1,
 			       (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
-			       EXTENT_DELALLOC | EXTENT_DIRTY |
-			       EXTENT_UPTODATE, 0, 0, NULL);
+			       EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
 		test_err("clear_extent_bit returned %d", ret);
 		goto out;
@@ -1004,7 +1004,7 @@
 	ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
 					(BTRFS_MAX_EXTENT_SIZE >> 1)
 					+ sectorsize - 1,
-					0, NULL, 0);
+					0, NULL);
 	if (ret) {
 		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
@@ -1022,7 +1022,7 @@
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
 			(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
-			0, NULL, 0);
+			0, NULL);
 	if (ret) {
 		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
@@ -1039,7 +1039,7 @@
 	*/
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
-			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
+			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
 	if (ret) {
 		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
@@ -1055,8 +1055,7 @@
 	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
 			       BTRFS_MAX_EXTENT_SIZE + sectorsize,
 			       BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
-			       EXTENT_DIRTY | EXTENT_DELALLOC |
-			       EXTENT_UPTODATE, 0, 0, NULL);
+			       EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
 		test_err("clear_extent_bit returned %d", ret);
 		goto out;
@@ -1074,7 +1073,7 @@
 	 */
 	ret = btrfs_set_extent_delalloc(inode,
 			BTRFS_MAX_EXTENT_SIZE + sectorsize,
-			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
+			BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
 	if (ret) {
 		test_err("btrfs_set_extent_delalloc returned %d", ret);
 		goto out;
@@ -1088,8 +1087,7 @@
 
 	/* Empty */
 	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
-			       EXTENT_DIRTY | EXTENT_DELALLOC |
-			       EXTENT_UPTODATE, 0, 0, NULL);
+			       EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
 		test_err("clear_extent_bit returned %d", ret);
 		goto out;
@@ -1104,8 +1102,7 @@
 out:
 	if (ret)
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_UPTODATE, 0, 0, NULL);
+				 EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
 	iput(inode);
 	btrfs_free_dummy_root(root);
 	btrfs_free_dummy_fs_info(fs_info);
@@ -1116,17 +1113,16 @@
 {
 	int ret;
 
+	test_msg("running inode tests");
+
 	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
 	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
 
-	test_msg("running btrfs_get_extent tests");
 	ret = test_btrfs_get_extent(sectorsize, nodesize);
 	if (ret)
 		return ret;
-	test_msg("running hole first btrfs_get_extent test");
 	ret = test_hole_first(sectorsize, nodesize);
 	if (ret)
 		return ret;
-	test_msg("running outstanding_extents tests");
 	return test_extent_accounting(sectorsize, nodesize);
 }

diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 412b910..09aaca1 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c

@@ -32,7 +32,7 @@
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_err("couldn't allocate path");
+		test_std_err(TEST_ALLOC_ROOT);
 		return -ENOMEM;
 	}
 
@@ -82,7 +82,7 @@
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_err("couldn't allocate path");
+		test_std_err(TEST_ALLOC_ROOT);
 		return -ENOMEM;
 	}
 
@@ -132,7 +132,7 @@
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_err("couldn't allocate path");
+		test_std_err(TEST_ALLOC_ROOT);
 		return -ENOMEM;
 	}
 	path->leave_spinning = 1;
@@ -166,7 +166,7 @@
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		test_err("couldn't allocate path");
+		test_std_err(TEST_ALLOC_ROOT);
 		return -ENOMEM;
 	}
 
@@ -215,7 +215,7 @@
 
 	btrfs_init_dummy_trans(&trans, fs_info);
 
-	test_msg("qgroup basic add");
+	test_msg("running qgroup add/remove tests");
 	ret = btrfs_create_qgroup(&trans, BTRFS_FS_TREE_OBJECTID);
 	if (ret) {
 		test_err("couldn't create a qgroup %d", ret);
@@ -316,7 +316,7 @@
 
 	btrfs_init_dummy_trans(&trans, fs_info);
 
-	test_msg("qgroup multiple refs test");
+	test_msg("running qgroup multiple refs test");
 
 	/*
 	 * We have BTRFS_FS_TREE_OBJECTID created already from the
@@ -457,13 +457,13 @@
 
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
-		test_err("couldn't allocate dummy fs info");
+		test_std_err(TEST_ALLOC_FS_INFO);
 		return -ENOMEM;
 	}
 
 	root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(root)) {
-		test_err("couldn't allocate root");
+		test_std_err(TEST_ALLOC_ROOT);
 		ret = PTR_ERR(root);
 		goto out;
 	}
@@ -495,7 +495,7 @@
 
 	tmp_root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(tmp_root)) {
-		test_err("couldn't allocate a fs root");
+		test_std_err(TEST_ALLOC_ROOT);
 		ret = PTR_ERR(tmp_root);
 		goto out;
 	}
@@ -510,7 +510,7 @@
 
 	tmp_root = btrfs_alloc_dummy_root(fs_info);
 	if (IS_ERR(tmp_root)) {
-		test_err("couldn't allocate a fs root");
+		test_std_err(TEST_ALLOC_ROOT);
 		ret = PTR_ERR(tmp_root);
 		goto out;
 	}

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bb8f6c0..8624bde 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c

@@ -10,6 +10,7 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
 #include <linux/uuid.h>
+#include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -19,6 +20,7 @@
 #include "volumes.h"
 #include "dev-replace.h"
 #include "qgroup.h"
+#include "block-group.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
@@ -28,15 +30,18 @@
 	[TRANS_STATE_COMMIT_START]	= (__TRANS_START | __TRANS_ATTACH),
 	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_START |
 					   __TRANS_ATTACH |
-					   __TRANS_JOIN),
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOSTART),
 	[TRANS_STATE_UNBLOCKED]		= (__TRANS_START |
 					   __TRANS_ATTACH |
 					   __TRANS_JOIN |
-					   __TRANS_JOIN_NOLOCK),
+					   __TRANS_JOIN_NOLOCK |
+					   __TRANS_JOIN_NOSTART),
 	[TRANS_STATE_COMPLETED]		= (__TRANS_START |
 					   __TRANS_ATTACH |
 					   __TRANS_JOIN |
-					   __TRANS_JOIN_NOLOCK),
+					   __TRANS_JOIN_NOLOCK |
+					   __TRANS_JOIN_NOSTART),
 };
 
 void btrfs_put_transaction(struct btrfs_transaction *transaction)
@@ -44,19 +49,12 @@
 	WARN_ON(refcount_read(&transaction->use_count) == 0);
 	if (refcount_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
-		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+		WARN_ON(!RB_EMPTY_ROOT(
+				&transaction->delayed_refs.href_root.rb_root));
 		if (transaction->delayed_refs.pending_csums)
 			btrfs_err(transaction->fs_info,
 				  "pending csums is %llu",
 				  transaction->delayed_refs.pending_csums);
-		while (!list_empty(&transaction->pending_chunks)) {
-			struct extent_map *em;
-
-			em = list_first_entry(&transaction->pending_chunks,
-					      struct extent_map, list);
-			list_del_init(&em->list);
-			free_extent_map(em);
-		}
 		/*
 		 * If any block groups are found in ->deleted_bgs then it's
 		 * because the transaction was aborted and a commit did not
@@ -74,39 +72,11 @@
 			btrfs_put_block_group_trimming(cache);
 			btrfs_put_block_group(cache);
 		}
+		WARN_ON(!list_empty(&transaction->dev_update_list));
 		kfree(transaction);
 	}
 }
 
-static void clear_btree_io_tree(struct extent_io_tree *tree)
-{
-	spin_lock(&tree->lock);
-	/*
-	 * Do a single barrier for the waitqueue_active check here, the state
-	 * of the waitqueue should not change once clear_btree_io_tree is
-	 * called.
-	 */
-	smp_mb();
-	while (!RB_EMPTY_ROOT(&tree->state)) {
-		struct rb_node *node;
-		struct extent_state *state;
-
-		node = rb_first(&tree->state);
-		state = rb_entry(node, struct extent_state, rb_node);
-		rb_erase(&state->rb_node, &tree->state);
-		RB_CLEAR_NODE(&state->rb_node);
-		/*
-		 * btree io trees aren't supposed to have tasks waiting for
-		 * changes in the flags of extent states ever.
-		 */
-		ASSERT(!waitqueue_active(&state->wq));
-		free_extent_state(state);
-
-		cond_resched_lock(&tree->lock);
-	}
-	spin_unlock(&tree->lock);
-}
-
 static noinline void switch_commit_roots(struct btrfs_transaction *trans)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -118,9 +88,10 @@
 		list_del_init(&root->dirty_list);
 		free_extent_buffer(root->commit_root);
 		root->commit_root = btrfs_root_node(root);
-		if (is_fstree(root->objectid))
+		if (is_fstree(root->root_key.objectid))
 			btrfs_unpin_free_ino(root);
-		clear_btree_io_tree(&root->dirty_log_pages);
+		extent_io_tree_release(&root->dirty_log_pages);
+		btrfs_qgroup_clean_swapped_blocks(root);
 	}
 
 	/* We can free old roots now. */
@@ -163,6 +134,24 @@
 }
 
 /*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+
+	if (!trans->chunk_bytes_reserved)
+		return;
+
+	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+				trans->chunk_bytes_reserved);
+	trans->chunk_bytes_reserved = 0;
+}
+
+/*
  * either allocate a new transaction or hop into the existing one
  */
 static noinline int join_transaction(struct btrfs_fs_info *fs_info,
@@ -232,20 +221,18 @@
 	extwriter_counter_init(cur_trans, type);
 	init_waitqueue_head(&cur_trans->writer_wait);
 	init_waitqueue_head(&cur_trans->commit_wait);
-	init_waitqueue_head(&cur_trans->pending_wait);
 	cur_trans->state = TRANS_STATE_RUNNING;
 	/*
 	 * One for this trans handle, one so it will live on until we
 	 * commit the transaction.
 	 */
 	refcount_set(&cur_trans->use_count, 2);
-	atomic_set(&cur_trans->pending_ordered, 0);
 	cur_trans->flags = 0;
 	cur_trans->start_time = ktime_get_seconds();
 
 	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
 
-	cur_trans->delayed_refs.href_root = RB_ROOT;
+	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
 	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
 
@@ -263,19 +250,18 @@
 	spin_lock_init(&cur_trans->delayed_refs.lock);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-	INIT_LIST_HEAD(&cur_trans->pending_chunks);
+	INIT_LIST_HEAD(&cur_trans->dev_update_list);
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
 	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
 	INIT_LIST_HEAD(&cur_trans->io_bgs);
 	INIT_LIST_HEAD(&cur_trans->dropped_roots);
 	mutex_init(&cur_trans->cache_write_mutex);
-	cur_trans->num_dirty_bgs = 0;
 	spin_lock_init(&cur_trans->dirty_bgs_lock);
 	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
 	spin_lock_init(&cur_trans->dropped_roots_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
-	extent_io_tree_init(&cur_trans->dirty_pages,
-			     fs_info->btree_inode);
+	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+			IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
 	fs_info->generation++;
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
@@ -455,7 +441,7 @@
 		  bool enforce_qgroups)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	u64 num_bytes = 0;
@@ -484,13 +470,28 @@
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items && root != fs_info->chunk_root) {
+		struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
+		u64 delayed_refs_bytes = 0;
+
 		qgroup_reserved = num_items * fs_info->nodesize;
 		ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
 				enforce_qgroups);
 		if (ret)
 			return ERR_PTR(ret);
 
-		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
+		/*
+		 * We want to reserve all the bytes we may need all at once, so
+		 * we only do 1 enospc flushing cycle per transaction start.  We
+		 * accomplish this by simply assuming we'll do 2 x num_items
+		 * worth of delayed refs updates in this trans handle, and
+		 * refill that amount for whatever is missing in the reserve.
+		 */
+		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
+		if (delayed_refs_rsv->full == 0) {
+			delayed_refs_bytes = num_bytes;
+			num_bytes <<= 1;
+		}
+
 		/*
 		 * Do the reservation for the relocation root creation
 		 */
@@ -499,8 +500,24 @@
 			reloc_reserved = true;
 		}
 
-		ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
-					  num_bytes, flush);
+		ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
+		if (ret)
+			goto reserve_fail;
+		if (delayed_refs_bytes) {
+			btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
+							  delayed_refs_bytes);
+			num_bytes -= delayed_refs_bytes;
+		}
+	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
+		   !delayed_refs_rsv->full) {
+		/*
+		 * Some people call with btrfs_start_transaction(root, 0)
+		 * because they can be throttled, but have some other mechanism
+		 * for reserving space.  We still want these guys to refill the
+		 * delayed block_rsv so just add 1 items worth of reservation
+		 * here.
+		 */
+		ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
 		if (ret)
 			goto reserve_fail;
 	}
@@ -531,7 +548,8 @@
 		ret = join_transaction(fs_info, type);
 		if (ret == -EBUSY) {
 			wait_current_trans(fs_info);
-			if (unlikely(type == TRANS_ATTACH))
+			if (unlikely(type == TRANS_ATTACH ||
+				     type == TRANS_JOIN_NOSTART))
 				ret = -ENOENT;
 		}
 	} while (ret == -EBUSY);
@@ -619,7 +637,7 @@
 	if (IS_ERR(trans))
 		return trans;
 
-	num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
+	num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
 	ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv,
 				       num_bytes, min_factor);
 	if (ret) {
@@ -648,6 +666,16 @@
 }
 
 /*
+ * Similar to regular join but it never starts a transaction when none is
+ * running or after waiting for the current one to finish.
+ */
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_JOIN_NOSTART,
+				 BTRFS_RESERVE_NO_FLUSH, true);
+}
+
+/*
  * btrfs_attach_transaction() - catch the running transaction
  *
  * It is used when we want to commit the current the transaction, but
@@ -669,7 +697,7 @@
 /*
  * btrfs_attach_transaction_barrier() - catch the running transaction
  *
- * It is similar to the above function, the differentia is this one
+ * It is similar to the above function, the difference is this one
  * will wait for all the inactive transactions until they fully
  * complete.
  */
@@ -759,7 +787,7 @@
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 
-	if (btrfs_check_space_for_delayed_refs(trans, fs_info))
+	if (btrfs_check_space_for_delayed_refs(fs_info))
 		return 1;
 
 	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -768,22 +796,12 @@
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
-	int updates;
-	int err;
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
 	    cur_trans->delayed_refs.flushing)
 		return 1;
 
-	updates = trans->delayed_ref_updates;
-	trans->delayed_ref_updates = 0;
-	if (updates) {
-		err = btrfs_run_delayed_refs(trans, updates * 2);
-		if (err) /* Error code will also eval true */
-			return err;
-	}
-
 	return should_end_transaction(trans);
 }
 
@@ -813,11 +831,8 @@
 {
 	struct btrfs_fs_info *info = trans->fs_info;
 	struct btrfs_transaction *cur_trans = trans->transaction;
-	u64 transid = trans->transid;
-	unsigned long cur = trans->delayed_ref_updates;
 	int lock = (trans->type != TRANS_JOIN_NOLOCK);
 	int err = 0;
-	int must_run_delayed_refs = 0;
 
 	if (refcount_read(&trans->use_count) > 1) {
 		refcount_dec(&trans->use_count);
@@ -828,40 +843,10 @@
 	btrfs_trans_release_metadata(trans);
 	trans->block_rsv = NULL;
 
-	if (!list_empty(&trans->new_bgs))
-		btrfs_create_pending_block_groups(trans);
-
-	trans->delayed_ref_updates = 0;
-	if (!trans->sync) {
-		must_run_delayed_refs =
-			btrfs_should_throttle_delayed_refs(trans, info);
-		cur = max_t(unsigned long, cur, 32);
-
-		/*
-		 * don't make the caller wait if they are from a NOLOCK
-		 * or ATTACH transaction, it will deadlock with commit
-		 */
-		if (must_run_delayed_refs == 1 &&
-		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
-			must_run_delayed_refs = 2;
-	}
-
-	btrfs_trans_release_metadata(trans);
-	trans->block_rsv = NULL;
-
-	if (!list_empty(&trans->new_bgs))
-		btrfs_create_pending_block_groups(trans);
+	btrfs_create_pending_block_groups(trans);
 
 	btrfs_trans_release_chunk_metadata(trans);
 
-	if (lock && should_end_transaction(trans) &&
-	    READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
-		spin_lock(&info->trans_lock);
-		if (cur_trans->state == TRANS_STATE_RUNNING)
-			cur_trans->state = TRANS_STATE_BLOCKED;
-		spin_unlock(&info->trans_lock);
-	}
-
 	if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
 		if (throttle)
 			return btrfs_commit_transaction(trans);
@@ -893,10 +878,6 @@
 	}
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
-	if (must_run_delayed_refs) {
-		btrfs_async_run_delayed_refs(info, cur, transid,
-					     must_run_delayed_refs == 1);
-	}
 	return err;
 }
 
@@ -944,7 +925,7 @@
 		 * superblock that points to btree nodes/leafs for which
 		 * writeback hasn't finished yet (and without errors).
 		 * We cleanup any entries left in the io tree when committing
-		 * the transaction (through clear_btree_io_tree()).
+		 * the transaction (through extent_io_tree_release()).
 		 */
 		if (err == -ENOMEM) {
 			err = 0;
@@ -989,7 +970,7 @@
 		 * left in the io tree. For a log commit, we don't remove them
 		 * after committing the log because the tree can be accessed
 		 * concurrently - we do it only at transaction commit time when
-		 * it's safe to do it (through clear_btree_io_tree()).
+		 * it's safe to do it (through extent_io_tree_release()).
 		 */
 		err = clear_extent_bit(dirty_pages, start, end,
 				       EXTENT_NEED_WAIT, 0, 0, &cached_state);
@@ -1067,7 +1048,7 @@
 	blk_finish_plug(&plug);
 	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
 
-	clear_btree_io_tree(&trans->transaction->dirty_pages);
+	extent_io_tree_release(&trans->transaction->dirty_pages);
 
 	if (ret)
 		return ret;
@@ -1146,17 +1127,17 @@
 	if (ret)
 		return ret;
 
-	ret = btrfs_run_dev_stats(trans, fs_info);
+	ret = btrfs_run_dev_stats(trans);
 	if (ret)
 		return ret;
-	ret = btrfs_run_dev_replace(trans, fs_info);
+	ret = btrfs_run_dev_replace(trans);
 	if (ret)
 		return ret;
 	ret = btrfs_run_qgroups(trans);
 	if (ret)
 		return ret;
 
-	ret = btrfs_setup_space_cache(trans, fs_info);
+	ret = btrfs_setup_space_cache(trans);
 	if (ret)
 		return ret;
 
@@ -1184,7 +1165,7 @@
 	}
 
 	while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
-		ret = btrfs_write_dirty_block_groups(trans, fs_info);
+		ret = btrfs_write_dirty_block_groups(trans);
 		if (ret)
 			return ret;
 		ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
@@ -1197,7 +1178,10 @@
 
 	list_add_tail(&fs_info->extent_root->dirty_list,
 		      &trans->transaction->switch_commits);
-	btrfs_after_dev_replace_commit(fs_info);
+
+	/* Update dev-replace pointer once everything is committed */
+	fs_info->dev_replace.committed_cursor_left =
+		fs_info->dev_replace.cursor_left_last_write_of_item;
 
 	return 0;
 }
@@ -1334,7 +1318,7 @@
 		return 0;
 
 	/*
-	 * Ensure dirty @src will be commited.  Or, after comming
+	 * Ensure dirty @src will be committed.  Or, after coming
 	 * commit_fs_roots() and switch_commit_roots(), any dirty but not
 	 * recorded root will never be updated again, causing an outdated root
 	 * item.
@@ -1545,7 +1529,7 @@
 		goto fail;
 	}
 
-	btrfs_set_lock_blocking(old);
+	btrfs_set_lock_blocking_write(old);
 
 	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
 	/* clean up in any case */
@@ -1613,10 +1597,9 @@
 	if (ret < 0)
 		goto fail;
 
-	ret = btrfs_insert_dir_item(trans, parent_root,
-				    dentry->d_name.name, dentry->d_name.len,
-				    BTRFS_I(parent_inode), &key,
-				    BTRFS_FT_DIR, index);
+	ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
+				    dentry->d_name.len, BTRFS_I(parent_inode),
+				    &key, BTRFS_FT_DIR, index);
 	/* We have check then name at the beginning, so it is impossible. */
 	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
 	if (ret) {
@@ -1839,7 +1822,6 @@
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_transaction *cur_trans = trans->transaction;
-	DEFINE_WAIT(wait);
 
 	WARN_ON(refcount_read(&trans->use_count) > 1);
 
@@ -1886,8 +1868,25 @@
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
 
-static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+/*
+ * Release reserved delayed ref space of all pending block groups of the
+ * transaction and remove them from the list
+ */
+static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_block_group_cache *block_group, *tmp;
+
+       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               list_del_init(&block_group->bg_list);
+       }
+}
+
+static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+
 	/*
 	 * We use writeback_inodes_sb here because if we used
 	 * btrfs_start_delalloc_roots we would deadlock with fs freeze.
@@ -1897,22 +1896,50 @@
 	 * from already being in a transaction and our join_transaction doesn't
 	 * have to re-take the fs freeze lock.
 	 */
-	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
 		writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+	} else {
+		struct btrfs_pending_snapshot *pending;
+		struct list_head *head = &trans->transaction->pending_snapshots;
+
+		/*
+		 * Flush dellaloc for any root that is going to be snapshotted.
+		 * This is done to avoid a corrupted version of files, in the
+		 * snapshots, that had both buffered and direct IO writes (even
+		 * if they were done sequentially) due to an unordered update of
+		 * the inode's size on disk.
+		 */
+		list_for_each_entry(pending, head, list) {
+			int ret;
+
+			ret = btrfs_start_delalloc_snapshot(pending->root);
+			if (ret)
+				return ret;
+		}
+	}
 	return 0;
 }
 
-static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
 {
-	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
-		btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
-}
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 
-static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
-{
-	wait_event(cur_trans->pending_wait,
-		   atomic_read(&cur_trans->pending_ordered) == 0);
+	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
+		btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+	} else {
+		struct btrfs_pending_snapshot *pending;
+		struct list_head *head = &trans->transaction->pending_snapshots;
+
+		/*
+		 * Wait for any dellaloc that we started previously for the roots
+		 * that are going to be snapshotted. This is to avoid a corrupted
+		 * version of files in the snapshots that had both buffered and
+		 * direct IO writes (even if they were done sequentially).
+		 */
+		list_for_each_entry(pending, head, list)
+			btrfs_wait_ordered_extents(pending->root,
+						   U64_MAX, 0, U64_MAX);
+	}
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -1950,8 +1977,7 @@
 	cur_trans->delayed_refs.flushing = 1;
 	smp_wmb();
 
-	if (!list_empty(&trans->new_bgs))
-		btrfs_create_pending_block_groups(trans);
+	btrfs_create_pending_block_groups(trans);
 
 	ret = btrfs_run_delayed_refs(trans, 0);
 	if (ret) {
@@ -2027,11 +2053,21 @@
 		}
 	} else {
 		spin_unlock(&fs_info->trans_lock);
+		/*
+		 * The previous transaction was aborted and was already removed
+		 * from the list of transactions at fs_info->trans_list. So we
+		 * abort to prevent writing a new superblock that reflects a
+		 * corrupt state (pointing to trees with unwritten nodes/leafs).
+		 */
+		if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+			ret = -EROFS;
+			goto cleanup_transaction;
+		}
 	}
 
 	extwriter_counter_dec(cur_trans, trans->type);
 
-	ret = btrfs_start_delalloc_flush(fs_info);
+	ret = btrfs_start_delalloc_flush(trans);
 	if (ret)
 		goto cleanup_transaction;
 
@@ -2047,9 +2083,7 @@
 	if (ret)
 		goto cleanup_transaction;
 
-	btrfs_wait_delalloc_flush(fs_info);
-
-	btrfs_wait_pending_ordered(cur_trans);
+	btrfs_wait_delalloc_flush(trans);
 
 	btrfs_scrub_pause(fs_info);
 	/*
@@ -2214,8 +2248,7 @@
 	memcpy(fs_info->super_for_commit, fs_info->super_copy,
 	       sizeof(*fs_info->super_copy));
 
-	btrfs_update_commit_device_size(fs_info);
-	btrfs_update_commit_device_bytes_used(cur_trans);
+	btrfs_commit_device_sizes(cur_trans);
 
 	clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
 	clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
@@ -2286,6 +2319,7 @@
 	btrfs_scrub_continue(fs_info);
 cleanup_transaction:
 	btrfs_trans_release_metadata(trans);
+	btrfs_cleanup_pending_block_groups(trans);
 	btrfs_trans_release_chunk_metadata(trans);
 	trans->block_rsv = NULL;
 	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
@@ -2321,7 +2355,7 @@
 	list_del_init(&root->root_list);
 	spin_unlock(&fs_info->trans_lock);
 
-	btrfs_debug(fs_info, "cleaner removing %llu", root->objectid);
+	btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
 
 	btrfs_kill_all_delayed_nodes(root);
 

diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4cbb1b5..2c5a6f6 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h

@@ -12,13 +12,13 @@
 #include "ctree.h"
 
 enum btrfs_trans_state {
-	TRANS_STATE_RUNNING		= 0,
-	TRANS_STATE_BLOCKED		= 1,
-	TRANS_STATE_COMMIT_START	= 2,
-	TRANS_STATE_COMMIT_DOING	= 3,
-	TRANS_STATE_UNBLOCKED		= 4,
-	TRANS_STATE_COMPLETED		= 5,
-	TRANS_STATE_MAX			= 6,
+	TRANS_STATE_RUNNING,
+	TRANS_STATE_BLOCKED,
+	TRANS_STATE_COMMIT_START,
+	TRANS_STATE_COMMIT_DOING,
+	TRANS_STATE_UNBLOCKED,
+	TRANS_STATE_COMPLETED,
+	TRANS_STATE_MAX,
 };
 
 #define BTRFS_TRANS_HAVE_FREE_BGS	0
@@ -39,7 +39,6 @@
 	 */
 	atomic_t num_writers;
 	refcount_t use_count;
-	atomic_t pending_ordered;
 
 	unsigned long flags;
 
@@ -51,9 +50,8 @@
 	time64_t start_time;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
-	wait_queue_head_t pending_wait;
 	struct list_head pending_snapshots;
-	struct list_head pending_chunks;
+	struct list_head dev_update_list;
 	struct list_head switch_commits;
 	struct list_head dirty_bgs;
 
@@ -82,7 +80,6 @@
 	 */
 	struct mutex cache_write_mutex;
 	spinlock_t dirty_bgs_lock;
-	unsigned int num_dirty_bgs;
 	/* Protected by spin lock fs_info->unused_bgs_lock. */
 	struct list_head deleted_bgs;
 	spinlock_t dropped_roots_lock;
@@ -97,11 +94,13 @@
 #define __TRANS_JOIN		(1U << 11)
 #define __TRANS_JOIN_NOLOCK	(1U << 12)
 #define __TRANS_DUMMY		(1U << 13)
+#define __TRANS_JOIN_NOSTART	(1U << 14)
 
 #define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE)
 #define TRANS_ATTACH		(__TRANS_ATTACH)
 #define TRANS_JOIN		(__TRANS_JOIN | __TRANS_FREEZABLE)
 #define TRANS_JOIN_NOLOCK	(__TRANS_JOIN_NOLOCK)
+#define TRANS_JOIN_NOSTART	(__TRANS_JOIN_NOSTART)
 
 #define TRANS_EXTWRITERS	(__TRANS_START | __TRANS_ATTACH)
 
@@ -122,7 +121,6 @@
 	bool allocating_chunk;
 	bool can_flush_pending_bgs;
 	bool reloc_reserved;
-	bool sync;
 	bool dirty;
 	struct btrfs_root *root;
 	struct btrfs_fs_info *fs_info;
@@ -187,6 +185,7 @@
 					int min_factor);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
 					struct btrfs_root *root);
@@ -228,5 +227,6 @@
 void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
 void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 
 #endif

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 3ec712c..076d5b8 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c

@@ -15,6 +15,9 @@
  * carefully reviewed otherwise so it does not prevent mount of valid images.
  */
 
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/error-injection.h>
 #include "ctree.h"
 #include "tree-checker.h"
 #include "disk-io.h"
@@ -27,10 +30,10 @@
  *
  * @type:	leaf or node
  * @identifier:	the necessary info to locate the leaf/node.
- * 		It's recommened to decode key.objecitd/offset if it's
+ * 		It's recommended to decode key.objecitd/offset if it's
  * 		meaningful.
  * @reason:	describe the error
- * @bad_value:	optional, it's recommened to output bad value and its
+ * @bad_value:	optional, it's recommended to output bad value and its
  *		expected value (range).
  *
  * Since comma is used to separate the components, only space is allowed
@@ -41,12 +44,12 @@
  * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
  * Allows callers to customize the output.
  */
-__printf(4, 5)
+__printf(3, 4)
 __cold
-static void generic_err(const struct btrfs_fs_info *fs_info,
-			const struct extent_buffer *eb, int slot,
+static void generic_err(const struct extent_buffer *eb, int slot,
 			const char *fmt, ...)
 {
+	const struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct va_format vaf;
 	va_list args;
 
@@ -66,12 +69,12 @@
  * Customized reporter for extent data item, since its key objectid and
  * offset has its own meaning.
  */
-__printf(4, 5)
+__printf(3, 4)
 __cold
-static void file_extent_err(const struct btrfs_fs_info *fs_info,
-			    const struct extent_buffer *eb, int slot,
+static void file_extent_err(const struct extent_buffer *eb, int slot,
 			    const char *fmt, ...)
 {
+	const struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct btrfs_key key;
 	struct va_format vaf;
 	va_list args;
@@ -94,26 +97,45 @@
  * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
  * Else return 1
  */
-#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment)	      \
+#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment)		      \
 ({									      \
 	if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
-		file_extent_err((fs_info), (leaf), (slot),		      \
+		file_extent_err((leaf), (slot),				      \
 	"invalid %s for file extent, have %llu, should be aligned to %u",     \
 			(#name), btrfs_file_extent_##name((leaf), (fi)),      \
 			(alignment));					      \
 	(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment)));   \
 })
 
-static int check_extent_data_item(struct btrfs_fs_info *fs_info,
-				  struct extent_buffer *leaf,
-				  struct btrfs_key *key, int slot)
+static u64 file_extent_end(struct extent_buffer *leaf,
+			   struct btrfs_key *key,
+			   struct btrfs_file_extent_item *extent)
 {
+	u64 end;
+	u64 len;
+
+	if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) {
+		len = btrfs_file_extent_ram_bytes(leaf, extent);
+		end = ALIGN(key->offset + len, leaf->fs_info->sectorsize);
+	} else {
+		len = btrfs_file_extent_num_bytes(leaf, extent);
+		end = key->offset + len;
+	}
+	return end;
+}
+
+static int check_extent_data_item(struct extent_buffer *leaf,
+				  struct btrfs_key *key, int slot,
+				  struct btrfs_key *prev_key)
+{
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_file_extent_item *fi;
 	u32 sectorsize = fs_info->sectorsize;
 	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u64 extent_end;
 
 	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		file_extent_err(fs_info, leaf, slot,
+		file_extent_err(leaf, slot,
 "unaligned file_offset for file extent, have %llu should be aligned to %u",
 			key->offset, sectorsize);
 		return -EUCLEAN;
@@ -122,7 +144,7 @@
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 
 	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
-		file_extent_err(fs_info, leaf, slot,
+		file_extent_err(leaf, slot,
 		"invalid type for file extent, have %u expect range [0, %u]",
 			btrfs_file_extent_type(leaf, fi),
 			BTRFS_FILE_EXTENT_TYPES);
@@ -130,18 +152,18 @@
 	}
 
 	/*
-	 * Support for new compression/encrption must introduce incompat flag,
+	 * Support for new compression/encryption must introduce incompat flag,
 	 * and must be caught in open_ctree().
 	 */
 	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
-		file_extent_err(fs_info, leaf, slot,
+		file_extent_err(leaf, slot,
 	"invalid compression for file extent, have %u expect range [0, %u]",
 			btrfs_file_extent_compression(leaf, fi),
 			BTRFS_COMPRESS_TYPES);
 		return -EUCLEAN;
 	}
 	if (btrfs_file_extent_encryption(leaf, fi)) {
-		file_extent_err(fs_info, leaf, slot,
+		file_extent_err(leaf, slot,
 			"invalid encryption for file extent, have %u expect 0",
 			btrfs_file_extent_encryption(leaf, fi));
 		return -EUCLEAN;
@@ -149,7 +171,7 @@
 	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
 		/* Inline extent must have 0 as key offset */
 		if (key->offset) {
-			file_extent_err(fs_info, leaf, slot,
+			file_extent_err(leaf, slot,
 		"invalid file_offset for inline file extent, have %llu expect 0",
 				key->offset);
 			return -EUCLEAN;
@@ -163,7 +185,7 @@
 		/* Uncompressed inline extent size must match item size */
 		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
 		    btrfs_file_extent_ram_bytes(leaf, fi)) {
-			file_extent_err(fs_info, leaf, slot,
+			file_extent_err(leaf, slot,
 	"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
 				item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
 				btrfs_file_extent_ram_bytes(leaf, fi));
@@ -174,41 +196,73 @@
 
 	/* Regular or preallocated extent has fixed item size */
 	if (item_size != sizeof(*fi)) {
-		file_extent_err(fs_info, leaf, slot,
+		file_extent_err(leaf, slot,
 	"invalid item size for reg/prealloc file extent, have %u expect %zu",
 			item_size, sizeof(*fi));
 		return -EUCLEAN;
 	}
-	if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) ||
-	    CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) ||
-	    CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) ||
-	    CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) ||
-	    CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize))
+	if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
+	    CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
+	    CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
+	    CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
+	    CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))
 		return -EUCLEAN;
+
+	/* Catch extent end overflow */
+	if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
+			       key->offset, &extent_end)) {
+		file_extent_err(leaf, slot,
+	"extent end overflow, have file offset %llu extent num bytes %llu",
+				key->offset,
+				btrfs_file_extent_num_bytes(leaf, fi));
+		return -EUCLEAN;
+	}
+
+	/*
+	 * Check that no two consecutive file extent items, in the same leaf,
+	 * present ranges that overlap each other.
+	 */
+	if (slot > 0 &&
+	    prev_key->objectid == key->objectid &&
+	    prev_key->type == BTRFS_EXTENT_DATA_KEY) {
+		struct btrfs_file_extent_item *prev_fi;
+		u64 prev_end;
+
+		prev_fi = btrfs_item_ptr(leaf, slot - 1,
+					 struct btrfs_file_extent_item);
+		prev_end = file_extent_end(leaf, prev_key, prev_fi);
+		if (prev_end > key->offset) {
+			file_extent_err(leaf, slot - 1,
+"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent",
+					prev_end, key->offset);
+			return -EUCLEAN;
+		}
+	}
+
 	return 0;
 }
 
-static int check_csum_item(struct btrfs_fs_info *fs_info,
-			   struct extent_buffer *leaf, struct btrfs_key *key,
+static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
 			   int slot)
 {
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	u32 sectorsize = fs_info->sectorsize;
 	u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
 
 	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
-		generic_err(fs_info, leaf, slot,
+		generic_err(leaf, slot,
 		"invalid key objectid for csum item, have %llu expect %llu",
 			key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
 		return -EUCLEAN;
 	}
 	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		generic_err(fs_info, leaf, slot,
+		generic_err(leaf, slot,
 	"unaligned key offset for csum item, have %llu should be aligned to %u",
 			key->offset, sectorsize);
 		return -EUCLEAN;
 	}
 	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
-		generic_err(fs_info, leaf, slot,
+		generic_err(leaf, slot,
 	"unaligned item size for csum item, have %u should be aligned to %u",
 			btrfs_item_size_nr(leaf, slot), csumsize);
 		return -EUCLEAN;
@@ -220,12 +274,12 @@
  * Customized reported for dir_item, only important new info is key->objectid,
  * which represents inode number
  */
-__printf(4, 5)
+__printf(3, 4)
 __cold
-static void dir_item_err(const struct btrfs_fs_info *fs_info,
-			 const struct extent_buffer *eb, int slot,
+static void dir_item_err(const struct extent_buffer *eb, int slot,
 			 const char *fmt, ...)
 {
+	const struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct btrfs_key key;
 	struct va_format vaf;
 	va_list args;
@@ -244,10 +298,10 @@
 	va_end(args);
 }
 
-static int check_dir_item(struct btrfs_fs_info *fs_info,
-			  struct extent_buffer *leaf,
+static int check_dir_item(struct extent_buffer *leaf,
 			  struct btrfs_key *key, int slot)
 {
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_dir_item *di;
 	u32 item_size = btrfs_item_size_nr(leaf, slot);
 	u32 cur = 0;
@@ -263,7 +317,7 @@
 
 		/* header itself should not cross item boundary */
 		if (cur + sizeof(*di) > item_size) {
-			dir_item_err(fs_info, leaf, slot,
+			dir_item_err(leaf, slot,
 		"dir item header crosses item boundary, have %zu boundary %u",
 				cur + sizeof(*di), item_size);
 			return -EUCLEAN;
@@ -272,7 +326,7 @@
 		/* dir type check */
 		dir_type = btrfs_dir_type(leaf, di);
 		if (dir_type >= BTRFS_FT_MAX) {
-			dir_item_err(fs_info, leaf, slot,
+			dir_item_err(leaf, slot,
 			"invalid dir item type, have %u expect [0, %u)",
 				dir_type, BTRFS_FT_MAX);
 			return -EUCLEAN;
@@ -280,14 +334,14 @@
 
 		if (key->type == BTRFS_XATTR_ITEM_KEY &&
 		    dir_type != BTRFS_FT_XATTR) {
-			dir_item_err(fs_info, leaf, slot,
+			dir_item_err(leaf, slot,
 		"invalid dir item type for XATTR key, have %u expect %u",
 				dir_type, BTRFS_FT_XATTR);
 			return -EUCLEAN;
 		}
 		if (dir_type == BTRFS_FT_XATTR &&
 		    key->type != BTRFS_XATTR_ITEM_KEY) {
-			dir_item_err(fs_info, leaf, slot,
+			dir_item_err(leaf, slot,
 			"xattr dir type found for non-XATTR key");
 			return -EUCLEAN;
 		}
@@ -300,13 +354,13 @@
 		name_len = btrfs_dir_name_len(leaf, di);
 		data_len = btrfs_dir_data_len(leaf, di);
 		if (name_len > max_name_len) {
-			dir_item_err(fs_info, leaf, slot,
+			dir_item_err(leaf, slot,
 			"dir item name len too long, have %u max %u",
 				name_len, max_name_len);
 			return -EUCLEAN;
 		}
 		if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
-			dir_item_err(fs_info, leaf, slot,
+			dir_item_err(leaf, slot,
 			"dir item name and data len too long, have %u max %u",
 				name_len + data_len,
 				BTRFS_MAX_XATTR_SIZE(fs_info));
@@ -314,7 +368,7 @@
 		}
 
 		if (data_len && dir_type != BTRFS_FT_XATTR) {
-			dir_item_err(fs_info, leaf, slot,
+			dir_item_err(leaf, slot,
 			"dir item with invalid data len, have %u expect 0",
 				data_len);
 			return -EUCLEAN;
@@ -324,7 +378,7 @@
 
 		/* header and name/data should not cross item boundary */
 		if (cur + total_size > item_size) {
-			dir_item_err(fs_info, leaf, slot,
+			dir_item_err(leaf, slot,
 		"dir item data crosses item boundary, have %u boundary %u",
 				cur + total_size, item_size);
 			return -EUCLEAN;
@@ -342,7 +396,7 @@
 					(unsigned long)(di + 1), name_len);
 			name_hash = btrfs_name_hash(namebuf, name_len);
 			if (key->offset != name_hash) {
-				dir_item_err(fs_info, leaf, slot,
+				dir_item_err(leaf, slot,
 		"name hash mismatch with key, have 0x%016x expect 0x%016llx",
 					name_hash, key->offset);
 				return -EUCLEAN;
@@ -354,12 +408,12 @@
 	return 0;
 }
 
-__printf(4, 5)
+__printf(3, 4)
 __cold
-static void block_group_err(const struct btrfs_fs_info *fs_info,
-			    const struct extent_buffer *eb, int slot,
+static void block_group_err(const struct extent_buffer *eb, int slot,
 			    const char *fmt, ...)
 {
+	const struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct btrfs_key key;
 	struct va_format vaf;
 	va_list args;
@@ -378,8 +432,7 @@
 	va_end(args);
 }
 
-static int check_block_group_item(struct btrfs_fs_info *fs_info,
-				  struct extent_buffer *leaf,
+static int check_block_group_item(struct extent_buffer *leaf,
 				  struct btrfs_key *key, int slot)
 {
 	struct btrfs_block_group_item bgi;
@@ -392,13 +445,13 @@
 	 * handle it.  We care more about the size.
 	 */
 	if (key->offset == 0) {
-		block_group_err(fs_info, leaf, slot,
+		block_group_err(leaf, slot,
 				"invalid block group size 0");
 		return -EUCLEAN;
 	}
 
 	if (item_size != sizeof(bgi)) {
-		block_group_err(fs_info, leaf, slot,
+		block_group_err(leaf, slot,
 			"invalid item size, have %u expect %zu",
 				item_size, sizeof(bgi));
 		return -EUCLEAN;
@@ -408,7 +461,7 @@
 			   sizeof(bgi));
 	if (btrfs_block_group_chunk_objectid(&bgi) !=
 	    BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
-		block_group_err(fs_info, leaf, slot,
+		block_group_err(leaf, slot,
 		"invalid block group chunk objectid, have %llu expect %llu",
 				btrfs_block_group_chunk_objectid(&bgi),
 				BTRFS_FIRST_CHUNK_TREE_OBJECTID);
@@ -416,7 +469,7 @@
 	}
 
 	if (btrfs_block_group_used(&bgi) > key->offset) {
-		block_group_err(fs_info, leaf, slot,
+		block_group_err(leaf, slot,
 			"invalid block group used, have %llu expect [0, %llu)",
 				btrfs_block_group_used(&bgi), key->offset);
 		return -EUCLEAN;
@@ -424,7 +477,7 @@
 
 	flags = btrfs_block_group_flags(&bgi);
 	if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
-		block_group_err(fs_info, leaf, slot,
+		block_group_err(leaf, slot,
 "invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
 			flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
 			hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
@@ -437,7 +490,7 @@
 	    type != BTRFS_BLOCK_GROUP_SYSTEM &&
 	    type != (BTRFS_BLOCK_GROUP_METADATA |
 			   BTRFS_BLOCK_GROUP_DATA)) {
-		block_group_err(fs_info, leaf, slot,
+		block_group_err(leaf, slot,
 "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
 			type, hweight64(type),
 			BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
@@ -448,43 +501,799 @@
 	return 0;
 }
 
+__printf(4, 5)
+__cold
+static void chunk_err(const struct extent_buffer *leaf,
+		      const struct btrfs_chunk *chunk, u64 logical,
+		      const char *fmt, ...)
+{
+	const struct btrfs_fs_info *fs_info = leaf->fs_info;
+	bool is_sb;
+	struct va_format vaf;
+	va_list args;
+	int i;
+	int slot = -1;
+
+	/* Only superblock eb is able to have such small offset */
+	is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET);
+
+	if (!is_sb) {
+		/*
+		 * Get the slot number by iterating through all slots, this
+		 * would provide better readability.
+		 */
+		for (i = 0; i < btrfs_header_nritems(leaf); i++) {
+			if (btrfs_item_ptr_offset(leaf, i) ==
+					(unsigned long)chunk) {
+				slot = i;
+				break;
+			}
+		}
+	}
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (is_sb)
+		btrfs_crit(fs_info,
+		"corrupt superblock syschunk array: chunk_start=%llu, %pV",
+			   logical, &vaf);
+	else
+		btrfs_crit(fs_info,
+	"corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV",
+			   BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot,
+			   logical, &vaf);
+	va_end(args);
+}
+
+/*
+ * The common chunk check which could also work on super block sys chunk array.
+ *
+ * Return -EUCLEAN if anything is corrupted.
+ * Return 0 if everything is OK.
+ */
+int btrfs_check_chunk_valid(struct extent_buffer *leaf,
+			    struct btrfs_chunk *chunk, u64 logical)
+{
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
+	u64 length;
+	u64 stripe_len;
+	u16 num_stripes;
+	u16 sub_stripes;
+	u64 type;
+	u64 features;
+	bool mixed = false;
+
+	length = btrfs_chunk_length(leaf, chunk);
+	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+	type = btrfs_chunk_type(leaf, chunk);
+
+	if (!num_stripes) {
+		chunk_err(leaf, chunk, logical,
+			  "invalid chunk num_stripes, have %u", num_stripes);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+		chunk_err(leaf, chunk, logical,
+		"invalid chunk logical, have %llu should aligned to %u",
+			  logical, fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+		chunk_err(leaf, chunk, logical,
+			  "invalid chunk sectorsize, have %u expect %u",
+			  btrfs_chunk_sector_size(leaf, chunk),
+			  fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+	if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+		chunk_err(leaf, chunk, logical,
+			  "invalid chunk length, have %llu", length);
+		return -EUCLEAN;
+	}
+	if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
+		chunk_err(leaf, chunk, logical,
+			  "invalid chunk stripe length: %llu",
+			  stripe_len);
+		return -EUCLEAN;
+	}
+	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+	    type) {
+		chunk_err(leaf, chunk, logical,
+			  "unrecognized chunk type: 0x%llx",
+			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+			  btrfs_chunk_type(leaf, chunk));
+		return -EUCLEAN;
+	}
+
+	if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+	    (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) {
+		chunk_err(leaf, chunk, logical,
+		"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
+			  type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+		return -EUCLEAN;
+	}
+	if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+		chunk_err(leaf, chunk, logical,
+	"missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
+			  type, BTRFS_BLOCK_GROUP_TYPE_MASK);
+		return -EUCLEAN;
+	}
+
+	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+	    (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+		chunk_err(leaf, chunk, logical,
+			  "system chunk with data or metadata type: 0x%llx",
+			  type);
+		return -EUCLEAN;
+	}
+
+	features = btrfs_super_incompat_flags(fs_info->super_copy);
+	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = true;
+
+	if (!mixed) {
+		if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
+		    (type & BTRFS_BLOCK_GROUP_DATA)) {
+			chunk_err(leaf, chunk, logical,
+			"mixed chunk type in non-mixed mode: 0x%llx", type);
+			return -EUCLEAN;
+		}
+	}
+
+	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
+	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) {
+		chunk_err(leaf, chunk, logical,
+			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
+			num_stripes, sub_stripes,
+			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
+__printf(3, 4)
+__cold
+static void dev_item_err(const struct extent_buffer *eb, int slot,
+			 const char *fmt, ...)
+{
+	struct btrfs_key key;
+	struct va_format vaf;
+	va_list args;
+
+	btrfs_item_key_to_cpu(eb, &key, slot);
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(eb->fs_info,
+	"corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
+		btrfs_header_level(eb) == 0 ? "leaf" : "node",
+		btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+		key.objectid, &vaf);
+	va_end(args);
+}
+
+static int check_dev_item(struct extent_buffer *leaf,
+			  struct btrfs_key *key, int slot)
+{
+	struct btrfs_dev_item *ditem;
+
+	if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
+		dev_item_err(leaf, slot,
+			     "invalid objectid: has=%llu expect=%llu",
+			     key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
+		return -EUCLEAN;
+	}
+	ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
+	if (btrfs_device_id(leaf, ditem) != key->offset) {
+		dev_item_err(leaf, slot,
+			     "devid mismatch: key has=%llu item has=%llu",
+			     key->offset, btrfs_device_id(leaf, ditem));
+		return -EUCLEAN;
+	}
+
+	/*
+	 * For device total_bytes, we don't have reliable way to check it, as
+	 * it can be 0 for device removal. Device size check can only be done
+	 * by dev extents check.
+	 */
+	if (btrfs_device_bytes_used(leaf, ditem) >
+	    btrfs_device_total_bytes(leaf, ditem)) {
+		dev_item_err(leaf, slot,
+			     "invalid bytes used: have %llu expect [0, %llu]",
+			     btrfs_device_bytes_used(leaf, ditem),
+			     btrfs_device_total_bytes(leaf, ditem));
+		return -EUCLEAN;
+	}
+	/*
+	 * Remaining members like io_align/type/gen/dev_group aren't really
+	 * utilized.  Skip them to make later usage of them easier.
+	 */
+	return 0;
+}
+
+/* Inode item error output has the same format as dir_item_err() */
+#define inode_item_err(fs_info, eb, slot, fmt, ...)			\
+	dir_item_err(eb, slot, fmt, __VA_ARGS__)
+
+static int check_inode_item(struct extent_buffer *leaf,
+			    struct btrfs_key *key, int slot)
+{
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
+	struct btrfs_inode_item *iitem;
+	u64 super_gen = btrfs_super_generation(fs_info->super_copy);
+	u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
+	u32 mode;
+
+	if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
+	     key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
+	    key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
+	    key->objectid != BTRFS_FREE_INO_OBJECTID) {
+		generic_err(leaf, slot,
+	"invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
+			    key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
+			    BTRFS_FIRST_FREE_OBJECTID,
+			    BTRFS_LAST_FREE_OBJECTID,
+			    BTRFS_FREE_INO_OBJECTID);
+		return -EUCLEAN;
+	}
+	if (key->offset != 0) {
+		inode_item_err(fs_info, leaf, slot,
+			"invalid key offset: has %llu expect 0",
+			key->offset);
+		return -EUCLEAN;
+	}
+	iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
+
+	/* Here we use super block generation + 1 to handle log tree */
+	if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
+		inode_item_err(fs_info, leaf, slot,
+			"invalid inode generation: has %llu expect (0, %llu]",
+			       btrfs_inode_generation(leaf, iitem),
+			       super_gen + 1);
+		return -EUCLEAN;
+	}
+	/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
+	if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
+		inode_item_err(fs_info, leaf, slot,
+			"invalid inode generation: has %llu expect [0, %llu]",
+			       btrfs_inode_transid(leaf, iitem), super_gen + 1);
+		return -EUCLEAN;
+	}
+
+	/*
+	 * For size and nbytes it's better not to be too strict, as for dir
+	 * item its size/nbytes can easily get wrong, but doesn't affect
+	 * anything in the fs. So here we skip the check.
+	 */
+	mode = btrfs_inode_mode(leaf, iitem);
+	if (mode & ~valid_mask) {
+		inode_item_err(fs_info, leaf, slot,
+			       "unknown mode bit detected: 0x%x",
+			       mode & ~valid_mask);
+		return -EUCLEAN;
+	}
+
+	/*
+	 * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2,
+	 * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG.
+	 * Only needs to check BLK, LNK and SOCKS
+	 */
+	if (!is_power_of_2(mode & S_IFMT)) {
+		if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
+			inode_item_err(fs_info, leaf, slot,
+			"invalid mode: has 0%o expect valid S_IF* bit(s)",
+				       mode & S_IFMT);
+			return -EUCLEAN;
+		}
+	}
+	if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
+		inode_item_err(fs_info, leaf, slot,
+		       "invalid nlink: has %u expect no more than 1 for dir",
+			btrfs_inode_nlink(leaf, iitem));
+		return -EUCLEAN;
+	}
+	if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
+		inode_item_err(fs_info, leaf, slot,
+			       "unknown flags detected: 0x%llx",
+			       btrfs_inode_flags(leaf, iitem) &
+			       ~BTRFS_INODE_FLAG_MASK);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
+static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
+			   int slot)
+{
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
+	struct btrfs_root_item ri;
+	const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
+				     BTRFS_ROOT_SUBVOL_DEAD;
+
+	/* No such tree id */
+	if (key->objectid == 0) {
+		generic_err(leaf, slot, "invalid root id 0");
+		return -EUCLEAN;
+	}
+
+	/*
+	 * Some older kernel may create ROOT_ITEM with non-zero offset, so here
+	 * we only check offset for reloc tree whose key->offset must be a
+	 * valid tree.
+	 */
+	if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
+		generic_err(leaf, slot, "invalid root id 0 for reloc tree");
+		return -EUCLEAN;
+	}
+
+	if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
+		generic_err(leaf, slot,
+			    "invalid root item size, have %u expect %zu",
+			    btrfs_item_size_nr(leaf, slot), sizeof(ri));
+	}
+
+	read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
+			   sizeof(ri));
+
+	/* Generation related */
+	if (btrfs_root_generation(&ri) >
+	    btrfs_super_generation(fs_info->super_copy) + 1) {
+		generic_err(leaf, slot,
+			"invalid root generation, have %llu expect (0, %llu]",
+			    btrfs_root_generation(&ri),
+			    btrfs_super_generation(fs_info->super_copy) + 1);
+		return -EUCLEAN;
+	}
+	if (btrfs_root_generation_v2(&ri) >
+	    btrfs_super_generation(fs_info->super_copy) + 1) {
+		generic_err(leaf, slot,
+		"invalid root v2 generation, have %llu expect (0, %llu]",
+			    btrfs_root_generation_v2(&ri),
+			    btrfs_super_generation(fs_info->super_copy) + 1);
+		return -EUCLEAN;
+	}
+	if (btrfs_root_last_snapshot(&ri) >
+	    btrfs_super_generation(fs_info->super_copy) + 1) {
+		generic_err(leaf, slot,
+		"invalid root last_snapshot, have %llu expect (0, %llu]",
+			    btrfs_root_last_snapshot(&ri),
+			    btrfs_super_generation(fs_info->super_copy) + 1);
+		return -EUCLEAN;
+	}
+
+	/* Alignment and level check */
+	if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) {
+		generic_err(leaf, slot,
+		"invalid root bytenr, have %llu expect to be aligned to %u",
+			    btrfs_root_bytenr(&ri), fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+	if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) {
+		generic_err(leaf, slot,
+			    "invalid root level, have %u expect [0, %u]",
+			    btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1);
+		return -EUCLEAN;
+	}
+	if (ri.drop_level >= BTRFS_MAX_LEVEL) {
+		generic_err(leaf, slot,
+			    "invalid root level, have %u expect [0, %u]",
+			    ri.drop_level, BTRFS_MAX_LEVEL - 1);
+		return -EUCLEAN;
+	}
+
+	/* Flags check */
+	if (btrfs_root_flags(&ri) & ~valid_root_flags) {
+		generic_err(leaf, slot,
+			    "invalid root flags, have 0x%llx expect mask 0x%llx",
+			    btrfs_root_flags(&ri), valid_root_flags);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
+__printf(3,4)
+__cold
+static void extent_err(const struct extent_buffer *eb, int slot,
+		       const char *fmt, ...)
+{
+	struct btrfs_key key;
+	struct va_format vaf;
+	va_list args;
+	u64 bytenr;
+	u64 len;
+
+	btrfs_item_key_to_cpu(eb, &key, slot);
+	bytenr = key.objectid;
+	if (key.type == BTRFS_METADATA_ITEM_KEY ||
+	    key.type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    key.type == BTRFS_SHARED_BLOCK_REF_KEY)
+		len = eb->fs_info->nodesize;
+	else
+		len = key.offset;
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(eb->fs_info,
+	"corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV",
+		btrfs_header_level(eb) == 0 ? "leaf" : "node",
+		eb->start, slot, bytenr, len, &vaf);
+	va_end(args);
+}
+
+static int check_extent_item(struct extent_buffer *leaf,
+			     struct btrfs_key *key, int slot)
+{
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
+	struct btrfs_extent_item *ei;
+	bool is_tree_block = false;
+	unsigned long ptr;	/* Current pointer inside inline refs */
+	unsigned long end;	/* Extent item end */
+	const u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u64 flags;
+	u64 generation;
+	u64 total_refs;		/* Total refs in btrfs_extent_item */
+	u64 inline_refs = 0;	/* found total inline refs */
+
+	if (key->type == BTRFS_METADATA_ITEM_KEY &&
+	    !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
+		generic_err(leaf, slot,
+"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled");
+		return -EUCLEAN;
+	}
+	/* key->objectid is the bytenr for both key types */
+	if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) {
+		generic_err(leaf, slot,
+		"invalid key objectid, have %llu expect to be aligned to %u",
+			   key->objectid, fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+
+	/* key->offset is tree level for METADATA_ITEM_KEY */
+	if (key->type == BTRFS_METADATA_ITEM_KEY &&
+	    key->offset >= BTRFS_MAX_LEVEL) {
+		extent_err(leaf, slot,
+			   "invalid tree level, have %llu expect [0, %u]",
+			   key->offset, BTRFS_MAX_LEVEL - 1);
+		return -EUCLEAN;
+	}
+
+	/*
+	 * EXTENT/METADATA_ITEM consists of:
+	 * 1) One btrfs_extent_item
+	 *    Records the total refs, type and generation of the extent.
+	 *
+	 * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only)
+	 *    Records the first key and level of the tree block.
+	 *
+	 * 2) Zero or more btrfs_extent_inline_ref(s)
+	 *    Each inline ref has one btrfs_extent_inline_ref shows:
+	 *    2.1) The ref type, one of the 4
+	 *         TREE_BLOCK_REF	Tree block only
+	 *         SHARED_BLOCK_REF	Tree block only
+	 *         EXTENT_DATA_REF	Data only
+	 *         SHARED_DATA_REF	Data only
+	 *    2.2) Ref type specific data
+	 *         Either using btrfs_extent_inline_ref::offset, or specific
+	 *         data structure.
+	 */
+	if (item_size < sizeof(*ei)) {
+		extent_err(leaf, slot,
+			   "invalid item size, have %u expect [%zu, %u)",
+			   item_size, sizeof(*ei),
+			   BTRFS_LEAF_DATA_SIZE(fs_info));
+		return -EUCLEAN;
+	}
+	end = item_size + btrfs_item_ptr_offset(leaf, slot);
+
+	/* Checks against extent_item */
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+	total_refs = btrfs_extent_refs(leaf, ei);
+	generation = btrfs_extent_generation(leaf, ei);
+	if (generation > btrfs_super_generation(fs_info->super_copy) + 1) {
+		extent_err(leaf, slot,
+			   "invalid generation, have %llu expect (0, %llu]",
+			   generation,
+			   btrfs_super_generation(fs_info->super_copy) + 1);
+		return -EUCLEAN;
+	}
+	if (!is_power_of_2(flags & (BTRFS_EXTENT_FLAG_DATA |
+				    BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
+		extent_err(leaf, slot,
+		"invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
+			flags, BTRFS_EXTENT_FLAG_DATA |
+			BTRFS_EXTENT_FLAG_TREE_BLOCK);
+		return -EUCLEAN;
+	}
+	is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK);
+	if (is_tree_block) {
+		if (key->type == BTRFS_EXTENT_ITEM_KEY &&
+		    key->offset != fs_info->nodesize) {
+			extent_err(leaf, slot,
+				   "invalid extent length, have %llu expect %u",
+				   key->offset, fs_info->nodesize);
+			return -EUCLEAN;
+		}
+	} else {
+		if (key->type != BTRFS_EXTENT_ITEM_KEY) {
+			extent_err(leaf, slot,
+			"invalid key type, have %u expect %u for data backref",
+				   key->type, BTRFS_EXTENT_ITEM_KEY);
+			return -EUCLEAN;
+		}
+		if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) {
+			extent_err(leaf, slot,
+			"invalid extent length, have %llu expect aligned to %u",
+				   key->offset, fs_info->sectorsize);
+			return -EUCLEAN;
+		}
+	}
+	ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);
+
+	/* Check the special case of btrfs_tree_block_info */
+	if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) {
+		struct btrfs_tree_block_info *info;
+
+		info = (struct btrfs_tree_block_info *)ptr;
+		if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) {
+			extent_err(leaf, slot,
+			"invalid tree block info level, have %u expect [0, %u]",
+				   btrfs_tree_block_level(leaf, info),
+				   BTRFS_MAX_LEVEL - 1);
+			return -EUCLEAN;
+		}
+		ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1);
+	}
+
+	/* Check inline refs */
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		struct btrfs_extent_data_ref *dref;
+		struct btrfs_shared_data_ref *sref;
+		u64 dref_offset;
+		u64 inline_offset;
+		u8 inline_type;
+
+		if (ptr + sizeof(*iref) > end) {
+			extent_err(leaf, slot,
+"inline ref item overflows extent item, ptr %lu iref size %zu end %lu",
+				   ptr, sizeof(*iref), end);
+			return -EUCLEAN;
+		}
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		inline_type = btrfs_extent_inline_ref_type(leaf, iref);
+		inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+		if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) {
+			extent_err(leaf, slot,
+"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
+				   ptr, inline_type, end);
+			return -EUCLEAN;
+		}
+
+		switch (inline_type) {
+		/* inline_offset is subvolid of the owner, no need to check */
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			inline_refs++;
+			break;
+		/* Contains parent bytenr */
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+				extent_err(leaf, slot,
+		"invalid tree parent bytenr, have %llu expect aligned to %u",
+					   inline_offset, fs_info->sectorsize);
+				return -EUCLEAN;
+			}
+			inline_refs++;
+			break;
+		/*
+		 * Contains owner subvolid, owner key objectid, adjusted offset.
+		 * The only obvious corruption can happen in that offset.
+		 */
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
+			if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) {
+				extent_err(leaf, slot,
+		"invalid data ref offset, have %llu expect aligned to %u",
+					   dref_offset, fs_info->sectorsize);
+				return -EUCLEAN;
+			}
+			inline_refs += btrfs_extent_data_ref_count(leaf, dref);
+			break;
+		/* Contains parent bytenr and ref count */
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = (struct btrfs_shared_data_ref *)(iref + 1);
+			if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+				extent_err(leaf, slot,
+		"invalid data parent bytenr, have %llu expect aligned to %u",
+					   inline_offset, fs_info->sectorsize);
+				return -EUCLEAN;
+			}
+			inline_refs += btrfs_shared_data_ref_count(leaf, sref);
+			break;
+		default:
+			extent_err(leaf, slot, "unknown inline ref type: %u",
+				   inline_type);
+			return -EUCLEAN;
+		}
+		ptr += btrfs_extent_inline_ref_size(inline_type);
+	}
+	/* No padding is allowed */
+	if (ptr != end) {
+		extent_err(leaf, slot,
+			   "invalid extent item size, padding bytes found");
+		return -EUCLEAN;
+	}
+
+	/* Finally, check the inline refs against total refs */
+	if (inline_refs > total_refs) {
+		extent_err(leaf, slot,
+			"invalid extent refs, have %llu expect >= inline %llu",
+			   total_refs, inline_refs);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
+static int check_simple_keyed_refs(struct extent_buffer *leaf,
+				   struct btrfs_key *key, int slot)
+{
+	u32 expect_item_size = 0;
+
+	if (key->type == BTRFS_SHARED_DATA_REF_KEY)
+		expect_item_size = sizeof(struct btrfs_shared_data_ref);
+
+	if (btrfs_item_size_nr(leaf, slot) != expect_item_size) {
+		generic_err(leaf, slot,
+		"invalid item size, have %u expect %u for key type %u",
+			    btrfs_item_size_nr(leaf, slot),
+			    expect_item_size, key->type);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+		generic_err(leaf, slot,
+"invalid key objectid for shared block ref, have %llu expect aligned to %u",
+			    key->objectid, leaf->fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+	if (key->type != BTRFS_TREE_BLOCK_REF_KEY &&
+	    !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) {
+		extent_err(leaf, slot,
+		"invalid tree parent bytenr, have %llu expect aligned to %u",
+			   key->offset, leaf->fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
+static int check_extent_data_ref(struct extent_buffer *leaf,
+				 struct btrfs_key *key, int slot)
+{
+	struct btrfs_extent_data_ref *dref;
+	unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+	const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
+
+	if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) {
+		generic_err(leaf, slot,
+	"invalid item size, have %u expect aligned to %zu for key type %u",
+			    btrfs_item_size_nr(leaf, slot),
+			    sizeof(*dref), key->type);
+	}
+	if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+		generic_err(leaf, slot,
+"invalid key objectid for shared block ref, have %llu expect aligned to %u",
+			    key->objectid, leaf->fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+	for (; ptr < end; ptr += sizeof(*dref)) {
+		u64 root_objectid;
+		u64 owner;
+		u64 offset;
+		u64 hash;
+
+		dref = (struct btrfs_extent_data_ref *)ptr;
+		root_objectid = btrfs_extent_data_ref_root(leaf, dref);
+		owner = btrfs_extent_data_ref_objectid(leaf, dref);
+		offset = btrfs_extent_data_ref_offset(leaf, dref);
+		hash = hash_extent_data_ref(root_objectid, owner, offset);
+		if (hash != key->offset) {
+			extent_err(leaf, slot,
+	"invalid extent data ref hash, item has 0x%016llx key has 0x%016llx",
+				   hash, key->offset);
+			return -EUCLEAN;
+		}
+		if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) {
+			extent_err(leaf, slot,
+	"invalid extent data backref offset, have %llu expect aligned to %u",
+				   offset, leaf->fs_info->sectorsize);
+		}
+	}
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
-static int check_leaf_item(struct btrfs_fs_info *fs_info,
-			   struct extent_buffer *leaf,
-			   struct btrfs_key *key, int slot)
+static int check_leaf_item(struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot,
+			   struct btrfs_key *prev_key)
 {
 	int ret = 0;
+	struct btrfs_chunk *chunk;
 
 	switch (key->type) {
 	case BTRFS_EXTENT_DATA_KEY:
-		ret = check_extent_data_item(fs_info, leaf, key, slot);
+		ret = check_extent_data_item(leaf, key, slot, prev_key);
 		break;
 	case BTRFS_EXTENT_CSUM_KEY:
-		ret = check_csum_item(fs_info, leaf, key, slot);
+		ret = check_csum_item(leaf, key, slot);
 		break;
 	case BTRFS_DIR_ITEM_KEY:
 	case BTRFS_DIR_INDEX_KEY:
 	case BTRFS_XATTR_ITEM_KEY:
-		ret = check_dir_item(fs_info, leaf, key, slot);
+		ret = check_dir_item(leaf, key, slot);
 		break;
 	case BTRFS_BLOCK_GROUP_ITEM_KEY:
-		ret = check_block_group_item(fs_info, leaf, key, slot);
+		ret = check_block_group_item(leaf, key, slot);
+		break;
+	case BTRFS_CHUNK_ITEM_KEY:
+		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+		ret = btrfs_check_chunk_valid(leaf, chunk, key->offset);
+		break;
+	case BTRFS_DEV_ITEM_KEY:
+		ret = check_dev_item(leaf, key, slot);
+		break;
+	case BTRFS_INODE_ITEM_KEY:
+		ret = check_inode_item(leaf, key, slot);
+		break;
+	case BTRFS_ROOT_ITEM_KEY:
+		ret = check_root_item(leaf, key, slot);
+		break;
+	case BTRFS_EXTENT_ITEM_KEY:
+	case BTRFS_METADATA_ITEM_KEY:
+		ret = check_extent_item(leaf, key, slot);
+		break;
+	case BTRFS_TREE_BLOCK_REF_KEY:
+	case BTRFS_SHARED_DATA_REF_KEY:
+	case BTRFS_SHARED_BLOCK_REF_KEY:
+		ret = check_simple_keyed_refs(leaf, key, slot);
+		break;
+	case BTRFS_EXTENT_DATA_REF_KEY:
+		ret = check_extent_data_ref(leaf, key, slot);
 		break;
 	}
 	return ret;
 }
 
-static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
-		      bool check_item_data)
+static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
 {
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	/* No valid key type is 0, so all key should be larger than this key */
 	struct btrfs_key prev_key = {0, 0, 0};
 	struct btrfs_key key;
 	u32 nritems = btrfs_header_nritems(leaf);
 	int slot;
 
+	if (btrfs_header_level(leaf) != 0) {
+		generic_err(leaf, 0,
+			"invalid level for leaf, have %d expect 0",
+			btrfs_header_level(leaf));
+		return -EUCLEAN;
+	}
+
 	/*
 	 * Extent buffers from a relocation tree have a owner field that
 	 * corresponds to the subvolume tree they are based on. So just from an
@@ -495,7 +1304,6 @@
 	 */
 	if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
 		u64 owner = btrfs_header_owner(leaf);
-		struct btrfs_root *check_root;
 
 		/* These trees must never be empty */
 		if (owner == BTRFS_ROOT_TREE_OBJECTID ||
@@ -504,33 +1312,16 @@
 		    owner == BTRFS_DEV_TREE_OBJECTID ||
 		    owner == BTRFS_FS_TREE_OBJECTID ||
 		    owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
-			generic_err(fs_info, leaf, 0,
+			generic_err(leaf, 0,
 			"invalid root, root %llu must never be empty",
 				    owner);
 			return -EUCLEAN;
 		}
-		key.objectid = owner;
-		key.type = BTRFS_ROOT_ITEM_KEY;
-		key.offset = (u64)-1;
-
-		check_root = btrfs_get_fs_root(fs_info, &key, false);
-		/*
-		 * The only reason we also check NULL here is that during
-		 * open_ctree() some roots has not yet been set up.
-		 */
-		if (!IS_ERR_OR_NULL(check_root)) {
-			struct extent_buffer *eb;
-
-			eb = btrfs_root_node(check_root);
-			/* if leaf is the root, then it's fine */
-			if (leaf != eb) {
-				generic_err(fs_info, leaf, 0,
-		"invalid nritems, have %u should not be 0 for non-root leaf",
-					nritems);
-				free_extent_buffer(eb);
-				return -EUCLEAN;
-			}
-			free_extent_buffer(eb);
+		/* Unknown tree */
+		if (owner == 0) {
+			generic_err(leaf, 0,
+				"invalid owner, root 0 is not defined");
+			return -EUCLEAN;
 		}
 		return 0;
 	}
@@ -557,7 +1348,7 @@
 
 		/* Make sure the keys are in the right order */
 		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
-			generic_err(fs_info, leaf, slot,
+			generic_err(leaf, slot,
 	"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
 				prev_key.objectid, prev_key.type,
 				prev_key.offset, key.objectid, key.type,
@@ -576,7 +1367,7 @@
 			item_end_expected = btrfs_item_offset_nr(leaf,
 								 slot - 1);
 		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
-			generic_err(fs_info, leaf, slot,
+			generic_err(leaf, slot,
 				"unexpected item end, have %u expect %u",
 				btrfs_item_end_nr(leaf, slot),
 				item_end_expected);
@@ -590,7 +1381,7 @@
 		 */
 		if (btrfs_item_end_nr(leaf, slot) >
 		    BTRFS_LEAF_DATA_SIZE(fs_info)) {
-			generic_err(fs_info, leaf, slot,
+			generic_err(leaf, slot,
 			"slot end outside of leaf, have %u expect range [0, %u]",
 				btrfs_item_end_nr(leaf, slot),
 				BTRFS_LEAF_DATA_SIZE(fs_info));
@@ -600,7 +1391,7 @@
 		/* Also check if the item pointer overlaps with btrfs item. */
 		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
 		    btrfs_item_ptr_offset(leaf, slot)) {
-			generic_err(fs_info, leaf, slot,
+			generic_err(leaf, slot,
 		"slot overlaps with its data, item end %lu data start %lu",
 				btrfs_item_nr_offset(slot) +
 				sizeof(struct btrfs_item),
@@ -613,7 +1404,7 @@
 			 * Check if the item size and content meet other
 			 * criteria
 			 */
-			ret = check_leaf_item(fs_info, leaf, &key, slot);
+			ret = check_leaf_item(leaf, &key, slot, &prev_key);
 			if (ret < 0)
 				return ret;
 		}
@@ -626,26 +1417,33 @@
 	return 0;
 }
 
-int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
-			  struct extent_buffer *leaf)
+int btrfs_check_leaf_full(struct extent_buffer *leaf)
 {
-	return check_leaf(fs_info, leaf, true);
+	return check_leaf(leaf, true);
+}
+ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO);
+
+int btrfs_check_leaf_relaxed(struct extent_buffer *leaf)
+{
+	return check_leaf(leaf, false);
 }
 
-int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
-			     struct extent_buffer *leaf)
+int btrfs_check_node(struct extent_buffer *node)
 {
-	return check_leaf(fs_info, leaf, false);
-}
-
-int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
-{
+	struct btrfs_fs_info *fs_info = node->fs_info;
 	unsigned long nr = btrfs_header_nritems(node);
 	struct btrfs_key key, next_key;
 	int slot;
+	int level = btrfs_header_level(node);
 	u64 bytenr;
 	int ret = 0;
 
+	if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+		generic_err(node, 0,
+			"invalid level for node, have %d expect [1, %d]",
+			level, BTRFS_MAX_LEVEL - 1);
+		return -EUCLEAN;
+	}
 	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
 		btrfs_crit(fs_info,
 "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
@@ -661,13 +1459,13 @@
 		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
 
 		if (!bytenr) {
-			generic_err(fs_info, node, slot,
+			generic_err(node, slot,
 				"invalid NULL node pointer");
 			ret = -EUCLEAN;
 			goto out;
 		}
 		if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
-			generic_err(fs_info, node, slot,
+			generic_err(node, slot,
 			"unaligned pointer, have %llu should be aligned to %u",
 				bytenr, fs_info->sectorsize);
 			ret = -EUCLEAN;
@@ -675,7 +1473,7 @@
 		}
 
 		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
-			generic_err(fs_info, node, slot,
+			generic_err(node, slot,
 	"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
 				key.objectid, key.type, key.offset,
 				next_key.objectid, next_key.type,
@@ -687,3 +1485,4 @@
 out:
 	return ret;
 }
+ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);

diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index ff04327..32fecc9 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h

@@ -14,15 +14,16 @@
  * Will check not only the item pointers, but also every possible member
  * in item data.
  */
-int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
-			  struct extent_buffer *leaf);
+int btrfs_check_leaf_full(struct extent_buffer *leaf);
 
 /*
  * Less strict leaf checker.
  * Will only check item pointers, not reading item data.
  */
-int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
-			     struct extent_buffer *leaf);
-int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node);
+int btrfs_check_leaf_relaxed(struct extent_buffer *leaf);
+int btrfs_check_node(struct extent_buffer *node);
+
+int btrfs_check_chunk_valid(struct extent_buffer *leaf,
+			    struct btrfs_chunk *chunk, u64 logical);
 
 #endif

diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3c0987a..5f9e2dd 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c

@@ -52,7 +52,7 @@
 		u32 nritems;
 
 		root_node = btrfs_lock_root_node(root);
-		btrfs_set_lock_blocking(root_node);
+		btrfs_set_lock_blocking_write(root_node);
 		nritems = btrfs_header_nritems(root_node);
 		root->defrag_max.objectid = 0;
 		/* from above we know this is not a leaf */

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 16ecb76..8a6cc60 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c

@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/list_sort.h>
 #include <linux/iversion.h>
+#include "misc.h"
 #include "ctree.h"
 #include "tree-log.h"
 #include "disk-io.h"
@@ -24,9 +25,12 @@
  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  * during log replay
  */
-#define LOG_INODE_ALL 0
-#define LOG_INODE_EXISTS 1
-#define LOG_OTHER_INODE 2
+enum {
+	LOG_INODE_ALL,
+	LOG_INODE_EXISTS,
+	LOG_OTHER_INODE,
+	LOG_OTHER_INODE_ALL,
+};
 
 /*
  * directory trouble cases
@@ -80,10 +84,12 @@
  * The last stage is to deal with directories and links and extents
  * and all the other fun semantics
  */
-#define LOG_WALK_PIN_ONLY 0
-#define LOG_WALK_REPLAY_INODES 1
-#define LOG_WALK_REPLAY_DIR_INDEX 2
-#define LOG_WALK_REPLAY_ALL 3
+enum {
+	LOG_WALK_PIN_ONLY,
+	LOG_WALK_REPLAY_INODES,
+	LOG_WALK_REPLAY_DIR_INDEX,
+	LOG_WALK_REPLAY_ALL,
+};
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct btrfs_inode *inode,
@@ -138,7 +144,7 @@
 	mutex_lock(&root->log_mutex);
 
 	if (root->log_root) {
-		if (btrfs_need_log_full_commit(fs_info, trans)) {
+		if (btrfs_need_log_full_commit(trans)) {
 			ret = -EAGAIN;
 			goto out;
 		}
@@ -187,10 +193,6 @@
 {
 	int ret = -ENOENT;
 
-	smp_mb();
-	if (!root->log_root)
-		return -ENOENT;
-
 	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
 		ret = 0;
@@ -205,14 +207,11 @@
  * until you call btrfs_end_log_trans() or it makes any future
  * log transactions wait until you call btrfs_end_log_trans()
  */
-int btrfs_pin_log_trans(struct btrfs_root *root)
+void btrfs_pin_log_trans(struct btrfs_root *root)
 {
-	int ret = -ENOENT;
-
 	mutex_lock(&root->log_mutex);
 	atomic_inc(&root->log_writers);
 	mutex_unlock(&root->log_mutex);
-	return ret;
 }
 
 /*
@@ -227,6 +226,17 @@
 	}
 }
 
+static int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
+					buf->start + buf->len - 1);
+}
+
+static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+	filemap_fdatawait_range(buf->pages[0]->mapping,
+			        buf->start, buf->start + buf->len - 1);
+}
 
 /*
  * the walk control struct is used to pass state down the chain when
@@ -306,7 +316,7 @@
 
 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 		if (wc->pin && btrfs_header_level(eb) == 0)
-			ret = btrfs_exclude_logged_extents(fs_info, eb);
+			ret = btrfs_exclude_logged_extents(eb);
 		if (wc->write)
 			btrfs_write_tree_block(eb);
 		if (wc->wait)
@@ -335,7 +345,6 @@
 				   struct extent_buffer *eb, int slot,
 				   struct btrfs_key *key)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	u32 item_size;
 	u64 saved_i_size = 0;
@@ -456,10 +465,9 @@
 		found_size = btrfs_item_size_nr(path->nodes[0],
 						path->slots[0]);
 		if (found_size > item_size)
-			btrfs_truncate_item(fs_info, path, item_size, 1);
+			btrfs_truncate_item(path, item_size, 1);
 		else if (found_size < item_size)
-			btrfs_extend_item(fs_info, path,
-					  item_size - found_size);
+			btrfs_extend_item(path, item_size - found_size);
 	} else if (ret) {
 		return ret;
 	}
@@ -498,7 +506,7 @@
 			    ino_size != 0) {
 				struct btrfs_map_token token;
 
-				btrfs_init_map_token(&token);
+				btrfs_init_map_token(&token, dst_eb);
 				btrfs_set_token_inode_size(dst_eb, dst_item,
 							   ino_size, &token);
 			}
@@ -696,9 +704,11 @@
 			goto out;
 
 		if (ins.objectid > 0) {
+			struct btrfs_ref ref = { 0 };
 			u64 csum_start;
 			u64 csum_end;
 			LIST_HEAD(ordered_sums);
+
 			/*
 			 * is this extent already allocated in the extent
 			 * allocation tree?  If so, just add a reference
@@ -706,10 +716,13 @@
 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 						ins.offset);
 			if (ret == 0) {
-				ret = btrfs_inc_extent_ref(trans, root,
-						ins.objectid, ins.offset,
-						0, root->root_key.objectid,
+				btrfs_init_generic_ref(&ref,
+						BTRFS_ADD_DELAYED_REF,
+						ins.objectid, ins.offset, 0);
+				btrfs_init_data_ref(&ref,
+						root->root_key.objectid,
 						key->objectid, offset);
+				ret = btrfs_inc_extent_ref(trans, &ref);
 				if (ret)
 					goto out;
 			} else {
@@ -955,7 +968,7 @@
 		if (btrfs_find_name_in_ext_backref(path->nodes[0],
 						   path->slots[0],
 						   ref_objectid,
-						   name, namelen, NULL))
+						   name, namelen))
 			match = 1;
 
 		goto out;
@@ -1147,7 +1160,7 @@
 	}
 	btrfs_release_path(path);
 
-	/* look for a conflicing name */
+	/* look for a conflicting name */
 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
 				   name, namelen, 0);
 	if (di && !IS_ERR(di)) {
@@ -1254,12 +1267,12 @@
 			goto out;
 
 		if (key->type == BTRFS_INODE_EXTREF_KEY)
-			ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
-							     parent_id, name,
-							     namelen, NULL);
+			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
+							       parent_id, name,
+							       namelen);
 		else
-			ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
-							 namelen, NULL);
+			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
+							   name, namelen);
 
 		if (!ret) {
 			struct inode *dir;
@@ -1321,18 +1334,78 @@
 		goto out;
 	}
 	if (key.type == BTRFS_INODE_EXTREF_KEY)
-		ret = btrfs_find_name_in_ext_backref(path->nodes[0],
-						     path->slots[0], parent_id,
-						     name, namelen, NULL);
+		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
+				path->slots[0], parent_id, name, namelen);
 	else
-		ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
-						 name, namelen, NULL);
+		ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+						   name, namelen);
 
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct inode *dir, struct inode *inode, const char *name,
+		    int namelen, u64 ref_index)
+{
+	struct btrfs_dir_item *dir_item;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct inode *other_inode = NULL;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	dir_item = btrfs_lookup_dir_item(NULL, root, path,
+					 btrfs_ino(BTRFS_I(dir)),
+					 name, namelen, 0);
+	if (!dir_item) {
+		btrfs_release_path(path);
+		goto add_link;
+	} else if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		goto out;
+	}
+
+	/*
+	 * Our inode's dentry collides with the dentry of another inode which is
+	 * in the log but not yet processed since it has a higher inode number.
+	 * So delete that other dentry.
+	 */
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
+	btrfs_release_path(path);
+	other_inode = read_one_inode(root, key.objectid);
+	if (!other_inode) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+				 name, namelen);
+	if (ret)
+		goto out;
+	/*
+	 * If we dropped the link count to 0, bump it so that later the iput()
+	 * on the inode will not free it. We will fixup the link count later.
+	 */
+	if (other_inode->i_nlink == 0)
+		inc_nlink(other_inode);
+
+	ret = btrfs_run_delayed_items(trans);
+	if (ret)
+		goto out;
+add_link:
+	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+			     name, namelen, 0, ref_index);
+out:
+	iput(other_inode);
+	btrfs_free_path(path);
+
+	return ret;
+}
+
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1469,9 +1542,8 @@
 				goto out;
 
 			/* insert our name */
-			ret = btrfs_add_link(trans, BTRFS_I(dir),
-					BTRFS_I(inode),
-					name, namelen, 0, ref_index);
+			ret = add_link(trans, root, dir, inode, name, namelen,
+				       ref_index);
 			if (ret)
 				goto out;
 
@@ -2666,8 +2738,8 @@
 
 				if (trans) {
 					btrfs_tree_lock(next);
-					btrfs_set_lock_blocking(next);
-					clean_tree_block(fs_info, next);
+					btrfs_set_lock_blocking_write(next);
+					btrfs_clean_tree_block(next);
 					btrfs_wait_tree_block_writeback(next);
 					btrfs_tree_unlock(next);
 				} else {
@@ -2750,8 +2822,8 @@
 
 				if (trans) {
 					btrfs_tree_lock(next);
-					btrfs_set_lock_blocking(next);
-					clean_tree_block(fs_info, next);
+					btrfs_set_lock_blocking_write(next);
+					btrfs_clean_tree_block(next);
 					btrfs_wait_tree_block_writeback(next);
 					btrfs_tree_unlock(next);
 				} else {
@@ -2832,8 +2904,8 @@
 
 			if (trans) {
 				btrfs_tree_lock(next);
-				btrfs_set_lock_blocking(next);
-				clean_tree_block(fs_info, next);
+				btrfs_set_lock_blocking_write(next);
+				btrfs_clean_tree_block(next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 			} else {
@@ -2860,7 +2932,8 @@
  * in the tree of log roots
  */
 static int update_log_root(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *log)
+			   struct btrfs_root *log,
+			   struct btrfs_root_item *root_item)
 {
 	struct btrfs_fs_info *fs_info = log->fs_info;
 	int ret;
@@ -2868,10 +2941,10 @@
 	if (log->log_transid == 1) {
 		/* insert root item on the first sync */
 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
-				&log->root_key, &log->root_item);
+				&log->root_key, root_item);
 	} else {
 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
-				&log->root_key, &log->root_item);
+				&log->root_key, root_item);
 	}
 	return ret;
 }
@@ -2969,6 +3042,7 @@
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+	struct btrfs_root_item new_root_item;
 	int log_transid = 0;
 	struct btrfs_log_ctx root_log_ctx;
 	struct blk_plug plug;
@@ -3008,7 +3082,7 @@
 	}
 
 	/* bail out if we need to do a full commit */
-	if (btrfs_need_log_full_commit(fs_info, trans)) {
+	if (btrfs_need_log_full_commit(trans)) {
 		ret = -EAGAIN;
 		mutex_unlock(&root->log_mutex);
 		goto out;
@@ -3027,12 +3101,26 @@
 	if (ret) {
 		blk_finish_plug(&plug);
 		btrfs_abort_transaction(trans, ret);
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
 
+	/*
+	 * We _must_ update under the root->log_mutex in order to make sure we
+	 * have a consistent view of the log root we are trying to commit at
+	 * this moment.
+	 *
+	 * We _must_ copy this into a local copy, because we are not holding the
+	 * log_root_tree->log_mutex yet.  This is important because when we
+	 * commit the log_root_tree we must have a consistent view of the
+	 * log_root_tree when we update the super block to point at the
+	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
+	 * with the commit and possibly point at the new block which we may not
+	 * have written out.
+	 */
 	btrfs_set_root_node(&log->root_item, log->node);
+	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
 
 	root->log_transid++;
 	log->log_transid = root->log_transid;
@@ -3056,9 +3144,15 @@
 
 	mutex_unlock(&log_root_tree->log_mutex);
 
-	ret = update_log_root(trans, log);
-
 	mutex_lock(&log_root_tree->log_mutex);
+
+	/*
+	 * Now we are safe to update the log_root_tree because we're under the
+	 * log_mutex, and we're a current writer so we're holding the commit
+	 * open until we drop the log_mutex.
+	 */
+	ret = update_log_root(trans, log, &new_root_item);
+
 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
 		/* atomic_dec_and_test implies a barrier */
 		cond_wake_up_nomb(&log_root_tree->log_writer_wait);
@@ -3069,7 +3163,7 @@
 			list_del_init(&root_log_ctx.list);
 
 		blk_finish_plug(&plug);
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 
 		if (ret != -ENOSPC) {
 			btrfs_abort_transaction(trans, ret);
@@ -3115,7 +3209,7 @@
 	 * now that we've moved on to the tree of log tree roots,
 	 * check the full commit flag again
 	 */
-	if (btrfs_need_log_full_commit(fs_info, trans)) {
+	if (btrfs_need_log_full_commit(trans)) {
 		blk_finish_plug(&plug);
 		btrfs_wait_tree_log_extents(log, mark);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -3128,7 +3222,7 @@
 					 EXTENT_DIRTY | EXTENT_NEW);
 	blk_finish_plug(&plug);
 	if (ret) {
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 		btrfs_abort_transaction(trans, ret);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out_wake_log_root;
@@ -3138,7 +3232,7 @@
 		ret = btrfs_wait_tree_log_extents(log_root_tree,
 						  EXTENT_NEW | EXTENT_DIRTY);
 	if (ret) {
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out_wake_log_root;
 	}
@@ -3152,7 +3246,7 @@
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	/*
-	 * nobody else is going to jump in and write the the ctree
+	 * Nobody else is going to jump in and write the ctree
 	 * super here because the log_commit atomic below is protecting
 	 * us.  We must be called with a transaction handle pinning
 	 * the running transaction open, so a full commit can't hop
@@ -3160,7 +3254,7 @@
 	 */
 	ret = write_all_supers(fs_info, 1);
 	if (ret) {
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 		btrfs_abort_transaction(trans, ret);
 		goto out_wake_log_root;
 	}
@@ -3204,8 +3298,6 @@
 			  struct btrfs_root *log)
 {
 	int ret;
-	u64 start;
-	u64 end;
 	struct walk_control wc = {
 		.free = 1,
 		.process_func = process_one_buffer
@@ -3219,18 +3311,8 @@
 			btrfs_handle_fs_error(log->fs_info, ret, NULL);
 	}
 
-	while (1) {
-		ret = find_first_extent_bit(&log->dirty_log_pages,
-				0, &start, &end,
-				EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
-				NULL);
-		if (ret)
-			break;
-
-		clear_extent_bits(&log->dirty_log_pages, start, end,
-				  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
-	}
-
+	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
+			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
 	free_extent_buffer(log->node);
 	kfree(log);
 }
@@ -3259,6 +3341,30 @@
 }
 
 /*
+ * Check if an inode was logged in the current transaction. We can't always rely
+ * on an inode's logged_trans value, because it's an in-memory only field and
+ * therefore not persisted. This means that its value is lost if the inode gets
+ * evicted and loaded again from disk (in which case it has a value of 0, and
+ * certainly it is smaller then any possible transaction ID), when that happens
+ * the full_sync flag is set in the inode's runtime flags, so on that case we
+ * assume eviction happened and ignore the logged_trans value, assuming the
+ * worst case, that the inode was logged before in the current transaction.
+ */
+static bool inode_logged(struct btrfs_trans_handle *trans,
+			 struct btrfs_inode *inode)
+{
+	if (inode->logged_trans == trans->transid)
+		return true;
+
+	if (inode->last_trans == trans->transid &&
+	    test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+	    !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
+		return true;
+
+	return false;
+}
+
+/*
  * If both a file and directory are logged, and unlinks or renames are
  * mixed in, we have a few interesting corners:
  *
@@ -3292,7 +3398,7 @@
 	int bytes_del = 0;
 	u64 dir_ino = btrfs_ino(dir);
 
-	if (dir->logged_trans < trans->transid)
+	if (!inode_logged(trans, dir))
 		return 0;
 
 	ret = join_running_log_trans(root);
@@ -3376,7 +3482,7 @@
 out_unlock:
 	mutex_unlock(&dir->log_mutex);
 	if (ret == -ENOSPC) {
-		btrfs_set_log_full_commit(root->fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 		ret = 0;
 	} else if (ret < 0)
 		btrfs_abort_transaction(trans, ret);
@@ -3392,12 +3498,11 @@
 			       const char *name, int name_len,
 			       struct btrfs_inode *inode, u64 dirid)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *log;
 	u64 index;
 	int ret;
 
-	if (inode->logged_trans < trans->transid)
+	if (!inode_logged(trans, inode))
 		return 0;
 
 	ret = join_running_log_trans(root);
@@ -3410,7 +3515,7 @@
 				  dirid, &index);
 	mutex_unlock(&inode->log_mutex);
 	if (ret == -ENOSPC) {
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 		ret = 0;
 	} else if (ret < 0 && ret != -ENOENT)
 		btrfs_abort_transaction(trans, ret);
@@ -3532,9 +3637,16 @@
 	}
 	btrfs_release_path(path);
 
-	/* find the first key from this transaction again */
+	/*
+	 * Find the first key from this transaction again.  See the note for
+	 * log_new_dir_dentries, if we're logging a directory recursively we
+	 * won't be holding its i_mutex, which means we can modify the directory
+	 * while we're logging it.  If we remove an entry between our first
+	 * search and this search we'll not find the key again and can just
+	 * bail.
+	 */
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
-	if (WARN_ON(ret != 0))
+	if (ret != 0)
 		goto done;
 
 	/*
@@ -3721,6 +3833,8 @@
 		found_key.type = 0;
 		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
 				       &start_slot);
+		if (ret < 0)
+			break;
 
 		ret = btrfs_del_items(trans, log, path, start_slot,
 				      path->slots[0] - start_slot + 1);
@@ -3746,7 +3860,7 @@
 {
 	struct btrfs_map_token token;
 
-	btrfs_init_map_token(&token);
+	btrfs_init_map_token(&token, leaf);
 
 	if (log_inode_only) {
 		/* set the generation to zero so the recover code
@@ -4114,6 +4228,7 @@
 							       *last_extent, 0,
 							       0, len, 0, len,
 							       0, 0, 0);
+				*last_extent += len;
 			}
 		}
 	}
@@ -4205,8 +4320,6 @@
 	if (ret)
 		return ret;
 
-	btrfs_init_map_token(&token);
-
 	ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
 				   em->start + em->len, NULL, 0, 1,
 				   sizeof(*fi), &extent_inserted);
@@ -4224,6 +4337,7 @@
 			return ret;
 	}
 	leaf = path->nodes[0];
+	btrfs_init_map_token(&token, leaf);
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
 
@@ -4386,7 +4500,6 @@
 	struct extent_map *em, *n;
 	struct list_head extents;
 	struct extent_map_tree *tree = &inode->extent_tree;
-	u64 logged_start, logged_end;
 	u64 test_gen;
 	int ret = 0;
 	int num = 0;
@@ -4395,8 +4508,6 @@
 
 	write_lock(&tree->lock);
 	test_gen = root->fs_info->last_trans_committed;
-	logged_start = start;
-	logged_end = end;
 
 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
 		/*
@@ -4437,11 +4548,6 @@
 		    em->start >= i_size_read(&inode->vfs_inode))
 			continue;
 
-		if (em->start < logged_start)
-			logged_start = em->start;
-		if ((em->start + em->len - 1) > logged_end)
-			logged_end = em->start + em->len - 1;
-
 		/* Need a ref to keep it from getting evicted from cache */
 		refcount_inc(&em->refs);
 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -4504,6 +4610,19 @@
 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				      struct btrfs_inode_item);
 		*size_ret = btrfs_inode_size(path->nodes[0], item);
+		/*
+		 * If the in-memory inode's i_size is smaller then the inode
+		 * size stored in the btree, return the inode's i_size, so
+		 * that we get a correct inode size after replaying the log
+		 * when before a power failure we had a shrinking truncate
+		 * followed by addition of a new name (rename / new hard link).
+		 * Otherwise return the inode size from the btree, to avoid
+		 * data loss when replaying a log due to previously doing a
+		 * write that expands the inode's size and logging a new name
+		 * immediately after.
+		 */
+		if (*size_ret > inode->vfs_inode.i_size)
+			*size_ret = inode->vfs_inode.i_size;
 	}
 
 	btrfs_release_path(path);
@@ -4665,15 +4784,8 @@
 					struct btrfs_file_extent_item);
 
 		if (btrfs_file_extent_type(leaf, extent) ==
-		    BTRFS_FILE_EXTENT_INLINE) {
-			len = btrfs_file_extent_ram_bytes(leaf, extent);
-			ASSERT(len == i_size ||
-			       (len == fs_info->sectorsize &&
-				btrfs_file_extent_compression(leaf, extent) !=
-				BTRFS_COMPRESS_NONE) ||
-			       (len < i_size && i_size < fs_info->sectorsize));
+		    BTRFS_FILE_EXTENT_INLINE)
 			return 0;
-		}
 
 		len = btrfs_file_extent_num_bytes(leaf, extent);
 		/* Last extent goes beyond i_size, no need to log a hole. */
@@ -4740,7 +4852,7 @@
 					 const int slot,
 					 const struct btrfs_key *key,
 					 struct btrfs_inode *inode,
-					 u64 *other_ino)
+					 u64 *other_ino, u64 *other_parent)
 {
 	int ret;
 	struct btrfs_path *search_path;
@@ -4803,8 +4915,13 @@
 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
 						  di, &di_key);
 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
-				ret = 1;
-				*other_ino = di_key.objectid;
+				if (di_key.objectid != key->objectid) {
+					ret = 1;
+					*other_ino = di_key.objectid;
+					*other_parent = parent;
+				} else {
+					ret = 0;
+				}
 			} else {
 				ret = -EAGAIN;
 			}
@@ -4824,6 +4941,144 @@
 	return ret;
 }
 
+struct btrfs_ino_list {
+	u64 ino;
+	u64 parent;
+	struct list_head list;
+};
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  struct btrfs_log_ctx *ctx,
+				  u64 ino, u64 parent)
+{
+	struct btrfs_ino_list *ino_elem;
+	LIST_HEAD(inode_list);
+	int ret = 0;
+
+	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+	if (!ino_elem)
+		return -ENOMEM;
+	ino_elem->ino = ino;
+	ino_elem->parent = parent;
+	list_add_tail(&ino_elem->list, &inode_list);
+
+	while (!list_empty(&inode_list)) {
+		struct btrfs_fs_info *fs_info = root->fs_info;
+		struct btrfs_key key;
+		struct inode *inode;
+
+		ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
+					    list);
+		ino = ino_elem->ino;
+		parent = ino_elem->parent;
+		list_del(&ino_elem->list);
+		kfree(ino_elem);
+		if (ret)
+			continue;
+
+		btrfs_release_path(path);
+
+		key.objectid = ino;
+		key.type = BTRFS_INODE_ITEM_KEY;
+		key.offset = 0;
+		inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+		/*
+		 * If the other inode that had a conflicting dir entry was
+		 * deleted in the current transaction, we need to log its parent
+		 * directory.
+		 */
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			if (ret == -ENOENT) {
+				key.objectid = parent;
+				inode = btrfs_iget(fs_info->sb, &key, root,
+						   NULL);
+				if (IS_ERR(inode)) {
+					ret = PTR_ERR(inode);
+				} else {
+					ret = btrfs_log_inode(trans, root,
+						      BTRFS_I(inode),
+						      LOG_OTHER_INODE_ALL,
+						      0, LLONG_MAX, ctx);
+					btrfs_add_delayed_iput(inode);
+				}
+			}
+			continue;
+		}
+		/*
+		 * We are safe logging the other inode without acquiring its
+		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
+		 * are safe against concurrent renames of the other inode as
+		 * well because during a rename we pin the log and update the
+		 * log with the new name before we unpin it.
+		 */
+		ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+				      LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+		if (ret) {
+			btrfs_add_delayed_iput(inode);
+			continue;
+		}
+
+		key.objectid = ino;
+		key.type = BTRFS_INODE_REF_KEY;
+		key.offset = 0;
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_add_delayed_iput(inode);
+			continue;
+		}
+
+		while (true) {
+			struct extent_buffer *leaf = path->nodes[0];
+			int slot = path->slots[0];
+			u64 other_ino = 0;
+			u64 other_parent = 0;
+
+			if (slot >= btrfs_header_nritems(leaf)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret < 0) {
+					break;
+				} else if (ret > 0) {
+					ret = 0;
+					break;
+				}
+				continue;
+			}
+
+			btrfs_item_key_to_cpu(leaf, &key, slot);
+			if (key.objectid != ino ||
+			    (key.type != BTRFS_INODE_REF_KEY &&
+			     key.type != BTRFS_INODE_EXTREF_KEY)) {
+				ret = 0;
+				break;
+			}
+
+			ret = btrfs_check_ref_name_override(leaf, slot, &key,
+					BTRFS_I(inode), &other_ino,
+					&other_parent);
+			if (ret < 0)
+				break;
+			if (ret > 0) {
+				ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+				if (!ino_elem) {
+					ret = -ENOMEM;
+					break;
+				}
+				ino_elem->ino = other_ino;
+				ino_elem->parent = other_parent;
+				list_add_tail(&ino_elem->list, &inode_list);
+				ret = 0;
+			}
+			path->slots[0]++;
+		}
+		btrfs_add_delayed_iput(inode);
+	}
+
+	return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -4863,6 +5118,7 @@
 	u64 logged_isize = 0;
 	bool need_log_inode_item = true;
 	bool xattrs_logged = false;
+	bool recursive_logging = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -4908,8 +5164,12 @@
 		return ret;
 	}
 
-	if (inode_only == LOG_OTHER_INODE) {
-		inode_only = LOG_INODE_EXISTS;
+	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
+		recursive_logging = true;
+		if (inode_only == LOG_OTHER_INODE)
+			inode_only = LOG_INODE_EXISTS;
+		else
+			inode_only = LOG_INODE_ALL;
 		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
 	} else {
 		mutex_lock(&inode->log_mutex);
@@ -5004,20 +5264,19 @@
 
 		if ((min_key.type == BTRFS_INODE_REF_KEY ||
 		     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
-		    inode->generation == trans->transid) {
+		    inode->generation == trans->transid &&
+		    !recursive_logging) {
 			u64 other_ino = 0;
+			u64 other_parent = 0;
 
 			ret = btrfs_check_ref_name_override(path->nodes[0],
 					path->slots[0], &min_key, inode,
-					&other_ino);
+					&other_ino, &other_parent);
 			if (ret < 0) {
 				err = ret;
 				goto out_unlock;
 			} else if (ret > 0 && ctx &&
 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
-				struct btrfs_key inode_key;
-				struct inode *other_inode;
-
 				if (ins_nr > 0) {
 					ins_nr++;
 				} else {
@@ -5033,43 +5292,13 @@
 					goto out_unlock;
 				}
 				ins_nr = 0;
-				btrfs_release_path(path);
-				inode_key.objectid = other_ino;
-				inode_key.type = BTRFS_INODE_ITEM_KEY;
-				inode_key.offset = 0;
-				other_inode = btrfs_iget(fs_info->sb,
-							 &inode_key, root,
-							 NULL);
-				/*
-				 * If the other inode that had a conflicting dir
-				 * entry was deleted in the current transaction,
-				 * we don't need to do more work nor fallback to
-				 * a transaction commit.
-				 */
-				if (other_inode == ERR_PTR(-ENOENT)) {
-					goto next_key;
-				} else if (IS_ERR(other_inode)) {
-					err = PTR_ERR(other_inode);
-					goto out_unlock;
-				}
-				/*
-				 * We are safe logging the other inode without
-				 * acquiring its i_mutex as long as we log with
-				 * the LOG_INODE_EXISTS mode. We're safe against
-				 * concurrent renames of the other inode as well
-				 * because during a rename we pin the log and
-				 * update the log with the new name before we
-				 * unpin it.
-				 */
-				err = btrfs_log_inode(trans, root,
-						BTRFS_I(other_inode),
-						LOG_OTHER_INODE, 0, LLONG_MAX,
-						ctx);
-				iput(other_inode);
+
+				err = log_conflicting_inodes(trans, root, path,
+						ctx, other_ino, other_parent);
 				if (err)
 					goto out_unlock;
-				else
-					goto next_key;
+				btrfs_release_path(path);
+				goto next_key;
 			}
 		}
 
@@ -5232,9 +5461,19 @@
 		}
 	}
 
+	/*
+	 * Don't update last_log_commit if we logged that an inode exists after
+	 * it was loaded to memory (full_sync bit set).
+	 * This is to prevent data loss when we do a write to the inode, then
+	 * the inode gets evicted after all delalloc was flushed, then we log
+	 * it exists (due to a rename for example) and then fsync it. This last
+	 * fsync would do nothing (not logging the extents previously written).
+	 */
 	spin_lock(&inode->lock);
 	inode->logged_trans = trans->transid;
-	inode->last_log_commit = inode->last_sub_trans;
+	if (inode_only != LOG_INODE_EXISTS ||
+	    !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+		inode->last_log_commit = inode->last_sub_trans;
 	spin_unlock(&inode->lock);
 out_unlock:
 	mutex_unlock(&inode->log_mutex);
@@ -5272,7 +5511,7 @@
 		 * Make sure any commits to the log are forced to be full
 		 * commits.
 		 */
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 		ret = true;
 	}
 	mutex_unlock(&inode->log_mutex);
@@ -5294,7 +5533,6 @@
 {
 	int ret = 0;
 	struct dentry *old_parent = NULL;
-	struct btrfs_inode *orig_inode = inode;
 
 	/*
 	 * for regular files, if its inode is already on disk, we don't
@@ -5314,16 +5552,6 @@
 	}
 
 	while (1) {
-		/*
-		 * If we are logging a directory then we start with our inode,
-		 * not our parent's inode, so we need to skip setting the
-		 * logged_trans so that further down in the log code we don't
-		 * think this inode has already been logged.
-		 */
-		if (inode != orig_inode)
-			inode->logged_trans = trans->transid;
-		smp_mb();
-
 		if (btrfs_must_commit_transaction(trans, inode)) {
 			ret = 1;
 			break;
@@ -5478,7 +5706,7 @@
 			}
 
 			if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
-				iput(di_inode);
+				btrfs_add_delayed_iput(di_inode);
 				break;
 			}
 
@@ -5490,7 +5718,7 @@
 			if (!ret &&
 			    btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
 				ret = 1;
-			iput(di_inode);
+			btrfs_add_delayed_iput(di_inode);
 			if (ret)
 				goto next_dir_inode;
 			if (ctx->log_new_dentries) {
@@ -5637,7 +5865,7 @@
 			if (!ret && ctx && ctx->log_new_dentries)
 				ret = log_new_dir_dentries(trans, root,
 						   BTRFS_I(dir_inode), ctx);
-			iput(dir_inode);
+			btrfs_add_delayed_iput(dir_inode);
 			if (ret)
 				goto out;
 		}
@@ -5649,6 +5877,190 @@
 	return ret;
 }
 
+static int log_new_ancestors(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_key found_key;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+	while (true) {
+		struct btrfs_fs_info *fs_info = root->fs_info;
+		const u64 last_committed = fs_info->last_trans_committed;
+		struct extent_buffer *leaf = path->nodes[0];
+		int slot = path->slots[0];
+		struct btrfs_key search_key;
+		struct inode *inode;
+		int ret = 0;
+
+		btrfs_release_path(path);
+
+		search_key.objectid = found_key.offset;
+		search_key.type = BTRFS_INODE_ITEM_KEY;
+		search_key.offset = 0;
+		inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+
+		if (BTRFS_I(inode)->generation > last_committed)
+			ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+					      LOG_INODE_EXISTS,
+					      0, LLONG_MAX, ctx);
+		btrfs_add_delayed_iput(inode);
+		if (ret)
+			return ret;
+
+		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
+			break;
+
+		search_key.type = BTRFS_INODE_REF_KEY;
+		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+		if (ret < 0)
+			return ret;
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				return ret;
+			else if (ret > 0)
+				return -ENOENT;
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.objectid != search_key.objectid ||
+		    found_key.type != BTRFS_INODE_REF_KEY)
+			return -ENOENT;
+	}
+	return 0;
+}
+
+static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
+				  struct btrfs_inode *inode,
+				  struct dentry *parent,
+				  struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct dentry *old_parent = NULL;
+	struct super_block *sb = inode->vfs_inode.i_sb;
+	int ret = 0;
+
+	while (true) {
+		if (!parent || d_really_is_negative(parent) ||
+		    sb != parent->d_sb)
+			break;
+
+		inode = BTRFS_I(d_inode(parent));
+		if (root != inode->root)
+			break;
+
+		if (inode->generation > fs_info->last_trans_committed) {
+			ret = btrfs_log_inode(trans, root, inode,
+					LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+			if (ret)
+				break;
+		}
+		if (IS_ROOT(parent))
+			break;
+
+		parent = dget_parent(parent);
+		dput(old_parent);
+		old_parent = parent;
+	}
+	dput(old_parent);
+
+	return ret;
+}
+
+static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
+				 struct btrfs_inode *inode,
+				 struct dentry *parent,
+				 struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_root *root = inode->root;
+	const u64 ino = btrfs_ino(inode);
+	struct btrfs_path *path;
+	struct btrfs_key search_key;
+	int ret;
+
+	/*
+	 * For a single hard link case, go through a fast path that does not
+	 * need to iterate the fs/subvolume tree.
+	 */
+	if (inode->vfs_inode.i_nlink < 2)
+		return log_new_ancestors_fast(trans, inode, parent, ctx);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	search_key.objectid = ino;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret == 0)
+		path->slots[0]++;
+
+	while (true) {
+		struct extent_buffer *leaf = path->nodes[0];
+		int slot = path->slots[0];
+		struct btrfs_key found_key;
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.objectid != ino ||
+		    found_key.type > BTRFS_INODE_EXTREF_KEY)
+			break;
+
+		/*
+		 * Don't deal with extended references because they are rare
+		 * cases and too complex to deal with (we would need to keep
+		 * track of which subitem we are processing for each item in
+		 * this loop, etc). So just return some error to fallback to
+		 * a transaction commit.
+		 */
+		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
+			ret = -EMLINK;
+			goto out;
+		}
+
+		/*
+		 * Logging ancestors needs to do more searches on the fs/subvol
+		 * tree, so it releases the path as needed to avoid deadlocks.
+		 * Keep track of the last inode ref key and resume from that key
+		 * after logging all new ancestors for the current hard link.
+		 */
+		memcpy(&search_key, &found_key, sizeof(search_key));
+
+		ret = log_new_ancestors(trans, root, path, ctx);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+		goto again;
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
@@ -5666,11 +6078,9 @@
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct super_block *sb;
-	struct dentry *old_parent = NULL;
 	int ret = 0;
 	u64 last_committed = fs_info->last_trans_committed;
 	bool log_dentries = false;
-	struct btrfs_inode *orig_inode = inode;
 
 	sb = inode->vfs_inode.i_sb;
 
@@ -5776,40 +6186,22 @@
 	 * and has a link count of 2.
 	 */
 	if (inode->last_unlink_trans > last_committed) {
-		ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+		ret = btrfs_log_all_parents(trans, inode, ctx);
 		if (ret)
 			goto end_trans;
 	}
 
-	while (1) {
-		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
-			break;
+	ret = log_all_new_ancestors(trans, inode, parent, ctx);
+	if (ret)
+		goto end_trans;
 
-		inode = BTRFS_I(d_inode(parent));
-		if (root != inode->root)
-			break;
-
-		if (inode->generation > last_committed) {
-			ret = btrfs_log_inode(trans, root, inode,
-					LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
-			if (ret)
-				goto end_trans;
-		}
-		if (IS_ROOT(parent))
-			break;
-
-		parent = dget_parent(parent);
-		dput(old_parent);
-		old_parent = parent;
-	}
 	if (log_dentries)
-		ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+		ret = log_new_dir_dentries(trans, root, inode, ctx);
 	else
 		ret = 0;
 end_trans:
-	dput(old_parent);
 	if (ret < 0) {
-		btrfs_set_log_full_commit(fs_info, trans);
+		btrfs_set_log_full_commit(trans);
 		ret = 1;
 	}
 
@@ -5858,7 +6250,7 @@
 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
 	struct walk_control wc = {
 		.process_func = process_one_buffer,
-		.stage = 0,
+		.stage = LOG_WALK_PIN_ONLY,
 	};
 
 	path = btrfs_alloc_path();
@@ -6036,7 +6428,6 @@
 	 * if this directory was already logged any new
 	 * names for this file/dir will get recorded
 	 */
-	smp_mb();
 	if (dir->logged_trans == trans->transid)
 		return;
 

diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7ab9bb8..132e43d 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h

@@ -15,7 +15,6 @@
 struct btrfs_log_ctx {
 	int log_ret;
 	int log_transid;
-	int io_err;
 	bool log_new_dentries;
 	struct inode *inode;
 	struct list_head list;
@@ -26,22 +25,19 @@
 {
 	ctx->log_ret = 0;
 	ctx->log_transid = 0;
-	ctx->io_err = 0;
 	ctx->log_new_dentries = false;
 	ctx->inode = inode;
 	INIT_LIST_HEAD(&ctx->list);
 }
 
-static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
-					     struct btrfs_trans_handle *trans)
+static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
 {
-	WRITE_ONCE(fs_info->last_trans_log_full_commit, trans->transid);
+	WRITE_ONCE(trans->fs_info->last_trans_log_full_commit, trans->transid);
 }
 
-static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
-					     struct btrfs_trans_handle *trans)
+static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans)
 {
-	return READ_ONCE(fs_info->last_trans_log_full_commit) ==
+	return READ_ONCE(trans->fs_info->last_trans_log_full_commit) ==
 		trans->transid;
 }
 
@@ -65,7 +61,7 @@
 			       const char *name, int name_len,
 			       struct btrfs_inode *inode, u64 dirid);
 void btrfs_end_log_trans(struct btrfs_root *root);
-int btrfs_pin_log_trans(struct btrfs_root *root);
+void btrfs_pin_log_trans(struct btrfs_root *root);
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
 			     int for_rename);

diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 3b2ae34..91caab6 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c

@@ -121,12 +121,12 @@
 		 * An item with that type already exists.
 		 * Extend the item and store the new subid at the end.
 		 */
-		btrfs_extend_item(fs_info, path, sizeof(subid_le));
+		btrfs_extend_item(path, sizeof(subid_le));
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		offset = btrfs_item_ptr_offset(eb, slot);
 		offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
-	} else if (ret < 0) {
+	} else {
 		btrfs_warn(fs_info,
 			   "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
 			   ret, (unsigned long long)key.objectid,
@@ -219,7 +219,7 @@
 	move_src = offset + sizeof(subid);
 	move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
 	memmove_extent_buffer(eb, move_dst, move_src, move_len);
-	btrfs_truncate_item(fs_info, path, item_size - sizeof(subid), 1);
+	btrfs_truncate_item(path, item_size - sizeof(subid), 1);
 
 out:
 	btrfs_free_path(path);

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4405e4..e04409f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c

@@ -14,6 +14,7 @@
 #include <linux/semaphore.h>
 #include <linux/uuid.h>
 #include <linux/list_sort.h>
+#include "misc.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -24,9 +25,11 @@
 #include "async-thread.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
-#include "math.h"
 #include "dev-replace.h"
 #include "sysfs.h"
+#include "tree-checker.h"
+#include "space-info.h"
+#include "block-group.h"
 
 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID10] = {
@@ -37,6 +40,7 @@
 		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
+		.nparity        = 0,
 		.raid_name	= "raid10",
 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
 		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
@@ -49,6 +53,7 @@
 		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
+		.nparity        = 0,
 		.raid_name	= "raid1",
 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
 		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
@@ -61,6 +66,7 @@
 		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 2,
+		.nparity        = 0,
 		.raid_name	= "dup",
 		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
 		.mindev_error	= 0,
@@ -73,6 +79,7 @@
 		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
+		.nparity        = 0,
 		.raid_name	= "raid0",
 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
 		.mindev_error	= 0,
@@ -85,6 +92,7 @@
 		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
+		.nparity        = 0,
 		.raid_name	= "single",
 		.bg_flag	= 0,
 		.mindev_error	= 0,
@@ -96,7 +104,8 @@
 		.devs_min	= 2,
 		.tolerated_failures = 1,
 		.devs_increment	= 1,
-		.ncopies	= 2,
+		.ncopies	= 1,
+		.nparity        = 1,
 		.raid_name	= "raid5",
 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
 		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
@@ -108,25 +117,80 @@
 		.devs_min	= 3,
 		.tolerated_failures = 2,
 		.devs_increment	= 1,
-		.ncopies	= 3,
+		.ncopies	= 1,
+		.nparity        = 2,
 		.raid_name	= "raid6",
 		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
 		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 	},
 };
 
-const char *get_raid_name(enum btrfs_raid_types type)
+const char *btrfs_bg_type_to_raid_name(u64 flags)
 {
-	if (type >= BTRFS_NR_RAID_TYPES)
+	const int index = btrfs_bg_flags_to_raid_index(flags);
+
+	if (index >= BTRFS_NR_RAID_TYPES)
 		return NULL;
 
-	return btrfs_raid_array[type].raid_name;
+	return btrfs_raid_array[index].raid_name;
 }
 
-static int init_first_rw_device(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info);
+/*
+ * Fill @buf with textual description of @bg_flags, no more than @size_buf
+ * bytes including terminating null byte.
+ */
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
+{
+	int i;
+	int ret;
+	char *bp = buf;
+	u64 flags = bg_flags;
+	u32 size_bp = size_buf;
+
+	if (!flags) {
+		strcpy(bp, "NONE");
+		return;
+	}
+
+#define DESCRIBE_FLAG(flag, desc)						\
+	do {								\
+		if (flags & (flag)) {					\
+			ret = snprintf(bp, size_bp, "%s|", (desc));	\
+			if (ret < 0 || ret >= size_bp)			\
+				goto out_overflow;			\
+			size_bp -= ret;					\
+			bp += ret;					\
+			flags &= ~(flag);				\
+		}							\
+	} while (0)
+
+	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
+	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
+	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+
+	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
+			      btrfs_raid_array[i].raid_name);
+#undef DESCRIBE_FLAG
+
+	if (flags) {
+		ret = snprintf(bp, size_bp, "0x%llx|", flags);
+		size_bp -= ret;
+	}
+
+	if (size_bp < size_buf)
+		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
+
+	/*
+	 * The text is trimmed, it's up to the caller to provide sufficiently
+	 * large buffer
+	 */
+out_overflow:;
+}
+
+static int init_first_rw_device(struct btrfs_trans_handle *trans);
 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
-static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
@@ -151,7 +215,7 @@
  * the mutex can be very coarse and can cover long-running operations
  *
  * protects: updates to fs_devices counters like missing devices, rw devices,
- * seeding, structure cloning, openning/closing devices at mount/umount time
+ * seeding, structure cloning, opening/closing devices at mount/umount time
  *
  * global::fs_devs - add, remove, updates to the global list
  *
@@ -176,7 +240,9 @@
  * chunk_mutex
  * -----------
  * protects chunks, adding or removing during allocation, trim or when a new
- * device is added/removed
+ * device is added/removed. Additionally it also protects post_commit_list of
+ * individual devices, since they can be added to the transaction's
+ * post_commit_list only with chunk_mutex held.
  *
  * cleaner_mutex
  * -------------
@@ -238,13 +304,15 @@
 
 /*
  * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid:	if not NULL, copy the uuid to fs_devices::fsid
+ * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
+ * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
  *
  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
  * The returned struct is not linked onto any lists and can be destroyed with
  * kfree() right away.
  */
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
+						 const u8 *metadata_fsid)
 {
 	struct btrfs_fs_devices *fs_devs;
 
@@ -255,18 +323,24 @@
 	mutex_init(&fs_devs->device_list_mutex);
 
 	INIT_LIST_HEAD(&fs_devs->devices);
-	INIT_LIST_HEAD(&fs_devs->resized_devices);
 	INIT_LIST_HEAD(&fs_devs->alloc_list);
 	INIT_LIST_HEAD(&fs_devs->fs_list);
 	if (fsid)
 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 
+	if (metadata_fsid)
+		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
+	else if (fsid)
+		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
+
 	return fs_devs;
 }
 
 void btrfs_free_device(struct btrfs_device *device)
 {
+	WARN_ON(!list_empty(&device->post_commit_list));
 	rcu_string_free(device->name);
+	extent_io_tree_release(&device->alloc_state);
 	bio_put(device->flush_bio);
 	kfree(device);
 }
@@ -284,19 +358,6 @@
 	kfree(fs_devices);
 }
 
-static void btrfs_kobject_uevent(struct block_device *bdev,
-				 enum kobject_action action)
-{
-	int ret;
-
-	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
-	if (ret)
-		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
-			action,
-			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
-			&disk_to_dev(bdev->bd_disk)->kobj);
-}
-
 void __exit btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -334,7 +395,7 @@
 
 	INIT_LIST_HEAD(&dev->dev_list);
 	INIT_LIST_HEAD(&dev->dev_alloc_list);
-	INIT_LIST_HEAD(&dev->resized_list);
+	INIT_LIST_HEAD(&dev->post_commit_list);
 
 	spin_lock_init(&dev->io_lock);
 
@@ -343,38 +404,62 @@
 	btrfs_device_data_ordered_init(dev);
 	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+	extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
 
 	return dev;
 }
 
-/*
- * Find a device specified by @devid or @uuid in the list of @fs_devices, or
- * return NULL.
- *
- * If devid and uuid are both specified, the match must be exact, otherwise
- * only devid is used.
- */
-static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
-		u64 devid, const u8 *uuid)
-{
-	struct btrfs_device *dev;
-
-	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
-		if (dev->devid == devid &&
-		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
-			return dev;
-		}
-	}
-	return NULL;
-}
-
-static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+static noinline struct btrfs_fs_devices *find_fsid(
+		const u8 *fsid, const u8 *metadata_fsid)
 {
 	struct btrfs_fs_devices *fs_devices;
 
+	ASSERT(fsid);
+
+	if (metadata_fsid) {
+		/*
+		 * Handle scanned device having completed its fsid change but
+		 * belonging to a fs_devices that was created by first scanning
+		 * a device which didn't have its fsid/metadata_uuid changed
+		 * at all and the CHANGING_FSID_V2 flag set.
+		 */
+		list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+			if (fs_devices->fsid_change &&
+			    memcmp(metadata_fsid, fs_devices->fsid,
+				   BTRFS_FSID_SIZE) == 0 &&
+			    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+				   BTRFS_FSID_SIZE) == 0) {
+				return fs_devices;
+			}
+		}
+		/*
+		 * Handle scanned device having completed its fsid change but
+		 * belonging to a fs_devices that was created by a device that
+		 * has an outdated pair of fsid/metadata_uuid and
+		 * CHANGING_FSID_V2 flag set.
+		 */
+		list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+			if (fs_devices->fsid_change &&
+			    memcmp(fs_devices->metadata_uuid,
+				   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
+			    memcmp(metadata_fsid, fs_devices->metadata_uuid,
+				   BTRFS_FSID_SIZE) == 0) {
+				return fs_devices;
+			}
+		}
+	}
+
+	/* Handle non-split brain cases */
 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
-			return fs_devices;
+		if (metadata_fsid) {
+			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
+			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
+				      BTRFS_FSID_SIZE) == 0)
+				return fs_devices;
+		} else {
+			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+				return fs_devices;
+		}
 	}
 	return NULL;
 }
@@ -622,6 +707,17 @@
 	run_scheduled_bios(device);
 }
 
+static bool device_path_matched(const char *path, struct btrfs_device *device)
+{
+	int found;
+
+	rcu_read_lock();
+	found = strcmp(rcu_str_deref(device->name), path);
+	rcu_read_unlock();
+
+	return found == 0;
+}
+
 /*
  *  Search and remove all stale (devices which are not mounted) devices.
  *  When both inputs are NULL, it will search and release all stale devices.
@@ -629,52 +725,57 @@
  *		matching this path only.
  *  skip_dev:	Optional. Will skip this device when searching for the stale
  *		devices.
+ *  Return:	0 for success or if @path is NULL.
+ * 		-EBUSY if @path is a mounted device.
+ * 		-ENOENT if @path does not match any device in the list.
  */
-static void btrfs_free_stale_devices(const char *path,
+static int btrfs_free_stale_devices(const char *path,
 				     struct btrfs_device *skip_device)
 {
 	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 	struct btrfs_device *device, *tmp_device;
+	int ret = 0;
+
+	if (path)
+		ret = -ENOENT;
 
 	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
-		mutex_lock(&fs_devices->device_list_mutex);
-		if (fs_devices->opened) {
-			mutex_unlock(&fs_devices->device_list_mutex);
-			continue;
-		}
 
+		mutex_lock(&fs_devices->device_list_mutex);
 		list_for_each_entry_safe(device, tmp_device,
 					 &fs_devices->devices, dev_list) {
-			int not_found = 0;
-
 			if (skip_device && skip_device == device)
 				continue;
 			if (path && !device->name)
 				continue;
-
-			rcu_read_lock();
-			if (path)
-				not_found = strcmp(rcu_str_deref(device->name),
-						   path);
-			rcu_read_unlock();
-			if (not_found)
+			if (path && !device_path_matched(path, device))
 				continue;
+			if (fs_devices->opened) {
+				/* for an already deleted device return 0 */
+				if (path && ret != 0)
+					ret = -EBUSY;
+				break;
+			}
 
 			/* delete the stale device */
 			fs_devices->num_devices--;
 			list_del(&device->dev_list);
 			btrfs_free_device(device);
 
+			ret = 0;
 			if (fs_devices->num_devices == 0)
 				break;
 		}
 		mutex_unlock(&fs_devices->device_list_mutex);
+
 		if (fs_devices->num_devices == 0) {
 			btrfs_sysfs_remove_fsid(fs_devices);
 			list_del(&fs_devices->fs_list);
 			free_fs_devices(fs_devices);
 		}
 	}
+
+	return ret;
 }
 
 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
@@ -709,6 +810,13 @@
 	device->generation = btrfs_super_generation(disk_super);
 
 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+		if (btrfs_super_incompat_flags(disk_super) &
+		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
+			pr_err(
+		"BTRFS: Invalid seeding and uuid-changed device detected\n");
+			goto error_brelse;
+		}
+
 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		fs_devices->seeding = 1;
 	} else {
@@ -744,6 +852,51 @@
 }
 
 /*
+ * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
+ * being created with a disk that has already completed its fsid change.
+ */
+static struct btrfs_fs_devices *find_fsid_inprogress(
+					struct btrfs_super_block *disk_super)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+			   BTRFS_FSID_SIZE) != 0 &&
+		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
+			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
+			return fs_devices;
+		}
+	}
+
+	return NULL;
+}
+
+
+static struct btrfs_fs_devices *find_fsid_changed(
+					struct btrfs_super_block *disk_super)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	/*
+	 * Handles the case where scanned device is part of an fs that had
+	 * multiple successful changes of FSID but curently device didn't
+	 * observe it. Meaning our fsid will be different than theirs.
+	 */
+	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+			   BTRFS_FSID_SIZE) != 0 &&
+		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
+			   BTRFS_FSID_SIZE) == 0 &&
+		    memcmp(fs_devices->fsid, disk_super->fsid,
+			   BTRFS_FSID_SIZE) != 0) {
+			return fs_devices;
+		}
+	}
+
+	return NULL;
+}
+/*
  * Add new device to list of registered devices
  *
  * Returns:
@@ -755,25 +908,72 @@
 			   bool *new_device_added)
 {
 	struct btrfs_device *device;
-	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_fs_devices *fs_devices = NULL;
 	struct rcu_string *name;
 	u64 found_transid = btrfs_super_generation(disk_super);
 	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
+					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 
-	fs_devices = find_fsid(disk_super->fsid);
+	if (fsid_change_in_progress) {
+		if (!has_metadata_uuid) {
+			/*
+			 * When we have an image which has CHANGING_FSID_V2 set
+			 * it might belong to either a filesystem which has
+			 * disks with completed fsid change or it might belong
+			 * to fs with no UUID changes in effect, handle both.
+			 */
+			fs_devices = find_fsid_inprogress(disk_super);
+			if (!fs_devices)
+				fs_devices = find_fsid(disk_super->fsid, NULL);
+		} else {
+			fs_devices = find_fsid_changed(disk_super);
+		}
+	} else if (has_metadata_uuid) {
+		fs_devices = find_fsid(disk_super->fsid,
+				       disk_super->metadata_uuid);
+	} else {
+		fs_devices = find_fsid(disk_super->fsid, NULL);
+	}
+
+
 	if (!fs_devices) {
-		fs_devices = alloc_fs_devices(disk_super->fsid);
+		if (has_metadata_uuid)
+			fs_devices = alloc_fs_devices(disk_super->fsid,
+						      disk_super->metadata_uuid);
+		else
+			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
+
 		if (IS_ERR(fs_devices))
 			return ERR_CAST(fs_devices);
 
+		fs_devices->fsid_change = fsid_change_in_progress;
+
 		mutex_lock(&fs_devices->device_list_mutex);
 		list_add(&fs_devices->fs_list, &fs_uuids);
 
 		device = NULL;
 	} else {
 		mutex_lock(&fs_devices->device_list_mutex);
-		device = find_device(fs_devices, devid,
-				disk_super->dev_item.uuid);
+		device = btrfs_find_device(fs_devices, devid,
+				disk_super->dev_item.uuid, NULL, false);
+
+		/*
+		 * If this disk has been pulled into an fs devices created by
+		 * a device which had the CHANGING_FSID_V2 flag then replace the
+		 * metadata_uuid/fsid values of the fs_devices.
+		 */
+		if (has_metadata_uuid && fs_devices->fsid_change &&
+		    found_transid > fs_devices->latest_generation) {
+			memcpy(fs_devices->fsid, disk_super->fsid,
+					BTRFS_FSID_SIZE);
+			memcpy(fs_devices->metadata_uuid,
+					disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+			fs_devices->fsid_change = false;
+		}
 	}
 
 	if (!device) {
@@ -850,6 +1050,35 @@
 			return ERR_PTR(-EEXIST);
 		}
 
+		/*
+		 * We are going to replace the device path for a given devid,
+		 * make sure it's the same device if the device is mounted
+		 */
+		if (device->bdev) {
+			struct block_device *path_bdev;
+
+			path_bdev = lookup_bdev(path);
+			if (IS_ERR(path_bdev)) {
+				mutex_unlock(&fs_devices->device_list_mutex);
+				return ERR_CAST(path_bdev);
+			}
+
+			if (device->bdev != path_bdev) {
+				bdput(path_bdev);
+				mutex_unlock(&fs_devices->device_list_mutex);
+				btrfs_warn_in_rcu(device->fs_info,
+			"duplicate device fsid:devid for %pU:%llu old:%s new:%s",
+					disk_super->fsid, devid,
+					rcu_str_deref(device->name), path);
+				return ERR_PTR(-EEXIST);
+			}
+			bdput(path_bdev);
+			btrfs_info_in_rcu(device->fs_info,
+				"device fsid %pU devid %llu moved old:%s new:%s",
+				disk_super->fsid, devid,
+				rcu_str_deref(device->name), path);
+		}
+
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name) {
 			mutex_unlock(&fs_devices->device_list_mutex);
@@ -869,8 +1098,11 @@
 	 * it back. We need it to pick the disk with largest generation
 	 * (as above).
 	 */
-	if (!fs_devices->opened)
+	if (!fs_devices->opened) {
 		device->generation = found_transid;
+		fs_devices->latest_generation = max_t(u64, found_transid,
+						fs_devices->latest_generation);
+	}
 
 	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 
@@ -883,22 +1115,24 @@
 	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_device *device;
 	struct btrfs_device *orig_dev;
+	int ret = 0;
 
-	fs_devices = alloc_fs_devices(orig->fsid);
+	fs_devices = alloc_fs_devices(orig->fsid, NULL);
 	if (IS_ERR(fs_devices))
 		return fs_devices;
 
 	mutex_lock(&orig->device_list_mutex);
 	fs_devices->total_devices = orig->total_devices;
 
-	/* We have held the volume lock, it is safe to get the devices. */
 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 		struct rcu_string *name;
 
 		device = btrfs_alloc_device(NULL, &orig_dev->devid,
 					    orig_dev->uuid);
-		if (IS_ERR(device))
+		if (IS_ERR(device)) {
+			ret = PTR_ERR(device);
 			goto error;
+		}
 
 		/*
 		 * This is ok to do without rcu read locked because we hold the
@@ -909,6 +1143,7 @@
 					GFP_KERNEL);
 			if (!name) {
 				btrfs_free_device(device);
+				ret = -ENOMEM;
 				goto error;
 			}
 			rcu_assign_pointer(device->name, name);
@@ -923,7 +1158,7 @@
 error:
 	mutex_unlock(&orig->device_list_mutex);
 	free_fs_devices(fs_devices);
-	return ERR_PTR(-ENOMEM);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -993,14 +1228,6 @@
 	mutex_unlock(&uuid_mutex);
 }
 
-static void free_device_rcu(struct rcu_head *head)
-{
-	struct btrfs_device *device;
-
-	device = container_of(head, struct btrfs_device, rcu);
-	btrfs_free_device(device);
-}
-
 static void btrfs_close_bdev(struct btrfs_device *device)
 {
 	if (!device->bdev)
@@ -1048,7 +1275,8 @@
 	list_replace_rcu(&device->dev_list, &new_device->dev_list);
 	new_device->fs_devices = device->fs_devices;
 
-	call_rcu(&device->rcu, free_device_rcu);
+	synchronize_rcu();
+	btrfs_free_device(device);
 }
 
 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -1193,7 +1421,7 @@
 	p = kmap(*page);
 
 	/* align our pointer to the offset of the super block */
-	*disk_super = p + (bytenr & ~PAGE_MASK);
+	*disk_super = p + offset_in_page(bytenr);
 
 	if (btrfs_super_bytenr(*disk_super) != bytenr ||
 	    btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
@@ -1208,6 +1436,17 @@
 	return 0;
 }
 
+int btrfs_forget_devices(const char *path)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+	mutex_unlock(&uuid_mutex);
+
+	return ret;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount path
  * and we are not allowed to call set_blocksize during the scan. The superblock
@@ -1257,58 +1496,29 @@
 	return device;
 }
 
-static int contains_pending_extent(struct btrfs_transaction *transaction,
-				   struct btrfs_device *device,
-				   u64 *start, u64 len)
+/*
+ * Try to find a chunk that intersects [start, start + len] range and when one
+ * such is found, record the end of it in *start
+ */
+static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
+				    u64 len)
 {
-	struct btrfs_fs_info *fs_info = device->fs_info;
-	struct extent_map *em;
-	struct list_head *search_list = &fs_info->pinned_chunks;
-	int ret = 0;
-	u64 physical_start = *start;
+	u64 physical_start, physical_end;
 
-	if (transaction)
-		search_list = &transaction->pending_chunks;
-again:
-	list_for_each_entry(em, search_list, list) {
-		struct map_lookup *map;
-		int i;
+	lockdep_assert_held(&device->fs_info->chunk_mutex);
 
-		map = em->map_lookup;
-		for (i = 0; i < map->num_stripes; i++) {
-			u64 end;
+	if (!find_first_extent_bit(&device->alloc_state, *start,
+				   &physical_start, &physical_end,
+				   CHUNK_ALLOCATED, NULL)) {
 
-			if (map->stripes[i].dev != device)
-				continue;
-			if (map->stripes[i].physical >= physical_start + len ||
-			    map->stripes[i].physical + em->orig_block_len <=
-			    physical_start)
-				continue;
-			/*
-			 * Make sure that while processing the pinned list we do
-			 * not override our *start with a lower value, because
-			 * we can have pinned chunks that fall within this
-			 * device hole and that have lower physical addresses
-			 * than the pending chunks we processed before. If we
-			 * do not take this special care we can end up getting
-			 * 2 pending chunks that start at the same physical
-			 * device offsets because the end offset of a pinned
-			 * chunk can be equal to the start offset of some
-			 * pending chunk.
-			 */
-			end = map->stripes[i].physical + em->orig_block_len;
-			if (end > *start) {
-				*start = end;
-				ret = 1;
-			}
+		if (in_range(physical_start, *start, len) ||
+		    in_range(*start, physical_start,
+			     physical_end - physical_start)) {
+			*start = physical_end + 1;
+			return true;
 		}
 	}
-	if (search_list != &fs_info->pinned_chunks) {
-		search_list = &fs_info->pinned_chunks;
-		goto again;
-	}
-
-	return ret;
+	return false;
 }
 
 
@@ -1332,10 +1542,16 @@
  * @len is used to store the size of the free space that we find.
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
+ *
+ * NOTE: This function will search *commit* root of device tree, and does extra
+ * check to ensure dev extents are not double allocated.
+ * This makes the function safe to allocate dev extents but may not report
+ * correct usable device space, as device extent freed in current transaction
+ * is not reported as avaiable.
  */
-int find_free_dev_extent_start(struct btrfs_transaction *transaction,
-			       struct btrfs_device *device, u64 num_bytes,
-			       u64 search_start, u64 *start, u64 *len)
+static int find_free_dev_extent_start(struct btrfs_device *device,
+				u64 num_bytes, u64 search_start, u64 *start,
+				u64 *len)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	struct btrfs_root *root = fs_info->dev_root;
@@ -1419,15 +1635,12 @@
 			 * Have to check before we set max_hole_start, otherwise
 			 * we could end up sending back this offset anyway.
 			 */
-			if (contains_pending_extent(transaction, device,
-						    &search_start,
+			if (contains_pending_extent(device, &search_start,
 						    hole_size)) {
-				if (key.offset >= search_start) {
+				if (key.offset >= search_start)
 					hole_size = key.offset - search_start;
-				} else {
-					WARN_ON_ONCE(1);
+				else
 					hole_size = 0;
-				}
 			}
 
 			if (hole_size > max_hole_size) {
@@ -1468,8 +1681,7 @@
 	if (search_end > search_start) {
 		hole_size = search_end - search_start;
 
-		if (contains_pending_extent(transaction, device, &search_start,
-					    hole_size)) {
+		if (contains_pending_extent(device, &search_start, hole_size)) {
 			btrfs_release_path(path);
 			goto again;
 		}
@@ -1494,13 +1706,11 @@
 	return ret;
 }
 
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *len)
 {
 	/* FIXME use last free of some kind */
-	return find_free_dev_extent_start(trans->transaction, device,
-					  num_bytes, 0, start, len);
+	return find_free_dev_extent_start(device, num_bytes, 0, start, len);
 }
 
 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
@@ -1611,9 +1821,9 @@
 	struct rb_node *n;
 	u64 ret = 0;
 
-	em_tree = &fs_info->mapping_tree.map_tree;
+	em_tree = &fs_info->mapping_tree;
 	read_lock(&em_tree->lock);
-	n = rb_last(&em_tree->map);
+	n = rb_last(&em_tree->map.rb_root);
 	if (n) {
 		em = rb_entry(n, struct extent_map, rb_node);
 		ret = em->start + em->len;
@@ -1643,7 +1853,12 @@
 	if (ret < 0)
 		goto error;
 
-	BUG_ON(ret == 0); /* Corruption */
+	if (ret == 0) {
+		/* Corruption */
+		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
+		ret = -EUCLEAN;
+		goto error;
+	}
 
 	ret = btrfs_previous_item(fs_info->chunk_root, path,
 				  BTRFS_DEV_ITEMS_OBJECTID,
@@ -1709,7 +1924,8 @@
 	ptr = btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 	ptr = btrfs_device_fsid(dev_item);
-	write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
+			    ptr, BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
@@ -1733,10 +1949,9 @@
 	filp_close(filp, NULL);
 }
 
-static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
-			     struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_device *device)
 {
-	struct btrfs_root *root = fs_info->chunk_root;
+	struct btrfs_root *root = device->fs_info->chunk_root;
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -1854,6 +2069,24 @@
 		fs_info->fs_devices->latest_bdev = next_device->bdev;
 }
 
+/*
+ * Return btrfs_fs_devices::num_devices excluding the device that's being
+ * currently replaced.
+ */
+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
+{
+	u64 num_devices = fs_info->fs_devices->num_devices;
+
+	down_read(&fs_info->dev_replace.rwsem);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+		ASSERT(num_devices > 1);
+		num_devices--;
+	}
+	up_read(&fs_info->dev_replace.rwsem);
+
+	return num_devices;
+}
+
 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		u64 devid)
 {
@@ -1865,22 +2098,30 @@
 
 	mutex_lock(&uuid_mutex);
 
-	num_devices = fs_devices->num_devices;
-	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
-	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
-		WARN_ON(num_devices < 1);
-		num_devices--;
-	}
-	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+	num_devices = btrfs_num_devices(fs_info);
 
 	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
 	if (ret)
 		goto out;
 
-	ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
-					   &device);
-	if (ret)
+	device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
+
+	if (IS_ERR(device)) {
+		if (PTR_ERR(device) == -ENOENT &&
+		    strcmp(device_path, "missing") == 0)
+			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
+		else
+			ret = PTR_ERR(device);
 		goto out;
+	}
+
+	if (btrfs_pinned_by_swapfile(fs_info, device)) {
+		btrfs_warn_in_rcu(fs_info,
+		  "cannot remove device %s (devid %llu) due to active swapfile",
+				  rcu_str_deref(device->name), device->devid);
+		ret = -ETXTBSY;
+		goto out;
+	}
 
 	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
@@ -1911,12 +2152,12 @@
 	 * counter although write_all_supers() is not locked out. This
 	 * could give a filesystem state which requires a degraded mount.
 	 */
-	ret = btrfs_rm_dev_item(fs_info, device);
+	ret = btrfs_rm_dev_item(device);
 	if (ret)
 		goto error_undo;
 
 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
-	btrfs_scrub_cancel_dev(fs_info, device);
+	btrfs_scrub_cancel_dev(device);
 
 	/*
 	 * the device list mutex makes sure that we don't change
@@ -1967,7 +2208,8 @@
 		btrfs_scratch_superblocks(device->bdev, device->name->str);
 
 	btrfs_close_bdev(device);
-	call_rcu(&device->rcu, free_device_rcu);
+	synchronize_rcu();
+	btrfs_free_device(device);
 
 	if (cur_devices->open_devices == 0) {
 		while (fs_devices) {
@@ -2024,9 +2266,9 @@
 		fs_devices->open_devices--;
 }
 
-void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
-				      struct btrfs_device *srcdev)
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
 {
+	struct btrfs_fs_info *fs_info = srcdev->fs_info;
 	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
 
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
@@ -2035,7 +2277,8 @@
 	}
 
 	btrfs_close_bdev(srcdev);
-	call_rcu(&srcdev->rcu, free_device_rcu);
+	synchronize_rcu();
+	btrfs_free_device(srcdev);
 
 	/* if this is no devs we rather delete the fs_devices */
 	if (!fs_devices->num_devices) {
@@ -2093,12 +2336,12 @@
 	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
 
 	btrfs_close_bdev(tgtdev);
-	call_rcu(&tgtdev->rcu, free_device_rcu);
+	synchronize_rcu();
+	btrfs_free_device(tgtdev);
 }
 
-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
-				     const char *device_path,
-				     struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_by_path(
+		struct btrfs_fs_info *fs_info, const char *device_path)
 {
 	int ret = 0;
 	struct btrfs_super_block *disk_super;
@@ -2106,72 +2349,61 @@
 	u8 *dev_uuid;
 	struct block_device *bdev;
 	struct buffer_head *bh;
+	struct btrfs_device *device;
 
-	*device = NULL;
 	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
 				    fs_info->bdev_holder, 0, &bdev, &bh);
 	if (ret)
-		return ret;
+		return ERR_PTR(ret);
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	dev_uuid = disk_super->dev_item.uuid;
-	*device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
+	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+					   disk_super->metadata_uuid, true);
+	else
+		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+					   disk_super->fsid, true);
+
 	brelse(bh);
-	if (!*device)
-		ret = -ENOENT;
+	if (!device)
+		device = ERR_PTR(-ENOENT);
 	blkdev_put(bdev, FMODE_READ);
-	return ret;
-}
-
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
-					 const char *device_path,
-					 struct btrfs_device **device)
-{
-	*device = NULL;
-	if (strcmp(device_path, "missing") == 0) {
-		struct list_head *devices;
-		struct btrfs_device *tmp;
-
-		devices = &fs_info->fs_devices->devices;
-		list_for_each_entry(tmp, devices, dev_list) {
-			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
-					&tmp->dev_state) && !tmp->bdev) {
-				*device = tmp;
-				break;
-			}
-		}
-
-		if (!*device)
-			return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
-
-		return 0;
-	} else {
-		return btrfs_find_device_by_path(fs_info, device_path, device);
-	}
+	return device;
 }
 
 /*
  * Lookup a device given by device id, or the path if the id is 0.
  */
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
-				 const char *devpath,
-				 struct btrfs_device **device)
+struct btrfs_device *btrfs_find_device_by_devspec(
+		struct btrfs_fs_info *fs_info, u64 devid,
+		const char *device_path)
 {
-	int ret;
+	struct btrfs_device *device;
 
 	if (devid) {
-		ret = 0;
-		*device = btrfs_find_device(fs_info, devid, NULL, NULL);
-		if (!*device)
-			ret = -ENOENT;
-	} else {
-		if (!devpath || !devpath[0])
-			return -EINVAL;
-
-		ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
-							   device);
+		device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
+					   NULL, true);
+		if (!device)
+			return ERR_PTR(-ENOENT);
+		return device;
 	}
-	return ret;
+
+	if (!device_path || !device_path[0])
+		return ERR_PTR(-EINVAL);
+
+	if (strcmp(device_path, "missing") == 0) {
+		/* Find first missing device */
+		list_for_each_entry(device, &fs_info->fs_devices->devices,
+				    dev_list) {
+			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+				     &device->dev_state) && !device->bdev)
+				return device;
+		}
+		return ERR_PTR(-ENOENT);
+	}
+
+	return btrfs_find_device_by_path(fs_info, device_path);
 }
 
 /*
@@ -2190,7 +2422,7 @@
 	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	seed_devices = alloc_fs_devices(NULL);
+	seed_devices = alloc_fs_devices(NULL, NULL);
 	if (IS_ERR(seed_devices))
 		return PTR_ERR(seed_devices);
 
@@ -2226,7 +2458,7 @@
 	fs_devices->seed = seed_devices;
 
 	generate_random_uuid(fs_devices->fsid);
-	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	mutex_unlock(&fs_devices->device_list_mutex);
 
@@ -2240,9 +2472,9 @@
 /*
  * Store the expected generation for seed devices in device items.
  */
-static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info)
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *root = fs_info->chunk_root;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -2293,7 +2525,8 @@
 				   BTRFS_UUID_SIZE);
 		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 				   BTRFS_FSID_SIZE);
-		device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+					   fs_uuid, true);
 		BUG_ON(!device); /* Logic error */
 
 		if (device->fs_devices->seeding) {
@@ -2441,7 +2674,7 @@
 
 	if (seeding_dev) {
 		mutex_lock(&fs_info->chunk_mutex);
-		ret = init_first_rw_device(trans, fs_info);
+		ret = init_first_rw_device(trans);
 		mutex_unlock(&fs_info->chunk_mutex);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
@@ -2456,22 +2689,14 @@
 	}
 
 	if (seeding_dev) {
-		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
-
-		ret = btrfs_finish_sprout(trans, fs_info);
+		ret = btrfs_finish_sprout(trans);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto error_sysfs;
 		}
 
-		/* Sprouting would change fsid of the mounted root,
-		 * so rename the fsid on the sysfs
-		 */
-		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
-						fs_info->fsid);
-		if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
-			btrfs_warn(fs_info,
-				   "sysfs: failed to create fsid for sprout");
+		btrfs_sysfs_update_sprout_fsid(fs_devices,
+				fs_info->fs_devices->fsid);
 	}
 
 	ret = btrfs_commit_transaction(trans);
@@ -2588,7 +2813,6 @@
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	struct btrfs_super_block *super_copy = fs_info->super_copy;
-	struct btrfs_fs_devices *fs_devices;
 	u64 old_total;
 	u64 diff;
 
@@ -2607,8 +2831,6 @@
 		return -EINVAL;
 	}
 
-	fs_devices = fs_info->fs_devices;
-
 	btrfs_set_super_total_bytes(super_copy,
 			round_down(old_total + diff, fs_info->sectorsize));
 	device->fs_devices->total_rw_bytes += diff;
@@ -2616,9 +2838,9 @@
 	btrfs_device_set_total_bytes(device, new_size);
 	btrfs_device_set_disk_total_bytes(device, new_size);
 	btrfs_clear_space_info_full(device->fs_info);
-	if (list_empty(&device->resized_list))
-		list_add_tail(&device->resized_list,
-			      &fs_devices->resized_devices);
+	if (list_empty(&device->post_commit_list))
+		list_add_tail(&device->post_commit_list,
+			      &trans->transaction->dev_update_list);
 	mutex_unlock(&fs_info->chunk_mutex);
 
 	return btrfs_update_device(trans, device);
@@ -2706,13 +2928,20 @@
 	return ret;
 }
 
-static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
-					u64 logical, u64 length)
+/*
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * @logical: Logical block offset in bytes.
+ * @length: Length of extent in bytes.
+ *
+ * Return: Chunk mapping or ERR_PTR.
+ */
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+				       u64 logical, u64 length)
 {
 	struct extent_map_tree *em_tree;
 	struct extent_map *em;
 
-	em_tree = &fs_info->mapping_tree.map_tree;
+	em_tree = &fs_info->mapping_tree;
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, length);
 	read_unlock(&em_tree->lock);
@@ -2744,7 +2973,7 @@
 	int i, ret = 0;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 
-	em = get_chunk_map(fs_info, chunk_offset, 1);
+	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
 	if (IS_ERR(em)) {
 		/*
 		 * This is a logic error, but we don't want to just rely on the
@@ -2785,13 +3014,11 @@
 			mutex_unlock(&fs_info->chunk_mutex);
 		}
 
-		if (map->stripes[i].dev) {
-			ret = btrfs_update_device(trans, map->stripes[i].dev);
-			if (ret) {
-				mutex_unlock(&fs_devices->device_list_mutex);
-				btrfs_abort_transaction(trans, ret);
-				goto out;
-			}
+		ret = btrfs_update_device(trans, device);
+		if (ret) {
+			mutex_unlock(&fs_devices->device_list_mutex);
+			btrfs_abort_transaction(trans, ret);
+			goto out;
 		}
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
@@ -2844,10 +3071,6 @@
 	 */
 	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
 
-	ret = btrfs_can_relocate(fs_info, chunk_offset);
-	if (ret)
-		return -ENOSPC;
-
 	/* step one, relocate all the extents inside this chunk */
 	btrfs_scrub_pause(fs_info);
 	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
@@ -2855,16 +3078,6 @@
 	if (ret)
 		return ret;
 
-	/*
-	 * We add the kobjects here (and after forcing data chunk creation)
-	 * since relocation is the only place we'll create chunks of a new
-	 * type at runtime.  The only place where we'll remove the last
-	 * chunk of a type is the call immediately below this one.  Even
-	 * so, we're protected against races with the cleaner thread since
-	 * we're covered by the delete_unused_bgs_mutex.
-	 */
-	btrfs_add_raid_kobjects(fs_info);
-
 	trans = btrfs_start_trans_remove_block_group(root->fs_info,
 						     chunk_offset);
 	if (IS_ERR(trans)) {
@@ -2991,9 +3204,6 @@
 			btrfs_end_transaction(trans);
 			if (ret < 0)
 				return ret;
-
-			btrfs_add_raid_kobjects(fs_info);
-
 			return 1;
 		}
 	}
@@ -3247,6 +3457,18 @@
 	return 1;
 }
 
+static u64 calc_data_stripes(u64 type, int num_stripes)
+{
+	const int index = btrfs_bg_flags_to_raid_index(type);
+	const int ncopies = btrfs_raid_array[index].ncopies;
+	const int nparity = btrfs_raid_array[index].nparity;
+
+	if (nparity)
+		return num_stripes - nparity;
+	else
+		return num_stripes / ncopies;
+}
+
 /* [pstart, pend) */
 static int chunk_drange_filter(struct extent_buffer *leaf,
 			       struct btrfs_chunk *chunk,
@@ -3256,22 +3478,15 @@
 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 	u64 stripe_offset;
 	u64 stripe_length;
+	u64 type;
 	int factor;
 	int i;
 
 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
 		return 0;
 
-	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
-	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
-		factor = num_stripes / 2;
-	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
-		factor = num_stripes - 1;
-	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
-		factor = num_stripes - 2;
-	} else {
-		factor = num_stripes;
-	}
+	type = btrfs_chunk_type(leaf, chunk);
+	factor = calc_data_stripes(type, num_stripes);
 
 	for (i = 0; i < num_stripes; i++) {
 		stripe = btrfs_stripe_nr(chunk, i);
@@ -3332,10 +3547,10 @@
 	return 0;
 }
 
-static int should_balance_chunk(struct btrfs_fs_info *fs_info,
-				struct extent_buffer *leaf,
+static int should_balance_chunk(struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
 {
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 	struct btrfs_balance_args *bargs = NULL;
 	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
@@ -3425,17 +3640,11 @@
 {
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 	struct btrfs_root *chunk_root = fs_info->chunk_root;
-	struct btrfs_root *dev_root = fs_info->dev_root;
-	struct list_head *devices;
-	struct btrfs_device *device;
-	u64 old_size;
-	u64 size_to_free;
 	u64 chunk_type;
 	struct btrfs_chunk *chunk;
 	struct btrfs_path *path = NULL;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	struct btrfs_trans_handle *trans;
 	struct extent_buffer *leaf;
 	int slot;
 	int ret;
@@ -3450,53 +3659,6 @@
 	u32 count_sys = 0;
 	int chunk_reserved = 0;
 
-	/* step one make some room on all the devices */
-	devices = &fs_info->fs_devices->devices;
-	list_for_each_entry(device, devices, dev_list) {
-		old_size = btrfs_device_get_total_bytes(device);
-		size_to_free = div_factor(old_size, 1);
-		size_to_free = min_t(u64, size_to_free, SZ_1M);
-		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
-		    btrfs_device_get_total_bytes(device) -
-		    btrfs_device_get_bytes_used(device) > size_to_free ||
-		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
-			continue;
-
-		ret = btrfs_shrink_device(device, old_size - size_to_free);
-		if (ret == -ENOSPC)
-			break;
-		if (ret) {
-			/* btrfs_shrink_device never returns ret > 0 */
-			WARN_ON(ret > 0);
-			goto error;
-		}
-
-		trans = btrfs_start_transaction(dev_root, 0);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			btrfs_info_in_rcu(fs_info,
-		 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
-					  rcu_str_deref(device->name), ret,
-					  old_size, old_size - size_to_free);
-			goto error;
-		}
-
-		ret = btrfs_grow_device(trans, device, old_size);
-		if (ret) {
-			btrfs_end_transaction(trans);
-			/* btrfs_grow_device never returns ret > 0 */
-			WARN_ON(ret > 0);
-			btrfs_info_in_rcu(fs_info,
-		 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
-					  rcu_str_deref(device->name), ret,
-					  old_size, old_size - size_to_free);
-			goto error;
-		}
-
-		btrfs_end_transaction(trans);
-	}
-
-	/* step two, relocate all the chunks */
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
@@ -3568,8 +3730,7 @@
 			spin_unlock(&fs_info->balance_lock);
 		}
 
-		ret = should_balance_chunk(fs_info, leaf, chunk,
-					   found_key.offset);
+		ret = should_balance_chunk(leaf, chunk, found_key.offset);
 
 		btrfs_release_path(path);
 		if (!ret) {
@@ -3626,10 +3787,15 @@
 
 		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-		if (ret && ret != -ENOSPC)
-			goto error;
 		if (ret == -ENOSPC) {
 			enospc_errors++;
+		} else if (ret == -ETXTBSY) {
+			btrfs_info(fs_info,
+	   "skipping relocation of block group %llu due to active swapfile",
+				   found_key.offset);
+			ret = 0;
+		} else if (ret) {
+			goto error;
 		} else {
 			spin_lock(&fs_info->balance_lock);
 			bctl->stat.completed++;
@@ -3679,7 +3845,11 @@
 		return !extended; /* "0" is valid for usual profiles */
 
 	/* true if exactly one bit set */
-	return (flags & (flags - 1)) == 0;
+	/*
+	 * Don't use is_power_of_2(unsigned long) because it won't work
+	 * for the single profile (1ULL << 48) on 32-bit CPUs.
+	 */
+	return flags != 0 && (flags & (flags - 1)) == 0;
 }
 
 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -3700,6 +3870,160 @@
 }
 
 /*
+ * Fill @buf with textual description of balance filter flags @bargs, up to
+ * @size_buf including the terminating null. The output may be trimmed if it
+ * does not fit into the provided buffer.
+ */
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
+				 u32 size_buf)
+{
+	int ret;
+	u32 size_bp = size_buf;
+	char *bp = buf;
+	u64 flags = bargs->flags;
+	char tmp_buf[128] = {'\0'};
+
+	if (!flags)
+		return;
+
+#define CHECK_APPEND_NOARG(a)						\
+	do {								\
+		ret = snprintf(bp, size_bp, (a));			\
+		if (ret < 0 || ret >= size_bp)				\
+			goto out_overflow;				\
+		size_bp -= ret;						\
+		bp += ret;						\
+	} while (0)
+
+#define CHECK_APPEND_1ARG(a, v1)					\
+	do {								\
+		ret = snprintf(bp, size_bp, (a), (v1));			\
+		if (ret < 0 || ret >= size_bp)				\
+			goto out_overflow;				\
+		size_bp -= ret;						\
+		bp += ret;						\
+	} while (0)
+
+#define CHECK_APPEND_2ARG(a, v1, v2)					\
+	do {								\
+		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
+		if (ret < 0 || ret >= size_bp)				\
+			goto out_overflow;				\
+		size_bp -= ret;						\
+		bp += ret;						\
+	} while (0)
+
+	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
+		CHECK_APPEND_1ARG("convert=%s,",
+				  btrfs_bg_type_to_raid_name(bargs->target));
+
+	if (flags & BTRFS_BALANCE_ARGS_SOFT)
+		CHECK_APPEND_NOARG("soft,");
+
+	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
+		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
+					    sizeof(tmp_buf));
+		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
+	}
+
+	if (flags & BTRFS_BALANCE_ARGS_USAGE)
+		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
+
+	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
+		CHECK_APPEND_2ARG("usage=%u..%u,",
+				  bargs->usage_min, bargs->usage_max);
+
+	if (flags & BTRFS_BALANCE_ARGS_DEVID)
+		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
+
+	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
+		CHECK_APPEND_2ARG("drange=%llu..%llu,",
+				  bargs->pstart, bargs->pend);
+
+	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
+		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
+				  bargs->vstart, bargs->vend);
+
+	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
+		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
+
+	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
+		CHECK_APPEND_2ARG("limit=%u..%u,",
+				bargs->limit_min, bargs->limit_max);
+
+	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
+		CHECK_APPEND_2ARG("stripes=%u..%u,",
+				  bargs->stripes_min, bargs->stripes_max);
+
+#undef CHECK_APPEND_2ARG
+#undef CHECK_APPEND_1ARG
+#undef CHECK_APPEND_NOARG
+
+out_overflow:
+
+	if (size_bp < size_buf)
+		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
+	else
+		buf[0] = '\0';
+}
+
+static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
+{
+	u32 size_buf = 1024;
+	char tmp_buf[192] = {'\0'};
+	char *buf;
+	char *bp;
+	u32 size_bp = size_buf;
+	int ret;
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	buf = kzalloc(size_buf, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	bp = buf;
+
+#define CHECK_APPEND_1ARG(a, v1)					\
+	do {								\
+		ret = snprintf(bp, size_bp, (a), (v1));			\
+		if (ret < 0 || ret >= size_bp)				\
+			goto out_overflow;				\
+		size_bp -= ret;						\
+		bp += ret;						\
+	} while (0)
+
+	if (bctl->flags & BTRFS_BALANCE_FORCE)
+		CHECK_APPEND_1ARG("%s", "-f ");
+
+	if (bctl->flags & BTRFS_BALANCE_DATA) {
+		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
+		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
+	}
+
+	if (bctl->flags & BTRFS_BALANCE_METADATA) {
+		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
+		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
+	}
+
+	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
+		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
+		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
+	}
+
+#undef CHECK_APPEND_1ARG
+
+out_overflow:
+
+	if (size_bp < size_buf)
+		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
+	btrfs_info(fs_info, "balance: %s %s",
+		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
+		   "resume" : "start", buf);
+
+	kfree(buf);
+}
+
+/*
  * Should be called with balance mutexe held
  */
 int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -3712,6 +4036,8 @@
 	int ret;
 	u64 num_devices;
 	unsigned seq;
+	bool reducing_integrity;
+	int i;
 
 	if (btrfs_fs_closing(fs_info) ||
 	    atomic_read(&fs_info->balance_pause_req) ||
@@ -3740,54 +4066,50 @@
 		}
 	}
 
-	num_devices = fs_info->fs_devices->num_devices;
-	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
-	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
-		BUG_ON(num_devices < 1);
-		num_devices--;
-	}
-	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
-	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
-	if (num_devices > 1)
-		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
-	if (num_devices > 2)
-		allowed |= BTRFS_BLOCK_GROUP_RAID5;
-	if (num_devices > 3)
-		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
-			    BTRFS_BLOCK_GROUP_RAID6);
-	if (validate_convert_profile(&bctl->data, allowed)) {
-		int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
+	num_devices = btrfs_num_devices(fs_info);
 
+	/*
+	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
+	 * special bit for it, to make it easier to distinguish.  Thus we need
+	 * to set it manually, or balance would refuse the profile.
+	 */
+	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
+		if (num_devices >= btrfs_raid_array[i].devs_min)
+			allowed |= btrfs_raid_array[i].bg_flag;
+
+	if (validate_convert_profile(&bctl->data, allowed)) {
 		btrfs_err(fs_info,
 			  "balance: invalid convert data profile %s",
-			  get_raid_name(index));
+			  btrfs_bg_type_to_raid_name(bctl->data.target));
 		ret = -EINVAL;
 		goto out;
 	}
 	if (validate_convert_profile(&bctl->meta, allowed)) {
-		int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
-
 		btrfs_err(fs_info,
 			  "balance: invalid convert metadata profile %s",
-			  get_raid_name(index));
+			  btrfs_bg_type_to_raid_name(bctl->meta.target));
 		ret = -EINVAL;
 		goto out;
 	}
 	if (validate_convert_profile(&bctl->sys, allowed)) {
-		int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
-
 		btrfs_err(fs_info,
 			  "balance: invalid convert system profile %s",
-			  get_raid_name(index));
+			  btrfs_bg_type_to_raid_name(bctl->sys.target));
 		ret = -EINVAL;
 		goto out;
 	}
 
-	/* allow to reduce meta or sys integrity only if force set */
-	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
-			BTRFS_BLOCK_GROUP_RAID10 |
-			BTRFS_BLOCK_GROUP_RAID5 |
-			BTRFS_BLOCK_GROUP_RAID6;
+	/*
+	 * Allow to reduce metadata or system integrity only if force set for
+	 * profiles with redundancy (copies, parity)
+	 */
+	allowed = 0;
+	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
+		if (btrfs_raid_array[i].ncopies >= 2 ||
+		    btrfs_raid_array[i].tolerated_failures >= 1)
+			allowed |= btrfs_raid_array[i].bg_flag;
+	}
 	do {
 		seq = read_seqbegin(&fs_info->profiles_lock);
 
@@ -3796,32 +4118,44 @@
 		     !(bctl->sys.target & allowed)) ||
 		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 		     (fs_info->avail_metadata_alloc_bits & allowed) &&
-		     !(bctl->meta.target & allowed))) {
-			if (bctl->flags & BTRFS_BALANCE_FORCE) {
-				btrfs_info(fs_info,
-				"balance: force reducing metadata integrity");
-			} else {
-				btrfs_err(fs_info,
-	"balance: reduces metadata integrity, use --force if you want this");
-				ret = -EINVAL;
-				goto out;
-			}
-		}
+		     !(bctl->meta.target & allowed)))
+			reducing_integrity = true;
+		else
+			reducing_integrity = false;
+
+		/* if we're not converting, the target field is uninitialized */
+		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+			bctl->data.target : fs_info->avail_data_alloc_bits;
 	} while (read_seqretry(&fs_info->profiles_lock, seq));
 
-	/* if we're not converting, the target field is uninitialized */
-	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
-		bctl->meta.target : fs_info->avail_metadata_alloc_bits;
-	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
-		bctl->data.target : fs_info->avail_data_alloc_bits;
+	if (reducing_integrity) {
+		if (bctl->flags & BTRFS_BALANCE_FORCE) {
+			btrfs_info(fs_info,
+				   "balance: force reducing metadata integrity");
+		} else {
+			btrfs_err(fs_info,
+	  "balance: reduces metadata integrity, use --force if you want this");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
 	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
 		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
-		int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
-		int data_index = btrfs_bg_flags_to_raid_index(data_target);
-
 		btrfs_warn(fs_info,
 	"balance: metadata profile %s has lower redundancy than data profile %s",
-			   get_raid_name(meta_index), get_raid_name(data_index));
+				btrfs_bg_type_to_raid_name(meta_target),
+				btrfs_bg_type_to_raid_name(data_target));
+	}
+
+	if (fs_info->send_in_progress) {
+		btrfs_warn_rl(fs_info,
+"cannot run balance while send operations are in progress (%d in progress)",
+			      fs_info->send_in_progress);
+		ret = -EAGAIN;
+		goto out;
 	}
 
 	ret = insert_balance_item(fs_info, bctl);
@@ -3843,11 +4177,19 @@
 
 	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
 	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
+	describe_balance_start_or_resume(fs_info);
 	mutex_unlock(&fs_info->balance_mutex);
 
 	ret = __btrfs_balance(fs_info);
 
 	mutex_lock(&fs_info->balance_mutex);
+	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
+		btrfs_info(fs_info, "balance: paused");
+	else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
+		btrfs_info(fs_info, "balance: canceled");
+	else
+		btrfs_info(fs_info, "balance: ended with status: %d", ret);
+
 	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
 
 	if (bargs) {
@@ -3880,10 +4222,8 @@
 	int ret = 0;
 
 	mutex_lock(&fs_info->balance_mutex);
-	if (fs_info->balance_ctl) {
-		btrfs_info(fs_info, "balance: resuming");
+	if (fs_info->balance_ctl)
 		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
-	}
 	mutex_unlock(&fs_info->balance_mutex);
 
 	return ret;
@@ -4276,8 +4616,7 @@
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	uuid_root = btrfs_create_tree(trans, fs_info,
-				      BTRFS_UUID_TREE_OBJECTID);
+	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
 	if (IS_ERR(uuid_root)) {
 		ret = PTR_ERR(uuid_root);
 		btrfs_abort_transaction(trans, ret);
@@ -4337,15 +4676,16 @@
 	int slot;
 	int failed = 0;
 	bool retried = false;
-	bool checked_pending_chunks = false;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	struct btrfs_super_block *super_copy = fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 old_size = btrfs_device_get_total_bytes(device);
 	u64 diff;
+	u64 start;
 
 	new_size = round_down(new_size, fs_info->sectorsize);
+	start = new_size;
 	diff = round_down(old_size - new_size, fs_info->sectorsize);
 
 	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
@@ -4357,6 +4697,12 @@
 
 	path->reada = READA_BACK;
 
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
 	mutex_lock(&fs_info->chunk_mutex);
 
 	btrfs_device_set_total_bytes(device, new_size);
@@ -4364,7 +4710,21 @@
 		device->fs_devices->total_rw_bytes -= diff;
 		atomic64_sub(diff, &fs_info->free_chunk_space);
 	}
-	mutex_unlock(&fs_info->chunk_mutex);
+
+	/*
+	 * Once the device's size has been set to the new size, ensure all
+	 * in-memory chunks are synced to disk so that the loop below sees them
+	 * and relocates them accordingly.
+	 */
+	if (contains_pending_extent(device, &start, diff)) {
+		mutex_unlock(&fs_info->chunk_mutex);
+		ret = btrfs_commit_transaction(trans);
+		if (ret)
+			goto done;
+	} else {
+		mutex_unlock(&fs_info->chunk_mutex);
+		btrfs_end_transaction(trans);
+	}
 
 again:
 	key.objectid = device->devid;
@@ -4426,10 +4786,16 @@
 
 		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-		if (ret && ret != -ENOSPC)
-			goto done;
-		if (ret == -ENOSPC)
+		if (ret == -ENOSPC) {
 			failed++;
+		} else if (ret) {
+			if (ret == -ETXTBSY) {
+				btrfs_warn(fs_info,
+		   "could not shrink block group %llu due to active swapfile",
+					   chunk_offset);
+			}
+			goto done;
+		}
 	} while (key.offset-- > 0);
 
 	if (failed && !retried) {
@@ -4449,40 +4815,10 @@
 	}
 
 	mutex_lock(&fs_info->chunk_mutex);
-
-	/*
-	 * We checked in the above loop all device extents that were already in
-	 * the device tree. However before we have updated the device's
-	 * total_bytes to the new size, we might have had chunk allocations that
-	 * have not complete yet (new block groups attached to transaction
-	 * handles), and therefore their device extents were not yet in the
-	 * device tree and we missed them in the loop above. So if we have any
-	 * pending chunk using a device extent that overlaps the device range
-	 * that we can not use anymore, commit the current transaction and
-	 * repeat the search on the device tree - this way we guarantee we will
-	 * not have chunks using device extents that end beyond 'new_size'.
-	 */
-	if (!checked_pending_chunks) {
-		u64 start = new_size;
-		u64 len = old_size - new_size;
-
-		if (contains_pending_extent(trans->transaction, device,
-					    &start, len)) {
-			mutex_unlock(&fs_info->chunk_mutex);
-			checked_pending_chunks = true;
-			failed = 0;
-			retried = false;
-			ret = btrfs_commit_transaction(trans);
-			if (ret)
-				goto done;
-			goto again;
-		}
-	}
-
 	btrfs_device_set_disk_total_bytes(device, new_size);
-	if (list_empty(&device->resized_list))
-		list_add_tail(&device->resized_list,
-			      &fs_info->fs_devices->resized_devices);
+	if (list_empty(&device->post_commit_list))
+		list_add_tail(&device->post_commit_list,
+			      &trans->transaction->dev_update_list);
 
 	WARN_ON(diff > old_total);
 	btrfs_set_super_total_bytes(super_copy,
@@ -4566,15 +4902,6 @@
 	btrfs_set_fs_incompat(info, RAID56);
 }
 
-#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
-			- sizeof(struct btrfs_chunk))		\
-			/ sizeof(struct btrfs_stripe) + 1)
-
-#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
-				- 2 * sizeof(struct btrfs_disk_key)	\
-				- 2 * sizeof(struct btrfs_chunk))	\
-				/ sizeof(struct btrfs_stripe) + 1)
-
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			       u64 start, u64 type)
 {
@@ -4595,11 +4922,13 @@
 	int devs_min;		/* min devs needed */
 	int devs_increment;	/* ndevs has to be a multiple of this */
 	int ncopies;		/* how many copies to data has */
+	int nparity;		/* number of stripes worth of bytes to
+				   store parity information */
 	int ret;
 	u64 max_stripe_size;
 	u64 max_chunk_size;
 	u64 stripe_size;
-	u64 num_bytes;
+	u64 chunk_size;
 	int ndevs;
 	int i;
 	int j;
@@ -4618,15 +4947,16 @@
 	sub_stripes = btrfs_raid_array[index].sub_stripes;
 	dev_stripes = btrfs_raid_array[index].dev_stripes;
 	devs_max = btrfs_raid_array[index].devs_max;
+	if (!devs_max)
+		devs_max = BTRFS_MAX_DEVS(info);
 	devs_min = btrfs_raid_array[index].devs_min;
 	devs_increment = btrfs_raid_array[index].devs_increment;
 	ncopies = btrfs_raid_array[index].ncopies;
+	nparity = btrfs_raid_array[index].nparity;
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_stripe_size = SZ_1G;
 		max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
-		if (!devs_max)
-			devs_max = BTRFS_MAX_DEVS(info);
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 		/* for larger filesystems, use larger metadata chunks */
 		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@@ -4634,20 +4964,17 @@
 		else
 			max_stripe_size = SZ_256M;
 		max_chunk_size = max_stripe_size;
-		if (!devs_max)
-			devs_max = BTRFS_MAX_DEVS(info);
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		max_stripe_size = SZ_32M;
 		max_chunk_size = 2 * max_stripe_size;
-		if (!devs_max)
-			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
+		devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
 	} else {
 		btrfs_err(info, "invalid chunk type 0x%llx requested",
 		       type);
-		BUG_ON(1);
+		BUG();
 	}
 
-	/* we don't want a chunk larger than 10% of writeable space */
+	/* We don't want a chunk larger than 10% of writable space */
 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
 			     max_chunk_size);
 
@@ -4685,7 +5012,7 @@
 		if (total_avail == 0)
 			continue;
 
-		ret = find_free_dev_extent(trans, device,
+		ret = find_free_dev_extent(device,
 					   max_stripe_size * dev_stripes,
 					   &dev_offset, &max_avail);
 		if (ret && ret != -ENOSPC)
@@ -4750,30 +5077,22 @@
 	 * this will have to be fixed for RAID1 and RAID10 over
 	 * more drives
 	 */
-	data_stripes = num_stripes / ncopies;
-
-	if (type & BTRFS_BLOCK_GROUP_RAID5)
-		data_stripes = num_stripes - 1;
-
-	if (type & BTRFS_BLOCK_GROUP_RAID6)
-		data_stripes = num_stripes - 2;
+	data_stripes = (num_stripes - nparity) / ncopies;
 
 	/*
 	 * Use the number of data stripes to figure out how big this chunk
 	 * is really going to be in terms of logical address space,
-	 * and compare that answer with the max chunk size
+	 * and compare that answer with the max chunk size. If it's higher,
+	 * we try to reduce stripe_size.
 	 */
 	if (stripe_size * data_stripes > max_chunk_size) {
-		stripe_size = div_u64(max_chunk_size, data_stripes);
-
-		/* bump the answer up to a 16MB boundary */
-		stripe_size = round_up(stripe_size, SZ_16M);
-
 		/*
-		 * But don't go higher than the limits we found while searching
-		 * for free extents
+		 * Reduce stripe_size, round it up to a 16MB boundary again and
+		 * then use it, unless it ends up being even bigger than the
+		 * previous value we had already.
 		 */
-		stripe_size = min(devices_info[ndevs - 1].max_avail,
+		stripe_size = min(round_up(div_u64(max_chunk_size,
+						   data_stripes), SZ_16M),
 				  stripe_size);
 	}
 
@@ -4801,9 +5120,9 @@
 	map->type = type;
 	map->sub_stripes = sub_stripes;
 
-	num_bytes = stripe_size * data_stripes;
+	chunk_size = stripe_size * data_stripes;
 
-	trace_btrfs_chunk_alloc(info, map, start, num_bytes);
+	trace_btrfs_chunk_alloc(info, map, start, chunk_size);
 
 	em = alloc_extent_map();
 	if (!em) {
@@ -4814,12 +5133,12 @@
 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
 	em->map_lookup = map;
 	em->start = start;
-	em->len = num_bytes;
+	em->len = chunk_size;
 	em->block_start = 0;
 	em->block_len = em->len;
 	em->orig_block_len = stripe_size;
 
-	em_tree = &info->mapping_tree.map_tree;
+	em_tree = &info->mapping_tree;
 	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em, 0);
 	if (ret) {
@@ -4827,18 +5146,19 @@
 		free_extent_map(em);
 		goto error;
 	}
-
-	list_add_tail(&em->list, &trans->transaction->pending_chunks);
-	refcount_inc(&em->refs);
 	write_unlock(&em_tree->lock);
 
-	ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
+	ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
 	if (ret)
 		goto error_del_extent;
 
 	for (i = 0; i < map->num_stripes; i++) {
-		num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
-		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+		struct btrfs_device *dev = map->stripes[i].dev;
+
+		btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size);
+		if (list_empty(&dev->post_commit_list))
+			list_add_tail(&dev->post_commit_list,
+				      &trans->transaction->dev_update_list);
 	}
 
 	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
@@ -4858,8 +5178,6 @@
 	free_extent_map(em);
 	/* One for the tree reference */
 	free_extent_map(em);
-	/* One for the pending_chunks list reference */
-	free_extent_map(em);
 error:
 	kfree(devices_info);
 	return ret;
@@ -4883,7 +5201,7 @@
 	int i = 0;
 	int ret = 0;
 
-	em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
 	if (IS_ERR(em))
 		return PTR_ERR(em);
 
@@ -4964,10 +5282,10 @@
 }
 
 /*
- * Chunk allocation falls into two parts. The first part does works
- * that make the new allocated chunk useable, but not do any operation
- * that modifies the chunk tree. The second part does the works that
- * require modifying the chunk tree. This division is important for the
+ * Chunk allocation falls into two parts. The first part does work
+ * that makes the new allocated chunk usable, but does not do any operation
+ * that modifies the chunk tree. The second part does the work that
+ * requires modifying the chunk tree. This division is important for the
  * bootstrap process of adding storage to a seed btrfs.
  */
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
@@ -4979,9 +5297,9 @@
 	return __btrfs_alloc_chunk(trans, chunk_offset, type);
 }
 
-static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
-					 struct btrfs_fs_info *fs_info)
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	u64 chunk_offset;
 	u64 sys_chunk_offset;
 	u64 alloc_profile;
@@ -5001,20 +5319,9 @@
 
 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
 {
-	int max_errors;
+	const int index = btrfs_bg_flags_to_raid_index(map->type);
 
-	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-			 BTRFS_BLOCK_GROUP_RAID10 |
-			 BTRFS_BLOCK_GROUP_RAID5 |
-			 BTRFS_BLOCK_GROUP_DUP)) {
-		max_errors = 1;
-	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
-		max_errors = 2;
-	} else {
-		max_errors = 0;
-	}
-
-	return max_errors;
+	return btrfs_raid_array[index].tolerated_failures;
 }
 
 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
@@ -5025,7 +5332,7 @@
 	int miss_ndevs = 0;
 	int i;
 
-	em = get_chunk_map(fs_info, chunk_offset, 1);
+	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
 	if (IS_ERR(em))
 		return 1;
 
@@ -5055,21 +5362,16 @@
 	return readonly;
 }
 
-void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
-{
-	extent_map_tree_init(&tree->map_tree);
-}
-
-void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+void btrfs_mapping_tree_free(struct extent_map_tree *tree)
 {
 	struct extent_map *em;
 
 	while (1) {
-		write_lock(&tree->map_tree.lock);
-		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+		write_lock(&tree->lock);
+		em = lookup_extent_mapping(tree, 0, (u64)-1);
 		if (em)
-			remove_extent_mapping(&tree->map_tree, em);
-		write_unlock(&tree->map_tree.lock);
+			remove_extent_mapping(tree, em);
+		write_unlock(&tree->lock);
 		if (!em)
 			break;
 		/* once for us */
@@ -5085,7 +5387,7 @@
 	struct map_lookup *map;
 	int ret;
 
-	em = get_chunk_map(fs_info, logical, len);
+	em = btrfs_get_chunk_map(fs_info, logical, len);
 	if (IS_ERR(em))
 		/*
 		 * We could return errors for these cases, but that could get
@@ -5096,7 +5398,7 @@
 		return 1;
 
 	map = em->map_lookup;
-	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
 		ret = map->num_stripes;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		ret = map->sub_stripes;
@@ -5115,11 +5417,11 @@
 		ret = 1;
 	free_extent_map(em);
 
-	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+	down_read(&fs_info->dev_replace.rwsem);
 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
 	    fs_info->dev_replace.tgtdev)
 		ret++;
-	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+	up_read(&fs_info->dev_replace.rwsem);
 
 	return ret;
 }
@@ -5131,7 +5433,7 @@
 	struct map_lookup *map;
 	unsigned long len = fs_info->sectorsize;
 
-	em = get_chunk_map(fs_info, logical, len);
+	em = btrfs_get_chunk_map(fs_info, logical, len);
 
 	if (!WARN_ON(IS_ERR(em))) {
 		map = em->map_lookup;
@@ -5148,7 +5450,7 @@
 	struct map_lookup *map;
 	int ret = 0;
 
-	em = get_chunk_map(fs_info, logical, len);
+	em = btrfs_get_chunk_map(fs_info, logical, len);
 
 	if(!WARN_ON(IS_ERR(em))) {
 		map = em->map_lookup;
@@ -5170,7 +5472,7 @@
 	struct btrfs_device *srcdev;
 
 	ASSERT((map->type &
-		 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
+		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
 
 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		num_stripes = map->sub_stripes;
@@ -5307,7 +5609,7 @@
 	/* discard always return a bbio */
 	ASSERT(bbio_ret);
 
-	em = get_chunk_map(fs_info, logical, length);
+	em = btrfs_get_chunk_map(fs_info, logical, length);
 	if (IS_ERR(em))
 		return PTR_ERR(em);
 
@@ -5359,7 +5661,7 @@
 					      &remaining_stripes);
 		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
 		last_stripe *= sub_stripes;
-	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
 				BTRFS_BLOCK_GROUP_DUP)) {
 		num_stripes = map->num_stripes;
 	} else {
@@ -5603,6 +5905,106 @@
 	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
 }
 
+/*
+ * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
+ *		       tuple. This information is used to calculate how big a
+ *		       particular bio can get before it straddles a stripe.
+ *
+ * @fs_info - the filesystem
+ * @logical - address that we want to figure out the geometry of
+ * @len	    - the length of IO we are going to perform, starting at @logical
+ * @op      - type of operation - write or read
+ * @io_geom - pointer used to return values
+ *
+ * Returns < 0 in case a chunk for the given logical address cannot be found,
+ * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
+ */
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+			u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 offset;
+	u64 stripe_offset;
+	u64 stripe_nr;
+	u64 stripe_len;
+	u64 raid56_full_stripe_start = (u64)-1;
+	int data_stripes;
+	int ret = 0;
+
+	ASSERT(op != BTRFS_MAP_DISCARD);
+
+	em = btrfs_get_chunk_map(fs_info, logical, len);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	map = em->map_lookup;
+	/* Offset of this logical address in the chunk */
+	offset = logical - em->start;
+	/* Len of a stripe in a chunk */
+	stripe_len = map->stripe_len;
+	/* Stripe wher this block falls in */
+	stripe_nr = div64_u64(offset, stripe_len);
+	/* Offset of stripe in the chunk */
+	stripe_offset = stripe_nr * stripe_len;
+	if (offset < stripe_offset) {
+		btrfs_crit(fs_info,
+"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
+			stripe_offset, offset, em->start, logical, stripe_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* stripe_offset is the offset of this block in its stripe */
+	stripe_offset = offset - stripe_offset;
+	data_stripes = nr_data_stripes(map);
+
+	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+		u64 max_len = stripe_len - stripe_offset;
+
+		/*
+		 * In case of raid56, we need to know the stripe aligned start
+		 */
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+			unsigned long full_stripe_len = stripe_len * data_stripes;
+			raid56_full_stripe_start = offset;
+
+			/*
+			 * Allow a write of a full stripe, but make sure we
+			 * don't allow straddling of stripes
+			 */
+			raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+					full_stripe_len);
+			raid56_full_stripe_start *= full_stripe_len;
+
+			/*
+			 * For writes to RAID[56], allow a full stripeset across
+			 * all disks. For other RAID types and for RAID[56]
+			 * reads, just allow a single stripe (on a single disk).
+			 */
+			if (op == BTRFS_MAP_WRITE) {
+				max_len = stripe_len * data_stripes -
+					  (offset - raid56_full_stripe_start);
+			}
+		}
+		len = min_t(u64, em->len - offset, max_len);
+	} else {
+		len = em->len - offset;
+	}
+
+	io_geom->len = len;
+	io_geom->offset = offset;
+	io_geom->stripe_len = stripe_len;
+	io_geom->stripe_nr = stripe_nr;
+	io_geom->stripe_offset = stripe_offset;
+	io_geom->raid56_stripe_offset = raid56_full_stripe_start;
+
+out:
+	/* once for us */
+	free_extent_map(em);
+	return ret;
+}
+
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 			     enum btrfs_map_op op,
 			     u64 logical, u64 *length,
@@ -5611,11 +6013,11 @@
 {
 	struct extent_map *em;
 	struct map_lookup *map;
-	u64 offset;
 	u64 stripe_offset;
 	u64 stripe_nr;
 	u64 stripe_len;
 	u32 stripe_index;
+	int data_stripes;
 	int i;
 	int ret = 0;
 	int num_stripes;
@@ -5628,81 +6030,37 @@
 	int patch_the_first_stripe_for_dev_replace = 0;
 	u64 physical_to_patch_in_first_stripe = 0;
 	u64 raid56_full_stripe_start = (u64)-1;
+	struct btrfs_io_geometry geom;
+
+	ASSERT(bbio_ret);
 
 	if (op == BTRFS_MAP_DISCARD)
 		return __btrfs_map_block_for_discard(fs_info, logical,
 						     *length, bbio_ret);
 
-	em = get_chunk_map(fs_info, logical, *length);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
+	ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
+	if (ret < 0)
+		return ret;
 
+	em = btrfs_get_chunk_map(fs_info, logical, *length);
+	ASSERT(!IS_ERR(em));
 	map = em->map_lookup;
-	offset = logical - em->start;
 
-	stripe_len = map->stripe_len;
-	stripe_nr = offset;
-	/*
-	 * stripe_nr counts the total number of stripes we have to stride
-	 * to get to this block
-	 */
-	stripe_nr = div64_u64(stripe_nr, stripe_len);
+	*length = geom.len;
+	stripe_len = geom.stripe_len;
+	stripe_nr = geom.stripe_nr;
+	stripe_offset = geom.stripe_offset;
+	raid56_full_stripe_start = geom.raid56_stripe_offset;
+	data_stripes = nr_data_stripes(map);
 
-	stripe_offset = stripe_nr * stripe_len;
-	if (offset < stripe_offset) {
-		btrfs_crit(fs_info,
-			   "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
-			   stripe_offset, offset, em->start, logical,
-			   stripe_len);
-		free_extent_map(em);
-		return -EINVAL;
-	}
-
-	/* stripe_offset is the offset of this block in its stripe*/
-	stripe_offset = offset - stripe_offset;
-
-	/* if we're here for raid56, we need to know the stripe aligned start */
-	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
-		raid56_full_stripe_start = offset;
-
-		/* allow a write of a full stripe, but make sure we don't
-		 * allow straddling of stripes
-		 */
-		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
-				full_stripe_len);
-		raid56_full_stripe_start *= full_stripe_len;
-	}
-
-	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
-		u64 max_len;
-		/* For writes to RAID[56], allow a full stripeset across all disks.
-		   For other RAID types and for RAID[56] reads, just allow a single
-		   stripe (on a single disk). */
-		if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
-		    (op == BTRFS_MAP_WRITE)) {
-			max_len = stripe_len * nr_data_stripes(map) -
-				(offset - raid56_full_stripe_start);
-		} else {
-			/* we limit the length of each bio to what fits in a stripe */
-			max_len = stripe_len - stripe_offset;
-		}
-		*length = min_t(u64, em->len - offset, max_len);
-	} else {
-		*length = em->len - offset;
-	}
-
-	/* This is for when we're called from btrfs_merge_bio_hook() and all
-	   it cares about is the length */
-	if (!bbio_ret)
-		goto out;
-
-	btrfs_dev_replace_read_lock(dev_replace);
+	down_read(&dev_replace->rwsem);
 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+	/*
+	 * Hold the semaphore for read during the whole operation, write is
+	 * requested at commit time but must wait.
+	 */
 	if (!dev_replace_is_ongoing)
-		btrfs_dev_replace_read_unlock(dev_replace);
-	else
-		btrfs_dev_replace_set_lock_blocking(dev_replace);
+		up_read(&dev_replace->rwsem);
 
 	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
 	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
@@ -5725,7 +6083,7 @@
 				&stripe_index);
 		if (!need_full_stripe(op))
 			mirror_num = 1;
-	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
 		if (need_full_stripe(op))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
@@ -5767,7 +6125,7 @@
 		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
 			/* push stripe_nr back to the start of the full stripe */
 			stripe_nr = div64_u64(raid56_full_stripe_start,
-					stripe_len * nr_data_stripes(map));
+					stripe_len * data_stripes);
 
 			/* RAID[56] write or recovery. Return all stripes */
 			num_stripes = map->num_stripes;
@@ -5783,10 +6141,9 @@
 			 * Mirror #3 is RAID6 Q block.
 			 */
 			stripe_nr = div_u64_rem(stripe_nr,
-					nr_data_stripes(map), &stripe_index);
+					data_stripes, &stripe_index);
 			if (mirror_num > 1)
-				stripe_index = nr_data_stripes(map) +
-						mirror_num - 2;
+				stripe_index = data_stripes + mirror_num - 2;
 
 			/* We distribute the parity blocks across stripes */
 			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
@@ -5844,8 +6201,8 @@
 		div_u64_rem(stripe_nr, num_stripes, &rot);
 
 		/* Fill in the logical address of each stripe */
-		tmp = stripe_nr * nr_data_stripes(map);
-		for (i = 0; i < nr_data_stripes(map); i++)
+		tmp = stripe_nr * data_stripes;
+		for (i = 0; i < data_stripes; i++)
 			bbio->raid_map[(i+rot) % num_stripes] =
 				em->start + (tmp + i) * map->stripe_len;
 
@@ -5897,8 +6254,9 @@
 	}
 out:
 	if (dev_replace_is_ongoing) {
-		btrfs_dev_replace_clear_lock_blocking(dev_replace);
-		btrfs_dev_replace_read_unlock(dev_replace);
+		lockdep_assert_held(&dev_replace->rwsem);
+		/* Unlock and let waiting writers proceed */
+		up_read(&dev_replace->rwsem);
 	}
 	free_extent_map(em);
 	return ret;
@@ -5932,7 +6290,7 @@
 	u64 rmap_len;
 	int i, j, nr = 0;
 
-	em = get_chunk_map(fs_info, chunk_start, 1);
+	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
 	if (IS_ERR(em))
 		return -EIO;
 
@@ -6017,7 +6375,7 @@
 				if (bio_op(bio) == REQ_OP_WRITE)
 					btrfs_dev_stat_inc_and_print(dev,
 						BTRFS_DEV_STAT_WRITE_ERRS);
-				else
+				else if (!(bio->bi_opf & REQ_RAHEAD))
 					btrfs_dev_stat_inc_and_print(dev,
 						BTRFS_DEV_STAT_READ_ERRS);
 				if (bio->bi_opf & REQ_PREFLUSH)
@@ -6072,12 +6430,6 @@
 	int should_queue = 1;
 	struct btrfs_pending_bios *pending_bios;
 
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
-	    !device->bdev) {
-		bio_io_error(bio);
-		return;
-	}
-
 	/* don't bother with additional async steps for reads, right now */
 	if (bio_op(bio) == REQ_OP_READ) {
 		btrfsic_submit_bio(bio);
@@ -6206,7 +6558,8 @@
 
 	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
 		dev = bbio->stripes[dev_nr].dev;
-		if (!dev || !dev->bdev ||
+		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
+						   &dev->dev_state) ||
 		    (bio_op(first_bio) == REQ_OP_WRITE &&
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
 			bbio_error(bbio, first_bio, logical);
@@ -6225,21 +6578,36 @@
 	return BLK_STS_OK;
 }
 
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
-				       u8 *uuid, u8 *fsid)
+/*
+ * Find a device specified by @devid or @uuid in the list of @fs_devices, or
+ * return NULL.
+ *
+ * If devid and uuid are both specified, the match must be exact, otherwise
+ * only devid is used.
+ *
+ * If @seed is true, traverse through the seed devices.
+ */
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+				       u64 devid, u8 *uuid, u8 *fsid,
+				       bool seed)
 {
 	struct btrfs_device *device;
-	struct btrfs_fs_devices *cur_devices;
 
-	cur_devices = fs_info->fs_devices;
-	while (cur_devices) {
+	while (fs_devices) {
 		if (!fsid ||
-		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
-			device = find_device(cur_devices, devid, uuid);
-			if (device)
-				return device;
+		    !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+			list_for_each_entry(device, &fs_devices->devices,
+					    dev_list) {
+				if (device->devid == devid &&
+				    (!uuid || memcmp(device->uuid, uuid,
+						     BTRFS_UUID_SIZE) == 0))
+					return device;
+			}
 		}
-		cur_devices = cur_devices->seed;
+		if (seed)
+			fs_devices = fs_devices->seed;
+		else
+			return NULL;
 	}
 	return NULL;
 }
@@ -6314,99 +6682,6 @@
 	return dev;
 }
 
-/* Return -EIO if any error, otherwise return 0. */
-static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
-				   struct extent_buffer *leaf,
-				   struct btrfs_chunk *chunk, u64 logical)
-{
-	u64 length;
-	u64 stripe_len;
-	u16 num_stripes;
-	u16 sub_stripes;
-	u64 type;
-	u64 features;
-	bool mixed = false;
-
-	length = btrfs_chunk_length(leaf, chunk);
-	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
-	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
-	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
-	type = btrfs_chunk_type(leaf, chunk);
-
-	if (!num_stripes) {
-		btrfs_err(fs_info, "invalid chunk num_stripes: %u",
-			  num_stripes);
-		return -EIO;
-	}
-	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
-		btrfs_err(fs_info, "invalid chunk logical %llu", logical);
-		return -EIO;
-	}
-	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
-		btrfs_err(fs_info, "invalid chunk sectorsize %u",
-			  btrfs_chunk_sector_size(leaf, chunk));
-		return -EIO;
-	}
-	if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
-		btrfs_err(fs_info, "invalid chunk length %llu", length);
-		return -EIO;
-	}
-	if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
-		btrfs_err(fs_info, "invalid chunk stripe length: %llu",
-			  stripe_len);
-		return -EIO;
-	}
-	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
-	    type) {
-		btrfs_err(fs_info, "unrecognized chunk type: %llu",
-			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
-			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
-			  btrfs_chunk_type(leaf, chunk));
-		return -EIO;
-	}
-
-	if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
-		btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type);
-		return -EIO;
-	}
-
-	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
-	    (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
-		btrfs_err(fs_info,
-			"system chunk with data or metadata type: 0x%llx", type);
-		return -EIO;
-	}
-
-	features = btrfs_super_incompat_flags(fs_info->super_copy);
-	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
-		mixed = true;
-
-	if (!mixed) {
-		if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
-		    (type & BTRFS_BLOCK_GROUP_DATA)) {
-			btrfs_err(fs_info,
-			"mixed chunk type in non-mixed mode: 0x%llx", type);
-			return -EIO;
-		}
-	}
-
-	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
-	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
-	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
-	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
-	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
-	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
-	     num_stripes != 1)) {
-		btrfs_err(fs_info,
-			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
-			num_stripes, sub_stripes,
-			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
-		return -EIO;
-	}
-
-	return 0;
-}
-
 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
 					u64 devid, u8 *uuid, bool error)
 {
@@ -6418,11 +6693,31 @@
 			      devid, uuid);
 }
 
-static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
-			  struct extent_buffer *leaf,
+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+{
+	int index = btrfs_bg_flags_to_raid_index(type);
+	int ncopies = btrfs_raid_array[index].ncopies;
+	int data_stripes;
+
+	switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+	case BTRFS_BLOCK_GROUP_RAID5:
+		data_stripes = num_stripes - 1;
+		break;
+	case BTRFS_BLOCK_GROUP_RAID6:
+		data_stripes = num_stripes - 2;
+		break;
+	default:
+		data_stripes = num_stripes / ncopies;
+		break;
+	}
+	return div_u64(chunk_len, data_stripes);
+}
+
+static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
 {
-	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
+	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
 	u64 logical;
@@ -6437,13 +6732,19 @@
 	length = btrfs_chunk_length(leaf, chunk);
 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 
-	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
-	if (ret)
-		return ret;
+	/*
+	 * Only need to verify chunk item if we're reading from sys chunk array,
+	 * as chunk item in tree block is already verified by tree-checker.
+	 */
+	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
+		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
+		if (ret)
+			return ret;
+	}
 
-	read_lock(&map_tree->map_tree.lock);
-	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
-	read_unlock(&map_tree->map_tree.lock);
+	read_lock(&map_tree->lock);
+	em = lookup_extent_mapping(map_tree, logical, 1);
+	read_unlock(&map_tree->lock);
 
 	/* already mapped? */
 	if (em && em->start <= logical && em->start + em->len > logical) {
@@ -6477,6 +6778,8 @@
 	map->type = btrfs_chunk_type(leaf, chunk);
 	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
 	map->verified_stripes = 0;
+	em->orig_block_len = calc_stripe_length(map->type, em->len,
+						map->num_stripes);
 	for (i = 0; i < num_stripes; i++) {
 		map->stripes[i].physical =
 			btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -6484,8 +6787,8 @@
 		read_extent_buffer(leaf, uuid, (unsigned long)
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
-		map->stripes[i].dev = btrfs_find_device(fs_info, devid,
-							uuid, NULL);
+		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
+							devid, uuid, NULL, true);
 		if (!map->stripes[i].dev &&
 		    !btrfs_test_opt(fs_info, DEGRADED)) {
 			free_extent_map(em);
@@ -6510,9 +6813,9 @@
 
 	}
 
-	write_lock(&map_tree->map_tree.lock);
-	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
-	write_unlock(&map_tree->map_tree.lock);
+	write_lock(&map_tree->lock);
+	ret = add_extent_mapping(map_tree, em, 0);
+	write_unlock(&map_tree->lock);
 	if (ret < 0) {
 		btrfs_err(fs_info,
 			  "failed to add chunk map, start=%llu len=%llu: %d",
@@ -6563,12 +6866,12 @@
 		fs_devices = fs_devices->seed;
 	}
 
-	fs_devices = find_fsid(fsid);
+	fs_devices = find_fsid(fsid, NULL);
 	if (!fs_devices) {
 		if (!btrfs_test_opt(fs_info, DEGRADED))
 			return ERR_PTR(-ENOENT);
 
-		fs_devices = alloc_fs_devices(fsid);
+		fs_devices = alloc_fs_devices(fsid, NULL);
 		if (IS_ERR(fs_devices))
 			return fs_devices;
 
@@ -6601,10 +6904,10 @@
 	return fs_devices;
 }
 
-static int read_one_dev(struct btrfs_fs_info *fs_info,
-			struct extent_buffer *leaf,
+static int read_one_dev(struct extent_buffer *leaf,
 			struct btrfs_dev_item *dev_item)
 {
+	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *device;
 	u64 devid;
@@ -6618,13 +6921,14 @@
 	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 			   BTRFS_FSID_SIZE);
 
-	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
+	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
 		fs_devices = open_seed_devices(fs_info, fs_uuid);
 		if (IS_ERR(fs_devices))
 			return PTR_ERR(fs_devices);
 	}
 
-	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+				   fs_uuid, true);
 	if (!device) {
 		if (!btrfs_test_opt(fs_info, DEGRADED)) {
 			btrfs_report_missing_device(fs_info, devid,
@@ -6792,7 +7096,7 @@
 			if (cur_offset + len > array_size)
 				goto out_short_read;
 
-			ret = read_one_chunk(fs_info, &key, sb, chunk);
+			ret = read_one_chunk(&key, sb, chunk);
 			if (ret)
 				break;
 		} else {
@@ -6829,14 +7133,14 @@
 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
 					struct btrfs_device *failing_dev)
 {
-	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	u64 next_start = 0;
 	bool ret = true;
 
-	read_lock(&map_tree->map_tree.lock);
-	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
-	read_unlock(&map_tree->map_tree.lock);
+	read_lock(&map_tree->lock);
+	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
+	read_unlock(&map_tree->lock);
 	/* No chunk at all? Return false anyway */
 	if (!em) {
 		ret = false;
@@ -6865,7 +7169,7 @@
 		if (missing > max_tolerated) {
 			if (!failing_dev)
 				btrfs_warn(fs_info,
-	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
+	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
 				   em->start, missing, max_tolerated);
 			free_extent_map(em);
 			ret = false;
@@ -6874,10 +7178,10 @@
 		next_start = extent_map_end(em);
 		free_extent_map(em);
 
-		read_lock(&map_tree->map_tree.lock);
-		em = lookup_extent_mapping(&map_tree->map_tree, next_start,
+		read_lock(&map_tree->lock);
+		em = lookup_extent_mapping(map_tree, next_start,
 					   (u64)(-1) - next_start);
-		read_unlock(&map_tree->map_tree.lock);
+		read_unlock(&map_tree->lock);
 	}
 out:
 	return ret;
@@ -6933,14 +7237,14 @@
 			struct btrfs_dev_item *dev_item;
 			dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
-			ret = read_one_dev(fs_info, leaf, dev_item);
+			ret = read_one_dev(leaf, dev_item);
 			if (ret)
 				goto error;
 			total_dev++;
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 			struct btrfs_chunk *chunk;
 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
-			ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
+			ret = read_one_chunk(&found_key, leaf, chunk);
 			if (ret)
 				goto error;
 		}
@@ -6992,18 +7296,32 @@
 	}
 }
 
-static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
+				 const struct btrfs_dev_stats_item *ptr,
+				 int index)
 {
-	int i;
+	u64 val;
 
-	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
-		btrfs_dev_stat_reset(dev, i);
+	read_extent_buffer(eb, &val,
+			   offsetof(struct btrfs_dev_stats_item, values) +
+			    ((unsigned long)ptr) + (index * sizeof(u64)),
+			   sizeof(val));
+	return val;
+}
+
+static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+				      struct btrfs_dev_stats_item *ptr,
+				      int index, u64 val)
+{
+	write_extent_buffer(eb, &val,
+			    offsetof(struct btrfs_dev_stats_item, values) +
+			     ((unsigned long)ptr) + (index * sizeof(u64)),
+			    sizeof(val));
 }
 
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_key key;
-	struct btrfs_key found_key;
 	struct btrfs_root *dev_root = fs_info->dev_root;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct extent_buffer *eb;
@@ -7014,10 +7332,8 @@
 	int i;
 
 	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!path)
+		return -ENOMEM;
 
 	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
@@ -7029,14 +7345,14 @@
 		key.offset = device->devid;
 		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
 		if (ret) {
-			__btrfs_reset_dev_stats(device);
+			for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+				btrfs_dev_stat_set(device, i, 0);
 			device->dev_stats_valid = 1;
 			btrfs_release_path(path);
 			continue;
 		}
 		slot = path->slots[0];
 		eb = path->nodes[0];
-		btrfs_item_key_to_cpu(eb, &found_key, slot);
 		item_size = btrfs_item_size_nr(eb, slot);
 
 		ptr = btrfs_item_ptr(eb, slot,
@@ -7047,7 +7363,7 @@
 				btrfs_dev_stat_set(device, i,
 					btrfs_dev_stats_value(eb, ptr, i));
 			else
-				btrfs_dev_stat_reset(device, i);
+				btrfs_dev_stat_set(device, i, 0);
 		}
 
 		device->dev_stats_valid = 1;
@@ -7056,7 +7372,6 @@
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
 
-out:
 	btrfs_free_path(path);
 	return ret < 0 ? ret : 0;
 }
@@ -7129,9 +7444,9 @@
 /*
  * called from commit_transaction. Writes all changed device stats to disk.
  */
-int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
-			struct btrfs_fs_info *fs_info)
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *device;
 	int stats_cnt;
@@ -7214,7 +7529,8 @@
 	int i;
 
 	mutex_lock(&fs_devices->device_list_mutex);
-	dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
+	dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
+				true);
 	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (!dev) {
@@ -7229,7 +7545,7 @@
 				stats->values[i] =
 					btrfs_dev_stat_read_and_reset(dev, i);
 			else
-				btrfs_dev_stat_reset(dev, i);
+				btrfs_dev_stat_set(dev, i, 0);
 		}
 	} else {
 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
@@ -7272,51 +7588,34 @@
 }
 
 /*
- * Update the size of all devices, which is used for writing out the
- * super blocks.
+ * Update the size and bytes used for each device where it changed.  This is
+ * delayed since we would otherwise get errors while writing out the
+ * superblocks.
+ *
+ * Must be invoked during transaction commit.
  */
-void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
 {
-	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *curr, *next;
 
-	if (list_empty(&fs_devices->resized_devices))
+	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
+
+	if (list_empty(&trans->dev_update_list))
 		return;
 
-	mutex_lock(&fs_devices->device_list_mutex);
-	mutex_lock(&fs_info->chunk_mutex);
-	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
-				 resized_list) {
-		list_del_init(&curr->resized_list);
+	/*
+	 * We don't need the device_list_mutex here.  This list is owned by the
+	 * transaction and the transaction must complete before the device is
+	 * released.
+	 */
+	mutex_lock(&trans->fs_info->chunk_mutex);
+	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
+				 post_commit_list) {
+		list_del_init(&curr->post_commit_list);
 		curr->commit_total_bytes = curr->disk_total_bytes;
+		curr->commit_bytes_used = curr->bytes_used;
 	}
-	mutex_unlock(&fs_info->chunk_mutex);
-	mutex_unlock(&fs_devices->device_list_mutex);
-}
-
-/* Must be invoked during the transaction commit */
-void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct extent_map *em;
-	struct map_lookup *map;
-	struct btrfs_device *dev;
-	int i;
-
-	if (list_empty(&trans->pending_chunks))
-		return;
-
-	/* In order to kick the device replace finish process */
-	mutex_lock(&fs_info->chunk_mutex);
-	list_for_each_entry(em, &trans->pending_chunks, list) {
-		map = em->map_lookup;
-
-		for (i = 0; i < map->num_stripes; i++) {
-			dev = map->stripes[i].dev;
-			dev->commit_bytes_used = dev->bytes_used;
-		}
-	}
-	mutex_unlock(&fs_info->chunk_mutex);
+	mutex_unlock(&trans->fs_info->chunk_mutex);
 }
 
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
@@ -7342,40 +7641,21 @@
  */
 int btrfs_bg_type_to_factor(u64 flags)
 {
-	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
-		     BTRFS_BLOCK_GROUP_RAID10))
-		return 2;
-	return 1;
+	const int index = btrfs_bg_flags_to_raid_index(flags);
+
+	return btrfs_raid_array[index].ncopies;
 }
 
 
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
-{
-	int index = btrfs_bg_flags_to_raid_index(type);
-	int ncopies = btrfs_raid_array[index].ncopies;
-	int data_stripes;
-
-	switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
-	case BTRFS_BLOCK_GROUP_RAID5:
-		data_stripes = num_stripes - 1;
-		break;
-	case BTRFS_BLOCK_GROUP_RAID6:
-		data_stripes = num_stripes - 2;
-		break;
-	default:
-		data_stripes = num_stripes / ncopies;
-		break;
-	}
-	return div_u64(chunk_len, data_stripes);
-}
 
 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 				 u64 chunk_offset, u64 devid,
 				 u64 physical_offset, u64 physical_len)
 {
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
+	struct btrfs_device *dev;
 	u64 stripe_len;
 	bool found = false;
 	int ret = 0;
@@ -7425,6 +7705,35 @@
 			physical_offset, devid);
 		ret = -EUCLEAN;
 	}
+
+	/* Make sure no dev extent is beyond device bondary */
+	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+	if (!dev) {
+		btrfs_err(fs_info, "failed to find devid %llu", devid);
+		ret = -EUCLEAN;
+		goto out;
+	}
+
+	/* It's possible this device is a dummy for seed device */
+	if (dev->disk_total_bytes == 0) {
+		dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
+					NULL, false);
+		if (!dev) {
+			btrfs_err(fs_info, "failed to find seed devid %llu",
+				  devid);
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+
+	if (physical_offset + physical_len > dev->disk_total_bytes) {
+		btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
+			  devid, physical_offset, physical_len,
+			  dev->disk_total_bytes);
+		ret = -EUCLEAN;
+		goto out;
+	}
 out:
 	free_extent_map(em);
 	return ret;
@@ -7432,13 +7741,13 @@
 
 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
 {
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct rb_node *node;
 	int ret = 0;
 
 	read_lock(&em_tree->lock);
-	for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
+	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
 		em = rb_entry(node, struct extent_map, rb_node);
 		if (em->map_lookup->num_stripes !=
 		    em->map_lookup->verified_stripes) {
@@ -7467,6 +7776,8 @@
 	struct btrfs_path *path;
 	struct btrfs_root *root = fs_info->dev_root;
 	struct btrfs_key key;
+	u64 prev_devid = 0;
+	u64 prev_dev_ext_end = 0;
 	int ret = 0;
 
 	key.objectid = 1;
@@ -7511,10 +7822,22 @@
 		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
 		physical_len = btrfs_dev_extent_length(leaf, dext);
 
+		/* Check if this dev extent overlaps with the previous one */
+		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+			btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
+				  devid, physical_offset, prev_dev_ext_end);
+			ret = -EUCLEAN;
+			goto out;
+		}
+
 		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
 					    physical_offset, physical_len);
 		if (ret < 0)
 			goto out;
+		prev_devid = devid;
+		prev_dev_ext_end = physical_offset + physical_len;
+
 		ret = btrfs_next_item(root, path);
 		if (ret < 0)
 			goto out;
@@ -7530,3 +7853,27 @@
 	btrfs_free_path(path);
 	return ret;
 }
+
+/*
+ * Check whether the given block group or device is pinned by any inode being
+ * used as a swapfile.
+ */
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
+{
+	struct btrfs_swapfile_pin *sp;
+	struct rb_node *node;
+
+	spin_lock(&fs_info->swapfile_pins_lock);
+	node = fs_info->swapfile_pins.rb_node;
+	while (node) {
+		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+		if (ptr < sp->ptr)
+			node = node->rb_left;
+		else if (ptr > sp->ptr)
+			node = node->rb_right;
+		else
+			break;
+	}
+	spin_unlock(&fs_info->swapfile_pins_lock);
+	return node != NULL;
+}

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 23e9285..a7da1f3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h

@@ -23,6 +23,21 @@
 	struct bio *tail;
 };
 
+struct btrfs_io_geometry {
+	/* remaining bytes before crossing a stripe */
+	u64 len;
+	/* offset of logical address in chunk */
+	u64 offset;
+	/* length of single IO stripe */
+	u64 stripe_len;
+	/* number of stripe where address falls */
+	u64 stripe_nr;
+	/* offset of address in stripe */
+	u64 stripe_offset;
+	/* offset of raid56 stripe into the chunk */
+	u64 raid56_stripe_offset;
+};
+
 /*
  * Use sequence counter to get consistent device stat data on
  * 32-bit processors.
@@ -43,8 +58,9 @@
 #define BTRFS_DEV_STATE_FLUSH_SENT	(4)
 
 struct btrfs_device {
-	struct list_head dev_list;
-	struct list_head dev_alloc_list;
+	struct list_head dev_list; /* device_list_mutex */
+	struct list_head dev_alloc_list; /* chunk mutex */
+	struct list_head post_commit_list; /* chunk mutex */
 	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_fs_info *fs_info;
 
@@ -66,7 +82,6 @@
 
 	unsigned long dev_state;
 	blk_status_t last_flush_error;
-	int flush_bio_sent;
 
 #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
 	seqcount_t data_seqcount;
@@ -102,18 +117,12 @@
 	 * size of the device on the current transaction
 	 *
 	 * This variant is update when committing the transaction,
-	 * and protected by device_list_mutex
+	 * and protected by chunk mutex
 	 */
 	u64 commit_total_bytes;
 
 	/* bytes used on the current transaction */
 	u64 commit_bytes_used;
-	/*
-	 * used to manage the device which is resized
-	 *
-	 * It is protected by chunk_lock.
-	 */
-	struct list_head resized_list;
 
 	/* for sending down flush barriers */
 	struct bio *flush_bio;
@@ -123,7 +132,6 @@
 	struct scrub_ctx *scrub_ctx;
 
 	struct btrfs_work work;
-	struct rcu_head rcu;
 
 	/* readahead state */
 	atomic_t reada_in_flight;
@@ -139,6 +147,8 @@
 	/* Counter to record the change of device stats */
 	atomic_t dev_stats_ccnt;
 	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+
+	struct extent_io_tree alloc_state;
 };
 
 /*
@@ -210,6 +220,8 @@
 
 struct btrfs_fs_devices {
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+	u8 metadata_uuid[BTRFS_FSID_SIZE];
+	bool fsid_change;
 	struct list_head fs_list;
 
 	u64 num_devices;
@@ -218,6 +230,10 @@
 	u64 missing_devices;
 	u64 total_rw_bytes;
 	u64 total_devices;
+
+	/* Highest generation number of seen devices */
+	u64 latest_generation;
+
 	struct block_device *latest_bdev;
 
 	/* all of the devices in the FS, protected by a mutex
@@ -227,10 +243,14 @@
 	 * this mutex lock.
 	 */
 	struct mutex device_list_mutex;
+
+	/* List of all devices, protected by device_list_mutex */
 	struct list_head devices;
 
-	struct list_head resized_devices;
-	/* devices not currently being allocated */
+	/*
+	 * Devices which can satisfy space allocation. Protected by
+	 * chunk_mutex
+	 */
 	struct list_head alloc_list;
 
 	struct btrfs_fs_devices *seed;
@@ -252,6 +272,15 @@
 
 #define BTRFS_BIO_INLINE_CSUM_SIZE	64
 
+#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
+			- sizeof(struct btrfs_chunk))		\
+			/ sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
+				- 2 * sizeof(struct btrfs_disk_key)	\
+				- 2 * sizeof(struct btrfs_chunk))	\
+				/ sizeof(struct btrfs_stripe) + 1)
+
 /*
  * we need the mirror number and stripe index to be passed around
  * the call chain while we are processing end_io (especially errors).
@@ -261,15 +290,12 @@
  * we allocate are actually btrfs_io_bios.  We'll cram as much of
  * struct btrfs_bio as we can into this over time.
  */
-typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
 struct btrfs_io_bio {
 	unsigned int mirror_num;
 	unsigned int stripe_index;
 	u64 logical;
 	u8 *csum;
 	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
-	u8 *csum_allocated;
-	btrfs_io_bio_end_io_t *end_io;
 	struct bvec_iter iter;
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
@@ -283,15 +309,20 @@
 	return container_of(bio, struct btrfs_io_bio, bio);
 }
 
+static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+{
+	if (io_bio->csum != io_bio->csum_inline) {
+		kfree(io_bio->csum);
+		io_bio->csum = NULL;
+	}
+}
+
 struct btrfs_bio_stripe {
 	struct btrfs_device *dev;
 	u64 physical;
 	u64 length; /* only used for discard mappings */
 };
 
-struct btrfs_bio;
-typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
-
 struct btrfs_bio {
 	refcount_t refs;
 	atomic_t stripes_pending;
@@ -324,14 +355,16 @@
 };
 
 struct btrfs_raid_attr {
-	int sub_stripes;	/* sub_stripes info for map */
-	int dev_stripes;	/* stripes per dev */
-	int devs_max;		/* max devs to use */
-	int devs_min;		/* min devs needed */
-	int tolerated_failures; /* max tolerated fail devs */
-	int devs_increment;	/* ndevs has to be a multiple of this */
-	int ncopies;		/* how many copies to data has */
-	int mindev_error;	/* error code if min devs requisite is unmet */
+	u8 sub_stripes;		/* sub_stripes info for map */
+	u8 dev_stripes;		/* stripes per dev */
+	u8 devs_max;		/* max devs to use */
+	u8 devs_min;		/* min devs needed */
+	u8 tolerated_failures;	/* max tolerated fail devs */
+	u8 devs_increment;	/* ndevs has to be a multiple of this */
+	u8 ncopies;		/* how many copies to data has */
+	u8 nparity;		/* number of stripes worth of bytes to store
+				 * parity information */
+	u8 mindev_error;	/* error code if min devs requisite is unmet */
 	const char raid_name[8]; /* name of the raid */
 	u64 bg_flag;		/* block group flag of the raid */
 };
@@ -380,6 +413,7 @@
 		return BTRFS_MAP_WRITE;
 	default:
 		WARN_ON_ONCE(1);
+		/* fall through */
 	case REQ_OP_READ:
 		return BTRFS_MAP_READ;
 	}
@@ -393,29 +427,28 @@
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		     u64 logical, u64 *length,
 		     struct btrfs_bio **bbio_ret);
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+		u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
 		     u64 physical, u64 **logical, int *naddrs, int *stripe_len);
 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
-void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
-void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct extent_map_tree *tree);
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num, int async_submit);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       fmode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path,
 					   fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
 void btrfs_assign_next_active_device(struct btrfs_device *device,
 				     struct btrfs_device *this_dev);
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
-					 const char *device_path,
-					 struct btrfs_device **device);
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
-					 const char *devpath,
-					 struct btrfs_device **device);
+struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
+						  u64 devid,
+						  const char *devpath);
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 					const u64 *devid,
 					const u8 *uuid);
@@ -426,13 +459,14 @@
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
-				       u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+				       u64 devid, u8 *uuid, u8 *fsid, bool seed);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
 int btrfs_balance(struct btrfs_fs_info *fs_info,
 		  struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
+void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
 int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
 int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
@@ -440,22 +474,16 @@
 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
-int find_free_dev_extent_start(struct btrfs_transaction *transaction,
-			 struct btrfs_device *device, u64 num_bytes,
-			 u64 search_start, u64 *start, u64 *max_avail);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
 			struct btrfs_ioctl_get_dev_stats *stats);
 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
-int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
-			struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
-void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
-				      struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
@@ -465,6 +493,8 @@
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 			     u64 chunk_offset, u64 chunk_size);
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+				       u64 logical, u64 length);
 
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
 				      int index)
@@ -517,12 +547,6 @@
 	atomic_inc(&dev->dev_stats_ccnt);
 }
 
-static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
-					int index)
-{
-	btrfs_dev_stat_set(dev, index, 0);
-}
-
 /*
  * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
  * can be used as index to access btrfs_raid_array[].
@@ -545,10 +569,7 @@
 	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 
-const char *get_raid_name(enum btrfs_raid_types type);
-
-void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
-void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
+void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
 
 struct list_head *btrfs_get_fs_uuids(void);
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
@@ -557,6 +578,7 @@
 					struct btrfs_device *failing_dev);
 
 int btrfs_bg_type_to_factor(u64 flags);
+const char *btrfs_bg_type_to_raid_name(u64 flags);
 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
 
 #endif

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ea78c3d..95d9aeb 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c

@@ -11,6 +11,7 @@
 #include <linux/security.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/iversion.h>
+#include <linux/sched/mm.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
@@ -75,9 +76,8 @@
 	return ret;
 }
 
-static int do_setxattr(struct btrfs_trans_handle *trans,
-		       struct inode *inode, const char *name,
-		       const void *value, size_t size, int flags)
+int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+		   const char *name, const void *value, size_t size, int flags)
 {
 	struct btrfs_dir_item *di = NULL;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -86,6 +86,8 @@
 	size_t name_len = strlen(name);
 	int ret = 0;
 
+	ASSERT(trans);
+
 	if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info))
 		return -ENOSPC;
 
@@ -173,7 +175,7 @@
 		char *ptr;
 
 		if (size > old_data_len) {
-			if (btrfs_leaf_free_space(fs_info, leaf) <
+			if (btrfs_leaf_free_space(leaf) <
 			    (size - old_data_len)) {
 				ret = -ENOSPC;
 				goto out;
@@ -183,17 +185,15 @@
 		if (old_data_len + name_len + sizeof(*di) == item_size) {
 			/* No other xattrs packed in the same leaf item. */
 			if (size > old_data_len)
-				btrfs_extend_item(fs_info, path,
-						  size - old_data_len);
+				btrfs_extend_item(path, size - old_data_len);
 			else if (size < old_data_len)
-				btrfs_truncate_item(fs_info, path,
-						    data_size, 1);
+				btrfs_truncate_item(path, data_size, 1);
 		} else {
 			/* There are other xattrs packed in the same item. */
 			ret = btrfs_delete_one_dir_name(trans, root, path, di);
 			if (ret)
 				goto out;
-			btrfs_extend_item(fs_info, path, data_size);
+			btrfs_extend_item(path, data_size);
 		}
 
 		item = btrfs_item_nr(slot);
@@ -213,36 +213,32 @@
 	}
 out:
 	btrfs_free_path(path);
+	if (!ret)
+		set_bit(BTRFS_INODE_COPY_EVERYTHING,
+			&BTRFS_I(inode)->runtime_flags);
 	return ret;
 }
 
 /*
  * @value: "" makes the attribute to empty, NULL removes it
  */
-int btrfs_setxattr(struct btrfs_trans_handle *trans,
-		     struct inode *inode, const char *name,
-		     const void *value, size_t size, int flags)
+int btrfs_setxattr_trans(struct inode *inode, const char *name,
+			 const void *value, size_t size, int flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
 	int ret;
 
-	if (btrfs_root_readonly(root))
-		return -EROFS;
-
-	if (trans)
-		return do_setxattr(trans, inode, name, value, size, flags);
-
 	trans = btrfs_start_transaction(root, 2);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	ret = do_setxattr(trans, inode, name, value, size, flags);
+	ret = btrfs_setxattr(trans, inode, name, value, size, flags);
 	if (ret)
 		goto out;
 
 	inode_inc_iversion(inode);
 	inode->i_ctime = current_time(inode);
-	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
@@ -369,7 +365,7 @@
 				   size_t size, int flags)
 {
 	name = xattr_full_name(handler, name);
-	return btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+	return btrfs_setxattr_trans(inode, name, buffer, size, flags);
 }
 
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
@@ -377,8 +373,30 @@
 					const char *name, const void *value,
 					size_t size, int flags)
 {
+	int ret;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
 	name = xattr_full_name(handler, name);
-	return btrfs_set_prop(inode, name, value, size, flags);
+	ret = btrfs_validate_prop(name, value, size);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 2);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_set_prop(trans, inode, name, value, size, flags);
+	if (!ret) {
+		inode_inc_iversion(inode);
+		inode->i_ctime = current_time(inode);
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+	}
+
+	btrfs_end_transaction(trans);
+
+	return ret;
 }
 
 static const struct xattr_handler btrfs_security_xattr_handler = {
@@ -418,13 +436,19 @@
 };
 
 static int btrfs_initxattrs(struct inode *inode,
-			    const struct xattr *xattr_array, void *fs_info)
+			    const struct xattr *xattr_array, void *fs_private)
 {
+	struct btrfs_trans_handle *trans = fs_private;
 	const struct xattr *xattr;
-	struct btrfs_trans_handle *trans = fs_info;
+	unsigned int nofs_flag;
 	char *name;
 	int err = 0;
 
+	/*
+	 * We're holding a transaction handle, so use a NOFS memory allocation
+	 * context to avoid deadlock if reclaim happens.
+	 */
+	nofs_flag = memalloc_nofs_save();
 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
 		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
 			       strlen(xattr->name) + 1, GFP_KERNEL);
@@ -435,11 +459,12 @@
 		strcpy(name, XATTR_SECURITY_PREFIX);
 		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
 		err = btrfs_setxattr(trans, inode, name, xattr->value,
-				xattr->value_len, 0);
+				     xattr->value_len, 0);
 		kfree(name);
 		if (err < 0)
 			break;
 	}
+	memalloc_nofs_restore(nofs_flag);
 	return err;
 }
 

diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 471fcac..1cd3fc0 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h

@@ -12,9 +12,10 @@
 
 int btrfs_getxattr(struct inode *inode, const char *name,
 		void *buffer, size_t size);
-int btrfs_setxattr(struct btrfs_trans_handle *trans,
-			    struct inode *inode, const char *name,
-			    const void *value, size_t size, int flags);
+int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+		   const char *name, const void *value, size_t size, int flags);
+int btrfs_setxattr_trans(struct inode *inode, const char *name,
+			 const void *value, size_t size, int flags);
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 970ff3e..df1aace 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c

@@ -27,6 +27,33 @@
 	int level;
 };
 
+static struct workspace_manager wsm;
+
+static void zlib_init_workspace_manager(void)
+{
+	btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
+}
+
+static void zlib_cleanup_workspace_manager(void)
+{
+	btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *zlib_get_workspace(unsigned int level)
+{
+	struct list_head *ws = btrfs_get_workspace(&wsm, level);
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	workspace->level = level;
+
+	return ws;
+}
+
+static void zlib_put_workspace(struct list_head *ws)
+{
+	btrfs_put_workspace(&wsm, ws);
+}
+
 static void zlib_free_workspace(struct list_head *ws)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -36,7 +63,7 @@
 	kfree(workspace);
 }
 
-static struct list_head *zlib_alloc_workspace(void)
+static struct list_head *zlib_alloc_workspace(unsigned int level)
 {
 	struct workspace *workspace;
 	int workspacesize;
@@ -48,6 +75,7 @@
 	workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
 			zlib_inflate_workspacesize());
 	workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+	workspace->level = level;
 	workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!workspace->strm.workspace || !workspace->buf)
 		goto fail;
@@ -390,22 +418,16 @@
 	return ret;
 }
 
-static void zlib_set_level(struct list_head *ws, unsigned int type)
-{
-	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	unsigned level = (type & 0xF0) >> 4;
-
-	if (level > 9)
-		level = 9;
-
-	workspace->level = level > 0 ? level : 3;
-}
-
 const struct btrfs_compress_op btrfs_zlib_compress = {
+	.init_workspace_manager	= zlib_init_workspace_manager,
+	.cleanup_workspace_manager = zlib_cleanup_workspace_manager,
+	.get_workspace		= zlib_get_workspace,
+	.put_workspace		= zlib_put_workspace,
 	.alloc_workspace	= zlib_alloc_workspace,
 	.free_workspace		= zlib_free_workspace,
 	.compress_pages		= zlib_compress_pages,
 	.decompress_bio		= zlib_decompress_bio,
 	.decompress		= zlib_decompress,
-	.set_level              = zlib_set_level,
+	.max_level		= 9,
+	.default_level		= BTRFS_ZLIB_DEFAULT_LEVEL,
 };

diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index af6ec59..764d47b 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c

@@ -6,25 +6,32 @@
  */
 
 #include <linux/bio.h>
+#include <linux/bitmap.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/pagemap.h>
 #include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/zstd.h>
+#include "misc.h"
 #include "compression.h"
+#include "ctree.h"
 
 #define ZSTD_BTRFS_MAX_WINDOWLOG 17
 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
 #define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MAX_LEVEL 15
+/* 307s to avoid pathologically clashing with transaction commit */
+#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
 
-static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
+						 size_t src_len)
 {
-	ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
-						src_len, 0);
+	ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
 
 	if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
 		params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -36,11 +43,295 @@
 	void *mem;
 	size_t size;
 	char *buf;
+	unsigned int level;
+	unsigned int req_level;
+	unsigned long last_used; /* jiffies */
 	struct list_head list;
+	struct list_head lru_list;
 	ZSTD_inBuffer in_buf;
 	ZSTD_outBuffer out_buf;
 };
 
+/*
+ * Zstd Workspace Management
+ *
+ * Zstd workspaces have different memory requirements depending on the level.
+ * The zstd workspaces are managed by having individual lists for each level
+ * and a global lru.  Forward progress is maintained by protecting a max level
+ * workspace.
+ *
+ * Getting a workspace is done by using the bitmap to identify the levels that
+ * have available workspaces and scans up.  This lets us recycle higher level
+ * workspaces because of the monotonic memory guarantee.  A workspace's
+ * last_used is only updated if it is being used by the corresponding memory
+ * level.  Putting a workspace involves adding it back to the appropriate places
+ * and adding it back to the lru if necessary.
+ *
+ * A timer is used to reclaim workspaces if they have not been used for
+ * ZSTD_BTRFS_RECLAIM_JIFFIES.  This helps keep only active workspaces around.
+ * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
+ */
+
+struct zstd_workspace_manager {
+	const struct btrfs_compress_op *ops;
+	spinlock_t lock;
+	struct list_head lru_list;
+	struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
+	unsigned long active_map;
+	wait_queue_head_t wait;
+	struct timer_list timer;
+};
+
+static struct zstd_workspace_manager wsm;
+
+static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
+
+static inline struct workspace *list_to_workspace(struct list_head *list)
+{
+	return container_of(list, struct workspace, list);
+}
+
+static void zstd_free_workspace(struct list_head *ws);
+static struct list_head *zstd_alloc_workspace(unsigned int level);
+
+/*
+ * zstd_reclaim_timer_fn - reclaim timer
+ * @t: timer
+ *
+ * This scans the lru_list and attempts to reclaim any workspace that hasn't
+ * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ */
+static void zstd_reclaim_timer_fn(struct timer_list *timer)
+{
+	unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
+	struct list_head *pos, *next;
+
+	spin_lock_bh(&wsm.lock);
+
+	if (list_empty(&wsm.lru_list)) {
+		spin_unlock_bh(&wsm.lock);
+		return;
+	}
+
+	list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+		struct workspace *victim = container_of(pos, struct workspace,
+							lru_list);
+		unsigned int level;
+
+		if (time_after(victim->last_used, reclaim_threshold))
+			break;
+
+		/* workspace is in use */
+		if (victim->req_level)
+			continue;
+
+		level = victim->level;
+		list_del(&victim->lru_list);
+		list_del(&victim->list);
+		zstd_free_workspace(&victim->list);
+
+		if (list_empty(&wsm.idle_ws[level - 1]))
+			clear_bit(level - 1, &wsm.active_map);
+
+	}
+
+	if (!list_empty(&wsm.lru_list))
+		mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+
+	spin_unlock_bh(&wsm.lock);
+}
+
+/*
+ * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ *
+ * It is possible based on the level configurations that a higher level
+ * workspace uses less memory than a lower level workspace.  In order to reuse
+ * workspaces, this must be made a monotonic relationship.  This precomputes
+ * the required memory for each level and enforces the monotonicity between
+ * level and memory required.
+ */
+static void zstd_calc_ws_mem_sizes(void)
+{
+	size_t max_size = 0;
+	unsigned int level;
+
+	for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+		ZSTD_parameters params =
+			zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
+		size_t level_size =
+			max_t(size_t,
+			      ZSTD_CStreamWorkspaceBound(params.cParams),
+			      ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+
+		max_size = max_t(size_t, max_size, level_size);
+		zstd_ws_mem_sizes[level - 1] = max_size;
+	}
+}
+
+static void zstd_init_workspace_manager(void)
+{
+	struct list_head *ws;
+	int i;
+
+	zstd_calc_ws_mem_sizes();
+
+	wsm.ops = &btrfs_zstd_compress;
+	spin_lock_init(&wsm.lock);
+	init_waitqueue_head(&wsm.wait);
+	timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
+
+	INIT_LIST_HEAD(&wsm.lru_list);
+	for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+		INIT_LIST_HEAD(&wsm.idle_ws[i]);
+
+	ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+	if (IS_ERR(ws)) {
+		pr_warn(
+		"BTRFS: cannot preallocate zstd compression workspace\n");
+	} else {
+		set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
+		list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+	}
+}
+
+static void zstd_cleanup_workspace_manager(void)
+{
+	struct workspace *workspace;
+	int i;
+
+	spin_lock_bh(&wsm.lock);
+	for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+		while (!list_empty(&wsm.idle_ws[i])) {
+			workspace = container_of(wsm.idle_ws[i].next,
+						 struct workspace, list);
+			list_del(&workspace->list);
+			list_del(&workspace->lru_list);
+			zstd_free_workspace(&workspace->list);
+		}
+	}
+	spin_unlock_bh(&wsm.lock);
+
+	del_timer_sync(&wsm.timer);
+}
+
+/*
+ * zstd_find_workspace - find workspace
+ * @level: compression level
+ *
+ * This iterates over the set bits in the active_map beginning at the requested
+ * compression level.  This lets us utilize already allocated workspaces before
+ * allocating a new one.  If the workspace is of a larger size, it is used, but
+ * the place in the lru_list and last_used times are not updated.  This is to
+ * offer the opportunity to reclaim the workspace in favor of allocating an
+ * appropriately sized one in the future.
+ */
+static struct list_head *zstd_find_workspace(unsigned int level)
+{
+	struct list_head *ws;
+	struct workspace *workspace;
+	int i = level - 1;
+
+	spin_lock_bh(&wsm.lock);
+	for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
+		if (!list_empty(&wsm.idle_ws[i])) {
+			ws = wsm.idle_ws[i].next;
+			workspace = list_to_workspace(ws);
+			list_del_init(ws);
+			/* keep its place if it's a lower level using this */
+			workspace->req_level = level;
+			if (level == workspace->level)
+				list_del(&workspace->lru_list);
+			if (list_empty(&wsm.idle_ws[i]))
+				clear_bit(i, &wsm.active_map);
+			spin_unlock_bh(&wsm.lock);
+			return ws;
+		}
+	}
+	spin_unlock_bh(&wsm.lock);
+
+	return NULL;
+}
+
+/*
+ * zstd_get_workspace - zstd's get_workspace
+ * @level: compression level
+ *
+ * If @level is 0, then any compression level can be used.  Therefore, we begin
+ * scanning from 1.  We first scan through possible workspaces and then after
+ * attempt to allocate a new workspace.  If we fail to allocate one due to
+ * memory pressure, go to sleep waiting for the max level workspace to free up.
+ */
+static struct list_head *zstd_get_workspace(unsigned int level)
+{
+	struct list_head *ws;
+	unsigned int nofs_flag;
+
+	/* level == 0 means we can use any workspace */
+	if (!level)
+		level = 1;
+
+again:
+	ws = zstd_find_workspace(level);
+	if (ws)
+		return ws;
+
+	nofs_flag = memalloc_nofs_save();
+	ws = zstd_alloc_workspace(level);
+	memalloc_nofs_restore(nofs_flag);
+
+	if (IS_ERR(ws)) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+		schedule();
+		finish_wait(&wsm.wait, &wait);
+
+		goto again;
+	}
+
+	return ws;
+}
+
+/*
+ * zstd_put_workspace - zstd put_workspace
+ * @ws: list_head for the workspace
+ *
+ * When putting back a workspace, we only need to update the LRU if we are of
+ * the requested compression level.  Here is where we continue to protect the
+ * max level workspace or update last_used accordingly.  If the reclaim timer
+ * isn't set, it is also set here.  Only the max level workspace tries and wakes
+ * up waiting workspaces.
+ */
+static void zstd_put_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_to_workspace(ws);
+
+	spin_lock_bh(&wsm.lock);
+
+	/* A node is only taken off the lru if we are the corresponding level */
+	if (workspace->req_level == workspace->level) {
+		/* Hide a max level workspace from reclaim */
+		if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+			INIT_LIST_HEAD(&workspace->lru_list);
+		} else {
+			workspace->last_used = jiffies;
+			list_add(&workspace->lru_list, &wsm.lru_list);
+			if (!timer_pending(&wsm.timer))
+				mod_timer(&wsm.timer,
+					  jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+		}
+	}
+
+	set_bit(workspace->level - 1, &wsm.active_map);
+	list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+	workspace->req_level = 0;
+
+	spin_unlock_bh(&wsm.lock);
+
+	if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+		cond_wake_up(&wsm.wait);
+}
+
 static void zstd_free_workspace(struct list_head *ws)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -50,25 +341,25 @@
 	kfree(workspace);
 }
 
-static struct list_head *zstd_alloc_workspace(void)
+static struct list_head *zstd_alloc_workspace(unsigned int level)
 {
-	ZSTD_parameters params =
-			zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
 	struct workspace *workspace;
 
 	workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
 	if (!workspace)
 		return ERR_PTR(-ENOMEM);
 
-	workspace->size = max_t(size_t,
-			ZSTD_CStreamWorkspaceBound(params.cParams),
-			ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+	workspace->size = zstd_ws_mem_sizes[level - 1];
+	workspace->level = level;
+	workspace->req_level = level;
+	workspace->last_used = jiffies;
 	workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
 	workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!workspace->mem || !workspace->buf)
 		goto fail;
 
 	INIT_LIST_HEAD(&workspace->list);
+	INIT_LIST_HEAD(&workspace->lru_list);
 
 	return &workspace->list;
 fail:
@@ -95,7 +386,8 @@
 	unsigned long len = *total_out;
 	const unsigned long nr_dest_pages = *out_pages;
 	unsigned long max_out = nr_dest_pages * PAGE_SIZE;
-	ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+	ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+							   len);
 
 	*out_pages = 0;
 	*total_out = 0;
@@ -419,15 +711,16 @@
 	return ret;
 }
 
-static void zstd_set_level(struct list_head *ws, unsigned int type)
-{
-}
-
 const struct btrfs_compress_op btrfs_zstd_compress = {
+	.init_workspace_manager = zstd_init_workspace_manager,
+	.cleanup_workspace_manager = zstd_cleanup_workspace_manager,
+	.get_workspace = zstd_get_workspace,
+	.put_workspace = zstd_put_workspace,
 	.alloc_workspace = zstd_alloc_workspace,
 	.free_workspace = zstd_free_workspace,
 	.compress_pages = zstd_compress_pages,
 	.decompress_bio = zstd_decompress_bio,
 	.decompress = zstd_decompress,
-	.set_level = zstd_set_level,
+	.max_level	= ZSTD_BTRFS_MAX_LEVEL,
+	.default_level	= ZSTD_BTRFS_DEFAULT_LEVEL,
 };

diff --git a/fs/buffer.c b/fs/buffer.c
index 6f1ae3a..86a38b9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/buffer.c
  *
@@ -200,6 +201,7 @@
 	struct buffer_head *head;
 	struct page *page;
 	int all_mapped = 1;
+	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
 
 	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
@@ -227,15 +229,15 @@
 	 * file io on the block device and getblk.  It gets dealt with
 	 * elsewhere, don't buffer_error if we had some unmapped buffers
 	 */
-	if (all_mapped) {
-		printk("__find_get_block_slow() failed. "
-			"block=%llu, b_blocknr=%llu\n",
-			(unsigned long long)block,
-			(unsigned long long)bh->b_blocknr);
-		printk("b_state=0x%08lx, b_size=%zu\n",
-			bh->b_state, bh->b_size);
-		printk("device %pg blocksize: %d\n", bdev,
-			1 << bd_inode->i_blkbits);
+	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
+	if (all_mapped && __ratelimit(&last_warned)) {
+		printk("__find_get_block_slow() failed. block=%llu, "
+		       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
+		       "device %pg blocksize: %d\n",
+		       (unsigned long long)block,
+		       (unsigned long long)bh->b_blocknr,
+		       bh->b_state, bh->b_size, bdev,
+		       1 << bd_inode->i_blkbits);
 	}
 out_unlock:
 	spin_unlock(&bd_mapping->private_lock);
@@ -562,7 +564,7 @@
 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 
 /*
- * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
  * dirty.
  *
  * If warn is true, then emit a warning if the page is not uptodate and has
@@ -579,8 +581,8 @@
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 		account_page_dirtied(page, mapping);
-		radix_tree_tag_set(&mapping->i_pages,
-				page_index(page), PAGECACHE_TAG_DIRTY);
+		__xa_set_mark(&mapping->i_pages, page_index(page),
+				PAGECACHE_TAG_DIRTY);
 	}
 	xa_unlock_irqrestore(&mapping->i_pages, flags);
 }
@@ -1050,7 +1052,7 @@
  * The relationship between dirty buffers and dirty pages:
  *
  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page is tagged dirty in its radix tree.
+ * the page is tagged dirty in the page cache.
  *
  * At all times, the dirtiness of the buffers represents the dirtiness of
  * subsections of the page.  If the page has buffers, the page dirty bit is
@@ -1073,9 +1075,9 @@
  * mark_buffer_dirty - mark a buffer_head as needing writeout
  * @bh: the buffer_head to mark dirty
  *
- * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- * backing page dirty, then tag the page as dirty in its address_space's radix
- * tree and then attach the address_space's inode to its superblock's dirty
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set
+ * its backing page dirty, then tag the page as dirty in the page cache
+ * and then attach the address_space's inode to its superblock's dirty
  * inode list.
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
@@ -2084,40 +2086,6 @@
 }
 EXPORT_SYMBOL(block_write_begin);
 
-int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
-		struct page *page)
-{
-	loff_t old_size = inode->i_size;
-	bool i_size_changed = false;
-
-	/*
-	 * No need to use i_size_read() here, the i_size cannot change under us
-	 * because we hold i_rwsem.
-	 *
-	 * But it's important to update i_size while still holding page lock:
-	 * page writeout could otherwise come in and zero beyond i_size.
-	 */
-	if (pos + copied > inode->i_size) {
-		i_size_write(inode, pos + copied);
-		i_size_changed = true;
-	}
-
-	unlock_page(page);
-	put_page(page);
-
-	if (old_size < pos)
-		pagecache_isize_extended(inode, old_size, pos);
-	/*
-	 * Don't mark the inode dirty under page lock. First, it unnecessarily
-	 * makes the holding time of page lock longer. Second, it forces lock
-	 * ordering of page lock and transaction start for journaling
-	 * filesystems.
-	 */
-	if (i_size_changed)
-		mark_inode_dirty(inode);
-	return copied;
-}
-
 int block_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
@@ -2158,8 +2126,38 @@
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
+	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
+	bool i_size_changed = false;
+
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-	return __generic_write_end(mapping->host, pos, copied, page);
+
+	/*
+	 * No need to use i_size_read() here, the i_size cannot change under us
+	 * because we hold i_rwsem.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos + copied > inode->i_size) {
+		i_size_write(inode, pos + copied);
+		i_size_changed = true;
+	}
+
+	unlock_page(page);
+	put_page(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+	return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
 
@@ -2366,7 +2364,7 @@
 
 		balance_dirty_pages_ratelimited(mapping);
 
-		if (unlikely(fatal_signal_pending(current))) {
+		if (fatal_signal_pending(current)) {
 			err = -EINTR;
 			goto out;
 		}
@@ -3026,13 +3024,23 @@
 	/* Uhhuh. We've got a bio that straddles the device size! */
 	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
 
+	/*
+	 * The bio contains more than one segment which spans EOD, just return
+	 * and let IO layer turn it into an EIO
+	 */
+	if (truncated_bytes > bvec->bv_len)
+		return;
+
 	/* Truncate the bio.. */
 	bio->bi_iter.bi_size -= truncated_bytes;
 	bvec->bv_len -= truncated_bytes;
 
 	/* ..and clear the end of the buffer for reads */
 	if (op == REQ_OP_READ) {
-		zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+		struct bio_vec bv;
+
+		mp_bvec_last_segment(bvec, &bv);
+		zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
 				truncated_bytes);
 	}
 }
@@ -3060,11 +3068,6 @@
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
-	if (wbc) {
-		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, bh->b_page, bh->b_size);
-	}
-
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio_set_dev(bio, bh->b_bdev);
 	bio->bi_write_hint = write_hint;
@@ -3084,6 +3087,11 @@
 		op_flags |= REQ_PRIO;
 	bio_set_op_attrs(bio, op, op_flags);
 
+	if (wbc) {
+		wbc_init_bio(wbc, bio);
+		wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
+	}
+
 	submit_bio(bio);
 	return 0;
 }

diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 80e9c61..ae559ed 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 
 config CACHEFILES
 	tristate "Filesystem caching on files"

diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 4a717d4..dfb14db 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Bind and unbind a cache from the filesystem backing it
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 3fdee21..752c1e4 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Daemon interface
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 222bc5d..4cea5fb 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache interface to CacheFiles
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/slab.h>

diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index d2f6f99..cf9bd64 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* General netfs cache on cache files internal defs
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #ifdef pr_fmt

diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 33b58c6..be96f5f 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Key to pathname encoder
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/slab.h>

diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index f54d3f5..ddf0cd5 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Network filesystem caching backend to use cache files on a premounted
  * filesystem
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 5ab411d..ecc8ecb 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* CacheFiles path walking and related routines
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>
@@ -20,7 +16,6 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/slab.h>
-#include <linux/xattr.h>
 #include "internal.h"
 
 #define CACHEFILES_KEYBUF_SIZE 512
@@ -338,7 +333,7 @@
 try_again:
 	/* first step is to make up a grave dentry in the graveyard */
 	sprintf(nbuffer, "%08x%08x",
-		(uint32_t) get_seconds(),
+		(uint32_t) ktime_get_real_seconds(),
 		(uint32_t) atomic_inc_return(&cache->gravecounter));
 
 	/* do the multiway lock magic */

diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
index 0ce1aa5..6e67aea 100644
--- a/fs/cachefiles/proc.c
+++ b/fs/cachefiles/proc.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* CacheFiles statistics
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8a57740..44a3ce1 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Storage object read/write
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/mount.h>

diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index 31bbc05..aec13fd 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* CacheFiles security management
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/fs.h>

diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 511e6c6..72e4243 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* CacheFiles extended attribute management
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 52095f4..cf235f6 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config CEPH_FS
 	tristate "Ceph distributed file system"
 	depends on INET
@@ -35,3 +36,15 @@
 	  groups beyond the owner/group/world scheme.
 
 	  If you don't know what Access Control Lists are, say N
+
+config CEPH_FS_SECURITY_LABEL
+	bool "CephFS Security Labels"
+	depends on CEPH_FS && SECURITY
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux. This option
+	  enables an extended attribute handler for file security
+	  labels in the Ceph filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index a699e32..c1da294 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile

@@ -6,7 +6,7 @@
 obj-$(CONFIG_CEPH_FS) += ceph.o
 
 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
-	export.o caps.o snap.o xattr.o quota.o \
+	export.o caps.o snap.o xattr.o quota.o io.o \
 	mds_client.o mdsmap.o strings.o ceph_frag.o \
 	debugfs.o
 

diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 027408d..aa55f41 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c

@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/ceph/acl.c
  *
  * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/ceph/ceph_debug.h>
@@ -104,6 +91,11 @@
 	struct timespec64 old_ctime = inode->i_ctime;
 	umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
 
+	if (ceph_snap(inode) != CEPH_NOSNAP) {
+		ret = -EROFS;
+		goto out;
+	}
+
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -138,11 +130,6 @@
 			goto out_free;
 	}
 
-	if (ceph_snap(inode) != CEPH_NOSNAP) {
-		ret = -EROFS;
-		goto out_free;
-	}
-
 	if (new_mode != old_mode) {
 		newattrs.ia_ctime = current_time(inode);
 		newattrs.ia_mode = new_mode;
@@ -172,7 +159,7 @@
 }
 
 int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
-		       struct ceph_acls_info *info)
+		       struct ceph_acl_sec_ctx *as_ctx)
 {
 	struct posix_acl *acl, *default_acl;
 	size_t val_size1 = 0, val_size2 = 0;
@@ -206,10 +193,9 @@
 	tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
 	if (!tmp_buf)
 		goto out_err;
-	pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
+	pagelist = ceph_pagelist_alloc(GFP_KERNEL);
 	if (!pagelist)
 		goto out_err;
-	ceph_pagelist_init(pagelist);
 
 	err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
 	if (err)
@@ -248,9 +234,9 @@
 
 	kfree(tmp_buf);
 
-	info->acl = acl;
-	info->default_acl = default_acl;
-	info->pagelist = pagelist;
+	as_ctx->acl = acl;
+	as_ctx->default_acl = default_acl;
+	as_ctx->pagelist = pagelist;
 	return 0;
 
 out_err:
@@ -262,18 +248,10 @@
 	return err;
 }
 
-void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info)
+void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx)
 {
 	if (!inode)
 		return;
-	ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl);
-	ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl);
-}
-
-void ceph_release_acls_info(struct ceph_acls_info *info)
-{
-	posix_acl_release(info->acl);
-	posix_acl_release(info->default_acl);
-	if (info->pagelist)
-		ceph_pagelist_release(info->pagelist);
+	ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, as_ctx->acl);
+	ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, as_ctx->default_acl);
 }

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9c332a6..7ab6166 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c

@@ -10,6 +10,7 @@
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/signal.h>
+#include <linux/iversion.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -188,8 +189,7 @@
 {
 	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc =
-		&ceph_inode_to_client(inode)->client->osdc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	int err = 0;
 	u64 off = page_offset(page);
 	u64 len = PAGE_SIZE;
@@ -218,8 +218,8 @@
 
 	dout("readpage inode %p file %p page %p index %lu\n",
 	     inode, filp, page, page->index);
-	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
-				  off, &len,
+	err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+				  &ci->i_layout, off, &len,
 				  ci->i_truncate_seq, ci->i_truncate_size,
 				  &page, 1, 0);
 	if (err == -ENOENT)
@@ -227,6 +227,8 @@
 	if (err < 0) {
 		SetPageError(page);
 		ceph_fscache_readpage_cancel(inode, page);
+		if (err == -EBLACKLISTED)
+			fsc->blacklisted = true;
 		goto out;
 	}
 	if (err < PAGE_SIZE)
@@ -265,6 +267,8 @@
 	int i;
 
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+	if (rc == -EBLACKLISTED)
+		ceph_inode_to_client(inode)->blacklisted = true;
 
 	/* unlock all pages, zeroing any data we didn't read */
 	osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -306,7 +310,7 @@
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct page *page = list_entry(page_list->prev, struct page, lru);
+	struct page *page = lru_to_page(page_list);
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
 	u64 off;
@@ -322,7 +326,8 @@
 		/* caller of readpages does not hold buffer and read caps
 		 * (fadvise, madvise and readahead cases) */
 		int want = CEPH_CAP_FILE_CACHE;
-		ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
+		ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
+					true, &got);
 		if (ret < 0) {
 			dout("start_read %p, error getting cap\n", inode);
 		} else if (!(got & want)) {
@@ -333,8 +338,7 @@
 			if (got)
 				ceph_put_cap_refs(ci, got);
 			while (!list_empty(page_list)) {
-				page = list_entry(page_list->prev,
-						  struct page, lru);
+				page = lru_to_page(page_list);
 				list_del(&page->lru);
 				put_page(page);
 			}
@@ -569,7 +573,7 @@
 /*
  * Write a single page, but leave the page locked.
  *
- * If we get a write error, set the page error bit, but still adjust the
+ * If we get a write error, mark the mapping for error, but still adjust the
  * dirty page accounting (i.e., page is no longer dirty).
  */
 static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
@@ -640,9 +644,10 @@
 			end_page_writeback(page);
 			return err;
 		}
+		if (err == -EBLACKLISTED)
+			fsc->blacklisted = true;
 		dout("writepage setting page/mapping error %d %p\n",
 		     err, page);
-		SetPageError(page);
 		mapping_set_error(&inode->i_data, err);
 		wbc->pages_skipped++;
 	} else {
@@ -680,23 +685,6 @@
 }
 
 /*
- * lame release_pages helper.  release_pages() isn't exported to
- * modules.
- */
-static void ceph_release_pages(struct page **pages, int num)
-{
-	struct pagevec pvec;
-	int i;
-
-	pagevec_init(&pvec);
-	for (i = 0; i < num; i++) {
-		if (pagevec_add(&pvec, pages[i]) == 0)
-			pagevec_release(&pvec);
-	}
-	pagevec_release(&pvec);
-}
-
-/*
  * async writeback completion handler.
  *
  * If we get an error, set the mapping error bit, but not the individual
@@ -720,6 +708,8 @@
 	if (rc < 0) {
 		mapping_set_error(mapping, rc);
 		ceph_set_error_write(ci);
+		if (rc == -EBLACKLISTED)
+			fsc->blacklisted = true;
 	} else {
 		ceph_clear_error_write(ci);
 	}
@@ -769,7 +759,7 @@
 		dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
 		     inode, osd_data->length, rc >= 0 ? num_pages : 0);
 
-		ceph_release_pages(osd_data->pages, num_pages);
+		release_pages(osd_data->pages, num_pages);
 	}
 
 	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
@@ -913,8 +903,9 @@
 			if (page_offset(page) >= ceph_wbc.i_size) {
 				dout("%p page eof %llu\n",
 				     page, ceph_wbc.i_size);
-				if (ceph_wbc.size_stable ||
-				    page_offset(page) >= i_size_read(inode))
+				if ((ceph_wbc.size_stable ||
+				    page_offset(page) >= i_size_read(inode)) &&
+				    clear_page_dirty_for_io(page))
 					mapping->a_ops->invalidatepage(page,
 								0, PAGE_SIZE);
 				unlock_page(page);
@@ -1451,7 +1442,8 @@
 		want = CEPH_CAP_FILE_CACHE;
 
 	got = 0;
-	err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+	err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
+			    &got, &pinned_page);
 	if (err < 0)
 		goto out_restore;
 
@@ -1495,10 +1487,7 @@
 		if (err < 0 || off >= i_size_read(inode)) {
 			unlock_page(page);
 			put_page(page);
-			if (err == -ENOMEM)
-				ret = VM_FAULT_OOM;
-			else
-				ret = VM_FAULT_SIGBUS;
+			ret = vmf_error(err);
 			goto out_inline;
 		}
 		if (err < PAGE_SIZE)
@@ -1542,6 +1531,7 @@
 	if (!prealloc_cf)
 		return VM_FAULT_OOM;
 
+	sb_start_pagefault(inode->i_sb);
 	ceph_block_sigs(&oldset);
 
 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
@@ -1570,7 +1560,7 @@
 		want = CEPH_CAP_FILE_BUFFER;
 
 	got = 0;
-	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+	err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
 			    &got, NULL);
 	if (err < 0)
 		goto out_free;
@@ -1580,6 +1570,7 @@
 
 	/* Update time before taking page lock */
 	file_update_time(vma->vm_file);
+	inode_inc_iversion_raw(inode);
 
 	do {
 		lock_page(page);
@@ -1615,6 +1606,7 @@
 	ceph_put_cap_refs(ci, got);
 out_free:
 	ceph_restore_sigs(&oldset);
+	sb_end_pagefault(inode->i_sb);
 	ceph_free_cap_flush(prealloc_cf);
 	if (err < 0)
 		ret = vmf_error(err);
@@ -1947,12 +1939,17 @@
 
 	if (err >= 0 || err == -ENOENT)
 		have |= POOL_READ;
-	else if (err != -EPERM)
+	else if (err != -EPERM) {
+		if (err == -EBLACKLISTED)
+			fsc->blacklisted = true;
 		goto out_unlock;
+	}
 
 	if (err2 == 0 || err2 == -EEXIST)
 		have |= POOL_WRITE;
 	else if (err2 != -EPERM) {
+		if (err2 == -EBLACKLISTED)
+			fsc->blacklisted = true;
 		err = err2;
 		goto out_unlock;
 	}
@@ -1990,10 +1987,11 @@
 	return err;
 }
 
-int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
+int ceph_pool_perm_check(struct inode *inode, int need)
 {
-	s64 pool;
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_string *pool_ns;
+	s64 pool;
 	int ret, flags;
 
 	if (ci->i_vino.snap != CEPH_NOSNAP) {
@@ -2005,7 +2003,7 @@
 		return 0;
 	}
 
-	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
+	if (ceph_test_mount_opt(ceph_inode_to_client(inode),
 				NOPOOLPERM))
 		return 0;
 

diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 1bf3502..b2ec29e 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c

@@ -1,26 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Ceph cache definitions.
  *
  *  Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
  *  Written by Milosz Tanski (milosz@adfin.com)
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
+#include <linux/ceph/ceph_debug.h>
+
 #include "super.h"
 #include "cache.h"
 

diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 7e72c75..e486fac 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h

@@ -1,24 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Ceph cache definitions.
  *
  *  Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
  *  Written by Milosz Tanski (milosz@adfin.com)
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2
- *  as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to:
- *  Free Software Foundation
- *  51 Franklin Street, Fifth Floor
- *  Boston, MA  02111-1301  USA
- *
  */
 
 #ifndef _CEPH_CACHE_H

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index dd7dfdd..f5a3891 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c

@@ -8,6 +8,7 @@
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
+#include <linux/iversion.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -148,11 +149,17 @@
 	spin_unlock(&mdsc->caps_list_lock);
 }
 
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+			      struct ceph_mount_options *fsopt)
 {
 	spin_lock(&mdsc->caps_list_lock);
-	mdsc->caps_min_count += delta;
-	BUG_ON(mdsc->caps_min_count < 0);
+	mdsc->caps_min_count = fsopt->max_readdir;
+	if (mdsc->caps_min_count < 1024)
+		mdsc->caps_min_count = 1024;
+	mdsc->caps_use_max = fsopt->caps_max;
+	if (mdsc->caps_use_max > 0 &&
+	    mdsc->caps_use_max < mdsc->caps_min_count)
+		mdsc->caps_use_max = mdsc->caps_min_count;
 	spin_unlock(&mdsc->caps_list_lock);
 }
 
@@ -272,6 +279,7 @@
 	if (!err) {
 		BUG_ON(have + alloc != need);
 		ctx->count = need;
+		ctx->used = 0;
 	}
 
 	spin_lock(&mdsc->caps_list_lock);
@@ -295,13 +303,24 @@
 }
 
 void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
-			struct ceph_cap_reservation *ctx)
+			 struct ceph_cap_reservation *ctx)
 {
+	bool reclaim = false;
+	if (!ctx->count)
+		return;
+
 	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
 	spin_lock(&mdsc->caps_list_lock);
 	__ceph_unreserve_caps(mdsc, ctx->count);
 	ctx->count = 0;
+
+	if (mdsc->caps_use_max > 0 &&
+	    mdsc->caps_use_count > mdsc->caps_use_max)
+		reclaim = true;
 	spin_unlock(&mdsc->caps_list_lock);
+
+	if (reclaim)
+		ceph_reclaim_caps_nr(mdsc, ctx->used);
 }
 
 struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
@@ -346,6 +365,7 @@
 	BUG_ON(list_empty(&mdsc->caps_list));
 
 	ctx->count--;
+	ctx->used++;
 	mdsc->caps_reserve_count--;
 	mdsc->caps_use_count++;
 
@@ -438,37 +458,6 @@
 }
 
 /*
- * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
- */
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
-{
-	struct ceph_cap *cap;
-	int mds = -1;
-	struct rb_node *p;
-
-	/* prefer mds with WR|BUFFER|EXCL caps */
-	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
-		cap = rb_entry(p, struct ceph_cap, ci_node);
-		mds = cap->mds;
-		if (cap->issued & (CEPH_CAP_FILE_WR |
-				   CEPH_CAP_FILE_BUFFER |
-				   CEPH_CAP_FILE_EXCL))
-			break;
-	}
-	return mds;
-}
-
-int ceph_get_cap_mds(struct inode *inode)
-{
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	int mds;
-	spin_lock(&ci->i_ceph_lock);
-	mds = __ceph_get_cap_mds(ceph_inode(inode));
-	spin_unlock(&ci->i_ceph_lock);
-	return mds;
-}
-
-/*
  * Called under i_ceph_lock.
  */
 static void __insert_cap_node(struct ceph_inode_info *ci,
@@ -500,12 +489,12 @@
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
-	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
 
 	ci->i_hold_caps_min = round_jiffies(jiffies +
-					    ma->caps_wanted_delay_min * HZ);
+					    opt->caps_wanted_delay_min * HZ);
 	ci->i_hold_caps_max = round_jiffies(jiffies +
-					    ma->caps_wanted_delay_max * HZ);
+					    opt->caps_wanted_delay_max * HZ);
 	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
 	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
 }
@@ -519,9 +508,9 @@
  *    -> we take mdsc->cap_delay_lock
  */
 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
-				struct ceph_inode_info *ci)
+				struct ceph_inode_info *ci,
+				bool set_timeout)
 {
-	__cap_set_timeouts(mdsc, ci);
 	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
 	     ci->i_ceph_flags, ci->i_hold_caps_max);
 	if (!mdsc->stopping) {
@@ -531,6 +520,8 @@
 				goto no_change;
 			list_del_init(&ci->i_cap_delay_list);
 		}
+		if (set_timeout)
+			__cap_set_timeouts(mdsc, ci);
 		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 no_change:
 		spin_unlock(&mdsc->cap_delay_lock);
@@ -606,7 +597,7 @@
 /*
  * Add a capability under the given MDS session.
  *
- * Caller should hold session snap_rwsem (read) and s_mutex.
+ * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
  *
  * @fmode is the open file mode, if we are opening a file, otherwise
  * it is < 0.  (This is so we can atomically add the cap and add an
@@ -623,6 +614,9 @@
 	struct ceph_cap *cap;
 	int mds = session->s_mds;
 	int actual_wanted;
+	u32 gen;
+
+	lockdep_assert_held(&ci->i_ceph_lock);
 
 	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
 	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
@@ -634,6 +628,10 @@
 	if (fmode >= 0)
 		wanted |= ceph_caps_for_mode(fmode);
 
+	spin_lock(&session->s_gen_ttl_lock);
+	gen = session->s_cap_gen;
+	spin_unlock(&session->s_gen_ttl_lock);
+
 	cap = __get_cap_for_mds(ci, mds);
 	if (!cap) {
 		cap = *new_cap;
@@ -655,6 +653,13 @@
 		session->s_nr_caps++;
 		spin_unlock(&session->s_cap_lock);
 	} else {
+		spin_lock(&session->s_cap_lock);
+		list_move_tail(&cap->session_caps, &session->s_caps);
+		spin_unlock(&session->s_cap_lock);
+
+		if (cap->cap_gen < gen)
+			cap->issued = cap->implemented = CEPH_CAP_PIN;
+
 		/*
 		 * auth mds of the inode changed. we received the cap export
 		 * message, but still haven't received the cap import message.
@@ -720,7 +725,7 @@
 		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
 		     ceph_cap_string(issued), ceph_cap_string(wanted),
 		     ceph_cap_string(actual_wanted));
-		__cap_delay_requeue(mdsc, ci);
+		__cap_delay_requeue(mdsc, ci, true);
 	}
 
 	if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -746,7 +751,7 @@
 	cap->seq = seq;
 	cap->issue_seq = seq;
 	cap->mseq = mseq;
-	cap->cap_gen = session->s_cap_gen;
+	cap->cap_gen = gen;
 
 	if (fmode >= 0)
 		__ceph_get_fmode(ci, fmode);
@@ -864,8 +869,8 @@
 	int have = ci->i_snap_caps;
 
 	if ((have & mask) == mask) {
-		dout("__ceph_caps_issued_mask %p snap issued %s"
-		     " (mask %s)\n", &ci->vfs_inode,
+		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
+		     " (mask %s)\n", ci->vfs_inode.i_ino,
 		     ceph_cap_string(have),
 		     ceph_cap_string(mask));
 		return 1;
@@ -876,8 +881,8 @@
 		if (!__cap_is_valid(cap))
 			continue;
 		if ((cap->issued & mask) == mask) {
-			dout("__ceph_caps_issued_mask %p cap %p issued %s"
-			     " (mask %s)\n", &ci->vfs_inode, cap,
+			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
+			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch)
@@ -888,8 +893,8 @@
 		/* does a combination of caps satisfy mask? */
 		have |= cap->issued;
 		if ((have & mask) == mask) {
-			dout("__ceph_caps_issued_mask %p combo issued %s"
-			     " (mask %s)\n", &ci->vfs_inode,
+			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
+			     " (mask %s)\n", ci->vfs_inode.i_ino,
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch) {
@@ -1030,6 +1035,8 @@
 	list_del_init(&ci->i_snap_realm_item);
 	ci->i_snap_realm_counter++;
 	ci->i_snap_realm = NULL;
+	if (realm->ino == ci->i_vino.ino)
+		realm->inode = NULL;
 	spin_unlock(&realm->inodes_with_caps_lock);
 	ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
 			    realm);
@@ -1051,6 +1058,11 @@
 
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
 
+	/* remove from inode's cap rbtree, and clear auth cap */
+	rb_erase(&cap->ci_node, &ci->i_caps);
+	if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
 	/* remove from session list */
 	spin_lock(&session->s_cap_lock);
 	if (session->s_cap_iterator == cap) {
@@ -1074,9 +1086,7 @@
 	    (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
 		cap->queue_release = 1;
 		if (removed) {
-			list_add_tail(&cap->session_caps,
-				      &session->s_cap_releases);
-			session->s_num_cap_releases++;
+			__ceph_queue_cap_release(session, cap);
 			removed = 0;
 		}
 	} else {
@@ -1086,11 +1096,6 @@
 
 	spin_unlock(&session->s_cap_lock);
 
-	/* remove from inode list */
-	rb_erase(&cap->ci_node, &ci->i_caps);
-	if (ci->i_auth_cap == cap)
-		ci->i_auth_cap = NULL;
-
 	if (removed)
 		ceph_put_cap(mdsc, cap);
 
@@ -1110,8 +1115,9 @@
 	u64			ino, cid, follows;
 	u64			flush_tid, oldest_flush_tid, size, max_size;
 	u64			xattr_version;
+	u64			change_attr;
 	struct ceph_buffer	*xattr_buf;
-	struct timespec64	atime, mtime, ctime;
+	struct timespec64	atime, mtime, ctime, btime;
 	int			op, caps, wanted, dirty;
 	u32			seq, issue_seq, mseq, time_warp_seq;
 	u32			flags;
@@ -1132,7 +1138,6 @@
 	struct ceph_msg *msg;
 	void *p;
 	size_t extra_len;
-	struct timespec64 zerotime = {0};
 	struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
 
 	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
@@ -1217,15 +1222,10 @@
 	/* pool namespace (version 8) (mds always ignores this) */
 	ceph_encode_32(&p, 0);
 
-	/*
-	 * btime and change_attr (version 9)
-	 *
-	 * We just zero these out for now, as the MDS ignores them unless
-	 * the requisite feature flags are set (which we don't do yet).
-	 */
-	ceph_encode_timespec64(p, &zerotime);
+	/* btime and change_attr (version 9) */
+	ceph_encode_timespec64(p, &arg->btime);
 	p += sizeof(struct ceph_timespec);
-	ceph_encode_64(&p, 0);
+	ceph_encode_64(&p, arg->change_attr);
 
 	/* Advisory flags (version 10) */
 	ceph_encode_32(&p, arg->flags);
@@ -1235,20 +1235,22 @@
 }
 
 /*
- * Queue cap releases when an inode is dropped from our cache.  Since
- * inode is about to be destroyed, there is no need for i_ceph_lock.
+ * Queue cap releases when an inode is dropped from our cache.
  */
-void ceph_queue_caps_release(struct inode *inode)
+void __ceph_remove_caps(struct ceph_inode_info *ci)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct rb_node *p;
 
+	/* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
+	 * may call __ceph_caps_issued_mask() on a freeing inode. */
+	spin_lock(&ci->i_ceph_lock);
 	p = rb_first(&ci->i_caps);
 	while (p) {
 		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
 		p = rb_next(p);
 		__ceph_remove_cap(cap, true);
 	}
+	spin_unlock(&ci->i_ceph_lock);
 }
 
 /*
@@ -1258,10 +1260,6 @@
  * Make note of max_size reported/requested from mds, revoked caps
  * that have now been implemented.
  *
- * Make half-hearted attempt ot to invalidate page cache if we are
- * dropping RDCACHE.  Note that this will leave behind locked pages
- * that we'll then need to deal with elsewhere.
- *
  * Return non-zero if delayed release, or we experienced an error
  * such that the caller should requeue + retry later.
  *
@@ -1269,12 +1267,13 @@
  * caller should hold snap_rwsem (read), s_mutex.
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
-		      int op, bool sync, int used, int want, int retain,
+		      int op, int flags, int used, int want, int retain,
 		      int flushing, u64 flush_tid, u64 oldest_flush_tid)
 	__releases(cap->ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = cap->ci;
 	struct inode *inode = &ci->vfs_inode;
+	struct ceph_buffer *old_blob = NULL;
 	struct cap_msg_args arg;
 	int held, revoking;
 	int wake = 0;
@@ -1339,7 +1338,7 @@
 	ci->i_requested_max_size = arg.max_size;
 
 	if (flushing & CEPH_CAP_XATTR_EXCL) {
-		__ceph_build_xattrs_blob(ci);
+		old_blob = __ceph_build_xattrs_blob(ci);
 		arg.xattr_version = ci->i_xattrs.version;
 		arg.xattr_buf = ci->i_xattrs.blob;
 	} else {
@@ -1349,6 +1348,8 @@
 	arg.mtime = inode->i_mtime;
 	arg.atime = inode->i_atime;
 	arg.ctime = inode->i_ctime;
+	arg.btime = ci->i_btime;
+	arg.change_attr = inode_peek_iversion_raw(inode);
 
 	arg.op = op;
 	arg.caps = cap->implemented;
@@ -1365,15 +1366,24 @@
 	arg.mode = inode->i_mode;
 
 	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-	if (list_empty(&ci->i_cap_snaps))
-		arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
-	else
-		arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
-	if (sync)
-		arg.flags |= CEPH_CLIENT_CAPS_SYNC;
+	if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
+	    !list_empty(&ci->i_cap_snaps)) {
+		struct ceph_cap_snap *capsnap;
+		list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->cap_flush.tid)
+				break;
+			if (capsnap->need_flush) {
+				flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
+				break;
+			}
+		}
+	}
+	arg.flags = flags;
 
 	spin_unlock(&ci->i_ceph_lock);
 
+	ceph_buffer_put(old_blob);
+
 	ret = send_cap_msg(&arg);
 	if (ret < 0) {
 		dout("error sending cap msg, must requeue %p\n", inode);
@@ -1408,6 +1418,8 @@
 	arg.atime = capsnap->atime;
 	arg.mtime = capsnap->mtime;
 	arg.ctime = capsnap->ctime;
+	arg.btime = capsnap->btime;
+	arg.change_attr = capsnap->change_attr;
 
 	arg.op = CEPH_CAP_OP_FLUSHSNAP;
 	arg.caps = capsnap->issued;
@@ -1575,10 +1587,8 @@
 	}
 
 	// make sure flushsnap messages are sent in proper order.
-	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
 		__kick_flushing_caps(mdsc, session, ci, 0);
-		ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
-	}
 
 	__ceph_flush_snaps(ci, session);
 out:
@@ -1647,7 +1657,7 @@
 	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
 	    (mask & CEPH_CAP_FILE_BUFFER))
 		dirty |= I_DIRTY_DATASYNC;
-	__cap_delay_requeue(mdsc, ci);
+	__cap_delay_requeue(mdsc, ci, true);
 	return dirty;
 }
 
@@ -1708,11 +1718,11 @@
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
  *
- * Called under i_ceph_lock.
+ * Called under i_ceph_lock. Returns the flush tid.
  */
-static int __mark_caps_flushing(struct inode *inode,
+static u64 __mark_caps_flushing(struct inode *inode,
 				struct ceph_mds_session *session, bool wake,
-				u64 *flush_tid, u64 *oldest_flush_tid)
+				u64 *oldest_flush_tid)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1751,8 +1761,7 @@
 
 	list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
 
-	*flush_tid = cf->tid;
-	return flushing;
+	return cf->tid;
 }
 
 /*
@@ -1853,14 +1862,17 @@
 			retain |= CEPH_CAP_ANY;       /* be greedy */
 		} else if (S_ISDIR(inode->i_mode) &&
 			   (issued & CEPH_CAP_FILE_SHARED) &&
-			    __ceph_dir_is_complete(ci)) {
+			   __ceph_dir_is_complete(ci)) {
 			/*
 			 * If a directory is complete, we want to keep
 			 * the exclusive cap. So that MDS does not end up
 			 * revoking the shared cap on every create/unlink
 			 * operation.
 			 */
-			want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+			if (IS_RDONLY(inode))
+				want = CEPH_CAP_ANY_SHARED;
+			else
+				want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
 			retain |= want;
 		} else {
 
@@ -1968,8 +1980,7 @@
 			goto ack;
 
 		/* things we might delay */
-		if ((cap->issued & ~retain) == 0 &&
-		    cap->mds_wanted == want)
+		if ((cap->issued & ~retain) == 0)
 			continue;     /* nope, all good */
 
 		if (no_delay)
@@ -1988,11 +1999,6 @@
 		}
 
 ack:
-		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
-			dout(" skipping %p I_NOFLUSH set\n", inode);
-			continue;
-		}
-
 		if (session && session != cap->session) {
 			dout("oops, wrong session %p mutex\n", session);
 			mutex_unlock(&session->s_mutex);
@@ -2018,10 +2024,8 @@
 		if (cap == ci->i_auth_cap &&
 		    (ci->i_ceph_flags &
 		     (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
-			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
 				__kick_flushing_caps(mdsc, session, ci, 0);
-				ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
-			}
 			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
 				__ceph_flush_snaps(ci, session);
 
@@ -2042,9 +2046,9 @@
 		}
 
 		if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
-			flushing = __mark_caps_flushing(inode, session, false,
-							&flush_tid,
-							&oldest_flush_tid);
+			flushing = ci->i_dirty_caps;
+			flush_tid = __mark_caps_flushing(inode, session, false,
+							 &oldest_flush_tid);
 		} else {
 			flushing = 0;
 			flush_tid = 0;
@@ -2057,7 +2061,7 @@
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
 				cap_used, want, retain, flushing,
 				flush_tid, oldest_flush_tid);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
@@ -2065,7 +2069,7 @@
 
 	/* Reschedule delayed caps release if we delayed anything */
 	if (delayed)
-		__cap_delay_requeue(mdsc, ci);
+		__cap_delay_requeue(mdsc, ci, false);
 
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -2091,18 +2095,12 @@
 
 retry:
 	spin_lock(&ci->i_ceph_lock);
-	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
-		spin_unlock(&ci->i_ceph_lock);
-		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
-		goto out;
-	}
+retry_locked:
 	if (ci->i_dirty_caps && ci->i_auth_cap) {
 		struct ceph_cap *cap = ci->i_auth_cap;
-		int used = __ceph_caps_used(ci);
-		int want = __ceph_caps_wanted(ci);
 		int delayed;
 
-		if (!session || session != cap->session) {
+		if (session != cap->session) {
 			spin_unlock(&ci->i_ceph_lock);
 			if (session)
 				mutex_unlock(&session->s_mutex);
@@ -2115,17 +2113,30 @@
 			goto out;
 		}
 
-		flushing = __mark_caps_flushing(inode, session, true,
-						&flush_tid, &oldest_flush_tid);
+		if (ci->i_ceph_flags &
+		    (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
+			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
+				__kick_flushing_caps(mdsc, session, ci, 0);
+			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+				__ceph_flush_snaps(ci, session);
+			goto retry_locked;
+		}
+
+		flushing = ci->i_dirty_caps;
+		flush_tid = __mark_caps_flushing(inode, session, true,
+						 &oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
-		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
-				used, want, (cap->issued | cap->implemented),
-				flushing, flush_tid, oldest_flush_tid);
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				     CEPH_CLIENT_CAPS_SYNC,
+				     __ceph_caps_used(ci),
+				     __ceph_caps_wanted(ci),
+				     (cap->issued | cap->implemented),
+				     flushing, flush_tid, oldest_flush_tid);
 
 		if (delayed) {
 			spin_lock(&ci->i_ceph_lock);
-			__cap_delay_requeue(mdsc, ci);
+			__cap_delay_requeue(mdsc, ci, true);
 			spin_unlock(&ci->i_ceph_lock);
 		}
 	} else {
@@ -2212,38 +2223,45 @@
 
 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file->f_mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 flush_tid;
-	int ret;
+	int ret, err;
 	int dirty;
 
 	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
 
 	ret = file_write_and_wait_range(file, start, end);
-	if (ret < 0)
-		goto out;
-
 	if (datasync)
 		goto out;
 
-	inode_lock(inode);
-
 	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
-	ret = unsafe_request_wait(inode);
+	err = unsafe_request_wait(inode);
 
 	/*
 	 * only wait on non-file metadata writeback (the mds
 	 * can recover size and mtime, so we don't need to
 	 * wait for that)
 	 */
-	if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
-		ret = wait_event_interruptible(ci->i_cap_wq,
+	if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+		err = wait_event_interruptible(ci->i_cap_wq,
 					caps_are_flushed(inode, flush_tid));
 	}
-	inode_unlock(inode);
+
+	if (err < 0)
+		ret = err;
+
+	if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
+		spin_lock(&file->f_lock);
+		err = errseq_check_and_advance(&ci->i_meta_err,
+					       &fi->meta_err);
+		spin_unlock(&file->f_lock);
+		if (err < 0)
+			ret = err;
+	}
 out:
 	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
 	return ret;
@@ -2293,6 +2311,16 @@
 	struct ceph_cap_flush *cf;
 	int ret;
 	u64 first_tid = 0;
+	u64 last_snap_flush = 0;
+
+	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+
+	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
+		if (!cf->caps) {
+			last_snap_flush = cf->tid;
+			break;
+		}
+	}
 
 	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
 		if (cf->tid < first_tid)
@@ -2311,10 +2339,13 @@
 			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
 			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
 			ci->i_ceph_flags |= CEPH_I_NODELAY;
+
 			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-					  false, __ceph_caps_used(ci),
+					 (cf->tid < last_snap_flush ?
+					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
+					  __ceph_caps_used(ci),
 					  __ceph_caps_wanted(ci),
-					  cap->issued | cap->implemented,
+					  (cap->issued | cap->implemented),
 					  cf->caps, cf->tid, oldest_flush_tid);
 			if (ret) {
 				pr_err("kick_flushing_caps: error sending "
@@ -2383,7 +2414,12 @@
 		 */
 		if ((cap->issued & ci->i_flushing_caps) !=
 		    ci->i_flushing_caps) {
-			ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+			/* encode_caps_cb() also will reset these sequence
+			 * numbers. make sure sequence numbers in cap flush
+			 * message match later reconnect message */
+			cap->seq = 0;
+			cap->issue_seq = 0;
+			cap->mseq = 0;
 			__kick_flushing_caps(mdsc, session, ci,
 					     oldest_flush_tid);
 		} else {
@@ -2417,7 +2453,6 @@
 			continue;
 		}
 		if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
-			ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
 			__kick_flushing_caps(mdsc, session, ci,
 					     oldest_flush_tid);
 		}
@@ -2445,7 +2480,6 @@
 		oldest_flush_tid = __get_oldest_flush_tid(mdsc);
 		spin_unlock(&mdsc->cap_dirty_lock);
 
-		ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
 		__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
 		spin_unlock(&ci->i_ceph_lock);
 	} else {
@@ -2492,11 +2526,21 @@
  * to (when applicable), and check against max_size here as well.
  * Note that caller is responsible for ensuring max_size increases are
  * requested from the MDS.
+ *
+ * Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
+ * or a negative error code.
+ *
+ * FIXME: how does a 0 return differ from -EAGAIN?
  */
-static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
-			    loff_t endoff, bool nonblock, int *got, int *err)
+enum {
+	NON_BLOCKING	= 1,
+	CHECK_FILELOCK	= 2,
+};
+
+static int try_get_cap_refs(struct inode *inode, int need, int want,
+			    loff_t endoff, int flags, int *got)
 {
-	struct inode *inode = &ci->vfs_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	int ret = 0;
 	int have, implemented;
@@ -2509,13 +2553,19 @@
 again:
 	spin_lock(&ci->i_ceph_lock);
 
+	if ((flags & CHECK_FILELOCK) &&
+	    (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
+		dout("try_get_cap_refs %p error filelock\n", inode);
+		ret = -EIO;
+		goto out_unlock;
+	}
+
 	/* make sure file is actually open */
 	file_wanted = __ceph_caps_file_wanted(ci);
 	if ((file_wanted & need) != need) {
 		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
 		     ceph_cap_string(need), ceph_cap_string(file_wanted));
-		*err = -EBADF;
-		ret = 1;
+		ret = -EBADF;
 		goto out_unlock;
 	}
 
@@ -2536,10 +2586,8 @@
 		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
 			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
 			     inode, endoff, ci->i_max_size);
-			if (endoff > ci->i_requested_max_size) {
-				*err = -EAGAIN;
-				ret = 1;
-			}
+			if (endoff > ci->i_requested_max_size)
+				ret = -EAGAIN;
 			goto out_unlock;
 		}
 		/*
@@ -2573,9 +2621,8 @@
 					 * we can not call down_read() when
 					 * task isn't in TASK_RUNNING state
 					 */
-					if (nonblock) {
-						*err = -EAGAIN;
-						ret = 1;
+					if (flags & NON_BLOCKING) {
+						ret = -EAGAIN;
 						goto out_unlock;
 					}
 
@@ -2604,8 +2651,7 @@
 		if (session_readonly) {
 			dout("get_cap_refs %p needed %s but mds%d readonly\n",
 			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
-			*err = -EROFS;
-			ret = 1;
+			ret = -EROFS;
 			goto out_unlock;
 		}
 
@@ -2614,16 +2660,14 @@
 			if (READ_ONCE(mdsc->fsc->mount_state) ==
 			    CEPH_MOUNT_SHUTDOWN) {
 				dout("get_cap_refs %p forced umount\n", inode);
-				*err = -EIO;
-				ret = 1;
+				ret = -EIO;
 				goto out_unlock;
 			}
 			mds_wanted = __ceph_caps_mds_wanted(ci, false);
 			if (need & ~(mds_wanted & need)) {
 				dout("get_cap_refs %p caps were dropped"
 				     " (session killed?)\n", inode);
-				*err = -ESTALE;
-				ret = 1;
+				ret = -ESTALE;
 				goto out_unlock;
 			}
 			if (!(file_wanted & ~mds_wanted))
@@ -2671,25 +2715,20 @@
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 }
 
-int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
+int ceph_try_get_caps(struct inode *inode, int need, int want,
+		      bool nonblock, int *got)
 {
-	int ret, err = 0;
+	int ret;
 
 	BUG_ON(need & ~CEPH_CAP_FILE_RD);
-	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
-	ret = ceph_pool_perm_check(ci, need);
+	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
+	ret = ceph_pool_perm_check(inode, need);
 	if (ret < 0)
 		return ret;
 
-	ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
-	if (ret) {
-		if (err == -EAGAIN) {
-			ret = 0;
-		} else if (err < 0) {
-			ret = err;
-		}
-	}
-	return ret;
+	ret = try_get_cap_refs(inode, need, want, 0,
+			       (nonblock ? NON_BLOCKING : 0), got);
+	return ret == -EAGAIN ? 0 : ret;
 }
 
 /*
@@ -2697,34 +2736,40 @@
  * due to a small max_size, make sure we check_max_size (and possibly
  * ask the mds) so we don't get hung up indefinitely.
  */
-int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+int ceph_get_caps(struct file *filp, int need, int want,
 		  loff_t endoff, int *got, struct page **pinned_page)
 {
-	int _got, ret, err = 0;
+	struct ceph_file_info *fi = filp->private_data;
+	struct inode *inode = file_inode(filp);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	int ret, _got, flags;
 
-	ret = ceph_pool_perm_check(ci, need);
+	ret = ceph_pool_perm_check(inode, need);
 	if (ret < 0)
 		return ret;
 
+	if ((fi->fmode & CEPH_FILE_MODE_WR) &&
+	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
+		return -EBADF;
+
 	while (true) {
 		if (endoff > 0)
-			check_max_size(&ci->vfs_inode, endoff);
+			check_max_size(inode, endoff);
 
-		err = 0;
+		flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
 		_got = 0;
-		ret = try_get_cap_refs(ci, need, want, endoff,
-				       false, &_got, &err);
-		if (ret) {
-			if (err == -EAGAIN)
-				continue;
-			if (err < 0)
-				ret = err;
-		} else {
+		ret = try_get_cap_refs(inode, need, want, endoff,
+				       flags, &_got);
+		if (ret == -EAGAIN)
+			continue;
+		if (!ret) {
 			DEFINE_WAIT_FUNC(wait, woken_wake_function);
 			add_wait_queue(&ci->i_cap_wq, &wait);
 
-			while (!try_get_cap_refs(ci, need, want, endoff,
-						 true, &_got, &err)) {
+			flags |= NON_BLOCKING;
+			while (!(ret = try_get_cap_refs(inode, need, want,
+							endoff, flags, &_got))) {
 				if (signal_pending(current)) {
 					ret = -ERESTARTSYS;
 					break;
@@ -2733,16 +2778,21 @@
 			}
 
 			remove_wait_queue(&ci->i_cap_wq, &wait);
-
-			if (err == -EAGAIN)
+			if (ret == -EAGAIN)
 				continue;
-			if (err < 0)
-				ret = err;
 		}
+
+		if ((fi->fmode & CEPH_FILE_MODE_WR) &&
+		    fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
+			if (ret >= 0 && _got)
+				ceph_put_cap_refs(ci, _got);
+			return -EBADF;
+		}
+
 		if (ret < 0) {
-			if (err == -ESTALE) {
+			if (ret == -ESTALE) {
 				/* session was killed, try renew caps */
-				ret = ceph_renew_caps(&ci->vfs_inode);
+				ret = ceph_renew_caps(inode);
 				if (ret == 0)
 					continue;
 			}
@@ -2751,9 +2801,9 @@
 
 		if (ci->i_inline_version != CEPH_INLINE_NONE &&
 		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-		    i_size_read(&ci->vfs_inode) > 0) {
+		    i_size_read(inode) > 0) {
 			struct page *page =
-				find_get_page(ci->vfs_inode.i_mapping, 0);
+				find_get_page(inode->i_mapping, 0);
 			if (page) {
 				if (PageUptodate(page)) {
 					*pinned_page = page;
@@ -2772,7 +2822,7 @@
 			 * getattr request will bring inline data into
 			 * page cache
 			 */
-			ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+			ret = __ceph_do_getattr(inode, NULL,
 						CEPH_STAT_CAP_INLINE_DATA,
 						true);
 			if (ret < 0)
@@ -2974,8 +3024,10 @@
 	}
 	if (complete_capsnap)
 		wake_up_all(&ci->i_cap_wq);
-	while (put-- > 0)
-		iput(inode);
+	while (put-- > 0) {
+		/* avoid calling iput_final() in osd dispatch threads */
+		ceph_async_iput(inode);
+	}
 }
 
 /*
@@ -3020,8 +3072,10 @@
 	bool dirstat_valid;
 	u64 nfiles;
 	u64 nsubdirs;
+	u64 change_attr;
 	/* currently issued */
 	int issued;
+	struct timespec64 btime;
 };
 
 /*
@@ -3045,7 +3099,8 @@
 	int used, wanted, dirty;
 	u64 size = le64_to_cpu(grant->size);
 	u64 max_size = le64_to_cpu(grant->max_size);
-	int check_caps = 0;
+	unsigned char check_caps = 0;
+	bool was_stale = cap->cap_gen < session->s_cap_gen;
 	bool wake = false;
 	bool writeback = false;
 	bool queue_trunc = false;
@@ -3060,21 +3115,6 @@
 
 
 	/*
-	 * auth mds of the inode changed. we received the cap export message,
-	 * but still haven't received the cap import message. handle_cap_export
-	 * updated the new auth MDS' cap.
-	 *
-	 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
-	 * that was sent before the cap import message. So don't remove caps.
-	 */
-	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
-		WARN_ON(cap != ci->i_auth_cap);
-		WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
-		seq = cap->seq;
-		newcaps |= cap->issued;
-	}
-
-	/*
 	 * If CACHE is being revoked, and we have no dirty buffers,
 	 * try to invalidate (once).  (If there are dirty buffers, we
 	 * will invalidate _after_ writeback.)
@@ -3093,17 +3133,38 @@
 		}
 	}
 
+	if (was_stale)
+		cap->issued = cap->implemented = CEPH_CAP_PIN;
+
+	/*
+	 * auth mds of the inode changed. we received the cap export message,
+	 * but still haven't received the cap import message. handle_cap_export
+	 * updated the new auth MDS' cap.
+	 *
+	 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+	 * that was sent before the cap import message. So don't remove caps.
+	 */
+	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+		WARN_ON(cap != ci->i_auth_cap);
+		WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+		seq = cap->seq;
+		newcaps |= cap->issued;
+	}
+
 	/* side effects now are allowed */
 	cap->cap_gen = session->s_cap_gen;
 	cap->seq = seq;
 
 	__check_cap_issue(ci, cap, newcaps);
 
+	inode_set_max_iversion_raw(inode, extra_info->change_attr);
+
 	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
 	    (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = le32_to_cpu(grant->mode);
 		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
 		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
+		ci->i_btime = extra_info->btime;
 		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
 		     from_kuid(&init_user_ns, inode->i_uid),
 		     from_kgid(&init_user_ns, inode->i_gid));
@@ -3130,6 +3191,7 @@
 			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
 			ci->i_xattrs.version = version;
 			ceph_forget_all_cached_acls(inode);
+			ceph_security_invalidate_secctx(inode);
 		}
 	}
 
@@ -3197,13 +3259,20 @@
 	     ceph_cap_string(wanted),
 	     ceph_cap_string(used),
 	     ceph_cap_string(dirty));
-	if (wanted != le32_to_cpu(grant->wanted)) {
-		dout("mds wanted %s -> %s\n",
-		     ceph_cap_string(le32_to_cpu(grant->wanted)),
-		     ceph_cap_string(wanted));
-		/* imported cap may not have correct mds_wanted */
-		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
-			check_caps = 1;
+
+	if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
+	    (wanted & ~(cap->mds_wanted | newcaps))) {
+		/*
+		 * If mds is importing cap, prior cap messages that update
+		 * 'wanted' may get dropped by mds (migrate seq mismatch).
+		 *
+		 * We don't send cap message to update 'wanted' if what we
+		 * want are already issued. If mds revokes caps, cap message
+		 * that releases caps also tells mds what we want. But if
+		 * caps got revoked by mds forcedly (session stale). We may
+		 * haven't told mds what we want.
+		 */
+		check_caps = 1;
 	}
 
 	/* revocation, grant, or no-op? */
@@ -3536,9 +3605,9 @@
 		goto out_unlock;
 
 	if (target < 0) {
-		__ceph_remove_cap(cap, false);
-		if (!ci->i_auth_cap)
+		if (cap->mds_wanted | cap->issued)
 			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+		__ceph_remove_cap(cap, false);
 		goto out_unlock;
 	}
 
@@ -3566,7 +3635,6 @@
 			tcap->cap_id = t_cap_id;
 			tcap->seq = t_seq - 1;
 			tcap->issue_seq = t_seq - 1;
-			tcap->mseq = t_mseq;
 			tcap->issued |= issued;
 			tcap->implemented |= issued;
 			if (cap == ci->i_auth_cap)
@@ -3818,17 +3886,19 @@
 		}
 	}
 
-	if (msg_version >= 11) {
+	if (msg_version >= 9) {
 		struct ceph_timespec *btime;
-		u64 change_attr;
-		u32 flags;
 
-		/* version >= 9 */
 		if (p + sizeof(*btime) > end)
 			goto bad;
 		btime = p;
+		ceph_decode_timespec64(&extra_info.btime, btime);
 		p += sizeof(*btime);
-		ceph_decode_64_safe(&p, end, change_attr, bad);
+		ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
+	}
+
+	if (msg_version >= 11) {
+		u32 flags;
 		/* version >= 10 */
 		ceph_decode_32_safe(&p, end, flags, bad);
 		/* version >= 11 */
@@ -3860,12 +3930,10 @@
 			cap->seq = seq;
 			cap->issue_seq = seq;
 			spin_lock(&session->s_cap_lock);
-			list_add_tail(&cap->session_caps,
-					&session->s_cap_releases);
-			session->s_num_cap_releases++;
+			__ceph_queue_cap_release(session, cap);
 			spin_unlock(&session->s_cap_lock);
 		}
-		goto flush_cap_releases;
+		goto done;
 	}
 
 	/* these will work even if we don't have a cap yet */
@@ -3935,7 +4003,13 @@
 		       ceph_cap_op_name(op));
 	}
 
-	goto done;
+done:
+	mutex_unlock(&session->s_mutex);
+done_unlocked:
+	ceph_put_string(extra_info.pool_ns);
+	/* avoid calling iput_final() in mds dispatch threads */
+	ceph_async_iput(inode);
+	return;
 
 flush_cap_releases:
 	/*
@@ -3943,14 +4017,8 @@
 	 * along for the mds (who clearly thinks we still have this
 	 * cap).
 	 */
-	ceph_send_cap_releases(mdsc, session);
-
-done:
-	mutex_unlock(&session->s_mutex);
-done_unlocked:
-	iput(inode);
-	ceph_put_string(extra_info.pool_ns);
-	return;
+	ceph_flush_cap_releases(mdsc, session);
+	goto done;
 
 bad:
 	pr_err("ceph_handle_caps: corrupt message\n");
@@ -3986,7 +4054,8 @@
 		if (inode) {
 			dout("check_delayed_caps on %p\n", inode);
 			ceph_check_caps(ci, flags, NULL);
-			iput(inode);
+			/* avoid calling iput_final() in tick thread */
+			ceph_async_iput(inode);
 		}
 	}
 	spin_unlock(&mdsc->cap_delay_lock);
@@ -4055,7 +4124,7 @@
 }
 
 /*
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
+ * For a soon-to-be unlinked file, drop the LINK caps. If it
  * looks like the link count will hit 0, drop any other caps (other
  * than PIN) we don't specifically want (due to the file still being
  * open).

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index abdf98d..facb387 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c

@@ -37,7 +37,7 @@
 		struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
 		int state = mdsmap->m_info[i].state;
 		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
-			       ceph_pr_addr(&addr->in_addr),
+			       ceph_pr_addr(addr),
 			       ceph_mds_state_name(state));
 	}
 	return 0;
@@ -52,7 +52,7 @@
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct rb_node *rp;
-	int pathlen;
+	int pathlen = 0;
 	u64 pathbase;
 	char *path;
 
@@ -88,7 +88,7 @@
 				   req->r_dentry,
 				   path ? path : "");
 			spin_unlock(&req->r_dentry->d_lock);
-			kfree(path);
+			ceph_mdsc_free_path(path, pathlen);
 		} else if (req->r_path1) {
 			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
 				   req->r_path1);
@@ -108,7 +108,7 @@
 				   req->r_old_dentry,
 				   path ? path : "");
 			spin_unlock(&req->r_old_dentry->d_lock);
-			kfree(path);
+			ceph_mdsc_free_path(path, pathlen);
 		} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
 			if (req->r_ino2.ino)
 				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
@@ -124,34 +124,47 @@
 	return 0;
 }
 
+static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
+{
+	struct seq_file *s = p;
+
+	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+		   ceph_cap_string(cap->issued),
+		   ceph_cap_string(cap->implemented));
+	return 0;
+}
+
 static int caps_show(struct seq_file *s, void *p)
 {
 	struct ceph_fs_client *fsc = s->private;
-	int total, avail, used, reserved, min;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	int total, avail, used, reserved, min, i;
 
 	ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
 	seq_printf(s, "total\t\t%d\n"
 		   "avail\t\t%d\n"
 		   "used\t\t%d\n"
 		   "reserved\t%d\n"
-		   "min\t%d\n",
+		   "min\t\t%d\n\n",
 		   total, avail, used, reserved, min);
-	return 0;
-}
+	seq_printf(s, "ino                issued           implemented\n");
+	seq_printf(s, "-----------------------------------------------\n");
 
-static int dentry_lru_show(struct seq_file *s, void *ptr)
-{
-	struct ceph_fs_client *fsc = s->private;
-	struct ceph_mds_client *mdsc = fsc->mdsc;
-	struct ceph_dentry_info *di;
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *session;
 
-	spin_lock(&mdsc->dentry_lru_lock);
-	list_for_each_entry(di, &mdsc->dentry_lru, lru) {
-		struct dentry *dentry = di->dentry;
-		seq_printf(s, "%p %p\t%pd\n",
-			   di, dentry, dentry);
+		session = __ceph_lookup_mds_session(mdsc, i);
+		if (!session)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&session->s_mutex);
+		ceph_iterate_session_caps(session, caps_show_cb, s);
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+		mutex_lock(&mdsc->mutex);
 	}
-	spin_unlock(&mdsc->dentry_lru_lock);
+	mutex_unlock(&mdsc->mutex);
 
 	return 0;
 }
@@ -195,7 +208,6 @@
 CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
 CEPH_DEFINE_SHOW_FUNC(mdsc_show)
 CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
 CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
 
 
@@ -231,24 +243,19 @@
 	debugfs_remove(fsc->debugfs_mds_sessions);
 	debugfs_remove(fsc->debugfs_caps);
 	debugfs_remove(fsc->debugfs_mdsc);
-	debugfs_remove(fsc->debugfs_dentry_lru);
 }
 
-int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
 	char name[100];
-	int err = -ENOMEM;
 
 	dout("ceph_fs_debugfs_init\n");
-	BUG_ON(!fsc->client->debugfs_dir);
 	fsc->debugfs_congestion_kb =
 		debugfs_create_file("writeback_congestion_kb",
 				    0600,
 				    fsc->client->debugfs_dir,
 				    fsc,
 				    &congestion_kb_fops);
-	if (!fsc->debugfs_congestion_kb)
-		goto out;
 
 	snprintf(name, sizeof(name), "../../bdi/%s",
 		 dev_name(fsc->sb->s_bdi->dev));
@@ -256,62 +263,37 @@
 		debugfs_create_symlink("bdi",
 				       fsc->client->debugfs_dir,
 				       name);
-	if (!fsc->debugfs_bdi)
-		goto out;
 
 	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
 					0400,
 					fsc->client->debugfs_dir,
 					fsc,
 					&mdsmap_show_fops);
-	if (!fsc->debugfs_mdsmap)
-		goto out;
 
 	fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
 					0400,
 					fsc->client->debugfs_dir,
 					fsc,
 					&mds_sessions_show_fops);
-	if (!fsc->debugfs_mds_sessions)
-		goto out;
 
 	fsc->debugfs_mdsc = debugfs_create_file("mdsc",
 						0400,
 						fsc->client->debugfs_dir,
 						fsc,
 						&mdsc_show_fops);
-	if (!fsc->debugfs_mdsc)
-		goto out;
 
 	fsc->debugfs_caps = debugfs_create_file("caps",
 						   0400,
 						   fsc->client->debugfs_dir,
 						   fsc,
 						   &caps_show_fops);
-	if (!fsc->debugfs_caps)
-		goto out;
-
-	fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
-					0400,
-					fsc->client->debugfs_dir,
-					fsc,
-					&dentry_lru_show_fops);
-	if (!fsc->debugfs_dentry_lru)
-		goto out;
-
-	return 0;
-
-out:
-	ceph_fs_debugfs_cleanup(fsc);
-	return err;
 }
 
 
 #else  /* CONFIG_DEBUG_FS */
 
-int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
-	return 0;
 }
 
 void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82928ce..d17a789 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c

@@ -29,6 +29,9 @@
 
 const struct dentry_operations ceph_dentry_ops;
 
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
+static int __dir_lease_try_check(const struct dentry *dentry);
+
 /*
  * Initialize ceph dentry state.
  */
@@ -44,7 +47,7 @@
 	di->lease_session = NULL;
 	di->time = jiffies;
 	dentry->d_fsdata = di;
-	ceph_dentry_lru_add(dentry);
+	INIT_LIST_HEAD(&di->lease_list);
 	return 0;
 }
 
@@ -241,6 +244,7 @@
 			goto out;
 		}
 		if (fpos_cmp(ctx->pos, di->offset) <= 0) {
+			__ceph_dentry_dir_lease_touch(di);
 			emit_dentry = true;
 		}
 		spin_unlock(&dentry->d_lock);
@@ -821,7 +825,7 @@
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
-	struct ceph_acls_info acls = {};
+	struct ceph_acl_sec_ctx as_ctx = {};
 	int err;
 
 	if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -832,7 +836,10 @@
 		goto out;
 	}
 
-	err = ceph_pre_init_acls(dir, &mode, &acls);
+	err = ceph_pre_init_acls(dir, &mode, &as_ctx);
+	if (err < 0)
+		goto out;
+	err = ceph_security_init_secctx(dentry, mode, &as_ctx);
 	if (err < 0)
 		goto out;
 
@@ -851,9 +858,9 @@
 	req->r_args.mknod.rdev = cpu_to_le32(rdev);
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
 	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
-	if (acls.pagelist) {
-		req->r_pagelist = acls.pagelist;
-		acls.pagelist = NULL;
+	if (as_ctx.pagelist) {
+		req->r_pagelist = as_ctx.pagelist;
+		as_ctx.pagelist = NULL;
 	}
 	err = ceph_mdsc_do_request(mdsc, dir, req);
 	if (!err && !req->r_reply_info.head->is_dentry)
@@ -861,10 +868,10 @@
 	ceph_mdsc_put_request(req);
 out:
 	if (!err)
-		ceph_init_inode_acls(d_inode(dentry), &acls);
+		ceph_init_inode_acls(d_inode(dentry), &as_ctx);
 	else
 		d_drop(dentry);
-	ceph_release_acls_info(&acls);
+	ceph_release_acl_sec_ctx(&as_ctx);
 	return err;
 }
 
@@ -880,6 +887,7 @@
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
+	struct ceph_acl_sec_ctx as_ctx = {};
 	int err;
 
 	if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -890,6 +898,10 @@
 		goto out;
 	}
 
+	err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx);
+	if (err < 0)
+		goto out;
+
 	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
 	if (IS_ERR(req)) {
@@ -915,6 +927,7 @@
 out:
 	if (err)
 		d_drop(dentry);
+	ceph_release_acl_sec_ctx(&as_ctx);
 	return err;
 }
 
@@ -923,7 +936,7 @@
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
-	struct ceph_acls_info acls = {};
+	struct ceph_acl_sec_ctx as_ctx = {};
 	int err = -EROFS;
 	int op;
 
@@ -946,7 +959,10 @@
 	}
 
 	mode |= S_IFDIR;
-	err = ceph_pre_init_acls(dir, &mode, &acls);
+	err = ceph_pre_init_acls(dir, &mode, &as_ctx);
+	if (err < 0)
+		goto out;
+	err = ceph_security_init_secctx(dentry, mode, &as_ctx);
 	if (err < 0)
 		goto out;
 
@@ -963,9 +979,9 @@
 	req->r_args.mkdir.mode = cpu_to_le32(mode);
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
 	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
-	if (acls.pagelist) {
-		req->r_pagelist = acls.pagelist;
-		acls.pagelist = NULL;
+	if (as_ctx.pagelist) {
+		req->r_pagelist = as_ctx.pagelist;
+		as_ctx.pagelist = NULL;
 	}
 	err = ceph_mdsc_do_request(mdsc, dir, req);
 	if (!err &&
@@ -975,10 +991,10 @@
 	ceph_mdsc_put_request(req);
 out:
 	if (!err)
-		ceph_init_inode_acls(d_inode(dentry), &acls);
+		ceph_init_inode_acls(d_inode(dentry), &as_ctx);
 	else
 		d_drop(dentry);
-	ceph_release_acls_info(&acls);
+	ceph_release_acl_sec_ctx(&as_ctx);
 	return err;
 }
 
@@ -1125,13 +1141,277 @@
 }
 
 /*
+ * Move dentry to tail of mdsc->dentry_leases list when lease is updated.
+ * Leases at front of the list will expire first. (Assume all leases have
+ * similar duration)
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
+{
+	struct dentry *dn = di->dentry;
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
+
+	di->flags |= CEPH_DENTRY_LEASE_LIST;
+	if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+		di->flags |= CEPH_DENTRY_REFERENCED;
+		return;
+	}
+
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_list_lock);
+	list_move_tail(&di->lease_list, &mdsc->dentry_leases);
+	spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
+				     struct ceph_dentry_info *di)
+{
+	di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED);
+	di->lease_gen = 0;
+	di->time = jiffies;
+	list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases);
+}
+
+/*
+ * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
+ * list if it's not in the list, otherwise set 'referenced' flag.
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
+{
+	struct dentry *dn = di->dentry;
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n",
+	     di, dn, dn, di->offset);
+
+	if (!list_empty(&di->lease_list)) {
+		if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+			/* don't remove dentry from dentry lease list
+			 * if its lease is valid */
+			if (__dentry_lease_is_valid(di))
+				return;
+		} else {
+			di->flags |= CEPH_DENTRY_REFERENCED;
+			return;
+		}
+	}
+
+	if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+		di->flags |= CEPH_DENTRY_REFERENCED;
+		di->flags &= ~CEPH_DENTRY_LEASE_LIST;
+		return;
+	}
+
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_list_lock);
+	__dentry_dir_lease_touch(mdsc, di),
+	spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_lease_unlist(struct ceph_dentry_info *di)
+{
+	struct ceph_mds_client *mdsc;
+	if (di->flags & CEPH_DENTRY_SHRINK_LIST)
+		return;
+	if (list_empty(&di->lease_list))
+		return;
+
+	mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_list_lock);
+	list_del_init(&di->lease_list);
+	spin_unlock(&mdsc->dentry_list_lock);
+}
+
+enum {
+	KEEP	= 0,
+	DELETE	= 1,
+	TOUCH	= 2,
+	STOP	= 4,
+};
+
+struct ceph_lease_walk_control {
+	bool dir_lease;
+	bool expire_dir_lease;
+	unsigned long nr_to_scan;
+	unsigned long dir_lease_ttl;
+};
+
+static unsigned long
+__dentry_leases_walk(struct ceph_mds_client *mdsc,
+		     struct ceph_lease_walk_control *lwc,
+		     int (*check)(struct dentry*, void*))
+{
+	struct ceph_dentry_info *di, *tmp;
+	struct dentry *dentry, *last = NULL;
+	struct list_head* list;
+        LIST_HEAD(dispose);
+	unsigned long freed = 0;
+	int ret = 0;
+
+	list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
+	spin_lock(&mdsc->dentry_list_lock);
+	list_for_each_entry_safe(di, tmp, list, lease_list) {
+		if (!lwc->nr_to_scan)
+			break;
+		--lwc->nr_to_scan;
+
+		dentry = di->dentry;
+		if (last == dentry)
+			break;
+
+		if (!spin_trylock(&dentry->d_lock))
+			continue;
+
+		if (__lockref_is_dead(&dentry->d_lockref)) {
+			list_del_init(&di->lease_list);
+			goto next;
+		}
+
+		ret = check(dentry, lwc);
+		if (ret & TOUCH) {
+			/* move it into tail of dir lease list */
+			__dentry_dir_lease_touch(mdsc, di);
+			if (!last)
+				last = dentry;
+		}
+		if (ret & DELETE) {
+			/* stale lease */
+			di->flags &= ~CEPH_DENTRY_REFERENCED;
+			if (dentry->d_lockref.count > 0) {
+				/* update_dentry_lease() will re-add
+				 * it to lease list, or
+				 * ceph_d_delete() will return 1 when
+				 * last reference is dropped */
+				list_del_init(&di->lease_list);
+			} else {
+				di->flags |= CEPH_DENTRY_SHRINK_LIST;
+				list_move_tail(&di->lease_list, &dispose);
+				dget_dlock(dentry);
+			}
+		}
+next:
+		spin_unlock(&dentry->d_lock);
+		if (ret & STOP)
+			break;
+	}
+	spin_unlock(&mdsc->dentry_list_lock);
+
+	while (!list_empty(&dispose)) {
+		di = list_first_entry(&dispose, struct ceph_dentry_info,
+				      lease_list);
+		dentry = di->dentry;
+		spin_lock(&dentry->d_lock);
+
+		list_del_init(&di->lease_list);
+		di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
+		if (di->flags & CEPH_DENTRY_REFERENCED) {
+			spin_lock(&mdsc->dentry_list_lock);
+			if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+				list_add_tail(&di->lease_list,
+					      &mdsc->dentry_leases);
+			} else {
+				__dentry_dir_lease_touch(mdsc, di);
+			}
+			spin_unlock(&mdsc->dentry_list_lock);
+		} else {
+			freed++;
+		}
+
+		spin_unlock(&dentry->d_lock);
+		/* ceph_d_delete() does the trick */
+		dput(dentry);
+	}
+	return freed;
+}
+
+static int __dentry_lease_check(struct dentry *dentry, void *arg)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int ret;
+
+	if (__dentry_lease_is_valid(di))
+		return STOP;
+	ret = __dir_lease_try_check(dentry);
+	if (ret == -EBUSY)
+		return KEEP;
+	if (ret > 0)
+		return TOUCH;
+	return DELETE;
+}
+
+static int __dir_lease_check(struct dentry *dentry, void *arg)
+{
+	struct ceph_lease_walk_control *lwc = arg;
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+	int ret = __dir_lease_try_check(dentry);
+	if (ret == -EBUSY)
+		return KEEP;
+	if (ret > 0) {
+		if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
+			return STOP;
+		/* Move dentry to tail of dir lease list if we don't want
+		 * to delete it. So dentries in the list are checked in a
+		 * round robin manner */
+		if (!lwc->expire_dir_lease)
+			return TOUCH;
+		if (dentry->d_lockref.count > 0 ||
+		    (di->flags & CEPH_DENTRY_REFERENCED))
+			return TOUCH;
+		/* invalidate dir lease */
+		di->lease_shared_gen = 0;
+	}
+	return DELETE;
+}
+
+int ceph_trim_dentries(struct ceph_mds_client *mdsc)
+{
+	struct ceph_lease_walk_control lwc;
+	unsigned long count;
+	unsigned long freed;
+
+	spin_lock(&mdsc->caps_list_lock);
+        if (mdsc->caps_use_max > 0 &&
+            mdsc->caps_use_count > mdsc->caps_use_max)
+		count = mdsc->caps_use_count - mdsc->caps_use_max;
+	else
+		count = 0;
+        spin_unlock(&mdsc->caps_list_lock);
+
+	lwc.dir_lease = false;
+	lwc.nr_to_scan  = CEPH_CAPS_PER_RELEASE * 2;
+	freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+	if (!lwc.nr_to_scan) /* more invalid leases */
+		return -EAGAIN;
+
+	if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
+		lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
+
+	lwc.dir_lease = true;
+	lwc.expire_dir_lease = freed < count;
+	lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
+	freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+	if (!lwc.nr_to_scan) /* more to check */
+		return -EAGAIN;
+
+	return freed > 0 ? 1 : 0;
+}
+
+/*
  * Ensure a dentry lease will no longer revalidate.
  */
 void ceph_invalidate_dentry_lease(struct dentry *dentry)
 {
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
 	spin_lock(&dentry->d_lock);
-	ceph_dentry(dentry)->time = jiffies;
-	ceph_dentry(dentry)->lease_shared_gen = 0;
+	di->time = jiffies;
+	di->lease_shared_gen = 0;
+	__dentry_lease_unlist(di);
 	spin_unlock(&dentry->d_lock);
 }
 
@@ -1139,52 +1419,65 @@
  * Check if dentry lease is valid.  If not, delete the lease.  Try to
  * renew if the least is more than half up.
  */
-static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
-				 struct inode *dir)
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
+{
+	struct ceph_mds_session *session;
+
+	if (!di->lease_gen)
+		return false;
+
+	session = di->lease_session;
+	if (session) {
+		u32 gen;
+		unsigned long ttl;
+
+		spin_lock(&session->s_gen_ttl_lock);
+		gen = session->s_cap_gen;
+		ttl = session->s_cap_ttl;
+		spin_unlock(&session->s_gen_ttl_lock);
+
+		if (di->lease_gen == gen &&
+		    time_before(jiffies, ttl) &&
+		    time_before(jiffies, di->time))
+			return true;
+	}
+	di->lease_gen = 0;
+	return false;
+}
+
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
 {
 	struct ceph_dentry_info *di;
-	struct ceph_mds_session *s;
-	int valid = 0;
-	u32 gen;
-	unsigned long ttl;
 	struct ceph_mds_session *session = NULL;
 	u32 seq = 0;
+	int valid = 0;
 
 	spin_lock(&dentry->d_lock);
 	di = ceph_dentry(dentry);
-	if (di && di->lease_session) {
-		s = di->lease_session;
-		spin_lock(&s->s_gen_ttl_lock);
-		gen = s->s_cap_gen;
-		ttl = s->s_cap_ttl;
-		spin_unlock(&s->s_gen_ttl_lock);
+	if (di && __dentry_lease_is_valid(di)) {
+		valid = 1;
 
-		if (di->lease_gen == gen &&
-		    time_before(jiffies, di->time) &&
-		    time_before(jiffies, ttl)) {
-			valid = 1;
-			if (di->lease_renew_after &&
-			    time_after(jiffies, di->lease_renew_after)) {
-				/*
-				 * We should renew. If we're in RCU walk mode
-				 * though, we can't do that so just return
-				 * -ECHILD.
-				 */
-				if (flags & LOOKUP_RCU) {
-					valid = -ECHILD;
-				} else {
-					session = ceph_get_mds_session(s);
-					seq = di->lease_seq;
-					di->lease_renew_after = 0;
-					di->lease_renew_from = jiffies;
-				}
+		if (di->lease_renew_after &&
+		    time_after(jiffies, di->lease_renew_after)) {
+			/*
+			 * We should renew. If we're in RCU walk mode
+			 * though, we can't do that so just return
+			 * -ECHILD.
+			 */
+			if (flags & LOOKUP_RCU) {
+				valid = -ECHILD;
+			} else {
+				session = ceph_get_mds_session(di->lease_session);
+				seq = di->lease_seq;
+				di->lease_renew_after = 0;
+				di->lease_renew_from = jiffies;
 			}
 		}
 	}
 	spin_unlock(&dentry->d_lock);
 
 	if (session) {
-		ceph_mdsc_lease_send_msg(session, dir, dentry,
+		ceph_mdsc_lease_send_msg(session, dentry,
 					 CEPH_MDS_LEASE_RENEW, seq);
 		ceph_put_mds_session(session);
 	}
@@ -1193,21 +1486,63 @@
 }
 
 /*
+ * Called under dentry->d_lock.
+ */
+static int __dir_lease_try_check(const struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	struct inode *dir;
+	struct ceph_inode_info *ci;
+	int valid = 0;
+
+	if (!di->lease_shared_gen)
+		return 0;
+	if (IS_ROOT(dentry))
+		return 0;
+
+	dir = d_inode(dentry->d_parent);
+	ci = ceph_inode(dir);
+
+	if (spin_trylock(&ci->i_ceph_lock)) {
+		if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen &&
+		    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0))
+			valid = 1;
+		spin_unlock(&ci->i_ceph_lock);
+	} else {
+		valid = -EBUSY;
+	}
+
+	if (!valid)
+		di->lease_shared_gen = 0;
+	return valid;
+}
+
+/*
  * Check if directory-wide content lease/cap is valid.
  */
 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 {
 	struct ceph_inode_info *ci = ceph_inode(dir);
-	struct ceph_dentry_info *di = ceph_dentry(dentry);
-	int valid = 0;
+	int valid;
+	int shared_gen;
 
 	spin_lock(&ci->i_ceph_lock);
-	if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen)
-		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+	valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+	shared_gen = atomic_read(&ci->i_shared_gen);
 	spin_unlock(&ci->i_ceph_lock);
-	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
-	     dir, (unsigned)atomic_read(&ci->i_shared_gen),
-	     dentry, (unsigned)di->lease_shared_gen, valid);
+	if (valid) {
+		struct ceph_dentry_info *di;
+		spin_lock(&dentry->d_lock);
+		di = ceph_dentry(dentry);
+		if (dir == d_inode(dentry->d_parent) &&
+		    di && di->lease_shared_gen == shared_gen)
+			__ceph_dentry_dir_lease_touch(di);
+		else
+			valid = 0;
+		spin_unlock(&dentry->d_lock);
+	}
+	dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
+	     dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
 	return valid;
 }
 
@@ -1218,36 +1553,37 @@
 {
 	int valid = 0;
 	struct dentry *parent;
-	struct inode *dir;
+	struct inode *dir, *inode;
 
 	if (flags & LOOKUP_RCU) {
 		parent = READ_ONCE(dentry->d_parent);
 		dir = d_inode_rcu(parent);
 		if (!dir)
 			return -ECHILD;
+		inode = d_inode_rcu(dentry);
 	} else {
 		parent = dget_parent(dentry);
 		dir = d_inode(parent);
+		inode = d_inode(dentry);
 	}
 
 	dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
-	     dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
+	     dentry, inode, ceph_dentry(dentry)->offset);
 
 	/* always trust cached snapped dentries, snapdir dentry */
 	if (ceph_snap(dir) != CEPH_NOSNAP) {
 		dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
-		     dentry, d_inode(dentry));
+		     dentry, inode);
 		valid = 1;
-	} else if (d_really_is_positive(dentry) &&
-		   ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
+	} else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
 		valid = 1;
 	} else {
-		valid = dentry_lease_is_valid(dentry, flags, dir);
+		valid = dentry_lease_is_valid(dentry, flags);
 		if (valid == -ECHILD)
 			return valid;
 		if (valid || dir_lease_is_valid(dir, dentry)) {
-			if (d_really_is_positive(dentry))
-				valid = ceph_is_any_caps(d_inode(dentry));
+			if (inode)
+				valid = ceph_is_any_caps(inode);
 			else
 				valid = 1;
 		}
@@ -1297,11 +1633,8 @@
 	}
 
 	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
-	if (valid) {
-		ceph_dentry_lru_touch(dentry);
-	} else {
+	if (!valid)
 		ceph_dir_clear_complete(dir);
-	}
 
 	if (!(flags & LOOKUP_RCU))
 		dput(parent);
@@ -1309,6 +1642,31 @@
 }
 
 /*
+ * Delete unused dentry that doesn't have valid lease
+ *
+ * Called under dentry->d_lock.
+ */
+static int ceph_d_delete(const struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+
+	/* won't release caps */
+	if (d_really_is_negative(dentry))
+		return 0;
+	if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+		return 0;
+	/* vaild lease? */
+	di = ceph_dentry(dentry);
+	if (di) {
+		if (__dentry_lease_is_valid(di))
+			return 0;
+		if (__dir_lease_try_check(dentry))
+			return 0;
+	}
+	return 1;
+}
+
+/*
  * Release our ceph_dentry_info.
  */
 static void ceph_d_release(struct dentry *dentry)
@@ -1316,9 +1674,9 @@
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 
 	dout("d_release %p\n", dentry);
-	ceph_dentry_lru_del(dentry);
 
 	spin_lock(&dentry->d_lock);
+	__dentry_lease_unlist(di);
 	dentry->d_fsdata = NULL;
 	spin_unlock(&dentry->d_lock);
 
@@ -1419,49 +1777,7 @@
 	return size - left;
 }
 
-/*
- * We maintain a private dentry LRU.
- *
- * FIXME: this needs to be changed to a per-mds lru to be useful.
- */
-void ceph_dentry_lru_add(struct dentry *dn)
-{
-	struct ceph_dentry_info *di = ceph_dentry(dn);
-	struct ceph_mds_client *mdsc;
 
-	dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
-	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-	spin_lock(&mdsc->dentry_lru_lock);
-	list_add_tail(&di->lru, &mdsc->dentry_lru);
-	mdsc->num_dentry++;
-	spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_touch(struct dentry *dn)
-{
-	struct ceph_dentry_info *di = ceph_dentry(dn);
-	struct ceph_mds_client *mdsc;
-
-	dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
-	     di->offset);
-	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-	spin_lock(&mdsc->dentry_lru_lock);
-	list_move_tail(&di->lru, &mdsc->dentry_lru);
-	spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_del(struct dentry *dn)
-{
-	struct ceph_dentry_info *di = ceph_dentry(dn);
-	struct ceph_mds_client *mdsc;
-
-	dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
-	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-	spin_lock(&mdsc->dentry_lru_lock);
-	list_del_init(&di->lru);
-	mdsc->num_dentry--;
-	spin_unlock(&mdsc->dentry_lru_lock);
-}
 
 /*
  * Return name hash for a given dentry.  This is dependent on
@@ -1470,6 +1786,7 @@
 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 {
 	struct ceph_inode_info *dci = ceph_inode(dir);
+	unsigned hash;
 
 	switch (dci->i_dir_layout.dl_dir_hash) {
 	case 0:	/* for backward compat */
@@ -1477,8 +1794,11 @@
 		return dn->d_name.hash;
 
 	default:
-		return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+		spin_lock(&dn->d_lock);
+		hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
 				     dn->d_name.name, dn->d_name.len);
+		spin_unlock(&dn->d_lock);
+		return hash;
 	}
 }
 
@@ -1531,6 +1851,7 @@
 
 const struct dentry_operations ceph_dentry_ops = {
 	.d_revalidate = ceph_d_revalidate,
+	.d_delete = ceph_d_delete,
 	.d_release = ceph_d_release,
 	.d_prune = ceph_d_prune,
 	.d_init = ceph_d_init,

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 3c59ad1..b6bfa94 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c

@@ -22,18 +22,77 @@
 	u64 ino, parent_ino;
 } __attribute__ ((packed));
 
+/*
+ * fh for snapped inode
+ */
+struct ceph_nfs_snapfh {
+	u64 ino;
+	u64 snapid;
+	u64 parent_ino;
+	u32 hash;
+} __attribute__ ((packed));
+
+static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
+			      struct inode *parent_inode)
+{
+	static const int snap_handle_length =
+		sizeof(struct ceph_nfs_snapfh) >> 2;
+	struct ceph_nfs_snapfh *sfh = (void *)rawfh;
+	u64 snapid = ceph_snap(inode);
+	int ret;
+	bool no_parent = true;
+
+	if (*max_len < snap_handle_length) {
+		*max_len = snap_handle_length;
+		ret = FILEID_INVALID;
+		goto out;
+	}
+
+	ret =  -EINVAL;
+	if (snapid != CEPH_SNAPDIR) {
+		struct inode *dir;
+		struct dentry *dentry = d_find_alias(inode);
+		if (!dentry)
+			goto out;
+
+		rcu_read_lock();
+		dir = d_inode_rcu(dentry->d_parent);
+		if (ceph_snap(dir) != CEPH_SNAPDIR) {
+			sfh->parent_ino = ceph_ino(dir);
+			sfh->hash = ceph_dentry_hash(dir, dentry);
+			no_parent = false;
+		}
+		rcu_read_unlock();
+		dput(dentry);
+	}
+
+	if (no_parent) {
+		if (!S_ISDIR(inode->i_mode))
+			goto out;
+		sfh->parent_ino = sfh->ino;
+		sfh->hash = 0;
+	}
+	sfh->ino = ceph_ino(inode);
+	sfh->snapid = snapid;
+
+	*max_len = snap_handle_length;
+	ret = FILEID_BTRFS_WITH_PARENT;
+out:
+	dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret);
+	return ret;
+}
+
 static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 			  struct inode *parent_inode)
 {
+	static const int handle_length =
+		sizeof(struct ceph_nfs_fh) >> 2;
+	static const int connected_handle_length =
+		sizeof(struct ceph_nfs_confh) >> 2;
 	int type;
-	struct ceph_nfs_fh *fh = (void *)rawfh;
-	struct ceph_nfs_confh *cfh = (void *)rawfh;
-	int connected_handle_length = sizeof(*cfh)/4;
-	int handle_length = sizeof(*fh)/4;
 
-	/* don't re-export snaps */
 	if (ceph_snap(inode) != CEPH_NOSNAP)
-		return -EINVAL;
+		return ceph_encode_snapfh(inode, rawfh, max_len, parent_inode);
 
 	if (parent_inode && (*max_len < connected_handle_length)) {
 		*max_len = connected_handle_length;
@@ -44,6 +103,7 @@
 	}
 
 	if (parent_inode) {
+		struct ceph_nfs_confh *cfh = (void *)rawfh;
 		dout("encode_fh %llx with parent %llx\n",
 		     ceph_ino(inode), ceph_ino(parent_inode));
 		cfh->ino = ceph_ino(inode);
@@ -51,6 +111,7 @@
 		*max_len = connected_handle_length;
 		type = FILEID_INO32_GEN_PARENT;
 	} else {
+		struct ceph_nfs_fh *fh = (void *)rawfh;
 		dout("encode_fh %llx\n", ceph_ino(inode));
 		fh->ino = ceph_ino(inode);
 		*max_len = handle_length;
@@ -59,7 +120,7 @@
 	return type;
 }
 
-static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
+static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
 	struct inode *inode;
@@ -81,7 +142,7 @@
 		mask = CEPH_STAT_CAP_INODE;
 		if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
 			mask |= CEPH_CAP_XATTR_SHARED;
-		req->r_args.getattr.mask = cpu_to_le32(mask);
+		req->r_args.lookupino.mask = cpu_to_le32(mask);
 
 		req->r_ino1 = vino;
 		req->r_num_caps = 1;
@@ -91,14 +152,112 @@
 			ihold(inode);
 		ceph_mdsc_put_request(req);
 		if (!inode)
-			return ERR_PTR(-ESTALE);
-		if (inode->i_nlink == 0) {
-			iput(inode);
-			return ERR_PTR(-ESTALE);
+			return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode = __lookup_inode(sb, ino);
+	if (IS_ERR(inode))
+		return inode;
+	if (inode->i_nlink == 0) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
+{
+	struct inode *inode = __lookup_inode(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (inode->i_nlink == 0) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *__snapfh_to_dentry(struct super_block *sb,
+					  struct ceph_nfs_snapfh *sfh,
+					  bool want_parent)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_request *req;
+	struct inode *inode;
+	struct ceph_vino vino;
+	int mask;
+	int err;
+	bool unlinked = false;
+
+	if (want_parent) {
+		vino.ino = sfh->parent_ino;
+		if (sfh->snapid == CEPH_SNAPDIR)
+			vino.snap = CEPH_NOSNAP;
+		else if (sfh->ino == sfh->parent_ino)
+			vino.snap = CEPH_SNAPDIR;
+		else
+			vino.snap = sfh->snapid;
+	} else {
+		vino.ino = sfh->ino;
+		vino.snap = sfh->snapid;
+	}
+	inode = ceph_find_inode(sb, vino);
+	if (inode)
+		return d_obtain_alias(inode);
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+				       USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+
+	mask = CEPH_STAT_CAP_INODE;
+	if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+		mask |= CEPH_CAP_XATTR_SHARED;
+	req->r_args.lookupino.mask = cpu_to_le32(mask);
+	if (vino.snap < CEPH_NOSNAP) {
+		req->r_args.lookupino.snapid = cpu_to_le64(vino.snap);
+		if (!want_parent && sfh->ino != sfh->parent_ino) {
+			req->r_args.lookupino.parent =
+					cpu_to_le64(sfh->parent_ino);
+			req->r_args.lookupino.hash =
+					cpu_to_le32(sfh->hash);
 		}
 	}
 
-	return d_obtain_alias(inode);
+	req->r_ino1 = vino;
+	req->r_num_caps = 1;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	inode = req->r_target_inode;
+	if (inode) {
+		if (vino.snap == CEPH_SNAPDIR) {
+			if (inode->i_nlink == 0)
+				unlinked = true;
+			inode = ceph_get_snapdir(inode);
+		} else if (ceph_snap(inode) == vino.snap) {
+			ihold(inode);
+		} else {
+			/* mds does not support lookup snapped inode */
+			err = -EOPNOTSUPP;
+			inode = NULL;
+		}
+	}
+	ceph_mdsc_put_request(req);
+
+	if (want_parent) {
+		dout("snapfh_to_parent %llx.%llx\n err=%d\n",
+		     vino.ino, vino.snap, err);
+	} else {
+		dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d",
+		      vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err);
+	}
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+	/* see comments in ceph_get_parent() */
+	return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
 }
 
 /*
@@ -110,6 +269,11 @@
 {
 	struct ceph_nfs_fh *fh = (void *)fid->raw;
 
+	if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+		struct ceph_nfs_snapfh *sfh = (void *)fid->raw;
+		return __snapfh_to_dentry(sb, sfh, false);
+	}
+
 	if (fh_type != FILEID_INO32_GEN  &&
 	    fh_type != FILEID_INO32_GEN_PARENT)
 		return NULL;
@@ -163,13 +327,49 @@
 
 static struct dentry *ceph_get_parent(struct dentry *child)
 {
-	/* don't re-export snaps */
-	if (ceph_snap(d_inode(child)) != CEPH_NOSNAP)
-		return ERR_PTR(-EINVAL);
+	struct inode *inode = d_inode(child);
+	struct dentry *dn;
 
-	dout("get_parent %p ino %llx.%llx\n",
-	     child, ceph_vinop(d_inode(child)));
-	return __get_parent(child->d_sb, child, 0);
+	if (ceph_snap(inode) != CEPH_NOSNAP) {
+		struct inode* dir;
+		bool unlinked = false;
+		/* do not support non-directory */
+		if (!d_is_dir(child)) {
+			dn = ERR_PTR(-EINVAL);
+			goto out;
+		}
+		dir = __lookup_inode(inode->i_sb, ceph_ino(inode));
+		if (IS_ERR(dir)) {
+			dn = ERR_CAST(dir);
+			goto out;
+		}
+		/* There can be multiple paths to access snapped inode.
+		 * For simplicity, treat snapdir of head inode as parent */
+		if (ceph_snap(inode) != CEPH_SNAPDIR) {
+			struct inode *snapdir = ceph_get_snapdir(dir);
+			if (dir->i_nlink == 0)
+				unlinked = true;
+			iput(dir);
+			if (IS_ERR(snapdir)) {
+				dn = ERR_CAST(snapdir);
+				goto out;
+			}
+			dir = snapdir;
+		}
+		/* If directory has already been deleted, futher get_parent
+		 * will fail. Do not mark snapdir dentry as disconnected,
+		 * this prevent exportfs from doing futher get_parent. */
+		if (unlinked)
+			dn = d_obtain_root(dir);
+		else
+			dn = d_obtain_alias(dir);
+	} else {
+		dn = __get_parent(child->d_sb, child, 0);
+	}
+out:
+	dout("get_parent %p ino %llx.%llx err=%ld\n",
+	     child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
+	return dn;
 }
 
 /*
@@ -182,6 +382,11 @@
 	struct ceph_nfs_confh *cfh = (void *)fid->raw;
 	struct dentry *dentry;
 
+	if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+		struct ceph_nfs_snapfh *sfh = (void *)fid->raw;
+		return __snapfh_to_dentry(sb, sfh, true);
+	}
+
 	if (fh_type != FILEID_INO32_GEN_PARENT)
 		return NULL;
 	if (fh_len < sizeof(*cfh) / 4)
@@ -194,14 +399,115 @@
 	return dentry;
 }
 
+static int __get_snap_name(struct dentry *parent, char *name,
+			   struct dentry *child)
+{
+	struct inode *inode = d_inode(child);
+	struct inode *dir = d_inode(parent);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_request *req = NULL;
+	char *last_name = NULL;
+	unsigned next_offset = 2;
+	int err = -EINVAL;
+
+	if (ceph_ino(inode) != ceph_ino(dir))
+		goto out;
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		if (ceph_snap(dir) == CEPH_NOSNAP) {
+			strcpy(name, fsc->mount_options->snapdir_name);
+			err = 0;
+		}
+		goto out;
+	}
+	if (ceph_snap(dir) != CEPH_SNAPDIR)
+		goto out;
+
+	while (1) {
+		struct ceph_mds_reply_info_parsed *rinfo;
+		struct ceph_mds_reply_dir_entry *rde;
+		int i;
+
+		req = ceph_mdsc_create_request(fsc->mdsc, CEPH_MDS_OP_LSSNAP,
+					       USE_AUTH_MDS);
+		if (IS_ERR(req)) {
+			err = PTR_ERR(req);
+			req = NULL;
+			goto out;
+		}
+		err = ceph_alloc_readdir_reply_buffer(req, inode);
+		if (err)
+			goto out;
+
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_readdir_offset = next_offset;
+		req->r_args.readdir.flags =
+				cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
+		if (last_name) {
+			req->r_path2 = last_name;
+			last_name = NULL;
+		}
+
+		req->r_inode = dir;
+		ihold(dir);
+		req->r_dentry = dget(parent);
+
+		inode_lock(dir);
+		err = ceph_mdsc_do_request(fsc->mdsc, NULL, req);
+		inode_unlock(dir);
+
+		if (err < 0)
+			goto out;
+
+		rinfo = &req->r_reply_info;
+		for (i = 0; i < rinfo->dir_nr; i++) {
+			rde = rinfo->dir_entries + i;
+			BUG_ON(!rde->inode.in);
+			if (ceph_snap(inode) ==
+			    le64_to_cpu(rde->inode.in->snapid)) {
+				memcpy(name, rde->name, rde->name_len);
+				name[rde->name_len] = '\0';
+				err = 0;
+				goto out;
+			}
+		}
+
+		if (rinfo->dir_end)
+			break;
+
+		BUG_ON(rinfo->dir_nr <= 0);
+		rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
+		next_offset += rinfo->dir_nr;
+		last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
+		if (!last_name) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		ceph_mdsc_put_request(req);
+		req = NULL;
+	}
+	err = -ENOENT;
+out:
+	if (req)
+		ceph_mdsc_put_request(req);
+	kfree(last_name);
+	dout("get_snap_name %p ino %llx.%llx err=%d\n",
+	     child, ceph_vinop(inode), err);
+	return err;
+}
+
 static int ceph_get_name(struct dentry *parent, char *name,
 			 struct dentry *child)
 {
 	struct ceph_mds_client *mdsc;
 	struct ceph_mds_request *req;
+	struct inode *inode = d_inode(child);
 	int err;
 
-	mdsc = ceph_inode_to_client(d_inode(child))->mdsc;
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return __get_snap_name(parent, name, child);
+
+	mdsc = ceph_inode_to_client(inode)->mdsc;
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
 				       USE_ANY_MDS);
 	if (IS_ERR(req))
@@ -209,8 +515,8 @@
 
 	inode_lock(d_inode(parent));
 
-	req->r_inode = d_inode(child);
-	ihold(d_inode(child));
+	req->r_inode = inode;
+	ihold(inode);
 	req->r_ino2 = ceph_vino(d_inode(parent));
 	req->r_parent = d_inode(parent);
 	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
@@ -224,10 +530,10 @@
 		memcpy(name, rinfo->dname, rinfo->dname_len);
 		name[rinfo->dname_len] = 0;
 		dout("get_name %p ino %llx.%llx name %s\n",
-		     child, ceph_vinop(d_inode(child)), name);
+		     child, ceph_vinop(inode), name);
 	} else {
 		dout("get_name %p ino %llx.%llx err %d\n",
-		     child, ceph_vinop(d_inode(child)), err);
+		     child, ceph_vinop(inode), err);
 	}
 
 	ceph_mdsc_put_request(req);

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 92ab204..8de6339 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c

@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/striper.h>
 
 #include <linux/module.h>
 #include <linux/sched.h>
@@ -9,10 +10,12 @@
 #include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/iversion.h>
 
 #include "super.h"
 #include "mds_client.h"
 #include "cache.h"
+#include "io.h"
 
 static __le32 ceph_flags_sys2wire(u32 flags)
 {
@@ -199,6 +202,7 @@
 static int ceph_init_file_info(struct inode *inode, struct file *file,
 					int fmode, bool isdir)
 {
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi;
 
 	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
@@ -209,7 +213,7 @@
 		struct ceph_dir_file_info *dfi =
 			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
 		if (!dfi) {
-			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			ceph_put_fmode(ci, fmode); /* clean up */
 			return -ENOMEM;
 		}
 
@@ -220,7 +224,7 @@
 	} else {
 		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
 		if (!fi) {
-			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			ceph_put_fmode(ci, fmode); /* clean up */
 			return -ENOMEM;
 		}
 
@@ -230,6 +234,8 @@
 	fi->fmode = fmode;
 	spin_lock_init(&fi->rw_contexts_lock);
 	INIT_LIST_HEAD(&fi->rw_contexts);
+	fi->meta_err = errseq_sample(&ci->i_meta_err);
+	fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
 
 	return 0;
 }
@@ -246,6 +252,7 @@
 	case S_IFREG:
 		ceph_fscache_register_inode_cookie(inode);
 		ceph_fscache_file_set_cookie(inode, file);
+		/* fall through */
 	case S_IFDIR:
 		ret = ceph_init_file_info(inode, file, fmode,
 						S_ISDIR(inode->i_mode));
@@ -435,7 +442,7 @@
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct dentry *dn;
-	struct ceph_acls_info acls = {};
+	struct ceph_acl_sec_ctx as_ctx = {};
 	int mask;
 	int err;
 
@@ -449,25 +456,31 @@
 	if (flags & O_CREAT) {
 		if (ceph_quota_is_max_files_exceeded(dir))
 			return -EDQUOT;
-		err = ceph_pre_init_acls(dir, &mode, &acls);
+		err = ceph_pre_init_acls(dir, &mode, &as_ctx);
 		if (err < 0)
 			return err;
+		err = ceph_security_init_secctx(dentry, mode, &as_ctx);
+		if (err < 0)
+			goto out_ctx;
+	} else if (!d_in_lookup(dentry)) {
+		/* If it's not being looked up, it's negative */
+		return -ENOENT;
 	}
 
 	/* do the open */
 	req = prepare_open_request(dir->i_sb, flags, mode);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
-		goto out_acl;
+		goto out_ctx;
 	}
 	req->r_dentry = dget(dentry);
 	req->r_num_caps = 2;
 	if (flags & O_CREAT) {
 		req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
 		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
-		if (acls.pagelist) {
-			req->r_pagelist = acls.pagelist;
-			acls.pagelist = NULL;
+		if (as_ctx.pagelist) {
+			req->r_pagelist = as_ctx.pagelist;
+			as_ctx.pagelist = NULL;
 		}
 	}
 
@@ -505,7 +518,7 @@
 	} else {
 		dout("atomic_open finish_open on dn %p\n", dn);
 		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
-			ceph_init_inode_acls(d_inode(dentry), &acls);
+			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
 			file->f_mode |= FMODE_CREATED;
 		}
 		err = finish_open(file, dentry, ceph_open);
@@ -514,8 +527,8 @@
 	if (!req->r_err && req->r_target_inode)
 		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
 	ceph_mdsc_put_request(req);
-out_acl:
-	ceph_release_acls_info(&acls);
+out_ctx:
+	ceph_release_acl_sec_ctx(&as_ctx);
 	dout("atomic_open result=%d\n", err);
 	return err;
 }
@@ -557,90 +570,26 @@
 };
 
 /*
- * Read a range of bytes striped over one or more objects.  Iterate over
- * objects we stripe over.  (That's not atomic, but good enough for now.)
+ * Completely synchronous read and write methods.  Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.  (That's not
+ * atomic, but good enough for now.)
  *
  * If we get a short result from the OSD, check against i_size; we need to
  * only return a short read to the caller if we hit EOF.
  */
-static int striped_read(struct inode *inode,
-			u64 pos, u64 len,
-			struct page **pages, int num_pages,
-			int page_align, int *checkeof)
-{
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	u64 this_len;
-	loff_t i_size;
-	int page_idx;
-	int ret, read = 0;
-	bool hit_stripe, was_short;
-
-	/*
-	 * we may need to do multiple reads.  not atomic, unfortunately.
-	 */
-more:
-	this_len = len;
-	page_idx = (page_align + read) >> PAGE_SHIFT;
-	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
-				  &ci->i_layout, pos, &this_len,
-				  ci->i_truncate_seq, ci->i_truncate_size,
-				  pages + page_idx, num_pages - page_idx,
-				  ((page_align + read) & ~PAGE_MASK));
-	if (ret == -ENOENT)
-		ret = 0;
-	hit_stripe = this_len < len;
-	was_short = ret >= 0 && ret < this_len;
-	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
-	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
-
-	i_size = i_size_read(inode);
-	if (ret >= 0) {
-		if (was_short && (pos + ret < i_size)) {
-			int zlen = min(this_len - ret, i_size - pos - ret);
-			int zoff = page_align + read + ret;
-			dout(" zero gap %llu to %llu\n",
-			     pos + ret, pos + ret + zlen);
-			ceph_zero_page_vector_range(zoff, zlen, pages);
-			ret += zlen;
-		}
-
-		read += ret;
-		pos += ret;
-		len -= ret;
-
-		/* hit stripe and need continue*/
-		if (len && hit_stripe && pos < i_size)
-			goto more;
-	}
-
-	if (read > 0) {
-		ret = read;
-		/* did we bounce off eof? */
-		if (pos + len > i_size)
-			*checkeof = CHECK_EOF;
-	}
-
-	dout("striped_read returns %d\n", ret);
-	return ret;
-}
-
-/*
- * Completely synchronous read and write methods.  Direct from __user
- * buffer to osd, or directly to user pages (if O_DIRECT).
- *
- * If the read spans object boundary, just do multiple reads.
- */
 static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
-			      int *checkeof)
+			      int *retry_op)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
-	struct page **pages;
-	u64 off = iocb->ki_pos;
-	int num_pages;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	ssize_t ret;
-	size_t len = iov_iter_count(to);
+	u64 off = iocb->ki_pos;
+	u64 len = iov_iter_count(to);
 
 	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -653,61 +602,125 @@
 	 * but it will at least behave sensibly when they are
 	 * in sequence.
 	 */
-	ret = filemap_write_and_wait_range(inode->i_mapping, off,
-						off + len);
+	ret = filemap_write_and_wait_range(inode->i_mapping,
+					   off, off + len - 1);
 	if (ret < 0)
 		return ret;
 
-	if (unlikely(to->type & ITER_PIPE)) {
+	ret = 0;
+	while ((len = iov_iter_count(to)) > 0) {
+		struct ceph_osd_request *req;
+		struct page **pages;
+		int num_pages;
 		size_t page_off;
-		ret = iov_iter_get_pages_alloc(to, &pages, len,
-					       &page_off);
-		if (ret <= 0)
-			return -ENOMEM;
-		num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+		u64 i_size;
+		bool more;
 
-		ret = striped_read(inode, off, ret, pages, num_pages,
-				   page_off, checkeof);
-		if (ret > 0) {
-			iov_iter_advance(to, ret);
-			off += ret;
-		} else {
-			iov_iter_advance(to, 0);
+		req = ceph_osdc_new_request(osdc, &ci->i_layout,
+					ci->i_vino, off, &len, 0, 1,
+					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+					NULL, ci->i_truncate_seq,
+					ci->i_truncate_size, false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			break;
 		}
-		ceph_put_page_vector(pages, num_pages, false);
-	} else {
-		num_pages = calc_pages_for(off, len);
-		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-		if (IS_ERR(pages))
-			return PTR_ERR(pages);
 
-		ret = striped_read(inode, off, len, pages, num_pages,
-				   (off & ~PAGE_MASK), checkeof);
-		if (ret > 0) {
-			int l, k = 0;
-			size_t left = ret;
+		more = len < iov_iter_count(to);
 
-			while (left) {
-				size_t page_off = off & ~PAGE_MASK;
-				size_t copy = min_t(size_t, left,
-						    PAGE_SIZE - page_off);
-				l = copy_page_to_iter(pages[k++], page_off,
-						      copy, to);
-				off += l;
-				left -= l;
-				if (l < copy)
-					break;
+		if (unlikely(iov_iter_is_pipe(to))) {
+			ret = iov_iter_get_pages_alloc(to, &pages, len,
+						       &page_off);
+			if (ret <= 0) {
+				ceph_osdc_put_request(req);
+				ret = -ENOMEM;
+				break;
+			}
+			num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+			if (ret < len) {
+				len = ret;
+				osd_req_op_extent_update(req, 0, len);
+				more = false;
+			}
+		} else {
+			num_pages = calc_pages_for(off, len);
+			page_off = off & ~PAGE_MASK;
+			pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+			if (IS_ERR(pages)) {
+				ceph_osdc_put_request(req);
+				ret = PTR_ERR(pages);
+				break;
 			}
 		}
-		ceph_release_page_vector(pages, num_pages);
+
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
+						 false, false);
+		ret = ceph_osdc_start_request(osdc, req, false);
+		if (!ret)
+			ret = ceph_osdc_wait_request(osdc, req);
+		ceph_osdc_put_request(req);
+
+		i_size = i_size_read(inode);
+		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
+		     off, len, ret, i_size, (more ? " MORE" : ""));
+
+		if (ret == -ENOENT)
+			ret = 0;
+		if (ret >= 0 && ret < len && (off + ret < i_size)) {
+			int zlen = min(len - ret, i_size - off - ret);
+			int zoff = page_off + ret;
+			dout("sync_read zero gap %llu~%llu\n",
+                             off + ret, off + ret + zlen);
+			ceph_zero_page_vector_range(zoff, zlen, pages);
+			ret += zlen;
+		}
+
+		if (unlikely(iov_iter_is_pipe(to))) {
+			if (ret > 0) {
+				iov_iter_advance(to, ret);
+				off += ret;
+			} else {
+				iov_iter_advance(to, 0);
+			}
+			ceph_put_page_vector(pages, num_pages, false);
+		} else {
+			int idx = 0;
+			size_t left = ret > 0 ? ret : 0;
+			while (left > 0) {
+				size_t len, copied;
+				page_off = off & ~PAGE_MASK;
+				len = min_t(size_t, left, PAGE_SIZE - page_off);
+				copied = copy_page_to_iter(pages[idx++],
+							   page_off, len, to);
+				off += copied;
+				left -= copied;
+				if (copied < len) {
+					ret = -EFAULT;
+					break;
+				}
+			}
+			ceph_release_page_vector(pages, num_pages);
+		}
+
+		if (ret < 0) {
+			if (ret == -EBLACKLISTED)
+				fsc->blacklisted = true;
+			break;
+		}
+
+		if (off >= i_size || !more)
+			break;
 	}
 
 	if (off > iocb->ki_pos) {
+		if (ret >= 0 &&
+		    iov_iter_count(to) > 0 && off >= i_size_read(inode))
+			*retry_op = CHECK_EOF;
 		ret = off - iocb->ki_pos;
 		iocb->ki_pos = off;
 	}
 
-	dout("sync_read result %zd\n", ret);
+	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
 	return ret;
 }
 
@@ -740,6 +753,9 @@
 	if (!atomic_dec_and_test(&aio_req->pending_reqs))
 		return;
 
+	if (aio_req->iocb->ki_flags & IOCB_DIRECT)
+		inode_dio_end(inode);
+
 	ret = aio_req->error;
 	if (!ret)
 		ret = aio_req->total_len;
@@ -795,7 +811,7 @@
 		if (aio_work) {
 			INIT_WORK(&aio_work->work, ceph_aio_retry_work);
 			aio_work->req = req;
-			queue_work(ceph_inode_to_client(inode)->wb_wq,
+			queue_work(ceph_inode_to_client(inode)->inode_wq,
 				   &aio_work->work);
 			return;
 		}
@@ -821,7 +837,7 @@
 				aio_req->total_len = rc + zlen;
 			}
 
-			iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
+			iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
 				      osd_data->num_bvecs,
 				      osd_data->bvec_pos.iter.bi_size);
 			iov_iter_advance(&i, rc);
@@ -865,7 +881,7 @@
 	}
 	spin_unlock(&ci->i_ceph_lock);
 
-	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
 			false, GFP_NOFS);
 	if (!req) {
 		ret = -ENOMEM;
@@ -877,6 +893,11 @@
 	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
 	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
+	req->r_ops[0] = orig_req->r_ops[0];
+
+	req->r_mtime = aio_req->mtime;
+	req->r_data_offset = req->r_ops[0].extent.offset;
+
 	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (ret) {
 		ceph_osdc_put_request(req);
@@ -884,11 +905,6 @@
 		goto out;
 	}
 
-	req->r_ops[0] = orig_req->r_ops[0];
-
-	req->r_mtime = aio_req->mtime;
-	req->r_data_offset = req->r_ops[0].extent.offset;
-
 	ceph_osdc_put_request(orig_req);
 
 	req->r_callback = ceph_aio_complete_req;
@@ -921,7 +937,7 @@
 	struct ceph_aio_request *aio_req = NULL;
 	int num_pages = 0;
 	int flags;
-	int ret;
+	int ret = 0;
 	struct timespec64 mtime = current_time(inode);
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
@@ -933,16 +949,12 @@
 
 	dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
 	     (write ? "write" : "read"), file, pos, (unsigned)count,
-	     snapc, snapc->seq);
-
-	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
-	if (ret < 0)
-		return ret;
+	     snapc, snapc ? snapc->seq : 0);
 
 	if (write) {
 		int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
 					pos >> PAGE_SHIFT,
-					(pos + count) >> PAGE_SHIFT);
+					(pos + count - 1) >> PAGE_SHIFT);
 		if (ret2 < 0)
 			dout("invalidate_inode_pages2_range returned %d\n", ret2);
 
@@ -1010,7 +1022,7 @@
 			 * may block.
 			 */
 			truncate_inode_pages_range(inode->i_mapping, pos,
-					(pos+len) | (PAGE_SIZE - 1));
+						   PAGE_ALIGN(pos + len) - 1);
 
 			req->r_mtime = mtime;
 		}
@@ -1025,7 +1037,7 @@
 			req->r_callback = ceph_aio_complete_req;
 			req->r_inode = inode;
 			req->r_priv = aio_req;
-			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+			list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
 
 			pos += len;
 			continue;
@@ -1044,8 +1056,7 @@
 				int zlen = min_t(size_t, len - ret,
 						 size - pos - ret);
 
-				iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
-					      len);
+				iov_iter_bvec(&i, READ, bvecs, num_pages, len);
 				iov_iter_advance(&i, ret);
 				iov_iter_zero(zlen, &i);
 				ret += zlen;
@@ -1083,11 +1094,12 @@
 					      CEPH_CAP_FILE_RD);
 
 		list_splice(&aio_req->osd_reqs, &osd_reqs);
+		inode_dio_begin(inode);
 		while (!list_empty(&osd_reqs)) {
 			req = list_first_entry(&osd_reqs,
 					       struct ceph_osd_request,
-					       r_unsafe_item);
-			list_del_init(&req->r_unsafe_item);
+					       r_private_item);
+			list_del_init(&req->r_private_item);
 			if (ret >= 0)
 				ret = ceph_osdc_start_request(req->r_osdc,
 							      req, false);
@@ -1139,13 +1151,14 @@
 	dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
 	     file, pos, (unsigned)count, snapc, snapc->seq);
 
-	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+	ret = filemap_write_and_wait_range(inode->i_mapping,
+					   pos, pos + count - 1);
 	if (ret < 0)
 		return ret;
 
 	ret = invalidate_inode_pages2_range(inode->i_mapping,
 					    pos >> PAGE_SHIFT,
-					    (pos + count) >> PAGE_SHIFT);
+					    (pos + count - 1) >> PAGE_SHIFT);
 	if (ret < 0)
 		dout("invalidate_inode_pages2_range returned %d\n", ret);
 
@@ -1255,13 +1268,24 @@
 	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
 	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
 
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_start_io_direct(inode);
+	else
+		ceph_start_io_read(inode);
+
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
 		want = CEPH_CAP_FILE_CACHE;
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
-	if (ret < 0)
+	ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
+			    &got, &pinned_page);
+	if (ret < 0) {
+		if (iocb->ki_flags & IOCB_DIRECT)
+			ceph_end_io_direct(inode);
+		else
+			ceph_end_io_read(inode);
 		return ret;
+	}
 
 	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_flags & IOCB_DIRECT) ||
@@ -1292,6 +1316,7 @@
 		ret = generic_file_read_iter(iocb, to);
 		ceph_del_rw_context(fi, &rw_ctx);
 	}
+
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
 	if (pinned_page) {
@@ -1299,6 +1324,12 @@
 		pinned_page = NULL;
 	}
 	ceph_put_cap_refs(ci, got);
+
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_end_io_direct(inode);
+	else
+		ceph_end_io_read(inode);
+
 	if (retry_op > HAVE_RETRIED && ret >= 0) {
 		int statret;
 		struct page *page = NULL;
@@ -1398,7 +1429,10 @@
 		return -ENOMEM;
 
 retry_snap:
-	inode_lock(inode);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_start_io_direct(inode);
+	else
+		ceph_start_io_write(inode);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -1435,6 +1469,8 @@
 	if (err)
 		goto out;
 
+	inode_inc_iversion_raw(inode);
+
 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
 		err = ceph_uninline_data(file, NULL);
 		if (err < 0)
@@ -1454,7 +1490,7 @@
 	else
 		want = CEPH_CAP_FILE_BUFFER;
 	got = 0;
-	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
+	err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
 			    &got, NULL);
 	if (err < 0)
 		goto out;
@@ -1467,7 +1503,6 @@
 	    (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
 		struct ceph_snap_context *snapc;
 		struct iov_iter data;
-		inode_unlock(inode);
 
 		spin_lock(&ci->i_ceph_lock);
 		if (__ceph_have_pending_cap_snap(ci)) {
@@ -1484,11 +1519,14 @@
 
 		/* we might need to revert back to that point */
 		data = *from;
-		if (iocb->ki_flags & IOCB_DIRECT)
+		if (iocb->ki_flags & IOCB_DIRECT) {
 			written = ceph_direct_read_write(iocb, &data, snapc,
 							 &prealloc_cf);
-		else
+			ceph_end_io_direct(inode);
+		} else {
 			written = ceph_sync_write(iocb, &data, pos, snapc);
+			ceph_end_io_write(inode);
+		}
 		if (written > 0)
 			iov_iter_advance(from, written);
 		ceph_put_snap_context(snapc);
@@ -1503,7 +1541,7 @@
 		written = generic_perform_write(file, from, pos);
 		if (likely(written >= 0))
 			iocb->ki_pos = pos + written;
-		inode_unlock(inode);
+		ceph_end_io_write(inode);
 	}
 
 	if (written >= 0) {
@@ -1538,9 +1576,11 @@
 	}
 
 	goto out_unlocked;
-
 out:
-	inode_unlock(inode);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_end_io_direct(inode);
+	else
+		ceph_end_io_write(inode);
 out_unlocked:
 	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;
@@ -1735,7 +1775,6 @@
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_cap_flush *prealloc_cf;
 	int want, got = 0;
 	int dirty;
@@ -1743,10 +1782,7 @@
 	loff_t endoff = 0;
 	loff_t size;
 
-	if ((offset + length) > max(i_size_read(inode), fsc->max_file_size))
-		return -EFBIG;
-
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 
 	if (!S_ISREG(inode->i_mode))
@@ -1763,18 +1799,6 @@
 		goto unlock;
 	}
 
-	if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
-	    ceph_quota_is_max_bytes_exceeded(inode, offset + length)) {
-		ret = -EDQUOT;
-		goto unlock;
-	}
-
-	if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL) &&
-	    !(mode & FALLOC_FL_PUNCH_HOLE)) {
-		ret = -ENOSPC;
-		goto unlock;
-	}
-
 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
 		ret = ceph_uninline_data(file, NULL);
 		if (ret < 0)
@@ -1782,32 +1806,24 @@
 	}
 
 	size = i_size_read(inode);
-	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
-		endoff = offset + length;
-		ret = inode_newsize_ok(inode, endoff);
-		if (ret)
-			goto unlock;
-	}
+
+	/* Are we punching a hole beyond EOF? */
+	if (offset >= size)
+		goto unlock;
+	if ((offset + length) > size)
+		length = size - offset;
 
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
 	else
 		want = CEPH_CAP_FILE_BUFFER;
 
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
+	ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
 	if (ret < 0)
 		goto unlock;
 
-	if (mode & FALLOC_FL_PUNCH_HOLE) {
-		if (offset < size)
-			ceph_zero_pagecache_range(inode, offset, length);
-		ret = ceph_zero_objects(inode, offset, length);
-	} else if (endoff > size) {
-		truncate_pagecache_range(inode, size, -1);
-		if (ceph_inode_set_size(inode, endoff))
-			ceph_check_caps(ceph_inode(inode),
-				CHECK_CAPS_AUTHONLY, NULL);
-	}
+	ceph_zero_pagecache_range(inode, offset, length);
+	ret = ceph_zero_objects(inode, offset, length);
 
 	if (!ret) {
 		spin_lock(&ci->i_ceph_lock);
@@ -1817,9 +1833,6 @@
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
-		if ((endoff > size) &&
-		    ceph_quota_is_max_bytes_approaching(inode, endoff))
-			ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
 	}
 
 	ceph_put_cap_refs(ci, got);
@@ -1829,6 +1842,339 @@
 	return ret;
 }
 
+/*
+ * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
+ * src_ci.  Two attempts are made to obtain both caps, and an error is return if
+ * this fails; zero is returned on success.
+ */
+static int get_rd_wr_caps(struct file *src_filp, int *src_got,
+			  struct file *dst_filp,
+			  loff_t dst_endoff, int *dst_got)
+{
+	int ret = 0;
+	bool retrying = false;
+
+retry_caps:
+	ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+			    dst_endoff, dst_got, NULL);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Since we're already holding the FILE_WR capability for the dst file,
+	 * we would risk a deadlock by using ceph_get_caps.  Thus, we'll do some
+	 * retry dance instead to try to get both capabilities.
+	 */
+	ret = ceph_try_get_caps(file_inode(src_filp),
+				CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
+				false, src_got);
+	if (ret <= 0) {
+		/* Start by dropping dst_ci caps and getting src_ci caps */
+		ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
+		if (retrying) {
+			if (!ret)
+				/* ceph_try_get_caps masks EAGAIN */
+				ret = -EAGAIN;
+			return ret;
+		}
+		ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
+				    CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
+		if (ret < 0)
+			return ret;
+		/*... drop src_ci caps too, and retry */
+		ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
+		retrying = true;
+		goto retry_caps;
+	}
+	return ret;
+}
+
+static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
+			   struct ceph_inode_info *dst_ci, int dst_got)
+{
+	ceph_put_cap_refs(src_ci, src_got);
+	ceph_put_cap_refs(dst_ci, dst_got);
+}
+
+/*
+ * This function does several size-related checks, returning an error if:
+ *  - source file is smaller than off+len
+ *  - destination file size is not OK (inode_newsize_ok())
+ *  - max bytes quotas is exceeded
+ */
+static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
+			   loff_t src_off, loff_t dst_off, size_t len)
+{
+	loff_t size, endoff;
+
+	size = i_size_read(src_inode);
+	/*
+	 * Don't copy beyond source file EOF.  Instead of simply setting length
+	 * to (size - src_off), just drop to VFS default implementation, as the
+	 * local i_size may be stale due to other clients writing to the source
+	 * inode.
+	 */
+	if (src_off + len > size) {
+		dout("Copy beyond EOF (%llu + %zu > %llu)\n",
+		     src_off, len, size);
+		return -EOPNOTSUPP;
+	}
+	size = i_size_read(dst_inode);
+
+	endoff = dst_off + len;
+	if (inode_newsize_ok(dst_inode, endoff))
+		return -EOPNOTSUPP;
+
+	if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
+		return -EDQUOT;
+
+	return 0;
+}
+
+static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
+				      struct file *dst_file, loff_t dst_off,
+				      size_t len, unsigned int flags)
+{
+	struct inode *src_inode = file_inode(src_file);
+	struct inode *dst_inode = file_inode(dst_file);
+	struct ceph_inode_info *src_ci = ceph_inode(src_inode);
+	struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
+	struct ceph_cap_flush *prealloc_cf;
+	struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
+	struct ceph_object_locator src_oloc, dst_oloc;
+	struct ceph_object_id src_oid, dst_oid;
+	loff_t endoff = 0, size;
+	ssize_t ret = -EIO;
+	u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
+	u32 src_objlen, dst_objlen, object_size;
+	int src_got = 0, dst_got = 0, err, dirty;
+	bool do_final_copy = false;
+
+	if (src_inode->i_sb != dst_inode->i_sb) {
+		struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
+
+		if (ceph_fsid_compare(&src_fsc->client->fsid,
+				      &dst_fsc->client->fsid)) {
+			dout("Copying files across clusters: src: %pU dst: %pU\n",
+			     &src_fsc->client->fsid, &dst_fsc->client->fsid);
+			return -EXDEV;
+		}
+	}
+	if (ceph_snap(dst_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	/*
+	 * Some of the checks below will return -EOPNOTSUPP, which will force a
+	 * fallback to the default VFS copy_file_range implementation.  This is
+	 * desirable in several cases (for ex, the 'len' is smaller than the
+	 * size of the objects, or in cases where that would be more
+	 * efficient).
+	 */
+
+	if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Striped file layouts require that we copy partial objects, but the
+	 * OSD copy-from operation only supports full-object copies.  Limit
+	 * this to non-striped file layouts for now.
+	 */
+	if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
+	    (src_ci->i_layout.stripe_count != 1) ||
+	    (dst_ci->i_layout.stripe_count != 1) ||
+	    (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
+		dout("Invalid src/dst files layout\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (len < src_ci->i_layout.object_size)
+		return -EOPNOTSUPP; /* no remote copy will be done */
+
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
+	/* Start by sync'ing the source and destination files */
+	ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
+	if (ret < 0) {
+		dout("failed to write src file (%zd)\n", ret);
+		goto out;
+	}
+	ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
+	if (ret < 0) {
+		dout("failed to write dst file (%zd)\n", ret);
+		goto out;
+	}
+
+	/*
+	 * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
+	 * clients may have dirty data in their caches.  And OSDs know nothing
+	 * about caps, so they can't safely do the remote object copies.
+	 */
+	err = get_rd_wr_caps(src_file, &src_got,
+			     dst_file, (dst_off + len), &dst_got);
+	if (err < 0) {
+		dout("get_rd_wr_caps returned %d\n", err);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
+	if (ret < 0)
+		goto out_caps;
+
+	size = i_size_read(dst_inode);
+	endoff = dst_off + len;
+
+	/* Drop dst file cached pages */
+	ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
+					    dst_off >> PAGE_SHIFT,
+					    endoff >> PAGE_SHIFT);
+	if (ret < 0) {
+		dout("Failed to invalidate inode pages (%zd)\n", ret);
+		ret = 0; /* XXX */
+	}
+	src_oloc.pool = src_ci->i_layout.pool_id;
+	src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+	dst_oloc.pool = dst_ci->i_layout.pool_id;
+	dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+
+	ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+				      src_ci->i_layout.object_size,
+				      &src_objnum, &src_objoff, &src_objlen);
+	ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+				      dst_ci->i_layout.object_size,
+				      &dst_objnum, &dst_objoff, &dst_objlen);
+	/* object-level offsets need to the same */
+	if (src_objoff != dst_objoff) {
+		ret = -EOPNOTSUPP;
+		goto out_caps;
+	}
+
+	/*
+	 * Do a manual copy if the object offset isn't object aligned.
+	 * 'src_objlen' contains the bytes left until the end of the object,
+	 * starting at the src_off
+	 */
+	if (src_objoff) {
+		/*
+		 * we need to temporarily drop all caps as we'll be calling
+		 * {read,write}_iter, which will get caps again.
+		 */
+		put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+		ret = do_splice_direct(src_file, &src_off, dst_file,
+				       &dst_off, src_objlen, flags);
+		if (ret < 0) {
+			dout("do_splice_direct returned %d\n", err);
+			goto out;
+		}
+		len -= ret;
+		err = get_rd_wr_caps(src_file, &src_got,
+				     dst_file, (dst_off + len), &dst_got);
+		if (err < 0)
+			goto out;
+		err = is_file_size_ok(src_inode, dst_inode,
+				      src_off, dst_off, len);
+		if (err < 0)
+			goto out_caps;
+	}
+	object_size = src_ci->i_layout.object_size;
+	while (len >= object_size) {
+		ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+					      object_size, &src_objnum,
+					      &src_objoff, &src_objlen);
+		ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+					      object_size, &dst_objnum,
+					      &dst_objoff, &dst_objlen);
+		ceph_oid_init(&src_oid);
+		ceph_oid_printf(&src_oid, "%llx.%08llx",
+				src_ci->i_vino.ino, src_objnum);
+		ceph_oid_init(&dst_oid);
+		ceph_oid_printf(&dst_oid, "%llx.%08llx",
+				dst_ci->i_vino.ino, dst_objnum);
+		/* Do an object remote copy */
+		err = ceph_osdc_copy_from(
+			&src_fsc->client->osdc,
+			src_ci->i_vino.snap, 0,
+			&src_oid, &src_oloc,
+			CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+			CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
+			&dst_oid, &dst_oloc,
+			CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+			CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0);
+		if (err) {
+			dout("ceph_osdc_copy_from returned %d\n", err);
+			if (!ret)
+				ret = err;
+			goto out_caps;
+		}
+		len -= object_size;
+		src_off += object_size;
+		dst_off += object_size;
+		ret += object_size;
+	}
+
+	if (len)
+		/* We still need one final local copy */
+		do_final_copy = true;
+
+	file_update_time(dst_file);
+	inode_inc_iversion_raw(dst_inode);
+
+	if (endoff > size) {
+		int caps_flags = 0;
+
+		/* Let the MDS know about dst file size change */
+		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
+			caps_flags |= CHECK_CAPS_NODELAY;
+		if (ceph_inode_set_size(dst_inode, endoff))
+			caps_flags |= CHECK_CAPS_AUTHONLY;
+		if (caps_flags)
+			ceph_check_caps(dst_ci, caps_flags, NULL);
+	}
+	/* Mark Fw dirty */
+	spin_lock(&dst_ci->i_ceph_lock);
+	dst_ci->i_inline_version = CEPH_INLINE_NONE;
+	dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+	spin_unlock(&dst_ci->i_ceph_lock);
+	if (dirty)
+		__mark_inode_dirty(dst_inode, dirty);
+
+out_caps:
+	put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+
+	if (do_final_copy) {
+		err = do_splice_direct(src_file, &src_off, dst_file,
+				       &dst_off, len, flags);
+		if (err < 0) {
+			dout("do_splice_direct returned %d\n", err);
+			goto out;
+		}
+		len -= err;
+		ret += err;
+	}
+
+out:
+	ceph_free_cap_flush(prealloc_cf);
+
+	return ret;
+}
+
+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+				    struct file *dst_file, loff_t dst_off,
+				    size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
+				     len, flags);
+
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src_file, src_off, dst_file,
+					      dst_off, len, flags);
+	return ret;
+}
+
 const struct file_operations ceph_file_fops = {
 	.open = ceph_open,
 	.release = ceph_release,
@@ -1844,5 +2190,5 @@
 	.unlocked_ioctl = ceph_ioctl,
 	.compat_ioctl	= ceph_ioctl,
 	.fallocate	= ceph_fallocate,
+	.copy_file_range = ceph_copy_file_range,
 };
-

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4055ab4..c074075 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c

@@ -13,6 +13,7 @@
 #include <linux/posix_acl.h>
 #include <linux/random.h>
 #include <linux/sort.h>
+#include <linux/iversion.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -33,9 +34,7 @@
 
 static const struct inode_operations ceph_symlink_iops;
 
-static void ceph_invalidate_work(struct work_struct *work);
-static void ceph_writeback_work(struct work_struct *work);
-static void ceph_vmtruncate_work(struct work_struct *work);
+static void ceph_inode_work(struct work_struct *work);
 
 /*
  * find or create an inode, given the ceph ino number
@@ -44,6 +43,7 @@
 {
 	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
 	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	inode_set_iversion_raw(inode, 0);
 	return 0;
 }
 
@@ -497,7 +497,7 @@
 	ci->i_wrbuffer_ref = 0;
 	ci->i_wrbuffer_ref_head = 0;
 	atomic_set(&ci->i_filelock_ref, 0);
-	atomic_set(&ci->i_shared_gen, 0);
+	atomic_set(&ci->i_shared_gen, 1);
 	ci->i_rdcache_gen = 0;
 	ci->i_rdcache_revoking = 0;
 
@@ -509,35 +509,39 @@
 	INIT_LIST_HEAD(&ci->i_snap_realm_item);
 	INIT_LIST_HEAD(&ci->i_snap_flush_item);
 
-	INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
-	INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
-
-	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+	INIT_WORK(&ci->i_work, ceph_inode_work);
+	ci->i_work_mask = 0;
+	memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
 
 	ceph_fscache_inode_init(ci);
 
+	ci->i_meta_err = 0;
+
 	return &ci->vfs_inode;
 }
 
-static void ceph_i_callback(struct rcu_head *head)
+void ceph_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
+	kfree(ci->i_symlink);
 	kmem_cache_free(ceph_inode_cachep, ci);
 }
 
-void ceph_destroy_inode(struct inode *inode)
+void ceph_evict_inode(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_inode_frag *frag;
 	struct rb_node *n;
 
-	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+	dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
 
 	ceph_fscache_unregister_inode_cookie(ci);
 
-	ceph_queue_caps_release(inode);
+	__ceph_remove_caps(ci);
 
 	if (__ceph_has_any_quota(ci))
 		ceph_adjust_quota_realms_count(inode, false);
@@ -548,20 +552,24 @@
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
-			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
-		struct ceph_snap_realm *realm = ci->i_snap_realm;
-
-		dout(" dropping residual ref to snap realm %p\n", realm);
-		spin_lock(&realm->inodes_with_caps_lock);
-		list_del_init(&ci->i_snap_realm_item);
-		ci->i_snap_realm = NULL;
-		if (realm->ino == ci->i_vino.ino)
-			realm->inode = NULL;
-		spin_unlock(&realm->inodes_with_caps_lock);
-		ceph_put_snap_realm(mdsc, realm);
+					ceph_inode_to_client(inode)->mdsc;
+		if (ceph_snap(inode) == CEPH_NOSNAP) {
+			struct ceph_snap_realm *realm = ci->i_snap_realm;
+			dout(" dropping residual ref to snap realm %p\n",
+			     realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_del_init(&ci->i_snap_realm_item);
+			ci->i_snap_realm = NULL;
+			if (realm->ino == ci->i_vino.ino)
+				realm->inode = NULL;
+			spin_unlock(&realm->inodes_with_caps_lock);
+			ceph_put_snap_realm(mdsc, realm);
+		} else {
+			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
+			ci->i_snap_realm = NULL;
+		}
 	}
 
-	kfree(ci->i_symlink);
 	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
 		frag = rb_entry(n, struct ceph_inode_frag, node);
 		rb_erase(n, &ci->i_fragtree);
@@ -576,18 +584,6 @@
 		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 
 	ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
-
-	call_rcu(&inode->i_rcu, ceph_i_callback);
-}
-
-int ceph_drop_inode(struct inode *inode)
-{
-	/*
-	 * Positve dentry and corresponding inode are always accompanied
-	 * in MDS reply. So no need to keep inode in the cache after
-	 * dropping all its aliases.
-	 */
-	return 1;
 }
 
 static inline blkcnt_t calc_inode_blocks(u64 size)
@@ -742,6 +738,7 @@
 	int issued, new_issued, info_caps;
 	struct timespec64 mtime, atime, ctime;
 	struct ceph_buffer *xattr_blob = NULL;
+	struct ceph_buffer *old_blob = NULL;
 	struct ceph_string *pool_ns = NULL;
 	struct ceph_cap *new_cap = NULL;
 	int err = 0;
@@ -776,6 +773,9 @@
 		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
 						     iinfo->pool_ns_len);
 
+	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
+		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
+
 	spin_lock(&ci->i_ceph_lock);
 
 	/*
@@ -794,13 +794,21 @@
 	     le64_to_cpu(info->version) > (ci->i_version & ~1)))
 		new_version = true;
 
+	/* Update change_attribute */
+	inode_set_max_iversion_raw(inode, iinfo->change_attr);
+
 	__ceph_caps_issued(ci, &issued);
 	issued |= __ceph_caps_dirty(ci);
 	new_issued = ~issued & info_caps;
 
 	/* update inode */
 	inode->i_rdev = le32_to_cpu(info->rdev);
-	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+	/* directories have fl_stripe_unit set to zero */
+	if (le32_to_cpu(info->layout.fl_stripe_unit))
+		inode->i_blkbits =
+			fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+	else
+		inode->i_blkbits = CEPH_BLOCK_SHIFT;
 
 	__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
 
@@ -812,6 +820,8 @@
 		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
 		     from_kuid(&init_user_ns, inode->i_uid),
 		     from_kgid(&init_user_ns, inode->i_gid));
+		ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
+		ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
 	}
 
 	if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
@@ -869,6 +879,7 @@
 			ci->i_rbytes = le64_to_cpu(info->rbytes);
 			ci->i_rfiles = le64_to_cpu(info->rfiles);
 			ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+			ci->i_dir_pin = iinfo->dir_pin;
 			ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
 		}
 	}
@@ -878,13 +889,14 @@
 	if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))  &&
 	    le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
 		if (ci->i_xattrs.blob)
-			ceph_buffer_put(ci->i_xattrs.blob);
+			old_blob = ci->i_xattrs.blob;
 		ci->i_xattrs.blob = xattr_blob;
 		if (xattr_blob)
 			memcpy(ci->i_xattrs.blob->vec.iov_base,
 			       iinfo->xattr_data, iinfo->xattr_len);
 		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
 		ceph_forget_all_cached_acls(inode);
+		ceph_security_invalidate_secctx(inode);
 		xattr_blob = NULL;
 	}
 
@@ -899,6 +911,7 @@
 	case S_IFBLK:
 	case S_IFCHR:
 	case S_IFSOCK:
+		inode->i_blkbits = PAGE_SHIFT;
 		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 		inode->i_op = &ceph_file_iops;
 		break;
@@ -1017,70 +1030,48 @@
 out:
 	if (new_cap)
 		ceph_put_cap(mdsc, new_cap);
-	if (xattr_blob)
-		ceph_buffer_put(xattr_blob);
+	ceph_buffer_put(old_blob);
+	ceph_buffer_put(xattr_blob);
 	ceph_put_string(pool_ns);
 	return err;
 }
 
 /*
- * caller should hold session s_mutex.
+ * caller should hold session s_mutex and dentry->d_lock.
  */
-static void update_dentry_lease(struct dentry *dentry,
-				struct ceph_mds_reply_lease *lease,
-				struct ceph_mds_session *session,
-				unsigned long from_time,
-				struct ceph_vino *tgt_vino,
-				struct ceph_vino *dir_vino)
+static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
+				  struct ceph_mds_reply_lease *lease,
+				  struct ceph_mds_session *session,
+				  unsigned long from_time,
+				  struct ceph_mds_session **old_lease_session)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 	long unsigned duration = le32_to_cpu(lease->duration_ms);
 	long unsigned ttl = from_time + (duration * HZ) / 1000;
 	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
-	struct inode *dir;
-	struct ceph_mds_session *old_lease_session = NULL;
 
-	/*
-	 * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
-	 * we expect a negative dentry.
-	 */
-	if (!tgt_vino && d_really_is_positive(dentry))
-		return;
-
-	if (tgt_vino && (d_really_is_negative(dentry) ||
-			!ceph_ino_compare(d_inode(dentry), tgt_vino)))
-		return;
-
-	spin_lock(&dentry->d_lock);
 	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
 	     dentry, duration, ttl);
 
-	dir = d_inode(dentry->d_parent);
-
-	/* make sure parent matches dir_vino */
-	if (!ceph_ino_compare(dir, dir_vino))
-		goto out_unlock;
-
 	/* only track leases on regular dentries */
 	if (ceph_snap(dir) != CEPH_NOSNAP)
-		goto out_unlock;
+		return;
 
 	di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
-
-	if (duration == 0)
-		goto out_unlock;
+	if (duration == 0) {
+		__ceph_dentry_dir_lease_touch(di);
+		return;
+	}
 
 	if (di->lease_gen == session->s_cap_gen &&
 	    time_before(ttl, di->time))
-		goto out_unlock;  /* we already have a newer lease. */
+		return;  /* we already have a newer lease. */
 
 	if (di->lease_session && di->lease_session != session) {
-		old_lease_session = di->lease_session;
+		*old_lease_session = di->lease_session;
 		di->lease_session = NULL;
 	}
 
-	ceph_dentry_lru_touch(dentry);
-
 	if (!di->lease_session)
 		di->lease_session = ceph_get_mds_session(session);
 	di->lease_gen = session->s_cap_gen;
@@ -1088,6 +1079,64 @@
 	di->lease_renew_after = half_ttl;
 	di->lease_renew_from = 0;
 	di->time = ttl;
+
+	__ceph_dentry_lease_touch(di);
+}
+
+static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
+					struct ceph_mds_reply_lease *lease,
+					struct ceph_mds_session *session,
+					unsigned long from_time)
+{
+	struct ceph_mds_session *old_lease_session = NULL;
+	spin_lock(&dentry->d_lock);
+	__update_dentry_lease(dir, dentry, lease, session, from_time,
+			      &old_lease_session);
+	spin_unlock(&dentry->d_lock);
+	if (old_lease_session)
+		ceph_put_mds_session(old_lease_session);
+}
+
+/*
+ * update dentry lease without having parent inode locked
+ */
+static void update_dentry_lease_careful(struct dentry *dentry,
+					struct ceph_mds_reply_lease *lease,
+					struct ceph_mds_session *session,
+					unsigned long from_time,
+					char *dname, u32 dname_len,
+					struct ceph_vino *pdvino,
+					struct ceph_vino *ptvino)
+
+{
+	struct inode *dir;
+	struct ceph_mds_session *old_lease_session = NULL;
+
+	spin_lock(&dentry->d_lock);
+	/* make sure dentry's name matches target */
+	if (dentry->d_name.len != dname_len ||
+	    memcmp(dentry->d_name.name, dname, dname_len))
+		goto out_unlock;
+
+	dir = d_inode(dentry->d_parent);
+	/* make sure parent matches dvino */
+	if (!ceph_ino_compare(dir, pdvino))
+		goto out_unlock;
+
+	/* make sure dentry's inode matches target. NULL ptvino means that
+	 * we expect a negative dentry */
+	if (ptvino) {
+		if (d_really_is_negative(dentry))
+			goto out_unlock;
+		if (!ceph_ino_compare(d_inode(dentry), ptvino))
+			goto out_unlock;
+	} else {
+		if (d_really_is_positive(dentry))
+			goto out_unlock;
+	}
+
+	__update_dentry_lease(dir, dentry, lease, session,
+			      from_time, &old_lease_session);
 out_unlock:
 	spin_unlock(&dentry->d_lock);
 	if (old_lease_session)
@@ -1098,8 +1147,9 @@
  * splice a dentry to an inode.
  * caller must hold directory i_mutex for this to be safe.
  */
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
+static int splice_dentry(struct dentry **pdn, struct inode *in)
 {
+	struct dentry *dn = *pdn;
 	struct dentry *realdn;
 
 	BUG_ON(d_inode(dn));
@@ -1132,28 +1182,23 @@
 	if (IS_ERR(realdn)) {
 		pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
 		       PTR_ERR(realdn), dn, in, ceph_vinop(in));
-		dn = realdn;
-		/*
-		 * Caller should release 'dn' in the case of error.
-		 * If 'req->r_dentry' is passed to this function,
-		 * caller should leave 'req->r_dentry' untouched.
-		 */
-		goto out;
-	} else if (realdn) {
+		return PTR_ERR(realdn);
+	}
+
+	if (realdn) {
 		dout("dn %p (%d) spliced with %p (%d) "
 		     "inode %p ino %llx.%llx\n",
 		     dn, d_count(dn),
 		     realdn, d_count(realdn),
 		     d_inode(realdn), ceph_vinop(d_inode(realdn)));
 		dput(dn);
-		dn = realdn;
+		*pdn = realdn;
 	} else {
 		BUG_ON(!ceph_dentry(dn));
 		dout("dn %p attached to %p ino %llx.%llx\n",
 		     dn, d_inode(dn), ceph_vinop(d_inode(dn)));
 	}
-out:
-	return dn;
+	return 0;
 }
 
 /*
@@ -1200,7 +1245,9 @@
 			WARN_ON_ONCE(1);
 		}
 
-		if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+		if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+		    test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+		    !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
 			struct qstr dname;
 			struct dentry *dn, *parent;
 
@@ -1338,7 +1385,12 @@
 			dout("dn %p gets new offset %lld\n", req->r_old_dentry,
 			     ceph_dentry(req->r_old_dentry)->offset);
 
-			dn = req->r_old_dentry;  /* use old_dentry */
+			/* swap r_dentry and r_old_dentry in case that
+			 * splice_dentry() gets called later. This is safe
+			 * because no other place will use them */
+			req->r_dentry = req->r_old_dentry;
+			req->r_old_dentry = dn;
+			dn = req->r_dentry;
 		}
 
 		/* null dentry? */
@@ -1351,10 +1403,9 @@
 			} else if (have_lease) {
 				if (d_unhashed(dn))
 					d_add(dn, NULL);
-				update_dentry_lease(dn, rinfo->dlease,
-						    session,
-						    req->r_request_started,
-						    NULL, &dvino);
+				update_dentry_lease(dir, dn,
+						    rinfo->dlease, session,
+						    req->r_request_started);
 			}
 			goto done;
 		}
@@ -1363,12 +1414,10 @@
 		if (d_really_is_negative(dn)) {
 			ceph_dir_clear_ordered(dir);
 			ihold(in);
-			dn = splice_dentry(dn, in);
-			if (IS_ERR(dn)) {
-				err = PTR_ERR(dn);
+			err = splice_dentry(&req->r_dentry, in);
+			if (err < 0)
 				goto done;
-			}
-			req->r_dentry = dn;  /* may have spliced */
+			dn = req->r_dentry;  /* may have spliced */
 		} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
 			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
 			     dn, d_inode(dn), ceph_vinop(d_inode(dn)),
@@ -1378,52 +1427,41 @@
 		}
 
 		if (have_lease) {
-			tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-			tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-			update_dentry_lease(dn, rinfo->dlease, session,
-					    req->r_request_started,
-					    &tvino, &dvino);
+			update_dentry_lease(dir, dn,
+					    rinfo->dlease, session,
+					    req->r_request_started);
 		}
 		dout(" final dn %p\n", dn);
 	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
 		    req->r_op == CEPH_MDS_OP_MKSNAP) &&
+	           test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
 		   !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
-		struct dentry *dn = req->r_dentry;
 		struct inode *dir = req->r_parent;
 
 		/* fill out a snapdir LOOKUPSNAP dentry */
-		BUG_ON(!dn);
 		BUG_ON(!dir);
 		BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
-		dout(" linking snapped dir %p to dn %p\n", in, dn);
+		BUG_ON(!req->r_dentry);
+		dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
 		ceph_dir_clear_ordered(dir);
 		ihold(in);
-		dn = splice_dentry(dn, in);
-		if (IS_ERR(dn)) {
-			err = PTR_ERR(dn);
+		err = splice_dentry(&req->r_dentry, in);
+		if (err < 0)
 			goto done;
-		}
-		req->r_dentry = dn;  /* may have spliced */
-	} else if (rinfo->head->is_dentry) {
+	} else if (rinfo->head->is_dentry && req->r_dentry) {
+		/* parent inode is not locked, be carefull */
 		struct ceph_vino *ptvino = NULL;
-
-		if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
-		    le32_to_cpu(rinfo->dlease->duration_ms)) {
-			dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
-			dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
-
-			if (rinfo->head->is_target) {
-				tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-				tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-				ptvino = &tvino;
-			}
-
-			update_dentry_lease(req->r_dentry, rinfo->dlease,
-				session, req->r_request_started, ptvino,
-				&dvino);
-		} else {
-			dout("%s: no dentry lease or dir cap\n", __func__);
+		dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+		dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+		if (rinfo->head->is_target) {
+			tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+			tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+			ptvino = &tvino;
 		}
+		update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
+					    session, req->r_request_started,
+					    rinfo->dname, rinfo->dname_len,
+					    &dvino, ptvino);
 	}
 done:
 	dout("fill_trace done err=%d\n", err);
@@ -1461,7 +1499,8 @@
 			pr_err("fill_inode badness on %p got %d\n", in, rc);
 			err = rc;
 		}
-		iput(in);
+		/* avoid calling iput_final() in mds dispatch threads */
+		ceph_async_iput(in);
 	}
 
 	return err;
@@ -1584,7 +1623,7 @@
 	/* FIXME: release caps/leases if error occurs */
 	for (i = 0; i < rinfo->dir_nr; i++) {
 		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
-		struct ceph_vino tvino, dvino;
+		struct ceph_vino tvino;
 
 		dname.name = rde->name;
 		dname.len = rde->name_len;
@@ -1659,39 +1698,35 @@
 				 &req->r_caps_reservation);
 		if (ret < 0) {
 			pr_err("fill_inode badness on %p\n", in);
-			if (d_really_is_negative(dn))
-				iput(in);
+			if (d_really_is_negative(dn)) {
+				/* avoid calling iput_final() in mds
+				 * dispatch threads */
+				ceph_async_iput(in);
+			}
 			d_drop(dn);
 			err = ret;
 			goto next_item;
 		}
 
 		if (d_really_is_negative(dn)) {
-			struct dentry *realdn;
-
 			if (ceph_security_xattr_deadlock(in)) {
 				dout(" skip splicing dn %p to inode %p"
 				     " (security xattr deadlock)\n", dn, in);
-				iput(in);
+				ceph_async_iput(in);
 				skipped++;
 				goto next_item;
 			}
 
-			realdn = splice_dentry(dn, in);
-			if (IS_ERR(realdn)) {
-				err = PTR_ERR(realdn);
-				d_drop(dn);
-				dn = NULL;
+			err = splice_dentry(&dn, in);
+			if (err < 0)
 				goto next_item;
-			}
-			dn = realdn;
 		}
 
 		ceph_dentry(dn)->offset = rde->offset;
 
-		dvino = ceph_vino(d_inode(parent));
-		update_dentry_lease(dn, rde->lease, req->r_session,
-				    req->r_request_started, &tvino, &dvino);
+		update_dentry_lease(d_inode(parent), dn,
+				    rde->lease, req->r_session,
+				    req->r_request_started);
 
 		if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
 			ret = fill_readdir_cache(d_inode(parent), dn,
@@ -1700,8 +1735,7 @@
 				err = ret;
 		}
 next_item:
-		if (dn)
-			dput(dn);
+		dput(dn);
 	}
 out:
 	if (err == 0 && skipped == 0) {
@@ -1730,56 +1764,86 @@
 }
 
 /*
+ * Put reference to inode, but avoid calling iput_final() in current thread.
+ * iput_final() may wait for reahahead pages. The wait can cause deadlock in
+ * some contexts.
+ */
+void ceph_async_iput(struct inode *inode)
+{
+	if (!inode)
+		return;
+	for (;;) {
+		if (atomic_add_unless(&inode->i_count, -1, 1))
+			break;
+		if (queue_work(ceph_inode_to_client(inode)->inode_wq,
+			       &ceph_inode(inode)->i_work))
+			break;
+		/* queue work failed, i_count must be at least 2 */
+	}
+}
+
+/*
  * Write back inode data in a worker thread.  (This can't be done
  * in the message handler context.)
  */
 void ceph_queue_writeback(struct inode *inode)
 {
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask);
+
 	ihold(inode);
-	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
-		       &ceph_inode(inode)->i_wb_work)) {
+	if (queue_work(ceph_inode_to_client(inode)->inode_wq,
+		       &ci->i_work)) {
 		dout("ceph_queue_writeback %p\n", inode);
 	} else {
-		dout("ceph_queue_writeback %p failed\n", inode);
+		dout("ceph_queue_writeback %p already queued, mask=%lx\n",
+		     inode, ci->i_work_mask);
 		iput(inode);
 	}
 }
 
-static void ceph_writeback_work(struct work_struct *work)
-{
-	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
-						  i_wb_work);
-	struct inode *inode = &ci->vfs_inode;
-
-	dout("writeback %p\n", inode);
-	filemap_fdatawrite(&inode->i_data);
-	iput(inode);
-}
-
 /*
  * queue an async invalidation
  */
 void ceph_queue_invalidate(struct inode *inode)
 {
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask);
+
 	ihold(inode);
-	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
-		       &ceph_inode(inode)->i_pg_inv_work)) {
+	if (queue_work(ceph_inode_to_client(inode)->inode_wq,
+		       &ceph_inode(inode)->i_work)) {
 		dout("ceph_queue_invalidate %p\n", inode);
 	} else {
-		dout("ceph_queue_invalidate %p failed\n", inode);
+		dout("ceph_queue_invalidate %p already queued, mask=%lx\n",
+		     inode, ci->i_work_mask);
 		iput(inode);
 	}
 }
 
 /*
- * Invalidate inode pages in a worker thread.  (This can't be done
- * in the message handler context.)
+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
  */
-static void ceph_invalidate_work(struct work_struct *work)
+void ceph_queue_vmtruncate(struct inode *inode)
 {
-	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
-						  i_pg_inv_work);
-	struct inode *inode = &ci->vfs_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask);
+
+	ihold(inode);
+	if (queue_work(ceph_inode_to_client(inode)->inode_wq,
+		       &ci->i_work)) {
+		dout("ceph_queue_vmtruncate %p\n", inode);
+	} else {
+		dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n",
+		     inode, ci->i_work_mask);
+		iput(inode);
+	}
+}
+
+static void ceph_do_invalidate_pages(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	u32 orig_gen;
 	int check = 0;
@@ -1831,44 +1895,6 @@
 out:
 	if (check)
 		ceph_check_caps(ci, 0, NULL);
-	iput(inode);
-}
-
-
-/*
- * called by trunc_wq;
- *
- * We also truncate in a separate thread as well.
- */
-static void ceph_vmtruncate_work(struct work_struct *work)
-{
-	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
-						  i_vmtruncate_work);
-	struct inode *inode = &ci->vfs_inode;
-
-	dout("vmtruncate_work %p\n", inode);
-	__ceph_do_pending_vmtruncate(inode);
-	iput(inode);
-}
-
-/*
- * Queue an async vmtruncate.  If we fail to queue work, we will handle
- * the truncation the next time we call __ceph_do_pending_vmtruncate.
- */
-void ceph_queue_vmtruncate(struct inode *inode)
-{
-	struct ceph_inode_info *ci = ceph_inode(inode);
-
-	ihold(inode);
-
-	if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
-		       &ci->i_vmtruncate_work)) {
-		dout("ceph_queue_vmtruncate %p\n", inode);
-	} else {
-		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
-		     inode, ci->i_truncate_pending);
-		iput(inode);
-	}
 }
 
 /*
@@ -1932,6 +1958,25 @@
 	wake_up_all(&ci->i_cap_wq);
 }
 
+static void ceph_inode_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						 i_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
+		dout("writeback %p\n", inode);
+		filemap_fdatawrite(&inode->i_data);
+	}
+	if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
+		ceph_do_invalidate_pages(inode);
+
+	if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
+		__ceph_do_pending_vmtruncate(inode);
+
+	iput(inode);
+}
+
 /*
  * symlinks
  */
@@ -1945,7 +1990,7 @@
 int __ceph_setattr(struct inode *inode, struct iattr *attr)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	const unsigned int ia_valid = attr->ia_valid;
+	unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_cap_flush *prealloc_cf;
@@ -2050,6 +2095,26 @@
 				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
 		}
 	}
+	if (ia_valid & ATTR_SIZE) {
+		dout("setattr %p size %lld -> %lld\n", inode,
+		     inode->i_size, attr->ia_size);
+		if ((issued & CEPH_CAP_FILE_EXCL) &&
+		    attr->ia_size > inode->i_size) {
+			i_size_write(inode, attr->ia_size);
+			inode->i_blocks = calc_inode_blocks(attr->ia_size);
+			ci->i_reported_size = attr->ia_size;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+			ia_valid |= ATTR_MTIME;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   attr->ia_size != inode->i_size) {
+			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+			req->r_args.setattr.old_size =
+				cpu_to_le64(inode->i_size);
+			mask |= CEPH_SETATTR_SIZE;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+		}
+	}
 	if (ia_valid & ATTR_MTIME) {
 		dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
 		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
@@ -2072,25 +2137,6 @@
 				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
 		}
 	}
-	if (ia_valid & ATTR_SIZE) {
-		dout("setattr %p size %lld -> %lld\n", inode,
-		     inode->i_size, attr->ia_size);
-		if ((issued & CEPH_CAP_FILE_EXCL) &&
-		    attr->ia_size > inode->i_size) {
-			i_size_write(inode, attr->ia_size);
-			inode->i_blocks = calc_inode_blocks(attr->ia_size);
-			ci->i_reported_size = attr->ia_size;
-			dirtied |= CEPH_CAP_FILE_EXCL;
-		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-			   attr->ia_size != inode->i_size) {
-			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
-			req->r_args.setattr.old_size =
-				cpu_to_le64(inode->i_size);
-			mask |= CEPH_SETATTR_SIZE;
-			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
-				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
-		}
-	}
 
 	/* these do nothing */
 	if (ia_valid & ATTR_CTIME) {
@@ -2255,42 +2301,82 @@
 	return err;
 }
 
+/* Craft a mask of needed caps given a set of requested statx attrs. */
+static int statx_to_caps(u32 want)
+{
+	int mask = 0;
+
+	if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
+		mask |= CEPH_CAP_AUTH_SHARED;
+
+	if (want & (STATX_NLINK|STATX_CTIME))
+		mask |= CEPH_CAP_LINK_SHARED;
+
+	if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
+		    STATX_BLOCKS))
+		mask |= CEPH_CAP_FILE_SHARED;
+
+	if (want & (STATX_CTIME))
+		mask |= CEPH_CAP_XATTR_SHARED;
+
+	return mask;
+}
+
 /*
- * Get all attributes.  Hopefully somedata we'll have a statlite()
- * and can limit the fields we require to be accurate.
+ * Get all the attributes. If we have sufficient caps for the requested attrs,
+ * then we can avoid talking to the MDS at all.
  */
 int ceph_getattr(const struct path *path, struct kstat *stat,
 		 u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int err;
+	u32 valid_mask = STATX_BASIC_STATS;
+	int err = 0;
 
-	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
-	if (!err) {
-		generic_fillattr(inode, stat);
-		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
-		if (ceph_snap(inode) != CEPH_NOSNAP)
-			stat->dev = ceph_snap(inode);
-		else
-			stat->dev = 0;
-		if (S_ISDIR(inode->i_mode)) {
-			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
-						RBYTES))
-				stat->size = ci->i_rbytes;
-			else
-				stat->size = ci->i_files + ci->i_subdirs;
-			stat->blocks = 0;
-			stat->blksize = 65536;
-			/*
-			 * Some applications rely on the number of st_nlink
-			 * value on directories to be either 0 (if unlinked)
-			 * or 2 + number of subdirectories.
-			 */
-			if (stat->nlink == 1)
-				/* '.' + '..' + subdirs */
-				stat->nlink = 1 + 1 + ci->i_subdirs;
-		}
+	/* Skip the getattr altogether if we're asked not to sync */
+	if (!(flags & AT_STATX_DONT_SYNC)) {
+		err = ceph_do_getattr(inode, statx_to_caps(request_mask),
+				      flags & AT_STATX_FORCE_SYNC);
+		if (err)
+			return err;
 	}
+
+	generic_fillattr(inode, stat);
+	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+
+	/*
+	 * btime on newly-allocated inodes is 0, so if this is still set to
+	 * that, then assume that it's not valid.
+	 */
+	if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
+		stat->btime = ci->i_btime;
+		valid_mask |= STATX_BTIME;
+	}
+
+	if (ceph_snap(inode) == CEPH_NOSNAP)
+		stat->dev = inode->i_sb->s_dev;
+	else
+		stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+					RBYTES))
+			stat->size = ci->i_rbytes;
+		else
+			stat->size = ci->i_files + ci->i_subdirs;
+		stat->blocks = 0;
+		stat->blksize = 65536;
+		/*
+		 * Some applications rely on the number of st_nlink
+		 * value on directories to be either 0 (if unlinked)
+		 * or 2 + number of subdirectories.
+		 */
+		if (stat->nlink == 1)
+			/* '.' + '..' + subdirs */
+			stat->nlink = 1 + 1 + ci->i_subdirs;
+	}
+
+	stat->result_mask = request_mask & valid_mask;
 	return err;
 }

diff --git a/fs/ceph/io.c b/fs/ceph/io.c
new file mode 100644
index 0000000..97602ea
--- /dev/null
+++ b/fs/ceph/io.c

@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ * Copyright (c) 2019 Jeff Layton
+ *
+ * I/O and data path helper functionality.
+ *
+ * Heavily borrowed from equivalent code in fs/nfs/io.c
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+
+#include "super.h"
+#include "io.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
+{
+	lockdep_assert_held_write(&inode->i_rwsem);
+
+	if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
+		spin_unlock(&ci->i_ceph_lock);
+		inode_dio_wait(inode);
+	}
+}
+
+/**
+ * ceph_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * CEPH_I_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+ceph_start_io_read(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	/* Be an optimist! */
+	down_read(&inode->i_rwsem);
+	if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
+		return;
+	up_read(&inode->i_rwsem);
+	/* Slow path.... */
+	down_write(&inode->i_rwsem);
+	ceph_block_o_direct(ci, inode);
+	downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * ceph_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_read(struct inode *inode)
+{
+	up_read(&inode->i_rwsem);
+}
+
+/**
+ * ceph_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+ceph_start_io_write(struct inode *inode)
+{
+	down_write(&inode->i_rwsem);
+	ceph_block_o_direct(ceph_inode(inode), inode);
+}
+
+/**
+ * ceph_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_write(struct inode *inode)
+{
+	up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
+{
+	lockdep_assert_held_write(&inode->i_rwsem);
+
+	if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_ceph_flags |= CEPH_I_ODIRECT;
+		spin_unlock(&ci->i_ceph_lock);
+		/* FIXME: unmap_mapping_range? */
+		filemap_write_and_wait(inode->i_mapping);
+	}
+}
+
+/**
+ * ceph_end_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * CEPH_I_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+ceph_start_io_direct(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	/* Be an optimist! */
+	down_read(&inode->i_rwsem);
+	if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
+		return;
+	up_read(&inode->i_rwsem);
+	/* Slow path.... */
+	down_write(&inode->i_rwsem);
+	ceph_block_buffered(ci, inode);
+	downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * ceph_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_direct(struct inode *inode)
+{
+	up_read(&inode->i_rwsem);
+}

diff --git a/fs/ceph/io.h b/fs/ceph/io.h
new file mode 100644
index 0000000..fa594cd
--- /dev/null
+++ b/fs/ceph/io.h

@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_IO_H
+#define _FS_CEPH_IO_H
+
+void ceph_start_io_read(struct inode *inode);
+void ceph_end_io_read(struct inode *inode);
+void ceph_start_io_write(struct inode *inode);
+void ceph_end_io_write(struct inode *inode);
+void ceph_start_io_direct(struct inode *inode);
+void ceph_end_io_direct(struct inode *inode);
+
+#endif /* FS_CEPH_IO_H */

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 9dae2ec..544e9e8 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c

@@ -32,14 +32,18 @@
 
 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
 {
-	struct inode *inode = file_inode(src->fl_file);
+	struct ceph_file_info *fi = dst->fl_file->private_data;
+	struct inode *inode = file_inode(dst->fl_file);
 	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+	atomic_inc(&fi->num_locks);
 }
 
 static void ceph_fl_release_lock(struct file_lock *fl)
 {
+	struct ceph_file_info *fi = fl->fl_file->private_data;
 	struct inode *inode = file_inode(fl->fl_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	atomic_dec(&fi->num_locks);
 	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
 		/* clear error when all locks are released */
 		spin_lock(&ci->i_ceph_lock);
@@ -73,7 +77,7 @@
 		 * window. Caller function will decrease the counter.
 		 */
 		fl->fl_ops = &ceph_fl_lock_ops;
-		atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+		fl->fl_ops->fl_copy_lock(fl, NULL);
 	}
 
 	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
@@ -111,8 +115,7 @@
 		req->r_wait_for_completion = ceph_lock_wait_for_completion;
 
 	err = ceph_mdsc_do_request(mdsc, inode, req);
-
-	if (operation == CEPH_MDS_OP_GETFILELOCK) {
+	if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
 		fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
 		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
 			fl->fl_type = F_RDLCK;
@@ -237,15 +240,6 @@
 	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 		err = -EIO;
-	} else if (op == CEPH_MDS_OP_SETFILELOCK) {
-		/*
-		 * increasing i_filelock_ref closes race window between
-		 * handling request reply and adding file_lock struct to
-		 * inode. Otherwise, i_auth_cap may get trimmed in the
-		 * window. Caller function will decrease the counter.
-		 */
-		fl->fl_ops = &ceph_fl_lock_ops;
-		atomic_inc(&ci->i_filelock_ref);
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	if (err < 0) {
@@ -299,10 +293,6 @@
 	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 		err = -EIO;
-	} else {
-		/* see comment in ceph_lock */
-		fl->fl_ops = &ceph_fl_lock_ops;
-		atomic_inc(&ci->i_filelock_ref);
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	if (err < 0) {

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bc43c82..a516329 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c

@@ -20,6 +20,8 @@
 #include <linux/ceph/auth.h>
 #include <linux/ceph/debugfs.h>
 
+#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
+
 /*
  * A cluster of MDS (metadata server) daemons is responsible for
  * managing the file system namespace (the directory hierarchy and
@@ -46,13 +48,17 @@
  */
 
 struct ceph_reconnect_state {
-	int nr_caps;
+	struct ceph_mds_session *session;
+	int nr_caps, nr_realms;
 	struct ceph_pagelist *pagelist;
 	unsigned msg_version;
+	bool allow_multi;
 };
 
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head);
+static void ceph_cap_release_work(struct work_struct *work);
+static void ceph_cap_reclaim_work(struct work_struct *work);
 
 static const struct ceph_connection_operations mds_con_ops;
 
@@ -61,6 +67,29 @@
  * mds reply parsing
  */
 
+static int parse_reply_info_quota(void **p, void *end,
+				  struct ceph_mds_reply_info_in *info)
+{
+	u8 struct_v, struct_compat;
+	u32 struct_len;
+
+	ceph_decode_8_safe(p, end, struct_v, bad);
+	ceph_decode_8_safe(p, end, struct_compat, bad);
+	/* struct_v is expected to be >= 1. we only
+	 * understand encoding with struct_compat == 1. */
+	if (!struct_v || struct_compat != 1)
+		goto bad;
+	ceph_decode_32_safe(p, end, struct_len, bad);
+	ceph_decode_need(p, end, struct_len, bad);
+	end = *p + struct_len;
+	ceph_decode_64_safe(p, end, info->max_bytes, bad);
+	ceph_decode_64_safe(p, end, info->max_files, bad);
+	*p = end;
+	return 0;
+bad:
+	return -EIO;
+}
+
 /*
  * parse individual inode info
  */
@@ -68,8 +97,24 @@
 			       struct ceph_mds_reply_info_in *info,
 			       u64 features)
 {
-	int err = -EIO;
+	int err = 0;
+	u8 struct_v = 0;
 
+	if (features == (u64)-1) {
+		u32 struct_len;
+		u8 struct_compat;
+		ceph_decode_8_safe(p, end, struct_v, bad);
+		ceph_decode_8_safe(p, end, struct_compat, bad);
+		/* struct_v is expected to be >= 1. we only understand
+		 * encoding with struct_compat == 1. */
+		if (!struct_v || struct_compat != 1)
+			goto bad;
+		ceph_decode_32_safe(p, end, struct_len, bad);
+		ceph_decode_need(p, end, struct_len, bad);
+		end = *p + struct_len;
+	}
+
+	ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
 	info->in = *p;
 	*p += sizeof(struct ceph_mds_reply_inode) +
 		sizeof(*info->in->fragtree.splits) *
@@ -80,62 +125,160 @@
 	info->symlink = *p;
 	*p += info->symlink_len;
 
-	if (features & CEPH_FEATURE_DIRLAYOUTHASH)
-		ceph_decode_copy_safe(p, end, &info->dir_layout,
-				      sizeof(info->dir_layout), bad);
-	else
-		memset(&info->dir_layout, 0, sizeof(info->dir_layout));
-
+	ceph_decode_copy_safe(p, end, &info->dir_layout,
+			      sizeof(info->dir_layout), bad);
 	ceph_decode_32_safe(p, end, info->xattr_len, bad);
 	ceph_decode_need(p, end, info->xattr_len, bad);
 	info->xattr_data = *p;
 	*p += info->xattr_len;
 
-	if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+	if (features == (u64)-1) {
+		/* inline data */
 		ceph_decode_64_safe(p, end, info->inline_version, bad);
 		ceph_decode_32_safe(p, end, info->inline_len, bad);
 		ceph_decode_need(p, end, info->inline_len, bad);
 		info->inline_data = *p;
 		*p += info->inline_len;
-	} else
-		info->inline_version = CEPH_INLINE_NONE;
-
-	if (features & CEPH_FEATURE_MDS_QUOTA) {
-		u8 struct_v, struct_compat;
-		u32 struct_len;
-
-		/*
-		 * both struct_v and struct_compat are expected to be >= 1
-		 */
-		ceph_decode_8_safe(p, end, struct_v, bad);
-		ceph_decode_8_safe(p, end, struct_compat, bad);
-		if (!struct_v || !struct_compat)
-			goto bad;
-		ceph_decode_32_safe(p, end, struct_len, bad);
-		ceph_decode_need(p, end, struct_len, bad);
-		ceph_decode_64_safe(p, end, info->max_bytes, bad);
-		ceph_decode_64_safe(p, end, info->max_files, bad);
-	} else {
-		info->max_bytes = 0;
-		info->max_files = 0;
-	}
-
-	info->pool_ns_len = 0;
-	info->pool_ns_data = NULL;
-	if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+		/* quota */
+		err = parse_reply_info_quota(p, end, info);
+		if (err < 0)
+			goto out_bad;
+		/* pool namespace */
 		ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
 		if (info->pool_ns_len > 0) {
 			ceph_decode_need(p, end, info->pool_ns_len, bad);
 			info->pool_ns_data = *p;
 			*p += info->pool_ns_len;
 		}
-	}
 
+		/* btime */
+		ceph_decode_need(p, end, sizeof(info->btime), bad);
+		ceph_decode_copy(p, &info->btime, sizeof(info->btime));
+
+		/* change attribute */
+		ceph_decode_64_safe(p, end, info->change_attr, bad);
+
+		/* dir pin */
+		if (struct_v >= 2) {
+			ceph_decode_32_safe(p, end, info->dir_pin, bad);
+		} else {
+			info->dir_pin = -ENODATA;
+		}
+
+		/* snapshot birth time, remains zero for v<=2 */
+		if (struct_v >= 3) {
+			ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
+			ceph_decode_copy(p, &info->snap_btime,
+					 sizeof(info->snap_btime));
+		} else {
+			memset(&info->snap_btime, 0, sizeof(info->snap_btime));
+		}
+
+		*p = end;
+	} else {
+		if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+			ceph_decode_64_safe(p, end, info->inline_version, bad);
+			ceph_decode_32_safe(p, end, info->inline_len, bad);
+			ceph_decode_need(p, end, info->inline_len, bad);
+			info->inline_data = *p;
+			*p += info->inline_len;
+		} else
+			info->inline_version = CEPH_INLINE_NONE;
+
+		if (features & CEPH_FEATURE_MDS_QUOTA) {
+			err = parse_reply_info_quota(p, end, info);
+			if (err < 0)
+				goto out_bad;
+		} else {
+			info->max_bytes = 0;
+			info->max_files = 0;
+		}
+
+		info->pool_ns_len = 0;
+		info->pool_ns_data = NULL;
+		if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+			ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+			if (info->pool_ns_len > 0) {
+				ceph_decode_need(p, end, info->pool_ns_len, bad);
+				info->pool_ns_data = *p;
+				*p += info->pool_ns_len;
+			}
+		}
+
+		if (features & CEPH_FEATURE_FS_BTIME) {
+			ceph_decode_need(p, end, sizeof(info->btime), bad);
+			ceph_decode_copy(p, &info->btime, sizeof(info->btime));
+			ceph_decode_64_safe(p, end, info->change_attr, bad);
+		}
+
+		info->dir_pin = -ENODATA;
+		/* info->snap_btime remains zero */
+	}
 	return 0;
 bad:
+	err = -EIO;
+out_bad:
 	return err;
 }
 
+static int parse_reply_info_dir(void **p, void *end,
+				struct ceph_mds_reply_dirfrag **dirfrag,
+				u64 features)
+{
+	if (features == (u64)-1) {
+		u8 struct_v, struct_compat;
+		u32 struct_len;
+		ceph_decode_8_safe(p, end, struct_v, bad);
+		ceph_decode_8_safe(p, end, struct_compat, bad);
+		/* struct_v is expected to be >= 1. we only understand
+		 * encoding whose struct_compat == 1. */
+		if (!struct_v || struct_compat != 1)
+			goto bad;
+		ceph_decode_32_safe(p, end, struct_len, bad);
+		ceph_decode_need(p, end, struct_len, bad);
+		end = *p + struct_len;
+	}
+
+	ceph_decode_need(p, end, sizeof(**dirfrag), bad);
+	*dirfrag = *p;
+	*p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
+	if (unlikely(*p > end))
+		goto bad;
+	if (features == (u64)-1)
+		*p = end;
+	return 0;
+bad:
+	return -EIO;
+}
+
+static int parse_reply_info_lease(void **p, void *end,
+				  struct ceph_mds_reply_lease **lease,
+				  u64 features)
+{
+	if (features == (u64)-1) {
+		u8 struct_v, struct_compat;
+		u32 struct_len;
+		ceph_decode_8_safe(p, end, struct_v, bad);
+		ceph_decode_8_safe(p, end, struct_compat, bad);
+		/* struct_v is expected to be >= 1. we only understand
+		 * encoding whose struct_compat == 1. */
+		if (!struct_v || struct_compat != 1)
+			goto bad;
+		ceph_decode_32_safe(p, end, struct_len, bad);
+		ceph_decode_need(p, end, struct_len, bad);
+		end = *p + struct_len;
+	}
+
+	ceph_decode_need(p, end, sizeof(**lease), bad);
+	*lease = *p;
+	*p += sizeof(**lease);
+	if (features == (u64)-1)
+		*p = end;
+	return 0;
+bad:
+	return -EIO;
+}
+
 /*
  * parse a normal reply, which may contain a (dir+)dentry and/or a
  * target inode.
@@ -151,20 +294,18 @@
 		if (err < 0)
 			goto out_bad;
 
-		if (unlikely(*p + sizeof(*info->dirfrag) > end))
-			goto bad;
-		info->dirfrag = *p;
-		*p += sizeof(*info->dirfrag) +
-			sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
-		if (unlikely(*p > end))
-			goto bad;
+		err = parse_reply_info_dir(p, end, &info->dirfrag, features);
+		if (err < 0)
+			goto out_bad;
 
 		ceph_decode_32_safe(p, end, info->dname_len, bad);
 		ceph_decode_need(p, end, info->dname_len, bad);
 		info->dname = *p;
 		*p += info->dname_len;
-		info->dlease = *p;
-		*p += sizeof(*info->dlease);
+
+		err = parse_reply_info_lease(p, end, &info->dlease, features);
+		if (err < 0)
+			goto out_bad;
 	}
 
 	if (info->head->is_target) {
@@ -187,20 +328,16 @@
 /*
  * parse readdir results
  */
-static int parse_reply_info_dir(void **p, void *end,
+static int parse_reply_info_readdir(void **p, void *end,
 				struct ceph_mds_reply_info_parsed *info,
 				u64 features)
 {
 	u32 num, i = 0;
 	int err;
 
-	info->dir_dir = *p;
-	if (*p + sizeof(*info->dir_dir) > end)
-		goto bad;
-	*p += sizeof(*info->dir_dir) +
-		sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
-	if (*p > end)
-		goto bad;
+	err = parse_reply_info_dir(p, end, &info->dir_dir, features);
+	if (err < 0)
+		goto out_bad;
 
 	ceph_decode_need(p, end, sizeof(num) + 2, bad);
 	num = ceph_decode_32(p);
@@ -226,15 +363,16 @@
 	while (num) {
 		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
 		/* dentry */
-		ceph_decode_need(p, end, sizeof(u32)*2, bad);
-		rde->name_len = ceph_decode_32(p);
+		ceph_decode_32_safe(p, end, rde->name_len, bad);
 		ceph_decode_need(p, end, rde->name_len, bad);
 		rde->name = *p;
 		*p += rde->name_len;
 		dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
-		rde->lease = *p;
-		*p += sizeof(struct ceph_mds_reply_lease);
 
+		/* dentry lease */
+		err = parse_reply_info_lease(p, end, &rde->lease, features);
+		if (err)
+			goto out_bad;
 		/* inode */
 		err = parse_reply_info_in(p, end, &rde->inode, features);
 		if (err < 0)
@@ -246,8 +384,8 @@
 	}
 
 done:
-	if (*p != end)
-		goto bad;
+	/* Skip over any unrecognized fields */
+	*p = end;
 	return 0;
 
 bad:
@@ -268,12 +406,10 @@
 		goto bad;
 
 	info->filelock_reply = *p;
-	*p += sizeof(*info->filelock_reply);
 
-	if (unlikely(*p != end))
-		goto bad;
+	/* Skip over any unrecognized fields */
+	*p = end;
 	return 0;
-
 bad:
 	return -EIO;
 }
@@ -285,19 +421,23 @@
 				  struct ceph_mds_reply_info_parsed *info,
 				  u64 features)
 {
-	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+	if (features == (u64)-1 ||
+	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
+		/* Malformed reply? */
 		if (*p == end) {
 			info->has_create_ino = false;
 		} else {
 			info->has_create_ino = true;
-			info->ino = ceph_decode_64(p);
+			ceph_decode_64_safe(p, end, info->ino, bad);
 		}
+	} else {
+		if (*p != end)
+			goto bad;
 	}
 
-	if (unlikely(*p != end))
-		goto bad;
+	/* Skip over any unrecognized fields */
+	*p = end;
 	return 0;
-
 bad:
 	return -EIO;
 }
@@ -314,7 +454,7 @@
 	if (op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
 	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
-		return parse_reply_info_dir(p, end, info, features);
+		return parse_reply_info_readdir(p, end, info, features);
 	else if (op == CEPH_MDS_OP_CREATE)
 		return parse_reply_info_create(p, end, info, features);
 	else
@@ -426,15 +566,9 @@
 struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
 						   int mds)
 {
-	struct ceph_mds_session *session;
-
 	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
 		return NULL;
-	session = mdsc->sessions[mds];
-	dout("lookup_mds_session %p %d\n", session,
-	     refcount_read(&session->s_ref));
-	get_session(session);
-	return session;
+	return get_session(mdsc->sessions[mds]);
 }
 
 static bool __have_session(struct ceph_mds_client *mdsc, int mds)
@@ -498,7 +632,7 @@
 	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
 
 	spin_lock_init(&s->s_gen_ttl_lock);
-	s->s_cap_gen = 0;
+	s->s_cap_gen = 1;
 	s->s_cap_ttl = jiffies - 1;
 
 	spin_lock_init(&s->s_cap_lock);
@@ -506,7 +640,6 @@
 	s->s_renew_seq = 0;
 	INIT_LIST_HEAD(&s->s_caps);
 	s->s_nr_caps = 0;
-	s->s_trim_caps = 0;
 	refcount_set(&s->s_ref, 1);
 	INIT_LIST_HEAD(&s->s_waiting);
 	INIT_LIST_HEAD(&s->s_unsafe);
@@ -514,6 +647,8 @@
 	s->s_cap_reconnect = 0;
 	s->s_cap_iterator = NULL;
 	INIT_LIST_HEAD(&s->s_cap_releases);
+	INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
+
 	INIT_LIST_HEAD(&s->s_cap_flushing);
 
 	mdsc->sessions[mds] = s;
@@ -539,6 +674,7 @@
 	dout("__unregister_session mds%d %p\n", s->s_mds, s);
 	BUG_ON(mdsc->sessions[s->s_mds] != s);
 	mdsc->sessions[s->s_mds] = NULL;
+	s->s_state = 0;
 	ceph_con_close(&s->s_con);
 	ceph_put_mds_session(s);
 	atomic_dec(&mdsc->num_sessions);
@@ -569,11 +705,12 @@
 		ceph_msg_put(req->r_reply);
 	if (req->r_inode) {
 		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
-		iput(req->r_inode);
+		/* avoid calling iput_final() in mds dispatch threads */
+		ceph_async_iput(req->r_inode);
 	}
 	if (req->r_parent)
 		ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
-	iput(req->r_target_inode);
+	ceph_async_iput(req->r_target_inode);
 	if (req->r_dentry)
 		dput(req->r_dentry);
 	if (req->r_old_dentry)
@@ -587,7 +724,7 @@
 		 */
 		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
 				  CEPH_CAP_PIN);
-		iput(req->r_old_dentry_dir);
+		ceph_async_iput(req->r_old_dentry_dir);
 	}
 	kfree(req->r_path1);
 	kfree(req->r_path2);
@@ -595,6 +732,7 @@
 		ceph_pagelist_release(req->r_pagelist);
 	put_request_session(req);
 	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
+	WARN_ON_ONCE(!list_empty(&req->r_wait));
 	kfree(req);
 }
 
@@ -697,7 +835,8 @@
 	}
 
 	if (req->r_unsafe_dir) {
-		iput(req->r_unsafe_dir);
+		/* avoid calling iput_final() in mds dispatch threads */
+		ceph_async_iput(req->r_unsafe_dir);
 		req->r_unsafe_dir = NULL;
 	}
 
@@ -780,7 +919,7 @@
 		struct inode *dir;
 
 		rcu_read_lock();
-		parent = req->r_dentry->d_parent;
+		parent = READ_ONCE(req->r_dentry->d_parent);
 		dir = req->r_parent ? : d_inode_rcu(parent);
 
 		if (!dir || dir->i_sb != mdsc->fsc->sb) {
@@ -862,7 +1001,7 @@
 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
 	if (!cap) {
 		spin_unlock(&ci->i_ceph_lock);
-		iput(inode);
+		ceph_async_iput(inode);
 		goto random;
 	}
 	mds = cap->session->s_mds;
@@ -871,7 +1010,9 @@
 	     cap == ci->i_auth_cap ? "auth " : "", cap);
 	spin_unlock(&ci->i_ceph_lock);
 out:
-	iput(inode);
+	/* avoid calling iput_final() while holding mdsc->mutex or
+	 * in mds dispatch threads */
+	ceph_async_iput(inode);
 	return mds;
 
 random:
@@ -1129,6 +1270,7 @@
 {
 	struct ceph_mds_request *req;
 	struct rb_node *p;
+	struct ceph_inode_info *ci;
 
 	dout("cleanup_session_requests mds%d\n", session->s_mds);
 	mutex_lock(&mdsc->mutex);
@@ -1137,6 +1279,16 @@
 				       struct ceph_mds_request, r_unsafe_item);
 		pr_warn_ratelimited(" dropping unsafe request %llu\n",
 				    req->r_tid);
+		if (req->r_target_inode) {
+			/* dropping unsafe change of inode's attributes */
+			ci = ceph_inode(req->r_target_inode);
+			errseq_set(&ci->i_meta_err, -EIO);
+		}
+		if (req->r_unsafe_dir) {
+			/* dropping unsafe directory operation */
+			ci = ceph_inode(req->r_unsafe_dir);
+			errseq_set(&ci->i_meta_err, -EIO);
+		}
 		__unregister_request(mdsc, req);
 	}
 	/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1157,9 +1309,9 @@
  *
  * Caller must hold session s_mutex.
  */
-static int iterate_session_caps(struct ceph_mds_session *session,
-				 int (*cb)(struct inode *, struct ceph_cap *,
-					    void *), void *arg)
+int ceph_iterate_session_caps(struct ceph_mds_session *session,
+			      int (*cb)(struct inode *, struct ceph_cap *,
+					void *), void *arg)
 {
 	struct list_head *p;
 	struct ceph_cap *cap;
@@ -1181,7 +1333,9 @@
 		spin_unlock(&session->s_cap_lock);
 
 		if (last_inode) {
-			iput(last_inode);
+			/* avoid calling iput_final() while holding
+			 * s_mutex or in mds dispatch threads */
+			ceph_async_iput(last_inode);
 			last_inode = NULL;
 		}
 		if (old_cap) {
@@ -1201,13 +1355,10 @@
 			cap->session = NULL;
 			list_del_init(&cap->session_caps);
 			session->s_nr_caps--;
-			if (cap->queue_release) {
-				list_add_tail(&cap->session_caps,
-					      &session->s_cap_releases);
-				session->s_num_cap_releases++;
-			} else {
+			if (cap->queue_release)
+				__ceph_queue_cap_release(session, cap);
+			else
 				old_cap = cap;  /* put_cap it w/o locks held */
-			}
 		}
 		if (ret < 0)
 			goto out;
@@ -1217,7 +1368,7 @@
 	session->s_cap_iterator = NULL;
 	spin_unlock(&session->s_cap_lock);
 
-	iput(last_inode);
+	ceph_async_iput(last_inode);
 	if (old_cap)
 		ceph_put_cap(session->s_mdsc, old_cap);
 
@@ -1230,22 +1381,25 @@
 	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	LIST_HEAD(to_remove);
-	bool drop = false;
+	bool dirty_dropped = false;
 	bool invalidate = false;
 
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
 	spin_lock(&ci->i_ceph_lock);
+	if (cap->mds_wanted | cap->issued)
+		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 	__ceph_remove_cap(cap, false);
 	if (!ci->i_auth_cap) {
 		struct ceph_cap_flush *cf;
 		struct ceph_mds_client *mdsc = fsc->mdsc;
 
-		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
-
-		if (ci->i_wrbuffer_ref > 0 &&
-		    READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
-			invalidate = true;
+		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+			if (inode->i_data.nrpages > 0)
+				invalidate = true;
+			if (ci->i_wrbuffer_ref > 0)
+				mapping_set_error(&inode->i_data, -EIO);
+		}
 
 		while (!list_empty(&ci->i_cap_flush_list)) {
 			cf = list_first_entry(&ci->i_cap_flush_list,
@@ -1265,7 +1419,7 @@
 				inode, ceph_ino(inode));
 			ci->i_dirty_caps = 0;
 			list_del_init(&ci->i_dirty_item);
-			drop = true;
+			dirty_dropped = true;
 		}
 		if (!list_empty(&ci->i_flushing_item)) {
 			pr_warn_ratelimited(
@@ -1275,10 +1429,22 @@
 			ci->i_flushing_caps = 0;
 			list_del_init(&ci->i_flushing_item);
 			mdsc->num_cap_flushing--;
-			drop = true;
+			dirty_dropped = true;
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 
+		if (dirty_dropped) {
+			errseq_set(&ci->i_meta_err, -EIO);
+
+			if (ci->i_wrbuffer_ref_head == 0 &&
+			    ci->i_wr_ref == 0 &&
+			    ci->i_dirty_caps == 0 &&
+			    ci->i_flushing_caps == 0) {
+				ceph_put_snap_context(ci->i_head_snapc);
+				ci->i_head_snapc = NULL;
+			}
+		}
+
 		if (atomic_read(&ci->i_filelock_ref) > 0) {
 			/* make further file lock syscall return -EIO */
 			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
@@ -1303,7 +1469,7 @@
 	wake_up_all(&ci->i_cap_wq);
 	if (invalidate)
 		ceph_queue_invalidate(inode);
-	if (drop)
+	if (dirty_dropped)
 		iput(inode);
 	return 0;
 }
@@ -1318,7 +1484,7 @@
 	LIST_HEAD(dispose);
 
 	dout("remove_session_caps on %p\n", session);
-	iterate_session_caps(session, remove_session_caps_cb, fsc);
+	ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
 
 	wake_up_all(&fsc->mdsc->cap_flushing_wq);
 
@@ -1344,7 +1510,8 @@
 			spin_unlock(&session->s_cap_lock);
 
 			inode = ceph_find_inode(sb, vino);
-			iput(inode);
+			 /* avoid calling iput_final() while holding s_mutex */
+			ceph_async_iput(inode);
 
 			spin_lock(&session->s_cap_lock);
 		}
@@ -1359,6 +1526,12 @@
 	dispose_cap_releases(session->s_mdsc, &dispose);
 }
 
+enum {
+	RECONNECT,
+	RENEWCAPS,
+	FORCE_RO,
+};
+
 /*
  * wake up any threads waiting on this session's caps.  if the cap is
  * old (didn't get renewed on the client reconnect), remove it now.
@@ -1369,23 +1542,34 @@
 			      void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned long ev = (unsigned long)arg;
 
-	if (arg) {
+	if (ev == RECONNECT) {
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_wanted_max_size = 0;
 		ci->i_requested_max_size = 0;
 		spin_unlock(&ci->i_ceph_lock);
+	} else if (ev == RENEWCAPS) {
+		if (cap->cap_gen < cap->session->s_cap_gen) {
+			/* mds did not re-issue stale cap */
+			spin_lock(&ci->i_ceph_lock);
+			cap->issued = cap->implemented = CEPH_CAP_PIN;
+			/* make sure mds knows what we want */
+			if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
+				ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+			spin_unlock(&ci->i_ceph_lock);
+		}
+	} else if (ev == FORCE_RO) {
 	}
 	wake_up_all(&ci->i_cap_wq);
 	return 0;
 }
 
-static void wake_up_session_caps(struct ceph_mds_session *session,
-				 int reconnect)
+static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
 {
 	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
-	iterate_session_caps(session, wake_up_session_cb,
-			     (void *)(unsigned long)reconnect);
+	ceph_iterate_session_caps(session, wake_up_session_cb,
+				  (void *)(unsigned long)ev);
 }
 
 /*
@@ -1470,7 +1654,7 @@
 	spin_unlock(&session->s_cap_lock);
 
 	if (wake)
-		wake_up_session_caps(session, 0);
+		wake_up_session_caps(session, RENEWCAPS);
 }
 
 /*
@@ -1538,11 +1722,11 @@
  */
 static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 {
-	struct ceph_mds_session *session = arg;
+	int *remaining = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int used, wanted, oissued, mine;
 
-	if (session->s_trim_caps <= 0)
+	if (*remaining <= 0)
 		return -1;
 
 	spin_lock(&ci->i_ceph_lock);
@@ -1579,7 +1763,7 @@
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
 		__ceph_remove_cap(cap, true);
-		session->s_trim_caps--;
+		(*remaining)--;
 	} else {
 		struct dentry *dentry;
 		/* try dropping referring dentries */
@@ -1591,7 +1775,7 @@
 			d_prune_aliases(inode);
 			count = atomic_read(&inode->i_count);
 			if (count == 1)
-				session->s_trim_caps--;
+				(*remaining)--;
 			dout("trim_caps_cb %p cap %p pruned, count now %d\n",
 			     inode, cap, count);
 		} else {
@@ -1617,15 +1801,15 @@
 	dout("trim_caps mds%d start: %d / %d, trim %d\n",
 	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
 	if (trim_caps > 0) {
-		session->s_trim_caps = trim_caps;
-		iterate_session_caps(session, trim_caps_cb, session);
+		int remaining = trim_caps;
+
+		ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
 		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
 		     session->s_mds, session->s_nr_caps, max_caps,
-			trim_caps - session->s_trim_caps);
-		session->s_trim_caps = 0;
+			trim_caps - remaining);
 	}
 
-	ceph_send_cap_releases(mdsc, session);
+	ceph_flush_cap_releases(mdsc, session);
 	return 0;
 }
 
@@ -1668,8 +1852,8 @@
 /*
  * called under s_mutex
  */
-void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
-			    struct ceph_mds_session *session)
+static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg = NULL;
 	struct ceph_mds_cap_release *head;
@@ -1711,7 +1895,8 @@
 		num_cap_releases--;
 
 		head = msg->front.iov_base;
-		le32_add_cpu(&head->num, 1);
+		put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
+				   &head->num);
 		item = msg->front.iov_base + msg->front.iov_len;
 		item->ino = cpu_to_le64(cap->cap_ino);
 		item->cap_id = cpu_to_le64(cap->cap_id);
@@ -1761,6 +1946,81 @@
 	spin_unlock(&session->s_cap_lock);
 }
 
+static void ceph_cap_release_work(struct work_struct *work)
+{
+	struct ceph_mds_session *session =
+		container_of(work, struct ceph_mds_session, s_cap_release_work);
+
+	mutex_lock(&session->s_mutex);
+	if (session->s_state == CEPH_MDS_SESSION_OPEN ||
+	    session->s_state == CEPH_MDS_SESSION_HUNG)
+		ceph_send_cap_releases(session->s_mdsc, session);
+	mutex_unlock(&session->s_mutex);
+	ceph_put_mds_session(session);
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+		             struct ceph_mds_session *session)
+{
+	if (mdsc->stopping)
+		return;
+
+	get_session(session);
+	if (queue_work(mdsc->fsc->cap_wq,
+		       &session->s_cap_release_work)) {
+		dout("cap release work queued\n");
+	} else {
+		ceph_put_mds_session(session);
+		dout("failed to queue cap release work\n");
+	}
+}
+
+/*
+ * caller holds session->s_cap_lock
+ */
+void __ceph_queue_cap_release(struct ceph_mds_session *session,
+			      struct ceph_cap *cap)
+{
+	list_add_tail(&cap->session_caps, &session->s_cap_releases);
+	session->s_num_cap_releases++;
+
+	if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
+		ceph_flush_cap_releases(session->s_mdsc, session);
+}
+
+static void ceph_cap_reclaim_work(struct work_struct *work)
+{
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, cap_reclaim_work);
+	int ret = ceph_trim_dentries(mdsc);
+	if (ret == -EAGAIN)
+		ceph_queue_cap_reclaim_work(mdsc);
+}
+
+void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
+{
+	if (mdsc->stopping)
+		return;
+
+        if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
+                dout("caps reclaim work queued\n");
+        } else {
+                dout("failed to queue caps release work\n");
+        }
+}
+
+void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
+{
+	int val;
+	if (!nr)
+		return;
+	val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
+	if (!(val % CEPH_CAPS_PER_RELEASE)) {
+		atomic_set(&mdsc->cap_reclaim_pending, 0);
+		ceph_queue_cap_reclaim_work(mdsc);
+	}
+}
+
 /*
  * requests
  */
@@ -1864,43 +2124,29 @@
  * Encode hidden .snap dirs as a double /, i.e.
  *   foo/.snap/bar -> foo//bar
  */
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
 			   int stop_on_nosnap)
 {
 	struct dentry *temp;
 	char *path;
-	int len, pos;
+	int pos;
 	unsigned seq;
+	u64 base;
 
 	if (!dentry)
 		return ERR_PTR(-EINVAL);
 
-retry:
-	len = 0;
-	seq = read_seqbegin(&rename_lock);
-	rcu_read_lock();
-	for (temp = dentry; !IS_ROOT(temp);) {
-		struct inode *inode = d_inode(temp);
-		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
-			len++;  /* slash only */
-		else if (stop_on_nosnap && inode &&
-			 ceph_snap(inode) == CEPH_NOSNAP)
-			break;
-		else
-			len += 1 + temp->d_name.len;
-		temp = temp->d_parent;
-	}
-	rcu_read_unlock();
-	if (len)
-		len--;  /* no leading '/' */
-
-	path = kmalloc(len+1, GFP_NOFS);
+	path = __getname();
 	if (!path)
 		return ERR_PTR(-ENOMEM);
-	pos = len;
-	path[pos] = 0;	/* trailing null */
+retry:
+	pos = PATH_MAX - 1;
+	path[pos] = '\0';
+
+	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
-	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+	temp = dentry;
+	for (;;) {
 		struct inode *inode;
 
 		spin_lock(&temp->d_lock);
@@ -1908,9 +2154,10 @@
 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
 			dout("build_path path+%d: %p SNAPDIR\n",
 			     pos, temp);
-		} else if (stop_on_nosnap && inode &&
+		} else if (stop_on_nosnap && inode && dentry != temp &&
 			   ceph_snap(inode) == CEPH_NOSNAP) {
 			spin_unlock(&temp->d_lock);
+			pos++; /* get rid of any prepended '/' */
 			break;
 		} else {
 			pos -= temp->d_name.len;
@@ -1918,43 +2165,50 @@
 				spin_unlock(&temp->d_lock);
 				break;
 			}
-			strncpy(path + pos, temp->d_name.name,
-				temp->d_name.len);
+			memcpy(path + pos, temp->d_name.name, temp->d_name.len);
 		}
 		spin_unlock(&temp->d_lock);
-		if (pos)
-			path[--pos] = '/';
-		temp = temp->d_parent;
+		temp = READ_ONCE(temp->d_parent);
+
+		/* Are we at the root? */
+		if (IS_ROOT(temp))
+			break;
+
+		/* Are we out of buffer? */
+		if (--pos < 0)
+			break;
+
+		path[pos] = '/';
 	}
+	base = ceph_ino(d_inode(temp));
 	rcu_read_unlock();
-	if (pos != 0 || read_seqretry(&rename_lock, seq)) {
+	if (pos < 0 || read_seqretry(&rename_lock, seq)) {
 		pr_err("build_path did not end path lookup where "
-		       "expected, namelen is %d, pos is %d\n", len, pos);
+		       "expected, pos is %d\n", pos);
 		/* presumably this is only possible if racing with a
 		   rename of one of the parent directories (we can not
 		   lock the dentries above us to prevent this, but
 		   retrying should be harmless) */
-		kfree(path);
 		goto retry;
 	}
 
-	*base = ceph_ino(d_inode(temp));
-	*plen = len;
+	*pbase = base;
+	*plen = PATH_MAX - 1 - pos;
 	dout("build_path on %p %d built %llx '%.*s'\n",
-	     dentry, d_count(dentry), *base, len, path);
-	return path;
+	     dentry, d_count(dentry), base, *plen, path + pos);
+	return path + pos;
 }
 
 static int build_dentry_path(struct dentry *dentry, struct inode *dir,
 			     const char **ppath, int *ppathlen, u64 *pino,
-			     int *pfreepath)
+			     bool *pfreepath, bool parent_locked)
 {
 	char *path;
 
 	rcu_read_lock();
 	if (!dir)
 		dir = d_inode_rcu(dentry->d_parent);
-	if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
+	if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) {
 		*pino = ceph_ino(dir);
 		rcu_read_unlock();
 		*ppath = dentry->d_name.name;
@@ -1966,13 +2220,13 @@
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
-	*pfreepath = 1;
+	*pfreepath = true;
 	return 0;
 }
 
 static int build_inode_path(struct inode *inode,
 			    const char **ppath, int *ppathlen, u64 *pino,
-			    int *pfreepath)
+			    bool *pfreepath)
 {
 	struct dentry *dentry;
 	char *path;
@@ -1988,7 +2242,7 @@
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
-	*pfreepath = 1;
+	*pfreepath = true;
 	return 0;
 }
 
@@ -1999,7 +2253,7 @@
 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 				  struct inode *rdiri, const char *rpath,
 				  u64 rino, const char **ppath, int *pathlen,
-				  u64 *ino, int *freepath)
+				  u64 *ino, bool *freepath, bool parent_locked)
 {
 	int r = 0;
 
@@ -2009,7 +2263,7 @@
 		     ceph_snap(rinode));
 	} else if (rdentry) {
 		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
-					freepath);
+					freepath, parent_locked);
 		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
 		     *ppath);
 	} else if (rpath || rino) {
@@ -2035,7 +2289,7 @@
 	const char *path2 = NULL;
 	u64 ino1 = 0, ino2 = 0;
 	int pathlen1 = 0, pathlen2 = 0;
-	int freepath1 = 0, freepath2 = 0;
+	bool freepath1 = false, freepath2 = false;
 	int len;
 	u16 releases;
 	void *p, *end;
@@ -2043,16 +2297,19 @@
 
 	ret = set_request_path_attr(req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
-			      &path1, &pathlen1, &ino1, &freepath1);
+			      &path1, &pathlen1, &ino1, &freepath1,
+			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
+					&req->r_req_flags));
 	if (ret < 0) {
 		msg = ERR_PTR(ret);
 		goto out;
 	}
 
+	/* If r_old_dentry is set, then assume that its parent is locked */
 	ret = set_request_path_attr(NULL, req->r_old_dentry,
 			      req->r_old_dentry_dir,
 			      req->r_path2, req->r_ino2.ino,
-			      &path2, &pathlen2, &ino2, &freepath2);
+			      &path2, &pathlen2, &ino2, &freepath2, true);
 	if (ret < 0) {
 		msg = ERR_PTR(ret);
 		goto out_free1;
@@ -2067,11 +2324,11 @@
 		(!!req->r_inode_drop + !!req->r_dentry_drop +
 		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
 	if (req->r_dentry_drop)
-		len += req->r_dentry->d_name.len;
+		len += pathlen1;
 	if (req->r_old_dentry_drop)
-		len += req->r_old_dentry->d_name.len;
+		len += pathlen2;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
+	msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
 	if (!msg) {
 		msg = ERR_PTR(-ENOMEM);
 		goto out_free2;
@@ -2136,7 +2393,6 @@
 
 	if (req->r_pagelist) {
 		struct ceph_pagelist *pagelist = req->r_pagelist;
-		refcount_inc(&pagelist->refcnt);
 		ceph_msg_data_add_pagelist(msg, pagelist);
 		msg->hdr.data_len = cpu_to_le32(pagelist->length);
 	} else {
@@ -2147,10 +2403,10 @@
 
 out_free2:
 	if (freepath2)
-		kfree((char *)path2);
+		ceph_mdsc_free_path((char *)path2, pathlen2);
 out_free1:
 	if (freepath1)
-		kfree((char *)path1);
+		ceph_mdsc_free_path((char *)path1, pathlen1);
 out:
 	return msg;
 }
@@ -2164,8 +2420,7 @@
 {
 	if (req->r_callback)
 		req->r_callback(mdsc, req);
-	else
-		complete_all(&req->r_completion);
+	complete_all(&req->r_completion);
 }
 
 /*
@@ -2407,28 +2662,11 @@
 	}
 }
 
-void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
 			      struct ceph_mds_request *req)
 {
-	dout("submit_request on %p\n", req);
-	mutex_lock(&mdsc->mutex);
-	__register_request(mdsc, req, NULL);
-	__do_request(mdsc, req);
-	mutex_unlock(&mdsc->mutex);
-}
-
-/*
- * Synchrously perform an mds request.  Take care of all of the
- * session setup, forwarding, retry details.
- */
-int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
-			 struct inode *dir,
-			 struct ceph_mds_request *req)
-{
 	int err;
 
-	dout("do_request on %p\n", req);
-
 	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
 	if (req->r_inode)
 		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
@@ -2438,18 +2676,21 @@
 		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
 				  CEPH_CAP_PIN);
 
-	/* issue */
+	dout("submit_request on %p for inode %p\n", req, dir);
 	mutex_lock(&mdsc->mutex);
 	__register_request(mdsc, req, dir);
 	__do_request(mdsc, req);
+	err = req->r_err;
+	mutex_unlock(&mdsc->mutex);
+	return err;
+}
 
-	if (req->r_err) {
-		err = req->r_err;
-		goto out;
-	}
+static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+				  struct ceph_mds_request *req)
+{
+	int err;
 
 	/* wait */
-	mutex_unlock(&mdsc->mutex);
 	dout("do_request waiting\n");
 	if (!req->r_timeout && req->r_wait_for_completion) {
 		err = req->r_wait_for_completion(mdsc, req);
@@ -2490,8 +2731,26 @@
 		err = req->r_err;
 	}
 
-out:
 	mutex_unlock(&mdsc->mutex);
+	return err;
+}
+
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+			 struct inode *dir,
+			 struct ceph_mds_request *req)
+{
+	int err;
+
+	dout("do_request on %p\n", req);
+
+	/* issue */
+	err = ceph_mdsc_submit_request(mdsc, dir, req);
+	if (!err)
+		err = ceph_mdsc_wait_request(mdsc, req);
 	dout("do_request %p done, result %d\n", req, err);
 	return err;
 }
@@ -2641,7 +2900,10 @@
 
 	dout("handle_reply tid %lld result %d\n", tid, result);
 	rinfo = &req->r_reply_info;
-	err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
+		err = parse_reply_info(msg, rinfo, (u64)-1);
+	else
+		err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
 	mutex_unlock(&mdsc->mutex);
 
 	mutex_lock(&session->s_mutex);
@@ -2672,7 +2934,6 @@
 		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
 				    req->r_op == CEPH_MDS_OP_LSSNAP))
 			ceph_readdir_prepopulate(req, req->r_session);
-		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
 	current->journal_info = NULL;
 	mutex_unlock(&req->r_fill_mutex);
@@ -2681,12 +2942,18 @@
 	if (realm)
 		ceph_put_snap_realm(mdsc, realm);
 
-	if (err == 0 && req->r_target_inode &&
-	    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
-		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
-		spin_lock(&ci->i_unsafe_lock);
-		list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
-		spin_unlock(&ci->i_unsafe_lock);
+	if (err == 0) {
+		if (req->r_target_inode &&
+		    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+			struct ceph_inode_info *ci =
+				ceph_inode(req->r_target_inode);
+			spin_lock(&ci->i_unsafe_lock);
+			list_add_tail(&req->r_unsafe_target_item,
+				      &ci->i_unsafe_iops);
+			spin_unlock(&ci->i_unsafe_lock);
+		}
+
+		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
 out_err:
 	mutex_lock(&mdsc->mutex);
@@ -2765,6 +3032,30 @@
 	pr_err("mdsc_handle_forward decode error err=%d\n", err);
 }
 
+static int __decode_session_metadata(void **p, void *end,
+				     bool *blacklisted)
+{
+	/* map<string,string> */
+	u32 n;
+	bool err_str;
+	ceph_decode_32_safe(p, end, n, bad);
+	while (n-- > 0) {
+		u32 len;
+		ceph_decode_32_safe(p, end, len, bad);
+		ceph_decode_need(p, end, len, bad);
+		err_str = !strncmp(*p, "error_string", len);
+		*p += len;
+		ceph_decode_32_safe(p, end, len, bad);
+		ceph_decode_need(p, end, len, bad);
+		if (err_str && strnstr(*p, "blacklisted", len))
+			*blacklisted = true;
+		*p += len;
+	}
+	return 0;
+bad:
+	return -1;
+}
+
 /*
  * handle a mds session control message
  */
@@ -2772,18 +3063,37 @@
 			   struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
+	int mds = session->s_mds;
+	int msg_version = le16_to_cpu(msg->hdr.version);
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+	struct ceph_mds_session_head *h;
 	u32 op;
 	u64 seq;
-	int mds = session->s_mds;
-	struct ceph_mds_session_head *h = msg->front.iov_base;
+	unsigned long features = 0;
 	int wake = 0;
+	bool blacklisted = false;
 
 	/* decode */
-	if (msg->front.iov_len < sizeof(*h))
-		goto bad;
+	ceph_decode_need(&p, end, sizeof(*h), bad);
+	h = p;
+	p += sizeof(*h);
+
 	op = le32_to_cpu(h->op);
 	seq = le64_to_cpu(h->seq);
 
+	if (msg_version >= 3) {
+		u32 len;
+		/* version >= 2, metadata */
+		if (__decode_session_metadata(&p, end, &blacklisted) < 0)
+			goto bad;
+		/* version >= 3, feature bits */
+		ceph_decode_32_safe(&p, end, len, bad);
+		ceph_decode_need(&p, end, len, bad);
+		memcpy(&features, p, min_t(size_t, len, sizeof(features)));
+		p += len;
+	}
+
 	mutex_lock(&mdsc->mutex);
 	if (op == CEPH_SESSION_CLOSE) {
 		get_session(session);
@@ -2809,6 +3119,7 @@
 		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
 			pr_info("mds%d reconnect success\n", session->s_mds);
 		session->s_state = CEPH_MDS_SESSION_OPEN;
+		session->s_features = features;
 		renewed_caps(mdsc, session, 0);
 		wake = 1;
 		if (mdsc->stopping)
@@ -2852,7 +3163,7 @@
 		spin_lock(&session->s_cap_lock);
 		session->s_readonly = true;
 		spin_unlock(&session->s_cap_lock);
-		wake_up_session_caps(session, 0);
+		wake_up_session_caps(session, FORCE_RO);
 		break;
 
 	case CEPH_SESSION_REJECT:
@@ -2861,6 +3172,8 @@
 		session->s_state = CEPH_MDS_SESSION_REJECTED;
 		cleanup_session_requests(mdsc, session);
 		remove_session_caps(session);
+		if (blacklisted)
+			mdsc->fsc->blacklisted = true;
 		wake = 2; /* for good measure */
 		break;
 
@@ -2935,6 +3248,82 @@
 	mutex_unlock(&mdsc->mutex);
 }
 
+static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
+{
+	struct ceph_msg *reply;
+	struct ceph_pagelist *_pagelist;
+	struct page *page;
+	__le32 *addr;
+	int err = -ENOMEM;
+
+	if (!recon_state->allow_multi)
+		return -ENOSPC;
+
+	/* can't handle message that contains both caps and realm */
+	BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
+
+	/* pre-allocate new pagelist */
+	_pagelist = ceph_pagelist_alloc(GFP_NOFS);
+	if (!_pagelist)
+		return -ENOMEM;
+
+	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
+	if (!reply)
+		goto fail_msg;
+
+	/* placeholder for nr_caps */
+	err = ceph_pagelist_encode_32(_pagelist, 0);
+	if (err < 0)
+		goto fail;
+
+	if (recon_state->nr_caps) {
+		/* currently encoding caps */
+		err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
+		if (err)
+			goto fail;
+	} else {
+		/* placeholder for nr_realms (currently encoding relams) */
+		err = ceph_pagelist_encode_32(_pagelist, 0);
+		if (err < 0)
+			goto fail;
+	}
+
+	err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
+	if (err)
+		goto fail;
+
+	page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
+	addr = kmap_atomic(page);
+	if (recon_state->nr_caps) {
+		/* currently encoding caps */
+		*addr = cpu_to_le32(recon_state->nr_caps);
+	} else {
+		/* currently encoding relams */
+		*(addr + 1) = cpu_to_le32(recon_state->nr_realms);
+	}
+	kunmap_atomic(addr);
+
+	reply->hdr.version = cpu_to_le16(5);
+	reply->hdr.compat_version = cpu_to_le16(4);
+
+	reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
+	ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
+
+	ceph_con_send(&recon_state->session->s_con, reply);
+	ceph_pagelist_release(recon_state->pagelist);
+
+	recon_state->pagelist = _pagelist;
+	recon_state->nr_caps = 0;
+	recon_state->nr_realms = 0;
+	recon_state->msg_version = 5;
+	return 0;
+fail:
+	ceph_msg_put(reply);
+fail_msg:
+	ceph_pagelist_release(_pagelist);
+	return err;
+}
+
 /*
  * Encode information about a cap for a reconnect with the MDS.
  */
@@ -2948,31 +3337,12 @@
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_reconnect_state *recon_state = arg;
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
-	char *path;
-	int pathlen, err;
-	u64 pathbase;
+	int err;
 	u64 snap_follows;
-	struct dentry *dentry;
 
 	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
 	     inode, ceph_vinop(inode), cap, cap->cap_id,
 	     ceph_cap_string(cap->issued));
-	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
-	if (err)
-		return err;
-
-	dentry = d_find_alias(inode);
-	if (dentry) {
-		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
-		if (IS_ERR(path)) {
-			err = PTR_ERR(path);
-			goto out_dput;
-		}
-	} else {
-		path = NULL;
-		pathlen = 0;
-		pathbase = 0;
-	}
 
 	spin_lock(&ci->i_ceph_lock);
 	cap->seq = 0;        /* reset cap seq */
@@ -2985,7 +3355,7 @@
 		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v2.issued = cpu_to_le32(cap->issued);
 		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-		rec.v2.pathbase = cpu_to_le64(pathbase);
+		rec.v2.pathbase = 0;
 		rec.v2.flock_len = (__force __le32)
 			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
 	} else {
@@ -2996,7 +3366,7 @@
 		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
 		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-		rec.v1.pathbase = cpu_to_le64(pathbase);
+		rec.v1.pathbase = 0;
 	}
 
 	if (list_empty(&ci->i_cap_snaps)) {
@@ -3012,7 +3382,7 @@
 	if (recon_state->msg_version >= 2) {
 		int num_fcntl_locks, num_flock_locks;
 		struct ceph_filelock *flocks = NULL;
-		size_t struct_len, total_len = 0;
+		size_t struct_len, total_len = sizeof(u64);
 		u8 struct_v = 0;
 
 encode_again:
@@ -3028,7 +3398,7 @@
 					       GFP_NOFS);
 			if (!flocks) {
 				err = -ENOMEM;
-				goto out_free;
+				goto out_err;
 			}
 			err = ceph_encode_locks_to_buffer(inode, flocks,
 							  num_fcntl_locks,
@@ -3038,7 +3408,7 @@
 				flocks = NULL;
 				if (err == -ENOSPC)
 					goto encode_again;
-				goto out_free;
+				goto out_err;
 			}
 		} else {
 			kfree(flocks);
@@ -3047,7 +3417,7 @@
 
 		if (recon_state->msg_version >= 3) {
 			/* version, compat_version and struct_len */
-			total_len = 2 * sizeof(u8) + sizeof(u32);
+			total_len += 2 * sizeof(u8) + sizeof(u32);
 			struct_v = 2;
 		}
 		/*
@@ -3058,44 +3428,132 @@
 			    sizeof(struct ceph_filelock);
 		rec.v2.flock_len = cpu_to_le32(struct_len);
 
-		struct_len += sizeof(rec.v2);
-		struct_len += sizeof(u32) + pathlen;
+		struct_len += sizeof(u32) + sizeof(rec.v2);
 
 		if (struct_v >= 2)
 			struct_len += sizeof(u64); /* snap_follows */
 
 		total_len += struct_len;
-		err = ceph_pagelist_reserve(pagelist, total_len);
 
-		if (!err) {
-			if (recon_state->msg_version >= 3) {
-				ceph_pagelist_encode_8(pagelist, struct_v);
-				ceph_pagelist_encode_8(pagelist, 1);
-				ceph_pagelist_encode_32(pagelist, struct_len);
-			}
-			ceph_pagelist_encode_string(pagelist, path, pathlen);
-			ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
-			ceph_locks_to_pagelist(flocks, pagelist,
-					       num_fcntl_locks,
-					       num_flock_locks);
-			if (struct_v >= 2)
-				ceph_pagelist_encode_64(pagelist, snap_follows);
+		if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
+			err = send_reconnect_partial(recon_state);
+			if (err)
+				goto out_freeflocks;
+			pagelist = recon_state->pagelist;
 		}
+
+		err = ceph_pagelist_reserve(pagelist, total_len);
+		if (err)
+			goto out_freeflocks;
+
+		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+		if (recon_state->msg_version >= 3) {
+			ceph_pagelist_encode_8(pagelist, struct_v);
+			ceph_pagelist_encode_8(pagelist, 1);
+			ceph_pagelist_encode_32(pagelist, struct_len);
+		}
+		ceph_pagelist_encode_string(pagelist, NULL, 0);
+		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+		ceph_locks_to_pagelist(flocks, pagelist,
+				       num_fcntl_locks, num_flock_locks);
+		if (struct_v >= 2)
+			ceph_pagelist_encode_64(pagelist, snap_follows);
+out_freeflocks:
 		kfree(flocks);
 	} else {
-		size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
-		err = ceph_pagelist_reserve(pagelist, size);
-		if (!err) {
-			ceph_pagelist_encode_string(pagelist, path, pathlen);
-			ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+		u64 pathbase = 0;
+		int pathlen = 0;
+		char *path = NULL;
+		struct dentry *dentry;
+
+		dentry = d_find_alias(inode);
+		if (dentry) {
+			path = ceph_mdsc_build_path(dentry,
+						&pathlen, &pathbase, 0);
+			dput(dentry);
+			if (IS_ERR(path)) {
+				err = PTR_ERR(path);
+				goto out_err;
+			}
+			rec.v1.pathbase = cpu_to_le64(pathbase);
 		}
+
+		err = ceph_pagelist_reserve(pagelist,
+					    sizeof(u64) + sizeof(u32) +
+					    pathlen + sizeof(rec.v1));
+		if (err) {
+			goto out_freepath;
+		}
+
+		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+		ceph_pagelist_encode_string(pagelist, path, pathlen);
+		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+out_freepath:
+		ceph_mdsc_free_path(path, pathlen);
 	}
 
-	recon_state->nr_caps++;
-out_free:
-	kfree(path);
-out_dput:
-	dput(dentry);
+out_err:
+	if (err >= 0)
+		recon_state->nr_caps++;
+	return err;
+}
+
+static int encode_snap_realms(struct ceph_mds_client *mdsc,
+			      struct ceph_reconnect_state *recon_state)
+{
+	struct rb_node *p;
+	struct ceph_pagelist *pagelist = recon_state->pagelist;
+	int err = 0;
+
+	if (recon_state->msg_version >= 4) {
+		err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
+		if (err < 0)
+			goto fail;
+	}
+
+	/*
+	 * snaprealms.  we provide mds with the ino, seq (version), and
+	 * parent for all of our realms.  If the mds has any newer info,
+	 * it will tell us.
+	 */
+	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+		struct ceph_snap_realm *realm =
+		       rb_entry(p, struct ceph_snap_realm, node);
+		struct ceph_mds_snaprealm_reconnect sr_rec;
+
+		if (recon_state->msg_version >= 4) {
+			size_t need = sizeof(u8) * 2 + sizeof(u32) +
+				      sizeof(sr_rec);
+
+			if (pagelist->length + need > RECONNECT_MAX_SIZE) {
+				err = send_reconnect_partial(recon_state);
+				if (err)
+					goto fail;
+				pagelist = recon_state->pagelist;
+			}
+
+			err = ceph_pagelist_reserve(pagelist, need);
+			if (err)
+				goto fail;
+
+			ceph_pagelist_encode_8(pagelist, 1);
+			ceph_pagelist_encode_8(pagelist, 1);
+			ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
+		}
+
+		dout(" adding snap realm %llx seq %lld parent %llx\n",
+		     realm->ino, realm->seq, realm->parent_ino);
+		sr_rec.ino = cpu_to_le64(realm->ino);
+		sr_rec.seq = cpu_to_le64(realm->seq);
+		sr_rec.parent = cpu_to_le64(realm->parent_ino);
+
+		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+		if (err)
+			goto fail;
+
+		recon_state->nr_realms++;
+	}
+fail:
 	return err;
 }
 
@@ -3116,22 +3574,20 @@
 			       struct ceph_mds_session *session)
 {
 	struct ceph_msg *reply;
-	struct rb_node *p;
 	int mds = session->s_mds;
 	int err = -ENOMEM;
-	int s_nr_caps;
-	struct ceph_pagelist *pagelist;
-	struct ceph_reconnect_state recon_state;
+	struct ceph_reconnect_state recon_state = {
+		.session = session,
+	};
 	LIST_HEAD(dispose);
 
 	pr_info("mds%d reconnect start\n", mds);
 
-	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
-	if (!pagelist)
+	recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
+	if (!recon_state.pagelist)
 		goto fail_nopagelist;
-	ceph_pagelist_init(pagelist);
 
-	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
+	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
 	if (!reply)
 		goto fail_nomsg;
 
@@ -3172,65 +3628,90 @@
 	/* replay unsafe requests */
 	replay_unsafe_requests(mdsc, session);
 
+	ceph_early_kick_flushing_caps(mdsc, session);
+
 	down_read(&mdsc->snap_rwsem);
 
-	/* traverse this session's caps */
-	s_nr_caps = session->s_nr_caps;
-	err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
+	/* placeholder for nr_caps */
+	err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
 	if (err)
 		goto fail;
 
-	recon_state.nr_caps = 0;
-	recon_state.pagelist = pagelist;
-	if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+	if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
 		recon_state.msg_version = 3;
-	else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
+		recon_state.allow_multi = true;
+	} else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
+		recon_state.msg_version = 3;
+	} else {
 		recon_state.msg_version = 2;
-	else
-		recon_state.msg_version = 1;
-	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
-	if (err < 0)
-		goto fail;
+	}
+	/* trsaverse this session's caps */
+	err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
 
 	spin_lock(&session->s_cap_lock);
 	session->s_cap_reconnect = 0;
 	spin_unlock(&session->s_cap_lock);
 
-	/*
-	 * snaprealms.  we provide mds with the ino, seq (version), and
-	 * parent for all of our realms.  If the mds has any newer info,
-	 * it will tell us.
-	 */
-	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
-		struct ceph_snap_realm *realm =
-			rb_entry(p, struct ceph_snap_realm, node);
-		struct ceph_mds_snaprealm_reconnect sr_rec;
+	if (err < 0)
+		goto fail;
 
-		dout(" adding snap realm %llx seq %lld parent %llx\n",
-		     realm->ino, realm->seq, realm->parent_ino);
-		sr_rec.ino = cpu_to_le64(realm->ino);
-		sr_rec.seq = cpu_to_le64(realm->seq);
-		sr_rec.parent = cpu_to_le64(realm->parent_ino);
-		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
-		if (err)
+	/* check if all realms can be encoded into current message */
+	if (mdsc->num_snap_realms) {
+		size_t total_len =
+			recon_state.pagelist->length +
+			mdsc->num_snap_realms *
+			sizeof(struct ceph_mds_snaprealm_reconnect);
+		if (recon_state.msg_version >= 4) {
+			/* number of realms */
+			total_len += sizeof(u32);
+			/* version, compat_version and struct_len */
+			total_len += mdsc->num_snap_realms *
+				     (2 * sizeof(u8) + sizeof(u32));
+		}
+		if (total_len > RECONNECT_MAX_SIZE) {
+			if (!recon_state.allow_multi) {
+				err = -ENOSPC;
+				goto fail;
+			}
+			if (recon_state.nr_caps) {
+				err = send_reconnect_partial(&recon_state);
+				if (err)
+					goto fail;
+			}
+			recon_state.msg_version = 5;
+		}
+	}
+
+	err = encode_snap_realms(mdsc, &recon_state);
+	if (err < 0)
+		goto fail;
+
+	if (recon_state.msg_version >= 5) {
+		err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
+		if (err < 0)
 			goto fail;
 	}
 
-	reply->hdr.version = cpu_to_le16(recon_state.msg_version);
-
-	/* raced with cap release? */
-	if (s_nr_caps != recon_state.nr_caps) {
-		struct page *page = list_first_entry(&pagelist->head,
-						     struct page, lru);
+	if (recon_state.nr_caps || recon_state.nr_realms) {
+		struct page *page =
+			list_first_entry(&recon_state.pagelist->head,
+					struct page, lru);
 		__le32 *addr = kmap_atomic(page);
-		*addr = cpu_to_le32(recon_state.nr_caps);
+		if (recon_state.nr_caps) {
+			WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
+			*addr = cpu_to_le32(recon_state.nr_caps);
+		} else if (recon_state.msg_version >= 4) {
+			*(addr + 1) = cpu_to_le32(recon_state.nr_realms);
+		}
 		kunmap_atomic(addr);
 	}
 
-	reply->hdr.data_len = cpu_to_le32(pagelist->length);
-	ceph_msg_data_add_pagelist(reply, pagelist);
+	reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+	if (recon_state.msg_version >= 4)
+		reply->hdr.compat_version = cpu_to_le16(4);
 
-	ceph_early_kick_flushing_caps(mdsc, session);
+	reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
+	ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
 
 	ceph_con_send(&session->s_con, reply);
 
@@ -3241,6 +3722,7 @@
 	mutex_unlock(&mdsc->mutex);
 
 	up_read(&mdsc->snap_rwsem);
+	ceph_pagelist_release(recon_state.pagelist);
 	return;
 
 fail:
@@ -3248,7 +3730,7 @@
 	up_read(&mdsc->snap_rwsem);
 	mutex_unlock(&session->s_mutex);
 fail_nomsg:
-	ceph_pagelist_release(pagelist);
+	ceph_pagelist_release(recon_state.pagelist);
 fail_nopagelist:
 	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
 	return;
@@ -3286,42 +3768,35 @@
 		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
 		     ceph_session_state_name(s->s_state));
 
-		if (i >= newmap->m_num_mds ||
-		    memcmp(ceph_mdsmap_get_addr(oldmap, i),
+		if (i >= newmap->m_num_mds) {
+			/* force close session for stopped mds */
+			get_session(s);
+			__unregister_session(mdsc, s);
+			__wake_requests(mdsc, &s->s_waiting);
+			mutex_unlock(&mdsc->mutex);
+
+			mutex_lock(&s->s_mutex);
+			cleanup_session_requests(mdsc, s);
+			remove_session_caps(s);
+			mutex_unlock(&s->s_mutex);
+
+			ceph_put_mds_session(s);
+
+			mutex_lock(&mdsc->mutex);
+			kick_requests(mdsc, i);
+			continue;
+		}
+
+		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
 			   ceph_mdsmap_get_addr(newmap, i),
 			   sizeof(struct ceph_entity_addr))) {
-			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
-				/* the session never opened, just close it
-				 * out now */
-				get_session(s);
-				__unregister_session(mdsc, s);
-				__wake_requests(mdsc, &s->s_waiting);
-				ceph_put_mds_session(s);
-			} else if (i >= newmap->m_num_mds) {
-				/* force close session for stopped mds */
-				get_session(s);
-				__unregister_session(mdsc, s);
-				__wake_requests(mdsc, &s->s_waiting);
-				kick_requests(mdsc, i);
-				mutex_unlock(&mdsc->mutex);
-
-				mutex_lock(&s->s_mutex);
-				cleanup_session_requests(mdsc, s);
-				remove_session_caps(s);
-				mutex_unlock(&s->s_mutex);
-
-				ceph_put_mds_session(s);
-
-				mutex_lock(&mdsc->mutex);
-			} else {
-				/* just close it */
-				mutex_unlock(&mdsc->mutex);
-				mutex_lock(&s->s_mutex);
-				mutex_lock(&mdsc->mutex);
-				ceph_con_close(&s->s_con);
-				mutex_unlock(&s->s_mutex);
-				s->s_state = CEPH_MDS_SESSION_RESTARTING;
-			}
+			/* just close it */
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&s->s_mutex);
+			mutex_lock(&mdsc->mutex);
+			ceph_con_close(&s->s_con);
+			mutex_unlock(&s->s_mutex);
+			s->s_state = CEPH_MDS_SESSION_RESTARTING;
 		} else if (oldstate == newstate) {
 			continue;  /* nothing new with this mds */
 		}
@@ -3346,7 +3821,7 @@
 				pr_info("mds%d recovery completed\n", s->s_mds);
 			kick_requests(mdsc, i);
 			ceph_kick_flushing_caps(mdsc, s);
-			wake_up_session_caps(s, 1);
+			wake_up_session_caps(s, RECONNECT);
 		}
 	}
 
@@ -3479,8 +3954,9 @@
 	ceph_con_send(&session->s_con, msg);
 
 out:
-	iput(inode);
 	mutex_unlock(&session->s_mutex);
+	/* avoid calling iput_final() in mds dispatch threads */
+	ceph_async_iput(inode);
 	return;
 
 bad:
@@ -3489,31 +3965,33 @@
 }
 
 void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
-			      struct inode *inode,
 			      struct dentry *dentry, char action,
 			      u32 seq)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_lease *lease;
-	int len = sizeof(*lease) + sizeof(u32);
-	int dnamelen = 0;
+	struct inode *dir;
+	int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
 
-	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
-	     inode, dentry, ceph_lease_op_name(action), session->s_mds);
-	dnamelen = dentry->d_name.len;
-	len += dnamelen;
+	dout("lease_send_msg identry %p %s to mds%d\n",
+	     dentry, ceph_lease_op_name(action), session->s_mds);
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
 	if (!msg)
 		return;
 	lease = msg->front.iov_base;
 	lease->action = action;
-	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
-	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
 	lease->seq = cpu_to_le32(seq);
-	put_unaligned_le32(dnamelen, lease + 1);
-	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
 
+	spin_lock(&dentry->d_lock);
+	dir = d_inode(dentry->d_parent);
+	lease->ino = cpu_to_le64(ceph_ino(dir));
+	lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
+
+	put_unaligned_le32(dentry->d_name.len, lease + 1);
+	memcpy((void *)(lease + 1) + 4,
+	       dentry->d_name.name, dentry->d_name.len);
+	spin_unlock(&dentry->d_lock);
 	/*
 	 * if this is a preemptive lease RELEASE, no need to
 	 * flush request stream, since the actual request will
@@ -3545,7 +4023,27 @@
 	mutex_unlock(&mdsc->mutex);
 }
 
+static void maybe_recover_session(struct ceph_mds_client *mdsc)
+{
+	struct ceph_fs_client *fsc = mdsc->fsc;
 
+	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
+		return;
+
+	if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
+		return;
+
+	if (!READ_ONCE(fsc->blacklisted))
+		return;
+
+	if (fsc->last_auto_reconnect &&
+	    time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
+		return;
+
+	pr_info("auto reconnect after blacklisted\n");
+	fsc->last_auto_reconnect = jiffies;
+	ceph_force_reconnect(fsc->sb);
+}
 
 /*
  * delayed work -- periodically trim expired leases, renew caps with mds
@@ -3566,7 +4064,6 @@
 	int renew_caps;
 
 	dout("mdsc delayed_work\n");
-	ceph_check_delayed_caps(mdsc);
 
 	mutex_lock(&mdsc->mutex);
 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
@@ -3592,7 +4089,9 @@
 				pr_info("mds%d hung\n", s->s_mds);
 			}
 		}
-		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+		if (s->s_state == CEPH_MDS_SESSION_NEW ||
+		    s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+		    s->s_state == CEPH_MDS_SESSION_REJECTED) {
 			/* this mds is failed or recovering, just wait */
 			ceph_put_mds_session(s);
 			continue;
@@ -3614,6 +4113,14 @@
 	}
 	mutex_unlock(&mdsc->mutex);
 
+	ceph_check_delayed_caps(mdsc);
+
+	ceph_queue_cap_reclaim_work(mdsc);
+
+	ceph_trim_snapid_map(mdsc);
+
+	maybe_recover_session(mdsc);
+
 	schedule_delayed(mdsc);
 }
 
@@ -3642,10 +4149,13 @@
 	mdsc->max_sessions = 0;
 	mdsc->stopping = 0;
 	atomic64_set(&mdsc->quotarealms_count, 0);
+	mdsc->quotarealms_inodes = RB_ROOT;
+	mutex_init(&mdsc->quotarealms_inodes_mutex);
 	mdsc->last_snap_seq = 0;
 	init_rwsem(&mdsc->snap_rwsem);
 	mdsc->snap_realms = RB_ROOT;
 	INIT_LIST_HEAD(&mdsc->snap_empty);
+	mdsc->num_snap_realms = 0;
 	spin_lock_init(&mdsc->snap_empty_lock);
 	mdsc->last_tid = 0;
 	mdsc->oldest_tid = 0;
@@ -3663,11 +4173,19 @@
 	mdsc->num_cap_flushing = 0;
 	spin_lock_init(&mdsc->cap_dirty_lock);
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
-	spin_lock_init(&mdsc->dentry_lru_lock);
-	INIT_LIST_HEAD(&mdsc->dentry_lru);
+	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+	atomic_set(&mdsc->cap_reclaim_pending, 0);
+
+	spin_lock_init(&mdsc->dentry_list_lock);
+	INIT_LIST_HEAD(&mdsc->dentry_leases);
+	INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
 
 	ceph_caps_init(mdsc);
-	ceph_adjust_min_caps(mdsc, fsc->min_caps);
+	ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
+
+	spin_lock_init(&mdsc->snapid_map_lock);
+	mdsc->snapid_map_tree = RB_ROOT;
+	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
 
 	init_rwsem(&mdsc->pool_perm_rwsem);
 	mdsc->pool_perm_tree = RB_ROOT;
@@ -3699,6 +4217,7 @@
 		while ((req = __get_oldest_req(mdsc))) {
 			dout("wait_requests timed out on tid %llu\n",
 			     req->r_tid);
+			list_del_init(&req->r_wait);
 			__unregister_request(mdsc, req);
 		}
 	}
@@ -3724,6 +4243,8 @@
 	 * their inode/dcache refs
 	 */
 	ceph_msgr_flush();
+
+	ceph_cleanup_quotarealms_inodes(mdsc);
 }
 
 /*
@@ -3862,8 +4383,10 @@
 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
 	mutex_unlock(&mdsc->mutex);
 
+	ceph_cleanup_snapid_map(mdsc);
 	ceph_cleanup_empty_realms(mdsc);
 
+	cancel_work_sync(&mdsc->cap_reclaim_work);
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
 
 	dout("stopped\n");
@@ -3881,7 +4404,12 @@
 		session = __ceph_lookup_mds_session(mdsc, mds);
 		if (!session)
 			continue;
+
+		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
+			__unregister_session(mdsc, session);
+		__wake_requests(mdsc, &session->s_waiting);
 		mutex_unlock(&mdsc->mutex);
+
 		mutex_lock(&session->s_mutex);
 		__close_session(mdsc, session);
 		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
@@ -3890,6 +4418,7 @@
 		}
 		mutex_unlock(&session->s_mutex);
 		ceph_put_mds_session(session);
+
 		mutex_lock(&mdsc->mutex);
 		kick_requests(mdsc, mds);
 	}

diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 32fcce0..5cd131b 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h

@@ -17,14 +17,19 @@
 #include <linux/ceph/auth.h>
 
 /* The first 8 bits are reserved for old ceph releases */
-#define CEPHFS_FEATURE_MIMIC    8
+#define CEPHFS_FEATURE_MIMIC		8
+#define CEPHFS_FEATURE_REPLY_ENCODING	9
+#define CEPHFS_FEATURE_RECLAIM_CLIENT	10
+#define CEPHFS_FEATURE_LAZY_CAP_WANTED	11
+#define CEPHFS_FEATURE_MULTI_RECONNECT  12
 
-#define CEPHFS_FEATURES_ALL {           \
-  0, 1, 2, 3, 4, 5, 6, 7,		\
-  CEPHFS_FEATURE_MIMIC,                 \
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED { 	\
+	0, 1, 2, 3, 4, 5, 6, 7,			\
+	CEPHFS_FEATURE_MIMIC,			\
+	CEPHFS_FEATURE_REPLY_ENCODING,		\
+	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
+	CEPHFS_FEATURE_MULTI_RECONNECT,		\
 }
-
-#define CEPHFS_FEATURES_CLIENT_SUPPORTED CEPHFS_FEATURES_ALL
 #define CEPHFS_FEATURES_CLIENT_REQUIRED {}
 
 
@@ -63,6 +68,10 @@
 	char *pool_ns_data;
 	u64 max_bytes;
 	u64 max_files;
+	s32 dir_pin;
+	struct ceph_timespec btime;
+	struct ceph_timespec snap_btime;
+	u64 change_attr;
 };
 
 struct ceph_mds_reply_dir_entry {
@@ -139,9 +148,9 @@
 	CEPH_MDS_SESSION_OPENING = 2,
 	CEPH_MDS_SESSION_OPEN = 3,
 	CEPH_MDS_SESSION_HUNG = 4,
-	CEPH_MDS_SESSION_CLOSING = 5,
-	CEPH_MDS_SESSION_RESTARTING = 6,
-	CEPH_MDS_SESSION_RECONNECTING = 7,
+	CEPH_MDS_SESSION_RESTARTING = 5,
+	CEPH_MDS_SESSION_RECONNECTING = 6,
+	CEPH_MDS_SESSION_CLOSING = 7,
 	CEPH_MDS_SESSION_REJECTED = 8,
 };
 
@@ -150,6 +159,7 @@
 	int               s_mds;
 	int               s_state;
 	unsigned long     s_ttl;      /* time until mds kills us */
+	unsigned long	  s_features;
 	u64               s_seq;      /* incoming msg seq # */
 	struct mutex      s_mutex;    /* serialize session messages */
 
@@ -165,19 +175,20 @@
 	/* protected by s_cap_lock */
 	spinlock_t        s_cap_lock;
 	struct list_head  s_caps;     /* all caps issued by this session */
-	int               s_nr_caps, s_trim_caps;
+	struct ceph_cap  *s_cap_iterator;
+	int               s_nr_caps;
 	int               s_num_cap_releases;
 	int		  s_cap_reconnect;
 	int		  s_readonly;
 	struct list_head  s_cap_releases; /* waiting cap_release messages */
-	struct ceph_cap  *s_cap_iterator;
+	struct work_struct s_cap_release_work;
 
 	/* protected by mutex */
 	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
 	unsigned long     s_renew_requested; /* last time we sent a renew req */
 	u64               s_renew_seq;
 
-	refcount_t          s_ref;
+	refcount_t        s_ref;
 	struct list_head  s_waiting;  /* waiting requests */
 	struct list_head  s_unsafe;   /* unsafe requests */
 };
@@ -308,6 +319,27 @@
 	char pool_ns[];
 };
 
+struct ceph_snapid_map {
+	struct rb_node node;
+	struct list_head lru;
+	atomic_t ref;
+	u64 snap;
+	dev_t dev;
+	unsigned long last_used;
+};
+
+/*
+ * node for list of quotarealm inodes that are not visible from the filesystem
+ * mountpoint, but required to handle, e.g. quotas.
+ */
+struct ceph_quotarealm_inode {
+	struct rb_node node;
+	u64 ino;
+	unsigned long timeout; /* last time a lookup failed for this inode */
+	struct mutex mutex;
+	struct inode *inode;
+};
+
 /*
  * mds client state
  */
@@ -327,6 +359,12 @@
 	int                     stopping;      /* true if shutting down */
 
 	atomic64_t		quotarealms_count; /* # realms with quota */
+	/*
+	 * We keep a list of inodes we don't see in the mountpoint but that we
+	 * need to track quota realms.
+	 */
+	struct rb_root		quotarealms_inodes;
+	struct mutex		quotarealms_inodes_mutex;
 
 	/*
 	 * snap_rwsem will cover cap linkage into snaprealms, and
@@ -339,6 +377,7 @@
 	struct rw_semaphore     snap_rwsem;
 	struct rb_root          snap_realms;
 	struct list_head        snap_empty;
+	int			num_snap_realms;
 	spinlock_t              snap_empty_lock;  /* protect snap_empty */
 
 	u64                    last_tid;      /* most recent mds request */
@@ -360,6 +399,9 @@
 	spinlock_t        cap_dirty_lock;   /* protects above items */
 	wait_queue_head_t cap_flushing_wq;
 
+	struct work_struct cap_reclaim_work;
+	atomic_t	   cap_reclaim_pending;
+
 	/*
 	 * Cap reservations
 	 *
@@ -376,13 +418,18 @@
 						unreserved) */
 	int		caps_total_count;    /* total caps allocated */
 	int		caps_use_count;      /* in use */
+	int		caps_use_max;	     /* max used caps */
 	int		caps_reserve_count;  /* unused, reserved */
 	int		caps_avail_count;    /* unused, unreserved */
 	int		caps_min_count;      /* keep at least this many
 						(unreserved) */
-	spinlock_t	  dentry_lru_lock;
-	struct list_head  dentry_lru;
-	int		  num_dentry;
+	spinlock_t	  dentry_list_lock;
+	struct list_head  dentry_leases;     /* fifo list */
+	struct list_head  dentry_dir_leases; /* lru list */
+
+	spinlock_t		snapid_map_lock;
+	struct rb_root		snapid_map_tree;
+	struct list_head	snapid_map_lru;
 
 	struct rw_semaphore     pool_perm_rwsem;
 	struct rb_root		pool_perm_tree;
@@ -421,8 +468,9 @@
 					   struct inode *dir);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
-extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
-				     struct ceph_mds_request *req);
+extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+				    struct inode *dir,
+				    struct ceph_mds_request *req);
 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 				struct inode *dir,
 				struct ceph_mds_request *req);
@@ -436,17 +484,29 @@
 	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 
-extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
-				   struct ceph_mds_session *session);
-
+extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
+				    struct ceph_cap *cap);
+extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+				    struct ceph_mds_session *session);
+extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
+extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
+extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
+				     int (*cb)(struct inode *,
+					       struct ceph_cap *, void *),
+				     void *arg);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 
+static inline void ceph_mdsc_free_path(char *path, int len)
+{
+	if (path)
+		__putname(path - (PATH_MAX - 1 - len));
+}
+
 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
 				  int stop_on_nosnap);
 
 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
 extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
-				     struct inode *inode,
 				     struct dentry *dentry, char action,
 				     u32 seq);
 

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 44e53ab..ce2d00d 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c

@@ -35,7 +35,6 @@
 
 	/* pick */
 	n = prandom_u32() % n;
-	i = 0;
 	for (i = 0; n > 0; i++, n--)
 		while (m->m_info[i].state <= 0)
 			i++;
@@ -108,7 +107,7 @@
 	struct ceph_mdsmap *m;
 	const void *start = *p;
 	int i, j, n;
-	int err = -EINVAL;
+	int err;
 	u8 mdsmap_v, mdsmap_cv;
 	u16 mdsmap_ev;
 
@@ -184,8 +183,9 @@
 		inc = ceph_decode_32(p);
 		state = ceph_decode_32(p);
 		state_seq = ceph_decode_64(p);
-		ceph_decode_copy(p, &addr, sizeof(addr));
-		ceph_decode_addr(&addr);
+		err = ceph_decode_entity_addr(p, end, &addr);
+		if (err)
+			goto corrupt;
 		ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
 		*p += sizeof(u32);
 		ceph_decode_32_safe(p, end, namelen, bad);
@@ -206,7 +206,7 @@
 
 		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
 		     i+1, n, global_id, mds, inc,
-		     ceph_pr_addr(&addr.in_addr),
+		     ceph_pr_addr(&addr),
 		     ceph_mds_state_name(state));
 
 		if (mds < 0 || state <= 0)
@@ -358,7 +358,7 @@
 nomem:
 	err = -ENOMEM;
 	goto out_err;
-bad:
+corrupt:
 	pr_err("corrupt mdsmap\n");
 	print_hex_dump(KERN_DEBUG, "mdsmap: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
@@ -366,6 +366,9 @@
 out_err:
 	ceph_mdsmap_destroy(m);
 	return ERR_PTR(err);
+bad:
+	err = -EINVAL;
+	goto corrupt;
 }
 
 void ceph_mdsmap_destroy(struct ceph_mdsmap *m)

diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 03f4d24..de56dee 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c

@@ -3,19 +3,6 @@
  * quota.c - CephFS quota
  *
  * Copyright (C) 2017-2018 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <linux/statfs.h>
@@ -35,7 +22,16 @@
 static inline bool ceph_has_realms_with_quotas(struct inode *inode)
 {
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-	return atomic64_read(&mdsc->quotarealms_count) > 0;
+	struct super_block *sb = mdsc->fsc->sb;
+
+	if (atomic64_read(&mdsc->quotarealms_count) > 0)
+		return true;
+	/* if root is the real CephFS root, we don't have quota realms */
+	if (sb->s_root->d_inode &&
+	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+		return false;
+	/* otherwise, we can't know for sure */
+	return true;
 }
 
 void ceph_handle_quota(struct ceph_mds_client *mdsc,
@@ -78,7 +74,121 @@
 		            le64_to_cpu(h->max_files));
 	spin_unlock(&ci->i_ceph_lock);
 
-	iput(inode);
+	/* avoid calling iput_final() in dispatch thread */
+	ceph_async_iput(inode);
+}
+
+static struct ceph_quotarealm_inode *
+find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
+{
+	struct ceph_quotarealm_inode *qri = NULL;
+	struct rb_node **node, *parent = NULL;
+
+	mutex_lock(&mdsc->quotarealms_inodes_mutex);
+	node = &(mdsc->quotarealms_inodes.rb_node);
+	while (*node) {
+		parent = *node;
+		qri = container_of(*node, struct ceph_quotarealm_inode, node);
+
+		if (ino < qri->ino)
+			node = &((*node)->rb_left);
+		else if (ino > qri->ino)
+			node = &((*node)->rb_right);
+		else
+			break;
+	}
+	if (!qri || (qri->ino != ino)) {
+		/* Not found, create a new one and insert it */
+		qri = kmalloc(sizeof(*qri), GFP_KERNEL);
+		if (qri) {
+			qri->ino = ino;
+			qri->inode = NULL;
+			qri->timeout = 0;
+			mutex_init(&qri->mutex);
+			rb_link_node(&qri->node, parent, node);
+			rb_insert_color(&qri->node, &mdsc->quotarealms_inodes);
+		} else
+			pr_warn("Failed to alloc quotarealms_inode\n");
+	}
+	mutex_unlock(&mdsc->quotarealms_inodes_mutex);
+
+	return qri;
+}
+
+/*
+ * This function will try to lookup a realm inode which isn't visible in the
+ * filesystem mountpoint.  A list of these kind of inodes (not visible) is
+ * maintained in the mdsc and freed only when the filesystem is umounted.
+ *
+ * Note that these inodes are kept in this list even if the lookup fails, which
+ * allows to prevent useless lookup requests.
+ */
+static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
+					     struct super_block *sb,
+					     struct ceph_snap_realm *realm)
+{
+	struct ceph_quotarealm_inode *qri;
+	struct inode *in;
+
+	qri = find_quotarealm_inode(mdsc, realm->ino);
+	if (!qri)
+		return NULL;
+
+	mutex_lock(&qri->mutex);
+	if (qri->inode && ceph_is_any_caps(qri->inode)) {
+		/* A request has already returned the inode */
+		mutex_unlock(&qri->mutex);
+		return qri->inode;
+	}
+	/* Check if this inode lookup has failed recently */
+	if (qri->timeout &&
+	    time_before_eq(jiffies, qri->timeout)) {
+		mutex_unlock(&qri->mutex);
+		return NULL;
+	}
+	if (qri->inode) {
+		/* get caps */
+		int ret = __ceph_do_getattr(qri->inode, NULL,
+					    CEPH_STAT_CAP_INODE, true);
+		if (ret >= 0)
+			in = qri->inode;
+		else
+			in = ERR_PTR(ret);
+	}  else {
+		in = ceph_lookup_inode(sb, realm->ino);
+	}
+
+	if (IS_ERR(in)) {
+		pr_warn("Can't lookup inode %llx (err: %ld)\n",
+			realm->ino, PTR_ERR(in));
+		qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
+	} else {
+		qri->timeout = 0;
+		qri->inode = in;
+	}
+	mutex_unlock(&qri->mutex);
+
+	return in;
+}
+
+void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
+{
+	struct ceph_quotarealm_inode *qri;
+	struct rb_node *node;
+
+	/*
+	 * It should now be safe to clean quotarealms_inode tree without holding
+	 * mdsc->quotarealms_inodes_mutex...
+	 */
+	mutex_lock(&mdsc->quotarealms_inodes_mutex);
+	while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) {
+		node = rb_first(&mdsc->quotarealms_inodes);
+		qri = rb_entry(node, struct ceph_quotarealm_inode, node);
+		rb_erase(node, &mdsc->quotarealms_inodes);
+		iput(qri->inode);
+		kfree(qri);
+	}
+	mutex_unlock(&mdsc->quotarealms_inodes_mutex);
 }
 
 /*
@@ -89,9 +199,15 @@
  *
  * Note that the caller is responsible for calling ceph_put_snap_realm() on the
  * returned realm.
+ *
+ * Callers of this function need to hold mdsc->snap_rwsem.  However, if there's
+ * a need to do an inode lookup, this rwsem will be temporarily dropped.  Hence
+ * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false'
+ * this function will return -EAGAIN; otherwise, the snaprealms walk-through
+ * will be restarted.
  */
 static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
-					       struct inode *inode)
+					       struct inode *inode, bool retry)
 {
 	struct ceph_inode_info *ci = NULL;
 	struct ceph_snap_realm *realm, *next;
@@ -101,6 +217,7 @@
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return NULL;
 
+restart:
 	realm = ceph_inode(inode)->i_snap_realm;
 	if (realm)
 		ceph_get_snap_realm(mdsc, realm);
@@ -108,15 +225,30 @@
 		pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
 				   "null i_snap_realm\n", ceph_vinop(inode));
 	while (realm) {
+		bool has_inode;
+
 		spin_lock(&realm->inodes_with_caps_lock);
-		in = realm->inode ? igrab(realm->inode) : NULL;
+		has_inode = realm->inode;
+		in = has_inode ? igrab(realm->inode) : NULL;
 		spin_unlock(&realm->inodes_with_caps_lock);
-		if (!in)
+		if (has_inode && !in)
 			break;
+		if (!in) {
+			up_read(&mdsc->snap_rwsem);
+			in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
+			down_read(&mdsc->snap_rwsem);
+			if (IS_ERR_OR_NULL(in))
+				break;
+			ceph_put_snap_realm(mdsc, realm);
+			if (!retry)
+				return ERR_PTR(-EAGAIN);
+			goto restart;
+		}
 
 		ci = ceph_inode(in);
 		has_quota = __ceph_has_any_quota(ci);
-		iput(in);
+		/* avoid calling iput_final() while holding mdsc->snap_rwsem */
+		ceph_async_iput(in);
 
 		next = realm->parent;
 		if (has_quota || !next)
@@ -138,9 +270,22 @@
 	struct ceph_snap_realm *old_realm, *new_realm;
 	bool is_same;
 
+restart:
+	/*
+	 * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem.
+	 * However, get_quota_realm may drop it temporarily.  By setting the
+	 * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was
+	 * dropped and we can then restart the whole operation.
+	 */
 	down_read(&mdsc->snap_rwsem);
-	old_realm = get_quota_realm(mdsc, old);
-	new_realm = get_quota_realm(mdsc, new);
+	old_realm = get_quota_realm(mdsc, old, true);
+	new_realm = get_quota_realm(mdsc, new, false);
+	if (PTR_ERR(new_realm) == -EAGAIN) {
+		up_read(&mdsc->snap_rwsem);
+		if (old_realm)
+			ceph_put_snap_realm(mdsc, old_realm);
+		goto restart;
+	}
 	is_same = (old_realm == new_realm);
 	up_read(&mdsc->snap_rwsem);
 
@@ -179,6 +324,7 @@
 		return false;
 
 	down_read(&mdsc->snap_rwsem);
+restart:
 	realm = ceph_inode(inode)->i_snap_realm;
 	if (realm)
 		ceph_get_snap_realm(mdsc, realm);
@@ -186,12 +332,23 @@
 		pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
 				   "null i_snap_realm\n", ceph_vinop(inode));
 	while (realm) {
-		spin_lock(&realm->inodes_with_caps_lock);
-		in = realm->inode ? igrab(realm->inode) : NULL;
-		spin_unlock(&realm->inodes_with_caps_lock);
-		if (!in)
-			break;
+		bool has_inode;
 
+		spin_lock(&realm->inodes_with_caps_lock);
+		has_inode = realm->inode;
+		in = has_inode ? igrab(realm->inode) : NULL;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		if (has_inode && !in)
+			break;
+		if (!in) {
+			up_read(&mdsc->snap_rwsem);
+			in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
+			down_read(&mdsc->snap_rwsem);
+			if (IS_ERR_OR_NULL(in))
+				break;
+			ceph_put_snap_realm(mdsc, realm);
+			goto restart;
+		}
 		ci = ceph_inode(in);
 		spin_lock(&ci->i_ceph_lock);
 		if (op == QUOTA_CHECK_MAX_FILES_OP) {
@@ -228,7 +385,8 @@
 			pr_warn("Invalid quota check op (%d)\n", op);
 			exceeded = true; /* Just break the loop */
 		}
-		iput(in);
+		/* avoid calling iput_final() while holding mdsc->snap_rwsem */
+		ceph_async_iput(in);
 
 		next = realm->parent;
 		if (exceeded || !next)
@@ -327,7 +485,7 @@
 	bool is_updated = false;
 
 	down_read(&mdsc->snap_rwsem);
-	realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
+	realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
 	up_read(&mdsc->snap_rwsem);
 	if (!realm)
 		return false;

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 041c27e..ccfcc66 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c

@@ -3,12 +3,14 @@
 
 #include <linux/sort.h>
 #include <linux/slab.h>
-
+#include <linux/iversion.h>
 #include "super.h"
 #include "mds_client.h"
-
 #include <linux/ceph/decode.h>
 
+/* unused map expires after 5 minutes */
+#define CEPH_SNAPID_MAP_TIMEOUT	(5 * 60 * HZ)
+
 /*
  * Snapshots in ceph are driven in large part by cooperation from the
  * client.  In contrast to local file systems or file servers that
@@ -124,6 +126,8 @@
 	INIT_LIST_HEAD(&realm->inodes_with_caps);
 	spin_lock_init(&realm->inodes_with_caps_lock);
 	__insert_snap_realm(&mdsc->snap_realms, realm);
+	mdsc->num_snap_realms++;
+
 	dout("create_snap_realm %llx %p\n", realm->ino, realm);
 	return realm;
 }
@@ -175,6 +179,7 @@
 	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
 
 	rb_erase(&realm->node, &mdsc->snap_realms);
+	mdsc->num_snap_realms--;
 
 	if (realm->parent) {
 		list_del_init(&realm->child_item);
@@ -460,6 +465,7 @@
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap_snap *capsnap;
 	struct ceph_snap_context *old_snapc, *new_snapc;
+	struct ceph_buffer *old_blob = NULL;
 	int used, dirty;
 
 	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
@@ -536,7 +542,7 @@
 	capsnap->gid = inode->i_gid;
 
 	if (dirty & CEPH_CAP_XATTR_EXCL) {
-		__ceph_build_xattrs_blob(ci);
+		old_blob = __ceph_build_xattrs_blob(ci);
 		capsnap->xattr_blob =
 			ceph_buffer_get(ci->i_xattrs.blob);
 		capsnap->xattr_version = ci->i_xattrs.version;
@@ -568,12 +574,18 @@
 	old_snapc = NULL;
 
 update_snapc:
-	if (ci->i_head_snapc) {
+       if (ci->i_wrbuffer_ref_head == 0 &&
+           ci->i_wr_ref == 0 &&
+           ci->i_dirty_caps == 0 &&
+           ci->i_flushing_caps == 0) {
+               ci->i_head_snapc = NULL;
+       } else {
 		ci->i_head_snapc = ceph_get_snap_context(new_snapc);
 		dout(" new snapc is %p\n", new_snapc);
 	}
 	spin_unlock(&ci->i_ceph_lock);
 
+	ceph_buffer_put(old_blob);
 	kfree(capsnap);
 	ceph_put_snap_context(old_snapc);
 }
@@ -597,6 +609,8 @@
 	capsnap->mtime = inode->i_mtime;
 	capsnap->atime = inode->i_atime;
 	capsnap->ctime = inode->i_ctime;
+	capsnap->btime = ci->i_btime;
+	capsnap->change_attr = inode_peek_iversion_raw(inode);
 	capsnap->time_warp_seq = ci->i_time_warp_seq;
 	capsnap->truncate_size = ci->i_truncate_size;
 	capsnap->truncate_seq = ci->i_truncate_seq;
@@ -616,7 +630,8 @@
 	     capsnap->size);
 
 	spin_lock(&mdsc->snap_flush_lock);
-	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+	if (list_empty(&ci->i_snap_flush_item))
+		list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
 	spin_unlock(&mdsc->snap_flush_lock);
 	return 1;  /* caller may want to ceph_flush_snaps */
 }
@@ -638,13 +653,15 @@
 		if (!inode)
 			continue;
 		spin_unlock(&realm->inodes_with_caps_lock);
-		iput(lastinode);
+		/* avoid calling iput_final() while holding
+		 * mdsc->snap_rwsem or in mds dispatch threads */
+		ceph_async_iput(lastinode);
 		lastinode = inode;
 		ceph_queue_cap_snap(ci);
 		spin_lock(&realm->inodes_with_caps_lock);
 	}
 	spin_unlock(&realm->inodes_with_caps_lock);
-	iput(lastinode);
+	ceph_async_iput(lastinode);
 
 	dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
 }
@@ -796,7 +813,9 @@
 		ihold(inode);
 		spin_unlock(&mdsc->snap_flush_lock);
 		ceph_flush_snaps(ci, &session);
-		iput(inode);
+		/* avoid calling iput_final() while holding
+		 * session->s_mutex or in mds dispatch threads */
+		ceph_async_iput(inode);
 		spin_lock(&mdsc->snap_flush_lock);
 	}
 	spin_unlock(&mdsc->snap_flush_lock);
@@ -940,12 +959,14 @@
 			ceph_get_snap_realm(mdsc, realm);
 			ceph_put_snap_realm(mdsc, oldrealm);
 
-			iput(inode);
+			/* avoid calling iput_final() while holding
+			 * mdsc->snap_rwsem or mds in dispatch threads */
+			ceph_async_iput(inode);
 			continue;
 
 skip_inode:
 			spin_unlock(&ci->i_ceph_lock);
-			iput(inode);
+			ceph_async_iput(inode);
 		}
 
 		/* we may have taken some of the old realm's children. */
@@ -985,3 +1006,154 @@
 		up_write(&mdsc->snap_rwsem);
 	return;
 }
+
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+					    u64 snap)
+{
+	struct ceph_snapid_map *sm, *exist;
+	struct rb_node **p, *parent;
+	int ret;
+
+	exist = NULL;
+	spin_lock(&mdsc->snapid_map_lock);
+	p = &mdsc->snapid_map_tree.rb_node;
+	while (*p) {
+		exist = rb_entry(*p, struct ceph_snapid_map, node);
+		if (snap > exist->snap) {
+			p = &(*p)->rb_left;
+		} else if (snap < exist->snap) {
+			p = &(*p)->rb_right;
+		} else {
+			if (atomic_inc_return(&exist->ref) == 1)
+				list_del_init(&exist->lru);
+			break;
+		}
+		exist = NULL;
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+	if (exist) {
+		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+		return exist;
+	}
+
+	sm = kmalloc(sizeof(*sm), GFP_NOFS);
+	if (!sm)
+		return NULL;
+
+	ret = get_anon_bdev(&sm->dev);
+	if (ret < 0) {
+		kfree(sm);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&sm->lru);
+	atomic_set(&sm->ref, 1);
+	sm->snap = snap;
+
+	exist = NULL;
+	parent = NULL;
+	p = &mdsc->snapid_map_tree.rb_node;
+	spin_lock(&mdsc->snapid_map_lock);
+	while (*p) {
+		parent = *p;
+		exist = rb_entry(*p, struct ceph_snapid_map, node);
+		if (snap > exist->snap)
+			p = &(*p)->rb_left;
+		else if (snap < exist->snap)
+			p = &(*p)->rb_right;
+		else
+			break;
+		exist = NULL;
+	}
+	if (exist) {
+		if (atomic_inc_return(&exist->ref) == 1)
+			list_del_init(&exist->lru);
+	} else {
+		rb_link_node(&sm->node, parent, p);
+		rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+	if (exist) {
+		free_anon_bdev(sm->dev);
+		kfree(sm);
+		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+		return exist;
+	}
+
+	dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+	return sm;
+}
+
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+			 struct ceph_snapid_map *sm)
+{
+	if (!sm)
+		return;
+	if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
+		if (!RB_EMPTY_NODE(&sm->node)) {
+			sm->last_used = jiffies;
+			list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
+			spin_unlock(&mdsc->snapid_map_lock);
+		} else {
+			/* already cleaned up by
+			 * ceph_cleanup_snapid_map() */
+			spin_unlock(&mdsc->snapid_map_lock);
+			kfree(sm);
+		}
+	}
+}
+
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snapid_map *sm;
+	unsigned long now;
+	LIST_HEAD(to_free);
+
+	spin_lock(&mdsc->snapid_map_lock);
+	now = jiffies;
+
+	while (!list_empty(&mdsc->snapid_map_lru)) {
+		sm = list_first_entry(&mdsc->snapid_map_lru,
+				      struct ceph_snapid_map, lru);
+		if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
+			break;
+
+		rb_erase(&sm->node, &mdsc->snapid_map_tree);
+		list_move(&sm->lru, &to_free);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+
+	while (!list_empty(&to_free)) {
+		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+		list_del(&sm->lru);
+		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+		free_anon_bdev(sm->dev);
+		kfree(sm);
+	}
+}
+
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snapid_map *sm;
+	struct rb_node *p;
+	LIST_HEAD(to_free);
+
+	spin_lock(&mdsc->snapid_map_lock);
+	while ((p = rb_first(&mdsc->snapid_map_tree))) {
+		sm = rb_entry(p, struct ceph_snapid_map, node);
+		rb_erase(p, &mdsc->snapid_map_tree);
+		RB_CLEAR_NODE(p);
+		list_move(&sm->lru, &to_free);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+
+	while (!list_empty(&to_free)) {
+		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+		list_del(&sm->lru);
+		free_anon_bdev(sm->dev);
+		if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
+			pr_err("snapid map %llx -> %x still in use\n",
+			       sm->snap, sm->dev);
+		}
+	}
+}

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index eab1359..b47f43f 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/ceph/ceph_debug.h>
 
@@ -133,6 +134,7 @@
 	Opt_rasize,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
+	Opt_caps_max,
 	Opt_readdir_max_entries,
 	Opt_readdir_max_bytes,
 	Opt_congestion_kb,
@@ -141,6 +143,7 @@
 	Opt_snapdirname,
 	Opt_mds_namespace,
 	Opt_fscache_uniq,
+	Opt_recover_session,
 	Opt_last_string,
 	/* string args above */
 	Opt_dirstat,
@@ -165,6 +168,8 @@
 	Opt_noacl,
 	Opt_quotadf,
 	Opt_noquotadf,
+	Opt_copyfrom,
+	Opt_nocopyfrom,
 };
 
 static match_table_t fsopt_tokens = {
@@ -173,12 +178,14 @@
 	{Opt_rasize, "rasize=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_caps_max, "caps_max=%d"},
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
 	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
 	/* int args above */
 	{Opt_snapdirname, "snapdirname=%s"},
 	{Opt_mds_namespace, "mds_namespace=%s"},
+	{Opt_recover_session, "recover_session=%s"},
 	{Opt_fscache_uniq, "fsc=%s"},
 	/* string args above */
 	{Opt_dirstat, "dirstat"},
@@ -203,6 +210,8 @@
 	{Opt_noacl, "noacl"},
 	{Opt_quotadf, "quotadf"},
 	{Opt_noquotadf, "noquotadf"},
+	{Opt_copyfrom, "copyfrom"},
+	{Opt_nocopyfrom, "nocopyfrom"},
 	{-1, NULL}
 };
 
@@ -247,7 +256,19 @@
 		if (!fsopt->mds_namespace)
 			return -ENOMEM;
 		break;
+	case Opt_recover_session:
+		if (!strncmp(argstr[0].from, "no",
+			     argstr[0].to - argstr[0].from)) {
+			fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
+		} else if (!strncmp(argstr[0].from, "clean",
+				    argstr[0].to - argstr[0].from)) {
+			fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
+		} else {
+			return -EINVAL;
+		}
+		break;
 	case Opt_fscache_uniq:
+#ifdef CONFIG_CEPH_FSCACHE
 		kfree(fsopt->fscache_uniq);
 		fsopt->fscache_uniq = kstrndup(argstr[0].from,
 					       argstr[0].to-argstr[0].from,
@@ -256,7 +277,10 @@
 			return -ENOMEM;
 		fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
 		break;
-		/* misc */
+#else
+		pr_err("fscache support is disabled\n");
+		return -EINVAL;
+#endif
 	case Opt_wsize:
 		if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
 			return -EINVAL;
@@ -282,6 +306,11 @@
 			return -EINVAL;
 		fsopt->caps_wanted_delay_max = intval;
 		break;
+	case Opt_caps_max:
+		if (intval < 0)
+			return -EINVAL;
+		fsopt->caps_max = intval;
+		break;
 	case Opt_readdir_max_entries:
 		if (intval < 1)
 			return -EINVAL;
@@ -328,10 +357,15 @@
 		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
 		break;
 	case Opt_fscache:
+#ifdef CONFIG_CEPH_FSCACHE
 		fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
 		kfree(fsopt->fscache_uniq);
 		fsopt->fscache_uniq = NULL;
 		break;
+#else
+		pr_err("fscache support is disabled\n");
+		return -EINVAL;
+#endif
 	case Opt_nofscache:
 		fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
 		kfree(fsopt->fscache_uniq);
@@ -355,6 +389,12 @@
 	case Opt_noquotadf:
 		fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
 		break;
+	case Opt_copyfrom:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM;
+		break;
+	case Opt_nocopyfrom:
+		fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM;
+		break;
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	case Opt_acl:
 		fsopt->sb_flags |= SB_POSIXACL;
@@ -520,7 +560,7 @@
 	seq_putc(m, ',');
 	pos = m->count;
 
-	ret = ceph_print_client_options(m, fsc->client);
+	ret = ceph_print_client_options(m, fsc->client, false);
 	if (ret)
 		return ret;
 
@@ -553,8 +593,15 @@
 		seq_puts(m, ",noacl");
 #endif
 
+	if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
+		seq_puts(m, ",copyfrom");
+
 	if (fsopt->mds_namespace)
 		seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+
+	if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
+		seq_show_option(m, "recover_session", "clean");
+
 	if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
 		seq_printf(m, ",wsize=%d", fsopt->wsize);
 	if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -563,6 +610,8 @@
 		seq_printf(m, ",rasize=%d", fsopt->rasize);
 	if (fsopt->congestion_kb != default_congestion_kb())
 		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+	if (fsopt->caps_max)
+		seq_printf(m, ",caps_max=%d", fsopt->caps_max);
 	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
 		seq_printf(m, ",caps_wanted_delay_min=%d",
 			 fsopt->caps_wanted_delay_min);
@@ -627,7 +676,7 @@
 	opt = NULL; /* fsc->client now owns this */
 
 	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-	fsc->client->osdc.abort_on_full = true;
+	ceph_set_opt(fsc->client, ABORT_ON_FULL);
 
 	if (!fsopt->mds_namespace) {
 		ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
@@ -641,6 +690,7 @@
 
 	fsc->sb = NULL;
 	fsc->mount_state = CEPH_MOUNT_MOUNTING;
+	fsc->filp_gen = 1;
 
 	atomic_long_set(&fsc->writeback_count, 0);
 
@@ -649,15 +699,12 @@
 	 * The number of concurrent works can be high but they don't need
 	 * to be processed in parallel, limit concurrency.
 	 */
-	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
-	if (!fsc->wb_wq)
+	fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0);
+	if (!fsc->inode_wq)
 		goto fail_client;
-	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
-	if (!fsc->pg_inv_wq)
-		goto fail_wb_wq;
-	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
-	if (!fsc->trunc_wq)
-		goto fail_pg_inv_wq;
+	fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+	if (!fsc->cap_wq)
+		goto fail_inode_wq;
 
 	/* set up mempools */
 	err = -ENOMEM;
@@ -665,19 +712,14 @@
 	size = sizeof (struct page *) * (page_count ? page_count : 1);
 	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
 	if (!fsc->wb_pagevec_pool)
-		goto fail_trunc_wq;
-
-	/* caps */
-	fsc->min_caps = fsopt->max_readdir;
+		goto fail_cap_wq;
 
 	return fsc;
 
-fail_trunc_wq:
-	destroy_workqueue(fsc->trunc_wq);
-fail_pg_inv_wq:
-	destroy_workqueue(fsc->pg_inv_wq);
-fail_wb_wq:
-	destroy_workqueue(fsc->wb_wq);
+fail_cap_wq:
+	destroy_workqueue(fsc->cap_wq);
+fail_inode_wq:
+	destroy_workqueue(fsc->inode_wq);
 fail_client:
 	ceph_destroy_client(fsc->client);
 fail:
@@ -690,18 +732,17 @@
 
 static void flush_fs_workqueues(struct ceph_fs_client *fsc)
 {
-	flush_workqueue(fsc->wb_wq);
-	flush_workqueue(fsc->pg_inv_wq);
-	flush_workqueue(fsc->trunc_wq);
+	flush_workqueue(fsc->inode_wq);
+	flush_workqueue(fsc->cap_wq);
 }
 
 static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
 	dout("destroy_fs_client %p\n", fsc);
 
-	destroy_workqueue(fsc->wb_wq);
-	destroy_workqueue(fsc->pg_inv_wq);
-	destroy_workqueue(fsc->trunc_wq);
+	ceph_mdsc_destroy(fsc);
+	destroy_workqueue(fsc->inode_wq);
+	destroy_workqueue(fsc->cap_wq);
 
 	mempool_destroy(fsc->wb_pagevec_pool);
 
@@ -816,16 +857,24 @@
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
 	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
 	ceph_mdsc_force_umount(fsc->mdsc);
-	return;
+	fsc->filp_gen++; // invalidate open files
+}
+
+static int ceph_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	return 0;
 }
 
 static const struct super_operations ceph_super_ops = {
 	.alloc_inode	= ceph_alloc_inode,
-	.destroy_inode	= ceph_destroy_inode,
+	.free_inode	= ceph_free_inode,
 	.write_inode    = ceph_write_inode,
-	.drop_inode	= ceph_drop_inode,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= ceph_evict_inode,
 	.sync_fs        = ceph_sync_fs,
 	.put_super	= ceph_put_super,
+	.remount_fs	= ceph_remount,
 	.show_options   = ceph_show_options,
 	.statfs		= ceph_statfs,
 	.umount_begin   = ceph_umount_begin,
@@ -916,9 +965,7 @@
 			dout("mount opening path %s\n", path);
 		}
 
-		err = ceph_fs_debugfs_init(fsc);
-		if (err < 0)
-			goto out;
+		ceph_fs_debugfs_init(fsc);
 
 		root = open_root_dentry(fsc, path, started);
 		if (IS_ERR(root)) {
@@ -959,7 +1006,9 @@
 	s->s_d_op = &ceph_dentry_ops;
 	s->s_export_op = &ceph_export_ops;
 
-	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
+	s->s_time_gran = 1;
+	s->s_time_min = 0;
+	s->s_time_max = U32_MAX;
 
 	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
 	if (ret != 0)
@@ -1068,7 +1117,6 @@
 	}
 
 	if (ceph_sb_to_client(sb) != fsc) {
-		ceph_mdsc_destroy(fsc);
 		destroy_fs_client(fsc);
 		fsc = ceph_sb_to_client(sb);
 		dout("get_sb got existing client %p\n", fsc);
@@ -1094,7 +1142,6 @@
 	goto out_final;
 
 out:
-	ceph_mdsc_destroy(fsc);
 	destroy_fs_client(fsc);
 out_final:
 	dout("ceph_mount fail %ld\n", PTR_ERR(res));
@@ -1118,8 +1165,6 @@
 
 	ceph_fscache_unregister_fs(fsc);
 
-	ceph_mdsc_destroy(fsc);
-
 	destroy_fs_client(fsc);
 	free_anon_bdev(dev);
 }
@@ -1133,6 +1178,33 @@
 };
 MODULE_ALIAS_FS("ceph");
 
+int ceph_force_reconnect(struct super_block *sb)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	int err = 0;
+
+	ceph_umount_begin(sb);
+
+	/* Make sure all page caches get invalidated.
+	 * see remove_session_caps_cb() */
+	flush_workqueue(fsc->inode_wq);
+
+	/* In case that we were blacklisted. This also reset
+	 * all mon/osd connections */
+	ceph_reset_client_addr(fsc->client);
+
+	ceph_osdc_clear_abort_err(&fsc->client->osdc);
+
+	fsc->blacklisted = false;
+	fsc->mount_state = CEPH_MOUNT_MOUNTED;
+
+	if (sb->s_root) {
+		err = __ceph_do_getattr(d_inode(sb->s_root), NULL,
+					CEPH_STAT_CAP_INODE, true);
+	}
+	return err;
+}
+
 static int __init init_ceph(void)
 {
 	int ret = init_caches();
@@ -1140,17 +1212,15 @@
 		goto out;
 
 	ceph_flock_init();
-	ceph_xattr_init();
 	ret = register_filesystem(&ceph_fs_type);
 	if (ret)
-		goto out_xattr;
+		goto out_caches;
 
 	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
 
 	return 0;
 
-out_xattr:
-	ceph_xattr_exit();
+out_caches:
 	destroy_caches();
 out:
 	return ret;
@@ -1160,7 +1230,6 @@
 {
 	dout("exit_ceph\n");
 	unregister_filesystem(&ceph_fs_type);
-	ceph_xattr_exit();
 	destroy_caches();
 }
 

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 582e28f..f98d924 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h

@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/posix_acl.h>
 #include <linux/refcount.h>
+#include <linux/security.h>
 
 #include <linux/ceph/libceph.h>
 
@@ -31,6 +32,7 @@
 #define CEPH_BLOCK_SHIFT   22  /* 4 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
+#define CEPH_MOUNT_OPT_CLEANRECOVER    (1<<1) /* auto reonnect (clean mode) after blacklisted */
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
@@ -40,8 +42,11 @@
 #define CEPH_MOUNT_OPT_NOPOOLPERM      (1<<11) /* no pool permission check */
 #define CEPH_MOUNT_OPT_MOUNTWAIT       (1<<12) /* mount waits if no mds is up */
 #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
+#define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
 
-#define CEPH_MOUNT_OPT_DEFAULT    CEPH_MOUNT_OPT_DCACHE
+#define CEPH_MOUNT_OPT_DEFAULT			\
+	(CEPH_MOUNT_OPT_DCACHE |		\
+	 CEPH_MOUNT_OPT_NOCOPYFROM)
 
 #define ceph_set_mount_opt(fsc, opt) \
 	(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -76,6 +81,7 @@
 	int rasize;           /* max readahead */
 	int congestion_kb;    /* max writeback in flight */
 	int caps_wanted_delay_min, caps_wanted_delay_max;
+	int caps_max;
 	int max_readdir;       /* max readdir result (entires) */
 	int max_readdir_bytes; /* max readdir result (bytes) */
 
@@ -97,18 +103,22 @@
 	struct ceph_client *client;
 
 	unsigned long mount_state;
-	int min_caps;                  /* min caps i added */
+
+	unsigned long last_auto_reconnect;
+	bool blacklisted;
+
+	u32 filp_gen;
 	loff_t max_file_size;
 
 	struct ceph_mds_client *mdsc;
 
 	/* writeback */
 	mempool_t *wb_pagevec_pool;
-	struct workqueue_struct *wb_wq;
-	struct workqueue_struct *pg_inv_wq;
-	struct workqueue_struct *trunc_wq;
 	atomic_long_t writeback_count;
 
+	struct workqueue_struct *inode_wq;
+	struct workqueue_struct *cap_wq;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_dentry_lru, *debugfs_caps;
 	struct dentry *debugfs_congestion_kb;
@@ -194,7 +204,8 @@
 	u64 xattr_version;
 
 	u64 size;
-	struct timespec64 mtime, atime, ctime;
+	u64 change_attr;
+	struct timespec64 mtime, atime, ctime, btime;
 	u64 time_warp_seq;
 	u64 truncate_size;
 	u32 truncate_seq;
@@ -257,17 +268,22 @@
  * Ceph dentry state
  */
 struct ceph_dentry_info {
+	struct dentry *dentry;
 	struct ceph_mds_session *lease_session;
+	struct list_head lease_list;
+	unsigned flags;
 	int lease_shared_gen;
 	u32 lease_gen;
 	u32 lease_seq;
 	unsigned long lease_renew_after, lease_renew_from;
-	struct list_head lru;
-	struct dentry *dentry;
 	unsigned long time;
 	u64 offset;
 };
 
+#define CEPH_DENTRY_REFERENCED		1
+#define CEPH_DENTRY_LEASE_LIST		2
+#define CEPH_DENTRY_SHRINK_LIST		4
+
 struct ceph_inode_xattrs_info {
 	/*
 	 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -315,6 +331,8 @@
 	/* quotas */
 	u64 i_max_bytes, i_max_files;
 
+	s32 i_dir_pin;
+
 	struct rb_root i_fragtree;
 	int i_fragtree_nsplits;
 	struct mutex i_fragtree_mutex;
@@ -367,20 +385,25 @@
 	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
 	spinlock_t i_unsafe_lock;
 
-	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+	union {
+		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
+	};
 	int i_snap_realm_counter; /* snap realm (if caps) */
 	struct list_head i_snap_realm_item;
 	struct list_head i_snap_flush_item;
+	struct timespec64 i_btime;
+	struct timespec64 i_snap_btime;
 
-	struct work_struct i_wb_work;  /* writeback work */
-	struct work_struct i_pg_inv_work;  /* page invalidation work */
-
-	struct work_struct i_vmtruncate_work;
+	struct work_struct i_work;
+	unsigned long  i_work_mask;
 
 #ifdef CONFIG_CEPH_FSCACHE
 	struct fscache_cookie *fscache;
 	u32 i_fscache_gen;
 #endif
+	errseq_t i_meta_err;
+
 	struct inode vfs_inode; /* at end */
 };
 
@@ -485,17 +508,23 @@
 #define CEPH_I_DIR_ORDERED	(1 << 0)  /* dentries in dir are ordered */
 #define CEPH_I_NODELAY		(1 << 1)  /* do not delay cap release */
 #define CEPH_I_FLUSH		(1 << 2)  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH		(1 << 3)  /* do not flush dirty caps */
-#define CEPH_I_POOL_PERM	(1 << 4)  /* pool rd/wr bits are valid */
-#define CEPH_I_POOL_RD		(1 << 5)  /* can read from pool */
-#define CEPH_I_POOL_WR		(1 << 6)  /* can write to pool */
-#define CEPH_I_SEC_INITED	(1 << 7)  /* security initialized */
-#define CEPH_I_CAP_DROPPED	(1 << 8)  /* caps were forcibly dropped */
-#define CEPH_I_KICK_FLUSH	(1 << 9)  /* kick flushing caps */
-#define CEPH_I_FLUSH_SNAPS	(1 << 10) /* need flush snapss */
-#define CEPH_I_ERROR_WRITE	(1 << 11) /* have seen write errors */
-#define CEPH_I_ERROR_FILELOCK	(1 << 12) /* have seen file lock errors */
+#define CEPH_I_POOL_PERM	(1 << 3)  /* pool rd/wr bits are valid */
+#define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
+#define CEPH_I_POOL_WR		(1 << 5)  /* can write to pool */
+#define CEPH_I_SEC_INITED	(1 << 6)  /* security initialized */
+#define CEPH_I_CAP_DROPPED	(1 << 7)  /* caps were forcibly dropped */
+#define CEPH_I_KICK_FLUSH	(1 << 8)  /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS	(1 << 9)  /* need flush snapss */
+#define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK	(1 << 11) /* have seen file lock errors */
+#define CEPH_I_ODIRECT		(1 << 12) /* inode in direct I/O mode */
 
+/*
+ * Masks of ceph inode work.
+ */
+#define CEPH_I_WORK_WRITEBACK		0 /* writeback */
+#define CEPH_I_WORK_INVALIDATE_PAGES	1 /* invalidate pages */
+#define CEPH_I_WORK_VMTRUNCATE		2 /* vmtruncate */
 
 /*
  * We set the ERROR_WRITE bit when we start seeing write errors on an inode
@@ -526,7 +555,12 @@
 					   long long release_count,
 					   long long ordered_count)
 {
-	smp_mb__before_atomic();
+	/*
+	 * Makes sure operations that setup readdir cache (update page
+	 * cache and i_size) are strongly ordered w.r.t. the following
+	 * atomic64_set() operations.
+	 */
+	smp_mb();
 	atomic64_set(&ci->i_complete_seq[0], release_count);
 	atomic64_set(&ci->i_complete_seq[1], ordered_count);
 }
@@ -584,7 +618,7 @@
 			    struct ceph_inode_frag *pfrag,
 			    int *found);
 
-static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry)
 {
 	return (struct ceph_dentry_info *)dentry->d_fsdata;
 }
@@ -653,7 +687,8 @@
 
 extern void ceph_caps_init(struct ceph_mds_client *mdsc);
 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+				     struct ceph_mount_options *fsopt);
 extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 			     struct ceph_cap_reservation *ctx, int need);
 extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -676,6 +711,10 @@
 
 	spinlock_t rw_contexts_lock;
 	struct list_head rw_contexts;
+
+	errseq_t meta_err;
+	u32 filp_gen;
+	atomic_t num_locks;
 };
 
 struct ceph_dir_file_info {
@@ -807,7 +846,7 @@
 	 * This allows larger machines to have larger/more transfers.
 	 * Limit the default to 256M
 	 */
-	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
 	if (congestion_kb > 256*1024)
 		congestion_kb = 256*1024;
 
@@ -815,7 +854,8 @@
 }
 
 
-
+/* super.c */
+extern int ceph_force_reconnect(struct super_block *sb);
 /* snap.c */
 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
 					       u64 ino);
@@ -834,6 +874,14 @@
 				  struct ceph_cap_snap *capsnap);
 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
 
+extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+						   u64 snap);
+extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+				struct ceph_snapid_map *sm);
+extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+
+
 /*
  * a cap_snap is "pending" if it is still awaiting an in-progress
  * sync write (that may/may not still update size, mtime, etc.).
@@ -849,8 +897,8 @@
 extern const struct inode_operations ceph_file_iops;
 
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
-extern void ceph_destroy_inode(struct inode *inode);
-extern int ceph_drop_inode(struct inode *inode);
+extern void ceph_evict_inode(struct inode *inode);
+extern void ceph_free_inode(struct inode *inode);
 
 extern struct inode *ceph_get_inode(struct super_block *sb,
 				    struct ceph_vino vino);
@@ -871,9 +919,9 @@
 extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
 extern void __ceph_do_pending_vmtruncate(struct inode *inode);
 extern void ceph_queue_vmtruncate(struct inode *inode);
-
 extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
+extern void ceph_async_iput(struct inode *inode);
 
 extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 			     int mask, bool force);
@@ -891,12 +939,22 @@
 int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
 ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
-extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-extern void __init ceph_xattr_init(void);
-extern void ceph_xattr_exit(void);
 extern const struct xattr_handler *ceph_xattr_handlers[];
 
+struct ceph_acl_sec_ctx {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	void *default_acl;
+	void *acl;
+#endif
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+	void *sec_ctx;
+	u32 sec_ctxlen;
+#endif
+	struct ceph_pagelist *pagelist;
+};
+
 #ifdef CONFIG_SECURITY
 extern bool ceph_security_xattr_deadlock(struct inode *in);
 extern bool ceph_security_xattr_wanted(struct inode *in);
@@ -911,21 +969,35 @@
 }
 #endif
 
-/* acl.c */
-struct ceph_acls_info {
-	void *default_acl;
-	void *acl;
-	struct ceph_pagelist *pagelist;
-};
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+				     struct ceph_acl_sec_ctx *ctx);
+static inline void ceph_security_invalidate_secctx(struct inode *inode)
+{
+	security_inode_invalidate_secctx(inode);
+}
+#else
+static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+					    struct ceph_acl_sec_ctx *ctx)
+{
+	return 0;
+}
+static inline void ceph_security_invalidate_secctx(struct inode *inode)
+{
+}
+#endif
 
+void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
+
+/* acl.c */
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 
 struct posix_acl *ceph_get_acl(struct inode *, int);
 int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
-		       struct ceph_acls_info *info);
-void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info);
-void ceph_release_acls_info(struct ceph_acls_info *info);
+		       struct ceph_acl_sec_ctx *as_ctx);
+void ceph_init_inode_acls(struct inode *inode,
+			  struct ceph_acl_sec_ctx *as_ctx);
 
 static inline void ceph_forget_all_cached_acls(struct inode *inode)
 {
@@ -938,15 +1010,12 @@
 #define ceph_set_acl NULL
 
 static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
-				     struct ceph_acls_info *info)
+				     struct ceph_acl_sec_ctx *as_ctx)
 {
 	return 0;
 }
 static inline void ceph_init_inode_acls(struct inode *inode,
-					struct ceph_acls_info *info)
-{
-}
-static inline void ceph_release_acls_info(struct ceph_acls_info *info)
+					struct ceph_acl_sec_ctx *as_ctx)
 {
 }
 static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
@@ -972,11 +1041,11 @@
 			 unsigned cap, unsigned seq, u64 realmino, int flags,
 			 struct ceph_cap **new_cap);
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void __ceph_remove_caps(struct ceph_inode_info *ci);
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 			 struct ceph_cap *cap);
 extern int ceph_is_any_caps(struct inode *inode);
 
-extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
 extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
 		      int datasync);
@@ -986,7 +1055,6 @@
 				    struct ceph_mds_session *session);
 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
 					     int mds);
-extern int ceph_get_cap_mds(struct inode *inode);
 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
@@ -1005,10 +1073,10 @@
 				      struct inode *dir,
 				      int mds, int drop, int unless);
 
-extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+extern int ceph_get_caps(struct file *filp, int need, int want,
 			 loff_t endoff, int *got, struct page **pinned_page);
-extern int ceph_try_get_caps(struct ceph_inode_info *ci,
-			     int need, int want, int *got);
+extern int ceph_try_get_caps(struct inode *inode,
+			     int need, int want, bool nonblock, int *got);
 
 /* for counting open files by mode */
 extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
@@ -1018,7 +1086,7 @@
 extern const struct address_space_operations ceph_aops;
 extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
-extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need);
+extern int ceph_pool_perm_check(struct inode *inode, int need);
 extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 
 /* file.c */
@@ -1046,10 +1114,10 @@
 extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 					 struct dentry *dentry, int err);
 
-extern void ceph_dentry_lru_add(struct dentry *dn);
-extern void ceph_dentry_lru_touch(struct dentry *dn);
-extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di);
+extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern int ceph_trim_dentries(struct ceph_mds_client *mdsc);
 extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
 extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
 
@@ -1058,6 +1126,7 @@
 
 /* export.c */
 extern const struct export_operations ceph_export_ops;
+struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino);
 
 /* locks.c */
 extern __init void ceph_flock_init(void);
@@ -1073,7 +1142,7 @@
 				  int num_fcntl_locks, int num_flock_locks);
 
 /* debugfs.c */
-extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_init(struct ceph_fs_client *client);
 extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
 
 /* quota.c */
@@ -1108,5 +1177,6 @@
 						loff_t newlen);
 extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
 				     struct kstatfs *buf);
+extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
 
 #endif /* _FS_CEPH_SUPER_H */

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5cc8b94..cb18ee6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c

@@ -8,6 +8,7 @@
 #include <linux/ceph/decode.h>
 
 #include <linux/xattr.h>
+#include <linux/security.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/slab.h>
 
@@ -17,26 +18,10 @@
 static int __remove_xattr(struct ceph_inode_info *ci,
 			  struct ceph_inode_xattr *xattr);
 
-static const struct xattr_handler ceph_other_xattr_handler;
-
-/*
- * List of handlers for synthetic system.* attributes. Other
- * attributes are handled directly.
- */
-const struct xattr_handler *ceph_xattr_handlers[] = {
-#ifdef CONFIG_CEPH_FS_POSIX_ACL
-	&posix_acl_access_xattr_handler,
-	&posix_acl_default_xattr_handler,
-#endif
-	&ceph_other_xattr_handler,
-	NULL,
-};
-
 static bool ceph_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
-	       !strncmp(name, XATTR_SECURITY_PREFIX,
-			XATTR_SECURITY_PREFIX_LEN) ||
+	return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
@@ -48,8 +33,8 @@
 struct ceph_vxattr {
 	char *name;
 	size_t name_size;	/* strlen(name) + 1 (for '\0') */
-	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
-			      size_t size);
+	ssize_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+			       size_t size);
 	bool (*exists_cb)(struct ceph_inode_info *ci);
 	unsigned int flags;
 };
@@ -68,8 +53,8 @@
 		rcu_dereference_raw(fl->pool_ns) != NULL);
 }
 
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
-				   size_t size)
+static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+				    size_t size)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
@@ -79,7 +64,7 @@
 	const char *ns_field = " pool_namespace=";
 	char buf[128];
 	size_t len, total_len = 0;
-	int ret;
+	ssize_t ret;
 
 	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
 
@@ -96,18 +81,15 @@
 		len = snprintf(buf, sizeof(buf),
 		"stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
 		ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
-	        ci->i_layout.object_size, (unsigned long long)pool);
+		ci->i_layout.object_size, pool);
 		total_len = len;
 	}
 
 	if (pool_ns)
 		total_len += strlen(ns_field) + pool_ns->len;
 
-	if (!size) {
-		ret = total_len;
-	} else if (total_len > size) {
-		ret = -ERANGE;
-	} else {
+	ret = total_len;
+	if (size >= total_len) {
 		memcpy(val, buf, len);
 		ret = len;
 		if (pool_name) {
@@ -128,28 +110,55 @@
 	return ret;
 }
 
-static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
-					       char *val, size_t size)
-{
-	return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
-}
-
-static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
-						char *val, size_t size)
-{
-	return snprintf(val, size, "%u", ci->i_layout.stripe_count);
-}
-
-static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
-					       char *val, size_t size)
-{
-	return snprintf(val, size, "%u", ci->i_layout.object_size);
-}
-
-static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
-					char *val, size_t size)
+/*
+ * The convention with strings in xattrs is that they should not be NULL
+ * terminated, since we're returning the length with them. snprintf always
+ * NULL terminates however, so call it on a temporary buffer and then memcpy
+ * the result into place.
+ */
+static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
 {
 	int ret;
+	va_list args;
+	char buf[96]; /* NB: reevaluate size if new vxattrs are added */
+
+	va_start(args, fmt);
+	ret = vsnprintf(buf, size ? sizeof(buf) : 0, fmt, args);
+	va_end(args);
+
+	/* Sanity check */
+	if (size && ret + 1 > sizeof(buf)) {
+		WARN_ONCE(true, "Returned length too big (%d)", ret);
+		return -E2BIG;
+	}
+
+	if (ret <= size)
+		memcpy(val, buf, ret);
+	return ret;
+}
+
+static ssize_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+						char *val, size_t size)
+{
+	return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_unit);
+}
+
+static ssize_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+						 char *val, size_t size)
+{
+	return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_count);
+}
+
+static ssize_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+						char *val, size_t size)
+{
+	return ceph_fmt_xattr(val, size, "%u", ci->i_layout.object_size);
+}
+
+static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+					 char *val, size_t size)
+{
+	ssize_t ret;
 	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	s64 pool = ci->i_layout.pool_id;
@@ -157,21 +166,27 @@
 
 	down_read(&osdc->lock);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
-	if (pool_name)
-		ret = snprintf(val, size, "%s", pool_name);
-	else
-		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+	if (pool_name) {
+		ret = strlen(pool_name);
+		if (ret <= size)
+			memcpy(val, pool_name, ret);
+	} else {
+		ret = ceph_fmt_xattr(val, size, "%lld", pool);
+	}
 	up_read(&osdc->lock);
 	return ret;
 }
 
-static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
-						  char *val, size_t size)
+static ssize_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+						   char *val, size_t size)
 {
-	int ret = 0;
+	ssize_t ret = 0;
 	struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+
 	if (ns) {
-		ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+		ret = ns->len;
+		if (ret <= size)
+			memcpy(val, ns->str, ret);
 		ceph_put_string(ns);
 	}
 	return ret;
@@ -179,57 +194,69 @@
 
 /* directories */
 
-static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
-					size_t size)
-{
-	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
-}
-
-static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
-				      size_t size)
-{
-	return snprintf(val, size, "%lld", ci->i_files);
-}
-
-static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
-					size_t size)
-{
-	return snprintf(val, size, "%lld", ci->i_subdirs);
-}
-
-static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
+static ssize_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
-	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+	return ceph_fmt_xattr(val, size, "%lld", ci->i_files + ci->i_subdirs);
 }
 
-static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
+static ssize_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
-	return snprintf(val, size, "%lld", ci->i_rfiles);
+	return ceph_fmt_xattr(val, size, "%lld", ci->i_files);
 }
 
-static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
+static ssize_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
-	return snprintf(val, size, "%lld", ci->i_rsubdirs);
+	return ceph_fmt_xattr(val, size, "%lld", ci->i_subdirs);
 }
 
-static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
-				       size_t size)
+static ssize_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
+					  size_t size)
 {
-	return snprintf(val, size, "%lld", ci->i_rbytes);
+	return ceph_fmt_xattr(val, size, "%lld",
+				ci->i_rfiles + ci->i_rsubdirs);
 }
 
-static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
-				       size_t size)
+static ssize_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
+					size_t size)
 {
-	return snprintf(val, size, "%lld.09%ld", ci->i_rctime.tv_sec,
-			ci->i_rctime.tv_nsec);
+	return ceph_fmt_xattr(val, size, "%lld", ci->i_rfiles);
+}
+
+static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
+					  size_t size)
+{
+	return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs);
+}
+
+static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return ceph_fmt_xattr(val, size, "%lld", ci->i_rbytes);
+}
+
+static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_rctime.tv_sec,
+				ci->i_rctime.tv_nsec);
+}
+
+/* dir pin */
+static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci)
+{
+	return ci->i_dir_pin != -ENODATA;
+}
+
+static ssize_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
+				     size_t size)
+{
+	return ceph_fmt_xattr(val, size, "%d", (int)ci->i_dir_pin);
 }
 
 /* quotas */
-
 static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
 {
 	bool ret = false;
@@ -243,23 +270,36 @@
 	return ret;
 }
 
-static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
-				  size_t size)
+static ssize_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
+				   size_t size)
 {
-	return snprintf(val, size, "max_bytes=%llu max_files=%llu",
-			ci->i_max_bytes, ci->i_max_files);
+	return ceph_fmt_xattr(val, size, "max_bytes=%llu max_files=%llu",
+				ci->i_max_bytes, ci->i_max_files);
 }
 
-static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
-					    char *val, size_t size)
+static ssize_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
+					     char *val, size_t size)
 {
-	return snprintf(val, size, "%llu", ci->i_max_bytes);
+	return ceph_fmt_xattr(val, size, "%llu", ci->i_max_bytes);
 }
 
-static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
-					    char *val, size_t size)
+static ssize_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
+					     char *val, size_t size)
 {
-	return snprintf(val, size, "%llu", ci->i_max_files);
+	return ceph_fmt_xattr(val, size, "%llu", ci->i_max_files);
+}
+
+/* snapshots */
+static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci)
+{
+	return (ci->i_snap_btime.tv_sec != 0 || ci->i_snap_btime.tv_nsec != 0);
+}
+
+static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_snap_btime.tv_sec,
+				ci->i_snap_btime.tv_nsec);
 }
 
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
@@ -315,6 +355,13 @@
 	XATTR_RSTAT_FIELD(dir, rbytes),
 	XATTR_RSTAT_FIELD(dir, rctime),
 	{
+		.name = "ceph.dir.pin",
+		.name_size = sizeof("ceph.dir.pin"),
+		.getxattr_cb = ceph_vxattrcb_dir_pin,
+		.exists_cb = ceph_vxattrcb_dir_pin_exists,
+		.flags = VXATTR_FLAG_HIDDEN,
+	},
+	{
 		.name = "ceph.quota",
 		.name_size = sizeof("ceph.quota"),
 		.getxattr_cb = ceph_vxattrcb_quota,
@@ -323,9 +370,15 @@
 	},
 	XATTR_QUOTA_FIELD(quota, max_bytes),
 	XATTR_QUOTA_FIELD(quota, max_files),
+	{
+		.name = "ceph.snap.btime",
+		.name_size = sizeof("ceph.snap.btime"),
+		.getxattr_cb = ceph_vxattrcb_snap_btime,
+		.exists_cb = ceph_vxattrcb_snap_btime_exists,
+		.flags = VXATTR_FLAG_READONLY,
+	},
 	{ .name = NULL, 0 }	/* Required table terminator */
 };
-static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
 
 /* files */
 
@@ -342,9 +395,15 @@
 	XATTR_LAYOUT_FIELD(file, layout, object_size),
 	XATTR_LAYOUT_FIELD(file, layout, pool),
 	XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
+	{
+		.name = "ceph.snap.btime",
+		.name_size = sizeof("ceph.snap.btime"),
+		.getxattr_cb = ceph_vxattrcb_snap_btime,
+		.exists_cb = ceph_vxattrcb_snap_btime_exists,
+		.flags = VXATTR_FLAG_READONLY,
+	},
 	{ .name = NULL, 0 }	/* Required table terminator */
 };
-static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
 
 static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
 {
@@ -355,47 +414,6 @@
 	return NULL;
 }
 
-static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
-{
-	if (vxattrs == ceph_dir_vxattrs)
-		return ceph_dir_vxattrs_name_size;
-	if (vxattrs == ceph_file_vxattrs)
-		return ceph_file_vxattrs_name_size;
-	BUG_ON(vxattrs);
-	return 0;
-}
-
-/*
- * Compute the aggregate size (including terminating '\0') of all
- * virtual extended attribute names in the given vxattr table.
- */
-static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
-{
-	struct ceph_vxattr *vxattr;
-	size_t size = 0;
-
-	for (vxattr = vxattrs; vxattr->name; vxattr++) {
-		if (!(vxattr->flags & VXATTR_FLAG_HIDDEN))
-			size += vxattr->name_size;
-	}
-
-	return size;
-}
-
-/* Routines called at initialization and exit time */
-
-void __init ceph_xattr_init(void)
-{
-	ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
-	ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
-}
-
-void ceph_xattr_exit(void)
-{
-	ceph_dir_vxattrs_name_size = 0;
-	ceph_file_vxattrs_name_size = 0;
-}
-
 static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
 						const char *name)
 {
@@ -505,8 +523,8 @@
 		dout("__set_xattr_val p=%p\n", p);
 	}
 
-	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
-	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+	dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
+	     ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val);
 
 	return 0;
 }
@@ -737,12 +755,15 @@
 
 /*
  * If there are dirty xattrs, reencode xattrs into the prealloc_blob
- * and swap into place.
+ * and swap into place.  It returns the old i_xattrs.blob (or NULL) so
+ * that it can be freed by the caller as the i_ceph_lock is likely to be
+ * held.
  */
-void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
 {
 	struct rb_node *p;
 	struct ceph_inode_xattr *xattr = NULL;
+	struct ceph_buffer *old_blob = NULL;
 	void *dest;
 
 	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
@@ -773,12 +794,14 @@
 			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
 
 		if (ci->i_xattrs.blob)
-			ceph_buffer_put(ci->i_xattrs.blob);
+			old_blob = ci->i_xattrs.blob;
 		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
 		ci->i_xattrs.prealloc_blob = NULL;
 		ci->i_xattrs.dirty = false;
 		ci->i_xattrs.version++;
 	}
+
+	return old_blob;
 }
 
 static inline int __get_request_mask(struct inode *in) {
@@ -805,7 +828,7 @@
 	struct ceph_inode_xattr *xattr;
 	struct ceph_vxattr *vxattr = NULL;
 	int req_mask;
-	int err;
+	ssize_t err;
 
 	/* let's see if a virtual xattr was requested */
 	vxattr = ceph_match_vxattr(inode, name);
@@ -817,8 +840,11 @@
 		if (err)
 			return err;
 		err = -ENODATA;
-		if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
+		if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
 			err = vxattr->getxattr_cb(ci, value, size);
+			if (size && size < err)
+				err = -ERANGE;
+		}
 		return err;
 	}
 
@@ -867,7 +893,8 @@
 	memcpy(value, xattr->val, xattr->val_len);
 
 	if (current->journal_info &&
-	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN))
 		ci->i_ceph_flags |= CEPH_I_SEC_INITED;
 out:
 	spin_unlock(&ci->i_ceph_lock);
@@ -878,12 +905,9 @@
 {
 	struct inode *inode = d_inode(dentry);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
-	u32 vir_namelen = 0;
+	bool len_only = (size == 0);
 	u32 namelen;
 	int err;
-	u32 len;
-	int i;
 
 	spin_lock(&ci->i_ceph_lock);
 	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -901,38 +925,18 @@
 	err = __build_xattrs(inode);
 	if (err < 0)
 		goto out;
-	/*
-	 * Start with virtual dir xattr names (if any) (including
-	 * terminating '\0' characters for each).
-	 */
-	vir_namelen = ceph_vxattrs_name_size(vxattrs);
 
-	/* adding 1 byte per each variable due to the null termination */
+	/* add 1 byte for each xattr due to the null termination */
 	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
-	err = -ERANGE;
-	if (size && vir_namelen + namelen > size)
-		goto out;
-
-	err = namelen + vir_namelen;
-	if (size == 0)
-		goto out;
-
-	names = __copy_xattr_names(ci, names);
-
-	/* virtual xattr names, too */
-	err = namelen;
-	if (vxattrs) {
-		for (i = 0; vxattrs[i].name; i++) {
-			if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) &&
-			    !(vxattrs[i].exists_cb &&
-			      !vxattrs[i].exists_cb(ci))) {
-				len = sprintf(names, "%s", vxattrs[i].name);
-				names += len + 1;
-				err += len + 1;
-			}
+	if (!len_only) {
+		if (namelen > size) {
+			err = -ERANGE;
+			goto out;
 		}
+		names = __copy_xattr_names(ci, names);
+		size -= namelen;
 	}
-
+	err = namelen;
 out:
 	spin_unlock(&ci->i_ceph_lock);
 	return err;
@@ -951,11 +955,10 @@
 
 	if (size > 0) {
 		/* copy value into pagelist */
-		pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+		pagelist = ceph_pagelist_alloc(GFP_NOFS);
 		if (!pagelist)
 			return -ENOMEM;
 
-		ceph_pagelist_init(pagelist);
 		err = ceph_pagelist_append(pagelist, value, size);
 		if (err)
 			goto out;
@@ -1011,6 +1014,7 @@
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_cap_flush *prealloc_cf = NULL;
+	struct ceph_buffer *old_blob = NULL;
 	int issued;
 	int err;
 	int dirty = 0;
@@ -1084,13 +1088,15 @@
 		struct ceph_buffer *blob;
 
 		spin_unlock(&ci->i_ceph_lock);
-		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		ceph_buffer_put(old_blob); /* Shouldn't be required */
+		dout(" pre-allocating new blob size=%d\n", required_blob_size);
 		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
 		if (!blob)
 			goto do_sync_unlocked;
 		spin_lock(&ci->i_ceph_lock);
+		/* prealloc_blob can't be released while holding i_ceph_lock */
 		if (ci->i_xattrs.prealloc_blob)
-			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+			old_blob = ci->i_xattrs.prealloc_blob;
 		ci->i_xattrs.prealloc_blob = blob;
 		goto retry;
 	}
@@ -1106,6 +1112,7 @@
 	}
 
 	spin_unlock(&ci->i_ceph_lock);
+	ceph_buffer_put(old_blob);
 	if (lock_snap_rwsem)
 		up_read(&mdsc->snap_rwsem);
 	if (dirty)
@@ -1189,4 +1196,101 @@
 	spin_unlock(&ci->i_ceph_lock);
 	return ret;
 }
+
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+			   struct ceph_acl_sec_ctx *as_ctx)
+{
+	struct ceph_pagelist *pagelist = as_ctx->pagelist;
+	const char *name;
+	size_t name_len;
+	int err;
+
+	err = security_dentry_init_security(dentry, mode, &dentry->d_name,
+					    &as_ctx->sec_ctx,
+					    &as_ctx->sec_ctxlen);
+	if (err < 0) {
+		WARN_ON_ONCE(err != -EOPNOTSUPP);
+		err = 0; /* do nothing */
+		goto out;
+	}
+
+	err = -ENOMEM;
+	if (!pagelist) {
+		pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+		if (!pagelist)
+			goto out;
+		err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
+		if (err)
+			goto out;
+		ceph_pagelist_encode_32(pagelist, 1);
+	}
+
+	/*
+	 * FIXME: Make security_dentry_init_security() generic. Currently
+	 * It only supports single security module and only selinux has
+	 * dentry_init_security hook.
+	 */
+	name = XATTR_NAME_SELINUX;
+	name_len = strlen(name);
+	err = ceph_pagelist_reserve(pagelist,
+				    4 * 2 + name_len + as_ctx->sec_ctxlen);
+	if (err)
+		goto out;
+
+	if (as_ctx->pagelist) {
+		/* update count of KV pairs */
+		BUG_ON(pagelist->length <= sizeof(__le32));
+		if (list_is_singular(&pagelist->head)) {
+			le32_add_cpu((__le32*)pagelist->mapped_tail, 1);
+		} else {
+			struct page *page = list_first_entry(&pagelist->head,
+							     struct page, lru);
+			void *addr = kmap_atomic(page);
+			le32_add_cpu((__le32*)addr, 1);
+			kunmap_atomic(addr);
+		}
+	} else {
+		as_ctx->pagelist = pagelist;
+	}
+
+	ceph_pagelist_encode_32(pagelist, name_len);
+	ceph_pagelist_append(pagelist, name, name_len);
+
+	ceph_pagelist_encode_32(pagelist, as_ctx->sec_ctxlen);
+	ceph_pagelist_append(pagelist, as_ctx->sec_ctx, as_ctx->sec_ctxlen);
+
+	err = 0;
+out:
+	if (pagelist && !as_ctx->pagelist)
+		ceph_pagelist_release(pagelist);
+	return err;
+}
+#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */
+#endif /* CONFIG_SECURITY */
+
+void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
+{
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	posix_acl_release(as_ctx->acl);
+	posix_acl_release(as_ctx->default_acl);
 #endif
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+	security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen);
+#endif
+	if (as_ctx->pagelist)
+		ceph_pagelist_release(as_ctx->pagelist);
+}
+
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&ceph_other_xattr_handler,
+	NULL,
+};

diff --git a/fs/char_dev.c b/fs/char_dev.c
index a279c58..00dfe17 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c

@@ -88,22 +88,31 @@
 /*
  * Register a single major with a specified minor range.
  *
- * If major == 0 this functions will dynamically allocate a major and return
- * its number.
+ * If major == 0 this function will dynamically allocate an unused major.
+ * If major > 0 this function will attempt to reserve the range of minors
+ * with given major.
  *
- * If major > 0 this function will attempt to reserve the passed range of
- * minors and will return zero on success.
- *
- * Returns a -ve errno on failure.
  */
 static struct char_device_struct *
 __register_chrdev_region(unsigned int major, unsigned int baseminor,
 			   int minorct, const char *name)
 {
-	struct char_device_struct *cd, **cp;
-	int ret = 0;
+	struct char_device_struct *cd, *curr, *prev = NULL;
+	int ret;
 	int i;
 
+	if (major >= CHRDEV_MAJOR_MAX) {
+		pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n",
+		       name, major, CHRDEV_MAJOR_MAX-1);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (minorct > MINORMASK + 1 - baseminor) {
+		pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n",
+			name, baseminor, baseminor + minorct - 1, 0, MINORMASK);
+		return ERR_PTR(-EINVAL);
+	}
+
 	cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL);
 	if (cd == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -120,10 +129,21 @@
 		major = ret;
 	}
 
-	if (major >= CHRDEV_MAJOR_MAX) {
-		pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n",
-		       name, major, CHRDEV_MAJOR_MAX-1);
-		ret = -EINVAL;
+	ret = -EBUSY;
+	i = major_to_index(major);
+	for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) {
+		if (curr->major < major)
+			continue;
+
+		if (curr->major > major)
+			break;
+
+		if (curr->baseminor + curr->minorct <= baseminor)
+			continue;
+
+		if (curr->baseminor >= baseminor + minorct)
+			break;
+
 		goto out;
 	}
 
@@ -132,37 +152,14 @@
 	cd->minorct = minorct;
 	strlcpy(cd->name, name, sizeof(cd->name));
 
-	i = major_to_index(major);
-
-	for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
-		if ((*cp)->major > major ||
-		    ((*cp)->major == major &&
-		     (((*cp)->baseminor >= baseminor) ||
-		      ((*cp)->baseminor + (*cp)->minorct > baseminor))))
-			break;
-
-	/* Check for overlapping minor ranges.  */
-	if (*cp && (*cp)->major == major) {
-		int old_min = (*cp)->baseminor;
-		int old_max = (*cp)->baseminor + (*cp)->minorct - 1;
-		int new_min = baseminor;
-		int new_max = baseminor + minorct - 1;
-
-		/* New driver overlaps from the left.  */
-		if (new_max >= old_min && new_max <= old_max) {
-			ret = -EBUSY;
-			goto out;
-		}
-
-		/* New driver overlaps from the right.  */
-		if (new_min <= old_max && new_min >= old_min) {
-			ret = -EBUSY;
-			goto out;
-		}
+	if (!prev) {
+		cd->next = curr;
+		chrdevs[i] = cd;
+	} else {
+		cd->next = prev->next;
+		prev->next = cd;
 	}
 
-	cd->next = *cp;
-	*cp = cd;
 	mutex_unlock(&chrdevs_lock);
 	return cd;
 out:

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 85dadb9..22cf04f 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config CIFS
 	tristate "SMB3 and CIFS support (advanced network filesystem)"
 	depends on INET
@@ -9,12 +10,14 @@
 	select CRYPTO_SHA512
 	select CRYPTO_CMAC
 	select CRYPTO_HMAC
-	select CRYPTO_ARC4
+	select CRYPTO_LIB_ARC4
 	select CRYPTO_AEAD2
 	select CRYPTO_CCM
+	select CRYPTO_GCM
 	select CRYPTO_ECB
 	select CRYPTO_AES
-	select CRYPTO_DES
+	select CRYPTO_LIB_DES
+	select KEYS
 	help
 	  This is the client VFS module for the SMB3 family of NAS protocols,
 	  (including support for the most recent, most secure dialect SMB3.1.1)
@@ -108,7 +111,7 @@
 
 config CIFS_UPCALL
 	bool "Kerberos/SPNEGO advanced session setup"
-	depends on CIFS && KEYS
+	depends on CIFS
 	select DNS_RESOLVER
 	help
 	  Enables an upcall mechanism for CIFS which accesses userspace helper
@@ -117,25 +120,25 @@
 	  secure Kerberos authentication is required). If unsure, say Y.
 
 config CIFS_XATTR
-        bool "CIFS extended attributes"
-        depends on CIFS
-        help
-          Extended attributes are name:value pairs associated with inodes by
-          the kernel or by users (see the attr(5) manual page for details).
-          CIFS maps the name of extended attributes beginning with the user
-          namespace prefix to SMB/CIFS EAs.  EAs are stored on Windows
-          servers without the user namespace prefix, but their names are
-          seen by Linux cifs clients prefaced by the user namespace prefix.
-          The system namespace (used by some filesystems to store ACLs) is
-          not supported at this time.
+	bool "CIFS extended attributes"
+	depends on CIFS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page for details).
+	  CIFS maps the name of extended attributes beginning with the user
+	  namespace prefix to SMB/CIFS EAs.  EAs are stored on Windows
+	  servers without the user namespace prefix, but their names are
+	  seen by Linux cifs clients prefaced by the user namespace prefix.
+	  The system namespace (used by some filesystems to store ACLs) is
+	  not supported at this time.
 
-          If unsure, say Y.
+	  If unsure, say Y.
 
 config CIFS_POSIX
-        bool "CIFS POSIX Extensions"
-        depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
-        help
-          Enabling this option will cause the cifs client to attempt to
+	bool "CIFS POSIX Extensions"
+	depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
+	help
+	  Enabling this option will cause the cifs client to attempt to
 	  negotiate a newer dialect with servers, such as Samba 3.0.5
 	  or later, that optionally can handle more POSIX like (rather
 	  than Windows like) file behavior.  It also enables
@@ -143,75 +146,76 @@
 	  (such as Samba 3.10 and later) which can negotiate
 	  CIFS POSIX ACL support.  If unsure, say N.
 
-config CIFS_ACL
-	  bool "Provide CIFS ACL support"
-	  depends on CIFS_XATTR && KEYS
-	  help
-	    Allows fetching CIFS/NTFS ACL from the server.  The DACL blob
-	    is handed over to the application/caller.  See the man
-	    page for getcifsacl for more information.  If unsure, say Y.
-
 config CIFS_DEBUG
 	bool "Enable CIFS debugging routines"
 	default y
 	depends on CIFS
 	help
-	   Enabling this option adds helpful debugging messages to
-	   the cifs code which increases the size of the cifs module.
-	   If unsure, say Y.
+	  Enabling this option adds helpful debugging messages to
+	  the cifs code which increases the size of the cifs module.
+	  If unsure, say Y.
+
 config CIFS_DEBUG2
 	bool "Enable additional CIFS debugging routines"
 	depends on CIFS_DEBUG
 	help
-	   Enabling this option adds a few more debugging routines
-	   to the cifs code which slightly increases the size of
-	   the cifs module and can cause additional logging of debug
-	   messages in some error paths, slowing performance. This
-	   option can be turned off unless you are debugging
-	   cifs problems.  If unsure, say N.
+	  Enabling this option adds a few more debugging routines
+	  to the cifs code which slightly increases the size of
+	  the cifs module and can cause additional logging of debug
+	  messages in some error paths, slowing performance. This
+	  option can be turned off unless you are debugging
+	  cifs problems.  If unsure, say N.
 
 config CIFS_DEBUG_DUMP_KEYS
 	bool "Dump encryption keys for offline decryption (Unsafe)"
 	depends on CIFS_DEBUG
 	help
-	   Enabling this will dump the encryption and decryption keys
-	   used to communicate on an encrypted share connection on the
-	   console. This allows Wireshark to decrypt and dissect
-	   encrypted network captures. Enable this carefully.
-	   If unsure, say N.
+	  Enabling this will dump the encryption and decryption keys
+	  used to communicate on an encrypted share connection on the
+	  console. This allows Wireshark to decrypt and dissect
+	  encrypted network captures. Enable this carefully.
+	  If unsure, say N.
 
 config CIFS_DFS_UPCALL
-	  bool "DFS feature support"
-	  depends on CIFS && KEYS
-	  select DNS_RESOLVER
-	  help
-	    Distributed File System (DFS) support is used to access shares
-	    transparently in an enterprise name space, even if the share
-	    moves to a different server.  This feature also enables
-	    an upcall mechanism for CIFS which contacts userspace helper
-	    utilities to provide server name resolution (host names to
-	    IP addresses) which is needed for implicit mounts of DFS junction
-	    points. If unsure, say Y.
+	bool "DFS feature support"
+	depends on CIFS
+	select DNS_RESOLVER
+	help
+	  Distributed File System (DFS) support is used to access shares
+	  transparently in an enterprise name space, even if the share
+	  moves to a different server.  This feature also enables
+	  an upcall mechanism for CIFS which contacts userspace helper
+	  utilities to provide server name resolution (host names to
+	  IP addresses) which is needed in order to reconnect to
+	  servers if their addresses change or for implicit mounts of
+	  DFS junction points. If unsure, say Y.
 
 config CIFS_NFSD_EXPORT
-	  bool "Allow nfsd to export CIFS file system"
-	  depends on CIFS && BROKEN
-	  help
-	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+	bool "Allow nfsd to export CIFS file system"
+	depends on CIFS && BROKEN
+	help
+	  Allows NFS server to export a CIFS mounted share (nfsd over cifs)
 
 config CIFS_SMB_DIRECT
-	bool "SMB Direct support (Experimental)"
+	bool "SMB Direct support"
 	depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
 	help
-	  Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1.
+	  Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
 	  SMB Direct allows transferring SMB packets over RDMA. If unsure,
 	  say N.
 
 config CIFS_FSCACHE
-	  bool "Provide CIFS client caching support"
-	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
-	  help
-	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
-	    to be cached locally on disk through the general filesystem cache
-	    manager. If unsure, say N.
+	bool "Provide CIFS client caching support"
+	depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+	help
+	  Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+	  to be cached locally on disk through the general filesystem cache
+	  manager. If unsure, say N.
 
+config CIFS_ROOT
+	bool "SMB root file system (Experimental)"
+	depends on CIFS=y && IP_PNP
+	help
+	  Enables root file system support over SMB protocol.
+
+	  Most people say N here.

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 8581799..51bae93 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile

@@ -10,15 +10,16 @@
 	  cifs_unicode.o nterr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \
 	  smb2ops.o smb2maperror.o smb2transport.o \
-	  smb2misc.o smb2pdu.o smb2inode.o smb2file.o
+	  smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o
 
 cifs-$(CONFIG_CIFS_XATTR) += xattr.o
-cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o
 
 cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
 
 cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
+
+cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o

diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 3d19595..689162e 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c

@@ -1,20 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in
  * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich
  *
  * Copyright (c) 2000 RP Internet (www.rpi.net.au).
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
 #include <linux/module.h>

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 0657679..0b4eee3 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   fs/cifs_debug.c
  *
  *   Copyright (C) International Business Machines  Corp., 2000,2005
  *
  *   Modified by Steve French (sfrench@us.ibm.com)
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #include <linux/fs.h>
 #include <linux/string.h>
@@ -30,6 +17,9 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifsfs.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
 #ifdef CONFIG_CIFS_SMB_DIRECT
 #include "smbdirect.h"
 #endif
@@ -112,7 +102,12 @@
 		seq_puts(m, " type: CDROM ");
 	else
 		seq_printf(m, " type: %d ", dev_type);
-	if (tcon->seal)
+
+	seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number);
+
+	if ((tcon->seal) ||
+	    (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) ||
+	    (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA))
 		seq_printf(m, " Encrypted");
 	if (tcon->nocase)
 		seq_printf(m, " nocase");
@@ -132,7 +127,7 @@
 	struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
 	struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr;
 
-	seq_printf(m, "\t\tSpeed: %zu bps\n", iface->speed);
+	seq_printf(m, "\tSpeed: %zu bps\n", iface->speed);
 	seq_puts(m, "\t\tCapabilities: ");
 	if (iface->rdma_capable)
 		seq_puts(m, "rdma ");
@@ -145,6 +140,58 @@
 		seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr);
 }
 
+static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
+{
+	struct list_head *stmp, *tmp, *tmp1, *tmp2;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct cifsFileInfo *cfile;
+
+	seq_puts(m, "# Version:1\n");
+	seq_puts(m, "# Format:\n");
+	seq_puts(m, "# <tree id> <persistent fid> <flags> <count> <pid> <uid>");
+#ifdef CONFIG_CIFS_DEBUG2
+	seq_printf(m, " <filename> <mid>\n");
+#else
+	seq_printf(m, " <filename>\n");
+#endif /* CIFS_DEBUG2 */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(stmp, &cifs_tcp_ses_list) {
+		server = list_entry(stmp, struct TCP_Server_Info,
+				    tcp_ses_list);
+		list_for_each(tmp, &server->smb_ses_list) {
+			ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+			list_for_each(tmp1, &ses->tcon_list) {
+				tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+				spin_lock(&tcon->open_file_lock);
+				list_for_each(tmp2, &tcon->openFileList) {
+					cfile = list_entry(tmp2, struct cifsFileInfo,
+						     tlist);
+					seq_printf(m,
+						"0x%x 0x%llx 0x%x %d %d %d %s",
+						tcon->tid,
+						cfile->fid.persistent_fid,
+						cfile->f_flags,
+						cfile->count,
+						cfile->pid,
+						from_kuid(&init_user_ns, cfile->uid),
+						cfile->dentry->d_name.name);
+#ifdef CONFIG_CIFS_DEBUG2
+					seq_printf(m, " 0x%llx\n", cfile->fid.mid);
+#else
+					seq_printf(m, "\n");
+#endif /* CIFS_DEBUG2 */
+				}
+				spin_unlock(&tcon->open_file_lock);
+			}
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	seq_putc(m, '\n');
+	return 0;
+}
+
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
 	struct list_head *tmp1, *tmp2, *tmp3;
@@ -193,10 +240,9 @@
 #ifdef CONFIG_CIFS_XATTR
 	seq_printf(m, ",XATTR");
 #endif
-#ifdef CONFIG_CIFS_ACL
 	seq_printf(m, ",ACL");
-#endif
 	seq_putc(m, '\n');
+	seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize);
 	seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
 	seq_printf(m, "Servers:");
 
@@ -251,12 +297,10 @@
 			atomic_read(&server->smbd_conn->send_credits),
 			atomic_read(&server->smbd_conn->receive_credits),
 			server->smbd_conn->receive_credit_target);
-		seq_printf(m, "\nPending send_pending: %x send_payload_pending:"
-			" %x smbd_send_pending: %x smbd_recv_pending: %x",
+		seq_printf(m, "\nPending send_pending: %x "
+			"send_payload_pending: %x",
 			atomic_read(&server->smbd_conn->send_pending),
-			atomic_read(&server->smbd_conn->send_payload_pending),
-			server->smbd_conn->smbd_send_pending,
-			server->smbd_conn->smbd_recv_pending);
+			atomic_read(&server->smbd_conn->send_payload_pending));
 		seq_printf(m, "\nReceive buffers count_receive_queue: %x "
 			"count_empty_packet_queue: %x",
 			server->smbd_conn->count_receive_queue,
@@ -273,6 +317,12 @@
 #endif
 		seq_printf(m, "\nNumber of credits: %d Dialect 0x%x",
 			server->credits,  server->dialect);
+		if (server->compress_algorithm == SMB3_COMPRESS_LZNT1)
+			seq_printf(m, " COMPRESS_LZNT1");
+		else if (server->compress_algorithm == SMB3_COMPRESS_LZ77)
+			seq_printf(m, " COMPRESS_LZ77");
+		else if (server->compress_algorithm == SMB3_COMPRESS_LZ77_HUFF)
+			seq_printf(m, " COMPRESS_LZ77_HUFF");
 		if (server->sign)
 			seq_printf(m, " signed");
 		if (server->posix_ext_supported)
@@ -285,7 +335,7 @@
 			if ((ses->serverDomain == NULL) ||
 				(ses->serverOS == NULL) ||
 				(ses->serverNOS == NULL)) {
-				seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d\t",
+				seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ",
 					i, ses->serverName, ses->ses_count,
 					ses->capabilities, ses->status);
 				if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST)
@@ -296,16 +346,18 @@
 				seq_printf(m,
 				    "\n%d) Name: %s  Domain: %s Uses: %d OS:"
 				    " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
-				    " session status: %d\t",
+				    " session status: %d ",
 				i, ses->serverName, ses->serverDomain,
 				ses->ses_count, ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->status);
 			}
 			if (server->rdma)
 				seq_printf(m, "RDMA\n\t");
-			seq_printf(m, "TCP status: %d\n\tLocal Users To "
+			seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To "
 				   "Server: %d SecMode: 0x%x Req On Wire: %d",
-				   server->tcpStatus, server->srv_count,
+				   server->tcpStatus,
+				   server->reconnect_instance,
+				   server->srv_count,
 				   server->sec_mode, in_flight(server));
 
 #ifdef CONFIG_CIFS_STATS2
@@ -313,6 +365,12 @@
 				atomic_read(&server->in_send),
 				atomic_read(&server->num_waiters));
 #endif
+			/* dump session id helpful for use with network trace */
+			seq_printf(m, " SessionId: 0x%llx", ses->Suid);
+			if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+				seq_puts(m, " encrypted");
+			if (ses->sign)
+				seq_puts(m, " signed");
 
 			seq_puts(m, "\n\tShares:");
 			j = 0;
@@ -352,7 +410,7 @@
 				seq_printf(m, "\n\tServer interfaces: %zu\n",
 					   ses->iface_count);
 			for (j = 0; j < ses->iface_count; j++) {
-				seq_printf(m, "\t%d)\n", j);
+				seq_printf(m, "\t%d)", j);
 				cifs_dump_iface(m, &ses->iface_list[j]);
 			}
 			spin_unlock(&ses->iface_lock);
@@ -394,9 +452,15 @@
 		list_for_each(tmp1, &cifs_tcp_ses_list) {
 			server = list_entry(tmp1, struct TCP_Server_Info,
 					    tcp_ses_list);
+			server->max_in_flight = 0;
 #ifdef CONFIG_CIFS_STATS2
-			for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++)
+			for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) {
+				atomic_set(&server->num_cmds[i], 0);
 				atomic_set(&server->smb2slowcmd[i], 0);
+				server->time_per_cmd[i] = 0;
+				server->slowest_cmd[i] = 0;
+				server->fastest_cmd[0] = 0;
+			}
 #endif /* CONFIG_CIFS_STATS2 */
 			list_for_each(tmp2, &server->smb_ses_list) {
 				ses = list_entry(tmp2, struct cifs_ses,
@@ -463,10 +527,21 @@
 	list_for_each(tmp1, &cifs_tcp_ses_list) {
 		server = list_entry(tmp1, struct TCP_Server_Info,
 				    tcp_ses_list);
+		seq_printf(m, "\nMax requests in flight: %d", server->max_in_flight);
 #ifdef CONFIG_CIFS_STATS2
+		seq_puts(m, "\nTotal time spent processing by command. Time ");
+		seq_printf(m, "units are jiffies (%d per second)\n", HZ);
+		seq_puts(m, "  SMB3 CMD\tNumber\tTotal Time\tFastest\tSlowest\n");
+		seq_puts(m, "  --------\t------\t----------\t-------\t-------\n");
+		for (j = 0; j < NUMBER_OF_SMB2_COMMANDS; j++)
+			seq_printf(m, "  %d\t\t%d\t%llu\t\t%u\t%u\n", j,
+				atomic_read(&server->num_cmds[j]),
+				server->time_per_cmd[j],
+				server->fastest_cmd[j],
+				server->slowest_cmd[j]);
 		for (j = 0; j < NUMBER_OF_SMB2_COMMANDS; j++)
 			if (atomic_read(&server->smb2slowcmd[j]))
-				seq_printf(m, "%d slow responses from %s for command %d\n",
+				seq_printf(m, "  %d slow responses from %s for command %d\n",
 					atomic_read(&server->smb2slowcmd[j]),
 					server->hostname, j);
 #endif /* STATS2 */
@@ -563,6 +638,9 @@
 	proc_create_single("DebugData", 0, proc_fs_cifs,
 			cifs_debug_data_proc_show);
 
+	proc_create_single("open_files", 0400, proc_fs_cifs,
+			cifs_debug_files_proc_show);
+
 	proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops);
 	proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops);
 	proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops);
@@ -572,6 +650,11 @@
 		    &cifs_security_flags_proc_fops);
 	proc_create("LookupCacheEnabled", 0644, proc_fs_cifs,
 		    &cifs_lookup_cache_proc_fops);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops);
+#endif
+
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs,
 		&cifs_rdma_readwrite_threshold_proc_fops);
@@ -599,12 +682,17 @@
 		return;
 
 	remove_proc_entry("DebugData", proc_fs_cifs);
+	remove_proc_entry("open_files", proc_fs_cifs);
 	remove_proc_entry("cifsFYI", proc_fs_cifs);
 	remove_proc_entry("traceSMB", proc_fs_cifs);
 	remove_proc_entry("Stats", proc_fs_cifs);
 	remove_proc_entry("SecurityFlags", proc_fs_cifs);
 	remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
 	remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	remove_proc_entry("dfscache", proc_fs_cifs);
+#endif
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs);
 	remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs);

diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index f4f3f08..100b005 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h

@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *
  *   Copyright (c) International Business Machines  Corp., 2000,2002
  *   Modified by Steve French (sfrench@us.ibm.com)
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
 */
 
 #ifndef _H_CIFS_DEBUG
@@ -47,6 +33,29 @@
  */
 #ifdef CONFIG_CIFS_DEBUG
 
+
+/*
+ * When adding tracepoints and debug messages we have various choices.
+ * Some considerations:
+ *
+ * Use cifs_dbg(VFS, ...) for things we always want logged, and the user to see
+ *     cifs_info(...) slightly less important, admin can filter via loglevel > 6
+ *     cifs_dbg(FYI, ...) minor debugging messages, off by default
+ *     trace_smb3_*  ftrace functions are preferred for complex debug messages
+ *                 intended for developers or experienced admins, off by default
+ */
+
+/* Information level messages, minor events */
+#define cifs_info_func(ratefunc, fmt, ...)			\
+do {								\
+	pr_info_ ## ratefunc("CIFS: " fmt, ##__VA_ARGS__); 	\
+} while (0)
+
+#define cifs_info(fmt, ...)					\
+do { 								\
+	cifs_info_func(ratelimited, fmt, ##__VA_ARGS__); 	\
+} while (0)
+
 /* information message: e.g., configuration, major event */
 #define cifs_dbg_func(ratefunc, type, fmt, ...)			\
 do {								\
@@ -71,6 +80,60 @@
 			type, fmt, ##__VA_ARGS__);	\
 } while (0)
 
+#define cifs_server_dbg_func(ratefunc, type, fmt, ...)		\
+do {								\
+	const char *sn = "";					\
+	if (server && server->hostname)				\
+		sn = server->hostname;				\
+	if ((type) & FYI && cifsFYI & CIFS_INFO) {		\
+		pr_debug_ ## ratefunc("%s: \\\\%s "	fmt,	\
+			__FILE__, sn, ##__VA_ARGS__);		\
+	} else if ((type) & VFS) {				\
+		pr_err_ ## ratefunc("CIFS VFS: \\\\%s " fmt,	\
+			sn, ##__VA_ARGS__);			\
+	} else if ((type) & NOISY && (NOISY != 0)) {		\
+		pr_debug_ ## ratefunc("\\\\%s " fmt,		\
+			sn, ##__VA_ARGS__);			\
+	}							\
+} while (0)
+
+#define cifs_server_dbg(type, fmt, ...)			\
+do {							\
+	if ((type) & ONCE)				\
+		cifs_server_dbg_func(once,		\
+			type, fmt, ##__VA_ARGS__);	\
+	else						\
+		cifs_server_dbg_func(ratelimited,	\
+			type, fmt, ##__VA_ARGS__);	\
+} while (0)
+
+#define cifs_tcon_dbg_func(ratefunc, type, fmt, ...)		\
+do {								\
+	const char *tn = "";					\
+	if (tcon && tcon->treeName)				\
+		tn = tcon->treeName;				\
+	if ((type) & FYI && cifsFYI & CIFS_INFO) {		\
+		pr_debug_ ## ratefunc("%s: %s "	fmt,		\
+			__FILE__, tn, ##__VA_ARGS__);		\
+	} else if ((type) & VFS) {				\
+		pr_err_ ## ratefunc("CIFS VFS: %s " fmt,	\
+			tn, ##__VA_ARGS__);			\
+	} else if ((type) & NOISY && (NOISY != 0)) {		\
+		pr_debug_ ## ratefunc("%s " fmt,		\
+			tn, ##__VA_ARGS__);			\
+	}							\
+} while (0)
+
+#define cifs_tcon_dbg(type, fmt, ...)			\
+do {							\
+	if ((type) & ONCE)				\
+		cifs_tcon_dbg_func(once,		\
+			type, fmt, ##__VA_ARGS__);	\
+	else						\
+		cifs_tcon_dbg_func(ratelimited,	\
+			type, fmt, ##__VA_ARGS__);	\
+} while (0)
+
 /*
  *	debug OFF
  *	---------
@@ -81,6 +144,24 @@
 	if (0)								\
 		pr_debug(fmt, ##__VA_ARGS__);				\
 } while (0)
+
+#define cifs_server_dbg(type, fmt, ...)					\
+do {									\
+	if (0)								\
+		pr_debug("\\\\%s " fmt,					\
+			 server->hostname, ##__VA_ARGS__);		\
+} while (0)
+
+#define cifs_tcon_dbg(type, fmt, ...)					\
+do {									\
+	if (0)								\
+		pr_debug("%s " fmt, tcon->treeName, ##__VA_ARGS__);	\
+} while (0)
+
+#define cifs_info(fmt, ...)						\
+do {									\
+	pr_info("CIFS: "fmt, ##__VA_ARGS__);				\
+} while (0)
 #endif
 
 #endif				/* _H_CIFS_DEBUG */

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 6b61df1..41957b8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Contains the CIFS DFS referral mounting routines used for handling
  *   traversal via DFS junction point
@@ -6,10 +7,6 @@
  *   Copyright (C) International Business Machines  Corp., 2008
  *   Author(s): Igor Mammedov (niallain@gmail.com)
  *		Steve French (sfrench@us.ibm.com)
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation; either version
- *   2 of the License, or (at your option) any later version.
  */
 
 #include <linux/dcache.h>
@@ -25,6 +22,7 @@
 #include "dns_resolve.h"
 #include "cifs_debug.h"
 #include "cifs_unicode.h"
+#include "dfs_cache.h"
 
 static LIST_HEAD(cifs_dfs_automount_list);
 
@@ -126,7 +124,7 @@
  * @sb_mountdata:	parent/root DFS mount options (template)
  * @fullpath:		full path in UNC format
  * @ref:		server's referral
- * @devname:		pointer for saving device name
+ * @devname:		optional pointer for saving device name
  *
  * creates mount options for submount based on template options sb_mountdata
  * and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
@@ -140,6 +138,7 @@
 				   char **devname)
 {
 	int rc;
+	char *name;
 	char *mountdata = NULL;
 	const char *prepath = NULL;
 	int md_len;
@@ -158,17 +157,17 @@
 			prepath++;
 	}
 
-	*devname = cifs_build_devname(ref->node_name, prepath);
-	if (IS_ERR(*devname)) {
-		rc = PTR_ERR(*devname);
-		*devname = NULL;
+	name = cifs_build_devname(ref->node_name, prepath);
+	if (IS_ERR(name)) {
+		rc = PTR_ERR(name);
+		name = NULL;
 		goto compose_mount_options_err;
 	}
 
-	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
+	rc = dns_resolve_server_name_to_ip(name, &srvIP);
 	if (rc < 0) {
 		cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
-			 __func__, *devname, rc);
+			 __func__, name, rc);
 		goto compose_mount_options_err;
 	}
 
@@ -224,6 +223,9 @@
 	strcat(mountdata, "ip=");
 	strcat(mountdata, srvIP);
 
+	if (devname)
+		*devname = name;
+
 	/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
 	/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
 
@@ -234,8 +236,7 @@
 compose_mount_options_err:
 	kfree(mountdata);
 	mountdata = ERR_PTR(rc);
-	kfree(*devname);
-	*devname = NULL;
+	kfree(name);
 	goto compose_mount_options_out;
 }
 
@@ -251,29 +252,39 @@
 {
 	struct vfsmount *mnt;
 	char *mountdata;
-	char *devname = NULL;
+	char *devname;
+
+	/*
+	 * Always pass down the DFS full path to smb3_do_mount() so we
+	 * can use it later for failover.
+	 */
+	devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+	if (!devname)
+		return ERR_PTR(-ENOMEM);
+
+	convert_delimiter(devname, '/');
 
 	/* strip first '\' from fullpath */
 	mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
-			fullpath + 1, ref, &devname);
-
-	if (IS_ERR(mountdata))
+					       fullpath + 1, ref, NULL);
+	if (IS_ERR(mountdata)) {
+		kfree(devname);
 		return (struct vfsmount *)mountdata;
+	}
 
 	mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
 	kfree(mountdata);
 	kfree(devname);
 	return mnt;
-
 }
 
 static void dump_referral(const struct dfs_info3_param *ref)
 {
 	cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
 	cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
-	cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+	cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
 		 ref->flags, ref->server_type);
-	cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+	cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
 		 ref->ref_flag, ref->path_consumed);
 }
 
@@ -282,16 +293,15 @@
  */
 static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 {
-	struct dfs_info3_param *referrals = NULL;
-	unsigned int num_referrals = 0;
+	struct dfs_info3_param referral = {0};
 	struct cifs_sb_info *cifs_sb;
 	struct cifs_ses *ses;
-	char *full_path;
+	struct cifs_tcon *tcon;
+	char *full_path, *root_path;
 	unsigned int xid;
-	int i;
+	int len;
 	int rc;
 	struct vfsmount *mnt;
-	struct tcon_link *tlink;
 
 	cifs_dbg(FYI, "in %s\n", __func__);
 	BUG_ON(IS_ROOT(mntpt));
@@ -304,54 +314,80 @@
 	 */
 	mnt = ERR_PTR(-ENOMEM);
 
+	cifs_sb = CIFS_SB(mntpt->d_sb);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) {
+		mnt = ERR_PTR(-EREMOTE);
+		goto cdda_exit;
+	}
+
 	/* always use tree name prefix */
 	full_path = build_path_from_dentry_optional_prefix(mntpt, true);
 	if (full_path == NULL)
 		goto cdda_exit;
 
-	cifs_sb = CIFS_SB(mntpt->d_sb);
-	tlink = cifs_sb_tlink(cifs_sb);
-	if (IS_ERR(tlink)) {
-		mnt = ERR_CAST(tlink);
+	cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
+
+	if (!cifs_sb_master_tlink(cifs_sb)) {
+		cifs_dbg(FYI, "%s: master tlink is NULL\n", __func__);
 		goto free_full_path;
 	}
-	ses = tlink_tcon(tlink)->ses;
 
-	xid = get_xid();
-	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
-		&num_referrals, &referrals,
-		cifs_remap(cifs_sb));
-	free_xid(xid);
-
-	cifs_put_tlink(tlink);
-
-	mnt = ERR_PTR(-ENOENT);
-	for (i = 0; i < num_referrals; i++) {
-		int len;
-		dump_referral(referrals + i);
-		/* connect to a node */
-		len = strlen(referrals[i].node_name);
-		if (len < 2) {
-			cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
-				 __func__, referrals[i].node_name);
-			mnt = ERR_PTR(-EINVAL);
-			break;
-		}
-		mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
-				full_path, referrals + i);
-		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
-			 __func__, referrals[i].node_name, mnt);
-		if (!IS_ERR(mnt))
-			goto success;
+	tcon = cifs_sb_master_tcon(cifs_sb);
+	if (!tcon) {
+		cifs_dbg(FYI, "%s: master tcon is NULL\n", __func__);
+		goto free_full_path;
 	}
 
-	/* no valid submounts were found; return error from get_dfs_path() by
-	 * preference */
-	if (rc != 0)
-		mnt = ERR_PTR(rc);
+	root_path = kstrdup(tcon->treeName, GFP_KERNEL);
+	if (!root_path) {
+		mnt = ERR_PTR(-ENOMEM);
+		goto free_full_path;
+	}
+	cifs_dbg(FYI, "%s: root path: %s\n", __func__, root_path);
 
-success:
-	free_dfs_info_array(referrals, num_referrals);
+	ses = tcon->ses;
+	xid = get_xid();
+
+	/*
+	 * If DFS root has been expired, then unconditionally fetch it again to
+	 * refresh DFS referral cache.
+	 */
+	rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+			    root_path + 1, NULL, NULL);
+	if (!rc) {
+		rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
+				    cifs_remap(cifs_sb), full_path + 1,
+				    &referral, NULL);
+	}
+
+	free_xid(xid);
+
+	if (rc) {
+		mnt = ERR_PTR(rc);
+		goto free_root_path;
+	}
+
+	dump_referral(&referral);
+
+	len = strlen(referral.node_name);
+	if (len < 2) {
+		cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
+			 __func__, referral.node_name);
+		mnt = ERR_PTR(-EINVAL);
+		goto free_dfs_ref;
+	}
+	/*
+	 * cifs_mount() will retry every available node server in case
+	 * of failures.
+	 */
+	mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, &referral);
+	cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__,
+		 referral.node_name, mnt);
+
+free_dfs_ref:
+	free_dfs_info_param(&referral);
+free_root_path:
+	kfree(root_path);
 free_full_path:
 	kfree(full_path);
 cdda_exit:

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9731d0d..6e7c442 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h

@@ -51,12 +51,17 @@
 					      */
 #define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */
 #define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */
+#define CIFS_MOUNT_NO_DFS 0x8000000 /* disable DFS resolving */
+#define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */
+#define CIFS_MOUNT_RO_CACHE	0x20000000  /* assumes share will not change */
+#define CIFS_MOUNT_RW_CACHE	0x40000000  /* assumes only client accessing */
 
 struct cifs_sb_info {
 	struct rb_root tlink_tree;
 	spinlock_t tlink_tree_lock;
 	struct tcon_link *master_tlink;
 	struct nls_table *local_nls;
+	unsigned int bsize;
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned long actimeo; /* attribute cache timeout (jiffies) */
@@ -71,6 +76,20 @@
 	char   *mountdata; /* options received at mount time or via DFS refs */
 	struct delayed_work prune_tlinks;
 	struct rcu_head rcu;
+
+	/* only used when CIFS_MOUNT_USE_PREFIX_PATH is set */
 	char *prepath;
+
+	/*
+	 * Path initially provided by the mount call. We might connect
+	 * to something different via DFS but we want to keep it to do
+	 * failover properly.
+	 */
+	char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */
+	/*
+	 * Indicate whether serverino option was turned off later
+	 * (cifs_autodisable_serverino) in order to match new mounts.
+	 */
+	bool mnt_cifs_serverino_autodisabled;
 };
 #endif				/* _CIFS_FS_SB_H */

diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index 57ff075..0f0dc1c 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h

@@ -43,8 +43,32 @@
 	/*	snapshots[]; */
 } __packed;
 
+/* query_info flags */
+#define PASSTHRU_QUERY_INFO	0x00000000
+#define PASSTHRU_FSCTL		0x00000001
+#define PASSTHRU_SET_INFO	0x00000002
+struct smb_query_info {
+	__u32   info_type;
+	__u32   file_info_class;
+	__u32   additional_information;
+	__u32   flags;
+	__u32	input_buffer_length;
+	__u32	output_buffer_length;
+	/* char buffer[]; */
+} __packed;
+
+struct smb3_key_debug_info {
+	__u64	Suid;
+	__u16	cipher_type;
+	__u8	auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */
+	__u8	smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
+	__u8	smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+} __packed;
+
 #define CIFS_IOCTL_MAGIC	0xCF
 #define CIFS_IOC_COPYCHUNK_FILE	_IOW(CIFS_IOCTL_MAGIC, 3, int)
 #define CIFS_IOC_SET_INTEGRITY  _IO(CIFS_IOCTL_MAGIC, 4)
 #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
 #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array)
+#define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info)
+#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info)

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index a2b2355..498777d 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c

@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   fs/cifs/cifs_unicode.c
  *
  *   Copyright (c) International Business Machines  Corp., 2000,2009
  *   Modified by Steve French (sfrench@us.ibm.com)
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #include <linux/fs.h>
 #include <linux/slab.h>

diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 8360b74..80b3d84 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * cifs_unicode:  Unicode kernel case support
  *
@@ -7,28 +8,12 @@
  *
  *   Copyright (c) International Business Machines  Corp., 2000,2009
  *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *
  * Notes:
  *     These APIs are based on the C library functions.  The semantics
  *     should match the C functions but with expanded size operands.
  *
  *     The upper/lower functions are based on a table created by mkupr.
  *     This is a compressed table of upper and lower case conversion.
- *
  */
 #ifndef _CIFS_UNICODE_H
 #define _CIFS_UNICODE_H

diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 0ac7c5a..7b272fc 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h

@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (c) International Business Machines  Corp., 2000,2002
  *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
  * uniupr.h - Unicode compressed case ranges
- *
 */
 
 #ifndef UNIUPR_NOUPPER

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1d377b7..f842944 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c

@@ -701,10 +701,9 @@
 }
 #endif
 
-
 static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 		       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
-		       struct cifs_fattr *fattr)
+		       struct cifs_fattr *fattr, bool mode_from_special_sid)
 {
 	int i;
 	int num_aces = 0;
@@ -757,22 +756,34 @@
 #ifdef CONFIG_CIFS_DEBUG2
 			dump_ace(ppace[i], end_of_acl);
 #endif
-			if (compare_sids(&(ppace[i]->sid), pownersid) == 0)
+			if (mode_from_special_sid &&
+			    (compare_sids(&(ppace[i]->sid),
+					  &sid_unix_NFS_mode) == 0)) {
+				/*
+				 * Full permissions are:
+				 * 07777 = S_ISUID | S_ISGID | S_ISVTX |
+				 *         S_IRWXU | S_IRWXG | S_IRWXO
+				 */
+				fattr->cf_mode &= ~07777;
+				fattr->cf_mode |=
+					le32_to_cpu(ppace[i]->sid.sub_auth[2]);
+				break;
+			} else if (compare_sids(&(ppace[i]->sid), pownersid) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &user_mask);
-			if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0)
+			else if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &group_mask);
-			if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0)
+			else if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &other_mask);
-			if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)
+			else if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
@@ -795,22 +806,49 @@
 
 
 static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
-			struct cifs_sid *pgrpsid, __u64 nmode)
+			struct cifs_sid *pgrpsid, __u64 nmode, bool modefromsid)
 {
 	u16 size = 0;
+	u32 num_aces = 0;
 	struct cifs_acl *pnndacl;
 
 	pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl));
 
+	if (modefromsid) {
+		struct cifs_ace *pntace =
+			(struct cifs_ace *)((char *)pnndacl + size);
+		int i;
+
+		pntace->type = ACCESS_ALLOWED;
+		pntace->flags = 0x0;
+		pntace->access_req = 0;
+		pntace->sid.num_subauth = 3;
+		pntace->sid.revision = 1;
+		for (i = 0; i < NUM_AUTHS; i++)
+			pntace->sid.authority[i] =
+				sid_unix_NFS_mode.authority[i];
+		pntace->sid.sub_auth[0] = sid_unix_NFS_mode.sub_auth[0];
+		pntace->sid.sub_auth[1] = sid_unix_NFS_mode.sub_auth[1];
+		pntace->sid.sub_auth[2] = cpu_to_le32(nmode & 07777);
+
+		/* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */
+		pntace->size = cpu_to_le16(28);
+		size += 28;
+		num_aces++;
+	}
+
 	size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size),
 					pownersid, nmode, S_IRWXU);
+	num_aces++;
 	size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
 					pgrpsid, nmode, S_IRWXG);
+	num_aces++;
 	size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
 					 &sid_everyone, nmode, S_IRWXO);
+	num_aces++;
 
+	pndacl->num_aces = cpu_to_le32(num_aces);
 	pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
-	pndacl->num_aces = cpu_to_le32(3);
 
 	return 0;
 }
@@ -851,7 +889,8 @@
 
 /* Convert CIFS ACL to POSIX form */
 static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
-		struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr)
+		struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr,
+		bool get_mode_from_special_sid)
 {
 	int rc = 0;
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
@@ -900,7 +939,7 @@
 
 	if (dacloffset)
 		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
-			   group_sid_ptr, fattr);
+			   group_sid_ptr, fattr, get_mode_from_special_sid);
 	else
 		cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */
 
@@ -909,7 +948,8 @@
 
 /* Convert permission bits from mode to equivalent CIFS ACL */
 static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
-	__u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag)
+	__u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid,
+	bool mode_from_sid, int *aclflag)
 {
 	int rc = 0;
 	__u32 dacloffset;
@@ -934,7 +974,7 @@
 		ndacl_ptr->num_aces = 0;
 
 		rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
-					nmode);
+				    nmode, mode_from_sid);
 		sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
 		/* copy sec desc control portion & owner and group sids */
 		copy_sec_desc(pntsd, pnntsd, sidsoffset);
@@ -1128,8 +1168,8 @@
 /* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */
 int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
-		  struct inode *inode, const char *path,
-		  const struct cifs_fid *pfid)
+		  struct inode *inode, bool mode_from_special_sid,
+		  const char *path, const struct cifs_fid *pfid)
 {
 	struct cifs_ntsd *pntsd = NULL;
 	u32 acllen = 0;
@@ -1156,8 +1196,11 @@
 	if (IS_ERR(pntsd)) {
 		rc = PTR_ERR(pntsd);
 		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
+	} else if (mode_from_special_sid) {
+		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, true);
 	} else {
-		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
+		/* get approximated mode from ACL */
+		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, false);
 		kfree(pntsd);
 		if (rc)
 			cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc);
@@ -1181,6 +1224,7 @@
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
 	struct smb_version_operations *ops;
+	bool mode_from_sid;
 
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
@@ -1218,8 +1262,13 @@
 		return -ENOMEM;
 	}
 
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)
+		mode_from_sid = true;
+	else
+		mode_from_sid = false;
+
 	rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
-				&aclflag);
+			    mode_from_sid, &aclflag);
 
 	cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
 

diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index dd95a6f..439b99c 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h

@@ -45,7 +45,7 @@
  */
 #define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
 			      sizeof(struct cifs_acl) + \
-			      (sizeof(struct cifs_ace) * 3))
+			      (sizeof(struct cifs_ace) * 4))
 
 /*
  * Maximum size of a string representation of a SID:
@@ -90,8 +90,39 @@
 	__le32 num_aces;
 } __attribute__((packed));
 
+/* ACE types - see MS-DTYP 2.4.4.1 */
+#define ACCESS_ALLOWED_ACE_TYPE	0x00
+#define ACCESS_DENIED_ACE_TYPE	0x01
+#define SYSTEM_AUDIT_ACE_TYPE	0x02
+#define SYSTEM_ALARM_ACE_TYPE	0x03
+#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
+#define ACCESS_ALLOWED_OBJECT_ACE_TYPE	0x05
+#define ACCESS_DENIED_OBJECT_ACE_TYPE	0x06
+#define SYSTEM_AUDIT_OBJECT_ACE_TYPE	0x07
+#define SYSTEM_ALARM_OBJECT_ACE_TYPE	0x08
+#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
+#define ACCESS_DENIED_CALLBACK_ACE_TYPE	0x0A
+#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
+#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE  0x0C
+#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE	0x0D
+#define SYSTEM_ALARM_CALLBACK_ACE_TYPE	0x0E /* Reserved */
+#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
+#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */
+#define SYSTEM_MANDATORY_LABEL_ACE_TYPE	0x11
+#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12
+#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13
+
+/* ACE flags */
+#define OBJECT_INHERIT_ACE	0x01
+#define CONTAINER_INHERIT_ACE	0x02
+#define NO_PROPAGATE_INHERIT_ACE 0x04
+#define INHERIT_ONLY_ACE	0x08
+#define INHERITED_ACE		0x10
+#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40
+#define FAILED_ACCESS_ACE_FLAG	0x80
+
 struct cifs_ace {
-	__u8 type;
+	__u8 type; /* see above and MS-DTYP 2.4.4.1 */
 	__u8 flags;
 	__le16 size;
 	__le32 access_req;
@@ -99,6 +130,54 @@
 } __attribute__((packed));
 
 /*
+ * The current SMB3 form of security descriptor is similar to what was used for
+ * cifs (see above) but some fields are split, and fields in the struct below
+ * matches names of fields to the the spec, MS-DTYP (see sections 2.4.5 and
+ * 2.4.6). Note that "CamelCase" fields are used in this struct in order to
+ * match the MS-DTYP and MS-SMB2 specs which define the wire format.
+ */
+struct smb3_sd {
+	__u8 Revision; /* revision level, MUST be one */
+	__u8 Sbz1; /* only meaningful if 'RM' flag set below */
+	__le16 Control;
+	__le32 OffsetOwner;
+	__le32 OffsetGroup;
+	__le32 OffsetSacl;
+	__le32 OffsetDacl;
+} __packed;
+
+/* Meaning of 'Control' field flags */
+#define ACL_CONTROL_SR	0x0001	/* Self relative */
+#define ACL_CONTROL_RM	0x0002	/* Resource manager control bits */
+#define ACL_CONTROL_PS	0x0004	/* SACL protected from inherits */
+#define ACL_CONTROL_PD	0x0008	/* DACL protected from inherits */
+#define ACL_CONTROL_SI	0x0010	/* SACL Auto-Inherited */
+#define ACL_CONTROL_DI	0x0020	/* DACL Auto-Inherited */
+#define ACL_CONTROL_SC	0x0040	/* SACL computed through inheritance */
+#define ACL_CONTROL_DC	0x0080	/* DACL computed through inheritence */
+#define ACL_CONTROL_SS	0x0100	/* Create server ACL */
+#define ACL_CONTROL_DT	0x0200	/* DACL provided by trusteed source */
+#define ACL_CONTROL_SD	0x0400	/* SACL defaulted */
+#define ACL_CONTROL_SP	0x0800	/* SACL is present on object */
+#define ACL_CONTROL_DD	0x1000	/* DACL defaulted */
+#define ACL_CONTROL_DP	0x2000	/* DACL is present on object */
+#define ACL_CONTROL_GD	0x4000	/* Group was defaulted */
+#define ACL_CONTROL_OD	0x8000	/* User was defaulted */
+
+/* Meaning of AclRevision flags */
+#define ACL_REVISION	0x02 /* See section 2.4.4.1 of MS-DTYP */
+#define ACL_REVISION_DS	0x04 /* Additional AceTypes allowed */
+
+struct smb3_acl {
+	u8 AclRevision; /* revision level */
+	u8 Sbz1; /* MBZ */
+	__le16 AclSize;
+	__le16 AceCount;
+	__le16 Sbz2; /* MBZ */
+} __packed;
+
+
+/*
  * Minimum security identifier can be one for system defined Users
  * and Groups such as NULL SID and World or Built-in accounts such
  * as Administrator and Guest and consists of

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 85b31cf..97b7497 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c

@@ -33,7 +33,8 @@
 #include <linux/ctype.h>
 #include <linux/random.h>
 #include <linux/highmem.h>
-#include <crypto/skcipher.h>
+#include <linux/fips.h>
+#include <crypto/arc4.h>
 #include <crypto/aead.h>
 
 int __cifs_calc_signature(struct smb_rqst *rqst,
@@ -224,7 +225,7 @@
 	if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) {
 		struct smb_com_lock_req *pSMB =
 			(struct smb_com_lock_req *)cifs_pdu;
-	    if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)
+		if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)
 			return 0;
 	}
 
@@ -304,12 +305,17 @@
 int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 			char *lnm_session_key)
 {
-	int i;
+	int i, len;
 	int rc;
 	char password_with_pad[CIFS_ENCPWD_SIZE] = {0};
 
-	if (password)
-		strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
+	if (password) {
+		for (len = 0; len < CIFS_ENCPWD_SIZE; len++)
+			if (!password[len])
+				break;
+
+		memcpy(password_with_pad, password, len);
+	}
 
 	if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
 		memcpy(lnm_session_key, password_with_pad,
@@ -767,63 +773,32 @@
 int
 calc_seckey(struct cifs_ses *ses)
 {
-	int rc;
-	struct crypto_skcipher *tfm_arc4;
-	struct scatterlist sgin, sgout;
-	struct skcipher_request *req;
-	unsigned char *sec_key;
+	unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+	struct arc4_ctx *ctx_arc4;
 
-	sec_key = kmalloc(CIFS_SESS_KEY_SIZE, GFP_KERNEL);
-	if (sec_key == NULL)
-		return -ENOMEM;
+	if (fips_enabled)
+		return -ENODEV;
 
 	get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
 
-	tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(tfm_arc4)) {
-		rc = PTR_ERR(tfm_arc4);
-		cifs_dbg(VFS, "could not allocate crypto API arc4\n");
-		goto out;
+	ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL);
+	if (!ctx_arc4) {
+		cifs_dbg(VFS, "could not allocate arc4 context\n");
+		return -ENOMEM;
 	}
 
-	rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response,
-					CIFS_SESS_KEY_SIZE);
-	if (rc) {
-		cifs_dbg(VFS, "%s: Could not set response as a key\n",
-			 __func__);
-		goto out_free_cipher;
-	}
-
-	req = skcipher_request_alloc(tfm_arc4, GFP_KERNEL);
-	if (!req) {
-		rc = -ENOMEM;
-		cifs_dbg(VFS, "could not allocate crypto API arc4 request\n");
-		goto out_free_cipher;
-	}
-
-	sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
-	sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
-
-	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, &sgin, &sgout, CIFS_CPHTXT_SIZE, NULL);
-
-	rc = crypto_skcipher_encrypt(req);
-	skcipher_request_free(req);
-	if (rc) {
-		cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
-		goto out_free_cipher;
-	}
+	arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
+	arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
+		   CIFS_CPHTXT_SIZE);
 
 	/* make secondary_key/nonce as session key */
 	memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
 	/* and make len as that of session key only */
 	ses->auth_key.len = CIFS_SESS_KEY_SIZE;
 
-out_free_cipher:
-	crypto_free_skcipher(tfm_arc4);
-out:
-	kfree(sec_key);
-	return rc;
+	memzero_explicit(sec_key, CIFS_SESS_KEY_SIZE);
+	kzfree(ctx_arc4);
+	return 0;
 }
 
 void

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index fb32f3d..1a135d1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c

@@ -52,6 +52,18 @@
 #include "cifs_spnego.h"
 #include "fscache.h"
 #include "smb2pdu.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
+
+/*
+ * DOS dates from 1980/1/1 through 2107/12/31
+ * Protocol specifications indicate the range should be to 119, which
+ * limits maximum year to 2099. But this range has not been checked.
+ */
+#define SMB_DATE_MAX (127<<9 | 12<<5 | 31)
+#define SMB_DATE_MIN (0<<9 | 1<<5 | 1)
+#define SMB_TIME_MAX (23<<11 | 59<<5 | 29)
 
 int cifsFYI = 0;
 bool traceSMB;
@@ -81,6 +93,14 @@
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
 				   "CIFS/SMB1 dialect (N/A for SMB3) "
 				   "Default: 32767 Range: 2 to 32767.");
+#ifdef CONFIG_CIFS_STATS2
+unsigned int slow_rsp_threshold = 1;
+module_param(slow_rsp_threshold, uint, 0644);
+MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait "
+				   "before logging that a response is delayed. "
+				   "Default: 1 (if set to 0 disables msg).");
+#endif /* STATS2 */
+
 module_param(enable_oplocks, bool, 0644);
 MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
 
@@ -98,6 +118,7 @@
 extern mempool_t *cifs_mid_poolp;
 
 struct workqueue_struct	*cifsiod_wq;
+struct workqueue_struct	*decrypt_wq;
 struct workqueue_struct	*cifsoplockd_wq;
 __u32 cifs_lock_secret;
 
@@ -131,6 +152,7 @@
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb;
 	struct cifs_tcon *tcon;
+	struct timespec64 ts;
 	int rc = 0;
 
 	cifs_sb = CIFS_SB(sb);
@@ -147,8 +169,34 @@
 	else
 		sb->s_maxbytes = MAX_NON_LFS;
 
-	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
-	sb->s_time_gran = 100;
+	/*
+	 * Some very old servers like DOS and OS/2 used 2 second granularity
+	 * (while all current servers use 100ns granularity - see MS-DTYP)
+	 * but 1 second is the maximum allowed granularity for the VFS
+	 * so for old servers set time granularity to 1 second while for
+	 * everything else (current servers) set it to 100ns.
+	 */
+	if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) &&
+	    ((tcon->ses->capabilities &
+	      tcon->ses->server->vals->cap_nt_find) == 0) &&
+	    !tcon->unix_ext) {
+		sb->s_time_gran = 1000000000; /* 1 second is max allowed gran */
+		ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0);
+		sb->s_time_min = ts.tv_sec;
+		ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX),
+				    cpu_to_le16(SMB_TIME_MAX), 0);
+		sb->s_time_max = ts.tv_sec;
+	} else {
+		/*
+		 * Almost every server, including all SMB2+, uses DCE TIME
+		 * ie 100 nanosecond units, since 1601.  See MS-DTYP and MS-FSCC
+		 */
+		sb->s_time_gran = 100;
+		ts = cifs_NTtimeToUnix(0);
+		sb->s_time_min = ts.tv_sec;
+		ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX));
+		sb->s_time_max = ts.tv_sec;
+	}
 
 	sb->s_magic = CIFS_MAGIC_NUMBER;
 	sb->s_op = &cifs_super_ops;
@@ -292,6 +340,7 @@
 	cifs_inode->uniqueid = 0;
 	cifs_inode->createtime = 0;
 	cifs_inode->epoch = 0;
+	spin_lock_init(&cifs_inode->open_file_lock);
 	generate_random_uuid(cifs_inode->lease_key);
 
 	/*
@@ -304,16 +353,10 @@
 	return &cifs_inode->vfs_inode;
 }
 
-static void cifs_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
-}
-
 static void
-cifs_destroy_inode(struct inode *inode)
+cifs_free_inode(struct inode *inode)
 {
-	call_rcu(&inode->i_rcu, cifs_i_callback);
+	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
 }
 
 static void
@@ -370,7 +413,7 @@
 		seq_puts(s, "ntlm");
 		break;
 	case Kerberos:
-		seq_puts(s, "krb5");
+		seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid));
 		break;
 	case RawNTLMSSP:
 		seq_puts(s, "ntlmssp");
@@ -394,6 +437,10 @@
 		seq_puts(s, "strict");
 	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
 		seq_puts(s, "none");
+	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)
+		seq_puts(s, "singleclient"); /* assume only one client access */
+	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)
+		seq_puts(s, "ro"); /* read only caching assumed */
 	else
 		seq_puts(s, "loose");
 }
@@ -427,6 +474,8 @@
 	cifs_show_security(s, tcon->ses);
 	cifs_show_cache_flavor(s, cifs_sb);
 
+	if (tcon->no_lease)
+		seq_puts(s, ",nolease");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
 		seq_puts(s, ",multiuser");
 	else if (tcon->ses->user_name)
@@ -478,6 +527,8 @@
 		seq_puts(s, ",seal");
 	if (tcon->nocase)
 		seq_puts(s, ",nocase");
+	if (tcon->local_lease)
+		seq_puts(s, ",locallease");
 	if (tcon->retry)
 		seq_puts(s, ",hard");
 	else
@@ -492,6 +543,8 @@
 		seq_puts(s, ",unix");
 	else
 		seq_puts(s, ",nounix");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+		seq_puts(s, ",nodfs");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
 		seq_puts(s, ",posixpaths");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@ -516,6 +569,8 @@
 		seq_puts(s, ",nobrl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE)
 		seq_puts(s, ",nohandlecache");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)
+		seq_puts(s, ",modefromsid");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
 		seq_puts(s, ",cifsacl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
@@ -541,10 +596,20 @@
 
 	seq_printf(s, ",rsize=%u", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+	seq_printf(s, ",bsize=%u", cifs_sb->bsize);
+	if (tcon->ses->server->min_offload)
+		seq_printf(s, ",esize=%u", tcon->ses->server->min_offload);
 	seq_printf(s, ",echo_interval=%lu",
 			tcon->ses->server->echo_interval / HZ);
+
+	/* Only display max_credits if it was overridden on mount */
+	if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE)
+		seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits);
+
 	if (tcon->snapshot_time)
 		seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
+	if (tcon->handle_timeout)
+		seq_printf(s, ",handletimeout=%u", tcon->handle_timeout);
 	/* convert actimeo and display it in seconds */
 	seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
 
@@ -614,7 +679,7 @@
 static const struct super_operations cifs_super_ops = {
 	.statfs = cifs_statfs,
 	.alloc_inode = cifs_alloc_inode,
-	.destroy_inode = cifs_destroy_inode,
+	.free_inode = cifs_free_inode,
 	.drop_inode	= cifs_drop_inode,
 	.evict_inode	= cifs_evict_inode,
 /*	.delete_inode	= cifs_delete_inode,  */  /* Do not need above
@@ -707,7 +772,14 @@
 	struct cifs_mnt_data mnt_data;
 	struct dentry *root;
 
-	cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+	/*
+	 * Prints in Kernel / CIFS log the attempted mount operation
+	 *	If CIFS_DEBUG && cifs_FYI
+	 */
+	if (cifsFYI)
+		cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+	else
+		cifs_info("Attempting to mount %s\n", dev_name);
 
 	volume_info = cifs_get_volume_info((char *)data, dev_name, is_smb3);
 	if (IS_ERR(volume_info))
@@ -859,6 +931,9 @@
 
 static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 {
+	struct cifsFileInfo *cfile = file->private_data;
+	struct cifs_tcon *tcon;
+
 	/*
 	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
 	 * the cached file length
@@ -890,6 +965,12 @@
 		if (rc < 0)
 			return (loff_t)rc;
 	}
+	if (cfile && cfile->tlink) {
+		tcon = tlink_tcon(cfile->tlink);
+		if (tcon->ses->server->ops->llseek)
+			return tcon->ses->server->ops->llseek(file, tcon,
+							      offset, whence);
+	}
 	return generic_file_llseek(file, offset, whence);
 }
 
@@ -967,6 +1048,7 @@
 	.getattr = cifs_getattr,
 	.permission = cifs_permission,
 	.listxattr = cifs_listxattr,
+	.fiemap = cifs_fiemap,
 };
 
 const struct inode_operations cifs_symlink_inode_ops = {
@@ -975,8 +1057,9 @@
 	.listxattr = cifs_listxattr,
 };
 
-static int cifs_clone_file_range(struct file *src_file, loff_t off,
-		struct file *dst_file, loff_t destoff, u64 len)
+static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
+		struct file *dst_file, loff_t destoff, loff_t len,
+		unsigned int remap_flags)
 {
 	struct inode *src_inode = file_inode(src_file);
 	struct inode *target_inode = file_inode(dst_file);
@@ -986,6 +1069,9 @@
 	unsigned int xid;
 	int rc;
 
+	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
 	cifs_dbg(FYI, "clone range\n");
 
 	xid = get_xid();
@@ -1028,7 +1114,7 @@
 	unlock_two_nondirectories(src_inode, target_inode);
 out:
 	free_xid(xid);
-	return rc;
+	return rc < 0 ? rc : len;
 }
 
 ssize_t cifs_file_copychunk_range(unsigned int xid,
@@ -1046,11 +1132,6 @@
 
 	cifs_dbg(FYI, "copychunk range\n");
 
-	if (src_inode == target_inode) {
-		rc = -EINVAL;
-		goto out;
-	}
-
 	if (!src_file->private_data || !dst_file->private_data) {
 		rc = -EBADF;
 		cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
@@ -1068,6 +1149,10 @@
 		goto out;
 	}
 
+	rc = -EOPNOTSUPP;
+	if (!target_tcon->ses->server->ops->copychunk_range)
+		goto out;
+
 	/*
 	 * Note: cifs case is easier than btrfs since server responsible for
 	 * checks for proper open modes and file type and if it wants
@@ -1079,11 +1164,12 @@
 	/* should we flush first and last page first */
 	truncate_inode_pages(&target_inode->i_data, 0);
 
-	if (target_tcon->ses->server->ops->copychunk_range)
+	rc = file_modified(dst_file);
+	if (!rc)
 		rc = target_tcon->ses->server->ops->copychunk_range(xid,
 			smb_file_src, smb_file_target, off, len, destoff);
-	else
-		rc = -EOPNOTSUPP;
+
+	file_accessed(src_file);
 
 	/* force revalidate of size and timestamps of target file now
 	 * that target is updated on the server
@@ -1120,6 +1206,10 @@
 	rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
 					len, flags);
 	free_xid(xid);
+
+	if (rc == -EOPNOTSUPP || rc == -EXDEV)
+		rc = generic_copy_file_range(src_file, off, dst_file,
+					     destoff, len, flags);
 	return rc;
 }
 
@@ -1137,7 +1227,7 @@
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
-	.clone_file_range = cifs_clone_file_range,
+	.remap_file_range = cifs_remap_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -1156,15 +1246,14 @@
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
-	.clone_file_range = cifs_clone_file_range,
+	.remap_file_range = cifs_remap_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
 
 const struct file_operations cifs_file_direct_ops = {
-	/* BB reevaluate whether they can be done with directio, no cache */
-	.read_iter = cifs_user_readv,
-	.write_iter = cifs_user_writev,
+	.read_iter = cifs_direct_readv,
+	.write_iter = cifs_direct_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.lock = cifs_lock,
@@ -1175,7 +1264,7 @@
 	.splice_write = iter_file_splice_write,
 	.unlocked_ioctl  = cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
-	.clone_file_range = cifs_clone_file_range,
+	.remap_file_range = cifs_remap_file_range,
 	.llseek = cifs_llseek,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
@@ -1194,7 +1283,7 @@
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
-	.clone_file_range = cifs_clone_file_range,
+	.remap_file_range = cifs_remap_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -1212,15 +1301,14 @@
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
-	.clone_file_range = cifs_clone_file_range,
+	.remap_file_range = cifs_remap_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
-	/* BB reevaluate whether they can be done with directio, no cache */
-	.read_iter = cifs_user_readv,
-	.write_iter = cifs_user_writev,
+	.read_iter = cifs_direct_readv,
+	.write_iter = cifs_direct_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.fsync = cifs_fsync,
@@ -1230,7 +1318,7 @@
 	.splice_write = iter_file_splice_write,
 	.unlocked_ioctl  = cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
-	.clone_file_range = cifs_clone_file_range,
+	.remap_file_range = cifs_remap_file_range,
 	.llseek = cifs_llseek,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
@@ -1242,7 +1330,7 @@
 	.read    = generic_read_dir,
 	.unlocked_ioctl  = cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
-	.clone_file_range = cifs_clone_file_range,
+	.remap_file_range = cifs_remap_file_range,
 	.llseek = generic_file_llseek,
 	.fsync = cifs_dir_fsync,
 };
@@ -1421,6 +1509,11 @@
 #ifdef CONFIG_CIFS_STATS2
 	atomic_set(&totBufAllocCount, 0);
 	atomic_set(&totSmBufAllocCount, 0);
+	if (slow_rsp_threshold < 1)
+		cifs_dbg(FYI, "slow_response_threshold msgs disabled\n");
+	else if (slow_rsp_threshold > 32767)
+		cifs_dbg(VFS,
+		       "slow response threshold set higher than recommended (0 to 32767)\n");
 #endif /* CONFIG_CIFS_STATS2 */
 
 	atomic_set(&midCount, 0);
@@ -1447,11 +1540,25 @@
 		goto out_clean_proc;
 	}
 
+	/*
+	 * Consider in future setting limit!=0 maybe to min(num_of_cores - 1, 3)
+	 * so that we don't launch too many worker threads but
+	 * Documentation/workqueue.txt recommends setting it to 0
+	 */
+
+	/* WQ_UNBOUND allows decrypt tasks to run on any CPU */
+	decrypt_wq = alloc_workqueue("smb3decryptd",
+				     WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+	if (!decrypt_wq) {
+		rc = -ENOMEM;
+		goto out_destroy_cifsiod_wq;
+	}
+
 	cifsoplockd_wq = alloc_workqueue("cifsoplockd",
 					 WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
 	if (!cifsoplockd_wq) {
 		rc = -ENOMEM;
-		goto out_destroy_cifsiod_wq;
+		goto out_destroy_decrypt_wq;
 	}
 
 	rc = cifs_fscache_register();
@@ -1470,17 +1577,20 @@
 	if (rc)
 		goto out_destroy_mids;
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	rc = dfs_cache_init();
+	if (rc)
+		goto out_destroy_request_bufs;
+#endif /* CONFIG_CIFS_DFS_UPCALL */
 #ifdef CONFIG_CIFS_UPCALL
 	rc = init_cifs_spnego();
 	if (rc)
-		goto out_destroy_request_bufs;
+		goto out_destroy_dfs_cache;
 #endif /* CONFIG_CIFS_UPCALL */
 
-#ifdef CONFIG_CIFS_ACL
 	rc = init_cifs_idmap();
 	if (rc)
 		goto out_register_key_type;
-#endif /* CONFIG_CIFS_ACL */
 
 	rc = register_filesystem(&cifs_fs_type);
 	if (rc)
@@ -1495,12 +1605,14 @@
 	return 0;
 
 out_init_cifs_idmap:
-#ifdef CONFIG_CIFS_ACL
 	exit_cifs_idmap();
 out_register_key_type:
-#endif
 #ifdef CONFIG_CIFS_UPCALL
 	exit_cifs_spnego();
+out_destroy_dfs_cache:
+#endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	dfs_cache_destroy();
 out_destroy_request_bufs:
 #endif
 	cifs_destroy_request_bufs();
@@ -1512,6 +1624,8 @@
 	cifs_fscache_unregister();
 out_destroy_cifsoplockd_wq:
 	destroy_workqueue(cifsoplockd_wq);
+out_destroy_decrypt_wq:
+	destroy_workqueue(decrypt_wq);
 out_destroy_cifsiod_wq:
 	destroy_workqueue(cifsiod_wq);
 out_clean_proc:
@@ -1526,29 +1640,29 @@
 	unregister_filesystem(&cifs_fs_type);
 	unregister_filesystem(&smb3_fs_type);
 	cifs_dfs_release_automount_timer();
-#ifdef CONFIG_CIFS_ACL
 	exit_cifs_idmap();
-#endif
 #ifdef CONFIG_CIFS_UPCALL
 	exit_cifs_spnego();
 #endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	dfs_cache_destroy();
+#endif
 	cifs_destroy_request_bufs();
 	cifs_destroy_mids();
 	cifs_destroy_inodecache();
 	cifs_fscache_unregister();
 	destroy_workqueue(cifsoplockd_wq);
+	destroy_workqueue(decrypt_wq);
 	destroy_workqueue(cifsiod_wq);
 	cifs_proc_clean();
 }
 
-MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
+MODULE_AUTHOR("Steve French");
 MODULE_LICENSE("GPL");	/* combination of LGPL + GPL source behaves as GPL */
 MODULE_DESCRIPTION
-    ("VFS to access servers complying with the SNIA CIFS Specification "
-     "e.g. Samba and Windows");
+	("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
+	"also older servers complying with the SNIA CIFS Specification)");
 MODULE_VERSION(CIFS_VERSION);
-MODULE_SOFTDEP("pre: arc4");
-MODULE_SOFTDEP("pre: des");
 MODULE_SOFTDEP("pre: ecb");
 MODULE_SOFTDEP("pre: hmac");
 MODULE_SOFTDEP("pre: md4");
@@ -1560,5 +1674,6 @@
 MODULE_SOFTDEP("pre: sha512");
 MODULE_SOFTDEP("pre: aead2");
 MODULE_SOFTDEP("pre: ccm");
+MODULE_SOFTDEP("pre: gcm");
 module_init(init_cifs)
 module_exit(exit_cifs)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f047e87..bc4ca94 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h

@@ -84,6 +84,8 @@
 extern int cifs_zap_mapping(struct inode *inode);
 extern int cifs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int cifs_setattr(struct dentry *, struct iattr *);
+extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start,
+		       u64 len);
 
 extern const struct inode_operations cifs_file_inode_ops;
 extern const struct inode_operations cifs_symlink_inode_ops;
@@ -101,8 +103,10 @@
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, loff_t, loff_t, int);
@@ -148,5 +152,5 @@
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "2.13"
+#define CIFS_VERSION   "2.23"
 #endif				/* _CIFSFS_H */

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9dcaed0..d78bfcc 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h

@@ -33,6 +33,7 @@
 
 #define CIFS_MAGIC_NUMBER 0xFF534D42      /* the first four bytes of SMB PDUs */
 
+#define SMB_PATH_MAX 260
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
 
@@ -59,6 +60,12 @@
 #define CIFS_MAX_ACTIMEO (1 << 30)
 
 /*
+ * Max persistent and resilient handle timeout (milliseconds).
+ * Windows durable max was 960000 (16 minutes)
+ */
+#define SMB3_MAX_HANDLE_TIMEOUT 960000
+
+/*
  * MAX_REQ is the maximum number of requests that WE will send
  * on one socket concurrently.
  */
@@ -215,6 +222,7 @@
 struct cifs_search_info;
 struct cifsInodeInfo;
 struct cifs_open_parms;
+struct cifs_credits;
 
 struct smb_version_operations {
 	int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *,
@@ -229,12 +237,15 @@
 	/* check response: verify signature, map error */
 	int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
 			     bool);
-	void (*add_credits)(struct TCP_Server_Info *, const unsigned int,
-			    const int);
+	void (*add_credits)(struct TCP_Server_Info *server,
+			    const struct cifs_credits *credits,
+			    const int optype);
 	void (*set_credits)(struct TCP_Server_Info *, const int);
 	int * (*get_credits_field)(struct TCP_Server_Info *, const int);
 	unsigned int (*get_credits)(struct mid_q_entry *);
 	__u64 (*get_next_mid)(struct TCP_Server_Info *);
+	void (*revert_current_mid)(struct TCP_Server_Info *server,
+				   const unsigned int val);
 	/* data offset from read response message */
 	unsigned int (*read_data_offset)(char *);
 	/*
@@ -320,8 +331,9 @@
 			umode_t mode, struct cifs_tcon *tcon,
 			const char *full_path,
 			struct cifs_sb_info *cifs_sb);
-	int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *,
-		     struct cifs_sb_info *);
+	int (*mkdir)(const unsigned int xid, struct inode *inode, umode_t mode,
+		     struct cifs_tcon *tcon, const char *name,
+		     struct cifs_sb_info *sb);
 	/* set info on created directory */
 	void (*mkdir_setinfo)(struct inode *, const char *,
 			      struct cifs_sb_info *, struct cifs_tcon *,
@@ -344,7 +356,8 @@
 			       struct cifs_sb_info *);
 	/* query symlink target */
 	int (*query_symlink)(const unsigned int, struct cifs_tcon *,
-			     const char *, char **, struct cifs_sb_info *);
+			     struct cifs_sb_info *, const char *,
+			     char **, bool);
 	/* open a file for non-posix mounts */
 	int (*open)(const unsigned int, struct cifs_open_parms *,
 		    __u32 *, FILE_ALL_INFO *);
@@ -382,8 +395,8 @@
 			 struct cifs_fid *);
 	/* calculate a size of SMB message */
 	unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
-	/* check for STATUS_PENDING and process it in a positive case */
-	bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
+	/* check for STATUS_PENDING and process the response if yes */
+	bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server);
 	/* check for STATUS_NETWORK_SESSION_EXPIRED */
 	bool (*is_session_expired)(char *);
 	/* send oplock break response */
@@ -451,7 +464,11 @@
 	unsigned int (*wp_retry_size)(struct inode *);
 	/* get mtu credits */
 	int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
-				unsigned int *, unsigned int *);
+				unsigned int *, struct cifs_credits *);
+	/* adjust previously taken mtu credits to request size */
+	int (*adjust_credits)(struct TCP_Server_Info *server,
+			      struct cifs_credits *credits,
+			      const unsigned int payload_size);
 	/* check if we need to issue closedir */
 	bool (*dir_needs_close)(struct cifsFileInfo *);
 	long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
@@ -465,6 +482,24 @@
 	enum securityEnum (*select_sectype)(struct TCP_Server_Info *,
 			    enum securityEnum);
 	int (*next_header)(char *);
+	/* ioctl passthrough for query_info */
+	int (*ioctl_query_info)(const unsigned int xid,
+				struct cifs_tcon *tcon,
+				__le16 *path, int is_dir,
+				unsigned long p);
+	/* make unix special files (block, char, fifo, socket) */
+	int (*make_node)(unsigned int xid,
+			 struct inode *inode,
+			 struct dentry *dentry,
+			 struct cifs_tcon *tcon,
+			 char *full_path,
+			 umode_t mode,
+			 dev_t device_number);
+	/* version specific fiemap implementation */
+	int (*fiemap)(struct cifs_tcon *tcon, struct cifsFileInfo *,
+		      struct fiemap_extent_info *, u64, u64);
+	/* version specific llseek implementation */
+	loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int);
 };
 
 struct smb_version_values {
@@ -508,6 +543,7 @@
 	umode_t dir_mode;
 	enum securityEnum sectype; /* sectype requested via mnt opts */
 	bool sign; /* was signing requested via mnt opts? */
+	bool ignore_signature:1;
 	bool retry:1;
 	bool intr:1;
 	bool setuids:1;
@@ -516,6 +552,7 @@
 	bool override_gid:1;
 	bool dynperm:1;
 	bool noperm:1;
+	bool mode_ace:1;
 	bool no_psx_acl:1; /* set if posix acl support should be disabled */
 	bool cifs_acl:1;
 	bool backupuid_specified; /* mount option  backupuid  is specified */
@@ -524,6 +561,8 @@
 	bool server_ino:1; /* use inode numbers from server ie UniqueId */
 	bool direct_io:1;
 	bool strict_io:1; /* strict cache behavior */
+	bool cache_ro:1;
+	bool cache_rw:1;
 	bool remap:1;      /* set to remap seven reserved chars in filenames */
 	bool sfu_remap:1;  /* remap seven reserved chars ala SFU */
 	bool posix_paths:1; /* unset to not ask for posix pathnames. */
@@ -541,6 +580,7 @@
 	bool noblocksnd:1;
 	bool noautotune:1;
 	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+	bool no_lease:1;     /* disable requesting leases */
 	bool fsc:1;	/* enable fscache */
 	bool mfsymlinks:1; /* use Minshall+French Symlinks */
 	bool multiuser:1;
@@ -551,8 +591,10 @@
 	bool resilient:1; /* noresilient not required since not fored for CA */
 	bool domainauto:1;
 	bool rdma:1;
+	unsigned int bsize;
 	unsigned int rsize;
 	unsigned int wsize;
+	unsigned int min_offload;
 	bool sockopt_tcp_nodelay:1;
 	unsigned long actimeo; /* attribute cache timeout (jiffies) */
 	struct smb_version_operations *ops;
@@ -563,9 +605,16 @@
 	struct nls_table *local_nls;
 	unsigned int echo_interval; /* echo interval in secs */
 	__u64 snapshot_time; /* needed for timewarp tokens */
+	__u32 handle_timeout; /* persistent and durable handle timeout in ms */
 	unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
+	__u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
+	bool rootfs:1; /* if it's a SMB root file system */
 };
 
+/**
+ * CIFS superblock mount flags (mnt_cifs_flags) to consider when
+ * trying to reuse existing superblock for a new mount
+ */
 #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
 			 CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \
 			 CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \
@@ -576,8 +625,15 @@
 			 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
 			 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
 			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
-			 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
+			 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID | \
+			 CIFS_MOUNT_UID_FROM_ACL | CIFS_MOUNT_NO_HANDLE_CACHE | \
+			 CIFS_MOUNT_NO_DFS | CIFS_MOUNT_MODE_FROM_SID | \
+			 CIFS_MOUNT_RO_CACHE | CIFS_MOUNT_RW_CACHE)
 
+/**
+ * Generic VFS superblock mount flags (s_flags) to consider when
+ * trying to reuse existing superblock for a new mount
+ */
 #define CIFS_MS_MASK (SB_RDONLY | SB_MANDLOCK | SB_NOEXEC | SB_NOSUID | \
 		      SB_NODEV | SB_SYNCHRONOUS)
 
@@ -624,12 +680,14 @@
 	unsigned int credits;  /* send no more requests at once */
 	unsigned int max_credits; /* can override large 32000 default at mnt */
 	unsigned int in_flight;  /* number of requests on the wire to server */
+	unsigned int max_in_flight; /* max number of requests that were on wire */
 	spinlock_t req_lock;  /* protect the two values above */
 	struct mutex srv_mutex;
 	struct task_struct *tsk;
 	char server_GUID[16];
 	__u16 sec_mode;
 	bool sign; /* is signing enabled on this connection? */
+	bool ignore_signature:1; /* skip validation of signatures in SMB2/3 rsp */
 	bool session_estab; /* mark when very first sess is established */
 	int echo_credits;  /* echo reserved slots */
 	int oplock_credits;  /* oplock break reserved slots */
@@ -654,6 +712,7 @@
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* for signing, protected by srv_mutex */
+	__u32 reconnect_instance; /* incremented on each reconnect */
 	struct session_key session_key;
 	unsigned long lstrp; /* when we got last response from this server */
 	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
@@ -683,10 +742,16 @@
 #ifdef CONFIG_CIFS_STATS2
 	atomic_t in_send; /* requests trying to send */
 	atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
+	atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */
 	atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */
+	__u64 time_per_cmd[NUMBER_OF_SMB2_COMMANDS]; /* total time per cmd */
+	__u32 slowest_cmd[NUMBER_OF_SMB2_COMMANDS];
+	__u32 fastest_cmd[NUMBER_OF_SMB2_COMMANDS];
 #endif /* STATS2 */
 	unsigned int	max_read;
 	unsigned int	max_write;
+	unsigned int	min_offload;
+	__le16	compress_algorithm;
 	__le16	cipher_type;
 	 /* save initital negprot hash */
 	__u8	preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
@@ -694,6 +759,19 @@
 	struct delayed_work reconnect; /* reconnect workqueue job */
 	struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
 	unsigned long echo_interval;
+
+	/*
+	 * Number of targets available for reconnect. The more targets
+	 * the more tasks have to wait to let the demultiplex thread
+	 * reconnect.
+	 */
+	int nr_targets;
+	bool noblockcnt; /* use non-blocking connect() */
+};
+
+struct cifs_credits {
+	unsigned int value;
+	unsigned int instance;
 };
 
 static inline unsigned int
@@ -707,28 +785,28 @@
 }
 
 static inline bool
-has_credits(struct TCP_Server_Info *server, int *credits)
+has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
 {
 	int num;
 	spin_lock(&server->req_lock);
 	num = *credits;
 	spin_unlock(&server->req_lock);
-	return num > 0;
+	return num >= num_credits;
 }
 
 static inline void
-add_credits(struct TCP_Server_Info *server, const unsigned int add,
+add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits,
 	    const int optype)
 {
-	server->ops->add_credits(server, add, optype);
+	server->ops->add_credits(server, credits, optype);
 }
 
 static inline void
-add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
-			const int optype)
+add_credits_and_wake_if(struct TCP_Server_Info *server,
+			const struct cifs_credits *credits, const int optype)
 {
-	if (add) {
-		server->ops->add_credits(server, add, optype);
+	if (credits->value) {
+		server->ops->add_credits(server, credits, optype);
 		wake_up(&server->request_q);
 	}
 }
@@ -739,6 +817,14 @@
 	server->ops->set_credits(server, val);
 }
 
+static inline int
+adjust_credits(struct TCP_Server_Info *server, struct cifs_credits *credits,
+	       const unsigned int payload_size)
+{
+	return server->ops->adjust_credits ?
+		server->ops->adjust_credits(server, credits, payload_size) : 0;
+}
+
 static inline __le64
 get_next_mid64(struct TCP_Server_Info *server)
 {
@@ -756,6 +842,22 @@
 	return cpu_to_le16(mid);
 }
 
+static inline void
+revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+	if (server->ops->revert_current_mid)
+		server->ops->revert_current_mid(server, val);
+}
+
+static inline void
+revert_current_mid_from_hdr(struct TCP_Server_Info *server,
+			    const struct smb2_sync_hdr *shdr)
+{
+	unsigned int num = le16_to_cpu(shdr->CreditCharge);
+
+	return revert_current_mid(server, num > 0 ? num : 1);
+}
+
 static inline __u16
 get_mid(const struct smb_hdr *smb)
 {
@@ -798,6 +900,7 @@
  * a single wsize request with a single call.
  */
 #define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
 
 /*
  * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
@@ -909,11 +1012,14 @@
 
 struct cached_fid {
 	bool is_valid:1;	/* Do we have a useable root fid */
+	bool file_all_info_is_valid:1;
+
 	struct kref refcount;
 	struct cifs_fid *fid;
 	struct mutex fid_mutex;
 	struct cifs_tcon *tcon;
 	struct work_struct lease_break;
+	struct smb2_file_all_info file_all_info;
 };
 
 /*
@@ -924,6 +1030,8 @@
 	struct list_head tcon_list;
 	int tc_count;
 	struct list_head rlist; /* reconnect list */
+	atomic_t num_local_opens;  /* num of all opens including disconnected */
+	atomic_t num_remote_opens; /* num of all network opens on server */
 	struct list_head openFileList;
 	spinlock_t open_file_lock; /* protects list above */
 	struct cifs_ses *ses;	/* pointer to session associated with */
@@ -986,12 +1094,14 @@
 	bool need_reopen_files:1; /* need to reopen tcon file handles */
 	bool use_resilient:1; /* use resilient instead of durable handles */
 	bool use_persistent:1; /* use persistent instead of durable handles */
+	bool no_lease:1;    /* Do not request leases on files or directories */
 	__le32 capabilities;
 	__u32 share_flags;
 	__u32 maximal_access;
 	__u32 vol_serial_number;
 	__le64 vol_create_time;
 	__u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */
+	__u32 handle_timeout; /* persistent and durable handle timeout in ms */
 	__u32 ss_flags;		/* sector size flags */
 	__u32 perf_sector_size; /* best sector size for perf */
 	__u32 max_chunks;
@@ -1004,6 +1114,11 @@
 	struct list_head pending_opens;	/* list of incomplete opens */
 	struct cached_fid crfid; /* Cached root fid */
 	/* BB add field for back pointer to sb struct(s)? */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	char *dfs_path;
+	int remap:2;
+	struct list_head ulist; /* cache update list */
+#endif
 };
 
 /*
@@ -1072,7 +1187,8 @@
 	__u64 offset;
 	__u64 length;
 	__u32 pid;
-	__u32 type;
+	__u16 type;
+	__u16 flags;
 };
 
 /*
@@ -1094,6 +1210,7 @@
 	bool smallBuf:1; /* so we know which buf_release function to call */
 };
 
+#define ACL_NO_MODE	((umode_t)(-1))
 struct cifs_open_parms {
 	struct cifs_tcon *tcon;
 	struct cifs_sb_info *cifs_sb;
@@ -1114,6 +1231,9 @@
 	__u8 create_guid[16];
 	struct cifs_pending_open *pending_open;
 	unsigned int epoch;
+#ifdef CONFIG_CIFS_DEBUG2
+	__u64 mid;
+#endif /* CIFS_DEBUG2 */
 	bool purge_cache;
 };
 
@@ -1172,6 +1292,11 @@
 	unsigned int		len;
 	unsigned int		total_len;
 	bool			should_dirty;
+	/*
+	 * Indicates if this aio_ctx is for direct_io,
+	 * If yes, iter is a copy of the user passed iov_iter
+	 */
+	bool			direct_io;
 };
 
 struct cifs_readdata;
@@ -1203,7 +1328,7 @@
 	unsigned int			pagesz;
 	unsigned int			page_offset;
 	unsigned int			tailsz;
-	unsigned int			credits;
+	struct cifs_credits		credits;
 	unsigned int			nr_pages;
 	struct page			**pages;
 };
@@ -1229,7 +1354,7 @@
 	unsigned int			pagesz;
 	unsigned int			page_offset;
 	unsigned int			tailsz;
-	unsigned int			credits;
+	struct cifs_credits		credits;
 	unsigned int			nr_pages;
 	struct page			**pages;
 };
@@ -1245,6 +1370,7 @@
 }
 
 struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr);
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
 
 #define CIFS_CACHE_READ_FLG	1
@@ -1254,9 +1380,9 @@
 #define CIFS_CACHE_RW_FLG	(CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)
 #define CIFS_CACHE_RHW_FLG	(CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG)
 
-#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG)
+#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE))
 #define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG)
-#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG)
+#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE))
 
 /*
  * One of these for each file inode
@@ -1265,9 +1391,15 @@
 struct cifsInodeInfo {
 	bool can_cache_brlcks;
 	struct list_head llist;	/* locks helb by this inode */
+	/*
+	 * NOTE: Some code paths call down_read(lock_sem) twice, so
+	 * we must always use use cifs_down_write() instead of down_write()
+	 * for this semaphore to avoid deadlocks.
+	 */
 	struct rw_semaphore lock_sem;	/* protect the fields above */
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
+	spinlock_t	open_file_lock;	/* protects openFileList */
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
 	unsigned int oplock;		/* oplock/lease level we have */
 	unsigned int epoch;		/* used to track lease state changes */
@@ -1391,6 +1523,7 @@
 	struct kref refcount;
 	struct TCP_Server_Info *server;	/* server corresponding to this mid */
 	__u64 mid;		/* multiplex id */
+	__u16 credits;		/* number of credits consumed by this mid */
 	__u32 pid;		/* process id */
 	__u32 sequence_number;  /* for CIFS signing */
 	unsigned long when_alloc;  /* when mid was created */
@@ -1407,6 +1540,7 @@
 	int mid_state;	/* wish this were enum but can not pass to wait_event */
 	unsigned int mid_flags;
 	__le16 command;		/* smb command code */
+	unsigned int optype;	/* operation type */
 	bool large_buf:1;	/* if valid response, is pointer to large buf */
 	bool multiRsp:1;	/* multiple trans2 responses for one request  */
 	bool multiEnd:1;	/* both received */
@@ -1489,6 +1623,7 @@
 	int ref_flag;
 	char *path_name;
 	char *node_name;
+	int ttl;
 };
 
 /*
@@ -1526,7 +1661,6 @@
 	if (param) {
 		kfree(param->path_name);
 		kfree(param->node_name);
-		kfree(param);
 	}
 }
 
@@ -1543,6 +1677,25 @@
 	kfree(param);
 }
 
+static inline bool is_interrupt_error(int error)
+{
+	switch (error) {
+	case -EINTR:
+	case -ERESTARTSYS:
+	case -ERESTARTNOHAND:
+	case -ERESTARTNOINTR:
+		return true;
+	}
+	return false;
+}
+
+static inline bool is_retryable_error(int error)
+{
+	if (is_interrupt_error(error) || error == -EAGAIN)
+		return true;
+	return false;
+}
+
 #define   MID_FREE 0
 #define   MID_REQUEST_ALLOCATED 1
 #define   MID_REQUEST_SUBMITTED 2
@@ -1563,11 +1716,11 @@
 
 /* Type of Request to SendReceive2 */
 #define   CIFS_BLOCKING_OP      1    /* operation can block */
-#define   CIFS_ASYNC_OP         2    /* do not wait for response */
+#define   CIFS_NON_BLOCKING     2    /* do not block waiting for credits */
 #define   CIFS_TIMEOUT_MASK 0x003    /* only one of above set in req */
 #define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
 #define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
-#define   CIFS_NO_RESP      0x040    /* no response buffer required */
+#define   CIFS_NO_RSP_BUF   0x040    /* no response buffer required */
 
 /* Type of request operation */
 #define   CIFS_ECHO_OP      0x080    /* echo request */
@@ -1577,6 +1730,7 @@
 
 #define   CIFS_HAS_CREDITS 0x0400    /* already has credits */
 #define   CIFS_TRANSFORM_REQ 0x0800    /* transform request before sending */
+#define   CIFS_NO_SRV_RSP    0x1000    /* there is no server response */
 
 /* Security Flags: indicate type of session setup needed */
 #define   CIFSSEC_MAY_SIGN	0x00001
@@ -1645,13 +1799,18 @@
  *  GlobalMid_Lock protects:
  *	list operations on pending_mid_q and oplockQ
  *      updates to XID counters, multiplex id  and SMB sequence numbers
+ *      list operations on global DnotifyReqList
  *  tcp_ses_lock protects:
  *	list operations on tcp and SMB session lists
  *  tcon->open_file_lock protects the list of open files hanging off the tcon
+ *  inode->open_file_lock protects the openFileList hanging off the inode
  *  cfile->file_info_lock protects counters and fields in cifs file struct
  *  f_owner.lock protects certain per file struct operations
  *  mapping->page_lock protects certain per page operations
  *
+ *  Note that the cifs_tcon.open_file_lock should be taken before
+ *  not after the cifsInodeInfo.open_file_lock
+ *
  *  Semaphores
  *  ----------
  *  sesSem     operations on smb session
@@ -1715,6 +1874,7 @@
 #ifdef CONFIG_CIFS_STATS2
 GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
 GLOBAL_EXTERN atomic_t totSmBufAllocCount;
+extern unsigned int slow_rsp_threshold; /* number of secs before logging */
 #endif
 GLOBAL_EXTERN atomic_t smBufAllocCount;
 GLOBAL_EXTERN atomic_t midCount;
@@ -1732,7 +1892,6 @@
 extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
 extern bool disable_legacy_dialects;  /* forbid vers=1.0 and vers=2.0 mounts */
 
-#ifdef CONFIG_CIFS_ACL
 GLOBAL_EXTERN struct rb_root uidtree;
 GLOBAL_EXTERN struct rb_root gidtree;
 GLOBAL_EXTERN spinlock_t siduidlock;
@@ -1741,12 +1900,13 @@
 GLOBAL_EXTERN struct rb_root sidgidtree;
 GLOBAL_EXTERN spinlock_t uidsidlock;
 GLOBAL_EXTERN spinlock_t gidsidlock;
-#endif /* CONFIG_CIFS_ACL */
 
 void cifs_oplock_break(struct work_struct *work);
+void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
 
 extern const struct slow_work_ops cifs_oplock_break_ops;
 extern struct workqueue_struct *cifsiod_wq;
+extern struct workqueue_struct *decrypt_wq;
 extern struct workqueue_struct *cifsoplockd_wq;
 extern __u32 cifs_lock_secret;
 
@@ -1770,6 +1930,7 @@
 extern struct smb_version_operations smb30_operations;
 extern struct smb_version_values smb30_values;
 #define SMB302_VERSION_STRING	"3.02"
+#define ALT_SMB302_VERSION_STRING "3.0.2"
 /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
 extern struct smb_version_values smb302_values;
 #define SMB311_VERSION_STRING	"3.1.1"

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 1ce733f..79d842e 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h

@@ -1539,6 +1539,9 @@
 	char	PathBuffer[0];
 } __attribute__((packed));
 
+/* Flag above */
+#define SYMLINK_FLAG_RELATIVE 0x00000001
+
 /* For IO_REPARSE_TAG_NFS */
 #define NFS_SPECFILE_LNK	0x00000000014B4E4C
 #define NFS_SPECFILE_CHR	0x0000000000524843

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 20adda4..fe597d3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h

@@ -22,6 +22,9 @@
 #define _CIFSPROTO_H
 #include <linux/nls.h>
 #include "trace.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
 
 struct statfs;
 struct smb_vol;
@@ -90,7 +93,8 @@
 extern int cifs_call_async(struct TCP_Server_Info *server,
 			struct smb_rqst *rqst,
 			mid_receive_t *receive, mid_callback_t *callback,
-			mid_handle_t *handle, void *cbdata, const int flags);
+			mid_handle_t *handle, void *cbdata, const int flags,
+			const struct cifs_credits *exist_credits);
 extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
 			  struct smb_rqst *rqst, int *resp_buf_type,
 			  const int flags, struct kvec *resp_iov);
@@ -112,7 +116,7 @@
 			struct TCP_Server_Info *server, bool log_error);
 extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
 				 unsigned int size, unsigned int *num,
-				 unsigned int *credits);
+				 struct cifs_credits *credits);
 extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
 			struct kvec *, int /* nvec to send */,
 			int * /* type of buf returned */, const int flags,
@@ -130,7 +134,14 @@
 extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 			    unsigned int bytes_written);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
+				  bool fsuid_only,
+				  struct cifsFileInfo **ret_file);
+extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
+				  struct cifsFileInfo **ret_file);
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
+extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
+				  struct cifsFileInfo **ret_file);
 extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
 			struct TCP_Server_Info *server);
@@ -159,6 +170,7 @@
 			     struct file_lock *flock, const unsigned int xid);
 extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
 
+extern void cifs_down_write(struct rw_semaphore *sem);
 extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
 					      struct file *file,
 					      struct tcon_link *tlink,
@@ -190,6 +202,7 @@
 				      const unsigned int xid);
 extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
 			      struct cifs_fattr *fattr, struct inode *inode,
+			      bool get_mode_from_special_sid,
 			      const char *path, const struct cifs_fid *pfid);
 extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
 					kuid_t, kgid_t);
@@ -213,13 +226,13 @@
 extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
 extern struct smb_vol *cifs_get_volume_info(char *mount_data,
 					    const char *devname, bool is_smb3);
-extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
+extern int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol);
 extern void cifs_umount(struct cifs_sb_info *);
 extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
 extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon);
 
 extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
-				    __u64 length, __u8 type,
+				    __u64 length, __u8 type, __u16 flags,
 				    struct cifsLockInfo **conf_lock,
 				    int rw_check);
 extern void cifs_add_pending_open(struct cifs_fid *fid,
@@ -294,11 +307,6 @@
 			   unsigned int *num_of_nodes,
 			   const struct nls_table *nls_codepage, int remap);
 
-extern int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
-			const char *old_path,
-			const struct nls_table *nls_codepage,
-			unsigned int *num_referrals,
-			struct dfs_info3_param **referrals, int remap);
 extern int parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,
 			       unsigned int *num_of_nodes,
 			       struct dfs_info3_param **target_nodes,
@@ -365,7 +373,8 @@
 				  const struct nls_table *nls_codepage,
 				  int remap);
 
-extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon,
+extern int CIFSSMBMkDir(const unsigned int xid, struct inode *inode,
+			umode_t mode, struct cifs_tcon *tcon,
 			const char *name, struct cifs_sb_info *cifs_sb);
 extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *name, struct cifs_sb_info *cifs_sb);
@@ -525,6 +534,20 @@
 extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 			unsigned char *p24);
 
+extern int
+cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
+		       const char *devname, bool is_smb3);
+extern void
+cifs_cleanup_volume_info_contents(struct smb_vol *volume_info);
+
+extern struct TCP_Server_Info *
+cifs_find_tcp_session(struct smb_vol *vol);
+
+extern void cifs_put_smb_ses(struct cifs_ses *ses);
+
+extern struct cifs_ses *
+cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info);
+
 void cifs_readdata_release(struct kref *refcount);
 int cifs_async_readv(struct cifs_readdata *rdata);
 int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
@@ -562,4 +585,18 @@
 extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
 				unsigned int *len, unsigned int *offset);
 
+void extract_unc_hostname(const char *unc, const char **h, size_t *len);
+int copy_path_name(char *dst, const char *src);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
+			       const char *old_path,
+			       const struct nls_table *nls_codepage,
+			       struct dfs_info3_param *referral, int remap)
+{
+	return dfs_cache_find(xid, ses, nls_codepage, remap, old_path,
+			      referral, NULL);
+}
+#endif
+
 #endif			/* _CIFSPROTO_H */

diff --git a/fs/cifs/cifsroot.c b/fs/cifs/cifsroot.c
new file mode 100644
index 0000000..37edbfb
--- /dev/null
+++ b/fs/cifs/cifsroot.c

@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SMB root file system support
+ *
+ * Copyright (c) 2019 Paulo Alcantara <palcantara@suse.de>
+ */
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/root_dev.h>
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <net/ipconfig.h>
+
+#define DEFAULT_MNT_OPTS \
+	"vers=1.0,cifsacl,mfsymlinks,rsize=1048576,wsize=65536,uid=0,gid=0," \
+	"hard,rootfs"
+
+static char root_dev[2048] __initdata = "";
+static char root_opts[1024] __initdata = DEFAULT_MNT_OPTS;
+
+static __be32 __init parse_srvaddr(char *start, char *end)
+{
+	/* TODO: ipv6 support */
+	char addr[sizeof("aaa.bbb.ccc.ddd")];
+	int i = 0;
+
+	while (start < end && i < sizeof(addr) - 1) {
+		if (isdigit(*start) || *start == '.')
+			addr[i++] = *start;
+		start++;
+	}
+	addr[i] = '\0';
+	return in_aton(addr);
+}
+
+/* cifsroot=//<server-ip>/<share>[,options] */
+static int __init cifs_root_setup(char *line)
+{
+	char *s;
+	int len;
+	__be32 srvaddr = htonl(INADDR_NONE);
+
+	ROOT_DEV = Root_CIFS;
+
+	if (strlen(line) > 3 && line[0] == '/' && line[1] == '/') {
+		s = strchr(&line[2], '/');
+		if (!s || s[1] == '\0')
+			return 1;
+
+		/* make s point to ',' or '\0' at end of line */
+		s = strchrnul(s, ',');
+		/* len is strlen(unc) + '\0' */
+		len = s - line + 1;
+		if (len > sizeof(root_dev)) {
+			printk(KERN_ERR "Root-CIFS: UNC path too long\n");
+			return 1;
+		}
+		strlcpy(root_dev, line, len);
+		srvaddr = parse_srvaddr(&line[2], s);
+		if (*s) {
+			int n = snprintf(root_opts,
+					 sizeof(root_opts), "%s,%s",
+					 DEFAULT_MNT_OPTS, s + 1);
+			if (n >= sizeof(root_opts)) {
+				printk(KERN_ERR "Root-CIFS: mount options string too long\n");
+				root_opts[sizeof(root_opts)-1] = '\0';
+				return 1;
+			}
+		}
+	}
+
+	root_server_addr = srvaddr;
+
+	return 1;
+}
+
+__setup("cifsroot=", cifs_root_setup);
+
+int __init cifs_root_data(char **dev, char **opts)
+{
+	if (!root_dev[0] || root_server_addr == htonl(INADDR_NONE)) {
+		printk(KERN_ERR "Root-CIFS: no SMB server address\n");
+		return -1;
+	}
+
+	*dev = root_dev;
+	*opts = root_opts;
+
+	return 0;
+}

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5657b79..4f554f0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c

@@ -44,6 +44,9 @@
 #include "cifs_debug.h"
 #include "fscache.h"
 #include "smbdirect.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
 
 #ifdef CONFIG_CIFS_POSIX
 static struct {
@@ -118,6 +121,86 @@
 	 */
 }
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
+				 struct cifs_tcon *tcon)
+{
+	int rc;
+	struct dfs_cache_tgt_list tl;
+	struct dfs_cache_tgt_iterator *it = NULL;
+	char *tree;
+	const char *tcp_host;
+	size_t tcp_host_len;
+	const char *dfs_host;
+	size_t dfs_host_len;
+
+	tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+	if (!tree)
+		return -ENOMEM;
+
+	if (tcon->ipc) {
+		scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+			  tcon->ses->server->hostname);
+		rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
+		goto out;
+	}
+
+	if (!tcon->dfs_path) {
+		rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+		goto out;
+	}
+
+	rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl);
+	if (rc)
+		goto out;
+
+	extract_unc_hostname(tcon->ses->server->hostname, &tcp_host,
+			     &tcp_host_len);
+
+	for (it = dfs_cache_get_tgt_iterator(&tl); it;
+	     it = dfs_cache_get_next_tgt(&tl, it)) {
+		const char *tgt = dfs_cache_get_tgt_name(it);
+
+		extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+
+		if (dfs_host_len != tcp_host_len
+		    || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
+			cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s",
+				 __func__,
+				 (int)dfs_host_len, dfs_host,
+				 (int)tcp_host_len, tcp_host);
+			continue;
+		}
+
+		scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+
+		rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
+		if (!rc)
+			break;
+		if (rc == -EREMOTE)
+			break;
+	}
+
+	if (!rc) {
+		if (it)
+			rc = dfs_cache_noreq_update_tgthint(tcon->dfs_path + 1,
+							    it);
+		else
+			rc = -ENOENT;
+	}
+	dfs_cache_free_tgts(&tl);
+out:
+	kfree(tree);
+	return rc;
+}
+#else
+static inline int __cifs_reconnect_tcon(const struct nls_table *nlsc,
+					struct cifs_tcon *tcon)
+{
+	return CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+}
+#endif
+
 /* reconnect the socket, tcon, and smb session if needed */
 static int
 cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
@@ -126,6 +209,7 @@
 	struct cifs_ses *ses;
 	struct TCP_Server_Info *server;
 	struct nls_table *nls_codepage;
+	int retries;
 
 	/*
 	 * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
@@ -152,9 +236,12 @@
 		}
 	}
 
+	retries = server->nr_targets;
+
 	/*
-	 * Give demultiplex thread up to 10 seconds to reconnect, should be
-	 * greater than cifs socket timeout which is 7 seconds
+	 * Give demultiplex thread up to 10 seconds to each target available for
+	 * reconnect -- should be greater than cifs socket timeout which is 7
+	 * seconds.
 	 */
 	while (server->tcpStatus == CifsNeedReconnect) {
 		rc = wait_event_interruptible_timeout(server->response_q,
@@ -170,6 +257,9 @@
 		if (server->tcpStatus != CifsNeedReconnect)
 			break;
 
+		if (--retries)
+			continue;
+
 		/*
 		 * on "soft" mounts we wait once. Hard mounts keep
 		 * retrying until process is killed or server comes
@@ -179,6 +269,7 @@
 			cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
 			return -EHOSTDOWN;
 		}
+		retries = server->nr_targets;
 	}
 
 	if (!ses->need_reconnect && !tcon->need_reconnect)
@@ -214,7 +305,7 @@
 	}
 
 	cifs_mark_open_files_invalid(tcon);
-	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+	rc = __cifs_reconnect_tcon(nls_codepage, tcon);
 	mutex_unlock(&ses->session_mutex);
 	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
 
@@ -731,9 +822,10 @@
 cifs_echo_callback(struct mid_q_entry *mid)
 {
 	struct TCP_Server_Info *server = mid->callback_data;
+	struct cifs_credits credits = { .value = 1, .instance = 0 };
 
 	DeleteMidQEntry(mid);
-	add_credits(server, 1, CIFS_ECHO_OP);
+	add_credits(server, &credits, CIFS_ECHO_OP);
 }
 
 int
@@ -768,7 +860,7 @@
 	iov[1].iov_base = (char *)smb + 4;
 
 	rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL,
-			     server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
+			     server, CIFS_NON_BLOCKING | CIFS_ECHO_OP, NULL);
 	if (rc)
 		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
 
@@ -850,10 +942,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else { /* BB add path length overrun check */
-		name_len = strnlen(fileName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, fileName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, fileName);
 	}
 
 	params = 6 + name_len;
@@ -923,10 +1013,8 @@
 					      remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {		/* BB improve check for buffer overruns BB */
-		name_len = strnlen(name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->fileName, name, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->fileName, name);
 	}
 	pSMB->SearchAttributes =
 	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM);
@@ -970,10 +1058,8 @@
 					      remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {		/* BB improve check for buffer overruns BB */
-		name_len = strnlen(name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->DirName, name, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->DirName, name);
 	}
 
 	pSMB->BufferFormat = 0x04;
@@ -992,7 +1078,8 @@
 }
 
 int
-CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+CIFSSMBMkDir(const unsigned int xid, struct inode *inode, umode_t mode,
+	     struct cifs_tcon *tcon, const char *name,
 	     struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;
@@ -1015,10 +1102,8 @@
 					      remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {		/* BB improve check for buffer overruns BB */
-		name_len = strnlen(name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->DirName, name, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->DirName, name);
 	}
 
 	pSMB->BufferFormat = 0x04;
@@ -1065,10 +1150,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, name, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, name);
 	}
 
 	params = 6 + name_len;
@@ -1232,11 +1315,9 @@
 				      fileName, PATH_MAX, nls_codepage, remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
-	} else {                /* BB improve check for buffer overruns BB */
+	} else {
 		count = 0;      /* no pad */
-		name_len = strnlen(fileName, PATH_MAX);
-		name_len++;     /* trailing null */
-		strncpy(pSMB->fileName, fileName, name_len);
+		name_len = copy_path_name(pSMB->fileName, fileName);
 	}
 	if (*pOplock & REQ_OPLOCK)
 		pSMB->OpenFlags = cpu_to_le16(REQ_OPLOCK);
@@ -1313,7 +1394,7 @@
 CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
 	  FILE_ALL_INFO *buf)
 {
-	int rc = -EACCES;
+	int rc;
 	OPEN_REQ *req = NULL;
 	OPEN_RSP *rsp = NULL;
 	int bytes_returned;
@@ -1350,11 +1431,8 @@
 		/* BB improve check for buffer overruns BB */
 		/* no pad */
 		count = 0;
-		name_len = strnlen(path, PATH_MAX);
-		/* trailing null */
-		name_len++;
+		name_len = copy_path_name(req->fileName, path);
 		req->NameLength = cpu_to_le16(name_len);
-		strncpy(req->fileName, path, name_len);
 	}
 
 	if (*oplock & REQ_OPLOCK)
@@ -1458,18 +1536,26 @@
 }
 
 static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
+		     bool malformed)
 {
 	int length;
-	struct cifs_readdata *rdata = mid->callback_data;
 
 	length = cifs_discard_remaining_data(server);
-	dequeue_mid(mid, rdata->result);
+	dequeue_mid(mid, malformed);
 	mid->resp_buf = server->smallbuf;
 	server->smallbuf = NULL;
 	return length;
 }
 
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	struct cifs_readdata *rdata = mid->callback_data;
+
+	return  __cifs_readv_discard(server, mid, rdata->result);
+}
+
 int
 cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 {
@@ -1506,17 +1592,29 @@
 	}
 
 	if (server->ops->is_status_pending &&
-	    server->ops->is_status_pending(buf, server, 0)) {
+	    server->ops->is_status_pending(buf, server)) {
 		cifs_discard_remaining_data(server);
 		return -1;
 	}
 
+	/* set up first two iov for signature check and to get credits */
+	rdata->iov[0].iov_base = buf;
+	rdata->iov[0].iov_len = server->vals->header_preamble_size;
+	rdata->iov[1].iov_base = buf + server->vals->header_preamble_size;
+	rdata->iov[1].iov_len =
+		server->total_read - server->vals->header_preamble_size;
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+	cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
 	/* Was the SMB read successful? */
 	rdata->result = server->ops->map_error(buf, false);
 	if (rdata->result != 0) {
 		cifs_dbg(FYI, "%s: server returned error %d\n",
 			 __func__, rdata->result);
-		return cifs_readv_discard(server, mid);
+		/* normal error on read response */
+		return __cifs_readv_discard(server, mid, false);
 	}
 
 	/* Is there enough to get to the rest of the READ_RSP header? */
@@ -1560,14 +1658,6 @@
 		server->total_read += length;
 	}
 
-	/* set up first iov for signature check */
-	rdata->iov[0].iov_base = buf;
-	rdata->iov[0].iov_len = 4;
-	rdata->iov[1].iov_base = buf + 4;
-	rdata->iov[1].iov_len = server->total_read - 4;
-	cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n",
-		 rdata->iov[0].iov_base, server->total_read);
-
 	/* how much data is in the response? */
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	use_rdma_mr = rdata->mr;
@@ -1607,9 +1697,11 @@
 	struct smb_rqst rqst = { .rq_iov = rdata->iov,
 				 .rq_nvec = 2,
 				 .rq_pages = rdata->pages,
+				 .rq_offset = rdata->page_offset,
 				 .rq_npages = rdata->nr_pages,
 				 .rq_pagesz = rdata->pagesz,
 				 .rq_tailsz = rdata->tailsz };
+	struct cifs_credits credits = { .value = 1, .instance = 0 };
 
 	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
 		 __func__, mid->mid, mid->mid_state, rdata->result,
@@ -1647,7 +1739,7 @@
 
 	queue_work(cifsiod_wq, &rdata->work);
 	DeleteMidQEntry(mid);
-	add_credits(server, 1, 0);
+	add_credits(server, &credits, 0);
 }
 
 /* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -1706,7 +1798,7 @@
 
 	kref_get(&rdata->refcount);
 	rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
-			     cifs_readv_callback, NULL, rdata, 0);
+			     cifs_readv_callback, NULL, rdata, 0, NULL);
 
 	if (rc == 0)
 		cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
@@ -2020,18 +2112,22 @@
 		wdata2->tailsz = tailsz;
 		wdata2->bytes = cur_len;
 
-		wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+		rc = cifs_get_writable_file(CIFS_I(inode), false,
+					    &wdata2->cfile);
 		if (!wdata2->cfile) {
-			cifs_dbg(VFS, "No writable handles for inode\n");
-			rc = -EBADF;
-			break;
+			cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
+				 rc);
+			if (!is_retryable_error(rc))
+				rc = -EBADF;
+		} else {
+			wdata2->pid = wdata2->cfile->pid;
+			rc = server->ops->async_writev(wdata2,
+						       cifs_writedata_release);
 		}
-		wdata2->pid = wdata2->cfile->pid;
-		rc = server->ops->async_writev(wdata2, cifs_writedata_release);
 
 		for (j = 0; j < nr_pages; j++) {
 			unlock_page(wdata2->pages[j]);
-			if (rc != 0 && rc != -EAGAIN) {
+			if (rc != 0 && !is_retryable_error(rc)) {
 				SetPageError(wdata2->pages[j]);
 				end_page_writeback(wdata2->pages[j]);
 				put_page(wdata2->pages[j]);
@@ -2040,8 +2136,9 @@
 
 		if (rc) {
 			kref_put(&wdata2->refcount, cifs_writedata_release);
-			if (rc == -EAGAIN)
+			if (is_retryable_error(rc))
 				continue;
+			i += nr_pages;
 			break;
 		}
 
@@ -2049,7 +2146,15 @@
 		i += nr_pages;
 	} while (i < wdata->nr_pages);
 
-	mapping_set_error(inode->i_mapping, rc);
+	/* cleanup remaining pages from the original wdata */
+	for (; i < wdata->nr_pages; i++) {
+		SetPageError(wdata->pages[i]);
+		end_page_writeback(wdata->pages[i]);
+		put_page(wdata->pages[i]);
+	}
+
+	if (rc != 0 && !is_retryable_error(rc))
+		mapping_set_error(inode->i_mapping, rc);
 	kref_put(&wdata->refcount, cifs_writedata_release);
 }
 
@@ -2122,6 +2227,7 @@
 	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
 	unsigned int written;
 	WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+	struct cifs_credits credits = { .value = 1, .instance = 0 };
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
@@ -2157,7 +2263,7 @@
 
 	queue_work(cifsiod_wq, &wdata->work);
 	DeleteMidQEntry(mid);
-	add_credits(tcon->ses->server, 1, 0);
+	add_credits(tcon->ses->server, &credits, 0);
 }
 
 /* cifs_async_writev - send an async write, and set up mid to handle result */
@@ -2210,6 +2316,7 @@
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 2;
 	rqst.rq_pages = wdata->pages;
+	rqst.rq_offset = wdata->page_offset;
 	rqst.rq_npages = wdata->nr_pages;
 	rqst.rq_pagesz = wdata->pagesz;
 	rqst.rq_tailsz = wdata->tailsz;
@@ -2234,7 +2341,7 @@
 
 	kref_get(&wdata->refcount);
 	rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
-				cifs_writev_callback, NULL, wdata, 0);
+			     cifs_writev_callback, NULL, wdata, 0, NULL);
 
 	if (rc == 0)
 		cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
@@ -2387,8 +2494,8 @@
 	iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
 
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
-	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP,
-			  &rsp_iov);
+	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type,
+			  CIFS_NO_RSP_BUF, &rsp_iov);
 	cifs_small_buf_release(pSMB);
 	if (rc)
 		cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc);
@@ -2419,7 +2526,7 @@
 
 	if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
 		/* no response expected */
-		flags = CIFS_ASYNC_OP | CIFS_OBREAK_OP;
+		flags = CIFS_NO_SRV_RSP | CIFS_NON_BLOCKING | CIFS_OBREAK_OP;
 		pSMB->Timeout = 0;
 	} else if (waitFlag) {
 		flags = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
@@ -2691,15 +2798,10 @@
 				       remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(from_name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->OldFileName, from_name, name_len);
-		name_len2 = strnlen(to_name, PATH_MAX);
-		name_len2++;	/* trailing null */
+	} else {
+		name_len = copy_path_name(pSMB->OldFileName, from_name);
+		name_len2 = copy_path_name(pSMB->OldFileName+name_len+1, to_name);
 		pSMB->OldFileName[name_len] = 0x04;  /* 2nd buffer format */
-		strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2);
-		name_len2++;	/* trailing null */
 		name_len2++;	/* signature byte */
 	}
 
@@ -2841,15 +2943,10 @@
 				       toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2; /* convert to bytes */
-	} else { 	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fromName, PATH_MAX);
-		name_len++;     /* trailing null */
-		strncpy(pSMB->OldFileName, fromName, name_len);
-		name_len2 = strnlen(toName, PATH_MAX);
-		name_len2++;    /* trailing null */
+	} else {
+		name_len = copy_path_name(pSMB->OldFileName, fromName);
 		pSMB->OldFileName[name_len] = 0x04;  /* 2nd buffer format */
-		strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2);
-		name_len2++;    /* trailing null */
+		name_len2 = copy_path_name(pSMB->OldFileName+name_len+1, toName);
 		name_len2++;    /* signature byte */
 	}
 
@@ -2900,10 +2997,8 @@
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fromName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, fromName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, fromName);
 	}
 	params = 6 + name_len;
 	pSMB->MaxSetupCount = 0;
@@ -2923,10 +3018,8 @@
 					PATH_MAX, nls_codepage, remap);
 		name_len_target++;	/* trailing null */
 		name_len_target *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len_target = strnlen(toName, PATH_MAX);
-		name_len_target++;	/* trailing null */
-		strncpy(data_offset, toName, name_len_target);
+	} else {
+		name_len_target = copy_path_name(data_offset, toName);
 	}
 
 	pSMB->MaxParameterCount = cpu_to_le16(2);
@@ -2988,10 +3081,8 @@
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(toName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, toName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, toName);
 	}
 	params = 6 + name_len;
 	pSMB->MaxSetupCount = 0;
@@ -3010,10 +3101,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		name_len_target++;	/* trailing null */
 		name_len_target *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len_target = strnlen(fromName, PATH_MAX);
-		name_len_target++;	/* trailing null */
-		strncpy(data_offset, fromName, name_len_target);
+	} else {
+		name_len_target = copy_path_name(data_offset, fromName);
 	}
 
 	pSMB->MaxParameterCount = cpu_to_le16(2);
@@ -3092,15 +3181,10 @@
 				       remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(from_name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->OldFileName, from_name, name_len);
-		name_len2 = strnlen(to_name, PATH_MAX);
-		name_len2++;	/* trailing null */
+	} else {
+		name_len = copy_path_name(pSMB->OldFileName, from_name);
 		pSMB->OldFileName[name_len] = 0x04;	/* 2nd buffer format */
-		strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2);
-		name_len2++;	/* trailing null */
+		name_len2 = copy_path_name(pSMB->OldFileName+name_len+1, to_name);
 		name_len2++;	/* signature byte */
 	}
 
@@ -3150,10 +3234,8 @@
 					   remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, searchName);
 	}
 
 	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
@@ -3479,11 +3561,9 @@
 	return size;
 }
 
-static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
+static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
 				     const struct posix_acl_xattr_entry *local_ace)
 {
-	__u16 rc = 0; /* 0 = ACL converted ok */
-
 	cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
 	cifs_ace->cifs_e_tag =  le16_to_cpu(local_ace->e_tag);
 	/* BB is there a better way to handle the large uid? */
@@ -3496,7 +3576,6 @@
 	cifs_dbg(FYI, "perm %d tag %d id %d\n",
 		 ace->e_perm, ace->e_tag, ace->e_id);
 */
-	return rc;
 }
 
 /* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */
@@ -3532,13 +3611,8 @@
 		cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
 		return 0;
 	}
-	for (i = 0; i < count; i++) {
-		rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
-		if (rc != 0) {
-			/* ACE not converted */
-			break;
-		}
-	}
+	for (i = 0; i < count; i++)
+		convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
 	if (rc == 0) {
 		rc = (__u16)(count * sizeof(struct cifs_posix_ace));
 		rc += sizeof(struct cifs_posix_acl);
@@ -3578,10 +3652,8 @@
 		name_len *= 2;
 		pSMB->FileName[name_len] = 0;
 		pSMB->FileName[name_len+1] = 0;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
-		name_len++;     /* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, searchName);
 	}
 
 	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
@@ -3663,10 +3735,8 @@
 					   PATH_MAX, nls_codepage, remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fileName, PATH_MAX);
-		name_len++;     /* trailing null */
-		strncpy(pSMB->FileName, fileName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, fileName);
 	}
 	params = 6 + name_len;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
@@ -3799,7 +3869,6 @@
 
 #endif /* CONFIG_POSIX */
 
-#ifdef CONFIG_CIFS_ACL
 /*
  * Initialize NT TRANSACT SMB into small smb request buffer.  This assumes that
  * all NT TRANSACTS that we init here have total parm and data under about 400
@@ -4043,7 +4112,6 @@
 	return (rc);
 }
 
-#endif /* CONFIG_CIFS_ACL */
 
 /* Legacy Query Path Information call for lookup to old servers such
    as Win9x/WinME */
@@ -4073,9 +4141,7 @@
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {
-		name_len = strnlen(search_name, PATH_MAX);
-		name_len++;     /* trailing null */
-		strncpy(pSMB->FileName, search_name, name_len);
+		name_len = copy_path_name(pSMB->FileName, search_name);
 	}
 	pSMB->BufferFormat = 0x04;
 	name_len++; /* account for buffer type byte */
@@ -4210,10 +4276,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(search_name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, search_name, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, search_name);
 	}
 
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
@@ -4379,10 +4443,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, searchName);
 	}
 
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
@@ -4482,17 +4544,16 @@
 			pSMB->FileName[name_len+1] = 0;
 			name_len += 2;
 		}
-	} else {	/* BB add check for overrun of SMB buf BB */
-		name_len = strnlen(searchName, PATH_MAX);
-/* BB fix here and in unicode clause above ie
-		if (name_len > buffersize-header)
-			free buffer exit; BB */
-		strncpy(pSMB->FileName, searchName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, searchName);
 		if (msearch) {
-			pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb);
-			pSMB->FileName[name_len+1] = '*';
-			pSMB->FileName[name_len+2] = 0;
-			name_len += 3;
+			if (WARN_ON_ONCE(name_len > PATH_MAX-2))
+				name_len = PATH_MAX-2;
+			/* overwrite nul byte */
+			pSMB->FileName[name_len-1] = CIFS_DIR_SEP(cifs_sb);
+			pSMB->FileName[name_len] = '*';
+			pSMB->FileName[name_len+1] = 0;
+			name_len += 2;
 		}
 	}
 
@@ -4787,10 +4848,8 @@
 					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(search_name, PATH_MAX);
-		name_len++;     /* trailing null */
-		strncpy(pSMB->FileName, search_name, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, search_name);
 	}
 
 	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
@@ -4897,9 +4956,7 @@
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(search_name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->RequestFileName, search_name, name_len);
+		name_len = copy_path_name(pSMB->RequestFileName, search_name);
 	}
 
 	if (ses->server->sign)
@@ -5027,6 +5084,13 @@
 				le16_to_cpu(response_data->BytesPerSector) *
 				le32_to_cpu(response_data->
 					SectorsPerAllocationUnit);
+			/*
+			 * much prefer larger but if server doesn't report
+			 * a valid size than 4K is a reasonable minimum
+			 */
+			if (FSData->f_bsize < 512)
+				FSData->f_bsize = 4096;
+
 			FSData->f_blocks =
 			       le32_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
@@ -5107,6 +5171,13 @@
 			    le32_to_cpu(response_data->BytesPerSector) *
 			    le32_to_cpu(response_data->
 					SectorsPerAllocationUnit);
+			/*
+			 * much prefer larger but if server doesn't report
+			 * a valid size than 4K is a reasonable minimum
+			 */
+			if (FSData->f_bsize < 512)
+				FSData->f_bsize = 4096;
+
 			FSData->f_blocks =
 			    le64_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
@@ -5470,6 +5541,13 @@
 				 data_offset);
 			FSData->f_bsize =
 					le32_to_cpu(response_data->BlockSize);
+			/*
+			 * much prefer larger but if server doesn't report
+			 * a valid size than 4K is a reasonable minimum
+			 */
+			if (FSData->f_bsize < 512)
+				FSData->f_bsize = 4096;
+
 			FSData->f_blocks =
 					le64_to_cpu(response_data->TotalBlocks);
 			FSData->f_bfree =
@@ -5531,10 +5609,8 @@
 				       PATH_MAX, cifs_sb->local_nls, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(file_name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, file_name, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, file_name);
 	}
 	params = 6 + name_len;
 	data_count = sizeof(struct file_end_of_file_info);
@@ -5827,10 +5903,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fileName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, fileName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, fileName);
 	}
 
 	params = 6 + name_len;
@@ -5908,10 +5982,8 @@
 				       PATH_MAX, nls_codepage);
 		name_len++;     /* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fileName, PATH_MAX);
-		name_len++;     /* trailing null */
-		strncpy(pSMB->fileName, fileName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->fileName, fileName);
 	}
 	pSMB->attr = cpu_to_le16(dos_attrs);
 	pSMB->BufferFormat = 0x04;
@@ -6071,10 +6143,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(file_name, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, file_name, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, file_name);
 	}
 
 	params = 6 + name_len;
@@ -6166,10 +6236,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		list_len++;	/* trailing null */
 		list_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		list_len = strnlen(searchName, PATH_MAX);
-		list_len++;	/* trailing null */
-		strncpy(pSMB->FileName, searchName, list_len);
+	} else {
+		list_len = copy_path_name(pSMB->FileName, searchName);
 	}
 
 	params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
@@ -6348,10 +6416,8 @@
 				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fileName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, fileName, name_len);
+	} else {
+		name_len = copy_path_name(pSMB->FileName, fileName);
 	}
 
 	params = 6 + name_len;
@@ -6425,93 +6491,3 @@
 	return rc;
 }
 #endif
-
-#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */
-/*
- *	Years ago the kernel added a "dnotify" function for Samba server,
- *	to allow network clients (such as Windows) to display updated
- *	lists of files in directory listings automatically when
- *	files are added by one user when another user has the
- *	same directory open on their desktop.  The Linux cifs kernel
- *	client hooked into the kernel side of this interface for
- *	the same reason, but ironically when the VFS moved from
- *	"dnotify" to "inotify" it became harder to plug in Linux
- *	network file system clients (the most obvious use case
- *	for notify interfaces is when multiple users can update
- *	the contents of the same directory - exactly what network
- *	file systems can do) although the server (Samba) could
- *	still use it.  For the short term we leave the worker
- *	function ifdeffed out (below) until inotify is fixed
- *	in the VFS to make it easier to plug in network file
- *	system clients.  If inotify turns out to be permanently
- *	incompatible for network fs clients, we could instead simply
- *	expose this config flag by adding a future cifs (and smb2) notify ioctl.
- */
-int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
-		  const int notify_subdirs, const __u16 netfid,
-		  __u32 filter, struct file *pfile, int multishot,
-		  const struct nls_table *nls_codepage)
-{
-	int rc = 0;
-	struct smb_com_transaction_change_notify_req *pSMB = NULL;
-	struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
-	struct dir_notify_req *dnotify_req;
-	int bytes_returned;
-
-	cifs_dbg(FYI, "In CIFSSMBNotify for file handle %d\n", (int)netfid);
-	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
-	if (rc)
-		return rc;
-
-	pSMB->TotalParameterCount = 0 ;
-	pSMB->TotalDataCount = 0;
-	pSMB->MaxParameterCount = cpu_to_le32(2);
-	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
-	pSMB->MaxSetupCount = 4;
-	pSMB->Reserved = 0;
-	pSMB->ParameterOffset = 0;
-	pSMB->DataCount = 0;
-	pSMB->DataOffset = 0;
-	pSMB->SetupCount = 4; /* single byte does not need le conversion */
-	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
-	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	if (notify_subdirs)
-		pSMB->WatchTree = 1; /* one byte - no le conversion needed */
-	pSMB->Reserved2 = 0;
-	pSMB->CompletionFilter = cpu_to_le32(filter);
-	pSMB->Fid = netfid; /* file handle always le */
-	pSMB->ByteCount = 0;
-
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *)pSMBr, &bytes_returned,
-			 CIFS_ASYNC_OP);
-	if (rc) {
-		cifs_dbg(FYI, "Error in Notify = %d\n", rc);
-	} else {
-		/* Add file to outstanding requests */
-		/* BB change to kmem cache alloc */
-		dnotify_req = kmalloc(
-						sizeof(struct dir_notify_req),
-						 GFP_KERNEL);
-		if (dnotify_req) {
-			dnotify_req->Pid = pSMB->hdr.Pid;
-			dnotify_req->PidHigh = pSMB->hdr.PidHigh;
-			dnotify_req->Mid = pSMB->hdr.Mid;
-			dnotify_req->Tid = pSMB->hdr.Tid;
-			dnotify_req->Uid = pSMB->hdr.Uid;
-			dnotify_req->netfid = netfid;
-			dnotify_req->pfile = pfile;
-			dnotify_req->filter = filter;
-			dnotify_req->multishot = multishot;
-			spin_lock(&GlobalMid_Lock);
-			list_add_tail(&dnotify_req->lhead,
-					&GlobalDnotifyReqList);
-			spin_unlock(&GlobalMid_Lock);
-		} else
-			rc = -ENOMEM;
-	}
-	cifs_buf_release(pSMB);
-	return rc;
-}
-#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 52d71b6..ccaa8ba 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c

@@ -56,6 +56,11 @@
 #include "fscache.h"
 #include "smb2proto.h"
 #include "smbdirect.h"
+#include "dns_resolve.h"
+#include "cifsfs.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
 
 extern mempool_t *cifs_req_poolp;
 extern bool disable_legacy_dialects;
@@ -69,7 +74,7 @@
 	Opt_user_xattr, Opt_nouser_xattr,
 	Opt_forceuid, Opt_noforceuid,
 	Opt_forcegid, Opt_noforcegid,
-	Opt_noblocksend, Opt_noautotune,
+	Opt_noblocksend, Opt_noautotune, Opt_nolease,
 	Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
 	Opt_mapposix, Opt_nomapposix,
 	Opt_mapchars, Opt_nomapchars, Opt_sfu,
@@ -86,19 +91,21 @@
 	Opt_serverino, Opt_noserverino,
 	Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl,
 	Opt_acl, Opt_noacl, Opt_locallease,
-	Opt_sign, Opt_seal, Opt_noac,
+	Opt_sign, Opt_ignore_signature, Opt_seal, Opt_noac,
 	Opt_fsc, Opt_mfsymlinks,
 	Opt_multiuser, Opt_sloppy, Opt_nosharesock,
 	Opt_persistent, Opt_nopersistent,
 	Opt_resilient, Opt_noresilient,
-	Opt_domainauto, Opt_rdma,
+	Opt_domainauto, Opt_rdma, Opt_modesid, Opt_rootfs,
+	Opt_compress,
 
 	/* Mount options which take numeric value */
 	Opt_backupuid, Opt_backupgid, Opt_uid,
 	Opt_cruid, Opt_gid, Opt_file_mode,
 	Opt_dirmode, Opt_port,
-	Opt_rsize, Opt_wsize, Opt_actimeo,
-	Opt_echo_interval, Opt_max_credits,
+	Opt_min_enc_offload,
+	Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo,
+	Opt_echo_interval, Opt_max_credits, Opt_handletimeout,
 	Opt_snapshot,
 
 	/* Mount options which take string value */
@@ -128,6 +135,7 @@
 	{ Opt_noforcegid, "noforcegid" },
 	{ Opt_noblocksend, "noblocksend" },
 	{ Opt_noautotune, "noautotune" },
+	{ Opt_nolease, "nolease" },
 	{ Opt_hard, "hard" },
 	{ Opt_soft, "soft" },
 	{ Opt_perm, "perm" },
@@ -170,12 +178,14 @@
 	{ Opt_serverino, "serverino" },
 	{ Opt_noserverino, "noserverino" },
 	{ Opt_rwpidforward, "rwpidforward" },
+	{ Opt_modesid, "modefromsid" },
 	{ Opt_cifsacl, "cifsacl" },
 	{ Opt_nocifsacl, "nocifsacl" },
 	{ Opt_acl, "acl" },
 	{ Opt_noacl, "noacl" },
 	{ Opt_locallease, "locallease" },
 	{ Opt_sign, "sign" },
+	{ Opt_ignore_signature, "signloosely" },
 	{ Opt_seal, "seal" },
 	{ Opt_noac, "noac" },
 	{ Opt_fsc, "fsc" },
@@ -199,12 +209,16 @@
 	{ Opt_dirmode, "dirmode=%s" },
 	{ Opt_dirmode, "dir_mode=%s" },
 	{ Opt_port, "port=%s" },
+	{ Opt_min_enc_offload, "esize=%s" },
+	{ Opt_blocksize, "bsize=%s" },
 	{ Opt_rsize, "rsize=%s" },
 	{ Opt_wsize, "wsize=%s" },
 	{ Opt_actimeo, "actimeo=%s" },
+	{ Opt_handletimeout, "handletimeout=%s" },
 	{ Opt_echo_interval, "echo_interval=%s" },
 	{ Opt_max_credits, "max_credits=%s" },
 	{ Opt_snapshot, "snapshot=%s" },
+	{ Opt_compress, "compress=%s" },
 
 	{ Opt_blank_user, "user=" },
 	{ Opt_blank_user, "username=" },
@@ -250,7 +264,9 @@
 	{ Opt_ignore, "dev" },
 	{ Opt_ignore, "mand" },
 	{ Opt_ignore, "nomand" },
+	{ Opt_ignore, "relatime" },
 	{ Opt_ignore, "_netdev" },
+	{ Opt_rootfs, "rootfs" },
 
 	{ Opt_err, NULL }
 };
@@ -287,6 +303,8 @@
 	Opt_cache_loose,
 	Opt_cache_strict,
 	Opt_cache_none,
+	Opt_cache_ro,
+	Opt_cache_rw,
 	Opt_cache_err
 };
 
@@ -294,6 +312,8 @@
 	{ Opt_cache_loose, "loose" },
 	{ Opt_cache_strict, "strict" },
 	{ Opt_cache_none, "none" },
+	{ Opt_cache_ro, "ro" },
+	{ Opt_cache_rw, "singleclient" },
 	{ Opt_cache_err, NULL }
 };
 
@@ -303,6 +323,7 @@
 	{ Smb_21, SMB21_VERSION_STRING },
 	{ Smb_30, SMB30_VERSION_STRING },
 	{ Smb_302, SMB302_VERSION_STRING },
+	{ Smb_302, ALT_SMB302_VERSION_STRING },
 	{ Smb_311, SMB311_VERSION_STRING },
 	{ Smb_311, ALT_SMB311_VERSION_STRING },
 	{ Smb_3any, SMB3ANY_VERSION_STRING },
@@ -314,8 +335,132 @@
 static int generic_ip_connect(struct TCP_Server_Info *server);
 static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
-static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
-					const char *devname, bool is_smb3);
+static char *extract_hostname(const char *unc);
+
+/*
+ * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may
+ * get their ip addresses changed at some point.
+ *
+ * This should be called with server->srv_mutex held.
+ */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+	int rc;
+	int len;
+	char *unc, *ipaddr = NULL;
+
+	if (!server->hostname)
+		return -EINVAL;
+
+	len = strlen(server->hostname) + 3;
+
+	unc = kmalloc(len, GFP_KERNEL);
+	if (!unc) {
+		cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
+		return -ENOMEM;
+	}
+	scnprintf(unc, len, "\\\\%s", server->hostname);
+
+	rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
+	kfree(unc);
+
+	if (rc < 0) {
+		cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n",
+			 __func__, server->hostname, rc);
+		return rc;
+	}
+
+	rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
+				  strlen(ipaddr));
+	kfree(ipaddr);
+
+	return !rc ? -1 : 0;
+}
+#else
+static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+struct super_cb_data {
+	struct TCP_Server_Info *server;
+	struct cifs_sb_info *cifs_sb;
+};
+
+/* These functions must be called with server->srv_mutex held */
+
+static void super_cb(struct super_block *sb, void *arg)
+{
+	struct super_cb_data *d = arg;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+
+	if (d->cifs_sb)
+		return;
+
+	cifs_sb = CIFS_SB(sb);
+	tcon = cifs_sb_master_tcon(cifs_sb);
+	if (tcon->ses->server == d->server)
+		d->cifs_sb = cifs_sb;
+}
+
+static inline struct cifs_sb_info *
+find_super_by_tcp(struct TCP_Server_Info *server)
+{
+	struct super_cb_data d = {
+		.server = server,
+		.cifs_sb = NULL,
+	};
+
+	iterate_supers_type(&cifs_fs_type, super_cb, &d);
+	return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT);
+}
+
+static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
+				    struct cifs_sb_info *cifs_sb,
+				    struct dfs_cache_tgt_list *tgt_list,
+				    struct dfs_cache_tgt_iterator **tgt_it)
+{
+	const char *name;
+
+	if (!cifs_sb || !cifs_sb->origin_fullpath || !tgt_list ||
+	    !server->nr_targets)
+		return;
+
+	if (!*tgt_it) {
+		*tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+	} else {
+		*tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it);
+		if (!*tgt_it)
+			*tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+	}
+
+	cifs_dbg(FYI, "%s: UNC: %s\n", __func__, cifs_sb->origin_fullpath);
+
+	name = dfs_cache_get_tgt_name(*tgt_it);
+
+	kfree(server->hostname);
+
+	server->hostname = extract_hostname(name);
+	if (IS_ERR(server->hostname)) {
+		cifs_dbg(FYI,
+			 "%s: failed to extract hostname from target: %ld\n",
+			 __func__, PTR_ERR(server->hostname));
+	}
+}
+
+static inline int reconn_setup_dfs_targets(struct cifs_sb_info *cifs_sb,
+					   struct dfs_cache_tgt_list *tl,
+					   struct dfs_cache_tgt_iterator **it)
+{
+	if (!cifs_sb->origin_fullpath)
+		return -EOPNOTSUPP;
+	return dfs_cache_noreq_find(cifs_sb->origin_fullpath + 1, NULL, tl);
+}
+#endif
 
 /*
  * cifs tcp session reconnection
@@ -334,8 +479,35 @@
 	struct cifs_tcon *tcon;
 	struct mid_q_entry *mid_entry;
 	struct list_head retry_list;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	struct cifs_sb_info *cifs_sb = NULL;
+	struct dfs_cache_tgt_list tgt_list = {0};
+	struct dfs_cache_tgt_iterator *tgt_it = NULL;
+#endif
 
 	spin_lock(&GlobalMid_Lock);
+	server->nr_targets = 1;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	spin_unlock(&GlobalMid_Lock);
+	cifs_sb = find_super_by_tcp(server);
+	if (IS_ERR(cifs_sb)) {
+		rc = PTR_ERR(cifs_sb);
+		cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
+			 __func__, rc);
+		cifs_sb = NULL;
+	} else {
+		rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it);
+		if (rc && (rc != -EOPNOTSUPP)) {
+			cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n",
+				 __func__);
+		} else {
+			server->nr_targets = dfs_cache_get_nr_tgts(&tgt_list);
+		}
+	}
+	cifs_dbg(FYI, "%s: will retry %d target(s)\n", __func__,
+		 server->nr_targets);
+	spin_lock(&GlobalMid_Lock);
+#endif
 	if (server->tcpStatus == CifsExiting) {
 		/* the demux thread will exit normally
 		next time through the loop */
@@ -347,7 +519,7 @@
 	server->maxBuf = 0;
 	server->max_read = 0;
 
-	cifs_dbg(FYI, "Reconnecting tcp session\n");
+	cifs_dbg(FYI, "Mark tcp session as need reconnect\n");
 	trace_smb3_reconnect(server->CurrentMid, server->hostname);
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
@@ -392,9 +564,11 @@
 	spin_lock(&GlobalMid_Lock);
 	list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		kref_get(&mid_entry->refcount);
 		if (mid_entry->mid_state == MID_REQUEST_SUBMITTED)
 			mid_entry->mid_state = MID_RETRY_NEEDED;
 		list_move(&mid_entry->qhead, &retry_list);
+		mid_entry->mid_flags |= MID_DELETED;
 	}
 	spin_unlock(&GlobalMid_Lock);
 	mutex_unlock(&server->srv_mutex);
@@ -404,23 +578,44 @@
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 		list_del_init(&mid_entry->qhead);
 		mid_entry->callback(mid_entry);
+		cifs_mid_q_entry_release(mid_entry);
+	}
+
+	if (cifs_rdma_enabled(server)) {
+		mutex_lock(&server->srv_mutex);
+		smbd_destroy(server);
+		mutex_unlock(&server->srv_mutex);
 	}
 
 	do {
 		try_to_freeze();
 
-		/* we should try only the port we connected to before */
 		mutex_lock(&server->srv_mutex);
+		/*
+		 * Set up next DFS target server (if any) for reconnect. If DFS
+		 * feature is disabled, then we will retry last server we
+		 * connected to before.
+		 */
 		if (cifs_rdma_enabled(server))
 			rc = smbd_reconnect(server);
 		else
 			rc = generic_ip_connect(server);
 		if (rc) {
 			cifs_dbg(FYI, "reconnect error %d\n", rc);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+			reconn_inval_dfs_target(server, cifs_sb, &tgt_list,
+						&tgt_it);
+#endif
+			rc = reconn_set_ipaddr(server);
+			if (rc) {
+				cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+					 __func__, rc);
+			}
 			mutex_unlock(&server->srv_mutex);
 			msleep(3000);
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
+			set_credits(server, 1);
 			spin_lock(&GlobalMid_Lock);
 			if (server->tcpStatus != CifsExiting)
 				server->tcpStatus = CifsNeedNegotiate;
@@ -429,6 +624,22 @@
 		}
 	} while (server->tcpStatus == CifsNeedReconnect);
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	if (tgt_it) {
+		rc = dfs_cache_noreq_update_tgthint(cifs_sb->origin_fullpath + 1,
+						    tgt_it);
+		if (rc) {
+			cifs_server_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n",
+				 __func__, rc);
+		}
+		rc = dfs_cache_update_vol(cifs_sb->origin_fullpath, server);
+		if (rc) {
+			cifs_server_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n",
+				 __func__, rc);
+		}
+		dfs_cache_free_tgts(&tgt_list);
+	}
+#endif
 	if (server->tcpStatus == CifsNeedNegotiate)
 		mod_delayed_work(cifsiod_wq, &server->echo, 0);
 
@@ -479,7 +690,7 @@
 	if (!server->bigbuf) {
 		server->bigbuf = (char *)cifs_buf_get();
 		if (!server->bigbuf) {
-			cifs_dbg(VFS, "No memory for large SMB response\n");
+			cifs_server_dbg(VFS, "No memory for large SMB response\n");
 			msleep(3000);
 			/* retry will check if exiting */
 			return false;
@@ -492,7 +703,7 @@
 	if (!server->smallbuf) {
 		server->smallbuf = (char *)cifs_small_buf_get();
 		if (!server->smallbuf) {
-			cifs_dbg(VFS, "No memory for SMB response\n");
+			cifs_server_dbg(VFS, "No memory for SMB response\n");
 			msleep(1000);
 			/* retry will check if exiting */
 			return false;
@@ -510,7 +721,7 @@
 server_unresponsive(struct TCP_Server_Info *server)
 {
 	/*
-	 * We need to wait 2 echo intervals to make sure we handle such
+	 * We need to wait 3 echo intervals to make sure we handle such
 	 * situations right:
 	 * 1s  client sends a normal SMB request
 	 * 2s  client gets a response
@@ -522,9 +733,9 @@
 	 */
 	if ((server->tcpStatus == CifsGood ||
 	    server->tcpStatus == CifsNeedNegotiate) &&
-	    time_after(jiffies, server->lstrp + 2 * server->echo_interval)) {
-		cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n",
-			 server->hostname, (2 * server->echo_interval) / HZ);
+	    time_after(jiffies, server->lstrp + 3 * server->echo_interval)) {
+		cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n",
+			 (3 * server->echo_interval) / HZ);
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
 		return true;
@@ -533,6 +744,21 @@
 	return false;
 }
 
+static inline bool
+zero_credits(struct TCP_Server_Info *server)
+{
+	int val;
+
+	spin_lock(&server->req_lock);
+	val = server->credits + server->echo_credits + server->oplock_credits;
+	if (server->in_flight == 0 && val == 0) {
+		spin_unlock(&server->req_lock);
+		return true;
+	}
+	spin_unlock(&server->req_lock);
+	return false;
+}
+
 static int
 cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
 {
@@ -545,6 +771,12 @@
 	for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
 		try_to_freeze();
 
+		/* reconnect if no credits and no requests in flight */
+		if (zero_credits(server)) {
+			cifs_reconnect(server);
+			return -ECONNABORTED;
+		}
+
 		if (server_unresponsive(server))
 			return -ECONNABORTED;
 		if (cifs_rdma_enabled(server) && server->smbd_conn)
@@ -588,7 +820,7 @@
 {
 	struct msghdr smb_msg;
 	struct kvec iov = {.iov_base = buf, .iov_len = to_read};
-	iov_iter_kvec(&smb_msg.msg_iter, READ | ITER_KVEC, &iov, 1, to_read);
+	iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
 
 	return cifs_readv_from_socket(server, &smb_msg);
 }
@@ -600,7 +832,7 @@
 	struct msghdr smb_msg;
 	struct bio_vec bv = {
 		.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
-	iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read);
+	iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
 	return cifs_readv_from_socket(server, &smb_msg);
 }
 
@@ -641,7 +873,7 @@
 		wake_up(&server->response_q);
 		break;
 	default:
-		cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
+		cifs_server_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
 		cifs_reconnect(server);
 	}
 
@@ -666,8 +898,10 @@
 	if (mid->mid_flags & MID_DELETED)
 		printk_once(KERN_WARNING
 			    "trying to dequeue a deleted mid\n");
-	else
+	else {
 		list_del_init(&mid->qhead);
+		mid->mid_flags |= MID_DELETED;
+	}
 	spin_unlock(&GlobalMid_Lock);
 }
 
@@ -720,10 +954,8 @@
 	wake_up_all(&server->request_q);
 	/* give those requests time to exit */
 	msleep(125);
-	if (cifs_rdma_enabled(server) && server->smbd_conn) {
-		smbd_destroy(server->smbd_conn);
-		server->smbd_conn = NULL;
-	}
+	if (cifs_rdma_enabled(server))
+		smbd_destroy(server);
 	if (server->ssocket) {
 		sock_release(server->ssocket);
 		server->ssocket = NULL;
@@ -739,8 +971,10 @@
 		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 			cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
+			kref_get(&mid_entry->refcount);
 			mid_entry->mid_state = MID_SHUTDOWN;
 			list_move(&mid_entry->qhead, &dispose_list);
+			mid_entry->mid_flags |= MID_DELETED;
 		}
 		spin_unlock(&GlobalMid_Lock);
 
@@ -750,6 +984,7 @@
 			cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
 			list_del_init(&mid_entry->qhead);
 			mid_entry->callback(mid_entry);
+			cifs_mid_q_entry_release(mid_entry);
 		}
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
@@ -790,7 +1025,7 @@
 	/* make sure this will fit in a large buffer */
 	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) -
 		server->vals->header_preamble_size) {
-		cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
+		cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
 		return -ECONNABORTED;
@@ -845,7 +1080,7 @@
 	}
 
 	if (server->ops->is_status_pending &&
-	    server->ops->is_status_pending(buf, server, length))
+	    server->ops->is_status_pending(buf, server))
 		return -1;
 
 	if (!mid)
@@ -855,6 +1090,26 @@
 	return 0;
 }
 
+static void
+smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
+{
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+
+	/*
+	 * SMB1 does not use credits.
+	 */
+	if (server->vals->header_preamble_size)
+		return;
+
+	if (shdr->CreditRequest) {
+		spin_lock(&server->req_lock);
+		server->credits += le16_to_cpu(shdr->CreditRequest);
+		spin_unlock(&server->req_lock);
+		wake_up(&server->request_q);
+	}
+}
+
+
 static int
 cifs_demultiplex_thread(void *p)
 {
@@ -875,6 +1130,7 @@
 		mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
 
 	set_freezable();
+	allow_kernel_signal(SIGKILL);
 	while (server->tcpStatus != CifsExiting) {
 		if (try_to_freeze())
 			continue;
@@ -910,7 +1166,7 @@
 		/* make sure we have enough to get to the MID */
 		if (server->pdu_size < HEADER_SIZE(server) - 1 -
 		    server->vals->header_preamble_size) {
-			cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
+			cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n",
 				 server->pdu_size);
 			cifs_reconnect(server);
 			wake_up(&server->response_q);
@@ -961,10 +1217,6 @@
 			continue;
 		}
 
-		if (server->large_buf)
-			buf = server->bigbuf;
-
-
 		server->lstrp = jiffies;
 
 		for (i = 0; i < num_mids; i++) {
@@ -984,13 +1236,15 @@
 			} else if (server->ops->is_oplock_break &&
 				   server->ops->is_oplock_break(bufs[i],
 								server)) {
+				smb2_add_credits_from_hdr(bufs[i], server);
 				cifs_dbg(FYI, "Received oplock break\n");
 			} else {
-				cifs_dbg(VFS, "No task to wake, unknown frame "
+				cifs_server_dbg(VFS, "No task to wake, unknown frame "
 					 "received! NumMids %d\n",
 					 atomic_read(&midCount));
 				cifs_dump_mem("Received Data is: ", bufs[i],
 					      HEADER_SIZE(server));
+				smb2_add_credits_from_hdr(bufs[i], server);
 #ifdef CONFIG_CIFS_DEBUG2
 				if (server->ops->dump_detail)
 					server->ops->dump_detail(bufs[i],
@@ -1042,7 +1296,12 @@
 
 	/* skip double chars at beginning of string */
 	/* BB: check validity of these bytes? */
-	src = unc + 2;
+	if (strlen(unc) < 3)
+		return ERR_PTR(-EINVAL);
+	for (src = unc; *src && *src == '\\'; src++)
+		;
+	if (!*src)
+		return ERR_PTR(-EINVAL);
 
 	/* delimiter between hostname and sharename is always '\\' now */
 	delim = strchr(src, '\\');
@@ -1176,14 +1435,32 @@
 	case Opt_cache_loose:
 		vol->direct_io = false;
 		vol->strict_io = false;
+		vol->cache_ro = false;
+		vol->cache_rw = false;
 		break;
 	case Opt_cache_strict:
 		vol->direct_io = false;
 		vol->strict_io = true;
+		vol->cache_ro = false;
+		vol->cache_rw = false;
 		break;
 	case Opt_cache_none:
 		vol->direct_io = true;
 		vol->strict_io = false;
+		vol->cache_ro = false;
+		vol->cache_rw = false;
+		break;
+	case Opt_cache_ro:
+		vol->direct_io = false;
+		vol->strict_io = false;
+		vol->cache_ro = true;
+		vol->cache_rw = false;
+		break;
+	case Opt_cache_rw:
+		vol->direct_io = false;
+		vol->strict_io = false;
+		vol->cache_ro = false;
+		vol->cache_rw = true;
 		break;
 	default:
 		cifs_dbg(VFS, "bad cache= option: %s\n", value);
@@ -1273,6 +1550,11 @@
 	const char *delims = "/\\";
 	size_t len;
 
+	if (unlikely(!devname || !*devname)) {
+		cifs_dbg(VFS, "Device name not specified.\n");
+		return -EINVAL;
+	}
+
 	/* make sure we have a valid UNC double delimiter prefix */
 	len = strspn(devname, delims);
 	if (len != 2)
@@ -1358,7 +1640,7 @@
 	vol->cred_uid = current_uid();
 	vol->linux_uid = current_uid();
 	vol->linux_gid = current_gid();
-
+	vol->bsize = 1024 * 1024; /* can improve cp performance significantly */
 	/*
 	 * default to SFM style remapping of seven reserved characters
 	 * unless user overrides it or we negotiate CIFS POSIX where
@@ -1381,6 +1663,9 @@
 
 	vol->actimeo = CIFS_DEF_ACTIMEO;
 
+	/* Most clients set timeout to 0, allows server to use its default */
+	vol->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */
+
 	/* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */
 	vol->ops = &smb30_operations;
 	vol->vals = &smbdefault_values;
@@ -1463,6 +1748,9 @@
 		case Opt_noautotune:
 			vol->noautotune = 1;
 			break;
+		case Opt_nolease:
+			vol->no_lease = 1;
+			break;
 		case Opt_hard:
 			vol->retry = 1;
 			break;
@@ -1498,6 +1786,11 @@
 		case Opt_nodfs:
 			vol->nodfs = 1;
 			break;
+		case Opt_rootfs:
+#ifdef CONFIG_CIFS_ROOT
+			vol->rootfs = true;
+#endif
+			break;
 		case Opt_posixpaths:
 			vol->posix_paths = 1;
 			break;
@@ -1584,6 +1877,9 @@
 		case Opt_rwpidforward:
 			vol->rwpidforward = 1;
 			break;
+		case Opt_modesid:
+			vol->mode_ace = 1;
+			break;
 		case Opt_cifsacl:
 			vol->cifs_acl = 1;
 			break;
@@ -1602,6 +1898,10 @@
 		case Opt_sign:
 			vol->sign = true;
 			break;
+		case Opt_ignore_signature:
+			vol->sign = true;
+			vol->ignore_signature = true;
+			break;
 		case Opt_seal:
 			/* we do not do the following in secFlags because seal
 			 * is a per tree connection (mount) not a per socket
@@ -1665,6 +1965,11 @@
 		case Opt_rdma:
 			vol->rdma = true;
 			break;
+		case Opt_compress:
+			vol->compression = UNKNOWN_TYPE;
+			cifs_dbg(VFS,
+				"SMB3 compression support is experimental\n");
+			break;
 
 		/* Numeric Values */
 		case Opt_backupuid:
@@ -1731,6 +2036,33 @@
 			}
 			port = (unsigned short)option;
 			break;
+		case Opt_min_enc_offload:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "Invalid minimum encrypted read offload size (esize)\n");
+				goto cifs_parse_mount_err;
+			}
+			vol->min_offload = option;
+			break;
+		case Opt_blocksize:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid blocksize value\n",
+					__func__);
+				goto cifs_parse_mount_err;
+			}
+			/*
+			 * inode blocksize realistically should never need to be
+			 * less than 16K or greater than 16M and default is 1MB.
+			 * Note that small inode block sizes (e.g. 64K) can lead
+			 * to very poor performance of common tools like cp and scp
+			 */
+			if ((option < CIFS_MAX_MSGSIZE) ||
+			   (option > (4 * SMB3_DEFAULT_IOSIZE))) {
+				cifs_dbg(VFS, "%s: Invalid blocksize\n",
+					__func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->bsize = option;
+			break;
 		case Opt_rsize:
 			if (get_option_ul(args, &option)) {
 				cifs_dbg(VFS, "%s: Invalid rsize value\n",
@@ -1759,6 +2091,18 @@
 				goto cifs_parse_mount_err;
 			}
 			break;
+		case Opt_handletimeout:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid handletimeout value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->handle_timeout = option;
+			if (vol->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) {
+				cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n");
+				goto cifs_parse_mount_err;
+			}
+			break;
 		case Opt_echo_interval:
 			if (get_option_ul(args, &option)) {
 				cifs_dbg(VFS, "%s: Invalid echo interval value\n",
@@ -1826,7 +2170,7 @@
 				vol->password = NULL;
 				break;
 			}
-			/* Yes it is. Drop down to Opt_pass below.*/
+			/* Fallthrough - to Opt_pass below.*/
 		case Opt_pass:
 			/* Obtain the value string */
 			value = strchr(data, '=');
@@ -2170,6 +2514,10 @@
 {
 	__be16 port, *sport;
 
+	/* SMBDirect manages its own ports, don't match it here */
+	if (server->rdma)
+		return true;
+
 	switch (addr->sa_family) {
 	case AF_INET:
 		sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
@@ -2262,8 +2610,15 @@
 	if (vol->nosharesock)
 		return 0;
 
-	/* BB update this for smb3any and default case */
-	if ((server->vals != vol->vals) || (server->ops != vol->ops))
+	/* If multidialect negotiation see if existing sessions match one */
+	if (strcmp(vol->vals->version_string, SMB3ANY_VERSION_STRING) == 0) {
+		if (server->vals->protocol_id < SMB30_PROT_ID)
+			return 0;
+	} else if (strcmp(vol->vals->version_string,
+		   SMBDEFAULT_VERSION_STRING) == 0) {
+		if (server->vals->protocol_id < SMB21_PROT_ID)
+			return 0;
+	} else if ((server->vals != vol->vals) || (server->ops != vol->ops))
 		return 0;
 
 	if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
@@ -2285,10 +2640,16 @@
 	if (server->rdma != vol->rdma)
 		return 0;
 
+	if (server->ignore_signature != vol->ignore_signature)
+		return 0;
+
+	if (server->min_offload != vol->min_offload)
+		return 0;
+
 	return 1;
 }
 
-static struct TCP_Server_Info *
+struct TCP_Server_Info *
 cifs_find_tcp_session(struct smb_vol *vol)
 {
 	struct TCP_Server_Info *server;
@@ -2349,7 +2710,7 @@
 
 	task = xchg(&server->tsk, NULL);
 	if (task)
-		force_sig(SIGKILL, task);
+		send_sig(SIGKILL, task, 1);
 }
 
 static struct TCP_Server_Info *
@@ -2380,11 +2741,13 @@
 		goto out_err_crypto_release;
 	}
 
-	tcp_ses->noblocksnd = volume_info->noblocksnd;
+	tcp_ses->noblockcnt = volume_info->rootfs;
+	tcp_ses->noblocksnd = volume_info->noblocksnd || volume_info->rootfs;
 	tcp_ses->noautotune = volume_info->noautotune;
 	tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
 	tcp_ses->rdma = volume_info->rdma;
 	tcp_ses->in_flight = 0;
+	tcp_ses->max_in_flight = 0;
 	tcp_ses->credits = 1;
 	init_waitqueue_head(&tcp_ses->response_q);
 	init_waitqueue_head(&tcp_ses->request_q);
@@ -2396,7 +2759,9 @@
 		volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
 	tcp_ses->session_estab = false;
 	tcp_ses->sequence_number = 0;
+	tcp_ses->reconnect_instance = 1;
 	tcp_ses->lstrp = jiffies;
+	tcp_ses->compress_algorithm = cpu_to_le16(volume_info->compression);
 	spin_lock_init(&tcp_ses->req_lock);
 	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
 	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
@@ -2457,8 +2822,11 @@
 		module_put(THIS_MODULE);
 		goto out_err_crypto_release;
 	}
+	tcp_ses->min_offload = volume_info->min_offload;
 	tcp_ses->tcpStatus = CifsNeedNegotiate;
 
+	tcp_ses->nr_targets = 1;
+	tcp_ses->ignore_signature = volume_info->ignore_signature;
 	/* thread spawned, put it on the list */
 	spin_lock(&cifs_tcp_ses_lock);
 	list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
@@ -2535,16 +2903,17 @@
 	struct nls_table *nls_codepage;
 	char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0};
 	bool seal = false;
+	struct TCP_Server_Info *server = ses->server;
 
 	/*
 	 * If the mount request that resulted in the creation of the
 	 * session requires encryption, force IPC to be encrypted too.
 	 */
 	if (volume_info->seal) {
-		if (ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)
+		if (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)
 			seal = true;
 		else {
-			cifs_dbg(VFS,
+			cifs_server_dbg(VFS,
 				 "IPC: server doesn't support encryption\n");
 			return -EOPNOTSUPP;
 		}
@@ -2554,7 +2923,7 @@
 	if (tcon == NULL)
 		return -ENOMEM;
 
-	snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
+	scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", server->hostname);
 
 	/* cannot fail */
 	nls_codepage = load_nls_default();
@@ -2563,11 +2932,11 @@
 	tcon->ses = ses;
 	tcon->ipc = true;
 	tcon->seal = seal;
-	rc = ses->server->ops->tree_connect(xid, ses, unc, tcon, nls_codepage);
+	rc = server->ops->tree_connect(xid, ses, unc, tcon, nls_codepage);
 	free_xid(xid);
 
 	if (rc) {
-		cifs_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc);
+		cifs_server_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc);
 		tconInfoFree(tcon);
 		goto out;
 	}
@@ -2627,8 +2996,7 @@
 	return NULL;
 }
 
-static void
-cifs_put_smb_ses(struct cifs_ses *ses)
+void cifs_put_smb_ses(struct cifs_ses *ses)
 {
 	unsigned int rc, xid;
 	struct TCP_Server_Info *server = ses->server;
@@ -2654,7 +3022,7 @@
 		xid = get_xid();
 		rc = server->ops->logoff(xid, ses);
 		if (rc)
-			cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
+			cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
 				__func__, rc);
 		_free_xid(xid);
 	}
@@ -2677,6 +3045,7 @@
 cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 {
 	int rc = 0;
+	int is_domain = 0;
 	const char *delim, *payload;
 	char *desc;
 	ssize_t len;
@@ -2724,6 +3093,7 @@
 			rc = PTR_ERR(key);
 			goto out_err;
 		}
+		is_domain = 1;
 	}
 
 	down_read(&key->sem);
@@ -2781,6 +3151,26 @@
 		goto out_key_put;
 	}
 
+	/*
+	 * If we have a domain key then we must set the domainName in the
+	 * for the request.
+	 */
+	if (is_domain && ses->domainName) {
+		vol->domainname = kstrndup(ses->domainName,
+					   strlen(ses->domainName),
+					   GFP_KERNEL);
+		if (!vol->domainname) {
+			cifs_dbg(FYI, "Unable to allocate %zd bytes for "
+				 "domain\n", len);
+			rc = -ENOMEM;
+			kfree(vol->username);
+			vol->username = NULL;
+			kzfree(vol->password);
+			vol->password = NULL;
+			goto out_key_put;
+		}
+	}
+
 out_key_put:
 	up_read(&key->sem);
 	key_put(key);
@@ -2805,7 +3195,7 @@
  * already got a server reference (server refcount +1). See
  * cifs_get_tcon() for refcount explanations.
  */
-static struct cifs_ses *
+struct cifs_ses *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
 	int rc = -ENOMEM;
@@ -2886,7 +3276,6 @@
 
 	ses->sectype = volume_info->sectype;
 	ses->sign = volume_info->sign;
-
 	mutex_lock(&ses->session_mutex);
 	rc = cifs_negotiate_protocol(xid, ses);
 	if (!rc)
@@ -2922,6 +3311,10 @@
 		return 0;
 	if (tcon->snapshot_time != volume_info->snapshot_time)
 		return 0;
+	if (tcon->handle_timeout != volume_info->handle_timeout)
+		return 0;
+	if (tcon->no_lease != volume_info->no_lease)
+		return 0;
 	return 1;
 }
 
@@ -3036,6 +3429,16 @@
 			tcon->snapshot_time = volume_info->snapshot_time;
 	}
 
+	if (volume_info->handle_timeout) {
+		if (ses->server->vals->protocol_id == 0) {
+			cifs_dbg(VFS,
+			     "Use SMB2.1 or later for handle timeout option\n");
+			rc = -EOPNOTSUPP;
+			goto out_fail;
+		} else
+			tcon->handle_timeout = volume_info->handle_timeout;
+	}
+
 	tcon->ses = ses;
 	if (volume_info->password) {
 		tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
@@ -3085,10 +3488,6 @@
 	if (rc)
 		goto out_fail;
 
-	if (volume_info->nodfs) {
-		tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-		cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
-	}
 	tcon->use_persistent = false;
 	/* check if SMB2 or later, CIFS does not support persistent handles */
 	if (volume_info->persistent) {
@@ -3121,6 +3520,14 @@
 		tcon->use_resilient = true;
 	}
 
+	/* If the user really knows what they are doing they can override */
+	if (tcon->share_flags & SMB2_SHAREFLAG_NO_CACHING) {
+		if (volume_info->cache_ro)
+			cifs_dbg(VFS, "cache=ro requested on mount but NO_CACHING flag set on share\n");
+		else if (volume_info->cache_rw)
+			cifs_dbg(VFS, "cache=singleclient requested on mount but NO_CACHING flag set on share\n");
+	}
+
 	/*
 	 * We can have only one retry value for a connection to a share so for
 	 * resources mounted more than once to the same server share the last
@@ -3130,6 +3537,7 @@
 	tcon->nocase = volume_info->nocase;
 	tcon->nohandlecache = volume_info->nohandlecache;
 	tcon->local_lease = volume_info->local_lease;
+	tcon->no_lease = volume_info->no_lease;
 	INIT_LIST_HEAD(&tcon->pending_opens);
 
 	spin_lock(&cifs_tcp_ses_lock);
@@ -3168,12 +3576,16 @@
 {
 	struct cifs_sb_info *old = CIFS_SB(sb);
 	struct cifs_sb_info *new = mnt_data->cifs_sb;
+	unsigned int oldflags = old->mnt_cifs_flags & CIFS_MOUNT_MASK;
+	unsigned int newflags = new->mnt_cifs_flags & CIFS_MOUNT_MASK;
 
 	if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK))
 		return 0;
 
-	if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) !=
-	    (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
+	if (old->mnt_cifs_serverino_autodisabled)
+		newflags &= ~CIFS_MOUNT_SERVER_INUM;
+
+	if (oldflags != newflags)
 		return 0;
 
 	/*
@@ -3258,25 +3670,6 @@
 	return rc;
 }
 
-int
-get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path,
-	     const struct nls_table *nls_codepage, unsigned int *num_referrals,
-	     struct dfs_info3_param **referrals, int remap)
-{
-	int rc = 0;
-
-	if (!ses->server->ops->get_dfs_refer)
-		return -ENOSYS;
-
-	*num_referrals = 0;
-	*referrals = NULL;
-
-	rc = ses->server->ops->get_dfs_refer(xid, ses, old_path,
-					     referrals, num_referrals,
-					     nls_codepage, remap);
-	return rc;
-}
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key cifs_key[2];
 static struct lock_class_key cifs_slock_key[2];
@@ -3340,10 +3733,10 @@
 			saddr4 = (struct sockaddr_in *)&server->srcaddr;
 			saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
 			if (saddr6->sin6_family == AF_INET6)
-				cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n",
+				cifs_server_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n",
 					 &saddr6->sin6_addr, rc);
 			else
-				cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n",
+				cifs_server_dbg(VFS, "Failed to bind to: %pI4, error: %d\n",
 					 &saddr4->sin_addr.s_addr, rc);
 		}
 	}
@@ -3447,7 +3840,7 @@
 		rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
 				   IPPROTO_TCP, &socket, 1);
 		if (rc < 0) {
-			cifs_dbg(VFS, "Error %d creating socket\n", rc);
+			cifs_server_dbg(VFS, "Error %d creating socket\n", rc);
 			server->ssocket = NULL;
 			return rc;
 		}
@@ -3495,7 +3888,15 @@
 		 socket->sk->sk_sndbuf,
 		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
 
-	rc = socket->ops->connect(socket, saddr, slen, 0);
+	rc = socket->ops->connect(socket, saddr, slen,
+				  server->noblockcnt ? O_NONBLOCK : 0);
+	/*
+	 * When mounting SMB root file systems, we do not want to block in
+	 * connect. Otherwise bail out and then let cifs_reconnect() perform
+	 * reconnect failover - if possible.
+	 */
+	if (server->noblockcnt && rc == -EINPROGRESS)
+		rc = 0;
 	if (rc < 0) {
 		cifs_dbg(FYI, "Error %d connecting to server\n", rc);
 		sock_release(socket);
@@ -3646,6 +4047,7 @@
 	spin_lock_init(&cifs_sb->tlink_tree_lock);
 	cifs_sb->tlink_tree = RB_ROOT;
 
+	cifs_sb->bsize = pvolume_info->bsize;
 	/*
 	 * Temporarily set r/wsize for matching superblock. If we end up using
 	 * new sb then client will later negotiate it downward if needed.
@@ -3663,6 +4065,8 @@
 	cifs_sb->actimeo = pvolume_info->actimeo;
 	cifs_sb->local_nls = pvolume_info->local_nls;
 
+	if (pvolume_info->nodfs)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_DFS;
 	if (pvolume_info->noperm)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
 	if (pvolume_info->setuids)
@@ -3689,6 +4093,8 @@
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
 	if (pvolume_info->rwpidforward)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
+	if (pvolume_info->mode_ace)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MODE_FROM_SID;
 	if (pvolume_info->cifs_acl)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
 	if (pvolume_info->backupuid_specified) {
@@ -3716,6 +4122,14 @@
 		cifs_dbg(FYI, "mounting share using direct i/o\n");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 	}
+	if (pvolume_info->cache_ro) {
+		cifs_dbg(VFS, "mounting share with read only caching. Ensure that the share will not be modified while in use.\n");
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RO_CACHE;
+	} else if (pvolume_info->cache_rw) {
+		cifs_dbg(VFS, "mounting share in single client RW caching mode. Ensure that no other systems will be accessing the share.\n");
+		cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_RO_CACHE |
+					    CIFS_MOUNT_RW_CACHE);
+	}
 	if (pvolume_info->mfsymlinks) {
 		if (pvolume_info->sfu_emul) {
 			/*
@@ -3746,8 +4160,8 @@
 	return 0;
 }
 
-static void
-cleanup_volume_info_contents(struct smb_vol *volume_info)
+void
+cifs_cleanup_volume_info_contents(struct smb_vol *volume_info)
 {
 	kfree(volume_info->username);
 	kzfree(volume_info->password);
@@ -3762,10 +4176,146 @@
 {
 	if (!volume_info)
 		return;
-	cleanup_volume_info_contents(volume_info);
+	cifs_cleanup_volume_info_contents(volume_info);
 	kfree(volume_info);
 }
 
+/* Release all succeed connections */
+static inline void mount_put_conns(struct cifs_sb_info *cifs_sb,
+				   unsigned int xid,
+				   struct TCP_Server_Info *server,
+				   struct cifs_ses *ses, struct cifs_tcon *tcon)
+{
+	int rc = 0;
+
+	if (tcon)
+		cifs_put_tcon(tcon);
+	else if (ses)
+		cifs_put_smb_ses(ses);
+	else if (server)
+		cifs_put_tcp_session(server, 0);
+	cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
+	free_xid(xid);
+}
+
+/* Get connections for tcp, ses and tcon */
+static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+			   unsigned int *xid,
+			   struct TCP_Server_Info **nserver,
+			   struct cifs_ses **nses, struct cifs_tcon **ntcon)
+{
+	int rc = 0;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+
+	*nserver = NULL;
+	*nses = NULL;
+	*ntcon = NULL;
+
+	*xid = get_xid();
+
+	/* get a reference to a tcp session */
+	server = cifs_get_tcp_session(vol);
+	if (IS_ERR(server)) {
+		rc = PTR_ERR(server);
+		return rc;
+	}
+
+	*nserver = server;
+
+	if ((vol->max_credits < 20) || (vol->max_credits > 60000))
+		server->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
+	else
+		server->max_credits = vol->max_credits;
+
+	/* get a reference to a SMB session */
+	ses = cifs_get_smb_ses(server, vol);
+	if (IS_ERR(ses)) {
+		rc = PTR_ERR(ses);
+		return rc;
+	}
+
+	*nses = ses;
+
+	if ((vol->persistent == true) && (!(ses->server->capabilities &
+					    SMB2_GLOBAL_CAP_PERSISTENT_HANDLES))) {
+		cifs_server_dbg(VFS, "persistent handles not supported by server\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* search for existing tcon to this server share */
+	tcon = cifs_get_tcon(ses, vol);
+	if (IS_ERR(tcon)) {
+		rc = PTR_ERR(tcon);
+		return rc;
+	}
+
+	*ntcon = tcon;
+
+	/* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+	if (tcon->posix_extensions)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+
+	/* tell server which Unix caps we support */
+	if (cap_unix(tcon->ses)) {
+		/*
+		 * reset of caps checks mount to see if unix extensions disabled
+		 * for just this mount.
+		 */
+		reset_cifs_unix_caps(*xid, tcon, cifs_sb, vol);
+		if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
+		    (le64_to_cpu(tcon->fsUnixInfo.Capability) &
+		     CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP))
+			return -EACCES;
+	} else
+		tcon->unix_ext = 0; /* server does not support them */
+
+	/* do not care if a following call succeed - informational */
+	if (!tcon->pipe && server->ops->qfs_tcon) {
+		server->ops->qfs_tcon(*xid, tcon);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) {
+			if (tcon->fsDevInfo.DeviceCharacteristics &
+			    cpu_to_le32(FILE_READ_ONLY_DEVICE))
+				cifs_dbg(VFS, "mounted to read only share\n");
+			else if ((cifs_sb->mnt_cifs_flags &
+				  CIFS_MOUNT_RW_CACHE) == 0)
+				cifs_dbg(VFS, "read only mount of RW share\n");
+			/* no need to log a RW mount of a typical RW share */
+		}
+	}
+
+	cifs_sb->wsize = server->ops->negotiate_wsize(tcon, vol);
+	cifs_sb->rsize = server->ops->negotiate_rsize(tcon, vol);
+
+	return 0;
+}
+
+static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+			     struct cifs_tcon *tcon)
+{
+	struct tcon_link *tlink;
+
+	/* hang the tcon off of the superblock */
+	tlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+	if (tlink == NULL)
+		return -ENOMEM;
+
+	tlink->tl_uid = ses->linux_uid;
+	tlink->tl_tcon = tcon;
+	tlink->tl_time = jiffies;
+	set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+	set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+
+	cifs_sb->master_tlink = tlink;
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+	return 0;
+}
 
 #ifdef CONFIG_CIFS_DFS_UPCALL
 /*
@@ -3774,22 +4324,26 @@
  */
 static char *
 build_unc_path_to_root(const struct smb_vol *vol,
-		const struct cifs_sb_info *cifs_sb)
+		       const struct cifs_sb_info *cifs_sb, bool useppath)
 {
 	char *full_path, *pos;
-	unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+	unsigned int pplen = useppath && vol->prepath ?
+		strlen(vol->prepath) + 1 : 0;
 	unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
 
+	if (unc_len > MAX_TREE_SIZE)
+		return ERR_PTR(-EINVAL);
+
 	full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	strncpy(full_path, vol->UNC, unc_len);
+	memcpy(full_path, vol->UNC, unc_len);
 	pos = full_path + unc_len;
 
 	if (pplen) {
 		*pos = CIFS_DIR_SEP(cifs_sb);
-		strncpy(pos + 1, vol->prepath, pplen);
+		memcpy(pos + 1, vol->prepath, pplen);
 		pos += pplen;
 	}
 
@@ -3799,8 +4353,9 @@
 	return full_path;
 }
 
-/*
- * Perform a dfs referral query for a share and (optionally) prefix
+/**
+ * expand_dfs_referral - Perform a dfs referral query and update the cifs_sb
+ *
  *
  * If a referral is found, cifs_sb->mountdata will be (re-)allocated
  * to a string containing updated options for the submount.  Otherwise it
@@ -3815,36 +4370,36 @@
 		    int check_prefix)
 {
 	int rc;
-	unsigned int num_referrals = 0;
-	struct dfs_info3_param *referrals = NULL;
+	struct dfs_info3_param referral = {0};
 	char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
 
-	full_path = build_unc_path_to_root(volume_info, cifs_sb);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+		return -EREMOTE;
+
+	full_path = build_unc_path_to_root(volume_info, cifs_sb, true);
 	if (IS_ERR(full_path))
 		return PTR_ERR(full_path);
 
 	/* For DFS paths, skip the first '\' of the UNC */
 	ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
 
-	rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls,
-			  &num_referrals, &referrals, cifs_remap(cifs_sb));
-
-	if (!rc && num_referrals > 0) {
+	rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+			    ref_path, &referral, NULL);
+	if (!rc) {
 		char *fake_devname = NULL;
 
 		mdata = cifs_compose_mount_options(cifs_sb->mountdata,
-						   full_path + 1, referrals,
+						   full_path + 1, &referral,
 						   &fake_devname);
-
-		free_dfs_info_array(referrals, num_referrals);
+		free_dfs_info_param(&referral);
 
 		if (IS_ERR(mdata)) {
 			rc = PTR_ERR(mdata);
 			mdata = NULL;
 		} else {
-			cleanup_volume_info_contents(volume_info);
+			cifs_cleanup_volume_info_contents(volume_info);
 			rc = cifs_setup_volume_info(volume_info, mdata,
-							fake_devname, false);
+						    fake_devname, false);
 		}
 		kfree(fake_devname);
 		kfree(cifs_sb->mountdata);
@@ -3853,9 +4408,146 @@
 	kfree(full_path);
 	return rc;
 }
+
+static inline int get_next_dfs_tgt(const char *path,
+				   struct dfs_cache_tgt_list *tgt_list,
+				   struct dfs_cache_tgt_iterator **tgt_it)
+{
+	if (!*tgt_it)
+		*tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+	else
+		*tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it);
+	return !*tgt_it ? -EHOSTDOWN : 0;
+}
+
+static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
+			   struct smb_vol *fake_vol, struct smb_vol *vol)
+{
+	const char *tgt = dfs_cache_get_tgt_name(tgt_it);
+	int len = strlen(tgt) + 2;
+	char *new_unc;
+
+	new_unc = kmalloc(len, GFP_KERNEL);
+	if (!new_unc)
+		return -ENOMEM;
+	scnprintf(new_unc, len, "\\%s", tgt);
+
+	kfree(vol->UNC);
+	vol->UNC = new_unc;
+
+	if (fake_vol->prepath) {
+		kfree(vol->prepath);
+		vol->prepath = fake_vol->prepath;
+		fake_vol->prepath = NULL;
+	}
+	memcpy(&vol->dstaddr, &fake_vol->dstaddr, sizeof(vol->dstaddr));
+
+	return 0;
+}
+
+static int setup_dfs_tgt_conn(const char *path,
+			      const struct dfs_cache_tgt_iterator *tgt_it,
+			      struct cifs_sb_info *cifs_sb,
+			      struct smb_vol *vol,
+			      unsigned int *xid,
+			      struct TCP_Server_Info **server,
+			      struct cifs_ses **ses,
+			      struct cifs_tcon **tcon)
+{
+	int rc;
+	struct dfs_info3_param ref = {0};
+	char *mdata = NULL, *fake_devname = NULL;
+	struct smb_vol fake_vol = {NULL};
+
+	cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path);
+
+	rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref);
+	if (rc)
+		return rc;
+
+	mdata = cifs_compose_mount_options(cifs_sb->mountdata, path, &ref,
+					   &fake_devname);
+	free_dfs_info_param(&ref);
+
+	if (IS_ERR(mdata)) {
+		rc = PTR_ERR(mdata);
+		mdata = NULL;
+	} else {
+		cifs_dbg(FYI, "%s: fake_devname: %s\n", __func__, fake_devname);
+		rc = cifs_setup_volume_info(&fake_vol, mdata, fake_devname,
+					    false);
+	}
+	kfree(mdata);
+	kfree(fake_devname);
+
+	if (!rc) {
+		/*
+		 * We use a 'fake_vol' here because we need pass it down to the
+		 * mount_{get,put} functions to test connection against new DFS
+		 * targets.
+		 */
+		mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon);
+		rc = mount_get_conns(&fake_vol, cifs_sb, xid, server, ses,
+				     tcon);
+		if (!rc) {
+			/*
+			 * We were able to connect to new target server.
+			 * Update current volume info with new target server.
+			 */
+			rc = update_vol_info(tgt_it, &fake_vol, vol);
+		}
+	}
+	cifs_cleanup_volume_info_contents(&fake_vol);
+	return rc;
+}
+
+static int mount_do_dfs_failover(const char *path,
+				 struct cifs_sb_info *cifs_sb,
+				 struct smb_vol *vol,
+				 struct cifs_ses *root_ses,
+				 unsigned int *xid,
+				 struct TCP_Server_Info **server,
+				 struct cifs_ses **ses,
+				 struct cifs_tcon **tcon)
+{
+	int rc;
+	struct dfs_cache_tgt_list tgt_list;
+	struct dfs_cache_tgt_iterator *tgt_it = NULL;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+		return -EOPNOTSUPP;
+
+	rc = dfs_cache_noreq_find(path, NULL, &tgt_list);
+	if (rc)
+		return rc;
+
+	for (;;) {
+		/* Get next DFS target server - if any */
+		rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it);
+		if (rc)
+			break;
+		/* Connect to next DFS target */
+		rc = setup_dfs_tgt_conn(path, tgt_it, cifs_sb, vol, xid, server,
+					ses, tcon);
+		if (!rc || rc == -EACCES || rc == -EOPNOTSUPP)
+			break;
+	}
+	if (!rc) {
+		/*
+		 * Update DFS target hint in DFS referral cache with the target
+		 * server we successfully reconnected to.
+		 */
+		rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses,
+					      cifs_sb->local_nls,
+					      cifs_remap(cifs_sb), path,
+					      tgt_it);
+	}
+	dfs_cache_free_tgts(&tgt_list);
+	return rc;
+}
 #endif
 
-static int
+int
 cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
 			const char *devname, bool is_smb3)
 {
@@ -3918,11 +4610,13 @@
 					unsigned int xid,
 					struct cifs_tcon *tcon,
 					struct cifs_sb_info *cifs_sb,
-					char *full_path)
+					char *full_path,
+					int added_treename)
 {
 	int rc;
 	char *s;
 	char sep, tmp;
+	int skip = added_treename ? 1 : 0;
 
 	sep = CIFS_DIR_SEP(cifs_sb);
 	s = full_path;
@@ -3937,7 +4631,14 @@
 		/* next separator */
 		while (*s && *s != sep)
 			s++;
-
+		/*
+		 * if the treename is added, we then have to skip the first
+		 * part within the separators
+		 */
+		if (skip) {
+			skip = 0;
+			continue;
+		}
 		/*
 		 * temporarily null-terminate the path at the end of
 		 * the current component
@@ -3951,107 +4652,107 @@
 	return rc;
 }
 
-int
-cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
+/*
+ * Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is,
+ * otherwise 0.
+ */
+static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol,
+			  const unsigned int xid,
+			  struct TCP_Server_Info *server,
+			  struct cifs_tcon *tcon)
 {
 	int rc;
+	char *full_path;
+
+	if (!server->ops->is_path_accessible)
+		return -EOPNOTSUPP;
+
+	/*
+	 * cifs_build_path_to_root works only when we have a valid tcon
+	 */
+	full_path = cifs_build_path_to_root(vol, cifs_sb, tcon,
+					    tcon->Flags & SMB_SHARE_IS_IN_DFS);
+	if (full_path == NULL)
+		return -ENOMEM;
+
+	cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
+
+	rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
+					     full_path);
+	if (rc != 0 && rc != -EREMOTE) {
+		kfree(full_path);
+		return rc;
+	}
+
+	if (rc != -EREMOTE) {
+		rc = cifs_are_all_path_components_accessible(server, xid, tcon,
+			cifs_sb, full_path, tcon->Flags & SMB_SHARE_IS_IN_DFS);
+		if (rc != 0) {
+			cifs_server_dbg(VFS, "cannot query dirs between root and final path, "
+				 "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+			rc = 0;
+		}
+	}
+
+	kfree(full_path);
+	return rc;
+}
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
+{
+	int rc = 0;
 	unsigned int xid;
 	struct cifs_ses *ses;
-	struct cifs_tcon *tcon;
+	struct cifs_tcon *root_tcon = NULL;
+	struct cifs_tcon *tcon = NULL;
 	struct TCP_Server_Info *server;
-	char   *full_path;
-	struct tcon_link *tlink;
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	int referral_walks_count = 0;
-#endif
+	char *root_path = NULL, *full_path = NULL;
+	char *old_mountdata, *origin_mountdata = NULL;
+	int count;
 
-#ifdef CONFIG_CIFS_DFS_UPCALL
-try_mount_again:
-	/* cleanup activities if we're chasing a referral */
-	if (referral_walks_count) {
-		if (tcon)
-			cifs_put_tcon(tcon);
-		else if (ses)
-			cifs_put_smb_ses(ses);
-
-		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
-
-		free_xid(xid);
-	}
-#endif
-	rc = 0;
-	tcon = NULL;
-	ses = NULL;
-	server = NULL;
-	full_path = NULL;
-	tlink = NULL;
-
-	xid = get_xid();
-
-	/* get a reference to a tcp session */
-	server = cifs_get_tcp_session(volume_info);
-	if (IS_ERR(server)) {
-		rc = PTR_ERR(server);
-		goto out;
-	}
-	if ((volume_info->max_credits < 20) ||
-	     (volume_info->max_credits > 60000))
-		server->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
-	else
-		server->max_credits = volume_info->max_credits;
-	/* get a reference to a SMB session */
-	ses = cifs_get_smb_ses(server, volume_info);
-	if (IS_ERR(ses)) {
-		rc = PTR_ERR(ses);
-		ses = NULL;
-		goto mount_fail_check;
-	}
-
-	if ((volume_info->persistent == true) && ((ses->server->capabilities &
-		SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
-		cifs_dbg(VFS, "persistent handles not supported by server\n");
-		rc = -EOPNOTSUPP;
-		goto mount_fail_check;
-	}
-
-	/* search for existing tcon to this server share */
-	tcon = cifs_get_tcon(ses, volume_info);
-	if (IS_ERR(tcon)) {
-		rc = PTR_ERR(tcon);
-		tcon = NULL;
-		if (rc == -EACCES)
-			goto mount_fail_check;
-
-		goto remote_path_check;
-	}
-
-	/* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
-	if (tcon->posix_extensions)
-		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
-
-	/* tell server which Unix caps we support */
-	if (cap_unix(tcon->ses)) {
-		/* reset of caps checks mount to see if unix extensions
-		   disabled for just this mount */
-		reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info);
-		if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
-		    (le64_to_cpu(tcon->fsUnixInfo.Capability) &
-		     CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
-			rc = -EACCES;
-			goto mount_fail_check;
+	rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+	if (!rc && tcon) {
+		/* If not a standalone DFS root, then check if path is remote */
+		rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
+				    cifs_remap(cifs_sb), vol->UNC + 1, NULL,
+				    NULL);
+		if (rc) {
+			rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+			if (!rc)
+				goto out;
+			if (rc != -EREMOTE)
+				goto error;
 		}
-	} else
-		tcon->unix_ext = 0; /* server does not support them */
+	}
+	/*
+	 * If first DFS target server went offline and we failed to connect it,
+	 * server and ses pointers are NULL at this point, though we still have
+	 * chance to get a cached DFS referral in expand_dfs_referral() and
+	 * retry next target available in it.
+	 *
+	 * If a NULL ses ptr is passed to dfs_cache_find(), a lookup will be
+	 * performed against DFS path and *no* requests will be sent to server
+	 * for any new DFS referrals. Hence it's safe to skip checking whether
+	 * server or ses ptr is NULL.
+	 */
+	if (rc == -EACCES || rc == -EOPNOTSUPP)
+		goto error;
 
-	/* do not care if a following call succeed - informational */
-	if (!tcon->pipe && server->ops->qfs_tcon)
-		server->ops->qfs_tcon(xid, tcon);
+	root_path = build_unc_path_to_root(vol, cifs_sb, false);
+	if (IS_ERR(root_path)) {
+		rc = PTR_ERR(root_path);
+		root_path = NULL;
+		goto error;
+	}
 
-	cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
-	cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
-
-remote_path_check:
-#ifdef CONFIG_CIFS_DFS_UPCALL
+	full_path = build_unc_path_to_root(vol, cifs_sb, true);
+	if (IS_ERR(full_path)) {
+		rc = PTR_ERR(full_path);
+		full_path = NULL;
+		goto error;
+	}
 	/*
 	 * Perform an unconditional check for whether there are DFS
 	 * referrals for this path without prefix, to provide support
@@ -4059,119 +4760,182 @@
 	 * with PATH_NOT_COVERED to requests that include the prefix.
 	 * Chase the referral if found, otherwise continue normally.
 	 */
-	if (referral_walks_count == 0) {
-		int refrc = expand_dfs_referral(xid, ses, volume_info, cifs_sb,
-						false);
-		if (!refrc) {
-			referral_walks_count++;
-			goto try_mount_again;
-		}
-	}
-#endif
+	old_mountdata = cifs_sb->mountdata;
+	(void)expand_dfs_referral(xid, ses, vol, cifs_sb, false);
 
-	/* check if a whole path is not remote */
-	if (!rc && tcon) {
-		if (!server->ops->is_path_accessible) {
-			rc = -ENOSYS;
-			goto mount_fail_check;
+	if (cifs_sb->mountdata == NULL) {
+		rc = -ENOENT;
+		goto error;
+	}
+
+	/* Save DFS root volume information for DFS refresh worker */
+	origin_mountdata = kstrndup(cifs_sb->mountdata,
+				    strlen(cifs_sb->mountdata), GFP_KERNEL);
+	if (!origin_mountdata) {
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	if (cifs_sb->mountdata != old_mountdata) {
+		/* If we were redirected, reconnect to new target server */
+		mount_put_conns(cifs_sb, xid, server, ses, tcon);
+		rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+	}
+	if (rc) {
+		if (rc == -EACCES || rc == -EOPNOTSUPP)
+			goto error;
+		/* Perform DFS failover to any other DFS targets */
+		rc = mount_do_dfs_failover(root_path + 1, cifs_sb, vol, NULL,
+					   &xid, &server, &ses, &tcon);
+		if (rc)
+			goto error;
+	}
+
+	kfree(root_path);
+	root_path = build_unc_path_to_root(vol, cifs_sb, false);
+	if (IS_ERR(root_path)) {
+		rc = PTR_ERR(root_path);
+		root_path = NULL;
+		goto error;
+	}
+	/* Cache out resolved root server */
+	(void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+			     root_path + 1, NULL, NULL);
+	/*
+	 * Save root tcon for additional DFS requests to update or create a new
+	 * DFS cache entry, or even perform DFS failover.
+	 */
+	spin_lock(&cifs_tcp_ses_lock);
+	tcon->tc_count++;
+	tcon->dfs_path = root_path;
+	root_path = NULL;
+	tcon->remap = cifs_remap(cifs_sb);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	root_tcon = tcon;
+
+	for (count = 1; ;) {
+		if (!rc && tcon) {
+			rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+			if (!rc || rc != -EREMOTE)
+				break;
 		}
 		/*
-		 * cifs_build_path_to_root works only when we have a valid tcon
+		 * BB: when we implement proper loop detection,
+		 *     we will remove this check. But now we need it
+		 *     to prevent an indefinite loop if 'DFS tree' is
+		 *     misconfigured (i.e. has loops).
 		 */
-		full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon,
-					tcon->Flags & SMB_SHARE_IS_IN_DFS);
-		if (full_path == NULL) {
-			rc = -ENOMEM;
-			goto mount_fail_check;
-		}
-		rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
-						     full_path);
-		if (rc != 0 && rc != -EREMOTE) {
-			kfree(full_path);
-			goto mount_fail_check;
-		}
-
-		if (rc != -EREMOTE) {
-			rc = cifs_are_all_path_components_accessible(server,
-							     xid, tcon, cifs_sb,
-							     full_path);
-			if (rc != 0) {
-				cifs_dbg(VFS, "cannot query dirs between root and final path, "
-					 "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
-				cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
-				rc = 0;
-			}
-		}
-		kfree(full_path);
-	}
-
-	/* get referral if needed */
-	if (rc == -EREMOTE) {
-#ifdef CONFIG_CIFS_DFS_UPCALL
-		if (referral_walks_count > MAX_NESTED_LINKS) {
-			/*
-			 * BB: when we implement proper loop detection,
-			 *     we will remove this check. But now we need it
-			 *     to prevent an indefinite loop if 'DFS tree' is
-			 *     misconfigured (i.e. has loops).
-			 */
+		if (count++ > MAX_NESTED_LINKS) {
 			rc = -ELOOP;
-			goto mount_fail_check;
+			break;
 		}
 
-		rc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, true);
-
-		if (!rc) {
-			referral_walks_count++;
-			goto try_mount_again;
+		kfree(full_path);
+		full_path = build_unc_path_to_root(vol, cifs_sb, true);
+		if (IS_ERR(full_path)) {
+			rc = PTR_ERR(full_path);
+			full_path = NULL;
+			break;
 		}
-		goto mount_fail_check;
-#else /* No DFS support, return error on mount */
-		rc = -EOPNOTSUPP;
-#endif
+
+		old_mountdata = cifs_sb->mountdata;
+		rc = expand_dfs_referral(xid, root_tcon->ses, vol, cifs_sb,
+					 true);
+		if (rc)
+			break;
+
+		if (cifs_sb->mountdata != old_mountdata) {
+			mount_put_conns(cifs_sb, xid, server, ses, tcon);
+			rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses,
+					     &tcon);
+		}
+		if (rc) {
+			if (rc == -EACCES || rc == -EOPNOTSUPP)
+				break;
+			/* Perform DFS failover to any other DFS targets */
+			rc = mount_do_dfs_failover(full_path + 1, cifs_sb, vol,
+						   root_tcon->ses, &xid,
+						   &server, &ses, &tcon);
+			if (rc == -EACCES || rc == -EOPNOTSUPP || !server ||
+			    !ses)
+				goto error;
+		}
 	}
+	cifs_put_tcon(root_tcon);
 
 	if (rc)
-		goto mount_fail_check;
+		goto error;
 
-	/* now, hang the tcon off of the superblock */
-	tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
-	if (tlink == NULL) {
+	spin_lock(&cifs_tcp_ses_lock);
+	if (!tcon->dfs_path) {
+		/* Save full path in new tcon to do failover when reconnecting tcons */
+		tcon->dfs_path = full_path;
+		full_path = NULL;
+		tcon->remap = cifs_remap(cifs_sb);
+	}
+	cifs_sb->origin_fullpath = kstrndup(tcon->dfs_path,
+					    strlen(tcon->dfs_path),
+					    GFP_ATOMIC);
+	if (!cifs_sb->origin_fullpath) {
+		spin_unlock(&cifs_tcp_ses_lock);
 		rc = -ENOMEM;
-		goto mount_fail_check;
+		goto error;
 	}
+	spin_unlock(&cifs_tcp_ses_lock);
 
-	tlink->tl_uid = ses->linux_uid;
-	tlink->tl_tcon = tcon;
-	tlink->tl_time = jiffies;
-	set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
-	set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
-
-	cifs_sb->master_tlink = tlink;
-	spin_lock(&cifs_sb->tlink_tree_lock);
-	tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
-	spin_unlock(&cifs_sb->tlink_tree_lock);
-
-	queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
-				TLINK_IDLE_EXPIRE);
-
-mount_fail_check:
-	/* on error free sesinfo and tcon struct if needed */
+	rc = dfs_cache_add_vol(origin_mountdata, vol, cifs_sb->origin_fullpath);
 	if (rc) {
-		/* If find_unc succeeded then rc == 0 so we can not end */
-		/* up accidentally freeing someone elses tcon struct */
-		if (tcon)
-			cifs_put_tcon(tcon);
-		else if (ses)
-			cifs_put_smb_ses(ses);
-		else
-			cifs_put_tcp_session(server, 0);
+		kfree(cifs_sb->origin_fullpath);
+		goto error;
 	}
-
+	/*
+	 * After reconnecting to a different server, unique ids won't
+	 * match anymore, so we disable serverino. This prevents
+	 * dentry revalidation to think the dentry are stale (ESTALE).
+	 */
+	cifs_autodisable_serverino(cifs_sb);
 out:
 	free_xid(xid);
+	return mount_setup_tlink(cifs_sb, ses, tcon);
+
+error:
+	kfree(full_path);
+	kfree(root_path);
+	kfree(origin_mountdata);
+	mount_put_conns(cifs_sb, xid, server, ses, tcon);
 	return rc;
 }
+#else
+int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
+{
+	int rc = 0;
+	unsigned int xid;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+
+	rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+	if (rc)
+		goto error;
+
+	if (tcon) {
+		rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+		if (rc == -EREMOTE)
+			rc = -EOPNOTSUPP;
+		if (rc)
+			goto error;
+	}
+
+	free_xid(xid);
+
+	return mount_setup_tlink(cifs_sb, ses, tcon);
+
+error:
+	mount_put_conns(cifs_sb, xid, server, ses, tcon);
+	return rc;
+}
+#endif
 
 /*
  * Issue a TREE_CONNECT request.
@@ -4367,6 +5131,10 @@
 
 	kfree(cifs_sb->mountdata);
 	kfree(cifs_sb->prepath);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	dfs_cache_del_vol(cifs_sb->origin_fullpath);
+	kfree(cifs_sb->origin_fullpath);
+#endif
 	call_rcu(&cifs_sb->rcu, delayed_free);
 }
 
@@ -4383,8 +5151,6 @@
 	if (!server->ops->need_neg(server))
 		return 0;
 
-	set_credits(server, 1);
-
 	rc = server->ops->negotiate(xid, ses);
 	if (rc == 0) {
 		spin_lock(&GlobalMid_Lock);
@@ -4424,7 +5190,7 @@
 		rc = server->ops->sess_setup(xid, ses, nls_info);
 
 	if (rc)
-		cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc);
+		cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc);
 
 	return rc;
 }

diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
new file mode 100644
index 0000000..1692c0c
--- /dev/null
+++ b/fs/cifs/dfs_cache.c

@@ -0,0 +1,1478 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DFS referral cache routines
+ *
+ * Copyright (c) 2018-2019 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/jhash.h>
+#include <linux/ktime.h>
+#include <linux/slab.h>
+#include <linux/nls.h>
+#include <linux/workqueue.h>
+#include "cifsglob.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "smb2glob.h"
+
+#include "dfs_cache.h"
+
+#define DFS_CACHE_HTABLE_SIZE 32
+#define DFS_CACHE_MAX_ENTRIES 64
+
+#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \
+				    DFSREF_STORAGE_SERVER))
+
+struct dfs_cache_tgt {
+	char *t_name;
+	struct list_head t_list;
+};
+
+struct dfs_cache_entry {
+	struct hlist_node ce_hlist;
+	const char *ce_path;
+	int ce_ttl;
+	int ce_srvtype;
+	int ce_flags;
+	struct timespec64 ce_etime;
+	int ce_path_consumed;
+	int ce_numtgts;
+	struct list_head ce_tlist;
+	struct dfs_cache_tgt *ce_tgthint;
+	struct rcu_head ce_rcu;
+};
+
+static struct kmem_cache *dfs_cache_slab __read_mostly;
+
+struct dfs_cache_vol_info {
+	char *vi_fullpath;
+	struct smb_vol vi_vol;
+	char *vi_mntdata;
+	struct list_head vi_list;
+};
+
+struct dfs_cache {
+	struct mutex dc_lock;
+	struct nls_table *dc_nlsc;
+	struct list_head dc_vol_list;
+	int dc_ttl;
+	struct delayed_work dc_refresh;
+};
+
+static struct dfs_cache dfs_cache;
+
+/*
+ * Number of entries in the cache
+ */
+static size_t dfs_cache_count;
+
+static DEFINE_MUTEX(dfs_cache_list_lock);
+static struct hlist_head dfs_cache_htable[DFS_CACHE_HTABLE_SIZE];
+
+static void refresh_cache_worker(struct work_struct *work);
+
+static inline bool is_path_valid(const char *path)
+{
+	return path && (strchr(path + 1, '\\') || strchr(path + 1, '/'));
+}
+
+static inline int get_normalized_path(const char *path, char **npath)
+{
+	if (*path == '\\') {
+		*npath = (char *)path;
+	} else {
+		*npath = kstrndup(path, strlen(path), GFP_KERNEL);
+		if (!*npath)
+			return -ENOMEM;
+		convert_delimiter(*npath, '\\');
+	}
+	return 0;
+}
+
+static inline void free_normalized_path(const char *path, char *npath)
+{
+	if (path != npath)
+		kfree(npath);
+}
+
+static inline bool cache_entry_expired(const struct dfs_cache_entry *ce)
+{
+	struct timespec64 ts;
+
+	ktime_get_coarse_real_ts64(&ts);
+	return timespec64_compare(&ts, &ce->ce_etime) >= 0;
+}
+
+static inline void free_tgts(struct dfs_cache_entry *ce)
+{
+	struct dfs_cache_tgt *t, *n;
+
+	list_for_each_entry_safe(t, n, &ce->ce_tlist, t_list) {
+		list_del(&t->t_list);
+		kfree(t->t_name);
+		kfree(t);
+	}
+}
+
+static void free_cache_entry(struct rcu_head *rcu)
+{
+	struct dfs_cache_entry *ce = container_of(rcu, struct dfs_cache_entry,
+						  ce_rcu);
+	kmem_cache_free(dfs_cache_slab, ce);
+}
+
+static inline void flush_cache_ent(struct dfs_cache_entry *ce)
+{
+	if (hlist_unhashed(&ce->ce_hlist))
+		return;
+
+	hlist_del_init_rcu(&ce->ce_hlist);
+	kfree_const(ce->ce_path);
+	free_tgts(ce);
+	dfs_cache_count--;
+	call_rcu(&ce->ce_rcu, free_cache_entry);
+}
+
+static void flush_cache_ents(void)
+{
+	int i;
+
+	rcu_read_lock();
+	for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) {
+		struct hlist_head *l = &dfs_cache_htable[i];
+		struct dfs_cache_entry *ce;
+
+		hlist_for_each_entry_rcu(ce, l, ce_hlist)
+			flush_cache_ent(ce);
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * dfs cache /proc file
+ */
+static int dfscache_proc_show(struct seq_file *m, void *v)
+{
+	int bucket;
+	struct dfs_cache_entry *ce;
+	struct dfs_cache_tgt *t;
+
+	seq_puts(m, "DFS cache\n---------\n");
+
+	mutex_lock(&dfs_cache_list_lock);
+
+	rcu_read_lock();
+	hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
+		seq_printf(m,
+			   "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
+			   "interlink=%s,path_consumed=%d,expired=%s\n",
+			   ce->ce_path,
+			   ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link",
+			   ce->ce_ttl, ce->ce_etime.tv_nsec,
+			   IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
+			   ce->ce_path_consumed,
+			   cache_entry_expired(ce) ? "yes" : "no");
+
+		list_for_each_entry(t, &ce->ce_tlist, t_list) {
+			seq_printf(m, "  %s%s\n",
+				   t->t_name,
+				   ce->ce_tgthint == t ? " (target hint)" : "");
+		}
+
+	}
+	rcu_read_unlock();
+
+	mutex_unlock(&dfs_cache_list_lock);
+	return 0;
+}
+
+static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer,
+				   size_t count, loff_t *ppos)
+{
+	char c;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+
+	if (c != '0')
+		return -EINVAL;
+
+	cifs_dbg(FYI, "clearing dfs cache");
+	mutex_lock(&dfs_cache_list_lock);
+	flush_cache_ents();
+	mutex_unlock(&dfs_cache_list_lock);
+
+	return count;
+}
+
+static int dfscache_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dfscache_proc_show, NULL);
+}
+
+const struct file_operations dfscache_proc_fops = {
+	.open		= dfscache_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= dfscache_proc_write,
+};
+
+#ifdef CONFIG_CIFS_DEBUG2
+static inline void dump_tgts(const struct dfs_cache_entry *ce)
+{
+	struct dfs_cache_tgt *t;
+
+	cifs_dbg(FYI, "target list:\n");
+	list_for_each_entry(t, &ce->ce_tlist, t_list) {
+		cifs_dbg(FYI, "  %s%s\n", t->t_name,
+			 ce->ce_tgthint == t ? " (target hint)" : "");
+	}
+}
+
+static inline void dump_ce(const struct dfs_cache_entry *ce)
+{
+	cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
+		 "interlink=%s,path_consumed=%d,expired=%s\n", ce->ce_path,
+		 ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ce_ttl,
+		 ce->ce_etime.tv_nsec,
+		 IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
+		 ce->ce_path_consumed,
+		 cache_entry_expired(ce) ? "yes" : "no");
+	dump_tgts(ce);
+}
+
+static inline void dump_refs(const struct dfs_info3_param *refs, int numrefs)
+{
+	int i;
+
+	cifs_dbg(FYI, "DFS referrals returned by the server:\n");
+	for (i = 0; i < numrefs; i++) {
+		const struct dfs_info3_param *ref = &refs[i];
+
+		cifs_dbg(FYI,
+			 "\n"
+			 "flags:         0x%x\n"
+			 "path_consumed: %d\n"
+			 "server_type:   0x%x\n"
+			 "ref_flag:      0x%x\n"
+			 "path_name:     %s\n"
+			 "node_name:     %s\n"
+			 "ttl:           %d (%dm)\n",
+			 ref->flags, ref->path_consumed, ref->server_type,
+			 ref->ref_flag, ref->path_name, ref->node_name,
+			 ref->ttl, ref->ttl / 60);
+	}
+}
+#else
+#define dump_tgts(e)
+#define dump_ce(e)
+#define dump_refs(r, n)
+#endif
+
+/**
+ * dfs_cache_init - Initialize DFS referral cache.
+ *
+ * Return zero if initialized successfully, otherwise non-zero.
+ */
+int dfs_cache_init(void)
+{
+	int i;
+
+	dfs_cache_slab = kmem_cache_create("cifs_dfs_cache",
+					   sizeof(struct dfs_cache_entry), 0,
+					   SLAB_HWCACHE_ALIGN, NULL);
+	if (!dfs_cache_slab)
+		return -ENOMEM;
+
+	for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++)
+		INIT_HLIST_HEAD(&dfs_cache_htable[i]);
+
+	INIT_LIST_HEAD(&dfs_cache.dc_vol_list);
+	mutex_init(&dfs_cache.dc_lock);
+	INIT_DELAYED_WORK(&dfs_cache.dc_refresh, refresh_cache_worker);
+	dfs_cache.dc_ttl = -1;
+	dfs_cache.dc_nlsc = load_nls_default();
+
+	cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__);
+	return 0;
+}
+
+static inline unsigned int cache_entry_hash(const void *data, int size)
+{
+	unsigned int h;
+
+	h = jhash(data, size, 0);
+	return h & (DFS_CACHE_HTABLE_SIZE - 1);
+}
+
+/* Check whether second path component of @path is SYSVOL or NETLOGON */
+static inline bool is_sysvol_or_netlogon(const char *path)
+{
+	const char *s;
+	char sep = path[0];
+
+	s = strchr(path + 1, sep) + 1;
+	return !strncasecmp(s, "sysvol", strlen("sysvol")) ||
+		!strncasecmp(s, "netlogon", strlen("netlogon"));
+}
+
+/* Return target hint of a DFS cache entry */
+static inline char *get_tgt_name(const struct dfs_cache_entry *ce)
+{
+	struct dfs_cache_tgt *t = ce->ce_tgthint;
+
+	return t ? t->t_name : ERR_PTR(-ENOENT);
+}
+
+/* Return expire time out of a new entry's TTL */
+static inline struct timespec64 get_expire_time(int ttl)
+{
+	struct timespec64 ts = {
+		.tv_sec = ttl,
+		.tv_nsec = 0,
+	};
+	struct timespec64 now;
+
+	ktime_get_coarse_real_ts64(&now);
+	return timespec64_add(now, ts);
+}
+
+/* Allocate a new DFS target */
+static inline struct dfs_cache_tgt *alloc_tgt(const char *name)
+{
+	struct dfs_cache_tgt *t;
+
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+	t->t_name = kstrndup(name, strlen(name), GFP_KERNEL);
+	if (!t->t_name) {
+		kfree(t);
+		return ERR_PTR(-ENOMEM);
+	}
+	INIT_LIST_HEAD(&t->t_list);
+	return t;
+}
+
+/*
+ * Copy DFS referral information to a cache entry and conditionally update
+ * target hint.
+ */
+static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
+			 struct dfs_cache_entry *ce, const char *tgthint)
+{
+	int i;
+
+	ce->ce_ttl = refs[0].ttl;
+	ce->ce_etime = get_expire_time(ce->ce_ttl);
+	ce->ce_srvtype = refs[0].server_type;
+	ce->ce_flags = refs[0].ref_flag;
+	ce->ce_path_consumed = refs[0].path_consumed;
+
+	for (i = 0; i < numrefs; i++) {
+		struct dfs_cache_tgt *t;
+
+		t = alloc_tgt(refs[i].node_name);
+		if (IS_ERR(t)) {
+			free_tgts(ce);
+			return PTR_ERR(t);
+		}
+		if (tgthint && !strcasecmp(t->t_name, tgthint)) {
+			list_add(&t->t_list, &ce->ce_tlist);
+			tgthint = NULL;
+		} else {
+			list_add_tail(&t->t_list, &ce->ce_tlist);
+		}
+		ce->ce_numtgts++;
+	}
+
+	ce->ce_tgthint = list_first_entry_or_null(&ce->ce_tlist,
+						  struct dfs_cache_tgt, t_list);
+
+	return 0;
+}
+
+/* Allocate a new cache entry */
+static struct dfs_cache_entry *
+alloc_cache_entry(const char *path, const struct dfs_info3_param *refs,
+		  int numrefs)
+{
+	struct dfs_cache_entry *ce;
+	int rc;
+
+	ce = kmem_cache_zalloc(dfs_cache_slab, GFP_KERNEL);
+	if (!ce)
+		return ERR_PTR(-ENOMEM);
+
+	ce->ce_path = kstrdup_const(path, GFP_KERNEL);
+	if (!ce->ce_path) {
+		kmem_cache_free(dfs_cache_slab, ce);
+		return ERR_PTR(-ENOMEM);
+	}
+	INIT_HLIST_NODE(&ce->ce_hlist);
+	INIT_LIST_HEAD(&ce->ce_tlist);
+
+	rc = copy_ref_data(refs, numrefs, ce, NULL);
+	if (rc) {
+		kfree_const(ce->ce_path);
+		kmem_cache_free(dfs_cache_slab, ce);
+		ce = ERR_PTR(rc);
+	}
+	return ce;
+}
+
+static void remove_oldest_entry(void)
+{
+	int bucket;
+	struct dfs_cache_entry *ce;
+	struct dfs_cache_entry *to_del = NULL;
+
+	rcu_read_lock();
+	hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
+		if (!to_del || timespec64_compare(&ce->ce_etime,
+						  &to_del->ce_etime) < 0)
+			to_del = ce;
+	}
+	if (!to_del) {
+		cifs_dbg(FYI, "%s: no entry to remove", __func__);
+		goto out;
+	}
+	cifs_dbg(FYI, "%s: removing entry", __func__);
+	dump_ce(to_del);
+	flush_cache_ent(to_del);
+out:
+	rcu_read_unlock();
+}
+
+/* Add a new DFS cache entry */
+static inline struct dfs_cache_entry *
+add_cache_entry(unsigned int hash, const char *path,
+		const struct dfs_info3_param *refs, int numrefs)
+{
+	struct dfs_cache_entry *ce;
+
+	ce = alloc_cache_entry(path, refs, numrefs);
+	if (IS_ERR(ce))
+		return ce;
+
+	hlist_add_head_rcu(&ce->ce_hlist, &dfs_cache_htable[hash]);
+
+	mutex_lock(&dfs_cache.dc_lock);
+	if (dfs_cache.dc_ttl < 0) {
+		dfs_cache.dc_ttl = ce->ce_ttl;
+		queue_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
+				   dfs_cache.dc_ttl * HZ);
+	} else {
+		dfs_cache.dc_ttl = min_t(int, dfs_cache.dc_ttl, ce->ce_ttl);
+		mod_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
+				 dfs_cache.dc_ttl * HZ);
+	}
+	mutex_unlock(&dfs_cache.dc_lock);
+
+	return ce;
+}
+
+static struct dfs_cache_entry *__find_cache_entry(unsigned int hash,
+						  const char *path)
+{
+	struct dfs_cache_entry *ce;
+	bool found = false;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(ce, &dfs_cache_htable[hash], ce_hlist) {
+		if (!strcasecmp(path, ce->ce_path)) {
+#ifdef CONFIG_CIFS_DEBUG2
+			char *name = get_tgt_name(ce);
+
+			if (IS_ERR(name)) {
+				rcu_read_unlock();
+				return ERR_CAST(name);
+			}
+			cifs_dbg(FYI, "%s: cache hit\n", __func__);
+			cifs_dbg(FYI, "%s: target hint: %s\n", __func__, name);
+#endif
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return found ? ce : ERR_PTR(-ENOENT);
+}
+
+/*
+ * Find a DFS cache entry in hash table and optionally check prefix path against
+ * @path.
+ * Use whole path components in the match.
+ * Return ERR_PTR(-ENOENT) if the entry is not found.
+ */
+static inline struct dfs_cache_entry *find_cache_entry(const char *path,
+						       unsigned int *hash)
+{
+	*hash = cache_entry_hash(path, strlen(path));
+	return __find_cache_entry(*hash, path);
+}
+
+static inline void destroy_slab_cache(void)
+{
+	rcu_barrier();
+	kmem_cache_destroy(dfs_cache_slab);
+}
+
+static inline void free_vol(struct dfs_cache_vol_info *vi)
+{
+	list_del(&vi->vi_list);
+	kfree(vi->vi_fullpath);
+	kfree(vi->vi_mntdata);
+	cifs_cleanup_volume_info_contents(&vi->vi_vol);
+	kfree(vi);
+}
+
+static inline void free_vol_list(void)
+{
+	struct dfs_cache_vol_info *vi, *nvi;
+
+	list_for_each_entry_safe(vi, nvi, &dfs_cache.dc_vol_list, vi_list)
+		free_vol(vi);
+}
+
+/**
+ * dfs_cache_destroy - destroy DFS referral cache
+ */
+void dfs_cache_destroy(void)
+{
+	cancel_delayed_work_sync(&dfs_cache.dc_refresh);
+	unload_nls(dfs_cache.dc_nlsc);
+	free_vol_list();
+	mutex_destroy(&dfs_cache.dc_lock);
+
+	flush_cache_ents();
+	destroy_slab_cache();
+	mutex_destroy(&dfs_cache_list_lock);
+
+	cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__);
+}
+
+static inline struct dfs_cache_entry *
+__update_cache_entry(const char *path, const struct dfs_info3_param *refs,
+		     int numrefs)
+{
+	int rc;
+	unsigned int h;
+	struct dfs_cache_entry *ce;
+	char *s, *th = NULL;
+
+	ce = find_cache_entry(path, &h);
+	if (IS_ERR(ce))
+		return ce;
+
+	if (ce->ce_tgthint) {
+		s = ce->ce_tgthint->t_name;
+		th = kstrndup(s, strlen(s), GFP_KERNEL);
+		if (!th)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	free_tgts(ce);
+	ce->ce_numtgts = 0;
+
+	rc = copy_ref_data(refs, numrefs, ce, th);
+	kfree(th);
+
+	if (rc)
+		ce = ERR_PTR(rc);
+
+	return ce;
+}
+
+/* Update an expired cache entry by getting a new DFS referral from server */
+static struct dfs_cache_entry *
+update_cache_entry(const unsigned int xid, struct cifs_ses *ses,
+		   const struct nls_table *nls_codepage, int remap,
+		   const char *path, struct dfs_cache_entry *ce)
+{
+	int rc;
+	struct dfs_info3_param *refs = NULL;
+	int numrefs = 0;
+
+	cifs_dbg(FYI, "%s: update expired cache entry\n", __func__);
+	/*
+	 * Check if caller provided enough parameters to update an expired
+	 * entry.
+	 */
+	if (!ses || !ses->server || !ses->server->ops->get_dfs_refer)
+		return ERR_PTR(-ETIME);
+	if (unlikely(!nls_codepage))
+		return ERR_PTR(-ETIME);
+
+	cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, path);
+
+	rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, &numrefs,
+					     nls_codepage, remap);
+	if (rc)
+		ce = ERR_PTR(rc);
+	else
+		ce = __update_cache_entry(path, refs, numrefs);
+
+	dump_refs(refs, numrefs);
+	free_dfs_info_array(refs, numrefs);
+
+	return ce;
+}
+
+/*
+ * Find, create or update a DFS cache entry.
+ *
+ * If the entry wasn't found, it will create a new one. Or if it was found but
+ * expired, then it will update the entry accordingly.
+ *
+ * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to
+ * handle them properly.
+ */
+static struct dfs_cache_entry *
+do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+		  const struct nls_table *nls_codepage, int remap,
+		  const char *path, bool noreq)
+{
+	int rc;
+	unsigned int h;
+	struct dfs_cache_entry *ce;
+	struct dfs_info3_param *nrefs;
+	int numnrefs;
+
+	cifs_dbg(FYI, "%s: search path: %s\n", __func__, path);
+
+	ce = find_cache_entry(path, &h);
+	if (IS_ERR(ce)) {
+		cifs_dbg(FYI, "%s: cache miss\n", __func__);
+		/*
+		 * If @noreq is set, no requests will be sent to the server for
+		 * either updating or getting a new DFS referral.
+		 */
+		if (noreq)
+			return ce;
+		/*
+		 * No cache entry was found, so check for valid parameters that
+		 * will be required to get a new DFS referral and then create a
+		 * new cache entry.
+		 */
+		if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) {
+			ce = ERR_PTR(-EOPNOTSUPP);
+			return ce;
+		}
+		if (unlikely(!nls_codepage)) {
+			ce = ERR_PTR(-EINVAL);
+			return ce;
+		}
+
+		nrefs = NULL;
+		numnrefs = 0;
+
+		cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__,
+			 path);
+
+		rc = ses->server->ops->get_dfs_refer(xid, ses, path, &nrefs,
+						     &numnrefs, nls_codepage,
+						     remap);
+		if (rc) {
+			ce = ERR_PTR(rc);
+			return ce;
+		}
+
+		dump_refs(nrefs, numnrefs);
+
+		cifs_dbg(FYI, "%s: new cache entry\n", __func__);
+
+		if (dfs_cache_count >= DFS_CACHE_MAX_ENTRIES) {
+			cifs_dbg(FYI, "%s: reached max cache size (%d)",
+				 __func__, DFS_CACHE_MAX_ENTRIES);
+			remove_oldest_entry();
+		}
+		ce = add_cache_entry(h, path, nrefs, numnrefs);
+		free_dfs_info_array(nrefs, numnrefs);
+
+		if (IS_ERR(ce))
+			return ce;
+
+		dfs_cache_count++;
+	}
+
+	dump_ce(ce);
+
+	/* Just return the found cache entry in case @noreq is set */
+	if (noreq)
+		return ce;
+
+	if (cache_entry_expired(ce)) {
+		cifs_dbg(FYI, "%s: expired cache entry\n", __func__);
+		ce = update_cache_entry(xid, ses, nls_codepage, remap, path,
+					ce);
+		if (IS_ERR(ce)) {
+			cifs_dbg(FYI, "%s: failed to update expired entry\n",
+				 __func__);
+		}
+	}
+	return ce;
+}
+
+/* Set up a new DFS referral from a given cache entry */
+static int setup_ref(const char *path, const struct dfs_cache_entry *ce,
+		     struct dfs_info3_param *ref, const char *tgt)
+{
+	int rc;
+
+	cifs_dbg(FYI, "%s: set up new ref\n", __func__);
+
+	memset(ref, 0, sizeof(*ref));
+
+	ref->path_name = kstrndup(path, strlen(path), GFP_KERNEL);
+	if (!ref->path_name)
+		return -ENOMEM;
+
+	ref->path_consumed = ce->ce_path_consumed;
+
+	ref->node_name = kstrndup(tgt, strlen(tgt), GFP_KERNEL);
+	if (!ref->node_name) {
+		rc = -ENOMEM;
+		goto err_free_path;
+	}
+
+	ref->ttl = ce->ce_ttl;
+	ref->server_type = ce->ce_srvtype;
+	ref->ref_flag = ce->ce_flags;
+
+	return 0;
+
+err_free_path:
+	kfree(ref->path_name);
+	ref->path_name = NULL;
+	return rc;
+}
+
+/* Return target list of a DFS cache entry */
+static int get_tgt_list(const struct dfs_cache_entry *ce,
+			struct dfs_cache_tgt_list *tl)
+{
+	int rc;
+	struct list_head *head = &tl->tl_list;
+	struct dfs_cache_tgt *t;
+	struct dfs_cache_tgt_iterator *it, *nit;
+
+	memset(tl, 0, sizeof(*tl));
+	INIT_LIST_HEAD(head);
+
+	list_for_each_entry(t, &ce->ce_tlist, t_list) {
+		it = kzalloc(sizeof(*it), GFP_KERNEL);
+		if (!it) {
+			rc = -ENOMEM;
+			goto err_free_it;
+		}
+
+		it->it_name = kstrndup(t->t_name, strlen(t->t_name),
+				       GFP_KERNEL);
+		if (!it->it_name) {
+			kfree(it);
+			rc = -ENOMEM;
+			goto err_free_it;
+		}
+
+		if (ce->ce_tgthint == t)
+			list_add(&it->it_list, head);
+		else
+			list_add_tail(&it->it_list, head);
+	}
+	tl->tl_numtgts = ce->ce_numtgts;
+
+	return 0;
+
+err_free_it:
+	list_for_each_entry_safe(it, nit, head, it_list) {
+		kfree(it->it_name);
+		kfree(it);
+	}
+	return rc;
+}
+
+/**
+ * dfs_cache_find - find a DFS cache entry
+ *
+ * If it doesn't find the cache entry, then it will get a DFS referral
+ * for @path and create a new entry.
+ *
+ * In case the cache entry exists but expired, it will get a DFS referral
+ * for @path and then update the respective cache entry.
+ *
+ * These parameters are passed down to the get_dfs_refer() call if it
+ * needs to be issued:
+ * @xid: syscall xid
+ * @ses: smb session to issue the request on
+ * @nls_codepage: charset conversion
+ * @remap: path character remapping type
+ * @path: path to lookup in DFS referral cache.
+ *
+ * @ref: when non-NULL, store single DFS referral result in it.
+ * @tgt_list: when non-NULL, store complete DFS target list in it.
+ *
+ * Return zero if the target was found, otherwise non-zero.
+ */
+int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+		   const struct nls_table *nls_codepage, int remap,
+		   const char *path, struct dfs_info3_param *ref,
+		   struct dfs_cache_tgt_list *tgt_list)
+{
+	int rc;
+	char *npath;
+	struct dfs_cache_entry *ce;
+
+	if (unlikely(!is_path_valid(path)))
+		return -EINVAL;
+
+	rc = get_normalized_path(path, &npath);
+	if (rc)
+		return rc;
+
+	mutex_lock(&dfs_cache_list_lock);
+	ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+	if (!IS_ERR(ce)) {
+		if (ref)
+			rc = setup_ref(path, ce, ref, get_tgt_name(ce));
+		else
+			rc = 0;
+		if (!rc && tgt_list)
+			rc = get_tgt_list(ce, tgt_list);
+	} else {
+		rc = PTR_ERR(ce);
+	}
+	mutex_unlock(&dfs_cache_list_lock);
+	free_normalized_path(path, npath);
+	return rc;
+}
+
+/**
+ * dfs_cache_noreq_find - find a DFS cache entry without sending any requests to
+ * the currently connected server.
+ *
+ * NOTE: This function will neither update a cache entry in case it was
+ * expired, nor create a new cache entry if @path hasn't been found. It heavily
+ * relies on an existing cache entry.
+ *
+ * @path: path to lookup in the DFS referral cache.
+ * @ref: when non-NULL, store single DFS referral result in it.
+ * @tgt_list: when non-NULL, store complete DFS target list in it.
+ *
+ * Return 0 if successful.
+ * Return -ENOENT if the entry was not found.
+ * Return non-zero for other errors.
+ */
+int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
+			 struct dfs_cache_tgt_list *tgt_list)
+{
+	int rc;
+	char *npath;
+	struct dfs_cache_entry *ce;
+
+	if (unlikely(!is_path_valid(path)))
+		return -EINVAL;
+
+	rc = get_normalized_path(path, &npath);
+	if (rc)
+		return rc;
+
+	mutex_lock(&dfs_cache_list_lock);
+	ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
+	if (IS_ERR(ce)) {
+		rc = PTR_ERR(ce);
+		goto out;
+	}
+
+	if (ref)
+		rc = setup_ref(path, ce, ref, get_tgt_name(ce));
+	else
+		rc = 0;
+	if (!rc && tgt_list)
+		rc = get_tgt_list(ce, tgt_list);
+out:
+	mutex_unlock(&dfs_cache_list_lock);
+	free_normalized_path(path, npath);
+	return rc;
+}
+
+/**
+ * dfs_cache_update_tgthint - update target hint of a DFS cache entry
+ *
+ * If it doesn't find the cache entry, then it will get a DFS referral for @path
+ * and create a new entry.
+ *
+ * In case the cache entry exists but expired, it will get a DFS referral
+ * for @path and then update the respective cache entry.
+ *
+ * @xid: syscall id
+ * @ses: smb session
+ * @nls_codepage: charset conversion
+ * @remap: type of character remapping for paths
+ * @path: path to lookup in DFS referral cache.
+ * @it: DFS target iterator
+ *
+ * Return zero if the target hint was updated successfully, otherwise non-zero.
+ */
+int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
+			     const struct nls_table *nls_codepage, int remap,
+			     const char *path,
+			     const struct dfs_cache_tgt_iterator *it)
+{
+	int rc;
+	char *npath;
+	struct dfs_cache_entry *ce;
+	struct dfs_cache_tgt *t;
+
+	if (unlikely(!is_path_valid(path)))
+		return -EINVAL;
+
+	rc = get_normalized_path(path, &npath);
+	if (rc)
+		return rc;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+	mutex_lock(&dfs_cache_list_lock);
+	ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+	if (IS_ERR(ce)) {
+		rc = PTR_ERR(ce);
+		goto out;
+	}
+
+	rc = 0;
+
+	t = ce->ce_tgthint;
+
+	if (likely(!strcasecmp(it->it_name, t->t_name)))
+		goto out;
+
+	list_for_each_entry(t, &ce->ce_tlist, t_list) {
+		if (!strcasecmp(t->t_name, it->it_name)) {
+			ce->ce_tgthint = t;
+			cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
+				 it->it_name);
+			break;
+		}
+	}
+
+out:
+	mutex_unlock(&dfs_cache_list_lock);
+	free_normalized_path(path, npath);
+	return rc;
+}
+
+/**
+ * dfs_cache_noreq_update_tgthint - update target hint of a DFS cache entry
+ * without sending any requests to the currently connected server.
+ *
+ * NOTE: This function will neither update a cache entry in case it was
+ * expired, nor create a new cache entry if @path hasn't been found. It heavily
+ * relies on an existing cache entry.
+ *
+ * @path: path to lookup in DFS referral cache.
+ * @it: target iterator which contains the target hint to update the cache
+ * entry with.
+ *
+ * Return zero if the target hint was updated successfully, otherwise non-zero.
+ */
+int dfs_cache_noreq_update_tgthint(const char *path,
+				   const struct dfs_cache_tgt_iterator *it)
+{
+	int rc;
+	char *npath;
+	struct dfs_cache_entry *ce;
+	struct dfs_cache_tgt *t;
+
+	if (unlikely(!is_path_valid(path)) || !it)
+		return -EINVAL;
+
+	rc = get_normalized_path(path, &npath);
+	if (rc)
+		return rc;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+	mutex_lock(&dfs_cache_list_lock);
+
+	ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
+	if (IS_ERR(ce)) {
+		rc = PTR_ERR(ce);
+		goto out;
+	}
+
+	rc = 0;
+
+	t = ce->ce_tgthint;
+
+	if (unlikely(!strcasecmp(it->it_name, t->t_name)))
+		goto out;
+
+	list_for_each_entry(t, &ce->ce_tlist, t_list) {
+		if (!strcasecmp(t->t_name, it->it_name)) {
+			ce->ce_tgthint = t;
+			cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
+				 it->it_name);
+			break;
+		}
+	}
+
+out:
+	mutex_unlock(&dfs_cache_list_lock);
+	free_normalized_path(path, npath);
+	return rc;
+}
+
+/**
+ * dfs_cache_get_tgt_referral - returns a DFS referral (@ref) from a given
+ * target iterator (@it).
+ *
+ * @path: path to lookup in DFS referral cache.
+ * @it: DFS target iterator.
+ * @ref: DFS referral pointer to set up the gathered information.
+ *
+ * Return zero if the DFS referral was set up correctly, otherwise non-zero.
+ */
+int dfs_cache_get_tgt_referral(const char *path,
+			       const struct dfs_cache_tgt_iterator *it,
+			       struct dfs_info3_param *ref)
+{
+	int rc;
+	char *npath;
+	struct dfs_cache_entry *ce;
+	unsigned int h;
+
+	if (!it || !ref)
+		return -EINVAL;
+	if (unlikely(!is_path_valid(path)))
+		return -EINVAL;
+
+	rc = get_normalized_path(path, &npath);
+	if (rc)
+		return rc;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+	mutex_lock(&dfs_cache_list_lock);
+
+	ce = find_cache_entry(npath, &h);
+	if (IS_ERR(ce)) {
+		rc = PTR_ERR(ce);
+		goto out;
+	}
+
+	cifs_dbg(FYI, "%s: target name: %s\n", __func__, it->it_name);
+
+	rc = setup_ref(path, ce, ref, it->it_name);
+
+out:
+	mutex_unlock(&dfs_cache_list_lock);
+	free_normalized_path(path, npath);
+	return rc;
+}
+
+static int dup_vol(struct smb_vol *vol, struct smb_vol *new)
+{
+	memcpy(new, vol, sizeof(*new));
+
+	if (vol->username) {
+		new->username = kstrndup(vol->username, strlen(vol->username),
+					GFP_KERNEL);
+		if (!new->username)
+			return -ENOMEM;
+	}
+	if (vol->password) {
+		new->password = kstrndup(vol->password, strlen(vol->password),
+					 GFP_KERNEL);
+		if (!new->password)
+			goto err_free_username;
+	}
+	if (vol->UNC) {
+		cifs_dbg(FYI, "%s: vol->UNC: %s\n", __func__, vol->UNC);
+		new->UNC = kstrndup(vol->UNC, strlen(vol->UNC), GFP_KERNEL);
+		if (!new->UNC)
+			goto err_free_password;
+	}
+	if (vol->domainname) {
+		new->domainname = kstrndup(vol->domainname,
+					  strlen(vol->domainname), GFP_KERNEL);
+		if (!new->domainname)
+			goto err_free_unc;
+	}
+	if (vol->iocharset) {
+		new->iocharset = kstrndup(vol->iocharset,
+					  strlen(vol->iocharset), GFP_KERNEL);
+		if (!new->iocharset)
+			goto err_free_domainname;
+	}
+	if (vol->prepath) {
+		cifs_dbg(FYI, "%s: vol->prepath: %s\n", __func__, vol->prepath);
+		new->prepath = kstrndup(vol->prepath, strlen(vol->prepath),
+					GFP_KERNEL);
+		if (!new->prepath)
+			goto err_free_iocharset;
+	}
+
+	return 0;
+
+err_free_iocharset:
+	kfree(new->iocharset);
+err_free_domainname:
+	kfree(new->domainname);
+err_free_unc:
+	kfree(new->UNC);
+err_free_password:
+	kzfree(new->password);
+err_free_username:
+	kfree(new->username);
+	kfree(new);
+	return -ENOMEM;
+}
+
+/**
+ * dfs_cache_add_vol - add a cifs volume during mount() that will be handled by
+ * DFS cache refresh worker.
+ *
+ * @mntdata: mount data.
+ * @vol: cifs volume.
+ * @fullpath: origin full path.
+ *
+ * Return zero if volume was set up correctly, otherwise non-zero.
+ */
+int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath)
+{
+	int rc;
+	struct dfs_cache_vol_info *vi;
+
+	if (!vol || !fullpath || !mntdata)
+		return -EINVAL;
+
+	cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+	vi = kzalloc(sizeof(*vi), GFP_KERNEL);
+	if (!vi)
+		return -ENOMEM;
+
+	vi->vi_fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+	if (!vi->vi_fullpath) {
+		rc = -ENOMEM;
+		goto err_free_vi;
+	}
+
+	rc = dup_vol(vol, &vi->vi_vol);
+	if (rc)
+		goto err_free_fullpath;
+
+	vi->vi_mntdata = mntdata;
+
+	mutex_lock(&dfs_cache.dc_lock);
+	list_add_tail(&vi->vi_list, &dfs_cache.dc_vol_list);
+	mutex_unlock(&dfs_cache.dc_lock);
+	return 0;
+
+err_free_fullpath:
+	kfree(vi->vi_fullpath);
+err_free_vi:
+	kfree(vi);
+	return rc;
+}
+
+static inline struct dfs_cache_vol_info *find_vol(const char *fullpath)
+{
+	struct dfs_cache_vol_info *vi;
+
+	list_for_each_entry(vi, &dfs_cache.dc_vol_list, vi_list) {
+		cifs_dbg(FYI, "%s: vi->vi_fullpath: %s\n", __func__,
+			 vi->vi_fullpath);
+		if (!strcasecmp(vi->vi_fullpath, fullpath))
+			return vi;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
+/**
+ * dfs_cache_update_vol - update vol info in DFS cache after failover
+ *
+ * @fullpath: fullpath to look up in volume list.
+ * @server: TCP ses pointer.
+ *
+ * Return zero if volume was updated, otherwise non-zero.
+ */
+int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server)
+{
+	int rc;
+	struct dfs_cache_vol_info *vi;
+
+	if (!fullpath || !server)
+		return -EINVAL;
+
+	cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+	mutex_lock(&dfs_cache.dc_lock);
+
+	vi = find_vol(fullpath);
+	if (IS_ERR(vi)) {
+		rc = PTR_ERR(vi);
+		goto out;
+	}
+
+	cifs_dbg(FYI, "%s: updating volume info\n", __func__);
+	memcpy(&vi->vi_vol.dstaddr, &server->dstaddr,
+	       sizeof(vi->vi_vol.dstaddr));
+	rc = 0;
+
+out:
+	mutex_unlock(&dfs_cache.dc_lock);
+	return rc;
+}
+
+/**
+ * dfs_cache_del_vol - remove volume info in DFS cache during umount()
+ *
+ * @fullpath: fullpath to look up in volume list.
+ */
+void dfs_cache_del_vol(const char *fullpath)
+{
+	struct dfs_cache_vol_info *vi;
+
+	if (!fullpath || !*fullpath)
+		return;
+
+	cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+	mutex_lock(&dfs_cache.dc_lock);
+	vi = find_vol(fullpath);
+	if (!IS_ERR(vi))
+		free_vol(vi);
+	mutex_unlock(&dfs_cache.dc_lock);
+}
+
+/* Get all tcons that are within a DFS namespace and can be refreshed */
+static void get_tcons(struct TCP_Server_Info *server, struct list_head *head)
+{
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+
+	INIT_LIST_HEAD(head);
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+			if (!tcon->need_reconnect && !tcon->need_reopen_files &&
+			    tcon->dfs_path) {
+				tcon->tc_count++;
+				list_add_tail(&tcon->ulist, head);
+			}
+		}
+		if (ses->tcon_ipc && !ses->tcon_ipc->need_reconnect &&
+		    ses->tcon_ipc->dfs_path) {
+			list_add_tail(&ses->tcon_ipc->ulist, head);
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+}
+
+static inline bool is_dfs_link(const char *path)
+{
+	char *s;
+
+	s = strchr(path + 1, '\\');
+	if (!s)
+		return false;
+	return !!strchr(s + 1, '\\');
+}
+
+static inline char *get_dfs_root(const char *path)
+{
+	char *s, *npath;
+
+	s = strchr(path + 1, '\\');
+	if (!s)
+		return ERR_PTR(-EINVAL);
+
+	s = strchr(s + 1, '\\');
+	if (!s)
+		return ERR_PTR(-EINVAL);
+
+	npath = kstrndup(path, s - path, GFP_KERNEL);
+	if (!npath)
+		return ERR_PTR(-ENOMEM);
+
+	return npath;
+}
+
+/* Find root SMB session out of a DFS link path */
+static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
+				      struct cifs_tcon *tcon, const char *path)
+{
+	char *rpath;
+	int rc;
+	struct dfs_info3_param ref = {0};
+	char *mdata = NULL, *devname = NULL;
+	bool is_smb3 = tcon->ses->server->vals->header_preamble_size == 0;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct smb_vol vol;
+
+	rpath = get_dfs_root(path);
+	if (IS_ERR(rpath))
+		return ERR_CAST(rpath);
+
+	memset(&vol, 0, sizeof(vol));
+
+	rc = dfs_cache_noreq_find(rpath, &ref, NULL);
+	if (rc) {
+		ses = ERR_PTR(rc);
+		goto out;
+	}
+
+	mdata = cifs_compose_mount_options(vi->vi_mntdata, rpath, &ref,
+					   &devname);
+	free_dfs_info_param(&ref);
+
+	if (IS_ERR(mdata)) {
+		ses = ERR_CAST(mdata);
+		mdata = NULL;
+		goto out;
+	}
+
+	rc = cifs_setup_volume_info(&vol, mdata, devname, is_smb3);
+	kfree(devname);
+
+	if (rc) {
+		ses = ERR_PTR(rc);
+		goto out;
+	}
+
+	server = cifs_find_tcp_session(&vol);
+	if (IS_ERR_OR_NULL(server)) {
+		ses = ERR_PTR(-EHOSTDOWN);
+		goto out;
+	}
+	if (server->tcpStatus != CifsGood) {
+		cifs_put_tcp_session(server, 0);
+		ses = ERR_PTR(-EHOSTDOWN);
+		goto out;
+	}
+
+	ses = cifs_get_smb_ses(server, &vol);
+
+out:
+	cifs_cleanup_volume_info_contents(&vol);
+	kfree(mdata);
+	kfree(rpath);
+
+	return ses;
+}
+
+/* Refresh DFS cache entry from a given tcon */
+static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi,
+			    struct cifs_tcon *tcon)
+{
+	int rc = 0;
+	unsigned int xid;
+	char *path, *npath;
+	unsigned int h;
+	struct dfs_cache_entry *ce;
+	struct dfs_info3_param *refs = NULL;
+	int numrefs = 0;
+	struct cifs_ses *root_ses = NULL, *ses;
+
+	xid = get_xid();
+
+	path = tcon->dfs_path + 1;
+
+	rc = get_normalized_path(path, &npath);
+	if (rc)
+		goto out;
+
+	mutex_lock(&dfs_cache_list_lock);
+	ce = find_cache_entry(npath, &h);
+	mutex_unlock(&dfs_cache_list_lock);
+
+	if (IS_ERR(ce)) {
+		rc = PTR_ERR(ce);
+		goto out;
+	}
+
+	if (!cache_entry_expired(ce))
+		goto out;
+
+	/* If it's a DFS Link, then use root SMB session for refreshing it */
+	if (is_dfs_link(npath)) {
+		ses = root_ses = find_root_ses(vi, tcon, npath);
+		if (IS_ERR(ses)) {
+			rc = PTR_ERR(ses);
+			root_ses = NULL;
+			goto out;
+		}
+	} else {
+		ses = tcon->ses;
+	}
+
+	if (unlikely(!ses->server->ops->get_dfs_refer)) {
+		rc = -EOPNOTSUPP;
+	} else {
+		rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs,
+						     &numrefs, dc->dc_nlsc,
+						     tcon->remap);
+		if (!rc) {
+			mutex_lock(&dfs_cache_list_lock);
+			ce = __update_cache_entry(npath, refs, numrefs);
+			mutex_unlock(&dfs_cache_list_lock);
+			dump_refs(refs, numrefs);
+			free_dfs_info_array(refs, numrefs);
+			if (IS_ERR(ce))
+				rc = PTR_ERR(ce);
+		}
+	}
+
+out:
+	if (root_ses)
+		cifs_put_smb_ses(root_ses);
+
+	free_xid(xid);
+	free_normalized_path(path, npath);
+}
+
+/*
+ * Worker that will refresh DFS cache based on lowest TTL value from a DFS
+ * referral.
+ */
+static void refresh_cache_worker(struct work_struct *work)
+{
+	struct dfs_cache *dc = container_of(work, struct dfs_cache,
+					    dc_refresh.work);
+	struct dfs_cache_vol_info *vi;
+	struct TCP_Server_Info *server;
+	LIST_HEAD(list);
+	struct cifs_tcon *tcon, *ntcon;
+
+	mutex_lock(&dc->dc_lock);
+
+	list_for_each_entry(vi, &dc->dc_vol_list, vi_list) {
+		server = cifs_find_tcp_session(&vi->vi_vol);
+		if (IS_ERR_OR_NULL(server))
+			continue;
+		if (server->tcpStatus != CifsGood)
+			goto next;
+		get_tcons(server, &list);
+		list_for_each_entry_safe(tcon, ntcon, &list, ulist) {
+			do_refresh_tcon(dc, vi, tcon);
+			list_del_init(&tcon->ulist);
+			cifs_put_tcon(tcon);
+		}
+next:
+		cifs_put_tcp_session(server, 0);
+	}
+	queue_delayed_work(cifsiod_wq, &dc->dc_refresh, dc->dc_ttl * HZ);
+	mutex_unlock(&dc->dc_lock);
+}

diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
new file mode 100644
index 0000000..76c7329
--- /dev/null
+++ b/fs/cifs/dfs_cache.h

@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DFS referral cache routines
+ *
+ * Copyright (c) 2018-2019 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#ifndef _CIFS_DFS_CACHE_H
+#define _CIFS_DFS_CACHE_H
+
+#include <linux/nls.h>
+#include <linux/list.h>
+#include "cifsglob.h"
+
+struct dfs_cache_tgt_list {
+	int tl_numtgts;
+	struct list_head tl_list;
+};
+
+struct dfs_cache_tgt_iterator {
+	char *it_name;
+	struct list_head it_list;
+};
+
+extern int dfs_cache_init(void);
+extern void dfs_cache_destroy(void);
+extern const struct file_operations dfscache_proc_fops;
+
+extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+			  const struct nls_table *nls_codepage, int remap,
+			  const char *path, struct dfs_info3_param *ref,
+			  struct dfs_cache_tgt_list *tgt_list);
+extern int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
+				struct dfs_cache_tgt_list *tgt_list);
+extern int dfs_cache_update_tgthint(const unsigned int xid,
+				    struct cifs_ses *ses,
+				    const struct nls_table *nls_codepage,
+				    int remap, const char *path,
+				    const struct dfs_cache_tgt_iterator *it);
+extern int
+dfs_cache_noreq_update_tgthint(const char *path,
+			       const struct dfs_cache_tgt_iterator *it);
+extern int dfs_cache_get_tgt_referral(const char *path,
+				      const struct dfs_cache_tgt_iterator *it,
+				      struct dfs_info3_param *ref);
+extern int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol,
+			const char *fullpath);
+extern int dfs_cache_update_vol(const char *fullpath,
+				struct TCP_Server_Info *server);
+extern void dfs_cache_del_vol(const char *fullpath);
+
+static inline struct dfs_cache_tgt_iterator *
+dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl,
+		       struct dfs_cache_tgt_iterator *it)
+{
+	if (!tl || list_empty(&tl->tl_list) || !it ||
+	    list_is_last(&it->it_list, &tl->tl_list))
+		return NULL;
+	return list_next_entry(it, it_list);
+}
+
+static inline struct dfs_cache_tgt_iterator *
+dfs_cache_get_tgt_iterator(struct dfs_cache_tgt_list *tl)
+{
+	if (!tl)
+		return NULL;
+	return list_first_entry_or_null(&tl->tl_list,
+					struct dfs_cache_tgt_iterator,
+					it_list);
+}
+
+static inline void dfs_cache_free_tgts(struct dfs_cache_tgt_list *tl)
+{
+	struct dfs_cache_tgt_iterator *it, *nit;
+
+	if (!tl || list_empty(&tl->tl_list))
+		return;
+	list_for_each_entry_safe(it, nit, &tl->tl_list, it_list) {
+		list_del(&it->it_list);
+		kfree(it->it_name);
+		kfree(it);
+	}
+	tl->tl_numtgts = 0;
+}
+
+static inline const char *
+dfs_cache_get_tgt_name(const struct dfs_cache_tgt_iterator *it)
+{
+	return it ? it->it_name : NULL;
+}
+
+static inline int
+dfs_cache_get_nr_tgts(const struct dfs_cache_tgt_list *tl)
+{
+	return tl ? tl->tl_numtgts : 0;
+}
+
+#endif /* _CIFS_DFS_CACHE_H */

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 907e85d..7ce689d 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c

@@ -69,11 +69,10 @@
 		return full_path;
 
 	if (dfsplen)
-		strncpy(full_path, tcon->treeName, dfsplen);
+		memcpy(full_path, tcon->treeName, dfsplen);
 	full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
-	strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
+	memcpy(full_path + dfsplen + 1, vol->prepath, pplen);
 	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
-	full_path[dfsplen + pplen] = 0; /* add trailing null */
 	return full_path;
 }
 
@@ -126,7 +125,7 @@
 	}
 	rcu_read_unlock();
 
-	full_path = kmalloc(namelen+1, GFP_KERNEL);
+	full_path = kmalloc(namelen+1, GFP_ATOMIC);
 	if (full_path == NULL)
 		return full_path;
 	full_path[namelen] = 0;	/* trailing null */
@@ -621,20 +620,10 @@
 {
 	int rc = -EPERM;
 	unsigned int xid;
-	int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
 	struct cifs_tcon *tcon;
-	struct cifs_io_parms io_parms;
 	char *full_path = NULL;
-	struct inode *newinode = NULL;
-	__u32 oplock = 0;
-	struct cifs_fid fid;
-	struct cifs_open_parms oparms;
-	FILE_ALL_INFO *buf = NULL;
-	unsigned int bytes_written;
-	struct win_dev *pdev;
-	struct kvec iov[2];
 
 	if (!old_valid_dev(device_number))
 		return -EINVAL;
@@ -654,103 +643,12 @@
 		goto mknod_out;
 	}
 
-	if (tcon->unix_ext) {
-		struct cifs_unix_set_info_args args = {
-			.mode	= mode & ~current_umask(),
-			.ctime	= NO_CHANGE_64,
-			.atime	= NO_CHANGE_64,
-			.mtime	= NO_CHANGE_64,
-			.device	= device_number,
-		};
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-			args.uid = current_fsuid();
-			args.gid = current_fsgid();
-		} else {
-			args.uid = INVALID_UID; /* no change */
-			args.gid = INVALID_GID; /* no change */
-		}
-		rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
-					    cifs_sb->local_nls,
-					    cifs_remap(cifs_sb));
-		if (rc)
-			goto mknod_out;
-
-		rc = cifs_get_inode_info_unix(&newinode, full_path,
-						inode->i_sb, xid);
-
-		if (rc == 0)
-			d_instantiate(direntry, newinode);
-		goto mknod_out;
-	}
-
-	if (!S_ISCHR(mode) && !S_ISBLK(mode))
-		goto mknod_out;
-
-	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
-		goto mknod_out;
-
-
-	cifs_dbg(FYI, "sfu compat create special file\n");
-
-	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-	if (buf == NULL) {
-		rc = -ENOMEM;
-		goto mknod_out;
-	}
-
-	if (backup_cred(cifs_sb))
-		create_options |= CREATE_OPEN_BACKUP_INTENT;
-
-	oparms.tcon = tcon;
-	oparms.cifs_sb = cifs_sb;
-	oparms.desired_access = GENERIC_WRITE;
-	oparms.create_options = create_options;
-	oparms.disposition = FILE_CREATE;
-	oparms.path = full_path;
-	oparms.fid = &fid;
-	oparms.reconnect = false;
-
-	if (tcon->ses->server->oplocks)
-		oplock = REQ_OPLOCK;
-	else
-		oplock = 0;
-	rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
-	if (rc)
-		goto mknod_out;
-
-	/*
-	 * BB Do not bother to decode buf since no local inode yet to put
-	 * timestamps in, but we can reuse it safely.
-	 */
-
-	pdev = (struct win_dev *)buf;
-	io_parms.pid = current->tgid;
-	io_parms.tcon = tcon;
-	io_parms.offset = 0;
-	io_parms.length = sizeof(struct win_dev);
-	iov[1].iov_base = buf;
-	iov[1].iov_len = sizeof(struct win_dev);
-	if (S_ISCHR(mode)) {
-		memcpy(pdev->type, "IntxCHR", 8);
-		pdev->major = cpu_to_le64(MAJOR(device_number));
-		pdev->minor = cpu_to_le64(MINOR(device_number));
-		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
-							&bytes_written, iov, 1);
-	} else if (S_ISBLK(mode)) {
-		memcpy(pdev->type, "IntxBLK", 8);
-		pdev->major = cpu_to_le64(MAJOR(device_number));
-		pdev->minor = cpu_to_le64(MINOR(device_number));
-		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
-							&bytes_written, iov, 1);
-	}
-	tcon->ses->server->ops->close(xid, tcon, &fid);
-	d_drop(direntry);
-
-	/* FIXME: add code here to set EAs */
+	rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
+					       full_path, mode,
+					       device_number);
 
 mknod_out:
 	kfree(full_path);
-	kfree(buf);
 	free_xid(xid);
 	cifs_put_tlink(tlink);
 	return rc;
@@ -840,10 +738,16 @@
 static int
 cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
 {
+	struct inode *inode;
+
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	if (d_really_is_positive(direntry)) {
+		inode = d_inode(direntry);
+		if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
+			CIFS_I(inode)->time = 0; /* force reval */
+
 		if (cifs_revalidate_dentry(direntry))
 			return 0;
 		else {
@@ -854,7 +758,7 @@
 			 * attributes will have been updated by
 			 * cifs_revalidate_dentry().
 			 */
-			if (IS_AUTOMOUNT(d_inode(direntry)) &&
+			if (IS_AUTOMOUNT(inode) &&
 			   !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
 				spin_lock(&direntry->d_lock);
 				direntry->d_flags |= DCACHE_NEED_AUTOMOUNT;

diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 7ede730..534cbba 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c

@@ -77,7 +77,8 @@
 		goto name_is_IP_address;
 
 	/* Perform the upcall */
-	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+	rc = dns_query(current->nsproxy->net_ns, NULL, hostname, len,
+		       NULL, ip_addr, NULL, false);
 	if (rc < 0)
 		cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
 			 __func__, len, len, hostname);

diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index ce8b7f6..eb0bb8c 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c

@@ -24,7 +24,7 @@
  */
 
  /*
-  * See Documentation/filesystems/nfs/Exporting
+  * See Documentation/filesystems/nfs/exporting.rst
   * and examples in fs/exportfs
   *
   * Since cifs is a network file system, an "fsid" must be included for

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8d41ca7..fa7b0fa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c

@@ -33,6 +33,7 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
+#include <linux/mm.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -252,6 +253,12 @@
 		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
 					 xid, fid);
 
+	if (rc) {
+		server->ops->close(xid, tcon, fid);
+		if (rc == -ESTALE)
+			rc = -EOPENSTALE;
+	}
+
 out:
 	kfree(buf);
 	return rc;
@@ -274,6 +281,13 @@
 	return has_locks;
 }
 
+void
+cifs_down_write(struct rw_semaphore *sem)
+{
+	while (!down_write_trylock(sem))
+		msleep(10);
+}
+
 struct cifsFileInfo *
 cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
 		  struct tcon_link *tlink, __u32 oplock)
@@ -299,7 +313,7 @@
 	INIT_LIST_HEAD(&fdlocks->locks);
 	fdlocks->cfile = cfile;
 	cfile->llist = fdlocks;
-	down_write(&cinode->lock_sem);
+	cifs_down_write(&cinode->lock_sem);
 	list_add(&fdlocks->llist, &cinode->llist);
 	up_write(&cinode->lock_sem);
 
@@ -334,12 +348,15 @@
 	server->ops->set_fid(cfile, fid, oplock);
 
 	list_add(&cfile->tlist, &tcon->openFileList);
+	atomic_inc(&tcon->num_local_opens);
 
 	/* if readable file instance put first in list*/
+	spin_lock(&cinode->open_file_lock);
 	if (file->f_mode & FMODE_READ)
 		list_add(&cfile->flist, &cinode->openFileList);
 	else
 		list_add_tail(&cfile->flist, &cinode->openFileList);
+	spin_unlock(&cinode->open_file_lock);
 	spin_unlock(&tcon->open_file_lock);
 
 	if (fid->purge_cache)
@@ -358,13 +375,31 @@
 	return cifs_file;
 }
 
-/*
- * Release a reference on the file private data. This may involve closing
- * the filehandle out on the server. Must be called without holding
- * tcon->open_file_lock and cifs_file->file_info_lock.
+/**
+ * cifsFileInfo_put - release a reference of file priv data
+ *
+ * Always potentially wait for oplock handler. See _cifsFileInfo_put().
  */
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
+	_cifsFileInfo_put(cifs_file, true);
+}
+
+/**
+ * _cifsFileInfo_put - release a reference of file priv data
+ *
+ * This may involve closing the filehandle @cifs_file out on the
+ * server. Must be called without holding tcon->open_file_lock and
+ * cifs_file->file_info_lock.
+ *
+ * If @wait_for_oplock_handler is true and we are releasing the last
+ * reference, wait for any running oplock break handler of the file
+ * and cancel any pending one. If calling this function from the
+ * oplock break handler, you need to pass false.
+ *
+ */
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
+{
 	struct inode *inode = d_inode(cifs_file->dentry);
 	struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
 	struct TCP_Server_Info *server = tcon->ses->server;
@@ -377,10 +412,11 @@
 	bool oplock_break_cancelled;
 
 	spin_lock(&tcon->open_file_lock);
-
+	spin_lock(&cifsi->open_file_lock);
 	spin_lock(&cifs_file->file_info_lock);
 	if (--cifs_file->count > 0) {
 		spin_unlock(&cifs_file->file_info_lock);
+		spin_unlock(&cifsi->open_file_lock);
 		spin_unlock(&tcon->open_file_lock);
 		return;
 	}
@@ -395,6 +431,7 @@
 	/* remove it from the lists */
 	list_del(&cifs_file->flist);
 	list_del(&cifs_file->tlist);
+	atomic_dec(&tcon->num_local_opens);
 
 	if (list_empty(&cifsi->openFileList)) {
 		cifs_dbg(FYI, "closing last open instance for inode %p\n",
@@ -409,9 +446,11 @@
 		cifs_set_oplock_level(cifsi, 0);
 	}
 
+	spin_unlock(&cifsi->open_file_lock);
 	spin_unlock(&tcon->open_file_lock);
 
-	oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
+	oplock_break_cancelled = wait_oplock_handler ?
+		cancel_work_sync(&cifs_file->oplock_break) : false;
 
 	if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
 		struct TCP_Server_Info *server = tcon->ses->server;
@@ -432,7 +471,7 @@
 	 * Delete any outstanding lock records. We'll lose them when the file
 	 * is closed anyway.
 	 */
-	down_write(&cifsi->lock_sem);
+	cifs_down_write(&cifsi->lock_sem);
 	list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
 		list_del(&li->llist);
 		cifs_del_lock_waiters(li);
@@ -730,7 +769,8 @@
 
 	if (can_flush) {
 		rc = filemap_write_and_wait(inode->i_mapping);
-		mapping_set_error(inode->i_mapping, rc);
+		if (!is_interrupt_error(rc))
+			mapping_set_error(inode->i_mapping, rc);
 
 		if (tcon->unix_ext)
 			rc = cifs_get_inode_info_unix(&inode, full_path,
@@ -864,7 +904,7 @@
 }
 
 static struct cifsLockInfo *
-cifs_lock_init(__u64 offset, __u64 length, __u8 type)
+cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 flags)
 {
 	struct cifsLockInfo *lock =
 		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
@@ -874,6 +914,7 @@
 	lock->length = length;
 	lock->type = type;
 	lock->pid = current->tgid;
+	lock->flags = flags;
 	INIT_LIST_HEAD(&lock->blist);
 	init_waitqueue_head(&lock->block_q);
 	return lock;
@@ -896,7 +937,8 @@
 /* @rw_check : 0 - no op, 1 - read, 2 - write */
 static bool
 cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
-			    __u64 length, __u8 type, struct cifsFileInfo *cfile,
+			    __u64 length, __u8 type, __u16 flags,
+			    struct cifsFileInfo *cfile,
 			    struct cifsLockInfo **conf_lock, int rw_check)
 {
 	struct cifsLockInfo *li;
@@ -918,6 +960,10 @@
 		    ((server->ops->compare_fids(cfile, cur_cfile) &&
 		     current->tgid == li->pid) || type == li->type))
 			continue;
+		if (rw_check == CIFS_LOCK_OP &&
+		    (flags & FL_OFDLCK) && (li->flags & FL_OFDLCK) &&
+		    server->ops->compare_fids(cfile, cur_cfile))
+			continue;
 		if (conf_lock)
 			*conf_lock = li;
 		return true;
@@ -927,8 +973,8 @@
 
 bool
 cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
-			__u8 type, struct cifsLockInfo **conf_lock,
-			int rw_check)
+			__u8 type, __u16 flags,
+			struct cifsLockInfo **conf_lock, int rw_check)
 {
 	bool rc = false;
 	struct cifs_fid_locks *cur;
@@ -936,7 +982,8 @@
 
 	list_for_each_entry(cur, &cinode->llist, llist) {
 		rc = cifs_find_fid_lock_conflict(cur, offset, length, type,
-						 cfile, conf_lock, rw_check);
+						 flags, cfile, conf_lock,
+						 rw_check);
 		if (rc)
 			break;
 	}
@@ -964,7 +1011,8 @@
 	down_read(&cinode->lock_sem);
 
 	exist = cifs_find_lock_conflict(cfile, offset, length, type,
-					&conf_lock, CIFS_LOCK_OP);
+					flock->fl_flags, &conf_lock,
+					CIFS_LOCK_OP);
 	if (exist) {
 		flock->fl_start = conf_lock->offset;
 		flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -986,7 +1034,7 @@
 cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
 {
 	struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
-	down_write(&cinode->lock_sem);
+	cifs_down_write(&cinode->lock_sem);
 	list_add_tail(&lock->llist, &cfile->llist->locks);
 	up_write(&cinode->lock_sem);
 }
@@ -995,7 +1043,7 @@
  * Set the byte-range lock (mandatory style). Returns:
  * 1) 0, if we set the lock and don't need to request to the server;
  * 2) 1, if no locks prevent us but we need to request to the server;
- * 3) -EACCESS, if there is a lock that prevents us and wait is false.
+ * 3) -EACCES, if there is a lock that prevents us and wait is false.
  */
 static int
 cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
@@ -1008,10 +1056,11 @@
 
 try_again:
 	exist = false;
-	down_write(&cinode->lock_sem);
+	cifs_down_write(&cinode->lock_sem);
 
 	exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
-					lock->type, &conf_lock, CIFS_LOCK_OP);
+					lock->type, lock->flags, &conf_lock,
+					CIFS_LOCK_OP);
 	if (!exist && cinode->can_cache_brlcks) {
 		list_add_tail(&lock->llist, &cfile->llist->locks);
 		up_write(&cinode->lock_sem);
@@ -1030,7 +1079,7 @@
 					(lock->blist.next == &lock->blist));
 		if (!rc)
 			goto try_again;
-		down_write(&cinode->lock_sem);
+		cifs_down_write(&cinode->lock_sem);
 		list_del_init(&lock->blist);
 	}
 
@@ -1083,7 +1132,7 @@
 		return rc;
 
 try_again:
-	down_write(&cinode->lock_sem);
+	cifs_down_write(&cinode->lock_sem);
 	if (!cinode->can_cache_brlcks) {
 		up_write(&cinode->lock_sem);
 		return rc;
@@ -1092,10 +1141,10 @@
 	rc = posix_lock_file(file, flock, NULL);
 	up_write(&cinode->lock_sem);
 	if (rc == FILE_LOCK_DEFERRED) {
-		rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
+		rc = wait_event_interruptible(flock->fl_wait, !flock->fl_blocker);
 		if (!rc)
 			goto try_again;
-		posix_unblock_lock(flock);
+		locks_delete_block(flock);
 	}
 	return rc;
 }
@@ -1120,14 +1169,18 @@
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf) {
+	if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) {
 		free_xid(xid);
 		return -EINVAL;
 	}
 
+	BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+		     PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+			PAGE_SIZE);
 	max_num = (max_buf - sizeof(struct smb_hdr)) /
 						sizeof(LOCKING_ANDX_RANGE);
 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
@@ -1285,7 +1338,7 @@
 	int rc = 0;
 
 	/* we are going to update can_cache_brlcks here - need a write access */
-	down_write(&cinode->lock_sem);
+	cifs_down_write(&cinode->lock_sem);
 	if (!cinode->can_cache_brlcks) {
 		up_write(&cinode->lock_sem);
 		return rc;
@@ -1321,7 +1374,7 @@
 		cifs_dbg(FYI, "Lease on file - not implemented yet\n");
 	if (flock->fl_flags &
 	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP |
-	       FL_ACCESS | FL_LEASE | FL_CLOSE)))
+	       FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK)))
 		cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
 
 	*type = server->vals->large_lock_type;
@@ -1460,19 +1513,23 @@
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf)
+	if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE)))
 		return -EINVAL;
 
+	BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+		     PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+			PAGE_SIZE);
 	max_num = (max_buf - sizeof(struct smb_hdr)) /
 						sizeof(LOCKING_ANDX_RANGE);
 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
-	down_write(&cinode->lock_sem);
+	cifs_down_write(&cinode->lock_sem);
 	for (i = 0; i < 2; i++) {
 		cur = buf;
 		num = 0;
@@ -1584,7 +1641,8 @@
 	if (lock) {
 		struct cifsLockInfo *lock;
 
-		lock = cifs_lock_init(flock->fl_start, length, type);
+		lock = cifs_lock_init(flock->fl_start, length, type,
+				      flock->fl_flags);
 		if (!lock)
 			return -ENOMEM;
 
@@ -1623,8 +1681,20 @@
 		rc = server->ops->mand_unlock_range(cfile, flock, xid);
 
 out:
-	if (flock->fl_flags & FL_POSIX && !rc)
+	if (flock->fl_flags & FL_POSIX) {
+		/*
+		 * If this is a request to remove all locks because we
+		 * are closing the file, it doesn't matter if the
+		 * unlocking failed as both cifs.ko and the SMB server
+		 * remove the lock on file close
+		 */
+		if (rc) {
+			cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
+			if (!(flock->fl_flags & FL_CLOSE))
+				return rc;
+		}
 		rc = locks_lock_file_wait(file, flock);
+	}
 	return rc;
 }
 
@@ -1636,9 +1706,7 @@
 	bool posix_lck = false;
 	struct cifs_sb_info *cifs_sb;
 	struct cifs_tcon *tcon;
-	struct cifsInodeInfo *cinode;
 	struct cifsFileInfo *cfile;
-	__u16 netfid;
 	__u32 type;
 
 	rc = -EACCES;
@@ -1653,10 +1721,7 @@
 
 	cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
 			tcon->ses->server);
-
 	cifs_sb = CIFS_FILE_SB(file);
-	netfid = cfile->fid.netfid;
-	cinode = CIFS_I(file_inode(file));
 
 	if (cap_unix(tcon->ses) &&
 	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
@@ -1708,7 +1773,6 @@
 	int rc = 0;
 	unsigned int bytes_written = 0;
 	unsigned int total_written;
-	struct cifs_sb_info *cifs_sb;
 	struct cifs_tcon *tcon;
 	struct TCP_Server_Info *server;
 	unsigned int xid;
@@ -1716,8 +1780,6 @@
 	struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry));
 	struct cifs_io_parms io_parms;
 
-	cifs_sb = CIFS_SB(dentry->d_sb);
-
 	cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n",
 		 write_size, *offset, dentry);
 
@@ -1791,13 +1853,12 @@
 {
 	struct cifsFileInfo *open_file = NULL;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
-	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	/* only filter by fsuid on multiuser mounts */
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
 		fsuid_only = false;
 
-	spin_lock(&tcon->open_file_lock);
+	spin_lock(&cifs_inode->open_file_lock);
 	/* we could simply get the first_list_entry since write-only entries
 	   are always at the end of the list but since the first entry might
 	   have a close pending, we go through the whole list */
@@ -1809,7 +1870,7 @@
 				/* found a good file */
 				/* lock it so it will not be closed on us */
 				cifsFileInfo_get(open_file);
-				spin_unlock(&tcon->open_file_lock);
+				spin_unlock(&cifs_inode->open_file_lock);
 				return open_file;
 			} /* else might as well continue, and look for
 			     another, or simply have the caller reopen it
@@ -1817,42 +1878,46 @@
 		} else /* write only file */
 			break; /* write only files are last so must be done */
 	}
-	spin_unlock(&tcon->open_file_lock);
+	spin_unlock(&cifs_inode->open_file_lock);
 	return NULL;
 }
 
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
-					bool fsuid_only)
+/* Return -EBADF if no handle is found and general rc otherwise */
+int
+cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
+		       struct cifsFileInfo **ret_file)
 {
 	struct cifsFileInfo *open_file, *inv_file = NULL;
 	struct cifs_sb_info *cifs_sb;
-	struct cifs_tcon *tcon;
 	bool any_available = false;
-	int rc;
+	int rc = -EBADF;
 	unsigned int refind = 0;
 
-	/* Having a null inode here (because mapping->host was set to zero by
-	the VFS or MM) should not happen but we had reports of on oops (due to
-	it being zero) during stress testcases so we need to check for it */
+	*ret_file = NULL;
+
+	/*
+	 * Having a null inode here (because mapping->host was set to zero by
+	 * the VFS or MM) should not happen but we had reports of on oops (due
+	 * to it being zero) during stress testcases so we need to check for it
+	 */
 
 	if (cifs_inode == NULL) {
 		cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
 		dump_stack();
-		return NULL;
+		return rc;
 	}
 
 	cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
-	tcon = cifs_sb_master_tcon(cifs_sb);
 
 	/* only filter by fsuid on multiuser mounts */
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
 		fsuid_only = false;
 
-	spin_lock(&tcon->open_file_lock);
+	spin_lock(&cifs_inode->open_file_lock);
 refind_writable:
 	if (refind > MAX_REOPEN_ATT) {
-		spin_unlock(&tcon->open_file_lock);
-		return NULL;
+		spin_unlock(&cifs_inode->open_file_lock);
+		return rc;
 	}
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (!any_available && open_file->pid != current->tgid)
@@ -1863,8 +1928,9 @@
 			if (!open_file->invalidHandle) {
 				/* found a good writable file */
 				cifsFileInfo_get(open_file);
-				spin_unlock(&tcon->open_file_lock);
-				return open_file;
+				spin_unlock(&cifs_inode->open_file_lock);
+				*ret_file = open_file;
+				return 0;
 			} else {
 				if (!inv_file)
 					inv_file = open_file;
@@ -1882,26 +1948,110 @@
 		cifsFileInfo_get(inv_file);
 	}
 
-	spin_unlock(&tcon->open_file_lock);
+	spin_unlock(&cifs_inode->open_file_lock);
 
 	if (inv_file) {
 		rc = cifs_reopen_file(inv_file, false);
-		if (!rc)
-			return inv_file;
-		else {
-			spin_lock(&tcon->open_file_lock);
-			list_move_tail(&inv_file->flist,
-					&cifs_inode->openFileList);
-			spin_unlock(&tcon->open_file_lock);
-			cifsFileInfo_put(inv_file);
-			++refind;
-			inv_file = NULL;
-			spin_lock(&tcon->open_file_lock);
-			goto refind_writable;
+		if (!rc) {
+			*ret_file = inv_file;
+			return 0;
 		}
+
+		spin_lock(&cifs_inode->open_file_lock);
+		list_move_tail(&inv_file->flist, &cifs_inode->openFileList);
+		spin_unlock(&cifs_inode->open_file_lock);
+		cifsFileInfo_put(inv_file);
+		++refind;
+		inv_file = NULL;
+		spin_lock(&cifs_inode->open_file_lock);
+		goto refind_writable;
 	}
 
-	return NULL;
+	return rc;
+}
+
+struct cifsFileInfo *
+find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
+{
+	struct cifsFileInfo *cfile;
+	int rc;
+
+	rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile);
+	if (rc)
+		cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc);
+
+	return cfile;
+}
+
+int
+cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
+		       struct cifsFileInfo **ret_file)
+{
+	struct list_head *tmp;
+	struct cifsFileInfo *cfile;
+	struct cifsInodeInfo *cinode;
+	char *full_path;
+
+	*ret_file = NULL;
+
+	spin_lock(&tcon->open_file_lock);
+	list_for_each(tmp, &tcon->openFileList) {
+		cfile = list_entry(tmp, struct cifsFileInfo,
+			     tlist);
+		full_path = build_path_from_dentry(cfile->dentry);
+		if (full_path == NULL) {
+			spin_unlock(&tcon->open_file_lock);
+			return -ENOMEM;
+		}
+		if (strcmp(full_path, name)) {
+			kfree(full_path);
+			continue;
+		}
+
+		kfree(full_path);
+		cinode = CIFS_I(d_inode(cfile->dentry));
+		spin_unlock(&tcon->open_file_lock);
+		return cifs_get_writable_file(cinode, 0, ret_file);
+	}
+
+	spin_unlock(&tcon->open_file_lock);
+	return -ENOENT;
+}
+
+int
+cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
+		       struct cifsFileInfo **ret_file)
+{
+	struct list_head *tmp;
+	struct cifsFileInfo *cfile;
+	struct cifsInodeInfo *cinode;
+	char *full_path;
+
+	*ret_file = NULL;
+
+	spin_lock(&tcon->open_file_lock);
+	list_for_each(tmp, &tcon->openFileList) {
+		cfile = list_entry(tmp, struct cifsFileInfo,
+			     tlist);
+		full_path = build_path_from_dentry(cfile->dentry);
+		if (full_path == NULL) {
+			spin_unlock(&tcon->open_file_lock);
+			return -ENOMEM;
+		}
+		if (strcmp(full_path, name)) {
+			kfree(full_path);
+			continue;
+		}
+
+		kfree(full_path);
+		cinode = CIFS_I(d_inode(cfile->dentry));
+		spin_unlock(&tcon->open_file_lock);
+		*ret_file = find_readable_file(cinode, 0);
+		return *ret_file ? 0 : -ENOENT;
+	}
+
+	spin_unlock(&tcon->open_file_lock);
+	return -ENOENT;
 }
 
 static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
@@ -1938,8 +2088,8 @@
 	if (mapping->host->i_size - offset < (loff_t)to)
 		to = (unsigned)(mapping->host->i_size - offset);
 
-	open_file = find_writable_file(CIFS_I(mapping->host), false);
-	if (open_file) {
+	rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file);
+	if (!rc) {
 		bytes_written = cifs_write(open_file, open_file->pid,
 					   write_data, to - from, &offset);
 		cifsFileInfo_put(open_file);
@@ -1949,9 +2099,12 @@
 			rc = 0;
 		else if (bytes_written < 0)
 			rc = bytes_written;
+		else
+			rc = -EFAULT;
 	} else {
-		cifs_dbg(FYI, "No writeable filehandles for inode\n");
-		rc = -EIO;
+		cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc);
+		if (!is_retryable_error(rc))
+			rc = -EIO;
 	}
 
 	kunmap(page);
@@ -2058,9 +2211,9 @@
 wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
 		 struct address_space *mapping, struct writeback_control *wbc)
 {
-	int rc = 0;
-	struct TCP_Server_Info *server;
-	unsigned int i;
+	int rc;
+	struct TCP_Server_Info *server =
+				tlink_tcon(wdata->cfile->tlink)->ses->server;
 
 	wdata->sync_mode = wbc->sync_mode;
 	wdata->nr_pages = nr_pages;
@@ -2070,21 +2223,16 @@
 			page_offset(wdata->pages[nr_pages - 1]),
 			(loff_t)PAGE_SIZE);
 	wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
+	wdata->pid = wdata->cfile->pid;
 
-	if (wdata->cfile != NULL)
-		cifsFileInfo_put(wdata->cfile);
-	wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
-	if (!wdata->cfile) {
-		cifs_dbg(VFS, "No writable handles for inode\n");
-		rc = -EBADF;
-	} else {
-		wdata->pid = wdata->cfile->pid;
-		server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+	rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+	if (rc)
+		return rc;
+
+	if (wdata->cfile->invalidHandle)
+		rc = -EAGAIN;
+	else
 		rc = server->ops->async_writev(wdata, cifs_writedata_release);
-	}
-
-	for (i = 0; i < nr_pages; ++i)
-		unlock_page(wdata->pages[i]);
 
 	return rc;
 }
@@ -2092,12 +2240,16 @@
 static int cifs_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
-	struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+	struct inode *inode = mapping->host;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct TCP_Server_Info *server;
 	bool done = false, scanned = false, range_whole = false;
 	pgoff_t end, index;
 	struct cifs_writedata *wdata;
+	struct cifsFileInfo *cfile = NULL;
 	int rc = 0;
+	int saved_rc = 0;
+	unsigned int xid;
 
 	/*
 	 * If wsize is smaller than the page cache size, default to writing
@@ -2106,6 +2258,7 @@
 	if (cifs_sb->wsize < PAGE_SIZE)
 		return generic_writepages(mapping, wbc);
 
+	xid = get_xid();
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -2119,13 +2272,27 @@
 	server = cifs_sb_master_tcon(cifs_sb)->ses->server;
 retry:
 	while (!done && index <= end) {
-		unsigned int i, nr_pages, found_pages, wsize, credits;
+		unsigned int i, nr_pages, found_pages, wsize;
 		pgoff_t next = 0, tofind, saved_index = index;
+		struct cifs_credits credits_on_stack;
+		struct cifs_credits *credits = &credits_on_stack;
+		int get_file_rc = 0;
+
+		if (cfile)
+			cifsFileInfo_put(cfile);
+
+		rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile);
+
+		/* in case of an error store it to return later */
+		if (rc)
+			get_file_rc = rc;
 
 		rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
-						   &wsize, &credits);
-		if (rc)
+						   &wsize, credits);
+		if (rc != 0) {
+			done = true;
 			break;
+		}
 
 		tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1;
 
@@ -2133,6 +2300,7 @@
 						  &found_pages);
 		if (!wdata) {
 			rc = -ENOMEM;
+			done = true;
 			add_credits_and_wake_if(server, credits, 0);
 			break;
 		}
@@ -2153,15 +2321,28 @@
 			continue;
 		}
 
-		wdata->credits = credits;
+		wdata->credits = credits_on_stack;
+		wdata->cfile = cfile;
+		cfile = NULL;
 
-		rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+		if (!wdata->cfile) {
+			cifs_dbg(VFS, "No writable handle in writepages rc=%d\n",
+				 get_file_rc);
+			if (is_retryable_error(get_file_rc))
+				rc = get_file_rc;
+			else
+				rc = -EBADF;
+		} else
+			rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+
+		for (i = 0; i < nr_pages; ++i)
+			unlock_page(wdata->pages[i]);
 
 		/* send failure -- clean up the mess */
 		if (rc != 0) {
-			add_credits_and_wake_if(server, wdata->credits, 0);
+			add_credits_and_wake_if(server, &wdata->credits, 0);
 			for (i = 0; i < nr_pages; ++i) {
-				if (rc == -EAGAIN)
+				if (is_retryable_error(rc))
 					redirty_page_for_writepage(wbc,
 							   wdata->pages[i]);
 				else
@@ -2169,7 +2350,7 @@
 				end_page_writeback(wdata->pages[i]);
 				put_page(wdata->pages[i]);
 			}
-			if (rc != -EAGAIN)
+			if (!is_retryable_error(rc))
 				mapping_set_error(mapping, rc);
 		}
 		kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2179,6 +2360,15 @@
 			continue;
 		}
 
+		/* Return immediately if we received a signal during writing */
+		if (is_interrupt_error(rc)) {
+			done = true;
+			break;
+		}
+
+		if (rc != 0 && saved_rc == 0)
+			saved_rc = rc;
+
 		wbc->nr_to_write -= nr_pages;
 		if (wbc->nr_to_write <= 0)
 			done = true;
@@ -2196,9 +2386,15 @@
 		goto retry;
 	}
 
+	if (saved_rc != 0)
+		rc = saved_rc;
+
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 
+	if (cfile)
+		cifsFileInfo_put(cfile);
+	free_xid(xid);
 	return rc;
 }
 
@@ -2227,8 +2423,8 @@
 	set_page_writeback(page);
 retry_write:
 	rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
-	if (rc == -EAGAIN) {
-		if (wbc->sync_mode == WB_SYNC_ALL)
+	if (is_retryable_error(rc)) {
+		if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
 			goto retry_write;
 		redirty_page_for_writepage(wbc, page);
 	} else if (rc != 0) {
@@ -2325,7 +2521,6 @@
 	rc = file_write_and_wait_range(file, start, end);
 	if (rc)
 		return rc;
-	inode_lock(inode);
 
 	xid = get_xid();
 
@@ -2350,7 +2545,6 @@
 	}
 
 	free_xid(xid);
-	inode_unlock(inode);
 	return rc;
 }
 
@@ -2362,12 +2556,10 @@
 	struct TCP_Server_Info *server;
 	struct cifsFileInfo *smbfile = file->private_data;
 	struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
-	struct inode *inode = file->f_mapping->host;
 
 	rc = file_write_and_wait_range(file, start, end);
 	if (rc)
 		return rc;
-	inode_lock(inode);
 
 	xid = get_xid();
 
@@ -2384,7 +2576,6 @@
 	}
 
 	free_xid(xid);
-	inode_unlock(inode);
 	return rc;
 }
 
@@ -2524,6 +2715,69 @@
 }
 
 static int
+cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
+	struct cifs_aio_ctx *ctx)
+{
+	unsigned int wsize;
+	struct cifs_credits credits;
+	int rc;
+	struct TCP_Server_Info *server =
+		tlink_tcon(wdata->cfile->tlink)->ses->server;
+
+	do {
+		if (wdata->cfile->invalidHandle) {
+			rc = cifs_reopen_file(wdata->cfile, false);
+			if (rc == -EAGAIN)
+				continue;
+			else if (rc)
+				break;
+		}
+
+
+		/*
+		 * Wait for credits to resend this wdata.
+		 * Note: we are attempting to resend the whole wdata not in
+		 * segments
+		 */
+		do {
+			rc = server->ops->wait_mtu_credits(server, wdata->bytes,
+						&wsize, &credits);
+			if (rc)
+				goto fail;
+
+			if (wsize < wdata->bytes) {
+				add_credits_and_wake_if(server, &credits, 0);
+				msleep(1000);
+			}
+		} while (wsize < wdata->bytes);
+		wdata->credits = credits;
+
+		rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+		if (!rc) {
+			if (wdata->cfile->invalidHandle)
+				rc = -EAGAIN;
+			else
+				rc = server->ops->async_writev(wdata,
+					cifs_uncached_writedata_release);
+		}
+
+		/* If the write was successfully sent, we are done */
+		if (!rc) {
+			list_add_tail(&wdata->list, wdata_list);
+			return 0;
+		}
+
+		/* Roll back credits and retry if needed */
+		add_credits_and_wake_if(server, &wdata->credits, 0);
+	} while (rc == -EAGAIN);
+
+fail:
+	kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+	return rc;
+}
+
+static int
 cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 		     struct cifsFileInfo *open_file,
 		     struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
@@ -2537,6 +2791,9 @@
 	loff_t saved_offset = offset;
 	pid_t pid;
 	struct TCP_Server_Info *server;
+	struct page **pagevec;
+	size_t start;
+	unsigned int xid;
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
 		pid = open_file->pid;
@@ -2544,47 +2801,106 @@
 		pid = current->tgid;
 
 	server = tlink_tcon(open_file->tlink)->ses->server;
+	xid = get_xid();
 
 	do {
-		unsigned int wsize, credits;
+		unsigned int wsize;
+		struct cifs_credits credits_on_stack;
+		struct cifs_credits *credits = &credits_on_stack;
+
+		if (open_file->invalidHandle) {
+			rc = cifs_reopen_file(open_file, false);
+			if (rc == -EAGAIN)
+				continue;
+			else if (rc)
+				break;
+		}
 
 		rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
-						   &wsize, &credits);
+						   &wsize, credits);
 		if (rc)
 			break;
 
-		nr_pages = get_numpages(wsize, len, &cur_len);
-		wdata = cifs_writedata_alloc(nr_pages,
+		cur_len = min_t(const size_t, len, wsize);
+
+		if (ctx->direct_io) {
+			ssize_t result;
+
+			result = iov_iter_get_pages_alloc(
+				from, &pagevec, cur_len, &start);
+			if (result < 0) {
+				cifs_dbg(VFS,
+					"direct_writev couldn't get user pages "
+					"(rc=%zd) iter type %d iov_offset %zd "
+					"count %zd\n",
+					result, from->type,
+					from->iov_offset, from->count);
+				dump_stack();
+
+				rc = result;
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
+			cur_len = (size_t)result;
+			iov_iter_advance(from, cur_len);
+
+			nr_pages =
+				(cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+			wdata = cifs_writedata_direct_alloc(pagevec,
 					     cifs_uncached_writev_complete);
-		if (!wdata) {
-			rc = -ENOMEM;
-			add_credits_and_wake_if(server, credits, 0);
-			break;
-		}
+			if (!wdata) {
+				rc = -ENOMEM;
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
 
-		rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
-		if (rc) {
-			kfree(wdata);
-			add_credits_and_wake_if(server, credits, 0);
-			break;
-		}
 
-		num_pages = nr_pages;
-		rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
-		if (rc) {
-			for (i = 0; i < nr_pages; i++)
-				put_page(wdata->pages[i]);
-			kfree(wdata);
-			add_credits_and_wake_if(server, credits, 0);
-			break;
-		}
+			wdata->page_offset = start;
+			wdata->tailsz =
+				nr_pages > 1 ?
+					cur_len - (PAGE_SIZE - start) -
+					(nr_pages - 2) * PAGE_SIZE :
+					cur_len;
+		} else {
+			nr_pages = get_numpages(wsize, len, &cur_len);
+			wdata = cifs_writedata_alloc(nr_pages,
+					     cifs_uncached_writev_complete);
+			if (!wdata) {
+				rc = -ENOMEM;
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
 
-		/*
-		 * Bring nr_pages down to the number of pages we actually used,
-		 * and free any pages that we didn't use.
-		 */
-		for ( ; nr_pages > num_pages; nr_pages--)
-			put_page(wdata->pages[nr_pages - 1]);
+			rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
+			if (rc) {
+				kvfree(wdata->pages);
+				kfree(wdata);
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
+
+			num_pages = nr_pages;
+			rc = wdata_fill_from_iovec(
+				wdata, from, &cur_len, &num_pages);
+			if (rc) {
+				for (i = 0; i < nr_pages; i++)
+					put_page(wdata->pages[i]);
+				kvfree(wdata->pages);
+				kfree(wdata);
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
+
+			/*
+			 * Bring nr_pages down to the number of pages we
+			 * actually used, and free any pages that we didn't use.
+			 */
+			for ( ; nr_pages > num_pages; nr_pages--)
+				put_page(wdata->pages[nr_pages - 1]);
+
+			wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
+		}
 
 		wdata->sync_mode = WB_SYNC_ALL;
 		wdata->nr_pages = nr_pages;
@@ -2593,17 +2909,22 @@
 		wdata->pid = pid;
 		wdata->bytes = cur_len;
 		wdata->pagesz = PAGE_SIZE;
-		wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
-		wdata->credits = credits;
+		wdata->credits = credits_on_stack;
 		wdata->ctx = ctx;
 		kref_get(&ctx->refcount);
 
-		if (!wdata->cfile->invalidHandle ||
-		    !(rc = cifs_reopen_file(wdata->cfile, false)))
-			rc = server->ops->async_writev(wdata,
+		rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+		if (!rc) {
+			if (wdata->cfile->invalidHandle)
+				rc = -EAGAIN;
+			else
+				rc = server->ops->async_writev(wdata,
 					cifs_uncached_writedata_release);
+		}
+
 		if (rc) {
-			add_credits_and_wake_if(server, wdata->credits, 0);
+			add_credits_and_wake_if(server, &wdata->credits, 0);
 			kref_put(&wdata->refcount,
 				 cifs_uncached_writedata_release);
 			if (rc == -EAGAIN) {
@@ -2619,6 +2940,7 @@
 		len -= cur_len;
 	} while (len > 0);
 
+	free_xid(xid);
 	return rc;
 }
 
@@ -2628,7 +2950,6 @@
 	struct cifs_tcon *tcon;
 	struct cifs_sb_info *cifs_sb;
 	struct dentry *dentry = ctx->cfile->dentry;
-	unsigned int i;
 	int rc;
 
 	tcon = tlink_tcon(ctx->cfile->tlink);
@@ -2668,18 +2989,23 @@
 				INIT_LIST_HEAD(&tmp_list);
 				list_del_init(&wdata->list);
 
-				iov_iter_advance(&tmp_from,
+				if (ctx->direct_io)
+					rc = cifs_resend_wdata(
+						wdata, &tmp_list, ctx);
+				else {
+					iov_iter_advance(&tmp_from,
 						 wdata->offset - ctx->pos);
 
-				rc = cifs_write_from_iter(wdata->offset,
+					rc = cifs_write_from_iter(wdata->offset,
 						wdata->bytes, &tmp_from,
 						ctx->cfile, cifs_sb, &tmp_list,
 						ctx);
 
-				list_splice(&tmp_list, &ctx->list);
+					kref_put(&wdata->refcount,
+						cifs_uncached_writedata_release);
+				}
 
-				kref_put(&wdata->refcount,
-					 cifs_uncached_writedata_release);
+				list_splice(&tmp_list, &ctx->list);
 				goto restart_loop;
 			}
 		}
@@ -2687,9 +3013,6 @@
 		kref_put(&wdata->refcount, cifs_uncached_writedata_release);
 	}
 
-	for (i = 0; i < ctx->npages; i++)
-		put_page(ctx->bv[i].bv_page);
-
 	cifs_stats_bytes_written(tcon, ctx->total_len);
 	set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
 
@@ -2703,7 +3026,8 @@
 		complete(&ctx->done);
 }
 
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t __cifs_writev(
+	struct kiocb *iocb, struct iov_iter *from, bool direct)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t total_written = 0;
@@ -2712,13 +3036,18 @@
 	struct cifs_sb_info *cifs_sb;
 	struct cifs_aio_ctx *ctx;
 	struct iov_iter saved_from = *from;
+	size_t len = iov_iter_count(from);
 	int rc;
 
 	/*
-	 * BB - optimize the way when signing is disabled. We can drop this
-	 * extra memory-to-memory copying and use iovec buffers for constructing
-	 * write request.
+	 * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
+	 * In this case, fall back to non-direct write function.
+	 * this could be improved by getting pages directly in ITER_KVEC
 	 */
+	if (direct && from->type & ITER_KVEC) {
+		cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
+		direct = false;
+	}
 
 	rc = generic_write_checks(iocb, from);
 	if (rc <= 0)
@@ -2742,10 +3071,16 @@
 
 	ctx->pos = iocb->ki_pos;
 
-	rc = setup_aio_ctx_iter(ctx, from, WRITE);
-	if (rc) {
-		kref_put(&ctx->refcount, cifs_aio_ctx_release);
-		return rc;
+	if (direct) {
+		ctx->direct_io = true;
+		ctx->iter = *from;
+		ctx->len = len;
+	} else {
+		rc = setup_aio_ctx_iter(ctx, from, WRITE);
+		if (rc) {
+			kref_put(&ctx->refcount, cifs_aio_ctx_release);
+			return rc;
+		}
 	}
 
 	/* grab a lock here due to read response handlers can access ctx */
@@ -2795,6 +3130,16 @@
 	return total_written;
 }
 
+ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+	return __cifs_writev(iocb, from, true);
+}
+
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+	return __cifs_writev(iocb, from, false);
+}
+
 static ssize_t
 cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 {
@@ -2817,8 +3162,8 @@
 		goto out;
 
 	if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
-				     server->vals->exclusive_lock_type, NULL,
-				     CIFS_WRITE_OP))
+				     server->vals->exclusive_lock_type, 0,
+				     NULL, CIFS_WRITE_OP))
 		rc = __generic_file_write_iter(iocb, from);
 	else
 		rc = -EACCES;
@@ -2863,14 +3208,16 @@
 	 * these pages but not on the region from pos to ppos+len-1.
 	 */
 	written = cifs_user_writev(iocb, from);
-	if (written > 0 && CIFS_CACHE_READ(cinode)) {
+	if (CIFS_CACHE_READ(cinode)) {
 		/*
-		 * Windows 7 server can delay breaking level2 oplock if a write
-		 * request comes - break it on the client to prevent reading
-		 * an old data.
+		 * We have read level caching and we have just sent a write
+		 * request to the server thus making data in the cache stale.
+		 * Zap the cache and set oplock/lease level to NONE to avoid
+		 * reading stale data from the cache. All subsequent read
+		 * operations will read new data from the server.
 		 */
 		cifs_zap_mapping(inode);
-		cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+		cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n",
 			 inode);
 		cinode->oplock = 0;
 	}
@@ -2947,7 +3294,9 @@
 	}
 
 	if (rc) {
-		for (i = 0; i < nr_pages; i++) {
+		unsigned int nr_page_failed = i;
+
+		for (i = 0; i < nr_page_failed; i++) {
 			put_page(rdata->pages[i]);
 			rdata->pages[i] = NULL;
 		}
@@ -2965,7 +3314,6 @@
 	kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
 	for (i = 0; i < rdata->nr_pages; i++) {
 		put_page(rdata->pages[i]);
-		rdata->pages[i] = NULL;
 	}
 	cifs_readdata_release(refcount);
 }
@@ -2990,7 +3338,7 @@
 		size_t copy = min_t(size_t, remaining, PAGE_SIZE);
 		size_t written;
 
-		if (unlikely(iter->type & ITER_PIPE)) {
+		if (unlikely(iov_iter_is_pipe(iter))) {
 			void *addr = kmap_atomic(page);
 
 			written = copy_to_iter(addr, copy, iter);
@@ -3092,17 +3440,84 @@
 	return uncached_fill_pages(server, rdata, iter, iter->count);
 }
 
+static int cifs_resend_rdata(struct cifs_readdata *rdata,
+			struct list_head *rdata_list,
+			struct cifs_aio_ctx *ctx)
+{
+	unsigned int rsize;
+	struct cifs_credits credits;
+	int rc;
+	struct TCP_Server_Info *server =
+		tlink_tcon(rdata->cfile->tlink)->ses->server;
+
+	do {
+		if (rdata->cfile->invalidHandle) {
+			rc = cifs_reopen_file(rdata->cfile, true);
+			if (rc == -EAGAIN)
+				continue;
+			else if (rc)
+				break;
+		}
+
+		/*
+		 * Wait for credits to resend this rdata.
+		 * Note: we are attempting to resend the whole rdata not in
+		 * segments
+		 */
+		do {
+			rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+						&rsize, &credits);
+
+			if (rc)
+				goto fail;
+
+			if (rsize < rdata->bytes) {
+				add_credits_and_wake_if(server, &credits, 0);
+				msleep(1000);
+			}
+		} while (rsize < rdata->bytes);
+		rdata->credits = credits;
+
+		rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+		if (!rc) {
+			if (rdata->cfile->invalidHandle)
+				rc = -EAGAIN;
+			else
+				rc = server->ops->async_readv(rdata);
+		}
+
+		/* If the read was successfully sent, we are done */
+		if (!rc) {
+			/* Add to aio pending list */
+			list_add_tail(&rdata->list, rdata_list);
+			return 0;
+		}
+
+		/* Roll back credits and retry if needed */
+		add_credits_and_wake_if(server, &rdata->credits, 0);
+	} while (rc == -EAGAIN);
+
+fail:
+	kref_put(&rdata->refcount, cifs_uncached_readdata_release);
+	return rc;
+}
+
 static int
 cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		     struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
 		     struct cifs_aio_ctx *ctx)
 {
 	struct cifs_readdata *rdata;
-	unsigned int npages, rsize, credits;
+	unsigned int npages, rsize;
+	struct cifs_credits credits_on_stack;
+	struct cifs_credits *credits = &credits_on_stack;
 	size_t cur_len;
 	int rc;
 	pid_t pid;
 	struct TCP_Server_Info *server;
+	struct page **pagevec;
+	size_t start;
+	struct iov_iter direct_iov = ctx->iter;
 
 	server = tlink_tcon(open_file->tlink)->ses->server;
 
@@ -3111,27 +3526,84 @@
 	else
 		pid = current->tgid;
 
+	if (ctx->direct_io)
+		iov_iter_advance(&direct_iov, offset - ctx->pos);
+
 	do {
+		if (open_file->invalidHandle) {
+			rc = cifs_reopen_file(open_file, true);
+			if (rc == -EAGAIN)
+				continue;
+			else if (rc)
+				break;
+		}
+
 		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
-						   &rsize, &credits);
+						   &rsize, credits);
 		if (rc)
 			break;
 
 		cur_len = min_t(const size_t, len, rsize);
-		npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
 
-		/* allocate a readdata struct */
-		rdata = cifs_readdata_alloc(npages,
+		if (ctx->direct_io) {
+			ssize_t result;
+
+			result = iov_iter_get_pages_alloc(
+					&direct_iov, &pagevec,
+					cur_len, &start);
+			if (result < 0) {
+				cifs_dbg(VFS,
+					"couldn't get user pages (rc=%zd)"
+					" iter type %d"
+					" iov_offset %zd count %zd\n",
+					result, direct_iov.type,
+					direct_iov.iov_offset,
+					direct_iov.count);
+				dump_stack();
+
+				rc = result;
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
+			cur_len = (size_t)result;
+			iov_iter_advance(&direct_iov, cur_len);
+
+			rdata = cifs_readdata_direct_alloc(
+					pagevec, cifs_uncached_readv_complete);
+			if (!rdata) {
+				add_credits_and_wake_if(server, credits, 0);
+				rc = -ENOMEM;
+				break;
+			}
+
+			npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
+			rdata->page_offset = start;
+			rdata->tailsz = npages > 1 ?
+				cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
+				cur_len;
+
+		} else {
+
+			npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
+			/* allocate a readdata struct */
+			rdata = cifs_readdata_alloc(npages,
 					    cifs_uncached_readv_complete);
-		if (!rdata) {
-			add_credits_and_wake_if(server, credits, 0);
-			rc = -ENOMEM;
-			break;
-		}
+			if (!rdata) {
+				add_credits_and_wake_if(server, credits, 0);
+				rc = -ENOMEM;
+				break;
+			}
 
-		rc = cifs_read_allocate_pages(rdata, npages);
-		if (rc)
-			goto error;
+			rc = cifs_read_allocate_pages(rdata, npages);
+			if (rc) {
+				kvfree(rdata->pages);
+				kfree(rdata);
+				add_credits_and_wake_if(server, credits, 0);
+				break;
+			}
+
+			rdata->tailsz = PAGE_SIZE;
+		}
 
 		rdata->cfile = cifsFileInfo_get(open_file);
 		rdata->nr_pages = npages;
@@ -3139,23 +3611,29 @@
 		rdata->bytes = cur_len;
 		rdata->pid = pid;
 		rdata->pagesz = PAGE_SIZE;
-		rdata->tailsz = PAGE_SIZE;
 		rdata->read_into_pages = cifs_uncached_read_into_pages;
 		rdata->copy_into_pages = cifs_uncached_copy_into_pages;
-		rdata->credits = credits;
+		rdata->credits = credits_on_stack;
 		rdata->ctx = ctx;
 		kref_get(&ctx->refcount);
 
-		if (!rdata->cfile->invalidHandle ||
-		    !(rc = cifs_reopen_file(rdata->cfile, true)))
-			rc = server->ops->async_readv(rdata);
-error:
+		rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+		if (!rc) {
+			if (rdata->cfile->invalidHandle)
+				rc = -EAGAIN;
+			else
+				rc = server->ops->async_readv(rdata);
+		}
+
 		if (rc) {
-			add_credits_and_wake_if(server, rdata->credits, 0);
+			add_credits_and_wake_if(server, &rdata->credits, 0);
 			kref_put(&rdata->refcount,
-				 cifs_uncached_readdata_release);
-			if (rc == -EAGAIN)
+				cifs_uncached_readdata_release);
+			if (rc == -EAGAIN) {
+				iov_iter_revert(&direct_iov, cur_len);
 				continue;
+			}
 			break;
 		}
 
@@ -3173,11 +3651,8 @@
 	struct cifs_readdata *rdata, *tmp;
 	struct iov_iter *to = &ctx->iter;
 	struct cifs_sb_info *cifs_sb;
-	struct cifs_tcon *tcon;
-	unsigned int i;
 	int rc;
 
-	tcon = tlink_tcon(ctx->cfile->tlink);
 	cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb);
 
 	mutex_lock(&ctx->aio_mutex);
@@ -3211,47 +3686,55 @@
 				 * reading.
 				 */
 				if (got_bytes && got_bytes < rdata->bytes) {
-					rc = cifs_readdata_to_iov(rdata, to);
+					rc = 0;
+					if (!ctx->direct_io)
+						rc = cifs_readdata_to_iov(rdata, to);
 					if (rc) {
 						kref_put(&rdata->refcount,
-						cifs_uncached_readdata_release);
+							cifs_uncached_readdata_release);
 						continue;
 					}
 				}
 
-				rc = cifs_send_async_read(
+				if (ctx->direct_io) {
+					/*
+					 * Re-use rdata as this is a
+					 * direct I/O
+					 */
+					rc = cifs_resend_rdata(
+						rdata,
+						&tmp_list, ctx);
+				} else {
+					rc = cifs_send_async_read(
 						rdata->offset + got_bytes,
 						rdata->bytes - got_bytes,
 						rdata->cfile, cifs_sb,
 						&tmp_list, ctx);
 
+					kref_put(&rdata->refcount,
+						cifs_uncached_readdata_release);
+				}
+
 				list_splice(&tmp_list, &ctx->list);
 
-				kref_put(&rdata->refcount,
-					 cifs_uncached_readdata_release);
 				goto again;
 			} else if (rdata->result)
 				rc = rdata->result;
-			else
+			else if (!ctx->direct_io)
 				rc = cifs_readdata_to_iov(rdata, to);
 
 			/* if there was a short read -- discard anything left */
 			if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
 				rc = -ENODATA;
+
+			ctx->total_len += rdata->got_bytes;
 		}
 		list_del_init(&rdata->list);
 		kref_put(&rdata->refcount, cifs_uncached_readdata_release);
 	}
 
-	for (i = 0; i < ctx->npages; i++) {
-		if (ctx->should_dirty)
-			set_page_dirty(ctx->bv[i].bv_page);
-		put_page(ctx->bv[i].bv_page);
-	}
-
-	ctx->total_len = ctx->len - iov_iter_count(to);
-
-	cifs_stats_bytes_read(tcon, ctx->total_len);
+	if (!ctx->direct_io)
+		ctx->total_len = ctx->len - iov_iter_count(to);
 
 	/* mask nodata case */
 	if (rc == -ENODATA)
@@ -3267,18 +3750,28 @@
 		complete(&ctx->done);
 }
 
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t __cifs_readv(
+	struct kiocb *iocb, struct iov_iter *to, bool direct)
 {
-	struct file *file = iocb->ki_filp;
-	ssize_t rc;
 	size_t len;
-	ssize_t total_read = 0;
-	loff_t offset = iocb->ki_pos;
+	struct file *file = iocb->ki_filp;
 	struct cifs_sb_info *cifs_sb;
-	struct cifs_tcon *tcon;
 	struct cifsFileInfo *cfile;
+	struct cifs_tcon *tcon;
+	ssize_t rc, total_read = 0;
+	loff_t offset = iocb->ki_pos;
 	struct cifs_aio_ctx *ctx;
 
+	/*
+	 * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
+	 * fall back to data copy read path
+	 * this could be improved by getting pages directly in ITER_KVEC
+	 */
+	if (direct && to->type & ITER_KVEC) {
+		cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
+		direct = false;
+	}
+
 	len = iov_iter_count(to);
 	if (!len)
 		return 0;
@@ -3302,17 +3795,23 @@
 	if (!is_sync_kiocb(iocb))
 		ctx->iocb = iocb;
 
-	if (to->type == ITER_IOVEC)
+	if (iter_is_iovec(to))
 		ctx->should_dirty = true;
 
-	rc = setup_aio_ctx_iter(ctx, to, READ);
-	if (rc) {
-		kref_put(&ctx->refcount, cifs_aio_ctx_release);
-		return rc;
+	if (direct) {
+		ctx->pos = offset;
+		ctx->direct_io = true;
+		ctx->iter = *to;
+		ctx->len = len;
+	} else {
+		rc = setup_aio_ctx_iter(ctx, to, READ);
+		if (rc) {
+			kref_put(&ctx->refcount, cifs_aio_ctx_release);
+			return rc;
+		}
+		len = ctx->len;
 	}
 
-	len = ctx->len;
-
 	/* grab a lock here due to read response handlers can access ctx */
 	mutex_lock(&ctx->aio_mutex);
 
@@ -3354,6 +3853,16 @@
 	return rc;
 }
 
+ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+	return __cifs_readv(iocb, to, true);
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+	return __cifs_readv(iocb, to, false);
+}
+
 ssize_t
 cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
 {
@@ -3388,7 +3897,7 @@
 	down_read(&cinode->lock_sem);
 	if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
 				     tcon->ses->server->vals->shared_lock_type,
-				     NULL, CIFS_READ_OP))
+				     0, NULL, CIFS_READ_OP))
 		rc = generic_file_read_iter(iocb, to);
 	up_read(&cinode->lock_sem);
 	return rc;
@@ -3687,7 +4196,7 @@
 
 	INIT_LIST_HEAD(tmplist);
 
-	page = list_entry(page_list->prev, struct page, lru);
+	page = lru_to_page(page_list);
 
 	/*
 	 * Lock the page and put it in the cache. Since no one else
@@ -3743,7 +4252,9 @@
 	struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
 	struct TCP_Server_Info *server;
 	pid_t pid;
+	unsigned int xid;
 
+	xid = get_xid();
 	/*
 	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
 	 * immediately if the cookie is negative
@@ -3753,8 +4264,10 @@
 	 */
 	rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
 					 &num_pages);
-	if (rc == 0)
+	if (rc == 0) {
+		free_xid(xid);
 		return rc;
+	}
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
 		pid = open_file->pid;
@@ -3783,10 +4296,19 @@
 		loff_t offset;
 		struct page *page, *tpage;
 		struct cifs_readdata *rdata;
-		unsigned credits;
+		struct cifs_credits credits_on_stack;
+		struct cifs_credits *credits = &credits_on_stack;
+
+		if (open_file->invalidHandle) {
+			rc = cifs_reopen_file(open_file, true);
+			if (rc == -EAGAIN)
+				continue;
+			else if (rc)
+				break;
+		}
 
 		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
-						   &rsize, &credits);
+						   &rsize, credits);
 		if (rc)
 			break;
 
@@ -3798,6 +4320,7 @@
 		 */
 		if (unlikely(rsize < PAGE_SIZE)) {
 			add_credits_and_wake_if(server, credits, 0);
+			free_xid(xid);
 			return 0;
 		}
 
@@ -3831,18 +4354,24 @@
 		rdata->tailsz = PAGE_SIZE;
 		rdata->read_into_pages = cifs_readpages_read_into_pages;
 		rdata->copy_into_pages = cifs_readpages_copy_into_pages;
-		rdata->credits = credits;
+		rdata->credits = credits_on_stack;
 
 		list_for_each_entry_safe(page, tpage, &tmplist, lru) {
 			list_del(&page->lru);
 			rdata->pages[rdata->nr_pages++] = page;
 		}
 
-		if (!rdata->cfile->invalidHandle ||
-		    !(rc = cifs_reopen_file(rdata->cfile, true)))
-			rc = server->ops->async_readv(rdata);
+		rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+		if (!rc) {
+			if (rdata->cfile->invalidHandle)
+				rc = -EAGAIN;
+			else
+				rc = server->ops->async_readv(rdata);
+		}
+
 		if (rc) {
-			add_credits_and_wake_if(server, rdata->credits, 0);
+			add_credits_and_wake_if(server, &rdata->credits, 0);
 			for (i = 0; i < rdata->nr_pages; i++) {
 				page = rdata->pages[i];
 				lru_cache_add_file(page);
@@ -3862,6 +4391,7 @@
 	 * allocator.
 	 */
 	cifs_fscache_readpages_cancel(mapping->host, page_list);
+	free_xid(xid);
 	return rc;
 }
 
@@ -3889,8 +4419,12 @@
 	else
 		cifs_dbg(FYI, "Bytes read %d\n", rc);
 
-	file_inode(file)->i_atime =
-		current_time(file_inode(file));
+	/* we do not want atime to be less than mtime, it broke some apps */
+	file_inode(file)->i_atime = current_time(file_inode(file));
+	if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime)))
+		file_inode(file)->i_atime = file_inode(file)->i_mtime;
+	else
+		file_inode(file)->i_atime = current_time(file_inode(file));
 
 	if (PAGE_SIZE > rc)
 		memset(read_data + rc, 0, PAGE_SIZE - rc);
@@ -3937,17 +4471,15 @@
 static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
 {
 	struct cifsFileInfo *open_file;
-	struct cifs_tcon *tcon =
-		cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb));
 
-	spin_lock(&tcon->open_file_lock);
+	spin_lock(&cifs_inode->open_file_lock);
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
-			spin_unlock(&tcon->open_file_lock);
+			spin_unlock(&cifs_inode->open_file_lock);
 			return 1;
 		}
 	}
-	spin_unlock(&tcon->open_file_lock);
+	spin_unlock(&cifs_inode->open_file_lock);
 	return 0;
 }
 
@@ -4148,6 +4680,7 @@
 							     cinode);
 		cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
 	}
+	_cifsFileInfo_put(cfile, false /* do not wait for ourself */);
 	cifs_done_oplock_break(cinode);
 }
 

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 020f49c..df93778 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c

@@ -162,7 +162,11 @@
 	cifs_revalidate_cache(inode, fattr);
 
 	spin_lock(&inode->i_lock);
-	inode->i_atime = fattr->cf_atime;
+	/* we do not want atime to be less than mtime, it broke some apps */
+	if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime))
+		inode->i_atime = fattr->cf_mtime;
+	else
+		inode->i_atime = fattr->cf_atime;
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
 	inode->i_rdev = fattr->cf_rdev;
@@ -329,7 +333,7 @@
 	fattr->cf_mtime = timespec64_trunc(fattr->cf_mtime, sb->s_time_gran);
 	fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime;
 	fattr->cf_nlink = 2;
-	fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
+	fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL;
 }
 
 static int
@@ -410,6 +414,7 @@
 		/* if uniqueid is different, return error */
 		if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
 		    CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) {
+			CIFS_I(*pinode)->time = 0; /* force reval */
 			rc = -ESTALE;
 			goto cgiiu_exit;
 		}
@@ -417,6 +422,7 @@
 		/* if filetype is different, return error */
 		if (unlikely(((*pinode)->i_mode & S_IFMT) !=
 		    (fattr.cf_mode & S_IFMT))) {
+			CIFS_I(*pinode)->time = 0; /* force reval */
 			rc = -ESTALE;
 			goto cgiiu_exit;
 		}
@@ -726,7 +732,6 @@
 		    FILE_ALL_INFO *data, struct super_block *sb, int xid,
 		    const struct cifs_fid *fid)
 {
-	bool validinum = false;
 	__u16 srchflgs;
 	int rc = 0, tmprc = ENOSYS;
 	struct cifs_tcon *tcon;
@@ -780,43 +785,49 @@
 	} else if ((rc == -EACCES) && backup_cred(cifs_sb) &&
 		   (strcmp(server->vals->version_string, SMB1_VERSION_STRING)
 		      == 0)) {
-			/*
-			 * For SMB2 and later the backup intent flag is already
-			 * sent if needed on open and there is no path based
-			 * FindFirst operation to use to retry with
-			 */
+		/*
+		 * For SMB2 and later the backup intent flag is already
+		 * sent if needed on open and there is no path based
+		 * FindFirst operation to use to retry with
+		 */
 
-			srchinf = kzalloc(sizeof(struct cifs_search_info),
-						GFP_KERNEL);
-			if (srchinf == NULL) {
-				rc = -ENOMEM;
-				goto cgii_exit;
-			}
+		srchinf = kzalloc(sizeof(struct cifs_search_info),
+					GFP_KERNEL);
+		if (srchinf == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 
-			srchinf->endOfSearch = false;
+		srchinf->endOfSearch = false;
+		if (tcon->unix_ext)
+			srchinf->info_level = SMB_FIND_FILE_UNIX;
+		else if ((tcon->ses->capabilities &
+			 tcon->ses->server->vals->cap_nt_find) == 0)
+			srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD;
+		else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
 			srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+		else /* no srvino useful for fallback to some netapp */
+			srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO;
 
-			srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
-					CIFS_SEARCH_CLOSE_AT_END |
-					CIFS_SEARCH_BACKUP_SEARCH;
+		srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
+				CIFS_SEARCH_CLOSE_AT_END |
+				CIFS_SEARCH_BACKUP_SEARCH;
 
-			rc = CIFSFindFirst(xid, tcon, full_path,
-				cifs_sb, NULL, srchflgs, srchinf, false);
-			if (!rc) {
-				data =
-				(FILE_ALL_INFO *)srchinf->srch_entries_start;
+		rc = CIFSFindFirst(xid, tcon, full_path,
+			cifs_sb, NULL, srchflgs, srchinf, false);
+		if (!rc) {
+			data = (FILE_ALL_INFO *)srchinf->srch_entries_start;
 
-				cifs_dir_info_to_fattr(&fattr,
-				(FILE_DIRECTORY_INFO *)data, cifs_sb);
-				fattr.cf_uniqueid = le64_to_cpu(
-				((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
-				validinum = true;
+			cifs_dir_info_to_fattr(&fattr,
+			(FILE_DIRECTORY_INFO *)data, cifs_sb);
+			fattr.cf_uniqueid = le64_to_cpu(
+			((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
 
-				cifs_buf_release(srchinf->ntwrk_buf_start);
-			}
-			kfree(srchinf);
-			if (rc)
-				goto cgii_exit;
+			cifs_buf_release(srchinf->ntwrk_buf_start);
+		}
+		kfree(srchinf);
+		if (rc)
+			goto cgii_exit;
 	} else
 		goto cgii_exit;
 
@@ -829,31 +840,29 @@
 	 */
 	if (*inode == NULL) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-			if (validinum == false) {
-				if (server->ops->get_srv_inum)
-					tmprc = server->ops->get_srv_inum(xid,
-						tcon, cifs_sb, full_path,
-						&fattr.cf_uniqueid, data);
-				if (tmprc) {
-					cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
-						 tmprc);
-					fattr.cf_uniqueid = iunique(sb, ROOT_I);
-					cifs_autodisable_serverino(cifs_sb);
-				} else if ((fattr.cf_uniqueid == 0) &&
-						strlen(full_path) == 0) {
-					/* some servers ret bad root ino ie 0 */
-					cifs_dbg(FYI, "Invalid (0) inodenum\n");
-					fattr.cf_flags |=
-						CIFS_FATTR_FAKE_ROOT_INO;
-					fattr.cf_uniqueid =
-						simple_hashstr(tcon->treeName);
-				}
+			if (server->ops->get_srv_inum)
+				tmprc = server->ops->get_srv_inum(xid,
+								  tcon, cifs_sb, full_path,
+								  &fattr.cf_uniqueid, data);
+			if (tmprc) {
+				cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
+					 tmprc);
+				fattr.cf_uniqueid = iunique(sb, ROOT_I);
+				cifs_autodisable_serverino(cifs_sb);
+			} else if ((fattr.cf_uniqueid == 0) &&
+				   strlen(full_path) == 0) {
+				/* some servers ret bad root ino ie 0 */
+				cifs_dbg(FYI, "Invalid (0) inodenum\n");
+				fattr.cf_flags |=
+					CIFS_FATTR_FAKE_ROOT_INO;
+				fattr.cf_uniqueid =
+					simple_hashstr(tcon->treeName);
 			}
 		} else
 			fattr.cf_uniqueid = iunique(sb, ROOT_I);
 	} else {
-		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
-		    validinum == false && server->ops->get_srv_inum) {
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+		    && server->ops->get_srv_inum) {
 			/*
 			 * Pass a NULL tcon to ensure we don't make a round
 			 * trip to the server. This only works for SMB2+.
@@ -885,17 +894,24 @@
 			cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc);
 	}
 
-#ifdef CONFIG_CIFS_ACL
 	/* fill in 0777 bits from ACL */
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) {
+		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true,
+				       full_path, fid);
+		if (rc) {
+			cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n",
+				__func__, rc);
+			goto cgii_exit;
+		}
+	} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false,
+				       full_path, fid);
 		if (rc) {
 			cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
 				 __func__, rc);
 			goto cgii_exit;
 		}
 	}
-#endif /* CONFIG_CIFS_ACL */
 
 	/* fill in remaining high mode bits e.g. SUID, VTX */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -919,6 +935,7 @@
 		/* if uniqueid is different, return error */
 		if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
 		    CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+			CIFS_I(*inode)->time = 0; /* force reval */
 			rc = -ESTALE;
 			goto cgii_exit;
 		}
@@ -926,6 +943,7 @@
 		/* if filetype is different, return error */
 		if (unlikely(((*inode)->i_mode & S_IFMT) !=
 		    (fattr.cf_mode & S_IFMT))) {
+			CIFS_I(*inode)->time = 0; /* force reval */
 			rc = -ESTALE;
 			goto cgii_exit;
 		}
@@ -1309,8 +1327,8 @@
 /*
  * If d_inode(dentry) is null (usually meaning the cached dentry
  * is a negative dentry) then we would attempt a standard SMB delete, but
- * if that fails we can not attempt the fall back mechanisms on EACCESS
- * but will return the EACCESS to the caller. Note that the VFS does not call
+ * if that fails we can not attempt the fall back mechanisms on EACCES
+ * but will return the EACCES to the caller. Note that the VFS does not call
  * unlink on negative dentries currently.
  */
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
@@ -1608,13 +1626,14 @@
 	}
 
 	/* BB add setting the equivalent of mode via CreateX w/ACLs */
-	rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
+	rc = server->ops->mkdir(xid, inode, mode, tcon, full_path, cifs_sb);
 	if (rc) {
 		cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc);
 		d_drop(direntry);
 		goto mkdir_out;
 	}
 
+	/* TODO: skip this for smb2/smb3 */
 	rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon,
 			      xid);
 mkdir_out:
@@ -1728,6 +1747,10 @@
 	if (rc == 0 || rc != -EBUSY)
 		goto do_rename_exit;
 
+	/* Don't fall back to using SMB on SMB 2+ mount */
+	if (server->vals->protocol_id != 0)
+		goto do_rename_exit;
+
 	/* open-file renames don't work across directories */
 	if (to_dentry->d_parent != from_dentry->d_parent)
 		goto do_rename_exit;
@@ -2073,7 +2096,7 @@
 		return rc;
 
 	generic_fillattr(inode, stat);
-	stat->blksize = CIFS_MAX_MSGSIZE;
+	stat->blksize = cifs_sb->bsize;
 	stat->ino = CIFS_I(inode)->uniqueid;
 
 	/* old CIFS Unix Extensions doesn't return create time */
@@ -2105,6 +2128,43 @@
 	return rc;
 }
 
+int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
+		u64 len)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->vfs_inode.i_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct cifsFileInfo *cfile;
+	int rc;
+
+	/*
+	 * We need to be sure that all dirty pages are written as they
+	 * might fill holes on the server.
+	 */
+	if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
+	    inode->i_mapping->nrpages != 0) {
+		rc = filemap_fdatawait(inode->i_mapping);
+		if (rc) {
+			mapping_set_error(inode->i_mapping, rc);
+			return rc;
+		}
+	}
+
+	cfile = find_readable_file(cifs_i, false);
+	if (cfile == NULL)
+		return -EINVAL;
+
+	if (server->ops->fiemap) {
+		rc = server->ops->fiemap(tcon, cfile, fei, start, len);
+		cifsFileInfo_put(cfile);
+		return rc;
+	}
+
+	cifsFileInfo_put(cfile);
+	return -ENOTSUPP;
+}
+
 static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 {
 	pgoff_t index = from >> PAGE_SHIFT;
@@ -2250,6 +2310,11 @@
 	 * the flush returns error?
 	 */
 	rc = filemap_write_and_wait(inode->i_mapping);
+	if (is_interrupt_error(rc)) {
+		rc = -ERESTARTSYS;
+		goto out;
+	}
+
 	mapping_set_error(inode->i_mapping, rc);
 	rc = 0;
 
@@ -2355,6 +2420,8 @@
 	struct inode *inode = d_inode(direntry);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifsFileInfo *wfile;
+	struct cifs_tcon *tcon;
 	char *full_path = NULL;
 	int rc = -EACCES;
 	__u32 dosattr = 0;
@@ -2362,7 +2429,7 @@
 
 	xid = get_xid();
 
-	cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n",
+	cifs_dbg(FYI, "setattr on file %pd attrs->ia_valid 0x%x\n",
 		 direntry, attrs->ia_valid);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
@@ -2393,9 +2460,28 @@
 	 * the flush returns error?
 	 */
 	rc = filemap_write_and_wait(inode->i_mapping);
+	if (is_interrupt_error(rc)) {
+		rc = -ERESTARTSYS;
+		goto cifs_setattr_exit;
+	}
+
 	mapping_set_error(inode->i_mapping, rc);
 	rc = 0;
 
+	if (attrs->ia_valid & ATTR_MTIME) {
+		rc = cifs_get_writable_file(cifsInode, false, &wfile);
+		if (!rc) {
+			tcon = tlink_tcon(wfile->tlink);
+			rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid);
+			cifsFileInfo_put(wfile);
+			if (rc)
+				goto cifs_setattr_exit;
+		} else if (rc != -EBADF)
+			goto cifs_setattr_exit;
+		else
+			rc = 0;
+	}
+
 	if (attrs->ia_valid & ATTR_SIZE) {
 		rc = cifs_set_file_size(inode, attrs, xid, full_path);
 		if (rc != 0)
@@ -2408,8 +2494,8 @@
 	if (attrs->ia_valid & ATTR_GID)
 		gid = attrs->ia_gid;
 
-#ifdef CONFIG_CIFS_ACL
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) ||
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) {
 		if (uid_valid(uid) || gid_valid(gid)) {
 			rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
 							uid, gid);
@@ -2420,7 +2506,6 @@
 			}
 		}
 	} else
-#endif /* CONFIG_CIFS_ACL */
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
 		attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
 
@@ -2431,8 +2516,8 @@
 	if (attrs->ia_valid & ATTR_MODE) {
 		mode = attrs->ia_mode;
 		rc = 0;
-#ifdef CONFIG_CIFS_ACL
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) ||
+		    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) {
 			rc = id_mode_to_cifs_acl(inode, full_path, mode,
 						INVALID_UID, INVALID_GID);
 			if (rc) {
@@ -2441,7 +2526,6 @@
 				goto cifs_setattr_exit;
 			}
 		} else
-#endif /* CONFIG_CIFS_ACL */
 		if (((mode & S_IWUGO) == 0) &&
 		    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
 

diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 54f32f9..1a01e10 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c

@@ -32,8 +32,51 @@
 #include "cifs_debug.h"
 #include "cifsfs.h"
 #include "cifs_ioctl.h"
+#include "smb2proto.h"
 #include <linux/btrfs.h>
 
+static long cifs_ioctl_query_info(unsigned int xid, struct file *filep,
+				  unsigned long p)
+{
+	struct inode *inode = file_inode(filep);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct dentry *dentry = filep->f_path.dentry;
+	unsigned char *path;
+	__le16 *utf16_path = NULL, root_path;
+	int rc = 0;
+
+	path = build_path_from_dentry(dentry);
+	if (path == NULL)
+		return -ENOMEM;
+
+	cifs_dbg(FYI, "%s %s\n", __func__, path);
+
+	if (!path[0]) {
+		root_path = 0;
+		utf16_path = &root_path;
+	} else {
+		utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb);
+		if (!utf16_path) {
+			rc = -ENOMEM;
+			goto ici_exit;
+		}
+	}
+
+	if (tcon->ses->server->ops->ioctl_query_info)
+		rc = tcon->ses->server->ops->ioctl_query_info(
+				xid, tcon, utf16_path,
+				filep->private_data ? 0 : 1, p);
+	else
+		rc = -EOPNOTSUPP;
+
+ ici_exit:
+	if (utf16_path != &root_path)
+		kfree(utf16_path);
+	kfree(path);
+	return rc;
+}
+
 static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
 			unsigned long srcfd)
 {
@@ -121,9 +164,9 @@
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
 	struct inode *inode = file_inode(filep);
+	struct smb3_key_debug_info pkey_inf;
 	int rc = -ENOTTY; /* strange error - but the precedent */
 	unsigned int xid;
-	struct cifs_sb_info *cifs_sb;
 	struct cifsFileInfo *pSMBFile = filep->private_data;
 	struct cifs_tcon *tcon;
 	__u64	ExtAttrBits = 0;
@@ -131,7 +174,6 @@
 
 	xid = get_xid();
 
-	cifs_sb = CIFS_SB(inode->i_sb);
 	cifs_dbg(FYI, "cifs ioctl 0x%x\n", command);
 	switch (command) {
 		case FS_IOC_GETFLAGS:
@@ -196,6 +238,9 @@
 		case CIFS_IOC_COPYCHUNK_FILE:
 			rc = cifs_ioctl_copychunk(xid, filep, arg);
 			break;
+		case CIFS_QUERY_INFO:
+			rc = cifs_ioctl_query_info(xid, filep, arg);
+			break;
 		case CIFS_IOC_SET_INTEGRITY:
 			if (pSMBFile == NULL)
 				break;
@@ -226,6 +271,34 @@
 			else
 				rc = -EOPNOTSUPP;
 			break;
+		case CIFS_DUMP_KEY:
+			if (pSMBFile == NULL)
+				break;
+			if (!capable(CAP_SYS_ADMIN)) {
+				rc = -EACCES;
+				break;
+			}
+
+			tcon = tlink_tcon(pSMBFile->tlink);
+			if (!smb3_encryption_required(tcon)) {
+				rc = -EOPNOTSUPP;
+				break;
+			}
+			pkey_inf.cipher_type =
+				le16_to_cpu(tcon->ses->server->cipher_type);
+			pkey_inf.Suid = tcon->ses->Suid;
+			memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response,
+					16 /* SMB2_NTLMV2_SESSKEY_SIZE */);
+			memcpy(pkey_inf.smb3decryptionkey,
+			      tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+			memcpy(pkey_inf.smb3encryptionkey,
+			      tcon->ses->smb3encryptionkey, SMB3_SIGN_KEY_SIZE);
+			if (copy_to_user((void __user *)arg, &pkey_inf,
+					sizeof(struct smb3_key_debug_info)))
+				rc = -EFAULT;
+			else
+				rc = 0;
+			break;
 		default:
 			cifs_dbg(FYI, "unsupported ioctl\n");
 			break;

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 2148b0f..b736acd 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c

@@ -103,9 +103,9 @@
 		return rc;
 	}
 
-	snprintf(md5_str2, sizeof(md5_str2),
-		 CIFS_MF_SYMLINK_MD5_FORMAT,
-		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+	scnprintf(md5_str2, sizeof(md5_str2),
+		  CIFS_MF_SYMLINK_MD5_FORMAT,
+		  CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
 
 	if (strncmp(md5_str1, md5_str2, 17) != 0)
 		return -EINVAL;
@@ -142,10 +142,10 @@
 		return rc;
 	}
 
-	snprintf(buf, buf_len,
-		 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
-		 link_len,
-		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+	scnprintf(buf, buf_len,
+		  CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+		  link_len,
+		  CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
 
 	ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
 	memcpy(buf + ofs, link_str, link_len);
@@ -648,9 +648,16 @@
 		rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
 				      &target_path);
 
-	if (rc != 0 && server->ops->query_symlink)
-		rc = server->ops->query_symlink(xid, tcon, full_path,
-						&target_path, cifs_sb);
+	if (rc != 0 && server->ops->query_symlink) {
+		struct cifsInodeInfo *cifsi = CIFS_I(inode);
+		bool reparse_point = false;
+
+		if (cifsi->cifsAttrs & ATTR_REPARSE)
+			reparse_point = true;
+
+		rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
+						&target_path, reparse_point);
+	}
 
 	kfree(full_path);
 	free_xid(xid);

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 6926685..5ad83bd 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c

@@ -111,19 +111,27 @@
 tconInfoAlloc(void)
 {
 	struct cifs_tcon *ret_buf;
-	ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
-	if (ret_buf) {
-		atomic_inc(&tconInfoAllocCount);
-		ret_buf->tidStatus = CifsNew;
-		++ret_buf->tc_count;
-		INIT_LIST_HEAD(&ret_buf->openFileList);
-		INIT_LIST_HEAD(&ret_buf->tcon_list);
-		spin_lock_init(&ret_buf->open_file_lock);
-		mutex_init(&ret_buf->crfid.fid_mutex);
-		ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid),
-					     GFP_KERNEL);
-		spin_lock_init(&ret_buf->stat_lock);
+
+	ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
+	if (!ret_buf)
+		return NULL;
+	ret_buf->crfid.fid = kzalloc(sizeof(*ret_buf->crfid.fid), GFP_KERNEL);
+	if (!ret_buf->crfid.fid) {
+		kfree(ret_buf);
+		return NULL;
 	}
+
+	atomic_inc(&tconInfoAllocCount);
+	ret_buf->tidStatus = CifsNew;
+	++ret_buf->tc_count;
+	INIT_LIST_HEAD(&ret_buf->openFileList);
+	INIT_LIST_HEAD(&ret_buf->tcon_list);
+	spin_lock_init(&ret_buf->open_file_lock);
+	mutex_init(&ret_buf->crfid.fid_mutex);
+	spin_lock_init(&ret_buf->stat_lock);
+	atomic_set(&ret_buf->num_local_opens, 0);
+	atomic_set(&ret_buf->num_remote_opens, 0);
+
 	return ret_buf;
 }
 
@@ -138,6 +146,9 @@
 	kfree(buf_to_free->nativeFileSystem);
 	kzfree(buf_to_free->password);
 	kfree(buf_to_free->crfid.fid);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	kfree(buf_to_free->dfs_path);
+#endif
 	kfree(buf_to_free);
 }
 
@@ -490,8 +501,7 @@
 					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
 					   &pCifsInode->flags);
 
-				queue_work(cifsoplockd_wq,
-					   &netfile->oplock_break);
+				cifs_queue_oplock_break(netfile);
 				netfile->oplock_break_cancelled = false;
 
 				spin_unlock(&tcon->open_file_lock);
@@ -523,9 +533,18 @@
 cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+		struct cifs_tcon *tcon = NULL;
+
+		if (cifs_sb->master_tlink)
+			tcon = cifs_sb_master_tcon(cifs_sb);
+
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
-		cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n",
-			 cifs_sb_master_tcon(cifs_sb)->treeName);
+		cifs_sb->mnt_cifs_serverino_autodisabled = true;
+		cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s.\n",
+			 tcon ? tcon->treeName : "new server");
+		cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS).\n");
+		cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n");
+
 	}
 }
 
@@ -588,6 +607,28 @@
 	spin_unlock(&cinode->writers_lock);
 }
 
+/**
+ * cifs_queue_oplock_break - queue the oplock break handler for cfile
+ *
+ * This function is called from the demultiplex thread when it
+ * receives an oplock break for @cfile.
+ *
+ * Assumes the tcon->open_file_lock is held.
+ * Assumes cfile->file_info_lock is NOT held.
+ */
+void cifs_queue_oplock_break(struct cifsFileInfo *cfile)
+{
+	/*
+	 * Bump the handle refcount now while we hold the
+	 * open_file_lock to enforce the validity of it for the oplock
+	 * break handler. The matching put is done at the end of the
+	 * handler.
+	 */
+	cifsFileInfo_get(cfile);
+
+	queue_work(cifsoplockd_wq, &cfile->oplock_break);
+}
+
 void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
 {
 	clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
@@ -730,6 +771,8 @@
 			goto parse_DFS_referrals_exit;
 		}
 
+		node->ttl = le32_to_cpu(ref->TimeToLive);
+
 		ref++;
 	}
 
@@ -747,6 +790,11 @@
 {
 	struct cifs_aio_ctx *ctx;
 
+	/*
+	 * Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io
+	 * to false so that we know when we have to unreference pages within
+	 * cifs_aio_ctx_release()
+	 */
 	ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
 	if (!ctx)
 		return NULL;
@@ -765,7 +813,23 @@
 					struct cifs_aio_ctx, refcount);
 
 	cifsFileInfo_put(ctx->cfile);
-	kvfree(ctx->bv);
+
+	/*
+	 * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly
+	 * which means that iov_iter_get_pages() was a success and thus that
+	 * we have taken reference on pages.
+	 */
+	if (ctx->bv) {
+		unsigned i;
+
+		for (i = 0; i < ctx->npages; i++) {
+			if (ctx->should_dirty)
+				set_page_dirty(ctx->bv[i].bv_page);
+			put_page(ctx->bv[i].bv_page);
+		}
+		kvfree(ctx->bv);
+	}
+
 	kfree(ctx);
 }
 
@@ -786,7 +850,7 @@
 	struct page **pages = NULL;
 	struct bio_vec *bv = NULL;
 
-	if (iter->type & ITER_KVEC) {
+	if (iov_iter_is_kvec(iter)) {
 		memcpy(&ctx->iter, iter, sizeof(struct iov_iter));
 		ctx->len = count;
 		iov_iter_advance(iter, count);
@@ -857,7 +921,7 @@
 	ctx->bv = bv;
 	ctx->len = saved_len - count;
 	ctx->npages = npages;
-	iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len);
+	iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len);
 	return 0;
 }
 
@@ -896,7 +960,6 @@
 	}
 
 	(*sdesc)->shash.tfm = *shash;
-	(*sdesc)->shash.flags = 0x0;
 	return 0;
 }
 
@@ -931,3 +994,42 @@
 	else if (page == 0)
 		*len = rqst->rq_pagesz - rqst->rq_offset;
 }
+
+void extract_unc_hostname(const char *unc, const char **h, size_t *len)
+{
+	const char *end;
+
+	/* skip initial slashes */
+	while (*unc && (*unc == '\\' || *unc == '/'))
+		unc++;
+
+	end = unc;
+
+	while (*end && !(*end == '\\' || *end == '/'))
+		end++;
+
+	*h = unc;
+	*len = end - unc;
+}
+
+/**
+ * copy_path_name - copy src path to dst, possibly truncating
+ *
+ * returns number of bytes written (including trailing nul)
+ */
+int copy_path_name(char *dst, const char *src)
+{
+	int name_len;
+
+	/*
+	 * PATH_MAX includes nul, so if strlen(src) >= PATH_MAX it
+	 * will truncate and strlen(dst) will be PATH_MAX-1
+	 */
+	name_len = strscpy(dst, src, PATH_MAX);
+	if (WARN_ON_ONCE(name_len < 0))
+		name_len = PATH_MAX-1;
+
+	/* we count the trailing nul */
+	name_len++;
+	return name_len;
+}

diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index fdd908e..9b41436 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   fs/cifs/netmisc.c
  *
@@ -6,20 +7,6 @@
  *
  *   Error mapping routines from Samba libsmb/errormap.c
  *   Copyright (C) Andrew Tridgell 2001
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/net.h>
@@ -130,10 +117,6 @@
 	{0, 0}
 };
 
-static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
-	{0, 0}
-};
-
 /*
  * Convert a string containing text IPv4 or IPv6 address to binary form.
  *
@@ -962,8 +945,8 @@
 struct timespec64 cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 {
 	struct timespec64 ts;
-	time64_t sec;
-	int min, days, month, year;
+	time64_t sec, days;
+	int min, day, month, year;
 	u16 date = le16_to_cpu(le_date);
 	u16 time = le16_to_cpu(le_time);
 	SMB_TIME *st = (SMB_TIME *)&time;
@@ -979,15 +962,15 @@
 	sec += 60 * 60 * st->Hours;
 	if (st->Hours > 24)
 		cifs_dbg(VFS, "illegal hours %d\n", st->Hours);
-	days = sd->Day;
+	day = sd->Day;
 	month = sd->Month;
-	if (days < 1 || days > 31 || month < 1 || month > 12) {
-		cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days);
-		days = clamp(days, 1, 31);
+	if (day < 1 || day > 31 || month < 1 || month > 12) {
+		cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, day);
+		day = clamp(day, 1, 31);
 		month = clamp(month, 1, 12);
 	}
 	month -= 1;
-	days += total_days_of_prev_months[month];
+	days = day + total_days_of_prev_months[month];
 	days += 3652; /* account for difference in days between 1980 and 1970 */
 	year = sd->Year;
 	days += year * 365;

diff --git a/fs/cifs/nterr.c b/fs/cifs/nterr.c
index b6023c6..358a766 100644
--- a/fs/cifs/nterr.c
+++ b/fs/cifs/nterr.c

@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Unix SMB/Netbios implementation.
  *  Version 1.9.
  *  RPC Pipe client / server routines
  *  Copyright (C) Luke Kenneth Casson Leighton 1997-2001.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /* NT error codes - see nterr.h */

diff --git a/fs/cifs/nterr.h b/fs/cifs/nterr.h
index 7a0eae5..edd4741 100644
--- a/fs/cifs/nterr.h
+++ b/fs/cifs/nterr.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
    Unix SMB/Netbios implementation.
    Version 1.9.
@@ -7,19 +8,6 @@
    Copyright (C) Luke Kenneth Casson Leighton 1996-2000
    Copyright (C) Paul Ashton                  1998-2000
 
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index e169e1a..3925a7b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c

@@ -655,7 +655,14 @@
 		/* scan and find it */
 		int i;
 		char *cur_ent;
-		char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
+		char *end_of_smb;
+
+		if (cfile->srch_inf.ntwrk_buf_start == NULL) {
+			cifs_dbg(VFS, "ntwrk_buf_start is NULL during readdir\n");
+			return -EIO;
+		}
+
+		end_of_smb = cfile->srch_inf.ntwrk_buf_start +
 			server->ops->calc_smb_size(
 					cfile->srch_inf.ntwrk_buf_start,
 					server);

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aa23c00..85bd644 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c

@@ -159,13 +159,16 @@
 				 const struct nls_table *nls_cp)
 {
 	char *bcc_ptr = *pbcc_area;
+	int len;
 
 	/* copy user */
 	/* BB what about null user mounts - check that we do this BB */
 	/* copy user */
 	if (ses->user_name != NULL) {
-		strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN);
-		bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN);
+		len = strscpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN);
+		if (WARN_ON_ONCE(len < 0))
+			len = CIFS_MAX_USERNAME_LEN - 1;
+		bcc_ptr += len;
 	}
 	/* else null user mount */
 	*bcc_ptr = 0;
@@ -173,8 +176,10 @@
 
 	/* copy domain */
 	if (ses->domainName != NULL) {
-		strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
-		bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+		len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+		if (WARN_ON_ONCE(len < 0))
+			len = CIFS_MAX_DOMAINNAME_LEN - 1;
+		bcc_ptr += len;
 	} /* else we will send a null domain name
 	     so the server will default to its own domain */
 	*bcc_ptr = 0;
@@ -242,9 +247,10 @@
 
 	kfree(ses->serverOS);
 
-	ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
+	ses->serverOS = kmalloc(len + 1, GFP_KERNEL);
 	if (ses->serverOS) {
-		strncpy(ses->serverOS, bcc_ptr, len);
+		memcpy(ses->serverOS, bcc_ptr, len);
+		ses->serverOS[len] = 0;
 		if (strncmp(ses->serverOS, "OS/2", 4) == 0)
 			cifs_dbg(FYI, "OS/2 server\n");
 	}
@@ -258,9 +264,11 @@
 
 	kfree(ses->serverNOS);
 
-	ses->serverNOS = kzalloc(len + 1, GFP_KERNEL);
-	if (ses->serverNOS)
-		strncpy(ses->serverNOS, bcc_ptr, len);
+	ses->serverNOS = kmalloc(len + 1, GFP_KERNEL);
+	if (ses->serverNOS) {
+		memcpy(ses->serverNOS, bcc_ptr, len);
+		ses->serverNOS[len] = 0;
+	}
 
 	bcc_ptr += len + 1;
 	bleft -= len + 1;
@@ -534,9 +542,9 @@
 			if (global_secflags & CIFSSEC_MAY_NTLM)
 				return NTLM;
 		default:
-			/* Fallthrough to attempt LANMAN authentication next */
 			break;
 		}
+		/* Fallthrough - to attempt LANMAN authentication next */
 	case CIFS_NEGFLAVOR_LANMAN:
 		switch (requested) {
 		case LANMAN:
@@ -690,7 +698,6 @@
 	char *bcc_ptr;
 	struct cifs_ses *ses = sess_data->ses;
 	char lnm_session_key[CIFS_AUTH_RESP_SIZE];
-	__u32 capabilities;
 	__u16 bytes_remaining;
 
 	/* lanman 2 style sessionsetup */
@@ -701,7 +708,7 @@
 
 	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 	bcc_ptr = sess_data->iov[2].iov_base;
-	capabilities = cifs_ssetup_hdr(ses, pSMB);
+	(void)cifs_ssetup_hdr(ses, pSMB);
 
 	pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
 
@@ -1154,14 +1161,12 @@
 static int
 _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
 {
-	struct smb_hdr *smb_buf;
 	SESSION_SETUP_ANDX *pSMB;
 	struct cifs_ses *ses = sess_data->ses;
 	__u32 capabilities;
 	char *bcc_ptr;
 
 	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
-	smb_buf = (struct smb_hdr *)pSMB;
 
 	capabilities = cifs_ssetup_hdr(ses, pSMB);
 	if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {

diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 378151e..5148106 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c

@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  SMB1 (CIFS) version specific operations
  *
  *  Copyright (c) 2012, Jeff Layton <jlayton@redhat.com>
- *
- *  This library is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License v2 as published
- *  by the Free Software Foundation.
- *
- *  This library is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *  the GNU Lesser General Public License for more details.
- *
- *  You should have received a copy of the GNU Lesser General Public License
- *  along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/pagemap.h>
@@ -117,11 +105,11 @@
 }
 
 static void
-cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add,
-		 const int optype)
+cifs_add_credits(struct TCP_Server_Info *server,
+		 const struct cifs_credits *credits, const int optype)
 {
 	spin_lock(&server->req_lock);
-	server->credits += add;
+	server->credits += credits->value;
 	server->in_flight--;
 	spin_unlock(&server->req_lock);
 	wake_up(&server->request_q);
@@ -183,6 +171,9 @@
 	/* we do not want to loop forever */
 	last_mid = cur_mid;
 	cur_mid++;
+	/* avoid 0xFFFF MID */
+	if (cur_mid == 0xffff)
+		cur_mid++;
 
 	/*
 	 * This nested loop looks more expensive than it is.
@@ -308,7 +299,7 @@
 	remaining = tgt_total_cnt - total_in_tgt;
 
 	if (remaining < 0) {
-		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n",
 			 tgt_total_cnt, total_in_tgt);
 		return -EPROTO;
 	}
@@ -929,19 +920,18 @@
 {
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	int rc;
-	unsigned int num_referrals = 0;
-	struct dfs_info3_param *referrals = NULL;
+	struct dfs_info3_param referral = {0};
 
-	rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage,
-			  &num_referrals, &referrals, 0);
+	rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage, &referral,
+			  0);
 
-	if (!rc && num_referrals > 0) {
-		*symlinkinfo = kstrndup(referrals->node_name,
-					strlen(referrals->node_name),
+	if (!rc) {
+		*symlinkinfo = kstrndup(referral.node_name,
+					strlen(referral.node_name),
 					GFP_KERNEL);
+		free_dfs_info_param(&referral);
 		if (!*symlinkinfo)
 			rc = -ENOMEM;
-		free_dfs_info_array(referrals, num_referrals);
 	}
 	return rc;
 #else /* No DFS support */
@@ -951,8 +941,8 @@
 
 static int
 cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
-		   const char *full_path, char **target_path,
-		   struct cifs_sb_info *cifs_sb)
+		   struct cifs_sb_info *cifs_sb, const char *full_path,
+		   char **target_path, bool is_reparse_point)
 {
 	int rc;
 	int oplock = 0;
@@ -961,6 +951,11 @@
 
 	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
 
+	if (is_reparse_point) {
+		cifs_dbg(VFS, "reparse points not handled for SMB1 symlinks\n");
+		return -EOPNOTSUPP;
+	}
+
 	/* Check for unix extensions */
 	if (cap_unix(tcon->ses)) {
 		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
@@ -1028,6 +1023,131 @@
 	return false;
 }
 
+static int
+cifs_make_node(unsigned int xid, struct inode *inode,
+	       struct dentry *dentry, struct cifs_tcon *tcon,
+	       char *full_path, umode_t mode, dev_t dev)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct inode *newinode = NULL;
+	int rc = -EPERM;
+	int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+	FILE_ALL_INFO *buf = NULL;
+	struct cifs_io_parms io_parms;
+	__u32 oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	unsigned int bytes_written;
+	struct win_dev *pdev;
+	struct kvec iov[2];
+
+	if (tcon->unix_ext) {
+		/*
+		 * SMB1 Unix Extensions: requires server support but
+		 * works with all special files
+		 */
+		struct cifs_unix_set_info_args args = {
+			.mode	= mode & ~current_umask(),
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= dev,
+		};
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = current_fsuid();
+			args.gid = current_fsgid();
+		} else {
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
+		}
+		rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+					    cifs_sb->local_nls,
+					    cifs_remap(cifs_sb));
+		if (rc)
+			goto out;
+
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
+					      inode->i_sb, xid);
+
+		if (rc == 0)
+			d_instantiate(dentry, newinode);
+		goto out;
+	}
+
+	/*
+	 * SMB1 SFU emulation: should work with all servers, but only
+	 * support block and char device (no socket & fifo)
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+		goto out;
+
+	if (!S_ISCHR(mode) && !S_ISBLK(mode))
+		goto out;
+
+	cifs_dbg(FYI, "sfu compat create special file\n");
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (buf == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_CREATE;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	if (tcon->ses->server->oplocks)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+	rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+	if (rc)
+		goto out;
+
+	/*
+	 * BB Do not bother to decode buf since no local inode yet to put
+	 * timestamps in, but we can reuse it safely.
+	 */
+
+	pdev = (struct win_dev *)buf;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = sizeof(struct win_dev);
+	iov[1].iov_base = buf;
+	iov[1].iov_len = sizeof(struct win_dev);
+	if (S_ISCHR(mode)) {
+		memcpy(pdev->type, "IntxCHR", 8);
+		pdev->major = cpu_to_le64(MAJOR(dev));
+		pdev->minor = cpu_to_le64(MINOR(dev));
+		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+							&bytes_written, iov, 1);
+	} else if (S_ISBLK(mode)) {
+		memcpy(pdev->type, "IntxBLK", 8);
+		pdev->major = cpu_to_le64(MAJOR(dev));
+		pdev->minor = cpu_to_le64(MINOR(dev));
+		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+							&bytes_written, iov, 1);
+	}
+	tcon->ses->server->ops->close(xid, tcon, &fid);
+	d_drop(dentry);
+
+	/* FIXME: add code here to set EAs */
+out:
+	kfree(buf);
+	return rc;
+}
+
+
+
 struct smb_version_operations smb1_operations = {
 	.send_cancel = send_nt_cancel,
 	.compare_fids = cifs_compare_fids,
@@ -1106,15 +1226,15 @@
 	.query_all_EAs = CIFSSMBQAllEAs,
 	.set_EA = CIFSSMBSetEA,
 #endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
 	.get_acl = get_cifs_acl,
 	.get_acl_by_fid = get_cifs_acl_by_fid,
 	.set_acl = set_cifs_acl,
-#endif /* CIFS_ACL */
+	.make_node = cifs_make_node,
 };
 
 struct smb_version_values smb1_values = {
 	.version_string = SMB1_VERSION_STRING,
+	.protocol_id = SMB10_PROT_ID,
 	.large_lock_type = LOCKING_ANDX_LARGE_FILES,
 	.exclusive_lock_type = 0,
 	.shared_lock_type = LOCKING_ANDX_SHARED_LOCK,

diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 4ed10dd..8b0b512 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c

@@ -68,13 +68,15 @@
 
 
 	 if (oparms->tcon->use_resilient) {
-		nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */
+		/* default timeout is 0, servers pick default (120 seconds) */
+		nr_ioctl_req.Timeout =
+			cpu_to_le32(oparms->tcon->handle_timeout);
 		nr_ioctl_req.Reserved = 0;
 		rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
 			fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY,
 			true /* is_fsctl */,
 			(char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
-			NULL, NULL /* no return info */);
+			CIFSMaxBufSize, NULL, NULL /* no return info */);
 		if (rc == -EOPNOTSUPP) {
 			cifs_dbg(VFS,
 			     "resiliency not supported by server, disabling\n");
@@ -86,14 +88,20 @@
 	}
 
 	if (buf) {
-		/* open response does not have IndexNumber field - get it */
-		rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
+		/* if open response does not have IndexNumber field - get it */
+		if (smb2_data->IndexNumber == 0) {
+			rc = SMB2_get_srv_num(xid, oparms->tcon,
+				      fid->persistent_fid,
 				      fid->volatile_fid,
 				      &smb2_data->IndexNumber);
-		if (rc) {
-			/* let get_inode_info disable server inode numbers */
-			smb2_data->IndexNumber = 0;
-			rc = 0;
+			if (rc) {
+				/*
+				 * let get_inode_info disable server inode
+				 * numbers
+				 */
+				smb2_data->IndexNumber = 0;
+				rc = 0;
+			}
 		}
 		move_smb2_info_to_cifs(buf, smb2_data);
 	}
@@ -122,12 +130,14 @@
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf)
+	if (max_buf < sizeof(struct smb2_lock_element))
 		return -EINVAL;
 
+	BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
 	max_num = max_buf / sizeof(struct smb2_lock_element);
 	buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
 	if (!buf)
@@ -135,7 +145,7 @@
 
 	cur = buf;
 
-	down_write(&cinode->lock_sem);
+	cifs_down_write(&cinode->lock_sem);
 	list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
 		if (flock->fl_start > li->offset ||
 		    (flock->fl_start + length) <
@@ -264,6 +274,8 @@
 		return -EINVAL;
 	}
 
+	BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
 	max_num = max_buf / sizeof(struct smb2_lock_element);
 	buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
 	if (!buf) {

diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 0ffa180..dd10f0c 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h

@@ -33,7 +33,7 @@
 
 /*
  * Identifiers for functions that use the open, operation, close pattern
- * in smb2inode.c:smb2_open_op_close()
+ * in smb2inode.c:smb2_compound_op()
  */
 #define SMB2_OP_SET_DELETE 1
 #define SMB2_OP_SET_INFO 2

diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1eef179..4121ac1 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c

@@ -37,96 +37,390 @@
 #include "smb2pdu.h"
 #include "smb2proto.h"
 
-static int
-smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
-		   struct cifs_sb_info *cifs_sb, const char *full_path,
-		   __u32 desired_access, __u32 create_disposition,
-		   __u32 create_options, void *data, int command)
+static void
+free_set_inf_compound(struct smb_rqst *rqst)
 {
-	int rc, tmprc = 0;
+	if (rqst[1].rq_iov)
+		SMB2_set_info_free(&rqst[1]);
+	if (rqst[2].rq_iov)
+		SMB2_close_free(&rqst[2]);
+}
+
+
+static int
+smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
+		 struct cifs_sb_info *cifs_sb, const char *full_path,
+		 __u32 desired_access, __u32 create_disposition,
+		 __u32 create_options, umode_t mode, void *ptr, int command,
+		 struct cifsFileInfo *cfile)
+{
+	int rc;
 	__le16 *utf16_path = NULL;
 	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
 	struct cifs_open_parms oparms;
 	struct cifs_fid fid;
-	bool use_cached_root_handle = false;
+	struct cifs_ses *ses = tcon->ses;
+	int num_rqst = 0;
+	struct smb_rqst rqst[3];
+	int resp_buftype[3];
+	struct kvec rsp_iov[3];
+	struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+	struct kvec qi_iov[1];
+	struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+	struct kvec close_iov[1];
+	struct smb2_query_info_rsp *qi_rsp = NULL;
+	int flags = 0;
+	__u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+	unsigned int size[2];
+	void *data[2];
+	struct smb2_file_rename_info rename_info;
+	struct smb2_file_link_info link_info;
+	int len;
 
-	if ((strcmp(full_path, "") == 0) && (create_options == 0) &&
-	    (desired_access == FILE_READ_ATTRIBUTES) &&
-	    (create_disposition == FILE_OPEN) &&
-	    (tcon->nohandlecache == false)) {
-		rc = open_shroot(xid, tcon, &fid);
-		if (rc == 0)
-			use_cached_root_handle = true;
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	memset(rqst, 0, sizeof(rqst));
+	resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+	memset(rsp_iov, 0, sizeof(rsp_iov));
+
+	/* We already have a handle so we can skip the open */
+	if (cfile)
+		goto after_open;
+
+	/* Open */
+	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+	if (!utf16_path) {
+		rc = -ENOMEM;
+		goto finished;
 	}
 
-	if (use_cached_root_handle == false) {
-		utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
-		if (!utf16_path)
-			return -ENOMEM;
+	oparms.tcon = tcon;
+	oparms.desired_access = desired_access;
+	oparms.disposition = create_disposition;
+	oparms.create_options = create_options;
+	if (backup_cred(cifs_sb))
+		oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+	oparms.mode = mode;
 
-		oparms.tcon = tcon;
-		oparms.desired_access = desired_access;
-		oparms.disposition = create_disposition;
-		oparms.create_options = create_options;
-		oparms.fid = &fid;
-		oparms.reconnect = false;
+	memset(&open_iov, 0, sizeof(open_iov));
+	rqst[num_rqst].rq_iov = open_iov;
+	rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
+	rc = SMB2_open_init(tcon, &rqst[num_rqst], &oplock, &oparms,
+			    utf16_path);
+	kfree(utf16_path);
+	if (rc)
+		goto finished;
 
-		rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
-			       NULL);
-		if (rc) {
-			kfree(utf16_path);
-			return rc;
-		}
-	}
+	smb2_set_next_command(tcon, &rqst[num_rqst]);
+ after_open:
+	num_rqst++;
+	rc = 0;
 
+	/* Operation */
 	switch (command) {
-	case SMB2_OP_DELETE:
-		break;
 	case SMB2_OP_QUERY_INFO:
-		tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
-					fid.volatile_fid,
-					(struct smb2_file_all_info *)data);
+		memset(&qi_iov, 0, sizeof(qi_iov));
+		rqst[num_rqst].rq_iov = qi_iov;
+		rqst[num_rqst].rq_nvec = 1;
+
+		if (cfile)
+			rc = SMB2_query_info_init(tcon, &rqst[num_rqst],
+				cfile->fid.persistent_fid,
+				cfile->fid.volatile_fid,
+				FILE_ALL_INFORMATION,
+				SMB2_O_INFO_FILE, 0,
+				sizeof(struct smb2_file_all_info) +
+					  PATH_MAX * 2, 0, NULL);
+		else {
+			rc = SMB2_query_info_init(tcon, &rqst[num_rqst],
+				COMPOUND_FID,
+				COMPOUND_FID,
+				 FILE_ALL_INFORMATION,
+				SMB2_O_INFO_FILE, 0,
+				sizeof(struct smb2_file_all_info) +
+					  PATH_MAX * 2, 0, NULL);
+			if (!rc) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			}
+		}
+
+		if (rc)
+			goto finished;
+		num_rqst++;
+		trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
+						     full_path);
+		break;
+	case SMB2_OP_DELETE:
+		trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
 		break;
 	case SMB2_OP_MKDIR:
 		/*
 		 * Directories are created through parameters in the
 		 * SMB2_open() call.
 		 */
+		trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
 		break;
 	case SMB2_OP_RMDIR:
-		tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid,
-				   fid.volatile_fid);
-		break;
-	case SMB2_OP_RENAME:
-		tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
-				    fid.volatile_fid, (__le16 *)data);
-		break;
-	case SMB2_OP_HARDLINK:
-		tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
-					  fid.volatile_fid, (__le16 *)data);
+		memset(&si_iov, 0, sizeof(si_iov));
+		rqst[num_rqst].rq_iov = si_iov;
+		rqst[num_rqst].rq_nvec = 1;
+
+		size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
+		data[0] = &delete_pending[0];
+
+		rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+					COMPOUND_FID, current->tgid,
+					FILE_DISPOSITION_INFORMATION,
+					SMB2_O_INFO_FILE, 0, data, size);
+		if (rc)
+			goto finished;
+		smb2_set_next_command(tcon, &rqst[num_rqst]);
+		smb2_set_related(&rqst[num_rqst++]);
+		trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
 		break;
 	case SMB2_OP_SET_EOF:
-		tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
-				     fid.volatile_fid, current->tgid,
-				     (__le64 *)data, false);
+		memset(&si_iov, 0, sizeof(si_iov));
+		rqst[num_rqst].rq_iov = si_iov;
+		rqst[num_rqst].rq_nvec = 1;
+
+		size[0] = 8; /* sizeof __le64 */
+		data[0] = ptr;
+
+		rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+					COMPOUND_FID, current->tgid,
+					FILE_END_OF_FILE_INFORMATION,
+					SMB2_O_INFO_FILE, 0, data, size);
+		if (rc)
+			goto finished;
+		smb2_set_next_command(tcon, &rqst[num_rqst]);
+		smb2_set_related(&rqst[num_rqst++]);
+		trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
 		break;
 	case SMB2_OP_SET_INFO:
-		tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
-				      fid.volatile_fid,
-				      (FILE_BASIC_INFO *)data);
+		memset(&si_iov, 0, sizeof(si_iov));
+		rqst[num_rqst].rq_iov = si_iov;
+		rqst[num_rqst].rq_nvec = 1;
+
+
+		size[0] = sizeof(FILE_BASIC_INFO);
+		data[0] = ptr;
+
+		if (cfile)
+			rc = SMB2_set_info_init(tcon, &rqst[num_rqst],
+				cfile->fid.persistent_fid,
+				cfile->fid.volatile_fid, current->tgid,
+				FILE_BASIC_INFORMATION,
+				SMB2_O_INFO_FILE, 0, data, size);
+		else {
+			rc = SMB2_set_info_init(tcon, &rqst[num_rqst],
+				COMPOUND_FID,
+				COMPOUND_FID, current->tgid,
+				FILE_BASIC_INFORMATION,
+				SMB2_O_INFO_FILE, 0, data, size);
+			if (!rc) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			}
+		}
+
+		if (rc)
+			goto finished;
+		num_rqst++;
+		trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid,
+						   full_path);
+		break;
+	case SMB2_OP_RENAME:
+		memset(&si_iov, 0, sizeof(si_iov));
+		rqst[num_rqst].rq_iov = si_iov;
+		rqst[num_rqst].rq_nvec = 2;
+
+		len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+
+		rename_info.ReplaceIfExists = 1;
+		rename_info.RootDirectory = 0;
+		rename_info.FileNameLength = cpu_to_le32(len);
+
+		size[0] = sizeof(struct smb2_file_rename_info);
+		data[0] = &rename_info;
+
+		size[1] = len + 2 /* null */;
+		data[1] = (__le16 *)ptr;
+
+		if (cfile)
+			rc = SMB2_set_info_init(tcon, &rqst[num_rqst],
+						cfile->fid.persistent_fid,
+						cfile->fid.volatile_fid,
+					current->tgid, FILE_RENAME_INFORMATION,
+					SMB2_O_INFO_FILE, 0, data, size);
+		else {
+			rc = SMB2_set_info_init(tcon, &rqst[num_rqst],
+					COMPOUND_FID, COMPOUND_FID,
+					current->tgid, FILE_RENAME_INFORMATION,
+					SMB2_O_INFO_FILE, 0, data, size);
+			if (!rc) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			}
+		}
+		if (rc)
+			goto finished;
+		num_rqst++;
+		trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
+		break;
+	case SMB2_OP_HARDLINK:
+		memset(&si_iov, 0, sizeof(si_iov));
+		rqst[num_rqst].rq_iov = si_iov;
+		rqst[num_rqst].rq_nvec = 2;
+
+		len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+
+		link_info.ReplaceIfExists = 0;
+		link_info.RootDirectory = 0;
+		link_info.FileNameLength = cpu_to_le32(len);
+
+		size[0] = sizeof(struct smb2_file_link_info);
+		data[0] = &link_info;
+
+		size[1] = len + 2 /* null */;
+		data[1] = (__le16 *)ptr;
+
+		rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+					COMPOUND_FID, current->tgid,
+					FILE_LINK_INFORMATION,
+					SMB2_O_INFO_FILE, 0, data, size);
+		if (rc)
+			goto finished;
+		smb2_set_next_command(tcon, &rqst[num_rqst]);
+		smb2_set_related(&rqst[num_rqst++]);
+		trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
 		break;
 	default:
 		cifs_dbg(VFS, "Invalid command\n");
-		break;
+		rc = -EINVAL;
+	}
+	if (rc)
+		goto finished;
+
+	/* We already have a handle so we can skip the close */
+	if (cfile)
+		goto after_close;
+	/* Close */
+	memset(&close_iov, 0, sizeof(close_iov));
+	rqst[num_rqst].rq_iov = close_iov;
+	rqst[num_rqst].rq_nvec = 1;
+	rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+			     COMPOUND_FID);
+	smb2_set_related(&rqst[num_rqst]);
+	if (rc)
+		goto finished;
+ after_close:
+	num_rqst++;
+
+	if (cfile) {
+		cifsFileInfo_put(cfile);
+		cfile = NULL;
+		rc = compound_send_recv(xid, ses, flags, num_rqst - 2,
+					&rqst[1], &resp_buftype[1],
+					&rsp_iov[1]);
+	} else
+		rc = compound_send_recv(xid, ses, flags, num_rqst,
+					rqst, resp_buftype,
+					rsp_iov);
+
+ finished:
+	if (cfile)
+		cifsFileInfo_put(cfile);
+
+	SMB2_open_free(&rqst[0]);
+	if (rc == -EREMCHG) {
+		printk_once(KERN_WARNING "server share %s deleted\n",
+			    tcon->treeName);
+		tcon->need_reconnect = true;
 	}
 
-	if (use_cached_root_handle)
-		close_shroot(&tcon->crfid);
-	else
-		rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
-	if (tmprc)
-		rc = tmprc;
-	kfree(utf16_path);
+	switch (command) {
+	case SMB2_OP_QUERY_INFO:
+		if (rc == 0) {
+			qi_rsp = (struct smb2_query_info_rsp *)
+				rsp_iov[1].iov_base;
+			rc = smb2_validate_and_copy_iov(
+				le16_to_cpu(qi_rsp->OutputBufferOffset),
+				le32_to_cpu(qi_rsp->OutputBufferLength),
+				&rsp_iov[1], sizeof(struct smb2_file_all_info),
+				ptr);
+		}
+		if (rqst[1].rq_iov)
+			SMB2_query_info_free(&rqst[1]);
+		if (rqst[2].rq_iov)
+			SMB2_close_free(&rqst[2]);
+		if (rc)
+			trace_smb3_query_info_compound_err(xid,  ses->Suid,
+						tcon->tid, rc);
+		else
+			trace_smb3_query_info_compound_done(xid, ses->Suid,
+						tcon->tid);
+		break;
+	case SMB2_OP_DELETE:
+		if (rc)
+			trace_smb3_delete_err(xid,  ses->Suid, tcon->tid, rc);
+		else
+			trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+		if (rqst[1].rq_iov)
+			SMB2_close_free(&rqst[1]);
+		break;
+	case SMB2_OP_MKDIR:
+		if (rc)
+			trace_smb3_mkdir_err(xid,  ses->Suid, tcon->tid, rc);
+		else
+			trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
+		if (rqst[1].rq_iov)
+			SMB2_close_free(&rqst[1]);
+		break;
+	case SMB2_OP_HARDLINK:
+		if (rc)
+			trace_smb3_hardlink_err(xid,  ses->Suid, tcon->tid, rc);
+		else
+			trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
+		free_set_inf_compound(rqst);
+		break;
+	case SMB2_OP_RENAME:
+		if (rc)
+			trace_smb3_rename_err(xid,  ses->Suid, tcon->tid, rc);
+		else
+			trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
+		free_set_inf_compound(rqst);
+		break;
+	case SMB2_OP_RMDIR:
+		if (rc)
+			trace_smb3_rmdir_err(xid,  ses->Suid, tcon->tid, rc);
+		else
+			trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
+		free_set_inf_compound(rqst);
+		break;
+	case SMB2_OP_SET_EOF:
+		if (rc)
+			trace_smb3_set_eof_err(xid,  ses->Suid, tcon->tid, rc);
+		else
+			trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
+		free_set_inf_compound(rqst);
+		break;
+	case SMB2_OP_SET_INFO:
+		if (rc)
+			trace_smb3_set_info_compound_err(xid,  ses->Suid,
+						tcon->tid, rc);
+		else
+			trace_smb3_set_info_compound_done(xid, ses->Suid,
+						tcon->tid);
+		free_set_inf_compound(rqst);
+		break;
+	}
+	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+	free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
 	return rc;
 }
 
@@ -147,6 +441,10 @@
 {
 	int rc;
 	struct smb2_file_all_info *smb2_data;
+	__u32 create_options = 0;
+	struct cifs_fid fid;
+	bool no_cached_open = tcon->nohandlecache;
+	struct cifsFileInfo *cfile;
 
 	*adjust_tz = false;
 	*symlink = false;
@@ -156,16 +454,41 @@
 	if (smb2_data == NULL)
 		return -ENOMEM;
 
-	rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
-				FILE_READ_ATTRIBUTES, FILE_OPEN, 0,
-				smb2_data, SMB2_OP_QUERY_INFO);
+	/* If it is a root and its handle is cached then use it */
+	if (!strlen(full_path) && !no_cached_open) {
+		rc = open_shroot(xid, tcon, &fid);
+		if (rc)
+			goto out;
+
+		if (tcon->crfid.file_all_info_is_valid) {
+			move_smb2_info_to_cifs(data,
+					       &tcon->crfid.file_all_info);
+		} else {
+			rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+					     fid.volatile_fid, smb2_data);
+			if (!rc)
+				move_smb2_info_to_cifs(data, smb2_data);
+		}
+		close_shroot(&tcon->crfid);
+		goto out;
+	}
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	cifs_get_readable_path(tcon, full_path, &cfile);
+	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+			      FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
+			      ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile);
 	if (rc == -EOPNOTSUPP) {
 		*symlink = true;
+		create_options |= OPEN_REPARSE_POINT;
+
 		/* Failed on a symbolic link - query a reparse point info */
-		rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
-					FILE_READ_ATTRIBUTES, FILE_OPEN,
-					OPEN_REPARSE_POINT, smb2_data,
-					SMB2_OP_QUERY_INFO);
+		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+				      FILE_READ_ATTRIBUTES, FILE_OPEN,
+				      create_options, ACL_NO_MODE,
+				      smb2_data, SMB2_OP_QUERY_INFO, NULL);
 	}
 	if (rc)
 		goto out;
@@ -177,12 +500,14 @@
 }
 
 int
-smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
+	   struct cifs_tcon *tcon, const char *name,
 	   struct cifs_sb_info *cifs_sb)
 {
-	return smb2_open_op_close(xid, tcon, cifs_sb, name,
-				  FILE_WRITE_ATTRIBUTES, FILE_CREATE,
-				  CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
+	return smb2_compound_op(xid, tcon, cifs_sb, name,
+				FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+				CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
+				NULL);
 }
 
 void
@@ -192,6 +517,7 @@
 {
 	FILE_BASIC_INFO data;
 	struct cifsInodeInfo *cifs_i;
+	struct cifsFileInfo *cfile;
 	u32 dosattrs;
 	int tmprc;
 
@@ -199,9 +525,11 @@
 	cifs_i = CIFS_I(inode);
 	dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
 	data.Attributes = cpu_to_le32(dosattrs);
-	tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
-				   FILE_WRITE_ATTRIBUTES, FILE_CREATE,
-				   CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
+	cifs_get_writable_path(tcon, name, &cfile);
+	tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
+				 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+				 CREATE_NOT_FILE, ACL_NO_MODE,
+				 &data, SMB2_OP_SET_INFO, cfile);
 	if (tmprc == 0)
 		cifs_i->cifsAttrs = dosattrs;
 }
@@ -210,24 +538,25 @@
 smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	   struct cifs_sb_info *cifs_sb)
 {
-	return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
-				  CREATE_NOT_FILE,
-				  NULL, SMB2_OP_RMDIR);
+	return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+				CREATE_NOT_FILE, ACL_NO_MODE,
+				NULL, SMB2_OP_RMDIR, NULL);
 }
 
 int
 smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	    struct cifs_sb_info *cifs_sb)
 {
-	return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
-				  CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
-				  NULL, SMB2_OP_DELETE);
+	return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+				CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+				ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL);
 }
 
 static int
 smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
 		   const char *from_name, const char *to_name,
-		   struct cifs_sb_info *cifs_sb, __u32 access, int command)
+		   struct cifs_sb_info *cifs_sb, __u32 access, int command,
+		   struct cifsFileInfo *cfile)
 {
 	__le16 *smb2_to_name = NULL;
 	int rc;
@@ -237,9 +566,9 @@
 		rc = -ENOMEM;
 		goto smb2_rename_path;
 	}
-
-	rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
-				FILE_OPEN, 0, smb2_to_name, command);
+	rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
+			      FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
+			      command, cfile);
 smb2_rename_path:
 	kfree(smb2_to_name);
 	return rc;
@@ -250,8 +579,12 @@
 		 const char *from_name, const char *to_name,
 		 struct cifs_sb_info *cifs_sb)
 {
-	return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
-				  DELETE, SMB2_OP_RENAME);
+	struct cifsFileInfo *cfile;
+
+	cifs_get_writable_path(tcon, from_name, &cfile);
+
+	return smb2_set_path_attr(xid, tcon, from_name, to_name,
+				  cifs_sb, DELETE, SMB2_OP_RENAME, cfile);
 }
 
 int
@@ -260,7 +593,8 @@
 		     struct cifs_sb_info *cifs_sb)
 {
 	return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
-				  FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK);
+				  FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK,
+				  NULL);
 }
 
 int
@@ -269,9 +603,10 @@
 		   struct cifs_sb_info *cifs_sb, bool set_alloc)
 {
 	__le64 eof = cpu_to_le64(size);
-	return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
-				  FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
-				  SMB2_OP_SET_EOF);
+
+	return smb2_compound_op(xid, tcon, cifs_sb, full_path,
+				FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
+				&eof, SMB2_OP_SET_EOF, NULL);
 }
 
 int
@@ -291,9 +626,9 @@
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
 
-	rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
-				FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
-				SMB2_OP_SET_INFO);
+	rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path,
+			      FILE_WRITE_ATTRIBUTES, FILE_OPEN,
+			      0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL);
 	cifs_put_tlink(tlink);
 	return rc;
 }

diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 20a2d30..7fde377 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c

@@ -288,7 +288,7 @@
 	{STATUS_FLT_BUFFER_TOO_SMALL, -ENOBUFS, "STATUS_FLT_BUFFER_TOO_SMALL"},
 	{STATUS_FVE_PARTIAL_METADATA, -EIO, "STATUS_FVE_PARTIAL_METADATA"},
 	{STATUS_UNSUCCESSFUL, -EIO, "STATUS_UNSUCCESSFUL"},
-	{STATUS_NOT_IMPLEMENTED, -ENOSYS, "STATUS_NOT_IMPLEMENTED"},
+	{STATUS_NOT_IMPLEMENTED, -EOPNOTSUPP, "STATUS_NOT_IMPLEMENTED"},
 	{STATUS_INVALID_INFO_CLASS, -EIO, "STATUS_INVALID_INFO_CLASS"},
 	{STATUS_INFO_LENGTH_MISMATCH, -EIO, "STATUS_INFO_LENGTH_MISMATCH"},
 	{STATUS_ACCESS_VIOLATION, -EACCES, "STATUS_ACCESS_VIOLATION"},
@@ -379,8 +379,8 @@
 	{STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"},
 	{STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"},
 	{STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"},
-	{STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"},
-	{STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"},
+	{STATUS_FILE_LOCK_CONFLICT, -EACCES, "STATUS_FILE_LOCK_CONFLICT"},
+	{STATUS_LOCK_NOT_GRANTED, -EACCES, "STATUS_LOCK_NOT_GRANTED"},
 	{STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"},
 	{STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS,
 	"STATUS_CTL_FILE_NOT_SUPPORTED"},
@@ -457,7 +457,7 @@
 	{STATUS_FILE_INVALID, -EIO, "STATUS_FILE_INVALID"},
 	{STATUS_ALLOTTED_SPACE_EXCEEDED, -EIO,
 	"STATUS_ALLOTTED_SPACE_EXCEEDED"},
-	{STATUS_INSUFFICIENT_RESOURCES, -EREMOTEIO,
+	{STATUS_INSUFFICIENT_RESOURCES, -EAGAIN,
 				"STATUS_INSUFFICIENT_RESOURCES"},
 	{STATUS_DFS_EXIT_PATH_FOUND, -EIO, "STATUS_DFS_EXIT_PATH_FOUND"},
 	{STATUS_DEVICE_DATA_ERROR, -EIO, "STATUS_DEVICE_DATA_ERROR"},
@@ -511,7 +511,7 @@
 	{STATUS_PRINT_QUEUE_FULL, -EIO, "STATUS_PRINT_QUEUE_FULL"},
 	{STATUS_NO_SPOOL_SPACE, -EIO, "STATUS_NO_SPOOL_SPACE"},
 	{STATUS_PRINT_CANCELLED, -EIO, "STATUS_PRINT_CANCELLED"},
-	{STATUS_NETWORK_NAME_DELETED, -EIO, "STATUS_NETWORK_NAME_DELETED"},
+	{STATUS_NETWORK_NAME_DELETED, -EREMCHG, "STATUS_NETWORK_NAME_DELETED"},
 	{STATUS_NETWORK_ACCESS_DENIED, -EACCES, "STATUS_NETWORK_ACCESS_DENIED"},
 	{STATUS_BAD_DEVICE_TYPE, -EIO, "STATUS_BAD_DEVICE_TYPE"},
 	{STATUS_BAD_NETWORK_NAME, -ENOENT, "STATUS_BAD_NETWORK_NAME"},
@@ -1036,7 +1036,8 @@
 	{STATUS_UNFINISHED_CONTEXT_DELETED, -EIO,
 	"STATUS_UNFINISHED_CONTEXT_DELETED"},
 	{STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"},
-	{STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"},
+	/* Note that ENOATTTR and ENODATA are the same errno */
+	{STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"},
 	{STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"},
 	{STATUS_WRONG_CREDENTIAL_HANDLE, -EIO,
 	"STATUS_WRONG_CREDENTIAL_HANDLE"},

diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 6a9c475..e311f58 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c

@@ -517,7 +517,6 @@
 	__u8 lease_state;
 	struct list_head *tmp;
 	struct cifsFileInfo *cfile;
-	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_pending_open *open;
 	struct cifsInodeInfo *cinode;
 	int ack_req = le32_to_cpu(rsp->Flags &
@@ -537,14 +536,26 @@
 		cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
 			 le32_to_cpu(rsp->NewLeaseState));
 
-		server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
-
 		if (ack_req)
 			cfile->oplock_break_cancelled = false;
 		else
 			cfile->oplock_break_cancelled = true;
 
-		queue_work(cifsoplockd_wq, &cfile->oplock_break);
+		set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+
+		/*
+		 * Set or clear flags depending on the lease state being READ.
+		 * HANDLE caching flag should be added when the client starts
+		 * to defer closing remote file handles with HANDLE leases.
+		 */
+		if (lease_state & SMB2_LEASE_READ_CACHING_HE)
+			set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+				&cinode->flags);
+		else
+			clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+				  &cinode->flags);
+
+		cifs_queue_oplock_break(cfile);
 		kfree(lw);
 		return true;
 	}
@@ -701,8 +712,8 @@
 					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
 					   &cinode->flags);
 				spin_unlock(&cfile->file_info_lock);
-				queue_work(cifsoplockd_wq,
-					   &cfile->oplock_break);
+
+				cifs_queue_oplock_break(cfile);
 
 				spin_unlock(&tcon->open_file_lock);
 				spin_unlock(&cifs_tcp_ses_lock);

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 812da3e..cd55af9 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c

@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  SMB2 version specific operations
  *
  *  Copyright (c) 2012, Jeff Layton <jlayton@redhat.com>
- *
- *  This library is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License v2 as published
- *  by the Free Software Foundation.
- *
- *  This library is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *  the GNU Lesser General Public License for more details.
- *
- *  You should have received a copy of the GNU Lesser General Public License
- *  along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/pagemap.h>
@@ -34,6 +22,7 @@
 #include "cifs_ioctl.h"
 #include "smbdirect.h"
 
+/* Change credits for different ops and return the total number of credits */
 static int
 change_conf(struct TCP_Server_Info *server)
 {
@@ -41,17 +30,15 @@
 	server->oplock_credits = server->echo_credits = 0;
 	switch (server->credits) {
 	case 0:
-		return -1;
+		return 0;
 	case 1:
 		server->echoes = false;
 		server->oplocks = false;
-		cifs_dbg(VFS, "disabling echoes and oplocks\n");
 		break;
 	case 2:
 		server->echoes = true;
 		server->oplocks = false;
 		server->echo_credits = 1;
-		cifs_dbg(FYI, "disabling oplocks\n");
 		break;
 	default:
 		server->echoes = true;
@@ -64,17 +51,30 @@
 		server->echo_credits = 1;
 	}
 	server->credits -= server->echo_credits + server->oplock_credits;
-	return 0;
+	return server->credits + server->echo_credits + server->oplock_credits;
 }
 
 static void
-smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
-		 const int optype)
+smb2_add_credits(struct TCP_Server_Info *server,
+		 const struct cifs_credits *credits, const int optype)
 {
-	int *val, rc = 0;
+	int *val, rc = -1;
+	unsigned int add = credits->value;
+	unsigned int instance = credits->instance;
+	bool reconnect_detected = false;
+
 	spin_lock(&server->req_lock);
 	val = server->ops->get_credits_field(server, optype);
-	*val += add;
+
+	/* eg found case where write overlapping reconnect messed up credits */
+	if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
+		trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
+			server->hostname, *val);
+	if ((instance == 0) || (instance == server->reconnect_instance))
+		*val += add;
+	else
+		reconnect_detected = true;
+
 	if (*val > 65000) {
 		*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
 		printk_once(KERN_WARNING "server overflowed SMB3 credits\n");
@@ -95,8 +95,31 @@
 	}
 	spin_unlock(&server->req_lock);
 	wake_up(&server->request_q);
-	if (rc)
-		cifs_reconnect(server);
+
+	if (reconnect_detected)
+		cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n",
+			 add, instance);
+
+	if (server->tcpStatus == CifsNeedReconnect
+	    || server->tcpStatus == CifsExiting)
+		return;
+
+	switch (rc) {
+	case -1:
+		/* change_conf hasn't been executed */
+		break;
+	case 0:
+		cifs_server_dbg(VFS, "Possible client or server bug - zero credits\n");
+		break;
+	case 1:
+		cifs_server_dbg(VFS, "disabling echoes and oplocks\n");
+		break;
+	case 2:
+		cifs_dbg(FYI, "disabling oplocks\n");
+		break;
+	default:
+		cifs_dbg(FYI, "add %u credits total=%d\n", add, rc);
+	}
 }
 
 static void
@@ -104,7 +127,12 @@
 {
 	spin_lock(&server->req_lock);
 	server->credits = val;
+	if (val == 1)
+		server->reconnect_instance++;
 	spin_unlock(&server->req_lock);
+	/* don't log while holding the lock */
+	if (val == 1)
+		cifs_dbg(FYI, "set credits to 1 due to smb2 reconnect\n");
 }
 
 static int *
@@ -125,12 +153,16 @@
 {
 	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
 
-	return le16_to_cpu(shdr->CreditRequest);
+	if (mid->mid_state == MID_RESPONSE_RECEIVED
+	    || mid->mid_state == MID_RESPONSE_MALFORMED)
+		return le16_to_cpu(shdr->CreditRequest);
+
+	return 0;
 }
 
 static int
 smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
-		      unsigned int *num, unsigned int *credits)
+		      unsigned int *num, struct cifs_credits *credits)
 {
 	int rc = 0;
 	unsigned int scredits;
@@ -141,7 +173,7 @@
 			spin_unlock(&server->req_lock);
 			cifs_num_waiters_inc(server);
 			rc = wait_event_killable(server->request_q,
-					has_credits(server, &server->credits));
+				has_credits(server, &server->credits, 1));
 			cifs_num_waiters_dec(server);
 			if (rc)
 				return rc;
@@ -154,20 +186,25 @@
 
 			scredits = server->credits;
 			/* can deadlock with reopen */
-			if (scredits == 1) {
+			if (scredits <= 8) {
 				*num = SMB2_MAX_BUFFER_SIZE;
-				*credits = 0;
+				credits->value = 0;
+				credits->instance = 0;
 				break;
 			}
 
-			/* leave one credit for a possible reopen */
-			scredits--;
+			/* leave some credits for reopen and other ops */
+			scredits -= 8;
 			*num = min_t(unsigned int, size,
 				     scredits * SMB2_MAX_BUFFER_SIZE);
 
-			*credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
-			server->credits -= *credits;
+			credits->value =
+				DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+			credits->instance = server->reconnect_instance;
+			server->credits -= credits->value;
 			server->in_flight++;
+			if (server->in_flight > server->max_in_flight)
+				server->max_in_flight = server->in_flight;
 			break;
 		}
 	}
@@ -175,6 +212,38 @@
 	return rc;
 }
 
+static int
+smb2_adjust_credits(struct TCP_Server_Info *server,
+		    struct cifs_credits *credits,
+		    const unsigned int payload_size)
+{
+	int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE);
+
+	if (!credits->value || credits->value == new_val)
+		return 0;
+
+	if (credits->value < new_val) {
+		WARN_ONCE(1, "request has less credits (%d) than required (%d)",
+			  credits->value, new_val);
+		return -ENOTSUPP;
+	}
+
+	spin_lock(&server->req_lock);
+
+	if (server->reconnect_instance != credits->instance) {
+		spin_unlock(&server->req_lock);
+		cifs_server_dbg(VFS, "trying to return %d credits to old session\n",
+			 credits->value - new_val);
+		return -EAGAIN;
+	}
+
+	server->credits += credits->value - new_val;
+	spin_unlock(&server->req_lock);
+	wake_up(&server->request_q);
+	credits->value = new_val;
+	return 0;
+}
+
 static __u64
 smb2_get_next_mid(struct TCP_Server_Info *server)
 {
@@ -186,6 +255,15 @@
 	return mid;
 }
 
+static void
+smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+	spin_lock(&GlobalMid_Lock);
+	if (server->CurrentMid >= val)
+		server->CurrentMid -= val;
+	spin_unlock(&GlobalMid_Lock);
+}
+
 static struct mid_q_entry *
 smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 {
@@ -194,7 +272,7 @@
 	__u64 wire_mid = le64_to_cpu(shdr->MessageId);
 
 	if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
-		cifs_dbg(VFS, "encrypted frame parsing not supported yet");
+		cifs_server_dbg(VFS, "Encrypted frame parsing not supported yet\n");
 		return NULL;
 	}
 
@@ -218,10 +296,10 @@
 #ifdef CONFIG_CIFS_DEBUG2
 	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
 
-	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
+	cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
 		 shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
 		 shdr->ProcessId);
-	cifs_dbg(VFS, "smb buf %p len %u\n", buf,
+	cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
 		 server->ops->calc_smb_size(buf, server));
 #endif
 }
@@ -236,6 +314,7 @@
 smb2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 {
 	int rc;
+
 	ses->server->CurrentMid = 0;
 	rc = SMB2_negotiate(xid, ses);
 	/* BB we probably don't need to retry with modern servers */
@@ -270,6 +349,31 @@
 }
 
 static unsigned int
+smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int wsize;
+
+	/* start with specified wsize, or default */
+	wsize = volume_info->wsize ? volume_info->wsize : SMB3_DEFAULT_IOSIZE;
+	wsize = min_t(unsigned int, wsize, server->max_write);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+	if (server->rdma) {
+		if (server->sign)
+			wsize = min_t(unsigned int,
+				wsize, server->smbd_conn->max_fragmented_send_size);
+		else
+			wsize = min_t(unsigned int,
+				wsize, server->smbd_conn->max_readwrite_size);
+	}
+#endif
+	if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+		wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
+
+	return wsize;
+}
+
+static unsigned int
 smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 {
 	struct TCP_Server_Info *server = tcon->ses->server;
@@ -295,6 +399,31 @@
 	return rsize;
 }
 
+static unsigned int
+smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int rsize;
+
+	/* start with specified rsize, or default */
+	rsize = volume_info->rsize ? volume_info->rsize : SMB3_DEFAULT_IOSIZE;
+	rsize = min_t(unsigned int, rsize, server->max_read);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+	if (server->rdma) {
+		if (server->sign)
+			rsize = min_t(unsigned int,
+				rsize, server->smbd_conn->max_fragmented_recv_size);
+		else
+			rsize = min_t(unsigned int,
+				rsize, server->smbd_conn->max_readwrite_size);
+	}
+#endif
+
+	if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+		rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
+
+	return rsize;
+}
 
 static int
 parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
@@ -443,13 +572,13 @@
 	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
 			FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
 			NULL /* no data input */, 0 /* no data input */,
-			(char **)&out_buf, &ret_data_len);
+			CIFSMaxBufSize, (char **)&out_buf, &ret_data_len);
 	if (rc == -EOPNOTSUPP) {
 		cifs_dbg(FYI,
 			 "server does not support query network interfaces\n");
 		goto out;
 	} else if (rc != 0) {
-		cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
+		cifs_tcon_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
 		goto out;
 	}
 
@@ -481,6 +610,7 @@
 		SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid,
 			   cfid->fid->volatile_fid);
 		cfid->is_valid = false;
+		cfid->file_all_info_is_valid = false;
 	}
 }
 
@@ -505,9 +635,18 @@
  */
 int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
 {
-	struct cifs_open_parms oparams;
-	int rc;
-	__le16 srch_path = 0; /* Null - since an open of top of share */
+	struct cifs_ses *ses = tcon->ses;
+	struct TCP_Server_Info *server = ses->server;
+	struct cifs_open_parms oparms;
+	struct smb2_create_rsp *o_rsp = NULL;
+	struct smb2_query_info_rsp *qi_rsp = NULL;
+	int resp_buftype[2];
+	struct smb_rqst rqst[2];
+	struct kvec rsp_iov[2];
+	struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+	struct kvec qi_iov[1];
+	int rc, flags = 0;
+	__le16 utf16_path = 0; /* Null - since an open of top of share */
 	u8 oplock = SMB2_OPLOCK_LEVEL_II;
 
 	mutex_lock(&tcon->crfid.fid_mutex);
@@ -519,22 +658,139 @@
 		return 0;
 	}
 
-	oparams.tcon = tcon;
-	oparams.create_options = 0;
-	oparams.desired_access = FILE_READ_ATTRIBUTES;
-	oparams.disposition = FILE_OPEN;
-	oparams.fid = pfid;
-	oparams.reconnect = false;
+	/*
+	 * We do not hold the lock for the open because in case
+	 * SMB2_open needs to reconnect, it will end up calling
+	 * cifs_mark_open_files_invalid() which takes the lock again
+	 * thus causing a deadlock
+	 */
 
-	rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL);
-	if (rc == 0) {
-		memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
-		tcon->crfid.tcon = tcon;
-		tcon->crfid.is_valid = true;
-		kref_init(&tcon->crfid.refcount);
-		kref_get(&tcon->crfid.refcount);
-	}
 	mutex_unlock(&tcon->crfid.fid_mutex);
+
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	memset(rqst, 0, sizeof(rqst));
+	resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+	memset(rsp_iov, 0, sizeof(rsp_iov));
+
+	/* Open */
+	memset(&open_iov, 0, sizeof(open_iov));
+	rqst[0].rq_iov = open_iov;
+	rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+	oparms.tcon = tcon;
+	oparms.create_options = 0;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.fid = pfid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path);
+	if (rc)
+		goto oshr_free;
+	smb2_set_next_command(tcon, &rqst[0]);
+
+	memset(&qi_iov, 0, sizeof(qi_iov));
+	rqst[1].rq_iov = qi_iov;
+	rqst[1].rq_nvec = 1;
+
+	rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+				  COMPOUND_FID, FILE_ALL_INFORMATION,
+				  SMB2_O_INFO_FILE, 0,
+				  sizeof(struct smb2_file_all_info) +
+				  PATH_MAX * 2, 0, NULL);
+	if (rc)
+		goto oshr_free;
+
+	smb2_set_related(&rqst[1]);
+
+	rc = compound_send_recv(xid, ses, flags, 2, rqst,
+				resp_buftype, rsp_iov);
+	mutex_lock(&tcon->crfid.fid_mutex);
+
+	/*
+	 * Now we need to check again as the cached root might have
+	 * been successfully re-opened from a concurrent process
+	 */
+
+	if (tcon->crfid.is_valid) {
+		/* work was already done */
+
+		/* stash fids for close() later */
+		struct cifs_fid fid = {
+			.persistent_fid = pfid->persistent_fid,
+			.volatile_fid = pfid->volatile_fid,
+		};
+
+		/*
+		 * caller expects this func to set pfid to a valid
+		 * cached root, so we copy the existing one and get a
+		 * reference.
+		 */
+		memcpy(pfid, tcon->crfid.fid, sizeof(*pfid));
+		kref_get(&tcon->crfid.refcount);
+
+		mutex_unlock(&tcon->crfid.fid_mutex);
+
+		if (rc == 0) {
+			/* close extra handle outside of crit sec */
+			SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+		}
+		goto oshr_free;
+	}
+
+	/* Cached root is still invalid, continue normaly */
+
+	if (rc) {
+		if (rc == -EREMCHG) {
+			tcon->need_reconnect = true;
+			printk_once(KERN_WARNING "server share %s deleted\n",
+				    tcon->treeName);
+		}
+		goto oshr_exit;
+	}
+
+	atomic_inc(&tcon->num_remote_opens);
+
+	o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
+	oparms.fid->persistent_fid = o_rsp->PersistentFileId;
+	oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+	oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
+
+	memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
+	tcon->crfid.tcon = tcon;
+	tcon->crfid.is_valid = true;
+	kref_init(&tcon->crfid.refcount);
+
+	/* BB TBD check to see if oplock level check can be removed below */
+	if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
+		kref_get(&tcon->crfid.refcount);
+		smb2_parse_contexts(server, o_rsp,
+				&oparms.fid->epoch,
+				oparms.fid->lease_key, &oplock, NULL);
+	} else
+		goto oshr_exit;
+
+	qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+	if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+		goto oshr_exit;
+	if (!smb2_validate_and_copy_iov(
+				le16_to_cpu(qi_rsp->OutputBufferOffset),
+				sizeof(struct smb2_file_all_info),
+				&rsp_iov[1], sizeof(struct smb2_file_all_info),
+				(char *)&tcon->crfid.file_all_info))
+		tcon->crfid.file_all_info_is_valid = 1;
+
+oshr_exit:
+	mutex_unlock(&tcon->crfid.fid_mutex);
+oshr_free:
+	SMB2_open_free(&rqst[0]);
+	SMB2_query_info_free(&rqst[1]);
+	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
 	return rc;
 }
 
@@ -578,8 +834,6 @@
 		SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
 	else
 		close_shroot(&tcon->crfid);
-
-	return;
 }
 
 static void
@@ -607,7 +861,6 @@
 	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
 			FS_DEVICE_INFORMATION);
 	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
-	return;
 }
 
 static int
@@ -695,9 +948,8 @@
 		value = &src->ea_data[src->ea_name_length + 1];
 		value_len = (size_t)le16_to_cpu(src->ea_value_length);
 
-		if (name_len == 0) {
+		if (name_len == 0)
 			break;
-		}
 
 		if (src_size < 8 + name_len + 1 + value_len) {
 			cifs_dbg(FYI, "EA entry goes beyond length of list\n");
@@ -770,72 +1022,50 @@
 {
 	int rc;
 	__le16 *utf16_path;
-	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
-	struct cifs_open_parms oparms;
-	struct cifs_fid fid;
-	struct smb2_file_full_ea_info *smb2_data;
-	int ea_buf_size = SMB2_MIN_EA_BUF;
+	struct kvec rsp_iov = {NULL, 0};
+	int buftype = CIFS_NO_BUFFER;
+	struct smb2_query_info_rsp *rsp;
+	struct smb2_file_full_ea_info *info = NULL;
 
 	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
 	if (!utf16_path)
 		return -ENOMEM;
 
-	oparms.tcon = tcon;
-	oparms.desired_access = FILE_READ_EA;
-	oparms.disposition = FILE_OPEN;
-	if (backup_cred(cifs_sb))
-		oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
-	else
-		oparms.create_options = 0;
-	oparms.fid = &fid;
-	oparms.reconnect = false;
-
-	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
-	kfree(utf16_path);
+	rc = smb2_query_info_compound(xid, tcon, utf16_path,
+				      FILE_READ_EA,
+				      FILE_FULL_EA_INFORMATION,
+				      SMB2_O_INFO_FILE,
+				      CIFSMaxBufSize -
+				      MAX_SMB2_CREATE_RESPONSE_SIZE -
+				      MAX_SMB2_CLOSE_RESPONSE_SIZE,
+				      &rsp_iov, &buftype, cifs_sb);
 	if (rc) {
-		cifs_dbg(FYI, "open failed rc=%d\n", rc);
-		return rc;
+		/*
+		 * If ea_name is NULL (listxattr) and there are no EAs,
+		 * return 0 as it's not an error. Otherwise, the specified
+		 * ea_name was not found.
+		 */
+		if (!ea_name && rc == -ENODATA)
+			rc = 0;
+		goto qeas_exit;
 	}
 
-	while (1) {
-		smb2_data = kzalloc(ea_buf_size, GFP_KERNEL);
-		if (smb2_data == NULL) {
-			SMB2_close(xid, tcon, fid.persistent_fid,
-				   fid.volatile_fid);
-			return -ENOMEM;
-		}
+	rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
+	rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
+			       le32_to_cpu(rsp->OutputBufferLength),
+			       &rsp_iov,
+			       sizeof(struct smb2_file_full_ea_info));
+	if (rc)
+		goto qeas_exit;
 
-		rc = SMB2_query_eas(xid, tcon, fid.persistent_fid,
-				    fid.volatile_fid,
-				    ea_buf_size, smb2_data);
+	info = (struct smb2_file_full_ea_info *)(
+			le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
+	rc = move_smb2_ea_to_cifs(ea_data, buf_size, info,
+			le32_to_cpu(rsp->OutputBufferLength), ea_name);
 
-		if (rc != -E2BIG)
-			break;
-
-		kfree(smb2_data);
-		ea_buf_size <<= 1;
-
-		if (ea_buf_size > SMB2_MAX_EA_BUF) {
-			cifs_dbg(VFS, "EA size is too large\n");
-			SMB2_close(xid, tcon, fid.persistent_fid,
-				   fid.volatile_fid);
-			return -ENOMEM;
-		}
-	}
-
-	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
-
-	/*
-	 * If ea_name is NULL (listxattr) and there are no EAs, return 0 as it's
-	 * not an error. Otherwise, the specified ea_name was not found.
-	 */
-	if (!rc)
-		rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data,
-					  SMB2_MAX_EA_BUF, ea_name);
-	else if (!ea_name && rc == -ENODATA)
-		rc = 0;
-
-	kfree(smb2_data);
+ qeas_exit:
+	kfree(utf16_path);
+	free_rsp_buf(buftype, rsp_iov.iov_base);
 	return rc;
 }
 
@@ -846,14 +1076,27 @@
 	    const __u16 ea_value_len, const struct nls_table *nls_codepage,
 	    struct cifs_sb_info *cifs_sb)
 {
-	int rc;
-	__le16 *utf16_path;
-	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
-	struct cifs_open_parms oparms;
-	struct cifs_fid fid;
-	struct smb2_file_full_ea_info *ea;
+	struct cifs_ses *ses = tcon->ses;
+	__le16 *utf16_path = NULL;
 	int ea_name_len = strlen(ea_name);
+	int flags = 0;
 	int len;
+	struct smb_rqst rqst[3];
+	int resp_buftype[3];
+	struct kvec rsp_iov[3];
+	struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+	struct cifs_open_parms oparms;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_fid fid;
+	struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+	unsigned int size[1];
+	void *data[1];
+	struct smb2_file_full_ea_info *ea = NULL;
+	struct kvec close_iov[1];
+	int rc;
+
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
 
 	if (ea_name_len > 255)
 		return -EINVAL;
@@ -862,6 +1105,26 @@
 	if (!utf16_path)
 		return -ENOMEM;
 
+	memset(rqst, 0, sizeof(rqst));
+	resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+	memset(rsp_iov, 0, sizeof(rsp_iov));
+
+	if (ses->server->ops->query_all_EAs) {
+		if (!ea_value) {
+			rc = ses->server->ops->query_all_EAs(xid, tcon, path,
+							     ea_name, NULL, 0,
+							     cifs_sb);
+			if (rc == -ENODATA)
+				goto sea_exit;
+		}
+	}
+
+	/* Open */
+	memset(&open_iov, 0, sizeof(open_iov));
+	rqst[0].rq_iov = open_iov;
+	rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+	memset(&oparms, 0, sizeof(oparms));
 	oparms.tcon = tcon;
 	oparms.desired_access = FILE_WRITE_EA;
 	oparms.disposition = FILE_OPEN;
@@ -872,18 +1135,22 @@
 	oparms.fid = &fid;
 	oparms.reconnect = false;
 
-	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
-	kfree(utf16_path);
-	if (rc) {
-		cifs_dbg(FYI, "open failed rc=%d\n", rc);
-		return rc;
-	}
+	rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path);
+	if (rc)
+		goto sea_exit;
+	smb2_set_next_command(tcon, &rqst[0]);
+
+
+	/* Set Info */
+	memset(&si_iov, 0, sizeof(si_iov));
+	rqst[1].rq_iov = si_iov;
+	rqst[1].rq_nvec = 1;
 
 	len = sizeof(ea) + ea_name_len + ea_value_len + 1;
 	ea = kzalloc(len, GFP_KERNEL);
 	if (ea == NULL) {
-		SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto sea_exit;
 	}
 
 	ea->ea_name_length = ea_name_len;
@@ -891,12 +1158,37 @@
 	memcpy(ea->ea_data, ea_name, ea_name_len + 1);
 	memcpy(ea->ea_data + ea_name_len + 1, ea_value, ea_value_len);
 
-	rc = SMB2_set_ea(xid, tcon, fid.persistent_fid, fid.volatile_fid, ea,
-			 len);
+	size[0] = len;
+	data[0] = ea;
+
+	rc = SMB2_set_info_init(tcon, &rqst[1], COMPOUND_FID,
+				COMPOUND_FID, current->tgid,
+				FILE_FULL_EA_INFORMATION,
+				SMB2_O_INFO_FILE, 0, data, size);
+	smb2_set_next_command(tcon, &rqst[1]);
+	smb2_set_related(&rqst[1]);
+
+
+	/* Close */
+	memset(&close_iov, 0, sizeof(close_iov));
+	rqst[2].rq_iov = close_iov;
+	rqst[2].rq_nvec = 1;
+	rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID);
+	smb2_set_related(&rqst[2]);
+
+	rc = compound_send_recv(xid, ses, flags, 3, rqst,
+				resp_buftype, rsp_iov);
+	/* no need to bump num_remote_opens because handle immediately closed */
+
+ sea_exit:
 	kfree(ea);
-
-	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
-
+	kfree(utf16_path);
+	SMB2_open_free(&rqst[0]);
+	SMB2_set_info_free(&rqst[1]);
+	SMB2_close_free(&rqst[2]);
+	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+	free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
 	return rc;
 }
 #endif
@@ -911,6 +1203,7 @@
 smb2_clear_stats(struct cifs_tcon *tcon)
 {
 	int i;
+
 	for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) {
 		atomic_set(&tcon->stats.smb2_stats.smb2_com_sent[i], 0);
 		atomic_set(&tcon->stats.smb2_stats.smb2_com_failed[i], 0);
@@ -963,6 +1256,9 @@
 	seq_printf(m, "\nBytes read: %llu  Bytes written: %llu",
 		   (long long)(tcon->bytes_read),
 		   (long long)(tcon->bytes_written));
+	seq_printf(m, "\nOpen files: %d total (local), %d open on server",
+		   atomic_read(&tcon->num_local_opens),
+		   atomic_read(&tcon->num_remote_opens));
 	seq_printf(m, "\nTreeConnects: %d total %d failed",
 		   atomic_read(&sent[SMB2_TREE_CONNECT_HE]),
 		   atomic_read(&failed[SMB2_TREE_CONNECT_HE]));
@@ -1015,6 +1311,9 @@
 
 	cfile->fid.persistent_fid = fid->persistent_fid;
 	cfile->fid.volatile_fid = fid->volatile_fid;
+#ifdef CONFIG_CIFS_DEBUG2
+	cfile->fid.mid = fid->mid;
+#endif /* CIFS_DEBUG2 */
 	server->ops->set_oplock_level(cinode, oplock, fid->epoch,
 				      &fid->purge_cache);
 	cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
@@ -1039,15 +1338,15 @@
 
 	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
 			FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
-			NULL, 0 /* no input */,
+			NULL, 0 /* no input */, CIFSMaxBufSize,
 			(char **)&res_key, &ret_data_len);
 
 	if (rc) {
-		cifs_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
+		cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
 		goto req_res_key_exit;
 	}
 	if (ret_data_len < sizeof(struct resume_key_req)) {
-		cifs_dbg(VFS, "Invalid refcopy resume key length\n");
+		cifs_tcon_dbg(VFS, "Invalid refcopy resume key length\n");
 		rc = -EINVAL;
 		goto req_res_key_exit;
 	}
@@ -1058,6 +1357,224 @@
 	return rc;
 }
 
+static int
+smb2_ioctl_query_info(const unsigned int xid,
+		      struct cifs_tcon *tcon,
+		      __le16 *path, int is_dir,
+		      unsigned long p)
+{
+	struct cifs_ses *ses = tcon->ses;
+	char __user *arg = (char __user *)p;
+	struct smb_query_info qi;
+	struct smb_query_info __user *pqi;
+	int rc = 0;
+	int flags = 0;
+	struct smb2_query_info_rsp *qi_rsp = NULL;
+	struct smb2_ioctl_rsp *io_rsp = NULL;
+	void *buffer = NULL;
+	struct smb_rqst rqst[3];
+	int resp_buftype[3];
+	struct kvec rsp_iov[3];
+	struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+	struct cifs_open_parms oparms;
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_fid fid;
+	struct kvec qi_iov[1];
+	struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
+	struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+	struct kvec close_iov[1];
+	unsigned int size[2];
+	void *data[2];
+
+	memset(rqst, 0, sizeof(rqst));
+	resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+	memset(rsp_iov, 0, sizeof(rsp_iov));
+
+	if (copy_from_user(&qi, arg, sizeof(struct smb_query_info)))
+		return -EFAULT;
+
+	if (qi.output_buffer_length > 1024)
+		return -EINVAL;
+
+	if (!ses || !(ses->server))
+		return -EIO;
+
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL);
+	if (buffer == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(buffer, arg + sizeof(struct smb_query_info),
+			   qi.output_buffer_length)) {
+		rc = -EFAULT;
+		goto iqinf_exit;
+	}
+
+	/* Open */
+	memset(&open_iov, 0, sizeof(open_iov));
+	rqst[0].rq_iov = open_iov;
+	rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+	memset(&oparms, 0, sizeof(oparms));
+	oparms.tcon = tcon;
+	oparms.disposition = FILE_OPEN;
+	if (is_dir)
+		oparms.create_options = CREATE_NOT_FILE;
+	else
+		oparms.create_options = CREATE_NOT_DIR;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	if (qi.flags & PASSTHRU_FSCTL) {
+		switch (qi.info_type & FSCTL_DEVICE_ACCESS_MASK) {
+		case FSCTL_DEVICE_ACCESS_FILE_READ_WRITE_ACCESS:
+			oparms.desired_access = FILE_READ_DATA | FILE_WRITE_DATA | FILE_READ_ATTRIBUTES | SYNCHRONIZE;
+			break;
+		case FSCTL_DEVICE_ACCESS_FILE_ANY_ACCESS:
+			oparms.desired_access = GENERIC_ALL;
+			break;
+		case FSCTL_DEVICE_ACCESS_FILE_READ_ACCESS:
+			oparms.desired_access = GENERIC_READ;
+			break;
+		case FSCTL_DEVICE_ACCESS_FILE_WRITE_ACCESS:
+			oparms.desired_access = GENERIC_WRITE;
+			break;
+		}
+	} else if (qi.flags & PASSTHRU_SET_INFO) {
+		oparms.desired_access = GENERIC_WRITE;
+	} else {
+		oparms.desired_access = FILE_READ_ATTRIBUTES | READ_CONTROL;
+	}
+
+	rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path);
+	if (rc)
+		goto iqinf_exit;
+	smb2_set_next_command(tcon, &rqst[0]);
+
+	/* Query */
+	if (qi.flags & PASSTHRU_FSCTL) {
+		/* Can eventually relax perm check since server enforces too */
+		if (!capable(CAP_SYS_ADMIN))
+			rc = -EPERM;
+		else  {
+			memset(&io_iov, 0, sizeof(io_iov));
+			rqst[1].rq_iov = io_iov;
+			rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+			rc = SMB2_ioctl_init(tcon, &rqst[1],
+					     COMPOUND_FID, COMPOUND_FID,
+					     qi.info_type, true, buffer,
+					     qi.output_buffer_length,
+					     CIFSMaxBufSize);
+		}
+	} else if (qi.flags == PASSTHRU_SET_INFO) {
+		/* Can eventually relax perm check since server enforces too */
+		if (!capable(CAP_SYS_ADMIN))
+			rc = -EPERM;
+		else  {
+			memset(&si_iov, 0, sizeof(si_iov));
+			rqst[1].rq_iov = si_iov;
+			rqst[1].rq_nvec = 1;
+
+			size[0] = 8;
+			data[0] = buffer;
+
+			rc = SMB2_set_info_init(tcon, &rqst[1],
+					COMPOUND_FID, COMPOUND_FID,
+					current->tgid,
+					FILE_END_OF_FILE_INFORMATION,
+					SMB2_O_INFO_FILE, 0, data, size);
+		}
+	} else if (qi.flags == PASSTHRU_QUERY_INFO) {
+		memset(&qi_iov, 0, sizeof(qi_iov));
+		rqst[1].rq_iov = qi_iov;
+		rqst[1].rq_nvec = 1;
+
+		rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+				  COMPOUND_FID, qi.file_info_class,
+				  qi.info_type, qi.additional_information,
+				  qi.input_buffer_length,
+				  qi.output_buffer_length, buffer);
+	} else { /* unknown flags */
+		cifs_tcon_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags);
+		rc = -EINVAL;
+	}
+
+	if (rc)
+		goto iqinf_exit;
+	smb2_set_next_command(tcon, &rqst[1]);
+	smb2_set_related(&rqst[1]);
+
+	/* Close */
+	memset(&close_iov, 0, sizeof(close_iov));
+	rqst[2].rq_iov = close_iov;
+	rqst[2].rq_nvec = 1;
+
+	rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID);
+	if (rc)
+		goto iqinf_exit;
+	smb2_set_related(&rqst[2]);
+
+	rc = compound_send_recv(xid, ses, flags, 3, rqst,
+				resp_buftype, rsp_iov);
+	if (rc)
+		goto iqinf_exit;
+
+	/* No need to bump num_remote_opens since handle immediately closed */
+	if (qi.flags & PASSTHRU_FSCTL) {
+		pqi = (struct smb_query_info __user *)arg;
+		io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base;
+		if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length)
+			qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
+		if (qi.input_buffer_length > 0 &&
+		    le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length > rsp_iov[1].iov_len) {
+			rc = -EFAULT;
+			goto iqinf_exit;
+		}
+		if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+				 sizeof(qi.input_buffer_length))) {
+			rc = -EFAULT;
+			goto iqinf_exit;
+		}
+		if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info),
+				 (const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset),
+				 qi.input_buffer_length)) {
+			rc = -EFAULT;
+			goto iqinf_exit;
+		}
+	} else {
+		pqi = (struct smb_query_info __user *)arg;
+		qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+		if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length)
+			qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
+		if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+				 sizeof(qi.input_buffer_length))) {
+			rc = -EFAULT;
+			goto iqinf_exit;
+		}
+		if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) {
+			rc = -EFAULT;
+			goto iqinf_exit;
+		}
+	}
+
+ iqinf_exit:
+	kfree(buffer);
+	SMB2_open_free(&rqst[0]);
+	if (qi.flags & PASSTHRU_FSCTL)
+		SMB2_ioctl_free(&rqst[1]);
+	else
+		SMB2_query_info_free(&rqst[1]);
+
+	SMB2_close_free(&rqst[2]);
+	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+	free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+	return rc;
+}
+
 static ssize_t
 smb2_copychunk_range(const unsigned int xid,
 			struct cifsFileInfo *srcfile,
@@ -1078,7 +1595,7 @@
 	if (pcchunk == NULL)
 		return -ENOMEM;
 
-	cifs_dbg(FYI, "in smb2_copychunk_range - about to call request res key\n");
+	cifs_dbg(FYI, "%s: about to call request res key\n", __func__);
 	/* Request a key from the server to identify the source of the copy */
 	rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink),
 				srcfile->fid.persistent_fid,
@@ -1105,12 +1622,12 @@
 		rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
 			trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
 			true /* is_fsctl */, (char *)pcchunk,
-			sizeof(struct copychunk_ioctl),	(char **)&retbuf,
-			&ret_data_len);
+			sizeof(struct copychunk_ioctl),	CIFSMaxBufSize,
+			(char **)&retbuf, &ret_data_len);
 		if (rc == 0) {
 			if (ret_data_len !=
 					sizeof(struct copychunk_ioctl_rsp)) {
-				cifs_dbg(VFS, "invalid cchunk response size\n");
+				cifs_tcon_dbg(VFS, "invalid cchunk response size\n");
 				rc = -EIO;
 				goto cchunk_out;
 			}
@@ -1124,12 +1641,12 @@
 			 */
 			if (le32_to_cpu(retbuf->TotalBytesWritten) >
 			    le32_to_cpu(pcchunk->Length)) {
-				cifs_dbg(VFS, "invalid copy chunk response\n");
+				cifs_tcon_dbg(VFS, "invalid copy chunk response\n");
 				rc = -EIO;
 				goto cchunk_out;
 			}
 			if (le32_to_cpu(retbuf->ChunksWritten) != 1) {
-				cifs_dbg(VFS, "invalid num chunks written\n");
+				cifs_tcon_dbg(VFS, "invalid num chunks written\n");
 				rc = -EIO;
 				goto cchunk_out;
 			}
@@ -1198,6 +1715,7 @@
 smb2_read_data_offset(char *buf)
 {
 	struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
+
 	return rsp->DataOffset;
 }
 
@@ -1266,7 +1784,7 @@
 	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
 			cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
 			true /* is_fctl */,
-			&setsparse, 1, NULL, NULL);
+			&setsparse, 1, CIFSMaxBufSize, NULL, NULL);
 	if (rc) {
 		tcon->broken_sparse_sup = true;
 		cifs_dbg(FYI, "set sparse rc = %d\n", rc);
@@ -1302,7 +1820,7 @@
 	}
 
 	return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
-			    cfile->fid.volatile_fid, cfile->pid, &eof, false);
+			    cfile->fid.volatile_fid, cfile->pid, &eof);
 }
 
 static int
@@ -1326,7 +1844,7 @@
 	dup_ext_buf.SourceFileOffset = cpu_to_le64(src_off);
 	dup_ext_buf.TargetFileOffset = cpu_to_le64(dest_off);
 	dup_ext_buf.ByteCount = cpu_to_le64(len);
-	cifs_dbg(FYI, "duplicate extents: src off %lld dst off %lld len %lld",
+	cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n",
 		src_off, dest_off, len);
 
 	rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false);
@@ -1339,11 +1857,11 @@
 			true /* is_fsctl */,
 			(char *)&dup_ext_buf,
 			sizeof(struct duplicate_extents_to_file),
-			NULL,
+			CIFSMaxBufSize, NULL,
 			&ret_data_len);
 
 	if (ret_data_len > 0)
-		cifs_dbg(FYI, "non-zero response length in duplicate extents");
+		cifs_dbg(FYI, "Non-zero response length in duplicate extents\n");
 
 duplicate_extents_out:
 	return rc;
@@ -1374,7 +1892,7 @@
 			true /* is_fsctl */,
 			(char *)&integr_info,
 			sizeof(struct fsctl_set_integrity_information_req),
-			NULL,
+			CIFSMaxBufSize, NULL,
 			&ret_data_len);
 
 }
@@ -1382,6 +1900,8 @@
 /* GMT Token is @GMT-YYYY.MM.DD-HH.MM.SS Unicode which is 48 bytes + null */
 #define GMT_TOKEN_SIZE 50
 
+#define MIN_SNAPSHOT_ARRAY_SIZE 16 /* See MS-SMB2 section 3.3.5.15.1 */
+
 /*
  * Input buffer contains (empty) struct smb_snapshot array with size filled in
  * For output see struct SRV_SNAPSHOT_ARRAY in MS-SMB2 section 2.2.32.2
@@ -1393,13 +1913,37 @@
 	char *retbuf = NULL;
 	unsigned int ret_data_len = 0;
 	int rc;
+	u32 max_response_size;
 	struct smb_snapshot_array snapshot_in;
 
+	/*
+	 * On the first query to enumerate the list of snapshots available
+	 * for this volume the buffer begins with 0 (number of snapshots
+	 * which can be returned is zero since at that point we do not know
+	 * how big the buffer needs to be). On the second query,
+	 * it (ret_data_len) is set to number of snapshots so we can
+	 * know to set the maximum response size larger (see below).
+	 */
+	if (get_user(ret_data_len, (unsigned int __user *)ioc_buf))
+		return -EFAULT;
+
+	/*
+	 * Note that for snapshot queries that servers like Azure expect that
+	 * the first query be minimal size (and just used to get the number/size
+	 * of previous versions) so response size must be specified as EXACTLY
+	 * sizeof(struct snapshot_array) which is 16 when rounded up to multiple
+	 * of eight bytes.
+	 */
+	if (ret_data_len == 0)
+		max_response_size = MIN_SNAPSHOT_ARRAY_SIZE;
+	else
+		max_response_size = CIFSMaxBufSize;
+
 	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
 			cfile->fid.volatile_fid,
 			FSCTL_SRV_ENUMERATE_SNAPSHOTS,
 			true /* is_fsctl */,
-			NULL, 0 /* no input data */,
+			NULL, 0 /* no input data */, max_response_size,
 			(char **)&retbuf,
 			&ret_data_len);
 	cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n",
@@ -1506,18 +2050,18 @@
 }
 
 /*
-* If we negotiate SMB2 protocol and get STATUS_PENDING - update
-* the number of credits and return true. Otherwise - return false.
-*/
+ * If we negotiate SMB2 protocol and get STATUS_PENDING - update
+ * the number of credits and return true. Otherwise - return false.
+ */
 static bool
-smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
+smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
 {
 	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
 
 	if (shdr->Status != STATUS_PENDING)
 		return false;
 
-	if (!length) {
+	if (shdr->CreditRequest) {
 		spin_lock(&server->req_lock);
 		server->credits += le16_to_cpu(shdr->CreditRequest);
 		spin_unlock(&server->req_lock);
@@ -1557,61 +2101,107 @@
 				 CIFS_CACHE_READ(cinode) ? 1 : 0);
 }
 
-static void
+void
 smb2_set_related(struct smb_rqst *rqst)
 {
 	struct smb2_sync_hdr *shdr;
 
 	shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+	if (shdr == NULL) {
+		cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
+		return;
+	}
 	shdr->Flags |= SMB2_FLAGS_RELATED_OPERATIONS;
 }
 
 char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0};
 
-static void
-smb2_set_next_command(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+void
+smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
 {
 	struct smb2_sync_hdr *shdr;
+	struct cifs_ses *ses = tcon->ses;
+	struct TCP_Server_Info *server = ses->server;
 	unsigned long len = smb_rqst_len(server, rqst);
-
-	/* SMB headers in a compound are 8 byte aligned. */
-	if (len & 7) {
-		rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding;
-		rqst->rq_iov[rqst->rq_nvec].iov_len = 8 - (len & 7);
-		rqst->rq_nvec++;
-		len = smb_rqst_len(server, rqst);
-	}
+	int i, num_padding;
 
 	shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+	if (shdr == NULL) {
+		cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n");
+		return;
+	}
+
+	/* SMB headers in a compound are 8 byte aligned. */
+
+	/* No padding needed */
+	if (!(len & 7))
+		goto finished;
+
+	num_padding = 8 - (len & 7);
+	if (!smb3_encryption_required(tcon)) {
+		/*
+		 * If we do not have encryption then we can just add an extra
+		 * iov for the padding.
+		 */
+		rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding;
+		rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding;
+		rqst->rq_nvec++;
+		len += num_padding;
+	} else {
+		/*
+		 * We can not add a small padding iov for the encryption case
+		 * because the encryption framework can not handle the padding
+		 * iovs.
+		 * We have to flatten this into a single buffer and add
+		 * the padding to it.
+		 */
+		for (i = 1; i < rqst->rq_nvec; i++) {
+			memcpy(rqst->rq_iov[0].iov_base +
+			       rqst->rq_iov[0].iov_len,
+			       rqst->rq_iov[i].iov_base,
+			       rqst->rq_iov[i].iov_len);
+			rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len;
+		}
+		memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len,
+		       0, num_padding);
+		rqst->rq_iov[0].iov_len += num_padding;
+		len += num_padding;
+		rqst->rq_nvec = 1;
+	}
+
+ finished:
 	shdr->NextCommand = cpu_to_le32(len);
 }
 
-static int
-smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
-	     struct kstatfs *buf)
+/*
+ * Passes the query info response back to the caller on success.
+ * Caller need to free this with free_rsp_buf().
+ */
+int
+smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
+			 __le16 *utf16_path, u32 desired_access,
+			 u32 class, u32 type, u32 output_len,
+			 struct kvec *rsp, int *buftype,
+			 struct cifs_sb_info *cifs_sb)
 {
-	struct smb2_query_info_rsp *rsp;
-	struct smb2_fs_full_size_info *info = NULL;
+	struct cifs_ses *ses = tcon->ses;
+	int flags = 0;
 	struct smb_rqst rqst[3];
 	int resp_buftype[3];
 	struct kvec rsp_iov[3];
 	struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
 	struct kvec qi_iov[1];
 	struct kvec close_iov[1];
-	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = ses->server;
-	__le16 srch_path = 0; /* Null - open root of share */
 	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
 	struct cifs_open_parms oparms;
 	struct cifs_fid fid;
-	int flags = 0;
 	int rc;
 
 	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
 	memset(rqst, 0, sizeof(rqst));
-	memset(resp_buftype, 0, sizeof(resp_buftype));
+	resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
 	memset(rsp_iov, 0, sizeof(rsp_iov));
 
 	memset(&open_iov, 0, sizeof(open_iov));
@@ -1619,28 +2209,31 @@
 	rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
 
 	oparms.tcon = tcon;
-	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.desired_access = desired_access;
 	oparms.disposition = FILE_OPEN;
-	oparms.create_options = 0;
+	if (cifs_sb && backup_cred(cifs_sb))
+		oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+	else
+		oparms.create_options = 0;
 	oparms.fid = &fid;
 	oparms.reconnect = false;
 
-	rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &srch_path);
+	rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path);
 	if (rc)
-		goto qfs_exit;
-	smb2_set_next_command(server, &rqst[0]);
+		goto qic_exit;
+	smb2_set_next_command(tcon, &rqst[0]);
 
 	memset(&qi_iov, 0, sizeof(qi_iov));
 	rqst[1].rq_iov = qi_iov;
 	rqst[1].rq_nvec = 1;
 
 	rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
-				  FS_FULL_SIZE_INFORMATION,
-				  SMB2_O_INFO_FILESYSTEM, 0,
-				  sizeof(struct smb2_fs_full_size_info));
+				  class, type, 0,
+				  output_len, 0,
+				  NULL);
 	if (rc)
-		goto qfs_exit;
-	smb2_set_next_command(server, &rqst[1]);
+		goto qic_exit;
+	smb2_set_next_command(tcon, &rqst[1]);
 	smb2_set_related(&rqst[1]);
 
 	memset(&close_iov, 0, sizeof(close_iov));
@@ -1649,32 +2242,66 @@
 
 	rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID);
 	if (rc)
-		goto qfs_exit;
+		goto qic_exit;
 	smb2_set_related(&rqst[2]);
 
 	rc = compound_send_recv(xid, ses, flags, 3, rqst,
 				resp_buftype, rsp_iov);
+	if (rc) {
+		free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+		if (rc == -EREMCHG) {
+			tcon->need_reconnect = true;
+			printk_once(KERN_WARNING "server share %s deleted\n",
+				    tcon->treeName);
+		}
+		goto qic_exit;
+	}
+	*rsp = rsp_iov[1];
+	*buftype = resp_buftype[1];
+
+ qic_exit:
+	SMB2_open_free(&rqst[0]);
+	SMB2_query_info_free(&rqst[1]);
+	SMB2_close_free(&rqst[2]);
+	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+	free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+	return rc;
+}
+
+static int
+smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
+	     struct kstatfs *buf)
+{
+	struct smb2_query_info_rsp *rsp;
+	struct smb2_fs_full_size_info *info = NULL;
+	__le16 utf16_path = 0; /* Null - open root of share */
+	struct kvec rsp_iov = {NULL, 0};
+	int buftype = CIFS_NO_BUFFER;
+	int rc;
+
+
+	rc = smb2_query_info_compound(xid, tcon, &utf16_path,
+				      FILE_READ_ATTRIBUTES,
+				      FS_FULL_SIZE_INFORMATION,
+				      SMB2_O_INFO_FILESYSTEM,
+				      sizeof(struct smb2_fs_full_size_info),
+				      &rsp_iov, &buftype, NULL);
 	if (rc)
 		goto qfs_exit;
 
-	rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+	rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
 	buf->f_type = SMB2_MAGIC_NUMBER;
 	info = (struct smb2_fs_full_size_info *)(
 		le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
 	rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
 			       le32_to_cpu(rsp->OutputBufferLength),
-			       &rsp_iov[1],
+			       &rsp_iov,
 			       sizeof(struct smb2_fs_full_size_info));
 	if (!rc)
 		smb2_copy_fs_info_to_kstatfs(info, buf);
 
 qfs_exit:
-	SMB2_open_free(&rqst[0]);
-	SMB2_query_info_free(&rqst[1]);
-	SMB2_close_free(&rqst[2]);
-	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
-	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
-	free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+	free_rsp_buf(buftype, rsp_iov.iov_base);
 	return rc;
 }
 
@@ -1760,7 +2387,7 @@
 	struct get_dfs_referral_rsp *dfs_rsp = NULL;
 	u32 dfs_req_size = 0, dfs_rsp_size = 0;
 
-	cifs_dbg(FYI, "smb2_get_dfs_refer path <%s>\n", search_name);
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, search_name);
 
 	/*
 	 * Try to use the IPC tcon, otherwise just use any
@@ -1808,13 +2435,13 @@
 		rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
 				FSCTL_DFS_GET_REFERRALS,
 				true /* is_fsctl */,
-				(char *)dfs_req, dfs_req_size,
+				(char *)dfs_req, dfs_req_size, CIFSMaxBufSize,
 				(char **)&dfs_rsp, &dfs_rsp_size);
 	} while (rc == -EAGAIN);
 
 	if (rc) {
 		if ((rc != -ENOENT) && (rc != -EOPNOTSUPP))
-			cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc);
+			cifs_tcon_dbg(VFS, "ioctl error in %s rc=%d\n", __func__, rc);
 		goto out;
 	}
 
@@ -1823,7 +2450,7 @@
 				 nls_codepage, remap, search_name,
 				 true /* is_unicode */);
 	if (rc) {
-		cifs_dbg(VFS, "parse error in smb2_get_dfs_refer rc=%d\n", rc);
+		cifs_tcon_dbg(VFS, "parse error in %s rc=%d\n", __func__, rc);
 		goto out;
 	}
 
@@ -1839,74 +2466,265 @@
 	kfree(dfs_rsp);
 	return rc;
 }
+
+static int
+parse_reparse_posix(struct reparse_posix_data *symlink_buf,
+		      u32 plen, char **target_path,
+		      struct cifs_sb_info *cifs_sb)
+{
+	unsigned int len;
+
+	/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
+	len = le16_to_cpu(symlink_buf->ReparseDataLength);
+
+	if (le64_to_cpu(symlink_buf->InodeType) != NFS_SPECFILE_LNK) {
+		cifs_dbg(VFS, "%lld not a supported symlink type\n",
+			le64_to_cpu(symlink_buf->InodeType));
+		return -EOPNOTSUPP;
+	}
+
+	*target_path = cifs_strndup_from_utf16(
+				symlink_buf->PathBuffer,
+				len, true, cifs_sb->local_nls);
+	if (!(*target_path))
+		return -ENOMEM;
+
+	convert_delimiter(*target_path, '/');
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+
+	return 0;
+}
+
+static int
+parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf,
+		      u32 plen, char **target_path,
+		      struct cifs_sb_info *cifs_sb)
+{
+	unsigned int sub_len;
+	unsigned int sub_offset;
+
+	/* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
+
+	sub_offset = le16_to_cpu(symlink_buf->SubstituteNameOffset);
+	sub_len = le16_to_cpu(symlink_buf->SubstituteNameLength);
+	if (sub_offset + 20 > plen ||
+	    sub_offset + sub_len + 20 > plen) {
+		cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
+		return -EIO;
+	}
+
+	*target_path = cifs_strndup_from_utf16(
+				symlink_buf->PathBuffer + sub_offset,
+				sub_len, true, cifs_sb->local_nls);
+	if (!(*target_path))
+		return -ENOMEM;
+
+	convert_delimiter(*target_path, '/');
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+
+	return 0;
+}
+
+static int
+parse_reparse_point(struct reparse_data_buffer *buf,
+		    u32 plen, char **target_path,
+		    struct cifs_sb_info *cifs_sb)
+{
+	if (plen < sizeof(struct reparse_data_buffer)) {
+		cifs_dbg(VFS, "reparse buffer is too small. Must be "
+			 "at least 8 bytes but was %d\n", plen);
+		return -EIO;
+	}
+
+	if (plen < le16_to_cpu(buf->ReparseDataLength) +
+	    sizeof(struct reparse_data_buffer)) {
+		cifs_dbg(VFS, "srv returned invalid reparse buf "
+			 "length: %d\n", plen);
+		return -EIO;
+	}
+
+	/* See MS-FSCC 2.1.2 */
+	switch (le32_to_cpu(buf->ReparseTag)) {
+	case IO_REPARSE_TAG_NFS:
+		return parse_reparse_posix(
+			(struct reparse_posix_data *)buf,
+			plen, target_path, cifs_sb);
+	case IO_REPARSE_TAG_SYMLINK:
+		return parse_reparse_symlink(
+			(struct reparse_symlink_data_buffer *)buf,
+			plen, target_path, cifs_sb);
+	default:
+		cifs_dbg(VFS, "srv returned unknown symlink buffer "
+			 "tag:0x%08x\n", le32_to_cpu(buf->ReparseTag));
+		return -EOPNOTSUPP;
+	}
+}
+
 #define SMB2_SYMLINK_STRUCT_SIZE \
 	(sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
 
 static int
 smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
-		   const char *full_path, char **target_path,
-		   struct cifs_sb_info *cifs_sb)
+		   struct cifs_sb_info *cifs_sb, const char *full_path,
+		   char **target_path, bool is_reparse_point)
 {
 	int rc;
-	__le16 *utf16_path;
+	__le16 *utf16_path = NULL;
 	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
 	struct cifs_open_parms oparms;
 	struct cifs_fid fid;
 	struct kvec err_iov = {NULL, 0};
 	struct smb2_err_rsp *err_buf = NULL;
-	int resp_buftype;
 	struct smb2_symlink_err_rsp *symlink;
 	unsigned int sub_len;
 	unsigned int sub_offset;
 	unsigned int print_len;
 	unsigned int print_offset;
+	int flags = 0;
+	struct smb_rqst rqst[3];
+	int resp_buftype[3];
+	struct kvec rsp_iov[3];
+	struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+	struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
+	struct kvec close_iov[1];
+	struct smb2_create_rsp *create_rsp;
+	struct smb2_ioctl_rsp *ioctl_rsp;
+	struct reparse_data_buffer *reparse_buf;
+	u32 plen;
 
 	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
 
+	*target_path = NULL;
+
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	memset(rqst, 0, sizeof(rqst));
+	resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+	memset(rsp_iov, 0, sizeof(rsp_iov));
+
 	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
 	if (!utf16_path)
 		return -ENOMEM;
 
+	/* Open */
+	memset(&open_iov, 0, sizeof(open_iov));
+	rqst[0].rq_iov = open_iov;
+	rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+	memset(&oparms, 0, sizeof(oparms));
 	oparms.tcon = tcon;
 	oparms.desired_access = FILE_READ_ATTRIBUTES;
 	oparms.disposition = FILE_OPEN;
+
 	if (backup_cred(cifs_sb))
 		oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
 	else
 		oparms.create_options = 0;
+	if (is_reparse_point)
+		oparms.create_options = OPEN_REPARSE_POINT;
+
 	oparms.fid = &fid;
 	oparms.reconnect = false;
 
-	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov,
-		       &resp_buftype);
+	rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path);
+	if (rc)
+		goto querty_exit;
+	smb2_set_next_command(tcon, &rqst[0]);
+
+
+	/* IOCTL */
+	memset(&io_iov, 0, sizeof(io_iov));
+	rqst[1].rq_iov = io_iov;
+	rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+	rc = SMB2_ioctl_init(tcon, &rqst[1], fid.persistent_fid,
+			     fid.volatile_fid, FSCTL_GET_REPARSE_POINT,
+			     true /* is_fctl */, NULL, 0, CIFSMaxBufSize);
+	if (rc)
+		goto querty_exit;
+
+	smb2_set_next_command(tcon, &rqst[1]);
+	smb2_set_related(&rqst[1]);
+
+
+	/* Close */
+	memset(&close_iov, 0, sizeof(close_iov));
+	rqst[2].rq_iov = close_iov;
+	rqst[2].rq_nvec = 1;
+
+	rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID);
+	if (rc)
+		goto querty_exit;
+
+	smb2_set_related(&rqst[2]);
+
+	rc = compound_send_recv(xid, tcon->ses, flags, 3, rqst,
+				resp_buftype, rsp_iov);
+
+	create_rsp = rsp_iov[0].iov_base;
+	if (create_rsp && create_rsp->sync_hdr.Status)
+		err_iov = rsp_iov[0];
+	ioctl_rsp = rsp_iov[1].iov_base;
+
+	/*
+	 * Open was successful and we got an ioctl response.
+	 */
+	if ((rc == 0) && (is_reparse_point)) {
+		/* See MS-FSCC 2.3.23 */
+
+		reparse_buf = (struct reparse_data_buffer *)
+			((char *)ioctl_rsp +
+			 le32_to_cpu(ioctl_rsp->OutputOffset));
+		plen = le32_to_cpu(ioctl_rsp->OutputCount);
+
+		if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) >
+		    rsp_iov[1].iov_len) {
+			cifs_tcon_dbg(VFS, "srv returned invalid ioctl len: %d\n",
+				 plen);
+			rc = -EIO;
+			goto querty_exit;
+		}
+
+		rc = parse_reparse_point(reparse_buf, plen, target_path,
+					 cifs_sb);
+		goto querty_exit;
+	}
+
 	if (!rc || !err_iov.iov_base) {
 		rc = -ENOENT;
-		goto free_path;
+		goto querty_exit;
 	}
 
 	err_buf = err_iov.iov_base;
 	if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
 	    err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) {
-		rc = -ENOENT;
+		rc = -EINVAL;
+		goto querty_exit;
+	}
+
+	symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
+	if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG ||
+	    le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) {
+		rc = -EINVAL;
 		goto querty_exit;
 	}
 
 	/* open must fail on symlink - reset rc */
 	rc = 0;
-	symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
 	sub_len = le16_to_cpu(symlink->SubstituteNameLength);
 	sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
 	print_len = le16_to_cpu(symlink->PrintNameLength);
 	print_offset = le16_to_cpu(symlink->PrintNameOffset);
 
 	if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
-		rc = -ENOENT;
+		rc = -EINVAL;
 		goto querty_exit;
 	}
 
 	if (err_iov.iov_len <
 	    SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
-		rc = -ENOENT;
+		rc = -EINVAL;
 		goto querty_exit;
 	}
 
@@ -1921,13 +2739,17 @@
 	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
 
  querty_exit:
-	free_rsp_buf(resp_buftype, err_buf);
- free_path:
+	cifs_dbg(FYI, "query symlink rc %d\n", rc);
 	kfree(utf16_path);
+	SMB2_open_free(&rqst[0]);
+	SMB2_ioctl_free(&rqst[1]);
+	SMB2_close_free(&rqst[2]);
+	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+	free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
 	return rc;
 }
 
-#ifdef CONFIG_CIFS_ACL
 static struct cifs_ntsd *
 get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
 		const struct cifs_fid *cifsfid, u32 *pacllen)
@@ -2012,7 +2834,6 @@
 	return pntsd;
 }
 
-#ifdef CONFIG_CIFS_ACL
 static int
 set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 		struct inode *inode, const char *path, int aclflag)
@@ -2070,7 +2891,6 @@
 	free_xid(xid);
 	return rc;
 }
-#endif /* CIFS_ACL */
 
 /* Retrieve an ACL from the server */
 static struct cifs_ntsd *
@@ -2090,64 +2910,68 @@
 	cifsFileInfo_put(open_file);
 	return pntsd;
 }
-#endif
 
 static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
 			    loff_t offset, loff_t len, bool keep_size)
 {
+	struct cifs_ses *ses = tcon->ses;
 	struct inode *inode;
 	struct cifsInodeInfo *cifsi;
 	struct cifsFileInfo *cfile = file->private_data;
 	struct file_zero_data_information fsctl_buf;
 	long rc;
 	unsigned int xid;
+	__le64 eof;
 
 	xid = get_xid();
 
 	inode = d_inode(cfile->dentry);
 	cifsi = CIFS_I(inode);
 
+	trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+			      ses->Suid, offset, len);
+
+
 	/* if file not oplocked can't be sure whether asking to extend size */
 	if (!CIFS_CACHE_READ(cifsi))
 		if (keep_size == false) {
 			rc = -EOPNOTSUPP;
+			trace_smb3_zero_err(xid, cfile->fid.persistent_fid,
+				tcon->tid, ses->Suid, offset, len, rc);
 			free_xid(xid);
 			return rc;
 		}
 
-	/*
-	 * Must check if file sparse since fallocate -z (zero range) assumes
-	 * non-sparse allocation
-	 */
-	if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
-		rc = -EOPNOTSUPP;
-		free_xid(xid);
-		return rc;
-	}
-
-	/*
-	 * need to make sure we are not asked to extend the file since the SMB3
-	 * fsctl does not change the file size. In the future we could change
-	 * this to zero the first part of the range then set the file size
-	 * which for a non sparse file would zero the newly extended range
-	 */
-	if (keep_size == false)
-		if (i_size_read(inode) < offset + len) {
-			rc = -EOPNOTSUPP;
-			free_xid(xid);
-			return rc;
-		}
-
-	cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+	cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
 
 	fsctl_buf.FileOffset = cpu_to_le64(offset);
 	fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
 
 	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
-			cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
-			true /* is_fctl */, (char *)&fsctl_buf,
-			sizeof(struct file_zero_data_information), NULL, NULL);
+			cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true,
+			(char *)&fsctl_buf,
+			sizeof(struct file_zero_data_information),
+			0, NULL, NULL);
+	if (rc)
+		goto zero_range_exit;
+
+	/*
+	 * do we also need to change the size of the file?
+	 */
+	if (keep_size == false && i_size_read(inode) < offset + len) {
+		eof = cpu_to_le64(offset + len);
+		rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+				  cfile->fid.volatile_fid, cfile->pid, &eof);
+	}
+
+ zero_range_exit:
 	free_xid(xid);
+	if (rc)
+		trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+			      ses->Suid, offset, len, rc);
+	else
+		trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid,
+			      ses->Suid, offset, len);
 	return rc;
 }
 
@@ -2155,7 +2979,6 @@
 			    loff_t offset, loff_t len)
 {
 	struct inode *inode;
-	struct cifsInodeInfo *cifsi;
 	struct cifsFileInfo *cfile = file->private_data;
 	struct file_zero_data_information fsctl_buf;
 	long rc;
@@ -2165,7 +2988,6 @@
 	xid = get_xid();
 
 	inode = d_inode(cfile->dentry);
-	cifsi = CIFS_I(inode);
 
 	/* Need to make file sparse, if not already, before freeing range. */
 	/* Consider adding equivalent for compressed since it could also work */
@@ -2175,7 +2997,7 @@
 		return rc;
 	}
 
-	cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+	cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
 
 	fsctl_buf.FileOffset = cpu_to_le64(offset);
 	fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
@@ -2183,7 +3005,8 @@
 	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
 			cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
 			true /* is_fctl */, (char *)&fsctl_buf,
-			sizeof(struct file_zero_data_information), NULL, NULL);
+			sizeof(struct file_zero_data_information),
+			CIFSMaxBufSize, NULL, NULL);
 	free_xid(xid);
 	return rc;
 }
@@ -2196,15 +3019,20 @@
 	struct cifsFileInfo *cfile = file->private_data;
 	long rc = -EOPNOTSUPP;
 	unsigned int xid;
+	__le64 eof;
 
 	xid = get_xid();
 
 	inode = d_inode(cfile->dentry);
 	cifsi = CIFS_I(inode);
 
+	trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+				tcon->ses->Suid, off, len);
 	/* if file not oplocked can't be sure whether asking to extend size */
 	if (!CIFS_CACHE_READ(cifsi))
 		if (keep_size == false) {
+			trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+				tcon->tid, tcon->ses->Suid, off, len, rc);
 			free_xid(xid);
 			return rc;
 		}
@@ -2224,6 +3052,12 @@
 		/* BB: in future add else clause to extend file */
 		else
 			rc = -EOPNOTSUPP;
+		if (rc)
+			trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+				tcon->tid, tcon->ses->Suid, off, len, rc);
+		else
+			trace_smb3_falloc_done(xid, cfile->fid.persistent_fid,
+				tcon->tid, tcon->ses->Suid, off, len);
 		free_xid(xid);
 		return rc;
 	}
@@ -2239,19 +3073,193 @@
 		 */
 		if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
 			rc = -EOPNOTSUPP;
+			trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+				tcon->tid, tcon->ses->Suid, off, len, rc);
 			free_xid(xid);
 			return rc;
 		}
 
-		rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
+		smb2_set_sparse(xid, tcon, cfile, inode, false);
+		rc = 0;
+	} else {
+		smb2_set_sparse(xid, tcon, cfile, inode, false);
+		rc = 0;
+		if (i_size_read(inode) < off + len) {
+			eof = cpu_to_le64(off + len);
+			rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+					  cfile->fid.volatile_fid, cfile->pid,
+					  &eof);
+		}
 	}
-	/* BB: else ... in future add code to extend file and set sparse */
 
+	if (rc)
+		trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid,
+				tcon->ses->Suid, off, len, rc);
+	else
+		trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid,
+				tcon->ses->Suid, off, len);
 
 	free_xid(xid);
 	return rc;
 }
 
+static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offset, int whence)
+{
+	struct cifsFileInfo *wrcfile, *cfile = file->private_data;
+	struct cifsInodeInfo *cifsi;
+	struct inode *inode;
+	int rc = 0;
+	struct file_allocated_range_buffer in_data, *out_data = NULL;
+	u32 out_data_len;
+	unsigned int xid;
+
+	if (whence != SEEK_HOLE && whence != SEEK_DATA)
+		return generic_file_llseek(file, offset, whence);
+
+	inode = d_inode(cfile->dentry);
+	cifsi = CIFS_I(inode);
+
+	if (offset < 0 || offset >= i_size_read(inode))
+		return -ENXIO;
+
+	xid = get_xid();
+	/*
+	 * We need to be sure that all dirty pages are written as they
+	 * might fill holes on the server.
+	 * Note that we also MUST flush any written pages since at least
+	 * some servers (Windows2016) will not reflect recent writes in
+	 * QUERY_ALLOCATED_RANGES until SMB2_flush is called.
+	 */
+	wrcfile = find_writable_file(cifsi, false);
+	if (wrcfile) {
+		filemap_write_and_wait(inode->i_mapping);
+		smb2_flush_file(xid, tcon, &wrcfile->fid);
+		cifsFileInfo_put(wrcfile);
+	}
+
+	if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
+		if (whence == SEEK_HOLE)
+			offset = i_size_read(inode);
+		goto lseek_exit;
+	}
+
+	in_data.file_offset = cpu_to_le64(offset);
+	in_data.length = cpu_to_le64(i_size_read(inode));
+
+	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+			cfile->fid.volatile_fid,
+			FSCTL_QUERY_ALLOCATED_RANGES, true,
+			(char *)&in_data, sizeof(in_data),
+			sizeof(struct file_allocated_range_buffer),
+			(char **)&out_data, &out_data_len);
+	if (rc == -E2BIG)
+		rc = 0;
+	if (rc)
+		goto lseek_exit;
+
+	if (whence == SEEK_HOLE && out_data_len == 0)
+		goto lseek_exit;
+
+	if (whence == SEEK_DATA && out_data_len == 0) {
+		rc = -ENXIO;
+		goto lseek_exit;
+	}
+
+	if (out_data_len < sizeof(struct file_allocated_range_buffer)) {
+		rc = -EINVAL;
+		goto lseek_exit;
+	}
+	if (whence == SEEK_DATA) {
+		offset = le64_to_cpu(out_data->file_offset);
+		goto lseek_exit;
+	}
+	if (offset < le64_to_cpu(out_data->file_offset))
+		goto lseek_exit;
+
+	offset = le64_to_cpu(out_data->file_offset) + le64_to_cpu(out_data->length);
+
+ lseek_exit:
+	free_xid(xid);
+	kfree(out_data);
+	if (!rc)
+		return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+	else
+		return rc;
+}
+
+static int smb3_fiemap(struct cifs_tcon *tcon,
+		       struct cifsFileInfo *cfile,
+		       struct fiemap_extent_info *fei, u64 start, u64 len)
+{
+	unsigned int xid;
+	struct file_allocated_range_buffer in_data, *out_data;
+	u32 out_data_len;
+	int i, num, rc, flags, last_blob;
+	u64 next;
+
+	if (fiemap_check_flags(fei, FIEMAP_FLAG_SYNC))
+		return -EBADR;
+
+	xid = get_xid();
+ again:
+	in_data.file_offset = cpu_to_le64(start);
+	in_data.length = cpu_to_le64(len);
+
+	rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+			cfile->fid.volatile_fid,
+			FSCTL_QUERY_ALLOCATED_RANGES, true,
+			(char *)&in_data, sizeof(in_data),
+			1024 * sizeof(struct file_allocated_range_buffer),
+			(char **)&out_data, &out_data_len);
+	if (rc == -E2BIG) {
+		last_blob = 0;
+		rc = 0;
+	} else
+		last_blob = 1;
+	if (rc)
+		goto out;
+
+	if (out_data_len < sizeof(struct file_allocated_range_buffer)) {
+		rc = -EINVAL;
+		goto out;
+	}
+	if (out_data_len % sizeof(struct file_allocated_range_buffer)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	num = out_data_len / sizeof(struct file_allocated_range_buffer);
+	for (i = 0; i < num; i++) {
+		flags = 0;
+		if (i == num - 1 && last_blob)
+			flags |= FIEMAP_EXTENT_LAST;
+
+		rc = fiemap_fill_next_extent(fei,
+				le64_to_cpu(out_data[i].file_offset),
+				le64_to_cpu(out_data[i].file_offset),
+				le64_to_cpu(out_data[i].length),
+				flags);
+		if (rc < 0)
+			goto out;
+		if (rc == 1) {
+			rc = 0;
+			goto out;
+		}
+	}
+
+	if (!last_blob) {
+		next = le64_to_cpu(out_data[num - 1].file_offset) +
+		  le64_to_cpu(out_data[num - 1].length);
+		len = len - (next - start);
+		start = next;
+		goto again;
+	}
+
+ out:
+	free_xid(xid);
+	kfree(out_data);
+	return rc;
+}
 
 static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
 			   loff_t off, loff_t len)
@@ -2283,6 +3291,15 @@
 }
 
 static void
+smb21_downgrade_oplock(struct TCP_Server_Info *server,
+		       struct cifsInodeInfo *cinode, bool set_level2)
+{
+	server->ops->set_oplock_level(cinode,
+				      set_level2 ? SMB2_LEASE_READ_CACHING_HE :
+				      0, 0, NULL);
+}
+
+static void
 smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
 		      unsigned int epoch, bool *purge_cache)
 {
@@ -2310,26 +3327,33 @@
 		       unsigned int epoch, bool *purge_cache)
 {
 	char message[5] = {0};
+	unsigned int new_oplock = 0;
 
 	oplock &= 0xFF;
 	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
 		return;
 
-	cinode->oplock = 0;
+	/* Check if the server granted an oplock rather than a lease */
+	if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE)
+		return smb2_set_oplock_level(cinode, oplock, epoch,
+					     purge_cache);
+
 	if (oplock & SMB2_LEASE_READ_CACHING_HE) {
-		cinode->oplock |= CIFS_CACHE_READ_FLG;
+		new_oplock |= CIFS_CACHE_READ_FLG;
 		strcat(message, "R");
 	}
 	if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) {
-		cinode->oplock |= CIFS_CACHE_HANDLE_FLG;
+		new_oplock |= CIFS_CACHE_HANDLE_FLG;
 		strcat(message, "H");
 	}
 	if (oplock & SMB2_LEASE_WRITE_CACHING_HE) {
-		cinode->oplock |= CIFS_CACHE_WRITE_FLG;
+		new_oplock |= CIFS_CACHE_WRITE_FLG;
 		strcat(message, "W");
 	}
-	if (!cinode->oplock)
-		strcat(message, "None");
+	if (!new_oplock)
+		strncpy(message, "None", sizeof(message));
+
+	cinode->oplock = new_oplock;
 	cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
 		 &cinode->vfs_inode);
 }
@@ -2486,7 +3510,7 @@
 
 static void
 fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
-		   struct smb_rqst *old_rq)
+		   struct smb_rqst *old_rq, __le16 cipher_type)
 {
 	struct smb2_sync_hdr *shdr =
 			(struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base;
@@ -2495,7 +3519,10 @@
 	tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
 	tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len);
 	tr_hdr->Flags = cpu_to_le16(0x01);
-	get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
+	if (cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+		get_random_bytes(&tr_hdr->Nonce, SMB3_AES128GCM_NONCE);
+	else
+		get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CCM_NONCE);
 	memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
 }
 
@@ -2505,7 +3532,15 @@
 static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
 				   unsigned int buflen)
 {
-	sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
+	void *addr;
+	/*
+	 * VMAP_STACK (at least) puts stack into the vmalloc address space
+	 */
+	if (is_vmalloc_addr(buf))
+		addr = vmalloc_to_page(buf);
+	else
+		addr = virt_to_page(buf);
+	sg_set_page(sg, addr, buflen, offset_in_page(buf));
 }
 
 /* Assumes the first rqst has a transform header as the first iov.
@@ -2543,7 +3578,7 @@
 			smb2_sg_set_buf(&sg[idx++],
 					rqst[i].rq_iov[j].iov_base + skip,
 					rqst[i].rq_iov[j].iov_len - skip);
-		}
+			}
 
 		for (j = 0; j < rqst[i].rq_npages; j++) {
 			unsigned int len, offset;
@@ -2603,14 +3638,14 @@
 
 	rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not get %scryption key\n", __func__,
+		cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__,
 			 enc ? "en" : "de");
 		return 0;
 	}
 
 	rc = smb3_crypto_aead_allocate(server);
 	if (rc) {
-		cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+		cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__);
 		return rc;
 	}
 
@@ -2618,19 +3653,19 @@
 						server->secmech.ccmaesdecrypt;
 	rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc);
+		cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc);
 		return rc;
 	}
 
 	rc = crypto_aead_setauthsize(tfm, SMB2_SIGNATURE_SIZE);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Failed to set authsize %d\n", __func__, rc);
+		cifs_server_dbg(VFS, "%s: Failed to set authsize %d\n", __func__, rc);
 		return rc;
 	}
 
 	req = aead_request_alloc(tfm, GFP_KERNEL);
 	if (!req) {
-		cifs_dbg(VFS, "%s: Failed to alloc aead request", __func__);
+		cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__);
 		return -ENOMEM;
 	}
 
@@ -2641,7 +3676,7 @@
 
 	sg = init_sg(num_rqst, rqst, sign);
 	if (!sg) {
-		cifs_dbg(VFS, "%s: Failed to init sg", __func__);
+		cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__);
 		rc = -ENOMEM;
 		goto free_req;
 	}
@@ -2649,12 +3684,17 @@
 	iv_len = crypto_aead_ivsize(tfm);
 	iv = kzalloc(iv_len, GFP_KERNEL);
 	if (!iv) {
-		cifs_dbg(VFS, "%s: Failed to alloc IV", __func__);
+		cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__);
 		rc = -ENOMEM;
 		goto free_sg;
 	}
-	iv[0] = 3;
-	memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
+
+	if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+		memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES128GCM_NONCE);
+	else {
+		iv[0] = 3;
+		memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CCM_NONCE);
+	}
 
 	aead_request_set_crypt(req, sg, sg, crypt_len, iv);
 	aead_request_set_ad(req, assoc_data_len);
@@ -2754,10 +3794,10 @@
 	}
 
 	/* fill the 1st iov with a transform header */
-	fill_transform_hdr(tr_hdr, orig_len, old_rq);
+	fill_transform_hdr(tr_hdr, orig_len, old_rq, server->cipher_type);
 
 	rc = crypt_message(server, num_rqst, new_rq, 1);
-	cifs_dbg(FYI, "encrypt message returned %d", rc);
+	cifs_dbg(FYI, "Encrypt message returned %d\n", rc);
 	if (rc)
 		goto err_free;
 
@@ -2798,7 +3838,7 @@
 	rqst.rq_tailsz = (page_data_size % PAGE_SIZE) ? : PAGE_SIZE;
 
 	rc = crypt_message(server, 1, &rqst, 0);
-	cifs_dbg(FYI, "decrypt message returned %d\n", rc);
+	cifs_dbg(FYI, "Decrypt message returned %d\n", rc);
 
 	if (rc)
 		return rc;
@@ -2886,7 +3926,7 @@
 	bool use_rdma_mr = false;
 
 	if (shdr->Command != SMB2_READ) {
-		cifs_dbg(VFS, "only big read responses are supported\n");
+		cifs_server_dbg(VFS, "only big read responses are supported\n");
 		return -ENOTSUPP;
 	}
 
@@ -2898,14 +3938,26 @@
 	}
 
 	if (server->ops->is_status_pending &&
-			server->ops->is_status_pending(buf, server, 0))
+			server->ops->is_status_pending(buf, server))
 		return -1;
 
-	rdata->result = server->ops->map_error(buf, false);
+	/* set up first two iov to get credits */
+	rdata->iov[0].iov_base = buf;
+	rdata->iov[0].iov_len = 0;
+	rdata->iov[1].iov_base = buf;
+	rdata->iov[1].iov_len =
+		min_t(unsigned int, buf_len, server->vals->read_rsp_size);
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+	cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
+	rdata->result = server->ops->map_error(buf, true);
 	if (rdata->result != 0) {
 		cifs_dbg(FYI, "%s: server returned error %d\n",
 			 __func__, rdata->result);
-		dequeue_mid(mid, rdata->result);
+		/* normal error on read response */
+		dequeue_mid(mid, false);
 		return 0;
 	}
 
@@ -2963,13 +4015,13 @@
 			return 0;
 		}
 
-		iov_iter_bvec(&iter, WRITE | ITER_BVEC, bvec, npages, data_len);
+		iov_iter_bvec(&iter, WRITE, bvec, npages, data_len);
 	} else if (buf_len >= data_offset + data_len) {
 		/* read response payload is in buf */
 		WARN_ONCE(npages > 0, "read data can be either in buf or in pages");
 		iov.iov_base = buf + data_offset;
 		iov.iov_len = data_len;
-		iov_iter_kvec(&iter, WRITE | ITER_KVEC, &iov, 1, data_len);
+		iov_iter_kvec(&iter, WRITE, &iov, 1, data_len);
 	} else {
 		/* read response payload cannot be in both buf and pages */
 		WARN_ONCE(1, "buf can not contain only a part of read data");
@@ -2978,14 +4030,6 @@
 		return 0;
 	}
 
-	/* set up first iov for signature check */
-	rdata->iov[0].iov_base = buf;
-	rdata->iov[0].iov_len = 4;
-	rdata->iov[1].iov_base = buf + 4;
-	rdata->iov[1].iov_len = server->vals->read_rsp_size - 4;
-	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
-		 rdata->iov[0].iov_base, server->vals->read_rsp_size);
-
 	length = rdata->copy_into_pages(server, rdata, &iter);
 
 	kfree(bvec);
@@ -2997,8 +4041,56 @@
 	return length;
 }
 
+struct smb2_decrypt_work {
+	struct work_struct decrypt;
+	struct TCP_Server_Info *server;
+	struct page **ppages;
+	char *buf;
+	unsigned int npages;
+	unsigned int len;
+};
+
+
+static void smb2_decrypt_offload(struct work_struct *work)
+{
+	struct smb2_decrypt_work *dw = container_of(work,
+				struct smb2_decrypt_work, decrypt);
+	int i, rc;
+	struct mid_q_entry *mid;
+
+	rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size,
+			      dw->ppages, dw->npages, dw->len);
+	if (rc) {
+		cifs_dbg(VFS, "error decrypting rc=%d\n", rc);
+		goto free_pages;
+	}
+
+	dw->server->lstrp = jiffies;
+	mid = smb2_find_mid(dw->server, dw->buf);
+	if (mid == NULL)
+		cifs_dbg(FYI, "mid not found\n");
+	else {
+		mid->decrypted = true;
+		rc = handle_read_data(dw->server, mid, dw->buf,
+				      dw->server->vals->read_rsp_size,
+				      dw->ppages, dw->npages, dw->len);
+		mid->callback(mid);
+		cifs_mid_q_entry_release(mid);
+	}
+
+free_pages:
+	for (i = dw->npages-1; i >= 0; i--)
+		put_page(dw->ppages[i]);
+
+	kfree(dw->ppages);
+	cifs_small_buf_release(dw->buf);
+	kfree(dw);
+}
+
+
 static int
-receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
+receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
+		       int *num_mids)
 {
 	char *buf = server->smallbuf;
 	struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
@@ -3008,7 +4100,9 @@
 	unsigned int buflen = server->pdu_size;
 	int rc;
 	int i = 0;
+	struct smb2_decrypt_work *dw;
 
+	*num_mids = 1;
 	len = min_t(unsigned int, buflen, server->vals->read_rsp_size +
 		sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1;
 
@@ -3044,6 +4138,32 @@
 	if (rc)
 		goto free_pages;
 
+	/*
+	 * For large reads, offload to different thread for better performance,
+	 * use more cores decrypting which can be expensive
+	 */
+
+	if ((server->min_offload) && (server->in_flight > 1) &&
+	    (server->pdu_size >= server->min_offload)) {
+		dw = kmalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL);
+		if (dw == NULL)
+			goto non_offloaded_decrypt;
+
+		dw->buf = server->smallbuf;
+		server->smallbuf = (char *)cifs_small_buf_get();
+
+		INIT_WORK(&dw->decrypt, smb2_decrypt_offload);
+
+		dw->npages = npages;
+		dw->server = server;
+		dw->ppages = pages;
+		dw->len = len;
+		queue_work(decrypt_wq, &dw->decrypt);
+		*num_mids = 0; /* worker thread takes care of finding mid */
+		return -1;
+	}
+
+non_offloaded_decrypt:
 	rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size,
 			      pages, npages, len);
 	if (rc)
@@ -3077,7 +4197,6 @@
 {
 	int ret, length;
 	char *buf = server->smallbuf;
-	char *tmpbuf;
 	struct smb2_sync_hdr *shdr;
 	unsigned int pdu_length = server->pdu_size;
 	unsigned int buf_size;
@@ -3107,18 +4226,15 @@
 		return length;
 
 	next_is_large = server->large_buf;
- one_more:
+one_more:
 	shdr = (struct smb2_sync_hdr *)buf;
 	if (shdr->NextCommand) {
-		if (next_is_large) {
-			tmpbuf = server->bigbuf;
+		if (next_is_large)
 			next_buffer = (char *)cifs_buf_get();
-		} else {
-			tmpbuf = server->smallbuf;
+		else
 			next_buffer = (char *)cifs_small_buf_get();
-		}
 		memcpy(next_buffer,
-		       tmpbuf + le32_to_cpu(shdr->NextCommand),
+		       buf + le32_to_cpu(shdr->NextCommand),
 		       pdu_length - le32_to_cpu(shdr->NextCommand));
 	}
 
@@ -3132,7 +4248,7 @@
 	}
 
 	if (*num_mids >= MAX_COMPOUND) {
-		cifs_dbg(VFS, "too many PDUs in compound\n");
+		cifs_server_dbg(VFS, "too many PDUs in compound\n");
 		return -1;
 	}
 	bufs[*num_mids] = buf;
@@ -3147,12 +4263,21 @@
 		pdu_length -= le32_to_cpu(shdr->NextCommand);
 		server->large_buf = next_is_large;
 		if (next_is_large)
-			server->bigbuf = next_buffer;
+			server->bigbuf = buf = next_buffer;
 		else
-			server->smallbuf = next_buffer;
-
-		buf += le32_to_cpu(shdr->NextCommand);
+			server->smallbuf = buf = next_buffer;
 		goto one_more;
+	} else if (ret != 0) {
+		/*
+		 * ret != 0 here means that we didn't get to handle_mid() thus
+		 * server->smallbuf and server->bigbuf are still valid. We need
+		 * to free next_buffer because it is not going to be used
+		 * anywhere.
+		 */
+		if (next_is_large)
+			free_rsp_buf(CIFS_LARGE_BUFFER, next_buffer);
+		else
+			free_rsp_buf(CIFS_SMALL_BUFFER, next_buffer);
 	}
 
 	return ret;
@@ -3169,7 +4294,7 @@
 
 	if (pdu_length < sizeof(struct smb2_transform_hdr) +
 						sizeof(struct smb2_sync_hdr)) {
-		cifs_dbg(VFS, "Transform message is too small (%u)\n",
+		cifs_server_dbg(VFS, "Transform message is too small (%u)\n",
 			 pdu_length);
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
@@ -3177,15 +4302,16 @@
 	}
 
 	if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) {
-		cifs_dbg(VFS, "Transform message is broken\n");
+		cifs_server_dbg(VFS, "Transform message is broken\n");
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
 		return -ECONNABORTED;
 	}
 
 	/* TODO: add support for compounds containing READ. */
-	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
-		return receive_encrypted_read(server, &mids[0]);
+	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) {
+		return receive_encrypted_read(server, &mids[0], num_mids);
+	}
 
 	return receive_encrypted_standard(server, mids, bufs, num_mids);
 }
@@ -3212,6 +4338,104 @@
 	return le32_to_cpu(hdr->NextCommand);
 }
 
+static int
+smb2_make_node(unsigned int xid, struct inode *inode,
+	       struct dentry *dentry, struct cifs_tcon *tcon,
+	       char *full_path, umode_t mode, dev_t dev)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	int rc = -EPERM;
+	int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+	FILE_ALL_INFO *buf = NULL;
+	struct cifs_io_parms io_parms;
+	__u32 oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	unsigned int bytes_written;
+	struct win_dev *pdev;
+	struct kvec iov[2];
+
+	/*
+	 * Check if mounted with mount parm 'sfu' mount parm.
+	 * SFU emulation should work with all servers, but only
+	 * supports block and char device (no socket & fifo),
+	 * and was used by default in earlier versions of Windows
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+		goto out;
+
+	/*
+	 * TODO: Add ability to create instead via reparse point. Windows (e.g.
+	 * their current NFS server) uses this approach to expose special files
+	 * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
+	 */
+
+	if (!S_ISCHR(mode) && !S_ISBLK(mode))
+		goto out;
+
+	cifs_dbg(FYI, "sfu compat create special file\n");
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (buf == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_CREATE;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	if (tcon->ses->server->oplocks)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+	rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+	if (rc)
+		goto out;
+
+	/*
+	 * BB Do not bother to decode buf since no local inode yet to put
+	 * timestamps in, but we can reuse it safely.
+	 */
+
+	pdev = (struct win_dev *)buf;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = sizeof(struct win_dev);
+	iov[1].iov_base = buf;
+	iov[1].iov_len = sizeof(struct win_dev);
+	if (S_ISCHR(mode)) {
+		memcpy(pdev->type, "IntxCHR", 8);
+		pdev->major = cpu_to_le64(MAJOR(dev));
+		pdev->minor = cpu_to_le64(MINOR(dev));
+		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+							&bytes_written, iov, 1);
+	} else if (S_ISBLK(mode)) {
+		memcpy(pdev->type, "IntxBLK", 8);
+		pdev->major = cpu_to_le64(MAJOR(dev));
+		pdev->minor = cpu_to_le64(MINOR(dev));
+		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+							&bytes_written, iov, 1);
+	}
+	tcon->ses->server->ops->close(xid, tcon, &fid);
+	d_drop(dentry);
+
+	/* FIXME: add code here to set EAs */
+out:
+	kfree(buf);
+	return rc;
+}
+
+
 struct smb_version_operations smb20_operations = {
 	.compare_fids = smb2_compare_fids,
 	.setup_request = smb2_setup_request,
@@ -3223,6 +4447,7 @@
 	.get_credits = smb2_get_credits,
 	.wait_mtu_credits = cifs_wait_mtu_credits,
 	.get_next_mid = smb2_get_next_mid,
+	.revert_current_mid = smb2_revert_current_mid,
 	.read_data_offset = smb2_read_data_offset,
 	.read_data_length = smb2_read_data_length,
 	.map_error = map_smb2_to_linux_error,
@@ -3298,12 +4523,14 @@
 	.query_all_EAs = smb2_query_eas,
 	.set_EA = smb2_set_ea,
 #endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
 	.get_acl = get_smb2_acl,
 	.get_acl_by_fid = get_smb2_acl_by_fid,
 	.set_acl = set_smb2_acl,
-#endif /* CIFS_ACL */
 	.next_header = smb2_next_header,
+	.ioctl_query_info = smb2_ioctl_query_info,
+	.make_node = smb2_make_node,
+	.fiemap = smb3_fiemap,
+	.llseek = smb3_llseek,
 };
 
 struct smb_version_operations smb21_operations = {
@@ -3316,7 +4543,9 @@
 	.get_credits_field = smb2_get_credits_field,
 	.get_credits = smb2_get_credits,
 	.wait_mtu_credits = smb2_wait_mtu_credits,
+	.adjust_credits = smb2_adjust_credits,
 	.get_next_mid = smb2_get_next_mid,
+	.revert_current_mid = smb2_revert_current_mid,
 	.read_data_offset = smb2_read_data_offset,
 	.read_data_length = smb2_read_data_length,
 	.map_error = map_smb2_to_linux_error,
@@ -3327,7 +4556,7 @@
 	.print_stats = smb2_print_stats,
 	.is_oplock_break = smb2_is_valid_oplock_break,
 	.handle_cancelled_mid = smb2_handle_cancelled_mid,
-	.downgrade_oplock = smb2_downgrade_oplock,
+	.downgrade_oplock = smb21_downgrade_oplock,
 	.need_neg = smb2_need_neg,
 	.negotiate = smb2_negotiate,
 	.negotiate_wsize = smb2_negotiate_wsize,
@@ -3393,12 +4622,14 @@
 	.query_all_EAs = smb2_query_eas,
 	.set_EA = smb2_set_ea,
 #endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
 	.get_acl = get_smb2_acl,
 	.get_acl_by_fid = get_smb2_acl_by_fid,
 	.set_acl = set_smb2_acl,
-#endif /* CIFS_ACL */
 	.next_header = smb2_next_header,
+	.ioctl_query_info = smb2_ioctl_query_info,
+	.make_node = smb2_make_node,
+	.fiemap = smb3_fiemap,
+	.llseek = smb3_llseek,
 };
 
 struct smb_version_operations smb30_operations = {
@@ -3411,7 +4642,9 @@
 	.get_credits_field = smb2_get_credits_field,
 	.get_credits = smb2_get_credits,
 	.wait_mtu_credits = smb2_wait_mtu_credits,
+	.adjust_credits = smb2_adjust_credits,
 	.get_next_mid = smb2_get_next_mid,
+	.revert_current_mid = smb2_revert_current_mid,
 	.read_data_offset = smb2_read_data_offset,
 	.read_data_length = smb2_read_data_length,
 	.map_error = map_smb2_to_linux_error,
@@ -3423,11 +4656,11 @@
 	.dump_share_caps = smb2_dump_share_caps,
 	.is_oplock_break = smb2_is_valid_oplock_break,
 	.handle_cancelled_mid = smb2_handle_cancelled_mid,
-	.downgrade_oplock = smb2_downgrade_oplock,
+	.downgrade_oplock = smb21_downgrade_oplock,
 	.need_neg = smb2_need_neg,
 	.negotiate = smb2_negotiate,
-	.negotiate_wsize = smb2_negotiate_wsize,
-	.negotiate_rsize = smb2_negotiate_rsize,
+	.negotiate_wsize = smb3_negotiate_wsize,
+	.negotiate_rsize = smb3_negotiate_rsize,
 	.sess_setup = SMB2_sess_setup,
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
@@ -3497,12 +4730,14 @@
 	.query_all_EAs = smb2_query_eas,
 	.set_EA = smb2_set_ea,
 #endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
 	.get_acl = get_smb2_acl,
 	.get_acl_by_fid = get_smb2_acl_by_fid,
 	.set_acl = set_smb2_acl,
-#endif /* CIFS_ACL */
 	.next_header = smb2_next_header,
+	.ioctl_query_info = smb2_ioctl_query_info,
+	.make_node = smb2_make_node,
+	.fiemap = smb3_fiemap,
+	.llseek = smb3_llseek,
 };
 
 struct smb_version_operations smb311_operations = {
@@ -3515,7 +4750,9 @@
 	.get_credits_field = smb2_get_credits_field,
 	.get_credits = smb2_get_credits,
 	.wait_mtu_credits = smb2_wait_mtu_credits,
+	.adjust_credits = smb2_adjust_credits,
 	.get_next_mid = smb2_get_next_mid,
+	.revert_current_mid = smb2_revert_current_mid,
 	.read_data_offset = smb2_read_data_offset,
 	.read_data_length = smb2_read_data_length,
 	.map_error = map_smb2_to_linux_error,
@@ -3527,11 +4764,11 @@
 	.dump_share_caps = smb2_dump_share_caps,
 	.is_oplock_break = smb2_is_valid_oplock_break,
 	.handle_cancelled_mid = smb2_handle_cancelled_mid,
-	.downgrade_oplock = smb2_downgrade_oplock,
+	.downgrade_oplock = smb21_downgrade_oplock,
 	.need_neg = smb2_need_neg,
 	.negotiate = smb2_negotiate,
-	.negotiate_wsize = smb2_negotiate_wsize,
-	.negotiate_rsize = smb2_negotiate_rsize,
+	.negotiate_wsize = smb3_negotiate_wsize,
+	.negotiate_rsize = smb3_negotiate_rsize,
 	.sess_setup = SMB2_sess_setup,
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
@@ -3602,12 +4839,14 @@
 	.query_all_EAs = smb2_query_eas,
 	.set_EA = smb2_set_ea,
 #endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
 	.get_acl = get_smb2_acl,
 	.get_acl_by_fid = get_smb2_acl_by_fid,
 	.set_acl = set_smb2_acl,
-#endif /* CIFS_ACL */
 	.next_header = smb2_next_header,
+	.ioctl_query_info = smb2_ioctl_query_info,
+	.make_node = smb2_make_node,
+	.fiemap = smb3_fiemap,
+	.llseek = smb3_llseek,
 };
 
 struct smb_version_values smb20_values = {

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index f54d07b..0514986 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c

@@ -50,6 +50,9 @@
 #include "cifs_spnego.h"
 #include "smbdirect.h"
 #include "trace.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
 
 /*
  *  The following table defines the expected "StructureSize" of SMB2 requests
@@ -104,13 +107,13 @@
 		struct TCP_Server_Info *server = tcon->ses->server;
 
 		spin_lock(&server->req_lock);
-		/* Request up to 2 credits but don't go over the limit. */
+		/* Request up to 10 credits but don't go over the limit. */
 		if (server->credits >= server->max_credits)
 			shdr->CreditRequest = cpu_to_le16(0);
 		else
 			shdr->CreditRequest = cpu_to_le16(
 				min_t(int, server->max_credits -
-						server->credits, 2));
+						server->credits, 10));
 		spin_unlock(&server->req_lock);
 	} else {
 		shdr->CreditRequest = cpu_to_le16(2);
@@ -152,6 +155,86 @@
 	return;
 }
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int __smb2_reconnect(const struct nls_table *nlsc,
+			    struct cifs_tcon *tcon)
+{
+	int rc;
+	struct dfs_cache_tgt_list tl;
+	struct dfs_cache_tgt_iterator *it = NULL;
+	char *tree;
+	const char *tcp_host;
+	size_t tcp_host_len;
+	const char *dfs_host;
+	size_t dfs_host_len;
+
+	tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+	if (!tree)
+		return -ENOMEM;
+
+	if (tcon->ipc) {
+		scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+			  tcon->ses->server->hostname);
+		rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
+		goto out;
+	}
+
+	if (!tcon->dfs_path) {
+		rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+		goto out;
+	}
+
+	rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl);
+	if (rc)
+		goto out;
+
+	extract_unc_hostname(tcon->ses->server->hostname, &tcp_host,
+			     &tcp_host_len);
+
+	for (it = dfs_cache_get_tgt_iterator(&tl); it;
+	     it = dfs_cache_get_next_tgt(&tl, it)) {
+		const char *tgt = dfs_cache_get_tgt_name(it);
+
+		extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+
+		if (dfs_host_len != tcp_host_len
+		    || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
+			cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s",
+				 __func__,
+				 (int)dfs_host_len, dfs_host,
+				 (int)tcp_host_len, tcp_host);
+			continue;
+		}
+
+		scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+
+		rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
+		if (!rc)
+			break;
+		if (rc == -EREMOTE)
+			break;
+	}
+
+	if (!rc) {
+		if (it)
+			rc = dfs_cache_noreq_update_tgthint(tcon->dfs_path + 1,
+							    it);
+		else
+			rc = -ENOENT;
+	}
+	dfs_cache_free_tgts(&tl);
+out:
+	kfree(tree);
+	return rc;
+}
+#else
+static inline int __smb2_reconnect(const struct nls_table *nlsc,
+				   struct cifs_tcon *tcon)
+{
+	return SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+}
+#endif
+
 static int
 smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
 {
@@ -159,6 +242,7 @@
 	struct nls_table *nls_codepage;
 	struct cifs_ses *ses;
 	struct TCP_Server_Info *server;
+	int retries;
 
 	/*
 	 * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -168,7 +252,7 @@
 	if (tcon == NULL)
 		return 0;
 
-	if (smb2_command == SMB2_TREE_CONNECT)
+	if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL)
 		return 0;
 
 	if (tcon->tidStatus == CifsExiting) {
@@ -192,9 +276,12 @@
 	ses = tcon->ses;
 	server = ses->server;
 
+	retries = server->nr_targets;
+
 	/*
-	 * Give demultiplex thread up to 10 seconds to reconnect, should be
-	 * greater than cifs socket timeout which is 7 seconds
+	 * Give demultiplex thread up to 10 seconds to each target available for
+	 * reconnect -- should be greater than cifs socket timeout which is 7
+	 * seconds.
 	 */
 	while (server->tcpStatus == CifsNeedReconnect) {
 		/*
@@ -225,6 +312,9 @@
 		if (server->tcpStatus != CifsNeedReconnect)
 			break;
 
+		if (--retries)
+			continue;
+
 		/*
 		 * on "soft" mounts we wait once. Hard mounts keep
 		 * retrying until process is killed or server comes
@@ -234,6 +324,7 @@
 			cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
 			return -EHOSTDOWN;
 		}
+		retries = server->nr_targets;
 	}
 
 	if (!tcon->ses->need_reconnect && !tcon->need_reconnect)
@@ -271,7 +362,7 @@
 	if (tcon->use_persistent)
 		tcon->need_reopen_files = true;
 
-	rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
+	rc = __smb2_reconnect(nls_codepage, tcon);
 	mutex_unlock(&tcon->ses->session_mutex);
 
 	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
@@ -368,14 +459,7 @@
 	return rc;
 }
 
-
-/* offset is sizeof smb2_negotiate_req but rounded up to 8 bytes */
-#define OFFSET_OF_NEG_CONTEXT 0x68  /* sizeof(struct smb2_negotiate_req) */
-
-
-#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES	cpu_to_le16(1)
-#define SMB2_ENCRYPTION_CAPABILITIES		cpu_to_le16(2)
-#define SMB2_POSIX_EXTENSIONS_AVAILABLE		cpu_to_le16(0x100)
+/* For explanation of negotiate contexts see MS-SMB2 section 2.2.3.1 */
 
 static void
 build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
@@ -389,13 +473,40 @@
 }
 
 static void
+build_compression_ctxt(struct smb2_compression_capabilities_context *pneg_ctxt)
+{
+	pneg_ctxt->ContextType = SMB2_COMPRESSION_CAPABILITIES;
+	pneg_ctxt->DataLength =
+		cpu_to_le16(sizeof(struct smb2_compression_capabilities_context)
+			  - sizeof(struct smb2_neg_context));
+	pneg_ctxt->CompressionAlgorithmCount = cpu_to_le16(3);
+	pneg_ctxt->CompressionAlgorithms[0] = SMB3_COMPRESS_LZ77;
+	pneg_ctxt->CompressionAlgorithms[1] = SMB3_COMPRESS_LZ77_HUFF;
+	pneg_ctxt->CompressionAlgorithms[2] = SMB3_COMPRESS_LZNT1;
+}
+
+static void
 build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
 {
 	pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES;
-	pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + le16 cipher */
-	pneg_ctxt->CipherCount = cpu_to_le16(1);
-/* pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;*/ /* not supported yet */
-	pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_CCM;
+	pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + two ciphers */
+	pneg_ctxt->CipherCount = cpu_to_le16(2);
+	pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;
+	pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM;
+}
+
+static unsigned int
+build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
+{
+	struct nls_table *cp = load_nls_default();
+
+	pneg_ctxt->ContextType = SMB2_NETNAME_NEGOTIATE_CONTEXT_ID;
+
+	/* copy up to max of first 100 bytes of server name to NetName field */
+	pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp));
+	/* context size is DataLength + minimal smb2_neg_context */
+	return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) +
+			sizeof(struct smb2_neg_context), 8) * 8;
 }
 
 static void
@@ -403,16 +514,47 @@
 {
 	pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
 	pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+	/* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+	pneg_ctxt->Name[0] = 0x93;
+	pneg_ctxt->Name[1] = 0xAD;
+	pneg_ctxt->Name[2] = 0x25;
+	pneg_ctxt->Name[3] = 0x50;
+	pneg_ctxt->Name[4] = 0x9C;
+	pneg_ctxt->Name[5] = 0xB4;
+	pneg_ctxt->Name[6] = 0x11;
+	pneg_ctxt->Name[7] = 0xE7;
+	pneg_ctxt->Name[8] = 0xB4;
+	pneg_ctxt->Name[9] = 0x23;
+	pneg_ctxt->Name[10] = 0x83;
+	pneg_ctxt->Name[11] = 0xDE;
+	pneg_ctxt->Name[12] = 0x96;
+	pneg_ctxt->Name[13] = 0x8B;
+	pneg_ctxt->Name[14] = 0xCD;
+	pneg_ctxt->Name[15] = 0x7C;
 }
 
 static void
 assemble_neg_contexts(struct smb2_negotiate_req *req,
-		      unsigned int *total_len)
+		      struct TCP_Server_Info *server, unsigned int *total_len)
 {
-	char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT;
+	char *pneg_ctxt = (char *)req;
 	unsigned int ctxt_len;
 
-	*total_len += 2; /* Add 2 due to round to 8 byte boundary for 1st ctxt */
+	if (*total_len > 200) {
+		/* In case length corrupted don't want to overrun smb buffer */
+		cifs_server_dbg(VFS, "Bad frame length assembling neg contexts\n");
+		return;
+	}
+
+	/*
+	 * round up total_len of fixed part of SMB3 negotiate request to 8
+	 * byte boundary before adding negotiate contexts
+	 */
+	*total_len = roundup(*total_len, 8);
+
+	pneg_ctxt = (*total_len) + (char *)req;
+	req->NegotiateContextOffset = cpu_to_le32(*total_len);
+
 	build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
 	ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
 	*total_len += ctxt_len;
@@ -423,11 +565,25 @@
 	*total_len += ctxt_len;
 	pneg_ctxt += ctxt_len;
 
+	if (server->compress_algorithm) {
+		build_compression_ctxt((struct smb2_compression_capabilities_context *)
+				pneg_ctxt);
+		ctxt_len = DIV_ROUND_UP(
+			sizeof(struct smb2_compression_capabilities_context),
+				8) * 8;
+		*total_len += ctxt_len;
+		pneg_ctxt += ctxt_len;
+		req->NegotiateContextCount = cpu_to_le16(5);
+	} else
+		req->NegotiateContextCount = cpu_to_le16(4);
+
+	ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
+					server->hostname);
+	*total_len += ctxt_len;
+	pneg_ctxt += ctxt_len;
+
 	build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
 	*total_len += sizeof(struct smb2_posix_neg_context);
-
-	req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
-	req->NegotiateContextCount = cpu_to_le16(3);
 }
 
 static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt)
@@ -445,6 +601,27 @@
 		printk_once(KERN_WARNING "unknown SMB3 hash algorithm\n");
 }
 
+static void decode_compress_ctx(struct TCP_Server_Info *server,
+			 struct smb2_compression_capabilities_context *ctxt)
+{
+	unsigned int len = le16_to_cpu(ctxt->DataLength);
+
+	/* sizeof compress context is a one element compression capbility struct */
+	if (len < 10) {
+		printk_once(KERN_WARNING "server sent bad compression cntxt\n");
+		return;
+	}
+	if (le16_to_cpu(ctxt->CompressionAlgorithmCount) != 1) {
+		printk_once(KERN_WARNING "illegal SMB3 compress algorithm count\n");
+		return;
+	}
+	if (le16_to_cpu(ctxt->CompressionAlgorithms[0]) > 3) {
+		printk_once(KERN_WARNING "unknown compression algorithm\n");
+		return;
+	}
+	server->compress_algorithm = ctxt->CompressionAlgorithms[0];
+}
+
 static int decode_encrypt_ctx(struct TCP_Server_Info *server,
 			      struct smb2_encryption_neg_context *ctxt)
 {
@@ -483,7 +660,7 @@
 
 	cifs_dbg(FYI, "decoding %d negotiate contexts\n", ctxt_cnt);
 	if (len_of_smb <= offset) {
-		cifs_dbg(VFS, "Invalid response: negotiate context offset\n");
+		cifs_server_dbg(VFS, "Invalid response: negotiate context offset\n");
 		return -EINVAL;
 	}
 
@@ -509,10 +686,13 @@
 		else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES)
 			rc = decode_encrypt_ctx(server,
 				(struct smb2_encryption_neg_context *)pctx);
+		else if (pctx->ContextType == SMB2_COMPRESSION_CAPABILITIES)
+			decode_compress_ctx(server,
+				(struct smb2_compression_capabilities_context *)pctx);
 		else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE)
 			server->posix_ext_supported = true;
 		else
-			cifs_dbg(VFS, "unknown negcontext of type %d ignored\n",
+			cifs_server_dbg(VFS, "unknown negcontext of type %d ignored\n",
 				le16_to_cpu(pctx->ContextType));
 
 		if (rc)
@@ -571,6 +751,8 @@
 	unsigned int num = *num_iovec;
 
 	iov[num].iov_base = create_posix_buf(mode);
+	if (mode == ACL_NO_MODE)
+		cifs_dbg(FYI, "illegal mode\n");
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct create_posix);
@@ -637,13 +819,14 @@
 		req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
 		req->DialectCount = cpu_to_le16(2);
 		total_len += 4;
-	} else if (strcmp(ses->server->vals->version_string,
+	} else if (strcmp(server->vals->version_string,
 		   SMBDEFAULT_VERSION_STRING) == 0) {
 		req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
 		req->Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
 		req->Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
-		req->DialectCount = cpu_to_le16(3);
-		total_len += 6;
+		req->Dialects[3] = cpu_to_le16(SMB311_PROT_ID);
+		req->DialectCount = cpu_to_le16(4);
+		total_len += 8;
 	} else {
 		/* otherwise send specific dialect */
 		req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
@@ -659,16 +842,18 @@
 	else
 		req->SecurityMode = 0;
 
-	req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
+	req->Capabilities = cpu_to_le32(server->vals->req_capabilities);
 
 	/* ClientGUID must be zero for SMB2.02 dialect */
-	if (ses->server->vals->protocol_id == SMB20_PROT_ID)
+	if (server->vals->protocol_id == SMB20_PROT_ID)
 		memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE);
 	else {
 		memcpy(req->ClientGUID, server->client_guid,
 			SMB2_CLIENT_GUID_SIZE);
-		if (ses->server->vals->protocol_id == SMB311_PROT_ID)
-			assemble_neg_contexts(req, &total_len);
+		if ((server->vals->protocol_id == SMB311_PROT_ID) ||
+		    (strcmp(server->vals->version_string,
+		     SMBDEFAULT_VERSION_STRING) == 0))
+			assemble_neg_contexts(req, server, &total_len);
 	}
 	iov[0].iov_base = (char *)req;
 	iov[0].iov_len = total_len;
@@ -685,38 +870,42 @@
 	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
 	 */
 	if (rc == -EOPNOTSUPP) {
-		cifs_dbg(VFS, "Dialect not supported by server. Consider "
+		cifs_server_dbg(VFS, "Dialect not supported by server. Consider "
 			"specifying vers=1.0 or vers=2.0 on mount for accessing"
 			" older servers\n");
 		goto neg_exit;
 	} else if (rc != 0)
 		goto neg_exit;
 
-	if (strcmp(ses->server->vals->version_string,
+	if (strcmp(server->vals->version_string,
 		   SMB3ANY_VERSION_STRING) == 0) {
 		if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) {
-			cifs_dbg(VFS,
+			cifs_server_dbg(VFS,
 				"SMB2 dialect returned but not requested\n");
 			return -EIO;
 		} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
-			cifs_dbg(VFS,
+			cifs_server_dbg(VFS,
 				"SMB2.1 dialect returned but not requested\n");
 			return -EIO;
 		}
-	} else if (strcmp(ses->server->vals->version_string,
+	} else if (strcmp(server->vals->version_string,
 		   SMBDEFAULT_VERSION_STRING) == 0) {
 		if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) {
-			cifs_dbg(VFS,
+			cifs_server_dbg(VFS,
 				"SMB2 dialect returned but not requested\n");
 			return -EIO;
 		} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
 			/* ops set to 3.0 by default for default so update */
-			ses->server->ops = &smb21_operations;
+			server->ops = &smb21_operations;
+			server->vals = &smb21_values;
+		} else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
+			server->ops = &smb311_operations;
+			server->vals = &smb311_values;
 		}
 	} else if (le16_to_cpu(rsp->DialectRevision) !=
-				ses->server->vals->protocol_id) {
+				server->vals->protocol_id) {
 		/* if requested single dialect ensure returned dialect matched */
-		cifs_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n",
+		cifs_server_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n",
 			le16_to_cpu(rsp->DialectRevision));
 		return -EIO;
 	}
@@ -734,7 +923,7 @@
 	else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID))
 		cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n");
 	else {
-		cifs_dbg(VFS, "Illegal dialect returned by server 0x%x\n",
+		cifs_server_dbg(VFS, "Illegal dialect returned by server 0x%x\n",
 			 le16_to_cpu(rsp->DialectRevision));
 		rc = -EIO;
 		goto neg_exit;
@@ -794,7 +983,7 @@
 			rc = smb311_decode_neg_context(rsp, server,
 						       rsp_iov.iov_len);
 		else
-			cifs_dbg(VFS, "Missing expected negotiate contexts\n");
+			cifs_server_dbg(VFS, "Missing expected negotiate contexts\n");
 	}
 neg_exit:
 	free_rsp_buf(resp_buftype, rsp);
@@ -808,11 +997,12 @@
 	struct validate_negotiate_info_rsp *pneg_rsp = NULL;
 	u32 rsplen;
 	u32 inbuflen; /* max of 4 dialects */
+	struct TCP_Server_Info *server = tcon->ses->server;
 
 	cifs_dbg(FYI, "validate negotiate\n");
 
 	/* In SMB3.11 preauth integrity supersedes validate negotiate */
-	if (tcon->ses->server->dialect == SMB311_PROT_ID)
+	if (server->dialect == SMB311_PROT_ID)
 		return 0;
 
 	/*
@@ -831,15 +1021,15 @@
 	}
 
 	if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL)
-		cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n");
+		cifs_tcon_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n");
 
 	pneg_inbuf = kmalloc(sizeof(*pneg_inbuf), GFP_NOFS);
 	if (!pneg_inbuf)
 		return -ENOMEM;
 
 	pneg_inbuf->Capabilities =
-			cpu_to_le32(tcon->ses->server->vals->req_capabilities);
-	memcpy(pneg_inbuf->Guid, tcon->ses->server->client_guid,
+			cpu_to_le32(server->vals->req_capabilities);
+	memcpy(pneg_inbuf->Guid, server->client_guid,
 					SMB2_CLIENT_GUID_SIZE);
 
 	if (tcon->ses->sign)
@@ -852,26 +1042,27 @@
 		pneg_inbuf->SecurityMode = 0;
 
 
-	if (strcmp(tcon->ses->server->vals->version_string,
+	if (strcmp(server->vals->version_string,
 		SMB3ANY_VERSION_STRING) == 0) {
 		pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
 		pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
 		pneg_inbuf->DialectCount = cpu_to_le16(2);
 		/* structure is big enough for 3 dialects, sending only 2 */
 		inbuflen = sizeof(*pneg_inbuf) -
-				sizeof(pneg_inbuf->Dialects[0]);
-	} else if (strcmp(tcon->ses->server->vals->version_string,
+				(2 * sizeof(pneg_inbuf->Dialects[0]));
+	} else if (strcmp(server->vals->version_string,
 		SMBDEFAULT_VERSION_STRING) == 0) {
 		pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
 		pneg_inbuf->Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
 		pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
-		pneg_inbuf->DialectCount = cpu_to_le16(3);
+		pneg_inbuf->Dialects[3] = cpu_to_le16(SMB311_PROT_ID);
+		pneg_inbuf->DialectCount = cpu_to_le16(4);
 		/* structure is big enough for 3 dialects */
 		inbuflen = sizeof(*pneg_inbuf);
 	} else {
 		/* otherwise specific dialect was requested */
 		pneg_inbuf->Dialects[0] =
-			cpu_to_le16(tcon->ses->server->vals->protocol_id);
+			cpu_to_le16(server->vals->protocol_id);
 		pneg_inbuf->DialectCount = cpu_to_le16(1);
 		/* structure is big enough for 3 dialects, sending only 1 */
 		inbuflen = sizeof(*pneg_inbuf) -
@@ -880,17 +1071,25 @@
 
 	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
 		FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
-		(char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
-
-	if (rc != 0) {
-		cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
+		(char *)pneg_inbuf, inbuflen, CIFSMaxBufSize,
+		(char **)&pneg_rsp, &rsplen);
+	if (rc == -EOPNOTSUPP) {
+		/*
+		 * Old Windows versions or Netapp SMB server can return
+		 * not supported error. Client should accept it.
+		 */
+		cifs_tcon_dbg(VFS, "Server does not support validate negotiate\n");
+		rc = 0;
+		goto out_free_inbuf;
+	} else if (rc != 0) {
+		cifs_tcon_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
 		rc = -EIO;
 		goto out_free_inbuf;
 	}
 
 	rc = -EIO;
 	if (rsplen != sizeof(*pneg_rsp)) {
-		cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n",
+		cifs_tcon_dbg(VFS, "invalid protocol negotiate response size: %d\n",
 			 rsplen);
 
 		/* relax check since Mac returns max bufsize allowed on ioctl */
@@ -899,16 +1098,16 @@
 	}
 
 	/* check validate negotiate info response matches what we got earlier */
-	if (pneg_rsp->Dialect != cpu_to_le16(tcon->ses->server->dialect))
+	if (pneg_rsp->Dialect != cpu_to_le16(server->dialect))
 		goto vneg_out;
 
-	if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode))
+	if (pneg_rsp->SecurityMode != cpu_to_le16(server->sec_mode))
 		goto vneg_out;
 
 	/* do not validate server guid because not saved at negprot time yet */
 
 	if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND |
-	      SMB2_LARGE_FILES) != tcon->ses->server->capabilities)
+	      SMB2_LARGE_FILES) != server->capabilities)
 		goto vneg_out;
 
 	/* validate negotiate successful */
@@ -917,7 +1116,7 @@
 	goto out_free_rsp;
 
 vneg_out:
-	cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
+	cifs_tcon_dbg(VFS, "protocol revalidation - security settings mismatch\n");
 out_free_rsp:
 	kfree(pneg_rsp);
 out_free_inbuf:
@@ -999,7 +1198,12 @@
 	else
 		req->SecurityMode = 0;
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	req->Capabilities = cpu_to_le32(SMB2_GLOBAL_CAP_DFS);
+#else
 	req->Capabilities = 0;
+#endif /* DFS_UPCALL */
+
 	req->Channel = 0; /* MBZ */
 
 	sess_data->iov[0].iov_base = (char *)req;
@@ -1366,7 +1570,7 @@
 		sess_data->func(sess_data);
 
 	if ((ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) && (ses->sign))
-		cifs_dbg(VFS, "signing requested but authenticated as guest\n");
+		cifs_server_dbg(VFS, "signing requested but authenticated as guest\n");
 	rc = sess_data->result;
 out:
 	kfree(sess_data);
@@ -1409,7 +1613,7 @@
 	else if (server->sign)
 		req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
 
-	flags |= CIFS_NO_RESP;
+	flags |= CIFS_NO_RSP_BUF;
 
 	iov[0].iov_base = (char *)req;
 	iov[0].iov_len = total_len;
@@ -1459,10 +1663,11 @@
 	__le16 *unc_path = NULL;
 	int flags = 0;
 	unsigned int total_len;
+	struct TCP_Server_Info *server = ses->server;
 
 	cifs_dbg(FYI, "TCON\n");
 
-	if (!(ses->server) || !tree)
+	if (!server || !tree)
 		return -EIO;
 
 	unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
@@ -1478,7 +1683,7 @@
 
 	/* SMB2 TREE_CONNECT request must be called with TreeId == 0 */
 	tcon->tid = 0;
-
+	atomic_set(&tcon->num_remote_opens, 0);
 	rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req,
 			     &total_len);
 	if (rc) {
@@ -1500,19 +1705,29 @@
 	iov[1].iov_base = unc_path;
 	iov[1].iov_len = unc_path_len;
 
-	/* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
-	if ((ses->server->dialect == SMB311_PROT_ID) &&
-	    !smb3_encryption_required(tcon))
+	/*
+	 * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1
+	 * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1
+	 * (Samba servers don't always set the flag so also check if null user)
+	 */
+	if ((server->dialect == SMB311_PROT_ID) &&
+	    !smb3_encryption_required(tcon) &&
+	    !(ses->session_flags &
+		    (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
+	    ((ses->user_name != NULL) || (ses->sectype == Kerberos)))
 		req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
 
 	memset(&rqst, 0, sizeof(struct smb_rqst));
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 2;
 
+	/* Need 64 for max size write so ask for more in case not there yet */
+	req->sync_hdr.CreditRequest = cpu_to_le16(64);
+
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
 	cifs_small_buf_release(req);
 	rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
-
+	trace_smb3_tcon(xid, tcon->tid, ses->Suid, tree, rc);
 	if (rc != 0) {
 		if (tcon) {
 			cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE);
@@ -1534,7 +1749,7 @@
 		cifs_dbg(FYI, "connection to printer\n");
 		break;
 	default:
-		cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType);
+		cifs_server_dbg(VFS, "unknown share type %d\n", rsp->ShareType);
 		rc = -EOPNOTSUPP;
 		goto tcon_error_exit;
 	}
@@ -1549,23 +1764,24 @@
 
 	if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
 	    ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
-		cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
+		cifs_tcon_dbg(VFS, "DFS capability contradicts DFS flag\n");
 
 	if (tcon->seal &&
-	    !(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
-		cifs_dbg(VFS, "Encryption is requested but not supported\n");
+	    !(server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
+		cifs_tcon_dbg(VFS, "Encryption is requested but not supported\n");
 
 	init_copy_chunk_defaults(tcon);
-	if (tcon->ses->server->ops->validate_negotiate)
-		rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
+	if (server->ops->validate_negotiate)
+		rc = server->ops->validate_negotiate(xid, tcon);
 tcon_exit:
+
 	free_rsp_buf(resp_buftype, rsp);
 	kfree(unc_path);
 	return rc;
 
 tcon_error_exit:
 	if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
-		cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
+		cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
 	}
 	goto tcon_exit;
 }
@@ -1599,7 +1815,7 @@
 	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
-	flags |= CIFS_NO_RESP;
+	flags |= CIFS_NO_RSP_BUF;
 
 	iov[0].iov_base = (char *)req;
 	iov[0].iov_len = total_len;
@@ -1665,9 +1881,21 @@
 	return buf;
 }
 
-static __u8
-parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
-		  unsigned int *epoch, char *lease_key)
+static void
+parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf)
+{
+	struct create_on_disk_id *pdisk_id = (struct create_on_disk_id *)cc;
+
+	cifs_dbg(FYI, "parse query id context 0x%llx 0x%llx\n",
+		pdisk_id->DiskFileId, pdisk_id->VolumeId);
+	buf->IndexNumber = pdisk_id->DiskFileId;
+}
+
+void
+smb2_parse_contexts(struct TCP_Server_Info *server,
+		       struct smb2_create_rsp *rsp,
+		       unsigned int *epoch, char *lease_key, __u8 *oplock,
+		       struct smb2_file_all_info *buf)
 {
 	char *data_offset;
 	struct create_context *cc;
@@ -1675,15 +1903,24 @@
 	unsigned int remaining;
 	char *name;
 
+	*oplock = 0;
 	data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset);
 	remaining = le32_to_cpu(rsp->CreateContextsLength);
 	cc = (struct create_context *)data_offset;
+
+	/* Initialize inode number to 0 in case no valid data in qfid context */
+	if (buf)
+		buf->IndexNumber = 0;
+
 	while (remaining >= sizeof(struct create_context)) {
 		name = le16_to_cpu(cc->NameOffset) + (char *)cc;
 		if (le16_to_cpu(cc->NameLength) == 4 &&
-		    strncmp(name, "RqLs", 4) == 0)
-			return server->ops->parse_lease_buf(cc, epoch,
-							    lease_key);
+		    strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4) == 0)
+			*oplock = server->ops->parse_lease_buf(cc, epoch,
+							   lease_key);
+		else if (buf && (le16_to_cpu(cc->NameLength) == 4) &&
+		    strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0)
+			parse_query_id_ctxt(cc, buf);
 
 		next = le32_to_cpu(cc->Next);
 		if (!next)
@@ -1692,7 +1929,10 @@
 		cc = (struct create_context *)((char *)cc + next);
 	}
 
-	return 0;
+	if (rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
+		*oplock = rsp->OplockLevel;
+
+	return;
 }
 
 static int
@@ -1718,8 +1958,9 @@
 }
 
 static struct create_durable_v2 *
-create_durable_v2_buf(struct cifs_fid *pfid)
+create_durable_v2_buf(struct cifs_open_parms *oparms)
 {
+	struct cifs_fid *pfid = oparms->fid;
 	struct create_durable_v2 *buf;
 
 	buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL);
@@ -1733,7 +1974,14 @@
 				(struct create_durable_v2, Name));
 	buf->ccontext.NameLength = cpu_to_le16(4);
 
-	buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
+	/*
+	 * NB: Handle timeout defaults to 0, which allows server to choose
+	 * (most servers default to 120 seconds) and most clients default to 0.
+	 * This can be overridden at mount ("handletimeout=") if the user wants
+	 * a different persistent (or resilient) handle timeout for all opens
+	 * opens on a particular SMB3 mount.
+	 */
+	buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
 	buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
 	generate_random_uuid(buf->dcontext.CreateGuid);
 	memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
@@ -1786,7 +2034,7 @@
 	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
-	iov[num].iov_base = create_durable_v2_buf(oparms->fid);
+	iov[num].iov_base = create_durable_v2_buf(oparms);
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct create_durable_v2);
@@ -1901,6 +2149,48 @@
 	return 0;
 }
 
+static struct crt_query_id_ctxt *
+create_query_id_buf(void)
+{
+	struct crt_query_id_ctxt *buf;
+
+	buf = kzalloc(sizeof(struct crt_query_id_ctxt), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->ccontext.DataOffset = cpu_to_le16(0);
+	buf->ccontext.DataLength = cpu_to_le32(0);
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct crt_query_id_ctxt, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	/* SMB2_CREATE_QUERY_ON_DISK_ID is "QFid" */
+	buf->Name[0] = 'Q';
+	buf->Name[1] = 'F';
+	buf->Name[2] = 'i';
+	buf->Name[3] = 'd';
+	return buf;
+}
+
+/* See MS-SMB2 2.2.13.2.9 */
+static int
+add_query_id_context(struct kvec *iov, unsigned int *num_iovec)
+{
+	struct smb2_create_req *req = iov[0].iov_base;
+	unsigned int num = *num_iovec;
+
+	iov[num].iov_base = create_query_id_buf();
+	if (iov[num].iov_base == NULL)
+		return -ENOMEM;
+	iov[num].iov_len = sizeof(struct crt_query_id_ctxt);
+	if (!req->CreateContextsOffset)
+		req->CreateContextsOffset = cpu_to_le32(
+				sizeof(struct smb2_create_req) +
+				iov[num - 1].iov_len);
+	le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_query_id_ctxt));
+	*num_iovec = num + 1;
+	return 0;
+}
+
 static int
 alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
 			    const char *treename, const __le16 *path)
@@ -1954,7 +2244,6 @@
 	struct smb_rqst rqst;
 	struct smb2_create_req *req;
 	struct smb2_create_rsp *rsp = NULL;
-	struct TCP_Server_Info *server;
 	struct cifs_ses *ses = tcon->ses;
 	struct kvec iov[3]; /* make sure at least one for each open context */
 	struct kvec rsp_iov = {NULL, 0};
@@ -1977,9 +2266,7 @@
 	if (!utf16_path)
 		return -ENOMEM;
 
-	if (ses && (ses->server))
-		server = ses->server;
-	else {
+	if (!ses || !(ses->server)) {
 		rc = -EIO;
 		goto err_free_path;
 	}
@@ -2067,6 +2354,9 @@
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = n_iov;
 
+	/* no need to inc num_remote_opens because we close it just below */
+	trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE,
+				    FILE_WRITE_ATTRIBUTES);
 	/* resource #4: response buffer */
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
 	if (rc) {
@@ -2129,6 +2419,7 @@
 	/* File attributes ignored on open (used in create though) */
 	req->FileAttributes = cpu_to_le32(file_attributes);
 	req->ShareAccess = FILE_SHARE_ALL_LE;
+
 	req->CreateDisposition = cpu_to_le32(oparms->disposition);
 	req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
 	req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req));
@@ -2172,7 +2463,7 @@
 	iov[1].iov_len = uni_path_len;
 	iov[1].iov_base = path;
 
-	if (!server->oplocks)
+	if ((!server->oplocks) || (tcon->no_lease))
 		*oplock = SMB2_OPLOCK_LEVEL_NONE;
 
 	if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) ||
@@ -2230,6 +2521,26 @@
 			return rc;
 	}
 
+	if ((oparms->disposition == FILE_CREATE) &&
+	    (oparms->mode != ACL_NO_MODE)) {
+		if (n_iov > 2) {
+			struct create_context *ccontext =
+			    (struct create_context *)iov[n_iov-1].iov_base;
+			ccontext->Next =
+				cpu_to_le32(iov[n_iov-1].iov_len);
+		}
+
+		/* rc = add_sd_context(iov, &n_iov, oparms->mode); */
+		if (rc)
+			return rc;
+	}
+
+	if (n_iov > 2) {
+		struct create_context *ccontext =
+			(struct create_context *)iov[n_iov-1].iov_base;
+		ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len);
+	}
+	add_query_id_context(iov, &n_iov);
 
 	rqst->rq_nvec = n_iov;
 	return 0;
@@ -2243,10 +2554,12 @@
 {
 	int i;
 
-	cifs_small_buf_release(rqst->rq_iov[0].iov_base);
-	for (i = 1; i < rqst->rq_nvec; i++)
-		if (rqst->rq_iov[i].iov_base != smb2_padding)
-			kfree(rqst->rq_iov[i].iov_base);
+	if (rqst && rqst->rq_iov) {
+		cifs_small_buf_release(rqst->rq_iov[0].iov_base);
+		for (i = 1; i < rqst->rq_nvec; i++)
+			if (rqst->rq_iov[i].iov_base != smb2_padding)
+				kfree(rqst->rq_iov[i].iov_base);
+	}
 }
 
 int
@@ -2261,7 +2574,7 @@
 	struct cifs_ses *ses = tcon->ses;
 	struct kvec iov[SMB2_CREATE_IOV_SIZE];
 	struct kvec rsp_iov = {NULL, 0};
-	int resp_buftype;
+	int resp_buftype = CIFS_NO_BUFFER;
 	int rc = 0;
 	int flags = 0;
 
@@ -2283,6 +2596,9 @@
 	if (rc)
 		goto creat_exit;
 
+	trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid,
+		oparms->create_options, oparms->desired_access);
+
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
 			    &rsp_iov);
 	rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
@@ -2297,14 +2613,23 @@
 		}
 		trace_smb3_open_err(xid, tcon->tid, ses->Suid,
 				    oparms->create_options, oparms->desired_access, rc);
+		if (rc == -EREMCHG) {
+			printk_once(KERN_WARNING "server share %s deleted\n",
+				    tcon->treeName);
+			tcon->need_reconnect = true;
+		}
 		goto creat_exit;
 	} else
 		trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid,
 				     ses->Suid, oparms->create_options,
 				     oparms->desired_access);
 
+	atomic_inc(&tcon->num_remote_opens);
 	oparms->fid->persistent_fid = rsp->PersistentFileId;
 	oparms->fid->volatile_fid = rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+	oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
 
 	if (buf) {
 		memcpy(buf, &rsp->CreationTime, 32);
@@ -2315,37 +2640,133 @@
 		buf->DeletePending = 0;
 	}
 
-	if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
-		*oplock = parse_lease_state(server, rsp, &oparms->fid->epoch,
-					    oparms->fid->lease_key);
-	else
-		*oplock = rsp->OplockLevel;
+
+	smb2_parse_contexts(server, rsp, &oparms->fid->epoch,
+			    oparms->fid->lease_key, oplock, buf);
 creat_exit:
 	SMB2_open_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp);
 	return rc;
 }
 
+int
+SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+		u64 persistent_fid, u64 volatile_fid, u32 opcode,
+		bool is_fsctl, char *in_data, u32 indatalen,
+		__u32 max_response_size)
+{
+	struct smb2_ioctl_req *req;
+	struct kvec *iov = rqst->rq_iov;
+	unsigned int total_len;
+	int rc;
+	char *in_data_buf;
+
+	rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
+	if (rc)
+		return rc;
+
+	if (indatalen) {
+		/*
+		 * indatalen is usually small at a couple of bytes max, so
+		 * just allocate through generic pool
+		 */
+		in_data_buf = kmemdup(in_data, indatalen, GFP_NOFS);
+		if (!in_data_buf) {
+			cifs_small_buf_release(req);
+			return -ENOMEM;
+		}
+	}
+
+	req->CtlCode = cpu_to_le32(opcode);
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	iov[0].iov_base = (char *)req;
+	/*
+	 * If no input data, the size of ioctl struct in
+	 * protocol spec still includes a 1 byte data buffer,
+	 * but if input data passed to ioctl, we do not
+	 * want to double count this, so we do not send
+	 * the dummy one byte of data in iovec[0] if sending
+	 * input data (in iovec[1]).
+	 */
+	if (indatalen) {
+		req->InputCount = cpu_to_le32(indatalen);
+		/* do not set InputOffset if no input data */
+		req->InputOffset =
+		       cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer));
+		rqst->rq_nvec = 2;
+		iov[0].iov_len = total_len - 1;
+		iov[1].iov_base = in_data_buf;
+		iov[1].iov_len = indatalen;
+	} else {
+		rqst->rq_nvec = 1;
+		iov[0].iov_len = total_len;
+	}
+
+	req->OutputOffset = 0;
+	req->OutputCount = 0; /* MBZ */
+
+	/*
+	 * In most cases max_response_size is set to 16K (CIFSMaxBufSize)
+	 * We Could increase default MaxOutputResponse, but that could require
+	 * more credits. Windows typically sets this smaller, but for some
+	 * ioctls it may be useful to allow server to send more. No point
+	 * limiting what the server can send as long as fits in one credit
+	 * We can not handle more than CIFS_MAX_BUF_SIZE yet but may want
+	 * to increase this limit up in the future.
+	 * Note that for snapshot queries that servers like Azure expect that
+	 * the first query be minimal size (and just used to get the number/size
+	 * of previous versions) so response size must be specified as EXACTLY
+	 * sizeof(struct snapshot_array) which is 16 when rounded up to multiple
+	 * of eight bytes.  Currently that is the only case where we set max
+	 * response size smaller.
+	 */
+	req->MaxOutputResponse = cpu_to_le32(max_response_size);
+
+	if (is_fsctl)
+		req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
+	else
+		req->Flags = 0;
+
+	/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
+	if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
+		req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+
+	return 0;
+}
+
+void
+SMB2_ioctl_free(struct smb_rqst *rqst)
+{
+	int i;
+	if (rqst && rqst->rq_iov) {
+		cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+		for (i = 1; i < rqst->rq_nvec; i++)
+			if (rqst->rq_iov[i].iov_base != smb2_padding)
+				kfree(rqst->rq_iov[i].iov_base);
+	}
+}
+
+
 /*
  *	SMB2 IOCTL is used for both IOCTLs and FSCTLs
  */
 int
 SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	   u64 volatile_fid, u32 opcode, bool is_fsctl,
-	   char *in_data, u32 indatalen,
+	   char *in_data, u32 indatalen, u32 max_out_data_len,
 	   char **out_data, u32 *plen /* returned data len */)
 {
 	struct smb_rqst rqst;
-	struct smb2_ioctl_req *req;
-	struct smb2_ioctl_rsp *rsp;
+	struct smb2_ioctl_rsp *rsp = NULL;
 	struct cifs_ses *ses;
-	struct kvec iov[2];
-	struct kvec rsp_iov;
-	int resp_buftype;
-	int n_iov;
+	struct kvec iov[SMB2_IOCTL_IOV_SIZE];
+	struct kvec rsp_iov = {NULL, 0};
+	int resp_buftype = CIFS_NO_BUFFER;
 	int rc = 0;
 	int flags = 0;
-	unsigned int total_len;
+	struct TCP_Server_Info *server;
 
 	cifs_dbg(FYI, "SMB2 IOCTL\n");
 
@@ -2361,85 +2782,34 @@
 	else
 		return -EIO;
 
-	if (!ses || !(ses->server))
+	if (!ses)
 		return -EIO;
-
-	rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
-	if (rc)
-		return rc;
+	server = ses->server;
+	if (!server)
+		return -EIO;
 
 	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
-	req->CtlCode = cpu_to_le32(opcode);
-	req->PersistentFileId = persistent_fid;
-	req->VolatileFileId = volatile_fid;
-
-	if (indatalen) {
-		req->InputCount = cpu_to_le32(indatalen);
-		/* do not set InputOffset if no input data */
-		req->InputOffset =
-		       cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer));
-		iov[1].iov_base = in_data;
-		iov[1].iov_len = indatalen;
-		n_iov = 2;
-	} else
-		n_iov = 1;
-
-	req->OutputOffset = 0;
-	req->OutputCount = 0; /* MBZ */
-
-	/*
-	 * Could increase MaxOutputResponse, but that would require more
-	 * than one credit. Windows typically sets this smaller, but for some
-	 * ioctls it may be useful to allow server to send more. No point
-	 * limiting what the server can send as long as fits in one credit
-	 * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE
-	 * (by default, note that it can be overridden to make max larger)
-	 * in responses (except for read responses which can be bigger.
-	 * We may want to bump this limit up
-	 */
-	req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize);
-
-	if (is_fsctl)
-		req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
-	else
-		req->Flags = 0;
-
-	iov[0].iov_base = (char *)req;
-
-	/*
-	 * If no input data, the size of ioctl struct in
-	 * protocol spec still includes a 1 byte data buffer,
-	 * but if input data passed to ioctl, we do not
-	 * want to double count this, so we do not send
-	 * the dummy one byte of data in iovec[0] if sending
-	 * input data (in iovec[1]).
-	 */
-
-	if (indatalen) {
-		iov[0].iov_len = total_len - 1;
-	} else
-		iov[0].iov_len = total_len;
-
-	/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
-	if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
-		req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
-
 	memset(&rqst, 0, sizeof(struct smb_rqst));
+	memset(&iov, 0, sizeof(iov));
 	rqst.rq_iov = iov;
-	rqst.rq_nvec = n_iov;
+	rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+	rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, opcode,
+			     is_fsctl, in_data, indatalen, max_out_data_len);
+	if (rc)
+		goto ioctl_exit;
 
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
 			    &rsp_iov);
-	cifs_small_buf_release(req);
 	rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
 
 	if (rc != 0)
 		trace_smb3_fsctl_err(xid, persistent_fid, tcon->tid,
 				ses->Suid, 0, opcode, rc);
 
-	if ((rc != 0) && (rc != -EINVAL)) {
+	if ((rc != 0) && (rc != -EINVAL) && (rc != -E2BIG)) {
 		cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
 		goto ioctl_exit;
 	} else if (rc == -EINVAL) {
@@ -2448,6 +2818,11 @@
 			cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
 			goto ioctl_exit;
 		}
+	} else if (rc == -E2BIG) {
+		if (opcode != FSCTL_QUERY_ALLOCATED_RANGES) {
+			cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+			goto ioctl_exit;
+		}
 	}
 
 	/* check if caller wants to look at return data or just return rc */
@@ -2460,28 +2835,29 @@
 	if (*plen == 0)
 		goto ioctl_exit; /* server returned no data */
 	else if (*plen > rsp_iov.iov_len || *plen > 0xFF00) {
-		cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
+		cifs_tcon_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
 		*plen = 0;
 		rc = -EIO;
 		goto ioctl_exit;
 	}
 
 	if (rsp_iov.iov_len - *plen < le32_to_cpu(rsp->OutputOffset)) {
-		cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
+		cifs_tcon_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
 			le32_to_cpu(rsp->OutputOffset));
 		*plen = 0;
 		rc = -EIO;
 		goto ioctl_exit;
 	}
 
-	*out_data = kmalloc(*plen, GFP_KERNEL);
+	*out_data = kmemdup((char *)rsp + le32_to_cpu(rsp->OutputOffset),
+			    *plen, GFP_KERNEL);
 	if (*out_data == NULL) {
 		rc = -ENOMEM;
 		goto ioctl_exit;
 	}
 
-	memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen);
 ioctl_exit:
+	SMB2_ioctl_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp);
 	return rc;
 }
@@ -2504,7 +2880,8 @@
 	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
 			FSCTL_SET_COMPRESSION, true /* is_fsctl */,
 			(char *)&fsctl_input /* data input */,
-			2 /* in data len */, &ret_data /* out data */, NULL);
+			2 /* in data len */, CIFSMaxBufSize /* max out data */,
+			&ret_data /* out data */, NULL);
 
 	cifs_dbg(FYI, "set compression rc %d\n", rc);
 
@@ -2535,7 +2912,8 @@
 void
 SMB2_close_free(struct smb_rqst *rqst)
 {
-	cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+	if (rqst && rqst->rq_iov)
+		cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
 }
 
 int
@@ -2547,7 +2925,7 @@
 	struct cifs_ses *ses = tcon->ses;
 	struct kvec iov[1];
 	struct kvec rsp_iov;
-	int resp_buftype;
+	int resp_buftype = CIFS_NO_BUFFER;
 	int rc = 0;
 
 	cifs_dbg(FYI, "Close\n");
@@ -2563,6 +2941,7 @@
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 1;
 
+	trace_smb3_close_enter(xid, persistent_fid, tcon->tid, ses->Suid);
 	rc = SMB2_close_init(tcon, &rqst, persistent_fid, volatile_fid);
 	if (rc)
 		goto close_exit;
@@ -2575,7 +2954,11 @@
 		trace_smb3_close_err(xid, persistent_fid, tcon->tid, ses->Suid,
 				     rc);
 		goto close_exit;
-	}
+	} else
+		trace_smb3_close_done(xid, persistent_fid, tcon->tid,
+				      ses->Suid);
+
+	atomic_dec(&tcon->num_remote_opens);
 
 	/* BB FIXME - decode close response, update inode for caching */
 
@@ -2627,10 +3010,10 @@
  * If SMB buffer fields are valid, copy into temporary buffer to hold result.
  * Caller must free buffer.
  */
-static int
-validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
-		      struct kvec *iov, unsigned int minbufsize,
-		      char *data)
+int
+smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
+			   struct kvec *iov, unsigned int minbufsize,
+			   char *data)
 {
 	char *begin_of_buf = offset + (char *)iov->iov_base;
 	int rc;
@@ -2651,7 +3034,7 @@
 SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
 		     u64 persistent_fid, u64 volatile_fid,
 		     u8 info_class, u8 info_type, u32 additional_info,
-		     size_t output_len)
+		     size_t output_len, size_t input_len, void *input)
 {
 	struct smb2_query_info_req *req;
 	struct kvec *iov = rqst->rq_iov;
@@ -2669,23 +3052,25 @@
 	req->VolatileFileId = volatile_fid;
 	req->AdditionalInformation = cpu_to_le32(additional_info);
 
-	/*
-	 * We do not use the input buffer (do not send extra byte)
-	 */
-	req->InputBufferOffset = 0;
-
 	req->OutputBufferLength = cpu_to_le32(output_len);
+	if (input_len) {
+		req->InputBufferLength = cpu_to_le32(input_len);
+		/* total_len for smb query request never close to le16 max */
+		req->InputBufferOffset = cpu_to_le16(total_len - 1);
+		memcpy(req->Buffer, input, input_len);
+	}
 
 	iov[0].iov_base = (char *)req;
 	/* 1 for Buffer */
-	iov[0].iov_len = total_len - 1;
+	iov[0].iov_len = total_len - 1 + input_len;
 	return 0;
 }
 
 void
 SMB2_query_info_free(struct smb_rqst *rqst)
 {
-	cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+	if (rqst && rqst->rq_iov)
+		cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
 }
 
 static int
@@ -2699,13 +3084,18 @@
 	struct kvec iov[1];
 	struct kvec rsp_iov;
 	int rc = 0;
-	int resp_buftype;
+	int resp_buftype = CIFS_NO_BUFFER;
 	struct cifs_ses *ses = tcon->ses;
+	struct TCP_Server_Info *server;
 	int flags = 0;
+	bool allocated = false;
 
 	cifs_dbg(FYI, "Query Info\n");
 
-	if (!ses || !(ses->server))
+	if (!ses)
+		return -EIO;
+	server = ses->server;
+	if (!server)
 		return -EIO;
 
 	if (smb3_encryption_required(tcon))
@@ -2718,10 +3108,13 @@
 
 	rc = SMB2_query_info_init(tcon, &rqst, persistent_fid, volatile_fid,
 				  info_class, info_type, additional_info,
-				  output_len);
+				  output_len, 0, NULL);
 	if (rc)
 		goto qinf_exit;
 
+	trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
+				    ses->Suid, info_class, (__u32)info_type);
+
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
 	rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
 
@@ -2732,23 +3125,33 @@
 		goto qinf_exit;
 	}
 
+	trace_smb3_query_info_done(xid, persistent_fid, tcon->tid,
+				ses->Suid, info_class, (__u32)info_type);
+
 	if (dlen) {
 		*dlen = le32_to_cpu(rsp->OutputBufferLength);
 		if (!*data) {
 			*data = kmalloc(*dlen, GFP_KERNEL);
 			if (!*data) {
-				cifs_dbg(VFS,
+				cifs_tcon_dbg(VFS,
 					"Error %d allocating memory for acl\n",
 					rc);
 				*dlen = 0;
+				rc = -ENOMEM;
 				goto qinf_exit;
 			}
+			allocated = true;
 		}
 	}
 
-	rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
-				   le32_to_cpu(rsp->OutputBufferLength),
-				   &rsp_iov, min_len, *data);
+	rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
+					le32_to_cpu(rsp->OutputBufferLength),
+					&rsp_iov, min_len, *data);
+	if (rc && allocated) {
+		kfree(*data);
+		*data = NULL;
+		*dlen = 0;
+	}
 
 qinf_exit:
 	SMB2_query_info_free(&rqst);
@@ -2756,18 +3159,6 @@
 	return rc;
 }
 
-int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
-		   u64 persistent_fid, u64 volatile_fid,
-		   int ea_buf_size, struct smb2_file_full_ea_info *data)
-{
-	return query_info(xid, tcon, persistent_fid, volatile_fid,
-			  FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0,
-			  ea_buf_size,
-			  sizeof(struct smb2_file_full_ea_info),
-			  (void **)&data,
-			  NULL);
-}
-
 int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
 	u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data)
 {
@@ -2803,6 +3194,92 @@
 }
 
 /*
+ * CHANGE_NOTIFY Request is sent to get notifications on changes to a directory
+ * See MS-SMB2 2.2.35 and 2.2.36
+ */
+
+static int
+SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
+		struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid,
+		u32 completion_filter, bool watch_tree)
+{
+	struct smb2_change_notify_req *req;
+	struct kvec *iov = rqst->rq_iov;
+	unsigned int total_len;
+	int rc;
+
+	rc = smb2_plain_req_init(SMB2_CHANGE_NOTIFY, tcon, (void **) &req, &total_len);
+	if (rc)
+		return rc;
+
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+	req->OutputBufferLength =
+		cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE);
+	req->CompletionFilter = cpu_to_le32(completion_filter);
+	if (watch_tree)
+		req->Flags = cpu_to_le16(SMB2_WATCH_TREE);
+	else
+		req->Flags = 0;
+
+	iov[0].iov_base = (char *)req;
+	iov[0].iov_len = total_len;
+
+	return 0;
+}
+
+int
+SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
+		u64 persistent_fid, u64 volatile_fid, bool watch_tree,
+		u32 completion_filter)
+{
+	struct cifs_ses *ses = tcon->ses;
+	struct smb_rqst rqst;
+	struct kvec iov[1];
+	struct kvec rsp_iov = {NULL, 0};
+	int resp_buftype = CIFS_NO_BUFFER;
+	int flags = 0;
+	int rc = 0;
+
+	cifs_dbg(FYI, "change notify\n");
+	if (!ses || !(ses->server))
+		return -EIO;
+
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	memset(&rqst, 0, sizeof(struct smb_rqst));
+	memset(&iov, 0, sizeof(iov));
+	rqst.rq_iov = iov;
+	rqst.rq_nvec = 1;
+
+	rc = SMB2_notify_init(xid, &rqst, tcon, persistent_fid, volatile_fid,
+			      completion_filter, watch_tree);
+	if (rc)
+		goto cnotify_exit;
+
+	trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid,
+				(u8)watch_tree, completion_filter);
+	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
+
+	if (rc != 0) {
+		cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE);
+		trace_smb3_notify_err(xid, persistent_fid, tcon->tid, ses->Suid,
+				(u8)watch_tree, completion_filter, rc);
+	} else
+		trace_smb3_notify_done(xid, persistent_fid, tcon->tid,
+				ses->Suid, (u8)watch_tree, completion_filter);
+
+ cnotify_exit:
+	if (rqst.rq_iov)
+		cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */
+	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+	return rc;
+}
+
+
+
+/*
  * This is a no-op for now. We're not really interested in the reply, but
  * rather in the fact that the server sent one and that server->lstrp
  * gets updated.
@@ -2814,13 +3291,16 @@
 {
 	struct TCP_Server_Info *server = mid->callback_data;
 	struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf;
-	unsigned int credits_received = 1;
+	struct cifs_credits credits = { .value = 0, .instance = 0 };
 
-	if (mid->mid_state == MID_RESPONSE_RECEIVED)
-		credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+	if (mid->mid_state == MID_RESPONSE_RECEIVED
+	    || mid->mid_state == MID_RESPONSE_MALFORMED) {
+		credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+		credits.instance = server->reconnect_instance;
+	}
 
 	DeleteMidQEntry(mid);
-	add_credits(server, credits_received, CIFS_ECHO_OP);
+	add_credits(server, &credits, CIFS_ECHO_OP);
 }
 
 void smb2_reconnect_server(struct work_struct *work)
@@ -2850,9 +3330,14 @@
 				tcon_exist = true;
 			}
 		}
+		/*
+		 * IPC has the same lifetime as its session and uses its
+		 * refcount.
+		 */
 		if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) {
 			list_add_tail(&ses->tcon_ipc->rlist, &tmp_list);
 			tcon_exist = true;
+			ses->ses_count++;
 		}
 	}
 	/*
@@ -2871,7 +3356,10 @@
 		else
 			resched = true;
 		list_del_init(&tcon->rlist);
-		cifs_put_tcon(tcon);
+		if (tcon->ipc)
+			cifs_put_smb_ses(tcon->ses);
+		else
+			cifs_put_tcon(tcon);
 	}
 
 	cifs_dbg(FYI, "Reconnecting tcons finished\n");
@@ -2912,7 +3400,7 @@
 	iov[0].iov_base = (char *)req;
 
 	rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL,
-			     server, CIFS_ECHO_OP);
+			     server, CIFS_ECHO_OP, NULL);
 	if (rc)
 		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
 
@@ -2920,51 +3408,76 @@
 	return rc;
 }
 
-int
-SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
-	   u64 volatile_fid)
+void
+SMB2_flush_free(struct smb_rqst *rqst)
 {
-	struct smb_rqst rqst;
+	if (rqst && rqst->rq_iov)
+		cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+}
+
+int
+SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst,
+		struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid)
+{
 	struct smb2_flush_req *req;
-	struct cifs_ses *ses = tcon->ses;
-	struct kvec iov[1];
-	struct kvec rsp_iov;
-	int resp_buftype;
-	int rc = 0;
-	int flags = 0;
+	struct kvec *iov = rqst->rq_iov;
 	unsigned int total_len;
-
-	cifs_dbg(FYI, "Flush\n");
-
-	if (!ses || !(ses->server))
-		return -EIO;
+	int rc;
 
 	rc = smb2_plain_req_init(SMB2_FLUSH, tcon, (void **) &req, &total_len);
 	if (rc)
 		return rc;
 
-	if (smb3_encryption_required(tcon))
-		flags |= CIFS_TRANSFORM_REQ;
-
 	req->PersistentFileId = persistent_fid;
 	req->VolatileFileId = volatile_fid;
 
 	iov[0].iov_base = (char *)req;
 	iov[0].iov_len = total_len;
 
+	return 0;
+}
+
+int
+SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+	   u64 volatile_fid)
+{
+	struct cifs_ses *ses = tcon->ses;
+	struct smb_rqst rqst;
+	struct kvec iov[1];
+	struct kvec rsp_iov = {NULL, 0};
+	int resp_buftype = CIFS_NO_BUFFER;
+	int flags = 0;
+	int rc = 0;
+
+	cifs_dbg(FYI, "flush\n");
+	if (!ses || !(ses->server))
+		return -EIO;
+
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
 	memset(&rqst, 0, sizeof(struct smb_rqst));
+	memset(&iov, 0, sizeof(iov));
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 1;
 
+	rc = SMB2_flush_init(xid, &rqst, tcon, persistent_fid, volatile_fid);
+	if (rc)
+		goto flush_exit;
+
+	trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid);
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
-	cifs_small_buf_release(req);
 
 	if (rc != 0) {
 		cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
 		trace_smb3_flush_err(xid, persistent_fid, tcon->tid, ses->Suid,
 				     rc);
-	}
+	} else
+		trace_smb3_flush_done(xid, persistent_fid, tcon->tid,
+				      ses->Suid);
 
+ flush_exit:
+	SMB2_flush_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 	return rc;
 }
@@ -3003,6 +3516,11 @@
 	req->MinimumCount = 0;
 	req->Length = cpu_to_le32(io_parms->length);
 	req->Offset = cpu_to_le64(io_parms->offset);
+
+	trace_smb3_read_enter(0 /* xid */,
+			io_parms->persistent_fid,
+			io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+			io_parms->offset, io_parms->length);
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	/*
 	 * If we want to do a RDMA write, fill in and append
@@ -3020,7 +3538,7 @@
 				rdata->nr_pages, rdata->page_offset,
 				rdata->tailsz, true, need_invalidate);
 		if (!rdata->mr)
-			return -ENOBUFS;
+			return -EAGAIN;
 
 		req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
 		if (need_invalidate)
@@ -3073,9 +3591,9 @@
 	struct TCP_Server_Info *server = tcon->ses->server;
 	struct smb2_sync_hdr *shdr =
 				(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
-	unsigned int credits_received = 1;
-	struct smb_rqst rqst = { .rq_iov = rdata->iov,
-				 .rq_nvec = 2,
+	struct cifs_credits credits = { .value = 0, .instance = 0 };
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
+				 .rq_nvec = 1,
 				 .rq_pages = rdata->pages,
 				 .rq_offset = rdata->page_offset,
 				 .rq_npages = rdata->nr_pages,
@@ -3088,14 +3606,15 @@
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
-		credits_received = le16_to_cpu(shdr->CreditRequest);
+		credits.value = le16_to_cpu(shdr->CreditRequest);
+		credits.instance = server->reconnect_instance;
 		/* result already set, check signature */
 		if (server->sign && !mid->decrypted) {
 			int rc;
 
 			rc = smb2_verify_signature(&rqst, server);
 			if (rc)
-				cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+				cifs_tcon_dbg(VFS, "SMB signature verification returned error = %d\n",
 					 rc);
 		}
 		/* FIXME: should this be counted toward the initiating task? */
@@ -3112,9 +3631,12 @@
 		task_io_account_read(rdata->got_bytes);
 		cifs_stats_bytes_read(tcon, rdata->got_bytes);
 		break;
+	case MID_RESPONSE_MALFORMED:
+		credits.value = le16_to_cpu(shdr->CreditRequest);
+		credits.instance = server->reconnect_instance;
+		/* fall through */
 	default:
-		if (rdata->result != -ENODATA)
-			rdata->result = -EIO;
+		rdata->result = -EIO;
 	}
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	/*
@@ -3127,12 +3649,21 @@
 		rdata->mr = NULL;
 	}
 #endif
-	if (rdata->result)
+	if (rdata->result && rdata->result != -ENODATA) {
 		cifs_stats_fail_inc(tcon, SMB2_READ_HE);
+		trace_smb3_read_err(0 /* xid */,
+				    rdata->cfile->fid.persistent_fid,
+				    tcon->tid, tcon->ses->Suid, rdata->offset,
+				    rdata->bytes, rdata->result);
+	} else
+		trace_smb3_read_done(0 /* xid */,
+				     rdata->cfile->fid.persistent_fid,
+				     tcon->tid, tcon->ses->Suid,
+				     rdata->offset, rdata->got_bytes);
 
 	queue_work(cifsiod_wq, &rdata->work);
 	DeleteMidQEntry(mid);
-	add_credits(server, credits_received, 0);
+	add_credits(server, &credits, 0);
 }
 
 /* smb2_async_readv - send an async read, and set up mid to handle result */
@@ -3162,17 +3693,8 @@
 
 	rc = smb2_new_read_req(
 		(void **) &buf, &total_len, &io_parms, rdata, 0, 0);
-	if (rc) {
-		if (rc == -EAGAIN && rdata->credits) {
-			/* credits was reset by reconnect */
-			rdata->credits = 0;
-			/* reduce in_flight value since we won't send the req */
-			spin_lock(&server->req_lock);
-			server->in_flight--;
-			spin_unlock(&server->req_lock);
-		}
+	if (rc)
 		return rc;
-	}
 
 	if (smb3_encryption_required(io_parms.tcon))
 		flags |= CIFS_TRANSFORM_REQ;
@@ -3182,33 +3704,34 @@
 
 	shdr = (struct smb2_sync_hdr *)buf;
 
-	if (rdata->credits) {
+	if (rdata->credits.value > 0) {
 		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
 						SMB2_MAX_BUFFER_SIZE));
-		shdr->CreditRequest = shdr->CreditCharge;
-		spin_lock(&server->req_lock);
-		server->credits += rdata->credits -
-						le16_to_cpu(shdr->CreditCharge);
-		spin_unlock(&server->req_lock);
-		wake_up(&server->request_q);
+		shdr->CreditRequest =
+			cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
+
+		rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+		if (rc)
+			goto async_readv_out;
+
 		flags |= CIFS_HAS_CREDITS;
 	}
 
 	kref_get(&rdata->refcount);
 	rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
 			     cifs_readv_receive, smb2_readv_callback,
-			     smb3_handle_read_data, rdata, flags);
+			     smb3_handle_read_data, rdata, flags,
+			     &rdata->credits);
 	if (rc) {
 		kref_put(&rdata->refcount, cifs_readdata_release);
 		cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
-		trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid,
-				   io_parms.tcon->tid, io_parms.tcon->ses->Suid,
-				   io_parms.offset, io_parms.length);
-	} else
-		trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid,
-				   io_parms.tcon->tid, io_parms.tcon->ses->Suid,
-				   io_parms.offset, io_parms.length);
+		trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid,
+				    io_parms.tcon->tid,
+				    io_parms.tcon->ses->Suid,
+				    io_parms.offset, io_parms.length, rc);
+	}
 
+async_readv_out:
 	cifs_small_buf_release(buf);
 	return rc;
 }
@@ -3218,7 +3741,7 @@
 	  unsigned int *nbytes, char **buf, int *buf_type)
 {
 	struct smb_rqst rqst;
-	int resp_buftype, rc = -EACCES;
+	int resp_buftype, rc;
 	struct smb2_read_plain_req *req = NULL;
 	struct smb2_read_rsp *rsp = NULL;
 	struct kvec iov[1];
@@ -3243,25 +3766,30 @@
 	rqst.rq_nvec = 1;
 
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
-	cifs_small_buf_release(req);
-
 	rsp = (struct smb2_read_rsp *)rsp_iov.iov_base;
 
 	if (rc) {
 		if (rc != -ENODATA) {
 			cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
 			cifs_dbg(VFS, "Send error in read = %d\n", rc);
-		}
-		trace_smb3_read_err(rc, xid, req->PersistentFileId,
+			trace_smb3_read_err(xid, req->PersistentFileId,
+					    io_parms->tcon->tid, ses->Suid,
+					    io_parms->offset, io_parms->length,
+					    rc);
+		} else
+			trace_smb3_read_done(xid, req->PersistentFileId,
 				    io_parms->tcon->tid, ses->Suid,
-				    io_parms->offset, io_parms->length);
+				    io_parms->offset, 0);
 		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+		cifs_small_buf_release(req);
 		return rc == -ENODATA ? 0 : rc;
 	} else
 		trace_smb3_read_done(xid, req->PersistentFileId,
 				    io_parms->tcon->tid, ses->Suid,
 				    io_parms->offset, io_parms->length);
 
+	cifs_small_buf_release(req);
+
 	*nbytes = le32_to_cpu(rsp->DataLength);
 	if ((*nbytes > CIFS_MAX_MSGSIZE) ||
 	    (*nbytes > io_parms->length)) {
@@ -3293,14 +3821,16 @@
 {
 	struct cifs_writedata *wdata = mid->callback_data;
 	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
 	unsigned int written;
 	struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
-	unsigned int credits_received = 1;
+	struct cifs_credits credits = { .value = 0, .instance = 0 };
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
-		credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
-		wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
+		credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+		credits.instance = server->reconnect_instance;
+		wdata->result = smb2_check_receive(mid, server, 0);
 		if (wdata->result != 0)
 			break;
 
@@ -3323,6 +3853,10 @@
 	case MID_RETRY_NEEDED:
 		wdata->result = -EAGAIN;
 		break;
+	case MID_RESPONSE_MALFORMED:
+		credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+		credits.instance = server->reconnect_instance;
+		/* fall through */
 	default:
 		wdata->result = -EIO;
 		break;
@@ -3340,12 +3874,21 @@
 		wdata->mr = NULL;
 	}
 #endif
-	if (wdata->result)
+	if (wdata->result) {
 		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
+		trace_smb3_write_err(0 /* no xid */,
+				     wdata->cfile->fid.persistent_fid,
+				     tcon->tid, tcon->ses->Suid, wdata->offset,
+				     wdata->bytes, wdata->result);
+	} else
+		trace_smb3_write_done(0 /* no xid */,
+				      wdata->cfile->fid.persistent_fid,
+				      tcon->tid, tcon->ses->Suid,
+				      wdata->offset, wdata->bytes);
 
 	queue_work(cifsiod_wq, &wdata->work);
 	DeleteMidQEntry(mid);
-	add_credits(tcon->ses->server, credits_received, 0);
+	add_credits(server, &credits, 0);
 }
 
 /* smb2_async_writev - send an async write, and set up mid to handle result */
@@ -3363,17 +3906,8 @@
 	unsigned int total_len;
 
 	rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len);
-	if (rc) {
-		if (rc == -EAGAIN && wdata->credits) {
-			/* credits was reset by reconnect */
-			wdata->credits = 0;
-			/* reduce in_flight value since we won't send the req */
-			spin_lock(&server->req_lock);
-			server->in_flight--;
-			spin_unlock(&server->req_lock);
-		}
-		goto async_writev_out;
-	}
+	if (rc)
+		return rc;
 
 	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
@@ -3390,6 +3924,9 @@
 	req->DataOffset = cpu_to_le16(
 				offsetof(struct smb2_write_req, Buffer));
 	req->RemainingBytes = 0;
+
+	trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid,
+		tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes);
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	/*
 	 * If we want to do a server RDMA read, fill in and append
@@ -3406,7 +3943,7 @@
 				wdata->nr_pages, wdata->page_offset,
 				wdata->tailsz, false, need_invalidate);
 		if (!wdata->mr) {
-			rc = -ENOBUFS;
+			rc = -EAGAIN;
 			goto async_writev_out;
 		}
 		req->Length = 0;
@@ -3459,21 +3996,22 @@
 	req->Length = cpu_to_le32(wdata->bytes);
 #endif
 
-	if (wdata->credits) {
+	if (wdata->credits.value > 0) {
 		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
 						    SMB2_MAX_BUFFER_SIZE));
-		shdr->CreditRequest = shdr->CreditCharge;
-		spin_lock(&server->req_lock);
-		server->credits += wdata->credits -
-						le16_to_cpu(shdr->CreditCharge);
-		spin_unlock(&server->req_lock);
-		wake_up(&server->request_q);
+		shdr->CreditRequest =
+			cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
+
+		rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+		if (rc)
+			goto async_writev_out;
+
 		flags |= CIFS_HAS_CREDITS;
 	}
 
 	kref_get(&wdata->refcount);
 	rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
-			     wdata, flags);
+			     wdata, flags, &wdata->credits);
 
 	if (rc) {
 		trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
@@ -3481,10 +4019,7 @@
 				     wdata->bytes, rc);
 		kref_put(&wdata->refcount, release);
 		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
-	} else
-		trace_smb3_write_done(0 /* no xid */, req->PersistentFileId,
-				     tcon->tid, tcon->ses->Suid, wdata->offset,
-				     wdata->bytes);
+	}
 
 async_writev_out:
 	cifs_small_buf_release(req);
@@ -3539,6 +4074,10 @@
 				offsetof(struct smb2_write_req, Buffer));
 	req->RemainingBytes = 0;
 
+	trace_smb3_write_enter(xid, io_parms->persistent_fid,
+		io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+		io_parms->offset, io_parms->length);
+
 	iov[0].iov_base = (char *)req;
 	/* 1 for Buffer */
 	iov[0].iov_len = total_len - 1;
@@ -3549,7 +4088,6 @@
 
 	rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst,
 			    &resp_buftype, flags, &rsp_iov);
-	cifs_small_buf_release(req);
 	rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
 
 	if (rc) {
@@ -3567,6 +4105,7 @@
 				     io_parms->offset, *nbytes);
 	}
 
+	cifs_small_buf_release(req);
 	free_rsp_buf(resp_buftype, rsp);
 	return rc;
 }
@@ -3665,7 +4204,7 @@
 		info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
 		break;
 	default:
-		cifs_dbg(VFS, "info level %u isn't supported\n",
+		cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
 			 srch_inf->info_level);
 		rc = -EINVAL;
 		goto qdir_exit;
@@ -3701,6 +4240,9 @@
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 2;
 
+	trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid,
+			tcon->ses->Suid, index, output_size);
+
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
 	cifs_small_buf_release(req);
 	rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -3708,18 +4250,26 @@
 	if (rc) {
 		if (rc == -ENODATA &&
 		    rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+			trace_smb3_query_dir_done(xid, persistent_fid,
+				tcon->tid, tcon->ses->Suid, index, 0);
 			srch_inf->endOfSearch = true;
 			rc = 0;
+		} else {
+			trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+				tcon->ses->Suid, index, 0, rc);
+			cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
 		}
-		cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
 		goto qdir_exit;
 	}
 
 	rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
 			       le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
 			       info_buf_size);
-	if (rc)
+	if (rc) {
+		trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+			tcon->ses->Suid, index, 0, rc);
 		goto qdir_exit;
+	}
 
 	srch_inf->unicode = true;
 
@@ -3745,8 +4295,10 @@
 	else if (resp_buftype == CIFS_SMALL_BUFFER)
 		srch_inf->smallBuf = true;
 	else
-		cifs_dbg(VFS, "illegal search buffer type\n");
+		cifs_tcon_dbg(VFS, "illegal search buffer type\n");
 
+	trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid,
+			tcon->ses->Suid, index, srch_inf->entries_in_buffer);
 	return rc;
 
 qdir_exit:
@@ -3754,45 +4306,22 @@
 	return rc;
 }
 
-static int
-send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+int
+SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
 	       u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class,
-	       u8 info_type, u32 additional_info, unsigned int num,
+	       u8 info_type, u32 additional_info,
 		void **data, unsigned int *size)
 {
-	struct smb_rqst rqst;
 	struct smb2_set_info_req *req;
-	struct smb2_set_info_rsp *rsp = NULL;
-	struct kvec *iov;
-	struct kvec rsp_iov;
-	int rc = 0;
-	int resp_buftype;
-	unsigned int i;
-	struct cifs_ses *ses = tcon->ses;
-	int flags = 0;
-	unsigned int total_len;
-
-	if (!ses || !(ses->server))
-		return -EIO;
-
-	if (!num)
-		return -EINVAL;
-
-	iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL);
-	if (!iov)
-		return -ENOMEM;
+	struct kvec *iov = rqst->rq_iov;
+	unsigned int i, total_len;
+	int rc;
 
 	rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len);
-	if (rc) {
-		kfree(iov);
+	if (rc)
 		return rc;
-	}
-
-	if (smb3_encryption_required(tcon))
-		flags |= CIFS_TRANSFORM_REQ;
 
 	req->sync_hdr.ProcessId = cpu_to_le32(pid);
-
 	req->InfoType = info_type;
 	req->FileInfoClass = info_class;
 	req->PersistentFileId = persistent_fid;
@@ -3810,19 +4339,66 @@
 	/* 1 for Buffer */
 	iov[0].iov_len = total_len - 1;
 
-	for (i = 1; i < num; i++) {
+	for (i = 1; i < rqst->rq_nvec; i++) {
 		le32_add_cpu(&req->BufferLength, size[i]);
 		iov[i].iov_base = (char *)data[i];
 		iov[i].iov_len = size[i];
 	}
 
+	return 0;
+}
+
+void
+SMB2_set_info_free(struct smb_rqst *rqst)
+{
+	if (rqst && rqst->rq_iov)
+		cifs_buf_release(rqst->rq_iov[0].iov_base); /* request */
+}
+
+static int
+send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+	       u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class,
+	       u8 info_type, u32 additional_info, unsigned int num,
+		void **data, unsigned int *size)
+{
+	struct smb_rqst rqst;
+	struct smb2_set_info_rsp *rsp = NULL;
+	struct kvec *iov;
+	struct kvec rsp_iov;
+	int rc = 0;
+	int resp_buftype;
+	struct cifs_ses *ses = tcon->ses;
+	int flags = 0;
+
+	if (!ses || !(ses->server))
+		return -EIO;
+
+	if (!num)
+		return -EINVAL;
+
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL);
+	if (!iov)
+		return -ENOMEM;
+
 	memset(&rqst, 0, sizeof(struct smb_rqst));
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = num;
 
+	rc = SMB2_set_info_init(tcon, &rqst, persistent_fid, volatile_fid, pid,
+				info_class, info_type, additional_info,
+				data, size);
+	if (rc) {
+		kfree(iov);
+		return rc;
+	}
+
+
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
 			    &rsp_iov);
-	cifs_buf_release(req);
+	SMB2_set_info_free(&rqst);
 	rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base;
 
 	if (rc != 0) {
@@ -3837,88 +4413,8 @@
 }
 
 int
-SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
-	    u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
-{
-	struct smb2_file_rename_info info;
-	void **data;
-	unsigned int size[2];
-	int rc;
-	int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
-
-	data = kmalloc_array(2, sizeof(void *), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	info.ReplaceIfExists = 1; /* 1 = replace existing target with new */
-			      /* 0 = fail if target already exists */
-	info.RootDirectory = 0;  /* MBZ for network ops (why does spec say?) */
-	info.FileNameLength = cpu_to_le32(len);
-
-	data[0] = &info;
-	size[0] = sizeof(struct smb2_file_rename_info);
-
-	data[1] = target_file;
-	size[1] = len + 2 /* null */;
-
-	rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
-		current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE,
-		0, 2, data, size);
-	kfree(data);
-	return rc;
-}
-
-int
-SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
-		  u64 persistent_fid, u64 volatile_fid)
-{
-	__u8 delete_pending = 1;
-	void *data;
-	unsigned int size;
-
-	data = &delete_pending;
-	size = 1; /* sizeof __u8 */
-
-	return send_set_info(xid, tcon, persistent_fid, volatile_fid,
-		current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE,
-		0, 1, &data, &size);
-}
-
-int
-SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
-		  u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
-{
-	struct smb2_file_link_info info;
-	void **data;
-	unsigned int size[2];
-	int rc;
-	int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
-
-	data = kmalloc_array(2, sizeof(void *), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	info.ReplaceIfExists = 0; /* 1 = replace existing link with new */
-			      /* 0 = fail if link already exists */
-	info.RootDirectory = 0;  /* MBZ for network ops (why does spec say?) */
-	info.FileNameLength = cpu_to_le32(len);
-
-	data[0] = &info;
-	size[0] = sizeof(struct smb2_file_link_info);
-
-	data[1] = target_file;
-	size[1] = len + 2 /* null */;
-
-	rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
-			current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE,
-			0, 2, data, size);
-	kfree(data);
-	return rc;
-}
-
-int
 SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
-	     u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc)
+	     u64 volatile_fid, u32 pid, __le64 *eof)
 {
 	struct smb2_file_eof_info info;
 	void *data;
@@ -3929,28 +4425,12 @@
 	data = &info;
 	size = sizeof(struct smb2_file_eof_info);
 
-	if (is_falloc)
-		return send_set_info(xid, tcon, persistent_fid, volatile_fid,
-			pid, FILE_ALLOCATION_INFORMATION, SMB2_O_INFO_FILE,
-			0, 1, &data, &size);
-	else
-		return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+	return send_set_info(xid, tcon, persistent_fid, volatile_fid,
 			pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
 			0, 1, &data, &size);
 }
 
 int
-SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
-	      u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf)
-{
-	unsigned int size;
-	size = sizeof(FILE_BASIC_INFO);
-	return send_set_info(xid, tcon, persistent_fid, volatile_fid,
-		current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE,
-		0, 1, (void **)&buf, &size);
-}
-
-int
 SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
 		u64 persistent_fid, u64 volatile_fid,
 		struct cifs_ntsd *pnntsd, int pacllen, int aclflag)
@@ -3999,7 +4479,7 @@
 	req->OplockLevel = oplock_level;
 	req->sync_hdr.CreditRequest = cpu_to_le16(1);
 
-	flags |= CIFS_NO_RESP;
+	flags |= CIFS_NO_RSP_BUF;
 
 	iov[0].iov_base = (char *)req;
 	iov[0].iov_len = total_len;
@@ -4054,7 +4534,6 @@
 build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
 		   int outbuf_len, u64 persistent_fid, u64 volatile_fid)
 {
-	struct TCP_Server_Info *server;
 	int rc;
 	struct smb2_query_info_req *req;
 	unsigned int total_len;
@@ -4064,8 +4543,6 @@
 	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
 		return -EIO;
 
-	server = tcon->ses->server;
-
 	rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req,
 			     &total_len);
 	if (rc)
@@ -4276,7 +4753,7 @@
 	struct kvec rsp_iov;
 	int resp_buf_type;
 	unsigned int count;
-	int flags = CIFS_NO_RESP;
+	int flags = CIFS_NO_RSP_BUF;
 	unsigned int total_len;
 
 	cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
@@ -4350,6 +4827,8 @@
 	struct kvec iov[1];
 	struct kvec rsp_iov;
 	int resp_buf_type;
+	__u64 *please_key_high;
+	__u64 *please_key_low;
 
 	cifs_dbg(FYI, "SMB2_lease_break\n");
 	rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req,
@@ -4367,7 +4846,7 @@
 	memcpy(req->LeaseKey, lease_key, 16);
 	req->LeaseState = lease_state;
 
-	flags |= CIFS_NO_RESP;
+	flags |= CIFS_NO_RSP_BUF;
 
 	iov[0].iov_base = (char *)req;
 	iov[0].iov_len = total_len;
@@ -4379,10 +4858,16 @@
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov);
 	cifs_small_buf_release(req);
 
+	please_key_low = (__u64 *)lease_key;
+	please_key_high = (__u64 *)(lease_key+8);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
+		trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
+			ses->Suid, *please_key_low, *please_key_high, rc);
 		cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
-	}
+	} else
+		trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid,
+			ses->Suid, *please_key_low, *please_key_high);
 
 	return rc;
 }

diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 8fb7887..0abfde6 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h

@@ -84,8 +84,9 @@
 
 #define NUMBER_OF_SMB2_COMMANDS	0x0013
 
-/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */
-#define MAX_SMB2_HDR_SIZE 0x00b0
+/* 52 transform hdr + 64 hdr + 88 create rsp */
+#define SMB2_TRANSFORM_HEADER_SIZE 52
+#define MAX_SMB2_HDR_SIZE 204
 
 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
 #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
@@ -122,7 +123,7 @@
 	__le16 StructureSize2; /* size of wct area (varies, request specific) */
 } __packed;
 
-#define SMB3_AES128CMM_NONCE 11
+#define SMB3_AES128CCM_NONCE 11
 #define SMB3_AES128GCM_NONCE 12
 
 struct smb2_transform_hdr {
@@ -142,7 +143,9 @@
 #define SMB2_FLAGS_ASYNC_COMMAND	cpu_to_le32(0x00000002)
 #define SMB2_FLAGS_RELATED_OPERATIONS	cpu_to_le32(0x00000004)
 #define SMB2_FLAGS_SIGNED		cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_PRIORITY_MASK	cpu_to_le32(0x00000070) /* SMB3.1.1 */
 #define SMB2_FLAGS_DFS_OPERATIONS	cpu_to_le32(0x10000000)
+#define SMB2_FLAGS_REPLAY_OPERATION	cpu_to_le32(0x20000000) /* SMB3 & up */
 
 /*
  *	Definitions for SMB2 Protocol Data Units (network frames)
@@ -165,6 +168,8 @@
 	__u8   ErrorData[1];  /* variable length */
 } __packed;
 
+#define SYMLINK_ERROR_TAG 0x4c4d5953
+
 struct smb2_symlink_err_rsp {
 	__le32 SymLinkLength;
 	__le32 SymLinkErrorTag;
@@ -226,6 +231,7 @@
 } __packed;
 
 /* Dialects */
+#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
 #define SMB20_PROT_ID 0x0202
 #define SMB21_PROT_ID 0x0210
 #define SMB30_PROT_ID 0x0300
@@ -250,6 +256,14 @@
 #define SMB2_NT_FIND			0x00100000
 #define SMB2_LARGE_FILES		0x00200000
 
+
+/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
+#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES	cpu_to_le16(1)
+#define SMB2_ENCRYPTION_CAPABILITIES		cpu_to_le16(2)
+#define SMB2_COMPRESSION_CAPABILITIES		cpu_to_le16(3)
+#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID	cpu_to_le16(5)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE		cpu_to_le16(0x100)
+
 struct smb2_neg_context {
 	__le16	ContextType;
 	__le16	DataLength;
@@ -284,15 +298,42 @@
 	__le16	DataLength;
 	__le32	Reserved;
 	__le16	CipherCount; /* AES-128-GCM and AES-128-CCM */
-	__le16	Ciphers[1]; /* Ciphers[0] since only one used now */
+	__le16	Ciphers[2];
 } __packed;
 
-#define POSIX_CTXT_DATA_LEN	8
+/* See MS-SMB2 2.2.3.1.3 */
+#define SMB3_COMPRESS_NONE	cpu_to_le16(0x0000)
+#define SMB3_COMPRESS_LZNT1	cpu_to_le16(0x0001)
+#define SMB3_COMPRESS_LZ77	cpu_to_le16(0x0002)
+#define SMB3_COMPRESS_LZ77_HUFF	cpu_to_le16(0x0003)
+
+struct smb2_compression_capabilities_context {
+	__le16	ContextType; /* 3 */
+	__le16  DataLength;
+	__u32	Reserved;
+	__le16	CompressionAlgorithmCount;
+	__u16	Padding;
+	__u32	Reserved1;
+	__le16	CompressionAlgorithms[3];
+} __packed;
+
+/*
+ * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
+ * Its struct simply contains NetName, an array of Unicode characters
+ */
+struct smb2_netname_neg_context {
+	__le16	ContextType; /* 0x100 */
+	__le16	DataLength;
+	__le32	Reserved;
+	__le16	NetName[0]; /* hostname of target converted to UCS-2 */
+} __packed;
+
+#define POSIX_CTXT_DATA_LEN	16
 struct smb2_posix_neg_context {
 	__le16	ContextType; /* 0x100 */
 	__le16	DataLength;
 	__le32	Reserved;
-	__le64	Reserved1; /* In case needed for future (eg version or caps) */
+	__u8	Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
 } __packed;
 
 struct smb2_negotiate_rsp {
@@ -610,9 +651,12 @@
 #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2	"DH2Q"
 #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2	"DH2C"
 #define SMB2_CREATE_APP_INSTANCE_ID	0x45BCA66AEFA7F74A9008FA462E144D74
+#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010
 #define SVHDX_OPEN_DEVICE_CONTEX	0x9CCBCF9E04C1E643980E158DA1F6EC83
 #define SMB2_CREATE_TAG_POSIX		0x93AD25509CB411E7B42383DE968BCD7C
 
+/* Flag (SMB3 open response) values */
+#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
 
 /*
  * Maximum number of iovs we need for an open/create request.
@@ -622,9 +666,10 @@
  * [3] : durable context
  * [4] : posix context
  * [5] : time warp context
- * [6] : compound padding
+ * [6] : query id context
+ * [7] : compound padding
  */
-#define SMB2_CREATE_IOV_SIZE 7
+#define SMB2_CREATE_IOV_SIZE 8
 
 struct smb2_create_req {
 	struct smb2_sync_hdr sync_hdr;
@@ -646,11 +691,18 @@
 	__u8   Buffer[0];
 } __packed;
 
+/*
+ * Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
+ * 88 (fixed part of create response) + 520 (path) + 208 (contexts) +
+ * 2 bytes of padding.
+ */
+#define MAX_SMB2_CREATE_RESPONSE_SIZE 880
+
 struct smb2_create_rsp {
 	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 89 */
 	__u8   OplockLevel;
-	__u8   Reserved;
+	__u8   Flag;  /* 0x01 if reparse point */
 	__le32 CreateAction;
 	__le64 CreationTime;
 	__le64 LastAccessTime;
@@ -767,6 +819,15 @@
 	__le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
 } __packed;
 
+/* See MS-SMB2 2.2.14.2.9 */
+struct create_on_disk_id {
+	struct create_context ccontext;
+	__u8   Name[8];
+	__le64 DiskFileId;
+	__le64 VolumeId;
+	__u32  Reserved[4];
+} __packed;
+
 /* See MS-SMB2 2.2.14.2.12 */
 struct durable_reconnect_context_v2_rsp {
 	__le32 Timeout;
@@ -777,6 +838,7 @@
 	struct create_context ccontext;
 	__u8   Name[8];
 	struct durable_reconnect_context_v2 dcontext;
+	__u8   Pad[4];
 } __packed;
 
 /* See MS-SMB2 2.2.13.2.5 */
@@ -787,6 +849,12 @@
 
 } __packed;
 
+/* See MS-SMB2 2.2.13.2.9 */
+struct crt_query_id_ctxt {
+	struct create_context ccontext;
+	__u8	Name[8];
+} __packed;
+
 #define COPY_CHUNK_RES_KEY_SIZE	24
 struct resume_key_req {
 	char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
@@ -832,6 +900,11 @@
 	__le32	ClusterSizeInBytes;
 } __packed;
 
+struct file_allocated_range_buffer {
+	__le64	file_offset;
+	__le64	length;
+} __packed;
+
 /* Integrity ChecksumAlgorithm choices for above */
 #define	CHECKSUM_TYPE_NONE	0x0000
 #define	CHECKSUM_TYPE_CRC64	0x0002
@@ -840,6 +913,53 @@
 /* Integrity flags for above */
 #define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF	0x00000001
 
+/* Reparse structures - see MS-FSCC 2.1.2 */
+
+/* struct fsctl_reparse_info_req is empty, only response structs (see below) */
+
+struct reparse_data_buffer {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__u8	DataBuffer[0]; /* Variable Length */
+} __packed;
+
+struct reparse_guid_data_buffer {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__u8	ReparseGuid[16];
+	__u8	DataBuffer[0]; /* Variable Length */
+} __packed;
+
+struct reparse_mount_point_data_buffer {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__le16	SubstituteNameOffset;
+	__le16	SubstituteNameLength;
+	__le16	PrintNameOffset;
+	__le16	PrintNameLength;
+	__u8	PathBuffer[0]; /* Variable Length */
+} __packed;
+
+#define SYMLINK_FLAG_RELATIVE 0x00000001
+
+struct reparse_symlink_data_buffer {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__le16	SubstituteNameOffset;
+	__le16	SubstituteNameLength;
+	__le16	PrintNameOffset;
+	__le16	PrintNameLength;
+	__le32	Flags;
+	__u8	PathBuffer[0]; /* Variable Length */
+} __packed;
+
+/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
+
+
 /* See MS-DFSC 2.2.2 */
 struct fsctl_get_dfs_referral_req {
 	__le16 MaxReferralLevel;
@@ -861,7 +981,7 @@
 	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
 	__le16 SecurityMode;
 	__le16 DialectCount;
-	__le16 Dialects[3]; /* BB expand this if autonegotiate > 3 dialects */
+	__le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
 } __packed;
 
 struct validate_negotiate_info_rsp {
@@ -914,6 +1034,13 @@
 	__le64 ByteCount;  /* Bytes to be copied */
 } __packed;
 
+/*
+ * Maximum number of iovs we need for an ioctl request.
+ * [0] : struct smb2_ioctl_req
+ * [1] : in_data
+ */
+#define SMB2_IOCTL_IOV_SIZE 2
+
 struct smb2_ioctl_req {
 	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize;	/* Must be 57 */
@@ -959,6 +1086,11 @@
 	__u64  VolatileFileId; /* opaque endianness */
 } __packed;
 
+/*
+ * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
+ */
+#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
+
 struct smb2_close_rsp {
 	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* 60 */
@@ -990,6 +1122,7 @@
 
 /* For read request Flags field below, following flag is defined for SMB3.02 */
 #define SMB2_READFLAG_READ_UNBUFFERED	0x01
+#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
 
 /* Channel field for read and write: exactly one of following flags can be set*/
 #define SMB2_CHANNEL_NONE	cpu_to_le32(0x00000000)
@@ -1056,6 +1189,42 @@
 	__u8   Buffer[1];
 } __packed;
 
+/* notify flags */
+#define SMB2_WATCH_TREE			0x0001
+
+/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
+#define FILE_NOTIFY_CHANGE_FILE_NAME		0x00000001
+#define FILE_NOTIFY_CHANGE_DIR_NAME		0x00000002
+#define FILE_NOTIFY_CHANGE_ATTRIBUTES		0x00000004
+#define FILE_NOTIFY_CHANGE_SIZE			0x00000008
+#define FILE_NOTIFY_CHANGE_LAST_WRITE		0x00000010
+#define FILE_NOTIFY_CHANGE_LAST_ACCESS		0x00000020
+#define FILE_NOTIFY_CHANGE_CREATION		0x00000040
+#define FILE_NOTIFY_CHANGE_EA			0x00000080
+#define FILE_NOTIFY_CHANGE_SECURITY		0x00000100
+#define FILE_NOTIFY_CHANGE_STREAM_NAME		0x00000200
+#define FILE_NOTIFY_CHANGE_STREAM_SIZE		0x00000400
+#define FILE_NOTIFY_CHANGE_STREAM_WRITE		0x00000800
+
+struct smb2_change_notify_req {
+	struct smb2_sync_hdr sync_hdr;
+	__le16	StructureSize;
+	__le16	Flags;
+	__le32	OutputBufferLength;
+	__u64	PersistentFileId; /* opaque endianness */
+	__u64	VolatileFileId; /* opaque endianness */
+	__le32	CompletionFilter;
+	__u32	Reserved;
+} __packed;
+
+struct smb2_change_notify_rsp {
+	struct smb2_sync_hdr sync_hdr;
+	__le16	StructureSize;  /* Must be 9 */
+	__le16	OutputBufferOffset;
+	__le32	OutputBufferLength;
+	__u8	Buffer[1]; /* array of file notify structs */
+} __packed;
+
 #define SMB2_LOCKFLAG_SHARED_LOCK	0x0001
 #define SMB2_LOCKFLAG_EXCLUSIVE_LOCK	0x0002
 #define SMB2_LOCKFLAG_UNLOCK		0x0004
@@ -1174,6 +1343,15 @@
 	__u8   Buffer[1];
 } __packed;
 
+/*
+ * Maximum number of iovs we need for a set-info request.
+ * The largest one is rename/hardlink
+ * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info
+ * [1] : path
+ * [2] : compound padding
+ */
+#define SMB2_SET_INFO_IOV_SIZE 3
+
 struct smb2_set_info_req {
 	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 33 */
@@ -1352,9 +1530,6 @@
 	char   FileName[0];     /* Name to be assigned to new link */
 } __packed; /* level 11 Set */
 
-#define SMB2_MIN_EA_BUF  2048
-#define SMB2_MAX_EA_BUF 65536
-
 struct smb2_file_full_ea_info { /* encoding of response for level 15 */
 	__le32 next_entry_offset;
 	__u8   flags;

diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index b407657..71b2930 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h

@@ -84,7 +84,8 @@
 			       umode_t mode, struct cifs_tcon *tcon,
 			       const char *full_path,
 			       struct cifs_sb_info *cifs_sb);
-extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon,
+extern int smb2_mkdir(const unsigned int xid, struct inode *inode,
+		      umode_t mode, struct cifs_tcon *tcon,
 		      const char *name, struct cifs_sb_info *cifs_sb);
 extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
 			       struct cifs_sb_info *cifs_sb,
@@ -116,6 +117,9 @@
 extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server);
 extern unsigned long smb_rqst_len(struct TCP_Server_Info *server,
 				  struct smb_rqst *rqst);
+extern void smb2_set_next_command(struct cifs_tcon *tcon,
+				  struct smb_rqst *rqst);
+extern void smb2_set_related(struct smb_rqst *rqst);
 
 /*
  * SMB2 Worker functions - most of protocol specific implementation details
@@ -139,8 +143,17 @@
 extern void SMB2_open_free(struct smb_rqst *rqst);
 extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
 		     u64 persistent_fid, u64 volatile_fid, u32 opcode,
-		     bool is_fsctl, char *in_data, u32 indatalen,
+		     bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen,
 		     char **out_data, u32 *plen /* returned data len */);
+extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+			   u64 persistent_fid, u64 volatile_fid, u32 opcode,
+			   bool is_fsctl, char *in_data, u32 indatalen,
+			   __u32 max_response_size);
+extern void SMB2_ioctl_free(struct smb_rqst *rqst);
+extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
+			u64 persistent_fid, u64 volatile_fid, bool watch_tree,
+			u32 completion_filter);
+
 extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 		      u64 persistent_file_id, u64 volatile_file_id);
 extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
@@ -150,17 +163,18 @@
 extern void SMB2_close_free(struct smb_rqst *rqst);
 extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
 		      u64 persistent_file_id, u64 volatile_file_id);
-extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
-			  u64 persistent_file_id, u64 volatile_file_id,
-			  int ea_buf_size,
-			  struct smb2_file_full_ea_info *data);
+extern int SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst,
+			   struct cifs_tcon *tcon,
+			   u64 persistent_file_id, u64 volatile_file_id);
+extern void SMB2_flush_free(struct smb_rqst *rqst);
 extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
 			   u64 persistent_file_id, u64 volatile_file_id,
 			   struct smb2_file_all_info *data);
 extern int SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
 				u64 persistent_fid, u64 volatile_fid,
 				u8 info_class, u8 info_type,
-				u32 additional_info, size_t output_len);
+				u32 additional_info, size_t output_len,
+				size_t input_len, void *input);
 extern void SMB2_query_info_free(struct smb_rqst *rqst);
 extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
 			   u64 persistent_file_id, u64 volatile_file_id,
@@ -179,20 +193,14 @@
 extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 				u64 persistent_fid, u64 volatile_fid, int index,
 				struct cifs_search_info *srch_inf);
-extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
-		       u64 persistent_fid, u64 volatile_fid,
-		       __le16 *target_file);
-extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
-		      u64 persistent_fid, u64 volatile_fid);
-extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
-			     u64 persistent_fid, u64 volatile_fid,
-			     __le16 *target_file);
 extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
 			u64 persistent_fid, u64 volatile_fid, u32 pid,
-			__le64 *eof, bool is_fallocate);
-extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
-			 u64 persistent_fid, u64 volatile_fid,
-			 FILE_BASIC_INFO *buf);
+			__le64 *eof);
+extern int SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+			      u64 persistent_fid, u64 volatile_fid, u32 pid,
+			      u8 info_class, u8 info_type, u32 additional_info,
+			      void **data, unsigned int *size);
+extern void SMB2_set_info_free(struct smb_rqst *rqst);
 extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
 			u64 persistent_fid, u64 volatile_fid,
 			struct cifs_ntsd *pnntsd, int pacllen, int aclflag);
@@ -229,13 +237,27 @@
 
 extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
 					enum securityEnum);
+extern void smb2_parse_contexts(struct TCP_Server_Info *server,
+				struct smb2_create_rsp *rsp,
+				unsigned int *epoch, char *lease_key,
+				__u8 *oplock, struct smb2_file_all_info *buf);
 extern int smb3_encryption_required(const struct cifs_tcon *tcon);
 extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
 			     struct kvec *iov, unsigned int min_buf_size);
+extern int smb2_validate_and_copy_iov(unsigned int offset,
+				      unsigned int buffer_length,
+				      struct kvec *iov,
+				      unsigned int minbufsize, char *data);
 extern void smb2_copy_fs_info_to_kstatfs(
 	 struct smb2_fs_full_size_info *pfs_inf,
 	 struct kstatfs *kst);
 extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server);
 extern int smb311_update_preauth_hash(struct cifs_ses *ses,
 				      struct kvec *iov, int nvec);
+extern int smb2_query_info_compound(const unsigned int xid,
+				    struct cifs_tcon *tcon,
+				    __le16 *utf16_path, u32 desired_access,
+				    u32 class, u32 type, u32 output_len,
+				    struct kvec *rsp, int *buftype,
+				    struct cifs_sb_info *cifs_sb);
 #endif			/* _SMB2PROTO_H */

diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h
index 3d5f621..7505056 100644
--- a/fs/cifs/smb2status.h
+++ b/fs/cifs/smb2status.h

@@ -30,9 +30,9 @@
  */
 
 #define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000)
-#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001)
-#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002)
-#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003)
+#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001)
+#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002)
+#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003)
 
 struct ntstatus {
 	/* Facility is the high 12 bits of the following field */
@@ -40,1743 +40,1743 @@
 	__le32 Code;
 };
 
-#define STATUS_SUCCESS __constant_cpu_to_le32(0x00000000)
-#define STATUS_WAIT_0 __constant_cpu_to_le32(0x00000000)
-#define STATUS_WAIT_1 __constant_cpu_to_le32(0x00000001)
-#define STATUS_WAIT_2 __constant_cpu_to_le32(0x00000002)
-#define STATUS_WAIT_3 __constant_cpu_to_le32(0x00000003)
-#define STATUS_WAIT_63 __constant_cpu_to_le32(0x0000003F)
-#define STATUS_ABANDONED __constant_cpu_to_le32(0x00000080)
-#define STATUS_ABANDONED_WAIT_0 __constant_cpu_to_le32(0x00000080)
-#define STATUS_ABANDONED_WAIT_63 __constant_cpu_to_le32(0x000000BF)
-#define STATUS_USER_APC __constant_cpu_to_le32(0x000000C0)
-#define STATUS_KERNEL_APC __constant_cpu_to_le32(0x00000100)
-#define STATUS_ALERTED __constant_cpu_to_le32(0x00000101)
-#define STATUS_TIMEOUT __constant_cpu_to_le32(0x00000102)
-#define STATUS_PENDING __constant_cpu_to_le32(0x00000103)
-#define STATUS_REPARSE __constant_cpu_to_le32(0x00000104)
-#define STATUS_MORE_ENTRIES __constant_cpu_to_le32(0x00000105)
-#define STATUS_NOT_ALL_ASSIGNED __constant_cpu_to_le32(0x00000106)
-#define STATUS_SOME_NOT_MAPPED __constant_cpu_to_le32(0x00000107)
-#define STATUS_OPLOCK_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x00000108)
-#define STATUS_VOLUME_MOUNTED __constant_cpu_to_le32(0x00000109)
-#define STATUS_RXACT_COMMITTED __constant_cpu_to_le32(0x0000010A)
-#define STATUS_NOTIFY_CLEANUP __constant_cpu_to_le32(0x0000010B)
-#define STATUS_NOTIFY_ENUM_DIR __constant_cpu_to_le32(0x0000010C)
-#define STATUS_NO_QUOTAS_FOR_ACCOUNT __constant_cpu_to_le32(0x0000010D)
-#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED __constant_cpu_to_le32(0x0000010E)
-#define STATUS_PAGE_FAULT_TRANSITION __constant_cpu_to_le32(0x00000110)
-#define STATUS_PAGE_FAULT_DEMAND_ZERO __constant_cpu_to_le32(0x00000111)
-#define STATUS_PAGE_FAULT_COPY_ON_WRITE __constant_cpu_to_le32(0x00000112)
-#define STATUS_PAGE_FAULT_GUARD_PAGE __constant_cpu_to_le32(0x00000113)
-#define STATUS_PAGE_FAULT_PAGING_FILE __constant_cpu_to_le32(0x00000114)
-#define STATUS_CACHE_PAGE_LOCKED __constant_cpu_to_le32(0x00000115)
-#define STATUS_CRASH_DUMP __constant_cpu_to_le32(0x00000116)
-#define STATUS_BUFFER_ALL_ZEROS __constant_cpu_to_le32(0x00000117)
-#define STATUS_REPARSE_OBJECT __constant_cpu_to_le32(0x00000118)
-#define STATUS_RESOURCE_REQUIREMENTS_CHANGED __constant_cpu_to_le32(0x00000119)
-#define STATUS_TRANSLATION_COMPLETE __constant_cpu_to_le32(0x00000120)
-#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY __constant_cpu_to_le32(0x00000121)
-#define STATUS_NOTHING_TO_TERMINATE __constant_cpu_to_le32(0x00000122)
-#define STATUS_PROCESS_NOT_IN_JOB __constant_cpu_to_le32(0x00000123)
-#define STATUS_PROCESS_IN_JOB __constant_cpu_to_le32(0x00000124)
-#define STATUS_VOLSNAP_HIBERNATE_READY __constant_cpu_to_le32(0x00000125)
-#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY __constant_cpu_to_le32(0x00000126)
-#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED __constant_cpu_to_le32(0x00000127)
-#define STATUS_INTERRUPT_STILL_CONNECTED __constant_cpu_to_le32(0x00000128)
-#define STATUS_PROCESS_CLONED __constant_cpu_to_le32(0x00000129)
-#define STATUS_FILE_LOCKED_WITH_ONLY_READERS __constant_cpu_to_le32(0x0000012A)
-#define STATUS_FILE_LOCKED_WITH_WRITERS __constant_cpu_to_le32(0x0000012B)
-#define STATUS_RESOURCEMANAGER_READ_ONLY __constant_cpu_to_le32(0x00000202)
-#define STATUS_WAIT_FOR_OPLOCK __constant_cpu_to_le32(0x00000367)
-#define DBG_EXCEPTION_HANDLED __constant_cpu_to_le32(0x00010001)
-#define DBG_CONTINUE __constant_cpu_to_le32(0x00010002)
-#define STATUS_FLT_IO_COMPLETE __constant_cpu_to_le32(0x001C0001)
-#define STATUS_OBJECT_NAME_EXISTS __constant_cpu_to_le32(0x40000000)
-#define STATUS_THREAD_WAS_SUSPENDED __constant_cpu_to_le32(0x40000001)
-#define STATUS_WORKING_SET_LIMIT_RANGE __constant_cpu_to_le32(0x40000002)
-#define STATUS_IMAGE_NOT_AT_BASE __constant_cpu_to_le32(0x40000003)
-#define STATUS_RXACT_STATE_CREATED __constant_cpu_to_le32(0x40000004)
-#define STATUS_SEGMENT_NOTIFICATION __constant_cpu_to_le32(0x40000005)
-#define STATUS_LOCAL_USER_SESSION_KEY __constant_cpu_to_le32(0x40000006)
-#define STATUS_BAD_CURRENT_DIRECTORY __constant_cpu_to_le32(0x40000007)
-#define STATUS_SERIAL_MORE_WRITES __constant_cpu_to_le32(0x40000008)
-#define STATUS_REGISTRY_RECOVERED __constant_cpu_to_le32(0x40000009)
-#define STATUS_FT_READ_RECOVERY_FROM_BACKUP __constant_cpu_to_le32(0x4000000A)
-#define STATUS_FT_WRITE_RECOVERY __constant_cpu_to_le32(0x4000000B)
-#define STATUS_SERIAL_COUNTER_TIMEOUT __constant_cpu_to_le32(0x4000000C)
-#define STATUS_NULL_LM_PASSWORD __constant_cpu_to_le32(0x4000000D)
-#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH __constant_cpu_to_le32(0x4000000E)
-#define STATUS_RECEIVE_PARTIAL __constant_cpu_to_le32(0x4000000F)
-#define STATUS_RECEIVE_EXPEDITED __constant_cpu_to_le32(0x40000010)
-#define STATUS_RECEIVE_PARTIAL_EXPEDITED __constant_cpu_to_le32(0x40000011)
-#define STATUS_EVENT_DONE __constant_cpu_to_le32(0x40000012)
-#define STATUS_EVENT_PENDING __constant_cpu_to_le32(0x40000013)
-#define STATUS_CHECKING_FILE_SYSTEM __constant_cpu_to_le32(0x40000014)
-#define STATUS_FATAL_APP_EXIT __constant_cpu_to_le32(0x40000015)
-#define STATUS_PREDEFINED_HANDLE __constant_cpu_to_le32(0x40000016)
-#define STATUS_WAS_UNLOCKED __constant_cpu_to_le32(0x40000017)
-#define STATUS_SERVICE_NOTIFICATION __constant_cpu_to_le32(0x40000018)
-#define STATUS_WAS_LOCKED __constant_cpu_to_le32(0x40000019)
-#define STATUS_LOG_HARD_ERROR __constant_cpu_to_le32(0x4000001A)
-#define STATUS_ALREADY_WIN32 __constant_cpu_to_le32(0x4000001B)
-#define STATUS_WX86_UNSIMULATE __constant_cpu_to_le32(0x4000001C)
-#define STATUS_WX86_CONTINUE __constant_cpu_to_le32(0x4000001D)
-#define STATUS_WX86_SINGLE_STEP __constant_cpu_to_le32(0x4000001E)
-#define STATUS_WX86_BREAKPOINT __constant_cpu_to_le32(0x4000001F)
-#define STATUS_WX86_EXCEPTION_CONTINUE __constant_cpu_to_le32(0x40000020)
-#define STATUS_WX86_EXCEPTION_LASTCHANCE __constant_cpu_to_le32(0x40000021)
-#define STATUS_WX86_EXCEPTION_CHAIN __constant_cpu_to_le32(0x40000022)
-#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE __constant_cpu_to_le32(0x40000023)
-#define STATUS_NO_YIELD_PERFORMED __constant_cpu_to_le32(0x40000024)
-#define STATUS_TIMER_RESUME_IGNORED __constant_cpu_to_le32(0x40000025)
-#define STATUS_ARBITRATION_UNHANDLED __constant_cpu_to_le32(0x40000026)
-#define STATUS_CARDBUS_NOT_SUPPORTED __constant_cpu_to_le32(0x40000027)
-#define STATUS_WX86_CREATEWX86TIB __constant_cpu_to_le32(0x40000028)
-#define STATUS_MP_PROCESSOR_MISMATCH __constant_cpu_to_le32(0x40000029)
-#define STATUS_HIBERNATED __constant_cpu_to_le32(0x4000002A)
-#define STATUS_RESUME_HIBERNATION __constant_cpu_to_le32(0x4000002B)
-#define STATUS_FIRMWARE_UPDATED __constant_cpu_to_le32(0x4000002C)
-#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES __constant_cpu_to_le32(0x4000002D)
-#define STATUS_MESSAGE_RETRIEVED __constant_cpu_to_le32(0x4000002E)
-#define STATUS_SYSTEM_POWERSTATE_TRANSITION __constant_cpu_to_le32(0x4000002F)
-#define STATUS_ALPC_CHECK_COMPLETION_LIST __constant_cpu_to_le32(0x40000030)
-#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION __constant_cpu_to_le32(0x40000031)
-#define STATUS_ACCESS_AUDIT_BY_POLICY __constant_cpu_to_le32(0x40000032)
-#define STATUS_ABANDON_HIBERFILE __constant_cpu_to_le32(0x40000033)
-#define STATUS_BIZRULES_NOT_ENABLED __constant_cpu_to_le32(0x40000034)
-#define STATUS_WAKE_SYSTEM __constant_cpu_to_le32(0x40000294)
-#define STATUS_DS_SHUTTING_DOWN __constant_cpu_to_le32(0x40000370)
-#define DBG_REPLY_LATER __constant_cpu_to_le32(0x40010001)
-#define DBG_UNABLE_TO_PROVIDE_HANDLE __constant_cpu_to_le32(0x40010002)
-#define DBG_TERMINATE_THREAD __constant_cpu_to_le32(0x40010003)
-#define DBG_TERMINATE_PROCESS __constant_cpu_to_le32(0x40010004)
-#define DBG_CONTROL_C __constant_cpu_to_le32(0x40010005)
-#define DBG_PRINTEXCEPTION_C __constant_cpu_to_le32(0x40010006)
-#define DBG_RIPEXCEPTION __constant_cpu_to_le32(0x40010007)
-#define DBG_CONTROL_BREAK __constant_cpu_to_le32(0x40010008)
-#define DBG_COMMAND_EXCEPTION __constant_cpu_to_le32(0x40010009)
-#define RPC_NT_UUID_LOCAL_ONLY __constant_cpu_to_le32(0x40020056)
-#define RPC_NT_SEND_INCOMPLETE __constant_cpu_to_le32(0x400200AF)
-#define STATUS_CTX_CDM_CONNECT __constant_cpu_to_le32(0x400A0004)
-#define STATUS_CTX_CDM_DISCONNECT __constant_cpu_to_le32(0x400A0005)
-#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT __constant_cpu_to_le32(0x4015000D)
-#define STATUS_RECOVERY_NOT_NEEDED __constant_cpu_to_le32(0x40190034)
-#define STATUS_RM_ALREADY_STARTED __constant_cpu_to_le32(0x40190035)
-#define STATUS_LOG_NO_RESTART __constant_cpu_to_le32(0x401A000C)
-#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST __constant_cpu_to_le32(0x401B00EC)
-#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED __constant_cpu_to_le32(0x401E000A)
-#define STATUS_GRAPHICS_DRIVER_MISMATCH __constant_cpu_to_le32(0x401E0117)
-#define STATUS_GRAPHICS_MODE_NOT_PINNED __constant_cpu_to_le32(0x401E0307)
-#define STATUS_GRAPHICS_NO_PREFERRED_MODE __constant_cpu_to_le32(0x401E031E)
-#define STATUS_GRAPHICS_DATASET_IS_EMPTY __constant_cpu_to_le32(0x401E034B)
-#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET __constant_cpu_to_le32(0x401E034C)
-#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED __constant_cpu_to_le32(0x401E0351)
-#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS __constant_cpu_to_le32(0x401E042F)
-#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED __constant_cpu_to_le32(0x401E0437)
-#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY __constant_cpu_to_le32(0x401E0439)
-#define STATUS_GRAPHICS_START_DEFERRED __constant_cpu_to_le32(0x401E043A)
-#define STATUS_NDIS_INDICATION_REQUIRED __constant_cpu_to_le32(0x40230001)
-#define STATUS_GUARD_PAGE_VIOLATION __constant_cpu_to_le32(0x80000001)
-#define STATUS_DATATYPE_MISALIGNMENT __constant_cpu_to_le32(0x80000002)
-#define STATUS_BREAKPOINT __constant_cpu_to_le32(0x80000003)
-#define STATUS_SINGLE_STEP __constant_cpu_to_le32(0x80000004)
-#define STATUS_BUFFER_OVERFLOW __constant_cpu_to_le32(0x80000005)
-#define STATUS_NO_MORE_FILES __constant_cpu_to_le32(0x80000006)
-#define STATUS_WAKE_SYSTEM_DEBUGGER __constant_cpu_to_le32(0x80000007)
-#define STATUS_HANDLES_CLOSED __constant_cpu_to_le32(0x8000000A)
-#define STATUS_NO_INHERITANCE __constant_cpu_to_le32(0x8000000B)
-#define STATUS_GUID_SUBSTITUTION_MADE __constant_cpu_to_le32(0x8000000C)
-#define STATUS_PARTIAL_COPY __constant_cpu_to_le32(0x8000000D)
-#define STATUS_DEVICE_PAPER_EMPTY __constant_cpu_to_le32(0x8000000E)
-#define STATUS_DEVICE_POWERED_OFF __constant_cpu_to_le32(0x8000000F)
-#define STATUS_DEVICE_OFF_LINE __constant_cpu_to_le32(0x80000010)
-#define STATUS_DEVICE_BUSY __constant_cpu_to_le32(0x80000011)
-#define STATUS_NO_MORE_EAS __constant_cpu_to_le32(0x80000012)
-#define STATUS_INVALID_EA_NAME __constant_cpu_to_le32(0x80000013)
-#define STATUS_EA_LIST_INCONSISTENT __constant_cpu_to_le32(0x80000014)
-#define STATUS_INVALID_EA_FLAG __constant_cpu_to_le32(0x80000015)
-#define STATUS_VERIFY_REQUIRED __constant_cpu_to_le32(0x80000016)
-#define STATUS_EXTRANEOUS_INFORMATION __constant_cpu_to_le32(0x80000017)
-#define STATUS_RXACT_COMMIT_NECESSARY __constant_cpu_to_le32(0x80000018)
-#define STATUS_NO_MORE_ENTRIES __constant_cpu_to_le32(0x8000001A)
-#define STATUS_FILEMARK_DETECTED __constant_cpu_to_le32(0x8000001B)
-#define STATUS_MEDIA_CHANGED __constant_cpu_to_le32(0x8000001C)
-#define STATUS_BUS_RESET __constant_cpu_to_le32(0x8000001D)
-#define STATUS_END_OF_MEDIA __constant_cpu_to_le32(0x8000001E)
-#define STATUS_BEGINNING_OF_MEDIA __constant_cpu_to_le32(0x8000001F)
-#define STATUS_MEDIA_CHECK __constant_cpu_to_le32(0x80000020)
-#define STATUS_SETMARK_DETECTED __constant_cpu_to_le32(0x80000021)
-#define STATUS_NO_DATA_DETECTED __constant_cpu_to_le32(0x80000022)
-#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES __constant_cpu_to_le32(0x80000023)
-#define STATUS_SERVER_HAS_OPEN_HANDLES __constant_cpu_to_le32(0x80000024)
-#define STATUS_ALREADY_DISCONNECTED __constant_cpu_to_le32(0x80000025)
-#define STATUS_LONGJUMP __constant_cpu_to_le32(0x80000026)
-#define STATUS_CLEANER_CARTRIDGE_INSTALLED __constant_cpu_to_le32(0x80000027)
-#define STATUS_PLUGPLAY_QUERY_VETOED __constant_cpu_to_le32(0x80000028)
-#define STATUS_UNWIND_CONSOLIDATE __constant_cpu_to_le32(0x80000029)
-#define STATUS_REGISTRY_HIVE_RECOVERED __constant_cpu_to_le32(0x8000002A)
-#define STATUS_DLL_MIGHT_BE_INSECURE __constant_cpu_to_le32(0x8000002B)
-#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE __constant_cpu_to_le32(0x8000002C)
-#define STATUS_STOPPED_ON_SYMLINK __constant_cpu_to_le32(0x8000002D)
-#define STATUS_DEVICE_REQUIRES_CLEANING __constant_cpu_to_le32(0x80000288)
-#define STATUS_DEVICE_DOOR_OPEN __constant_cpu_to_le32(0x80000289)
-#define STATUS_DATA_LOST_REPAIR __constant_cpu_to_le32(0x80000803)
-#define DBG_EXCEPTION_NOT_HANDLED __constant_cpu_to_le32(0x80010001)
-#define STATUS_CLUSTER_NODE_ALREADY_UP __constant_cpu_to_le32(0x80130001)
-#define STATUS_CLUSTER_NODE_ALREADY_DOWN __constant_cpu_to_le32(0x80130002)
-#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE __constant_cpu_to_le32(0x80130003)
-#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE __constant_cpu_to_le32(0x80130004)
-#define STATUS_CLUSTER_NODE_ALREADY_MEMBER __constant_cpu_to_le32(0x80130005)
-#define STATUS_COULD_NOT_RESIZE_LOG __constant_cpu_to_le32(0x80190009)
-#define STATUS_NO_TXF_METADATA __constant_cpu_to_le32(0x80190029)
-#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN __constant_cpu_to_le32(0x80190031)
-#define STATUS_TXF_METADATA_ALREADY_PRESENT __constant_cpu_to_le32(0x80190041)
-#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET __constant_cpu_to_le32(0x80190042)
-#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED __constant_cpu_to_le32(0x801B00EB)
-#define STATUS_FLT_BUFFER_TOO_SMALL __constant_cpu_to_le32(0x801C0001)
-#define STATUS_FVE_PARTIAL_METADATA __constant_cpu_to_le32(0x80210001)
-#define STATUS_UNSUCCESSFUL __constant_cpu_to_le32(0xC0000001)
-#define STATUS_NOT_IMPLEMENTED __constant_cpu_to_le32(0xC0000002)
-#define STATUS_INVALID_INFO_CLASS __constant_cpu_to_le32(0xC0000003)
-#define STATUS_INFO_LENGTH_MISMATCH __constant_cpu_to_le32(0xC0000004)
-#define STATUS_ACCESS_VIOLATION __constant_cpu_to_le32(0xC0000005)
-#define STATUS_IN_PAGE_ERROR __constant_cpu_to_le32(0xC0000006)
-#define STATUS_PAGEFILE_QUOTA __constant_cpu_to_le32(0xC0000007)
-#define STATUS_INVALID_HANDLE __constant_cpu_to_le32(0xC0000008)
-#define STATUS_BAD_INITIAL_STACK __constant_cpu_to_le32(0xC0000009)
-#define STATUS_BAD_INITIAL_PC __constant_cpu_to_le32(0xC000000A)
-#define STATUS_INVALID_CID __constant_cpu_to_le32(0xC000000B)
-#define STATUS_TIMER_NOT_CANCELED __constant_cpu_to_le32(0xC000000C)
-#define STATUS_INVALID_PARAMETER __constant_cpu_to_le32(0xC000000D)
-#define STATUS_NO_SUCH_DEVICE __constant_cpu_to_le32(0xC000000E)
-#define STATUS_NO_SUCH_FILE __constant_cpu_to_le32(0xC000000F)
-#define STATUS_INVALID_DEVICE_REQUEST __constant_cpu_to_le32(0xC0000010)
-#define STATUS_END_OF_FILE __constant_cpu_to_le32(0xC0000011)
-#define STATUS_WRONG_VOLUME __constant_cpu_to_le32(0xC0000012)
-#define STATUS_NO_MEDIA_IN_DEVICE __constant_cpu_to_le32(0xC0000013)
-#define STATUS_UNRECOGNIZED_MEDIA __constant_cpu_to_le32(0xC0000014)
-#define STATUS_NONEXISTENT_SECTOR __constant_cpu_to_le32(0xC0000015)
-#define STATUS_MORE_PROCESSING_REQUIRED __constant_cpu_to_le32(0xC0000016)
-#define STATUS_NO_MEMORY __constant_cpu_to_le32(0xC0000017)
-#define STATUS_CONFLICTING_ADDRESSES __constant_cpu_to_le32(0xC0000018)
-#define STATUS_NOT_MAPPED_VIEW __constant_cpu_to_le32(0xC0000019)
-#define STATUS_UNABLE_TO_FREE_VM __constant_cpu_to_le32(0xC000001A)
-#define STATUS_UNABLE_TO_DELETE_SECTION __constant_cpu_to_le32(0xC000001B)
-#define STATUS_INVALID_SYSTEM_SERVICE __constant_cpu_to_le32(0xC000001C)
-#define STATUS_ILLEGAL_INSTRUCTION __constant_cpu_to_le32(0xC000001D)
-#define STATUS_INVALID_LOCK_SEQUENCE __constant_cpu_to_le32(0xC000001E)
-#define STATUS_INVALID_VIEW_SIZE __constant_cpu_to_le32(0xC000001F)
-#define STATUS_INVALID_FILE_FOR_SECTION __constant_cpu_to_le32(0xC0000020)
-#define STATUS_ALREADY_COMMITTED __constant_cpu_to_le32(0xC0000021)
-#define STATUS_ACCESS_DENIED __constant_cpu_to_le32(0xC0000022)
-#define STATUS_BUFFER_TOO_SMALL __constant_cpu_to_le32(0xC0000023)
-#define STATUS_OBJECT_TYPE_MISMATCH __constant_cpu_to_le32(0xC0000024)
-#define STATUS_NONCONTINUABLE_EXCEPTION __constant_cpu_to_le32(0xC0000025)
-#define STATUS_INVALID_DISPOSITION __constant_cpu_to_le32(0xC0000026)
-#define STATUS_UNWIND __constant_cpu_to_le32(0xC0000027)
-#define STATUS_BAD_STACK __constant_cpu_to_le32(0xC0000028)
-#define STATUS_INVALID_UNWIND_TARGET __constant_cpu_to_le32(0xC0000029)
-#define STATUS_NOT_LOCKED __constant_cpu_to_le32(0xC000002A)
-#define STATUS_PARITY_ERROR __constant_cpu_to_le32(0xC000002B)
-#define STATUS_UNABLE_TO_DECOMMIT_VM __constant_cpu_to_le32(0xC000002C)
-#define STATUS_NOT_COMMITTED __constant_cpu_to_le32(0xC000002D)
-#define STATUS_INVALID_PORT_ATTRIBUTES __constant_cpu_to_le32(0xC000002E)
-#define STATUS_PORT_MESSAGE_TOO_LONG __constant_cpu_to_le32(0xC000002F)
-#define STATUS_INVALID_PARAMETER_MIX __constant_cpu_to_le32(0xC0000030)
-#define STATUS_INVALID_QUOTA_LOWER __constant_cpu_to_le32(0xC0000031)
-#define STATUS_DISK_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000032)
-#define STATUS_OBJECT_NAME_INVALID __constant_cpu_to_le32(0xC0000033)
-#define STATUS_OBJECT_NAME_NOT_FOUND __constant_cpu_to_le32(0xC0000034)
-#define STATUS_OBJECT_NAME_COLLISION __constant_cpu_to_le32(0xC0000035)
-#define STATUS_PORT_DISCONNECTED __constant_cpu_to_le32(0xC0000037)
-#define STATUS_DEVICE_ALREADY_ATTACHED __constant_cpu_to_le32(0xC0000038)
-#define STATUS_OBJECT_PATH_INVALID __constant_cpu_to_le32(0xC0000039)
-#define STATUS_OBJECT_PATH_NOT_FOUND __constant_cpu_to_le32(0xC000003A)
-#define STATUS_OBJECT_PATH_SYNTAX_BAD __constant_cpu_to_le32(0xC000003B)
-#define STATUS_DATA_OVERRUN __constant_cpu_to_le32(0xC000003C)
-#define STATUS_DATA_LATE_ERROR __constant_cpu_to_le32(0xC000003D)
-#define STATUS_DATA_ERROR __constant_cpu_to_le32(0xC000003E)
-#define STATUS_CRC_ERROR __constant_cpu_to_le32(0xC000003F)
-#define STATUS_SECTION_TOO_BIG __constant_cpu_to_le32(0xC0000040)
-#define STATUS_PORT_CONNECTION_REFUSED __constant_cpu_to_le32(0xC0000041)
-#define STATUS_INVALID_PORT_HANDLE __constant_cpu_to_le32(0xC0000042)
-#define STATUS_SHARING_VIOLATION __constant_cpu_to_le32(0xC0000043)
-#define STATUS_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000044)
-#define STATUS_INVALID_PAGE_PROTECTION __constant_cpu_to_le32(0xC0000045)
-#define STATUS_MUTANT_NOT_OWNED __constant_cpu_to_le32(0xC0000046)
-#define STATUS_SEMAPHORE_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC0000047)
-#define STATUS_PORT_ALREADY_SET __constant_cpu_to_le32(0xC0000048)
-#define STATUS_SECTION_NOT_IMAGE __constant_cpu_to_le32(0xC0000049)
-#define STATUS_SUSPEND_COUNT_EXCEEDED __constant_cpu_to_le32(0xC000004A)
-#define STATUS_THREAD_IS_TERMINATING __constant_cpu_to_le32(0xC000004B)
-#define STATUS_BAD_WORKING_SET_LIMIT __constant_cpu_to_le32(0xC000004C)
-#define STATUS_INCOMPATIBLE_FILE_MAP __constant_cpu_to_le32(0xC000004D)
-#define STATUS_SECTION_PROTECTION __constant_cpu_to_le32(0xC000004E)
-#define STATUS_EAS_NOT_SUPPORTED __constant_cpu_to_le32(0xC000004F)
-#define STATUS_EA_TOO_LARGE __constant_cpu_to_le32(0xC0000050)
-#define STATUS_NONEXISTENT_EA_ENTRY __constant_cpu_to_le32(0xC0000051)
-#define STATUS_NO_EAS_ON_FILE __constant_cpu_to_le32(0xC0000052)
-#define STATUS_EA_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000053)
-#define STATUS_FILE_LOCK_CONFLICT __constant_cpu_to_le32(0xC0000054)
-#define STATUS_LOCK_NOT_GRANTED __constant_cpu_to_le32(0xC0000055)
-#define STATUS_DELETE_PENDING __constant_cpu_to_le32(0xC0000056)
-#define STATUS_CTL_FILE_NOT_SUPPORTED __constant_cpu_to_le32(0xC0000057)
-#define STATUS_UNKNOWN_REVISION __constant_cpu_to_le32(0xC0000058)
-#define STATUS_REVISION_MISMATCH __constant_cpu_to_le32(0xC0000059)
-#define STATUS_INVALID_OWNER __constant_cpu_to_le32(0xC000005A)
-#define STATUS_INVALID_PRIMARY_GROUP __constant_cpu_to_le32(0xC000005B)
-#define STATUS_NO_IMPERSONATION_TOKEN __constant_cpu_to_le32(0xC000005C)
-#define STATUS_CANT_DISABLE_MANDATORY __constant_cpu_to_le32(0xC000005D)
-#define STATUS_NO_LOGON_SERVERS __constant_cpu_to_le32(0xC000005E)
-#define STATUS_NO_SUCH_LOGON_SESSION __constant_cpu_to_le32(0xC000005F)
-#define STATUS_NO_SUCH_PRIVILEGE __constant_cpu_to_le32(0xC0000060)
-#define STATUS_PRIVILEGE_NOT_HELD __constant_cpu_to_le32(0xC0000061)
-#define STATUS_INVALID_ACCOUNT_NAME __constant_cpu_to_le32(0xC0000062)
-#define STATUS_USER_EXISTS __constant_cpu_to_le32(0xC0000063)
-#define STATUS_NO_SUCH_USER __constant_cpu_to_le32(0xC0000064)
-#define STATUS_GROUP_EXISTS __constant_cpu_to_le32(0xC0000065)
-#define STATUS_NO_SUCH_GROUP __constant_cpu_to_le32(0xC0000066)
-#define STATUS_MEMBER_IN_GROUP __constant_cpu_to_le32(0xC0000067)
-#define STATUS_MEMBER_NOT_IN_GROUP __constant_cpu_to_le32(0xC0000068)
-#define STATUS_LAST_ADMIN __constant_cpu_to_le32(0xC0000069)
-#define STATUS_WRONG_PASSWORD __constant_cpu_to_le32(0xC000006A)
-#define STATUS_ILL_FORMED_PASSWORD __constant_cpu_to_le32(0xC000006B)
-#define STATUS_PASSWORD_RESTRICTION __constant_cpu_to_le32(0xC000006C)
-#define STATUS_LOGON_FAILURE __constant_cpu_to_le32(0xC000006D)
-#define STATUS_ACCOUNT_RESTRICTION __constant_cpu_to_le32(0xC000006E)
-#define STATUS_INVALID_LOGON_HOURS __constant_cpu_to_le32(0xC000006F)
-#define STATUS_INVALID_WORKSTATION __constant_cpu_to_le32(0xC0000070)
-#define STATUS_PASSWORD_EXPIRED __constant_cpu_to_le32(0xC0000071)
-#define STATUS_ACCOUNT_DISABLED __constant_cpu_to_le32(0xC0000072)
-#define STATUS_NONE_MAPPED __constant_cpu_to_le32(0xC0000073)
-#define STATUS_TOO_MANY_LUIDS_REQUESTED __constant_cpu_to_le32(0xC0000074)
-#define STATUS_LUIDS_EXHAUSTED __constant_cpu_to_le32(0xC0000075)
-#define STATUS_INVALID_SUB_AUTHORITY __constant_cpu_to_le32(0xC0000076)
-#define STATUS_INVALID_ACL __constant_cpu_to_le32(0xC0000077)
-#define STATUS_INVALID_SID __constant_cpu_to_le32(0xC0000078)
-#define STATUS_INVALID_SECURITY_DESCR __constant_cpu_to_le32(0xC0000079)
-#define STATUS_PROCEDURE_NOT_FOUND __constant_cpu_to_le32(0xC000007A)
-#define STATUS_INVALID_IMAGE_FORMAT __constant_cpu_to_le32(0xC000007B)
-#define STATUS_NO_TOKEN __constant_cpu_to_le32(0xC000007C)
-#define STATUS_BAD_INHERITANCE_ACL __constant_cpu_to_le32(0xC000007D)
-#define STATUS_RANGE_NOT_LOCKED __constant_cpu_to_le32(0xC000007E)
-#define STATUS_DISK_FULL __constant_cpu_to_le32(0xC000007F)
-#define STATUS_SERVER_DISABLED __constant_cpu_to_le32(0xC0000080)
-#define STATUS_SERVER_NOT_DISABLED __constant_cpu_to_le32(0xC0000081)
-#define STATUS_TOO_MANY_GUIDS_REQUESTED __constant_cpu_to_le32(0xC0000082)
-#define STATUS_GUIDS_EXHAUSTED __constant_cpu_to_le32(0xC0000083)
-#define STATUS_INVALID_ID_AUTHORITY __constant_cpu_to_le32(0xC0000084)
-#define STATUS_AGENTS_EXHAUSTED __constant_cpu_to_le32(0xC0000085)
-#define STATUS_INVALID_VOLUME_LABEL __constant_cpu_to_le32(0xC0000086)
-#define STATUS_SECTION_NOT_EXTENDED __constant_cpu_to_le32(0xC0000087)
-#define STATUS_NOT_MAPPED_DATA __constant_cpu_to_le32(0xC0000088)
-#define STATUS_RESOURCE_DATA_NOT_FOUND __constant_cpu_to_le32(0xC0000089)
-#define STATUS_RESOURCE_TYPE_NOT_FOUND __constant_cpu_to_le32(0xC000008A)
-#define STATUS_RESOURCE_NAME_NOT_FOUND __constant_cpu_to_le32(0xC000008B)
-#define STATUS_ARRAY_BOUNDS_EXCEEDED __constant_cpu_to_le32(0xC000008C)
-#define STATUS_FLOAT_DENORMAL_OPERAND __constant_cpu_to_le32(0xC000008D)
-#define STATUS_FLOAT_DIVIDE_BY_ZERO __constant_cpu_to_le32(0xC000008E)
-#define STATUS_FLOAT_INEXACT_RESULT __constant_cpu_to_le32(0xC000008F)
-#define STATUS_FLOAT_INVALID_OPERATION __constant_cpu_to_le32(0xC0000090)
-#define STATUS_FLOAT_OVERFLOW __constant_cpu_to_le32(0xC0000091)
-#define STATUS_FLOAT_STACK_CHECK __constant_cpu_to_le32(0xC0000092)
-#define STATUS_FLOAT_UNDERFLOW __constant_cpu_to_le32(0xC0000093)
-#define STATUS_INTEGER_DIVIDE_BY_ZERO __constant_cpu_to_le32(0xC0000094)
-#define STATUS_INTEGER_OVERFLOW __constant_cpu_to_le32(0xC0000095)
-#define STATUS_PRIVILEGED_INSTRUCTION __constant_cpu_to_le32(0xC0000096)
-#define STATUS_TOO_MANY_PAGING_FILES __constant_cpu_to_le32(0xC0000097)
-#define STATUS_FILE_INVALID __constant_cpu_to_le32(0xC0000098)
-#define STATUS_ALLOTTED_SPACE_EXCEEDED __constant_cpu_to_le32(0xC0000099)
-#define STATUS_INSUFFICIENT_RESOURCES __constant_cpu_to_le32(0xC000009A)
-#define STATUS_DFS_EXIT_PATH_FOUND __constant_cpu_to_le32(0xC000009B)
-#define STATUS_DEVICE_DATA_ERROR __constant_cpu_to_le32(0xC000009C)
-#define STATUS_DEVICE_NOT_CONNECTED __constant_cpu_to_le32(0xC000009D)
-#define STATUS_DEVICE_POWER_FAILURE __constant_cpu_to_le32(0xC000009E)
-#define STATUS_FREE_VM_NOT_AT_BASE __constant_cpu_to_le32(0xC000009F)
-#define STATUS_MEMORY_NOT_ALLOCATED __constant_cpu_to_le32(0xC00000A0)
-#define STATUS_WORKING_SET_QUOTA __constant_cpu_to_le32(0xC00000A1)
-#define STATUS_MEDIA_WRITE_PROTECTED __constant_cpu_to_le32(0xC00000A2)
-#define STATUS_DEVICE_NOT_READY __constant_cpu_to_le32(0xC00000A3)
-#define STATUS_INVALID_GROUP_ATTRIBUTES __constant_cpu_to_le32(0xC00000A4)
-#define STATUS_BAD_IMPERSONATION_LEVEL __constant_cpu_to_le32(0xC00000A5)
-#define STATUS_CANT_OPEN_ANONYMOUS __constant_cpu_to_le32(0xC00000A6)
-#define STATUS_BAD_VALIDATION_CLASS __constant_cpu_to_le32(0xC00000A7)
-#define STATUS_BAD_TOKEN_TYPE __constant_cpu_to_le32(0xC00000A8)
-#define STATUS_BAD_MASTER_BOOT_RECORD __constant_cpu_to_le32(0xC00000A9)
-#define STATUS_INSTRUCTION_MISALIGNMENT __constant_cpu_to_le32(0xC00000AA)
-#define STATUS_INSTANCE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00000AB)
-#define STATUS_PIPE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00000AC)
-#define STATUS_INVALID_PIPE_STATE __constant_cpu_to_le32(0xC00000AD)
-#define STATUS_PIPE_BUSY __constant_cpu_to_le32(0xC00000AE)
-#define STATUS_ILLEGAL_FUNCTION __constant_cpu_to_le32(0xC00000AF)
-#define STATUS_PIPE_DISCONNECTED __constant_cpu_to_le32(0xC00000B0)
-#define STATUS_PIPE_CLOSING __constant_cpu_to_le32(0xC00000B1)
-#define STATUS_PIPE_CONNECTED __constant_cpu_to_le32(0xC00000B2)
-#define STATUS_PIPE_LISTENING __constant_cpu_to_le32(0xC00000B3)
-#define STATUS_INVALID_READ_MODE __constant_cpu_to_le32(0xC00000B4)
-#define STATUS_IO_TIMEOUT __constant_cpu_to_le32(0xC00000B5)
-#define STATUS_FILE_FORCED_CLOSED __constant_cpu_to_le32(0xC00000B6)
-#define STATUS_PROFILING_NOT_STARTED __constant_cpu_to_le32(0xC00000B7)
-#define STATUS_PROFILING_NOT_STOPPED __constant_cpu_to_le32(0xC00000B8)
-#define STATUS_COULD_NOT_INTERPRET __constant_cpu_to_le32(0xC00000B9)
-#define STATUS_FILE_IS_A_DIRECTORY __constant_cpu_to_le32(0xC00000BA)
-#define STATUS_NOT_SUPPORTED __constant_cpu_to_le32(0xC00000BB)
-#define STATUS_REMOTE_NOT_LISTENING __constant_cpu_to_le32(0xC00000BC)
-#define STATUS_DUPLICATE_NAME __constant_cpu_to_le32(0xC00000BD)
-#define STATUS_BAD_NETWORK_PATH __constant_cpu_to_le32(0xC00000BE)
-#define STATUS_NETWORK_BUSY __constant_cpu_to_le32(0xC00000BF)
-#define STATUS_DEVICE_DOES_NOT_EXIST __constant_cpu_to_le32(0xC00000C0)
-#define STATUS_TOO_MANY_COMMANDS __constant_cpu_to_le32(0xC00000C1)
-#define STATUS_ADAPTER_HARDWARE_ERROR __constant_cpu_to_le32(0xC00000C2)
-#define STATUS_INVALID_NETWORK_RESPONSE __constant_cpu_to_le32(0xC00000C3)
-#define STATUS_UNEXPECTED_NETWORK_ERROR __constant_cpu_to_le32(0xC00000C4)
-#define STATUS_BAD_REMOTE_ADAPTER __constant_cpu_to_le32(0xC00000C5)
-#define STATUS_PRINT_QUEUE_FULL __constant_cpu_to_le32(0xC00000C6)
-#define STATUS_NO_SPOOL_SPACE __constant_cpu_to_le32(0xC00000C7)
-#define STATUS_PRINT_CANCELLED __constant_cpu_to_le32(0xC00000C8)
-#define STATUS_NETWORK_NAME_DELETED __constant_cpu_to_le32(0xC00000C9)
-#define STATUS_NETWORK_ACCESS_DENIED __constant_cpu_to_le32(0xC00000CA)
-#define STATUS_BAD_DEVICE_TYPE __constant_cpu_to_le32(0xC00000CB)
-#define STATUS_BAD_NETWORK_NAME __constant_cpu_to_le32(0xC00000CC)
-#define STATUS_TOO_MANY_NAMES __constant_cpu_to_le32(0xC00000CD)
-#define STATUS_TOO_MANY_SESSIONS __constant_cpu_to_le32(0xC00000CE)
-#define STATUS_SHARING_PAUSED __constant_cpu_to_le32(0xC00000CF)
-#define STATUS_REQUEST_NOT_ACCEPTED __constant_cpu_to_le32(0xC00000D0)
-#define STATUS_REDIRECTOR_PAUSED __constant_cpu_to_le32(0xC00000D1)
-#define STATUS_NET_WRITE_FAULT __constant_cpu_to_le32(0xC00000D2)
-#define STATUS_PROFILING_AT_LIMIT __constant_cpu_to_le32(0xC00000D3)
-#define STATUS_NOT_SAME_DEVICE __constant_cpu_to_le32(0xC00000D4)
-#define STATUS_FILE_RENAMED __constant_cpu_to_le32(0xC00000D5)
-#define STATUS_VIRTUAL_CIRCUIT_CLOSED __constant_cpu_to_le32(0xC00000D6)
-#define STATUS_NO_SECURITY_ON_OBJECT __constant_cpu_to_le32(0xC00000D7)
-#define STATUS_CANT_WAIT __constant_cpu_to_le32(0xC00000D8)
-#define STATUS_PIPE_EMPTY __constant_cpu_to_le32(0xC00000D9)
-#define STATUS_CANT_ACCESS_DOMAIN_INFO __constant_cpu_to_le32(0xC00000DA)
-#define STATUS_CANT_TERMINATE_SELF __constant_cpu_to_le32(0xC00000DB)
-#define STATUS_INVALID_SERVER_STATE __constant_cpu_to_le32(0xC00000DC)
-#define STATUS_INVALID_DOMAIN_STATE __constant_cpu_to_le32(0xC00000DD)
-#define STATUS_INVALID_DOMAIN_ROLE __constant_cpu_to_le32(0xC00000DE)
-#define STATUS_NO_SUCH_DOMAIN __constant_cpu_to_le32(0xC00000DF)
-#define STATUS_DOMAIN_EXISTS __constant_cpu_to_le32(0xC00000E0)
-#define STATUS_DOMAIN_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC00000E1)
-#define STATUS_OPLOCK_NOT_GRANTED __constant_cpu_to_le32(0xC00000E2)
-#define STATUS_INVALID_OPLOCK_PROTOCOL __constant_cpu_to_le32(0xC00000E3)
-#define STATUS_INTERNAL_DB_CORRUPTION __constant_cpu_to_le32(0xC00000E4)
-#define STATUS_INTERNAL_ERROR __constant_cpu_to_le32(0xC00000E5)
-#define STATUS_GENERIC_NOT_MAPPED __constant_cpu_to_le32(0xC00000E6)
-#define STATUS_BAD_DESCRIPTOR_FORMAT __constant_cpu_to_le32(0xC00000E7)
-#define STATUS_INVALID_USER_BUFFER __constant_cpu_to_le32(0xC00000E8)
-#define STATUS_UNEXPECTED_IO_ERROR __constant_cpu_to_le32(0xC00000E9)
-#define STATUS_UNEXPECTED_MM_CREATE_ERR __constant_cpu_to_le32(0xC00000EA)
-#define STATUS_UNEXPECTED_MM_MAP_ERROR __constant_cpu_to_le32(0xC00000EB)
-#define STATUS_UNEXPECTED_MM_EXTEND_ERR __constant_cpu_to_le32(0xC00000EC)
-#define STATUS_NOT_LOGON_PROCESS __constant_cpu_to_le32(0xC00000ED)
-#define STATUS_LOGON_SESSION_EXISTS __constant_cpu_to_le32(0xC00000EE)
-#define STATUS_INVALID_PARAMETER_1 __constant_cpu_to_le32(0xC00000EF)
-#define STATUS_INVALID_PARAMETER_2 __constant_cpu_to_le32(0xC00000F0)
-#define STATUS_INVALID_PARAMETER_3 __constant_cpu_to_le32(0xC00000F1)
-#define STATUS_INVALID_PARAMETER_4 __constant_cpu_to_le32(0xC00000F2)
-#define STATUS_INVALID_PARAMETER_5 __constant_cpu_to_le32(0xC00000F3)
-#define STATUS_INVALID_PARAMETER_6 __constant_cpu_to_le32(0xC00000F4)
-#define STATUS_INVALID_PARAMETER_7 __constant_cpu_to_le32(0xC00000F5)
-#define STATUS_INVALID_PARAMETER_8 __constant_cpu_to_le32(0xC00000F6)
-#define STATUS_INVALID_PARAMETER_9 __constant_cpu_to_le32(0xC00000F7)
-#define STATUS_INVALID_PARAMETER_10 __constant_cpu_to_le32(0xC00000F8)
-#define STATUS_INVALID_PARAMETER_11 __constant_cpu_to_le32(0xC00000F9)
-#define STATUS_INVALID_PARAMETER_12 __constant_cpu_to_le32(0xC00000FA)
-#define STATUS_REDIRECTOR_NOT_STARTED __constant_cpu_to_le32(0xC00000FB)
-#define STATUS_REDIRECTOR_STARTED __constant_cpu_to_le32(0xC00000FC)
-#define STATUS_STACK_OVERFLOW __constant_cpu_to_le32(0xC00000FD)
-#define STATUS_NO_SUCH_PACKAGE __constant_cpu_to_le32(0xC00000FE)
-#define STATUS_BAD_FUNCTION_TABLE __constant_cpu_to_le32(0xC00000FF)
-#define STATUS_VARIABLE_NOT_FOUND __constant_cpu_to_le32(0xC0000100)
-#define STATUS_DIRECTORY_NOT_EMPTY __constant_cpu_to_le32(0xC0000101)
-#define STATUS_FILE_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000102)
-#define STATUS_NOT_A_DIRECTORY __constant_cpu_to_le32(0xC0000103)
-#define STATUS_BAD_LOGON_SESSION_STATE __constant_cpu_to_le32(0xC0000104)
-#define STATUS_LOGON_SESSION_COLLISION __constant_cpu_to_le32(0xC0000105)
-#define STATUS_NAME_TOO_LONG __constant_cpu_to_le32(0xC0000106)
-#define STATUS_FILES_OPEN __constant_cpu_to_le32(0xC0000107)
-#define STATUS_CONNECTION_IN_USE __constant_cpu_to_le32(0xC0000108)
-#define STATUS_MESSAGE_NOT_FOUND __constant_cpu_to_le32(0xC0000109)
-#define STATUS_PROCESS_IS_TERMINATING __constant_cpu_to_le32(0xC000010A)
-#define STATUS_INVALID_LOGON_TYPE __constant_cpu_to_le32(0xC000010B)
-#define STATUS_NO_GUID_TRANSLATION __constant_cpu_to_le32(0xC000010C)
-#define STATUS_CANNOT_IMPERSONATE __constant_cpu_to_le32(0xC000010D)
-#define STATUS_IMAGE_ALREADY_LOADED __constant_cpu_to_le32(0xC000010E)
-#define STATUS_ABIOS_NOT_PRESENT __constant_cpu_to_le32(0xC000010F)
-#define STATUS_ABIOS_LID_NOT_EXIST __constant_cpu_to_le32(0xC0000110)
-#define STATUS_ABIOS_LID_ALREADY_OWNED __constant_cpu_to_le32(0xC0000111)
-#define STATUS_ABIOS_NOT_LID_OWNER __constant_cpu_to_le32(0xC0000112)
-#define STATUS_ABIOS_INVALID_COMMAND __constant_cpu_to_le32(0xC0000113)
-#define STATUS_ABIOS_INVALID_LID __constant_cpu_to_le32(0xC0000114)
-#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE __constant_cpu_to_le32(0xC0000115)
-#define STATUS_ABIOS_INVALID_SELECTOR __constant_cpu_to_le32(0xC0000116)
-#define STATUS_NO_LDT __constant_cpu_to_le32(0xC0000117)
-#define STATUS_INVALID_LDT_SIZE __constant_cpu_to_le32(0xC0000118)
-#define STATUS_INVALID_LDT_OFFSET __constant_cpu_to_le32(0xC0000119)
-#define STATUS_INVALID_LDT_DESCRIPTOR __constant_cpu_to_le32(0xC000011A)
-#define STATUS_INVALID_IMAGE_NE_FORMAT __constant_cpu_to_le32(0xC000011B)
-#define STATUS_RXACT_INVALID_STATE __constant_cpu_to_le32(0xC000011C)
-#define STATUS_RXACT_COMMIT_FAILURE __constant_cpu_to_le32(0xC000011D)
-#define STATUS_MAPPED_FILE_SIZE_ZERO __constant_cpu_to_le32(0xC000011E)
-#define STATUS_TOO_MANY_OPENED_FILES __constant_cpu_to_le32(0xC000011F)
-#define STATUS_CANCELLED __constant_cpu_to_le32(0xC0000120)
-#define STATUS_CANNOT_DELETE __constant_cpu_to_le32(0xC0000121)
-#define STATUS_INVALID_COMPUTER_NAME __constant_cpu_to_le32(0xC0000122)
-#define STATUS_FILE_DELETED __constant_cpu_to_le32(0xC0000123)
-#define STATUS_SPECIAL_ACCOUNT __constant_cpu_to_le32(0xC0000124)
-#define STATUS_SPECIAL_GROUP __constant_cpu_to_le32(0xC0000125)
-#define STATUS_SPECIAL_USER __constant_cpu_to_le32(0xC0000126)
-#define STATUS_MEMBERS_PRIMARY_GROUP __constant_cpu_to_le32(0xC0000127)
-#define STATUS_FILE_CLOSED __constant_cpu_to_le32(0xC0000128)
-#define STATUS_TOO_MANY_THREADS __constant_cpu_to_le32(0xC0000129)
-#define STATUS_THREAD_NOT_IN_PROCESS __constant_cpu_to_le32(0xC000012A)
-#define STATUS_TOKEN_ALREADY_IN_USE __constant_cpu_to_le32(0xC000012B)
-#define STATUS_PAGEFILE_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC000012C)
-#define STATUS_COMMITMENT_LIMIT __constant_cpu_to_le32(0xC000012D)
-#define STATUS_INVALID_IMAGE_LE_FORMAT __constant_cpu_to_le32(0xC000012E)
-#define STATUS_INVALID_IMAGE_NOT_MZ __constant_cpu_to_le32(0xC000012F)
-#define STATUS_INVALID_IMAGE_PROTECT __constant_cpu_to_le32(0xC0000130)
-#define STATUS_INVALID_IMAGE_WIN_16 __constant_cpu_to_le32(0xC0000131)
-#define STATUS_LOGON_SERVER_CONFLICT __constant_cpu_to_le32(0xC0000132)
-#define STATUS_TIME_DIFFERENCE_AT_DC __constant_cpu_to_le32(0xC0000133)
-#define STATUS_SYNCHRONIZATION_REQUIRED __constant_cpu_to_le32(0xC0000134)
-#define STATUS_DLL_NOT_FOUND __constant_cpu_to_le32(0xC0000135)
-#define STATUS_OPEN_FAILED __constant_cpu_to_le32(0xC0000136)
-#define STATUS_IO_PRIVILEGE_FAILED __constant_cpu_to_le32(0xC0000137)
-#define STATUS_ORDINAL_NOT_FOUND __constant_cpu_to_le32(0xC0000138)
-#define STATUS_ENTRYPOINT_NOT_FOUND __constant_cpu_to_le32(0xC0000139)
-#define STATUS_CONTROL_C_EXIT __constant_cpu_to_le32(0xC000013A)
-#define STATUS_LOCAL_DISCONNECT __constant_cpu_to_le32(0xC000013B)
-#define STATUS_REMOTE_DISCONNECT __constant_cpu_to_le32(0xC000013C)
-#define STATUS_REMOTE_RESOURCES __constant_cpu_to_le32(0xC000013D)
-#define STATUS_LINK_FAILED __constant_cpu_to_le32(0xC000013E)
-#define STATUS_LINK_TIMEOUT __constant_cpu_to_le32(0xC000013F)
-#define STATUS_INVALID_CONNECTION __constant_cpu_to_le32(0xC0000140)
-#define STATUS_INVALID_ADDRESS __constant_cpu_to_le32(0xC0000141)
-#define STATUS_DLL_INIT_FAILED __constant_cpu_to_le32(0xC0000142)
-#define STATUS_MISSING_SYSTEMFILE __constant_cpu_to_le32(0xC0000143)
-#define STATUS_UNHANDLED_EXCEPTION __constant_cpu_to_le32(0xC0000144)
-#define STATUS_APP_INIT_FAILURE __constant_cpu_to_le32(0xC0000145)
-#define STATUS_PAGEFILE_CREATE_FAILED __constant_cpu_to_le32(0xC0000146)
-#define STATUS_NO_PAGEFILE __constant_cpu_to_le32(0xC0000147)
-#define STATUS_INVALID_LEVEL __constant_cpu_to_le32(0xC0000148)
-#define STATUS_WRONG_PASSWORD_CORE __constant_cpu_to_le32(0xC0000149)
-#define STATUS_ILLEGAL_FLOAT_CONTEXT __constant_cpu_to_le32(0xC000014A)
-#define STATUS_PIPE_BROKEN __constant_cpu_to_le32(0xC000014B)
-#define STATUS_REGISTRY_CORRUPT __constant_cpu_to_le32(0xC000014C)
-#define STATUS_REGISTRY_IO_FAILED __constant_cpu_to_le32(0xC000014D)
-#define STATUS_NO_EVENT_PAIR __constant_cpu_to_le32(0xC000014E)
-#define STATUS_UNRECOGNIZED_VOLUME __constant_cpu_to_le32(0xC000014F)
-#define STATUS_SERIAL_NO_DEVICE_INITED __constant_cpu_to_le32(0xC0000150)
-#define STATUS_NO_SUCH_ALIAS __constant_cpu_to_le32(0xC0000151)
-#define STATUS_MEMBER_NOT_IN_ALIAS __constant_cpu_to_le32(0xC0000152)
-#define STATUS_MEMBER_IN_ALIAS __constant_cpu_to_le32(0xC0000153)
-#define STATUS_ALIAS_EXISTS __constant_cpu_to_le32(0xC0000154)
-#define STATUS_LOGON_NOT_GRANTED __constant_cpu_to_le32(0xC0000155)
-#define STATUS_TOO_MANY_SECRETS __constant_cpu_to_le32(0xC0000156)
-#define STATUS_SECRET_TOO_LONG __constant_cpu_to_le32(0xC0000157)
-#define STATUS_INTERNAL_DB_ERROR __constant_cpu_to_le32(0xC0000158)
-#define STATUS_FULLSCREEN_MODE __constant_cpu_to_le32(0xC0000159)
-#define STATUS_TOO_MANY_CONTEXT_IDS __constant_cpu_to_le32(0xC000015A)
-#define STATUS_LOGON_TYPE_NOT_GRANTED __constant_cpu_to_le32(0xC000015B)
-#define STATUS_NOT_REGISTRY_FILE __constant_cpu_to_le32(0xC000015C)
-#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED __constant_cpu_to_le32(0xC000015D)
-#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR __constant_cpu_to_le32(0xC000015E)
-#define STATUS_FT_MISSING_MEMBER __constant_cpu_to_le32(0xC000015F)
-#define STATUS_ILL_FORMED_SERVICE_ENTRY __constant_cpu_to_le32(0xC0000160)
-#define STATUS_ILLEGAL_CHARACTER __constant_cpu_to_le32(0xC0000161)
-#define STATUS_UNMAPPABLE_CHARACTER __constant_cpu_to_le32(0xC0000162)
-#define STATUS_UNDEFINED_CHARACTER __constant_cpu_to_le32(0xC0000163)
-#define STATUS_FLOPPY_VOLUME __constant_cpu_to_le32(0xC0000164)
-#define STATUS_FLOPPY_ID_MARK_NOT_FOUND __constant_cpu_to_le32(0xC0000165)
-#define STATUS_FLOPPY_WRONG_CYLINDER __constant_cpu_to_le32(0xC0000166)
-#define STATUS_FLOPPY_UNKNOWN_ERROR __constant_cpu_to_le32(0xC0000167)
-#define STATUS_FLOPPY_BAD_REGISTERS __constant_cpu_to_le32(0xC0000168)
-#define STATUS_DISK_RECALIBRATE_FAILED __constant_cpu_to_le32(0xC0000169)
-#define STATUS_DISK_OPERATION_FAILED __constant_cpu_to_le32(0xC000016A)
-#define STATUS_DISK_RESET_FAILED __constant_cpu_to_le32(0xC000016B)
-#define STATUS_SHARED_IRQ_BUSY __constant_cpu_to_le32(0xC000016C)
-#define STATUS_FT_ORPHANING __constant_cpu_to_le32(0xC000016D)
-#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT __constant_cpu_to_le32(0xC000016E)
-#define STATUS_PARTITION_FAILURE __constant_cpu_to_le32(0xC0000172)
-#define STATUS_INVALID_BLOCK_LENGTH __constant_cpu_to_le32(0xC0000173)
-#define STATUS_DEVICE_NOT_PARTITIONED __constant_cpu_to_le32(0xC0000174)
-#define STATUS_UNABLE_TO_LOCK_MEDIA __constant_cpu_to_le32(0xC0000175)
-#define STATUS_UNABLE_TO_UNLOAD_MEDIA __constant_cpu_to_le32(0xC0000176)
-#define STATUS_EOM_OVERFLOW __constant_cpu_to_le32(0xC0000177)
-#define STATUS_NO_MEDIA __constant_cpu_to_le32(0xC0000178)
-#define STATUS_NO_SUCH_MEMBER __constant_cpu_to_le32(0xC000017A)
-#define STATUS_INVALID_MEMBER __constant_cpu_to_le32(0xC000017B)
-#define STATUS_KEY_DELETED __constant_cpu_to_le32(0xC000017C)
-#define STATUS_NO_LOG_SPACE __constant_cpu_to_le32(0xC000017D)
-#define STATUS_TOO_MANY_SIDS __constant_cpu_to_le32(0xC000017E)
-#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED __constant_cpu_to_le32(0xC000017F)
-#define STATUS_KEY_HAS_CHILDREN __constant_cpu_to_le32(0xC0000180)
-#define STATUS_CHILD_MUST_BE_VOLATILE __constant_cpu_to_le32(0xC0000181)
-#define STATUS_DEVICE_CONFIGURATION_ERROR __constant_cpu_to_le32(0xC0000182)
-#define STATUS_DRIVER_INTERNAL_ERROR __constant_cpu_to_le32(0xC0000183)
-#define STATUS_INVALID_DEVICE_STATE __constant_cpu_to_le32(0xC0000184)
-#define STATUS_IO_DEVICE_ERROR __constant_cpu_to_le32(0xC0000185)
-#define STATUS_DEVICE_PROTOCOL_ERROR __constant_cpu_to_le32(0xC0000186)
-#define STATUS_BACKUP_CONTROLLER __constant_cpu_to_le32(0xC0000187)
-#define STATUS_LOG_FILE_FULL __constant_cpu_to_le32(0xC0000188)
-#define STATUS_TOO_LATE __constant_cpu_to_le32(0xC0000189)
-#define STATUS_NO_TRUST_LSA_SECRET __constant_cpu_to_le32(0xC000018A)
-#define STATUS_NO_TRUST_SAM_ACCOUNT __constant_cpu_to_le32(0xC000018B)
-#define STATUS_TRUSTED_DOMAIN_FAILURE __constant_cpu_to_le32(0xC000018C)
-#define STATUS_TRUSTED_RELATIONSHIP_FAILURE __constant_cpu_to_le32(0xC000018D)
-#define STATUS_EVENTLOG_FILE_CORRUPT __constant_cpu_to_le32(0xC000018E)
-#define STATUS_EVENTLOG_CANT_START __constant_cpu_to_le32(0xC000018F)
-#define STATUS_TRUST_FAILURE __constant_cpu_to_le32(0xC0000190)
-#define STATUS_MUTANT_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC0000191)
-#define STATUS_NETLOGON_NOT_STARTED __constant_cpu_to_le32(0xC0000192)
-#define STATUS_ACCOUNT_EXPIRED __constant_cpu_to_le32(0xC0000193)
-#define STATUS_POSSIBLE_DEADLOCK __constant_cpu_to_le32(0xC0000194)
-#define STATUS_NETWORK_CREDENTIAL_CONFLICT __constant_cpu_to_le32(0xC0000195)
-#define STATUS_REMOTE_SESSION_LIMIT __constant_cpu_to_le32(0xC0000196)
-#define STATUS_EVENTLOG_FILE_CHANGED __constant_cpu_to_le32(0xC0000197)
-#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT __constant_cpu_to_le32(0xC0000198)
-#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT __constant_cpu_to_le32(0xC0000199)
-#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT __constant_cpu_to_le32(0xC000019A)
-#define STATUS_DOMAIN_TRUST_INCONSISTENT __constant_cpu_to_le32(0xC000019B)
-#define STATUS_FS_DRIVER_REQUIRED __constant_cpu_to_le32(0xC000019C)
-#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL __constant_cpu_to_le32(0xC000019D)
-#define STATUS_NETWORK_OPEN_RESTRICTION __constant_cpu_to_le32(0xC0000201)
-#define STATUS_NO_USER_SESSION_KEY __constant_cpu_to_le32(0xC0000202)
-#define STATUS_USER_SESSION_DELETED __constant_cpu_to_le32(0xC0000203)
-#define STATUS_RESOURCE_LANG_NOT_FOUND __constant_cpu_to_le32(0xC0000204)
-#define STATUS_INSUFF_SERVER_RESOURCES __constant_cpu_to_le32(0xC0000205)
-#define STATUS_INVALID_BUFFER_SIZE __constant_cpu_to_le32(0xC0000206)
-#define STATUS_INVALID_ADDRESS_COMPONENT __constant_cpu_to_le32(0xC0000207)
-#define STATUS_INVALID_ADDRESS_WILDCARD __constant_cpu_to_le32(0xC0000208)
-#define STATUS_TOO_MANY_ADDRESSES __constant_cpu_to_le32(0xC0000209)
-#define STATUS_ADDRESS_ALREADY_EXISTS __constant_cpu_to_le32(0xC000020A)
-#define STATUS_ADDRESS_CLOSED __constant_cpu_to_le32(0xC000020B)
-#define STATUS_CONNECTION_DISCONNECTED __constant_cpu_to_le32(0xC000020C)
-#define STATUS_CONNECTION_RESET __constant_cpu_to_le32(0xC000020D)
-#define STATUS_TOO_MANY_NODES __constant_cpu_to_le32(0xC000020E)
-#define STATUS_TRANSACTION_ABORTED __constant_cpu_to_le32(0xC000020F)
-#define STATUS_TRANSACTION_TIMED_OUT __constant_cpu_to_le32(0xC0000210)
-#define STATUS_TRANSACTION_NO_RELEASE __constant_cpu_to_le32(0xC0000211)
-#define STATUS_TRANSACTION_NO_MATCH __constant_cpu_to_le32(0xC0000212)
-#define STATUS_TRANSACTION_RESPONDED __constant_cpu_to_le32(0xC0000213)
-#define STATUS_TRANSACTION_INVALID_ID __constant_cpu_to_le32(0xC0000214)
-#define STATUS_TRANSACTION_INVALID_TYPE __constant_cpu_to_le32(0xC0000215)
-#define STATUS_NOT_SERVER_SESSION __constant_cpu_to_le32(0xC0000216)
-#define STATUS_NOT_CLIENT_SESSION __constant_cpu_to_le32(0xC0000217)
-#define STATUS_CANNOT_LOAD_REGISTRY_FILE __constant_cpu_to_le32(0xC0000218)
-#define STATUS_DEBUG_ATTACH_FAILED __constant_cpu_to_le32(0xC0000219)
-#define STATUS_SYSTEM_PROCESS_TERMINATED __constant_cpu_to_le32(0xC000021A)
-#define STATUS_DATA_NOT_ACCEPTED __constant_cpu_to_le32(0xC000021B)
-#define STATUS_NO_BROWSER_SERVERS_FOUND __constant_cpu_to_le32(0xC000021C)
-#define STATUS_VDM_HARD_ERROR __constant_cpu_to_le32(0xC000021D)
-#define STATUS_DRIVER_CANCEL_TIMEOUT __constant_cpu_to_le32(0xC000021E)
-#define STATUS_REPLY_MESSAGE_MISMATCH __constant_cpu_to_le32(0xC000021F)
-#define STATUS_MAPPED_ALIGNMENT __constant_cpu_to_le32(0xC0000220)
-#define STATUS_IMAGE_CHECKSUM_MISMATCH __constant_cpu_to_le32(0xC0000221)
-#define STATUS_LOST_WRITEBEHIND_DATA __constant_cpu_to_le32(0xC0000222)
-#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID __constant_cpu_to_le32(0xC0000223)
-#define STATUS_PASSWORD_MUST_CHANGE __constant_cpu_to_le32(0xC0000224)
-#define STATUS_NOT_FOUND __constant_cpu_to_le32(0xC0000225)
-#define STATUS_NOT_TINY_STREAM __constant_cpu_to_le32(0xC0000226)
-#define STATUS_RECOVERY_FAILURE __constant_cpu_to_le32(0xC0000227)
-#define STATUS_STACK_OVERFLOW_READ __constant_cpu_to_le32(0xC0000228)
-#define STATUS_FAIL_CHECK __constant_cpu_to_le32(0xC0000229)
-#define STATUS_DUPLICATE_OBJECTID __constant_cpu_to_le32(0xC000022A)
-#define STATUS_OBJECTID_EXISTS __constant_cpu_to_le32(0xC000022B)
-#define STATUS_CONVERT_TO_LARGE __constant_cpu_to_le32(0xC000022C)
-#define STATUS_RETRY __constant_cpu_to_le32(0xC000022D)
-#define STATUS_FOUND_OUT_OF_SCOPE __constant_cpu_to_le32(0xC000022E)
-#define STATUS_ALLOCATE_BUCKET __constant_cpu_to_le32(0xC000022F)
-#define STATUS_PROPSET_NOT_FOUND __constant_cpu_to_le32(0xC0000230)
-#define STATUS_MARSHALL_OVERFLOW __constant_cpu_to_le32(0xC0000231)
-#define STATUS_INVALID_VARIANT __constant_cpu_to_le32(0xC0000232)
-#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND __constant_cpu_to_le32(0xC0000233)
-#define STATUS_ACCOUNT_LOCKED_OUT __constant_cpu_to_le32(0xC0000234)
-#define STATUS_HANDLE_NOT_CLOSABLE __constant_cpu_to_le32(0xC0000235)
-#define STATUS_CONNECTION_REFUSED __constant_cpu_to_le32(0xC0000236)
-#define STATUS_GRACEFUL_DISCONNECT __constant_cpu_to_le32(0xC0000237)
-#define STATUS_ADDRESS_ALREADY_ASSOCIATED __constant_cpu_to_le32(0xC0000238)
-#define STATUS_ADDRESS_NOT_ASSOCIATED __constant_cpu_to_le32(0xC0000239)
-#define STATUS_CONNECTION_INVALID __constant_cpu_to_le32(0xC000023A)
-#define STATUS_CONNECTION_ACTIVE __constant_cpu_to_le32(0xC000023B)
-#define STATUS_NETWORK_UNREACHABLE __constant_cpu_to_le32(0xC000023C)
-#define STATUS_HOST_UNREACHABLE __constant_cpu_to_le32(0xC000023D)
-#define STATUS_PROTOCOL_UNREACHABLE __constant_cpu_to_le32(0xC000023E)
-#define STATUS_PORT_UNREACHABLE __constant_cpu_to_le32(0xC000023F)
-#define STATUS_REQUEST_ABORTED __constant_cpu_to_le32(0xC0000240)
-#define STATUS_CONNECTION_ABORTED __constant_cpu_to_le32(0xC0000241)
-#define STATUS_BAD_COMPRESSION_BUFFER __constant_cpu_to_le32(0xC0000242)
-#define STATUS_USER_MAPPED_FILE __constant_cpu_to_le32(0xC0000243)
-#define STATUS_AUDIT_FAILED __constant_cpu_to_le32(0xC0000244)
-#define STATUS_TIMER_RESOLUTION_NOT_SET __constant_cpu_to_le32(0xC0000245)
-#define STATUS_CONNECTION_COUNT_LIMIT __constant_cpu_to_le32(0xC0000246)
-#define STATUS_LOGIN_TIME_RESTRICTION __constant_cpu_to_le32(0xC0000247)
-#define STATUS_LOGIN_WKSTA_RESTRICTION __constant_cpu_to_le32(0xC0000248)
-#define STATUS_IMAGE_MP_UP_MISMATCH __constant_cpu_to_le32(0xC0000249)
-#define STATUS_INSUFFICIENT_LOGON_INFO __constant_cpu_to_le32(0xC0000250)
-#define STATUS_BAD_DLL_ENTRYPOINT __constant_cpu_to_le32(0xC0000251)
-#define STATUS_BAD_SERVICE_ENTRYPOINT __constant_cpu_to_le32(0xC0000252)
-#define STATUS_LPC_REPLY_LOST __constant_cpu_to_le32(0xC0000253)
-#define STATUS_IP_ADDRESS_CONFLICT1 __constant_cpu_to_le32(0xC0000254)
-#define STATUS_IP_ADDRESS_CONFLICT2 __constant_cpu_to_le32(0xC0000255)
-#define STATUS_REGISTRY_QUOTA_LIMIT __constant_cpu_to_le32(0xC0000256)
-#define STATUS_PATH_NOT_COVERED __constant_cpu_to_le32(0xC0000257)
-#define STATUS_NO_CALLBACK_ACTIVE __constant_cpu_to_le32(0xC0000258)
-#define STATUS_LICENSE_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000259)
-#define STATUS_PWD_TOO_SHORT __constant_cpu_to_le32(0xC000025A)
-#define STATUS_PWD_TOO_RECENT __constant_cpu_to_le32(0xC000025B)
-#define STATUS_PWD_HISTORY_CONFLICT __constant_cpu_to_le32(0xC000025C)
-#define STATUS_PLUGPLAY_NO_DEVICE __constant_cpu_to_le32(0xC000025E)
-#define STATUS_UNSUPPORTED_COMPRESSION __constant_cpu_to_le32(0xC000025F)
-#define STATUS_INVALID_HW_PROFILE __constant_cpu_to_le32(0xC0000260)
-#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH __constant_cpu_to_le32(0xC0000261)
-#define STATUS_DRIVER_ORDINAL_NOT_FOUND __constant_cpu_to_le32(0xC0000262)
-#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND __constant_cpu_to_le32(0xC0000263)
-#define STATUS_RESOURCE_NOT_OWNED __constant_cpu_to_le32(0xC0000264)
-#define STATUS_TOO_MANY_LINKS __constant_cpu_to_le32(0xC0000265)
-#define STATUS_QUOTA_LIST_INCONSISTENT __constant_cpu_to_le32(0xC0000266)
-#define STATUS_FILE_IS_OFFLINE __constant_cpu_to_le32(0xC0000267)
-#define STATUS_EVALUATION_EXPIRATION __constant_cpu_to_le32(0xC0000268)
-#define STATUS_ILLEGAL_DLL_RELOCATION __constant_cpu_to_le32(0xC0000269)
-#define STATUS_LICENSE_VIOLATION __constant_cpu_to_le32(0xC000026A)
-#define STATUS_DLL_INIT_FAILED_LOGOFF __constant_cpu_to_le32(0xC000026B)
-#define STATUS_DRIVER_UNABLE_TO_LOAD __constant_cpu_to_le32(0xC000026C)
-#define STATUS_DFS_UNAVAILABLE __constant_cpu_to_le32(0xC000026D)
-#define STATUS_VOLUME_DISMOUNTED __constant_cpu_to_le32(0xC000026E)
-#define STATUS_WX86_INTERNAL_ERROR __constant_cpu_to_le32(0xC000026F)
-#define STATUS_WX86_FLOAT_STACK_CHECK __constant_cpu_to_le32(0xC0000270)
-#define STATUS_VALIDATE_CONTINUE __constant_cpu_to_le32(0xC0000271)
-#define STATUS_NO_MATCH __constant_cpu_to_le32(0xC0000272)
-#define STATUS_NO_MORE_MATCHES __constant_cpu_to_le32(0xC0000273)
-#define STATUS_NOT_A_REPARSE_POINT __constant_cpu_to_le32(0xC0000275)
-#define STATUS_IO_REPARSE_TAG_INVALID __constant_cpu_to_le32(0xC0000276)
-#define STATUS_IO_REPARSE_TAG_MISMATCH __constant_cpu_to_le32(0xC0000277)
-#define STATUS_IO_REPARSE_DATA_INVALID __constant_cpu_to_le32(0xC0000278)
-#define STATUS_IO_REPARSE_TAG_NOT_HANDLED __constant_cpu_to_le32(0xC0000279)
-#define STATUS_REPARSE_POINT_NOT_RESOLVED __constant_cpu_to_le32(0xC0000280)
-#define STATUS_DIRECTORY_IS_A_REPARSE_POINT __constant_cpu_to_le32(0xC0000281)
-#define STATUS_RANGE_LIST_CONFLICT __constant_cpu_to_le32(0xC0000282)
-#define STATUS_SOURCE_ELEMENT_EMPTY __constant_cpu_to_le32(0xC0000283)
-#define STATUS_DESTINATION_ELEMENT_FULL __constant_cpu_to_le32(0xC0000284)
-#define STATUS_ILLEGAL_ELEMENT_ADDRESS __constant_cpu_to_le32(0xC0000285)
-#define STATUS_MAGAZINE_NOT_PRESENT __constant_cpu_to_le32(0xC0000286)
-#define STATUS_REINITIALIZATION_NEEDED __constant_cpu_to_le32(0xC0000287)
-#define STATUS_ENCRYPTION_FAILED __constant_cpu_to_le32(0xC000028A)
-#define STATUS_DECRYPTION_FAILED __constant_cpu_to_le32(0xC000028B)
-#define STATUS_RANGE_NOT_FOUND __constant_cpu_to_le32(0xC000028C)
-#define STATUS_NO_RECOVERY_POLICY __constant_cpu_to_le32(0xC000028D)
-#define STATUS_NO_EFS __constant_cpu_to_le32(0xC000028E)
-#define STATUS_WRONG_EFS __constant_cpu_to_le32(0xC000028F)
-#define STATUS_NO_USER_KEYS __constant_cpu_to_le32(0xC0000290)
-#define STATUS_FILE_NOT_ENCRYPTED __constant_cpu_to_le32(0xC0000291)
-#define STATUS_NOT_EXPORT_FORMAT __constant_cpu_to_le32(0xC0000292)
-#define STATUS_FILE_ENCRYPTED __constant_cpu_to_le32(0xC0000293)
-#define STATUS_WMI_GUID_NOT_FOUND __constant_cpu_to_le32(0xC0000295)
-#define STATUS_WMI_INSTANCE_NOT_FOUND __constant_cpu_to_le32(0xC0000296)
-#define STATUS_WMI_ITEMID_NOT_FOUND __constant_cpu_to_le32(0xC0000297)
-#define STATUS_WMI_TRY_AGAIN __constant_cpu_to_le32(0xC0000298)
-#define STATUS_SHARED_POLICY __constant_cpu_to_le32(0xC0000299)
-#define STATUS_POLICY_OBJECT_NOT_FOUND __constant_cpu_to_le32(0xC000029A)
-#define STATUS_POLICY_ONLY_IN_DS __constant_cpu_to_le32(0xC000029B)
-#define STATUS_VOLUME_NOT_UPGRADED __constant_cpu_to_le32(0xC000029C)
-#define STATUS_REMOTE_STORAGE_NOT_ACTIVE __constant_cpu_to_le32(0xC000029D)
-#define STATUS_REMOTE_STORAGE_MEDIA_ERROR __constant_cpu_to_le32(0xC000029E)
-#define STATUS_NO_TRACKING_SERVICE __constant_cpu_to_le32(0xC000029F)
-#define STATUS_SERVER_SID_MISMATCH __constant_cpu_to_le32(0xC00002A0)
-#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE __constant_cpu_to_le32(0xC00002A1)
-#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX __constant_cpu_to_le32(0xC00002A2)
-#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED __constant_cpu_to_le32(0xC00002A3)
-#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS __constant_cpu_to_le32(0xC00002A4)
-#define STATUS_DS_BUSY __constant_cpu_to_le32(0xC00002A5)
-#define STATUS_DS_UNAVAILABLE __constant_cpu_to_le32(0xC00002A6)
-#define STATUS_DS_NO_RIDS_ALLOCATED __constant_cpu_to_le32(0xC00002A7)
-#define STATUS_DS_NO_MORE_RIDS __constant_cpu_to_le32(0xC00002A8)
-#define STATUS_DS_INCORRECT_ROLE_OWNER __constant_cpu_to_le32(0xC00002A9)
-#define STATUS_DS_RIDMGR_INIT_ERROR __constant_cpu_to_le32(0xC00002AA)
-#define STATUS_DS_OBJ_CLASS_VIOLATION __constant_cpu_to_le32(0xC00002AB)
-#define STATUS_DS_CANT_ON_NON_LEAF __constant_cpu_to_le32(0xC00002AC)
-#define STATUS_DS_CANT_ON_RDN __constant_cpu_to_le32(0xC00002AD)
-#define STATUS_DS_CANT_MOD_OBJ_CLASS __constant_cpu_to_le32(0xC00002AE)
-#define STATUS_DS_CROSS_DOM_MOVE_FAILED __constant_cpu_to_le32(0xC00002AF)
-#define STATUS_DS_GC_NOT_AVAILABLE __constant_cpu_to_le32(0xC00002B0)
-#define STATUS_DIRECTORY_SERVICE_REQUIRED __constant_cpu_to_le32(0xC00002B1)
-#define STATUS_REPARSE_ATTRIBUTE_CONFLICT __constant_cpu_to_le32(0xC00002B2)
-#define STATUS_CANT_ENABLE_DENY_ONLY __constant_cpu_to_le32(0xC00002B3)
-#define STATUS_FLOAT_MULTIPLE_FAULTS __constant_cpu_to_le32(0xC00002B4)
-#define STATUS_FLOAT_MULTIPLE_TRAPS __constant_cpu_to_le32(0xC00002B5)
-#define STATUS_DEVICE_REMOVED __constant_cpu_to_le32(0xC00002B6)
-#define STATUS_JOURNAL_DELETE_IN_PROGRESS __constant_cpu_to_le32(0xC00002B7)
-#define STATUS_JOURNAL_NOT_ACTIVE __constant_cpu_to_le32(0xC00002B8)
-#define STATUS_NOINTERFACE __constant_cpu_to_le32(0xC00002B9)
-#define STATUS_DS_ADMIN_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC00002C1)
-#define STATUS_DRIVER_FAILED_SLEEP __constant_cpu_to_le32(0xC00002C2)
-#define STATUS_MUTUAL_AUTHENTICATION_FAILED __constant_cpu_to_le32(0xC00002C3)
-#define STATUS_CORRUPT_SYSTEM_FILE __constant_cpu_to_le32(0xC00002C4)
-#define STATUS_DATATYPE_MISALIGNMENT_ERROR __constant_cpu_to_le32(0xC00002C5)
-#define STATUS_WMI_READ_ONLY __constant_cpu_to_le32(0xC00002C6)
-#define STATUS_WMI_SET_FAILURE __constant_cpu_to_le32(0xC00002C7)
-#define STATUS_COMMITMENT_MINIMUM __constant_cpu_to_le32(0xC00002C8)
-#define STATUS_REG_NAT_CONSUMPTION __constant_cpu_to_le32(0xC00002C9)
-#define STATUS_TRANSPORT_FULL __constant_cpu_to_le32(0xC00002CA)
-#define STATUS_DS_SAM_INIT_FAILURE __constant_cpu_to_le32(0xC00002CB)
-#define STATUS_ONLY_IF_CONNECTED __constant_cpu_to_le32(0xC00002CC)
-#define STATUS_DS_SENSITIVE_GROUP_VIOLATION __constant_cpu_to_le32(0xC00002CD)
-#define STATUS_PNP_RESTART_ENUMERATION __constant_cpu_to_le32(0xC00002CE)
-#define STATUS_JOURNAL_ENTRY_DELETED __constant_cpu_to_le32(0xC00002CF)
-#define STATUS_DS_CANT_MOD_PRIMARYGROUPID __constant_cpu_to_le32(0xC00002D0)
-#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE __constant_cpu_to_le32(0xC00002D1)
-#define STATUS_PNP_REBOOT_REQUIRED __constant_cpu_to_le32(0xC00002D2)
-#define STATUS_POWER_STATE_INVALID __constant_cpu_to_le32(0xC00002D3)
-#define STATUS_DS_INVALID_GROUP_TYPE __constant_cpu_to_le32(0xC00002D4)
-#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN __constant_cpu_to_le32(0xC00002D5)
-#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN __constant_cpu_to_le32(0xC00002D6)
-#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002D7)
-#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER __constant_cpu_to_le32(0xC00002D8)
-#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002D9)
-#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER __constant_cpu_to_le32(0xC00002DA)
-#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002DB)
-#define STATUS_DS_HAVE_PRIMARY_MEMBERS __constant_cpu_to_le32(0xC00002DC)
-#define STATUS_WMI_NOT_SUPPORTED __constant_cpu_to_le32(0xC00002DD)
-#define STATUS_INSUFFICIENT_POWER __constant_cpu_to_le32(0xC00002DE)
-#define STATUS_SAM_NEED_BOOTKEY_PASSWORD __constant_cpu_to_le32(0xC00002DF)
-#define STATUS_SAM_NEED_BOOTKEY_FLOPPY __constant_cpu_to_le32(0xC00002E0)
-#define STATUS_DS_CANT_START __constant_cpu_to_le32(0xC00002E1)
-#define STATUS_DS_INIT_FAILURE __constant_cpu_to_le32(0xC00002E2)
-#define STATUS_SAM_INIT_FAILURE __constant_cpu_to_le32(0xC00002E3)
-#define STATUS_DS_GC_REQUIRED __constant_cpu_to_le32(0xC00002E4)
-#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY __constant_cpu_to_le32(0xC00002E5)
-#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS __constant_cpu_to_le32(0xC00002E6)
-#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC00002E7)
-#define STATUS_MULTIPLE_FAULT_VIOLATION __constant_cpu_to_le32(0xC00002E8)
-#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED __constant_cpu_to_le32(0xC00002E9)
-#define STATUS_CANNOT_MAKE __constant_cpu_to_le32(0xC00002EA)
-#define STATUS_SYSTEM_SHUTDOWN __constant_cpu_to_le32(0xC00002EB)
-#define STATUS_DS_INIT_FAILURE_CONSOLE __constant_cpu_to_le32(0xC00002EC)
-#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE __constant_cpu_to_le32(0xC00002ED)
-#define STATUS_UNFINISHED_CONTEXT_DELETED __constant_cpu_to_le32(0xC00002EE)
-#define STATUS_NO_TGT_REPLY __constant_cpu_to_le32(0xC00002EF)
-#define STATUS_OBJECTID_NOT_FOUND __constant_cpu_to_le32(0xC00002F0)
-#define STATUS_NO_IP_ADDRESSES __constant_cpu_to_le32(0xC00002F1)
-#define STATUS_WRONG_CREDENTIAL_HANDLE __constant_cpu_to_le32(0xC00002F2)
-#define STATUS_CRYPTO_SYSTEM_INVALID __constant_cpu_to_le32(0xC00002F3)
-#define STATUS_MAX_REFERRALS_EXCEEDED __constant_cpu_to_le32(0xC00002F4)
-#define STATUS_MUST_BE_KDC __constant_cpu_to_le32(0xC00002F5)
-#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED __constant_cpu_to_le32(0xC00002F6)
-#define STATUS_TOO_MANY_PRINCIPALS __constant_cpu_to_le32(0xC00002F7)
-#define STATUS_NO_PA_DATA __constant_cpu_to_le32(0xC00002F8)
-#define STATUS_PKINIT_NAME_MISMATCH __constant_cpu_to_le32(0xC00002F9)
-#define STATUS_SMARTCARD_LOGON_REQUIRED __constant_cpu_to_le32(0xC00002FA)
-#define STATUS_KDC_INVALID_REQUEST __constant_cpu_to_le32(0xC00002FB)
-#define STATUS_KDC_UNABLE_TO_REFER __constant_cpu_to_le32(0xC00002FC)
-#define STATUS_KDC_UNKNOWN_ETYPE __constant_cpu_to_le32(0xC00002FD)
-#define STATUS_SHUTDOWN_IN_PROGRESS __constant_cpu_to_le32(0xC00002FE)
-#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS __constant_cpu_to_le32(0xC00002FF)
-#define STATUS_NOT_SUPPORTED_ON_SBS __constant_cpu_to_le32(0xC0000300)
-#define STATUS_WMI_GUID_DISCONNECTED __constant_cpu_to_le32(0xC0000301)
-#define STATUS_WMI_ALREADY_DISABLED __constant_cpu_to_le32(0xC0000302)
-#define STATUS_WMI_ALREADY_ENABLED __constant_cpu_to_le32(0xC0000303)
-#define STATUS_MFT_TOO_FRAGMENTED __constant_cpu_to_le32(0xC0000304)
-#define STATUS_COPY_PROTECTION_FAILURE __constant_cpu_to_le32(0xC0000305)
-#define STATUS_CSS_AUTHENTICATION_FAILURE __constant_cpu_to_le32(0xC0000306)
-#define STATUS_CSS_KEY_NOT_PRESENT __constant_cpu_to_le32(0xC0000307)
-#define STATUS_CSS_KEY_NOT_ESTABLISHED __constant_cpu_to_le32(0xC0000308)
-#define STATUS_CSS_SCRAMBLED_SECTOR __constant_cpu_to_le32(0xC0000309)
-#define STATUS_CSS_REGION_MISMATCH __constant_cpu_to_le32(0xC000030A)
-#define STATUS_CSS_RESETS_EXHAUSTED __constant_cpu_to_le32(0xC000030B)
-#define STATUS_PKINIT_FAILURE __constant_cpu_to_le32(0xC0000320)
-#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE __constant_cpu_to_le32(0xC0000321)
-#define STATUS_NO_KERB_KEY __constant_cpu_to_le32(0xC0000322)
-#define STATUS_HOST_DOWN __constant_cpu_to_le32(0xC0000350)
-#define STATUS_UNSUPPORTED_PREAUTH __constant_cpu_to_le32(0xC0000351)
-#define STATUS_EFS_ALG_BLOB_TOO_BIG __constant_cpu_to_le32(0xC0000352)
-#define STATUS_PORT_NOT_SET __constant_cpu_to_le32(0xC0000353)
-#define STATUS_DEBUGGER_INACTIVE __constant_cpu_to_le32(0xC0000354)
-#define STATUS_DS_VERSION_CHECK_FAILURE __constant_cpu_to_le32(0xC0000355)
-#define STATUS_AUDITING_DISABLED __constant_cpu_to_le32(0xC0000356)
-#define STATUS_PRENT4_MACHINE_ACCOUNT __constant_cpu_to_le32(0xC0000357)
-#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER __constant_cpu_to_le32(0xC0000358)
-#define STATUS_INVALID_IMAGE_WIN_32 __constant_cpu_to_le32(0xC0000359)
-#define STATUS_INVALID_IMAGE_WIN_64 __constant_cpu_to_le32(0xC000035A)
-#define STATUS_BAD_BINDINGS __constant_cpu_to_le32(0xC000035B)
-#define STATUS_NETWORK_SESSION_EXPIRED __constant_cpu_to_le32(0xC000035C)
-#define STATUS_APPHELP_BLOCK __constant_cpu_to_le32(0xC000035D)
-#define STATUS_ALL_SIDS_FILTERED __constant_cpu_to_le32(0xC000035E)
-#define STATUS_NOT_SAFE_MODE_DRIVER __constant_cpu_to_le32(0xC000035F)
-#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT __constant_cpu_to_le32(0xC0000361)
-#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH __constant_cpu_to_le32(0xC0000362)
-#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER __constant_cpu_to_le32(0xC0000363)
-#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER __constant_cpu_to_le32(0xC0000364)
-#define STATUS_FAILED_DRIVER_ENTRY __constant_cpu_to_le32(0xC0000365)
-#define STATUS_DEVICE_ENUMERATION_ERROR __constant_cpu_to_le32(0xC0000366)
-#define STATUS_MOUNT_POINT_NOT_RESOLVED __constant_cpu_to_le32(0xC0000368)
-#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER __constant_cpu_to_le32(0xC0000369)
-#define STATUS_MCA_OCCURED __constant_cpu_to_le32(0xC000036A)
-#define STATUS_DRIVER_BLOCKED_CRITICAL __constant_cpu_to_le32(0xC000036B)
-#define STATUS_DRIVER_BLOCKED __constant_cpu_to_le32(0xC000036C)
-#define STATUS_DRIVER_DATABASE_ERROR __constant_cpu_to_le32(0xC000036D)
-#define STATUS_SYSTEM_HIVE_TOO_LARGE __constant_cpu_to_le32(0xC000036E)
-#define STATUS_INVALID_IMPORT_OF_NON_DLL __constant_cpu_to_le32(0xC000036F)
-#define STATUS_NO_SECRETS __constant_cpu_to_le32(0xC0000371)
-#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY __constant_cpu_to_le32(0xC0000372)
-#define STATUS_FAILED_STACK_SWITCH __constant_cpu_to_le32(0xC0000373)
-#define STATUS_HEAP_CORRUPTION __constant_cpu_to_le32(0xC0000374)
-#define STATUS_SMARTCARD_WRONG_PIN __constant_cpu_to_le32(0xC0000380)
-#define STATUS_SMARTCARD_CARD_BLOCKED __constant_cpu_to_le32(0xC0000381)
-#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED __constant_cpu_to_le32(0xC0000382)
-#define STATUS_SMARTCARD_NO_CARD __constant_cpu_to_le32(0xC0000383)
-#define STATUS_SMARTCARD_NO_KEY_CONTAINER __constant_cpu_to_le32(0xC0000384)
-#define STATUS_SMARTCARD_NO_CERTIFICATE __constant_cpu_to_le32(0xC0000385)
-#define STATUS_SMARTCARD_NO_KEYSET __constant_cpu_to_le32(0xC0000386)
-#define STATUS_SMARTCARD_IO_ERROR __constant_cpu_to_le32(0xC0000387)
-#define STATUS_DOWNGRADE_DETECTED __constant_cpu_to_le32(0xC0000388)
-#define STATUS_SMARTCARD_CERT_REVOKED __constant_cpu_to_le32(0xC0000389)
-#define STATUS_ISSUING_CA_UNTRUSTED __constant_cpu_to_le32(0xC000038A)
-#define STATUS_REVOCATION_OFFLINE_C __constant_cpu_to_le32(0xC000038B)
-#define STATUS_PKINIT_CLIENT_FAILURE __constant_cpu_to_le32(0xC000038C)
-#define STATUS_SMARTCARD_CERT_EXPIRED __constant_cpu_to_le32(0xC000038D)
-#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD __constant_cpu_to_le32(0xC000038E)
-#define STATUS_SMARTCARD_SILENT_CONTEXT __constant_cpu_to_le32(0xC000038F)
-#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000401)
-#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000402)
-#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000403)
-#define STATUS_DS_NAME_NOT_UNIQUE __constant_cpu_to_le32(0xC0000404)
-#define STATUS_DS_DUPLICATE_ID_FOUND __constant_cpu_to_le32(0xC0000405)
-#define STATUS_DS_GROUP_CONVERSION_ERROR __constant_cpu_to_le32(0xC0000406)
-#define STATUS_VOLSNAP_PREPARE_HIBERNATE __constant_cpu_to_le32(0xC0000407)
-#define STATUS_USER2USER_REQUIRED __constant_cpu_to_le32(0xC0000408)
-#define STATUS_STACK_BUFFER_OVERRUN __constant_cpu_to_le32(0xC0000409)
-#define STATUS_NO_S4U_PROT_SUPPORT __constant_cpu_to_le32(0xC000040A)
-#define STATUS_CROSSREALM_DELEGATION_FAILURE __constant_cpu_to_le32(0xC000040B)
-#define STATUS_REVOCATION_OFFLINE_KDC __constant_cpu_to_le32(0xC000040C)
-#define STATUS_ISSUING_CA_UNTRUSTED_KDC __constant_cpu_to_le32(0xC000040D)
-#define STATUS_KDC_CERT_EXPIRED __constant_cpu_to_le32(0xC000040E)
-#define STATUS_KDC_CERT_REVOKED __constant_cpu_to_le32(0xC000040F)
-#define STATUS_PARAMETER_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000410)
-#define STATUS_HIBERNATION_FAILURE __constant_cpu_to_le32(0xC0000411)
-#define STATUS_DELAY_LOAD_FAILED __constant_cpu_to_le32(0xC0000412)
-#define STATUS_AUTHENTICATION_FIREWALL_FAILED __constant_cpu_to_le32(0xC0000413)
-#define STATUS_VDM_DISALLOWED __constant_cpu_to_le32(0xC0000414)
-#define STATUS_HUNG_DISPLAY_DRIVER_THREAD __constant_cpu_to_le32(0xC0000415)
-#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE __constant_cpu_to_le32(0xC0000416)
-#define STATUS_INVALID_CRUNTIME_PARAMETER __constant_cpu_to_le32(0xC0000417)
-#define STATUS_NTLM_BLOCKED __constant_cpu_to_le32(0xC0000418)
-#define STATUS_ASSERTION_FAILURE __constant_cpu_to_le32(0xC0000420)
-#define STATUS_VERIFIER_STOP __constant_cpu_to_le32(0xC0000421)
-#define STATUS_CALLBACK_POP_STACK __constant_cpu_to_le32(0xC0000423)
-#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED __constant_cpu_to_le32(0xC0000424)
-#define STATUS_HIVE_UNLOADED __constant_cpu_to_le32(0xC0000425)
-#define STATUS_COMPRESSION_DISABLED __constant_cpu_to_le32(0xC0000426)
-#define STATUS_FILE_SYSTEM_LIMITATION __constant_cpu_to_le32(0xC0000427)
-#define STATUS_INVALID_IMAGE_HASH __constant_cpu_to_le32(0xC0000428)
-#define STATUS_NOT_CAPABLE __constant_cpu_to_le32(0xC0000429)
-#define STATUS_REQUEST_OUT_OF_SEQUENCE __constant_cpu_to_le32(0xC000042A)
-#define STATUS_IMPLEMENTATION_LIMIT __constant_cpu_to_le32(0xC000042B)
-#define STATUS_ELEVATION_REQUIRED __constant_cpu_to_le32(0xC000042C)
-#define STATUS_BEYOND_VDL __constant_cpu_to_le32(0xC0000432)
-#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS __constant_cpu_to_le32(0xC0000433)
-#define STATUS_PTE_CHANGED __constant_cpu_to_le32(0xC0000434)
-#define STATUS_PURGE_FAILED __constant_cpu_to_le32(0xC0000435)
-#define STATUS_CRED_REQUIRES_CONFIRMATION __constant_cpu_to_le32(0xC0000440)
-#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE __constant_cpu_to_le32(0xC0000441)
-#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER __constant_cpu_to_le32(0xC0000442)
-#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE __constant_cpu_to_le32(0xC0000443)
-#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE __constant_cpu_to_le32(0xC0000444)
-#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE __constant_cpu_to_le32(0xC0000445)
-#define STATUS_INVALID_LABEL __constant_cpu_to_le32(0xC0000446)
-#define STATUS_DRIVER_PROCESS_TERMINATED __constant_cpu_to_le32(0xC0000450)
-#define STATUS_AMBIGUOUS_SYSTEM_DEVICE __constant_cpu_to_le32(0xC0000451)
-#define STATUS_SYSTEM_DEVICE_NOT_FOUND __constant_cpu_to_le32(0xC0000452)
-#define STATUS_RESTART_BOOT_APPLICATION __constant_cpu_to_le32(0xC0000453)
-#define STATUS_INVALID_TASK_NAME __constant_cpu_to_le32(0xC0000500)
-#define STATUS_INVALID_TASK_INDEX __constant_cpu_to_le32(0xC0000501)
-#define STATUS_THREAD_ALREADY_IN_TASK __constant_cpu_to_le32(0xC0000502)
-#define STATUS_CALLBACK_BYPASS __constant_cpu_to_le32(0xC0000503)
-#define STATUS_PORT_CLOSED __constant_cpu_to_le32(0xC0000700)
-#define STATUS_MESSAGE_LOST __constant_cpu_to_le32(0xC0000701)
-#define STATUS_INVALID_MESSAGE __constant_cpu_to_le32(0xC0000702)
-#define STATUS_REQUEST_CANCELED __constant_cpu_to_le32(0xC0000703)
-#define STATUS_RECURSIVE_DISPATCH __constant_cpu_to_le32(0xC0000704)
-#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED __constant_cpu_to_le32(0xC0000705)
-#define STATUS_LPC_INVALID_CONNECTION_USAGE __constant_cpu_to_le32(0xC0000706)
-#define STATUS_LPC_REQUESTS_NOT_ALLOWED __constant_cpu_to_le32(0xC0000707)
-#define STATUS_RESOURCE_IN_USE __constant_cpu_to_le32(0xC0000708)
-#define STATUS_HARDWARE_MEMORY_ERROR __constant_cpu_to_le32(0xC0000709)
-#define STATUS_THREADPOOL_HANDLE_EXCEPTION __constant_cpu_to_le32(0xC000070A)
-#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070B)
-#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070C)
-#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070D)
-#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070E)
-#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION __constant_cpu_to_le32(0xC000070F)
-#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING __constant_cpu_to_le32(0xC0000710)
-#define STATUS_APC_RETURNED_WHILE_IMPERSONATING __constant_cpu_to_le32(0xC0000711)
-#define STATUS_PROCESS_IS_PROTECTED __constant_cpu_to_le32(0xC0000712)
-#define STATUS_MCA_EXCEPTION __constant_cpu_to_le32(0xC0000713)
-#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE __constant_cpu_to_le32(0xC0000714)
-#define STATUS_SYMLINK_CLASS_DISABLED __constant_cpu_to_le32(0xC0000715)
-#define STATUS_INVALID_IDN_NORMALIZATION __constant_cpu_to_le32(0xC0000716)
-#define STATUS_NO_UNICODE_TRANSLATION __constant_cpu_to_le32(0xC0000717)
-#define STATUS_ALREADY_REGISTERED __constant_cpu_to_le32(0xC0000718)
-#define STATUS_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0000719)
-#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST __constant_cpu_to_le32(0xC000071A)
-#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY __constant_cpu_to_le32(0xC000071B)
-#define STATUS_INVALID_THREAD __constant_cpu_to_le32(0xC000071C)
-#define STATUS_CALLBACK_RETURNED_TRANSACTION __constant_cpu_to_le32(0xC000071D)
-#define STATUS_CALLBACK_RETURNED_LDR_LOCK __constant_cpu_to_le32(0xC000071E)
-#define STATUS_CALLBACK_RETURNED_LANG __constant_cpu_to_le32(0xC000071F)
-#define STATUS_CALLBACK_RETURNED_PRI_BACK __constant_cpu_to_le32(0xC0000720)
-#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY __constant_cpu_to_le32(0xC0000721)
-#define STATUS_DISK_REPAIR_DISABLED __constant_cpu_to_le32(0xC0000800)
-#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS __constant_cpu_to_le32(0xC0000801)
-#define STATUS_DISK_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000802)
-#define STATUS_CONTENT_BLOCKED __constant_cpu_to_le32(0xC0000804)
-#define STATUS_BAD_CLUSTERS __constant_cpu_to_le32(0xC0000805)
-#define STATUS_VOLUME_DIRTY __constant_cpu_to_le32(0xC0000806)
-#define STATUS_FILE_CHECKED_OUT __constant_cpu_to_le32(0xC0000901)
-#define STATUS_CHECKOUT_REQUIRED __constant_cpu_to_le32(0xC0000902)
-#define STATUS_BAD_FILE_TYPE __constant_cpu_to_le32(0xC0000903)
-#define STATUS_FILE_TOO_LARGE __constant_cpu_to_le32(0xC0000904)
-#define STATUS_FORMS_AUTH_REQUIRED __constant_cpu_to_le32(0xC0000905)
-#define STATUS_VIRUS_INFECTED __constant_cpu_to_le32(0xC0000906)
-#define STATUS_VIRUS_DELETED __constant_cpu_to_le32(0xC0000907)
-#define STATUS_BAD_MCFG_TABLE __constant_cpu_to_le32(0xC0000908)
-#define STATUS_WOW_ASSERTION __constant_cpu_to_le32(0xC0009898)
-#define STATUS_INVALID_SIGNATURE __constant_cpu_to_le32(0xC000A000)
-#define STATUS_HMAC_NOT_SUPPORTED __constant_cpu_to_le32(0xC000A001)
-#define STATUS_IPSEC_QUEUE_OVERFLOW __constant_cpu_to_le32(0xC000A010)
-#define STATUS_ND_QUEUE_OVERFLOW __constant_cpu_to_le32(0xC000A011)
-#define STATUS_HOPLIMIT_EXCEEDED __constant_cpu_to_le32(0xC000A012)
-#define STATUS_PROTOCOL_NOT_SUPPORTED __constant_cpu_to_le32(0xC000A013)
-#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED __constant_cpu_to_le32(0xC000A080)
-#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR __constant_cpu_to_le32(0xC000A081)
-#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR __constant_cpu_to_le32(0xC000A082)
-#define STATUS_XML_PARSE_ERROR __constant_cpu_to_le32(0xC000A083)
-#define STATUS_XMLDSIG_ERROR __constant_cpu_to_le32(0xC000A084)
-#define STATUS_WRONG_COMPARTMENT __constant_cpu_to_le32(0xC000A085)
-#define STATUS_AUTHIP_FAILURE __constant_cpu_to_le32(0xC000A086)
-#define DBG_NO_STATE_CHANGE __constant_cpu_to_le32(0xC0010001)
-#define DBG_APP_NOT_IDLE __constant_cpu_to_le32(0xC0010002)
-#define RPC_NT_INVALID_STRING_BINDING __constant_cpu_to_le32(0xC0020001)
-#define RPC_NT_WRONG_KIND_OF_BINDING __constant_cpu_to_le32(0xC0020002)
-#define RPC_NT_INVALID_BINDING __constant_cpu_to_le32(0xC0020003)
-#define RPC_NT_PROTSEQ_NOT_SUPPORTED __constant_cpu_to_le32(0xC0020004)
-#define RPC_NT_INVALID_RPC_PROTSEQ __constant_cpu_to_le32(0xC0020005)
-#define RPC_NT_INVALID_STRING_UUID __constant_cpu_to_le32(0xC0020006)
-#define RPC_NT_INVALID_ENDPOINT_FORMAT __constant_cpu_to_le32(0xC0020007)
-#define RPC_NT_INVALID_NET_ADDR __constant_cpu_to_le32(0xC0020008)
-#define RPC_NT_NO_ENDPOINT_FOUND __constant_cpu_to_le32(0xC0020009)
-#define RPC_NT_INVALID_TIMEOUT __constant_cpu_to_le32(0xC002000A)
-#define RPC_NT_OBJECT_NOT_FOUND __constant_cpu_to_le32(0xC002000B)
-#define RPC_NT_ALREADY_REGISTERED __constant_cpu_to_le32(0xC002000C)
-#define RPC_NT_TYPE_ALREADY_REGISTERED __constant_cpu_to_le32(0xC002000D)
-#define RPC_NT_ALREADY_LISTENING __constant_cpu_to_le32(0xC002000E)
-#define RPC_NT_NO_PROTSEQS_REGISTERED __constant_cpu_to_le32(0xC002000F)
-#define RPC_NT_NOT_LISTENING __constant_cpu_to_le32(0xC0020010)
-#define RPC_NT_UNKNOWN_MGR_TYPE __constant_cpu_to_le32(0xC0020011)
-#define RPC_NT_UNKNOWN_IF __constant_cpu_to_le32(0xC0020012)
-#define RPC_NT_NO_BINDINGS __constant_cpu_to_le32(0xC0020013)
-#define RPC_NT_NO_PROTSEQS __constant_cpu_to_le32(0xC0020014)
-#define RPC_NT_CANT_CREATE_ENDPOINT __constant_cpu_to_le32(0xC0020015)
-#define RPC_NT_OUT_OF_RESOURCES __constant_cpu_to_le32(0xC0020016)
-#define RPC_NT_SERVER_UNAVAILABLE __constant_cpu_to_le32(0xC0020017)
-#define RPC_NT_SERVER_TOO_BUSY __constant_cpu_to_le32(0xC0020018)
-#define RPC_NT_INVALID_NETWORK_OPTIONS __constant_cpu_to_le32(0xC0020019)
-#define RPC_NT_NO_CALL_ACTIVE __constant_cpu_to_le32(0xC002001A)
-#define RPC_NT_CALL_FAILED __constant_cpu_to_le32(0xC002001B)
-#define RPC_NT_CALL_FAILED_DNE __constant_cpu_to_le32(0xC002001C)
-#define RPC_NT_PROTOCOL_ERROR __constant_cpu_to_le32(0xC002001D)
-#define RPC_NT_UNSUPPORTED_TRANS_SYN __constant_cpu_to_le32(0xC002001F)
-#define RPC_NT_UNSUPPORTED_TYPE __constant_cpu_to_le32(0xC0020021)
-#define RPC_NT_INVALID_TAG __constant_cpu_to_le32(0xC0020022)
-#define RPC_NT_INVALID_BOUND __constant_cpu_to_le32(0xC0020023)
-#define RPC_NT_NO_ENTRY_NAME __constant_cpu_to_le32(0xC0020024)
-#define RPC_NT_INVALID_NAME_SYNTAX __constant_cpu_to_le32(0xC0020025)
-#define RPC_NT_UNSUPPORTED_NAME_SYNTAX __constant_cpu_to_le32(0xC0020026)
-#define RPC_NT_UUID_NO_ADDRESS __constant_cpu_to_le32(0xC0020028)
-#define RPC_NT_DUPLICATE_ENDPOINT __constant_cpu_to_le32(0xC0020029)
-#define RPC_NT_UNKNOWN_AUTHN_TYPE __constant_cpu_to_le32(0xC002002A)
-#define RPC_NT_MAX_CALLS_TOO_SMALL __constant_cpu_to_le32(0xC002002B)
-#define RPC_NT_STRING_TOO_LONG __constant_cpu_to_le32(0xC002002C)
-#define RPC_NT_PROTSEQ_NOT_FOUND __constant_cpu_to_le32(0xC002002D)
-#define RPC_NT_PROCNUM_OUT_OF_RANGE __constant_cpu_to_le32(0xC002002E)
-#define RPC_NT_BINDING_HAS_NO_AUTH __constant_cpu_to_le32(0xC002002F)
-#define RPC_NT_UNKNOWN_AUTHN_SERVICE __constant_cpu_to_le32(0xC0020030)
-#define RPC_NT_UNKNOWN_AUTHN_LEVEL __constant_cpu_to_le32(0xC0020031)
-#define RPC_NT_INVALID_AUTH_IDENTITY __constant_cpu_to_le32(0xC0020032)
-#define RPC_NT_UNKNOWN_AUTHZ_SERVICE __constant_cpu_to_le32(0xC0020033)
-#define EPT_NT_INVALID_ENTRY __constant_cpu_to_le32(0xC0020034)
-#define EPT_NT_CANT_PERFORM_OP __constant_cpu_to_le32(0xC0020035)
-#define EPT_NT_NOT_REGISTERED __constant_cpu_to_le32(0xC0020036)
-#define RPC_NT_NOTHING_TO_EXPORT __constant_cpu_to_le32(0xC0020037)
-#define RPC_NT_INCOMPLETE_NAME __constant_cpu_to_le32(0xC0020038)
-#define RPC_NT_INVALID_VERS_OPTION __constant_cpu_to_le32(0xC0020039)
-#define RPC_NT_NO_MORE_MEMBERS __constant_cpu_to_le32(0xC002003A)
-#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED __constant_cpu_to_le32(0xC002003B)
-#define RPC_NT_INTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC002003C)
-#define RPC_NT_ENTRY_ALREADY_EXISTS __constant_cpu_to_le32(0xC002003D)
-#define RPC_NT_ENTRY_NOT_FOUND __constant_cpu_to_le32(0xC002003E)
-#define RPC_NT_NAME_SERVICE_UNAVAILABLE __constant_cpu_to_le32(0xC002003F)
-#define RPC_NT_INVALID_NAF_ID __constant_cpu_to_le32(0xC0020040)
-#define RPC_NT_CANNOT_SUPPORT __constant_cpu_to_le32(0xC0020041)
-#define RPC_NT_NO_CONTEXT_AVAILABLE __constant_cpu_to_le32(0xC0020042)
-#define RPC_NT_INTERNAL_ERROR __constant_cpu_to_le32(0xC0020043)
-#define RPC_NT_ZERO_DIVIDE __constant_cpu_to_le32(0xC0020044)
-#define RPC_NT_ADDRESS_ERROR __constant_cpu_to_le32(0xC0020045)
-#define RPC_NT_FP_DIV_ZERO __constant_cpu_to_le32(0xC0020046)
-#define RPC_NT_FP_UNDERFLOW __constant_cpu_to_le32(0xC0020047)
-#define RPC_NT_FP_OVERFLOW __constant_cpu_to_le32(0xC0020048)
-#define RPC_NT_CALL_IN_PROGRESS __constant_cpu_to_le32(0xC0020049)
-#define RPC_NT_NO_MORE_BINDINGS __constant_cpu_to_le32(0xC002004A)
-#define RPC_NT_GROUP_MEMBER_NOT_FOUND __constant_cpu_to_le32(0xC002004B)
-#define EPT_NT_CANT_CREATE __constant_cpu_to_le32(0xC002004C)
-#define RPC_NT_INVALID_OBJECT __constant_cpu_to_le32(0xC002004D)
-#define RPC_NT_NO_INTERFACES __constant_cpu_to_le32(0xC002004F)
-#define RPC_NT_CALL_CANCELLED __constant_cpu_to_le32(0xC0020050)
-#define RPC_NT_BINDING_INCOMPLETE __constant_cpu_to_le32(0xC0020051)
-#define RPC_NT_COMM_FAILURE __constant_cpu_to_le32(0xC0020052)
-#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL __constant_cpu_to_le32(0xC0020053)
-#define RPC_NT_NO_PRINC_NAME __constant_cpu_to_le32(0xC0020054)
-#define RPC_NT_NOT_RPC_ERROR __constant_cpu_to_le32(0xC0020055)
-#define RPC_NT_SEC_PKG_ERROR __constant_cpu_to_le32(0xC0020057)
-#define RPC_NT_NOT_CANCELLED __constant_cpu_to_le32(0xC0020058)
-#define RPC_NT_INVALID_ASYNC_HANDLE __constant_cpu_to_le32(0xC0020062)
-#define RPC_NT_INVALID_ASYNC_CALL __constant_cpu_to_le32(0xC0020063)
-#define RPC_NT_PROXY_ACCESS_DENIED __constant_cpu_to_le32(0xC0020064)
-#define RPC_NT_NO_MORE_ENTRIES __constant_cpu_to_le32(0xC0030001)
-#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL __constant_cpu_to_le32(0xC0030002)
-#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE __constant_cpu_to_le32(0xC0030003)
-#define RPC_NT_SS_IN_NULL_CONTEXT __constant_cpu_to_le32(0xC0030004)
-#define RPC_NT_SS_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0030005)
-#define RPC_NT_SS_CONTEXT_DAMAGED __constant_cpu_to_le32(0xC0030006)
-#define RPC_NT_SS_HANDLES_MISMATCH __constant_cpu_to_le32(0xC0030007)
-#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE __constant_cpu_to_le32(0xC0030008)
-#define RPC_NT_NULL_REF_POINTER __constant_cpu_to_le32(0xC0030009)
-#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE __constant_cpu_to_le32(0xC003000A)
-#define RPC_NT_BYTE_COUNT_TOO_SMALL __constant_cpu_to_le32(0xC003000B)
-#define RPC_NT_BAD_STUB_DATA __constant_cpu_to_le32(0xC003000C)
-#define RPC_NT_INVALID_ES_ACTION __constant_cpu_to_le32(0xC0030059)
-#define RPC_NT_WRONG_ES_VERSION __constant_cpu_to_le32(0xC003005A)
-#define RPC_NT_WRONG_STUB_VERSION __constant_cpu_to_le32(0xC003005B)
-#define RPC_NT_INVALID_PIPE_OBJECT __constant_cpu_to_le32(0xC003005C)
-#define RPC_NT_INVALID_PIPE_OPERATION __constant_cpu_to_le32(0xC003005D)
-#define RPC_NT_WRONG_PIPE_VERSION __constant_cpu_to_le32(0xC003005E)
-#define RPC_NT_PIPE_CLOSED __constant_cpu_to_le32(0xC003005F)
-#define RPC_NT_PIPE_DISCIPLINE_ERROR __constant_cpu_to_le32(0xC0030060)
-#define RPC_NT_PIPE_EMPTY __constant_cpu_to_le32(0xC0030061)
-#define STATUS_PNP_BAD_MPS_TABLE __constant_cpu_to_le32(0xC0040035)
-#define STATUS_PNP_TRANSLATION_FAILED __constant_cpu_to_le32(0xC0040036)
-#define STATUS_PNP_IRQ_TRANSLATION_FAILED __constant_cpu_to_le32(0xC0040037)
-#define STATUS_PNP_INVALID_ID __constant_cpu_to_le32(0xC0040038)
-#define STATUS_IO_REISSUE_AS_CACHED __constant_cpu_to_le32(0xC0040039)
-#define STATUS_CTX_WINSTATION_NAME_INVALID __constant_cpu_to_le32(0xC00A0001)
-#define STATUS_CTX_INVALID_PD __constant_cpu_to_le32(0xC00A0002)
-#define STATUS_CTX_PD_NOT_FOUND __constant_cpu_to_le32(0xC00A0003)
-#define STATUS_CTX_CLOSE_PENDING __constant_cpu_to_le32(0xC00A0006)
-#define STATUS_CTX_NO_OUTBUF __constant_cpu_to_le32(0xC00A0007)
-#define STATUS_CTX_MODEM_INF_NOT_FOUND __constant_cpu_to_le32(0xC00A0008)
-#define STATUS_CTX_INVALID_MODEMNAME __constant_cpu_to_le32(0xC00A0009)
-#define STATUS_CTX_RESPONSE_ERROR __constant_cpu_to_le32(0xC00A000A)
-#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT __constant_cpu_to_le32(0xC00A000B)
-#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER __constant_cpu_to_le32(0xC00A000C)
-#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE __constant_cpu_to_le32(0xC00A000D)
-#define STATUS_CTX_MODEM_RESPONSE_BUSY __constant_cpu_to_le32(0xC00A000E)
-#define STATUS_CTX_MODEM_RESPONSE_VOICE __constant_cpu_to_le32(0xC00A000F)
-#define STATUS_CTX_TD_ERROR __constant_cpu_to_le32(0xC00A0010)
-#define STATUS_CTX_LICENSE_CLIENT_INVALID __constant_cpu_to_le32(0xC00A0012)
-#define STATUS_CTX_LICENSE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00A0013)
-#define STATUS_CTX_LICENSE_EXPIRED __constant_cpu_to_le32(0xC00A0014)
-#define STATUS_CTX_WINSTATION_NOT_FOUND __constant_cpu_to_le32(0xC00A0015)
-#define STATUS_CTX_WINSTATION_NAME_COLLISION __constant_cpu_to_le32(0xC00A0016)
-#define STATUS_CTX_WINSTATION_BUSY __constant_cpu_to_le32(0xC00A0017)
-#define STATUS_CTX_BAD_VIDEO_MODE __constant_cpu_to_le32(0xC00A0018)
-#define STATUS_CTX_GRAPHICS_INVALID __constant_cpu_to_le32(0xC00A0022)
-#define STATUS_CTX_NOT_CONSOLE __constant_cpu_to_le32(0xC00A0024)
-#define STATUS_CTX_CLIENT_QUERY_TIMEOUT __constant_cpu_to_le32(0xC00A0026)
-#define STATUS_CTX_CONSOLE_DISCONNECT __constant_cpu_to_le32(0xC00A0027)
-#define STATUS_CTX_CONSOLE_CONNECT __constant_cpu_to_le32(0xC00A0028)
-#define STATUS_CTX_SHADOW_DENIED __constant_cpu_to_le32(0xC00A002A)
-#define STATUS_CTX_WINSTATION_ACCESS_DENIED __constant_cpu_to_le32(0xC00A002B)
-#define STATUS_CTX_INVALID_WD __constant_cpu_to_le32(0xC00A002E)
-#define STATUS_CTX_WD_NOT_FOUND __constant_cpu_to_le32(0xC00A002F)
-#define STATUS_CTX_SHADOW_INVALID __constant_cpu_to_le32(0xC00A0030)
-#define STATUS_CTX_SHADOW_DISABLED __constant_cpu_to_le32(0xC00A0031)
-#define STATUS_RDP_PROTOCOL_ERROR __constant_cpu_to_le32(0xC00A0032)
-#define STATUS_CTX_CLIENT_LICENSE_NOT_SET __constant_cpu_to_le32(0xC00A0033)
-#define STATUS_CTX_CLIENT_LICENSE_IN_USE __constant_cpu_to_le32(0xC00A0034)
-#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE __constant_cpu_to_le32(0xC00A0035)
-#define STATUS_CTX_SHADOW_NOT_RUNNING __constant_cpu_to_le32(0xC00A0036)
-#define STATUS_CTX_LOGON_DISABLED __constant_cpu_to_le32(0xC00A0037)
-#define STATUS_CTX_SECURITY_LAYER_ERROR __constant_cpu_to_le32(0xC00A0038)
-#define STATUS_TS_INCOMPATIBLE_SESSIONS __constant_cpu_to_le32(0xC00A0039)
-#define STATUS_MUI_FILE_NOT_FOUND __constant_cpu_to_le32(0xC00B0001)
-#define STATUS_MUI_INVALID_FILE __constant_cpu_to_le32(0xC00B0002)
-#define STATUS_MUI_INVALID_RC_CONFIG __constant_cpu_to_le32(0xC00B0003)
-#define STATUS_MUI_INVALID_LOCALE_NAME __constant_cpu_to_le32(0xC00B0004)
-#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME __constant_cpu_to_le32(0xC00B0005)
-#define STATUS_MUI_FILE_NOT_LOADED __constant_cpu_to_le32(0xC00B0006)
-#define STATUS_RESOURCE_ENUM_USER_STOP __constant_cpu_to_le32(0xC00B0007)
-#define STATUS_CLUSTER_INVALID_NODE __constant_cpu_to_le32(0xC0130001)
-#define STATUS_CLUSTER_NODE_EXISTS __constant_cpu_to_le32(0xC0130002)
-#define STATUS_CLUSTER_JOIN_IN_PROGRESS __constant_cpu_to_le32(0xC0130003)
-#define STATUS_CLUSTER_NODE_NOT_FOUND __constant_cpu_to_le32(0xC0130004)
-#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND __constant_cpu_to_le32(0xC0130005)
-#define STATUS_CLUSTER_NETWORK_EXISTS __constant_cpu_to_le32(0xC0130006)
-#define STATUS_CLUSTER_NETWORK_NOT_FOUND __constant_cpu_to_le32(0xC0130007)
-#define STATUS_CLUSTER_NETINTERFACE_EXISTS __constant_cpu_to_le32(0xC0130008)
-#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC0130009)
-#define STATUS_CLUSTER_INVALID_REQUEST __constant_cpu_to_le32(0xC013000A)
-#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER __constant_cpu_to_le32(0xC013000B)
-#define STATUS_CLUSTER_NODE_DOWN __constant_cpu_to_le32(0xC013000C)
-#define STATUS_CLUSTER_NODE_UNREACHABLE __constant_cpu_to_le32(0xC013000D)
-#define STATUS_CLUSTER_NODE_NOT_MEMBER __constant_cpu_to_le32(0xC013000E)
-#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS __constant_cpu_to_le32(0xC013000F)
-#define STATUS_CLUSTER_INVALID_NETWORK __constant_cpu_to_le32(0xC0130010)
-#define STATUS_CLUSTER_NO_NET_ADAPTERS __constant_cpu_to_le32(0xC0130011)
-#define STATUS_CLUSTER_NODE_UP __constant_cpu_to_le32(0xC0130012)
-#define STATUS_CLUSTER_NODE_PAUSED __constant_cpu_to_le32(0xC0130013)
-#define STATUS_CLUSTER_NODE_NOT_PAUSED __constant_cpu_to_le32(0xC0130014)
-#define STATUS_CLUSTER_NO_SECURITY_CONTEXT __constant_cpu_to_le32(0xC0130015)
-#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL __constant_cpu_to_le32(0xC0130016)
-#define STATUS_CLUSTER_POISONED __constant_cpu_to_le32(0xC0130017)
-#define STATUS_ACPI_INVALID_OPCODE __constant_cpu_to_le32(0xC0140001)
-#define STATUS_ACPI_STACK_OVERFLOW __constant_cpu_to_le32(0xC0140002)
-#define STATUS_ACPI_ASSERT_FAILED __constant_cpu_to_le32(0xC0140003)
-#define STATUS_ACPI_INVALID_INDEX __constant_cpu_to_le32(0xC0140004)
-#define STATUS_ACPI_INVALID_ARGUMENT __constant_cpu_to_le32(0xC0140005)
-#define STATUS_ACPI_FATAL __constant_cpu_to_le32(0xC0140006)
-#define STATUS_ACPI_INVALID_SUPERNAME __constant_cpu_to_le32(0xC0140007)
-#define STATUS_ACPI_INVALID_ARGTYPE __constant_cpu_to_le32(0xC0140008)
-#define STATUS_ACPI_INVALID_OBJTYPE __constant_cpu_to_le32(0xC0140009)
-#define STATUS_ACPI_INVALID_TARGETTYPE __constant_cpu_to_le32(0xC014000A)
-#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT __constant_cpu_to_le32(0xC014000B)
-#define STATUS_ACPI_ADDRESS_NOT_MAPPED __constant_cpu_to_le32(0xC014000C)
-#define STATUS_ACPI_INVALID_EVENTTYPE __constant_cpu_to_le32(0xC014000D)
-#define STATUS_ACPI_HANDLER_COLLISION __constant_cpu_to_le32(0xC014000E)
-#define STATUS_ACPI_INVALID_DATA __constant_cpu_to_le32(0xC014000F)
-#define STATUS_ACPI_INVALID_REGION __constant_cpu_to_le32(0xC0140010)
-#define STATUS_ACPI_INVALID_ACCESS_SIZE __constant_cpu_to_le32(0xC0140011)
-#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK __constant_cpu_to_le32(0xC0140012)
-#define STATUS_ACPI_ALREADY_INITIALIZED __constant_cpu_to_le32(0xC0140013)
-#define STATUS_ACPI_NOT_INITIALIZED __constant_cpu_to_le32(0xC0140014)
-#define STATUS_ACPI_INVALID_MUTEX_LEVEL __constant_cpu_to_le32(0xC0140015)
-#define STATUS_ACPI_MUTEX_NOT_OWNED __constant_cpu_to_le32(0xC0140016)
-#define STATUS_ACPI_MUTEX_NOT_OWNER __constant_cpu_to_le32(0xC0140017)
-#define STATUS_ACPI_RS_ACCESS __constant_cpu_to_le32(0xC0140018)
-#define STATUS_ACPI_INVALID_TABLE __constant_cpu_to_le32(0xC0140019)
-#define STATUS_ACPI_REG_HANDLER_FAILED __constant_cpu_to_le32(0xC0140020)
-#define STATUS_ACPI_POWER_REQUEST_FAILED __constant_cpu_to_le32(0xC0140021)
-#define STATUS_SXS_SECTION_NOT_FOUND __constant_cpu_to_le32(0xC0150001)
-#define STATUS_SXS_CANT_GEN_ACTCTX __constant_cpu_to_le32(0xC0150002)
-#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT __constant_cpu_to_le32(0xC0150003)
-#define STATUS_SXS_ASSEMBLY_NOT_FOUND __constant_cpu_to_le32(0xC0150004)
-#define STATUS_SXS_MANIFEST_FORMAT_ERROR __constant_cpu_to_le32(0xC0150005)
-#define STATUS_SXS_MANIFEST_PARSE_ERROR __constant_cpu_to_le32(0xC0150006)
-#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED __constant_cpu_to_le32(0xC0150007)
-#define STATUS_SXS_KEY_NOT_FOUND __constant_cpu_to_le32(0xC0150008)
-#define STATUS_SXS_VERSION_CONFLICT __constant_cpu_to_le32(0xC0150009)
-#define STATUS_SXS_WRONG_SECTION_TYPE __constant_cpu_to_le32(0xC015000A)
-#define STATUS_SXS_THREAD_QUERIES_DISABLED __constant_cpu_to_le32(0xC015000B)
-#define STATUS_SXS_ASSEMBLY_MISSING __constant_cpu_to_le32(0xC015000C)
-#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET __constant_cpu_to_le32(0xC015000E)
-#define STATUS_SXS_EARLY_DEACTIVATION __constant_cpu_to_le32(0xC015000F)
-#define STATUS_SXS_INVALID_DEACTIVATION __constant_cpu_to_le32(0xC0150010)
-#define STATUS_SXS_MULTIPLE_DEACTIVATION __constant_cpu_to_le32(0xC0150011)
-#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY __constant_cpu_to_le32(0xC0150012)
-#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED __constant_cpu_to_le32(0xC0150013)
-#define STATUS_SXS_CORRUPT_ACTIVATION_STACK __constant_cpu_to_le32(0xC0150014)
-#define STATUS_SXS_CORRUPTION __constant_cpu_to_le32(0xC0150015)
-#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE __constant_cpu_to_le32(0xC0150016)
-#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME __constant_cpu_to_le32(0xC0150017)
-#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE __constant_cpu_to_le32(0xC0150018)
-#define STATUS_SXS_IDENTITY_PARSE_ERROR __constant_cpu_to_le32(0xC0150019)
-#define STATUS_SXS_COMPONENT_STORE_CORRUPT __constant_cpu_to_le32(0xC015001A)
-#define STATUS_SXS_FILE_HASH_MISMATCH __constant_cpu_to_le32(0xC015001B)
-#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT __constant_cpu_to_le32(0xC015001C)
-#define STATUS_SXS_IDENTITIES_DIFFERENT __constant_cpu_to_le32(0xC015001D)
-#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT __constant_cpu_to_le32(0xC015001E)
-#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY __constant_cpu_to_le32(0xC015001F)
-#define STATUS_ADVANCED_INSTALLER_FAILED __constant_cpu_to_le32(0xC0150020)
-#define STATUS_XML_ENCODING_MISMATCH __constant_cpu_to_le32(0xC0150021)
-#define STATUS_SXS_MANIFEST_TOO_BIG __constant_cpu_to_le32(0xC0150022)
-#define STATUS_SXS_SETTING_NOT_REGISTERED __constant_cpu_to_le32(0xC0150023)
-#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE __constant_cpu_to_le32(0xC0150024)
-#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED __constant_cpu_to_le32(0xC0150025)
-#define STATUS_GENERIC_COMMAND_FAILED __constant_cpu_to_le32(0xC0150026)
-#define STATUS_SXS_FILE_HASH_MISSING __constant_cpu_to_le32(0xC0150027)
-#define STATUS_TRANSACTIONAL_CONFLICT __constant_cpu_to_le32(0xC0190001)
-#define STATUS_INVALID_TRANSACTION __constant_cpu_to_le32(0xC0190002)
-#define STATUS_TRANSACTION_NOT_ACTIVE __constant_cpu_to_le32(0xC0190003)
-#define STATUS_TM_INITIALIZATION_FAILED __constant_cpu_to_le32(0xC0190004)
-#define STATUS_RM_NOT_ACTIVE __constant_cpu_to_le32(0xC0190005)
-#define STATUS_RM_METADATA_CORRUPT __constant_cpu_to_le32(0xC0190006)
-#define STATUS_TRANSACTION_NOT_JOINED __constant_cpu_to_le32(0xC0190007)
-#define STATUS_DIRECTORY_NOT_RM __constant_cpu_to_le32(0xC0190008)
-#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE __constant_cpu_to_le32(0xC019000A)
-#define STATUS_LOG_RESIZE_INVALID_SIZE __constant_cpu_to_le32(0xC019000B)
-#define STATUS_REMOTE_FILE_VERSION_MISMATCH __constant_cpu_to_le32(0xC019000C)
-#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS __constant_cpu_to_le32(0xC019000F)
-#define STATUS_TRANSACTION_PROPAGATION_FAILED __constant_cpu_to_le32(0xC0190010)
-#define STATUS_CRM_PROTOCOL_NOT_FOUND __constant_cpu_to_le32(0xC0190011)
-#define STATUS_TRANSACTION_SUPERIOR_EXISTS __constant_cpu_to_le32(0xC0190012)
-#define STATUS_TRANSACTION_REQUEST_NOT_VALID __constant_cpu_to_le32(0xC0190013)
-#define STATUS_TRANSACTION_NOT_REQUESTED __constant_cpu_to_le32(0xC0190014)
-#define STATUS_TRANSACTION_ALREADY_ABORTED __constant_cpu_to_le32(0xC0190015)
-#define STATUS_TRANSACTION_ALREADY_COMMITTED __constant_cpu_to_le32(0xC0190016)
-#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER __constant_cpu_to_le32(0xC0190017)
-#define STATUS_CURRENT_TRANSACTION_NOT_VALID __constant_cpu_to_le32(0xC0190018)
-#define STATUS_LOG_GROWTH_FAILED __constant_cpu_to_le32(0xC0190019)
-#define STATUS_OBJECT_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC0190021)
-#define STATUS_STREAM_MINIVERSION_NOT_FOUND __constant_cpu_to_le32(0xC0190022)
-#define STATUS_STREAM_MINIVERSION_NOT_VALID __constant_cpu_to_le32(0xC0190023)
-#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION __constant_cpu_to_le32(0xC0190024)
-#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT __constant_cpu_to_le32(0xC0190025)
-#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS __constant_cpu_to_le32(0xC0190026)
-#define STATUS_HANDLE_NO_LONGER_VALID __constant_cpu_to_le32(0xC0190028)
-#define STATUS_LOG_CORRUPTION_DETECTED __constant_cpu_to_le32(0xC0190030)
-#define STATUS_RM_DISCONNECTED __constant_cpu_to_le32(0xC0190032)
-#define STATUS_ENLISTMENT_NOT_SUPERIOR __constant_cpu_to_le32(0xC0190033)
-#define STATUS_FILE_IDENTITY_NOT_PERSISTENT __constant_cpu_to_le32(0xC0190036)
-#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY __constant_cpu_to_le32(0xC0190037)
-#define STATUS_CANT_CROSS_RM_BOUNDARY __constant_cpu_to_le32(0xC0190038)
-#define STATUS_TXF_DIR_NOT_EMPTY __constant_cpu_to_le32(0xC0190039)
-#define STATUS_INDOUBT_TRANSACTIONS_EXIST __constant_cpu_to_le32(0xC019003A)
-#define STATUS_TM_VOLATILE __constant_cpu_to_le32(0xC019003B)
-#define STATUS_ROLLBACK_TIMER_EXPIRED __constant_cpu_to_le32(0xC019003C)
-#define STATUS_TXF_ATTRIBUTE_CORRUPT __constant_cpu_to_le32(0xC019003D)
-#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC019003E)
-#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED __constant_cpu_to_le32(0xC019003F)
-#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE __constant_cpu_to_le32(0xC0190040)
-#define STATUS_TRANSACTION_REQUIRED_PROMOTION __constant_cpu_to_le32(0xC0190043)
-#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION __constant_cpu_to_le32(0xC0190044)
-#define STATUS_TRANSACTIONS_NOT_FROZEN __constant_cpu_to_le32(0xC0190045)
-#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS __constant_cpu_to_le32(0xC0190046)
-#define STATUS_NOT_SNAPSHOT_VOLUME __constant_cpu_to_le32(0xC0190047)
-#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES __constant_cpu_to_le32(0xC0190048)
-#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC0190049)
-#define STATUS_TM_IDENTITY_MISMATCH __constant_cpu_to_le32(0xC019004A)
-#define STATUS_FLOATED_SECTION __constant_cpu_to_le32(0xC019004B)
-#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK __constant_cpu_to_le32(0xC019004C)
-#define STATUS_CANNOT_ABORT_TRANSACTIONS __constant_cpu_to_le32(0xC019004D)
-#define STATUS_TRANSACTION_NOT_FOUND __constant_cpu_to_le32(0xC019004E)
-#define STATUS_RESOURCEMANAGER_NOT_FOUND __constant_cpu_to_le32(0xC019004F)
-#define STATUS_ENLISTMENT_NOT_FOUND __constant_cpu_to_le32(0xC0190050)
-#define STATUS_TRANSACTIONMANAGER_NOT_FOUND __constant_cpu_to_le32(0xC0190051)
-#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE __constant_cpu_to_le32(0xC0190052)
-#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION __constant_cpu_to_le32(0xC0190053)
-#define STATUS_TRANSACTION_NOT_ROOT __constant_cpu_to_le32(0xC0190054)
-#define STATUS_TRANSACTION_OBJECT_EXPIRED __constant_cpu_to_le32(0xC0190055)
-#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC0190056)
-#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED __constant_cpu_to_le32(0xC0190057)
-#define STATUS_TRANSACTION_RECORD_TOO_LONG __constant_cpu_to_le32(0xC0190058)
-#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION __constant_cpu_to_le32(0xC0190059)
-#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION __constant_cpu_to_le32(0xC019005A)
-#define STATUS_TRANSACTION_INTEGRITY_VIOLATED __constant_cpu_to_le32(0xC019005B)
-#define STATUS_LOG_SECTOR_INVALID __constant_cpu_to_le32(0xC01A0001)
-#define STATUS_LOG_SECTOR_PARITY_INVALID __constant_cpu_to_le32(0xC01A0002)
-#define STATUS_LOG_SECTOR_REMAPPED __constant_cpu_to_le32(0xC01A0003)
-#define STATUS_LOG_BLOCK_INCOMPLETE __constant_cpu_to_le32(0xC01A0004)
-#define STATUS_LOG_INVALID_RANGE __constant_cpu_to_le32(0xC01A0005)
-#define STATUS_LOG_BLOCKS_EXHAUSTED __constant_cpu_to_le32(0xC01A0006)
-#define STATUS_LOG_READ_CONTEXT_INVALID __constant_cpu_to_le32(0xC01A0007)
-#define STATUS_LOG_RESTART_INVALID __constant_cpu_to_le32(0xC01A0008)
-#define STATUS_LOG_BLOCK_VERSION __constant_cpu_to_le32(0xC01A0009)
-#define STATUS_LOG_BLOCK_INVALID __constant_cpu_to_le32(0xC01A000A)
-#define STATUS_LOG_READ_MODE_INVALID __constant_cpu_to_le32(0xC01A000B)
-#define STATUS_LOG_METADATA_CORRUPT __constant_cpu_to_le32(0xC01A000D)
-#define STATUS_LOG_METADATA_INVALID __constant_cpu_to_le32(0xC01A000E)
-#define STATUS_LOG_METADATA_INCONSISTENT __constant_cpu_to_le32(0xC01A000F)
-#define STATUS_LOG_RESERVATION_INVALID __constant_cpu_to_le32(0xC01A0010)
-#define STATUS_LOG_CANT_DELETE __constant_cpu_to_le32(0xC01A0011)
-#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC01A0012)
-#define STATUS_LOG_START_OF_LOG __constant_cpu_to_le32(0xC01A0013)
-#define STATUS_LOG_POLICY_ALREADY_INSTALLED __constant_cpu_to_le32(0xC01A0014)
-#define STATUS_LOG_POLICY_NOT_INSTALLED __constant_cpu_to_le32(0xC01A0015)
-#define STATUS_LOG_POLICY_INVALID __constant_cpu_to_le32(0xC01A0016)
-#define STATUS_LOG_POLICY_CONFLICT __constant_cpu_to_le32(0xC01A0017)
-#define STATUS_LOG_PINNED_ARCHIVE_TAIL __constant_cpu_to_le32(0xC01A0018)
-#define STATUS_LOG_RECORD_NONEXISTENT __constant_cpu_to_le32(0xC01A0019)
-#define STATUS_LOG_RECORDS_RESERVED_INVALID __constant_cpu_to_le32(0xC01A001A)
-#define STATUS_LOG_SPACE_RESERVED_INVALID __constant_cpu_to_le32(0xC01A001B)
-#define STATUS_LOG_TAIL_INVALID __constant_cpu_to_le32(0xC01A001C)
-#define STATUS_LOG_FULL __constant_cpu_to_le32(0xC01A001D)
-#define STATUS_LOG_MULTIPLEXED __constant_cpu_to_le32(0xC01A001E)
-#define STATUS_LOG_DEDICATED __constant_cpu_to_le32(0xC01A001F)
-#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS __constant_cpu_to_le32(0xC01A0020)
-#define STATUS_LOG_ARCHIVE_IN_PROGRESS __constant_cpu_to_le32(0xC01A0021)
-#define STATUS_LOG_EPHEMERAL __constant_cpu_to_le32(0xC01A0022)
-#define STATUS_LOG_NOT_ENOUGH_CONTAINERS __constant_cpu_to_le32(0xC01A0023)
-#define STATUS_LOG_CLIENT_ALREADY_REGISTERED __constant_cpu_to_le32(0xC01A0024)
-#define STATUS_LOG_CLIENT_NOT_REGISTERED __constant_cpu_to_le32(0xC01A0025)
-#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS __constant_cpu_to_le32(0xC01A0026)
-#define STATUS_LOG_CONTAINER_READ_FAILED __constant_cpu_to_le32(0xC01A0027)
-#define STATUS_LOG_CONTAINER_WRITE_FAILED __constant_cpu_to_le32(0xC01A0028)
-#define STATUS_LOG_CONTAINER_OPEN_FAILED __constant_cpu_to_le32(0xC01A0029)
-#define STATUS_LOG_CONTAINER_STATE_INVALID __constant_cpu_to_le32(0xC01A002A)
-#define STATUS_LOG_STATE_INVALID __constant_cpu_to_le32(0xC01A002B)
-#define STATUS_LOG_PINNED __constant_cpu_to_le32(0xC01A002C)
-#define STATUS_LOG_METADATA_FLUSH_FAILED __constant_cpu_to_le32(0xC01A002D)
-#define STATUS_LOG_INCONSISTENT_SECURITY __constant_cpu_to_le32(0xC01A002E)
-#define STATUS_LOG_APPENDED_FLUSH_FAILED __constant_cpu_to_le32(0xC01A002F)
-#define STATUS_LOG_PINNED_RESERVATION __constant_cpu_to_le32(0xC01A0030)
-#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD __constant_cpu_to_le32(0xC01B00EA)
-#define STATUS_FLT_NO_HANDLER_DEFINED __constant_cpu_to_le32(0xC01C0001)
-#define STATUS_FLT_CONTEXT_ALREADY_DEFINED __constant_cpu_to_le32(0xC01C0002)
-#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST __constant_cpu_to_le32(0xC01C0003)
-#define STATUS_FLT_DISALLOW_FAST_IO __constant_cpu_to_le32(0xC01C0004)
-#define STATUS_FLT_INVALID_NAME_REQUEST __constant_cpu_to_le32(0xC01C0005)
-#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION __constant_cpu_to_le32(0xC01C0006)
-#define STATUS_FLT_NOT_INITIALIZED __constant_cpu_to_le32(0xC01C0007)
-#define STATUS_FLT_FILTER_NOT_READY __constant_cpu_to_le32(0xC01C0008)
-#define STATUS_FLT_POST_OPERATION_CLEANUP __constant_cpu_to_le32(0xC01C0009)
-#define STATUS_FLT_INTERNAL_ERROR __constant_cpu_to_le32(0xC01C000A)
-#define STATUS_FLT_DELETING_OBJECT __constant_cpu_to_le32(0xC01C000B)
-#define STATUS_FLT_MUST_BE_NONPAGED_POOL __constant_cpu_to_le32(0xC01C000C)
-#define STATUS_FLT_DUPLICATE_ENTRY __constant_cpu_to_le32(0xC01C000D)
-#define STATUS_FLT_CBDQ_DISABLED __constant_cpu_to_le32(0xC01C000E)
-#define STATUS_FLT_DO_NOT_ATTACH __constant_cpu_to_le32(0xC01C000F)
-#define STATUS_FLT_DO_NOT_DETACH __constant_cpu_to_le32(0xC01C0010)
-#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION __constant_cpu_to_le32(0xC01C0011)
-#define STATUS_FLT_INSTANCE_NAME_COLLISION __constant_cpu_to_le32(0xC01C0012)
-#define STATUS_FLT_FILTER_NOT_FOUND __constant_cpu_to_le32(0xC01C0013)
-#define STATUS_FLT_VOLUME_NOT_FOUND __constant_cpu_to_le32(0xC01C0014)
-#define STATUS_FLT_INSTANCE_NOT_FOUND __constant_cpu_to_le32(0xC01C0015)
-#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND __constant_cpu_to_le32(0xC01C0016)
-#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION __constant_cpu_to_le32(0xC01C0017)
-#define STATUS_FLT_NAME_CACHE_MISS __constant_cpu_to_le32(0xC01C0018)
-#define STATUS_FLT_NO_DEVICE_OBJECT __constant_cpu_to_le32(0xC01C0019)
-#define STATUS_FLT_VOLUME_ALREADY_MOUNTED __constant_cpu_to_le32(0xC01C001A)
-#define STATUS_FLT_ALREADY_ENLISTED __constant_cpu_to_le32(0xC01C001B)
-#define STATUS_FLT_CONTEXT_ALREADY_LINKED __constant_cpu_to_le32(0xC01C001C)
-#define STATUS_FLT_NO_WAITER_FOR_REPLY __constant_cpu_to_le32(0xC01C0020)
-#define STATUS_MONITOR_NO_DESCRIPTOR __constant_cpu_to_le32(0xC01D0001)
-#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT __constant_cpu_to_le32(0xC01D0002)
-#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM __constant_cpu_to_le32(0xC01D0003)
-#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK __constant_cpu_to_le32(0xC01D0004)
-#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED __constant_cpu_to_le32(0xC01D0005)
-#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK __constant_cpu_to_le32(0xC01D0006)
-#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK __constant_cpu_to_le32(0xC01D0007)
-#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA __constant_cpu_to_le32(0xC01D0008)
-#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK __constant_cpu_to_le32(0xC01D0009)
-#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER __constant_cpu_to_le32(0xC01E0000)
-#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER __constant_cpu_to_le32(0xC01E0001)
-#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER __constant_cpu_to_le32(0xC01E0002)
-#define STATUS_GRAPHICS_ADAPTER_WAS_RESET __constant_cpu_to_le32(0xC01E0003)
-#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL __constant_cpu_to_le32(0xC01E0004)
-#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED __constant_cpu_to_le32(0xC01E0005)
-#define STATUS_GRAPHICS_PRESENT_OCCLUDED __constant_cpu_to_le32(0xC01E0006)
-#define STATUS_GRAPHICS_PRESENT_DENIED __constant_cpu_to_le32(0xC01E0007)
-#define STATUS_GRAPHICS_CANNOTCOLORCONVERT __constant_cpu_to_le32(0xC01E0008)
-#define STATUS_GRAPHICS_NO_VIDEO_MEMORY __constant_cpu_to_le32(0xC01E0100)
-#define STATUS_GRAPHICS_CANT_LOCK_MEMORY __constant_cpu_to_le32(0xC01E0101)
-#define STATUS_GRAPHICS_ALLOCATION_BUSY __constant_cpu_to_le32(0xC01E0102)
-#define STATUS_GRAPHICS_TOO_MANY_REFERENCES __constant_cpu_to_le32(0xC01E0103)
-#define STATUS_GRAPHICS_TRY_AGAIN_LATER __constant_cpu_to_le32(0xC01E0104)
-#define STATUS_GRAPHICS_TRY_AGAIN_NOW __constant_cpu_to_le32(0xC01E0105)
-#define STATUS_GRAPHICS_ALLOCATION_INVALID __constant_cpu_to_le32(0xC01E0106)
-#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE __constant_cpu_to_le32(0xC01E0107)
-#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED __constant_cpu_to_le32(0xC01E0108)
-#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION __constant_cpu_to_le32(0xC01E0109)
-#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE __constant_cpu_to_le32(0xC01E0110)
-#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION __constant_cpu_to_le32(0xC01E0111)
-#define STATUS_GRAPHICS_ALLOCATION_CLOSED __constant_cpu_to_le32(0xC01E0112)
-#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE __constant_cpu_to_le32(0xC01E0113)
-#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE __constant_cpu_to_le32(0xC01E0114)
-#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE __constant_cpu_to_le32(0xC01E0115)
-#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST __constant_cpu_to_le32(0xC01E0116)
-#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE __constant_cpu_to_le32(0xC01E0200)
-#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E0300)
-#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0301)
-#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0302)
-#define STATUS_GRAPHICS_INVALID_VIDPN __constant_cpu_to_le32(0xC01E0303)
-#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE __constant_cpu_to_le32(0xC01E0304)
-#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET __constant_cpu_to_le32(0xC01E0305)
-#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0306)
-#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET __constant_cpu_to_le32(0xC01E0308)
-#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET __constant_cpu_to_le32(0xC01E0309)
-#define STATUS_GRAPHICS_INVALID_FREQUENCY __constant_cpu_to_le32(0xC01E030A)
-#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION __constant_cpu_to_le32(0xC01E030B)
-#define STATUS_GRAPHICS_INVALID_TOTAL_REGION __constant_cpu_to_le32(0xC01E030C)
-#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE __constant_cpu_to_le32(0xC01E0310)
-#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE __constant_cpu_to_le32(0xC01E0311)
-#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET __constant_cpu_to_le32(0xC01E0312)
-#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0313)
-#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET __constant_cpu_to_le32(0xC01E0314)
-#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET __constant_cpu_to_le32(0xC01E0315)
-#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET __constant_cpu_to_le32(0xC01E0316)
-#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E0317)
-#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E0318)
-#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH __constant_cpu_to_le32(0xC01E0319)
-#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E031A)
-#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET __constant_cpu_to_le32(0xC01E031B)
-#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE __constant_cpu_to_le32(0xC01E031C)
-#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET __constant_cpu_to_le32(0xC01E031D)
-#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E031F)
-#define STATUS_GRAPHICS_STALE_MODESET __constant_cpu_to_le32(0xC01E0320)
-#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET __constant_cpu_to_le32(0xC01E0321)
-#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE __constant_cpu_to_le32(0xC01E0322)
-#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN __constant_cpu_to_le32(0xC01E0323)
-#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0324)
-#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION __constant_cpu_to_le32(0xC01E0325)
-#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES __constant_cpu_to_le32(0xC01E0326)
-#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0327)
-#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE __constant_cpu_to_le32(0xC01E0328)
-#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET __constant_cpu_to_le32(0xC01E0329)
-#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET __constant_cpu_to_le32(0xC01E032A)
-#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR __constant_cpu_to_le32(0xC01E032B)
-#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET __constant_cpu_to_le32(0xC01E032C)
-#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E032D)
-#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E032E)
-#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE __constant_cpu_to_le32(0xC01E032F)
-#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED __constant_cpu_to_le32(0xC01E0330)
-#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0331)
-#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0332)
-#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET __constant_cpu_to_le32(0xC01E0333)
-#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER __constant_cpu_to_le32(0xC01E0334)
-#define STATUS_GRAPHICS_NO_VIDPNMGR __constant_cpu_to_le32(0xC01E0335)
-#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN __constant_cpu_to_le32(0xC01E0336)
-#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E0337)
-#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED __constant_cpu_to_le32(0xC01E0338)
-#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0339)
-#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE __constant_cpu_to_le32(0xC01E033A)
-#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE __constant_cpu_to_le32(0xC01E033B)
-#define STATUS_GRAPHICS_INVALID_STRIDE __constant_cpu_to_le32(0xC01E033C)
-#define STATUS_GRAPHICS_INVALID_PIXELFORMAT __constant_cpu_to_le32(0xC01E033D)
-#define STATUS_GRAPHICS_INVALID_COLORBASIS __constant_cpu_to_le32(0xC01E033E)
-#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE __constant_cpu_to_le32(0xC01E033F)
-#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0340)
-#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT __constant_cpu_to_le32(0xC01E0341)
-#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE __constant_cpu_to_le32(0xC01E0342)
-#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN __constant_cpu_to_le32(0xC01E0343)
-#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL __constant_cpu_to_le32(0xC01E0344)
-#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION __constant_cpu_to_le32(0xC01E0345)
-#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0346)
-#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP __constant_cpu_to_le32(0xC01E0347)
-#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0348)
-#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0349)
-#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET __constant_cpu_to_le32(0xC01E034A)
-#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON __constant_cpu_to_le32(0xC01E034D)
-#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE __constant_cpu_to_le32(0xC01E034E)
-#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE __constant_cpu_to_le32(0xC01E034F)
-#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS __constant_cpu_to_le32(0xC01E0350)
-#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING __constant_cpu_to_le32(0xC01E0352)
-#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED __constant_cpu_to_le32(0xC01E0353)
-#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS __constant_cpu_to_le32(0xC01E0354)
-#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT __constant_cpu_to_le32(0xC01E0355)
-#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM __constant_cpu_to_le32(0xC01E0356)
-#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN __constant_cpu_to_le32(0xC01E0357)
-#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT __constant_cpu_to_le32(0xC01E0358)
-#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED __constant_cpu_to_le32(0xC01E0359)
-#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION __constant_cpu_to_le32(0xC01E035A)
-#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE __constant_cpu_to_le32(0xC01E035B)
-#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET __constant_cpu_to_le32(0xC01E035C)
-#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED __constant_cpu_to_le32(0xC01E0400)
-#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0401)
-#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER __constant_cpu_to_le32(0xC01E0430)
-#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED __constant_cpu_to_le32(0xC01E0431)
-#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED __constant_cpu_to_le32(0xC01E0432)
-#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY __constant_cpu_to_le32(0xC01E0433)
-#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED __constant_cpu_to_le32(0xC01E0434)
-#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON __constant_cpu_to_le32(0xC01E0435)
-#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE __constant_cpu_to_le32(0xC01E0436)
-#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER __constant_cpu_to_le32(0xC01E0438)
-#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED __constant_cpu_to_le32(0xC01E043B)
-#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS __constant_cpu_to_le32(0xC01E051C)
-#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST __constant_cpu_to_le32(0xC01E051D)
-#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E051E)
-#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS __constant_cpu_to_le32(0xC01E051F)
-#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0520)
-#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST __constant_cpu_to_le32(0xC01E0521)
-#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0500)
-#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0501)
-#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0502)
-#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS __constant_cpu_to_le32(0xC01E0503)
-#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL __constant_cpu_to_le32(0xC01E0504)
-#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST __constant_cpu_to_le32(0xC01E0505)
-#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME __constant_cpu_to_le32(0xC01E0506)
-#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP __constant_cpu_to_le32(0xC01E0507)
-#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0508)
-#define STATUS_GRAPHICS_OPM_INVALID_POINTER __constant_cpu_to_le32(0xC01E050A)
-#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E050B)
-#define STATUS_GRAPHICS_OPM_INVALID_HANDLE __constant_cpu_to_le32(0xC01E050C)
-#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE __constant_cpu_to_le32(0xC01E050D)
-#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH __constant_cpu_to_le32(0xC01E050E)
-#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED __constant_cpu_to_le32(0xC01E050F)
-#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED __constant_cpu_to_le32(0xC01E0510)
-#define STATUS_GRAPHICS_PVP_HFS_FAILED __constant_cpu_to_le32(0xC01E0511)
-#define STATUS_GRAPHICS_OPM_INVALID_SRM __constant_cpu_to_le32(0xC01E0512)
-#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP __constant_cpu_to_le32(0xC01E0513)
-#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP __constant_cpu_to_le32(0xC01E0514)
-#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA __constant_cpu_to_le32(0xC01E0515)
-#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET __constant_cpu_to_le32(0xC01E0516)
-#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH __constant_cpu_to_le32(0xC01E0517)
-#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE __constant_cpu_to_le32(0xC01E0518)
-#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC01E051A)
-#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS __constant_cpu_to_le32(0xC01E051B)
-#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0580)
-#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST __constant_cpu_to_le32(0xC01E0581)
-#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA __constant_cpu_to_le32(0xC01E0582)
-#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA __constant_cpu_to_le32(0xC01E0583)
-#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0584)
-#define STATUS_GRAPHICS_DDCCI_INVALID_DATA __constant_cpu_to_le32(0xC01E0585)
-#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE __constant_cpu_to_le32(0xC01E0586)
-#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING __constant_cpu_to_le32(0xC01E0587)
-#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E0588)
-#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND __constant_cpu_to_le32(0xC01E0589)
-#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH __constant_cpu_to_le32(0xC01E058A)
-#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM __constant_cpu_to_le32(0xC01E058B)
-#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE __constant_cpu_to_le32(0xC01E058C)
-#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC01E058D)
-#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED __constant_cpu_to_le32(0xC01E05E0)
-#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME __constant_cpu_to_le32(0xC01E05E1)
-#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP __constant_cpu_to_le32(0xC01E05E2)
-#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E05E3)
-#define STATUS_GRAPHICS_INVALID_POINTER __constant_cpu_to_le32(0xC01E05E4)
-#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE __constant_cpu_to_le32(0xC01E05E5)
-#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL __constant_cpu_to_le32(0xC01E05E6)
-#define STATUS_GRAPHICS_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E05E7)
-#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS __constant_cpu_to_le32(0xC01E05E8)
-#define STATUS_FVE_LOCKED_VOLUME __constant_cpu_to_le32(0xC0210000)
-#define STATUS_FVE_NOT_ENCRYPTED __constant_cpu_to_le32(0xC0210001)
-#define STATUS_FVE_BAD_INFORMATION __constant_cpu_to_le32(0xC0210002)
-#define STATUS_FVE_TOO_SMALL __constant_cpu_to_le32(0xC0210003)
-#define STATUS_FVE_FAILED_WRONG_FS __constant_cpu_to_le32(0xC0210004)
-#define STATUS_FVE_FAILED_BAD_FS __constant_cpu_to_le32(0xC0210005)
-#define STATUS_FVE_FS_NOT_EXTENDED __constant_cpu_to_le32(0xC0210006)
-#define STATUS_FVE_FS_MOUNTED __constant_cpu_to_le32(0xC0210007)
-#define STATUS_FVE_NO_LICENSE __constant_cpu_to_le32(0xC0210008)
-#define STATUS_FVE_ACTION_NOT_ALLOWED __constant_cpu_to_le32(0xC0210009)
-#define STATUS_FVE_BAD_DATA __constant_cpu_to_le32(0xC021000A)
-#define STATUS_FVE_VOLUME_NOT_BOUND __constant_cpu_to_le32(0xC021000B)
-#define STATUS_FVE_NOT_DATA_VOLUME __constant_cpu_to_le32(0xC021000C)
-#define STATUS_FVE_CONV_READ_ERROR __constant_cpu_to_le32(0xC021000D)
-#define STATUS_FVE_CONV_WRITE_ERROR __constant_cpu_to_le32(0xC021000E)
-#define STATUS_FVE_OVERLAPPED_UPDATE __constant_cpu_to_le32(0xC021000F)
-#define STATUS_FVE_FAILED_SECTOR_SIZE __constant_cpu_to_le32(0xC0210010)
-#define STATUS_FVE_FAILED_AUTHENTICATION __constant_cpu_to_le32(0xC0210011)
-#define STATUS_FVE_NOT_OS_VOLUME __constant_cpu_to_le32(0xC0210012)
-#define STATUS_FVE_KEYFILE_NOT_FOUND __constant_cpu_to_le32(0xC0210013)
-#define STATUS_FVE_KEYFILE_INVALID __constant_cpu_to_le32(0xC0210014)
-#define STATUS_FVE_KEYFILE_NO_VMK __constant_cpu_to_le32(0xC0210015)
-#define STATUS_FVE_TPM_DISABLED __constant_cpu_to_le32(0xC0210016)
-#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO __constant_cpu_to_le32(0xC0210017)
-#define STATUS_FVE_TPM_INVALID_PCR __constant_cpu_to_le32(0xC0210018)
-#define STATUS_FVE_TPM_NO_VMK __constant_cpu_to_le32(0xC0210019)
-#define STATUS_FVE_PIN_INVALID __constant_cpu_to_le32(0xC021001A)
-#define STATUS_FVE_AUTH_INVALID_APPLICATION __constant_cpu_to_le32(0xC021001B)
-#define STATUS_FVE_AUTH_INVALID_CONFIG __constant_cpu_to_le32(0xC021001C)
-#define STATUS_FVE_DEBUGGER_ENABLED __constant_cpu_to_le32(0xC021001D)
-#define STATUS_FVE_DRY_RUN_FAILED __constant_cpu_to_le32(0xC021001E)
-#define STATUS_FVE_BAD_METADATA_POINTER __constant_cpu_to_le32(0xC021001F)
-#define STATUS_FVE_OLD_METADATA_COPY __constant_cpu_to_le32(0xC0210020)
-#define STATUS_FVE_REBOOT_REQUIRED __constant_cpu_to_le32(0xC0210021)
-#define STATUS_FVE_RAW_ACCESS __constant_cpu_to_le32(0xC0210022)
-#define STATUS_FVE_RAW_BLOCKED __constant_cpu_to_le32(0xC0210023)
-#define STATUS_FWP_CALLOUT_NOT_FOUND __constant_cpu_to_le32(0xC0220001)
-#define STATUS_FWP_CONDITION_NOT_FOUND __constant_cpu_to_le32(0xC0220002)
-#define STATUS_FWP_FILTER_NOT_FOUND __constant_cpu_to_le32(0xC0220003)
-#define STATUS_FWP_LAYER_NOT_FOUND __constant_cpu_to_le32(0xC0220004)
-#define STATUS_FWP_PROVIDER_NOT_FOUND __constant_cpu_to_le32(0xC0220005)
-#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND __constant_cpu_to_le32(0xC0220006)
-#define STATUS_FWP_SUBLAYER_NOT_FOUND __constant_cpu_to_le32(0xC0220007)
-#define STATUS_FWP_NOT_FOUND __constant_cpu_to_le32(0xC0220008)
-#define STATUS_FWP_ALREADY_EXISTS __constant_cpu_to_le32(0xC0220009)
-#define STATUS_FWP_IN_USE __constant_cpu_to_le32(0xC022000A)
-#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS __constant_cpu_to_le32(0xC022000B)
-#define STATUS_FWP_WRONG_SESSION __constant_cpu_to_le32(0xC022000C)
-#define STATUS_FWP_NO_TXN_IN_PROGRESS __constant_cpu_to_le32(0xC022000D)
-#define STATUS_FWP_TXN_IN_PROGRESS __constant_cpu_to_le32(0xC022000E)
-#define STATUS_FWP_TXN_ABORTED __constant_cpu_to_le32(0xC022000F)
-#define STATUS_FWP_SESSION_ABORTED __constant_cpu_to_le32(0xC0220010)
-#define STATUS_FWP_INCOMPATIBLE_TXN __constant_cpu_to_le32(0xC0220011)
-#define STATUS_FWP_TIMEOUT __constant_cpu_to_le32(0xC0220012)
-#define STATUS_FWP_NET_EVENTS_DISABLED __constant_cpu_to_le32(0xC0220013)
-#define STATUS_FWP_INCOMPATIBLE_LAYER __constant_cpu_to_le32(0xC0220014)
-#define STATUS_FWP_KM_CLIENTS_ONLY __constant_cpu_to_le32(0xC0220015)
-#define STATUS_FWP_LIFETIME_MISMATCH __constant_cpu_to_le32(0xC0220016)
-#define STATUS_FWP_BUILTIN_OBJECT __constant_cpu_to_le32(0xC0220017)
-#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS __constant_cpu_to_le32(0xC0220018)
-#define STATUS_FWP_TOO_MANY_CALLOUTS __constant_cpu_to_le32(0xC0220018)
-#define STATUS_FWP_NOTIFICATION_DROPPED __constant_cpu_to_le32(0xC0220019)
-#define STATUS_FWP_TRAFFIC_MISMATCH __constant_cpu_to_le32(0xC022001A)
-#define STATUS_FWP_INCOMPATIBLE_SA_STATE __constant_cpu_to_le32(0xC022001B)
-#define STATUS_FWP_NULL_POINTER __constant_cpu_to_le32(0xC022001C)
-#define STATUS_FWP_INVALID_ENUMERATOR __constant_cpu_to_le32(0xC022001D)
-#define STATUS_FWP_INVALID_FLAGS __constant_cpu_to_le32(0xC022001E)
-#define STATUS_FWP_INVALID_NET_MASK __constant_cpu_to_le32(0xC022001F)
-#define STATUS_FWP_INVALID_RANGE __constant_cpu_to_le32(0xC0220020)
-#define STATUS_FWP_INVALID_INTERVAL __constant_cpu_to_le32(0xC0220021)
-#define STATUS_FWP_ZERO_LENGTH_ARRAY __constant_cpu_to_le32(0xC0220022)
-#define STATUS_FWP_NULL_DISPLAY_NAME __constant_cpu_to_le32(0xC0220023)
-#define STATUS_FWP_INVALID_ACTION_TYPE __constant_cpu_to_le32(0xC0220024)
-#define STATUS_FWP_INVALID_WEIGHT __constant_cpu_to_le32(0xC0220025)
-#define STATUS_FWP_MATCH_TYPE_MISMATCH __constant_cpu_to_le32(0xC0220026)
-#define STATUS_FWP_TYPE_MISMATCH __constant_cpu_to_le32(0xC0220027)
-#define STATUS_FWP_OUT_OF_BOUNDS __constant_cpu_to_le32(0xC0220028)
-#define STATUS_FWP_RESERVED __constant_cpu_to_le32(0xC0220029)
-#define STATUS_FWP_DUPLICATE_CONDITION __constant_cpu_to_le32(0xC022002A)
-#define STATUS_FWP_DUPLICATE_KEYMOD __constant_cpu_to_le32(0xC022002B)
-#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER __constant_cpu_to_le32(0xC022002C)
-#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER __constant_cpu_to_le32(0xC022002D)
-#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER __constant_cpu_to_le32(0xC022002E)
-#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT __constant_cpu_to_le32(0xC022002F)
-#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD __constant_cpu_to_le32(0xC0220030)
-#define STATUS_FWP_INCOMPATIBLE_DH_GROUP __constant_cpu_to_le32(0xC0220031)
-#define STATUS_FWP_EM_NOT_SUPPORTED __constant_cpu_to_le32(0xC0220032)
-#define STATUS_FWP_NEVER_MATCH __constant_cpu_to_le32(0xC0220033)
-#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0220034)
-#define STATUS_FWP_INVALID_PARAMETER __constant_cpu_to_le32(0xC0220035)
-#define STATUS_FWP_TOO_MANY_SUBLAYERS __constant_cpu_to_le32(0xC0220036)
-#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED __constant_cpu_to_le32(0xC0220037)
-#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG __constant_cpu_to_le32(0xC0220038)
-#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG __constant_cpu_to_le32(0xC0220039)
-#define STATUS_FWP_TCPIP_NOT_READY __constant_cpu_to_le32(0xC0220100)
-#define STATUS_FWP_INJECT_HANDLE_CLOSING __constant_cpu_to_le32(0xC0220101)
-#define STATUS_FWP_INJECT_HANDLE_STALE __constant_cpu_to_le32(0xC0220102)
-#define STATUS_FWP_CANNOT_PEND __constant_cpu_to_le32(0xC0220103)
-#define STATUS_NDIS_CLOSING __constant_cpu_to_le32(0xC0230002)
-#define STATUS_NDIS_BAD_VERSION __constant_cpu_to_le32(0xC0230004)
-#define STATUS_NDIS_BAD_CHARACTERISTICS __constant_cpu_to_le32(0xC0230005)
-#define STATUS_NDIS_ADAPTER_NOT_FOUND __constant_cpu_to_le32(0xC0230006)
-#define STATUS_NDIS_OPEN_FAILED __constant_cpu_to_le32(0xC0230007)
-#define STATUS_NDIS_DEVICE_FAILED __constant_cpu_to_le32(0xC0230008)
-#define STATUS_NDIS_MULTICAST_FULL __constant_cpu_to_le32(0xC0230009)
-#define STATUS_NDIS_MULTICAST_EXISTS __constant_cpu_to_le32(0xC023000A)
-#define STATUS_NDIS_MULTICAST_NOT_FOUND __constant_cpu_to_le32(0xC023000B)
-#define STATUS_NDIS_REQUEST_ABORTED __constant_cpu_to_le32(0xC023000C)
-#define STATUS_NDIS_RESET_IN_PROGRESS __constant_cpu_to_le32(0xC023000D)
-#define STATUS_NDIS_INVALID_PACKET __constant_cpu_to_le32(0xC023000F)
-#define STATUS_NDIS_INVALID_DEVICE_REQUEST __constant_cpu_to_le32(0xC0230010)
-#define STATUS_NDIS_ADAPTER_NOT_READY __constant_cpu_to_le32(0xC0230011)
-#define STATUS_NDIS_INVALID_LENGTH __constant_cpu_to_le32(0xC0230014)
-#define STATUS_NDIS_INVALID_DATA __constant_cpu_to_le32(0xC0230015)
-#define STATUS_NDIS_BUFFER_TOO_SHORT __constant_cpu_to_le32(0xC0230016)
-#define STATUS_NDIS_INVALID_OID __constant_cpu_to_le32(0xC0230017)
-#define STATUS_NDIS_ADAPTER_REMOVED __constant_cpu_to_le32(0xC0230018)
-#define STATUS_NDIS_UNSUPPORTED_MEDIA __constant_cpu_to_le32(0xC0230019)
-#define STATUS_NDIS_GROUP_ADDRESS_IN_USE __constant_cpu_to_le32(0xC023001A)
-#define STATUS_NDIS_FILE_NOT_FOUND __constant_cpu_to_le32(0xC023001B)
-#define STATUS_NDIS_ERROR_READING_FILE __constant_cpu_to_le32(0xC023001C)
-#define STATUS_NDIS_ALREADY_MAPPED __constant_cpu_to_le32(0xC023001D)
-#define STATUS_NDIS_RESOURCE_CONFLICT __constant_cpu_to_le32(0xC023001E)
-#define STATUS_NDIS_MEDIA_DISCONNECTED __constant_cpu_to_le32(0xC023001F)
-#define STATUS_NDIS_INVALID_ADDRESS __constant_cpu_to_le32(0xC0230022)
-#define STATUS_NDIS_PAUSED __constant_cpu_to_le32(0xC023002A)
-#define STATUS_NDIS_INTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC023002B)
-#define STATUS_NDIS_UNSUPPORTED_REVISION __constant_cpu_to_le32(0xC023002C)
-#define STATUS_NDIS_INVALID_PORT __constant_cpu_to_le32(0xC023002D)
-#define STATUS_NDIS_INVALID_PORT_STATE __constant_cpu_to_le32(0xC023002E)
-#define STATUS_NDIS_LOW_POWER_STATE __constant_cpu_to_le32(0xC023002F)
-#define STATUS_NDIS_NOT_SUPPORTED __constant_cpu_to_le32(0xC02300BB)
-#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED __constant_cpu_to_le32(0xC0232000)
-#define STATUS_NDIS_DOT11_MEDIA_IN_USE __constant_cpu_to_le32(0xC0232001)
-#define STATUS_NDIS_DOT11_POWER_STATE_INVALID __constant_cpu_to_le32(0xC0232002)
-#define STATUS_IPSEC_BAD_SPI __constant_cpu_to_le32(0xC0360001)
-#define STATUS_IPSEC_SA_LIFETIME_EXPIRED __constant_cpu_to_le32(0xC0360002)
-#define STATUS_IPSEC_WRONG_SA __constant_cpu_to_le32(0xC0360003)
-#define STATUS_IPSEC_REPLAY_CHECK_FAILED __constant_cpu_to_le32(0xC0360004)
-#define STATUS_IPSEC_INVALID_PACKET __constant_cpu_to_le32(0xC0360005)
-#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED __constant_cpu_to_le32(0xC0360006)
-#define STATUS_IPSEC_CLEAR_TEXT_DROP __constant_cpu_to_le32(0xC0360007)
+#define STATUS_SUCCESS cpu_to_le32(0x00000000)
+#define STATUS_WAIT_0 cpu_to_le32(0x00000000)
+#define STATUS_WAIT_1 cpu_to_le32(0x00000001)
+#define STATUS_WAIT_2 cpu_to_le32(0x00000002)
+#define STATUS_WAIT_3 cpu_to_le32(0x00000003)
+#define STATUS_WAIT_63 cpu_to_le32(0x0000003F)
+#define STATUS_ABANDONED cpu_to_le32(0x00000080)
+#define STATUS_ABANDONED_WAIT_0 cpu_to_le32(0x00000080)
+#define STATUS_ABANDONED_WAIT_63 cpu_to_le32(0x000000BF)
+#define STATUS_USER_APC cpu_to_le32(0x000000C0)
+#define STATUS_KERNEL_APC cpu_to_le32(0x00000100)
+#define STATUS_ALERTED cpu_to_le32(0x00000101)
+#define STATUS_TIMEOUT cpu_to_le32(0x00000102)
+#define STATUS_PENDING cpu_to_le32(0x00000103)
+#define STATUS_REPARSE cpu_to_le32(0x00000104)
+#define STATUS_MORE_ENTRIES cpu_to_le32(0x00000105)
+#define STATUS_NOT_ALL_ASSIGNED cpu_to_le32(0x00000106)
+#define STATUS_SOME_NOT_MAPPED cpu_to_le32(0x00000107)
+#define STATUS_OPLOCK_BREAK_IN_PROGRESS cpu_to_le32(0x00000108)
+#define STATUS_VOLUME_MOUNTED cpu_to_le32(0x00000109)
+#define STATUS_RXACT_COMMITTED cpu_to_le32(0x0000010A)
+#define STATUS_NOTIFY_CLEANUP cpu_to_le32(0x0000010B)
+#define STATUS_NOTIFY_ENUM_DIR cpu_to_le32(0x0000010C)
+#define STATUS_NO_QUOTAS_FOR_ACCOUNT cpu_to_le32(0x0000010D)
+#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED cpu_to_le32(0x0000010E)
+#define STATUS_PAGE_FAULT_TRANSITION cpu_to_le32(0x00000110)
+#define STATUS_PAGE_FAULT_DEMAND_ZERO cpu_to_le32(0x00000111)
+#define STATUS_PAGE_FAULT_COPY_ON_WRITE cpu_to_le32(0x00000112)
+#define STATUS_PAGE_FAULT_GUARD_PAGE cpu_to_le32(0x00000113)
+#define STATUS_PAGE_FAULT_PAGING_FILE cpu_to_le32(0x00000114)
+#define STATUS_CACHE_PAGE_LOCKED cpu_to_le32(0x00000115)
+#define STATUS_CRASH_DUMP cpu_to_le32(0x00000116)
+#define STATUS_BUFFER_ALL_ZEROS cpu_to_le32(0x00000117)
+#define STATUS_REPARSE_OBJECT cpu_to_le32(0x00000118)
+#define STATUS_RESOURCE_REQUIREMENTS_CHANGED cpu_to_le32(0x00000119)
+#define STATUS_TRANSLATION_COMPLETE cpu_to_le32(0x00000120)
+#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY cpu_to_le32(0x00000121)
+#define STATUS_NOTHING_TO_TERMINATE cpu_to_le32(0x00000122)
+#define STATUS_PROCESS_NOT_IN_JOB cpu_to_le32(0x00000123)
+#define STATUS_PROCESS_IN_JOB cpu_to_le32(0x00000124)
+#define STATUS_VOLSNAP_HIBERNATE_READY cpu_to_le32(0x00000125)
+#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY cpu_to_le32(0x00000126)
+#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED cpu_to_le32(0x00000127)
+#define STATUS_INTERRUPT_STILL_CONNECTED cpu_to_le32(0x00000128)
+#define STATUS_PROCESS_CLONED cpu_to_le32(0x00000129)
+#define STATUS_FILE_LOCKED_WITH_ONLY_READERS cpu_to_le32(0x0000012A)
+#define STATUS_FILE_LOCKED_WITH_WRITERS cpu_to_le32(0x0000012B)
+#define STATUS_RESOURCEMANAGER_READ_ONLY cpu_to_le32(0x00000202)
+#define STATUS_WAIT_FOR_OPLOCK cpu_to_le32(0x00000367)
+#define DBG_EXCEPTION_HANDLED cpu_to_le32(0x00010001)
+#define DBG_CONTINUE cpu_to_le32(0x00010002)
+#define STATUS_FLT_IO_COMPLETE cpu_to_le32(0x001C0001)
+#define STATUS_OBJECT_NAME_EXISTS cpu_to_le32(0x40000000)
+#define STATUS_THREAD_WAS_SUSPENDED cpu_to_le32(0x40000001)
+#define STATUS_WORKING_SET_LIMIT_RANGE cpu_to_le32(0x40000002)
+#define STATUS_IMAGE_NOT_AT_BASE cpu_to_le32(0x40000003)
+#define STATUS_RXACT_STATE_CREATED cpu_to_le32(0x40000004)
+#define STATUS_SEGMENT_NOTIFICATION cpu_to_le32(0x40000005)
+#define STATUS_LOCAL_USER_SESSION_KEY cpu_to_le32(0x40000006)
+#define STATUS_BAD_CURRENT_DIRECTORY cpu_to_le32(0x40000007)
+#define STATUS_SERIAL_MORE_WRITES cpu_to_le32(0x40000008)
+#define STATUS_REGISTRY_RECOVERED cpu_to_le32(0x40000009)
+#define STATUS_FT_READ_RECOVERY_FROM_BACKUP cpu_to_le32(0x4000000A)
+#define STATUS_FT_WRITE_RECOVERY cpu_to_le32(0x4000000B)
+#define STATUS_SERIAL_COUNTER_TIMEOUT cpu_to_le32(0x4000000C)
+#define STATUS_NULL_LM_PASSWORD cpu_to_le32(0x4000000D)
+#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH cpu_to_le32(0x4000000E)
+#define STATUS_RECEIVE_PARTIAL cpu_to_le32(0x4000000F)
+#define STATUS_RECEIVE_EXPEDITED cpu_to_le32(0x40000010)
+#define STATUS_RECEIVE_PARTIAL_EXPEDITED cpu_to_le32(0x40000011)
+#define STATUS_EVENT_DONE cpu_to_le32(0x40000012)
+#define STATUS_EVENT_PENDING cpu_to_le32(0x40000013)
+#define STATUS_CHECKING_FILE_SYSTEM cpu_to_le32(0x40000014)
+#define STATUS_FATAL_APP_EXIT cpu_to_le32(0x40000015)
+#define STATUS_PREDEFINED_HANDLE cpu_to_le32(0x40000016)
+#define STATUS_WAS_UNLOCKED cpu_to_le32(0x40000017)
+#define STATUS_SERVICE_NOTIFICATION cpu_to_le32(0x40000018)
+#define STATUS_WAS_LOCKED cpu_to_le32(0x40000019)
+#define STATUS_LOG_HARD_ERROR cpu_to_le32(0x4000001A)
+#define STATUS_ALREADY_WIN32 cpu_to_le32(0x4000001B)
+#define STATUS_WX86_UNSIMULATE cpu_to_le32(0x4000001C)
+#define STATUS_WX86_CONTINUE cpu_to_le32(0x4000001D)
+#define STATUS_WX86_SINGLE_STEP cpu_to_le32(0x4000001E)
+#define STATUS_WX86_BREAKPOINT cpu_to_le32(0x4000001F)
+#define STATUS_WX86_EXCEPTION_CONTINUE cpu_to_le32(0x40000020)
+#define STATUS_WX86_EXCEPTION_LASTCHANCE cpu_to_le32(0x40000021)
+#define STATUS_WX86_EXCEPTION_CHAIN cpu_to_le32(0x40000022)
+#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE cpu_to_le32(0x40000023)
+#define STATUS_NO_YIELD_PERFORMED cpu_to_le32(0x40000024)
+#define STATUS_TIMER_RESUME_IGNORED cpu_to_le32(0x40000025)
+#define STATUS_ARBITRATION_UNHANDLED cpu_to_le32(0x40000026)
+#define STATUS_CARDBUS_NOT_SUPPORTED cpu_to_le32(0x40000027)
+#define STATUS_WX86_CREATEWX86TIB cpu_to_le32(0x40000028)
+#define STATUS_MP_PROCESSOR_MISMATCH cpu_to_le32(0x40000029)
+#define STATUS_HIBERNATED cpu_to_le32(0x4000002A)
+#define STATUS_RESUME_HIBERNATION cpu_to_le32(0x4000002B)
+#define STATUS_FIRMWARE_UPDATED cpu_to_le32(0x4000002C)
+#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES cpu_to_le32(0x4000002D)
+#define STATUS_MESSAGE_RETRIEVED cpu_to_le32(0x4000002E)
+#define STATUS_SYSTEM_POWERSTATE_TRANSITION cpu_to_le32(0x4000002F)
+#define STATUS_ALPC_CHECK_COMPLETION_LIST cpu_to_le32(0x40000030)
+#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION cpu_to_le32(0x40000031)
+#define STATUS_ACCESS_AUDIT_BY_POLICY cpu_to_le32(0x40000032)
+#define STATUS_ABANDON_HIBERFILE cpu_to_le32(0x40000033)
+#define STATUS_BIZRULES_NOT_ENABLED cpu_to_le32(0x40000034)
+#define STATUS_WAKE_SYSTEM cpu_to_le32(0x40000294)
+#define STATUS_DS_SHUTTING_DOWN cpu_to_le32(0x40000370)
+#define DBG_REPLY_LATER cpu_to_le32(0x40010001)
+#define DBG_UNABLE_TO_PROVIDE_HANDLE cpu_to_le32(0x40010002)
+#define DBG_TERMINATE_THREAD cpu_to_le32(0x40010003)
+#define DBG_TERMINATE_PROCESS cpu_to_le32(0x40010004)
+#define DBG_CONTROL_C cpu_to_le32(0x40010005)
+#define DBG_PRINTEXCEPTION_C cpu_to_le32(0x40010006)
+#define DBG_RIPEXCEPTION cpu_to_le32(0x40010007)
+#define DBG_CONTROL_BREAK cpu_to_le32(0x40010008)
+#define DBG_COMMAND_EXCEPTION cpu_to_le32(0x40010009)
+#define RPC_NT_UUID_LOCAL_ONLY cpu_to_le32(0x40020056)
+#define RPC_NT_SEND_INCOMPLETE cpu_to_le32(0x400200AF)
+#define STATUS_CTX_CDM_CONNECT cpu_to_le32(0x400A0004)
+#define STATUS_CTX_CDM_DISCONNECT cpu_to_le32(0x400A0005)
+#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT cpu_to_le32(0x4015000D)
+#define STATUS_RECOVERY_NOT_NEEDED cpu_to_le32(0x40190034)
+#define STATUS_RM_ALREADY_STARTED cpu_to_le32(0x40190035)
+#define STATUS_LOG_NO_RESTART cpu_to_le32(0x401A000C)
+#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST cpu_to_le32(0x401B00EC)
+#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED cpu_to_le32(0x401E000A)
+#define STATUS_GRAPHICS_DRIVER_MISMATCH cpu_to_le32(0x401E0117)
+#define STATUS_GRAPHICS_MODE_NOT_PINNED cpu_to_le32(0x401E0307)
+#define STATUS_GRAPHICS_NO_PREFERRED_MODE cpu_to_le32(0x401E031E)
+#define STATUS_GRAPHICS_DATASET_IS_EMPTY cpu_to_le32(0x401E034B)
+#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET cpu_to_le32(0x401E034C)
+#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED cpu_to_le32(0x401E0351)
+#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS cpu_to_le32(0x401E042F)
+#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED cpu_to_le32(0x401E0437)
+#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY cpu_to_le32(0x401E0439)
+#define STATUS_GRAPHICS_START_DEFERRED cpu_to_le32(0x401E043A)
+#define STATUS_NDIS_INDICATION_REQUIRED cpu_to_le32(0x40230001)
+#define STATUS_GUARD_PAGE_VIOLATION cpu_to_le32(0x80000001)
+#define STATUS_DATATYPE_MISALIGNMENT cpu_to_le32(0x80000002)
+#define STATUS_BREAKPOINT cpu_to_le32(0x80000003)
+#define STATUS_SINGLE_STEP cpu_to_le32(0x80000004)
+#define STATUS_BUFFER_OVERFLOW cpu_to_le32(0x80000005)
+#define STATUS_NO_MORE_FILES cpu_to_le32(0x80000006)
+#define STATUS_WAKE_SYSTEM_DEBUGGER cpu_to_le32(0x80000007)
+#define STATUS_HANDLES_CLOSED cpu_to_le32(0x8000000A)
+#define STATUS_NO_INHERITANCE cpu_to_le32(0x8000000B)
+#define STATUS_GUID_SUBSTITUTION_MADE cpu_to_le32(0x8000000C)
+#define STATUS_PARTIAL_COPY cpu_to_le32(0x8000000D)
+#define STATUS_DEVICE_PAPER_EMPTY cpu_to_le32(0x8000000E)
+#define STATUS_DEVICE_POWERED_OFF cpu_to_le32(0x8000000F)
+#define STATUS_DEVICE_OFF_LINE cpu_to_le32(0x80000010)
+#define STATUS_DEVICE_BUSY cpu_to_le32(0x80000011)
+#define STATUS_NO_MORE_EAS cpu_to_le32(0x80000012)
+#define STATUS_INVALID_EA_NAME cpu_to_le32(0x80000013)
+#define STATUS_EA_LIST_INCONSISTENT cpu_to_le32(0x80000014)
+#define STATUS_INVALID_EA_FLAG cpu_to_le32(0x80000015)
+#define STATUS_VERIFY_REQUIRED cpu_to_le32(0x80000016)
+#define STATUS_EXTRANEOUS_INFORMATION cpu_to_le32(0x80000017)
+#define STATUS_RXACT_COMMIT_NECESSARY cpu_to_le32(0x80000018)
+#define STATUS_NO_MORE_ENTRIES cpu_to_le32(0x8000001A)
+#define STATUS_FILEMARK_DETECTED cpu_to_le32(0x8000001B)
+#define STATUS_MEDIA_CHANGED cpu_to_le32(0x8000001C)
+#define STATUS_BUS_RESET cpu_to_le32(0x8000001D)
+#define STATUS_END_OF_MEDIA cpu_to_le32(0x8000001E)
+#define STATUS_BEGINNING_OF_MEDIA cpu_to_le32(0x8000001F)
+#define STATUS_MEDIA_CHECK cpu_to_le32(0x80000020)
+#define STATUS_SETMARK_DETECTED cpu_to_le32(0x80000021)
+#define STATUS_NO_DATA_DETECTED cpu_to_le32(0x80000022)
+#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES cpu_to_le32(0x80000023)
+#define STATUS_SERVER_HAS_OPEN_HANDLES cpu_to_le32(0x80000024)
+#define STATUS_ALREADY_DISCONNECTED cpu_to_le32(0x80000025)
+#define STATUS_LONGJUMP cpu_to_le32(0x80000026)
+#define STATUS_CLEANER_CARTRIDGE_INSTALLED cpu_to_le32(0x80000027)
+#define STATUS_PLUGPLAY_QUERY_VETOED cpu_to_le32(0x80000028)
+#define STATUS_UNWIND_CONSOLIDATE cpu_to_le32(0x80000029)
+#define STATUS_REGISTRY_HIVE_RECOVERED cpu_to_le32(0x8000002A)
+#define STATUS_DLL_MIGHT_BE_INSECURE cpu_to_le32(0x8000002B)
+#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE cpu_to_le32(0x8000002C)
+#define STATUS_STOPPED_ON_SYMLINK cpu_to_le32(0x8000002D)
+#define STATUS_DEVICE_REQUIRES_CLEANING cpu_to_le32(0x80000288)
+#define STATUS_DEVICE_DOOR_OPEN cpu_to_le32(0x80000289)
+#define STATUS_DATA_LOST_REPAIR cpu_to_le32(0x80000803)
+#define DBG_EXCEPTION_NOT_HANDLED cpu_to_le32(0x80010001)
+#define STATUS_CLUSTER_NODE_ALREADY_UP cpu_to_le32(0x80130001)
+#define STATUS_CLUSTER_NODE_ALREADY_DOWN cpu_to_le32(0x80130002)
+#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE cpu_to_le32(0x80130003)
+#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE cpu_to_le32(0x80130004)
+#define STATUS_CLUSTER_NODE_ALREADY_MEMBER cpu_to_le32(0x80130005)
+#define STATUS_COULD_NOT_RESIZE_LOG cpu_to_le32(0x80190009)
+#define STATUS_NO_TXF_METADATA cpu_to_le32(0x80190029)
+#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN cpu_to_le32(0x80190031)
+#define STATUS_TXF_METADATA_ALREADY_PRESENT cpu_to_le32(0x80190041)
+#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET cpu_to_le32(0x80190042)
+#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED cpu_to_le32(0x801B00EB)
+#define STATUS_FLT_BUFFER_TOO_SMALL cpu_to_le32(0x801C0001)
+#define STATUS_FVE_PARTIAL_METADATA cpu_to_le32(0x80210001)
+#define STATUS_UNSUCCESSFUL cpu_to_le32(0xC0000001)
+#define STATUS_NOT_IMPLEMENTED cpu_to_le32(0xC0000002)
+#define STATUS_INVALID_INFO_CLASS cpu_to_le32(0xC0000003)
+#define STATUS_INFO_LENGTH_MISMATCH cpu_to_le32(0xC0000004)
+#define STATUS_ACCESS_VIOLATION cpu_to_le32(0xC0000005)
+#define STATUS_IN_PAGE_ERROR cpu_to_le32(0xC0000006)
+#define STATUS_PAGEFILE_QUOTA cpu_to_le32(0xC0000007)
+#define STATUS_INVALID_HANDLE cpu_to_le32(0xC0000008)
+#define STATUS_BAD_INITIAL_STACK cpu_to_le32(0xC0000009)
+#define STATUS_BAD_INITIAL_PC cpu_to_le32(0xC000000A)
+#define STATUS_INVALID_CID cpu_to_le32(0xC000000B)
+#define STATUS_TIMER_NOT_CANCELED cpu_to_le32(0xC000000C)
+#define STATUS_INVALID_PARAMETER cpu_to_le32(0xC000000D)
+#define STATUS_NO_SUCH_DEVICE cpu_to_le32(0xC000000E)
+#define STATUS_NO_SUCH_FILE cpu_to_le32(0xC000000F)
+#define STATUS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0000010)
+#define STATUS_END_OF_FILE cpu_to_le32(0xC0000011)
+#define STATUS_WRONG_VOLUME cpu_to_le32(0xC0000012)
+#define STATUS_NO_MEDIA_IN_DEVICE cpu_to_le32(0xC0000013)
+#define STATUS_UNRECOGNIZED_MEDIA cpu_to_le32(0xC0000014)
+#define STATUS_NONEXISTENT_SECTOR cpu_to_le32(0xC0000015)
+#define STATUS_MORE_PROCESSING_REQUIRED cpu_to_le32(0xC0000016)
+#define STATUS_NO_MEMORY cpu_to_le32(0xC0000017)
+#define STATUS_CONFLICTING_ADDRESSES cpu_to_le32(0xC0000018)
+#define STATUS_NOT_MAPPED_VIEW cpu_to_le32(0xC0000019)
+#define STATUS_UNABLE_TO_FREE_VM cpu_to_le32(0xC000001A)
+#define STATUS_UNABLE_TO_DELETE_SECTION cpu_to_le32(0xC000001B)
+#define STATUS_INVALID_SYSTEM_SERVICE cpu_to_le32(0xC000001C)
+#define STATUS_ILLEGAL_INSTRUCTION cpu_to_le32(0xC000001D)
+#define STATUS_INVALID_LOCK_SEQUENCE cpu_to_le32(0xC000001E)
+#define STATUS_INVALID_VIEW_SIZE cpu_to_le32(0xC000001F)
+#define STATUS_INVALID_FILE_FOR_SECTION cpu_to_le32(0xC0000020)
+#define STATUS_ALREADY_COMMITTED cpu_to_le32(0xC0000021)
+#define STATUS_ACCESS_DENIED cpu_to_le32(0xC0000022)
+#define STATUS_BUFFER_TOO_SMALL cpu_to_le32(0xC0000023)
+#define STATUS_OBJECT_TYPE_MISMATCH cpu_to_le32(0xC0000024)
+#define STATUS_NONCONTINUABLE_EXCEPTION cpu_to_le32(0xC0000025)
+#define STATUS_INVALID_DISPOSITION cpu_to_le32(0xC0000026)
+#define STATUS_UNWIND cpu_to_le32(0xC0000027)
+#define STATUS_BAD_STACK cpu_to_le32(0xC0000028)
+#define STATUS_INVALID_UNWIND_TARGET cpu_to_le32(0xC0000029)
+#define STATUS_NOT_LOCKED cpu_to_le32(0xC000002A)
+#define STATUS_PARITY_ERROR cpu_to_le32(0xC000002B)
+#define STATUS_UNABLE_TO_DECOMMIT_VM cpu_to_le32(0xC000002C)
+#define STATUS_NOT_COMMITTED cpu_to_le32(0xC000002D)
+#define STATUS_INVALID_PORT_ATTRIBUTES cpu_to_le32(0xC000002E)
+#define STATUS_PORT_MESSAGE_TOO_LONG cpu_to_le32(0xC000002F)
+#define STATUS_INVALID_PARAMETER_MIX cpu_to_le32(0xC0000030)
+#define STATUS_INVALID_QUOTA_LOWER cpu_to_le32(0xC0000031)
+#define STATUS_DISK_CORRUPT_ERROR cpu_to_le32(0xC0000032)
+#define STATUS_OBJECT_NAME_INVALID cpu_to_le32(0xC0000033)
+#define STATUS_OBJECT_NAME_NOT_FOUND cpu_to_le32(0xC0000034)
+#define STATUS_OBJECT_NAME_COLLISION cpu_to_le32(0xC0000035)
+#define STATUS_PORT_DISCONNECTED cpu_to_le32(0xC0000037)
+#define STATUS_DEVICE_ALREADY_ATTACHED cpu_to_le32(0xC0000038)
+#define STATUS_OBJECT_PATH_INVALID cpu_to_le32(0xC0000039)
+#define STATUS_OBJECT_PATH_NOT_FOUND cpu_to_le32(0xC000003A)
+#define STATUS_OBJECT_PATH_SYNTAX_BAD cpu_to_le32(0xC000003B)
+#define STATUS_DATA_OVERRUN cpu_to_le32(0xC000003C)
+#define STATUS_DATA_LATE_ERROR cpu_to_le32(0xC000003D)
+#define STATUS_DATA_ERROR cpu_to_le32(0xC000003E)
+#define STATUS_CRC_ERROR cpu_to_le32(0xC000003F)
+#define STATUS_SECTION_TOO_BIG cpu_to_le32(0xC0000040)
+#define STATUS_PORT_CONNECTION_REFUSED cpu_to_le32(0xC0000041)
+#define STATUS_INVALID_PORT_HANDLE cpu_to_le32(0xC0000042)
+#define STATUS_SHARING_VIOLATION cpu_to_le32(0xC0000043)
+#define STATUS_QUOTA_EXCEEDED cpu_to_le32(0xC0000044)
+#define STATUS_INVALID_PAGE_PROTECTION cpu_to_le32(0xC0000045)
+#define STATUS_MUTANT_NOT_OWNED cpu_to_le32(0xC0000046)
+#define STATUS_SEMAPHORE_LIMIT_EXCEEDED cpu_to_le32(0xC0000047)
+#define STATUS_PORT_ALREADY_SET cpu_to_le32(0xC0000048)
+#define STATUS_SECTION_NOT_IMAGE cpu_to_le32(0xC0000049)
+#define STATUS_SUSPEND_COUNT_EXCEEDED cpu_to_le32(0xC000004A)
+#define STATUS_THREAD_IS_TERMINATING cpu_to_le32(0xC000004B)
+#define STATUS_BAD_WORKING_SET_LIMIT cpu_to_le32(0xC000004C)
+#define STATUS_INCOMPATIBLE_FILE_MAP cpu_to_le32(0xC000004D)
+#define STATUS_SECTION_PROTECTION cpu_to_le32(0xC000004E)
+#define STATUS_EAS_NOT_SUPPORTED cpu_to_le32(0xC000004F)
+#define STATUS_EA_TOO_LARGE cpu_to_le32(0xC0000050)
+#define STATUS_NONEXISTENT_EA_ENTRY cpu_to_le32(0xC0000051)
+#define STATUS_NO_EAS_ON_FILE cpu_to_le32(0xC0000052)
+#define STATUS_EA_CORRUPT_ERROR cpu_to_le32(0xC0000053)
+#define STATUS_FILE_LOCK_CONFLICT cpu_to_le32(0xC0000054)
+#define STATUS_LOCK_NOT_GRANTED cpu_to_le32(0xC0000055)
+#define STATUS_DELETE_PENDING cpu_to_le32(0xC0000056)
+#define STATUS_CTL_FILE_NOT_SUPPORTED cpu_to_le32(0xC0000057)
+#define STATUS_UNKNOWN_REVISION cpu_to_le32(0xC0000058)
+#define STATUS_REVISION_MISMATCH cpu_to_le32(0xC0000059)
+#define STATUS_INVALID_OWNER cpu_to_le32(0xC000005A)
+#define STATUS_INVALID_PRIMARY_GROUP cpu_to_le32(0xC000005B)
+#define STATUS_NO_IMPERSONATION_TOKEN cpu_to_le32(0xC000005C)
+#define STATUS_CANT_DISABLE_MANDATORY cpu_to_le32(0xC000005D)
+#define STATUS_NO_LOGON_SERVERS cpu_to_le32(0xC000005E)
+#define STATUS_NO_SUCH_LOGON_SESSION cpu_to_le32(0xC000005F)
+#define STATUS_NO_SUCH_PRIVILEGE cpu_to_le32(0xC0000060)
+#define STATUS_PRIVILEGE_NOT_HELD cpu_to_le32(0xC0000061)
+#define STATUS_INVALID_ACCOUNT_NAME cpu_to_le32(0xC0000062)
+#define STATUS_USER_EXISTS cpu_to_le32(0xC0000063)
+#define STATUS_NO_SUCH_USER cpu_to_le32(0xC0000064)
+#define STATUS_GROUP_EXISTS cpu_to_le32(0xC0000065)
+#define STATUS_NO_SUCH_GROUP cpu_to_le32(0xC0000066)
+#define STATUS_MEMBER_IN_GROUP cpu_to_le32(0xC0000067)
+#define STATUS_MEMBER_NOT_IN_GROUP cpu_to_le32(0xC0000068)
+#define STATUS_LAST_ADMIN cpu_to_le32(0xC0000069)
+#define STATUS_WRONG_PASSWORD cpu_to_le32(0xC000006A)
+#define STATUS_ILL_FORMED_PASSWORD cpu_to_le32(0xC000006B)
+#define STATUS_PASSWORD_RESTRICTION cpu_to_le32(0xC000006C)
+#define STATUS_LOGON_FAILURE cpu_to_le32(0xC000006D)
+#define STATUS_ACCOUNT_RESTRICTION cpu_to_le32(0xC000006E)
+#define STATUS_INVALID_LOGON_HOURS cpu_to_le32(0xC000006F)
+#define STATUS_INVALID_WORKSTATION cpu_to_le32(0xC0000070)
+#define STATUS_PASSWORD_EXPIRED cpu_to_le32(0xC0000071)
+#define STATUS_ACCOUNT_DISABLED cpu_to_le32(0xC0000072)
+#define STATUS_NONE_MAPPED cpu_to_le32(0xC0000073)
+#define STATUS_TOO_MANY_LUIDS_REQUESTED cpu_to_le32(0xC0000074)
+#define STATUS_LUIDS_EXHAUSTED cpu_to_le32(0xC0000075)
+#define STATUS_INVALID_SUB_AUTHORITY cpu_to_le32(0xC0000076)
+#define STATUS_INVALID_ACL cpu_to_le32(0xC0000077)
+#define STATUS_INVALID_SID cpu_to_le32(0xC0000078)
+#define STATUS_INVALID_SECURITY_DESCR cpu_to_le32(0xC0000079)
+#define STATUS_PROCEDURE_NOT_FOUND cpu_to_le32(0xC000007A)
+#define STATUS_INVALID_IMAGE_FORMAT cpu_to_le32(0xC000007B)
+#define STATUS_NO_TOKEN cpu_to_le32(0xC000007C)
+#define STATUS_BAD_INHERITANCE_ACL cpu_to_le32(0xC000007D)
+#define STATUS_RANGE_NOT_LOCKED cpu_to_le32(0xC000007E)
+#define STATUS_DISK_FULL cpu_to_le32(0xC000007F)
+#define STATUS_SERVER_DISABLED cpu_to_le32(0xC0000080)
+#define STATUS_SERVER_NOT_DISABLED cpu_to_le32(0xC0000081)
+#define STATUS_TOO_MANY_GUIDS_REQUESTED cpu_to_le32(0xC0000082)
+#define STATUS_GUIDS_EXHAUSTED cpu_to_le32(0xC0000083)
+#define STATUS_INVALID_ID_AUTHORITY cpu_to_le32(0xC0000084)
+#define STATUS_AGENTS_EXHAUSTED cpu_to_le32(0xC0000085)
+#define STATUS_INVALID_VOLUME_LABEL cpu_to_le32(0xC0000086)
+#define STATUS_SECTION_NOT_EXTENDED cpu_to_le32(0xC0000087)
+#define STATUS_NOT_MAPPED_DATA cpu_to_le32(0xC0000088)
+#define STATUS_RESOURCE_DATA_NOT_FOUND cpu_to_le32(0xC0000089)
+#define STATUS_RESOURCE_TYPE_NOT_FOUND cpu_to_le32(0xC000008A)
+#define STATUS_RESOURCE_NAME_NOT_FOUND cpu_to_le32(0xC000008B)
+#define STATUS_ARRAY_BOUNDS_EXCEEDED cpu_to_le32(0xC000008C)
+#define STATUS_FLOAT_DENORMAL_OPERAND cpu_to_le32(0xC000008D)
+#define STATUS_FLOAT_DIVIDE_BY_ZERO cpu_to_le32(0xC000008E)
+#define STATUS_FLOAT_INEXACT_RESULT cpu_to_le32(0xC000008F)
+#define STATUS_FLOAT_INVALID_OPERATION cpu_to_le32(0xC0000090)
+#define STATUS_FLOAT_OVERFLOW cpu_to_le32(0xC0000091)
+#define STATUS_FLOAT_STACK_CHECK cpu_to_le32(0xC0000092)
+#define STATUS_FLOAT_UNDERFLOW cpu_to_le32(0xC0000093)
+#define STATUS_INTEGER_DIVIDE_BY_ZERO cpu_to_le32(0xC0000094)
+#define STATUS_INTEGER_OVERFLOW cpu_to_le32(0xC0000095)
+#define STATUS_PRIVILEGED_INSTRUCTION cpu_to_le32(0xC0000096)
+#define STATUS_TOO_MANY_PAGING_FILES cpu_to_le32(0xC0000097)
+#define STATUS_FILE_INVALID cpu_to_le32(0xC0000098)
+#define STATUS_ALLOTTED_SPACE_EXCEEDED cpu_to_le32(0xC0000099)
+#define STATUS_INSUFFICIENT_RESOURCES cpu_to_le32(0xC000009A)
+#define STATUS_DFS_EXIT_PATH_FOUND cpu_to_le32(0xC000009B)
+#define STATUS_DEVICE_DATA_ERROR cpu_to_le32(0xC000009C)
+#define STATUS_DEVICE_NOT_CONNECTED cpu_to_le32(0xC000009D)
+#define STATUS_DEVICE_POWER_FAILURE cpu_to_le32(0xC000009E)
+#define STATUS_FREE_VM_NOT_AT_BASE cpu_to_le32(0xC000009F)
+#define STATUS_MEMORY_NOT_ALLOCATED cpu_to_le32(0xC00000A0)
+#define STATUS_WORKING_SET_QUOTA cpu_to_le32(0xC00000A1)
+#define STATUS_MEDIA_WRITE_PROTECTED cpu_to_le32(0xC00000A2)
+#define STATUS_DEVICE_NOT_READY cpu_to_le32(0xC00000A3)
+#define STATUS_INVALID_GROUP_ATTRIBUTES cpu_to_le32(0xC00000A4)
+#define STATUS_BAD_IMPERSONATION_LEVEL cpu_to_le32(0xC00000A5)
+#define STATUS_CANT_OPEN_ANONYMOUS cpu_to_le32(0xC00000A6)
+#define STATUS_BAD_VALIDATION_CLASS cpu_to_le32(0xC00000A7)
+#define STATUS_BAD_TOKEN_TYPE cpu_to_le32(0xC00000A8)
+#define STATUS_BAD_MASTER_BOOT_RECORD cpu_to_le32(0xC00000A9)
+#define STATUS_INSTRUCTION_MISALIGNMENT cpu_to_le32(0xC00000AA)
+#define STATUS_INSTANCE_NOT_AVAILABLE cpu_to_le32(0xC00000AB)
+#define STATUS_PIPE_NOT_AVAILABLE cpu_to_le32(0xC00000AC)
+#define STATUS_INVALID_PIPE_STATE cpu_to_le32(0xC00000AD)
+#define STATUS_PIPE_BUSY cpu_to_le32(0xC00000AE)
+#define STATUS_ILLEGAL_FUNCTION cpu_to_le32(0xC00000AF)
+#define STATUS_PIPE_DISCONNECTED cpu_to_le32(0xC00000B0)
+#define STATUS_PIPE_CLOSING cpu_to_le32(0xC00000B1)
+#define STATUS_PIPE_CONNECTED cpu_to_le32(0xC00000B2)
+#define STATUS_PIPE_LISTENING cpu_to_le32(0xC00000B3)
+#define STATUS_INVALID_READ_MODE cpu_to_le32(0xC00000B4)
+#define STATUS_IO_TIMEOUT cpu_to_le32(0xC00000B5)
+#define STATUS_FILE_FORCED_CLOSED cpu_to_le32(0xC00000B6)
+#define STATUS_PROFILING_NOT_STARTED cpu_to_le32(0xC00000B7)
+#define STATUS_PROFILING_NOT_STOPPED cpu_to_le32(0xC00000B8)
+#define STATUS_COULD_NOT_INTERPRET cpu_to_le32(0xC00000B9)
+#define STATUS_FILE_IS_A_DIRECTORY cpu_to_le32(0xC00000BA)
+#define STATUS_NOT_SUPPORTED cpu_to_le32(0xC00000BB)
+#define STATUS_REMOTE_NOT_LISTENING cpu_to_le32(0xC00000BC)
+#define STATUS_DUPLICATE_NAME cpu_to_le32(0xC00000BD)
+#define STATUS_BAD_NETWORK_PATH cpu_to_le32(0xC00000BE)
+#define STATUS_NETWORK_BUSY cpu_to_le32(0xC00000BF)
+#define STATUS_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC00000C0)
+#define STATUS_TOO_MANY_COMMANDS cpu_to_le32(0xC00000C1)
+#define STATUS_ADAPTER_HARDWARE_ERROR cpu_to_le32(0xC00000C2)
+#define STATUS_INVALID_NETWORK_RESPONSE cpu_to_le32(0xC00000C3)
+#define STATUS_UNEXPECTED_NETWORK_ERROR cpu_to_le32(0xC00000C4)
+#define STATUS_BAD_REMOTE_ADAPTER cpu_to_le32(0xC00000C5)
+#define STATUS_PRINT_QUEUE_FULL cpu_to_le32(0xC00000C6)
+#define STATUS_NO_SPOOL_SPACE cpu_to_le32(0xC00000C7)
+#define STATUS_PRINT_CANCELLED cpu_to_le32(0xC00000C8)
+#define STATUS_NETWORK_NAME_DELETED cpu_to_le32(0xC00000C9)
+#define STATUS_NETWORK_ACCESS_DENIED cpu_to_le32(0xC00000CA)
+#define STATUS_BAD_DEVICE_TYPE cpu_to_le32(0xC00000CB)
+#define STATUS_BAD_NETWORK_NAME cpu_to_le32(0xC00000CC)
+#define STATUS_TOO_MANY_NAMES cpu_to_le32(0xC00000CD)
+#define STATUS_TOO_MANY_SESSIONS cpu_to_le32(0xC00000CE)
+#define STATUS_SHARING_PAUSED cpu_to_le32(0xC00000CF)
+#define STATUS_REQUEST_NOT_ACCEPTED cpu_to_le32(0xC00000D0)
+#define STATUS_REDIRECTOR_PAUSED cpu_to_le32(0xC00000D1)
+#define STATUS_NET_WRITE_FAULT cpu_to_le32(0xC00000D2)
+#define STATUS_PROFILING_AT_LIMIT cpu_to_le32(0xC00000D3)
+#define STATUS_NOT_SAME_DEVICE cpu_to_le32(0xC00000D4)
+#define STATUS_FILE_RENAMED cpu_to_le32(0xC00000D5)
+#define STATUS_VIRTUAL_CIRCUIT_CLOSED cpu_to_le32(0xC00000D6)
+#define STATUS_NO_SECURITY_ON_OBJECT cpu_to_le32(0xC00000D7)
+#define STATUS_CANT_WAIT cpu_to_le32(0xC00000D8)
+#define STATUS_PIPE_EMPTY cpu_to_le32(0xC00000D9)
+#define STATUS_CANT_ACCESS_DOMAIN_INFO cpu_to_le32(0xC00000DA)
+#define STATUS_CANT_TERMINATE_SELF cpu_to_le32(0xC00000DB)
+#define STATUS_INVALID_SERVER_STATE cpu_to_le32(0xC00000DC)
+#define STATUS_INVALID_DOMAIN_STATE cpu_to_le32(0xC00000DD)
+#define STATUS_INVALID_DOMAIN_ROLE cpu_to_le32(0xC00000DE)
+#define STATUS_NO_SUCH_DOMAIN cpu_to_le32(0xC00000DF)
+#define STATUS_DOMAIN_EXISTS cpu_to_le32(0xC00000E0)
+#define STATUS_DOMAIN_LIMIT_EXCEEDED cpu_to_le32(0xC00000E1)
+#define STATUS_OPLOCK_NOT_GRANTED cpu_to_le32(0xC00000E2)
+#define STATUS_INVALID_OPLOCK_PROTOCOL cpu_to_le32(0xC00000E3)
+#define STATUS_INTERNAL_DB_CORRUPTION cpu_to_le32(0xC00000E4)
+#define STATUS_INTERNAL_ERROR cpu_to_le32(0xC00000E5)
+#define STATUS_GENERIC_NOT_MAPPED cpu_to_le32(0xC00000E6)
+#define STATUS_BAD_DESCRIPTOR_FORMAT cpu_to_le32(0xC00000E7)
+#define STATUS_INVALID_USER_BUFFER cpu_to_le32(0xC00000E8)
+#define STATUS_UNEXPECTED_IO_ERROR cpu_to_le32(0xC00000E9)
+#define STATUS_UNEXPECTED_MM_CREATE_ERR cpu_to_le32(0xC00000EA)
+#define STATUS_UNEXPECTED_MM_MAP_ERROR cpu_to_le32(0xC00000EB)
+#define STATUS_UNEXPECTED_MM_EXTEND_ERR cpu_to_le32(0xC00000EC)
+#define STATUS_NOT_LOGON_PROCESS cpu_to_le32(0xC00000ED)
+#define STATUS_LOGON_SESSION_EXISTS cpu_to_le32(0xC00000EE)
+#define STATUS_INVALID_PARAMETER_1 cpu_to_le32(0xC00000EF)
+#define STATUS_INVALID_PARAMETER_2 cpu_to_le32(0xC00000F0)
+#define STATUS_INVALID_PARAMETER_3 cpu_to_le32(0xC00000F1)
+#define STATUS_INVALID_PARAMETER_4 cpu_to_le32(0xC00000F2)
+#define STATUS_INVALID_PARAMETER_5 cpu_to_le32(0xC00000F3)
+#define STATUS_INVALID_PARAMETER_6 cpu_to_le32(0xC00000F4)
+#define STATUS_INVALID_PARAMETER_7 cpu_to_le32(0xC00000F5)
+#define STATUS_INVALID_PARAMETER_8 cpu_to_le32(0xC00000F6)
+#define STATUS_INVALID_PARAMETER_9 cpu_to_le32(0xC00000F7)
+#define STATUS_INVALID_PARAMETER_10 cpu_to_le32(0xC00000F8)
+#define STATUS_INVALID_PARAMETER_11 cpu_to_le32(0xC00000F9)
+#define STATUS_INVALID_PARAMETER_12 cpu_to_le32(0xC00000FA)
+#define STATUS_REDIRECTOR_NOT_STARTED cpu_to_le32(0xC00000FB)
+#define STATUS_REDIRECTOR_STARTED cpu_to_le32(0xC00000FC)
+#define STATUS_STACK_OVERFLOW cpu_to_le32(0xC00000FD)
+#define STATUS_NO_SUCH_PACKAGE cpu_to_le32(0xC00000FE)
+#define STATUS_BAD_FUNCTION_TABLE cpu_to_le32(0xC00000FF)
+#define STATUS_VARIABLE_NOT_FOUND cpu_to_le32(0xC0000100)
+#define STATUS_DIRECTORY_NOT_EMPTY cpu_to_le32(0xC0000101)
+#define STATUS_FILE_CORRUPT_ERROR cpu_to_le32(0xC0000102)
+#define STATUS_NOT_A_DIRECTORY cpu_to_le32(0xC0000103)
+#define STATUS_BAD_LOGON_SESSION_STATE cpu_to_le32(0xC0000104)
+#define STATUS_LOGON_SESSION_COLLISION cpu_to_le32(0xC0000105)
+#define STATUS_NAME_TOO_LONG cpu_to_le32(0xC0000106)
+#define STATUS_FILES_OPEN cpu_to_le32(0xC0000107)
+#define STATUS_CONNECTION_IN_USE cpu_to_le32(0xC0000108)
+#define STATUS_MESSAGE_NOT_FOUND cpu_to_le32(0xC0000109)
+#define STATUS_PROCESS_IS_TERMINATING cpu_to_le32(0xC000010A)
+#define STATUS_INVALID_LOGON_TYPE cpu_to_le32(0xC000010B)
+#define STATUS_NO_GUID_TRANSLATION cpu_to_le32(0xC000010C)
+#define STATUS_CANNOT_IMPERSONATE cpu_to_le32(0xC000010D)
+#define STATUS_IMAGE_ALREADY_LOADED cpu_to_le32(0xC000010E)
+#define STATUS_ABIOS_NOT_PRESENT cpu_to_le32(0xC000010F)
+#define STATUS_ABIOS_LID_NOT_EXIST cpu_to_le32(0xC0000110)
+#define STATUS_ABIOS_LID_ALREADY_OWNED cpu_to_le32(0xC0000111)
+#define STATUS_ABIOS_NOT_LID_OWNER cpu_to_le32(0xC0000112)
+#define STATUS_ABIOS_INVALID_COMMAND cpu_to_le32(0xC0000113)
+#define STATUS_ABIOS_INVALID_LID cpu_to_le32(0xC0000114)
+#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE cpu_to_le32(0xC0000115)
+#define STATUS_ABIOS_INVALID_SELECTOR cpu_to_le32(0xC0000116)
+#define STATUS_NO_LDT cpu_to_le32(0xC0000117)
+#define STATUS_INVALID_LDT_SIZE cpu_to_le32(0xC0000118)
+#define STATUS_INVALID_LDT_OFFSET cpu_to_le32(0xC0000119)
+#define STATUS_INVALID_LDT_DESCRIPTOR cpu_to_le32(0xC000011A)
+#define STATUS_INVALID_IMAGE_NE_FORMAT cpu_to_le32(0xC000011B)
+#define STATUS_RXACT_INVALID_STATE cpu_to_le32(0xC000011C)
+#define STATUS_RXACT_COMMIT_FAILURE cpu_to_le32(0xC000011D)
+#define STATUS_MAPPED_FILE_SIZE_ZERO cpu_to_le32(0xC000011E)
+#define STATUS_TOO_MANY_OPENED_FILES cpu_to_le32(0xC000011F)
+#define STATUS_CANCELLED cpu_to_le32(0xC0000120)
+#define STATUS_CANNOT_DELETE cpu_to_le32(0xC0000121)
+#define STATUS_INVALID_COMPUTER_NAME cpu_to_le32(0xC0000122)
+#define STATUS_FILE_DELETED cpu_to_le32(0xC0000123)
+#define STATUS_SPECIAL_ACCOUNT cpu_to_le32(0xC0000124)
+#define STATUS_SPECIAL_GROUP cpu_to_le32(0xC0000125)
+#define STATUS_SPECIAL_USER cpu_to_le32(0xC0000126)
+#define STATUS_MEMBERS_PRIMARY_GROUP cpu_to_le32(0xC0000127)
+#define STATUS_FILE_CLOSED cpu_to_le32(0xC0000128)
+#define STATUS_TOO_MANY_THREADS cpu_to_le32(0xC0000129)
+#define STATUS_THREAD_NOT_IN_PROCESS cpu_to_le32(0xC000012A)
+#define STATUS_TOKEN_ALREADY_IN_USE cpu_to_le32(0xC000012B)
+#define STATUS_PAGEFILE_QUOTA_EXCEEDED cpu_to_le32(0xC000012C)
+#define STATUS_COMMITMENT_LIMIT cpu_to_le32(0xC000012D)
+#define STATUS_INVALID_IMAGE_LE_FORMAT cpu_to_le32(0xC000012E)
+#define STATUS_INVALID_IMAGE_NOT_MZ cpu_to_le32(0xC000012F)
+#define STATUS_INVALID_IMAGE_PROTECT cpu_to_le32(0xC0000130)
+#define STATUS_INVALID_IMAGE_WIN_16 cpu_to_le32(0xC0000131)
+#define STATUS_LOGON_SERVER_CONFLICT cpu_to_le32(0xC0000132)
+#define STATUS_TIME_DIFFERENCE_AT_DC cpu_to_le32(0xC0000133)
+#define STATUS_SYNCHRONIZATION_REQUIRED cpu_to_le32(0xC0000134)
+#define STATUS_DLL_NOT_FOUND cpu_to_le32(0xC0000135)
+#define STATUS_OPEN_FAILED cpu_to_le32(0xC0000136)
+#define STATUS_IO_PRIVILEGE_FAILED cpu_to_le32(0xC0000137)
+#define STATUS_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000138)
+#define STATUS_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000139)
+#define STATUS_CONTROL_C_EXIT cpu_to_le32(0xC000013A)
+#define STATUS_LOCAL_DISCONNECT cpu_to_le32(0xC000013B)
+#define STATUS_REMOTE_DISCONNECT cpu_to_le32(0xC000013C)
+#define STATUS_REMOTE_RESOURCES cpu_to_le32(0xC000013D)
+#define STATUS_LINK_FAILED cpu_to_le32(0xC000013E)
+#define STATUS_LINK_TIMEOUT cpu_to_le32(0xC000013F)
+#define STATUS_INVALID_CONNECTION cpu_to_le32(0xC0000140)
+#define STATUS_INVALID_ADDRESS cpu_to_le32(0xC0000141)
+#define STATUS_DLL_INIT_FAILED cpu_to_le32(0xC0000142)
+#define STATUS_MISSING_SYSTEMFILE cpu_to_le32(0xC0000143)
+#define STATUS_UNHANDLED_EXCEPTION cpu_to_le32(0xC0000144)
+#define STATUS_APP_INIT_FAILURE cpu_to_le32(0xC0000145)
+#define STATUS_PAGEFILE_CREATE_FAILED cpu_to_le32(0xC0000146)
+#define STATUS_NO_PAGEFILE cpu_to_le32(0xC0000147)
+#define STATUS_INVALID_LEVEL cpu_to_le32(0xC0000148)
+#define STATUS_WRONG_PASSWORD_CORE cpu_to_le32(0xC0000149)
+#define STATUS_ILLEGAL_FLOAT_CONTEXT cpu_to_le32(0xC000014A)
+#define STATUS_PIPE_BROKEN cpu_to_le32(0xC000014B)
+#define STATUS_REGISTRY_CORRUPT cpu_to_le32(0xC000014C)
+#define STATUS_REGISTRY_IO_FAILED cpu_to_le32(0xC000014D)
+#define STATUS_NO_EVENT_PAIR cpu_to_le32(0xC000014E)
+#define STATUS_UNRECOGNIZED_VOLUME cpu_to_le32(0xC000014F)
+#define STATUS_SERIAL_NO_DEVICE_INITED cpu_to_le32(0xC0000150)
+#define STATUS_NO_SUCH_ALIAS cpu_to_le32(0xC0000151)
+#define STATUS_MEMBER_NOT_IN_ALIAS cpu_to_le32(0xC0000152)
+#define STATUS_MEMBER_IN_ALIAS cpu_to_le32(0xC0000153)
+#define STATUS_ALIAS_EXISTS cpu_to_le32(0xC0000154)
+#define STATUS_LOGON_NOT_GRANTED cpu_to_le32(0xC0000155)
+#define STATUS_TOO_MANY_SECRETS cpu_to_le32(0xC0000156)
+#define STATUS_SECRET_TOO_LONG cpu_to_le32(0xC0000157)
+#define STATUS_INTERNAL_DB_ERROR cpu_to_le32(0xC0000158)
+#define STATUS_FULLSCREEN_MODE cpu_to_le32(0xC0000159)
+#define STATUS_TOO_MANY_CONTEXT_IDS cpu_to_le32(0xC000015A)
+#define STATUS_LOGON_TYPE_NOT_GRANTED cpu_to_le32(0xC000015B)
+#define STATUS_NOT_REGISTRY_FILE cpu_to_le32(0xC000015C)
+#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000015D)
+#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR cpu_to_le32(0xC000015E)
+#define STATUS_FT_MISSING_MEMBER cpu_to_le32(0xC000015F)
+#define STATUS_ILL_FORMED_SERVICE_ENTRY cpu_to_le32(0xC0000160)
+#define STATUS_ILLEGAL_CHARACTER cpu_to_le32(0xC0000161)
+#define STATUS_UNMAPPABLE_CHARACTER cpu_to_le32(0xC0000162)
+#define STATUS_UNDEFINED_CHARACTER cpu_to_le32(0xC0000163)
+#define STATUS_FLOPPY_VOLUME cpu_to_le32(0xC0000164)
+#define STATUS_FLOPPY_ID_MARK_NOT_FOUND cpu_to_le32(0xC0000165)
+#define STATUS_FLOPPY_WRONG_CYLINDER cpu_to_le32(0xC0000166)
+#define STATUS_FLOPPY_UNKNOWN_ERROR cpu_to_le32(0xC0000167)
+#define STATUS_FLOPPY_BAD_REGISTERS cpu_to_le32(0xC0000168)
+#define STATUS_DISK_RECALIBRATE_FAILED cpu_to_le32(0xC0000169)
+#define STATUS_DISK_OPERATION_FAILED cpu_to_le32(0xC000016A)
+#define STATUS_DISK_RESET_FAILED cpu_to_le32(0xC000016B)
+#define STATUS_SHARED_IRQ_BUSY cpu_to_le32(0xC000016C)
+#define STATUS_FT_ORPHANING cpu_to_le32(0xC000016D)
+#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT cpu_to_le32(0xC000016E)
+#define STATUS_PARTITION_FAILURE cpu_to_le32(0xC0000172)
+#define STATUS_INVALID_BLOCK_LENGTH cpu_to_le32(0xC0000173)
+#define STATUS_DEVICE_NOT_PARTITIONED cpu_to_le32(0xC0000174)
+#define STATUS_UNABLE_TO_LOCK_MEDIA cpu_to_le32(0xC0000175)
+#define STATUS_UNABLE_TO_UNLOAD_MEDIA cpu_to_le32(0xC0000176)
+#define STATUS_EOM_OVERFLOW cpu_to_le32(0xC0000177)
+#define STATUS_NO_MEDIA cpu_to_le32(0xC0000178)
+#define STATUS_NO_SUCH_MEMBER cpu_to_le32(0xC000017A)
+#define STATUS_INVALID_MEMBER cpu_to_le32(0xC000017B)
+#define STATUS_KEY_DELETED cpu_to_le32(0xC000017C)
+#define STATUS_NO_LOG_SPACE cpu_to_le32(0xC000017D)
+#define STATUS_TOO_MANY_SIDS cpu_to_le32(0xC000017E)
+#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000017F)
+#define STATUS_KEY_HAS_CHILDREN cpu_to_le32(0xC0000180)
+#define STATUS_CHILD_MUST_BE_VOLATILE cpu_to_le32(0xC0000181)
+#define STATUS_DEVICE_CONFIGURATION_ERROR cpu_to_le32(0xC0000182)
+#define STATUS_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC0000183)
+#define STATUS_INVALID_DEVICE_STATE cpu_to_le32(0xC0000184)
+#define STATUS_IO_DEVICE_ERROR cpu_to_le32(0xC0000185)
+#define STATUS_DEVICE_PROTOCOL_ERROR cpu_to_le32(0xC0000186)
+#define STATUS_BACKUP_CONTROLLER cpu_to_le32(0xC0000187)
+#define STATUS_LOG_FILE_FULL cpu_to_le32(0xC0000188)
+#define STATUS_TOO_LATE cpu_to_le32(0xC0000189)
+#define STATUS_NO_TRUST_LSA_SECRET cpu_to_le32(0xC000018A)
+#define STATUS_NO_TRUST_SAM_ACCOUNT cpu_to_le32(0xC000018B)
+#define STATUS_TRUSTED_DOMAIN_FAILURE cpu_to_le32(0xC000018C)
+#define STATUS_TRUSTED_RELATIONSHIP_FAILURE cpu_to_le32(0xC000018D)
+#define STATUS_EVENTLOG_FILE_CORRUPT cpu_to_le32(0xC000018E)
+#define STATUS_EVENTLOG_CANT_START cpu_to_le32(0xC000018F)
+#define STATUS_TRUST_FAILURE cpu_to_le32(0xC0000190)
+#define STATUS_MUTANT_LIMIT_EXCEEDED cpu_to_le32(0xC0000191)
+#define STATUS_NETLOGON_NOT_STARTED cpu_to_le32(0xC0000192)
+#define STATUS_ACCOUNT_EXPIRED cpu_to_le32(0xC0000193)
+#define STATUS_POSSIBLE_DEADLOCK cpu_to_le32(0xC0000194)
+#define STATUS_NETWORK_CREDENTIAL_CONFLICT cpu_to_le32(0xC0000195)
+#define STATUS_REMOTE_SESSION_LIMIT cpu_to_le32(0xC0000196)
+#define STATUS_EVENTLOG_FILE_CHANGED cpu_to_le32(0xC0000197)
+#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT cpu_to_le32(0xC0000198)
+#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT cpu_to_le32(0xC0000199)
+#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT cpu_to_le32(0xC000019A)
+#define STATUS_DOMAIN_TRUST_INCONSISTENT cpu_to_le32(0xC000019B)
+#define STATUS_FS_DRIVER_REQUIRED cpu_to_le32(0xC000019C)
+#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL cpu_to_le32(0xC000019D)
+#define STATUS_NETWORK_OPEN_RESTRICTION cpu_to_le32(0xC0000201)
+#define STATUS_NO_USER_SESSION_KEY cpu_to_le32(0xC0000202)
+#define STATUS_USER_SESSION_DELETED cpu_to_le32(0xC0000203)
+#define STATUS_RESOURCE_LANG_NOT_FOUND cpu_to_le32(0xC0000204)
+#define STATUS_INSUFF_SERVER_RESOURCES cpu_to_le32(0xC0000205)
+#define STATUS_INVALID_BUFFER_SIZE cpu_to_le32(0xC0000206)
+#define STATUS_INVALID_ADDRESS_COMPONENT cpu_to_le32(0xC0000207)
+#define STATUS_INVALID_ADDRESS_WILDCARD cpu_to_le32(0xC0000208)
+#define STATUS_TOO_MANY_ADDRESSES cpu_to_le32(0xC0000209)
+#define STATUS_ADDRESS_ALREADY_EXISTS cpu_to_le32(0xC000020A)
+#define STATUS_ADDRESS_CLOSED cpu_to_le32(0xC000020B)
+#define STATUS_CONNECTION_DISCONNECTED cpu_to_le32(0xC000020C)
+#define STATUS_CONNECTION_RESET cpu_to_le32(0xC000020D)
+#define STATUS_TOO_MANY_NODES cpu_to_le32(0xC000020E)
+#define STATUS_TRANSACTION_ABORTED cpu_to_le32(0xC000020F)
+#define STATUS_TRANSACTION_TIMED_OUT cpu_to_le32(0xC0000210)
+#define STATUS_TRANSACTION_NO_RELEASE cpu_to_le32(0xC0000211)
+#define STATUS_TRANSACTION_NO_MATCH cpu_to_le32(0xC0000212)
+#define STATUS_TRANSACTION_RESPONDED cpu_to_le32(0xC0000213)
+#define STATUS_TRANSACTION_INVALID_ID cpu_to_le32(0xC0000214)
+#define STATUS_TRANSACTION_INVALID_TYPE cpu_to_le32(0xC0000215)
+#define STATUS_NOT_SERVER_SESSION cpu_to_le32(0xC0000216)
+#define STATUS_NOT_CLIENT_SESSION cpu_to_le32(0xC0000217)
+#define STATUS_CANNOT_LOAD_REGISTRY_FILE cpu_to_le32(0xC0000218)
+#define STATUS_DEBUG_ATTACH_FAILED cpu_to_le32(0xC0000219)
+#define STATUS_SYSTEM_PROCESS_TERMINATED cpu_to_le32(0xC000021A)
+#define STATUS_DATA_NOT_ACCEPTED cpu_to_le32(0xC000021B)
+#define STATUS_NO_BROWSER_SERVERS_FOUND cpu_to_le32(0xC000021C)
+#define STATUS_VDM_HARD_ERROR cpu_to_le32(0xC000021D)
+#define STATUS_DRIVER_CANCEL_TIMEOUT cpu_to_le32(0xC000021E)
+#define STATUS_REPLY_MESSAGE_MISMATCH cpu_to_le32(0xC000021F)
+#define STATUS_MAPPED_ALIGNMENT cpu_to_le32(0xC0000220)
+#define STATUS_IMAGE_CHECKSUM_MISMATCH cpu_to_le32(0xC0000221)
+#define STATUS_LOST_WRITEBEHIND_DATA cpu_to_le32(0xC0000222)
+#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID cpu_to_le32(0xC0000223)
+#define STATUS_PASSWORD_MUST_CHANGE cpu_to_le32(0xC0000224)
+#define STATUS_NOT_FOUND cpu_to_le32(0xC0000225)
+#define STATUS_NOT_TINY_STREAM cpu_to_le32(0xC0000226)
+#define STATUS_RECOVERY_FAILURE cpu_to_le32(0xC0000227)
+#define STATUS_STACK_OVERFLOW_READ cpu_to_le32(0xC0000228)
+#define STATUS_FAIL_CHECK cpu_to_le32(0xC0000229)
+#define STATUS_DUPLICATE_OBJECTID cpu_to_le32(0xC000022A)
+#define STATUS_OBJECTID_EXISTS cpu_to_le32(0xC000022B)
+#define STATUS_CONVERT_TO_LARGE cpu_to_le32(0xC000022C)
+#define STATUS_RETRY cpu_to_le32(0xC000022D)
+#define STATUS_FOUND_OUT_OF_SCOPE cpu_to_le32(0xC000022E)
+#define STATUS_ALLOCATE_BUCKET cpu_to_le32(0xC000022F)
+#define STATUS_PROPSET_NOT_FOUND cpu_to_le32(0xC0000230)
+#define STATUS_MARSHALL_OVERFLOW cpu_to_le32(0xC0000231)
+#define STATUS_INVALID_VARIANT cpu_to_le32(0xC0000232)
+#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND cpu_to_le32(0xC0000233)
+#define STATUS_ACCOUNT_LOCKED_OUT cpu_to_le32(0xC0000234)
+#define STATUS_HANDLE_NOT_CLOSABLE cpu_to_le32(0xC0000235)
+#define STATUS_CONNECTION_REFUSED cpu_to_le32(0xC0000236)
+#define STATUS_GRACEFUL_DISCONNECT cpu_to_le32(0xC0000237)
+#define STATUS_ADDRESS_ALREADY_ASSOCIATED cpu_to_le32(0xC0000238)
+#define STATUS_ADDRESS_NOT_ASSOCIATED cpu_to_le32(0xC0000239)
+#define STATUS_CONNECTION_INVALID cpu_to_le32(0xC000023A)
+#define STATUS_CONNECTION_ACTIVE cpu_to_le32(0xC000023B)
+#define STATUS_NETWORK_UNREACHABLE cpu_to_le32(0xC000023C)
+#define STATUS_HOST_UNREACHABLE cpu_to_le32(0xC000023D)
+#define STATUS_PROTOCOL_UNREACHABLE cpu_to_le32(0xC000023E)
+#define STATUS_PORT_UNREACHABLE cpu_to_le32(0xC000023F)
+#define STATUS_REQUEST_ABORTED cpu_to_le32(0xC0000240)
+#define STATUS_CONNECTION_ABORTED cpu_to_le32(0xC0000241)
+#define STATUS_BAD_COMPRESSION_BUFFER cpu_to_le32(0xC0000242)
+#define STATUS_USER_MAPPED_FILE cpu_to_le32(0xC0000243)
+#define STATUS_AUDIT_FAILED cpu_to_le32(0xC0000244)
+#define STATUS_TIMER_RESOLUTION_NOT_SET cpu_to_le32(0xC0000245)
+#define STATUS_CONNECTION_COUNT_LIMIT cpu_to_le32(0xC0000246)
+#define STATUS_LOGIN_TIME_RESTRICTION cpu_to_le32(0xC0000247)
+#define STATUS_LOGIN_WKSTA_RESTRICTION cpu_to_le32(0xC0000248)
+#define STATUS_IMAGE_MP_UP_MISMATCH cpu_to_le32(0xC0000249)
+#define STATUS_INSUFFICIENT_LOGON_INFO cpu_to_le32(0xC0000250)
+#define STATUS_BAD_DLL_ENTRYPOINT cpu_to_le32(0xC0000251)
+#define STATUS_BAD_SERVICE_ENTRYPOINT cpu_to_le32(0xC0000252)
+#define STATUS_LPC_REPLY_LOST cpu_to_le32(0xC0000253)
+#define STATUS_IP_ADDRESS_CONFLICT1 cpu_to_le32(0xC0000254)
+#define STATUS_IP_ADDRESS_CONFLICT2 cpu_to_le32(0xC0000255)
+#define STATUS_REGISTRY_QUOTA_LIMIT cpu_to_le32(0xC0000256)
+#define STATUS_PATH_NOT_COVERED cpu_to_le32(0xC0000257)
+#define STATUS_NO_CALLBACK_ACTIVE cpu_to_le32(0xC0000258)
+#define STATUS_LICENSE_QUOTA_EXCEEDED cpu_to_le32(0xC0000259)
+#define STATUS_PWD_TOO_SHORT cpu_to_le32(0xC000025A)
+#define STATUS_PWD_TOO_RECENT cpu_to_le32(0xC000025B)
+#define STATUS_PWD_HISTORY_CONFLICT cpu_to_le32(0xC000025C)
+#define STATUS_PLUGPLAY_NO_DEVICE cpu_to_le32(0xC000025E)
+#define STATUS_UNSUPPORTED_COMPRESSION cpu_to_le32(0xC000025F)
+#define STATUS_INVALID_HW_PROFILE cpu_to_le32(0xC0000260)
+#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH cpu_to_le32(0xC0000261)
+#define STATUS_DRIVER_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000262)
+#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000263)
+#define STATUS_RESOURCE_NOT_OWNED cpu_to_le32(0xC0000264)
+#define STATUS_TOO_MANY_LINKS cpu_to_le32(0xC0000265)
+#define STATUS_QUOTA_LIST_INCONSISTENT cpu_to_le32(0xC0000266)
+#define STATUS_FILE_IS_OFFLINE cpu_to_le32(0xC0000267)
+#define STATUS_EVALUATION_EXPIRATION cpu_to_le32(0xC0000268)
+#define STATUS_ILLEGAL_DLL_RELOCATION cpu_to_le32(0xC0000269)
+#define STATUS_LICENSE_VIOLATION cpu_to_le32(0xC000026A)
+#define STATUS_DLL_INIT_FAILED_LOGOFF cpu_to_le32(0xC000026B)
+#define STATUS_DRIVER_UNABLE_TO_LOAD cpu_to_le32(0xC000026C)
+#define STATUS_DFS_UNAVAILABLE cpu_to_le32(0xC000026D)
+#define STATUS_VOLUME_DISMOUNTED cpu_to_le32(0xC000026E)
+#define STATUS_WX86_INTERNAL_ERROR cpu_to_le32(0xC000026F)
+#define STATUS_WX86_FLOAT_STACK_CHECK cpu_to_le32(0xC0000270)
+#define STATUS_VALIDATE_CONTINUE cpu_to_le32(0xC0000271)
+#define STATUS_NO_MATCH cpu_to_le32(0xC0000272)
+#define STATUS_NO_MORE_MATCHES cpu_to_le32(0xC0000273)
+#define STATUS_NOT_A_REPARSE_POINT cpu_to_le32(0xC0000275)
+#define STATUS_IO_REPARSE_TAG_INVALID cpu_to_le32(0xC0000276)
+#define STATUS_IO_REPARSE_TAG_MISMATCH cpu_to_le32(0xC0000277)
+#define STATUS_IO_REPARSE_DATA_INVALID cpu_to_le32(0xC0000278)
+#define STATUS_IO_REPARSE_TAG_NOT_HANDLED cpu_to_le32(0xC0000279)
+#define STATUS_REPARSE_POINT_NOT_RESOLVED cpu_to_le32(0xC0000280)
+#define STATUS_DIRECTORY_IS_A_REPARSE_POINT cpu_to_le32(0xC0000281)
+#define STATUS_RANGE_LIST_CONFLICT cpu_to_le32(0xC0000282)
+#define STATUS_SOURCE_ELEMENT_EMPTY cpu_to_le32(0xC0000283)
+#define STATUS_DESTINATION_ELEMENT_FULL cpu_to_le32(0xC0000284)
+#define STATUS_ILLEGAL_ELEMENT_ADDRESS cpu_to_le32(0xC0000285)
+#define STATUS_MAGAZINE_NOT_PRESENT cpu_to_le32(0xC0000286)
+#define STATUS_REINITIALIZATION_NEEDED cpu_to_le32(0xC0000287)
+#define STATUS_ENCRYPTION_FAILED cpu_to_le32(0xC000028A)
+#define STATUS_DECRYPTION_FAILED cpu_to_le32(0xC000028B)
+#define STATUS_RANGE_NOT_FOUND cpu_to_le32(0xC000028C)
+#define STATUS_NO_RECOVERY_POLICY cpu_to_le32(0xC000028D)
+#define STATUS_NO_EFS cpu_to_le32(0xC000028E)
+#define STATUS_WRONG_EFS cpu_to_le32(0xC000028F)
+#define STATUS_NO_USER_KEYS cpu_to_le32(0xC0000290)
+#define STATUS_FILE_NOT_ENCRYPTED cpu_to_le32(0xC0000291)
+#define STATUS_NOT_EXPORT_FORMAT cpu_to_le32(0xC0000292)
+#define STATUS_FILE_ENCRYPTED cpu_to_le32(0xC0000293)
+#define STATUS_WMI_GUID_NOT_FOUND cpu_to_le32(0xC0000295)
+#define STATUS_WMI_INSTANCE_NOT_FOUND cpu_to_le32(0xC0000296)
+#define STATUS_WMI_ITEMID_NOT_FOUND cpu_to_le32(0xC0000297)
+#define STATUS_WMI_TRY_AGAIN cpu_to_le32(0xC0000298)
+#define STATUS_SHARED_POLICY cpu_to_le32(0xC0000299)
+#define STATUS_POLICY_OBJECT_NOT_FOUND cpu_to_le32(0xC000029A)
+#define STATUS_POLICY_ONLY_IN_DS cpu_to_le32(0xC000029B)
+#define STATUS_VOLUME_NOT_UPGRADED cpu_to_le32(0xC000029C)
+#define STATUS_REMOTE_STORAGE_NOT_ACTIVE cpu_to_le32(0xC000029D)
+#define STATUS_REMOTE_STORAGE_MEDIA_ERROR cpu_to_le32(0xC000029E)
+#define STATUS_NO_TRACKING_SERVICE cpu_to_le32(0xC000029F)
+#define STATUS_SERVER_SID_MISMATCH cpu_to_le32(0xC00002A0)
+#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE cpu_to_le32(0xC00002A1)
+#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX cpu_to_le32(0xC00002A2)
+#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED cpu_to_le32(0xC00002A3)
+#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS cpu_to_le32(0xC00002A4)
+#define STATUS_DS_BUSY cpu_to_le32(0xC00002A5)
+#define STATUS_DS_UNAVAILABLE cpu_to_le32(0xC00002A6)
+#define STATUS_DS_NO_RIDS_ALLOCATED cpu_to_le32(0xC00002A7)
+#define STATUS_DS_NO_MORE_RIDS cpu_to_le32(0xC00002A8)
+#define STATUS_DS_INCORRECT_ROLE_OWNER cpu_to_le32(0xC00002A9)
+#define STATUS_DS_RIDMGR_INIT_ERROR cpu_to_le32(0xC00002AA)
+#define STATUS_DS_OBJ_CLASS_VIOLATION cpu_to_le32(0xC00002AB)
+#define STATUS_DS_CANT_ON_NON_LEAF cpu_to_le32(0xC00002AC)
+#define STATUS_DS_CANT_ON_RDN cpu_to_le32(0xC00002AD)
+#define STATUS_DS_CANT_MOD_OBJ_CLASS cpu_to_le32(0xC00002AE)
+#define STATUS_DS_CROSS_DOM_MOVE_FAILED cpu_to_le32(0xC00002AF)
+#define STATUS_DS_GC_NOT_AVAILABLE cpu_to_le32(0xC00002B0)
+#define STATUS_DIRECTORY_SERVICE_REQUIRED cpu_to_le32(0xC00002B1)
+#define STATUS_REPARSE_ATTRIBUTE_CONFLICT cpu_to_le32(0xC00002B2)
+#define STATUS_CANT_ENABLE_DENY_ONLY cpu_to_le32(0xC00002B3)
+#define STATUS_FLOAT_MULTIPLE_FAULTS cpu_to_le32(0xC00002B4)
+#define STATUS_FLOAT_MULTIPLE_TRAPS cpu_to_le32(0xC00002B5)
+#define STATUS_DEVICE_REMOVED cpu_to_le32(0xC00002B6)
+#define STATUS_JOURNAL_DELETE_IN_PROGRESS cpu_to_le32(0xC00002B7)
+#define STATUS_JOURNAL_NOT_ACTIVE cpu_to_le32(0xC00002B8)
+#define STATUS_NOINTERFACE cpu_to_le32(0xC00002B9)
+#define STATUS_DS_ADMIN_LIMIT_EXCEEDED cpu_to_le32(0xC00002C1)
+#define STATUS_DRIVER_FAILED_SLEEP cpu_to_le32(0xC00002C2)
+#define STATUS_MUTUAL_AUTHENTICATION_FAILED cpu_to_le32(0xC00002C3)
+#define STATUS_CORRUPT_SYSTEM_FILE cpu_to_le32(0xC00002C4)
+#define STATUS_DATATYPE_MISALIGNMENT_ERROR cpu_to_le32(0xC00002C5)
+#define STATUS_WMI_READ_ONLY cpu_to_le32(0xC00002C6)
+#define STATUS_WMI_SET_FAILURE cpu_to_le32(0xC00002C7)
+#define STATUS_COMMITMENT_MINIMUM cpu_to_le32(0xC00002C8)
+#define STATUS_REG_NAT_CONSUMPTION cpu_to_le32(0xC00002C9)
+#define STATUS_TRANSPORT_FULL cpu_to_le32(0xC00002CA)
+#define STATUS_DS_SAM_INIT_FAILURE cpu_to_le32(0xC00002CB)
+#define STATUS_ONLY_IF_CONNECTED cpu_to_le32(0xC00002CC)
+#define STATUS_DS_SENSITIVE_GROUP_VIOLATION cpu_to_le32(0xC00002CD)
+#define STATUS_PNP_RESTART_ENUMERATION cpu_to_le32(0xC00002CE)
+#define STATUS_JOURNAL_ENTRY_DELETED cpu_to_le32(0xC00002CF)
+#define STATUS_DS_CANT_MOD_PRIMARYGROUPID cpu_to_le32(0xC00002D0)
+#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE cpu_to_le32(0xC00002D1)
+#define STATUS_PNP_REBOOT_REQUIRED cpu_to_le32(0xC00002D2)
+#define STATUS_POWER_STATE_INVALID cpu_to_le32(0xC00002D3)
+#define STATUS_DS_INVALID_GROUP_TYPE cpu_to_le32(0xC00002D4)
+#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D5)
+#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D6)
+#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D7)
+#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC00002D8)
+#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D9)
+#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER cpu_to_le32(0xC00002DA)
+#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER cpu_to_le32(0xC00002DB)
+#define STATUS_DS_HAVE_PRIMARY_MEMBERS cpu_to_le32(0xC00002DC)
+#define STATUS_WMI_NOT_SUPPORTED cpu_to_le32(0xC00002DD)
+#define STATUS_INSUFFICIENT_POWER cpu_to_le32(0xC00002DE)
+#define STATUS_SAM_NEED_BOOTKEY_PASSWORD cpu_to_le32(0xC00002DF)
+#define STATUS_SAM_NEED_BOOTKEY_FLOPPY cpu_to_le32(0xC00002E0)
+#define STATUS_DS_CANT_START cpu_to_le32(0xC00002E1)
+#define STATUS_DS_INIT_FAILURE cpu_to_le32(0xC00002E2)
+#define STATUS_SAM_INIT_FAILURE cpu_to_le32(0xC00002E3)
+#define STATUS_DS_GC_REQUIRED cpu_to_le32(0xC00002E4)
+#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY cpu_to_le32(0xC00002E5)
+#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS cpu_to_le32(0xC00002E6)
+#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED cpu_to_le32(0xC00002E7)
+#define STATUS_MULTIPLE_FAULT_VIOLATION cpu_to_le32(0xC00002E8)
+#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED cpu_to_le32(0xC00002E9)
+#define STATUS_CANNOT_MAKE cpu_to_le32(0xC00002EA)
+#define STATUS_SYSTEM_SHUTDOWN cpu_to_le32(0xC00002EB)
+#define STATUS_DS_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002EC)
+#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002ED)
+#define STATUS_UNFINISHED_CONTEXT_DELETED cpu_to_le32(0xC00002EE)
+#define STATUS_NO_TGT_REPLY cpu_to_le32(0xC00002EF)
+#define STATUS_OBJECTID_NOT_FOUND cpu_to_le32(0xC00002F0)
+#define STATUS_NO_IP_ADDRESSES cpu_to_le32(0xC00002F1)
+#define STATUS_WRONG_CREDENTIAL_HANDLE cpu_to_le32(0xC00002F2)
+#define STATUS_CRYPTO_SYSTEM_INVALID cpu_to_le32(0xC00002F3)
+#define STATUS_MAX_REFERRALS_EXCEEDED cpu_to_le32(0xC00002F4)
+#define STATUS_MUST_BE_KDC cpu_to_le32(0xC00002F5)
+#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED cpu_to_le32(0xC00002F6)
+#define STATUS_TOO_MANY_PRINCIPALS cpu_to_le32(0xC00002F7)
+#define STATUS_NO_PA_DATA cpu_to_le32(0xC00002F8)
+#define STATUS_PKINIT_NAME_MISMATCH cpu_to_le32(0xC00002F9)
+#define STATUS_SMARTCARD_LOGON_REQUIRED cpu_to_le32(0xC00002FA)
+#define STATUS_KDC_INVALID_REQUEST cpu_to_le32(0xC00002FB)
+#define STATUS_KDC_UNABLE_TO_REFER cpu_to_le32(0xC00002FC)
+#define STATUS_KDC_UNKNOWN_ETYPE cpu_to_le32(0xC00002FD)
+#define STATUS_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FE)
+#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FF)
+#define STATUS_NOT_SUPPORTED_ON_SBS cpu_to_le32(0xC0000300)
+#define STATUS_WMI_GUID_DISCONNECTED cpu_to_le32(0xC0000301)
+#define STATUS_WMI_ALREADY_DISABLED cpu_to_le32(0xC0000302)
+#define STATUS_WMI_ALREADY_ENABLED cpu_to_le32(0xC0000303)
+#define STATUS_MFT_TOO_FRAGMENTED cpu_to_le32(0xC0000304)
+#define STATUS_COPY_PROTECTION_FAILURE cpu_to_le32(0xC0000305)
+#define STATUS_CSS_AUTHENTICATION_FAILURE cpu_to_le32(0xC0000306)
+#define STATUS_CSS_KEY_NOT_PRESENT cpu_to_le32(0xC0000307)
+#define STATUS_CSS_KEY_NOT_ESTABLISHED cpu_to_le32(0xC0000308)
+#define STATUS_CSS_SCRAMBLED_SECTOR cpu_to_le32(0xC0000309)
+#define STATUS_CSS_REGION_MISMATCH cpu_to_le32(0xC000030A)
+#define STATUS_CSS_RESETS_EXHAUSTED cpu_to_le32(0xC000030B)
+#define STATUS_PKINIT_FAILURE cpu_to_le32(0xC0000320)
+#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE cpu_to_le32(0xC0000321)
+#define STATUS_NO_KERB_KEY cpu_to_le32(0xC0000322)
+#define STATUS_HOST_DOWN cpu_to_le32(0xC0000350)
+#define STATUS_UNSUPPORTED_PREAUTH cpu_to_le32(0xC0000351)
+#define STATUS_EFS_ALG_BLOB_TOO_BIG cpu_to_le32(0xC0000352)
+#define STATUS_PORT_NOT_SET cpu_to_le32(0xC0000353)
+#define STATUS_DEBUGGER_INACTIVE cpu_to_le32(0xC0000354)
+#define STATUS_DS_VERSION_CHECK_FAILURE cpu_to_le32(0xC0000355)
+#define STATUS_AUDITING_DISABLED cpu_to_le32(0xC0000356)
+#define STATUS_PRENT4_MACHINE_ACCOUNT cpu_to_le32(0xC0000357)
+#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC0000358)
+#define STATUS_INVALID_IMAGE_WIN_32 cpu_to_le32(0xC0000359)
+#define STATUS_INVALID_IMAGE_WIN_64 cpu_to_le32(0xC000035A)
+#define STATUS_BAD_BINDINGS cpu_to_le32(0xC000035B)
+#define STATUS_NETWORK_SESSION_EXPIRED cpu_to_le32(0xC000035C)
+#define STATUS_APPHELP_BLOCK cpu_to_le32(0xC000035D)
+#define STATUS_ALL_SIDS_FILTERED cpu_to_le32(0xC000035E)
+#define STATUS_NOT_SAFE_MODE_DRIVER cpu_to_le32(0xC000035F)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT cpu_to_le32(0xC0000361)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH cpu_to_le32(0xC0000362)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER cpu_to_le32(0xC0000363)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER cpu_to_le32(0xC0000364)
+#define STATUS_FAILED_DRIVER_ENTRY cpu_to_le32(0xC0000365)
+#define STATUS_DEVICE_ENUMERATION_ERROR cpu_to_le32(0xC0000366)
+#define STATUS_MOUNT_POINT_NOT_RESOLVED cpu_to_le32(0xC0000368)
+#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER cpu_to_le32(0xC0000369)
+#define STATUS_MCA_OCCURED cpu_to_le32(0xC000036A)
+#define STATUS_DRIVER_BLOCKED_CRITICAL cpu_to_le32(0xC000036B)
+#define STATUS_DRIVER_BLOCKED cpu_to_le32(0xC000036C)
+#define STATUS_DRIVER_DATABASE_ERROR cpu_to_le32(0xC000036D)
+#define STATUS_SYSTEM_HIVE_TOO_LARGE cpu_to_le32(0xC000036E)
+#define STATUS_INVALID_IMPORT_OF_NON_DLL cpu_to_le32(0xC000036F)
+#define STATUS_NO_SECRETS cpu_to_le32(0xC0000371)
+#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY cpu_to_le32(0xC0000372)
+#define STATUS_FAILED_STACK_SWITCH cpu_to_le32(0xC0000373)
+#define STATUS_HEAP_CORRUPTION cpu_to_le32(0xC0000374)
+#define STATUS_SMARTCARD_WRONG_PIN cpu_to_le32(0xC0000380)
+#define STATUS_SMARTCARD_CARD_BLOCKED cpu_to_le32(0xC0000381)
+#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED cpu_to_le32(0xC0000382)
+#define STATUS_SMARTCARD_NO_CARD cpu_to_le32(0xC0000383)
+#define STATUS_SMARTCARD_NO_KEY_CONTAINER cpu_to_le32(0xC0000384)
+#define STATUS_SMARTCARD_NO_CERTIFICATE cpu_to_le32(0xC0000385)
+#define STATUS_SMARTCARD_NO_KEYSET cpu_to_le32(0xC0000386)
+#define STATUS_SMARTCARD_IO_ERROR cpu_to_le32(0xC0000387)
+#define STATUS_DOWNGRADE_DETECTED cpu_to_le32(0xC0000388)
+#define STATUS_SMARTCARD_CERT_REVOKED cpu_to_le32(0xC0000389)
+#define STATUS_ISSUING_CA_UNTRUSTED cpu_to_le32(0xC000038A)
+#define STATUS_REVOCATION_OFFLINE_C cpu_to_le32(0xC000038B)
+#define STATUS_PKINIT_CLIENT_FAILURE cpu_to_le32(0xC000038C)
+#define STATUS_SMARTCARD_CERT_EXPIRED cpu_to_le32(0xC000038D)
+#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD cpu_to_le32(0xC000038E)
+#define STATUS_SMARTCARD_SILENT_CONTEXT cpu_to_le32(0xC000038F)
+#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000401)
+#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000402)
+#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000403)
+#define STATUS_DS_NAME_NOT_UNIQUE cpu_to_le32(0xC0000404)
+#define STATUS_DS_DUPLICATE_ID_FOUND cpu_to_le32(0xC0000405)
+#define STATUS_DS_GROUP_CONVERSION_ERROR cpu_to_le32(0xC0000406)
+#define STATUS_VOLSNAP_PREPARE_HIBERNATE cpu_to_le32(0xC0000407)
+#define STATUS_USER2USER_REQUIRED cpu_to_le32(0xC0000408)
+#define STATUS_STACK_BUFFER_OVERRUN cpu_to_le32(0xC0000409)
+#define STATUS_NO_S4U_PROT_SUPPORT cpu_to_le32(0xC000040A)
+#define STATUS_CROSSREALM_DELEGATION_FAILURE cpu_to_le32(0xC000040B)
+#define STATUS_REVOCATION_OFFLINE_KDC cpu_to_le32(0xC000040C)
+#define STATUS_ISSUING_CA_UNTRUSTED_KDC cpu_to_le32(0xC000040D)
+#define STATUS_KDC_CERT_EXPIRED cpu_to_le32(0xC000040E)
+#define STATUS_KDC_CERT_REVOKED cpu_to_le32(0xC000040F)
+#define STATUS_PARAMETER_QUOTA_EXCEEDED cpu_to_le32(0xC0000410)
+#define STATUS_HIBERNATION_FAILURE cpu_to_le32(0xC0000411)
+#define STATUS_DELAY_LOAD_FAILED cpu_to_le32(0xC0000412)
+#define STATUS_AUTHENTICATION_FIREWALL_FAILED cpu_to_le32(0xC0000413)
+#define STATUS_VDM_DISALLOWED cpu_to_le32(0xC0000414)
+#define STATUS_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC0000415)
+#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE cpu_to_le32(0xC0000416)
+#define STATUS_INVALID_CRUNTIME_PARAMETER cpu_to_le32(0xC0000417)
+#define STATUS_NTLM_BLOCKED cpu_to_le32(0xC0000418)
+#define STATUS_ASSERTION_FAILURE cpu_to_le32(0xC0000420)
+#define STATUS_VERIFIER_STOP cpu_to_le32(0xC0000421)
+#define STATUS_CALLBACK_POP_STACK cpu_to_le32(0xC0000423)
+#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED cpu_to_le32(0xC0000424)
+#define STATUS_HIVE_UNLOADED cpu_to_le32(0xC0000425)
+#define STATUS_COMPRESSION_DISABLED cpu_to_le32(0xC0000426)
+#define STATUS_FILE_SYSTEM_LIMITATION cpu_to_le32(0xC0000427)
+#define STATUS_INVALID_IMAGE_HASH cpu_to_le32(0xC0000428)
+#define STATUS_NOT_CAPABLE cpu_to_le32(0xC0000429)
+#define STATUS_REQUEST_OUT_OF_SEQUENCE cpu_to_le32(0xC000042A)
+#define STATUS_IMPLEMENTATION_LIMIT cpu_to_le32(0xC000042B)
+#define STATUS_ELEVATION_REQUIRED cpu_to_le32(0xC000042C)
+#define STATUS_BEYOND_VDL cpu_to_le32(0xC0000432)
+#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS cpu_to_le32(0xC0000433)
+#define STATUS_PTE_CHANGED cpu_to_le32(0xC0000434)
+#define STATUS_PURGE_FAILED cpu_to_le32(0xC0000435)
+#define STATUS_CRED_REQUIRES_CONFIRMATION cpu_to_le32(0xC0000440)
+#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE cpu_to_le32(0xC0000441)
+#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER cpu_to_le32(0xC0000442)
+#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE cpu_to_le32(0xC0000443)
+#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE cpu_to_le32(0xC0000444)
+#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE cpu_to_le32(0xC0000445)
+#define STATUS_INVALID_LABEL cpu_to_le32(0xC0000446)
+#define STATUS_DRIVER_PROCESS_TERMINATED cpu_to_le32(0xC0000450)
+#define STATUS_AMBIGUOUS_SYSTEM_DEVICE cpu_to_le32(0xC0000451)
+#define STATUS_SYSTEM_DEVICE_NOT_FOUND cpu_to_le32(0xC0000452)
+#define STATUS_RESTART_BOOT_APPLICATION cpu_to_le32(0xC0000453)
+#define STATUS_INVALID_TASK_NAME cpu_to_le32(0xC0000500)
+#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501)
+#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502)
+#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503)
+#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700)
+#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701)
+#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702)
+#define STATUS_REQUEST_CANCELED cpu_to_le32(0xC0000703)
+#define STATUS_RECURSIVE_DISPATCH cpu_to_le32(0xC0000704)
+#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED cpu_to_le32(0xC0000705)
+#define STATUS_LPC_INVALID_CONNECTION_USAGE cpu_to_le32(0xC0000706)
+#define STATUS_LPC_REQUESTS_NOT_ALLOWED cpu_to_le32(0xC0000707)
+#define STATUS_RESOURCE_IN_USE cpu_to_le32(0xC0000708)
+#define STATUS_HARDWARE_MEMORY_ERROR cpu_to_le32(0xC0000709)
+#define STATUS_THREADPOOL_HANDLE_EXCEPTION cpu_to_le32(0xC000070A)
+#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED cpu_to_le32(0xC000070B)
+#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED cpu_to_le32(0xC000070C)
+#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED cpu_to_le32(0xC000070D)
+#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED cpu_to_le32(0xC000070E)
+#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION cpu_to_le32(0xC000070F)
+#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000710)
+#define STATUS_APC_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000711)
+#define STATUS_PROCESS_IS_PROTECTED cpu_to_le32(0xC0000712)
+#define STATUS_MCA_EXCEPTION cpu_to_le32(0xC0000713)
+#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE cpu_to_le32(0xC0000714)
+#define STATUS_SYMLINK_CLASS_DISABLED cpu_to_le32(0xC0000715)
+#define STATUS_INVALID_IDN_NORMALIZATION cpu_to_le32(0xC0000716)
+#define STATUS_NO_UNICODE_TRANSLATION cpu_to_le32(0xC0000717)
+#define STATUS_ALREADY_REGISTERED cpu_to_le32(0xC0000718)
+#define STATUS_CONTEXT_MISMATCH cpu_to_le32(0xC0000719)
+#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST cpu_to_le32(0xC000071A)
+#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY cpu_to_le32(0xC000071B)
+#define STATUS_INVALID_THREAD cpu_to_le32(0xC000071C)
+#define STATUS_CALLBACK_RETURNED_TRANSACTION cpu_to_le32(0xC000071D)
+#define STATUS_CALLBACK_RETURNED_LDR_LOCK cpu_to_le32(0xC000071E)
+#define STATUS_CALLBACK_RETURNED_LANG cpu_to_le32(0xC000071F)
+#define STATUS_CALLBACK_RETURNED_PRI_BACK cpu_to_le32(0xC0000720)
+#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY cpu_to_le32(0xC0000721)
+#define STATUS_DISK_REPAIR_DISABLED cpu_to_le32(0xC0000800)
+#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS cpu_to_le32(0xC0000801)
+#define STATUS_DISK_QUOTA_EXCEEDED cpu_to_le32(0xC0000802)
+#define STATUS_CONTENT_BLOCKED cpu_to_le32(0xC0000804)
+#define STATUS_BAD_CLUSTERS cpu_to_le32(0xC0000805)
+#define STATUS_VOLUME_DIRTY cpu_to_le32(0xC0000806)
+#define STATUS_FILE_CHECKED_OUT cpu_to_le32(0xC0000901)
+#define STATUS_CHECKOUT_REQUIRED cpu_to_le32(0xC0000902)
+#define STATUS_BAD_FILE_TYPE cpu_to_le32(0xC0000903)
+#define STATUS_FILE_TOO_LARGE cpu_to_le32(0xC0000904)
+#define STATUS_FORMS_AUTH_REQUIRED cpu_to_le32(0xC0000905)
+#define STATUS_VIRUS_INFECTED cpu_to_le32(0xC0000906)
+#define STATUS_VIRUS_DELETED cpu_to_le32(0xC0000907)
+#define STATUS_BAD_MCFG_TABLE cpu_to_le32(0xC0000908)
+#define STATUS_WOW_ASSERTION cpu_to_le32(0xC0009898)
+#define STATUS_INVALID_SIGNATURE cpu_to_le32(0xC000A000)
+#define STATUS_HMAC_NOT_SUPPORTED cpu_to_le32(0xC000A001)
+#define STATUS_IPSEC_QUEUE_OVERFLOW cpu_to_le32(0xC000A010)
+#define STATUS_ND_QUEUE_OVERFLOW cpu_to_le32(0xC000A011)
+#define STATUS_HOPLIMIT_EXCEEDED cpu_to_le32(0xC000A012)
+#define STATUS_PROTOCOL_NOT_SUPPORTED cpu_to_le32(0xC000A013)
+#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED cpu_to_le32(0xC000A080)
+#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR cpu_to_le32(0xC000A081)
+#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR cpu_to_le32(0xC000A082)
+#define STATUS_XML_PARSE_ERROR cpu_to_le32(0xC000A083)
+#define STATUS_XMLDSIG_ERROR cpu_to_le32(0xC000A084)
+#define STATUS_WRONG_COMPARTMENT cpu_to_le32(0xC000A085)
+#define STATUS_AUTHIP_FAILURE cpu_to_le32(0xC000A086)
+#define DBG_NO_STATE_CHANGE cpu_to_le32(0xC0010001)
+#define DBG_APP_NOT_IDLE cpu_to_le32(0xC0010002)
+#define RPC_NT_INVALID_STRING_BINDING cpu_to_le32(0xC0020001)
+#define RPC_NT_WRONG_KIND_OF_BINDING cpu_to_le32(0xC0020002)
+#define RPC_NT_INVALID_BINDING cpu_to_le32(0xC0020003)
+#define RPC_NT_PROTSEQ_NOT_SUPPORTED cpu_to_le32(0xC0020004)
+#define RPC_NT_INVALID_RPC_PROTSEQ cpu_to_le32(0xC0020005)
+#define RPC_NT_INVALID_STRING_UUID cpu_to_le32(0xC0020006)
+#define RPC_NT_INVALID_ENDPOINT_FORMAT cpu_to_le32(0xC0020007)
+#define RPC_NT_INVALID_NET_ADDR cpu_to_le32(0xC0020008)
+#define RPC_NT_NO_ENDPOINT_FOUND cpu_to_le32(0xC0020009)
+#define RPC_NT_INVALID_TIMEOUT cpu_to_le32(0xC002000A)
+#define RPC_NT_OBJECT_NOT_FOUND cpu_to_le32(0xC002000B)
+#define RPC_NT_ALREADY_REGISTERED cpu_to_le32(0xC002000C)
+#define RPC_NT_TYPE_ALREADY_REGISTERED cpu_to_le32(0xC002000D)
+#define RPC_NT_ALREADY_LISTENING cpu_to_le32(0xC002000E)
+#define RPC_NT_NO_PROTSEQS_REGISTERED cpu_to_le32(0xC002000F)
+#define RPC_NT_NOT_LISTENING cpu_to_le32(0xC0020010)
+#define RPC_NT_UNKNOWN_MGR_TYPE cpu_to_le32(0xC0020011)
+#define RPC_NT_UNKNOWN_IF cpu_to_le32(0xC0020012)
+#define RPC_NT_NO_BINDINGS cpu_to_le32(0xC0020013)
+#define RPC_NT_NO_PROTSEQS cpu_to_le32(0xC0020014)
+#define RPC_NT_CANT_CREATE_ENDPOINT cpu_to_le32(0xC0020015)
+#define RPC_NT_OUT_OF_RESOURCES cpu_to_le32(0xC0020016)
+#define RPC_NT_SERVER_UNAVAILABLE cpu_to_le32(0xC0020017)
+#define RPC_NT_SERVER_TOO_BUSY cpu_to_le32(0xC0020018)
+#define RPC_NT_INVALID_NETWORK_OPTIONS cpu_to_le32(0xC0020019)
+#define RPC_NT_NO_CALL_ACTIVE cpu_to_le32(0xC002001A)
+#define RPC_NT_CALL_FAILED cpu_to_le32(0xC002001B)
+#define RPC_NT_CALL_FAILED_DNE cpu_to_le32(0xC002001C)
+#define RPC_NT_PROTOCOL_ERROR cpu_to_le32(0xC002001D)
+#define RPC_NT_UNSUPPORTED_TRANS_SYN cpu_to_le32(0xC002001F)
+#define RPC_NT_UNSUPPORTED_TYPE cpu_to_le32(0xC0020021)
+#define RPC_NT_INVALID_TAG cpu_to_le32(0xC0020022)
+#define RPC_NT_INVALID_BOUND cpu_to_le32(0xC0020023)
+#define RPC_NT_NO_ENTRY_NAME cpu_to_le32(0xC0020024)
+#define RPC_NT_INVALID_NAME_SYNTAX cpu_to_le32(0xC0020025)
+#define RPC_NT_UNSUPPORTED_NAME_SYNTAX cpu_to_le32(0xC0020026)
+#define RPC_NT_UUID_NO_ADDRESS cpu_to_le32(0xC0020028)
+#define RPC_NT_DUPLICATE_ENDPOINT cpu_to_le32(0xC0020029)
+#define RPC_NT_UNKNOWN_AUTHN_TYPE cpu_to_le32(0xC002002A)
+#define RPC_NT_MAX_CALLS_TOO_SMALL cpu_to_le32(0xC002002B)
+#define RPC_NT_STRING_TOO_LONG cpu_to_le32(0xC002002C)
+#define RPC_NT_PROTSEQ_NOT_FOUND cpu_to_le32(0xC002002D)
+#define RPC_NT_PROCNUM_OUT_OF_RANGE cpu_to_le32(0xC002002E)
+#define RPC_NT_BINDING_HAS_NO_AUTH cpu_to_le32(0xC002002F)
+#define RPC_NT_UNKNOWN_AUTHN_SERVICE cpu_to_le32(0xC0020030)
+#define RPC_NT_UNKNOWN_AUTHN_LEVEL cpu_to_le32(0xC0020031)
+#define RPC_NT_INVALID_AUTH_IDENTITY cpu_to_le32(0xC0020032)
+#define RPC_NT_UNKNOWN_AUTHZ_SERVICE cpu_to_le32(0xC0020033)
+#define EPT_NT_INVALID_ENTRY cpu_to_le32(0xC0020034)
+#define EPT_NT_CANT_PERFORM_OP cpu_to_le32(0xC0020035)
+#define EPT_NT_NOT_REGISTERED cpu_to_le32(0xC0020036)
+#define RPC_NT_NOTHING_TO_EXPORT cpu_to_le32(0xC0020037)
+#define RPC_NT_INCOMPLETE_NAME cpu_to_le32(0xC0020038)
+#define RPC_NT_INVALID_VERS_OPTION cpu_to_le32(0xC0020039)
+#define RPC_NT_NO_MORE_MEMBERS cpu_to_le32(0xC002003A)
+#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED cpu_to_le32(0xC002003B)
+#define RPC_NT_INTERFACE_NOT_FOUND cpu_to_le32(0xC002003C)
+#define RPC_NT_ENTRY_ALREADY_EXISTS cpu_to_le32(0xC002003D)
+#define RPC_NT_ENTRY_NOT_FOUND cpu_to_le32(0xC002003E)
+#define RPC_NT_NAME_SERVICE_UNAVAILABLE cpu_to_le32(0xC002003F)
+#define RPC_NT_INVALID_NAF_ID cpu_to_le32(0xC0020040)
+#define RPC_NT_CANNOT_SUPPORT cpu_to_le32(0xC0020041)
+#define RPC_NT_NO_CONTEXT_AVAILABLE cpu_to_le32(0xC0020042)
+#define RPC_NT_INTERNAL_ERROR cpu_to_le32(0xC0020043)
+#define RPC_NT_ZERO_DIVIDE cpu_to_le32(0xC0020044)
+#define RPC_NT_ADDRESS_ERROR cpu_to_le32(0xC0020045)
+#define RPC_NT_FP_DIV_ZERO cpu_to_le32(0xC0020046)
+#define RPC_NT_FP_UNDERFLOW cpu_to_le32(0xC0020047)
+#define RPC_NT_FP_OVERFLOW cpu_to_le32(0xC0020048)
+#define RPC_NT_CALL_IN_PROGRESS cpu_to_le32(0xC0020049)
+#define RPC_NT_NO_MORE_BINDINGS cpu_to_le32(0xC002004A)
+#define RPC_NT_GROUP_MEMBER_NOT_FOUND cpu_to_le32(0xC002004B)
+#define EPT_NT_CANT_CREATE cpu_to_le32(0xC002004C)
+#define RPC_NT_INVALID_OBJECT cpu_to_le32(0xC002004D)
+#define RPC_NT_NO_INTERFACES cpu_to_le32(0xC002004F)
+#define RPC_NT_CALL_CANCELLED cpu_to_le32(0xC0020050)
+#define RPC_NT_BINDING_INCOMPLETE cpu_to_le32(0xC0020051)
+#define RPC_NT_COMM_FAILURE cpu_to_le32(0xC0020052)
+#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL cpu_to_le32(0xC0020053)
+#define RPC_NT_NO_PRINC_NAME cpu_to_le32(0xC0020054)
+#define RPC_NT_NOT_RPC_ERROR cpu_to_le32(0xC0020055)
+#define RPC_NT_SEC_PKG_ERROR cpu_to_le32(0xC0020057)
+#define RPC_NT_NOT_CANCELLED cpu_to_le32(0xC0020058)
+#define RPC_NT_INVALID_ASYNC_HANDLE cpu_to_le32(0xC0020062)
+#define RPC_NT_INVALID_ASYNC_CALL cpu_to_le32(0xC0020063)
+#define RPC_NT_PROXY_ACCESS_DENIED cpu_to_le32(0xC0020064)
+#define RPC_NT_NO_MORE_ENTRIES cpu_to_le32(0xC0030001)
+#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL cpu_to_le32(0xC0030002)
+#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE cpu_to_le32(0xC0030003)
+#define RPC_NT_SS_IN_NULL_CONTEXT cpu_to_le32(0xC0030004)
+#define RPC_NT_SS_CONTEXT_MISMATCH cpu_to_le32(0xC0030005)
+#define RPC_NT_SS_CONTEXT_DAMAGED cpu_to_le32(0xC0030006)
+#define RPC_NT_SS_HANDLES_MISMATCH cpu_to_le32(0xC0030007)
+#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE cpu_to_le32(0xC0030008)
+#define RPC_NT_NULL_REF_POINTER cpu_to_le32(0xC0030009)
+#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE cpu_to_le32(0xC003000A)
+#define RPC_NT_BYTE_COUNT_TOO_SMALL cpu_to_le32(0xC003000B)
+#define RPC_NT_BAD_STUB_DATA cpu_to_le32(0xC003000C)
+#define RPC_NT_INVALID_ES_ACTION cpu_to_le32(0xC0030059)
+#define RPC_NT_WRONG_ES_VERSION cpu_to_le32(0xC003005A)
+#define RPC_NT_WRONG_STUB_VERSION cpu_to_le32(0xC003005B)
+#define RPC_NT_INVALID_PIPE_OBJECT cpu_to_le32(0xC003005C)
+#define RPC_NT_INVALID_PIPE_OPERATION cpu_to_le32(0xC003005D)
+#define RPC_NT_WRONG_PIPE_VERSION cpu_to_le32(0xC003005E)
+#define RPC_NT_PIPE_CLOSED cpu_to_le32(0xC003005F)
+#define RPC_NT_PIPE_DISCIPLINE_ERROR cpu_to_le32(0xC0030060)
+#define RPC_NT_PIPE_EMPTY cpu_to_le32(0xC0030061)
+#define STATUS_PNP_BAD_MPS_TABLE cpu_to_le32(0xC0040035)
+#define STATUS_PNP_TRANSLATION_FAILED cpu_to_le32(0xC0040036)
+#define STATUS_PNP_IRQ_TRANSLATION_FAILED cpu_to_le32(0xC0040037)
+#define STATUS_PNP_INVALID_ID cpu_to_le32(0xC0040038)
+#define STATUS_IO_REISSUE_AS_CACHED cpu_to_le32(0xC0040039)
+#define STATUS_CTX_WINSTATION_NAME_INVALID cpu_to_le32(0xC00A0001)
+#define STATUS_CTX_INVALID_PD cpu_to_le32(0xC00A0002)
+#define STATUS_CTX_PD_NOT_FOUND cpu_to_le32(0xC00A0003)
+#define STATUS_CTX_CLOSE_PENDING cpu_to_le32(0xC00A0006)
+#define STATUS_CTX_NO_OUTBUF cpu_to_le32(0xC00A0007)
+#define STATUS_CTX_MODEM_INF_NOT_FOUND cpu_to_le32(0xC00A0008)
+#define STATUS_CTX_INVALID_MODEMNAME cpu_to_le32(0xC00A0009)
+#define STATUS_CTX_RESPONSE_ERROR cpu_to_le32(0xC00A000A)
+#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT cpu_to_le32(0xC00A000B)
+#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER cpu_to_le32(0xC00A000C)
+#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE cpu_to_le32(0xC00A000D)
+#define STATUS_CTX_MODEM_RESPONSE_BUSY cpu_to_le32(0xC00A000E)
+#define STATUS_CTX_MODEM_RESPONSE_VOICE cpu_to_le32(0xC00A000F)
+#define STATUS_CTX_TD_ERROR cpu_to_le32(0xC00A0010)
+#define STATUS_CTX_LICENSE_CLIENT_INVALID cpu_to_le32(0xC00A0012)
+#define STATUS_CTX_LICENSE_NOT_AVAILABLE cpu_to_le32(0xC00A0013)
+#define STATUS_CTX_LICENSE_EXPIRED cpu_to_le32(0xC00A0014)
+#define STATUS_CTX_WINSTATION_NOT_FOUND cpu_to_le32(0xC00A0015)
+#define STATUS_CTX_WINSTATION_NAME_COLLISION cpu_to_le32(0xC00A0016)
+#define STATUS_CTX_WINSTATION_BUSY cpu_to_le32(0xC00A0017)
+#define STATUS_CTX_BAD_VIDEO_MODE cpu_to_le32(0xC00A0018)
+#define STATUS_CTX_GRAPHICS_INVALID cpu_to_le32(0xC00A0022)
+#define STATUS_CTX_NOT_CONSOLE cpu_to_le32(0xC00A0024)
+#define STATUS_CTX_CLIENT_QUERY_TIMEOUT cpu_to_le32(0xC00A0026)
+#define STATUS_CTX_CONSOLE_DISCONNECT cpu_to_le32(0xC00A0027)
+#define STATUS_CTX_CONSOLE_CONNECT cpu_to_le32(0xC00A0028)
+#define STATUS_CTX_SHADOW_DENIED cpu_to_le32(0xC00A002A)
+#define STATUS_CTX_WINSTATION_ACCESS_DENIED cpu_to_le32(0xC00A002B)
+#define STATUS_CTX_INVALID_WD cpu_to_le32(0xC00A002E)
+#define STATUS_CTX_WD_NOT_FOUND cpu_to_le32(0xC00A002F)
+#define STATUS_CTX_SHADOW_INVALID cpu_to_le32(0xC00A0030)
+#define STATUS_CTX_SHADOW_DISABLED cpu_to_le32(0xC00A0031)
+#define STATUS_RDP_PROTOCOL_ERROR cpu_to_le32(0xC00A0032)
+#define STATUS_CTX_CLIENT_LICENSE_NOT_SET cpu_to_le32(0xC00A0033)
+#define STATUS_CTX_CLIENT_LICENSE_IN_USE cpu_to_le32(0xC00A0034)
+#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE cpu_to_le32(0xC00A0035)
+#define STATUS_CTX_SHADOW_NOT_RUNNING cpu_to_le32(0xC00A0036)
+#define STATUS_CTX_LOGON_DISABLED cpu_to_le32(0xC00A0037)
+#define STATUS_CTX_SECURITY_LAYER_ERROR cpu_to_le32(0xC00A0038)
+#define STATUS_TS_INCOMPATIBLE_SESSIONS cpu_to_le32(0xC00A0039)
+#define STATUS_MUI_FILE_NOT_FOUND cpu_to_le32(0xC00B0001)
+#define STATUS_MUI_INVALID_FILE cpu_to_le32(0xC00B0002)
+#define STATUS_MUI_INVALID_RC_CONFIG cpu_to_le32(0xC00B0003)
+#define STATUS_MUI_INVALID_LOCALE_NAME cpu_to_le32(0xC00B0004)
+#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME cpu_to_le32(0xC00B0005)
+#define STATUS_MUI_FILE_NOT_LOADED cpu_to_le32(0xC00B0006)
+#define STATUS_RESOURCE_ENUM_USER_STOP cpu_to_le32(0xC00B0007)
+#define STATUS_CLUSTER_INVALID_NODE cpu_to_le32(0xC0130001)
+#define STATUS_CLUSTER_NODE_EXISTS cpu_to_le32(0xC0130002)
+#define STATUS_CLUSTER_JOIN_IN_PROGRESS cpu_to_le32(0xC0130003)
+#define STATUS_CLUSTER_NODE_NOT_FOUND cpu_to_le32(0xC0130004)
+#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND cpu_to_le32(0xC0130005)
+#define STATUS_CLUSTER_NETWORK_EXISTS cpu_to_le32(0xC0130006)
+#define STATUS_CLUSTER_NETWORK_NOT_FOUND cpu_to_le32(0xC0130007)
+#define STATUS_CLUSTER_NETINTERFACE_EXISTS cpu_to_le32(0xC0130008)
+#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND cpu_to_le32(0xC0130009)
+#define STATUS_CLUSTER_INVALID_REQUEST cpu_to_le32(0xC013000A)
+#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER cpu_to_le32(0xC013000B)
+#define STATUS_CLUSTER_NODE_DOWN cpu_to_le32(0xC013000C)
+#define STATUS_CLUSTER_NODE_UNREACHABLE cpu_to_le32(0xC013000D)
+#define STATUS_CLUSTER_NODE_NOT_MEMBER cpu_to_le32(0xC013000E)
+#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS cpu_to_le32(0xC013000F)
+#define STATUS_CLUSTER_INVALID_NETWORK cpu_to_le32(0xC0130010)
+#define STATUS_CLUSTER_NO_NET_ADAPTERS cpu_to_le32(0xC0130011)
+#define STATUS_CLUSTER_NODE_UP cpu_to_le32(0xC0130012)
+#define STATUS_CLUSTER_NODE_PAUSED cpu_to_le32(0xC0130013)
+#define STATUS_CLUSTER_NODE_NOT_PAUSED cpu_to_le32(0xC0130014)
+#define STATUS_CLUSTER_NO_SECURITY_CONTEXT cpu_to_le32(0xC0130015)
+#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL cpu_to_le32(0xC0130016)
+#define STATUS_CLUSTER_POISONED cpu_to_le32(0xC0130017)
+#define STATUS_ACPI_INVALID_OPCODE cpu_to_le32(0xC0140001)
+#define STATUS_ACPI_STACK_OVERFLOW cpu_to_le32(0xC0140002)
+#define STATUS_ACPI_ASSERT_FAILED cpu_to_le32(0xC0140003)
+#define STATUS_ACPI_INVALID_INDEX cpu_to_le32(0xC0140004)
+#define STATUS_ACPI_INVALID_ARGUMENT cpu_to_le32(0xC0140005)
+#define STATUS_ACPI_FATAL cpu_to_le32(0xC0140006)
+#define STATUS_ACPI_INVALID_SUPERNAME cpu_to_le32(0xC0140007)
+#define STATUS_ACPI_INVALID_ARGTYPE cpu_to_le32(0xC0140008)
+#define STATUS_ACPI_INVALID_OBJTYPE cpu_to_le32(0xC0140009)
+#define STATUS_ACPI_INVALID_TARGETTYPE cpu_to_le32(0xC014000A)
+#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT cpu_to_le32(0xC014000B)
+#define STATUS_ACPI_ADDRESS_NOT_MAPPED cpu_to_le32(0xC014000C)
+#define STATUS_ACPI_INVALID_EVENTTYPE cpu_to_le32(0xC014000D)
+#define STATUS_ACPI_HANDLER_COLLISION cpu_to_le32(0xC014000E)
+#define STATUS_ACPI_INVALID_DATA cpu_to_le32(0xC014000F)
+#define STATUS_ACPI_INVALID_REGION cpu_to_le32(0xC0140010)
+#define STATUS_ACPI_INVALID_ACCESS_SIZE cpu_to_le32(0xC0140011)
+#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK cpu_to_le32(0xC0140012)
+#define STATUS_ACPI_ALREADY_INITIALIZED cpu_to_le32(0xC0140013)
+#define STATUS_ACPI_NOT_INITIALIZED cpu_to_le32(0xC0140014)
+#define STATUS_ACPI_INVALID_MUTEX_LEVEL cpu_to_le32(0xC0140015)
+#define STATUS_ACPI_MUTEX_NOT_OWNED cpu_to_le32(0xC0140016)
+#define STATUS_ACPI_MUTEX_NOT_OWNER cpu_to_le32(0xC0140017)
+#define STATUS_ACPI_RS_ACCESS cpu_to_le32(0xC0140018)
+#define STATUS_ACPI_INVALID_TABLE cpu_to_le32(0xC0140019)
+#define STATUS_ACPI_REG_HANDLER_FAILED cpu_to_le32(0xC0140020)
+#define STATUS_ACPI_POWER_REQUEST_FAILED cpu_to_le32(0xC0140021)
+#define STATUS_SXS_SECTION_NOT_FOUND cpu_to_le32(0xC0150001)
+#define STATUS_SXS_CANT_GEN_ACTCTX cpu_to_le32(0xC0150002)
+#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT cpu_to_le32(0xC0150003)
+#define STATUS_SXS_ASSEMBLY_NOT_FOUND cpu_to_le32(0xC0150004)
+#define STATUS_SXS_MANIFEST_FORMAT_ERROR cpu_to_le32(0xC0150005)
+#define STATUS_SXS_MANIFEST_PARSE_ERROR cpu_to_le32(0xC0150006)
+#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED cpu_to_le32(0xC0150007)
+#define STATUS_SXS_KEY_NOT_FOUND cpu_to_le32(0xC0150008)
+#define STATUS_SXS_VERSION_CONFLICT cpu_to_le32(0xC0150009)
+#define STATUS_SXS_WRONG_SECTION_TYPE cpu_to_le32(0xC015000A)
+#define STATUS_SXS_THREAD_QUERIES_DISABLED cpu_to_le32(0xC015000B)
+#define STATUS_SXS_ASSEMBLY_MISSING cpu_to_le32(0xC015000C)
+#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET cpu_to_le32(0xC015000E)
+#define STATUS_SXS_EARLY_DEACTIVATION cpu_to_le32(0xC015000F)
+#define STATUS_SXS_INVALID_DEACTIVATION cpu_to_le32(0xC0150010)
+#define STATUS_SXS_MULTIPLE_DEACTIVATION cpu_to_le32(0xC0150011)
+#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY cpu_to_le32(0xC0150012)
+#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED cpu_to_le32(0xC0150013)
+#define STATUS_SXS_CORRUPT_ACTIVATION_STACK cpu_to_le32(0xC0150014)
+#define STATUS_SXS_CORRUPTION cpu_to_le32(0xC0150015)
+#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE cpu_to_le32(0xC0150016)
+#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME cpu_to_le32(0xC0150017)
+#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE cpu_to_le32(0xC0150018)
+#define STATUS_SXS_IDENTITY_PARSE_ERROR cpu_to_le32(0xC0150019)
+#define STATUS_SXS_COMPONENT_STORE_CORRUPT cpu_to_le32(0xC015001A)
+#define STATUS_SXS_FILE_HASH_MISMATCH cpu_to_le32(0xC015001B)
+#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT cpu_to_le32(0xC015001C)
+#define STATUS_SXS_IDENTITIES_DIFFERENT cpu_to_le32(0xC015001D)
+#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT cpu_to_le32(0xC015001E)
+#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY cpu_to_le32(0xC015001F)
+#define STATUS_ADVANCED_INSTALLER_FAILED cpu_to_le32(0xC0150020)
+#define STATUS_XML_ENCODING_MISMATCH cpu_to_le32(0xC0150021)
+#define STATUS_SXS_MANIFEST_TOO_BIG cpu_to_le32(0xC0150022)
+#define STATUS_SXS_SETTING_NOT_REGISTERED cpu_to_le32(0xC0150023)
+#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE cpu_to_le32(0xC0150024)
+#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED cpu_to_le32(0xC0150025)
+#define STATUS_GENERIC_COMMAND_FAILED cpu_to_le32(0xC0150026)
+#define STATUS_SXS_FILE_HASH_MISSING cpu_to_le32(0xC0150027)
+#define STATUS_TRANSACTIONAL_CONFLICT cpu_to_le32(0xC0190001)
+#define STATUS_INVALID_TRANSACTION cpu_to_le32(0xC0190002)
+#define STATUS_TRANSACTION_NOT_ACTIVE cpu_to_le32(0xC0190003)
+#define STATUS_TM_INITIALIZATION_FAILED cpu_to_le32(0xC0190004)
+#define STATUS_RM_NOT_ACTIVE cpu_to_le32(0xC0190005)
+#define STATUS_RM_METADATA_CORRUPT cpu_to_le32(0xC0190006)
+#define STATUS_TRANSACTION_NOT_JOINED cpu_to_le32(0xC0190007)
+#define STATUS_DIRECTORY_NOT_RM cpu_to_le32(0xC0190008)
+#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE cpu_to_le32(0xC019000A)
+#define STATUS_LOG_RESIZE_INVALID_SIZE cpu_to_le32(0xC019000B)
+#define STATUS_REMOTE_FILE_VERSION_MISMATCH cpu_to_le32(0xC019000C)
+#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS cpu_to_le32(0xC019000F)
+#define STATUS_TRANSACTION_PROPAGATION_FAILED cpu_to_le32(0xC0190010)
+#define STATUS_CRM_PROTOCOL_NOT_FOUND cpu_to_le32(0xC0190011)
+#define STATUS_TRANSACTION_SUPERIOR_EXISTS cpu_to_le32(0xC0190012)
+#define STATUS_TRANSACTION_REQUEST_NOT_VALID cpu_to_le32(0xC0190013)
+#define STATUS_TRANSACTION_NOT_REQUESTED cpu_to_le32(0xC0190014)
+#define STATUS_TRANSACTION_ALREADY_ABORTED cpu_to_le32(0xC0190015)
+#define STATUS_TRANSACTION_ALREADY_COMMITTED cpu_to_le32(0xC0190016)
+#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER cpu_to_le32(0xC0190017)
+#define STATUS_CURRENT_TRANSACTION_NOT_VALID cpu_to_le32(0xC0190018)
+#define STATUS_LOG_GROWTH_FAILED cpu_to_le32(0xC0190019)
+#define STATUS_OBJECT_NO_LONGER_EXISTS cpu_to_le32(0xC0190021)
+#define STATUS_STREAM_MINIVERSION_NOT_FOUND cpu_to_le32(0xC0190022)
+#define STATUS_STREAM_MINIVERSION_NOT_VALID cpu_to_le32(0xC0190023)
+#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION cpu_to_le32(0xC0190024)
+#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT cpu_to_le32(0xC0190025)
+#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS cpu_to_le32(0xC0190026)
+#define STATUS_HANDLE_NO_LONGER_VALID cpu_to_le32(0xC0190028)
+#define STATUS_LOG_CORRUPTION_DETECTED cpu_to_le32(0xC0190030)
+#define STATUS_RM_DISCONNECTED cpu_to_le32(0xC0190032)
+#define STATUS_ENLISTMENT_NOT_SUPERIOR cpu_to_le32(0xC0190033)
+#define STATUS_FILE_IDENTITY_NOT_PERSISTENT cpu_to_le32(0xC0190036)
+#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY cpu_to_le32(0xC0190037)
+#define STATUS_CANT_CROSS_RM_BOUNDARY cpu_to_le32(0xC0190038)
+#define STATUS_TXF_DIR_NOT_EMPTY cpu_to_le32(0xC0190039)
+#define STATUS_INDOUBT_TRANSACTIONS_EXIST cpu_to_le32(0xC019003A)
+#define STATUS_TM_VOLATILE cpu_to_le32(0xC019003B)
+#define STATUS_ROLLBACK_TIMER_EXPIRED cpu_to_le32(0xC019003C)
+#define STATUS_TXF_ATTRIBUTE_CORRUPT cpu_to_le32(0xC019003D)
+#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC019003E)
+#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED cpu_to_le32(0xC019003F)
+#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE cpu_to_le32(0xC0190040)
+#define STATUS_TRANSACTION_REQUIRED_PROMOTION cpu_to_le32(0xC0190043)
+#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION cpu_to_le32(0xC0190044)
+#define STATUS_TRANSACTIONS_NOT_FROZEN cpu_to_le32(0xC0190045)
+#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS cpu_to_le32(0xC0190046)
+#define STATUS_NOT_SNAPSHOT_VOLUME cpu_to_le32(0xC0190047)
+#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES cpu_to_le32(0xC0190048)
+#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190049)
+#define STATUS_TM_IDENTITY_MISMATCH cpu_to_le32(0xC019004A)
+#define STATUS_FLOATED_SECTION cpu_to_le32(0xC019004B)
+#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK cpu_to_le32(0xC019004C)
+#define STATUS_CANNOT_ABORT_TRANSACTIONS cpu_to_le32(0xC019004D)
+#define STATUS_TRANSACTION_NOT_FOUND cpu_to_le32(0xC019004E)
+#define STATUS_RESOURCEMANAGER_NOT_FOUND cpu_to_le32(0xC019004F)
+#define STATUS_ENLISTMENT_NOT_FOUND cpu_to_le32(0xC0190050)
+#define STATUS_TRANSACTIONMANAGER_NOT_FOUND cpu_to_le32(0xC0190051)
+#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE cpu_to_le32(0xC0190052)
+#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION cpu_to_le32(0xC0190053)
+#define STATUS_TRANSACTION_NOT_ROOT cpu_to_le32(0xC0190054)
+#define STATUS_TRANSACTION_OBJECT_EXPIRED cpu_to_le32(0xC0190055)
+#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190056)
+#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED cpu_to_le32(0xC0190057)
+#define STATUS_TRANSACTION_RECORD_TOO_LONG cpu_to_le32(0xC0190058)
+#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION cpu_to_le32(0xC0190059)
+#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION cpu_to_le32(0xC019005A)
+#define STATUS_TRANSACTION_INTEGRITY_VIOLATED cpu_to_le32(0xC019005B)
+#define STATUS_LOG_SECTOR_INVALID cpu_to_le32(0xC01A0001)
+#define STATUS_LOG_SECTOR_PARITY_INVALID cpu_to_le32(0xC01A0002)
+#define STATUS_LOG_SECTOR_REMAPPED cpu_to_le32(0xC01A0003)
+#define STATUS_LOG_BLOCK_INCOMPLETE cpu_to_le32(0xC01A0004)
+#define STATUS_LOG_INVALID_RANGE cpu_to_le32(0xC01A0005)
+#define STATUS_LOG_BLOCKS_EXHAUSTED cpu_to_le32(0xC01A0006)
+#define STATUS_LOG_READ_CONTEXT_INVALID cpu_to_le32(0xC01A0007)
+#define STATUS_LOG_RESTART_INVALID cpu_to_le32(0xC01A0008)
+#define STATUS_LOG_BLOCK_VERSION cpu_to_le32(0xC01A0009)
+#define STATUS_LOG_BLOCK_INVALID cpu_to_le32(0xC01A000A)
+#define STATUS_LOG_READ_MODE_INVALID cpu_to_le32(0xC01A000B)
+#define STATUS_LOG_METADATA_CORRUPT cpu_to_le32(0xC01A000D)
+#define STATUS_LOG_METADATA_INVALID cpu_to_le32(0xC01A000E)
+#define STATUS_LOG_METADATA_INCONSISTENT cpu_to_le32(0xC01A000F)
+#define STATUS_LOG_RESERVATION_INVALID cpu_to_le32(0xC01A0010)
+#define STATUS_LOG_CANT_DELETE cpu_to_le32(0xC01A0011)
+#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED cpu_to_le32(0xC01A0012)
+#define STATUS_LOG_START_OF_LOG cpu_to_le32(0xC01A0013)
+#define STATUS_LOG_POLICY_ALREADY_INSTALLED cpu_to_le32(0xC01A0014)
+#define STATUS_LOG_POLICY_NOT_INSTALLED cpu_to_le32(0xC01A0015)
+#define STATUS_LOG_POLICY_INVALID cpu_to_le32(0xC01A0016)
+#define STATUS_LOG_POLICY_CONFLICT cpu_to_le32(0xC01A0017)
+#define STATUS_LOG_PINNED_ARCHIVE_TAIL cpu_to_le32(0xC01A0018)
+#define STATUS_LOG_RECORD_NONEXISTENT cpu_to_le32(0xC01A0019)
+#define STATUS_LOG_RECORDS_RESERVED_INVALID cpu_to_le32(0xC01A001A)
+#define STATUS_LOG_SPACE_RESERVED_INVALID cpu_to_le32(0xC01A001B)
+#define STATUS_LOG_TAIL_INVALID cpu_to_le32(0xC01A001C)
+#define STATUS_LOG_FULL cpu_to_le32(0xC01A001D)
+#define STATUS_LOG_MULTIPLEXED cpu_to_le32(0xC01A001E)
+#define STATUS_LOG_DEDICATED cpu_to_le32(0xC01A001F)
+#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS cpu_to_le32(0xC01A0020)
+#define STATUS_LOG_ARCHIVE_IN_PROGRESS cpu_to_le32(0xC01A0021)
+#define STATUS_LOG_EPHEMERAL cpu_to_le32(0xC01A0022)
+#define STATUS_LOG_NOT_ENOUGH_CONTAINERS cpu_to_le32(0xC01A0023)
+#define STATUS_LOG_CLIENT_ALREADY_REGISTERED cpu_to_le32(0xC01A0024)
+#define STATUS_LOG_CLIENT_NOT_REGISTERED cpu_to_le32(0xC01A0025)
+#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS cpu_to_le32(0xC01A0026)
+#define STATUS_LOG_CONTAINER_READ_FAILED cpu_to_le32(0xC01A0027)
+#define STATUS_LOG_CONTAINER_WRITE_FAILED cpu_to_le32(0xC01A0028)
+#define STATUS_LOG_CONTAINER_OPEN_FAILED cpu_to_le32(0xC01A0029)
+#define STATUS_LOG_CONTAINER_STATE_INVALID cpu_to_le32(0xC01A002A)
+#define STATUS_LOG_STATE_INVALID cpu_to_le32(0xC01A002B)
+#define STATUS_LOG_PINNED cpu_to_le32(0xC01A002C)
+#define STATUS_LOG_METADATA_FLUSH_FAILED cpu_to_le32(0xC01A002D)
+#define STATUS_LOG_INCONSISTENT_SECURITY cpu_to_le32(0xC01A002E)
+#define STATUS_LOG_APPENDED_FLUSH_FAILED cpu_to_le32(0xC01A002F)
+#define STATUS_LOG_PINNED_RESERVATION cpu_to_le32(0xC01A0030)
+#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC01B00EA)
+#define STATUS_FLT_NO_HANDLER_DEFINED cpu_to_le32(0xC01C0001)
+#define STATUS_FLT_CONTEXT_ALREADY_DEFINED cpu_to_le32(0xC01C0002)
+#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST cpu_to_le32(0xC01C0003)
+#define STATUS_FLT_DISALLOW_FAST_IO cpu_to_le32(0xC01C0004)
+#define STATUS_FLT_INVALID_NAME_REQUEST cpu_to_le32(0xC01C0005)
+#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION cpu_to_le32(0xC01C0006)
+#define STATUS_FLT_NOT_INITIALIZED cpu_to_le32(0xC01C0007)
+#define STATUS_FLT_FILTER_NOT_READY cpu_to_le32(0xC01C0008)
+#define STATUS_FLT_POST_OPERATION_CLEANUP cpu_to_le32(0xC01C0009)
+#define STATUS_FLT_INTERNAL_ERROR cpu_to_le32(0xC01C000A)
+#define STATUS_FLT_DELETING_OBJECT cpu_to_le32(0xC01C000B)
+#define STATUS_FLT_MUST_BE_NONPAGED_POOL cpu_to_le32(0xC01C000C)
+#define STATUS_FLT_DUPLICATE_ENTRY cpu_to_le32(0xC01C000D)
+#define STATUS_FLT_CBDQ_DISABLED cpu_to_le32(0xC01C000E)
+#define STATUS_FLT_DO_NOT_ATTACH cpu_to_le32(0xC01C000F)
+#define STATUS_FLT_DO_NOT_DETACH cpu_to_le32(0xC01C0010)
+#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION cpu_to_le32(0xC01C0011)
+#define STATUS_FLT_INSTANCE_NAME_COLLISION cpu_to_le32(0xC01C0012)
+#define STATUS_FLT_FILTER_NOT_FOUND cpu_to_le32(0xC01C0013)
+#define STATUS_FLT_VOLUME_NOT_FOUND cpu_to_le32(0xC01C0014)
+#define STATUS_FLT_INSTANCE_NOT_FOUND cpu_to_le32(0xC01C0015)
+#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND cpu_to_le32(0xC01C0016)
+#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION cpu_to_le32(0xC01C0017)
+#define STATUS_FLT_NAME_CACHE_MISS cpu_to_le32(0xC01C0018)
+#define STATUS_FLT_NO_DEVICE_OBJECT cpu_to_le32(0xC01C0019)
+#define STATUS_FLT_VOLUME_ALREADY_MOUNTED cpu_to_le32(0xC01C001A)
+#define STATUS_FLT_ALREADY_ENLISTED cpu_to_le32(0xC01C001B)
+#define STATUS_FLT_CONTEXT_ALREADY_LINKED cpu_to_le32(0xC01C001C)
+#define STATUS_FLT_NO_WAITER_FOR_REPLY cpu_to_le32(0xC01C0020)
+#define STATUS_MONITOR_NO_DESCRIPTOR cpu_to_le32(0xC01D0001)
+#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT cpu_to_le32(0xC01D0002)
+#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM cpu_to_le32(0xC01D0003)
+#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK cpu_to_le32(0xC01D0004)
+#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED cpu_to_le32(0xC01D0005)
+#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK cpu_to_le32(0xC01D0006)
+#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK cpu_to_le32(0xC01D0007)
+#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA cpu_to_le32(0xC01D0008)
+#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK cpu_to_le32(0xC01D0009)
+#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER cpu_to_le32(0xC01E0000)
+#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER cpu_to_le32(0xC01E0001)
+#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER cpu_to_le32(0xC01E0002)
+#define STATUS_GRAPHICS_ADAPTER_WAS_RESET cpu_to_le32(0xC01E0003)
+#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL cpu_to_le32(0xC01E0004)
+#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED cpu_to_le32(0xC01E0005)
+#define STATUS_GRAPHICS_PRESENT_OCCLUDED cpu_to_le32(0xC01E0006)
+#define STATUS_GRAPHICS_PRESENT_DENIED cpu_to_le32(0xC01E0007)
+#define STATUS_GRAPHICS_CANNOTCOLORCONVERT cpu_to_le32(0xC01E0008)
+#define STATUS_GRAPHICS_NO_VIDEO_MEMORY cpu_to_le32(0xC01E0100)
+#define STATUS_GRAPHICS_CANT_LOCK_MEMORY cpu_to_le32(0xC01E0101)
+#define STATUS_GRAPHICS_ALLOCATION_BUSY cpu_to_le32(0xC01E0102)
+#define STATUS_GRAPHICS_TOO_MANY_REFERENCES cpu_to_le32(0xC01E0103)
+#define STATUS_GRAPHICS_TRY_AGAIN_LATER cpu_to_le32(0xC01E0104)
+#define STATUS_GRAPHICS_TRY_AGAIN_NOW cpu_to_le32(0xC01E0105)
+#define STATUS_GRAPHICS_ALLOCATION_INVALID cpu_to_le32(0xC01E0106)
+#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE cpu_to_le32(0xC01E0107)
+#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED cpu_to_le32(0xC01E0108)
+#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION cpu_to_le32(0xC01E0109)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE cpu_to_le32(0xC01E0110)
+#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION cpu_to_le32(0xC01E0111)
+#define STATUS_GRAPHICS_ALLOCATION_CLOSED cpu_to_le32(0xC01E0112)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE cpu_to_le32(0xC01E0113)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE cpu_to_le32(0xC01E0114)
+#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE cpu_to_le32(0xC01E0115)
+#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST cpu_to_le32(0xC01E0116)
+#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE cpu_to_le32(0xC01E0200)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0300)
+#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED cpu_to_le32(0xC01E0301)
+#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED cpu_to_le32(0xC01E0302)
+#define STATUS_GRAPHICS_INVALID_VIDPN cpu_to_le32(0xC01E0303)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE cpu_to_le32(0xC01E0304)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET cpu_to_le32(0xC01E0305)
+#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED cpu_to_le32(0xC01E0306)
+#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET cpu_to_le32(0xC01E0308)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET cpu_to_le32(0xC01E0309)
+#define STATUS_GRAPHICS_INVALID_FREQUENCY cpu_to_le32(0xC01E030A)
+#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION cpu_to_le32(0xC01E030B)
+#define STATUS_GRAPHICS_INVALID_TOTAL_REGION cpu_to_le32(0xC01E030C)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE cpu_to_le32(0xC01E0310)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE cpu_to_le32(0xC01E0311)
+#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET cpu_to_le32(0xC01E0312)
+#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY cpu_to_le32(0xC01E0313)
+#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET cpu_to_le32(0xC01E0314)
+#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET cpu_to_le32(0xC01E0315)
+#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET cpu_to_le32(0xC01E0316)
+#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET cpu_to_le32(0xC01E0317)
+#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET cpu_to_le32(0xC01E0318)
+#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH cpu_to_le32(0xC01E0319)
+#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY cpu_to_le32(0xC01E031A)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET cpu_to_le32(0xC01E031B)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE cpu_to_le32(0xC01E031C)
+#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET cpu_to_le32(0xC01E031D)
+#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET cpu_to_le32(0xC01E031F)
+#define STATUS_GRAPHICS_STALE_MODESET cpu_to_le32(0xC01E0320)
+#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET cpu_to_le32(0xC01E0321)
+#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE cpu_to_le32(0xC01E0322)
+#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN cpu_to_le32(0xC01E0323)
+#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0324)
+#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION cpu_to_le32(0xC01E0325)
+#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES cpu_to_le32(0xC01E0326)
+#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0327)
+#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE cpu_to_le32(0xC01E0328)
+#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET cpu_to_le32(0xC01E0329)
+#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET cpu_to_le32(0xC01E032A)
+#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR cpu_to_le32(0xC01E032B)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET cpu_to_le32(0xC01E032C)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET cpu_to_le32(0xC01E032D)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E032E)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE cpu_to_le32(0xC01E032F)
+#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED cpu_to_le32(0xC01E0330)
+#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0331)
+#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0332)
+#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET cpu_to_le32(0xC01E0333)
+#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER cpu_to_le32(0xC01E0334)
+#define STATUS_GRAPHICS_NO_VIDPNMGR cpu_to_le32(0xC01E0335)
+#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN cpu_to_le32(0xC01E0336)
+#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0337)
+#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED cpu_to_le32(0xC01E0338)
+#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0339)
+#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE cpu_to_le32(0xC01E033A)
+#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE cpu_to_le32(0xC01E033B)
+#define STATUS_GRAPHICS_INVALID_STRIDE cpu_to_le32(0xC01E033C)
+#define STATUS_GRAPHICS_INVALID_PIXELFORMAT cpu_to_le32(0xC01E033D)
+#define STATUS_GRAPHICS_INVALID_COLORBASIS cpu_to_le32(0xC01E033E)
+#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE cpu_to_le32(0xC01E033F)
+#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0340)
+#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT cpu_to_le32(0xC01E0341)
+#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE cpu_to_le32(0xC01E0342)
+#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN cpu_to_le32(0xC01E0343)
+#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL cpu_to_le32(0xC01E0344)
+#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION cpu_to_le32(0xC01E0345)
+#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED cpu_to_le32(0xC01E0346)
+#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP cpu_to_le32(0xC01E0347)
+#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED cpu_to_le32(0xC01E0348)
+#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED cpu_to_le32(0xC01E0349)
+#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET cpu_to_le32(0xC01E034A)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON cpu_to_le32(0xC01E034D)
+#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE cpu_to_le32(0xC01E034E)
+#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE cpu_to_le32(0xC01E034F)
+#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS cpu_to_le32(0xC01E0350)
+#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING cpu_to_le32(0xC01E0352)
+#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED cpu_to_le32(0xC01E0353)
+#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS cpu_to_le32(0xC01E0354)
+#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT cpu_to_le32(0xC01E0355)
+#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM cpu_to_le32(0xC01E0356)
+#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN cpu_to_le32(0xC01E0357)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT cpu_to_le32(0xC01E0358)
+#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED cpu_to_le32(0xC01E0359)
+#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION cpu_to_le32(0xC01E035A)
+#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE cpu_to_le32(0xC01E035B)
+#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET cpu_to_le32(0xC01E035C)
+#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED cpu_to_le32(0xC01E0400)
+#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED cpu_to_le32(0xC01E0401)
+#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER cpu_to_le32(0xC01E0430)
+#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED cpu_to_le32(0xC01E0431)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED cpu_to_le32(0xC01E0432)
+#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY cpu_to_le32(0xC01E0433)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED cpu_to_le32(0xC01E0434)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON cpu_to_le32(0xC01E0435)
+#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE cpu_to_le32(0xC01E0436)
+#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER cpu_to_le32(0xC01E0438)
+#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED cpu_to_le32(0xC01E043B)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS cpu_to_le32(0xC01E051C)
+#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST cpu_to_le32(0xC01E051D)
+#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC01E051E)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS cpu_to_le32(0xC01E051F)
+#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED cpu_to_le32(0xC01E0520)
+#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST cpu_to_le32(0xC01E0521)
+#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED cpu_to_le32(0xC01E0500)
+#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED cpu_to_le32(0xC01E0501)
+#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED cpu_to_le32(0xC01E0502)
+#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS cpu_to_le32(0xC01E0503)
+#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E0504)
+#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST cpu_to_le32(0xC01E0505)
+#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME cpu_to_le32(0xC01E0506)
+#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP cpu_to_le32(0xC01E0507)
+#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED cpu_to_le32(0xC01E0508)
+#define STATUS_GRAPHICS_OPM_INVALID_POINTER cpu_to_le32(0xC01E050A)
+#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR cpu_to_le32(0xC01E050B)
+#define STATUS_GRAPHICS_OPM_INVALID_HANDLE cpu_to_le32(0xC01E050C)
+#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE cpu_to_le32(0xC01E050D)
+#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH cpu_to_le32(0xC01E050E)
+#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED cpu_to_le32(0xC01E050F)
+#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED cpu_to_le32(0xC01E0510)
+#define STATUS_GRAPHICS_PVP_HFS_FAILED cpu_to_le32(0xC01E0511)
+#define STATUS_GRAPHICS_OPM_INVALID_SRM cpu_to_le32(0xC01E0512)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP cpu_to_le32(0xC01E0513)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP cpu_to_le32(0xC01E0514)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA cpu_to_le32(0xC01E0515)
+#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET cpu_to_le32(0xC01E0516)
+#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH cpu_to_le32(0xC01E0517)
+#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE cpu_to_le32(0xC01E0518)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS cpu_to_le32(0xC01E051A)
+#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS cpu_to_le32(0xC01E051B)
+#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED cpu_to_le32(0xC01E0580)
+#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC01E0581)
+#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA cpu_to_le32(0xC01E0582)
+#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA cpu_to_le32(0xC01E0583)
+#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED cpu_to_le32(0xC01E0584)
+#define STATUS_GRAPHICS_DDCCI_INVALID_DATA cpu_to_le32(0xC01E0585)
+#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE cpu_to_le32(0xC01E0586)
+#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING cpu_to_le32(0xC01E0587)
+#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR cpu_to_le32(0xC01E0588)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND cpu_to_le32(0xC01E0589)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH cpu_to_le32(0xC01E058A)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM cpu_to_le32(0xC01E058B)
+#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE cpu_to_le32(0xC01E058C)
+#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS cpu_to_le32(0xC01E058D)
+#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED cpu_to_le32(0xC01E05E0)
+#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME cpu_to_le32(0xC01E05E1)
+#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP cpu_to_le32(0xC01E05E2)
+#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED cpu_to_le32(0xC01E05E3)
+#define STATUS_GRAPHICS_INVALID_POINTER cpu_to_le32(0xC01E05E4)
+#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE cpu_to_le32(0xC01E05E5)
+#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E05E6)
+#define STATUS_GRAPHICS_INTERNAL_ERROR cpu_to_le32(0xC01E05E7)
+#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS cpu_to_le32(0xC01E05E8)
+#define STATUS_FVE_LOCKED_VOLUME cpu_to_le32(0xC0210000)
+#define STATUS_FVE_NOT_ENCRYPTED cpu_to_le32(0xC0210001)
+#define STATUS_FVE_BAD_INFORMATION cpu_to_le32(0xC0210002)
+#define STATUS_FVE_TOO_SMALL cpu_to_le32(0xC0210003)
+#define STATUS_FVE_FAILED_WRONG_FS cpu_to_le32(0xC0210004)
+#define STATUS_FVE_FAILED_BAD_FS cpu_to_le32(0xC0210005)
+#define STATUS_FVE_FS_NOT_EXTENDED cpu_to_le32(0xC0210006)
+#define STATUS_FVE_FS_MOUNTED cpu_to_le32(0xC0210007)
+#define STATUS_FVE_NO_LICENSE cpu_to_le32(0xC0210008)
+#define STATUS_FVE_ACTION_NOT_ALLOWED cpu_to_le32(0xC0210009)
+#define STATUS_FVE_BAD_DATA cpu_to_le32(0xC021000A)
+#define STATUS_FVE_VOLUME_NOT_BOUND cpu_to_le32(0xC021000B)
+#define STATUS_FVE_NOT_DATA_VOLUME cpu_to_le32(0xC021000C)
+#define STATUS_FVE_CONV_READ_ERROR cpu_to_le32(0xC021000D)
+#define STATUS_FVE_CONV_WRITE_ERROR cpu_to_le32(0xC021000E)
+#define STATUS_FVE_OVERLAPPED_UPDATE cpu_to_le32(0xC021000F)
+#define STATUS_FVE_FAILED_SECTOR_SIZE cpu_to_le32(0xC0210010)
+#define STATUS_FVE_FAILED_AUTHENTICATION cpu_to_le32(0xC0210011)
+#define STATUS_FVE_NOT_OS_VOLUME cpu_to_le32(0xC0210012)
+#define STATUS_FVE_KEYFILE_NOT_FOUND cpu_to_le32(0xC0210013)
+#define STATUS_FVE_KEYFILE_INVALID cpu_to_le32(0xC0210014)
+#define STATUS_FVE_KEYFILE_NO_VMK cpu_to_le32(0xC0210015)
+#define STATUS_FVE_TPM_DISABLED cpu_to_le32(0xC0210016)
+#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO cpu_to_le32(0xC0210017)
+#define STATUS_FVE_TPM_INVALID_PCR cpu_to_le32(0xC0210018)
+#define STATUS_FVE_TPM_NO_VMK cpu_to_le32(0xC0210019)
+#define STATUS_FVE_PIN_INVALID cpu_to_le32(0xC021001A)
+#define STATUS_FVE_AUTH_INVALID_APPLICATION cpu_to_le32(0xC021001B)
+#define STATUS_FVE_AUTH_INVALID_CONFIG cpu_to_le32(0xC021001C)
+#define STATUS_FVE_DEBUGGER_ENABLED cpu_to_le32(0xC021001D)
+#define STATUS_FVE_DRY_RUN_FAILED cpu_to_le32(0xC021001E)
+#define STATUS_FVE_BAD_METADATA_POINTER cpu_to_le32(0xC021001F)
+#define STATUS_FVE_OLD_METADATA_COPY cpu_to_le32(0xC0210020)
+#define STATUS_FVE_REBOOT_REQUIRED cpu_to_le32(0xC0210021)
+#define STATUS_FVE_RAW_ACCESS cpu_to_le32(0xC0210022)
+#define STATUS_FVE_RAW_BLOCKED cpu_to_le32(0xC0210023)
+#define STATUS_FWP_CALLOUT_NOT_FOUND cpu_to_le32(0xC0220001)
+#define STATUS_FWP_CONDITION_NOT_FOUND cpu_to_le32(0xC0220002)
+#define STATUS_FWP_FILTER_NOT_FOUND cpu_to_le32(0xC0220003)
+#define STATUS_FWP_LAYER_NOT_FOUND cpu_to_le32(0xC0220004)
+#define STATUS_FWP_PROVIDER_NOT_FOUND cpu_to_le32(0xC0220005)
+#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND cpu_to_le32(0xC0220006)
+#define STATUS_FWP_SUBLAYER_NOT_FOUND cpu_to_le32(0xC0220007)
+#define STATUS_FWP_NOT_FOUND cpu_to_le32(0xC0220008)
+#define STATUS_FWP_ALREADY_EXISTS cpu_to_le32(0xC0220009)
+#define STATUS_FWP_IN_USE cpu_to_le32(0xC022000A)
+#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS cpu_to_le32(0xC022000B)
+#define STATUS_FWP_WRONG_SESSION cpu_to_le32(0xC022000C)
+#define STATUS_FWP_NO_TXN_IN_PROGRESS cpu_to_le32(0xC022000D)
+#define STATUS_FWP_TXN_IN_PROGRESS cpu_to_le32(0xC022000E)
+#define STATUS_FWP_TXN_ABORTED cpu_to_le32(0xC022000F)
+#define STATUS_FWP_SESSION_ABORTED cpu_to_le32(0xC0220010)
+#define STATUS_FWP_INCOMPATIBLE_TXN cpu_to_le32(0xC0220011)
+#define STATUS_FWP_TIMEOUT cpu_to_le32(0xC0220012)
+#define STATUS_FWP_NET_EVENTS_DISABLED cpu_to_le32(0xC0220013)
+#define STATUS_FWP_INCOMPATIBLE_LAYER cpu_to_le32(0xC0220014)
+#define STATUS_FWP_KM_CLIENTS_ONLY cpu_to_le32(0xC0220015)
+#define STATUS_FWP_LIFETIME_MISMATCH cpu_to_le32(0xC0220016)
+#define STATUS_FWP_BUILTIN_OBJECT cpu_to_le32(0xC0220017)
+#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS cpu_to_le32(0xC0220018)
+#define STATUS_FWP_TOO_MANY_CALLOUTS cpu_to_le32(0xC0220018)
+#define STATUS_FWP_NOTIFICATION_DROPPED cpu_to_le32(0xC0220019)
+#define STATUS_FWP_TRAFFIC_MISMATCH cpu_to_le32(0xC022001A)
+#define STATUS_FWP_INCOMPATIBLE_SA_STATE cpu_to_le32(0xC022001B)
+#define STATUS_FWP_NULL_POINTER cpu_to_le32(0xC022001C)
+#define STATUS_FWP_INVALID_ENUMERATOR cpu_to_le32(0xC022001D)
+#define STATUS_FWP_INVALID_FLAGS cpu_to_le32(0xC022001E)
+#define STATUS_FWP_INVALID_NET_MASK cpu_to_le32(0xC022001F)
+#define STATUS_FWP_INVALID_RANGE cpu_to_le32(0xC0220020)
+#define STATUS_FWP_INVALID_INTERVAL cpu_to_le32(0xC0220021)
+#define STATUS_FWP_ZERO_LENGTH_ARRAY cpu_to_le32(0xC0220022)
+#define STATUS_FWP_NULL_DISPLAY_NAME cpu_to_le32(0xC0220023)
+#define STATUS_FWP_INVALID_ACTION_TYPE cpu_to_le32(0xC0220024)
+#define STATUS_FWP_INVALID_WEIGHT cpu_to_le32(0xC0220025)
+#define STATUS_FWP_MATCH_TYPE_MISMATCH cpu_to_le32(0xC0220026)
+#define STATUS_FWP_TYPE_MISMATCH cpu_to_le32(0xC0220027)
+#define STATUS_FWP_OUT_OF_BOUNDS cpu_to_le32(0xC0220028)
+#define STATUS_FWP_RESERVED cpu_to_le32(0xC0220029)
+#define STATUS_FWP_DUPLICATE_CONDITION cpu_to_le32(0xC022002A)
+#define STATUS_FWP_DUPLICATE_KEYMOD cpu_to_le32(0xC022002B)
+#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002C)
+#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER cpu_to_le32(0xC022002D)
+#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002E)
+#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT cpu_to_le32(0xC022002F)
+#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD cpu_to_le32(0xC0220030)
+#define STATUS_FWP_INCOMPATIBLE_DH_GROUP cpu_to_le32(0xC0220031)
+#define STATUS_FWP_EM_NOT_SUPPORTED cpu_to_le32(0xC0220032)
+#define STATUS_FWP_NEVER_MATCH cpu_to_le32(0xC0220033)
+#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH cpu_to_le32(0xC0220034)
+#define STATUS_FWP_INVALID_PARAMETER cpu_to_le32(0xC0220035)
+#define STATUS_FWP_TOO_MANY_SUBLAYERS cpu_to_le32(0xC0220036)
+#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED cpu_to_le32(0xC0220037)
+#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG cpu_to_le32(0xC0220038)
+#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG cpu_to_le32(0xC0220039)
+#define STATUS_FWP_TCPIP_NOT_READY cpu_to_le32(0xC0220100)
+#define STATUS_FWP_INJECT_HANDLE_CLOSING cpu_to_le32(0xC0220101)
+#define STATUS_FWP_INJECT_HANDLE_STALE cpu_to_le32(0xC0220102)
+#define STATUS_FWP_CANNOT_PEND cpu_to_le32(0xC0220103)
+#define STATUS_NDIS_CLOSING cpu_to_le32(0xC0230002)
+#define STATUS_NDIS_BAD_VERSION cpu_to_le32(0xC0230004)
+#define STATUS_NDIS_BAD_CHARACTERISTICS cpu_to_le32(0xC0230005)
+#define STATUS_NDIS_ADAPTER_NOT_FOUND cpu_to_le32(0xC0230006)
+#define STATUS_NDIS_OPEN_FAILED cpu_to_le32(0xC0230007)
+#define STATUS_NDIS_DEVICE_FAILED cpu_to_le32(0xC0230008)
+#define STATUS_NDIS_MULTICAST_FULL cpu_to_le32(0xC0230009)
+#define STATUS_NDIS_MULTICAST_EXISTS cpu_to_le32(0xC023000A)
+#define STATUS_NDIS_MULTICAST_NOT_FOUND cpu_to_le32(0xC023000B)
+#define STATUS_NDIS_REQUEST_ABORTED cpu_to_le32(0xC023000C)
+#define STATUS_NDIS_RESET_IN_PROGRESS cpu_to_le32(0xC023000D)
+#define STATUS_NDIS_INVALID_PACKET cpu_to_le32(0xC023000F)
+#define STATUS_NDIS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0230010)
+#define STATUS_NDIS_ADAPTER_NOT_READY cpu_to_le32(0xC0230011)
+#define STATUS_NDIS_INVALID_LENGTH cpu_to_le32(0xC0230014)
+#define STATUS_NDIS_INVALID_DATA cpu_to_le32(0xC0230015)
+#define STATUS_NDIS_BUFFER_TOO_SHORT cpu_to_le32(0xC0230016)
+#define STATUS_NDIS_INVALID_OID cpu_to_le32(0xC0230017)
+#define STATUS_NDIS_ADAPTER_REMOVED cpu_to_le32(0xC0230018)
+#define STATUS_NDIS_UNSUPPORTED_MEDIA cpu_to_le32(0xC0230019)
+#define STATUS_NDIS_GROUP_ADDRESS_IN_USE cpu_to_le32(0xC023001A)
+#define STATUS_NDIS_FILE_NOT_FOUND cpu_to_le32(0xC023001B)
+#define STATUS_NDIS_ERROR_READING_FILE cpu_to_le32(0xC023001C)
+#define STATUS_NDIS_ALREADY_MAPPED cpu_to_le32(0xC023001D)
+#define STATUS_NDIS_RESOURCE_CONFLICT cpu_to_le32(0xC023001E)
+#define STATUS_NDIS_MEDIA_DISCONNECTED cpu_to_le32(0xC023001F)
+#define STATUS_NDIS_INVALID_ADDRESS cpu_to_le32(0xC0230022)
+#define STATUS_NDIS_PAUSED cpu_to_le32(0xC023002A)
+#define STATUS_NDIS_INTERFACE_NOT_FOUND cpu_to_le32(0xC023002B)
+#define STATUS_NDIS_UNSUPPORTED_REVISION cpu_to_le32(0xC023002C)
+#define STATUS_NDIS_INVALID_PORT cpu_to_le32(0xC023002D)
+#define STATUS_NDIS_INVALID_PORT_STATE cpu_to_le32(0xC023002E)
+#define STATUS_NDIS_LOW_POWER_STATE cpu_to_le32(0xC023002F)
+#define STATUS_NDIS_NOT_SUPPORTED cpu_to_le32(0xC02300BB)
+#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED cpu_to_le32(0xC0232000)
+#define STATUS_NDIS_DOT11_MEDIA_IN_USE cpu_to_le32(0xC0232001)
+#define STATUS_NDIS_DOT11_POWER_STATE_INVALID cpu_to_le32(0xC0232002)
+#define STATUS_IPSEC_BAD_SPI cpu_to_le32(0xC0360001)
+#define STATUS_IPSEC_SA_LIFETIME_EXPIRED cpu_to_le32(0xC0360002)
+#define STATUS_IPSEC_WRONG_SA cpu_to_le32(0xC0360003)
+#define STATUS_IPSEC_REPLAY_CHECK_FAILED cpu_to_le32(0xC0360004)
+#define STATUS_IPSEC_INVALID_PACKET cpu_to_le32(0xC0360005)
+#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED cpu_to_le32(0xC0360006)
+#define STATUS_IPSEC_CLEAR_TEXT_DROP cpu_to_le32(0xC0360007)

diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 7b351c6..148d794 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c

@@ -176,7 +176,7 @@
 
 	ses = smb2_find_smb_ses(server, shdr->SessionId);
 	if (!ses) {
-		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
 		return 0;
 	}
 
@@ -185,21 +185,21 @@
 
 	rc = smb2_crypto_shash_allocate(server);
 	if (rc) {
-		cifs_dbg(VFS, "%s: sha256 alloc failed\n", __func__);
+		cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__);
 		return rc;
 	}
 
 	rc = crypto_shash_setkey(server->secmech.hmacsha256,
 				 ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not update with response\n", __func__);
 		return rc;
 	}
 
 	shash = &server->secmech.sdeschmacsha256->shash;
 	rc = crypto_shash_init(shash);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not init sha256", __func__);
+		cifs_server_dbg(VFS, "%s: Could not init sha256", __func__);
 		return rc;
 	}
 
@@ -215,7 +215,7 @@
 		rc = crypto_shash_update(shash, iov[0].iov_base,
 					 iov[0].iov_len);
 		if (rc) {
-			cifs_dbg(VFS, "%s: Could not update with payload\n",
+			cifs_server_dbg(VFS, "%s: Could not update with payload\n",
 				 __func__);
 			return rc;
 		}
@@ -239,68 +239,69 @@
 	int rc = 0;
 	unsigned char prfhash[SMB2_HMACSHA256_SIZE];
 	unsigned char *hashptr = prfhash;
+	struct TCP_Server_Info *server = ses->server;
 
 	memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
 	memset(key, 0x0, key_size);
 
-	rc = smb3_crypto_shash_allocate(ses->server);
+	rc = smb3_crypto_shash_allocate(server);
 	if (rc) {
-		cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+		cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_setkey(ses->server->secmech.hmacsha256,
+	rc = crypto_shash_setkey(server->secmech.hmacsha256,
 		ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash);
+	rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
 				i, 4);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
 				label.iov_base, label.iov_len);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
 				&zero, 1);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
 				context.iov_base, context.iov_len);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
 				L, 4);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
 				hashptr);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
 		goto smb3signkey_ret;
 	}
 
@@ -436,7 +437,7 @@
 
 	ses = smb2_find_smb_ses(server, shdr->SessionId);
 	if (!ses) {
-		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
 		return 0;
 	}
 
@@ -446,7 +447,7 @@
 	rc = crypto_shash_setkey(server->secmech.cmacaes,
 				 ses->smb3signingkey, SMB2_CMACAES_SIZE);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
 		return rc;
 	}
 
@@ -457,7 +458,7 @@
 	 */
 	rc = crypto_shash_init(shash);
 	if (rc) {
-		cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
+		cifs_server_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
 		return rc;
 	}
 
@@ -473,7 +474,7 @@
 		rc = crypto_shash_update(shash, iov[0].iov_base,
 					 iov[0].iov_len);
 		if (rc) {
-			cifs_dbg(VFS, "%s: Could not update with payload\n",
+			cifs_server_dbg(VFS, "%s: Could not update with payload\n",
 				 __func__);
 			return rc;
 		}
@@ -521,6 +522,7 @@
 	if ((shdr->Command == SMB2_NEGOTIATE) ||
 	    (shdr->Command == SMB2_SESSION_SETUP) ||
 	    (shdr->Command == SMB2_OPLOCK_BREAK) ||
+	    server->ignore_signature ||
 	    (!server->session_estab))
 		return 0;
 
@@ -576,6 +578,7 @@
 		     struct TCP_Server_Info *server)
 {
 	struct mid_q_entry *temp;
+	unsigned int credits = le16_to_cpu(shdr->CreditCharge);
 
 	if (server == NULL) {
 		cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
@@ -586,6 +589,7 @@
 	memset(temp, 0, sizeof(struct mid_q_entry));
 	kref_init(&temp->refcount);
 	temp->mid = le64_to_cpu(shdr->MessageId);
+	temp->credits = credits > 0 ? credits : 1;
 	temp->pid = current->pid;
 	temp->command = shdr->Command; /* Always LE */
 	temp->when_alloc = jiffies;
@@ -600,6 +604,8 @@
 
 	atomic_inc(&midCount);
 	temp->mid_state = MID_REQUEST_ALLOCATED;
+	trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
+		le16_to_cpu(shdr->Command), temp->mid);
 	return temp;
 }
 
@@ -615,6 +621,10 @@
 		return -EAGAIN;
 	}
 
+	if (ses->server->tcpStatus == CifsNeedNegotiate &&
+	   shdr->Command != SMB2_NEGOTIATE)
+		return -EAGAIN;
+
 	if (ses->status == CifsNew) {
 		if ((shdr->Command != SMB2_SESSION_SETUP) &&
 		    (shdr->Command != SMB2_NEGOTIATE))
@@ -634,6 +644,7 @@
 	spin_lock(&GlobalMid_Lock);
 	list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
 	spin_unlock(&GlobalMid_Lock);
+
 	return 0;
 }
 
@@ -656,7 +667,7 @@
 
 		rc = smb2_verify_signature(&rqst, server);
 		if (rc)
-			cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+			cifs_server_dbg(VFS, "SMB signature verification returned error = %d\n",
 				 rc);
 	}
 
@@ -674,13 +685,18 @@
 	smb2_seq_num_into_buf(ses->server, shdr);
 
 	rc = smb2_get_mid_entry(ses, shdr, &mid);
-	if (rc)
+	if (rc) {
+		revert_current_mid_from_hdr(ses->server, shdr);
 		return ERR_PTR(rc);
+	}
+
 	rc = smb2_sign_rqst(rqst, ses->server);
 	if (rc) {
+		revert_current_mid_from_hdr(ses->server, shdr);
 		cifs_delete_mid(mid);
 		return ERR_PTR(rc);
 	}
+
 	return mid;
 }
 
@@ -692,14 +708,21 @@
 			(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
 	struct mid_q_entry *mid;
 
+	if (server->tcpStatus == CifsNeedNegotiate &&
+	   shdr->Command != SMB2_NEGOTIATE)
+		return ERR_PTR(-EAGAIN);
+
 	smb2_seq_num_into_buf(server, shdr);
 
 	mid = smb2_mid_entry_alloc(shdr, server);
-	if (mid == NULL)
+	if (mid == NULL) {
+		revert_current_mid_from_hdr(server, shdr);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	rc = smb2_sign_rqst(rqst, server);
 	if (rc) {
+		revert_current_mid_from_hdr(server, shdr);
 		DeleteMidQEntry(mid);
 		return ERR_PTR(rc);
 	}
@@ -713,9 +736,12 @@
 	struct crypto_aead *tfm;
 
 	if (!server->secmech.ccmaesencrypt) {
-		tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+		if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+			tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+		else
+			tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
 		if (IS_ERR(tfm)) {
-			cifs_dbg(VFS, "%s: Failed to alloc encrypt aead\n",
+			cifs_server_dbg(VFS, "%s: Failed to alloc encrypt aead\n",
 				 __func__);
 			return PTR_ERR(tfm);
 		}
@@ -723,11 +749,14 @@
 	}
 
 	if (!server->secmech.ccmaesdecrypt) {
-		tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+		if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+			tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+		else
+			tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
 		if (IS_ERR(tfm)) {
 			crypto_free_aead(server->secmech.ccmaesencrypt);
 			server->secmech.ccmaesencrypt = NULL;
-			cifs_dbg(VFS, "%s: Failed to alloc decrypt aead\n",
+			cifs_server_dbg(VFS, "%s: Failed to alloc decrypt aead\n",
 				 __func__);
 			return PTR_ERR(tfm);
 		}

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5fdb9a5..3c91fa9 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c

@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) 2017, Microsoft Corporation.
  *
  *   Author(s): Long Li <longli@microsoft.com>
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
  */
 #include <linux/module.h>
 #include <linux/highmem.h>
@@ -164,95 +155,6 @@
 #define log_rdma_mr(level, fmt, args...) \
 		log_rdma(level, LOG_RDMA_MR, fmt, ##args)
 
-/*
- * Destroy the transport and related RDMA and memory resources
- * Need to go through all the pending counters and make sure on one is using
- * the transport while it is destroyed
- */
-static void smbd_destroy_rdma_work(struct work_struct *work)
-{
-	struct smbd_response *response;
-	struct smbd_connection *info =
-		container_of(work, struct smbd_connection, destroy_work);
-	unsigned long flags;
-
-	log_rdma_event(INFO, "destroying qp\n");
-	ib_drain_qp(info->id->qp);
-	rdma_destroy_qp(info->id);
-
-	/* Unblock all I/O waiting on the send queue */
-	wake_up_interruptible_all(&info->wait_send_queue);
-
-	log_rdma_event(INFO, "cancelling idle timer\n");
-	cancel_delayed_work_sync(&info->idle_timer_work);
-	log_rdma_event(INFO, "cancelling send immediate work\n");
-	cancel_delayed_work_sync(&info->send_immediate_work);
-
-	log_rdma_event(INFO, "wait for all send to finish\n");
-	wait_event(info->wait_smbd_send_pending,
-		info->smbd_send_pending == 0);
-
-	log_rdma_event(INFO, "wait for all recv to finish\n");
-	wake_up_interruptible(&info->wait_reassembly_queue);
-	wait_event(info->wait_smbd_recv_pending,
-		info->smbd_recv_pending == 0);
-
-	log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
-	wait_event(info->wait_send_pending,
-		atomic_read(&info->send_pending) == 0);
-	wait_event(info->wait_send_payload_pending,
-		atomic_read(&info->send_payload_pending) == 0);
-
-	log_rdma_event(INFO, "freeing mr list\n");
-	wake_up_interruptible_all(&info->wait_mr);
-	wait_event(info->wait_for_mr_cleanup,
-		atomic_read(&info->mr_used_count) == 0);
-	destroy_mr_list(info);
-
-	/* It's not posssible for upper layer to get to reassembly */
-	log_rdma_event(INFO, "drain the reassembly queue\n");
-	do {
-		spin_lock_irqsave(&info->reassembly_queue_lock, flags);
-		response = _get_first_reassembly(info);
-		if (response) {
-			list_del(&response->list);
-			spin_unlock_irqrestore(
-				&info->reassembly_queue_lock, flags);
-			put_receive_buffer(info, response);
-		} else
-			spin_unlock_irqrestore(&info->reassembly_queue_lock, flags);
-	} while (response);
-
-	info->reassembly_data_length = 0;
-
-	log_rdma_event(INFO, "free receive buffers\n");
-	wait_event(info->wait_receive_queues,
-		info->count_receive_queue + info->count_empty_packet_queue
-			== info->receive_credit_max);
-	destroy_receive_buffers(info);
-
-	ib_free_cq(info->send_cq);
-	ib_free_cq(info->recv_cq);
-	ib_dealloc_pd(info->pd);
-	rdma_destroy_id(info->id);
-
-	/* free mempools */
-	mempool_destroy(info->request_mempool);
-	kmem_cache_destroy(info->request_cache);
-
-	mempool_destroy(info->response_mempool);
-	kmem_cache_destroy(info->response_cache);
-
-	info->transport_status = SMBD_DESTROYED;
-	wake_up_all(&info->wait_destroy);
-}
-
-static int smbd_process_disconnected(struct smbd_connection *info)
-{
-	schedule_work(&info->destroy_work);
-	return 0;
-}
-
 static void smbd_disconnect_rdma_work(struct work_struct *work)
 {
 	struct smbd_connection *info =
@@ -319,7 +221,9 @@
 		}
 
 		info->transport_status = SMBD_DISCONNECTED;
-		smbd_process_disconnected(info);
+		wake_up_interruptible(&info->disconn_wait);
+		wake_up_interruptible(&info->wait_reassembly_queue);
+		wake_up_interruptible_all(&info->wait_send_queue);
 		break;
 
 	default:
@@ -940,7 +844,7 @@
 
 	if (info->transport_status != SMBD_CONNECTED) {
 		log_outgoing(ERR, "disconnected not sending\n");
-		return -ENOENT;
+		return -EAGAIN;
 	}
 	atomic_dec(&info->send_credits);
 
@@ -990,7 +894,7 @@
 	request->sge[0].addr = ib_dma_map_single(info->id->device,
 						 (void *)packet,
 						 header_length,
-						 DMA_BIDIRECTIONAL);
+						 DMA_TO_DEVICE);
 	if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
 		mempool_free(request, info->request_mempool);
 		rc = -EIO;
@@ -1066,6 +970,7 @@
 				wake_up(&info->wait_send_pending);
 		}
 		smbd_disconnect_rdma_connection(info);
+		rc = -EAGAIN;
 	} else
 		/* Reset timer for idle connection after packet is sent */
 		mod_delayed_work(info->workqueue, &info->idle_timer_work,
@@ -1091,7 +996,7 @@
 	for_each_sg(sgl, sg, num_sgs, i) {
 		request->sge[i+1].addr =
 			ib_dma_map_page(info->id->device, sg_page(sg),
-			       sg->offset, sg->length, DMA_BIDIRECTIONAL);
+			       sg->offset, sg->length, DMA_TO_DEVICE);
 		if (ib_dma_mapping_error(
 				info->id->device, request->sge[i+1].addr)) {
 			rc = -EIO;
@@ -1478,17 +1383,97 @@
 			info->keep_alive_interval*HZ);
 }
 
-/* Destroy this SMBD connection, called from upper layer */
-void smbd_destroy(struct smbd_connection *info)
+/*
+ * Destroy the transport and related RDMA and memory resources
+ * Need to go through all the pending counters and make sure on one is using
+ * the transport while it is destroyed
+ */
+void smbd_destroy(struct TCP_Server_Info *server)
 {
+	struct smbd_connection *info = server->smbd_conn;
+	struct smbd_response *response;
+	unsigned long flags;
+
+	if (!info) {
+		log_rdma_event(INFO, "rdma session already destroyed\n");
+		return;
+	}
+
 	log_rdma_event(INFO, "destroying rdma session\n");
+	if (info->transport_status != SMBD_DISCONNECTED) {
+		rdma_disconnect(server->smbd_conn->id);
+		log_rdma_event(INFO, "wait for transport being disconnected\n");
+		wait_event_interruptible(
+			info->disconn_wait,
+			info->transport_status == SMBD_DISCONNECTED);
+	}
 
-	/* Kick off the disconnection process */
-	smbd_disconnect_rdma_connection(info);
+	log_rdma_event(INFO, "destroying qp\n");
+	ib_drain_qp(info->id->qp);
+	rdma_destroy_qp(info->id);
 
-	log_rdma_event(INFO, "wait for transport being destroyed\n");
-	wait_event(info->wait_destroy,
-		info->transport_status == SMBD_DESTROYED);
+	log_rdma_event(INFO, "cancelling idle timer\n");
+	cancel_delayed_work_sync(&info->idle_timer_work);
+	log_rdma_event(INFO, "cancelling send immediate work\n");
+	cancel_delayed_work_sync(&info->send_immediate_work);
+
+	log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
+	wait_event(info->wait_send_pending,
+		atomic_read(&info->send_pending) == 0);
+	wait_event(info->wait_send_payload_pending,
+		atomic_read(&info->send_payload_pending) == 0);
+
+	/* It's not posssible for upper layer to get to reassembly */
+	log_rdma_event(INFO, "drain the reassembly queue\n");
+	do {
+		spin_lock_irqsave(&info->reassembly_queue_lock, flags);
+		response = _get_first_reassembly(info);
+		if (response) {
+			list_del(&response->list);
+			spin_unlock_irqrestore(
+				&info->reassembly_queue_lock, flags);
+			put_receive_buffer(info, response);
+		} else
+			spin_unlock_irqrestore(
+				&info->reassembly_queue_lock, flags);
+	} while (response);
+	info->reassembly_data_length = 0;
+
+	log_rdma_event(INFO, "free receive buffers\n");
+	wait_event(info->wait_receive_queues,
+		info->count_receive_queue + info->count_empty_packet_queue
+			== info->receive_credit_max);
+	destroy_receive_buffers(info);
+
+	/*
+	 * For performance reasons, memory registration and deregistration
+	 * are not locked by srv_mutex. It is possible some processes are
+	 * blocked on transport srv_mutex while holding memory registration.
+	 * Release the transport srv_mutex to allow them to hit the failure
+	 * path when sending data, and then release memory registartions.
+	 */
+	log_rdma_event(INFO, "freeing mr list\n");
+	wake_up_interruptible_all(&info->wait_mr);
+	while (atomic_read(&info->mr_used_count)) {
+		mutex_unlock(&server->srv_mutex);
+		msleep(1000);
+		mutex_lock(&server->srv_mutex);
+	}
+	destroy_mr_list(info);
+
+	ib_free_cq(info->send_cq);
+	ib_free_cq(info->recv_cq);
+	ib_dealloc_pd(info->pd);
+	rdma_destroy_id(info->id);
+
+	/* free mempools */
+	mempool_destroy(info->request_mempool);
+	kmem_cache_destroy(info->request_cache);
+
+	mempool_destroy(info->response_mempool);
+	kmem_cache_destroy(info->response_cache);
+
+	info->transport_status = SMBD_DESTROYED;
 
 	destroy_workqueue(info->workqueue);
 	kfree(info);
@@ -1513,17 +1498,9 @@
 	 */
 	if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
 		log_rdma_event(INFO, "disconnecting transport\n");
-		smbd_disconnect_rdma_connection(server->smbd_conn);
+		smbd_destroy(server);
 	}
 
-	/* wait until the transport is destroyed */
-	if (!wait_event_timeout(server->smbd_conn->wait_destroy,
-		server->smbd_conn->transport_status == SMBD_DESTROYED, 5*HZ))
-		return -EAGAIN;
-
-	destroy_workqueue(server->smbd_conn->workqueue);
-	kfree(server->smbd_conn);
-
 create_conn:
 	log_rdma_event(INFO, "creating rdma session\n");
 	server->smbd_conn = smbd_get_connection(
@@ -1550,7 +1527,7 @@
 	char name[MAX_NAME_LEN];
 	int rc;
 
-	snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
+	scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
 	info->request_cache =
 		kmem_cache_create(
 			name,
@@ -1566,7 +1543,7 @@
 	if (!info->request_mempool)
 		goto out1;
 
-	snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
+	scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
 	info->response_cache =
 		kmem_cache_create(
 			name,
@@ -1582,7 +1559,7 @@
 	if (!info->response_mempool)
 		goto out3;
 
-	snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
+	scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
 	info->workqueue = create_workqueue(name);
 	if (!info->workqueue)
 		goto out4;
@@ -1677,15 +1654,17 @@
 
 	info->send_cq = NULL;
 	info->recv_cq = NULL;
-	info->send_cq = ib_alloc_cq(info->id->device, info,
-			info->send_credit_target, 0, IB_POLL_SOFTIRQ);
+	info->send_cq =
+		ib_alloc_cq_any(info->id->device, info,
+				info->send_credit_target, IB_POLL_SOFTIRQ);
 	if (IS_ERR(info->send_cq)) {
 		info->send_cq = NULL;
 		goto alloc_cq_failed;
 	}
 
-	info->recv_cq = ib_alloc_cq(info->id->device, info,
-			info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
+	info->recv_cq =
+		ib_alloc_cq_any(info->id->device, info,
+				info->receive_credit_max, IB_POLL_SOFTIRQ);
 	if (IS_ERR(info->recv_cq)) {
 		info->recv_cq = NULL;
 		goto alloc_cq_failed;
@@ -1724,7 +1703,7 @@
 		info->responder_resources);
 
 	/* Need to send IRD/ORD in private data for iWARP */
-	info->id->device->get_port_immutable(
+	info->id->device->ops.get_port_immutable(
 		info->id->device, info->id->port_num, &port_immutable);
 	if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
 		ird_ord_hdr[0] = info->responder_resources;
@@ -1739,12 +1718,13 @@
 	conn_param.retry_count = SMBD_CM_RETRY;
 	conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
 	conn_param.flow_control = 0;
-	init_waitqueue_head(&info->wait_destroy);
 
 	log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
 		&addr_in->sin_addr, port);
 
 	init_waitqueue_head(&info->conn_wait);
+	init_waitqueue_head(&info->disconn_wait);
+	init_waitqueue_head(&info->wait_reassembly_queue);
 	rc = rdma_connect(info->id, &conn_param);
 	if (rc) {
 		log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
@@ -1768,19 +1748,11 @@
 	}
 
 	init_waitqueue_head(&info->wait_send_queue);
-	init_waitqueue_head(&info->wait_reassembly_queue);
-
 	INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
 	INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
 	queue_delayed_work(info->workqueue, &info->idle_timer_work,
 		info->keep_alive_interval*HZ);
 
-	init_waitqueue_head(&info->wait_smbd_send_pending);
-	info->smbd_send_pending = 0;
-
-	init_waitqueue_head(&info->wait_smbd_recv_pending);
-	info->smbd_recv_pending = 0;
-
 	init_waitqueue_head(&info->wait_send_pending);
 	atomic_set(&info->send_pending, 0);
 
@@ -1788,7 +1760,6 @@
 	atomic_set(&info->send_payload_pending, 0);
 
 	INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
-	INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work);
 	INIT_WORK(&info->recv_done_work, smbd_recv_done_work);
 	INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
 	info->new_credits_offered = 0;
@@ -1810,7 +1781,7 @@
 
 allocate_mr_failed:
 	/* At this point, need to a full transport shutdown */
-	smbd_destroy(info);
+	smbd_destroy(server);
 	return NULL;
 
 negotiation_failed:
@@ -1882,11 +1853,6 @@
 	int rc;
 
 again:
-	if (info->transport_status != SMBD_CONNECTED) {
-		log_read(ERR, "disconnected\n");
-		return -ENODEV;
-	}
-
 	/*
 	 * No need to hold the reassembly queue lock all the time as we are
 	 * the only one reading from the front of the queue. The transport
@@ -2000,7 +1966,12 @@
 			info->transport_status != SMBD_CONNECTED);
 	/* Don't return any data if interrupted */
 	if (rc)
-		return -ENODEV;
+		return rc;
+
+	if (info->transport_status != SMBD_CONNECTED) {
+		log_read(ERR, "disconnected\n");
+		return 0;
+	}
 
 	goto again;
 }
@@ -2052,16 +2023,22 @@
 	unsigned int to_read, page_offset;
 	int rc;
 
-	info->smbd_recv_pending++;
+	if (iov_iter_rw(&msg->msg_iter) == WRITE) {
+		/* It's a bug in upper layer to get there */
+		cifs_dbg(VFS, "CIFS: invalid msg iter dir %u\n",
+			 iov_iter_rw(&msg->msg_iter));
+		rc = -EINVAL;
+		goto out;
+	}
 
-	switch (msg->msg_iter.type) {
-	case READ | ITER_KVEC:
+	switch (iov_iter_type(&msg->msg_iter)) {
+	case ITER_KVEC:
 		buf = msg->msg_iter.kvec->iov_base;
 		to_read = msg->msg_iter.kvec->iov_len;
 		rc = smbd_recv_buf(info, buf, to_read);
 		break;
 
-	case READ | ITER_BVEC:
+	case ITER_BVEC:
 		page = msg->msg_iter.bvec->bv_page;
 		page_offset = msg->msg_iter.bvec->bv_offset;
 		to_read = msg->msg_iter.bvec->bv_len;
@@ -2071,13 +2048,11 @@
 	default:
 		/* It's a bug in upper layer to get there */
 		cifs_dbg(VFS, "CIFS: invalid msg type %d\n",
-			msg->msg_iter.type);
+			 iov_iter_type(&msg->msg_iter));
 		rc = -EINVAL;
 	}
 
-	info->smbd_recv_pending--;
-	wake_up(&info->wait_smbd_recv_pending);
-
+out:
 	/* SMBDirect will read it all or nothing */
 	if (rc > 0)
 		msg->msg_iter.count = 0;
@@ -2090,7 +2065,8 @@
  * rqst: the data to write
  * return value: 0 if successfully write, otherwise error code
  */
-int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+int smbd_send(struct TCP_Server_Info *server,
+	int num_rqst, struct smb_rqst *rqst_array)
 {
 	struct smbd_connection *info = server->smbd_conn;
 	struct kvec vec;
@@ -2102,54 +2078,51 @@
 		info->max_send_size - sizeof(struct smbd_data_transfer);
 	struct kvec *iov;
 	int rc;
+	struct smb_rqst *rqst;
+	int rqst_idx;
 
-	info->smbd_send_pending++;
 	if (info->transport_status != SMBD_CONNECTED) {
-		rc = -ENODEV;
+		rc = -EAGAIN;
 		goto done;
 	}
 
 	/*
-	 * Skip the RFC1002 length defined in MS-SMB2 section 2.1
-	 * It is used only for TCP transport in the iov[0]
-	 * In future we may want to add a transport layer under protocol
-	 * layer so this will only be issued to TCP transport
-	 */
-
-	if (rqst->rq_iov[0].iov_len != 4) {
-		log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len);
-		return -EINVAL;
-	}
-
-	/*
 	 * Add in the page array if there is one. The caller needs to set
 	 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
 	 * ends at page boundary
 	 */
-	buflen = smb_rqst_len(server, rqst);
+	remaining_data_length = 0;
+	for (i = 0; i < num_rqst; i++)
+		remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
 
-	if (buflen + sizeof(struct smbd_data_transfer) >
+	if (remaining_data_length + sizeof(struct smbd_data_transfer) >
 		info->max_fragmented_send_size) {
 		log_write(ERR, "payload size %d > max size %d\n",
-			buflen, info->max_fragmented_send_size);
+			remaining_data_length, info->max_fragmented_send_size);
 		rc = -EINVAL;
 		goto done;
 	}
 
-	iov = &rqst->rq_iov[1];
+	log_write(INFO, "num_rqst=%d total length=%u\n",
+			num_rqst, remaining_data_length);
 
-	cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen);
-	for (i = 0; i < rqst->rq_nvec-1; i++)
+	rqst_idx = 0;
+next_rqst:
+	rqst = &rqst_array[rqst_idx];
+	iov = rqst->rq_iov;
+
+	cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
+		rqst_idx, smb_rqst_len(server, rqst));
+	for (i = 0; i < rqst->rq_nvec; i++)
 		dump_smb(iov[i].iov_base, iov[i].iov_len);
 
-	remaining_data_length = buflen;
 
-	log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
-		"rq_tailsz=%d buflen=%d\n",
-		rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
-		rqst->rq_tailsz, buflen);
+	log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
+		"rq_tailsz=%d buflen=%lu\n",
+		rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
+		rqst->rq_tailsz, smb_rqst_len(server, rqst));
 
-	start = i = iov[0].iov_len ? 0 : 1;
+	start = i = 0;
 	buflen = 0;
 	while (true) {
 		buflen += iov[i].iov_len;
@@ -2197,14 +2170,14 @@
 						goto done;
 				}
 				i++;
-				if (i == rqst->rq_nvec-1)
+				if (i == rqst->rq_nvec)
 					break;
 			}
 			start = i;
 			buflen = 0;
 		} else {
 			i++;
-			if (i == rqst->rq_nvec-1) {
+			if (i == rqst->rq_nvec) {
 				/* send out all remaining vecs */
 				remaining_data_length -= buflen;
 				log_write(INFO,
@@ -2248,6 +2221,10 @@
 		}
 	}
 
+	rqst_idx++;
+	if (rqst_idx < num_rqst)
+		goto next_rqst;
+
 done:
 	/*
 	 * As an optimization, we don't wait for individual I/O to finish
@@ -2259,9 +2236,6 @@
 	wait_event(info->wait_send_payload_pending,
 		atomic_read(&info->send_payload_pending) == 0);
 
-	info->smbd_send_pending--;
-	wake_up(&info->wait_smbd_send_pending);
-
 	return rc;
 }
 
@@ -2295,8 +2269,12 @@
 	int rc;
 
 	list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
-		if (smbdirect_mr->state == MR_INVALIDATED ||
-			smbdirect_mr->state == MR_ERROR) {
+		if (smbdirect_mr->state == MR_INVALIDATED)
+			ib_dma_unmap_sg(
+				info->id->device, smbdirect_mr->sgl,
+				smbdirect_mr->sgl_count,
+				smbdirect_mr->dir);
+		else if (smbdirect_mr->state == MR_ERROR) {
 
 			/* recover this MR entry */
 			rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2320,25 +2298,21 @@
 				smbd_disconnect_rdma_connection(info);
 				continue;
 			}
+		} else
+			/* This MR is being used, don't recover it */
+			continue;
 
-			if (smbdirect_mr->state == MR_INVALIDATED)
-				ib_dma_unmap_sg(
-					info->id->device, smbdirect_mr->sgl,
-					smbdirect_mr->sgl_count,
-					smbdirect_mr->dir);
+		smbdirect_mr->state = MR_READY;
 
-			smbdirect_mr->state = MR_READY;
-
-			/* smbdirect_mr->state is updated by this function
-			 * and is read and updated by I/O issuing CPUs trying
-			 * to get a MR, the call to atomic_inc_return
-			 * implicates a memory barrier and guarantees this
-			 * value is updated before waking up any calls to
-			 * get_mr() from the I/O issuing CPUs
-			 */
-			if (atomic_inc_return(&info->mr_ready_count) == 1)
-				wake_up_interruptible(&info->wait_mr);
-		}
+		/* smbdirect_mr->state is updated by this function
+		 * and is read and updated by I/O issuing CPUs trying
+		 * to get a MR, the call to atomic_inc_return
+		 * implicates a memory barrier and guarantees this
+		 * value is updated before waking up any calls to
+		 * get_mr() from the I/O issuing CPUs
+		 */
+		if (atomic_inc_return(&info->mr_ready_count) == 1)
+			wake_up_interruptible(&info->wait_mr);
 	}
 }
 

diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index a110962..6ff880a 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h

@@ -1,17 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) 2017, Microsoft Corporation.
  *
  *   Author(s): Long Li <longli@microsoft.com>
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
  */
 #ifndef _SMBDIRECT_H
 #define _SMBDIRECT_H
@@ -70,12 +61,11 @@
 	int ri_rc;
 	struct completion ri_done;
 	wait_queue_head_t conn_wait;
-	wait_queue_head_t wait_destroy;
+	wait_queue_head_t disconn_wait;
 
 	struct completion negotiate_completion;
 	bool negotiate_done;
 
-	struct work_struct destroy_work;
 	struct work_struct disconnect_work;
 	struct work_struct recv_done_work;
 	struct work_struct post_send_credits_work;
@@ -123,13 +113,6 @@
 	wait_queue_head_t wait_for_mr_cleanup;
 
 	/* Activity accoutning */
-	/* Pending reqeusts issued from upper layer */
-	int smbd_send_pending;
-	wait_queue_head_t wait_smbd_send_pending;
-
-	int smbd_recv_pending;
-	wait_queue_head_t wait_smbd_recv_pending;
-
 	atomic_t send_pending;
 	wait_queue_head_t wait_send_pending;
 	atomic_t send_payload_pending;
@@ -288,11 +271,12 @@
 /* Reconnect SMBDirect session */
 int smbd_reconnect(struct TCP_Server_Info *server);
 /* Destroy SMBDirect session */
-void smbd_destroy(struct smbd_connection *info);
+void smbd_destroy(struct TCP_Server_Info *server);
 
 /* Interface for carrying upper layer I/O through send/recv */
 int smbd_recv(struct smbd_connection *info, struct msghdr *msg);
-int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst);
+int smbd_send(struct TCP_Server_Info *server,
+	int num_rqst, struct smb_rqst *rqst);
 
 enum mr_state {
 	MR_READY,
@@ -330,9 +314,9 @@
 static inline void *smbd_get_connection(
 	struct TCP_Server_Info *server, struct sockaddr *dstaddr) {return NULL;}
 static inline int smbd_reconnect(struct TCP_Server_Info *server) {return -1; }
-static inline void smbd_destroy(struct smbd_connection *info) {}
+static inline void smbd_destroy(struct TCP_Server_Info *server) {}
 static inline int smbd_recv(struct smbd_connection *info, struct msghdr *msg) {return -1; }
-static inline int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst) {return -1; }
+static inline int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst) {return -1; }
 #endif
 
 #endif

diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index a0b80ac..39a9384 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
    Unix SMB/Netbios implementation.
    Version 1.9.
@@ -8,28 +9,16 @@
    Copyright (C) Andrew Bartlett <abartlet@samba.org> 2002-2003
    Modified by Steve French (sfrench@us.ibm.com) 2002-2003
 
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
-#include <linux/crypto.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/fips.h>
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/random.h>
+#include <crypto/des.h>
 #include "cifs_fs_sb.h"
 #include "cifs_unicode.h"
 #include "cifspdu.h"
@@ -70,19 +59,18 @@
 smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
 {
 	unsigned char key2[8];
-	struct crypto_cipher *tfm_des;
+	struct des_ctx ctx;
 
 	str_to_key(key, key2);
 
-	tfm_des = crypto_alloc_cipher("des", 0, 0);
-	if (IS_ERR(tfm_des)) {
-		cifs_dbg(VFS, "could not allocate des crypto API\n");
-		return PTR_ERR(tfm_des);
+	if (fips_enabled) {
+		cifs_dbg(VFS, "FIPS compliance enabled: DES not permitted\n");
+		return -ENOENT;
 	}
 
-	crypto_cipher_setkey(tfm_des, key2, 8);
-	crypto_cipher_encrypt_one(tfm_des, out, in);
-	crypto_free_cipher(tfm_des);
+	des_expand_key(&ctx, key2, DES_KEY_SIZE);
+	des_encrypt(&ctx, out, in);
+	memzero_explicit(&ctx, sizeof(ctx));
 
 	return 0;
 }

diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index f996dae..1ff2852 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h

@@ -35,6 +35,33 @@
  * below). Additional detail on less common ones can be found in MS-FSCC
  * section 2.3.
  */
+
+/*
+ * FSCTL values are 32 bits and are constructed as
+ * <device 16bits> <access 2bits> <function 12bits> <method 2bits>
+ */
+/* Device */
+#define FSCTL_DEVICE_DFS                 (0x0006 << 16)
+#define FSCTL_DEVICE_FILE_SYSTEM         (0x0009 << 16)
+#define FSCTL_DEVICE_NAMED_PIPE          (0x0011 << 16)
+#define FSCTL_DEVICE_NETWORK_FILE_SYSTEM (0x0014 << 16)
+#define FSCTL_DEVICE_MASK                0xffff0000
+/* Access */
+#define FSCTL_DEVICE_ACCESS_FILE_ANY_ACCESS        (0x00 << 14)
+#define FSCTL_DEVICE_ACCESS_FILE_READ_ACCESS       (0x01 << 14)
+#define FSCTL_DEVICE_ACCESS_FILE_WRITE_ACCESS      (0x02 << 14)
+#define FSCTL_DEVICE_ACCESS_FILE_READ_WRITE_ACCESS (0x03 << 14)
+#define FSCTL_DEVICE_ACCESS_MASK                   0x0000c000
+/* Function */
+#define FSCTL_DEVICE_FUNCTION_MASK       0x00003ffc
+/* Method */
+#define FSCTL_DEVICE_METHOD_BUFFERED   0x00
+#define FSCTL_DEVICE_METHOD_IN_DIRECT  0x01
+#define FSCTL_DEVICE_METHOD_OUT_DIRECT 0x02
+#define FSCTL_DEVICE_METHOD_NEITHER    0x03
+#define FSCTL_DEVICE_METHOD_MASK       0x00000003
+
+
 #define FSCTL_DFS_GET_REFERRALS      0x00060194
 #define FSCTL_DFS_GET_REFERRALS_EX   0x000601B0
 #define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
@@ -76,7 +103,7 @@
 #define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
 #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
 #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
-#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
+#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF
 #define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
 #define FSCTL_FILE_LEVEL_TRIM        0x00098208 /* BB add struct */
 #define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344
@@ -117,6 +144,17 @@
 #define IO_REPARSE_APPXSTREAM	     0xC0000014
 /* NFS symlinks, Win 8/SMB3 and later */
 #define IO_REPARSE_TAG_NFS           0x80000014
+/*
+ * AzureFileSync - see
+ * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering
+ */
+#define IO_REPARSE_TAG_AZ_FILE_SYNC  0x8000001e
+/* WSL reparse tags */
+#define IO_REPARSE_TAG_LX_SYMLINK    0xA000001D
+#define IO_REPARSE_TAG_AF_UNIX	     0x80000023
+#define IO_REPARSE_TAG_LX_FIFO	     0x80000024
+#define IO_REPARSE_TAG_LX_CHR	     0x80000025
+#define IO_REPARSE_TAG_LX_BLK	     0x80000026
 
 /* fsctl flags */
 /* If Flags is set to this value, the request is an FSCTL not ioctl request */

diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c
index bd4a546..4654837 100644
--- a/fs/cifs/trace.c
+++ b/fs/cifs/trace.c

@@ -3,16 +3,6 @@
  *   Copyright (C) 2018, Microsoft Corporation.
  *
  *   Author(s): Steve French <stfrench@microsoft.com>
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
  */
 #define CREATE_TRACE_POINTS
 #include "trace.h"

diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index d4aed52..e7e350b 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h

@@ -3,16 +3,6 @@
  *   Copyright (C) 2018, Microsoft Corporation.
  *
  *   Author(s): Steve French <stfrench@microsoft.com>
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
  */
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM cifs
@@ -68,6 +58,9 @@
 
 DEFINE_SMB3_RW_ERR_EVENT(write_err);
 DEFINE_SMB3_RW_ERR_EVENT(read_err);
+DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
+DEFINE_SMB3_RW_ERR_EVENT(zero_err);
+DEFINE_SMB3_RW_ERR_EVENT(falloc_err);
 
 
 /* For logging successful read or write */
@@ -110,12 +103,55 @@
 		__u32	len),			\
 	TP_ARGS(xid, fid, tid, sesid, offset, len))
 
+DEFINE_SMB3_RW_DONE_EVENT(write_enter);
+DEFINE_SMB3_RW_DONE_EVENT(read_enter);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
+DEFINE_SMB3_RW_DONE_EVENT(zero_enter);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_enter);
 DEFINE_SMB3_RW_DONE_EVENT(write_done);
 DEFINE_SMB3_RW_DONE_EVENT(read_done);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
+DEFINE_SMB3_RW_DONE_EVENT(zero_done);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
 
 /*
  * For handle based calls other than read and write, and get/set info
  */
+DECLARE_EVENT_CLASS(smb3_fd_class,
+	TP_PROTO(unsigned int xid,
+		__u64	fid,
+		__u32	tid,
+		__u64	sesid),
+	TP_ARGS(xid, fid, tid, sesid),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->fid = fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+	),
+	TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->fid)
+)
+
+#define DEFINE_SMB3_FD_EVENT(name)          \
+DEFINE_EVENT(smb3_fd_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u64	fid,			\
+		__u32	tid,			\
+		__u64	sesid),			\
+	TP_ARGS(xid, fid, tid, sesid))
+
+DEFINE_SMB3_FD_EVENT(flush_enter);
+DEFINE_SMB3_FD_EVENT(flush_done);
+DEFINE_SMB3_FD_EVENT(close_enter);
+DEFINE_SMB3_FD_EVENT(close_done);
+
 DECLARE_EVENT_CLASS(smb3_fd_err_class,
 	TP_PROTO(unsigned int xid,
 		__u64	fid,
@@ -158,6 +194,50 @@
 /*
  * For handle based query/set info calls
  */
+DECLARE_EVENT_CLASS(smb3_inf_enter_class,
+	TP_PROTO(unsigned int xid,
+		__u64	fid,
+		__u32	tid,
+		__u64	sesid,
+		__u8	infclass,
+		__u32	type),
+	TP_ARGS(xid, fid, tid, sesid, infclass, type),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u8, infclass)
+		__field(__u32, type)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->fid = fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->infclass = infclass;
+		__entry->type = type;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+		__entry->infclass, __entry->type)
+)
+
+#define DEFINE_SMB3_INF_ENTER_EVENT(name)          \
+DEFINE_EVENT(smb3_inf_enter_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u64	fid,			\
+		__u32	tid,			\
+		__u64	sesid,			\
+		__u8	infclass,		\
+		__u32	type),			\
+	TP_ARGS(xid, fid, tid, sesid, infclass, type))
+
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_enter);
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_done);
+DEFINE_SMB3_INF_ENTER_EVENT(notify_enter);
+DEFINE_SMB3_INF_ENTER_EVENT(notify_done);
+
 DECLARE_EVENT_CLASS(smb3_inf_err_class,
 	TP_PROTO(unsigned int xid,
 		__u64	fid,
@@ -203,8 +283,126 @@
 
 DEFINE_SMB3_INF_ERR_EVENT(query_info_err);
 DEFINE_SMB3_INF_ERR_EVENT(set_info_err);
+DEFINE_SMB3_INF_ERR_EVENT(notify_err);
 DEFINE_SMB3_INF_ERR_EVENT(fsctl_err);
 
+DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class,
+	TP_PROTO(unsigned int xid,
+		__u32	tid,
+		__u64	sesid,
+		const char *full_path),
+	TP_ARGS(xid, tid, sesid, full_path),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__string(path, full_path)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__assign_str(path, full_path);
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s",
+		__entry->xid, __entry->sesid, __entry->tid,
+		__get_str(path))
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name)     \
+DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		const char *full_path),		\
+	TP_ARGS(xid, tid, sesid, full_path))
+
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
+	TP_PROTO(unsigned int xid,
+		__u32	tid,
+		__u64	sesid),
+	TP_ARGS(xid, tid, sesid),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x",
+		__entry->xid, __entry->sesid, __entry->tid)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name)     \
+DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u32	tid,			\
+		__u64	sesid),			\
+	TP_ARGS(xid, tid, sesid))
+
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
+	TP_PROTO(unsigned int xid,
+		__u32	tid,
+		__u64	sesid,
+		int	rc),
+	TP_ARGS(xid, tid, sesid, rc),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->rc = rc;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d",
+		__entry->xid, __entry->sesid, __entry->tid,
+		__entry->rc)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name)     \
+DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		int rc),			\
+	TP_ARGS(xid, tid, sesid, rc))
+
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
+
 /*
  * For logging SMB3 Status code and Command for responses which return errors
  */
@@ -280,6 +478,7 @@
 		__u64	mid),			\
 	TP_ARGS(tid, sesid, cmd, mid))
 
+DEFINE_SMB3_CMD_DONE_EVENT(cmd_enter);
 DEFINE_SMB3_CMD_DONE_EVENT(cmd_done);
 DEFINE_SMB3_CMD_DONE_EVENT(ses_expired);
 
@@ -374,8 +573,89 @@
 DEFINE_SMB3_ENTER_EXIT_EVENT(exit_done);
 
 /*
- * For smb2/smb3 open call
+ * For SMB2/SMB3 tree connect
  */
+
+DECLARE_EVENT_CLASS(smb3_tcon_class,
+	TP_PROTO(unsigned int xid,
+		__u32	tid,
+		__u64	sesid,
+		const char *unc_name,
+		int	rc),
+	TP_ARGS(xid, tid, sesid, unc_name, rc),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__string(name, unc_name)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__assign_str(name, unc_name);
+		__entry->rc = rc;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d",
+		__entry->xid, __entry->sesid, __entry->tid,
+		__get_str(name), __entry->rc)
+)
+
+#define DEFINE_SMB3_TCON_EVENT(name)          \
+DEFINE_EVENT(smb3_tcon_class, smb3_##name,    \
+	TP_PROTO(unsigned int xid,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		const char *unc_name,		\
+		int	rc),			\
+	TP_ARGS(xid, tid, sesid, unc_name, rc))
+
+DEFINE_SMB3_TCON_EVENT(tcon);
+
+
+/*
+ * For smb2/smb3 open (including create and mkdir) calls
+ */
+
+DECLARE_EVENT_CLASS(smb3_open_enter_class,
+	TP_PROTO(unsigned int xid,
+		__u32	tid,
+		__u64	sesid,
+		int	create_options,
+		int	desired_access),
+	TP_ARGS(xid, tid, sesid, create_options, desired_access),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(int, create_options)
+		__field(int, desired_access)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->create_options = create_options;
+		__entry->desired_access = desired_access;
+	),
+	TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x",
+		__entry->xid, __entry->sesid, __entry->tid,
+		__entry->create_options, __entry->desired_access)
+)
+
+#define DEFINE_SMB3_OPEN_ENTER_EVENT(name)        \
+DEFINE_EVENT(smb3_open_enter_class, smb3_##name,  \
+	TP_PROTO(unsigned int xid,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		int	create_options,		\
+		int	desired_access),	\
+	TP_ARGS(xid, tid, sesid, create_options, desired_access))
+
+DEFINE_SMB3_OPEN_ENTER_EVENT(open_enter);
+DEFINE_SMB3_OPEN_ENTER_EVENT(posix_mkdir_enter);
+
 DECLARE_EVENT_CLASS(smb3_open_err_class,
 	TP_PROTO(unsigned int xid,
 		__u32	tid,
@@ -460,6 +740,85 @@
 DEFINE_SMB3_OPEN_DONE_EVENT(open_done);
 DEFINE_SMB3_OPEN_DONE_EVENT(posix_mkdir_done);
 
+
+DECLARE_EVENT_CLASS(smb3_lease_done_class,
+	TP_PROTO(__u32	lease_state,
+		__u32	tid,
+		__u64	sesid,
+		__u64	lease_key_low,
+		__u64	lease_key_high),
+	TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high),
+	TP_STRUCT__entry(
+		__field(__u32, lease_state)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u64, lease_key_low)
+		__field(__u64, lease_key_high)
+	),
+	TP_fast_assign(
+		__entry->lease_state = lease_state;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->lease_key_low = lease_key_low;
+		__entry->lease_key_high = lease_key_high;
+	),
+	TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x",
+		__entry->sesid, __entry->tid, __entry->lease_key_high,
+		__entry->lease_key_low, __entry->lease_state)
+)
+
+#define DEFINE_SMB3_LEASE_DONE_EVENT(name)        \
+DEFINE_EVENT(smb3_lease_done_class, smb3_##name,  \
+	TP_PROTO(__u32	lease_state,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		__u64	lease_key_low,		\
+		__u64	lease_key_high),	\
+	TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
+
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
+
+DECLARE_EVENT_CLASS(smb3_lease_err_class,
+	TP_PROTO(__u32	lease_state,
+		__u32	tid,
+		__u64	sesid,
+		__u64	lease_key_low,
+		__u64	lease_key_high,
+		int	rc),
+	TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc),
+	TP_STRUCT__entry(
+		__field(__u32, lease_state)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u64, lease_key_low)
+		__field(__u64, lease_key_high)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->lease_state = lease_state;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->lease_key_low = lease_key_low;
+		__entry->lease_key_high = lease_key_high;
+		__entry->rc = rc;
+	),
+	TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x rc=%d",
+		__entry->sesid, __entry->tid, __entry->lease_key_high,
+		__entry->lease_key_low, __entry->lease_state, __entry->rc)
+)
+
+#define DEFINE_SMB3_LEASE_ERR_EVENT(name)        \
+DEFINE_EVENT(smb3_lease_err_class, smb3_##name,  \
+	TP_PROTO(__u32	lease_state,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		__u64	lease_key_low,		\
+		__u64	lease_key_high,		\
+		int	rc),			\
+	TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc))
+
+DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+
 DECLARE_EVENT_CLASS(smb3_reconnect_class,
 	TP_PROTO(__u64	currmid,
 		char *hostname),
@@ -486,6 +845,37 @@
 DEFINE_SMB3_RECONNECT_EVENT(reconnect);
 DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect);
 
+DECLARE_EVENT_CLASS(smb3_credit_class,
+	TP_PROTO(__u64	currmid,
+		char *hostname,
+		int credits),
+	TP_ARGS(currmid, hostname, credits),
+	TP_STRUCT__entry(
+		__field(__u64, currmid)
+		__field(char *, hostname)
+		__field(int, credits)
+	),
+	TP_fast_assign(
+		__entry->currmid = currmid;
+		__entry->hostname = hostname;
+		__entry->credits = credits;
+	),
+	TP_printk("server=%s current_mid=0x%llx credits=%d",
+		__entry->hostname,
+		__entry->currmid,
+		__entry->credits)
+)
+
+#define DEFINE_SMB3_CREDIT_EVENT(name)        \
+DEFINE_EVENT(smb3_credit_class, smb3_##name,  \
+	TP_PROTO(__u64	currmid,		\
+		char *hostname,			\
+		int  credits),			\
+	TP_ARGS(currmid, hostname, credits))
+
+DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
+DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
+
 #endif /* _CIFS_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 333729c..ca3de62 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c

@@ -33,6 +33,7 @@
 #include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <linux/mempool.h>
+#include <linux/sched/signal.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -85,25 +86,14 @@
 
 static void _cifs_mid_q_entry_release(struct kref *refcount)
 {
-	struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry,
-					       refcount);
-
-	mempool_free(mid, cifs_mid_poolp);
-}
-
-void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
-{
-	spin_lock(&GlobalMid_Lock);
-	kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
-	spin_unlock(&GlobalMid_Lock);
-}
-
-void
-DeleteMidQEntry(struct mid_q_entry *midEntry)
-{
+	struct mid_q_entry *midEntry =
+			container_of(refcount, struct mid_q_entry, refcount);
 #ifdef CONFIG_CIFS_STATS2
 	__le16 command = midEntry->server->vals->lock_cmd;
+	__u16 smb_cmd = le16_to_cpu(midEntry->command);
 	unsigned long now;
+	unsigned long roundtrip_time;
+	struct TCP_Server_Info *server = midEntry->server;
 #endif
 	midEntry->mid_state = MID_FREE;
 	atomic_dec(&midCount);
@@ -113,28 +103,68 @@
 		cifs_small_buf_release(midEntry->resp_buf);
 #ifdef CONFIG_CIFS_STATS2
 	now = jiffies;
-	/* commands taking longer than one second are indications that
-	   something is wrong, unless it is quite a slow link or server */
-	if (time_after(now, midEntry->when_alloc + HZ) &&
-	    (midEntry->command != command)) {
-		/* smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command */
-		if ((le16_to_cpu(midEntry->command) < NUMBER_OF_SMB2_COMMANDS) &&
-		    (le16_to_cpu(midEntry->command) >= 0))
-			cifs_stats_inc(&midEntry->server->smb2slowcmd[le16_to_cpu(midEntry->command)]);
+	if (now < midEntry->when_alloc)
+		cifs_server_dbg(VFS, "invalid mid allocation time\n");
+	roundtrip_time = now - midEntry->when_alloc;
 
-		trace_smb3_slow_rsp(le16_to_cpu(midEntry->command),
-			       midEntry->mid, midEntry->pid,
+	if (smb_cmd < NUMBER_OF_SMB2_COMMANDS) {
+		if (atomic_read(&server->num_cmds[smb_cmd]) == 0) {
+			server->slowest_cmd[smb_cmd] = roundtrip_time;
+			server->fastest_cmd[smb_cmd] = roundtrip_time;
+		} else {
+			if (server->slowest_cmd[smb_cmd] < roundtrip_time)
+				server->slowest_cmd[smb_cmd] = roundtrip_time;
+			else if (server->fastest_cmd[smb_cmd] > roundtrip_time)
+				server->fastest_cmd[smb_cmd] = roundtrip_time;
+		}
+		cifs_stats_inc(&server->num_cmds[smb_cmd]);
+		server->time_per_cmd[smb_cmd] += roundtrip_time;
+	}
+	/*
+	 * commands taking longer than one second (default) can be indications
+	 * that something is wrong, unless it is quite a slow link or a very
+	 * busy server. Note that this calc is unlikely or impossible to wrap
+	 * as long as slow_rsp_threshold is not set way above recommended max
+	 * value (32767 ie 9 hours) and is generally harmless even if wrong
+	 * since only affects debug counters - so leaving the calc as simple
+	 * comparison rather than doing multiple conversions and overflow
+	 * checks
+	 */
+	if ((slow_rsp_threshold != 0) &&
+	    time_after(now, midEntry->when_alloc + (slow_rsp_threshold * HZ)) &&
+	    (midEntry->command != command)) {
+		/*
+		 * smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command
+		 * NB: le16_to_cpu returns unsigned so can not be negative below
+		 */
+		if (smb_cmd < NUMBER_OF_SMB2_COMMANDS)
+			cifs_stats_inc(&server->smb2slowcmd[smb_cmd]);
+
+		trace_smb3_slow_rsp(smb_cmd, midEntry->mid, midEntry->pid,
 			       midEntry->when_sent, midEntry->when_received);
 		if (cifsFYI & CIFS_TIMER) {
 			pr_debug(" CIFS slow rsp: cmd %d mid %llu",
 			       midEntry->command, midEntry->mid);
-			pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
+			cifs_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
 			       now - midEntry->when_alloc,
 			       now - midEntry->when_sent,
 			       now - midEntry->when_received);
 		}
 	}
 #endif
+
+	mempool_free(midEntry, cifs_mid_poolp);
+}
+
+void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
+{
+	spin_lock(&GlobalMid_Lock);
+	kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
+	spin_unlock(&GlobalMid_Lock);
+}
+
+void DeleteMidQEntry(struct mid_q_entry *midEntry)
+{
 	cifs_mid_q_entry_release(midEntry);
 }
 
@@ -142,8 +172,10 @@
 cifs_delete_mid(struct mid_q_entry *mid)
 {
 	spin_lock(&GlobalMid_Lock);
-	list_del_init(&mid->qhead);
-	mid->mid_flags |= MID_DELETED;
+	if (!(mid->mid_flags & MID_DELETED)) {
+		list_del_init(&mid->qhead);
+		mid->mid_flags |= MID_DELETED;
+	}
 	spin_unlock(&GlobalMid_Lock);
 
 	DeleteMidQEntry(mid);
@@ -201,7 +233,7 @@
 			retries++;
 			if (retries >= 14 ||
 			    (!server->noblocksnd && (retries > 2))) {
-				cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
+				cifs_server_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
 					 ssocket);
 				return -EAGAIN;
 			}
@@ -215,7 +247,7 @@
 		if (rc == 0) {
 			/* should never happen, letting socket clear before
 			   retrying is our only obvious option here */
-			cifs_dbg(VFS, "tcp sent no data\n");
+			cifs_server_dbg(VFS, "tcp sent no data\n");
 			msleep(500);
 			continue;
 		}
@@ -280,6 +312,7 @@
 	int n_vec;
 	unsigned int send_length = 0;
 	unsigned int i, j;
+	sigset_t mask, oldmask;
 	size_t total_len = 0, sent, size;
 	struct socket *ssocket = server->ssocket;
 	struct msghdr smb_msg;
@@ -287,11 +320,17 @@
 	__be32 rfc1002_marker;
 
 	if (cifs_rdma_enabled(server) && server->smbd_conn) {
-		rc = smbd_send(server, rqst);
+		rc = smbd_send(server, num_rqst, rqst);
 		goto smbd_done;
 	}
+
 	if (ssocket == NULL)
-		return -ENOTSOCK;
+		return -EAGAIN;
+
+	if (signal_pending(current)) {
+		cifs_dbg(FYI, "signal is pending before sending any data\n");
+		return -EINTR;
+	}
 
 	/* cork the socket */
 	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -301,17 +340,26 @@
 		send_length += smb_rqst_len(server, &rqst[j]);
 	rfc1002_marker = cpu_to_be32(send_length);
 
+	/*
+	 * We should not allow signals to interrupt the network send because
+	 * any partial send will cause session reconnects thus increasing
+	 * latency of system calls and overload a server with unnecessary
+	 * requests.
+	 */
+
+	sigfillset(&mask);
+	sigprocmask(SIG_BLOCK, &mask, &oldmask);
+
 	/* Generate a rfc1002 marker for SMB2+ */
 	if (server->vals->header_preamble_size == 0) {
 		struct kvec hiov = {
 			.iov_base = &rfc1002_marker,
 			.iov_len  = 4
 		};
-		iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, &hiov,
-			      1, 4);
+		iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
 		rc = smb_send_kvec(server, &smb_msg, &sent);
 		if (rc < 0)
-			goto uncork;
+			goto unmask;
 
 		total_len += sent;
 		send_length += 4;
@@ -329,12 +377,11 @@
 			size += iov[i].iov_len;
 		}
 
-		iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC,
-			      iov, n_vec, size);
+		iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size);
 
 		rc = smb_send_kvec(server, &smb_msg, &sent);
 		if (rc < 0)
-			goto uncork;
+			goto unmask;
 
 		total_len += sent;
 
@@ -346,7 +393,7 @@
 			rqst_page_get_length(&rqst[j], i, &bvec.bv_len,
 					     &bvec.bv_offset);
 
-			iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC,
+			iov_iter_bvec(&smb_msg.msg_iter, WRITE,
 				      &bvec, 1, bvec.bv_len);
 			rc = smb_send_kvec(server, &smb_msg, &sent);
 			if (rc < 0)
@@ -356,7 +403,25 @@
 		}
 	}
 
-uncork:
+unmask:
+	sigprocmask(SIG_SETMASK, &oldmask, NULL);
+
+	/*
+	 * If signal is pending but we have already sent the whole packet to
+	 * the server we need to return success status to allow a corresponding
+	 * mid entry to be kept in the pending requests queue thus allowing
+	 * to handle responses from the server by the client.
+	 *
+	 * If only part of the packet has been sent there is no need to hide
+	 * interrupt because the session will be reconnected anyway, so there
+	 * won't be any response from the server to handle.
+	 */
+
+	if (signal_pending(current) && (total_len != send_length)) {
+		cifs_dbg(FYI, "signal is pending after attempt to send\n");
+		rc = -EINTR;
+	}
+
 	/* uncork it */
 	val = 0;
 	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -376,9 +441,9 @@
 	}
 smbd_done:
 	if (rc < 0 && rc != -EINTR)
-		cifs_dbg(VFS, "Error %d sending data on socket to server\n",
+		cifs_server_dbg(VFS, "Error %d sending data on socket to server\n",
 			 rc);
-	else
+	else if (rc > 0)
 		rc = 0;
 
 	return rc;
@@ -409,8 +474,8 @@
 	cur_rqst[0].rq_nvec = 1;
 
 	if (!server->ops->init_transform_rq) {
-		cifs_dbg(VFS, "Encryption requested but transform callback "
-			 "is missing\n");
+		cifs_server_dbg(VFS, "Encryption requested but transform "
+				"callback is missing\n");
 		return -EIO;
 	}
 
@@ -441,29 +506,57 @@
 }
 
 static int
-wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
-		      int *credits)
+wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
+		      const int timeout, const int flags,
+		      unsigned int *instance)
 {
 	int rc;
+	int *credits;
+	int optype;
+	long int t;
+
+	if (timeout < 0)
+		t = MAX_JIFFY_OFFSET;
+	else
+		t = msecs_to_jiffies(timeout);
+
+	optype = flags & CIFS_OP_MASK;
+
+	*instance = 0;
+
+	credits = server->ops->get_credits_field(server, optype);
+	/* Since an echo is already inflight, no need to wait to send another */
+	if (*credits <= 0 && optype == CIFS_ECHO_OP)
+		return -EAGAIN;
 
 	spin_lock(&server->req_lock);
-	if (timeout == CIFS_ASYNC_OP) {
+	if ((flags & CIFS_TIMEOUT_MASK) == CIFS_NON_BLOCKING) {
 		/* oplock breaks must not be held up */
 		server->in_flight++;
+		if (server->in_flight > server->max_in_flight)
+			server->max_in_flight = server->in_flight;
 		*credits -= 1;
+		*instance = server->reconnect_instance;
 		spin_unlock(&server->req_lock);
 		return 0;
 	}
 
 	while (1) {
-		if (*credits <= 0) {
+		if (*credits < num_credits) {
 			spin_unlock(&server->req_lock);
 			cifs_num_waiters_inc(server);
-			rc = wait_event_killable(server->request_q,
-						 has_credits(server, credits));
+			rc = wait_event_killable_timeout(server->request_q,
+				has_credits(server, credits, num_credits), t);
 			cifs_num_waiters_dec(server);
-			if (rc)
-				return rc;
+			if (!rc) {
+				trace_smb3_credit_timeout(server->CurrentMid,
+					server->hostname, num_credits);
+				cifs_server_dbg(VFS, "wait timed out after %d ms\n",
+					 timeout);
+				return -ENOTSUPP;
+			}
+			if (rc == -ERESTARTSYS)
+				return -ERESTARTSYS;
 			spin_lock(&server->req_lock);
 		} else {
 			if (server->tcpStatus == CifsExiting) {
@@ -472,14 +565,55 @@
 			}
 
 			/*
+			 * For normal commands, reserve the last MAX_COMPOUND
+			 * credits to compound requests.
+			 * Otherwise these compounds could be permanently
+			 * starved for credits by single-credit requests.
+			 *
+			 * To prevent spinning CPU, block this thread until
+			 * there are >MAX_COMPOUND credits available.
+			 * But only do this is we already have a lot of
+			 * credits in flight to avoid triggering this check
+			 * for servers that are slow to hand out credits on
+			 * new sessions.
+			 */
+			if (!optype && num_credits == 1 &&
+			    server->in_flight > 2 * MAX_COMPOUND &&
+			    *credits <= MAX_COMPOUND) {
+				spin_unlock(&server->req_lock);
+				cifs_num_waiters_inc(server);
+				rc = wait_event_killable_timeout(
+					server->request_q,
+					has_credits(server, credits,
+						    MAX_COMPOUND + 1),
+					t);
+				cifs_num_waiters_dec(server);
+				if (!rc) {
+					trace_smb3_credit_timeout(
+						server->CurrentMid,
+						server->hostname, num_credits);
+					cifs_server_dbg(VFS, "wait timed out after %d ms\n",
+						 timeout);
+					return -ENOTSUPP;
+				}
+				if (rc == -ERESTARTSYS)
+					return -ERESTARTSYS;
+				spin_lock(&server->req_lock);
+				continue;
+			}
+
+			/*
 			 * Can not count locking commands against total
 			 * as they are allowed to block on server.
 			 */
 
 			/* update # of requests on the wire to server */
-			if (timeout != CIFS_BLOCKING_OP) {
-				*credits -= 1;
-				server->in_flight++;
+			if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) {
+				*credits -= num_credits;
+				server->in_flight += num_credits;
+				if (server->in_flight > server->max_in_flight)
+					server->max_in_flight = server->in_flight;
+				*instance = server->reconnect_instance;
 			}
 			spin_unlock(&server->req_lock);
 			break;
@@ -489,24 +623,45 @@
 }
 
 static int
-wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
-		      const int optype)
+wait_for_free_request(struct TCP_Server_Info *server, const int flags,
+		      unsigned int *instance)
 {
-	int *val;
+	return wait_for_free_credits(server, 1, -1, flags,
+				     instance);
+}
 
-	val = server->ops->get_credits_field(server, optype);
-	/* Since an echo is already inflight, no need to wait to send another */
-	if (*val <= 0 && optype == CIFS_ECHO_OP)
-		return -EAGAIN;
-	return wait_for_free_credits(server, timeout, val);
+static int
+wait_for_compound_request(struct TCP_Server_Info *server, int num,
+			  const int flags, unsigned int *instance)
+{
+	int *credits;
+
+	credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK);
+
+	spin_lock(&server->req_lock);
+	if (*credits < num) {
+		/*
+		 * Return immediately if not too many requests in flight since
+		 * we will likely be stuck on waiting for credits.
+		 */
+		if (server->in_flight < num - *credits) {
+			spin_unlock(&server->req_lock);
+			return -ENOTSUPP;
+		}
+	}
+	spin_unlock(&server->req_lock);
+
+	return wait_for_free_credits(server, num, 60000, flags,
+				     instance);
 }
 
 int
 cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
-		      unsigned int *num, unsigned int *credits)
+		      unsigned int *num, struct cifs_credits *credits)
 {
 	*num = size;
-	*credits = 0;
+	credits->value = 0;
+	credits->instance = server->reconnect_instance;
 	return 0;
 }
 
@@ -593,27 +748,43 @@
 int
 cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 		mid_receive_t *receive, mid_callback_t *callback,
-		mid_handle_t *handle, void *cbdata, const int flags)
+		mid_handle_t *handle, void *cbdata, const int flags,
+		const struct cifs_credits *exist_credits)
 {
-	int rc, timeout, optype;
+	int rc;
 	struct mid_q_entry *mid;
-	unsigned int credits = 0;
+	struct cifs_credits credits = { .value = 0, .instance = 0 };
+	unsigned int instance;
+	int optype;
 
-	timeout = flags & CIFS_TIMEOUT_MASK;
 	optype = flags & CIFS_OP_MASK;
 
 	if ((flags & CIFS_HAS_CREDITS) == 0) {
-		rc = wait_for_free_request(server, timeout, optype);
+		rc = wait_for_free_request(server, flags, &instance);
 		if (rc)
 			return rc;
-		credits = 1;
-	}
+		credits.value = 1;
+		credits.instance = instance;
+	} else
+		instance = exist_credits->instance;
 
 	mutex_lock(&server->srv_mutex);
+
+	/*
+	 * We can't use credits obtained from the previous session to send this
+	 * request. Check if there were reconnects after we obtained credits and
+	 * return -EAGAIN in such cases to let callers handle it.
+	 */
+	if (instance != server->reconnect_instance) {
+		mutex_unlock(&server->srv_mutex);
+		add_credits_and_wake_if(server, &credits, optype);
+		return -EAGAIN;
+	}
+
 	mid = server->ops->setup_async_request(server, rqst);
 	if (IS_ERR(mid)) {
 		mutex_unlock(&server->srv_mutex);
-		add_credits_and_wake_if(server, credits, optype);
+		add_credits_and_wake_if(server, &credits, optype);
 		return PTR_ERR(mid);
 	}
 
@@ -638,6 +809,7 @@
 	cifs_in_send_dec(server);
 
 	if (rc < 0) {
+		revert_current_mid(server, mid->credits);
 		server->sequence_number -= 2;
 		cifs_delete_mid(mid);
 	}
@@ -647,7 +819,7 @@
 	if (rc == 0)
 		return 0;
 
-	add_credits_and_wake_if(server, credits, optype);
+	add_credits_and_wake_if(server, &credits, optype);
 	return rc;
 }
 
@@ -671,7 +843,7 @@
 
 	iov[0].iov_base = in_buf;
 	iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
-	flags |= CIFS_NO_RESP;
+	flags |= CIFS_NO_RSP_BUF;
 	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov);
 	cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc);
 
@@ -701,8 +873,11 @@
 		rc = -EHOSTDOWN;
 		break;
 	default:
-		list_del_init(&mid->qhead);
-		cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
+		if (!(mid->mid_flags & MID_DELETED)) {
+			list_del_init(&mid->qhead);
+			mid->mid_flags |= MID_DELETED;
+		}
+		cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
 			 __func__, mid->mid, mid->mid_state);
 		rc = -EIO;
 	}
@@ -743,7 +918,7 @@
 		rc = cifs_verify_signature(&rqst, server,
 					   mid->sequence_number);
 		if (rc)
-			cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+			cifs_server_dbg(VFS, "SMB signature verification returned error = %d\n",
 				 rc);
 	}
 
@@ -774,8 +949,29 @@
 }
 
 static void
-cifs_noop_callback(struct mid_q_entry *mid)
+cifs_compound_callback(struct mid_q_entry *mid)
 {
+	struct TCP_Server_Info *server = mid->server;
+	struct cifs_credits credits;
+
+	credits.value = server->ops->get_credits(mid);
+	credits.instance = server->reconnect_instance;
+
+	add_credits(server, &credits, mid->optype);
+}
+
+static void
+cifs_compound_last_callback(struct mid_q_entry *mid)
+{
+	cifs_compound_callback(mid);
+	cifs_wake_up_task(mid);
+}
+
+static void
+cifs_cancelled_callback(struct mid_q_entry *mid)
+{
+	cifs_compound_callback(mid);
+	DeleteMidQEntry(mid);
 }
 
 int
@@ -783,13 +979,16 @@
 		   const int flags, const int num_rqst, struct smb_rqst *rqst,
 		   int *resp_buf_type, struct kvec *resp_iov)
 {
-	int i, j, rc = 0;
-	int timeout, optype;
+	int i, j, optype, rc = 0;
 	struct mid_q_entry *midQ[MAX_COMPOUND];
-	unsigned int credits = 0;
+	bool cancelled_mid[MAX_COMPOUND] = {false};
+	struct cifs_credits credits[MAX_COMPOUND] = {
+		{ .value = 0, .instance = 0 }
+	};
+	unsigned int instance;
 	char *buf;
+	struct TCP_Server_Info *server;
 
-	timeout = flags & CIFS_TIMEOUT_MASK;
 	optype = flags & CIFS_OP_MASK;
 
 	for (i = 0; i < num_rqst; i++)
@@ -800,59 +999,107 @@
 		return -EIO;
 	}
 
-	if (ses->server->tcpStatus == CifsExiting)
+	server = ses->server;
+	if (server->tcpStatus == CifsExiting)
 		return -ENOENT;
 
 	/*
-	 * Ensure that we do not send more than 50 overlapping requests
-	 * to the same server. We may make this configurable later or
-	 * use ses->maxReq.
+	 * Wait for all the requests to become available.
+	 * This approach still leaves the possibility to be stuck waiting for
+	 * credits if the server doesn't grant credits to the outstanding
+	 * requests and if the client is completely idle, not generating any
+	 * other requests.
+	 * This can be handled by the eventual session reconnect.
 	 */
-	rc = wait_for_free_request(ses->server, timeout, optype);
+	rc = wait_for_compound_request(server, num_rqst, flags,
+				       &instance);
 	if (rc)
 		return rc;
 
+	for (i = 0; i < num_rqst; i++) {
+		credits[i].value = 1;
+		credits[i].instance = instance;
+	}
+
 	/*
 	 * Make sure that we sign in the same order that we send on this socket
 	 * and avoid races inside tcp sendmsg code that could cause corruption
 	 * of smb data.
 	 */
 
-	mutex_lock(&ses->server->srv_mutex);
+	mutex_lock(&server->srv_mutex);
+
+	/*
+	 * All the parts of the compound chain belong obtained credits from the
+	 * same session. We can not use credits obtained from the previous
+	 * session to send this request. Check if there were reconnects after
+	 * we obtained credits and return -EAGAIN in such cases to let callers
+	 * handle it.
+	 */
+	if (instance != server->reconnect_instance) {
+		mutex_unlock(&server->srv_mutex);
+		for (j = 0; j < num_rqst; j++)
+			add_credits(server, &credits[j], optype);
+		return -EAGAIN;
+	}
 
 	for (i = 0; i < num_rqst; i++) {
-		midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]);
+		midQ[i] = server->ops->setup_request(ses, &rqst[i]);
 		if (IS_ERR(midQ[i])) {
+			revert_current_mid(server, i);
 			for (j = 0; j < i; j++)
 				cifs_delete_mid(midQ[j]);
-			mutex_unlock(&ses->server->srv_mutex);
+			mutex_unlock(&server->srv_mutex);
+
 			/* Update # of requests on wire to server */
-			add_credits(ses->server, 1, optype);
+			for (j = 0; j < num_rqst; j++)
+				add_credits(server, &credits[j], optype);
 			return PTR_ERR(midQ[i]);
 		}
 
 		midQ[i]->mid_state = MID_REQUEST_SUBMITTED;
+		midQ[i]->optype = optype;
 		/*
-		 * We don't invoke the callback compounds unless it is the last
-		 * request.
+		 * Invoke callback for every part of the compound chain
+		 * to calculate credits properly. Wake up this thread only when
+		 * the last element is received.
 		 */
 		if (i < num_rqst - 1)
-			midQ[i]->callback = cifs_noop_callback;
+			midQ[i]->callback = cifs_compound_callback;
+		else
+			midQ[i]->callback = cifs_compound_last_callback;
 	}
-	cifs_in_send_inc(ses->server);
-	rc = smb_send_rqst(ses->server, num_rqst, rqst, flags);
-	cifs_in_send_dec(ses->server);
+	cifs_in_send_inc(server);
+	rc = smb_send_rqst(server, num_rqst, rqst, flags);
+	cifs_in_send_dec(server);
 
 	for (i = 0; i < num_rqst; i++)
 		cifs_save_when_sent(midQ[i]);
 
-	if (rc < 0)
-		ses->server->sequence_number -= 2;
+	if (rc < 0) {
+		revert_current_mid(server, num_rqst);
+		server->sequence_number -= 2;
+	}
 
-	mutex_unlock(&ses->server->srv_mutex);
+	mutex_unlock(&server->srv_mutex);
 
-	if (rc < 0)
+	/*
+	 * If sending failed for some reason or it is an oplock break that we
+	 * will not receive a response to - return credits back
+	 */
+	if (rc < 0 || (flags & CIFS_NO_SRV_RSP)) {
+		for (i = 0; i < num_rqst; i++)
+			add_credits(server, &credits[i], optype);
 		goto out;
+	}
+
+	/*
+	 * At this point the request is passed to the network stack - we assume
+	 * that any credits taken from the server structure on the client have
+	 * been spent and we can't return them back. Once we receive responses
+	 * we will collect credits granted by the server in the mid callbacks
+	 * and add those credits to the server structure.
+	 */
 
 	/*
 	 * Compounding is never used during session establish.
@@ -861,41 +1108,36 @@
 		smb311_update_preauth_hash(ses, rqst[0].rq_iov,
 					   rqst[0].rq_nvec);
 
-	if (timeout == CIFS_ASYNC_OP)
-		goto out;
-
 	for (i = 0; i < num_rqst; i++) {
-		rc = wait_for_response(ses->server, midQ[i]);
-		if (rc != 0) {
-			cifs_dbg(FYI, "Cancelling wait for mid %llu\n",
-				 midQ[i]->mid);
-			send_cancel(ses->server, &rqst[i], midQ[i]);
+		rc = wait_for_response(server, midQ[i]);
+		if (rc != 0)
+			break;
+	}
+	if (rc != 0) {
+		for (; i < num_rqst; i++) {
+			cifs_server_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n",
+				 midQ[i]->mid, le16_to_cpu(midQ[i]->command));
+			send_cancel(server, &rqst[i], midQ[i]);
 			spin_lock(&GlobalMid_Lock);
 			if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
 				midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
-				midQ[i]->callback = DeleteMidQEntry;
-				spin_unlock(&GlobalMid_Lock);
-				add_credits(ses->server, 1, optype);
-				return rc;
+				midQ[i]->callback = cifs_cancelled_callback;
+				cancelled_mid[i] = true;
+				credits[i].value = 0;
 			}
 			spin_unlock(&GlobalMid_Lock);
 		}
 	}
 
-	for (i = 0; i < num_rqst; i++)
-		if (midQ[i]->resp_buf)
-			credits += ses->server->ops->get_credits(midQ[i]);
-	if (!credits)
-		credits = 1;
-
 	for (i = 0; i < num_rqst; i++) {
 		if (rc < 0)
 			goto out;
 
-		rc = cifs_sync_mid_result(midQ[i], ses->server);
+		rc = cifs_sync_mid_result(midQ[i], server);
 		if (rc != 0) {
-			add_credits(ses->server, credits, optype);
-			return rc;
+			/* mark this mid as cancelled to not free it below */
+			cancelled_mid[i] = true;
+			goto out;
 		}
 
 		if (!midQ[i]->resp_buf ||
@@ -908,18 +1150,18 @@
 		buf = (char *)midQ[i]->resp_buf;
 		resp_iov[i].iov_base = buf;
 		resp_iov[i].iov_len = midQ[i]->resp_buf_size +
-			ses->server->vals->header_preamble_size;
+			server->vals->header_preamble_size;
 
 		if (midQ[i]->large_buf)
 			resp_buf_type[i] = CIFS_LARGE_BUFFER;
 		else
 			resp_buf_type[i] = CIFS_SMALL_BUFFER;
 
-		rc = ses->server->ops->check_receive(midQ[i], ses->server,
+		rc = server->ops->check_receive(midQ[i], server,
 						     flags & CIFS_LOG_ERROR);
 
 		/* mark it so buf will not be freed by cifs_delete_mid */
-		if ((flags & CIFS_NO_RESP) == 0)
+		if ((flags & CIFS_NO_RSP_BUF) == 0)
 			midQ[i]->resp_buf = NULL;
 
 	}
@@ -942,9 +1184,10 @@
 	 * This is prevented above by using a noop callback that will not
 	 * wake this thread except for the very last PDU.
 	 */
-	for (i = 0; i < num_rqst; i++)
-		cifs_delete_mid(midQ[i]);
-	add_credits(ses->server, credits, optype);
+	for (i = 0; i < num_rqst; i++) {
+		if (!cancelled_mid[i])
+			cifs_delete_mid(midQ[i]);
+	}
 
 	return rc;
 }
@@ -999,24 +1242,27 @@
 int
 SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	    struct smb_hdr *in_buf, struct smb_hdr *out_buf,
-	    int *pbytes_returned, const int timeout)
+	    int *pbytes_returned, const int flags)
 {
 	int rc = 0;
 	struct mid_q_entry *midQ;
 	unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
 	struct kvec iov = { .iov_base = in_buf, .iov_len = len };
 	struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+	struct cifs_credits credits = { .value = 1, .instance = 0 };
+	struct TCP_Server_Info *server;
 
 	if (ses == NULL) {
 		cifs_dbg(VFS, "Null smb session\n");
 		return -EIO;
 	}
-	if (ses->server == NULL) {
+	server = ses->server;
+	if (server == NULL) {
 		cifs_dbg(VFS, "Null tcp session\n");
 		return -EIO;
 	}
 
-	if (ses->server->tcpStatus == CifsExiting)
+	if (server->tcpStatus == CifsExiting)
 		return -ENOENT;
 
 	/* Ensure that we do not send more than 50 overlapping requests
@@ -1024,12 +1270,12 @@
 	   use ses->maxReq */
 
 	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+		cifs_server_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
 			 len);
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses->server, timeout, 0);
+	rc = wait_for_free_request(server, flags, &credits.instance);
 	if (rc)
 		return rc;
 
@@ -1037,73 +1283,70 @@
 	   and avoid races inside tcp sendmsg code that could cause corruption
 	   of smb data */
 
-	mutex_lock(&ses->server->srv_mutex);
+	mutex_lock(&server->srv_mutex);
 
 	rc = allocate_mid(ses, in_buf, &midQ);
 	if (rc) {
 		mutex_unlock(&ses->server->srv_mutex);
 		/* Update # of requests on wire to server */
-		add_credits(ses->server, 1, 0);
+		add_credits(server, &credits, 0);
 		return rc;
 	}
 
-	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
+	rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
 	if (rc) {
-		mutex_unlock(&ses->server->srv_mutex);
+		mutex_unlock(&server->srv_mutex);
 		goto out;
 	}
 
 	midQ->mid_state = MID_REQUEST_SUBMITTED;
 
-	cifs_in_send_inc(ses->server);
-	rc = smb_send(ses->server, in_buf, len);
-	cifs_in_send_dec(ses->server);
+	cifs_in_send_inc(server);
+	rc = smb_send(server, in_buf, len);
+	cifs_in_send_dec(server);
 	cifs_save_when_sent(midQ);
 
 	if (rc < 0)
-		ses->server->sequence_number -= 2;
+		server->sequence_number -= 2;
 
-	mutex_unlock(&ses->server->srv_mutex);
+	mutex_unlock(&server->srv_mutex);
 
 	if (rc < 0)
 		goto out;
 
-	if (timeout == CIFS_ASYNC_OP)
-		goto out;
-
-	rc = wait_for_response(ses->server, midQ);
+	rc = wait_for_response(server, midQ);
 	if (rc != 0) {
-		send_cancel(ses->server, &rqst, midQ);
+		send_cancel(server, &rqst, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 			/* no longer considered to be "in-flight" */
 			midQ->callback = DeleteMidQEntry;
 			spin_unlock(&GlobalMid_Lock);
-			add_credits(ses->server, 1, 0);
+			add_credits(server, &credits, 0);
 			return rc;
 		}
 		spin_unlock(&GlobalMid_Lock);
 	}
 
-	rc = cifs_sync_mid_result(midQ, ses->server);
+	rc = cifs_sync_mid_result(midQ, server);
 	if (rc != 0) {
-		add_credits(ses->server, 1, 0);
+		add_credits(server, &credits, 0);
 		return rc;
 	}
 
 	if (!midQ->resp_buf || !out_buf ||
 	    midQ->mid_state != MID_RESPONSE_RECEIVED) {
 		rc = -EIO;
-		cifs_dbg(VFS, "Bad MID state?\n");
+		cifs_server_dbg(VFS, "Bad MID state?\n");
 		goto out;
 	}
 
 	*pbytes_returned = get_rfc1002_length(midQ->resp_buf);
 	memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
-	rc = cifs_check_receive(midQ, ses->server, 0);
+	rc = cifs_check_receive(midQ, server, 0);
 out:
 	cifs_delete_mid(midQ);
-	add_credits(ses->server, 1, 0);
+	add_credits(server, &credits, 0);
 
 	return rc;
 }
@@ -1145,19 +1388,22 @@
 	unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
 	struct kvec iov = { .iov_base = in_buf, .iov_len = len };
 	struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+	unsigned int instance;
+	struct TCP_Server_Info *server;
 
 	if (tcon == NULL || tcon->ses == NULL) {
 		cifs_dbg(VFS, "Null smb session\n");
 		return -EIO;
 	}
 	ses = tcon->ses;
+	server = ses->server;
 
-	if (ses->server == NULL) {
+	if (server == NULL) {
 		cifs_dbg(VFS, "Null tcp session\n");
 		return -EIO;
 	}
 
-	if (ses->server->tcpStatus == CifsExiting)
+	if (server->tcpStatus == CifsExiting)
 		return -ENOENT;
 
 	/* Ensure that we do not send more than 50 overlapping requests
@@ -1165,12 +1411,12 @@
 	   use ses->maxReq */
 
 	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+		cifs_tcon_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
 			 len);
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0);
+	rc = wait_for_free_request(server, CIFS_BLOCKING_OP, &instance);
 	if (rc)
 		return rc;
 
@@ -1178,31 +1424,31 @@
 	   and avoid races inside tcp sendmsg code that could cause corruption
 	   of smb data */
 
-	mutex_lock(&ses->server->srv_mutex);
+	mutex_lock(&server->srv_mutex);
 
 	rc = allocate_mid(ses, in_buf, &midQ);
 	if (rc) {
-		mutex_unlock(&ses->server->srv_mutex);
+		mutex_unlock(&server->srv_mutex);
 		return rc;
 	}
 
-	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
+	rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
 	if (rc) {
 		cifs_delete_mid(midQ);
-		mutex_unlock(&ses->server->srv_mutex);
+		mutex_unlock(&server->srv_mutex);
 		return rc;
 	}
 
 	midQ->mid_state = MID_REQUEST_SUBMITTED;
-	cifs_in_send_inc(ses->server);
-	rc = smb_send(ses->server, in_buf, len);
-	cifs_in_send_dec(ses->server);
+	cifs_in_send_inc(server);
+	rc = smb_send(server, in_buf, len);
+	cifs_in_send_dec(server);
 	cifs_save_when_sent(midQ);
 
 	if (rc < 0)
-		ses->server->sequence_number -= 2;
+		server->sequence_number -= 2;
 
-	mutex_unlock(&ses->server->srv_mutex);
+	mutex_unlock(&server->srv_mutex);
 
 	if (rc < 0) {
 		cifs_delete_mid(midQ);
@@ -1210,21 +1456,21 @@
 	}
 
 	/* Wait for a reply - allow signals to interrupt. */
-	rc = wait_event_interruptible(ses->server->response_q,
+	rc = wait_event_interruptible(server->response_q,
 		(!(midQ->mid_state == MID_REQUEST_SUBMITTED)) ||
-		((ses->server->tcpStatus != CifsGood) &&
-		 (ses->server->tcpStatus != CifsNew)));
+		((server->tcpStatus != CifsGood) &&
+		 (server->tcpStatus != CifsNew)));
 
 	/* Were we interrupted by a signal ? */
 	if ((rc == -ERESTARTSYS) &&
 		(midQ->mid_state == MID_REQUEST_SUBMITTED) &&
-		((ses->server->tcpStatus == CifsGood) ||
-		 (ses->server->tcpStatus == CifsNew))) {
+		((server->tcpStatus == CifsGood) ||
+		 (server->tcpStatus == CifsNew))) {
 
 		if (in_buf->Command == SMB_COM_TRANSACTION2) {
 			/* POSIX lock. We send a NT_CANCEL SMB to cause the
 			   blocking lock to return. */
-			rc = send_cancel(ses->server, &rqst, midQ);
+			rc = send_cancel(server, &rqst, midQ);
 			if (rc) {
 				cifs_delete_mid(midQ);
 				return rc;
@@ -1243,9 +1489,9 @@
 			}
 		}
 
-		rc = wait_for_response(ses->server, midQ);
+		rc = wait_for_response(server, midQ);
 		if (rc) {
-			send_cancel(ses->server, &rqst, midQ);
+			send_cancel(server, &rqst, midQ);
 			spin_lock(&GlobalMid_Lock);
 			if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 				/* no longer considered to be "in-flight" */
@@ -1260,20 +1506,20 @@
 		rstart = 1;
 	}
 
-	rc = cifs_sync_mid_result(midQ, ses->server);
+	rc = cifs_sync_mid_result(midQ, server);
 	if (rc != 0)
 		return rc;
 
 	/* rcvd frame is ok */
 	if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
 		rc = -EIO;
-		cifs_dbg(VFS, "Bad MID state?\n");
+		cifs_tcon_dbg(VFS, "Bad MID state?\n");
 		goto out;
 	}
 
 	*pbytes_returned = get_rfc1002_length(midQ->resp_buf);
 	memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
-	rc = cifs_check_receive(midQ, ses->server, 0);
+	rc = cifs_check_receive(midQ, server, 0);
 out:
 	cifs_delete_mid(midQ);
 	if (rstart && rc == -EACCES)

diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c
index 1506d4f..1a23a1d 100644
--- a/fs/cifs/winucase.c
+++ b/fs/cifs/winucase.c

@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * fs/cifs/winucase.c
  *
  * Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013
  *
- * This program is free software;  you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY;  without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- * the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program;  if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
  * The const tables in this file were converted from the following info
  * provided by Microsoft:
  *

diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 50ddb79..db4ba8f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c

@@ -31,7 +31,7 @@
 #include "cifs_fs_sb.h"
 #include "cifs_unicode.h"
 
-#define MAX_EA_VALUE_SIZE 65535
+#define MAX_EA_VALUE_SIZE CIFSMaxBufSize
 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
 #define CIFS_XATTR_ATTRIB "cifs.dosattrib"  /* full name: user.cifs.dosattrib */
 #define CIFS_XATTR_CREATETIME "cifs.creationtime"  /* user.cifs.creationtime */
@@ -96,7 +96,6 @@
 		break;
 
 	case XATTR_CIFS_ACL: {
-#ifdef CONFIG_CIFS_ACL
 		struct cifs_ntsd *pacl;
 
 		if (!value)
@@ -117,7 +116,6 @@
 				CIFS_I(inode)->time = 0;
 			kfree(pacl);
 		}
-#endif /* CONFIG_CIFS_ACL */
 		break;
 	}
 
@@ -247,7 +245,6 @@
 		break;
 
 	case XATTR_CIFS_ACL: {
-#ifdef CONFIG_CIFS_ACL
 		u32 acllen;
 		struct cifs_ntsd *pacl;
 
@@ -270,7 +267,6 @@
 			rc = acllen;
 			kfree(pacl);
 		}
-#endif  /* CONFIG_CIFS_ACL */
 		break;
 	}
 

diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig
index c0e5a7f..ae6759f 100644
--- a/fs/coda/Kconfig
+++ b/fs/coda/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config CODA_FS
 	tristate "Coda file system support (advanced network fs)"
 	depends on INET

diff --git a/fs/coda/Makefile b/fs/coda/Makefile
index 1bab69a..78befb8 100644
--- a/fs/coda/Makefile
+++ b/fs/coda/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux Coda filesystem routines.
 #
@@ -5,7 +6,8 @@
 obj-$(CONFIG_CODA_FS) += coda.o
 
 coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \
-	     coda_linux.o symlink.o pioctl.o sysctl.o 
+	     coda_linux.o symlink.o pioctl.o
+coda-$(CONFIG_SYSCTL) += sysctl.o
 
 # If you want debugging output, please uncomment the following line.
 

diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 201fc08..3b8c451 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c

@@ -21,7 +21,7 @@
 #include <linux/spinlock.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 

diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 845b5a6..06855f6 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c

@@ -8,8 +8,8 @@
 #include <linux/time.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
 #include <linux/pagemap.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
@@ -137,11 +137,6 @@
 	struct inode *inode;
 	unsigned long hash = coda_f2i(fid);
 
-	if ( !sb ) {
-		pr_warn("%s: no sb!\n", __func__);
-		return NULL;
-	}
-
 	inode = ilookup5(sb, hash, coda_test_inode, fid);
 	if ( !inode )
 		return NULL;
@@ -153,6 +148,16 @@
 	return inode;
 }
 
+struct coda_file_info *coda_ftoc(struct file *file)
+{
+	struct coda_file_info *cfi = file->private_data;
+
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+
+	return cfi;
+
+}
+
 /* the CONTROL inode is made without asking attributes from Venus */
 struct inode *coda_cnode_makectl(struct super_block *sb)
 {

diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index d702ba1..1763ff9 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h

@@ -40,10 +40,9 @@
 	int		   cfi_magic;	  /* magic number */
 	struct file	  *cfi_container; /* container file for this cnode */
 	unsigned int	   cfi_mapcount;  /* nr of times this file is mapped */
+	bool		   cfi_access_intent; /* is access intent supported */
 };
 
-#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
-
 /* flags */
 #define C_VATTR       0x1   /* Validity of vattr in inode */
 #define C_FLUSH       0x2   /* used after a flush */
@@ -54,6 +53,7 @@
 struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
 struct inode *coda_cnode_makectl(struct super_block *sb);
 struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+struct coda_file_info *coda_ftoc(struct file *file);
 void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
 
 #endif

diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index bb0b3e0..f82b59c 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h

@@ -13,9 +13,19 @@
 void coda_destroy_inodecache(void);
 int __init coda_init_inodecache(void);
 int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
+
+#ifdef CONFIG_SYSCTL
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
+#else
+static inline void coda_sysctl_init(void)
+{
+}
 
+static inline void coda_sysctl_clean(void)
+{
+}
+#endif
 #endif  /*  _CODA_INT_  */
 
 

diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index f3d543d..2e1a5a1 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c

@@ -18,7 +18,7 @@
 #include <linux/string.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 /* initialize the debugging variables */
@@ -66,6 +66,25 @@
 	return coda_flags;
 }
 
+static struct timespec64 coda_to_timespec64(struct coda_timespec ts)
+{
+	struct timespec64 ts64 = {
+		.tv_sec = ts.tv_sec,
+		.tv_nsec = ts.tv_nsec,
+	};
+
+	return ts64;
+}
+
+static struct coda_timespec timespec64_to_coda(struct timespec64 ts64)
+{
+	struct coda_timespec ts = {
+		.tv_sec = ts64.tv_sec,
+		.tv_nsec = ts64.tv_nsec,
+	};
+
+	return ts;
+}
 
 /* utility functions below */
 void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
@@ -105,11 +124,11 @@
 	if (attr->va_size != -1)
 		inode->i_blocks = (attr->va_size + 511) >> 9;
 	if (attr->va_atime.tv_sec != -1) 
-		inode->i_atime = timespec_to_timespec64(attr->va_atime);
+		inode->i_atime = coda_to_timespec64(attr->va_atime);
 	if (attr->va_mtime.tv_sec != -1)
-		inode->i_mtime = timespec_to_timespec64(attr->va_mtime);
+		inode->i_mtime = coda_to_timespec64(attr->va_mtime);
         if (attr->va_ctime.tv_sec != -1)
-		inode->i_ctime = timespec_to_timespec64(attr->va_ctime);
+		inode->i_ctime = coda_to_timespec64(attr->va_ctime);
 }
 
 
@@ -130,12 +149,12 @@
         vattr->va_uid = (vuid_t) -1; 
         vattr->va_gid = (vgid_t) -1;
         vattr->va_size = (off_t) -1;
-	vattr->va_atime.tv_sec = (time_t) -1;
-	vattr->va_atime.tv_nsec =  (time_t) -1;
-        vattr->va_mtime.tv_sec = (time_t) -1;
-        vattr->va_mtime.tv_nsec = (time_t) -1;
-	vattr->va_ctime.tv_sec = (time_t) -1;
-	vattr->va_ctime.tv_nsec = (time_t) -1;
+	vattr->va_atime.tv_sec = (int64_t) -1;
+	vattr->va_atime.tv_nsec = (long) -1;
+	vattr->va_mtime.tv_sec = (int64_t) -1;
+	vattr->va_mtime.tv_nsec = (long) -1;
+	vattr->va_ctime.tv_sec = (int64_t) -1;
+	vattr->va_ctime.tv_nsec = (long) -1;
         vattr->va_type = C_VNON;
 	vattr->va_fileid = -1;
 	vattr->va_gen = -1;
@@ -175,13 +194,13 @@
                 vattr->va_size = iattr->ia_size;
 	}
         if ( valid & ATTR_ATIME ) {
-		vattr->va_atime = timespec64_to_timespec(iattr->ia_atime);
+		vattr->va_atime = timespec64_to_coda(iattr->ia_atime);
 	}
         if ( valid & ATTR_MTIME ) {
-		vattr->va_mtime = timespec64_to_timespec(iattr->ia_mtime);
+		vattr->va_mtime = timespec64_to_coda(iattr->ia_mtime);
 	}
         if ( valid & ATTR_CTIME ) {
-		vattr->va_ctime = timespec64_to_timespec(iattr->ia_ctime);
+		vattr->va_ctime = timespec64_to_coda(iattr->ia_ctime);
 	}
 }
 

diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 126155c..d5ebd36 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h

@@ -59,22 +59,6 @@
 void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
 unsigned short coda_flags_to_cflags(unsigned short);
 
-/* sysctl.h */
-void coda_sysctl_init(void);
-void coda_sysctl_clean(void);
-
-#define CODA_ALLOC(ptr, cast, size) do { \
-    if (size < PAGE_SIZE) \
-        ptr = kzalloc((unsigned long) size, GFP_KERNEL); \
-    else \
-        ptr = (cast)vzalloc((unsigned long) size); \
-    if (!ptr) \
-	pr_warn("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
-} while (0)
-
-
-#define CODA_FREE(ptr, size) kvfree((ptr))
-
 /* inode to cnode access functions */
 
 static inline struct coda_inode_info *ITOC(struct inode *inode)

diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h
new file mode 100644
index 0000000..52da08c
--- /dev/null
+++ b/fs/coda/coda_psdev.h

@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CODA_PSDEV_H
+#define __CODA_PSDEV_H
+
+#include <linux/backing-dev.h>
+#include <linux/magic.h>
+#include <linux/mutex.h>
+
+#define CODA_PSDEV_MAJOR 67
+#define MAX_CODADEVS  5	   /* how many do we allow */
+
+struct kstatfs;
+
+/* messages between coda filesystem in kernel and Venus */
+struct upc_req {
+	struct list_head	uc_chain;
+	caddr_t			uc_data;
+	u_short			uc_flags;
+	u_short			uc_inSize;  /* Size is at most 5000 bytes */
+	u_short			uc_outSize;
+	u_short			uc_opcode;  /* copied from data to save lookup */
+	int			uc_unique;
+	wait_queue_head_t	uc_sleep;   /* process' wait queue */
+};
+
+#define CODA_REQ_ASYNC  0x1
+#define CODA_REQ_READ   0x2
+#define CODA_REQ_WRITE  0x4
+#define CODA_REQ_ABORT  0x8
+
+/* communication pending/processing queues */
+struct venus_comm {
+	u_long		    vc_seq;
+	wait_queue_head_t   vc_waitq; /* Venus wait queue */
+	struct list_head    vc_pending;
+	struct list_head    vc_processing;
+	int                 vc_inuse;
+	struct super_block *vc_sb;
+	struct mutex	    vc_mutex;
+};
+
+static inline struct venus_comm *coda_vcp(struct super_block *sb)
+{
+	return (struct venus_comm *)((sb)->s_fs_info);
+}
+
+/* upcalls */
+int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
+int venus_getattr(struct super_block *sb, struct CodaFid *fid,
+		  struct coda_vattr *attr);
+int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *);
+int venus_lookup(struct super_block *sb, struct CodaFid *fid,
+		 const char *name, int length, int *type,
+		 struct CodaFid *resfid);
+int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
+		kuid_t uid);
+int venus_open(struct super_block *sb, struct CodaFid *fid, int flags,
+	       struct file **f);
+int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid,
+		const char *name, int length,
+		struct CodaFid *newfid, struct coda_vattr *attrs);
+int venus_create(struct super_block *sb, struct CodaFid *dirfid,
+		 const char *name, int length, int excl, int mode,
+		 struct CodaFid *newfid, struct coda_vattr *attrs);
+int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid,
+		const char *name, int length);
+int venus_remove(struct super_block *sb, struct CodaFid *dirfid,
+		 const char *name, int length);
+int venus_readlink(struct super_block *sb, struct CodaFid *fid,
+		   char *buffer, int *length);
+int venus_rename(struct super_block *sb, struct CodaFid *new_fid,
+		 struct CodaFid *old_fid, size_t old_length,
+		 size_t new_length, const char *old_name,
+		 const char *new_name);
+int venus_link(struct super_block *sb, struct CodaFid *fid,
+		  struct CodaFid *dirfid, const char *name, int len );
+int venus_symlink(struct super_block *sb, struct CodaFid *fid,
+		  const char *name, int len, const char *symname, int symlen);
+int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
+int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
+		 unsigned int cmd, struct PioctlData *data);
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes);
+int venus_fsync(struct super_block *sb, struct CodaFid *fid);
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
+int venus_access_intent(struct super_block *sb, struct CodaFid *fid,
+			bool *access_intent_supported,
+			size_t count, loff_t ppos, int type);
+
+/*
+ * Statistics
+ */
+
+extern struct venus_comm coda_comms[];
+#endif

diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 00876dd..ca40c25 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c

@@ -23,7 +23,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
@@ -47,8 +47,8 @@
 	int type = 0;
 
 	if (length > CODA_MAXNAMLEN) {
-		pr_err("name too long: lookup, %s (%*s)\n",
-		       coda_i2s(dir), (int)length, name);
+		pr_err("name too long: lookup, %s %zu\n",
+		       coda_i2s(dir), length);
 		return ERR_PTR(-ENAMETOOLONG);
 	}
 
@@ -356,8 +356,7 @@
 	ino_t ino;
 	int ret;
 
-	cfi = CODA_FTOC(coda_file);
-	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	cfi = coda_ftoc(coda_file);
 	host_file = cfi->cfi_container;
 
 	cii = ITOC(file_inode(coda_file));
@@ -426,8 +425,7 @@
 	struct file *host_file;
 	int ret;
 
-	cfi = CODA_FTOC(coda_file);
-	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	cfi = coda_ftoc(coda_file);
 	host_file = cfi->cfi_container;
 
 	if (host_file->f_op->iterate || host_file->f_op->iterate_shared) {

diff --git a/fs/coda/file.c b/fs/coda/file.c
index 1cbc1f2..128d63d 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c

@@ -20,22 +20,43 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/uio.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_int.h"
 
+struct coda_vm_ops {
+	atomic_t refcnt;
+	struct file *coda_file;
+	const struct vm_operations_struct *host_vm_ops;
+	struct vm_operations_struct vm_ops;
+};
+
 static ssize_t
 coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *coda_file = iocb->ki_filp;
-	struct coda_file_info *cfi = CODA_FTOC(coda_file);
+	struct inode *coda_inode = file_inode(coda_file);
+	struct coda_file_info *cfi = coda_ftoc(coda_file);
+	loff_t ki_pos = iocb->ki_pos;
+	size_t count = iov_iter_count(to);
+	ssize_t ret;
 
-	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+				  &cfi->cfi_access_intent,
+				  count, ki_pos, CODA_ACCESS_TYPE_READ);
+	if (ret)
+		goto finish_read;
 
-	return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0);
+	ret = vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0);
+
+finish_read:
+	venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+			    &cfi->cfi_access_intent,
+			    count, ki_pos, CODA_ACCESS_TYPE_READ_FINISH);
+	return ret;
 }
 
 static ssize_t
@@ -43,13 +64,18 @@
 {
 	struct file *coda_file = iocb->ki_filp;
 	struct inode *coda_inode = file_inode(coda_file);
-	struct coda_file_info *cfi = CODA_FTOC(coda_file);
-	struct file *host_file;
+	struct coda_file_info *cfi = coda_ftoc(coda_file);
+	struct file *host_file = cfi->cfi_container;
+	loff_t ki_pos = iocb->ki_pos;
+	size_t count = iov_iter_count(to);
 	ssize_t ret;
 
-	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+				  &cfi->cfi_access_intent,
+				  count, ki_pos, CODA_ACCESS_TYPE_WRITE);
+	if (ret)
+		goto finish_write;
 
-	host_file = cfi->cfi_container;
 	file_start_write(host_file);
 	inode_lock(coda_inode);
 	ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
@@ -58,26 +84,73 @@
 	coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode);
 	inode_unlock(coda_inode);
 	file_end_write(host_file);
+
+finish_write:
+	venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+			    &cfi->cfi_access_intent,
+			    count, ki_pos, CODA_ACCESS_TYPE_WRITE_FINISH);
 	return ret;
 }
 
+static void
+coda_vm_open(struct vm_area_struct *vma)
+{
+	struct coda_vm_ops *cvm_ops =
+		container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
+
+	atomic_inc(&cvm_ops->refcnt);
+
+	if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open)
+		cvm_ops->host_vm_ops->open(vma);
+}
+
+static void
+coda_vm_close(struct vm_area_struct *vma)
+{
+	struct coda_vm_ops *cvm_ops =
+		container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
+
+	if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close)
+		cvm_ops->host_vm_ops->close(vma);
+
+	if (atomic_dec_and_test(&cvm_ops->refcnt)) {
+		vma->vm_ops = cvm_ops->host_vm_ops;
+		fput(cvm_ops->coda_file);
+		kfree(cvm_ops);
+	}
+}
+
 static int
 coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 {
-	struct coda_file_info *cfi;
+	struct inode *coda_inode = file_inode(coda_file);
+	struct coda_file_info *cfi = coda_ftoc(coda_file);
+	struct file *host_file = cfi->cfi_container;
+	struct inode *host_inode = file_inode(host_file);
 	struct coda_inode_info *cii;
-	struct file *host_file;
-	struct inode *coda_inode, *host_inode;
-
-	cfi = CODA_FTOC(coda_file);
-	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
-	host_file = cfi->cfi_container;
+	struct coda_vm_ops *cvm_ops;
+	loff_t ppos;
+	size_t count;
+	int ret;
 
 	if (!host_file->f_op->mmap)
 		return -ENODEV;
 
-	coda_inode = file_inode(coda_file);
-	host_inode = file_inode(host_file);
+	if (WARN_ON(coda_file != vma->vm_file))
+		return -EIO;
+
+	count = vma->vm_end - vma->vm_start;
+	ppos = vma->vm_pgoff * PAGE_SIZE;
+
+	ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+				  &cfi->cfi_access_intent,
+				  count, ppos, CODA_ACCESS_TYPE_MMAP);
+	if (ret)
+		return ret;
+
+	cvm_ops = kmalloc(sizeof(struct coda_vm_ops), GFP_KERNEL);
+	if (!cvm_ops)
+		return -ENOMEM;
 
 	cii = ITOC(coda_inode);
 	spin_lock(&cii->c_lock);
@@ -89,6 +162,7 @@
 	 * the container file on us! */
 	else if (coda_inode->i_mapping != host_inode->i_mapping) {
 		spin_unlock(&cii->c_lock);
+		kfree(cvm_ops);
 		return -EBUSY;
 	}
 
@@ -97,7 +171,29 @@
 	cfi->cfi_mapcount++;
 	spin_unlock(&cii->c_lock);
 
-	return call_mmap(host_file, vma);
+	vma->vm_file = get_file(host_file);
+	ret = call_mmap(vma->vm_file, vma);
+
+	if (ret) {
+		/* if call_mmap fails, our caller will put coda_file so we
+		 * should drop the reference to the host_file that we got.
+		 */
+		fput(host_file);
+		kfree(cvm_ops);
+	} else {
+		/* here we add redirects for the open/close vm_operations */
+		cvm_ops->host_vm_ops = vma->vm_ops;
+		if (vma->vm_ops)
+			cvm_ops->vm_ops = *vma->vm_ops;
+
+		cvm_ops->vm_ops.open = coda_vm_open;
+		cvm_ops->vm_ops.close = coda_vm_close;
+		cvm_ops->coda_file = coda_file;
+		atomic_set(&cvm_ops->refcnt, 1);
+
+		vma->vm_ops = &cvm_ops->vm_ops;
+	}
+	return ret;
 }
 
 int coda_open(struct inode *coda_inode, struct file *coda_file)
@@ -127,6 +223,8 @@
 	cfi->cfi_magic = CODA_MAGIC;
 	cfi->cfi_mapcount = 0;
 	cfi->cfi_container = host_file;
+	/* assume access intents are supported unless we hear otherwise */
+	cfi->cfi_access_intent = true;
 
 	BUG_ON(coda_file->private_data != NULL);
 	coda_file->private_data = cfi;
@@ -142,8 +240,7 @@
 	struct inode *host_inode;
 	int err;
 
-	cfi = CODA_FTOC(coda_file);
-	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	cfi = coda_ftoc(coda_file);
 
 	err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
 			  coda_flags, coda_file->f_cred->fsuid);
@@ -185,8 +282,7 @@
 		return err;
 	inode_lock(coda_inode);
 
-	cfi = CODA_FTOC(coda_file);
-	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	cfi = coda_ftoc(coda_file);
 	host_file = cfi->cfi_container;
 
 	err = vfs_fsync(host_file, datasync);
@@ -207,4 +303,3 @@
 	.fsync		= coda_fsync,
 	.splice_read	= generic_file_splice_read,
 };
-

diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 97424cf..b1c70e2 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c

@@ -27,7 +27,7 @@
 #include <linux/vmalloc.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
@@ -54,17 +54,11 @@
 	return &ei->vfs_inode;
 }
 
-static void coda_i_callback(struct rcu_head *head)
+static void coda_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
-static void coda_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, coda_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
@@ -104,7 +98,7 @@
 static const struct super_operations coda_super_operations =
 {
 	.alloc_inode	= coda_alloc_inode,
-	.destroy_inode	= coda_destroy_inode,
+	.free_inode	= coda_free_inode,
 	.evict_inode	= coda_evict_inode,
 	.put_super	= coda_put_super,
 	.statfs		= coda_statfs,
@@ -194,6 +188,9 @@
 	sb->s_magic = CODA_SUPER_MAGIC;
 	sb->s_op = &coda_super_operations;
 	sb->s_d_op = &coda_dentry_operations;
+	sb->s_time_gran = 1;
+	sb->s_time_min = S64_MIN;
+	sb->s_time_max = S64_MAX;
 
 	error = super_setup_bdi(sb);
 	if (error)
@@ -242,6 +239,7 @@
 	vcp->vc_sb = NULL;
 	sb->s_fs_info = NULL;
 	mutex_unlock(&vcp->vc_mutex);
+	mutex_destroy(&vcp->vc_mutex);
 
 	pr_info("Bye bye.\n");
 }

diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index e0c17b7..3aec27e 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c

@@ -20,8 +20,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 /* pioctl ops */
@@ -64,11 +63,8 @@
 	 * Look up the pathname. Note that the pathname is in
 	 * user memory, and namei takes care of this
 	 */
-	if (data.follow)
-		error = user_path(data.path, &path);
-	else
-		error = user_lpath(data.path, &path);
-
+	error = user_path_at(AT_FDCWD, data.path,
+			     data.follow ? LOOKUP_FOLLOW : 0, &path);
 	if (error)
 		return error;
 

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index c5234c2..240669f 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *      	An implementation of a loadable kernel mode driver providing
  *		multiple kernel/user space bidirectional communications links.
  *
  * 		Author: 	Alan Cox <alan@lxorguk.ukuu.org.uk>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
  * 
  *              Adapted to become the Linux 2.0 Coda pseudo device
  *              Peter  Braam  <braam@maths.ox.ac.uk> 
@@ -39,12 +35,10 @@
 #include <linux/device.h>
 #include <linux/pid_namespace.h>
 #include <asm/io.h>
-#include <linux/poll.h>
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 #include "coda_int.h"
@@ -105,8 +99,12 @@
 	ssize_t retval = 0, count = 0;
 	int error;
 
+	/* make sure there is enough to copy out the (opcode, unique) values */
+	if (nbytes < (2 * sizeof(u_int32_t)))
+		return -EINVAL;
+
         /* Peek at the opcode, uniquefier */
-	if (copy_from_user(&hdr, buf, 2 * sizeof(u_long)))
+	if (copy_from_user(&hdr, buf, 2 * sizeof(u_int32_t)))
 	        return -EFAULT;
 
         if (DOWNCALL(hdr.opcode)) {
@@ -124,17 +122,21 @@
 				hdr.opcode, hdr.unique);
 		        nbytes = size;
 		}
-		CODA_ALLOC(dcbuf, union outputArgs *, nbytes);
+		dcbuf = kvmalloc(nbytes, GFP_KERNEL);
+		if (!dcbuf) {
+			retval = -ENOMEM;
+			goto out;
+		}
 		if (copy_from_user(dcbuf, buf, nbytes)) {
-			CODA_FREE(dcbuf, nbytes);
+			kvfree(dcbuf);
 			retval = -EFAULT;
 			goto out;
 		}
 
 		/* what downcall errors does Venus handle ? */
-		error = coda_downcall(vcp, hdr.opcode, dcbuf);
+		error = coda_downcall(vcp, hdr.opcode, dcbuf, nbytes);
 
-		CODA_FREE(dcbuf, nbytes);
+		kvfree(dcbuf);
 		if (error) {
 			pr_warn("%s: coda_downcall error: %d\n",
 				__func__, error);
@@ -187,8 +189,11 @@
 	if (req->uc_opcode == CODA_OPEN_BY_FD) {
 		struct coda_open_by_fd_out *outp =
 			(struct coda_open_by_fd_out *)req->uc_data;
-		if (!outp->oh.result)
+		if (!outp->oh.result) {
 			outp->fh = fget(outp->fd);
+			if (!outp->fh)
+				return -EBADF;
+		}
 	}
 
         wake_up(&req->uc_sleep);
@@ -257,7 +262,7 @@
 		goto out;
 	}
 
-	CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
+	kvfree(req->uc_data);
 	kfree(req);
 out:
 	mutex_unlock(&vcp->vc_mutex);
@@ -319,7 +324,7 @@
 
 		/* Async requests need to be freed here */
 		if (req->uc_flags & CODA_REQ_ASYNC) {
-			CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
+			kvfree(req->uc_data);
 			kfree(req);
 			continue;
 		}
@@ -352,13 +357,13 @@
 	.llseek		= noop_llseek,
 };
 
-static int init_coda_psdev(void)
+static int __init init_coda_psdev(void)
 {
 	int i, err = 0;
 	if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) {
 		pr_err("%s: unable to get major %d\n",
 		       __func__, CODA_PSDEV_MAJOR);
-              return -EIO;
+		return -EIO;
 	}
 	coda_psdev_class = class_create(THIS_MODULE, "coda");
 	if (IS_ERR(coda_psdev_class)) {
@@ -383,7 +388,7 @@
 MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
 MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
 MODULE_LICENSE("GPL");
-MODULE_VERSION("6.6");
+MODULE_VERSION("7.0");
 
 static int __init init_coda(void)
 {

diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 202297d..8907d05 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c

@@ -17,8 +17,7 @@
 #include <linux/pagemap.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 static int coda_symlink_filler(struct file *file, struct page *page)

diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 0301d45..fda3b70 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c

@@ -12,7 +12,6 @@
 
 #include "coda_int.h"
 
-#ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
 
 static struct ctl_table coda_table[] = {
@@ -62,13 +61,3 @@
 		fs_table_header = NULL;
 	}
 }
-
-#else
-void coda_sysctl_init(void)
-{
-}
-
-void coda_sysctl_clean(void)
-{
-}
-#endif

diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1175a17..eb3b189 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c

@@ -33,7 +33,7 @@
 #include <linux/vfs.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
@@ -46,7 +46,7 @@
 {
 	union inputArgs *inp;
 
-	CODA_ALLOC(inp, union inputArgs *, size);
+	inp = kvzalloc(size, GFP_KERNEL);
         if (!inp)
 		return ERR_PTR(-ENOMEM);
 
@@ -85,7 +85,7 @@
 	if (!error)
 		*fidp = outp->coda_root.VFid;
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }
 
@@ -104,7 +104,7 @@
 	if (!error)
 		*attr = outp->coda_getattr.attr;
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
         return error;
 }
 
@@ -123,7 +123,7 @@
 
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-        CODA_FREE(inp, insize);
+	kvfree(inp);
         return error;
 }
 
@@ -153,7 +153,7 @@
 		*type = outp->coda_lookup.vtype;
 	}
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }
 
@@ -173,7 +173,7 @@
 
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
         return error;
 }
 
@@ -194,7 +194,7 @@
 	if (!error)
 		*fh = outp->coda_open_by_fd.fh;
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }	
 
@@ -224,7 +224,7 @@
 		*newfid = outp->coda_mkdir.VFid;
 	}
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;        
 }
 
@@ -262,7 +262,7 @@
 
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }
 
@@ -295,7 +295,7 @@
 		*newfid = outp->coda_create.VFid;
 	}
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;        
 }
 
@@ -318,7 +318,7 @@
 
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }
 
@@ -340,7 +340,7 @@
 
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }
 
@@ -370,7 +370,7 @@
 		*(buffer + retlen) = '\0';
 	}
 
-        CODA_FREE(inp, insize);
+	kvfree(inp);
         return error;
 }
 
@@ -398,7 +398,7 @@
 
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
         return error;
 }
 
@@ -433,7 +433,7 @@
 
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
         return error;
 }
 
@@ -449,7 +449,7 @@
 	inp->coda_fsync.VFid = *fid;
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }
 
@@ -467,7 +467,7 @@
 
 	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }
 
@@ -543,7 +543,7 @@
 	}
 
  exit:
-	CODA_FREE(inp, insize);
+	kvfree(inp);
 	return error;
 }
 
@@ -553,7 +553,7 @@
         union outputArgs *outp;
         int insize, outsize, error;
         
-	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
+	insize = SIZE(statfs);
 	UPARG(CODA_STATFS);
 
 	error = coda_upcall(coda_vcp(dentry->d_sb), insize, &outsize, inp);
@@ -565,10 +565,51 @@
 		sfs->f_ffree  = outp->coda_statfs.stat.f_ffree;
 	}
 
-        CODA_FREE(inp, insize);
+	kvfree(inp);
         return error;
 }
 
+int venus_access_intent(struct super_block *sb, struct CodaFid *fid,
+			bool *access_intent_supported,
+			size_t count, loff_t ppos, int type)
+{
+	union inputArgs *inp;
+	union outputArgs *outp;
+	int insize, outsize, error;
+	bool finalizer =
+		type == CODA_ACCESS_TYPE_READ_FINISH ||
+		type == CODA_ACCESS_TYPE_WRITE_FINISH;
+
+	if (!*access_intent_supported && !finalizer)
+		return 0;
+
+	insize = SIZE(access_intent);
+	UPARG(CODA_ACCESS_INTENT);
+
+	inp->coda_access_intent.VFid = *fid;
+	inp->coda_access_intent.count = count;
+	inp->coda_access_intent.pos = ppos;
+	inp->coda_access_intent.type = type;
+
+	error = coda_upcall(coda_vcp(sb), insize,
+			    finalizer ? NULL : &outsize, inp);
+
+	/*
+	 * we have to free the request buffer for synchronous upcalls
+	 * or when asynchronous upcalls fail, but not when asynchronous
+	 * upcalls succeed
+	 */
+	if (!finalizer || error)
+		kvfree(inp);
+
+	/* Chunked access is not supported or an old Coda client */
+	if (error == -EOPNOTSUPP) {
+		*access_intent_supported = false;
+		error = 0;
+	}
+	return error;
+}
+
 /*
  * coda_upcall and coda_downcall routines.
  */
@@ -598,10 +639,12 @@
  * has seen them,
  * - CODA_CLOSE or CODA_RELEASE upcall  (to avoid reference count problems)
  * - CODA_STORE				(to avoid data loss)
+ * - CODA_ACCESS_INTENT                 (to avoid reference count problems)
  */
 #define CODA_INTERRUPTIBLE(r) (!coda_hard && \
 			       (((r)->uc_opcode != CODA_CLOSE && \
 				 (r)->uc_opcode != CODA_STORE && \
+				 (r)->uc_opcode != CODA_ACCESS_INTENT && \
 				 (r)->uc_opcode != CODA_RELEASE) || \
 				(r)->uc_flags & CODA_REQ_READ))
 
@@ -687,21 +730,25 @@
 		goto exit;
 	}
 
-	req->uc_data = (void *)buffer;
-	req->uc_flags = 0;
-	req->uc_inSize = inSize;
-	req->uc_outSize = *outSize ? *outSize : inSize;
-	req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode;
-	req->uc_unique = ++vcp->vc_seq;
-	init_waitqueue_head(&req->uc_sleep);
+	buffer->ih.unique = ++vcp->vc_seq;
 
-	/* Fill in the common input args. */
-	((union inputArgs *)buffer)->ih.unique = req->uc_unique;
+	req->uc_data = (void *)buffer;
+	req->uc_flags = outSize ? 0 : CODA_REQ_ASYNC;
+	req->uc_inSize = inSize;
+	req->uc_outSize = (outSize && *outSize) ? *outSize : inSize;
+	req->uc_opcode = buffer->ih.opcode;
+	req->uc_unique = buffer->ih.unique;
+	init_waitqueue_head(&req->uc_sleep);
 
 	/* Append msg to pending queue and poke Venus. */
 	list_add_tail(&req->uc_chain, &vcp->vc_pending);
-
 	wake_up_interruptible(&vcp->vc_waitq);
+
+	if (req->uc_flags & CODA_REQ_ASYNC) {
+		mutex_unlock(&vcp->vc_mutex);
+		return 0;
+	}
+
 	/* We can be interrupted while we wait for Venus to process
 	 * our request.  If the interrupt occurs before Venus has read
 	 * the request, we dequeue and return. If it occurs after the
@@ -743,20 +790,20 @@
 	sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
 	if (!sig_req) goto exit;
 
-	CODA_ALLOC((sig_req->uc_data), char *, sizeof(struct coda_in_hdr));
-	if (!sig_req->uc_data) {
+	sig_inputArgs = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL);
+	if (!sig_inputArgs) {
 		kfree(sig_req);
 		goto exit;
 	}
 
 	error = -EINTR;
-	sig_inputArgs = (union inputArgs *)sig_req->uc_data;
 	sig_inputArgs->ih.opcode = CODA_SIGNAL;
 	sig_inputArgs->ih.unique = req->uc_unique;
 
 	sig_req->uc_flags = CODA_REQ_ASYNC;
 	sig_req->uc_opcode = sig_inputArgs->ih.opcode;
 	sig_req->uc_unique = sig_inputArgs->ih.unique;
+	sig_req->uc_data = (void *)sig_inputArgs;
 	sig_req->uc_inSize = sizeof(struct coda_in_hdr);
 	sig_req->uc_outSize = sizeof(struct coda_in_hdr);
 
@@ -804,12 +851,44 @@
  *
  * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
 
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes)
 {
 	struct inode *inode = NULL;
 	struct CodaFid *fid = NULL, *newfid;
 	struct super_block *sb;
 
+	/*
+	 * Make sure we have received enough data from the cache
+	 * manager to populate the necessary fields in the buffer
+	 */
+	switch (opcode) {
+	case CODA_PURGEUSER:
+		if (nbytes < sizeof(struct coda_purgeuser_out))
+			return -EINVAL;
+		break;
+
+	case CODA_ZAPDIR:
+		if (nbytes < sizeof(struct coda_zapdir_out))
+			return -EINVAL;
+		break;
+
+	case CODA_ZAPFILE:
+		if (nbytes < sizeof(struct coda_zapfile_out))
+			return -EINVAL;
+		break;
+
+	case CODA_PURGEFID:
+		if (nbytes < sizeof(struct coda_purgefid_out))
+			return -EINVAL;
+		break;
+
+	case CODA_REPLACE:
+		if (nbytes < sizeof(struct coda_replace_out))
+			return -EINVAL;
+		break;
+	}
+
 	/* Handle invalidation requests. */
 	mutex_lock(&vcp->vc_mutex);
 	sb = vcp->vc_sb;
@@ -879,4 +958,3 @@
 	iput(inode);
 	return 0;
 }
-

diff --git a/fs/compat.c b/fs/compat.c
index 4a0aaaf..436d228 100644
--- a/fs/compat.c
+++ b/fs/compat.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/compat.c
  *
@@ -9,10 +10,6 @@
  *  Copyright (C) 1998       Eddie C. Dost  (ecd@skynet.be)
  *  Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
  *  Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
  */
 
 #include <linux/compat.h>

diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 504b3c3..b7f9ffa 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * 32-bit compatibility support for ELF format executables and core dumps.
  *
  * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
  *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
- *
  * Red Hat Author: Roland McGrath.
  *
  * This file is used in a 64-bit kernel that wants to support 32-bit ELF.
@@ -52,7 +49,7 @@
 #define elf_prpsinfo	compat_elf_prpsinfo
 
 #undef ns_to_timeval
-#define ns_to_timeval ns_to_compat_timeval
+#define ns_to_timeval ns_to_old_timeval32
 
 /*
  * To use this file, asm/elf.h must define compat_elf_check_arch.

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a9b0094..a7ec2d3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c

@@ -22,37 +22,21 @@
 #include <linux/smp.h>
 #include <linux/ioctl.h>
 #include <linux/if.h>
-#include <linux/if_bridge.h>
 #include <linux/raid/md_u.h>
-#include <linux/kd.h>
-#include <linux/route.h>
-#include <linux/in6.h>
-#include <linux/ipv6_route.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/vt.h>
 #include <linux/falloc.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/ppp_defs.h>
 #include <linux/ppp-ioctl.h>
 #include <linux/if_pppox.h>
 #include <linux/mtio.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
-#include <linux/fb.h>
-#include <linux/videodev2.h>
-#include <linux/netdevice.h>
 #include <linux/raw.h>
 #include <linux/blkdev.h>
-#include <linux/elevator.h>
 #include <linux/rtc.h>
 #include <linux/pci.h>
 #include <linux/serial.h>
-#include <linux/if_tun.h>
 #include <linux/ctype.h>
 #include <linux/syscalls.h>
-#include <linux/atalk.h>
 #include <linux/gfp.h>
 #include <linux/cec.h>
 
@@ -74,44 +58,17 @@
 #endif
 
 #include <linux/uaccess.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_bonding.h>
 #include <linux/watchdog.h>
 
 #include <linux/soundcard.h>
-#include <linux/lp.h>
-#include <linux/ppdev.h>
-
-#include <linux/atm.h>
-#include <linux/atmarp.h>
-#include <linux/atmclip.h>
-#include <linux/atmdev.h>
-#include <linux/atmioc.h>
-#include <linux/atmlec.h>
-#include <linux/atmmpc.h>
-#include <linux/atmsvc.h>
-#include <linux/atm_tcp.h>
-#include <linux/sonet.h>
-#include <linux/atm_suni.h>
-
-#include <linux/usb.h>
-#include <linux/usbdevice_fs.h>
-#include <linux/nbd.h>
-#include <linux/random.h>
-#include <linux/filter.h>
 
 #include <linux/hiddev.h>
 
-#define __DVB_CORE__
-#include <linux/dvb/audio.h>
-#include <linux/dvb/dmx.h>
-#include <linux/dvb/frontend.h>
-#include <linux/dvb/video.h>
 
 #include <linux/sort.h>
 
 #ifdef CONFIG_SPARC
+#include <linux/fb.h>
 #include <asm/fbio.h>
 #endif
 
@@ -133,71 +90,6 @@
 	return vfs_ioctl(file, cmd, arg);
 }
 
-struct compat_video_event {
-	int32_t		type;
-	compat_time_t	timestamp;
-	union {
-	        video_size_t size;
-		unsigned int frame_rate;
-	} u;
-};
-
-static int do_video_get_event(struct file *file,
-		unsigned int cmd, struct compat_video_event __user *up)
-{
-	struct video_event __user *kevent =
-		compat_alloc_user_space(sizeof(*kevent));
-	int err;
-
-	if (kevent == NULL)
-		return -EFAULT;
-
-	err = do_ioctl(file, cmd, (unsigned long)kevent);
-	if (!err) {
-		err  = convert_in_user(&kevent->type, &up->type);
-		err |= convert_in_user(&kevent->timestamp, &up->timestamp);
-		err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
-		err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
-		err |= convert_in_user(&kevent->u.size.aspect_ratio,
-				&up->u.size.aspect_ratio);
-		if (err)
-			err = -EFAULT;
-	}
-
-	return err;
-}
-
-struct compat_video_still_picture {
-        compat_uptr_t iFrame;
-        int32_t size;
-};
-
-static int do_video_stillpicture(struct file *file,
-		unsigned int cmd, struct compat_video_still_picture __user *up)
-{
-	struct video_still_picture __user *up_native;
-	compat_uptr_t fp;
-	int32_t size;
-	int err;
-
-	err  = get_user(fp, &up->iFrame);
-	err |= get_user(size, &up->size);
-	if (err)
-		return -EFAULT;
-
-	up_native =
-		compat_alloc_user_space(sizeof(struct video_still_picture));
-
-	err =  put_user(compat_ptr(fp), &up_native->iFrame);
-	err |= put_user(size, &up_native->size);
-	if (err)
-		return -EFAULT;
-
-	err = do_ioctl(file, cmd, (unsigned long) up_native);
-
-	return err;
-}
-
 #ifdef CONFIG_BLOCK
 typedef struct sg_io_hdr32 {
 	compat_int_t interface_id;	/* [i] 'S' for SCSI generic (required) */
@@ -544,85 +436,6 @@
 #define HCIUARTSETFLAGS		_IOW('U', 203, int)
 #define HCIUARTGETFLAGS		_IOR('U', 204, int)
 
-#define BNEPCONNADD	_IOW('B', 200, int)
-#define BNEPCONNDEL	_IOW('B', 201, int)
-#define BNEPGETCONNLIST	_IOR('B', 210, int)
-#define BNEPGETCONNINFO	_IOR('B', 211, int)
-#define BNEPGETSUPPFEAT	_IOR('B', 212, int)
-
-#define CMTPCONNADD	_IOW('C', 200, int)
-#define CMTPCONNDEL	_IOW('C', 201, int)
-#define CMTPGETCONNLIST	_IOR('C', 210, int)
-#define CMTPGETCONNINFO	_IOR('C', 211, int)
-
-#define HIDPCONNADD	_IOW('H', 200, int)
-#define HIDPCONNDEL	_IOW('H', 201, int)
-#define HIDPGETCONNLIST	_IOR('H', 210, int)
-#define HIDPGETCONNINFO	_IOR('H', 211, int)
-
-
-struct serial_struct32 {
-        compat_int_t    type;
-        compat_int_t    line;
-        compat_uint_t   port;
-        compat_int_t    irq;
-        compat_int_t    flags;
-        compat_int_t    xmit_fifo_size;
-        compat_int_t    custom_divisor;
-        compat_int_t    baud_base;
-        unsigned short  close_delay;
-        char    io_type;
-        char    reserved_char[1];
-        compat_int_t    hub6;
-        unsigned short  closing_wait; /* time to wait before closing */
-        unsigned short  closing_wait2; /* no longer used... */
-        compat_uint_t   iomem_base;
-        unsigned short  iomem_reg_shift;
-        unsigned int    port_high;
-     /* compat_ulong_t  iomap_base FIXME */
-        compat_int_t    reserved[1];
-};
-
-static int serial_struct_ioctl(struct file *file,
-		unsigned cmd, struct serial_struct32 __user *ss32)
-{
-        typedef struct serial_struct32 SS32;
-        int err;
-	struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
-        __u32 udata;
-	unsigned int base;
-	unsigned char *iomem_base;
-
-	if (ss == NULL)
-		return -EFAULT;
-        if (cmd == TIOCSSERIAL) {
-		if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
-		    get_user(udata, &ss32->iomem_base))
-			return -EFAULT;
-		iomem_base = compat_ptr(udata);
-		if (put_user(iomem_base, &ss->iomem_base) ||
-		    convert_in_user(&ss32->iomem_reg_shift,
-		      &ss->iomem_reg_shift) ||
-		    convert_in_user(&ss32->port_high, &ss->port_high) ||
-		    put_user(0UL, &ss->iomap_base))
-			return -EFAULT;
-        }
-	err = do_ioctl(file, cmd, (unsigned long)ss);
-        if (cmd == TIOCGSERIAL && err >= 0) {
-		if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
-		    get_user(iomem_base, &ss->iomem_base))
-			return -EFAULT;
-		base = (unsigned long)iomem_base  >> 32 ?
-			0xffffffff : (unsigned)(unsigned long)iomem_base;
-		if (put_user(base, &ss32->iomem_base) ||
-		    convert_in_user(&ss->iomem_reg_shift,
-		      &ss32->iomem_reg_shift) ||
-		    convert_in_user(&ss->port_high, &ss32->port_high))
-			return -EFAULT;
-        }
-        return err;
-}
-
 #define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
 #define RTC_IRQP_SET32		_IOW('p', 0x0c, compat_ulong_t)
 #define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
@@ -707,60 +520,8 @@
 
 static unsigned int ioctl_pointer[] = {
 /* compatible ioctls first */
-COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
-COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
-
-/* Big T */
-COMPATIBLE_IOCTL(TCGETA)
-COMPATIBLE_IOCTL(TCSETA)
-COMPATIBLE_IOCTL(TCSETAW)
-COMPATIBLE_IOCTL(TCSETAF)
-COMPATIBLE_IOCTL(TCSBRK)
-COMPATIBLE_IOCTL(TCXONC)
-COMPATIBLE_IOCTL(TCFLSH)
-COMPATIBLE_IOCTL(TCGETS)
-COMPATIBLE_IOCTL(TCSETS)
-COMPATIBLE_IOCTL(TCSETSW)
-COMPATIBLE_IOCTL(TCSETSF)
-COMPATIBLE_IOCTL(TIOCLINUX)
-COMPATIBLE_IOCTL(TIOCSBRK)
-COMPATIBLE_IOCTL(TIOCGDEV)
-COMPATIBLE_IOCTL(TIOCCBRK)
-COMPATIBLE_IOCTL(TIOCGSID)
-COMPATIBLE_IOCTL(TIOCGICOUNT)
-COMPATIBLE_IOCTL(TIOCGEXCL)
 /* Little t */
-COMPATIBLE_IOCTL(TIOCGETD)
-COMPATIBLE_IOCTL(TIOCSETD)
-COMPATIBLE_IOCTL(TIOCEXCL)
-COMPATIBLE_IOCTL(TIOCNXCL)
-COMPATIBLE_IOCTL(TIOCCONS)
-COMPATIBLE_IOCTL(TIOCGSOFTCAR)
-COMPATIBLE_IOCTL(TIOCSSOFTCAR)
-COMPATIBLE_IOCTL(TIOCSWINSZ)
-COMPATIBLE_IOCTL(TIOCGWINSZ)
-COMPATIBLE_IOCTL(TIOCMGET)
-COMPATIBLE_IOCTL(TIOCMBIC)
-COMPATIBLE_IOCTL(TIOCMBIS)
-COMPATIBLE_IOCTL(TIOCMSET)
-COMPATIBLE_IOCTL(TIOCNOTTY)
-COMPATIBLE_IOCTL(TIOCSTI)
 COMPATIBLE_IOCTL(TIOCOUTQ)
-COMPATIBLE_IOCTL(TIOCSPGRP)
-COMPATIBLE_IOCTL(TIOCGPGRP)
-COMPATIBLE_IOCTL(TIOCSERGETLSR)
-#ifdef TIOCSRS485
-COMPATIBLE_IOCTL(TIOCSRS485)
-#endif
-#ifdef TIOCGRS485
-COMPATIBLE_IOCTL(TIOCGRS485)
-#endif
-#ifdef TCGETS2
-COMPATIBLE_IOCTL(TCGETS2)
-COMPATIBLE_IOCTL(TCSETS2)
-COMPATIBLE_IOCTL(TCSETSW2)
-COMPATIBLE_IOCTL(TCSETSF2)
-#endif
 /* Little f */
 COMPATIBLE_IOCTL(FIOCLEX)
 COMPATIBLE_IOCTL(FIONCLEX)
@@ -775,23 +536,6 @@
 COMPATIBLE_IOCTL(FIFREEZE)
 COMPATIBLE_IOCTL(FITHAW)
 COMPATIBLE_IOCTL(FITRIM)
-COMPATIBLE_IOCTL(KDGETKEYCODE)
-COMPATIBLE_IOCTL(KDSETKEYCODE)
-COMPATIBLE_IOCTL(KDGKBTYPE)
-COMPATIBLE_IOCTL(KDGETMODE)
-COMPATIBLE_IOCTL(KDGKBMODE)
-COMPATIBLE_IOCTL(KDGKBMETA)
-COMPATIBLE_IOCTL(KDGKBENT)
-COMPATIBLE_IOCTL(KDSKBENT)
-COMPATIBLE_IOCTL(KDGKBSENT)
-COMPATIBLE_IOCTL(KDSKBSENT)
-COMPATIBLE_IOCTL(KDGKBDIACR)
-COMPATIBLE_IOCTL(KDSKBDIACR)
-COMPATIBLE_IOCTL(KDGKBDIACRUC)
-COMPATIBLE_IOCTL(KDSKBDIACRUC)
-COMPATIBLE_IOCTL(KDKBDREP)
-COMPATIBLE_IOCTL(KDGKBLED)
-COMPATIBLE_IOCTL(KDGETLED)
 #ifdef CONFIG_BLOCK
 /* Big S */
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
@@ -894,9 +638,6 @@
 COMPATIBLE_IOCTL(PPPIOCATTCHAN)
 COMPATIBLE_IOCTL(PPPIOCGCHAN)
 COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
-/* PPPOX */
-COMPATIBLE_IOCTL(PPPOEIOCSFWD)
-COMPATIBLE_IOCTL(PPPOEIOCDFWD)
 /* Big A */
 /* sparc only */
 /* Big Q for sound/OSS */
@@ -1106,19 +847,6 @@
 COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
 COMPATIBLE_IOCTL(RFCOMMGETDEVINFO)
 COMPATIBLE_IOCTL(RFCOMMSTEALDLC)
-COMPATIBLE_IOCTL(BNEPCONNADD)
-COMPATIBLE_IOCTL(BNEPCONNDEL)
-COMPATIBLE_IOCTL(BNEPGETCONNLIST)
-COMPATIBLE_IOCTL(BNEPGETCONNINFO)
-COMPATIBLE_IOCTL(BNEPGETSUPPFEAT)
-COMPATIBLE_IOCTL(CMTPCONNADD)
-COMPATIBLE_IOCTL(CMTPCONNDEL)
-COMPATIBLE_IOCTL(CMTPGETCONNLIST)
-COMPATIBLE_IOCTL(CMTPGETCONNINFO)
-COMPATIBLE_IOCTL(HIDPCONNADD)
-COMPATIBLE_IOCTL(HIDPCONNDEL)
-COMPATIBLE_IOCTL(HIDPGETCONNLIST)
-COMPATIBLE_IOCTL(HIDPGETCONNINFO)
 /* CAPI */
 COMPATIBLE_IOCTL(CAPI_REGISTER)
 COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER)
@@ -1133,11 +861,6 @@
 COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
 COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
 COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
-/* Siemens Gigaset */
-COMPATIBLE_IOCTL(GIGASET_REDIR)
-COMPATIBLE_IOCTL(GIGASET_CONFIG)
-COMPATIBLE_IOCTL(GIGASET_BRKCHARS)
-COMPATIBLE_IOCTL(GIGASET_VERSION)
 /* Misc. */
 COMPATIBLE_IOCTL(0x41545900)		/* ATYIO_CLKR */
 COMPATIBLE_IOCTL(0x41545901)		/* ATYIO_CLKW */
@@ -1162,82 +885,12 @@
 COMPATIBLE_IOCTL(HIDIOCSFLAG)
 COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
 COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
-/* dvb */
-COMPATIBLE_IOCTL(AUDIO_STOP)
-COMPATIBLE_IOCTL(AUDIO_PLAY)
-COMPATIBLE_IOCTL(AUDIO_PAUSE)
-COMPATIBLE_IOCTL(AUDIO_CONTINUE)
-COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE)
-COMPATIBLE_IOCTL(AUDIO_SET_MUTE)
-COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC)
-COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE)
-COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT)
-COMPATIBLE_IOCTL(AUDIO_GET_STATUS)
-COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES)
-COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(AUDIO_SET_ID)
-COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
-COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(DMX_START)
-COMPATIBLE_IOCTL(DMX_STOP)
-COMPATIBLE_IOCTL(DMX_SET_FILTER)
-COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
-COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
-COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
-COMPATIBLE_IOCTL(DMX_GET_STC)
-COMPATIBLE_IOCTL(DMX_REQBUFS)
-COMPATIBLE_IOCTL(DMX_QUERYBUF)
-COMPATIBLE_IOCTL(DMX_EXPBUF)
-COMPATIBLE_IOCTL(DMX_QBUF)
-COMPATIBLE_IOCTL(DMX_DQBUF)
-COMPATIBLE_IOCTL(VIDEO_STOP)
-COMPATIBLE_IOCTL(VIDEO_PLAY)
-COMPATIBLE_IOCTL(VIDEO_FREEZE)
-COMPATIBLE_IOCTL(VIDEO_CONTINUE)
-COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE)
-COMPATIBLE_IOCTL(VIDEO_SET_BLANK)
-COMPATIBLE_IOCTL(VIDEO_GET_STATUS)
-COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
-COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
-COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
-COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
-/* cec */
-COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS)
-COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS)
-COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS)
-COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR)
-COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR)
-COMPATIBLE_IOCTL(CEC_G_MODE)
-COMPATIBLE_IOCTL(CEC_S_MODE)
-COMPATIBLE_IOCTL(CEC_TRANSMIT)
-COMPATIBLE_IOCTL(CEC_RECEIVE)
-COMPATIBLE_IOCTL(CEC_DQEVENT)
-
 /* joystick */
 COMPATIBLE_IOCTL(JSIOCGVERSION)
 COMPATIBLE_IOCTL(JSIOCGAXES)
 COMPATIBLE_IOCTL(JSIOCGBUTTONS)
 COMPATIBLE_IOCTL(JSIOCGNAME(0))
 
-#ifdef TIOCGLTC
-COMPATIBLE_IOCTL(TIOCGLTC)
-COMPATIBLE_IOCTL(TIOCSLTC)
-#endif
-#ifdef TIOCSTART
-/*
- * For these two we have definitions in ioctls.h and/or termios.h on
- * some architectures but no actual implemention.  Some applications
- * like bash call them if they are defined in the headers, so we provide
- * entries here to avoid syslog message spew.
- */
-COMPATIBLE_IOCTL(TIOCSTART)
-COMPATIBLE_IOCTL(TIOCSTOP)
-#endif
-
 /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
    but we don't want warnings on other file systems. So declare
    them as compatible here. */
@@ -1293,22 +946,12 @@
 	case MTIOCPOS32:
 		return mt_ioctl_trans(file, cmd, argp);
 #endif
-	/* Serial */
-	case TIOCGSERIAL:
-	case TIOCSSERIAL:
-		return serial_struct_ioctl(file, cmd, argp);
 	/* Not implemented in the native kernel */
 	case RTC_IRQP_READ32:
 	case RTC_IRQP_SET32:
 	case RTC_EPOCH_READ32:
 	case RTC_EPOCH_SET32:
 		return rtc_ioctl(file, cmd, argp);
-
-	/* dvb */
-	case VIDEO_GET_EVENT:
-		return do_video_get_event(file, cmd, argp);
-	case VIDEO_STILLPICTURE:
-		return do_video_stillpicture(file, cmd, argp);
 	}
 
 	/*
@@ -1316,24 +959,11 @@
 	 * so we must not do a compat_ptr() translation.
 	 */
 	switch (cmd) {
-	/* Big T */
-	case TCSBRKP:
-	case TIOCMIWAIT:
-	case TIOCSCTTY:
 	/* RAID */
 	case HOT_REMOVE_DISK:
 	case HOT_ADD_DISK:
 	case SET_DISK_FAULTY:
 	case SET_BITMAP_FILE:
-	/* Big K */
-	case KDSIGACCEPT:
-	case KIOCSOUND:
-	case KDMKTONE:
-	case KDSETMODE:
-	case KDSKBMODE:
-	case KDSKBMETA:
-	case KDSKBLED:
-	case KDSETLED:
 		return vfs_ioctl(file, cmd, arg);
 	}
 

diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 9febcde..272b644 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config CONFIGFS_FS
 	tristate "Userspace-driven configuration filesystem"
 	select SYSFS

diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
index 00ffb27..0200498 100644
--- a/fs/configfs/Makefile
+++ b/fs/configfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the configfs virtual filesystem
 #

diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index ccc31fa..22dce2d 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset:8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * configfs_internal.h - Internal stuff for configfs
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * Based on sysfs:
  * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
  *
@@ -34,12 +20,21 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 
+struct configfs_fragment {
+	atomic_t frag_count;
+	struct rw_semaphore frag_sem;
+	bool frag_dead;
+};
+
+void put_fragment(struct configfs_fragment *);
+struct configfs_fragment *get_fragment(struct configfs_fragment *);
+
 struct configfs_dirent {
 	atomic_t		s_count;
 	int			s_dependent_count;
 	struct list_head	s_sibling;
 	struct list_head	s_children;
-	struct list_head	s_links;
+	int			s_links;
 	void			* s_element;
 	int			s_type;
 	umode_t			s_mode;
@@ -48,6 +43,7 @@
 #ifdef CONFIG_LOCKDEP
 	int			s_depth;
 #endif
+	struct configfs_fragment *s_frag;
 };
 
 #define CONFIGFS_ROOT		0x0001
@@ -70,13 +66,13 @@
 extern int configfs_is_root(struct config_item *item);
 
 extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
-extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *));
+extern struct inode *configfs_create(struct dentry *, umode_t mode);
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_create_bin_file(struct config_item *,
 				    const struct configfs_bin_attribute *);
-extern int configfs_make_dirent(struct configfs_dirent *,
-				struct dentry *, void *, umode_t, int);
+extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *,
+				void *, umode_t, int, struct configfs_fragment *);
 extern int configfs_dirent_is_ready(struct configfs_dirent *);
 
 extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
@@ -88,7 +84,6 @@
 extern struct dentry *configfs_pin_fs(void);
 extern void configfs_release_fs(void);
 
-extern struct rw_semaphore configfs_rename_sem;
 extern const struct file_operations configfs_dir_operations;
 extern const struct file_operations configfs_file_operations;
 extern const struct file_operations configfs_bin_file_operations;
@@ -101,14 +96,8 @@
 			    const char *symname);
 extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
 
-struct configfs_symlink {
-	struct list_head sl_list;
-	struct config_item *sl_target;
-};
-
-extern int configfs_create_link(struct configfs_symlink *sl,
-				struct dentry *parent,
-				struct dentry *dentry);
+int configfs_create_link(struct configfs_dirent *target, struct dentry *parent,
+		struct dentry *dentry, char *body);
 
 static inline struct config_item * to_item(struct dentry * dentry)
 {
@@ -136,11 +125,7 @@
 	spin_lock(&dentry->d_lock);
 	if (!d_unhashed(dentry)) {
 		struct configfs_dirent * sd = dentry->d_fsdata;
-		if (sd->s_type & CONFIGFS_ITEM_LINK) {
-			struct configfs_symlink * sl = sd->s_element;
-			item = config_item_get(sl->sl_target);
-		} else
-			item = config_item_get(sd->s_element);
+		item = config_item_get(sd->s_element);
 	}
 	spin_unlock(&dentry->d_lock);
 
@@ -151,6 +136,7 @@
 {
 	if (!(sd->s_type & CONFIGFS_ROOT)) {
 		kfree(sd->s_iattr);
+		put_fragment(sd->s_frag);
 		kmem_cache_free(configfs_dir_cachep, sd);
 	}
 }

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 39843fa..cf7b7e1 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * dir.c - Operations for configfs directories.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * Based on sysfs:
  * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
  *
@@ -27,6 +13,7 @@
 #undef DEBUG
 
 #include <linux/fs.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -35,7 +22,6 @@
 #include <linux/configfs.h>
 #include "configfs_internal.h"
 
-DECLARE_RWSEM(configfs_rename_sem);
 /*
  * Protects mutations of configfs_dirent linkage together with proper i_mutex
  * Also protects mutations of symlinks linkage to target configfs_dirent
@@ -58,15 +44,13 @@
 	if (sd) {
 		/* Coordinate with configfs_readdir */
 		spin_lock(&configfs_dirent_lock);
-		/* Coordinate with configfs_attach_attr where will increase
-		 * sd->s_count and update sd->s_dentry to new allocated one.
-		 * Only set sd->dentry to null when this dentry is the only
-		 * sd owner.
-		 * If not do so, configfs_d_iput may run just after
-		 * configfs_attach_attr and set sd->s_dentry to null
-		 * even it's still in use.
+		/*
+		 * Set sd->s_dentry to null only when this dentry is the one
+		 * that is going to be killed.  Otherwise configfs_d_iput may
+		 * run just after configfs_attach_attr and set sd->s_dentry to
+		 * NULL even it's still in use.
 		 */
-		if (atomic_read(&sd->s_count) <= 2)
+		if (sd->s_dentry == dentry)
 			sd->s_dentry = NULL;
 
 		spin_unlock(&configfs_dirent_lock);
@@ -166,11 +150,38 @@
 
 #endif /* CONFIG_LOCKDEP */
 
+static struct configfs_fragment *new_fragment(void)
+{
+	struct configfs_fragment *p;
+
+	p = kmalloc(sizeof(struct configfs_fragment), GFP_KERNEL);
+	if (p) {
+		atomic_set(&p->frag_count, 1);
+		init_rwsem(&p->frag_sem);
+		p->frag_dead = false;
+	}
+	return p;
+}
+
+void put_fragment(struct configfs_fragment *frag)
+{
+	if (frag && atomic_dec_and_test(&frag->frag_count))
+		kfree(frag);
+}
+
+struct configfs_fragment *get_fragment(struct configfs_fragment *frag)
+{
+	if (likely(frag))
+		atomic_inc(&frag->frag_count);
+	return frag;
+}
+
 /*
  * Allocates a new configfs_dirent and links it to the parent configfs_dirent
  */
 static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
-						   void *element, int type)
+						   void *element, int type,
+						   struct configfs_fragment *frag)
 {
 	struct configfs_dirent * sd;
 
@@ -179,7 +190,6 @@
 		return ERR_PTR(-ENOMEM);
 
 	atomic_set(&sd->s_count, 1);
-	INIT_LIST_HEAD(&sd->s_links);
 	INIT_LIST_HEAD(&sd->s_children);
 	sd->s_element = element;
 	sd->s_type = type;
@@ -190,6 +200,7 @@
 		kmem_cache_free(configfs_dir_cachep, sd);
 		return ERR_PTR(-ENOENT);
 	}
+	sd->s_frag = get_fragment(frag);
 	list_add(&sd->s_sibling, &parent_sd->s_children);
 	spin_unlock(&configfs_dirent_lock);
 
@@ -224,11 +235,11 @@
 
 int configfs_make_dirent(struct configfs_dirent * parent_sd,
 			 struct dentry * dentry, void * element,
-			 umode_t mode, int type)
+			 umode_t mode, int type, struct configfs_fragment *frag)
 {
 	struct configfs_dirent * sd;
 
-	sd = configfs_new_dirent(parent_sd, element, type);
+	sd = configfs_new_dirent(parent_sd, element, type, frag);
 	if (IS_ERR(sd))
 		return PTR_ERR(sd);
 
@@ -240,30 +251,16 @@
 	return 0;
 }
 
-static void init_dir(struct inode * inode)
+static void configfs_remove_dirent(struct dentry *dentry)
 {
-	inode->i_op = &configfs_dir_inode_operations;
-	inode->i_fop = &configfs_dir_operations;
+	struct configfs_dirent *sd = dentry->d_fsdata;
 
-	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inc_nlink(inode);
-}
-
-static void configfs_init_file(struct inode * inode)
-{
-	inode->i_size = PAGE_SIZE;
-	inode->i_fop = &configfs_file_operations;
-}
-
-static void configfs_init_bin_file(struct inode *inode)
-{
-	inode->i_size = 0;
-	inode->i_fop = &configfs_bin_file_operations;
-}
-
-static void init_symlink(struct inode * inode)
-{
-	inode->i_op = &configfs_symlink_inode_operations;
+	if (!sd)
+		return;
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
+	configfs_put(sd);
 }
 
 /**
@@ -275,11 +272,13 @@
  *	until it is validated by configfs_dir_set_ready()
  */
 
-static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
+static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
+				struct configfs_fragment *frag)
 {
 	int error;
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
 	struct dentry *p = dentry->d_parent;
+	struct inode *inode;
 
 	BUG_ON(!item);
 
@@ -288,25 +287,30 @@
 		return error;
 
 	error = configfs_make_dirent(p->d_fsdata, dentry, item, mode,
-				     CONFIGFS_DIR | CONFIGFS_USET_CREATING);
+				     CONFIGFS_DIR | CONFIGFS_USET_CREATING,
+				     frag);
 	if (unlikely(error))
 		return error;
 
 	configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata);
-	error = configfs_create(dentry, mode, init_dir);
-	if (!error) {
-		inc_nlink(d_inode(p));
-		item->ci_dentry = dentry;
-	} else {
-		struct configfs_dirent *sd = dentry->d_fsdata;
-		if (sd) {
-			spin_lock(&configfs_dirent_lock);
-			list_del_init(&sd->s_sibling);
-			spin_unlock(&configfs_dirent_lock);
-			configfs_put(sd);
-		}
-	}
-	return error;
+	inode = configfs_create(dentry, mode);
+	if (IS_ERR(inode))
+		goto out_remove;
+
+	inode->i_op = &configfs_dir_inode_operations;
+	inode->i_fop = &configfs_dir_operations;
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inc_nlink(inode);
+	d_instantiate(dentry, inode);
+	/* already hashed */
+	dget(dentry);  /* pin directory dentries in core */
+	inc_nlink(d_inode(p));
+	item->ci_dentry = dentry;
+	return 0;
+
+out_remove:
+	configfs_remove_dirent(dentry);
+	return PTR_ERR(inode);
 }
 
 /*
@@ -347,40 +351,40 @@
 	return ret;
 }
 
-int configfs_create_link(struct configfs_symlink *sl,
-			 struct dentry *parent,
-			 struct dentry *dentry)
+int configfs_create_link(struct configfs_dirent *target, struct dentry *parent,
+		struct dentry *dentry, char *body)
 {
 	int err = 0;
 	umode_t mode = S_IFLNK | S_IRWXUGO;
+	struct configfs_dirent *p = parent->d_fsdata;
+	struct inode *inode;
 
-	err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
-				   CONFIGFS_ITEM_LINK);
-	if (!err) {
-		err = configfs_create(dentry, mode, init_symlink);
-		if (err) {
-			struct configfs_dirent *sd = dentry->d_fsdata;
-			if (sd) {
-				spin_lock(&configfs_dirent_lock);
-				list_del_init(&sd->s_sibling);
-				spin_unlock(&configfs_dirent_lock);
-				configfs_put(sd);
-			}
-		}
-	}
-	return err;
+	err = configfs_make_dirent(p, dentry, target, mode, CONFIGFS_ITEM_LINK,
+			p->s_frag);
+	if (err)
+		return err;
+
+	inode = configfs_create(dentry, mode);
+	if (IS_ERR(inode))
+		goto out_remove;
+
+	inode->i_link = body;
+	inode->i_op = &configfs_symlink_inode_operations;
+	d_instantiate(dentry, inode);
+	dget(dentry);  /* pin link dentries in core */
+	return 0;
+
+out_remove:
+	configfs_remove_dirent(dentry);
+	return PTR_ERR(inode);
 }
 
 static void remove_dir(struct dentry * d)
 {
 	struct dentry * parent = dget(d->d_parent);
-	struct configfs_dirent * sd;
 
-	sd = d->d_fsdata;
-	spin_lock(&configfs_dirent_lock);
-	list_del_init(&sd->s_sibling);
-	spin_unlock(&configfs_dirent_lock);
-	configfs_put(sd);
+	configfs_remove_dirent(d);
+
 	if (d_really_is_positive(d))
 		simple_rmdir(d_inode(parent),d);
 
@@ -421,20 +425,27 @@
 static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
 {
 	struct configfs_attribute * attr = sd->s_element;
-	int error;
+	struct inode *inode;
 
 	spin_lock(&configfs_dirent_lock);
 	dentry->d_fsdata = configfs_get(sd);
 	sd->s_dentry = dentry;
 	spin_unlock(&configfs_dirent_lock);
 
-	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
-				(sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
-					configfs_init_bin_file :
-					configfs_init_file);
-	if (error)
+	inode = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG);
+	if (IS_ERR(inode)) {
 		configfs_put(sd);
-	return error;
+		return PTR_ERR(inode);
+	}
+	if (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) {
+		inode->i_size = 0;
+		inode->i_fop = &configfs_bin_file_operations;
+	} else {
+		inode->i_size = PAGE_SIZE;
+		inode->i_fop = &configfs_file_operations;
+	}
+	d_add(dentry, inode);
+	return 0;
 }
 
 static struct dentry * configfs_lookup(struct inode *dir,
@@ -504,7 +515,7 @@
 	parent_sd->s_type |= CONFIGFS_USET_DROPPING;
 
 	ret = -EBUSY;
-	if (!list_empty(&parent_sd->s_links))
+	if (parent_sd->s_links)
 		goto out;
 
 	ret = 0;
@@ -614,7 +625,8 @@
 
 static int configfs_attach_group(struct config_item *parent_item,
 				 struct config_item *item,
-				 struct dentry *dentry);
+				 struct dentry *dentry,
+				 struct configfs_fragment *frag);
 static void configfs_detach_group(struct config_item *item);
 
 static void detach_groups(struct config_group *group)
@@ -662,7 +674,8 @@
  * try using vfs_mkdir.  Just a thought.
  */
 static int create_default_group(struct config_group *parent_group,
-				struct config_group *group)
+				struct config_group *group,
+				struct configfs_fragment *frag)
 {
 	int ret;
 	struct configfs_dirent *sd;
@@ -678,7 +691,7 @@
 		d_add(child, NULL);
 
 		ret = configfs_attach_group(&parent_group->cg_item,
-					    &group->cg_item, child);
+					    &group->cg_item, child, frag);
 		if (!ret) {
 			sd = child->d_fsdata;
 			sd->s_type |= CONFIGFS_USET_DEFAULT;
@@ -692,13 +705,14 @@
 	return ret;
 }
 
-static int populate_groups(struct config_group *group)
+static int populate_groups(struct config_group *group,
+			   struct configfs_fragment *frag)
 {
 	struct config_group *new_group;
 	int ret = 0;
 
 	list_for_each_entry(new_group, &group->default_groups, group_entry) {
-		ret = create_default_group(group, new_group);
+		ret = create_default_group(group, new_group, frag);
 		if (ret) {
 			detach_groups(group);
 			break;
@@ -812,11 +826,12 @@
  */
 static int configfs_attach_item(struct config_item *parent_item,
 				struct config_item *item,
-				struct dentry *dentry)
+				struct dentry *dentry,
+				struct configfs_fragment *frag)
 {
 	int ret;
 
-	ret = configfs_create_dir(item, dentry);
+	ret = configfs_create_dir(item, dentry, frag);
 	if (!ret) {
 		ret = populate_attrs(item);
 		if (ret) {
@@ -846,12 +861,13 @@
 
 static int configfs_attach_group(struct config_item *parent_item,
 				 struct config_item *item,
-				 struct dentry *dentry)
+				 struct dentry *dentry,
+				 struct configfs_fragment *frag)
 {
 	int ret;
 	struct configfs_dirent *sd;
 
-	ret = configfs_attach_item(parent_item, item, dentry);
+	ret = configfs_attach_item(parent_item, item, dentry, frag);
 	if (!ret) {
 		sd = dentry->d_fsdata;
 		sd->s_type |= CONFIGFS_USET_DIR;
@@ -867,7 +883,7 @@
 		 */
 		inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 		configfs_adjust_dir_dirent_depth_before_populate(sd);
-		ret = populate_groups(to_config_group(item));
+		ret = populate_groups(to_config_group(item), frag);
 		if (ret) {
 			configfs_detach_item(item);
 			d_inode(dentry)->i_flags |= S_DEAD;
@@ -1262,6 +1278,7 @@
 	struct configfs_dirent *sd;
 	const struct config_item_type *type;
 	struct module *subsys_owner = NULL, *new_item_owner = NULL;
+	struct configfs_fragment *frag;
 	char *name;
 
 	sd = dentry->d_parent->d_fsdata;
@@ -1280,6 +1297,12 @@
 		goto out;
 	}
 
+	frag = new_fragment();
+	if (!frag) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	/* Get a working ref for the duration of this function */
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	type = parent_item->ci_type;
@@ -1382,9 +1405,9 @@
 	spin_unlock(&configfs_dirent_lock);
 
 	if (group)
-		ret = configfs_attach_group(parent_item, item, dentry);
+		ret = configfs_attach_group(parent_item, item, dentry, frag);
 	else
-		ret = configfs_attach_item(parent_item, item, dentry);
+		ret = configfs_attach_item(parent_item, item, dentry, frag);
 
 	spin_lock(&configfs_dirent_lock);
 	sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
@@ -1421,6 +1444,7 @@
 	 * reference.
 	 */
 	config_item_put(parent_item);
+	put_fragment(frag);
 
 out:
 	return ret;
@@ -1432,6 +1456,7 @@
 	struct config_item *item;
 	struct configfs_subsystem *subsys;
 	struct configfs_dirent *sd;
+	struct configfs_fragment *frag;
 	struct module *subsys_owner = NULL, *dead_item_owner = NULL;
 	int ret;
 
@@ -1489,6 +1514,16 @@
 		}
 	} while (ret == -EAGAIN);
 
+	frag = sd->s_frag;
+	if (down_write_killable(&frag->frag_sem)) {
+		spin_lock(&configfs_dirent_lock);
+		configfs_detach_rollback(dentry);
+		spin_unlock(&configfs_dirent_lock);
+		return -EINTR;
+	}
+	frag->frag_dead = true;
+	up_write(&frag->frag_sem);
+
 	/* Get a working ref for the duration of this function */
 	item = configfs_get_config_item(dentry);
 
@@ -1538,44 +1573,6 @@
 	.setattr	= configfs_setattr,
 };
 
-#if 0
-int configfs_rename_dir(struct config_item * item, const char *new_name)
-{
-	int error = 0;
-	struct dentry * new_dentry, * parent;
-
-	if (!strcmp(config_item_name(item), new_name))
-		return -EINVAL;
-
-	if (!item->parent)
-		return -EINVAL;
-
-	down_write(&configfs_rename_sem);
-	parent = item->parent->dentry;
-
-	inode_lock(d_inode(parent));
-
-	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
-	if (!IS_ERR(new_dentry)) {
-		if (d_really_is_negative(new_dentry)) {
-			error = config_item_set_name(item, "%s", new_name);
-			if (!error) {
-				d_add(new_dentry, NULL);
-				d_move(item->dentry, new_dentry);
-			}
-			else
-				d_delete(new_dentry);
-		} else
-			error = -EEXIST;
-		dput(new_dentry);
-	}
-	inode_unlock(d_inode(parent));
-	up_write(&configfs_rename_sem);
-
-	return error;
-}
-#endif
-
 static int configfs_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry * dentry = file->f_path.dentry;
@@ -1589,7 +1586,7 @@
 	 */
 	err = -ENOENT;
 	if (configfs_dirent_is_ready(parent_sd)) {
-		file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
+		file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL);
 		if (IS_ERR(file->private_data))
 			err = PTR_ERR(file->private_data);
 		else
@@ -1690,9 +1687,11 @@
 	switch (whence) {
 		case 1:
 			offset += file->f_pos;
+			/* fall through */
 		case 0:
 			if (offset >= 0)
 				break;
+			/* fall through */
 		default:
 			return -EINVAL;
 	}
@@ -1745,8 +1744,13 @@
 {
 	struct configfs_subsystem *subsys = parent_group->cg_subsys;
 	struct dentry *parent;
+	struct configfs_fragment *frag;
 	int ret;
 
+	frag = new_fragment();
+	if (!frag)
+		return -ENOMEM;
+
 	mutex_lock(&subsys->su_mutex);
 	link_group(parent_group, group);
 	mutex_unlock(&subsys->su_mutex);
@@ -1754,13 +1758,22 @@
 	parent = parent_group->cg_item.ci_dentry;
 
 	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
-	ret = create_default_group(parent_group, group);
-	if (!ret) {
-		spin_lock(&configfs_dirent_lock);
-		configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
-		spin_unlock(&configfs_dirent_lock);
-	}
+	ret = create_default_group(parent_group, group, frag);
+	if (ret)
+		goto err_out;
+
+	spin_lock(&configfs_dirent_lock);
+	configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
+	spin_unlock(&configfs_dirent_lock);
 	inode_unlock(d_inode(parent));
+	put_fragment(frag);
+	return 0;
+err_out:
+	inode_unlock(d_inode(parent));
+	mutex_lock(&subsys->su_mutex);
+	unlink_group(group);
+	mutex_unlock(&subsys->su_mutex);
+	put_fragment(frag);
 	return ret;
 }
 EXPORT_SYMBOL(configfs_register_group);
@@ -1776,16 +1789,12 @@
 	struct configfs_subsystem *subsys = group->cg_subsys;
 	struct dentry *dentry = group->cg_item.ci_dentry;
 	struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_fragment *frag = sd->s_frag;
 
-	mutex_lock(&subsys->su_mutex);
-	if (!group->cg_item.ci_parent->ci_group) {
-		/*
-		 * The parent has already been unlinked and detached
-		 * due to a rmdir.
-		 */
-		goto unlink_group;
-	}
-	mutex_unlock(&subsys->su_mutex);
+	down_write(&frag->frag_sem);
+	frag->frag_dead = true;
+	up_write(&frag->frag_sem);
 
 	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
 	spin_lock(&configfs_dirent_lock);
@@ -1795,13 +1804,13 @@
 	configfs_detach_group(&group->cg_item);
 	d_inode(dentry)->i_flags |= S_DEAD;
 	dont_mount(dentry);
+	fsnotify_rmdir(d_inode(parent), dentry);
 	d_delete(dentry);
 	inode_unlock(d_inode(parent));
 
 	dput(dentry);
 
 	mutex_lock(&subsys->su_mutex);
-unlink_group:
 	unlink_group(group);
 	mutex_unlock(&subsys->su_mutex);
 }
@@ -1858,10 +1867,17 @@
 	struct dentry *dentry;
 	struct dentry *root;
 	struct configfs_dirent *sd;
+	struct configfs_fragment *frag;
+
+	frag = new_fragment();
+	if (!frag)
+		return -ENOMEM;
 
 	root = configfs_pin_fs();
-	if (IS_ERR(root))
+	if (IS_ERR(root)) {
+		put_fragment(frag);
 		return PTR_ERR(root);
+	}
 
 	if (!group->cg_item.ci_name)
 		group->cg_item.ci_name = group->cg_item.ci_namebuf;
@@ -1877,7 +1893,7 @@
 		d_add(dentry, NULL);
 
 		err = configfs_attach_group(sd->s_element, &group->cg_item,
-					    dentry);
+					    dentry, frag);
 		if (err) {
 			BUG_ON(d_inode(dentry));
 			d_drop(dentry);
@@ -1895,6 +1911,7 @@
 		unlink_group(group);
 		configfs_release_fs();
 	}
+	put_fragment(frag);
 
 	return err;
 }
@@ -1904,12 +1921,18 @@
 	struct config_group *group = &subsys->su_group;
 	struct dentry *dentry = group->cg_item.ci_dentry;
 	struct dentry *root = dentry->d_sb->s_root;
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_fragment *frag = sd->s_frag;
 
 	if (dentry->d_parent != root) {
 		pr_err("Tried to unregister non-subsystem!\n");
 		return;
 	}
 
+	down_write(&frag->frag_sem);
+	frag->frag_dead = true;
+	up_write(&frag->frag_sem);
+
 	inode_lock_nested(d_inode(root),
 			  I_MUTEX_PARENT);
 	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
@@ -1923,6 +1946,7 @@
 	configfs_detach_group(&group->cg_item);
 	d_inode(dentry)->i_flags |= S_DEAD;
 	dont_mount(dentry);
+	fsnotify_rmdir(d_inode(root), dentry);
 	inode_unlock(d_inode(dentry));
 
 	d_delete(dentry);

diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 62580db..fb65b70 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * file.c - operations for regular (text) files.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * Based on sysfs:
  * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
  *
@@ -53,40 +39,44 @@
 	bool			write_in_progress;
 	char			*bin_buffer;
 	int			bin_buffer_size;
+	int			cb_max_size;
+	struct config_item	*item;
+	struct module		*owner;
+	union {
+		struct configfs_attribute	*attr;
+		struct configfs_bin_attribute	*bin_attr;
+	};
 };
 
-
-/**
- *	fill_read_buffer - allocate and fill buffer from item.
- *	@dentry:	dentry pointer.
- *	@buffer:	data buffer for file.
- *
- *	Allocate @buffer->page, if it hasn't been already, then call the
- *	config_item's show() method to fill the buffer with this attribute's
- *	data.
- *	This is called only once, on the file's first read.
- */
-static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+static inline struct configfs_fragment *to_frag(struct file *file)
 {
-	struct configfs_attribute * attr = to_attr(dentry);
-	struct config_item * item = to_item(dentry->d_parent);
-	int ret = 0;
-	ssize_t count;
+	struct configfs_dirent *sd = file->f_path.dentry->d_fsdata;
+
+	return sd->s_frag;
+}
+
+static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer)
+{
+	struct configfs_fragment *frag = to_frag(file);
+	ssize_t count = -ENOENT;
 
 	if (!buffer->page)
 		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
 	if (!buffer->page)
 		return -ENOMEM;
 
-	count = attr->show(item, buffer->page);
+	down_read(&frag->frag_sem);
+	if (!frag->frag_dead)
+		count = buffer->attr->show(buffer->item, buffer->page);
+	up_read(&frag->frag_sem);
 
-	BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
-	if (count >= 0) {
-		buffer->needs_read_fill = 0;
-		buffer->count = count;
-	} else
-		ret = count;
-	return ret;
+	if (count < 0)
+		return count;
+	if (WARN_ON_ONCE(count > (ssize_t)SIMPLE_ATTR_SIZE))
+		return -EIO;
+	buffer->needs_read_fill = 0;
+	buffer->count = count;
+	return 0;
 }
 
 /**
@@ -111,12 +101,13 @@
 static ssize_t
 configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
-	struct configfs_buffer * buffer = file->private_data;
+	struct configfs_buffer *buffer = file->private_data;
 	ssize_t retval = 0;
 
 	mutex_lock(&buffer->mutex);
 	if (buffer->needs_read_fill) {
-		if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
+		retval = fill_read_buffer(file, buffer);
+		if (retval)
 			goto out;
 	}
 	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
@@ -152,10 +143,8 @@
 configfs_read_bin_file(struct file *file, char __user *buf,
 		       size_t count, loff_t *ppos)
 {
+	struct configfs_fragment *frag = to_frag(file);
 	struct configfs_buffer *buffer = file->private_data;
-	struct dentry *dentry = file->f_path.dentry;
-	struct config_item *item = to_item(dentry->d_parent);
-	struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
 	ssize_t retval = 0;
 	ssize_t len = min_t(size_t, count, PAGE_SIZE);
 
@@ -170,14 +159,19 @@
 
 	if (buffer->needs_read_fill) {
 		/* perform first read with buf == NULL to get extent */
-		len = bin_attr->read(item, NULL, 0);
+		down_read(&frag->frag_sem);
+		if (!frag->frag_dead)
+			len = buffer->bin_attr->read(buffer->item, NULL, 0);
+		else
+			len = -ENOENT;
+		up_read(&frag->frag_sem);
 		if (len <= 0) {
 			retval = len;
 			goto out;
 		}
 
 		/* do not exceed the maximum value */
-		if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) {
+		if (buffer->cb_max_size && len > buffer->cb_max_size) {
 			retval = -EFBIG;
 			goto out;
 		}
@@ -190,7 +184,13 @@
 		buffer->bin_buffer_size = len;
 
 		/* perform second read to fill buffer */
-		len = bin_attr->read(item, buffer->bin_buffer, len);
+		down_read(&frag->frag_sem);
+		if (!frag->frag_dead)
+			len = buffer->bin_attr->read(buffer->item,
+						     buffer->bin_buffer, len);
+		else
+			len = -ENOENT;
+		up_read(&frag->frag_sem);
 		if (len < 0) {
 			retval = len;
 			vfree(buffer->bin_buffer);
@@ -240,25 +240,17 @@
 	return error ? -EFAULT : count;
 }
 
-
-/**
- *	flush_write_buffer - push buffer to config_item.
- *	@dentry:	dentry to the attribute
- *	@buffer:	data buffer for file.
- *	@count:		number of bytes
- *
- *	Get the correct pointers for the config_item and the attribute we're
- *	dealing with, then call the store() method for the attribute,
- *	passing the buffer that we acquired in fill_write_buffer().
- */
-
 static int
-flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t count)
 {
-	struct configfs_attribute * attr = to_attr(dentry);
-	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_fragment *frag = to_frag(file);
+	int res = -ENOENT;
 
-	return attr->store(item, buffer->page, count);
+	down_read(&frag->frag_sem);
+	if (!frag->frag_dead)
+		res = buffer->attr->store(buffer->item, buffer->page, count);
+	up_read(&frag->frag_sem);
+	return res;
 }
 
 
@@ -282,13 +274,13 @@
 static ssize_t
 configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 {
-	struct configfs_buffer * buffer = file->private_data;
+	struct configfs_buffer *buffer = file->private_data;
 	ssize_t len;
 
 	mutex_lock(&buffer->mutex);
 	len = fill_write_buffer(buffer, buf, count);
 	if (len > 0)
-		len = flush_write_buffer(file->f_path.dentry, buffer, len);
+		len = flush_write_buffer(file, buffer, len);
 	if (len > 0)
 		*ppos += len;
 	mutex_unlock(&buffer->mutex);
@@ -313,8 +305,6 @@
 			size_t count, loff_t *ppos)
 {
 	struct configfs_buffer *buffer = file->private_data;
-	struct dentry *dentry = file->f_path.dentry;
-	struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
 	void *tbuf = NULL;
 	ssize_t len;
 
@@ -330,8 +320,8 @@
 	/* buffer grows? */
 	if (*ppos + count > buffer->bin_buffer_size) {
 
-		if (bin_attr->cb_max_size &&
-			*ppos + count > bin_attr->cb_max_size) {
+		if (buffer->cb_max_size &&
+			*ppos + count > buffer->cb_max_size) {
 			len = -EFBIG;
 			goto out;
 		}
@@ -363,31 +353,51 @@
 	return len;
 }
 
-static int check_perm(struct inode * inode, struct file * file, int type)
+static int __configfs_open_file(struct inode *inode, struct file *file, int type)
 {
-	struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
-	struct configfs_attribute * attr = to_attr(file->f_path.dentry);
-	struct configfs_bin_attribute *bin_attr = NULL;
-	struct configfs_buffer * buffer;
-	struct configfs_item_operations * ops = NULL;
-	int error = 0;
+	struct dentry *dentry = file->f_path.dentry;
+	struct configfs_fragment *frag = to_frag(file);
+	struct configfs_attribute *attr;
+	struct configfs_buffer *buffer;
+	int error;
 
-	if (!item || !attr)
-		goto Einval;
+	error = -ENOMEM;
+	buffer = kzalloc(sizeof(struct configfs_buffer), GFP_KERNEL);
+	if (!buffer)
+		goto out;
 
-	if (type & CONFIGFS_ITEM_BIN_ATTR)
-		bin_attr = to_bin_attr(file->f_path.dentry);
+	error = -ENOENT;
+	down_read(&frag->frag_sem);
+	if (unlikely(frag->frag_dead))
+		goto out_free_buffer;
 
-	/* Grab the module reference for this attribute if we have one */
-	if (!try_module_get(attr->ca_owner)) {
-		error = -ENODEV;
-		goto Done;
+	error = -EINVAL;
+	buffer->item = to_item(dentry->d_parent);
+	if (!buffer->item)
+		goto out_free_buffer;
+
+	attr = to_attr(dentry);
+	if (!attr)
+		goto out_put_item;
+
+	if (type & CONFIGFS_ITEM_BIN_ATTR) {
+		buffer->bin_attr = to_bin_attr(dentry);
+		buffer->cb_max_size = buffer->bin_attr->cb_max_size;
+	} else {
+		buffer->attr = attr;
 	}
 
-	if (item->ci_type)
-		ops = item->ci_type->ct_item_ops;
-	else
-		goto Eaccess;
+	buffer->owner = attr->ca_owner;
+	/* Grab the module reference for this attribute if we have one */
+	error = -ENODEV;
+	if (!try_module_get(buffer->owner))
+		goto out_put_item;
+
+	error = -EACCES;
+	if (!buffer->item->ci_type)
+		goto out_put_module;
+
+	buffer->ops = buffer->item->ci_type->ct_item_ops;
 
 	/* File needs write support.
 	 * The inode's perms must say it's ok,
@@ -395,13 +405,11 @@
 	 */
 	if (file->f_mode & FMODE_WRITE) {
 		if (!(inode->i_mode & S_IWUGO))
-			goto Eaccess;
-
+			goto out_put_module;
 		if ((type & CONFIGFS_ITEM_ATTR) && !attr->store)
-			goto Eaccess;
-
-		if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write)
-			goto Eaccess;
+			goto out_put_module;
+		if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->write)
+			goto out_put_module;
 	}
 
 	/* File needs read support.
@@ -410,92 +418,72 @@
 	 */
 	if (file->f_mode & FMODE_READ) {
 		if (!(inode->i_mode & S_IRUGO))
-			goto Eaccess;
-
+			goto out_put_module;
 		if ((type & CONFIGFS_ITEM_ATTR) && !attr->show)
-			goto Eaccess;
-
-		if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read)
-			goto Eaccess;
+			goto out_put_module;
+		if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->read)
+			goto out_put_module;
 	}
 
-	/* No error? Great, allocate a buffer for the file, and store it
-	 * it in file->private_data for easy access.
-	 */
-	buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
-	if (!buffer) {
-		error = -ENOMEM;
-		goto Enomem;
-	}
 	mutex_init(&buffer->mutex);
 	buffer->needs_read_fill = 1;
 	buffer->read_in_progress = false;
 	buffer->write_in_progress = false;
-	buffer->ops = ops;
 	file->private_data = buffer;
-	goto Done;
+	up_read(&frag->frag_sem);
+	return 0;
 
- Einval:
-	error = -EINVAL;
-	goto Done;
- Eaccess:
-	error = -EACCES;
- Enomem:
-	module_put(attr->ca_owner);
- Done:
-	if (error && item)
-		config_item_put(item);
+out_put_module:
+	module_put(buffer->owner);
+out_put_item:
+	config_item_put(buffer->item);
+out_free_buffer:
+	up_read(&frag->frag_sem);
+	kfree(buffer);
+out:
 	return error;
 }
 
 static int configfs_release(struct inode *inode, struct file *filp)
 {
-	struct config_item * item = to_item(filp->f_path.dentry->d_parent);
-	struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
-	struct module * owner = attr->ca_owner;
-	struct configfs_buffer * buffer = filp->private_data;
+	struct configfs_buffer *buffer = filp->private_data;
 
-	if (item)
-		config_item_put(item);
-	/* After this point, attr should not be accessed. */
-	module_put(owner);
-
-	if (buffer) {
-		if (buffer->page)
-			free_page((unsigned long)buffer->page);
-		mutex_destroy(&buffer->mutex);
-		kfree(buffer);
-	}
+	module_put(buffer->owner);
+	if (buffer->page)
+		free_page((unsigned long)buffer->page);
+	mutex_destroy(&buffer->mutex);
+	kfree(buffer);
 	return 0;
 }
 
 static int configfs_open_file(struct inode *inode, struct file *filp)
 {
-	return check_perm(inode, filp, CONFIGFS_ITEM_ATTR);
+	return __configfs_open_file(inode, filp, CONFIGFS_ITEM_ATTR);
 }
 
 static int configfs_open_bin_file(struct inode *inode, struct file *filp)
 {
-	return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
+	return __configfs_open_file(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
 }
 
-static int configfs_release_bin_file(struct inode *inode, struct file *filp)
+static int configfs_release_bin_file(struct inode *inode, struct file *file)
 {
-	struct configfs_buffer *buffer = filp->private_data;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct config_item *item = to_item(dentry->d_parent);
-	struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
-	ssize_t len = 0;
-	int ret;
+	struct configfs_buffer *buffer = file->private_data;
 
 	buffer->read_in_progress = false;
 
 	if (buffer->write_in_progress) {
+		struct configfs_fragment *frag = to_frag(file);
 		buffer->write_in_progress = false;
 
-		len = bin_attr->write(item, buffer->bin_buffer,
-				buffer->bin_buffer_size);
-
+		down_read(&frag->frag_sem);
+		if (!frag->frag_dead) {
+			/* result of ->release() is ignored */
+			buffer->bin_attr->write(buffer->item,
+					buffer->bin_buffer,
+					buffer->bin_buffer_size);
+		}
+		up_read(&frag->frag_sem);
 		/* vfree on NULL is safe */
 		vfree(buffer->bin_buffer);
 		buffer->bin_buffer = NULL;
@@ -503,10 +491,8 @@
 		buffer->needs_read_fill = 1;
 	}
 
-	ret = configfs_release(inode, filp);
-	if (len < 0)
-		return len;
-	return ret;
+	configfs_release(inode, file);
+	return 0;
 }
 
 
@@ -541,7 +527,7 @@
 
 	inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL);
 	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
-				     CONFIGFS_ITEM_ATTR);
+				     CONFIGFS_ITEM_ATTR, parent_sd->s_frag);
 	inode_unlock(d_inode(dir));
 
 	return error;
@@ -563,7 +549,7 @@
 
 	inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL);
 	error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
-				     CONFIGFS_ITEM_BIN_ATTR);
+				     CONFIGFS_ITEM_BIN_ATTR, parent_sd->s_frag);
 	inode_unlock(dir->d_inode);
 
 	return error;

diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 28ef9e5..680aba9 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * inode.c - basic inode and dentry operations.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * Based on sysfs:
  * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
  *
@@ -90,14 +76,14 @@
 	if (ia_valid & ATTR_GID)
 		sd_iattr->ia_gid = iattr->ia_gid;
 	if (ia_valid & ATTR_ATIME)
-		sd_iattr->ia_atime = timespec64_trunc(iattr->ia_atime,
-						      inode->i_sb->s_time_gran);
+		sd_iattr->ia_atime = timestamp_truncate(iattr->ia_atime,
+						      inode);
 	if (ia_valid & ATTR_MTIME)
-		sd_iattr->ia_mtime = timespec64_trunc(iattr->ia_mtime,
-						      inode->i_sb->s_time_gran);
+		sd_iattr->ia_mtime = timestamp_truncate(iattr->ia_mtime,
+						      inode);
 	if (ia_valid & ATTR_CTIME)
-		sd_iattr->ia_ctime = timespec64_trunc(iattr->ia_ctime,
-						      inode->i_sb->s_time_gran);
+		sd_iattr->ia_ctime = timestamp_truncate(iattr->ia_ctime,
+						      inode);
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = iattr->ia_mode;
 
@@ -178,41 +164,27 @@
 
 #endif /* CONFIG_LOCKDEP */
 
-int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct inode *))
+struct inode *configfs_create(struct dentry *dentry, umode_t mode)
 {
-	int error = 0;
 	struct inode *inode = NULL;
 	struct configfs_dirent *sd;
 	struct inode *p_inode;
 
 	if (!dentry)
-		return -ENOENT;
+		return ERR_PTR(-ENOENT);
 
 	if (d_really_is_positive(dentry))
-		return -EEXIST;
+		return ERR_PTR(-EEXIST);
 
 	sd = dentry->d_fsdata;
 	inode = configfs_new_inode(mode, sd, dentry->d_sb);
 	if (!inode)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	p_inode = d_inode(dentry->d_parent);
 	p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode);
 	configfs_set_inode_lock_class(sd, inode);
-
-	init(inode);
-	if (S_ISDIR(mode) || S_ISLNK(mode)) {
-		/*
-		 * ->symlink(), ->mkdir(), configfs_register_subsystem() or
-		 * create_default_group() - already hashed.
-		 */
-		d_instantiate(dentry, inode);
-		dget(dentry);  /* pin link and directory dentries in core */
-	} else {
-		/* ->lookup() */
-		d_add(dentry, inode);
-	}
-	return error;
+	return inode;
 }
 
 /*

diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 99d491c..6e0f1fc 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * item.c - library routines for handling generic config items
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * Based on kobject:
  *	kobject is Copyright (c) 2002-2003 Patrick Mochel
  *

diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index cfd9132..0c6e8cf 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * mount.c - operations for initializing and mounting configfs.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * Based on sysfs:
  * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
  *
@@ -27,6 +13,7 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/mount.h>
+#include <linux/fs_context.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -41,9 +28,18 @@
 struct kmem_cache *configfs_dir_cachep;
 static int configfs_mnt_count = 0;
 
+
+static void configfs_free_inode(struct inode *inode)
+{
+	if (S_ISLNK(inode->i_mode))
+		kfree(inode->i_link);
+	free_inode_nonrcu(inode);
+}
+
 static const struct super_operations configfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
+	.free_inode	= configfs_free_inode,
 };
 
 static struct config_group configfs_root_group = {
@@ -66,7 +62,7 @@
 	.s_iattr	= NULL,
 };
 
-static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+static int configfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct inode *inode;
 	struct dentry *root;
@@ -102,16 +98,25 @@
 	return 0;
 }
 
-static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int configfs_get_tree(struct fs_context *fc)
 {
-	return mount_single(fs_type, flags, data, configfs_fill_super);
+	return get_tree_single(fc, configfs_fill_super);
+}
+
+static const struct fs_context_operations configfs_context_ops = {
+	.get_tree	= configfs_get_tree,
+};
+
+static int configfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &configfs_context_ops;
+	return 0;
 }
 
 static struct file_system_type configfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "configfs",
-	.mount		= configfs_do_mount,
+	.init_fs_context = configfs_init_fs_context,
 	.kill_sb	= kill_litter_super,
 };
 MODULE_ALIAS_FS("configfs");

diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index a5c54af..cb61467 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * symlink.c - operations for configfs symlinks.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * Based on sysfs:
  * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
  *
@@ -69,41 +55,63 @@
 	}
 }
 
+static int configfs_get_target_path(struct config_item *item,
+		struct config_item *target, char *path)
+{
+	int depth, size;
+	char *s;
+
+	depth = item_depth(item);
+	size = item_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	pr_debug("%s: depth = %d, size = %d\n", __func__, depth, size);
+
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
+
+	fill_item_path(target, path, size);
+	pr_debug("%s: path = '%s'\n", __func__, path);
+	return 0;
+}
+
 static int create_link(struct config_item *parent_item,
 		       struct config_item *item,
 		       struct dentry *dentry)
 {
 	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
-	struct configfs_symlink *sl;
+	char *body;
 	int ret;
 
-	ret = -ENOENT;
 	if (!configfs_dirent_is_ready(target_sd))
-		goto out;
-	ret = -ENOMEM;
-	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
-	if (sl) {
-		spin_lock(&configfs_dirent_lock);
-		if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
-			spin_unlock(&configfs_dirent_lock);
-			kfree(sl);
-			return -ENOENT;
-		}
-		sl->sl_target = config_item_get(item);
-		list_add(&sl->sl_list, &target_sd->s_links);
-		spin_unlock(&configfs_dirent_lock);
-		ret = configfs_create_link(sl, parent_item->ci_dentry,
-					   dentry);
-		if (ret) {
-			spin_lock(&configfs_dirent_lock);
-			list_del_init(&sl->sl_list);
-			spin_unlock(&configfs_dirent_lock);
-			config_item_put(item);
-			kfree(sl);
-		}
-	}
+		return -ENOENT;
 
-out:
+	body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!body)
+		return -ENOMEM;
+
+	configfs_get(target_sd);
+	spin_lock(&configfs_dirent_lock);
+	if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
+		spin_unlock(&configfs_dirent_lock);
+		configfs_put(target_sd);
+		kfree(body);
+		return -ENOENT;
+	}
+	target_sd->s_links++;
+	spin_unlock(&configfs_dirent_lock);
+	ret = configfs_get_target_path(parent_item, item, body);
+	if (!ret)
+		ret = configfs_create_link(target_sd, parent_item->ci_dentry,
+					   dentry, body);
+	if (ret) {
+		spin_lock(&configfs_dirent_lock);
+		target_sd->s_links--;
+		spin_unlock(&configfs_dirent_lock);
+		configfs_put(target_sd);
+		kfree(body);
+	}
 	return ret;
 }
 
@@ -145,9 +153,8 @@
 	 * Fake invisibility if dir belongs to a group/default groups hierarchy
 	 * being attached
 	 */
-	ret = -ENOENT;
 	if (!configfs_dirent_is_ready(sd))
-		goto out;
+		return -ENOENT;
 
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	type = parent_item->ci_type;
@@ -157,11 +164,42 @@
 	    !type->ct_item_ops->allow_link)
 		goto out_put;
 
+	/*
+	 * This is really sick.  What they wanted was a hybrid of
+	 * link(2) and symlink(2) - they wanted the target resolved
+	 * at syscall time (as link(2) would've done), be a directory
+	 * (which link(2) would've refused to do) *AND* be a deep
+	 * fucking magic, making the target busy from rmdir POV.
+	 * symlink(2) is nothing of that sort, and the locking it
+	 * gets matches the normal symlink(2) semantics.  Without
+	 * attempts to resolve the target (which might very well
+	 * not even exist yet) done prior to locking the parent
+	 * directory.  This perversion, OTOH, needs to resolve
+	 * the target, which would lead to obvious deadlocks if
+	 * attempted with any directories locked.
+	 *
+	 * Unfortunately, that garbage is userland ABI and we should've
+	 * said "no" back in 2005.  Too late now, so we get to
+	 * play very ugly games with locking.
+	 *
+	 * Try *ANYTHING* of that sort in new code, and you will
+	 * really regret it.  Just ask yourself - what could a BOFH
+	 * do to me and do I want to find it out first-hand?
+	 *
+	 *  AV, a thoroughly annoyed bastard.
+	 */
+	inode_unlock(dir);
 	ret = get_target(symname, &path, &target_item, dentry->d_sb);
+	inode_lock(dir);
 	if (ret)
 		goto out_put;
 
-	ret = type->ct_item_ops->allow_link(parent_item, target_item);
+	if (dentry->d_inode || d_unhashed(dentry))
+		ret = -EEXIST;
+	else
+		ret = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (!ret)
+		ret = type->ct_item_ops->allow_link(parent_item, target_item);
 	if (!ret) {
 		mutex_lock(&configfs_symlink_mutex);
 		ret = create_link(parent_item, target_item, dentry);
@@ -176,15 +214,12 @@
 
 out_put:
 	config_item_put(parent_item);
-
-out:
 	return ret;
 }
 
 int configfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct configfs_dirent *sd = dentry->d_fsdata;
-	struct configfs_symlink *sl;
+	struct configfs_dirent *sd = dentry->d_fsdata, *target_sd;
 	struct config_item *parent_item;
 	const struct config_item_type *type;
 	int ret;
@@ -193,7 +228,7 @@
 	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
 		goto out;
 
-	sl = sd->s_element;
+	target_sd = sd->s_element;
 
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	type = parent_item->ci_type;
@@ -207,21 +242,18 @@
 
 	/*
 	 * drop_link() must be called before
-	 * list_del_init(&sl->sl_list), so that the order of
+	 * decrementing target's ->s_links, so that the order of
 	 * drop_link(this, target) and drop_item(target) is preserved.
 	 */
 	if (type && type->ct_item_ops &&
 	    type->ct_item_ops->drop_link)
 		type->ct_item_ops->drop_link(parent_item,
-					       sl->sl_target);
+					       target_sd->s_element);
 
 	spin_lock(&configfs_dirent_lock);
-	list_del_init(&sl->sl_list);
+	target_sd->s_links--;
 	spin_unlock(&configfs_dirent_lock);
-
-	/* Put reference from create_link() */
-	config_item_put(sl->sl_target);
-	kfree(sl);
+	configfs_put(target_sd);
 
 	config_item_put(parent_item);
 
@@ -231,79 +263,8 @@
 	return ret;
 }
 
-static int configfs_get_target_path(struct config_item * item, struct config_item * target,
-				   char *path)
-{
-	char * s;
-	int depth, size;
-
-	depth = item_depth(item);
-	size = item_path_length(target) + depth * 3 - 1;
-	if (size > PATH_MAX)
-		return -ENAMETOOLONG;
-
-	pr_debug("%s: depth = %d, size = %d\n", __func__, depth, size);
-
-	for (s = path; depth--; s += 3)
-		strcpy(s,"../");
-
-	fill_item_path(target, path, size);
-	pr_debug("%s: path = '%s'\n", __func__, path);
-
-	return 0;
-}
-
-static int configfs_getlink(struct dentry *dentry, char * path)
-{
-	struct config_item *item, *target_item;
-	int error = 0;
-
-	item = configfs_get_config_item(dentry->d_parent);
-	if (!item)
-		return -EINVAL;
-
-	target_item = configfs_get_config_item(dentry);
-	if (!target_item) {
-		config_item_put(item);
-		return -EINVAL;
-	}
-
-	down_read(&configfs_rename_sem);
-	error = configfs_get_target_path(item, target_item, path);
-	up_read(&configfs_rename_sem);
-
-	config_item_put(item);
-	config_item_put(target_item);
-	return error;
-
-}
-
-static const char *configfs_get_link(struct dentry *dentry,
-				     struct inode *inode,
-				     struct delayed_call *done)
-{
-	char *body;
-	int error;
-
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
-
-	body = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!body)
-		return ERR_PTR(-ENOMEM);
-
-	error = configfs_getlink(dentry, body);
-	if (!error) {
-		set_delayed_call(done, kfree_link, body);
-		return body;
-	}
-
-	kfree(body);
-	return ERR_PTR(error);
-}
-
 const struct inode_operations configfs_symlink_inode_operations = {
-	.get_link = configfs_get_link,
+	.get_link = simple_get_link,
 	.setattr = configfs_setattr,
 };
 

diff --git a/fs/coredump.c b/fs/coredump.c
index 1e2c87a..b1ea7df 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c

@@ -7,6 +7,7 @@
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/swap.h>
+#include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
@@ -187,11 +188,13 @@
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-static int format_corename(struct core_name *cn, struct coredump_params *cprm)
+static int format_corename(struct core_name *cn, struct coredump_params *cprm,
+			   size_t **argv, int *argc)
 {
 	const struct cred *cred = current_cred();
 	const char *pat_ptr = core_pattern;
 	int ispipe = (*pat_ptr == '|');
+	bool was_space = false;
 	int pid_in_pattern = 0;
 	int err = 0;
 
@@ -201,12 +204,35 @@
 		return -ENOMEM;
 	cn->corename[0] = '\0';
 
-	if (ispipe)
+	if (ispipe) {
+		int argvs = sizeof(core_pattern) / 2;
+		(*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
+		if (!(*argv))
+			return -ENOMEM;
+		(*argv)[(*argc)++] = 0;
 		++pat_ptr;
+	}
 
 	/* Repeat as long as we have more pattern to process and more output
 	   space */
 	while (*pat_ptr) {
+		/*
+		 * Split on spaces before doing template expansion so that
+		 * %e and %E don't get split if they have spaces in them
+		 */
+		if (ispipe) {
+			if (isspace(*pat_ptr)) {
+				was_space = true;
+				pat_ptr++;
+				continue;
+			} else if (was_space) {
+				was_space = false;
+				err = cn_printf(cn, "%c", '\0');
+				if (err)
+					return err;
+				(*argv)[(*argc)++] = cn->used;
+			}
+		}
 		if (*pat_ptr != '%') {
 			err = cn_printf(cn, "%c", *pat_ptr++);
 		} else {
@@ -536,7 +562,7 @@
 	return err;
 }
 
-void do_coredump(const siginfo_t *siginfo)
+void do_coredump(const kernel_siginfo_t *siginfo)
 {
 	struct core_state core_state;
 	struct core_name cn;
@@ -546,6 +572,8 @@
 	struct cred *cred;
 	int retval = 0;
 	int ispipe;
+	size_t *argv = NULL;
+	int argc = 0;
 	struct files_struct *displaced;
 	/* require nonrelative corefile path and be extra careful */
 	bool need_suid_safe = false;
@@ -592,9 +620,10 @@
 
 	old_cred = override_creds(cred);
 
-	ispipe = format_corename(&cn, &cprm);
+	ispipe = format_corename(&cn, &cprm, &argv, &argc);
 
 	if (ispipe) {
+		int argi;
 		int dump_count;
 		char **helper_argv;
 		struct subprocess_info *sub_info;
@@ -637,12 +666,16 @@
 			goto fail_dropcount;
 		}
 
-		helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
+		helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
+					    GFP_KERNEL);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
 			goto fail_dropcount;
 		}
+		for (argi = 0; argi < argc; argi++)
+			helper_argv[argi] = cn.corename + argv[argi];
+		helper_argv[argi] = NULL;
 
 		retval = -ENOMEM;
 		sub_info = call_usermodehelper_setup(helper_argv[0],
@@ -652,7 +685,7 @@
 			retval = call_usermodehelper_exec(sub_info,
 							  UMH_WAIT_EXEC);
 
-		argv_free(helper_argv);
+		kfree(helper_argv);
 		if (retval) {
 			printk(KERN_INFO "Core dump to |%s pipe failed\n",
 			       cn.corename);
@@ -766,6 +799,7 @@
 	if (ispipe)
 		atomic_dec(&core_dump_count);
 fail_unlock:
+	kfree(argv);
 	kfree(cn.corename);
 	coredump_finish(mm, core_dumped);
 	revert_creds(old_cred);

diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index 5933f99..c8bebb7 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config CRAMFS
 	tristate "Compressed ROM file system support (cramfs)"
 	select ZLIB_INFLATE

diff --git a/fs/cramfs/Makefile b/fs/cramfs/Makefile
index 92ebb46..8c3ed29 100644
--- a/fs/cramfs/Makefile
+++ b/fs/cramfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux cramfs routines.
 #

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 6e00039..2f04024 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c

@@ -24,6 +24,7 @@
 #include <linux/blkdev.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/super.h>
+#include <linux/fs_context.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
@@ -419,9 +420,12 @@
 		int i;
 		vma->vm_flags |= VM_MIXEDMAP;
 		for (i = 0; i < pages && !ret; i++) {
+			vm_fault_t vmf;
 			unsigned long off = i * PAGE_SIZE;
 			pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
-			ret = vm_insert_mixed(vma, vma->vm_start + off, pfn);
+			vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+			if (vmf & VM_FAULT_ERROR)
+				ret = vm_fault_to_errno(vmf, 0);
 		}
 	}
 
@@ -503,18 +507,19 @@
 	kfree(sbi);
 }
 
-static int cramfs_remount(struct super_block *sb, int *flags, char *data)
+static int cramfs_reconfigure(struct fs_context *fc)
 {
-	sync_filesystem(sb);
-	*flags |= SB_RDONLY;
+	sync_filesystem(fc->root->d_sb);
+	fc->sb_flags |= SB_RDONLY;
 	return 0;
 }
 
-static int cramfs_read_super(struct super_block *sb,
-			     struct cramfs_super *super, int silent)
+static int cramfs_read_super(struct super_block *sb, struct fs_context *fc,
+			     struct cramfs_super *super)
 {
 	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
 	unsigned long root_offset;
+	bool silent = fc->sb_flags & SB_SILENT;
 
 	/* We don't know the real size yet */
 	sbi->size = PAGE_SIZE;
@@ -529,7 +534,7 @@
 		/* check for wrong endianness */
 		if (super->magic == CRAMFS_MAGIC_WEND) {
 			if (!silent)
-				pr_err("wrong endianness\n");
+				errorf(fc, "cramfs: wrong endianness");
 			return -EINVAL;
 		}
 
@@ -541,22 +546,22 @@
 		mutex_unlock(&read_mutex);
 		if (super->magic != CRAMFS_MAGIC) {
 			if (super->magic == CRAMFS_MAGIC_WEND && !silent)
-				pr_err("wrong endianness\n");
+				errorf(fc, "cramfs: wrong endianness");
 			else if (!silent)
-				pr_err("wrong magic\n");
+				errorf(fc, "cramfs: wrong magic");
 			return -EINVAL;
 		}
 	}
 
 	/* get feature flags first */
 	if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) {
-		pr_err("unsupported filesystem features\n");
+		errorf(fc, "cramfs: unsupported filesystem features");
 		return -EINVAL;
 	}
 
 	/* Check that the root inode is in a sane state */
 	if (!S_ISDIR(super->root.mode)) {
-		pr_err("root is not a directory\n");
+		errorf(fc, "cramfs: root is not a directory");
 		return -EINVAL;
 	}
 	/* correct strange, hard-coded permissions of mkcramfs */
@@ -575,12 +580,12 @@
 	sbi->magic = super->magic;
 	sbi->flags = super->flags;
 	if (root_offset == 0)
-		pr_info("empty filesystem");
+		infof(fc, "cramfs: empty filesystem");
 	else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
 		 ((root_offset != sizeof(struct cramfs_super)) &&
 		  (root_offset != 512 + sizeof(struct cramfs_super))))
 	{
-		pr_err("bad root offset %lu\n", root_offset);
+		errorf(fc, "cramfs: bad root offset %lu", root_offset);
 		return -EINVAL;
 	}
 
@@ -594,6 +599,8 @@
 
 	/* Set it all up.. */
 	sb->s_flags |= SB_RDONLY;
+	sb->s_time_min = 0;
+	sb->s_time_max = 0;
 	sb->s_op = &cramfs_ops;
 	root = get_cramfs_inode(sb, cramfs_root, 0);
 	if (IS_ERR(root))
@@ -604,8 +611,7 @@
 	return 0;
 }
 
-static int cramfs_blkdev_fill_super(struct super_block *sb, void *data,
-				    int silent)
+static int cramfs_blkdev_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct cramfs_sb_info *sbi;
 	struct cramfs_super super;
@@ -620,14 +626,13 @@
 	for (i = 0; i < READ_BUFFERS; i++)
 		buffer_blocknr[i] = -1;
 
-	err = cramfs_read_super(sb, &super, silent);
+	err = cramfs_read_super(sb, fc, &super);
 	if (err)
 		return err;
 	return cramfs_finalize_super(sb, &super.root);
 }
 
-static int cramfs_mtd_fill_super(struct super_block *sb, void *data,
-				 int silent)
+static int cramfs_mtd_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct cramfs_sb_info *sbi;
 	struct cramfs_super super;
@@ -649,7 +654,7 @@
 
 	pr_info("checking physical address %pap for linear cramfs image\n",
 		&sbi->linear_phys_addr);
-	err = cramfs_read_super(sb, &super, silent);
+	err = cramfs_read_super(sb, fc, &super);
 	if (err)
 		return err;
 
@@ -870,8 +875,8 @@
 			if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
 				/* See comments on earlier code. */
 				u32 prev_start = block_start;
-			       block_start = prev_start & ~CRAMFS_BLK_FLAGS;
-			       block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+				block_start = prev_start & ~CRAMFS_BLK_FLAGS;
+				block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
 				if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
 					block_start += PAGE_SIZE;
 				} else {
@@ -944,32 +949,41 @@
 };
 
 static const struct super_operations cramfs_ops = {
-	.remount_fs	= cramfs_remount,
 	.statfs		= cramfs_statfs,
 };
 
-static struct dentry *cramfs_mount(struct file_system_type *fs_type, int flags,
-				   const char *dev_name, void *data)
+static int cramfs_get_tree(struct fs_context *fc)
 {
-	struct dentry *ret = ERR_PTR(-ENOPROTOOPT);
+	int ret = -ENOPROTOOPT;
 
 	if (IS_ENABLED(CONFIG_CRAMFS_MTD)) {
-		ret = mount_mtd(fs_type, flags, dev_name, data,
-				cramfs_mtd_fill_super);
-		if (!IS_ERR(ret))
-			return ret;
+		ret = get_tree_mtd(fc, cramfs_mtd_fill_super);
+		if (!ret)
+			return 0;
 	}
-	if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) {
-		ret = mount_bdev(fs_type, flags, dev_name, data,
-				 cramfs_blkdev_fill_super);
-	}
+	if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV))
+		ret = get_tree_bdev(fc, cramfs_blkdev_fill_super);
 	return ret;
 }
 
+static const struct fs_context_operations cramfs_context_ops = {
+	.get_tree	= cramfs_get_tree,
+	.reconfigure	= cramfs_reconfigure,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int cramfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &cramfs_context_ops;
+	return 0;
+}
+
 static struct file_system_type cramfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "cramfs",
-	.mount		= cramfs_mount,
+	.init_fs_context = cramfs_init_fs_context,
 	.kill_sb	= cramfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };

diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 02b7d91..ff5a174 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig

@@ -1,16 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config FS_ENCRYPTION
-	tristate "FS Encryption (Per-file encryption)"
+	bool "FS Encryption (Per-file encryption)"
 	select CRYPTO
 	select CRYPTO_AES
 	select CRYPTO_CBC
 	select CRYPTO_ECB
 	select CRYPTO_XTS
 	select CRYPTO_CTS
-	select CRYPTO_CTR
-	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	select CRYPTO_HMAC
 	select KEYS
 	help
 	  Enable encryption of files and directories.  This
 	  feature is similar to ecryptfs, but it is more memory
 	  efficient since it avoids caching the encrypted and
-	  decrypted pages in the page cache.
+	  decrypted pages in the page cache.  Currently Ext4,
+	  F2FS and UBIFS make use of this feature.

diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index cb49698..232e2bb 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile

@@ -1,4 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 
-fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o
+fscrypto-y := crypto.o \
+	      fname.o \
+	      hkdf.o \
+	      hooks.o \
+	      keyring.o \
+	      keysetup.o \
+	      keysetup_v1.o \
+	      policy.o
+
 fscrypto-$(CONFIG_BLOCK) += bio.o

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0959044..82da251 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c

@@ -29,19 +29,16 @@
 static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
 {
 	struct bio_vec *bv;
-	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, iter_all) {
 		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_page(page->mapping->host, page,
-				PAGE_SIZE, 0, page->index);
-
-		if (ret) {
-			WARN_ON_ONCE(1);
+		int ret = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len,
+							   bv->bv_offset);
+		if (ret)
 			SetPageError(page);
-		} else if (done) {
+		else if (done)
 			SetPageUptodate(page);
-		}
 		if (done)
 			unlock_page(page);
 	}
@@ -55,9 +52,8 @@
 
 static void completion_pages(struct work_struct *work)
 {
-	struct fscrypt_ctx *ctx =
-		container_of(work, struct fscrypt_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
+	struct fscrypt_ctx *ctx = container_of(work, struct fscrypt_ctx, work);
+	struct bio *bio = ctx->bio;
 
 	__fscrypt_decrypt_bio(bio, true);
 	fscrypt_release_ctx(ctx);
@@ -66,57 +62,29 @@
 
 void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio)
 {
-	INIT_WORK(&ctx->r.work, completion_pages);
-	ctx->r.bio = bio;
-	fscrypt_enqueue_decrypt_work(&ctx->r.work);
+	INIT_WORK(&ctx->work, completion_pages);
+	ctx->bio = bio;
+	fscrypt_enqueue_decrypt_work(&ctx->work);
 }
 EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio);
 
-void fscrypt_pullback_bio_page(struct page **page, bool restore)
-{
-	struct fscrypt_ctx *ctx;
-	struct page *bounce_page;
-
-	/* The bounce data pages are unmapped. */
-	if ((*page)->mapping)
-		return;
-
-	/* The bounce data page is unmapped. */
-	bounce_page = *page;
-	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
-
-	/* restore control page */
-	*page = ctx->w.control_page;
-
-	if (restore)
-		fscrypt_restore_control_page(bounce_page);
-}
-EXPORT_SYMBOL(fscrypt_pullback_bio_page);
-
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 				sector_t pblk, unsigned int len)
 {
-	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = NULL;
+	const unsigned int blockbits = inode->i_blkbits;
+	const unsigned int blocksize = 1 << blockbits;
+	struct page *ciphertext_page;
 	struct bio *bio;
 	int ret, err = 0;
 
-	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
-
-	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT);
-	if (IS_ERR(ciphertext_page)) {
-		err = PTR_ERR(ciphertext_page);
-		goto errout;
-	}
+	ciphertext_page = fscrypt_alloc_bounce_page(GFP_NOWAIT);
+	if (!ciphertext_page)
+		return -ENOMEM;
 
 	while (len--) {
-		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk,
-					     ZERO_PAGE(0), ciphertext_page,
-					     PAGE_SIZE, 0, GFP_NOFS);
+		err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk,
+					  ZERO_PAGE(0), ciphertext_page,
+					  blocksize, 0, GFP_NOFS);
 		if (err)
 			goto errout;
 
@@ -126,14 +94,11 @@
 			goto errout;
 		}
 		bio_set_dev(bio, inode->i_sb->s_bdev);
-		bio->bi_iter.bi_sector =
-			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		bio->bi_iter.bi_sector = pblk << (blockbits - 9);
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-		ret = bio_add_page(bio, ciphertext_page,
-					inode->i_sb->s_blocksize, 0);
-		if (ret != inode->i_sb->s_blocksize) {
+		ret = bio_add_page(bio, ciphertext_page, blocksize, 0);
+		if (WARN_ON(ret != blocksize)) {
 			/* should never happen! */
-			WARN_ON(1);
 			bio_put(bio);
 			err = -EIO;
 			goto errout;
@@ -149,7 +114,7 @@
 	}
 	err = 0;
 errout:
-	fscrypt_release_ctx(ctx);
+	fscrypt_free_bounce_page(ciphertext_page);
 	return err;
 }
 EXPORT_SYMBOL(fscrypt_zeroout_range);

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 0f46cf5..32a7ad0 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This contains encryption functions for per-file encryption.
  *
@@ -58,23 +59,16 @@
 EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
 
 /**
- * fscrypt_release_ctx() - Releases an encryption context
- * @ctx: The encryption context to release.
+ * fscrypt_release_ctx() - Release a decryption context
+ * @ctx: The decryption context to release.
  *
- * If the encryption context was allocated from the pre-allocated pool, returns
- * it to that pool. Else, frees it.
- *
- * If there's a bounce page in the context, this frees that.
+ * If the decryption context was allocated from the pre-allocated pool, return
+ * it to that pool.  Else, free it.
  */
 void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
 {
 	unsigned long flags;
 
-	if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
-		mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
-		ctx->w.bounce_page = NULL;
-	}
-	ctx->w.control_page = NULL;
 	if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
 		kmem_cache_free(fscrypt_ctx_cachep, ctx);
 	} else {
@@ -86,33 +80,21 @@
 EXPORT_SYMBOL(fscrypt_release_ctx);
 
 /**
- * fscrypt_get_ctx() - Gets an encryption context
- * @inode:       The inode for which we are doing the crypto
+ * fscrypt_get_ctx() - Get a decryption context
  * @gfp_flags:   The gfp flag for memory allocation
  *
- * Allocates and initializes an encryption context.
+ * Allocate and initialize a decryption context.
  *
- * Return: An allocated and initialized encryption context on success; error
- * value or NULL otherwise.
+ * Return: A new decryption context on success; an ERR_PTR() otherwise.
  */
-struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
+struct fscrypt_ctx *fscrypt_get_ctx(gfp_t gfp_flags)
 {
-	struct fscrypt_ctx *ctx = NULL;
-	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct fscrypt_ctx *ctx;
 	unsigned long flags;
 
-	if (ci == NULL)
-		return ERR_PTR(-ENOKEY);
-
 	/*
-	 * We first try getting the ctx from a free list because in
-	 * the common case the ctx will have an allocated and
-	 * initialized crypto tfm, so it's probably a worthwhile
-	 * optimization. For the bounce page, we first try getting it
-	 * from the kernel allocator because that's just about as fast
-	 * as getting it from a list and because a cache of free pages
-	 * should generally be a "last resort" option for a filesystem
-	 * to be able to do its job.
+	 * First try getting a ctx from the free list so that we don't have to
+	 * call into the slab allocator.
 	 */
 	spin_lock_irqsave(&fscrypt_ctx_lock, flags);
 	ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
@@ -128,20 +110,51 @@
 	} else {
 		ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
 	}
-	ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
 	return ctx;
 }
 EXPORT_SYMBOL(fscrypt_get_ctx);
 
-int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
-			   u64 lblk_num, struct page *src_page,
-			   struct page *dest_page, unsigned int len,
-			   unsigned int offs, gfp_t gfp_flags)
+struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
 {
-	struct {
-		__le64 index;
-		u8 padding[FS_IV_SIZE - sizeof(__le64)];
-	} iv;
+	return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
+}
+
+/**
+ * fscrypt_free_bounce_page() - free a ciphertext bounce page
+ *
+ * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(),
+ * or by fscrypt_alloc_bounce_page() directly.
+ */
+void fscrypt_free_bounce_page(struct page *bounce_page)
+{
+	if (!bounce_page)
+		return;
+	set_page_private(bounce_page, (unsigned long)NULL);
+	ClearPagePrivate(bounce_page);
+	mempool_free(bounce_page, fscrypt_bounce_page_pool);
+}
+EXPORT_SYMBOL(fscrypt_free_bounce_page);
+
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+			 const struct fscrypt_info *ci)
+{
+	memset(iv, 0, ci->ci_mode->ivsize);
+	iv->lblk_num = cpu_to_le64(lblk_num);
+
+	if (fscrypt_is_direct_key_policy(&ci->ci_policy))
+		memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+	if (ci->ci_essiv_tfm != NULL)
+		crypto_cipher_encrypt_one(ci->ci_essiv_tfm, iv->raw, iv->raw);
+}
+
+/* Encrypt or decrypt a single filesystem block of file contents */
+int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
+			u64 lblk_num, struct page *src_page,
+			struct page *dest_page, unsigned int len,
+			unsigned int offs, gfp_t gfp_flags)
+{
+	union fscrypt_iv iv;
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist dst, src;
@@ -149,17 +162,12 @@
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 
-	BUG_ON(len == 0);
+	if (WARN_ON_ONCE(len <= 0))
+		return -EINVAL;
+	if (WARN_ON_ONCE(len % FS_CRYPTO_BLOCK_SIZE != 0))
+		return -EINVAL;
 
-	BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE);
-	BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE);
-	iv.index = cpu_to_le64(lblk_num);
-	memset(iv.padding, 0, sizeof(iv.padding));
-
-	if (ci->ci_essiv_tfm != NULL) {
-		crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv,
-					  (u8 *)&iv);
-	}
+	fscrypt_generate_iv(&iv, lblk_num, ci);
 
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req)
@@ -180,194 +188,214 @@
 		res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res) {
-		fscrypt_err(inode->i_sb,
-			    "%scryption failed for inode %lu, block %llu: %d",
-			    (rw == FS_DECRYPT ? "de" : "en"),
-			    inode->i_ino, lblk_num, res);
+		fscrypt_err(inode, "%scryption failed for block %llu: %d",
+			    (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res);
 		return res;
 	}
 	return 0;
 }
 
-struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
-				       gfp_t gfp_flags)
-{
-	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
-	if (ctx->w.bounce_page == NULL)
-		return ERR_PTR(-ENOMEM);
-	ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
-	return ctx->w.bounce_page;
-}
-
 /**
- * fscypt_encrypt_page() - Encrypts a page
- * @inode:     The inode for which the encryption should take place
- * @page:      The page to encrypt. Must be locked for bounce-page
- *             encryption.
- * @len:       Length of data to encrypt in @page and encrypted
- *             data in returned page.
- * @offs:      Offset of data within @page and returned
- *             page holding encrypted data.
- * @lblk_num:  Logical block number. This must be unique for multiple
- *             calls with same inode, except when overwriting
- *             previously written data.
- * @gfp_flags: The gfp flag for memory allocation
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page
+ * @page:      The locked pagecache page containing the block(s) to encrypt
+ * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
+ *		multiple of the filesystem's block size.
+ * @offs:      Byte offset within @page of the first block to encrypt.  Must be
+ *		a multiple of the filesystem's block size.
+ * @gfp_flags: Memory allocation flags
  *
- * Encrypts @page using the ctx encryption context. Performs encryption
- * either in-place or into a newly allocated bounce page.
- * Called on the page write path.
+ * A new bounce page is allocated, and the specified block(s) are encrypted into
+ * it.  In the bounce page, the ciphertext block(s) will be located at the same
+ * offsets at which the plaintext block(s) were located in the source page; any
+ * other parts of the bounce page will be left uninitialized.  However, normally
+ * blocksize == PAGE_SIZE and the whole page is encrypted at once.
  *
- * Bounce page allocation is the default.
- * In this case, the contents of @page are encrypted and stored in an
- * allocated bounce page. @page has to be locked and the caller must call
- * fscrypt_restore_control_page() on the returned ciphertext page to
- * release the bounce buffer and the encryption context.
+ * This is for use by the filesystem's ->writepages() method.
  *
- * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
- * fscrypt_operations. Here, the input-page is returned with its content
- * encrypted.
- *
- * Return: A page with the encrypted content on success. Else, an
- * error value or NULL.
+ * Return: the new encrypted bounce page on success; an ERR_PTR() on failure
  */
-struct page *fscrypt_encrypt_page(const struct inode *inode,
-				struct page *page,
-				unsigned int len,
-				unsigned int offs,
-				u64 lblk_num, gfp_t gfp_flags)
+struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
+					      unsigned int len,
+					      unsigned int offs,
+					      gfp_t gfp_flags)
 
 {
-	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = page;
+	const struct inode *inode = page->mapping->host;
+	const unsigned int blockbits = inode->i_blkbits;
+	const unsigned int blocksize = 1 << blockbits;
+	struct page *ciphertext_page;
+	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
+		       (offs >> blockbits);
+	unsigned int i;
 	int err;
 
-	BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
+	if (WARN_ON_ONCE(!PageLocked(page)))
+		return ERR_PTR(-EINVAL);
 
-	if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
-		/* with inplace-encryption we just encrypt the page */
-		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page,
-					     ciphertext_page, len, offs,
-					     gfp_flags);
-		if (err)
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+		return ERR_PTR(-EINVAL);
+
+	ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
+	if (!ciphertext_page)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+		err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num,
+					  page, ciphertext_page,
+					  blocksize, i, gfp_flags);
+		if (err) {
+			fscrypt_free_bounce_page(ciphertext_page);
 			return ERR_PTR(err);
-
-		return ciphertext_page;
-	}
-
-	BUG_ON(!PageLocked(page));
-
-	ctx = fscrypt_get_ctx(inode, gfp_flags);
-	if (IS_ERR(ctx))
-		return (struct page *)ctx;
-
-	/* The encryption operation will require a bounce page. */
-	ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
-	if (IS_ERR(ciphertext_page))
-		goto errout;
-
-	ctx->w.control_page = page;
-	err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num,
-				     page, ciphertext_page, len, offs,
-				     gfp_flags);
-	if (err) {
-		ciphertext_page = ERR_PTR(err);
-		goto errout;
+		}
 	}
 	SetPagePrivate(ciphertext_page);
-	set_page_private(ciphertext_page, (unsigned long)ctx);
-	lock_page(ciphertext_page);
-	return ciphertext_page;
-
-errout:
-	fscrypt_release_ctx(ctx);
+	set_page_private(ciphertext_page, (unsigned long)page);
 	return ciphertext_page;
 }
-EXPORT_SYMBOL(fscrypt_encrypt_page);
+EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
 
 /**
- * fscrypt_decrypt_page() - Decrypts a page in-place
- * @inode:     The corresponding inode for the page to decrypt.
- * @page:      The page to decrypt. Must be locked in case
- *             it is a writeback page (FS_CFLG_OWN_PAGES unset).
- * @len:       Number of bytes in @page to be decrypted.
- * @offs:      Start of data in @page.
- * @lblk_num:  Logical block number.
+ * fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place
+ * @inode:     The inode to which this block belongs
+ * @page:      The page containing the block to encrypt
+ * @len:       Size of block to encrypt.  Doesn't need to be a multiple of the
+ *		fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE.
+ * @offs:      Byte offset within @page at which the block to encrypt begins
+ * @lblk_num:  Filesystem logical block number of the block, i.e. the 0-based
+ *		number of the block within the file
+ * @gfp_flags: Memory allocation flags
  *
- * Decrypts page in-place using the ctx encryption context.
+ * Encrypt a possibly-compressed filesystem block that is located in an
+ * arbitrary page, not necessarily in the original pagecache page.  The @inode
+ * and @lblk_num must be specified, as they can't be determined from @page.
  *
- * Called from the read completion callback.
- *
- * Return: Zero on success, non-zero otherwise.
+ * Return: 0 on success; -errno on failure
  */
-int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
-			unsigned int len, unsigned int offs, u64 lblk_num)
+int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
+				  unsigned int len, unsigned int offs,
+				  u64 lblk_num, gfp_t gfp_flags)
 {
-	if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
-		BUG_ON(!PageLocked(page));
-
-	return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page,
-				      len, offs, GFP_NOFS);
+	return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page,
+				   len, offs, gfp_flags);
 }
-EXPORT_SYMBOL(fscrypt_decrypt_page);
+EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
+
+/**
+ * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page
+ * @page:      The locked pagecache page containing the block(s) to decrypt
+ * @len:       Total size of the block(s) to decrypt.  Must be a nonzero
+ *		multiple of the filesystem's block size.
+ * @offs:      Byte offset within @page of the first block to decrypt.  Must be
+ *		a multiple of the filesystem's block size.
+ *
+ * The specified block(s) are decrypted in-place within the pagecache page,
+ * which must still be locked and not uptodate.  Normally, blocksize ==
+ * PAGE_SIZE and the whole page is decrypted at once.
+ *
+ * This is for use by the filesystem's ->readpages() method.
+ *
+ * Return: 0 on success; -errno on failure
+ */
+int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len,
+				     unsigned int offs)
+{
+	const struct inode *inode = page->mapping->host;
+	const unsigned int blockbits = inode->i_blkbits;
+	const unsigned int blocksize = 1 << blockbits;
+	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
+		       (offs >> blockbits);
+	unsigned int i;
+	int err;
+
+	if (WARN_ON_ONCE(!PageLocked(page)))
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+		return -EINVAL;
+
+	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+		err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
+					  page, blocksize, i, GFP_NOFS);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
+
+/**
+ * fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place
+ * @inode:     The inode to which this block belongs
+ * @page:      The page containing the block to decrypt
+ * @len:       Size of block to decrypt.  Doesn't need to be a multiple of the
+ *		fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE.
+ * @offs:      Byte offset within @page at which the block to decrypt begins
+ * @lblk_num:  Filesystem logical block number of the block, i.e. the 0-based
+ *		number of the block within the file
+ *
+ * Decrypt a possibly-compressed filesystem block that is located in an
+ * arbitrary page, not necessarily in the original pagecache page.  The @inode
+ * and @lblk_num must be specified, as they can't be determined from @page.
+ *
+ * Return: 0 on success; -errno on failure
+ */
+int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
+				  unsigned int len, unsigned int offs,
+				  u64 lblk_num)
+{
+	return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
+				   len, offs, GFP_NOFS);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
 
 /*
- * Validate dentries for encrypted directories to make sure we aren't
- * potentially caching stale data after a key has been added or
- * removed.
+ * Validate dentries in encrypted directories to make sure we aren't potentially
+ * caching stale dentries after a key has been added.
  */
 static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *dir;
-	int dir_has_key, cached_with_key;
+	int err;
+	int valid;
+
+	/*
+	 * Plaintext names are always valid, since fscrypt doesn't support
+	 * reverting to ciphertext names without evicting the directory's inode
+	 * -- which implies eviction of the dentries in the directory.
+	 */
+	if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
+		return 1;
+
+	/*
+	 * Ciphertext name; valid if the directory's key is still unavailable.
+	 *
+	 * Although fscrypt forbids rename() on ciphertext names, we still must
+	 * use dget_parent() here rather than use ->d_parent directly.  That's
+	 * because a corrupted fs image may contain directory hard links, which
+	 * the VFS handles by moving the directory's dentry tree in the dcache
+	 * each time ->lookup() finds the directory and it already has a dentry
+	 * elsewhere.  Thus ->d_parent can be changing, and we must safely grab
+	 * a reference to some ->d_parent to prevent it from being freed.
+	 */
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	dir = dget_parent(dentry);
-	if (!IS_ENCRYPTED(d_inode(dir))) {
-		dput(dir);
-		return 0;
-	}
-
-	spin_lock(&dentry->d_lock);
-	cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
-	spin_unlock(&dentry->d_lock);
-	dir_has_key = (d_inode(dir)->i_crypt_info != NULL);
+	err = fscrypt_get_encryption_info(d_inode(dir));
+	valid = !fscrypt_has_encryption_key(d_inode(dir));
 	dput(dir);
 
-	/*
-	 * If the dentry was cached without the key, and it is a
-	 * negative dentry, it might be a valid name.  We can't check
-	 * if the key has since been made available due to locking
-	 * reasons, so we fail the validation so ext4_lookup() can do
-	 * this check.
-	 *
-	 * We also fail the validation if the dentry was created with
-	 * the key present, but we no longer have the key, or vice versa.
-	 */
-	if ((!cached_with_key && d_is_negative(dentry)) ||
-			(!cached_with_key && dir_has_key) ||
-			(cached_with_key && !dir_has_key))
-		return 0;
-	return 1;
+	if (err < 0)
+		return err;
+
+	return valid;
 }
 
 const struct dentry_operations fscrypt_d_ops = {
 	.d_revalidate = fscrypt_d_revalidate,
 };
 
-void fscrypt_restore_control_page(struct page *page)
-{
-	struct fscrypt_ctx *ctx;
-
-	ctx = (struct fscrypt_ctx *)page_private(page);
-	set_page_private(page, (unsigned long)NULL);
-	ClearPagePrivate(page);
-	unlock_page(page);
-	fscrypt_release_ctx(ctx);
-}
-EXPORT_SYMBOL(fscrypt_restore_control_page);
-
 static void fscrypt_destroy(void)
 {
 	struct fscrypt_ctx *pos, *n;
@@ -423,7 +451,7 @@
 	return res;
 }
 
-void fscrypt_msg(struct super_block *sb, const char *level,
+void fscrypt_msg(const struct inode *inode, const char *level,
 		 const char *fmt, ...)
 {
 	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,8 +465,9 @@
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	if (sb)
-		printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf);
+	if (inode)
+		printk("%sfscrypt (%s, inode %lu): %pV\n",
+		       level, inode->i_sb->s_id, inode->i_ino, &vaf);
 	else
 		printk("%sfscrypt: %pV\n", level, &vaf);
 	va_end(args);
@@ -449,6 +478,8 @@
  */
 static int __init fscrypt_init(void)
 {
+	int err = -ENOMEM;
+
 	/*
 	 * Use an unbound workqueue to allow bios to be decrypted in parallel
 	 * even when they happen to complete on the same CPU.  This sacrifices
@@ -471,31 +502,19 @@
 	if (!fscrypt_info_cachep)
 		goto fail_free_ctx;
 
+	err = fscrypt_init_keyring();
+	if (err)
+		goto fail_free_info;
+
 	return 0;
 
+fail_free_info:
+	kmem_cache_destroy(fscrypt_info_cachep);
 fail_free_ctx:
 	kmem_cache_destroy(fscrypt_ctx_cachep);
 fail_free_queue:
 	destroy_workqueue(fscrypt_read_workqueue);
 fail:
-	return -ENOMEM;
+	return err;
 }
-module_init(fscrypt_init)
-
-/**
- * fscrypt_exit() - Shutdown the fs encryption system
- */
-static void __exit fscrypt_exit(void)
-{
-	fscrypt_destroy();
-
-	if (fscrypt_read_workqueue)
-		destroy_workqueue(fscrypt_read_workqueue);
-	kmem_cache_destroy(fscrypt_ctx_cachep);
-	kmem_cache_destroy(fscrypt_info_cachep);
-
-	fscrypt_essiv_cleanup();
-}
-module_exit(fscrypt_exit);
-
-MODULE_LICENSE("GPL");
+late_initcall(fscrypt_init)

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index d7a0f68..3da3707 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c

@@ -12,7 +12,6 @@
  */
 
 #include <linux/scatterlist.h>
-#include <linux/ratelimit.h>
 #include <crypto/skcipher.h>
 #include "fscrypt_private.h"
 
@@ -40,10 +39,11 @@
 {
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
-	struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
-	int res = 0;
-	char iv[FS_CRYPTO_BLOCK_SIZE];
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	union fscrypt_iv iv;
 	struct scatterlist sg;
+	int res;
 
 	/*
 	 * Copy the filename to the output buffer for encrypting in-place and
@@ -55,7 +55,7 @@
 	memset(out + iname->len, 0, olen - iname->len);
 
 	/* Initialize the IV */
-	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+	fscrypt_generate_iv(&iv, 0, ci);
 
 	/* Set up the encryption request */
 	req = skcipher_request_alloc(tfm, GFP_NOFS);
@@ -65,15 +65,13 @@
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 			crypto_req_done, &wait);
 	sg_init_one(&sg, out, olen);
-	skcipher_request_set_crypt(req, &sg, &sg, olen, iv);
+	skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);
 
 	/* Do the encryption */
 	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res < 0) {
-		fscrypt_err(inode->i_sb,
-			    "Filename encryption failed for inode %lu: %d",
-			    inode->i_ino, res);
+		fscrypt_err(inode, "Filename encryption failed: %d", res);
 		return res;
 	}
 
@@ -94,9 +92,10 @@
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist src_sg, dst_sg;
-	struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
-	int res = 0;
-	char iv[FS_CRYPTO_BLOCK_SIZE];
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	union fscrypt_iv iv;
+	int res;
 
 	/* Allocate request */
 	req = skcipher_request_alloc(tfm, GFP_NOFS);
@@ -107,18 +106,16 @@
 		crypto_req_done, &wait);
 
 	/* Initialize IV */
-	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+	fscrypt_generate_iv(&iv, 0, ci);
 
 	/* Create decryption request */
 	sg_init_one(&src_sg, iname->name, iname->len);
 	sg_init_one(&dst_sg, oname->name, oname->len);
-	skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
 	res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res < 0) {
-		fscrypt_err(inode->i_sb,
-			    "Filename decryption failed for inode %lu: %d",
-			    inode->i_ino, res);
+		fscrypt_err(inode, "Filename decryption failed: %d", res);
 		return res;
 	}
 
@@ -126,44 +123,45 @@
 	return 0;
 }
 
-static const char *lookup_table =
+static const char lookup_table[65] =
 	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
 
 #define BASE64_CHARS(nbytes)	DIV_ROUND_UP((nbytes) * 4, 3)
 
 /**
- * digest_encode() -
+ * base64_encode() -
  *
- * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * Encodes the input string using characters from the set [A-Za-z0-9+,].
  * The encoded string is roughly 4/3 times the size of the input string.
+ *
+ * Return: length of the encoded string
  */
-static int digest_encode(const char *src, int len, char *dst)
+static int base64_encode(const u8 *src, int len, char *dst)
 {
-	int i = 0, bits = 0, ac = 0;
+	int i, bits = 0, ac = 0;
 	char *cp = dst;
 
-	while (i < len) {
-		ac += (((unsigned char) src[i]) << bits);
+	for (i = 0; i < len; i++) {
+		ac += src[i] << bits;
 		bits += 8;
 		do {
 			*cp++ = lookup_table[ac & 0x3f];
 			ac >>= 6;
 			bits -= 6;
 		} while (bits >= 6);
-		i++;
 	}
 	if (bits)
 		*cp++ = lookup_table[ac & 0x3f];
 	return cp - dst;
 }
 
-static int digest_decode(const char *src, int len, char *dst)
+static int base64_decode(const char *src, int len, u8 *dst)
 {
-	int i = 0, bits = 0, ac = 0;
+	int i, bits = 0, ac = 0;
 	const char *p;
-	char *cp = dst;
+	u8 *cp = dst;
 
-	while (i < len) {
+	for (i = 0; i < len; i++) {
 		p = strchr(lookup_table, src[i]);
 		if (p == NULL || src[i] == 0)
 			return -2;
@@ -174,7 +172,6 @@
 			ac >>= 8;
 			bits -= 8;
 		}
-		i++;
 	}
 	if (ac)
 		return -1;
@@ -184,8 +181,9 @@
 bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
 				  u32 max_len, u32 *encrypted_len_ret)
 {
-	int padding = 4 << (inode->i_crypt_info->ci_flags &
-			    FS_POLICY_FLAGS_PAD_MASK);
+	const struct fscrypt_info *ci = inode->i_crypt_info;
+	int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) &
+			    FSCRYPT_POLICY_FLAGS_PAD_MASK);
 	u32 encrypted_len;
 
 	if (orig_len > max_len)
@@ -267,11 +265,11 @@
 	if (iname->len < FS_CRYPTO_BLOCK_SIZE)
 		return -EUCLEAN;
 
-	if (inode->i_crypt_info)
+	if (fscrypt_has_encryption_key(inode))
 		return fname_decrypt(inode, iname, oname);
 
 	if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
-		oname->len = digest_encode(iname->name, iname->len,
+		oname->len = base64_encode(iname->name, iname->len,
 					   oname->name);
 		return 0;
 	}
@@ -286,7 +284,7 @@
 	       FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
 	       FSCRYPT_FNAME_DIGEST_SIZE);
 	oname->name[0] = '_';
-	oname->len = 1 + digest_encode((const char *)&digested_name,
+	oname->len = 1 + base64_encode((const u8 *)&digested_name,
 				       sizeof(digested_name), oname->name + 1);
 	return 0;
 }
@@ -334,7 +332,7 @@
 	if (ret)
 		return ret;
 
-	if (dir->i_crypt_info) {
+	if (fscrypt_has_encryption_key(dir)) {
 		if (!fscrypt_fname_encrypted_size(dir, iname->len,
 						  dir->i_sb->s_cop->max_namelen,
 						  &fname->crypto_buf.len))
@@ -354,6 +352,7 @@
 	}
 	if (!lookup)
 		return -ENOKEY;
+	fname->is_ciphertext_name = true;
 
 	/*
 	 * We don't have the key and we are doing a lookup; decode the
@@ -378,8 +377,8 @@
 	if (fname->crypto_buf.name == NULL)
 		return -ENOMEM;
 
-	ret = digest_decode(iname->name + digested, iname->len - digested,
-				fname->crypto_buf.name);
+	ret = base64_decode(iname->name + digested, iname->len - digested,
+			    fname->crypto_buf.name);
 	if (ret < 0) {
 		ret = -ENOENT;
 		goto errout;

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 79debfc..e84efc0 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h

@@ -4,43 +4,143 @@
  *
  * Copyright (C) 2015, Google, Inc.
  *
- * This contains encryption key functions.
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
+ * Heavily modified since then.
  */
 
 #ifndef _FSCRYPT_PRIVATE_H
 #define _FSCRYPT_PRIVATE_H
 
-#define __FS_HAS_ENCRYPTION 1
 #include <linux/fscrypt.h>
 #include <crypto/hash.h>
 
-/* Encryption parameters */
-#define FS_IV_SIZE			16
+#define CONST_STRLEN(str)	(sizeof(str) - 1)
+
 #define FS_KEY_DERIVATION_NONCE_SIZE	16
 
-/**
- * Encryption context for inode
- *
- * Protector format:
- *  1 byte: Protector format (1 = this version)
- *  1 byte: File contents encryption mode
- *  1 byte: File names encryption mode
- *  1 byte: Flags
- *  8 bytes: Master Key descriptor
- *  16 bytes: Encryption Key derivation nonce
- */
-struct fscrypt_context {
-	u8 format;
+#define FSCRYPT_MIN_KEY_SIZE		16
+
+#define FSCRYPT_CONTEXT_V1	1
+#define FSCRYPT_CONTEXT_V2	2
+
+struct fscrypt_context_v1 {
+	u8 version; /* FSCRYPT_CONTEXT_V1 */
 	u8 contents_encryption_mode;
 	u8 filenames_encryption_mode;
 	u8 flags;
-	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
 	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
-} __packed;
+};
 
-#define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
+struct fscrypt_context_v2 {
+	u8 version; /* FSCRYPT_CONTEXT_V2 */
+	u8 contents_encryption_mode;
+	u8 filenames_encryption_mode;
+	u8 flags;
+	u8 __reserved[4];
+	u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
+	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+};
+
+/**
+ * fscrypt_context - the encryption context of an inode
+ *
+ * This is the on-disk equivalent of an fscrypt_policy, stored alongside each
+ * encrypted file usually in a hidden extended attribute.  It contains the
+ * fields from the fscrypt_policy, in order to identify the encryption algorithm
+ * and key with which the file is encrypted.  It also contains a nonce that was
+ * randomly generated by fscrypt itself; this is used as KDF input or as a tweak
+ * to cause different files to be encrypted differently.
+ */
+union fscrypt_context {
+	u8 version;
+	struct fscrypt_context_v1 v1;
+	struct fscrypt_context_v2 v2;
+};
+
+/*
+ * Return the size expected for the given fscrypt_context based on its version
+ * number, or 0 if the context version is unrecognized.
+ */
+static inline int fscrypt_context_size(const union fscrypt_context *ctx)
+{
+	switch (ctx->version) {
+	case FSCRYPT_CONTEXT_V1:
+		BUILD_BUG_ON(sizeof(ctx->v1) != 28);
+		return sizeof(ctx->v1);
+	case FSCRYPT_CONTEXT_V2:
+		BUILD_BUG_ON(sizeof(ctx->v2) != 40);
+		return sizeof(ctx->v2);
+	}
+	return 0;
+}
+
+#undef fscrypt_policy
+union fscrypt_policy {
+	u8 version;
+	struct fscrypt_policy_v1 v1;
+	struct fscrypt_policy_v2 v2;
+};
+
+/*
+ * Return the size expected for the given fscrypt_policy based on its version
+ * number, or 0 if the policy version is unrecognized.
+ */
+static inline int fscrypt_policy_size(const union fscrypt_policy *policy)
+{
+	switch (policy->version) {
+	case FSCRYPT_POLICY_V1:
+		return sizeof(policy->v1);
+	case FSCRYPT_POLICY_V2:
+		return sizeof(policy->v2);
+	}
+	return 0;
+}
+
+/* Return the contents encryption mode of a valid encryption policy */
+static inline u8
+fscrypt_policy_contents_mode(const union fscrypt_policy *policy)
+{
+	switch (policy->version) {
+	case FSCRYPT_POLICY_V1:
+		return policy->v1.contents_encryption_mode;
+	case FSCRYPT_POLICY_V2:
+		return policy->v2.contents_encryption_mode;
+	}
+	BUG();
+}
+
+/* Return the filenames encryption mode of a valid encryption policy */
+static inline u8
+fscrypt_policy_fnames_mode(const union fscrypt_policy *policy)
+{
+	switch (policy->version) {
+	case FSCRYPT_POLICY_V1:
+		return policy->v1.filenames_encryption_mode;
+	case FSCRYPT_POLICY_V2:
+		return policy->v2.filenames_encryption_mode;
+	}
+	BUG();
+}
+
+/* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */
+static inline u8
+fscrypt_policy_flags(const union fscrypt_policy *policy)
+{
+	switch (policy->version) {
+	case FSCRYPT_POLICY_V1:
+		return policy->v1.flags;
+	case FSCRYPT_POLICY_V2:
+		return policy->v2.flags;
+	}
+	BUG();
+}
+
+static inline bool
+fscrypt_is_direct_key_policy(const union fscrypt_policy *policy)
+{
+	return fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAG_DIRECT_KEY;
+}
 
 /**
  * For encrypted symlinks, the ciphertext length is stored at the beginning
@@ -52,16 +152,56 @@
 } __packed;
 
 /*
- * A pointer to this structure is stored in the file system's in-core
- * representation of an inode.
+ * fscrypt_info - the "encryption key" for an inode
+ *
+ * When an encrypted file's key is made available, an instance of this struct is
+ * allocated and stored in ->i_crypt_info.  Once created, it remains until the
+ * inode is evicted.
  */
 struct fscrypt_info {
-	u8 ci_data_mode;
-	u8 ci_filename_mode;
-	u8 ci_flags;
+
+	/* The actual crypto transform used for encryption and decryption */
 	struct crypto_skcipher *ci_ctfm;
+
+	/*
+	 * Cipher for ESSIV IV generation.  Only set for CBC contents
+	 * encryption, otherwise is NULL.
+	 */
 	struct crypto_cipher *ci_essiv_tfm;
-	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+
+	/*
+	 * Encryption mode used for this inode.  It corresponds to either the
+	 * contents or filenames encryption mode, depending on the inode type.
+	 */
+	struct fscrypt_mode *ci_mode;
+
+	/* Back-pointer to the inode */
+	struct inode *ci_inode;
+
+	/*
+	 * The master key with which this inode was unlocked (decrypted).  This
+	 * will be NULL if the master key was found in a process-subscribed
+	 * keyring rather than in the filesystem-level keyring.
+	 */
+	struct key *ci_master_key;
+
+	/*
+	 * Link in list of inodes that were unlocked with the master key.
+	 * Only used when ->ci_master_key is set.
+	 */
+	struct list_head ci_master_key_link;
+
+	/*
+	 * If non-NULL, then encryption is done using the master key directly
+	 * and ci_ctfm will equal ci_direct_key->dk_ctfm.
+	 */
+	struct fscrypt_direct_key *ci_direct_key;
+
+	/* The encryption policy used by this inode */
+	union fscrypt_policy ci_policy;
+
+	/* This inode's nonce, copied from the fscrypt_context */
+	u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
 };
 
 typedef enum {
@@ -70,17 +210,20 @@
 } fscrypt_direction_t;
 
 #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
-#define FS_CTX_HAS_BOUNCE_BUFFER_FL		0x00000002
 
 static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 					   u32 filenames_mode)
 {
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+	if (contents_mode == FSCRYPT_MODE_AES_128_CBC &&
+	    filenames_mode == FSCRYPT_MODE_AES_128_CTS)
 		return true;
 
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+	    filenames_mode == FSCRYPT_MODE_AES_256_CTS)
+		return true;
+
+	if (contents_mode == FSCRYPT_MODE_ADIANTUM &&
+	    filenames_mode == FSCRYPT_MODE_ADIANTUM)
 		return true;
 
 	return false;
@@ -89,23 +232,37 @@
 /* crypto.c */
 extern struct kmem_cache *fscrypt_info_cachep;
 extern int fscrypt_initialize(unsigned int cop_flags);
-extern int fscrypt_do_page_crypto(const struct inode *inode,
-				  fscrypt_direction_t rw, u64 lblk_num,
-				  struct page *src_page,
-				  struct page *dest_page,
-				  unsigned int len, unsigned int offs,
-				  gfp_t gfp_flags);
-extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
-					      gfp_t gfp_flags);
+extern int fscrypt_crypt_block(const struct inode *inode,
+			       fscrypt_direction_t rw, u64 lblk_num,
+			       struct page *src_page, struct page *dest_page,
+			       unsigned int len, unsigned int offs,
+			       gfp_t gfp_flags);
+extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
 extern const struct dentry_operations fscrypt_d_ops;
 
 extern void __printf(3, 4) __cold
-fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
 
-#define fscrypt_warn(sb, fmt, ...)		\
-	fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
-#define fscrypt_err(sb, fmt, ...)		\
-	fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+#define fscrypt_warn(inode, fmt, ...)		\
+	fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__)
+#define fscrypt_err(inode, fmt, ...)		\
+	fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
+
+#define FSCRYPT_MAX_IV_SIZE	32
+
+union fscrypt_iv {
+	struct {
+		/* logical block number within the file */
+		__le64 lblk_num;
+
+		/* per-file nonce; only set in DIRECT_KEY mode */
+		u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+	};
+	u8 raw[FSCRYPT_MAX_IV_SIZE];
+};
+
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+			 const struct fscrypt_info *ci);
 
 /* fname.c */
 extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
@@ -114,7 +271,212 @@
 					 u32 orig_len, u32 max_len,
 					 u32 *encrypted_len_ret);
 
-/* keyinfo.c */
-extern void __exit fscrypt_essiv_cleanup(void);
+/* hkdf.c */
+
+struct fscrypt_hkdf {
+	struct crypto_shash *hmac_tfm;
+};
+
+extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
+			     unsigned int master_key_size);
+
+/*
+ * The list of contexts in which fscrypt uses HKDF.  These values are used as
+ * the first byte of the HKDF application-specific info string to guarantee that
+ * info strings are never repeated between contexts.  This ensures that all HKDF
+ * outputs are unique and cryptographically isolated, i.e. knowledge of one
+ * output doesn't reveal another.
+ */
+#define HKDF_CONTEXT_KEY_IDENTIFIER	1
+#define HKDF_CONTEXT_PER_FILE_KEY	2
+#define HKDF_CONTEXT_PER_MODE_KEY	3
+
+extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context,
+			       const u8 *info, unsigned int infolen,
+			       u8 *okm, unsigned int okmlen);
+
+extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
+
+/* keyring.c */
+
+/*
+ * fscrypt_master_key_secret - secret key material of an in-use master key
+ */
+struct fscrypt_master_key_secret {
+
+	/*
+	 * For v2 policy keys: HKDF context keyed by this master key.
+	 * For v1 policy keys: not set (hkdf.hmac_tfm == NULL).
+	 */
+	struct fscrypt_hkdf	hkdf;
+
+	/* Size of the raw key in bytes.  Set even if ->raw isn't set. */
+	u32			size;
+
+	/* For v1 policy keys: the raw key.  Wiped for v2 policy keys. */
+	u8			raw[FSCRYPT_MAX_KEY_SIZE];
+
+} __randomize_layout;
+
+/*
+ * fscrypt_master_key - an in-use master key
+ *
+ * This represents a master encryption key which has been added to the
+ * filesystem and can be used to "unlock" the encrypted files which were
+ * encrypted with it.
+ */
+struct fscrypt_master_key {
+
+	/*
+	 * The secret key material.  After FS_IOC_REMOVE_ENCRYPTION_KEY is
+	 * executed, this is wiped and no new inodes can be unlocked with this
+	 * key; however, there may still be inodes in ->mk_decrypted_inodes
+	 * which could not be evicted.  As long as some inodes still remain,
+	 * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
+	 * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
+	 *
+	 * Locking: protected by key->sem (outer) and mk_secret_sem (inner).
+	 * The reason for two locks is that key->sem also protects modifying
+	 * mk_users, which ranks it above the semaphore for the keyring key
+	 * type, which is in turn above page faults (via keyring_read).  But
+	 * sometimes filesystems call fscrypt_get_encryption_info() from within
+	 * a transaction, which ranks it below page faults.  So we need a
+	 * separate lock which protects mk_secret but not also mk_users.
+	 */
+	struct fscrypt_master_key_secret	mk_secret;
+	struct rw_semaphore			mk_secret_sem;
+
+	/*
+	 * For v1 policy keys: an arbitrary key descriptor which was assigned by
+	 * userspace (->descriptor).
+	 *
+	 * For v2 policy keys: a cryptographic hash of this key (->identifier).
+	 */
+	struct fscrypt_key_specifier		mk_spec;
+
+	/*
+	 * Keyring which contains a key of type 'key_type_fscrypt_user' for each
+	 * user who has added this key.  Normally each key will be added by just
+	 * one user, but it's possible that multiple users share a key, and in
+	 * that case we need to keep track of those users so that one user can't
+	 * remove the key before the others want it removed too.
+	 *
+	 * This is NULL for v1 policy keys; those can only be added by root.
+	 *
+	 * Locking: in addition to this keyrings own semaphore, this is
+	 * protected by the master key's key->sem, so we can do atomic
+	 * search+insert.  It can also be searched without taking any locks, but
+	 * in that case the returned key may have already been removed.
+	 */
+	struct key		*mk_users;
+
+	/*
+	 * Length of ->mk_decrypted_inodes, plus one if mk_secret is present.
+	 * Once this goes to 0, the master key is removed from ->s_master_keys.
+	 * The 'struct fscrypt_master_key' will continue to live as long as the
+	 * 'struct key' whose payload it is, but we won't let this reference
+	 * count rise again.
+	 */
+	refcount_t		mk_refcount;
+
+	/*
+	 * List of inodes that were unlocked using this key.  This allows the
+	 * inodes to be evicted efficiently if the key is removed.
+	 */
+	struct list_head	mk_decrypted_inodes;
+	spinlock_t		mk_decrypted_inodes_lock;
+
+	/* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */
+	struct crypto_skcipher	*mk_mode_keys[__FSCRYPT_MODE_MAX + 1];
+
+} __randomize_layout;
+
+static inline bool
+is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
+{
+	/*
+	 * The READ_ONCE() is only necessary for fscrypt_drop_inode() and
+	 * fscrypt_key_describe().  These run in atomic context, so they can't
+	 * take ->mk_secret_sem and thus 'secret' can change concurrently which
+	 * would be a data race.  But they only need to know whether the secret
+	 * *was* present at the time of check, so READ_ONCE() suffices.
+	 */
+	return READ_ONCE(secret->size) != 0;
+}
+
+static inline const char *master_key_spec_type(
+				const struct fscrypt_key_specifier *spec)
+{
+	switch (spec->type) {
+	case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		return "descriptor";
+	case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+		return "identifier";
+	}
+	return "[unknown]";
+}
+
+static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
+{
+	switch (spec->type) {
+	case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		return FSCRYPT_KEY_DESCRIPTOR_SIZE;
+	case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+		return FSCRYPT_KEY_IDENTIFIER_SIZE;
+	}
+	return 0;
+}
+
+extern struct key *
+fscrypt_find_master_key(struct super_block *sb,
+			const struct fscrypt_key_specifier *mk_spec);
+
+extern int fscrypt_verify_key_added(struct super_block *sb,
+				    const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
+
+extern int __init fscrypt_init_keyring(void);
+
+/* keysetup.c */
+
+struct fscrypt_mode {
+	const char *friendly_name;
+	const char *cipher_str;
+	int keysize;
+	int ivsize;
+	bool logged_impl_name;
+	bool needs_essiv;
+};
+
+static inline bool
+fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode)
+{
+	return mode->ivsize >= offsetofend(union fscrypt_iv, nonce);
+}
+
+extern struct crypto_skcipher *
+fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
+			  const struct inode *inode);
+
+extern int fscrypt_set_derived_key(struct fscrypt_info *ci,
+				   const u8 *derived_key);
+
+/* keysetup_v1.c */
+
+extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
+
+extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+				     const u8 *raw_master_key);
+
+extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+					struct fscrypt_info *ci);
+/* policy.c */
+
+extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
+				   const union fscrypt_policy *policy2);
+extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
+				     const struct inode *inode);
+extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
+				       const union fscrypt_context *ctx_u,
+				       int ctx_size);
 
 #endif /* _FSCRYPT_PRIVATE_H */

diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
new file mode 100644
index 0000000..f21873e
--- /dev/null
+++ b/fs/crypto/hkdf.c

@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
+ * Function"), aka RFC 5869.  See also the original paper (Krawczyk 2010):
+ * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
+ *
+ * This is used to derive keys from the fscrypt master keys.
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+
+#include "fscrypt_private.h"
+
+/*
+ * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses
+ * SHA-512 because it is reasonably secure and efficient; and since it produces
+ * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
+ * entropy from the master key and requires only one iteration of HKDF-Expand.
+ */
+#define HKDF_HMAC_ALG		"hmac(sha512)"
+#define HKDF_HASHLEN		SHA512_DIGEST_SIZE
+
+/*
+ * HKDF consists of two steps:
+ *
+ * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from
+ *    the input keying material and optional salt.
+ * 2. HKDF-Expand: expand the pseudorandom key into output keying material of
+ *    any length, parameterized by an application-specific info string.
+ *
+ * HKDF-Extract can be skipped if the input is already a pseudorandom key of
+ * length HKDF_HASHLEN bytes.  However, cipher modes other than AES-256-XTS take
+ * shorter keys, and we don't want to force users of those modes to provide
+ * unnecessarily long master keys.  Thus fscrypt still does HKDF-Extract.  No
+ * salt is used, since fscrypt master keys should already be pseudorandom and
+ * there's no way to persist a random salt per master key from kernel mode.
+ */
+
+/* HKDF-Extract (RFC 5869 section 2.2), unsalted */
+static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
+			unsigned int ikmlen, u8 prk[HKDF_HASHLEN])
+{
+	static const u8 default_salt[HKDF_HASHLEN];
+	SHASH_DESC_ON_STACK(desc, hmac_tfm);
+	int err;
+
+	err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN);
+	if (err)
+		return err;
+
+	desc->tfm = hmac_tfm;
+	err = crypto_shash_digest(desc, ikm, ikmlen, prk);
+	shash_desc_zero(desc);
+	return err;
+}
+
+/*
+ * Compute HKDF-Extract using the given master key as the input keying material,
+ * and prepare an HMAC transform object keyed by the resulting pseudorandom key.
+ *
+ * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many
+ * times without having to recompute HKDF-Extract each time.
+ */
+int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
+		      unsigned int master_key_size)
+{
+	struct crypto_shash *hmac_tfm;
+	u8 prk[HKDF_HASHLEN];
+	int err;
+
+	hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0);
+	if (IS_ERR(hmac_tfm)) {
+		fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld",
+			    PTR_ERR(hmac_tfm));
+		return PTR_ERR(hmac_tfm);
+	}
+
+	if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) {
+		err = -EINVAL;
+		goto err_free_tfm;
+	}
+
+	err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk);
+	if (err)
+		goto err_free_tfm;
+
+	err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk));
+	if (err)
+		goto err_free_tfm;
+
+	hkdf->hmac_tfm = hmac_tfm;
+	goto out;
+
+err_free_tfm:
+	crypto_free_shash(hmac_tfm);
+out:
+	memzero_explicit(prk, sizeof(prk));
+	return err;
+}
+
+/*
+ * HKDF-Expand (RFC 5869 section 2.3).  This expands the pseudorandom key, which
+ * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen'
+ * bytes of output keying material parameterized by the application-specific
+ * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context'
+ * byte.  This is thread-safe and may be called by multiple threads in parallel.
+ *
+ * ('context' isn't part of the HKDF specification; it's just a prefix fscrypt
+ * adds to its application-specific info strings to guarantee that it doesn't
+ * accidentally repeat an info string when using HKDF for different purposes.)
+ */
+int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context,
+			const u8 *info, unsigned int infolen,
+			u8 *okm, unsigned int okmlen)
+{
+	SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm);
+	u8 prefix[9];
+	unsigned int i;
+	int err;
+	const u8 *prev = NULL;
+	u8 counter = 1;
+	u8 tmp[HKDF_HASHLEN];
+
+	if (WARN_ON(okmlen > 255 * HKDF_HASHLEN))
+		return -EINVAL;
+
+	desc->tfm = hkdf->hmac_tfm;
+
+	memcpy(prefix, "fscrypt\0", 8);
+	prefix[8] = context;
+
+	for (i = 0; i < okmlen; i += HKDF_HASHLEN) {
+
+		err = crypto_shash_init(desc);
+		if (err)
+			goto out;
+
+		if (prev) {
+			err = crypto_shash_update(desc, prev, HKDF_HASHLEN);
+			if (err)
+				goto out;
+		}
+
+		err = crypto_shash_update(desc, prefix, sizeof(prefix));
+		if (err)
+			goto out;
+
+		err = crypto_shash_update(desc, info, infolen);
+		if (err)
+			goto out;
+
+		BUILD_BUG_ON(sizeof(counter) != 1);
+		if (okmlen - i < HKDF_HASHLEN) {
+			err = crypto_shash_finup(desc, &counter, 1, tmp);
+			if (err)
+				goto out;
+			memcpy(&okm[i], tmp, okmlen - i);
+			memzero_explicit(tmp, sizeof(tmp));
+		} else {
+			err = crypto_shash_finup(desc, &counter, 1, &okm[i]);
+			if (err)
+				goto out;
+		}
+		counter++;
+		prev = &okm[i];
+	}
+	err = 0;
+out:
+	if (unlikely(err))
+		memzero_explicit(okm, okmlen); /* so caller doesn't need to */
+	shash_desc_zero(desc);
+	return err;
+}
+
+void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf)
+{
+	crypto_free_shash(hkdf->hmac_tfm);
+}

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 926e5df..bb3b7fc 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c

@@ -1,10 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/crypto/hooks.c
  *
  * Encryption hooks for higher-level filesystem operations.
  */
 
-#include <linux/ratelimit.h>
 #include "fscrypt_private.h"
 
 /**
@@ -39,9 +39,9 @@
 	dir = dget_parent(file_dentry(filp));
 	if (IS_ENCRYPTED(d_inode(dir)) &&
 	    !fscrypt_has_permitted_context(d_inode(dir), inode)) {
-		fscrypt_warn(inode->i_sb,
-			     "inconsistent encryption contexts: %lu/%lu",
-			     d_inode(dir)->i_ino, inode->i_ino);
+		fscrypt_warn(inode,
+			     "Inconsistent encryption context (parent directory: %lu)",
+			     d_inode(dir)->i_ino);
 		err = -EPERM;
 	}
 	dput(dir);
@@ -49,7 +49,8 @@
 }
 EXPORT_SYMBOL_GPL(fscrypt_file_open);
 
-int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
+int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
+			   struct dentry *dentry)
 {
 	int err;
 
@@ -57,8 +58,12 @@
 	if (err)
 		return err;
 
+	/* ... in case we looked up ciphertext name before key was added */
+	if (dentry->d_flags & DCACHE_ENCRYPTED_NAME)
+		return -ENOKEY;
+
 	if (!fscrypt_has_permitted_context(dir, inode))
-		return -EPERM;
+		return -EXDEV;
 
 	return 0;
 }
@@ -78,37 +83,42 @@
 	if (err)
 		return err;
 
+	/* ... in case we looked up ciphertext name(s) before key was added */
+	if ((old_dentry->d_flags | new_dentry->d_flags) &
+	    DCACHE_ENCRYPTED_NAME)
+		return -ENOKEY;
+
 	if (old_dir != new_dir) {
 		if (IS_ENCRYPTED(new_dir) &&
 		    !fscrypt_has_permitted_context(new_dir,
 						   d_inode(old_dentry)))
-			return -EPERM;
+			return -EXDEV;
 
 		if ((flags & RENAME_EXCHANGE) &&
 		    IS_ENCRYPTED(old_dir) &&
 		    !fscrypt_has_permitted_context(old_dir,
 						   d_inode(new_dentry)))
-			return -EPERM;
+			return -EXDEV;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);
 
-int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry)
+int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
+			     struct fscrypt_name *fname)
 {
-	int err = fscrypt_get_encryption_info(dir);
+	int err = fscrypt_setup_filename(dir, &dentry->d_name, 1, fname);
 
-	if (err)
+	if (err && err != -ENOENT)
 		return err;
 
-	if (fscrypt_has_encryption_key(dir)) {
+	if (fname->is_ciphertext_name) {
 		spin_lock(&dentry->d_lock);
-		dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+		dentry->d_flags |= DCACHE_ENCRYPTED_NAME;
 		spin_unlock(&dentry->d_lock);
+		d_set_d_op(dentry, &fscrypt_d_ops);
 	}
-
-	d_set_d_op(dentry, &fscrypt_d_ops);
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
 
@@ -179,11 +189,9 @@
 	sd->len = cpu_to_le16(ciphertext_len);
 
 	err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len);
-	if (err) {
-		if (!disk_link->name)
-			kfree(sd);
-		return err;
-	}
+	if (err)
+		goto err_free_sd;
+
 	/*
 	 * Null-terminating the ciphertext doesn't make sense, but we still
 	 * count the null terminator in the length, so we might as well
@@ -191,9 +199,20 @@
 	 */
 	sd->encrypted_path[ciphertext_len] = '\0';
 
+	/* Cache the plaintext symlink target for later use by get_link() */
+	err = -ENOMEM;
+	inode->i_link = kmemdup(target, len + 1, GFP_NOFS);
+	if (!inode->i_link)
+		goto err_free_sd;
+
 	if (!disk_link->name)
 		disk_link->name = (unsigned char *)sd;
 	return 0;
+
+err_free_sd:
+	if (!disk_link->name)
+		kfree(sd);
+	return err;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink);
 
@@ -202,7 +221,7 @@
  * @inode: the symlink inode
  * @caddr: the on-disk contents of the symlink
  * @max_size: size of @caddr buffer
- * @done: if successful, will be set up to free the returned target
+ * @done: if successful, will be set up to free the returned target if needed
  *
  * If the symlink's encryption key is available, we decrypt its target.
  * Otherwise, we encode its target for presentation.
@@ -217,12 +236,18 @@
 {
 	const struct fscrypt_symlink_data *sd;
 	struct fscrypt_str cstr, pstr;
+	bool has_key;
 	int err;
 
 	/* This is for encrypted symlinks only */
 	if (WARN_ON(!IS_ENCRYPTED(inode)))
 		return ERR_PTR(-EINVAL);
 
+	/* If the decrypted target is already cached, just return it. */
+	pstr.name = READ_ONCE(inode->i_link);
+	if (pstr.name)
+		return pstr.name;
+
 	/*
 	 * Try to set up the symlink's encryption key, but we can continue
 	 * regardless of whether the key is available or not.
@@ -230,6 +255,7 @@
 	err = fscrypt_get_encryption_info(inode);
 	if (err)
 		return ERR_PTR(err);
+	has_key = fscrypt_has_encryption_key(inode);
 
 	/*
 	 * For historical reasons, encrypted symlink targets are prefixed with
@@ -261,7 +287,17 @@
 		goto err_kfree;
 
 	pstr.name[pstr.len] = '\0';
-	set_delayed_call(done, kfree_link, pstr.name);
+
+	/*
+	 * Cache decrypted symlink targets in i_link for later use.  Don't cache
+	 * symlink targets encoded without the key, since those become outdated
+	 * once the key is added.  This pairs with the READ_ONCE() above and in
+	 * the VFS path lookup code.
+	 */
+	if (!has_key ||
+	    cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL)
+		set_delayed_call(done, kfree_link, pstr.name);
+
 	return pstr.name;
 
 err_kfree:

diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
deleted file mode 100644
index 7874c9b..0000000
--- a/fs/crypto/keyinfo.c
+++ /dev/null

@@ -1,399 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * key management facility for FS encryption support.
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption key functions.
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
- */
-
-#include <keys/user-type.h>
-#include <linux/scatterlist.h>
-#include <linux/ratelimit.h>
-#include <crypto/aes.h>
-#include <crypto/sha.h>
-#include <crypto/skcipher.h>
-#include "fscrypt_private.h"
-
-static struct crypto_shash *essiv_hash_tfm;
-
-/*
- * Key derivation function.  This generates the derived key by encrypting the
- * master key with AES-128-ECB using the inode's nonce as the AES key.
- *
- * The master key must be at least as long as the derived key.  If the master
- * key is longer, then only the first 'derived_keysize' bytes are used.
- */
-static int derive_key_aes(const u8 *master_key,
-			  const struct fscrypt_context *ctx,
-			  u8 *derived_key, unsigned int derived_keysize)
-{
-	int res = 0;
-	struct skcipher_request *req = NULL;
-	DECLARE_CRYPTO_WAIT(wait);
-	struct scatterlist src_sg, dst_sg;
-	struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
-
-	if (IS_ERR(tfm)) {
-		res = PTR_ERR(tfm);
-		tfm = NULL;
-		goto out;
-	}
-	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-	req = skcipher_request_alloc(tfm, GFP_NOFS);
-	if (!req) {
-		res = -ENOMEM;
-		goto out;
-	}
-	skcipher_request_set_callback(req,
-			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			crypto_req_done, &wait);
-	res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce));
-	if (res < 0)
-		goto out;
-
-	sg_init_one(&src_sg, master_key, derived_keysize);
-	sg_init_one(&dst_sg, derived_key, derived_keysize);
-	skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
-				   NULL);
-	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
-out:
-	skcipher_request_free(req);
-	crypto_free_skcipher(tfm);
-	return res;
-}
-
-/*
- * Search the current task's subscribed keyrings for a "logon" key with
- * description prefix:descriptor, and if found acquire a read lock on it and
- * return a pointer to its validated payload in *payload_ret.
- */
-static struct key *
-find_and_lock_process_key(const char *prefix,
-			  const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE],
-			  unsigned int min_keysize,
-			  const struct fscrypt_key **payload_ret)
-{
-	char *description;
-	struct key *key;
-	const struct user_key_payload *ukp;
-	const struct fscrypt_key *payload;
-
-	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
-				FS_KEY_DESCRIPTOR_SIZE, descriptor);
-	if (!description)
-		return ERR_PTR(-ENOMEM);
-
-	key = request_key(&key_type_logon, description, NULL);
-	kfree(description);
-	if (IS_ERR(key))
-		return key;
-
-	down_read(&key->sem);
-	ukp = user_key_payload_locked(key);
-
-	if (!ukp) /* was the key revoked before we acquired its semaphore? */
-		goto invalid;
-
-	payload = (const struct fscrypt_key *)ukp->data;
-
-	if (ukp->datalen != sizeof(struct fscrypt_key) ||
-	    payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) {
-		fscrypt_warn(NULL,
-			     "key with description '%s' has invalid payload",
-			     key->description);
-		goto invalid;
-	}
-
-	if (payload->size < min_keysize) {
-		fscrypt_warn(NULL,
-			     "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
-			     key->description, payload->size, min_keysize);
-		goto invalid;
-	}
-
-	*payload_ret = payload;
-	return key;
-
-invalid:
-	up_read(&key->sem);
-	key_put(key);
-	return ERR_PTR(-ENOKEY);
-}
-
-/* Find the master key, then derive the inode's actual encryption key */
-static int find_and_derive_key(const struct inode *inode,
-			       const struct fscrypt_context *ctx,
-			       u8 *derived_key, unsigned int derived_keysize)
-{
-	struct key *key;
-	const struct fscrypt_key *payload;
-	int err;
-
-	key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
-					ctx->master_key_descriptor,
-					derived_keysize, &payload);
-	if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
-		key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
-						ctx->master_key_descriptor,
-						derived_keysize, &payload);
-	}
-	if (IS_ERR(key))
-		return PTR_ERR(key);
-	err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize);
-	up_read(&key->sem);
-	key_put(key);
-	return err;
-}
-
-static struct fscrypt_mode {
-	const char *friendly_name;
-	const char *cipher_str;
-	int keysize;
-	bool logged_impl_name;
-} available_modes[] = {
-	[FS_ENCRYPTION_MODE_AES_256_XTS] = {
-		.friendly_name = "AES-256-XTS",
-		.cipher_str = "xts(aes)",
-		.keysize = 64,
-	},
-	[FS_ENCRYPTION_MODE_AES_256_CTS] = {
-		.friendly_name = "AES-256-CTS-CBC",
-		.cipher_str = "cts(cbc(aes))",
-		.keysize = 32,
-	},
-	[FS_ENCRYPTION_MODE_AES_128_CBC] = {
-		.friendly_name = "AES-128-CBC",
-		.cipher_str = "cbc(aes)",
-		.keysize = 16,
-	},
-	[FS_ENCRYPTION_MODE_AES_128_CTS] = {
-		.friendly_name = "AES-128-CTS-CBC",
-		.cipher_str = "cts(cbc(aes))",
-		.keysize = 16,
-	},
-};
-
-static struct fscrypt_mode *
-select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
-{
-	if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
-		fscrypt_warn(inode->i_sb,
-			     "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)",
-			     inode->i_ino, ci->ci_data_mode,
-			     ci->ci_filename_mode);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (S_ISREG(inode->i_mode))
-		return &available_modes[ci->ci_data_mode];
-
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		return &available_modes[ci->ci_filename_mode];
-
-	WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
-		  inode->i_ino, (inode->i_mode & S_IFMT));
-	return ERR_PTR(-EINVAL);
-}
-
-static void put_crypt_info(struct fscrypt_info *ci)
-{
-	if (!ci)
-		return;
-
-	crypto_free_skcipher(ci->ci_ctfm);
-	crypto_free_cipher(ci->ci_essiv_tfm);
-	kmem_cache_free(fscrypt_info_cachep, ci);
-}
-
-static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
-{
-	struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
-
-	/* init hash transform on demand */
-	if (unlikely(!tfm)) {
-		struct crypto_shash *prev_tfm;
-
-		tfm = crypto_alloc_shash("sha256", 0, 0);
-		if (IS_ERR(tfm)) {
-			fscrypt_warn(NULL,
-				     "error allocating SHA-256 transform: %ld",
-				     PTR_ERR(tfm));
-			return PTR_ERR(tfm);
-		}
-		prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
-		if (prev_tfm) {
-			crypto_free_shash(tfm);
-			tfm = prev_tfm;
-		}
-	}
-
-	{
-		SHASH_DESC_ON_STACK(desc, tfm);
-		desc->tfm = tfm;
-		desc->flags = 0;
-
-		return crypto_shash_digest(desc, key, keysize, salt);
-	}
-}
-
-static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
-				int keysize)
-{
-	int err;
-	struct crypto_cipher *essiv_tfm;
-	u8 salt[SHA256_DIGEST_SIZE];
-
-	essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
-	if (IS_ERR(essiv_tfm))
-		return PTR_ERR(essiv_tfm);
-
-	ci->ci_essiv_tfm = essiv_tfm;
-
-	err = derive_essiv_salt(raw_key, keysize, salt);
-	if (err)
-		goto out;
-
-	/*
-	 * Using SHA256 to derive the salt/key will result in AES-256 being
-	 * used for IV generation. File contents encryption will still use the
-	 * configured keysize (AES-128) nevertheless.
-	 */
-	err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
-	if (err)
-		goto out;
-
-out:
-	memzero_explicit(salt, sizeof(salt));
-	return err;
-}
-
-void __exit fscrypt_essiv_cleanup(void)
-{
-	crypto_free_shash(essiv_hash_tfm);
-}
-
-int fscrypt_get_encryption_info(struct inode *inode)
-{
-	struct fscrypt_info *crypt_info;
-	struct fscrypt_context ctx;
-	struct crypto_skcipher *ctfm;
-	struct fscrypt_mode *mode;
-	u8 *raw_key = NULL;
-	int res;
-
-	if (inode->i_crypt_info)
-		return 0;
-
-	res = fscrypt_initialize(inode->i_sb->s_cop->flags);
-	if (res)
-		return res;
-
-	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
-	if (res < 0) {
-		if (!fscrypt_dummy_context_enabled(inode) ||
-		    IS_ENCRYPTED(inode))
-			return res;
-		/* Fake up a context for an unencrypted directory */
-		memset(&ctx, 0, sizeof(ctx));
-		ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
-		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
-		memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
-	} else if (res != sizeof(ctx)) {
-		return -EINVAL;
-	}
-
-	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
-		return -EINVAL;
-
-	if (ctx.flags & ~FS_POLICY_FLAGS_VALID)
-		return -EINVAL;
-
-	crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
-	if (!crypt_info)
-		return -ENOMEM;
-
-	crypt_info->ci_flags = ctx.flags;
-	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
-	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
-	crypt_info->ci_ctfm = NULL;
-	crypt_info->ci_essiv_tfm = NULL;
-	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
-				sizeof(crypt_info->ci_master_key));
-
-	mode = select_encryption_mode(crypt_info, inode);
-	if (IS_ERR(mode)) {
-		res = PTR_ERR(mode);
-		goto out;
-	}
-
-	/*
-	 * This cannot be a stack buffer because it is passed to the scatterlist
-	 * crypto API as part of key derivation.
-	 */
-	res = -ENOMEM;
-	raw_key = kmalloc(mode->keysize, GFP_NOFS);
-	if (!raw_key)
-		goto out;
-
-	res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize);
-	if (res)
-		goto out;
-
-	ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
-	if (IS_ERR(ctfm)) {
-		res = PTR_ERR(ctfm);
-		fscrypt_warn(inode->i_sb,
-			     "error allocating '%s' transform for inode %lu: %d",
-			     mode->cipher_str, inode->i_ino, res);
-		goto out;
-	}
-	if (unlikely(!mode->logged_impl_name)) {
-		/*
-		 * fscrypt performance can vary greatly depending on which
-		 * crypto algorithm implementation is used.  Help people debug
-		 * performance problems by logging the ->cra_driver_name the
-		 * first time a mode is used.  Note that multiple threads can
-		 * race here, but it doesn't really matter.
-		 */
-		mode->logged_impl_name = true;
-		pr_info("fscrypt: %s using implementation \"%s\"\n",
-			mode->friendly_name,
-			crypto_skcipher_alg(ctfm)->base.cra_driver_name);
-	}
-	crypt_info->ci_ctfm = ctfm;
-	crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
-	res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize);
-	if (res)
-		goto out;
-
-	if (S_ISREG(inode->i_mode) &&
-	    crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
-		res = init_essiv_generator(crypt_info, raw_key, mode->keysize);
-		if (res) {
-			fscrypt_warn(inode->i_sb,
-				     "error initializing ESSIV generator for inode %lu: %d",
-				     inode->i_ino, res);
-			goto out;
-		}
-	}
-	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL)
-		crypt_info = NULL;
-out:
-	if (res == -ENOKEY)
-		res = 0;
-	put_crypt_info(crypt_info);
-	kzfree(raw_key);
-	return res;
-}
-EXPORT_SYMBOL(fscrypt_get_encryption_info);
-
-void fscrypt_put_encryption_info(struct inode *inode)
-{
-	put_crypt_info(inode->i_crypt_info);
-	inode->i_crypt_info = NULL;
-}
-EXPORT_SYMBOL(fscrypt_put_encryption_info);

diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
new file mode 100644
index 0000000..c34fa7c
--- /dev/null
+++ b/fs/crypto/keyring.c

@@ -0,0 +1,984 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Filesystem-level keyring for fscrypt
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * This file implements management of fscrypt master keys in the
+ * filesystem-level keyring, including the ioctls:
+ *
+ * - FS_IOC_ADD_ENCRYPTION_KEY
+ * - FS_IOC_REMOVE_ENCRYPTION_KEY
+ * - FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS
+ * - FS_IOC_GET_ENCRYPTION_KEY_STATUS
+ *
+ * See the "User API" section of Documentation/filesystems/fscrypt.rst for more
+ * information about these ioctls.
+ */
+
+#include <crypto/skcipher.h>
+#include <linux/key-type.h>
+#include <linux/seq_file.h>
+
+#include "fscrypt_private.h"
+
+static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
+{
+	fscrypt_destroy_hkdf(&secret->hkdf);
+	memzero_explicit(secret, sizeof(*secret));
+}
+
+static void move_master_key_secret(struct fscrypt_master_key_secret *dst,
+				   struct fscrypt_master_key_secret *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+	memzero_explicit(src, sizeof(*src));
+}
+
+static void free_master_key(struct fscrypt_master_key *mk)
+{
+	size_t i;
+
+	wipe_master_key_secret(&mk->mk_secret);
+
+	for (i = 0; i < ARRAY_SIZE(mk->mk_mode_keys); i++)
+		crypto_free_skcipher(mk->mk_mode_keys[i]);
+
+	key_put(mk->mk_users);
+	kzfree(mk);
+}
+
+static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
+{
+	if (spec->__reserved)
+		return false;
+	return master_key_spec_len(spec) != 0;
+}
+
+static int fscrypt_key_instantiate(struct key *key,
+				   struct key_preparsed_payload *prep)
+{
+	key->payload.data[0] = (struct fscrypt_master_key *)prep->data;
+	return 0;
+}
+
+static void fscrypt_key_destroy(struct key *key)
+{
+	free_master_key(key->payload.data[0]);
+}
+
+static void fscrypt_key_describe(const struct key *key, struct seq_file *m)
+{
+	seq_puts(m, key->description);
+
+	if (key_is_positive(key)) {
+		const struct fscrypt_master_key *mk = key->payload.data[0];
+
+		if (!is_master_key_secret_present(&mk->mk_secret))
+			seq_puts(m, ": secret removed");
+	}
+}
+
+/*
+ * Type of key in ->s_master_keys.  Each key of this type represents a master
+ * key which has been added to the filesystem.  Its payload is a
+ * 'struct fscrypt_master_key'.  The "." prefix in the key type name prevents
+ * users from adding keys of this type via the keyrings syscalls rather than via
+ * the intended method of FS_IOC_ADD_ENCRYPTION_KEY.
+ */
+static struct key_type key_type_fscrypt = {
+	.name			= "._fscrypt",
+	.instantiate		= fscrypt_key_instantiate,
+	.destroy		= fscrypt_key_destroy,
+	.describe		= fscrypt_key_describe,
+};
+
+static int fscrypt_user_key_instantiate(struct key *key,
+					struct key_preparsed_payload *prep)
+{
+	/*
+	 * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for
+	 * each key, regardless of the exact key size.  The amount of memory
+	 * actually used is greater than the size of the raw key anyway.
+	 */
+	return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE);
+}
+
+static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m)
+{
+	seq_puts(m, key->description);
+}
+
+/*
+ * Type of key in ->mk_users.  Each key of this type represents a particular
+ * user who has added a particular master key.
+ *
+ * Note that the name of this key type really should be something like
+ * ".fscrypt-user" instead of simply ".fscrypt".  But the shorter name is chosen
+ * mainly for simplicity of presentation in /proc/keys when read by a non-root
+ * user.  And it is expected to be rare that a key is actually added by multiple
+ * users, since users should keep their encryption keys confidential.
+ */
+static struct key_type key_type_fscrypt_user = {
+	.name			= ".fscrypt",
+	.instantiate		= fscrypt_user_key_instantiate,
+	.describe		= fscrypt_user_key_describe,
+};
+
+/* Search ->s_master_keys or ->mk_users */
+static struct key *search_fscrypt_keyring(struct key *keyring,
+					  struct key_type *type,
+					  const char *description)
+{
+	/*
+	 * We need to mark the keyring reference as "possessed" so that we
+	 * acquire permission to search it, via the KEY_POS_SEARCH permission.
+	 */
+	key_ref_t keyref = make_key_ref(keyring, true /* possessed */);
+
+	keyref = keyring_search(keyref, type, description, false);
+	if (IS_ERR(keyref)) {
+		if (PTR_ERR(keyref) == -EAGAIN || /* not found */
+		    PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
+			keyref = ERR_PTR(-ENOKEY);
+		return ERR_CAST(keyref);
+	}
+	return key_ref_to_ptr(keyref);
+}
+
+#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE	\
+	(CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id))
+
+#define FSCRYPT_MK_DESCRIPTION_SIZE	(2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1)
+
+#define FSCRYPT_MK_USERS_DESCRIPTION_SIZE	\
+	(CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \
+	 CONST_STRLEN("-users") + 1)
+
+#define FSCRYPT_MK_USER_DESCRIPTION_SIZE	\
+	(2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1)
+
+static void format_fs_keyring_description(
+			char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE],
+			const struct super_block *sb)
+{
+	sprintf(description, "fscrypt-%s", sb->s_id);
+}
+
+static void format_mk_description(
+			char description[FSCRYPT_MK_DESCRIPTION_SIZE],
+			const struct fscrypt_key_specifier *mk_spec)
+{
+	sprintf(description, "%*phN",
+		master_key_spec_len(mk_spec), (u8 *)&mk_spec->u);
+}
+
+static void format_mk_users_keyring_description(
+			char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE],
+			const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
+{
+	sprintf(description, "fscrypt-%*phN-users",
+		FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier);
+}
+
+static void format_mk_user_description(
+			char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE],
+			const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
+{
+
+	sprintf(description, "%*phN.uid.%u", FSCRYPT_KEY_IDENTIFIER_SIZE,
+		mk_identifier, __kuid_val(current_fsuid()));
+}
+
+/* Create ->s_master_keys if needed.  Synchronized by fscrypt_add_key_mutex. */
+static int allocate_filesystem_keyring(struct super_block *sb)
+{
+	char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE];
+	struct key *keyring;
+
+	if (sb->s_master_keys)
+		return 0;
+
+	format_fs_keyring_description(description, sb);
+	keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				current_cred(), KEY_POS_SEARCH |
+				  KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	/* Pairs with READ_ONCE() in fscrypt_find_master_key() */
+	smp_store_release(&sb->s_master_keys, keyring);
+	return 0;
+}
+
+void fscrypt_sb_free(struct super_block *sb)
+{
+	key_put(sb->s_master_keys);
+	sb->s_master_keys = NULL;
+}
+
+/*
+ * Find the specified master key in ->s_master_keys.
+ * Returns ERR_PTR(-ENOKEY) if not found.
+ */
+struct key *fscrypt_find_master_key(struct super_block *sb,
+				    const struct fscrypt_key_specifier *mk_spec)
+{
+	struct key *keyring;
+	char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+
+	/* pairs with smp_store_release() in allocate_filesystem_keyring() */
+	keyring = READ_ONCE(sb->s_master_keys);
+	if (keyring == NULL)
+		return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
+
+	format_mk_description(description, mk_spec);
+	return search_fscrypt_keyring(keyring, &key_type_fscrypt, description);
+}
+
+static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
+{
+	char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE];
+	struct key *keyring;
+
+	format_mk_users_keyring_description(description,
+					    mk->mk_spec.u.identifier);
+	keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				current_cred(), KEY_POS_SEARCH |
+				  KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	mk->mk_users = keyring;
+	return 0;
+}
+
+/*
+ * Find the current user's "key" in the master key's ->mk_users.
+ * Returns ERR_PTR(-ENOKEY) if not found.
+ */
+static struct key *find_master_key_user(struct fscrypt_master_key *mk)
+{
+	char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE];
+
+	format_mk_user_description(description, mk->mk_spec.u.identifier);
+	return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user,
+				      description);
+}
+
+/*
+ * Give the current user a "key" in ->mk_users.  This charges the user's quota
+ * and marks the master key as added by the current user, so that it cannot be
+ * removed by another user with the key.  Either the master key's key->sem must
+ * be held for write, or the master key must be still undergoing initialization.
+ */
+static int add_master_key_user(struct fscrypt_master_key *mk)
+{
+	char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE];
+	struct key *mk_user;
+	int err;
+
+	format_mk_user_description(description, mk->mk_spec.u.identifier);
+	mk_user = key_alloc(&key_type_fscrypt_user, description,
+			    current_fsuid(), current_gid(), current_cred(),
+			    KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL);
+	if (IS_ERR(mk_user))
+		return PTR_ERR(mk_user);
+
+	err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL);
+	key_put(mk_user);
+	return err;
+}
+
+/*
+ * Remove the current user's "key" from ->mk_users.
+ * The master key's key->sem must be held for write.
+ *
+ * Returns 0 if removed, -ENOKEY if not found, or another -errno code.
+ */
+static int remove_master_key_user(struct fscrypt_master_key *mk)
+{
+	struct key *mk_user;
+	int err;
+
+	mk_user = find_master_key_user(mk);
+	if (IS_ERR(mk_user))
+		return PTR_ERR(mk_user);
+	err = key_unlink(mk->mk_users, mk_user);
+	key_put(mk_user);
+	return err;
+}
+
+/*
+ * Allocate a new fscrypt_master_key which contains the given secret, set it as
+ * the payload of a new 'struct key' of type fscrypt, and link the 'struct key'
+ * into the given keyring.  Synchronized by fscrypt_add_key_mutex.
+ */
+static int add_new_master_key(struct fscrypt_master_key_secret *secret,
+			      const struct fscrypt_key_specifier *mk_spec,
+			      struct key *keyring)
+{
+	struct fscrypt_master_key *mk;
+	char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+	struct key *key;
+	int err;
+
+	mk = kzalloc(sizeof(*mk), GFP_KERNEL);
+	if (!mk)
+		return -ENOMEM;
+
+	mk->mk_spec = *mk_spec;
+
+	move_master_key_secret(&mk->mk_secret, secret);
+	init_rwsem(&mk->mk_secret_sem);
+
+	refcount_set(&mk->mk_refcount, 1); /* secret is present */
+	INIT_LIST_HEAD(&mk->mk_decrypted_inodes);
+	spin_lock_init(&mk->mk_decrypted_inodes_lock);
+
+	if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
+		err = allocate_master_key_users_keyring(mk);
+		if (err)
+			goto out_free_mk;
+		err = add_master_key_user(mk);
+		if (err)
+			goto out_free_mk;
+	}
+
+	/*
+	 * Note that we don't charge this key to anyone's quota, since when
+	 * ->mk_users is in use those keys are charged instead, and otherwise
+	 * (when ->mk_users isn't in use) only root can add these keys.
+	 */
+	format_mk_description(description, mk_spec);
+	key = key_alloc(&key_type_fscrypt, description,
+			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+			KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW,
+			KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto out_free_mk;
+	}
+	err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL);
+	key_put(key);
+	if (err)
+		goto out_free_mk;
+
+	return 0;
+
+out_free_mk:
+	free_master_key(mk);
+	return err;
+}
+
+#define KEY_DEAD	1
+
+static int add_existing_master_key(struct fscrypt_master_key *mk,
+				   struct fscrypt_master_key_secret *secret)
+{
+	struct key *mk_user;
+	bool rekey;
+	int err;
+
+	/*
+	 * If the current user is already in ->mk_users, then there's nothing to
+	 * do.  (Not applicable for v1 policy keys, which have NULL ->mk_users.)
+	 */
+	if (mk->mk_users) {
+		mk_user = find_master_key_user(mk);
+		if (mk_user != ERR_PTR(-ENOKEY)) {
+			if (IS_ERR(mk_user))
+				return PTR_ERR(mk_user);
+			key_put(mk_user);
+			return 0;
+		}
+	}
+
+	/* If we'll be re-adding ->mk_secret, try to take the reference. */
+	rekey = !is_master_key_secret_present(&mk->mk_secret);
+	if (rekey && !refcount_inc_not_zero(&mk->mk_refcount))
+		return KEY_DEAD;
+
+	/* Add the current user to ->mk_users, if applicable. */
+	if (mk->mk_users) {
+		err = add_master_key_user(mk);
+		if (err) {
+			if (rekey && refcount_dec_and_test(&mk->mk_refcount))
+				return KEY_DEAD;
+			return err;
+		}
+	}
+
+	/* Re-add the secret if needed. */
+	if (rekey) {
+		down_write(&mk->mk_secret_sem);
+		move_master_key_secret(&mk->mk_secret, secret);
+		up_write(&mk->mk_secret_sem);
+	}
+	return 0;
+}
+
+static int add_master_key(struct super_block *sb,
+			  struct fscrypt_master_key_secret *secret,
+			  const struct fscrypt_key_specifier *mk_spec)
+{
+	static DEFINE_MUTEX(fscrypt_add_key_mutex);
+	struct key *key;
+	int err;
+
+	mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */
+retry:
+	key = fscrypt_find_master_key(sb, mk_spec);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		if (err != -ENOKEY)
+			goto out_unlock;
+		/* Didn't find the key in ->s_master_keys.  Add it. */
+		err = allocate_filesystem_keyring(sb);
+		if (err)
+			goto out_unlock;
+		err = add_new_master_key(secret, mk_spec, sb->s_master_keys);
+	} else {
+		/*
+		 * Found the key in ->s_master_keys.  Re-add the secret if
+		 * needed, and add the user to ->mk_users if needed.
+		 */
+		down_write(&key->sem);
+		err = add_existing_master_key(key->payload.data[0], secret);
+		up_write(&key->sem);
+		if (err == KEY_DEAD) {
+			/* Key being removed or needs to be removed */
+			key_invalidate(key);
+			key_put(key);
+			goto retry;
+		}
+		key_put(key);
+	}
+out_unlock:
+	mutex_unlock(&fscrypt_add_key_mutex);
+	return err;
+}
+
+/*
+ * Add a master encryption key to the filesystem, causing all files which were
+ * encrypted with it to appear "unlocked" (decrypted) when accessed.
+ *
+ * When adding a key for use by v1 encryption policies, this ioctl is
+ * privileged, and userspace must provide the 'key_descriptor'.
+ *
+ * When adding a key for use by v2+ encryption policies, this ioctl is
+ * unprivileged.  This is needed, in general, to allow non-root users to use
+ * encryption without encountering the visibility problems of process-subscribed
+ * keyrings and the inability to properly remove keys.  This works by having
+ * each key identified by its cryptographically secure hash --- the
+ * 'key_identifier'.  The cryptographic hash ensures that a malicious user
+ * cannot add the wrong key for a given identifier.  Furthermore, each added key
+ * is charged to the appropriate user's quota for the keyrings service, which
+ * prevents a malicious user from adding too many keys.  Finally, we forbid a
+ * user from removing a key while other users have added it too, which prevents
+ * a user who knows another user's key from causing a denial-of-service by
+ * removing it at an inopportune time.  (We tolerate that a user who knows a key
+ * can prevent other users from removing it.)
+ *
+ * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of
+ * Documentation/filesystems/fscrypt.rst.
+ */
+int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+	struct fscrypt_add_key_arg __user *uarg = _uarg;
+	struct fscrypt_add_key_arg arg;
+	struct fscrypt_master_key_secret secret;
+	int err;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (!valid_key_spec(&arg.key_spec))
+		return -EINVAL;
+
+	if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE ||
+	    arg.raw_size > FSCRYPT_MAX_KEY_SIZE)
+		return -EINVAL;
+
+	if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+		return -EINVAL;
+
+	memset(&secret, 0, sizeof(secret));
+	secret.size = arg.raw_size;
+	err = -EFAULT;
+	if (copy_from_user(secret.raw, uarg->raw, secret.size))
+		goto out_wipe_secret;
+
+	switch (arg.key_spec.type) {
+	case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		/*
+		 * Only root can add keys that are identified by an arbitrary
+		 * descriptor rather than by a cryptographic hash --- since
+		 * otherwise a malicious user could add the wrong key.
+		 */
+		err = -EACCES;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_wipe_secret;
+		break;
+	case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+		err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size);
+		if (err)
+			goto out_wipe_secret;
+
+		/*
+		 * Now that the HKDF context is initialized, the raw key is no
+		 * longer needed.
+		 */
+		memzero_explicit(secret.raw, secret.size);
+
+		/* Calculate the key identifier and return it to userspace. */
+		err = fscrypt_hkdf_expand(&secret.hkdf,
+					  HKDF_CONTEXT_KEY_IDENTIFIER,
+					  NULL, 0, arg.key_spec.u.identifier,
+					  FSCRYPT_KEY_IDENTIFIER_SIZE);
+		if (err)
+			goto out_wipe_secret;
+		err = -EFAULT;
+		if (copy_to_user(uarg->key_spec.u.identifier,
+				 arg.key_spec.u.identifier,
+				 FSCRYPT_KEY_IDENTIFIER_SIZE))
+			goto out_wipe_secret;
+		break;
+	default:
+		WARN_ON(1);
+		err = -EINVAL;
+		goto out_wipe_secret;
+	}
+
+	err = add_master_key(sb, &secret, &arg.key_spec);
+out_wipe_secret:
+	wipe_master_key_secret(&secret);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key);
+
+/*
+ * Verify that the current user has added a master key with the given identifier
+ * (returns -ENOKEY if not).  This is needed to prevent a user from encrypting
+ * their files using some other user's key which they don't actually know.
+ * Cryptographically this isn't much of a problem, but the semantics of this
+ * would be a bit weird, so it's best to just forbid it.
+ *
+ * The system administrator (CAP_FOWNER) can override this, which should be
+ * enough for any use cases where encryption policies are being set using keys
+ * that were chosen ahead of time but aren't available at the moment.
+ *
+ * Note that the key may have already removed by the time this returns, but
+ * that's okay; we just care whether the key was there at some point.
+ *
+ * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code
+ */
+int fscrypt_verify_key_added(struct super_block *sb,
+			     const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
+{
+	struct fscrypt_key_specifier mk_spec;
+	struct key *key, *mk_user;
+	struct fscrypt_master_key *mk;
+	int err;
+
+	mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+	memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
+
+	key = fscrypt_find_master_key(sb, &mk_spec);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto out;
+	}
+	mk = key->payload.data[0];
+	mk_user = find_master_key_user(mk);
+	if (IS_ERR(mk_user)) {
+		err = PTR_ERR(mk_user);
+	} else {
+		key_put(mk_user);
+		err = 0;
+	}
+	key_put(key);
+out:
+	if (err == -ENOKEY && capable(CAP_FOWNER))
+		err = 0;
+	return err;
+}
+
+/*
+ * Try to evict the inode's dentries from the dentry cache.  If the inode is a
+ * directory, then it can have at most one dentry; however, that dentry may be
+ * pinned by child dentries, so first try to evict the children too.
+ */
+static void shrink_dcache_inode(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	if (S_ISDIR(inode->i_mode)) {
+		dentry = d_find_any_alias(inode);
+		if (dentry) {
+			shrink_dcache_parent(dentry);
+			dput(dentry);
+		}
+	}
+	d_prune_aliases(inode);
+}
+
+static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
+{
+	struct fscrypt_info *ci;
+	struct inode *inode;
+	struct inode *toput_inode = NULL;
+
+	spin_lock(&mk->mk_decrypted_inodes_lock);
+
+	list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) {
+		inode = ci->ci_inode;
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&mk->mk_decrypted_inodes_lock);
+
+		shrink_dcache_inode(inode);
+		iput(toput_inode);
+		toput_inode = inode;
+
+		spin_lock(&mk->mk_decrypted_inodes_lock);
+	}
+
+	spin_unlock(&mk->mk_decrypted_inodes_lock);
+	iput(toput_inode);
+}
+
+static int check_for_busy_inodes(struct super_block *sb,
+				 struct fscrypt_master_key *mk)
+{
+	struct list_head *pos;
+	size_t busy_count = 0;
+	unsigned long ino;
+	struct dentry *dentry;
+	char _path[256];
+	char *path = NULL;
+
+	spin_lock(&mk->mk_decrypted_inodes_lock);
+
+	list_for_each(pos, &mk->mk_decrypted_inodes)
+		busy_count++;
+
+	if (busy_count == 0) {
+		spin_unlock(&mk->mk_decrypted_inodes_lock);
+		return 0;
+	}
+
+	{
+		/* select an example file to show for debugging purposes */
+		struct inode *inode =
+			list_first_entry(&mk->mk_decrypted_inodes,
+					 struct fscrypt_info,
+					 ci_master_key_link)->ci_inode;
+		ino = inode->i_ino;
+		dentry = d_find_alias(inode);
+	}
+	spin_unlock(&mk->mk_decrypted_inodes_lock);
+
+	if (dentry) {
+		path = dentry_path(dentry, _path, sizeof(_path));
+		dput(dentry);
+	}
+	if (IS_ERR_OR_NULL(path))
+		path = "(unknown)";
+
+	fscrypt_warn(NULL,
+		     "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)",
+		     sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec),
+		     master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u,
+		     ino, path);
+	return -EBUSY;
+}
+
+static int try_to_lock_encrypted_files(struct super_block *sb,
+				       struct fscrypt_master_key *mk)
+{
+	int err1;
+	int err2;
+
+	/*
+	 * An inode can't be evicted while it is dirty or has dirty pages.
+	 * Thus, we first have to clean the inodes in ->mk_decrypted_inodes.
+	 *
+	 * Just do it the easy way: call sync_filesystem().  It's overkill, but
+	 * it works, and it's more important to minimize the amount of caches we
+	 * drop than the amount of data we sync.  Also, unprivileged users can
+	 * already call sync_filesystem() via sys_syncfs() or sys_sync().
+	 */
+	down_read(&sb->s_umount);
+	err1 = sync_filesystem(sb);
+	up_read(&sb->s_umount);
+	/* If a sync error occurs, still try to evict as much as possible. */
+
+	/*
+	 * Inodes are pinned by their dentries, so we have to evict their
+	 * dentries.  shrink_dcache_sb() would suffice, but would be overkill
+	 * and inappropriate for use by unprivileged users.  So instead go
+	 * through the inodes' alias lists and try to evict each dentry.
+	 */
+	evict_dentries_for_decrypted_inodes(mk);
+
+	/*
+	 * evict_dentries_for_decrypted_inodes() already iput() each inode in
+	 * the list; any inodes for which that dropped the last reference will
+	 * have been evicted due to fscrypt_drop_inode() detecting the key
+	 * removal and telling the VFS to evict the inode.  So to finish, we
+	 * just need to check whether any inodes couldn't be evicted.
+	 */
+	err2 = check_for_busy_inodes(sb, mk);
+
+	return err1 ?: err2;
+}
+
+/*
+ * Try to remove an fscrypt master encryption key.
+ *
+ * FS_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's
+ * claim to the key, then removes the key itself if no other users have claims.
+ * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the
+ * key itself.
+ *
+ * To "remove the key itself", first we wipe the actual master key secret, so
+ * that no more inodes can be unlocked with it.  Then we try to evict all cached
+ * inodes that had been unlocked with the key.
+ *
+ * If all inodes were evicted, then we unlink the fscrypt_master_key from the
+ * keyring.  Otherwise it remains in the keyring in the "incompletely removed"
+ * state (without the actual secret key) where it tracks the list of remaining
+ * inodes.  Userspace can execute the ioctl again later to retry eviction, or
+ * alternatively can re-add the secret key again.
+ *
+ * For more details, see the "Removing keys" section of
+ * Documentation/filesystems/fscrypt.rst.
+ */
+static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+	struct fscrypt_remove_key_arg __user *uarg = _uarg;
+	struct fscrypt_remove_key_arg arg;
+	struct key *key;
+	struct fscrypt_master_key *mk;
+	u32 status_flags = 0;
+	int err;
+	bool dead;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (!valid_key_spec(&arg.key_spec))
+		return -EINVAL;
+
+	if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+		return -EINVAL;
+
+	/*
+	 * Only root can add and remove keys that are identified by an arbitrary
+	 * descriptor rather than by a cryptographic hash.
+	 */
+	if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	/* Find the key being removed. */
+	key = fscrypt_find_master_key(sb, &arg.key_spec);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+	mk = key->payload.data[0];
+
+	down_write(&key->sem);
+
+	/* If relevant, remove current user's (or all users) claim to the key */
+	if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) {
+		if (all_users)
+			err = keyring_clear(mk->mk_users);
+		else
+			err = remove_master_key_user(mk);
+		if (err) {
+			up_write(&key->sem);
+			goto out_put_key;
+		}
+		if (mk->mk_users->keys.nr_leaves_on_tree != 0) {
+			/*
+			 * Other users have still added the key too.  We removed
+			 * the current user's claim to the key, but we still
+			 * can't remove the key itself.
+			 */
+			status_flags |=
+				FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS;
+			err = 0;
+			up_write(&key->sem);
+			goto out_put_key;
+		}
+	}
+
+	/* No user claims remaining.  Go ahead and wipe the secret. */
+	dead = false;
+	if (is_master_key_secret_present(&mk->mk_secret)) {
+		down_write(&mk->mk_secret_sem);
+		wipe_master_key_secret(&mk->mk_secret);
+		dead = refcount_dec_and_test(&mk->mk_refcount);
+		up_write(&mk->mk_secret_sem);
+	}
+	up_write(&key->sem);
+	if (dead) {
+		/*
+		 * No inodes reference the key, and we wiped the secret, so the
+		 * key object is free to be removed from the keyring.
+		 */
+		key_invalidate(key);
+		err = 0;
+	} else {
+		/* Some inodes still reference this key; try to evict them. */
+		err = try_to_lock_encrypted_files(sb, mk);
+		if (err == -EBUSY) {
+			status_flags |=
+				FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY;
+			err = 0;
+		}
+	}
+	/*
+	 * We return 0 if we successfully did something: removed a claim to the
+	 * key, wiped the secret, or tried locking the files again.  Users need
+	 * to check the informational status flags if they care whether the key
+	 * has been fully removed including all files locked.
+	 */
+out_put_key:
+	key_put(key);
+	if (err == 0)
+		err = put_user(status_flags, &uarg->removal_status_flags);
+	return err;
+}
+
+int fscrypt_ioctl_remove_key(struct file *filp, void __user *uarg)
+{
+	return do_remove_key(filp, uarg, false);
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key);
+
+int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	return do_remove_key(filp, uarg, true);
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users);
+
+/*
+ * Retrieve the status of an fscrypt master encryption key.
+ *
+ * We set ->status to indicate whether the key is absent, present, or
+ * incompletely removed.  "Incompletely removed" means that the master key
+ * secret has been removed, but some files which had been unlocked with it are
+ * still in use.  This field allows applications to easily determine the state
+ * of an encrypted directory without using a hack such as trying to open a
+ * regular file in it (which can confuse the "incompletely removed" state with
+ * absent or present).
+ *
+ * In addition, for v2 policy keys we allow applications to determine, via
+ * ->status_flags and ->user_count, whether the key has been added by the
+ * current user, by other users, or by both.  Most applications should not need
+ * this, since ordinarily only one user should know a given key.  However, if a
+ * secret key is shared by multiple users, applications may wish to add an
+ * already-present key to prevent other users from removing it.  This ioctl can
+ * be used to check whether that really is the case before the work is done to
+ * add the key --- which might e.g. require prompting the user for a passphrase.
+ *
+ * For more details, see the "FS_IOC_GET_ENCRYPTION_KEY_STATUS" section of
+ * Documentation/filesystems/fscrypt.rst.
+ */
+int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
+{
+	struct super_block *sb = file_inode(filp)->i_sb;
+	struct fscrypt_get_key_status_arg arg;
+	struct key *key;
+	struct fscrypt_master_key *mk;
+	int err;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (!valid_key_spec(&arg.key_spec))
+		return -EINVAL;
+
+	if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+		return -EINVAL;
+
+	arg.status_flags = 0;
+	arg.user_count = 0;
+	memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved));
+
+	key = fscrypt_find_master_key(sb, &arg.key_spec);
+	if (IS_ERR(key)) {
+		if (key != ERR_PTR(-ENOKEY))
+			return PTR_ERR(key);
+		arg.status = FSCRYPT_KEY_STATUS_ABSENT;
+		err = 0;
+		goto out;
+	}
+	mk = key->payload.data[0];
+	down_read(&key->sem);
+
+	if (!is_master_key_secret_present(&mk->mk_secret)) {
+		arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED;
+		err = 0;
+		goto out_release_key;
+	}
+
+	arg.status = FSCRYPT_KEY_STATUS_PRESENT;
+	if (mk->mk_users) {
+		struct key *mk_user;
+
+		arg.user_count = mk->mk_users->keys.nr_leaves_on_tree;
+		mk_user = find_master_key_user(mk);
+		if (!IS_ERR(mk_user)) {
+			arg.status_flags |=
+				FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF;
+			key_put(mk_user);
+		} else if (mk_user != ERR_PTR(-ENOKEY)) {
+			err = PTR_ERR(mk_user);
+			goto out_release_key;
+		}
+	}
+	err = 0;
+out_release_key:
+	up_read(&key->sem);
+	key_put(key);
+out:
+	if (!err && copy_to_user(uarg, &arg, sizeof(arg)))
+		err = -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_key_status);
+
+int __init fscrypt_init_keyring(void)
+{
+	int err;
+
+	err = register_key_type(&key_type_fscrypt);
+	if (err)
+		return err;
+
+	err = register_key_type(&key_type_fscrypt_user);
+	if (err)
+		goto err_unregister_fscrypt;
+
+	return 0;
+
+err_unregister_fscrypt:
+	unregister_key_type(&key_type_fscrypt);
+	return err;
+}

diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
new file mode 100644
index 0000000..d71c2d6
--- /dev/null
+++ b/fs/crypto/keysetup.c

@@ -0,0 +1,591 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Key setup facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
+ * Heavily modified since then.
+ */
+
+#include <crypto/aes.h>
+#include <crypto/sha.h>
+#include <crypto/skcipher.h>
+#include <linux/key.h>
+
+#include "fscrypt_private.h"
+
+static struct crypto_shash *essiv_hash_tfm;
+
+static struct fscrypt_mode available_modes[] = {
+	[FSCRYPT_MODE_AES_256_XTS] = {
+		.friendly_name = "AES-256-XTS",
+		.cipher_str = "xts(aes)",
+		.keysize = 64,
+		.ivsize = 16,
+	},
+	[FSCRYPT_MODE_AES_256_CTS] = {
+		.friendly_name = "AES-256-CTS-CBC",
+		.cipher_str = "cts(cbc(aes))",
+		.keysize = 32,
+		.ivsize = 16,
+	},
+	[FSCRYPT_MODE_AES_128_CBC] = {
+		.friendly_name = "AES-128-CBC",
+		.cipher_str = "cbc(aes)",
+		.keysize = 16,
+		.ivsize = 16,
+		.needs_essiv = true,
+	},
+	[FSCRYPT_MODE_AES_128_CTS] = {
+		.friendly_name = "AES-128-CTS-CBC",
+		.cipher_str = "cts(cbc(aes))",
+		.keysize = 16,
+		.ivsize = 16,
+	},
+	[FSCRYPT_MODE_ADIANTUM] = {
+		.friendly_name = "Adiantum",
+		.cipher_str = "adiantum(xchacha12,aes)",
+		.keysize = 32,
+		.ivsize = 32,
+	},
+};
+
+static struct fscrypt_mode *
+select_encryption_mode(const union fscrypt_policy *policy,
+		       const struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode))
+		return &available_modes[fscrypt_policy_contents_mode(policy)];
+
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		return &available_modes[fscrypt_policy_fnames_mode(policy)];
+
+	WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+		  inode->i_ino, (inode->i_mode & S_IFMT));
+	return ERR_PTR(-EINVAL);
+}
+
+/* Create a symmetric cipher object for the given encryption mode and key */
+struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
+						  const u8 *raw_key,
+						  const struct inode *inode)
+{
+	struct crypto_skcipher *tfm;
+	int err;
+
+	tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+	if (IS_ERR(tfm)) {
+		if (PTR_ERR(tfm) == -ENOENT) {
+			fscrypt_warn(inode,
+				     "Missing crypto API support for %s (API name: \"%s\")",
+				     mode->friendly_name, mode->cipher_str);
+			return ERR_PTR(-ENOPKG);
+		}
+		fscrypt_err(inode, "Error allocating '%s' transform: %ld",
+			    mode->cipher_str, PTR_ERR(tfm));
+		return tfm;
+	}
+	if (unlikely(!mode->logged_impl_name)) {
+		/*
+		 * fscrypt performance can vary greatly depending on which
+		 * crypto algorithm implementation is used.  Help people debug
+		 * performance problems by logging the ->cra_driver_name the
+		 * first time a mode is used.  Note that multiple threads can
+		 * race here, but it doesn't really matter.
+		 */
+		mode->logged_impl_name = true;
+		pr_info("fscrypt: %s using implementation \"%s\"\n",
+			mode->friendly_name,
+			crypto_skcipher_alg(tfm)->base.cra_driver_name);
+	}
+	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+	err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+	if (err)
+		goto err_free_tfm;
+
+	return tfm;
+
+err_free_tfm:
+	crypto_free_skcipher(tfm);
+	return ERR_PTR(err);
+}
+
+static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
+{
+	struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
+
+	/* init hash transform on demand */
+	if (unlikely(!tfm)) {
+		struct crypto_shash *prev_tfm;
+
+		tfm = crypto_alloc_shash("sha256", 0, 0);
+		if (IS_ERR(tfm)) {
+			if (PTR_ERR(tfm) == -ENOENT) {
+				fscrypt_warn(NULL,
+					     "Missing crypto API support for SHA-256");
+				return -ENOPKG;
+			}
+			fscrypt_err(NULL,
+				    "Error allocating SHA-256 transform: %ld",
+				    PTR_ERR(tfm));
+			return PTR_ERR(tfm);
+		}
+		prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
+		if (prev_tfm) {
+			crypto_free_shash(tfm);
+			tfm = prev_tfm;
+		}
+	}
+
+	{
+		SHASH_DESC_ON_STACK(desc, tfm);
+		desc->tfm = tfm;
+
+		return crypto_shash_digest(desc, key, keysize, salt);
+	}
+}
+
+static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
+				int keysize)
+{
+	int err;
+	struct crypto_cipher *essiv_tfm;
+	u8 salt[SHA256_DIGEST_SIZE];
+
+	if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE))
+		return -EINVAL;
+
+	essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(essiv_tfm))
+		return PTR_ERR(essiv_tfm);
+
+	ci->ci_essiv_tfm = essiv_tfm;
+
+	err = derive_essiv_salt(raw_key, keysize, salt);
+	if (err)
+		goto out;
+
+	/*
+	 * Using SHA256 to derive the salt/key will result in AES-256 being
+	 * used for IV generation. File contents encryption will still use the
+	 * configured keysize (AES-128) nevertheless.
+	 */
+	err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
+	if (err)
+		goto out;
+
+out:
+	memzero_explicit(salt, sizeof(salt));
+	return err;
+}
+
+/* Given the per-file key, set up the file's crypto transform object(s) */
+int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key)
+{
+	struct fscrypt_mode *mode = ci->ci_mode;
+	struct crypto_skcipher *ctfm;
+	int err;
+
+	ctfm = fscrypt_allocate_skcipher(mode, derived_key, ci->ci_inode);
+	if (IS_ERR(ctfm))
+		return PTR_ERR(ctfm);
+
+	ci->ci_ctfm = ctfm;
+
+	if (mode->needs_essiv) {
+		err = init_essiv_generator(ci, derived_key, mode->keysize);
+		if (err) {
+			fscrypt_warn(ci->ci_inode,
+				     "Error initializing ESSIV generator: %d",
+				     err);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static int setup_per_mode_key(struct fscrypt_info *ci,
+			      struct fscrypt_master_key *mk)
+{
+	struct fscrypt_mode *mode = ci->ci_mode;
+	u8 mode_num = mode - available_modes;
+	struct crypto_skcipher *tfm, *prev_tfm;
+	u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
+	int err;
+
+	if (WARN_ON(mode_num >= ARRAY_SIZE(mk->mk_mode_keys)))
+		return -EINVAL;
+
+	/* pairs with cmpxchg() below */
+	tfm = READ_ONCE(mk->mk_mode_keys[mode_num]);
+	if (likely(tfm != NULL))
+		goto done;
+
+	BUILD_BUG_ON(sizeof(mode_num) != 1);
+	err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+				  HKDF_CONTEXT_PER_MODE_KEY,
+				  &mode_num, sizeof(mode_num),
+				  mode_key, mode->keysize);
+	if (err)
+		return err;
+	tfm = fscrypt_allocate_skcipher(mode, mode_key, ci->ci_inode);
+	memzero_explicit(mode_key, mode->keysize);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	/* pairs with READ_ONCE() above */
+	prev_tfm = cmpxchg(&mk->mk_mode_keys[mode_num], NULL, tfm);
+	if (prev_tfm != NULL) {
+		crypto_free_skcipher(tfm);
+		tfm = prev_tfm;
+	}
+done:
+	ci->ci_ctfm = tfm;
+	return 0;
+}
+
+static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
+				     struct fscrypt_master_key *mk)
+{
+	u8 derived_key[FSCRYPT_MAX_KEY_SIZE];
+	int err;
+
+	if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
+		/*
+		 * DIRECT_KEY: instead of deriving per-file keys, the per-file
+		 * nonce will be included in all the IVs.  But unlike v1
+		 * policies, for v2 policies in this case we don't encrypt with
+		 * the master key directly but rather derive a per-mode key.
+		 * This ensures that the master key is consistently used only
+		 * for HKDF, avoiding key reuse issues.
+		 */
+		if (!fscrypt_mode_supports_direct_key(ci->ci_mode)) {
+			fscrypt_warn(ci->ci_inode,
+				     "Direct key flag not allowed with %s",
+				     ci->ci_mode->friendly_name);
+			return -EINVAL;
+		}
+		return setup_per_mode_key(ci, mk);
+	}
+
+	err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+				  HKDF_CONTEXT_PER_FILE_KEY,
+				  ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE,
+				  derived_key, ci->ci_mode->keysize);
+	if (err)
+		return err;
+
+	err = fscrypt_set_derived_key(ci, derived_key);
+	memzero_explicit(derived_key, ci->ci_mode->keysize);
+	return err;
+}
+
+/*
+ * Find the master key, then set up the inode's actual encryption key.
+ *
+ * If the master key is found in the filesystem-level keyring, then the
+ * corresponding 'struct key' is returned in *master_key_ret with
+ * ->mk_secret_sem read-locked.  This is needed to ensure that only one task
+ * links the fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race
+ * to create an fscrypt_info for the same inode), and to synchronize the master
+ * key being removed with a new inode starting to use it.
+ */
+static int setup_file_encryption_key(struct fscrypt_info *ci,
+				     struct key **master_key_ret)
+{
+	struct key *key;
+	struct fscrypt_master_key *mk = NULL;
+	struct fscrypt_key_specifier mk_spec;
+	int err;
+
+	switch (ci->ci_policy.version) {
+	case FSCRYPT_POLICY_V1:
+		mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
+		memcpy(mk_spec.u.descriptor,
+		       ci->ci_policy.v1.master_key_descriptor,
+		       FSCRYPT_KEY_DESCRIPTOR_SIZE);
+		break;
+	case FSCRYPT_POLICY_V2:
+		mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+		memcpy(mk_spec.u.identifier,
+		       ci->ci_policy.v2.master_key_identifier,
+		       FSCRYPT_KEY_IDENTIFIER_SIZE);
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
+	if (IS_ERR(key)) {
+		if (key != ERR_PTR(-ENOKEY) ||
+		    ci->ci_policy.version != FSCRYPT_POLICY_V1)
+			return PTR_ERR(key);
+
+		/*
+		 * As a legacy fallback for v1 policies, search for the key in
+		 * the current task's subscribed keyrings too.  Don't move this
+		 * to before the search of ->s_master_keys, since users
+		 * shouldn't be able to override filesystem-level keys.
+		 */
+		return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
+	}
+
+	mk = key->payload.data[0];
+	down_read(&mk->mk_secret_sem);
+
+	/* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
+	if (!is_master_key_secret_present(&mk->mk_secret)) {
+		err = -ENOKEY;
+		goto out_release_key;
+	}
+
+	/*
+	 * Require that the master key be at least as long as the derived key.
+	 * Otherwise, the derived key cannot possibly contain as much entropy as
+	 * that required by the encryption mode it will be used for.  For v1
+	 * policies it's also required for the KDF to work at all.
+	 */
+	if (mk->mk_secret.size < ci->ci_mode->keysize) {
+		fscrypt_warn(NULL,
+			     "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+			     master_key_spec_type(&mk_spec),
+			     master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
+			     mk->mk_secret.size, ci->ci_mode->keysize);
+		err = -ENOKEY;
+		goto out_release_key;
+	}
+
+	switch (ci->ci_policy.version) {
+	case FSCRYPT_POLICY_V1:
+		err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
+		break;
+	case FSCRYPT_POLICY_V2:
+		err = fscrypt_setup_v2_file_key(ci, mk);
+		break;
+	default:
+		WARN_ON(1);
+		err = -EINVAL;
+		break;
+	}
+	if (err)
+		goto out_release_key;
+
+	*master_key_ret = key;
+	return 0;
+
+out_release_key:
+	up_read(&mk->mk_secret_sem);
+	key_put(key);
+	return err;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+	struct key *key;
+
+	if (!ci)
+		return;
+
+	if (ci->ci_direct_key) {
+		fscrypt_put_direct_key(ci->ci_direct_key);
+	} else if ((ci->ci_ctfm != NULL || ci->ci_essiv_tfm != NULL) &&
+		   !fscrypt_is_direct_key_policy(&ci->ci_policy)) {
+		crypto_free_skcipher(ci->ci_ctfm);
+		crypto_free_cipher(ci->ci_essiv_tfm);
+	}
+
+	key = ci->ci_master_key;
+	if (key) {
+		struct fscrypt_master_key *mk = key->payload.data[0];
+
+		/*
+		 * Remove this inode from the list of inodes that were unlocked
+		 * with the master key.
+		 *
+		 * In addition, if we're removing the last inode from a key that
+		 * already had its secret removed, invalidate the key so that it
+		 * gets removed from ->s_master_keys.
+		 */
+		spin_lock(&mk->mk_decrypted_inodes_lock);
+		list_del(&ci->ci_master_key_link);
+		spin_unlock(&mk->mk_decrypted_inodes_lock);
+		if (refcount_dec_and_test(&mk->mk_refcount))
+			key_invalidate(key);
+		key_put(key);
+	}
+	kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+	struct fscrypt_info *crypt_info;
+	union fscrypt_context ctx;
+	struct fscrypt_mode *mode;
+	struct key *master_key = NULL;
+	int res;
+
+	if (fscrypt_has_encryption_key(inode))
+		return 0;
+
+	res = fscrypt_initialize(inode->i_sb->s_cop->flags);
+	if (res)
+		return res;
+
+	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res < 0) {
+		if (!fscrypt_dummy_context_enabled(inode) ||
+		    IS_ENCRYPTED(inode)) {
+			fscrypt_warn(inode,
+				     "Error %d getting encryption context",
+				     res);
+			return res;
+		}
+		/* Fake up a context for an unencrypted directory */
+		memset(&ctx, 0, sizeof(ctx));
+		ctx.version = FSCRYPT_CONTEXT_V1;
+		ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+		ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+		memset(ctx.v1.master_key_descriptor, 0x42,
+		       FSCRYPT_KEY_DESCRIPTOR_SIZE);
+		res = sizeof(ctx.v1);
+	}
+
+	crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
+	if (!crypt_info)
+		return -ENOMEM;
+
+	crypt_info->ci_inode = inode;
+
+	res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res);
+	if (res) {
+		fscrypt_warn(inode,
+			     "Unrecognized or corrupt encryption context");
+		goto out;
+	}
+
+	switch (ctx.version) {
+	case FSCRYPT_CONTEXT_V1:
+		memcpy(crypt_info->ci_nonce, ctx.v1.nonce,
+		       FS_KEY_DERIVATION_NONCE_SIZE);
+		break;
+	case FSCRYPT_CONTEXT_V2:
+		memcpy(crypt_info->ci_nonce, ctx.v2.nonce,
+		       FS_KEY_DERIVATION_NONCE_SIZE);
+		break;
+	default:
+		WARN_ON(1);
+		res = -EINVAL;
+		goto out;
+	}
+
+	if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
+		res = -EINVAL;
+		goto out;
+	}
+
+	mode = select_encryption_mode(&crypt_info->ci_policy, inode);
+	if (IS_ERR(mode)) {
+		res = PTR_ERR(mode);
+		goto out;
+	}
+	WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
+	crypt_info->ci_mode = mode;
+
+	res = setup_file_encryption_key(crypt_info, &master_key);
+	if (res)
+		goto out;
+
+	if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
+		if (master_key) {
+			struct fscrypt_master_key *mk =
+				master_key->payload.data[0];
+
+			refcount_inc(&mk->mk_refcount);
+			crypt_info->ci_master_key = key_get(master_key);
+			spin_lock(&mk->mk_decrypted_inodes_lock);
+			list_add(&crypt_info->ci_master_key_link,
+				 &mk->mk_decrypted_inodes);
+			spin_unlock(&mk->mk_decrypted_inodes_lock);
+		}
+		crypt_info = NULL;
+	}
+	res = 0;
+out:
+	if (master_key) {
+		struct fscrypt_master_key *mk = master_key->payload.data[0];
+
+		up_read(&mk->mk_secret_sem);
+		key_put(master_key);
+	}
+	if (res == -ENOKEY)
+		res = 0;
+	put_crypt_info(crypt_info);
+	return res;
+}
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
+
+/**
+ * fscrypt_put_encryption_info - free most of an inode's fscrypt data
+ *
+ * Free the inode's fscrypt_info.  Filesystems must call this when the inode is
+ * being evicted.  An RCU grace period need not have elapsed yet.
+ */
+void fscrypt_put_encryption_info(struct inode *inode)
+{
+	put_crypt_info(inode->i_crypt_info);
+	inode->i_crypt_info = NULL;
+}
+EXPORT_SYMBOL(fscrypt_put_encryption_info);
+
+/**
+ * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay
+ *
+ * Free the inode's cached decrypted symlink target, if any.  Filesystems must
+ * call this after an RCU grace period, just before they free the inode.
+ */
+void fscrypt_free_inode(struct inode *inode)
+{
+	if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) {
+		kfree(inode->i_link);
+		inode->i_link = NULL;
+	}
+}
+EXPORT_SYMBOL(fscrypt_free_inode);
+
+/**
+ * fscrypt_drop_inode - check whether the inode's master key has been removed
+ *
+ * Filesystems supporting fscrypt must call this from their ->drop_inode()
+ * method so that encrypted inodes are evicted as soon as they're no longer in
+ * use and their master key has been removed.
+ *
+ * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0
+ */
+int fscrypt_drop_inode(struct inode *inode)
+{
+	const struct fscrypt_info *ci = READ_ONCE(inode->i_crypt_info);
+	const struct fscrypt_master_key *mk;
+
+	/*
+	 * If ci is NULL, then the inode doesn't have an encryption key set up
+	 * so it's irrelevant.  If ci_master_key is NULL, then the master key
+	 * was provided via the legacy mechanism of the process-subscribed
+	 * keyrings, so we don't know whether it's been removed or not.
+	 */
+	if (!ci || !ci->ci_master_key)
+		return 0;
+	mk = ci->ci_master_key->payload.data[0];
+
+	/*
+	 * Note: since we aren't holding ->mk_secret_sem, the result here can
+	 * immediately become outdated.  But there's no correctness problem with
+	 * unnecessarily evicting.  Nor is there a correctness problem with not
+	 * evicting while iput() is racing with the key being removed, since
+	 * then the thread removing the key will either evict the inode itself
+	 * or will correctly detect that it wasn't evicted due to the race.
+	 */
+	return !is_master_key_secret_present(&mk->mk_secret);
+}
+EXPORT_SYMBOL_GPL(fscrypt_drop_inode);

diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
new file mode 100644
index 0000000..ad1a36c
--- /dev/null
+++ b/fs/crypto/keysetup_v1.c

@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Key setup for v1 encryption policies
+ *
+ * Copyright 2015, 2019 Google LLC
+ */
+
+/*
+ * This file implements compatibility functions for the original encryption
+ * policy version ("v1"), including:
+ *
+ * - Deriving per-file keys using the AES-128-ECB based KDF
+ *   (rather than the new method of using HKDF-SHA512)
+ *
+ * - Retrieving fscrypt master keys from process-subscribed keyrings
+ *   (rather than the new method of using a filesystem-level keyring)
+ *
+ * - Handling policies with the DIRECT_KEY flag set using a master key table
+ *   (rather than the new method of implementing DIRECT_KEY with per-mode keys
+ *    managed alongside the master keys in the filesystem-level keyring)
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+#include <linux/hashtable.h>
+#include <linux/scatterlist.h>
+
+#include "fscrypt_private.h"
+
+/* Table of keys referenced by DIRECT_KEY policies */
+static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */
+static DEFINE_SPINLOCK(fscrypt_direct_keys_lock);
+
+/*
+ * v1 key derivation function.  This generates the derived key by encrypting the
+ * master key with AES-128-ECB using the nonce as the AES key.  This provides a
+ * unique derived key with sufficient entropy for each inode.  However, it's
+ * nonstandard, non-extensible, doesn't evenly distribute the entropy from the
+ * master key, and is trivially reversible: an attacker who compromises a
+ * derived key can "decrypt" it to get back to the master key, then derive any
+ * other key.  For all new code, use HKDF instead.
+ *
+ * The master key must be at least as long as the derived key.  If the master
+ * key is longer, then only the first 'derived_keysize' bytes are used.
+ */
+static int derive_key_aes(const u8 *master_key,
+			  const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE],
+			  u8 *derived_key, unsigned int derived_keysize)
+{
+	int res = 0;
+	struct skcipher_request *req = NULL;
+	DECLARE_CRYPTO_WAIT(wait);
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+
+	if (IS_ERR(tfm)) {
+		res = PTR_ERR(tfm);
+		tfm = NULL;
+		goto out;
+	}
+	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		res = -ENOMEM;
+		goto out;
+	}
+	skcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			crypto_req_done, &wait);
+	res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+	if (res < 0)
+		goto out;
+
+	sg_init_one(&src_sg, master_key, derived_keysize);
+	sg_init_one(&dst_sg, derived_key, derived_keysize);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
+				   NULL);
+	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+out:
+	skcipher_request_free(req);
+	crypto_free_skcipher(tfm);
+	return res;
+}
+
+/*
+ * Search the current task's subscribed keyrings for a "logon" key with
+ * description prefix:descriptor, and if found acquire a read lock on it and
+ * return a pointer to its validated payload in *payload_ret.
+ */
+static struct key *
+find_and_lock_process_key(const char *prefix,
+			  const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE],
+			  unsigned int min_keysize,
+			  const struct fscrypt_key **payload_ret)
+{
+	char *description;
+	struct key *key;
+	const struct user_key_payload *ukp;
+	const struct fscrypt_key *payload;
+
+	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+				FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
+	if (!description)
+		return ERR_PTR(-ENOMEM);
+
+	key = request_key(&key_type_logon, description, NULL);
+	kfree(description);
+	if (IS_ERR(key))
+		return key;
+
+	down_read(&key->sem);
+	ukp = user_key_payload_locked(key);
+
+	if (!ukp) /* was the key revoked before we acquired its semaphore? */
+		goto invalid;
+
+	payload = (const struct fscrypt_key *)ukp->data;
+
+	if (ukp->datalen != sizeof(struct fscrypt_key) ||
+	    payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) {
+		fscrypt_warn(NULL,
+			     "key with description '%s' has invalid payload",
+			     key->description);
+		goto invalid;
+	}
+
+	if (payload->size < min_keysize) {
+		fscrypt_warn(NULL,
+			     "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
+			     key->description, payload->size, min_keysize);
+		goto invalid;
+	}
+
+	*payload_ret = payload;
+	return key;
+
+invalid:
+	up_read(&key->sem);
+	key_put(key);
+	return ERR_PTR(-ENOKEY);
+}
+
+/* Master key referenced by DIRECT_KEY policy */
+struct fscrypt_direct_key {
+	struct hlist_node		dk_node;
+	refcount_t			dk_refcount;
+	const struct fscrypt_mode	*dk_mode;
+	struct crypto_skcipher		*dk_ctfm;
+	u8				dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
+	u8				dk_raw[FSCRYPT_MAX_KEY_SIZE];
+};
+
+static void free_direct_key(struct fscrypt_direct_key *dk)
+{
+	if (dk) {
+		crypto_free_skcipher(dk->dk_ctfm);
+		kzfree(dk);
+	}
+}
+
+void fscrypt_put_direct_key(struct fscrypt_direct_key *dk)
+{
+	if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock))
+		return;
+	hash_del(&dk->dk_node);
+	spin_unlock(&fscrypt_direct_keys_lock);
+
+	free_direct_key(dk);
+}
+
+/*
+ * Find/insert the given key into the fscrypt_direct_keys table.  If found, it
+ * is returned with elevated refcount, and 'to_insert' is freed if non-NULL.  If
+ * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise
+ * NULL is returned.
+ */
+static struct fscrypt_direct_key *
+find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
+			  const u8 *raw_key, const struct fscrypt_info *ci)
+{
+	unsigned long hash_key;
+	struct fscrypt_direct_key *dk;
+
+	/*
+	 * Careful: to avoid potentially leaking secret key bytes via timing
+	 * information, we must key the hash table by descriptor rather than by
+	 * raw key, and use crypto_memneq() when comparing raw keys.
+	 */
+
+	BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE);
+	memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor,
+	       sizeof(hash_key));
+
+	spin_lock(&fscrypt_direct_keys_lock);
+	hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) {
+		if (memcmp(ci->ci_policy.v1.master_key_descriptor,
+			   dk->dk_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
+			continue;
+		if (ci->ci_mode != dk->dk_mode)
+			continue;
+		if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize))
+			continue;
+		/* using existing tfm with same (descriptor, mode, raw_key) */
+		refcount_inc(&dk->dk_refcount);
+		spin_unlock(&fscrypt_direct_keys_lock);
+		free_direct_key(to_insert);
+		return dk;
+	}
+	if (to_insert)
+		hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key);
+	spin_unlock(&fscrypt_direct_keys_lock);
+	return to_insert;
+}
+
+/* Prepare to encrypt directly using the master key in the given mode */
+static struct fscrypt_direct_key *
+fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
+{
+	struct fscrypt_direct_key *dk;
+	int err;
+
+	/* Is there already a tfm for this key? */
+	dk = find_or_insert_direct_key(NULL, raw_key, ci);
+	if (dk)
+		return dk;
+
+	/* Nope, allocate one. */
+	dk = kzalloc(sizeof(*dk), GFP_NOFS);
+	if (!dk)
+		return ERR_PTR(-ENOMEM);
+	refcount_set(&dk->dk_refcount, 1);
+	dk->dk_mode = ci->ci_mode;
+	dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key,
+						ci->ci_inode);
+	if (IS_ERR(dk->dk_ctfm)) {
+		err = PTR_ERR(dk->dk_ctfm);
+		dk->dk_ctfm = NULL;
+		goto err_free_dk;
+	}
+	memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor,
+	       FSCRYPT_KEY_DESCRIPTOR_SIZE);
+	memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize);
+
+	return find_or_insert_direct_key(dk, raw_key, ci);
+
+err_free_dk:
+	free_direct_key(dk);
+	return ERR_PTR(err);
+}
+
+/* v1 policy, DIRECT_KEY: use the master key directly */
+static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+				    const u8 *raw_master_key)
+{
+	const struct fscrypt_mode *mode = ci->ci_mode;
+	struct fscrypt_direct_key *dk;
+
+	if (!fscrypt_mode_supports_direct_key(mode)) {
+		fscrypt_warn(ci->ci_inode,
+			     "Direct key mode not allowed with %s",
+			     mode->friendly_name);
+		return -EINVAL;
+	}
+
+	if (ci->ci_policy.v1.contents_encryption_mode !=
+	    ci->ci_policy.v1.filenames_encryption_mode) {
+		fscrypt_warn(ci->ci_inode,
+			     "Direct key mode not allowed with different contents and filenames modes");
+		return -EINVAL;
+	}
+
+	/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
+	if (WARN_ON(mode->needs_essiv))
+		return -EINVAL;
+
+	dk = fscrypt_get_direct_key(ci, raw_master_key);
+	if (IS_ERR(dk))
+		return PTR_ERR(dk);
+	ci->ci_direct_key = dk;
+	ci->ci_ctfm = dk->dk_ctfm;
+	return 0;
+}
+
+/* v1 policy, !DIRECT_KEY: derive the file's encryption key */
+static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+				     const u8 *raw_master_key)
+{
+	u8 *derived_key;
+	int err;
+
+	/*
+	 * This cannot be a stack buffer because it will be passed to the
+	 * scatterlist crypto API during derive_key_aes().
+	 */
+	derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+	if (!derived_key)
+		return -ENOMEM;
+
+	err = derive_key_aes(raw_master_key, ci->ci_nonce,
+			     derived_key, ci->ci_mode->keysize);
+	if (err)
+		goto out;
+
+	err = fscrypt_set_derived_key(ci, derived_key);
+out:
+	kzfree(derived_key);
+	return err;
+}
+
+int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
+{
+	if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
+		return setup_v1_file_key_direct(ci, raw_master_key);
+	else
+		return setup_v1_file_key_derived(ci, raw_master_key);
+}
+
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
+{
+	struct key *key;
+	const struct fscrypt_key *payload;
+	int err;
+
+	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
+					ci->ci_policy.v1.master_key_descriptor,
+					ci->ci_mode->keysize, &payload);
+	if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
+		key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+						ci->ci_policy.v1.master_key_descriptor,
+						ci->ci_mode->keysize, &payload);
+	}
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	err = fscrypt_setup_v1_file_key(ci, payload->raw);
+	up_read(&key->sem);
+	key_put(key);
+	return err;
+}

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index c6d431a..4072ba6 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c

@@ -5,8 +5,9 @@
  * Copyright (C) 2015, Google, Inc.
  * Copyright (C) 2015, Motorola Mobility.
  *
- * Written by Michael Halcrow, 2015.
+ * Originally written by Michael Halcrow, 2015.
  * Modified by Jaegeuk Kim, 2015.
+ * Modified by Eric Biggers, 2019 for v2 policy support.
  */
 
 #include <linux/random.h>
@@ -14,84 +15,315 @@
 #include <linux/mount.h>
 #include "fscrypt_private.h"
 
-/*
- * check whether an encryption policy is consistent with an encryption context
+/**
+ * fscrypt_policies_equal - check whether two encryption policies are the same
+ *
+ * Return: %true if equal, else %false
  */
-static bool is_encryption_context_consistent_with_policy(
-				const struct fscrypt_context *ctx,
-				const struct fscrypt_policy *policy)
+bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
+			    const union fscrypt_policy *policy2)
 {
-	return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor,
-		      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-		(ctx->flags == policy->flags) &&
-		(ctx->contents_encryption_mode ==
-		 policy->contents_encryption_mode) &&
-		(ctx->filenames_encryption_mode ==
-		 policy->filenames_encryption_mode);
+	if (policy1->version != policy2->version)
+		return false;
+
+	return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
 }
 
-static int create_encryption_context_from_policy(struct inode *inode,
-				const struct fscrypt_policy *policy)
+/**
+ * fscrypt_supported_policy - check whether an encryption policy is supported
+ *
+ * Given an encryption policy, check whether all its encryption modes and other
+ * settings are supported by this kernel.  (But we don't currently don't check
+ * for crypto API support here, so attempting to use an algorithm not configured
+ * into the crypto API will still fail later.)
+ *
+ * Return: %true if supported, else %false
+ */
+bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
+			      const struct inode *inode)
 {
-	struct fscrypt_context ctx;
+	switch (policy_u->version) {
+	case FSCRYPT_POLICY_V1: {
+		const struct fscrypt_policy_v1 *policy = &policy_u->v1;
 
-	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
-					FS_KEY_DESCRIPTOR_SIZE);
+		if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+					     policy->filenames_encryption_mode)) {
+			fscrypt_warn(inode,
+				     "Unsupported encryption modes (contents %d, filenames %d)",
+				     policy->contents_encryption_mode,
+				     policy->filenames_encryption_mode);
+			return false;
+		}
 
-	if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
-				     policy->filenames_encryption_mode))
+		if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
+			fscrypt_warn(inode,
+				     "Unsupported encryption flags (0x%02x)",
+				     policy->flags);
+			return false;
+		}
+
+		return true;
+	}
+	case FSCRYPT_POLICY_V2: {
+		const struct fscrypt_policy_v2 *policy = &policy_u->v2;
+
+		if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+					     policy->filenames_encryption_mode)) {
+			fscrypt_warn(inode,
+				     "Unsupported encryption modes (contents %d, filenames %d)",
+				     policy->contents_encryption_mode,
+				     policy->filenames_encryption_mode);
+			return false;
+		}
+
+		if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
+			fscrypt_warn(inode,
+				     "Unsupported encryption flags (0x%02x)",
+				     policy->flags);
+			return false;
+		}
+
+		if (memchr_inv(policy->__reserved, 0,
+			       sizeof(policy->__reserved))) {
+			fscrypt_warn(inode,
+				     "Reserved bits set in encryption policy");
+			return false;
+		}
+
+		return true;
+	}
+	}
+	return false;
+}
+
+/**
+ * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy
+ *
+ * Create an fscrypt_context for an inode that is being assigned the given
+ * encryption policy.  A new nonce is randomly generated.
+ *
+ * Return: the size of the new context in bytes.
+ */
+static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
+					   const union fscrypt_policy *policy_u)
+{
+	memset(ctx_u, 0, sizeof(*ctx_u));
+
+	switch (policy_u->version) {
+	case FSCRYPT_POLICY_V1: {
+		const struct fscrypt_policy_v1 *policy = &policy_u->v1;
+		struct fscrypt_context_v1 *ctx = &ctx_u->v1;
+
+		ctx->version = FSCRYPT_CONTEXT_V1;
+		ctx->contents_encryption_mode =
+			policy->contents_encryption_mode;
+		ctx->filenames_encryption_mode =
+			policy->filenames_encryption_mode;
+		ctx->flags = policy->flags;
+		memcpy(ctx->master_key_descriptor,
+		       policy->master_key_descriptor,
+		       sizeof(ctx->master_key_descriptor));
+		get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+		return sizeof(*ctx);
+	}
+	case FSCRYPT_POLICY_V2: {
+		const struct fscrypt_policy_v2 *policy = &policy_u->v2;
+		struct fscrypt_context_v2 *ctx = &ctx_u->v2;
+
+		ctx->version = FSCRYPT_CONTEXT_V2;
+		ctx->contents_encryption_mode =
+			policy->contents_encryption_mode;
+		ctx->filenames_encryption_mode =
+			policy->filenames_encryption_mode;
+		ctx->flags = policy->flags;
+		memcpy(ctx->master_key_identifier,
+		       policy->master_key_identifier,
+		       sizeof(ctx->master_key_identifier));
+		get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+		return sizeof(*ctx);
+	}
+	}
+	BUG();
+}
+
+/**
+ * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy
+ *
+ * Given an fscrypt_context, build the corresponding fscrypt_policy.
+ *
+ * Return: 0 on success, or -EINVAL if the fscrypt_context has an unrecognized
+ * version number or size.
+ *
+ * This does *not* validate the settings within the policy itself, e.g. the
+ * modes, flags, and reserved bits.  Use fscrypt_supported_policy() for that.
+ */
+int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
+				const union fscrypt_context *ctx_u,
+				int ctx_size)
+{
+	memset(policy_u, 0, sizeof(*policy_u));
+
+	if (ctx_size <= 0 || ctx_size != fscrypt_context_size(ctx_u))
 		return -EINVAL;
 
-	if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+	switch (ctx_u->version) {
+	case FSCRYPT_CONTEXT_V1: {
+		const struct fscrypt_context_v1 *ctx = &ctx_u->v1;
+		struct fscrypt_policy_v1 *policy = &policy_u->v1;
+
+		policy->version = FSCRYPT_POLICY_V1;
+		policy->contents_encryption_mode =
+			ctx->contents_encryption_mode;
+		policy->filenames_encryption_mode =
+			ctx->filenames_encryption_mode;
+		policy->flags = ctx->flags;
+		memcpy(policy->master_key_descriptor,
+		       ctx->master_key_descriptor,
+		       sizeof(policy->master_key_descriptor));
+		return 0;
+	}
+	case FSCRYPT_CONTEXT_V2: {
+		const struct fscrypt_context_v2 *ctx = &ctx_u->v2;
+		struct fscrypt_policy_v2 *policy = &policy_u->v2;
+
+		policy->version = FSCRYPT_POLICY_V2;
+		policy->contents_encryption_mode =
+			ctx->contents_encryption_mode;
+		policy->filenames_encryption_mode =
+			ctx->filenames_encryption_mode;
+		policy->flags = ctx->flags;
+		memcpy(policy->__reserved, ctx->__reserved,
+		       sizeof(policy->__reserved));
+		memcpy(policy->master_key_identifier,
+		       ctx->master_key_identifier,
+		       sizeof(policy->master_key_identifier));
+		return 0;
+	}
+	}
+	/* unreachable */
+	return -EINVAL;
+}
+
+/* Retrieve an inode's encryption policy */
+static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
+{
+	const struct fscrypt_info *ci;
+	union fscrypt_context ctx;
+	int ret;
+
+	ci = READ_ONCE(inode->i_crypt_info);
+	if (ci) {
+		/* key available, use the cached policy */
+		*policy = ci->ci_policy;
+		return 0;
+	}
+
+	if (!IS_ENCRYPTED(inode))
+		return -ENODATA;
+
+	ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (ret < 0)
+		return (ret == -ERANGE) ? -EINVAL : ret;
+
+	return fscrypt_policy_from_context(policy, &ctx, ret);
+}
+
+static int set_encryption_policy(struct inode *inode,
+				 const union fscrypt_policy *policy)
+{
+	union fscrypt_context ctx;
+	int ctxsize;
+	int err;
+
+	if (!fscrypt_supported_policy(policy, inode))
 		return -EINVAL;
 
-	ctx.contents_encryption_mode = policy->contents_encryption_mode;
-	ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
-	ctx.flags = policy->flags;
-	BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
-	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+	switch (policy->version) {
+	case FSCRYPT_POLICY_V1:
+		/*
+		 * The original encryption policy version provided no way of
+		 * verifying that the correct master key was supplied, which was
+		 * insecure in scenarios where multiple users have access to the
+		 * same encrypted files (even just read-only access).  The new
+		 * encryption policy version fixes this and also implies use of
+		 * an improved key derivation function and allows non-root users
+		 * to securely remove keys.  So as long as compatibility with
+		 * old kernels isn't required, it is recommended to use the new
+		 * policy version for all new encrypted directories.
+		 */
+		pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n",
+			     current->comm, current->pid);
+		break;
+	case FSCRYPT_POLICY_V2:
+		err = fscrypt_verify_key_added(inode->i_sb,
+					       policy->v2.master_key_identifier);
+		if (err)
+			return err;
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
 
-	return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
+	ctxsize = fscrypt_new_context_from_policy(&ctx, policy);
+
+	return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL);
 }
 
 int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 {
-	struct fscrypt_policy policy;
+	union fscrypt_policy policy;
+	union fscrypt_policy existing_policy;
 	struct inode *inode = file_inode(filp);
+	u8 version;
+	int size;
 	int ret;
-	struct fscrypt_context ctx;
 
-	if (copy_from_user(&policy, arg, sizeof(policy)))
+	if (get_user(policy.version, (const u8 __user *)arg))
 		return -EFAULT;
 
+	size = fscrypt_policy_size(&policy);
+	if (size <= 0)
+		return -EINVAL;
+
+	/*
+	 * We should just copy the remaining 'size - 1' bytes here, but a
+	 * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to
+	 * think that size can be 0 here (despite the check above!) *and* that
+	 * it's a compile-time constant.  Thus it would think copy_from_user()
+	 * is passed compile-time constant ULONG_MAX, causing the compile-time
+	 * buffer overflow check to fail, breaking the build. This only occurred
+	 * when building an i386 kernel with -Os and branch profiling enabled.
+	 *
+	 * Work around it by just copying the first byte again...
+	 */
+	version = policy.version;
+	if (copy_from_user(&policy, arg, size))
+		return -EFAULT;
+	policy.version = version;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	if (policy.version != 0)
-		return -EINVAL;
-
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
 
 	inode_lock(inode);
 
-	ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	ret = fscrypt_get_policy(inode, &existing_policy);
 	if (ret == -ENODATA) {
 		if (!S_ISDIR(inode->i_mode))
 			ret = -ENOTDIR;
+		else if (IS_DEADDIR(inode))
+			ret = -ENOENT;
 		else if (!inode->i_sb->s_cop->empty_dir(inode))
 			ret = -ENOTEMPTY;
 		else
-			ret = create_encryption_context_from_policy(inode,
-								    &policy);
-	} else if (ret == sizeof(ctx) &&
-		   is_encryption_context_consistent_with_policy(&ctx,
-								&policy)) {
-		/* The file already uses the same encryption policy. */
-		ret = 0;
-	} else if (ret >= 0 || ret == -ERANGE) {
+			ret = set_encryption_policy(inode, &policy);
+	} else if (ret == -EINVAL ||
+		   (ret == 0 && !fscrypt_policies_equal(&policy,
+							&existing_policy))) {
 		/* The file already uses a different encryption policy. */
 		ret = -EEXIST;
 	}
@@ -103,37 +335,57 @@
 }
 EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
 
+/* Original ioctl version; can only get the original policy version */
 int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
 {
-	struct inode *inode = file_inode(filp);
-	struct fscrypt_context ctx;
-	struct fscrypt_policy policy;
-	int res;
+	union fscrypt_policy policy;
+	int err;
 
-	if (!IS_ENCRYPTED(inode))
-		return -ENODATA;
+	err = fscrypt_get_policy(file_inode(filp), &policy);
+	if (err)
+		return err;
 
-	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
-	if (res < 0 && res != -ERANGE)
-		return res;
-	if (res != sizeof(ctx))
-		return -EINVAL;
-	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+	if (policy.version != FSCRYPT_POLICY_V1)
 		return -EINVAL;
 
-	policy.version = 0;
-	policy.contents_encryption_mode = ctx.contents_encryption_mode;
-	policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
-	policy.flags = ctx.flags;
-	memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
-				FS_KEY_DESCRIPTOR_SIZE);
-
-	if (copy_to_user(arg, &policy, sizeof(policy)))
+	if (copy_to_user(arg, &policy, sizeof(policy.v1)))
 		return -EFAULT;
 	return 0;
 }
 EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
 
+/* Extended ioctl version; can get policies of any version */
+int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg)
+{
+	struct fscrypt_get_policy_ex_arg arg;
+	union fscrypt_policy *policy = (union fscrypt_policy *)&arg.policy;
+	size_t policy_size;
+	int err;
+
+	/* arg is policy_size, then policy */
+	BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0);
+	BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) !=
+		     offsetof(typeof(arg), policy));
+	BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy));
+
+	err = fscrypt_get_policy(file_inode(filp), policy);
+	if (err)
+		return err;
+	policy_size = fscrypt_policy_size(policy);
+
+	if (copy_from_user(&arg, uarg, sizeof(arg.policy_size)))
+		return -EFAULT;
+
+	if (policy_size > arg.policy_size)
+		return -EOVERFLOW;
+	arg.policy_size = policy_size;
+
+	if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex);
+
 /**
  * fscrypt_has_permitted_context() - is a file's encryption policy permitted
  *				     within its directory?
@@ -151,15 +403,12 @@
  * malicious offline violations of this constraint, while the link and rename
  * checks are needed to prevent online violations of this constraint.
  *
- * Return: 1 if permitted, 0 if forbidden.  If forbidden, the caller must fail
- * the filesystem operation with EPERM.
+ * Return: 1 if permitted, 0 if forbidden.
  */
 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 {
-	const struct fscrypt_operations *cops = parent->i_sb->s_cop;
-	const struct fscrypt_info *parent_ci, *child_ci;
-	struct fscrypt_context parent_ctx, child_ctx;
-	int res;
+	union fscrypt_policy parent_policy, child_policy;
+	int err;
 
 	/* No restrictions on file types which are never encrypted */
 	if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
@@ -189,40 +438,22 @@
 	 * In any case, if an unexpected error occurs, fall back to "forbidden".
 	 */
 
-	res = fscrypt_get_encryption_info(parent);
-	if (res)
+	err = fscrypt_get_encryption_info(parent);
+	if (err)
 		return 0;
-	res = fscrypt_get_encryption_info(child);
-	if (res)
-		return 0;
-	parent_ci = parent->i_crypt_info;
-	child_ci = child->i_crypt_info;
-
-	if (parent_ci && child_ci) {
-		return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
-			      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-			(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
-			(parent_ci->ci_filename_mode ==
-			 child_ci->ci_filename_mode) &&
-			(parent_ci->ci_flags == child_ci->ci_flags);
-	}
-
-	res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx));
-	if (res != sizeof(parent_ctx))
+	err = fscrypt_get_encryption_info(child);
+	if (err)
 		return 0;
 
-	res = cops->get_context(child, &child_ctx, sizeof(child_ctx));
-	if (res != sizeof(child_ctx))
+	err = fscrypt_get_policy(parent, &parent_policy);
+	if (err)
 		return 0;
 
-	return memcmp(parent_ctx.master_key_descriptor,
-		      child_ctx.master_key_descriptor,
-		      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-		(parent_ctx.contents_encryption_mode ==
-		 child_ctx.contents_encryption_mode) &&
-		(parent_ctx.filenames_encryption_mode ==
-		 child_ctx.filenames_encryption_mode) &&
-		(parent_ctx.flags == child_ctx.flags);
+	err = fscrypt_get_policy(child, &child_policy);
+	if (err)
+		return 0;
+
+	return fscrypt_policies_equal(&parent_policy, &child_policy);
 }
 EXPORT_SYMBOL(fscrypt_has_permitted_context);
 
@@ -238,7 +469,8 @@
 int fscrypt_inherit_context(struct inode *parent, struct inode *child,
 						void *fs_data, bool preload)
 {
-	struct fscrypt_context ctx;
+	union fscrypt_context ctx;
+	int ctxsize;
 	struct fscrypt_info *ci;
 	int res;
 
@@ -246,20 +478,14 @@
 	if (res < 0)
 		return res;
 
-	ci = parent->i_crypt_info;
+	ci = READ_ONCE(parent->i_crypt_info);
 	if (ci == NULL)
 		return -ENOKEY;
 
-	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-	ctx.contents_encryption_mode = ci->ci_data_mode;
-	ctx.filenames_encryption_mode = ci->ci_filename_mode;
-	ctx.flags = ci->ci_flags;
-	memcpy(ctx.master_key_descriptor, ci->ci_master_key,
-	       FS_KEY_DESCRIPTOR_SIZE);
-	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+	ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy);
+
 	BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
-	res = parent->i_sb->s_cop->set_context(child, &ctx,
-						sizeof(ctx), fs_data);
+	res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data);
 	if (res)
 		return res;
 	return preload ? fscrypt_get_encryption_info(child): 0;

diff --git a/fs/d_path.c b/fs/d_path.c
index e8fce6b..0f1fc17 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c

@@ -116,8 +116,10 @@
 				vfsmnt = &mnt->mnt;
 				continue;
 			}
-			if (!error)
-				error = is_mounted(vfsmnt) ? 1 : 2;
+			if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns))
+				error = 1;	// absolute root
+			else
+				error = 2;	// detached or not attached yet
 			break;
 		}
 		parent = dentry->d_parent;
@@ -316,7 +318,6 @@
 		end = ERR_PTR(-ENAMETOOLONG);
 	return end;
 }
-EXPORT_SYMBOL(simple_dname);
 
 /*
  * Write full pathname from the root of the filesystem into the buffer.

diff --git a/fs/dax.c b/fs/dax.c
index 3a2682a..2cc43cd 100644
--- a/fs/dax.c
+++ b/fs/dax.c

@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/dax.c - Direct Access filesystem code
  * Copyright (c) 2013-2014 Intel Corporation
  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/atomic.h>
@@ -33,11 +25,22 @@
 #include <linux/sizes.h>
 #include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
-#include "internal.h"
+#include <asm/pgalloc.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/fs_dax.h>
 
+static inline unsigned int pe_order(enum page_entry_size pe_size)
+{
+	if (pe_size == PE_SIZE_PTE)
+		return PAGE_SHIFT - PAGE_SHIFT;
+	if (pe_size == PE_SIZE_PMD)
+		return PMD_SHIFT - PAGE_SHIFT;
+	if (pe_size == PE_SIZE_PUD)
+		return PUD_SHIFT - PAGE_SHIFT;
+	return ~0;
+}
+
 /* We choose 4096 entries - same as per-zone page wait tables */
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -46,6 +49,9 @@
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 #define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
 
+/* The order of a PMD entry */
+#define PMD_ORDER	(PMD_SHIFT - PAGE_SHIFT)
+
 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
 
 static int __init init_dax_wait_table(void)
@@ -59,63 +65,77 @@
 fs_initcall(init_dax_wait_table);
 
 /*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking.  In total four special bits.
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
+ * for pages.  We use one bit for locking, one bit for the entry size (PMD)
+ * and two more to tell us if the entry is a zero page or an empty entry that
+ * is just used for locking.  In total four special bits.
  *
  * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
  * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  * block allocation.
  */
-#define RADIX_DAX_SHIFT		(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK	(1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE	(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+#define DAX_SHIFT	(4)
+#define DAX_LOCKED	(1UL << 0)
+#define DAX_PMD		(1UL << 1)
+#define DAX_ZERO_PAGE	(1UL << 2)
+#define DAX_EMPTY	(1UL << 3)
 
-static unsigned long dax_radix_pfn(void *entry)
+static unsigned long dax_to_pfn(void *entry)
 {
-	return (unsigned long)entry >> RADIX_DAX_SHIFT;
+	return xa_to_value(entry) >> DAX_SHIFT;
 }
 
-static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
 {
-	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
-			(pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
+	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
 }
 
-static unsigned int dax_radix_order(void *entry)
+static bool dax_is_locked(void *entry)
 {
-	if ((unsigned long)entry & RADIX_DAX_PMD)
-		return PMD_SHIFT - PAGE_SHIFT;
+	return xa_to_value(entry) & DAX_LOCKED;
+}
+
+static unsigned int dax_entry_order(void *entry)
+{
+	if (xa_to_value(entry) & DAX_PMD)
+		return PMD_ORDER;
 	return 0;
 }
 
-static int dax_is_pmd_entry(void *entry)
+static unsigned long dax_is_pmd_entry(void *entry)
 {
-	return (unsigned long)entry & RADIX_DAX_PMD;
+	return xa_to_value(entry) & DAX_PMD;
 }
 
-static int dax_is_pte_entry(void *entry)
+static bool dax_is_pte_entry(void *entry)
 {
-	return !((unsigned long)entry & RADIX_DAX_PMD);
+	return !(xa_to_value(entry) & DAX_PMD);
 }
 
 static int dax_is_zero_entry(void *entry)
 {
-	return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
+	return xa_to_value(entry) & DAX_ZERO_PAGE;
 }
 
 static int dax_is_empty_entry(void *entry)
 {
-	return (unsigned long)entry & RADIX_DAX_EMPTY;
+	return xa_to_value(entry) & DAX_EMPTY;
 }
 
 /*
- * DAX radix tree locking
+ * true if the entry that was found is of a smaller order than the entry
+ * we were looking for
+ */
+static bool dax_is_conflict(void *entry)
+{
+	return entry == XA_RETRY_ENTRY;
+}
+
+/*
+ * DAX page cache entry locking
  */
 struct exceptional_entry_key {
-	struct address_space *mapping;
+	struct xarray *xa;
 	pgoff_t entry_start;
 };
 
@@ -124,10 +144,11 @@
 	struct exceptional_entry_key key;
 };
 
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
-		pgoff_t index, void *entry, struct exceptional_entry_key *key)
+static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
+		void *entry, struct exceptional_entry_key *key)
 {
 	unsigned long hash;
+	unsigned long index = xas->xa_index;
 
 	/*
 	 * If 'entry' is a PMD, align the 'index' that we use for the wait
@@ -136,22 +157,21 @@
 	 */
 	if (dax_is_pmd_entry(entry))
 		index &= ~PG_PMD_COLOUR;
-
-	key->mapping = mapping;
+	key->xa = xas->xa;
 	key->entry_start = index;
 
-	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
 	return wait_table + hash;
 }
 
-static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
-				       int sync, void *keyp)
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
+		unsigned int mode, int sync, void *keyp)
 {
 	struct exceptional_entry_key *key = keyp;
 	struct wait_exceptional_entry_queue *ewait =
 		container_of(wait, struct wait_exceptional_entry_queue, wait);
 
-	if (key->mapping != ewait->key.mapping ||
+	if (key->xa != ewait->key.xa ||
 	    key->entry_start != ewait->key.entry_start)
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, NULL);
@@ -162,13 +182,12 @@
  * The important information it's conveying is whether the entry at
  * this index used to be a PMD entry.
  */
-static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-		pgoff_t index, void *entry, bool wake_all)
+static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
 {
 	struct exceptional_entry_key key;
 	wait_queue_head_t *wq;
 
-	wq = dax_entry_waitqueue(mapping, index, entry, &key);
+	wq = dax_entry_waitqueue(xas, entry, &key);
 
 	/*
 	 * Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -181,58 +200,18 @@
 }
 
 /*
- * Check whether the given slot is locked.  Must be called with the i_pages
- * lock held.
- */
-static inline int slot_locked(struct address_space *mapping, void **slot)
-{
-	unsigned long entry = (unsigned long)
-		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-	return entry & RADIX_DAX_ENTRY_LOCK;
-}
-
-/*
- * Mark the given slot as locked.  Must be called with the i_pages lock held.
- */
-static inline void *lock_slot(struct address_space *mapping, void **slot)
-{
-	unsigned long entry = (unsigned long)
-		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
-	entry |= RADIX_DAX_ENTRY_LOCK;
-	radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
-	return (void *)entry;
-}
-
-/*
- * Mark the given slot as unlocked.  Must be called with the i_pages lock held.
- */
-static inline void *unlock_slot(struct address_space *mapping, void **slot)
-{
-	unsigned long entry = (unsigned long)
-		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
-	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
-	radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
-	return (void *)entry;
-}
-
-static void put_unlocked_mapping_entry(struct address_space *mapping,
-				       pgoff_t index, void *entry);
-
-/*
- * Lookup entry in radix tree, wait for it to become unlocked if it is
- * exceptional entry and return it. The caller must call
- * put_unlocked_mapping_entry() when he decided not to lock the entry or
- * put_locked_mapping_entry() when he locked the entry and now wants to
- * unlock it.
+ * Look up entry in page cache, wait for it to become unlocked if it
+ * is a DAX entry and return it.  The caller must subsequently call
+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
+ * if it did.  The entry returned may have a larger order than @order.
+ * If @order is larger than the order of the entry found in i_pages, this
+ * function returns a dax_is_conflict entry.
  *
  * Must be called with the i_pages lock held.
  */
-static void *__get_unlocked_mapping_entry(struct address_space *mapping,
-		pgoff_t index, void ***slotp, bool (*wait_fn)(void))
+static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
 {
-	void *entry, **slot;
+	void *entry;
 	struct wait_exceptional_entry_queue ewait;
 	wait_queue_head_t *wq;
 
@@ -240,82 +219,83 @@
 	ewait.wait.func = wake_exceptional_entry_func;
 
 	for (;;) {
-		bool revalidate;
-
-		entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
-					  &slot);
-		if (!entry ||
-		    WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
-		    !slot_locked(mapping, slot)) {
-			if (slotp)
-				*slotp = slot;
+		entry = xas_find_conflict(xas);
+		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 			return entry;
-		}
+		if (dax_entry_order(entry) < order)
+			return XA_RETRY_ENTRY;
+		if (!dax_is_locked(entry))
+			return entry;
 
-		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
+		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
 		prepare_to_wait_exclusive(wq, &ewait.wait,
 					  TASK_UNINTERRUPTIBLE);
-		xa_unlock_irq(&mapping->i_pages);
-		revalidate = wait_fn();
+		xas_unlock_irq(xas);
+		xas_reset(xas);
+		schedule();
 		finish_wait(wq, &ewait.wait);
-		xa_lock_irq(&mapping->i_pages);
-		if (revalidate) {
-			put_unlocked_mapping_entry(mapping, index, entry);
-			return ERR_PTR(-EAGAIN);
-		}
+		xas_lock_irq(xas);
 	}
 }
 
-static bool entry_wait(void)
-{
-	schedule();
-	/*
-	 * Never return an ERR_PTR() from
-	 * __get_unlocked_mapping_entry(), just keep looping.
-	 */
-	return false;
-}
-
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
-		pgoff_t index, void ***slotp)
-{
-	return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
-}
-
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
-	void *entry, **slot;
-
-	xa_lock_irq(&mapping->i_pages);
-	entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
-	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
-			 !slot_locked(mapping, slot))) {
-		xa_unlock_irq(&mapping->i_pages);
-		return;
-	}
-	unlock_slot(mapping, slot);
-	xa_unlock_irq(&mapping->i_pages);
-	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
-}
-
-static void put_locked_mapping_entry(struct address_space *mapping,
-		pgoff_t index)
-{
-	unlock_mapping_entry(mapping, index);
-}
-
 /*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ * The only thing keeping the address space around is the i_pages lock
+ * (it's cycled in clear_inode() after removing the entries from i_pages)
+ * After we call xas_unlock_irq(), we cannot touch xas->xa.
  */
-static void put_unlocked_mapping_entry(struct address_space *mapping,
-				       pgoff_t index, void *entry)
+static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 {
-	if (!entry)
-		return;
+	struct wait_exceptional_entry_queue ewait;
+	wait_queue_head_t *wq;
 
-	/* We have to wake up next waiter for the radix tree entry lock */
-	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+	init_wait(&ewait.wait);
+	ewait.wait.func = wake_exceptional_entry_func;
+
+	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+	/*
+	 * Unlike get_unlocked_entry() there is no guarantee that this
+	 * path ever successfully retrieves an unlocked entry before an
+	 * inode dies. Perform a non-exclusive wait in case this path
+	 * never successfully performs its own wake up.
+	 */
+	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
+	xas_unlock_irq(xas);
+	schedule();
+	finish_wait(wq, &ewait.wait);
+}
+
+static void put_unlocked_entry(struct xa_state *xas, void *entry)
+{
+	/* If we were the only waiter woken, wake the next one */
+	if (entry && !dax_is_conflict(entry))
+		dax_wake_entry(xas, entry, false);
+}
+
+/*
+ * We used the xa_state to get the entry, but then we locked the entry and
+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
+ * before use.
+ */
+static void dax_unlock_entry(struct xa_state *xas, void *entry)
+{
+	void *old;
+
+	BUG_ON(dax_is_locked(entry));
+	xas_reset(xas);
+	xas_lock_irq(xas);
+	old = xas_store(xas, entry);
+	xas_unlock_irq(xas);
+	BUG_ON(!dax_is_locked(old));
+	dax_wake_entry(xas, entry, false);
+}
+
+/*
+ * Return: The entry stored at this location before it was locked.
+ */
+static void *dax_lock_entry(struct xa_state *xas, void *entry)
+{
+	unsigned long v = xa_to_value(entry);
+	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
 }
 
 static unsigned long dax_entry_size(void *entry)
@@ -330,9 +310,9 @@
 		return PAGE_SIZE;
 }
 
-static unsigned long dax_radix_end_pfn(void *entry)
+static unsigned long dax_end_pfn(void *entry)
 {
-	return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
 }
 
 /*
@@ -340,8 +320,8 @@
  * 'empty' and 'zero' entries.
  */
 #define for_each_mapped_pfn(entry, pfn) \
-	for (pfn = dax_radix_pfn(entry); \
-			pfn < dax_radix_end_pfn(entry); pfn++)
+	for (pfn = dax_to_pfn(entry); \
+			pfn < dax_end_pfn(entry); pfn++)
 
 /*
  * TODO: for reflink+dax we need a way to associate a single page with
@@ -398,31 +378,25 @@
 	return NULL;
 }
 
-static bool entry_wait_revalidate(void)
+/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * @page: The page whose entry we want to lock
+ *
+ * Context: Process context.
+ * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
+ * not be locked.
+ */
+dax_entry_t dax_lock_page(struct page *page)
 {
-	rcu_read_unlock();
-	schedule();
-	rcu_read_lock();
+	XA_STATE(xas, NULL, 0);
+	void *entry;
 
-	/*
-	 * Tell __get_unlocked_mapping_entry() to take a break, we need
-	 * to revalidate page->mapping after dropping locks
-	 */
-	return true;
-}
-
-bool dax_lock_mapping_entry(struct page *page)
-{
-	pgoff_t index;
-	struct inode *inode;
-	bool did_lock = false;
-	void *entry = NULL, **slot;
-	struct address_space *mapping;
-
+	/* Ensure page->mapping isn't freed while we look at it */
 	rcu_read_lock();
 	for (;;) {
-		mapping = READ_ONCE(page->mapping);
+		struct address_space *mapping = READ_ONCE(page->mapping);
 
+		entry = NULL;
 		if (!mapping || !dax_mapping(mapping))
 			break;
 
@@ -433,100 +407,92 @@
 		 * otherwise we would not have a valid pfn_to_page()
 		 * translation.
 		 */
-		inode = mapping->host;
-		if (S_ISCHR(inode->i_mode)) {
-			did_lock = true;
+		entry = (void *)~0UL;
+		if (S_ISCHR(mapping->host->i_mode))
 			break;
-		}
 
-		xa_lock_irq(&mapping->i_pages);
+		xas.xa = &mapping->i_pages;
+		xas_lock_irq(&xas);
 		if (mapping != page->mapping) {
-			xa_unlock_irq(&mapping->i_pages);
+			xas_unlock_irq(&xas);
 			continue;
 		}
-		index = page->index;
-
-		entry = __get_unlocked_mapping_entry(mapping, index, &slot,
-				entry_wait_revalidate);
-		if (!entry) {
-			xa_unlock_irq(&mapping->i_pages);
-			break;
-		} else if (IS_ERR(entry)) {
-			xa_unlock_irq(&mapping->i_pages);
-			WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
+		xas_set(&xas, page->index);
+		entry = xas_load(&xas);
+		if (dax_is_locked(entry)) {
+			rcu_read_unlock();
+			wait_entry_unlocked(&xas, entry);
+			rcu_read_lock();
 			continue;
 		}
-		lock_slot(mapping, slot);
-		did_lock = true;
-		xa_unlock_irq(&mapping->i_pages);
+		dax_lock_entry(&xas, entry);
+		xas_unlock_irq(&xas);
 		break;
 	}
 	rcu_read_unlock();
-
-	return did_lock;
+	return (dax_entry_t)entry;
 }
 
-void dax_unlock_mapping_entry(struct page *page)
+void dax_unlock_page(struct page *page, dax_entry_t cookie)
 {
 	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
+	XA_STATE(xas, &mapping->i_pages, page->index);
 
-	if (S_ISCHR(inode->i_mode))
+	if (S_ISCHR(mapping->host->i_mode))
 		return;
 
-	unlock_mapping_entry(mapping, page->index);
+	dax_unlock_entry(&xas, (void *)cookie);
 }
 
 /*
- * Find radix tree entry at given index. If it points to an exceptional entry,
- * return it with the radix tree entry locked. If the radix tree doesn't
- * contain given index, create an empty exceptional entry for the index and
- * return with it locked.
+ * Find page cache entry at given index. If it is a DAX entry, return it
+ * with the entry locked. If the page cache doesn't contain an entry at
+ * that index, add a locked empty entry.
  *
- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
- * either return that locked entry or will return an error.  This error will
- * happen if there are any 4k entries within the 2MiB range that we are
- * requesting.
+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return VM_FAULT_FALLBACK.
+ * This will happen if there are any PTE entries within the PMD range
+ * that we are requesting.
  *
- * We always favor 4k entries over 2MiB entries. There isn't a flow where we
- * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
- * insertion will fail if it finds any 4k entries already in the tree, and a
- * 4k insertion will cause an existing 2MiB entry to be unmapped and
- * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
- * well as 2MiB empty entries.
+ * We always favor PTE entries over PMD entries. There isn't a flow where we
+ * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
+ * insertion will fail if it finds any PTE entries already in the tree, and a
+ * PTE insertion will cause an existing PMD entry to be unmapped and
+ * downgraded to PTE entries.  This happens for both PMD zero pages as
+ * well as PMD empty entries.
  *
- * The exception to this downgrade path is for 2MiB DAX PMD entries that have
- * real storage backing them.  We will leave these real 2MiB DAX entries in
- * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ * The exception to this downgrade path is for PMD entries that have
+ * real storage backing them.  We will leave these real PMD entries in
+ * the tree, and PTE writes will simply dirty the entire PMD entry.
  *
  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  * persistent memory the benefit is doubtful. We can add that later if we can
  * show it helps.
+ *
+ * On error, this function does not return an ERR_PTR.  Instead it returns
+ * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
+ * overlap with xarray value entries.
  */
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
-		unsigned long size_flag)
+static void *grab_mapping_entry(struct xa_state *xas,
+		struct address_space *mapping, unsigned int order)
 {
-	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
-	void *entry, **slot;
+	unsigned long index = xas->xa_index;
+	bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+	void *entry;
 
-restart:
-	xa_lock_irq(&mapping->i_pages);
-	entry = get_unlocked_mapping_entry(mapping, index, &slot);
-
-	if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
-		entry = ERR_PTR(-EIO);
-		goto out_unlock;
-	}
+retry:
+	xas_lock_irq(xas);
+	entry = get_unlocked_entry(xas, order);
 
 	if (entry) {
-		if (size_flag & RADIX_DAX_PMD) {
-			if (dax_is_pte_entry(entry)) {
-				put_unlocked_mapping_entry(mapping, index,
-						entry);
-				entry = ERR_PTR(-EEXIST);
-				goto out_unlock;
-			}
-		} else { /* trying to grab a PTE entry */
+		if (dax_is_conflict(entry))
+			goto fallback;
+		if (!xa_is_value(entry)) {
+			xas_set_err(xas, EIO);
+			goto out_unlock;
+		}
+
+		if (order == 0) {
 			if (dax_is_pmd_entry(entry) &&
 			    (dax_is_zero_entry(entry) ||
 			     dax_is_empty_entry(entry))) {
@@ -535,87 +501,61 @@
 		}
 	}
 
-	/* No entry for given index? Make sure radix tree is big enough. */
-	if (!entry || pmd_downgrade) {
-		int err;
+	if (pmd_downgrade) {
+		/*
+		 * Make sure 'entry' remains valid while we drop
+		 * the i_pages lock.
+		 */
+		dax_lock_entry(xas, entry);
 
-		if (pmd_downgrade) {
-			/*
-			 * Make sure 'entry' remains valid while we drop
-			 * the i_pages lock.
-			 */
-			entry = lock_slot(mapping, slot);
-		}
-
-		xa_unlock_irq(&mapping->i_pages);
 		/*
 		 * Besides huge zero pages the only other thing that gets
 		 * downgraded are empty entries which don't need to be
 		 * unmapped.
 		 */
-		if (pmd_downgrade && dax_is_zero_entry(entry))
-			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
-							PG_PMD_NR, false);
-
-		err = radix_tree_preload(
-				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
-		if (err) {
-			if (pmd_downgrade)
-				put_locked_mapping_entry(mapping, index);
-			return ERR_PTR(err);
-		}
-		xa_lock_irq(&mapping->i_pages);
-
-		if (!entry) {
-			/*
-			 * We needed to drop the i_pages lock while calling
-			 * radix_tree_preload() and we didn't have an entry to
-			 * lock.  See if another thread inserted an entry at
-			 * our index during this time.
-			 */
-			entry = __radix_tree_lookup(&mapping->i_pages, index,
-					NULL, &slot);
-			if (entry) {
-				radix_tree_preload_end();
-				xa_unlock_irq(&mapping->i_pages);
-				goto restart;
-			}
+		if (dax_is_zero_entry(entry)) {
+			xas_unlock_irq(xas);
+			unmap_mapping_pages(mapping,
+					xas->xa_index & ~PG_PMD_COLOUR,
+					PG_PMD_NR, false);
+			xas_reset(xas);
+			xas_lock_irq(xas);
 		}
 
-		if (pmd_downgrade) {
-			dax_disassociate_entry(entry, mapping, false);
-			radix_tree_delete(&mapping->i_pages, index);
-			mapping->nrexceptional--;
-			dax_wake_mapping_entry_waiter(mapping, index, entry,
-					true);
-		}
-
-		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
-
-		err = __radix_tree_insert(&mapping->i_pages, index,
-				dax_radix_order(entry), entry);
-		radix_tree_preload_end();
-		if (err) {
-			xa_unlock_irq(&mapping->i_pages);
-			/*
-			 * Our insertion of a DAX entry failed, most likely
-			 * because we were inserting a PMD entry and it
-			 * collided with a PTE sized entry at a different
-			 * index in the PMD range.  We haven't inserted
-			 * anything into the radix tree and have no waiters to
-			 * wake.
-			 */
-			return ERR_PTR(err);
-		}
-		/* Good, we have inserted empty locked entry into the tree. */
-		mapping->nrexceptional++;
-		xa_unlock_irq(&mapping->i_pages);
-		return entry;
+		dax_disassociate_entry(entry, mapping, false);
+		xas_store(xas, NULL);	/* undo the PMD join */
+		dax_wake_entry(xas, entry, true);
+		mapping->nrexceptional--;
+		entry = NULL;
+		xas_set(xas, index);
 	}
-	entry = lock_slot(mapping, slot);
- out_unlock:
-	xa_unlock_irq(&mapping->i_pages);
+
+	if (entry) {
+		dax_lock_entry(xas, entry);
+	} else {
+		unsigned long flags = DAX_EMPTY;
+
+		if (order > 0)
+			flags |= DAX_PMD;
+		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
+		dax_lock_entry(xas, entry);
+		if (xas_error(xas))
+			goto out_unlock;
+		mapping->nrexceptional++;
+	}
+
+out_unlock:
+	xas_unlock_irq(xas);
+	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
+		goto retry;
+	if (xas->xa_node == XA_ERROR(-ENOMEM))
+		return xa_mk_internal(VM_FAULT_OOM);
+	if (xas_error(xas))
+		return xa_mk_internal(VM_FAULT_SIGBUS);
 	return entry;
+fallback:
+	xas_unlock_irq(xas);
+	return xa_mk_internal(VM_FAULT_FALLBACK);
 }
 
 /**
@@ -635,11 +575,10 @@
  */
 struct page *dax_layout_busy_page(struct address_space *mapping)
 {
-	pgoff_t	indices[PAGEVEC_SIZE];
+	XA_STATE(xas, &mapping->i_pages, 0);
+	void *entry;
+	unsigned int scanned = 0;
 	struct page *page = NULL;
-	struct pagevec pvec;
-	pgoff_t	index, end;
-	unsigned i;
 
 	/*
 	 * In the 'limited' case get_user_pages() for dax is disabled.
@@ -650,13 +589,9 @@
 	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
 		return NULL;
 
-	pagevec_init(&pvec);
-	index = 0;
-	end = -1;
-
 	/*
 	 * If we race get_user_pages_fast() here either we'll see the
-	 * elevated page count in the pagevec_lookup and wait, or
+	 * elevated page count in the iteration and wait, or
 	 * get_user_pages_fast() will see that the page it took a reference
 	 * against is no longer mapped in the page tables and bail to the
 	 * get_user_pages() slow path.  The slow path is protected by
@@ -666,96 +601,70 @@
 	 * guaranteed to either see new references or prevent new
 	 * references from being established.
 	 */
-	unmap_mapping_range(mapping, 0, 0, 1);
+	unmap_mapping_range(mapping, 0, 0, 0);
 
-	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE),
-				indices)) {
-		pgoff_t nr_pages = 1;
-
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *pvec_ent = pvec.pages[i];
-			void *entry;
-
-			index = indices[i];
-			if (index >= end)
-				break;
-
-			if (WARN_ON_ONCE(
-			     !radix_tree_exceptional_entry(pvec_ent)))
-				continue;
-
-			xa_lock_irq(&mapping->i_pages);
-			entry = get_unlocked_mapping_entry(mapping, index, NULL);
-			if (entry) {
-				page = dax_busy_page(entry);
-				/*
-				 * Account for multi-order entries at
-				 * the end of the pagevec.
-				 */
-				if (i + 1 >= pagevec_count(&pvec))
-					nr_pages = 1UL << dax_radix_order(entry);
-			}
-			put_unlocked_mapping_entry(mapping, index, entry);
-			xa_unlock_irq(&mapping->i_pages);
-			if (page)
-				break;
-		}
-
-		/*
-		 * We don't expect normal struct page entries to exist in our
-		 * tree, but we keep these pagevec calls so that this code is
-		 * consistent with the common pattern for handling pagevecs
-		 * throughout the kernel.
-		 */
-		pagevec_remove_exceptionals(&pvec);
-		pagevec_release(&pvec);
-		index += nr_pages;
-
+	xas_lock_irq(&xas);
+	xas_for_each(&xas, entry, ULONG_MAX) {
+		if (WARN_ON_ONCE(!xa_is_value(entry)))
+			continue;
+		if (unlikely(dax_is_locked(entry)))
+			entry = get_unlocked_entry(&xas, 0);
+		if (entry)
+			page = dax_busy_page(entry);
+		put_unlocked_entry(&xas, entry);
 		if (page)
 			break;
+		if (++scanned % XA_CHECK_SCHED)
+			continue;
+
+		xas_pause(&xas);
+		xas_unlock_irq(&xas);
+		cond_resched();
+		xas_lock_irq(&xas);
 	}
+	xas_unlock_irq(&xas);
 	return page;
 }
 EXPORT_SYMBOL_GPL(dax_layout_busy_page);
 
-static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+static int __dax_invalidate_entry(struct address_space *mapping,
 					  pgoff_t index, bool trunc)
 {
+	XA_STATE(xas, &mapping->i_pages, index);
 	int ret = 0;
 	void *entry;
-	struct radix_tree_root *pages = &mapping->i_pages;
 
-	xa_lock_irq(pages);
-	entry = get_unlocked_mapping_entry(mapping, index, NULL);
-	if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
+	xas_lock_irq(&xas);
+	entry = get_unlocked_entry(&xas, 0);
+	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 		goto out;
 	if (!trunc &&
-	    (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
-	     radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
+	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
+	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
 		goto out;
 	dax_disassociate_entry(entry, mapping, trunc);
-	radix_tree_delete(pages, index);
+	xas_store(&xas, NULL);
 	mapping->nrexceptional--;
 	ret = 1;
 out:
-	put_unlocked_mapping_entry(mapping, index, entry);
-	xa_unlock_irq(pages);
+	put_unlocked_entry(&xas, entry);
+	xas_unlock_irq(&xas);
 	return ret;
 }
+
 /*
- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
- * entry to get unlocked before deleting it.
+ * Delete DAX entry at @index from @mapping.  Wait for it
+ * to be unlocked before deleting it.
  */
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
-	int ret = __dax_invalidate_mapping_entry(mapping, index, true);
+	int ret = __dax_invalidate_entry(mapping, index, true);
 
 	/*
 	 * This gets called from truncate / punch_hole path. As such, the caller
 	 * must hold locks protecting against concurrent modifications of the
-	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
-	 * caller has seen exceptional entry for this index, we better find it
+	 * page cache (usually fs-private i_mmap_sem for writing). Since the
+	 * caller has seen a DAX entry for this index, we better find it
 	 * at that index as well...
 	 */
 	WARN_ON_ONCE(!ret);
@@ -763,12 +672,12 @@
 }
 
 /*
- * Invalidate exceptional DAX entry if it is clean.
+ * Invalidate DAX entry if it is clean.
  */
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index)
 {
-	return __dax_invalidate_mapping_entry(mapping, index, false);
+	return __dax_invalidate_entry(mapping, index, false);
 }
 
 static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
@@ -804,64 +713,57 @@
  * already in the tree, we will skip the insertion and just dirty the PMD as
  * appropriate.
  */
-static void *dax_insert_mapping_entry(struct address_space *mapping,
-				      struct vm_fault *vmf,
-				      void *entry, pfn_t pfn_t,
-				      unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas,
+		struct address_space *mapping, struct vm_fault *vmf,
+		void *entry, pfn_t pfn, unsigned long flags, bool dirty)
 {
-	struct radix_tree_root *pages = &mapping->i_pages;
-	unsigned long pfn = pfn_t_to_pfn(pfn_t);
-	pgoff_t index = vmf->pgoff;
-	void *new_entry;
+	void *new_entry = dax_make_entry(pfn, flags);
 
 	if (dirty)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+	if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+		unsigned long index = xas->xa_index;
 		/* we are replacing a zero page with block mapping */
 		if (dax_is_pmd_entry(entry))
 			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
-							PG_PMD_NR, false);
+					PG_PMD_NR, false);
 		else /* pte entry */
-			unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
+			unmap_mapping_pages(mapping, index, 1, false);
 	}
 
-	xa_lock_irq(pages);
-	new_entry = dax_radix_locked_entry(pfn, flags);
-	if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
+	xas_reset(xas);
+	xas_lock_irq(xas);
+	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+		void *old;
+
 		dax_disassociate_entry(entry, mapping, false);
 		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
-	}
-
-	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 		/*
-		 * Only swap our new entry into the radix tree if the current
+		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
-		 * PMD entry is already in the tree, we leave it alone.  This
+		 * PMD entry is already in the cache, we leave it alone.  This
 		 * means that if we are trying to insert a PTE and the
 		 * existing entry is a PMD, we will just leave the PMD in the
 		 * tree and dirty it if necessary.
 		 */
-		struct radix_tree_node *node;
-		void **slot;
-		void *ret;
-
-		ret = __radix_tree_lookup(pages, index, &node, &slot);
-		WARN_ON_ONCE(ret != entry);
-		__radix_tree_replace(pages, node, slot,
-				     new_entry, NULL);
+		old = dax_lock_entry(xas, new_entry);
+		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
+					DAX_LOCKED));
 		entry = new_entry;
+	} else {
+		xas_load(xas);	/* Walk the xa_state */
 	}
 
 	if (dirty)
-		radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
+		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
 
-	xa_unlock_irq(pages);
+	xas_unlock_irq(xas);
 	return entry;
 }
 
-static inline unsigned long
-pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+static inline
+unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
 {
 	unsigned long address;
 
@@ -871,8 +773,8 @@
 }
 
 /* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_mapping_entry_mkclean(struct address_space *mapping,
-				      pgoff_t index, unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
+		unsigned long pfn)
 {
 	struct vm_area_struct *vma;
 	pte_t pte, *ptep = NULL;
@@ -881,7 +783,8 @@
 
 	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
-		unsigned long address, start, end;
+		struct mmu_notifier_range range;
+		unsigned long address;
 
 		cond_resched();
 
@@ -891,11 +794,12 @@
 		address = pgoff_address(index, vma);
 
 		/*
-		 * Note because we provide start/end to follow_pte_pmd it will
+		 * Note because we provide range to follow_pte_pmd it will
 		 * call mmu_notifier_invalidate_range_start() on our behalf
 		 * before taking any lock.
 		 */
-		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
+		if (follow_pte_pmd(vma->vm_mm, address, &range,
+				   &ptep, &pmdp, &ptl))
 			continue;
 
 		/*
@@ -915,7 +819,7 @@
 				goto unlock_pmd;
 
 			flush_cache_page(vma, address, pfn);
-			pmd = pmdp_huge_clear_flush(vma, address, pmdp);
+			pmd = pmdp_invalidate(vma, address, pmdp);
 			pmd = pmd_wrprotect(pmd);
 			pmd = pmd_mkclean(pmd);
 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
@@ -937,50 +841,53 @@
 			pte_unmap_unlock(ptep, ptl);
 		}
 
-		mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+		mmu_notifier_invalidate_range_end(&range);
 	}
 	i_mmap_unlock_read(mapping);
 }
 
-static int dax_writeback_one(struct dax_device *dax_dev,
-		struct address_space *mapping, pgoff_t index, void *entry)
+static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+		struct address_space *mapping, void *entry)
 {
-	struct radix_tree_root *pages = &mapping->i_pages;
-	void *entry2, **slot;
-	unsigned long pfn;
+	unsigned long pfn, index, count;
 	long ret = 0;
-	size_t size;
 
 	/*
 	 * A page got tagged dirty in DAX mapping? Something is seriously
 	 * wrong.
 	 */
-	if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+	if (WARN_ON(!xa_is_value(entry)))
 		return -EIO;
 
-	xa_lock_irq(pages);
-	entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
-	/* Entry got punched out / reallocated? */
-	if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
-		goto put_unlocked;
-	/*
-	 * Entry got reallocated elsewhere? No need to writeback. We have to
-	 * compare pfns as we must not bail out due to difference in lockbit
-	 * or entry type.
-	 */
-	if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
-		goto put_unlocked;
-	if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
-				dax_is_zero_entry(entry))) {
-		ret = -EIO;
-		goto put_unlocked;
+	if (unlikely(dax_is_locked(entry))) {
+		void *old_entry = entry;
+
+		entry = get_unlocked_entry(xas, 0);
+
+		/* Entry got punched out / reallocated? */
+		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+			goto put_unlocked;
+		/*
+		 * Entry got reallocated elsewhere? No need to writeback.
+		 * We have to compare pfns as we must not bail out due to
+		 * difference in lockbit or entry type.
+		 */
+		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
+			goto put_unlocked;
+		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+					dax_is_zero_entry(entry))) {
+			ret = -EIO;
+			goto put_unlocked;
+		}
+
+		/* Another fsync thread may have already done this entry */
+		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
+			goto put_unlocked;
 	}
 
-	/* Another fsync thread may have already written back this entry */
-	if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
-		goto put_unlocked;
 	/* Lock the entry to serialize with page faults */
-	entry = lock_slot(mapping, slot);
+	dax_lock_entry(xas, entry);
+
 	/*
 	 * We can clear the tag now but we have to be careful so that concurrent
 	 * dax_writeback_one() calls for the same index cannot finish before we
@@ -988,37 +895,39 @@
 	 * at the entry only under the i_pages lock and once they do that
 	 * they will see the entry locked and wait for it to unlock.
 	 */
-	radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
-	xa_unlock_irq(pages);
+	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
+	xas_unlock_irq(xas);
 
 	/*
-	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
-	 * in the middle of a PMD, the 'index' we are given will be aligned to
-	 * the start index of the PMD, as will the pfn we pull from 'entry'.
+	 * If dax_writeback_mapping_range() was given a wbc->range_start
+	 * in the middle of a PMD, the 'index' we use needs to be
+	 * aligned to the start of the PMD.
 	 * This allows us to flush for PMD_SIZE and not have to worry about
 	 * partial PMD writebacks.
 	 */
-	pfn = dax_radix_pfn(entry);
-	size = PAGE_SIZE << dax_radix_order(entry);
+	pfn = dax_to_pfn(entry);
+	count = 1UL << dax_entry_order(entry);
+	index = xas->xa_index & ~(count - 1);
 
-	dax_mapping_entry_mkclean(mapping, index, pfn);
-	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
+	dax_entry_mkclean(mapping, index, pfn);
+	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
 	/*
 	 * After we have flushed the cache, we can clear the dirty tag. There
 	 * cannot be new dirty data in the pfn after the flush has completed as
 	 * the pfn mappings are writeprotected and fault waits for mapping
 	 * entry lock.
 	 */
-	xa_lock_irq(pages);
-	radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
-	xa_unlock_irq(pages);
-	trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
-	put_locked_mapping_entry(mapping, index);
+	xas_reset(xas);
+	xas_lock_irq(xas);
+	xas_store(xas, entry);
+	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
+	dax_wake_entry(xas, entry, false);
+
+	trace_dax_writeback_one(mapping->host, index, count);
 	return ret;
 
  put_unlocked:
-	put_unlocked_mapping_entry(mapping, index, entry2);
-	xa_unlock_irq(pages);
+	put_unlocked_entry(xas, entry);
 	return ret;
 }
 
@@ -1030,13 +939,13 @@
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc)
 {
+	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
 	struct inode *inode = mapping->host;
-	pgoff_t start_index, end_index;
-	pgoff_t indices[PAGEVEC_SIZE];
+	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
 	struct dax_device *dax_dev;
-	struct pagevec pvec;
-	bool done = false;
-	int i, ret = 0;
+	void *entry;
+	int ret = 0;
+	unsigned int scanned = 0;
 
 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 		return -EIO;
@@ -1048,41 +957,29 @@
 	if (!dax_dev)
 		return -EIO;
 
-	start_index = wbc->range_start >> PAGE_SHIFT;
-	end_index = wbc->range_end >> PAGE_SHIFT;
+	trace_dax_writeback_range(inode, xas.xa_index, end_index);
 
-	trace_dax_writeback_range(inode, start_index, end_index);
+	tag_pages_for_writeback(mapping, xas.xa_index, end_index);
 
-	tag_pages_for_writeback(mapping, start_index, end_index);
-
-	pagevec_init(&pvec);
-	while (!done) {
-		pvec.nr = find_get_entries_tag(mapping, start_index,
-				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
-				pvec.pages, indices);
-
-		if (pvec.nr == 0)
+	xas_lock_irq(&xas);
+	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
+		ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
+		if (ret < 0) {
+			mapping_set_error(mapping, ret);
 			break;
-
-		for (i = 0; i < pvec.nr; i++) {
-			if (indices[i] > end_index) {
-				done = true;
-				break;
-			}
-
-			ret = dax_writeback_one(dax_dev, mapping, indices[i],
-					pvec.pages[i]);
-			if (ret < 0) {
-				mapping_set_error(mapping, ret);
-				goto out;
-			}
 		}
-		start_index = indices[pvec.nr - 1] + 1;
+		if (++scanned % XA_CHECK_SCHED)
+			continue;
+
+		xas_pause(&xas);
+		xas_unlock_irq(&xas);
+		cond_resched();
+		xas_lock_irq(&xas);
 	}
-out:
+	xas_unlock_irq(&xas);
 	put_dax(dax_dev);
-	trace_dax_writeback_range_done(inode, start_index, end_index);
-	return (ret < 0 ? ret : 0);
+	trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
@@ -1130,16 +1027,18 @@
  * If this page is ever written to we will re-fault and change the mapping to
  * point to real DAX storage instead.
  */
-static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
-			 struct vm_fault *vmf)
+static vm_fault_t dax_load_hole(struct xa_state *xas,
+		struct address_space *mapping, void **entry,
+		struct vm_fault *vmf)
 {
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
 	vm_fault_t ret;
 
-	dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
-			false);
+	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+			DAX_ZERO_PAGE, false);
+
 	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
 	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
@@ -1302,7 +1201,7 @@
 	unsigned flags = 0;
 
 	if (iov_iter_rw(iter) == WRITE) {
-		lockdep_assert_held_exclusive(&inode->i_rwsem);
+		lockdep_assert_held_write(&inode->i_rwsem);
 		flags |= IOMAP_WRITE;
 	} else {
 		lockdep_assert_held(&inode->i_rwsem);
@@ -1326,9 +1225,7 @@
 {
 	if (error == 0)
 		return VM_FAULT_NOPAGE;
-	if (error == -ENOMEM)
-		return VM_FAULT_OOM;
-	return VM_FAULT_SIGBUS;
+	return vmf_error(error);
 }
 
 /*
@@ -1347,6 +1244,7 @@
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
+	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1373,9 +1271,9 @@
 	if (write && !vmf->cow_page)
 		flags |= IOMAP_WRITE;
 
-	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
-	if (IS_ERR(entry)) {
-		ret = dax_fault_return(PTR_ERR(entry));
+	entry = grab_mapping_entry(&xas, mapping, 0);
+	if (xa_is_internal(entry)) {
+		ret = xa_to_internal(entry);
 		goto out;
 	}
 
@@ -1448,7 +1346,7 @@
 		if (error < 0)
 			goto error_finish_iomap;
 
-		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
+		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
 						 0, write && !sync);
 
 		/*
@@ -1476,7 +1374,7 @@
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (!write) {
-			ret = dax_load_hole(mapping, entry, vmf);
+			ret = dax_load_hole(&xas, mapping, &entry, vmf);
 			goto finish_iomap;
 		}
 		/*FALLTHRU*/
@@ -1503,21 +1401,22 @@
 		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
 	}
  unlock_entry:
-	put_locked_mapping_entry(mapping, vmf->pgoff);
+	dax_unlock_entry(&xas, entry);
  out:
 	trace_dax_pte_fault_done(inode, vmf, ret);
 	return ret | major;
 }
 
 #ifdef CONFIG_FS_DAX_PMD
-static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
-		void *entry)
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+		struct iomap *iomap, void **entry)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = mapping->host;
+	pgtable_t pgtable = NULL;
 	struct page *zero_page;
-	void *ret = NULL;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
 	pfn_t pfn;
@@ -1528,8 +1427,14 @@
 		goto fallback;
 
 	pfn = page_to_pfn_t(zero_page);
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
-			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
+	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+			DAX_PMD | DAX_ZERO_PAGE, false);
+
+	if (arch_needs_pgtable_deposit()) {
+		pgtable = pte_alloc_one(vma->vm_mm);
+		if (!pgtable)
+			return VM_FAULT_OOM;
+	}
 
 	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
 	if (!pmd_none(*(vmf->pmd))) {
@@ -1537,15 +1442,21 @@
 		goto fallback;
 	}
 
+	if (pgtable) {
+		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+		mm_inc_nr_ptes(vma->vm_mm);
+	}
 	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
 	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 	spin_unlock(ptl);
-	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
+	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
 	return VM_FAULT_NOPAGE;
 
 fallback:
-	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
+	if (pgtable)
+		pte_free(vma->vm_mm, pgtable);
+	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -1554,6 +1465,7 @@
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
+	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	bool sync;
@@ -1561,7 +1473,7 @@
 	struct inode *inode = mapping->host;
 	vm_fault_t result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
-	pgoff_t max_pgoff, pgoff;
+	pgoff_t max_pgoff;
 	void *entry;
 	loff_t pos;
 	int error;
@@ -1572,7 +1484,6 @@
 	 * supposed to hold locks serializing us with truncate / punch hole so
 	 * this is a reliable test.
 	 */
-	pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 
 	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1581,7 +1492,7 @@
 	 * Make sure that the faulting address's PMD offset (color) matches
 	 * the PMD offset from the start of the file.  This is necessary so
 	 * that a PMD range in the page table overlaps exactly with a PMD
-	 * range in the radix tree.
+	 * range in the page cache.
 	 */
 	if ((vmf->pgoff & PG_PMD_COLOUR) !=
 	    ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
@@ -1597,24 +1508,26 @@
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	if (pgoff >= max_pgoff) {
+	if (xas.xa_index >= max_pgoff) {
 		result = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
 	/* If the PMD would extend beyond the file size */
-	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
+	if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
 		goto fallback;
 
 	/*
-	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
-	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
-	 * is already in the tree, for instance), it will return -EEXIST and
-	 * we just fall back to 4k entries.
+	 * grab_mapping_entry() will make sure we get an empty PMD entry,
+	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
+	 * entry is already in the array, for instance), it will return
+	 * VM_FAULT_FALLBACK.
 	 */
-	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
-	if (IS_ERR(entry))
+	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
+	if (xa_is_internal(entry)) {
+		result = xa_to_internal(entry);
 		goto fallback;
+	}
 
 	/*
 	 * It is possible, particularly with mixed reads & writes to private
@@ -1633,7 +1546,7 @@
 	 * setting up a mapping, so really we're using iomap_begin() as a way
 	 * to look up our filesystem block.
 	 */
-	pos = (loff_t)pgoff << PAGE_SHIFT;
+	pos = (loff_t)xas.xa_index << PAGE_SHIFT;
 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
 	if (error)
 		goto unlock_entry;
@@ -1649,8 +1562,8 @@
 		if (error < 0)
 			goto finish_iomap;
 
-		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
-						RADIX_DAX_PMD, write && !sync);
+		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
+						DAX_PMD, write && !sync);
 
 		/*
 		 * If we are doing synchronous page fault and inode needs fsync,
@@ -1667,14 +1580,13 @@
 		}
 
 		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
-		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
-					    write);
+		result = vmf_insert_pfn_pmd(vmf, pfn, write);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			break;
-		result = dax_pmd_load_hole(vmf, &iomap, entry);
+		result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1697,7 +1609,7 @@
 				&iomap);
 	}
  unlock_entry:
-	put_locked_mapping_entry(mapping, pgoff);
+	dax_unlock_entry(&xas, entry);
  fallback:
 	if (result == VM_FAULT_FALLBACK) {
 		split_huge_pmd(vma, vmf->pmd, vmf->address);
@@ -1742,54 +1654,46 @@
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
 
-/**
+/*
  * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
  * @vmf: The description of the fault
- * @pe_size: Size of entry to be inserted
  * @pfn: PFN to insert
+ * @order: Order of entry to insert.
  *
- * This function inserts writeable PTE or PMD entry into page tables for mmaped
- * DAX file.  It takes care of marking corresponding radix tree entry as dirty
- * as well.
+ * This function inserts a writeable PTE or PMD entry into the page tables
+ * for an mmaped DAX file.  It also marks the page cache entry as dirty.
  */
-static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
-				  enum page_entry_size pe_size,
-				  pfn_t pfn)
+static vm_fault_t
+dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-	void *entry, **slot;
-	pgoff_t index = vmf->pgoff;
+	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+	void *entry;
 	vm_fault_t ret;
 
-	xa_lock_irq(&mapping->i_pages);
-	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+	xas_lock_irq(&xas);
+	entry = get_unlocked_entry(&xas, order);
 	/* Did we race with someone splitting entry or so? */
-	if (!entry ||
-	    (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
-	    (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
-		put_unlocked_mapping_entry(mapping, index, entry);
-		xa_unlock_irq(&mapping->i_pages);
+	if (!entry || dax_is_conflict(entry) ||
+	    (order == 0 && !dax_is_pte_entry(entry))) {
+		put_unlocked_entry(&xas, entry);
+		xas_unlock_irq(&xas);
 		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
 						      VM_FAULT_NOPAGE);
 		return VM_FAULT_NOPAGE;
 	}
-	radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
-	entry = lock_slot(mapping, slot);
-	xa_unlock_irq(&mapping->i_pages);
-	switch (pe_size) {
-	case PE_SIZE_PTE:
+	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
+	dax_lock_entry(&xas, entry);
+	xas_unlock_irq(&xas);
+	if (order == 0)
 		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
-		break;
 #ifdef CONFIG_FS_DAX_PMD
-	case PE_SIZE_PMD:
-		ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			pfn, true);
-		break;
+	else if (order == PMD_ORDER)
+		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
 #endif
-	default:
+	else
 		ret = VM_FAULT_FALLBACK;
-	}
-	put_locked_mapping_entry(mapping, index);
+	dax_unlock_entry(&xas, entry);
 	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
 	return ret;
 }
@@ -1809,17 +1713,12 @@
 {
 	int err;
 	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
-	size_t len = 0;
+	unsigned int order = pe_order(pe_size);
+	size_t len = PAGE_SIZE << order;
 
-	if (pe_size == PE_SIZE_PTE)
-		len = PAGE_SIZE;
-	else if (pe_size == PE_SIZE_PMD)
-		len = PMD_SIZE;
-	else
-		WARN_ON_ONCE(1);
 	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
 	if (err)
 		return VM_FAULT_SIGBUS;
-	return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+	return dax_insert_pfn_mkwrite(vmf, pfn, order);
 }
 EXPORT_SYMBOL_GPL(dax_finish_sync_fault);

diff --git a/fs/dcache.c b/fs/dcache.c
index 2e7e8d8..e88cf05 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/dcache.c
  *
@@ -18,6 +19,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/fscrypt.h>
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -26,7 +28,7 @@
 #include <linux/export.h>
 #include <linux/security.h>
 #include <linux/seqlock.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rculist_bl.h>
 #include <linux/list_lru.h>
@@ -119,6 +121,7 @@
 
 static DEFINE_PER_CPU(long, nr_dentry);
 static DEFINE_PER_CPU(long, nr_dentry_unused);
+static DEFINE_PER_CPU(long, nr_dentry_negative);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 
@@ -152,11 +155,22 @@
 	return sum < 0 ? 0 : sum;
 }
 
+static long get_nr_dentry_negative(void)
+{
+	int i;
+	long sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry_negative, i);
+	return sum < 0 ? 0 : sum;
+}
+
 int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = get_nr_dentry();
 	dentry_stat.nr_unused = get_nr_dentry_unused();
+	dentry_stat.nr_negative = get_nr_dentry_negative();
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -257,24 +271,10 @@
 	kmem_cache_free(dentry_cache, dentry); 
 }
 
-static void __d_free_external_name(struct rcu_head *head)
-{
-	struct external_name *name = container_of(head, struct external_name,
-						  u.head);
-
-	mod_node_page_state(page_pgdat(virt_to_page(name)),
-			    NR_INDIRECTLY_RECLAIMABLE_BYTES,
-			    -ksize(name));
-
-	kfree(name);
-}
-
 static void __d_free_external(struct rcu_head *head)
 {
 	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
-
-	__d_free_external_name(&external_name(dentry)->u.head);
-
+	kfree(external_name(dentry));
 	kmem_cache_free(dentry_cache, dentry);
 }
 
@@ -286,27 +286,25 @@
 void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
 {
 	spin_lock(&dentry->d_lock);
+	name->name = dentry->d_name;
 	if (unlikely(dname_external(dentry))) {
-		struct external_name *p = external_name(dentry);
-		atomic_inc(&p->u.count);
-		spin_unlock(&dentry->d_lock);
-		name->name = p->name;
+		atomic_inc(&external_name(dentry)->u.count);
 	} else {
 		memcpy(name->inline_name, dentry->d_iname,
 		       dentry->d_name.len + 1);
-		spin_unlock(&dentry->d_lock);
-		name->name = name->inline_name;
+		name->name.name = name->inline_name;
 	}
+	spin_unlock(&dentry->d_lock);
 }
 EXPORT_SYMBOL(take_dentry_name_snapshot);
 
 void release_dentry_name_snapshot(struct name_snapshot *name)
 {
-	if (unlikely(name->name != name->inline_name)) {
+	if (unlikely(name->name.name != name->inline_name)) {
 		struct external_name *p;
-		p = container_of(name->name, struct external_name, name[0]);
+		p = container_of(name->name.name, struct external_name, name[0]);
 		if (unlikely(atomic_dec_and_test(&p->u.count)))
-			call_rcu(&p->u.head, __d_free_external_name);
+			kfree_rcu(p, u.head);
 	}
 }
 EXPORT_SYMBOL(release_dentry_name_snapshot);
@@ -331,6 +329,8 @@
 	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
 	WRITE_ONCE(dentry->d_flags, flags);
 	dentry->d_inode = NULL;
+	if (dentry->d_flags & DCACHE_LRU_LIST)
+		this_cpu_inc(nr_dentry_negative);
 }
 
 static void dentry_free(struct dentry *dentry)
@@ -344,7 +344,7 @@
 		}
 	}
 	/* if dentry was never visible to RCU, immediate free is OK */
-	if (!(dentry->d_flags & DCACHE_RCUACCESS))
+	if (dentry->d_flags & DCACHE_NORCU)
 		__d_free(&dentry->d_u.d_rcu);
 	else
 		call_rcu(&dentry->d_u.d_rcu, __d_free);
@@ -385,6 +385,11 @@
  * The per-cpu "nr_dentry_unused" counters are updated with
  * the DCACHE_LRU_LIST bit.
  *
+ * The per-cpu "nr_dentry_negative" counters are only updated
+ * when deleted from or added to the per-superblock LRU list, not
+ * from/to the shrink list. That is to avoid an unneeded dec/inc
+ * pair when moving from LRU to shrink list in select_collect().
+ *
  * These helper functions make sure we always follow the
  * rules. d_lock must be held by the caller.
  */
@@ -394,6 +399,8 @@
 	D_FLAG_VERIFY(dentry, 0);
 	dentry->d_flags |= DCACHE_LRU_LIST;
 	this_cpu_inc(nr_dentry_unused);
+	if (d_is_negative(dentry))
+		this_cpu_inc(nr_dentry_negative);
 	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
 }
 
@@ -402,6 +409,8 @@
 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
 	dentry->d_flags &= ~DCACHE_LRU_LIST;
 	this_cpu_dec(nr_dentry_unused);
+	if (d_is_negative(dentry))
+		this_cpu_dec(nr_dentry_negative);
 	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
 }
 
@@ -432,6 +441,8 @@
 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
 	dentry->d_flags &= ~DCACHE_LRU_LIST;
 	this_cpu_dec(nr_dentry_unused);
+	if (d_is_negative(dentry))
+		this_cpu_dec(nr_dentry_negative);
 	list_lru_isolate(lru, &dentry->d_lru);
 }
 
@@ -440,6 +451,8 @@
 {
 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
 	dentry->d_flags |= DCACHE_SHRINK_LIST;
+	if (d_is_negative(dentry))
+		this_cpu_dec(nr_dentry_negative);
 	list_lru_isolate_move(lru, &dentry->d_lru, list);
 }
 
@@ -848,6 +861,32 @@
 }
 EXPORT_SYMBOL(dput);
 
+static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+__must_hold(&dentry->d_lock)
+{
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+		/* let the owner of the list it's on deal with it */
+		--dentry->d_lockref.count;
+	} else {
+		if (dentry->d_flags & DCACHE_LRU_LIST)
+			d_lru_del(dentry);
+		if (!--dentry->d_lockref.count)
+			d_shrink_add(dentry, list);
+	}
+}
+
+void dput_to_list(struct dentry *dentry, struct list_head *list)
+{
+	rcu_read_lock();
+	if (likely(fast_dput(dentry))) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+	if (!retain_dentry(dentry))
+		__dput_to_list(dentry, list);
+	spin_unlock(&dentry->d_lock);
+}
 
 /* This must be called with d_lock held */
 static inline void __dget_dlock(struct dentry *dentry)
@@ -1054,7 +1093,7 @@
 	return false;
 }
 
-static void shrink_dentry_list(struct list_head *list)
+void shrink_dentry_list(struct list_head *list)
 {
 	while (!list_empty(list)) {
 		struct dentry *dentry, *parent;
@@ -1076,18 +1115,9 @@
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
+		if (parent != dentry)
+			__dput_to_list(parent, list);
 		__dentry_kill(dentry);
-		if (parent == dentry)
-			continue;
-		/*
-		 * We need to prune ancestors too. This is necessary to prevent
-		 * quadratic behavior of shrink_dcache_parent(), but is also
-		 * expected to be beneficial in reducing dentry cache
-		 * fragmentation.
-		 */
-		dentry = parent;
-		while (dentry && !lockref_put_or_lock(&dentry->d_lockref))
-			dentry = dentry_kill(dentry);
 	}
 }
 
@@ -1202,15 +1232,11 @@
  */
 void shrink_dcache_sb(struct super_block *sb)
 {
-	long freed;
-
 	do {
 		LIST_HEAD(dispose);
 
-		freed = list_lru_walk(&sb->s_dentry_lru,
+		list_lru_walk(&sb->s_dentry_lru,
 			dentry_lru_isolate_shrink, &dispose, 1024);
-
-		this_cpu_sub(nr_dentry_unused, freed);
 		shrink_dentry_list(&dispose);
 	} while (list_lru_count(&sb->s_dentry_lru) > 0);
 }
@@ -1436,8 +1462,11 @@
 
 struct select_data {
 	struct dentry *start;
+	union {
+		long found;
+		struct dentry *victim;
+	};
 	struct list_head dispose;
-	int found;
 };
 
 static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
@@ -1469,6 +1498,37 @@
 	return ret;
 }
 
+static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
+{
+	struct select_data *data = _data;
+	enum d_walk_ret ret = D_WALK_CONTINUE;
+
+	if (data->start == dentry)
+		goto out;
+
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+		if (!dentry->d_lockref.count) {
+			rcu_read_lock();
+			data->victim = dentry;
+			return D_WALK_QUIT;
+		}
+	} else {
+		if (dentry->d_flags & DCACHE_LRU_LIST)
+			d_lru_del(dentry);
+		if (!dentry->d_lockref.count)
+			d_shrink_add(dentry, &data->dispose);
+	}
+	/*
+	 * We can return to the caller if we have found some (this
+	 * ensures forward progress). We'll be coming back to find
+	 * the rest.
+	 */
+	if (!list_empty(&data->dispose))
+		ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
+out:
+	return ret;
+}
+
 /**
  * shrink_dcache_parent - prune dcache
  * @parent: parent of entries to prune
@@ -1478,12 +1538,9 @@
 void shrink_dcache_parent(struct dentry *parent)
 {
 	for (;;) {
-		struct select_data data;
+		struct select_data data = {.start = parent};
 
 		INIT_LIST_HEAD(&data.dispose);
-		data.start = parent;
-		data.found = 0;
-
 		d_walk(parent, &data, select_collect);
 
 		if (!list_empty(&data.dispose)) {
@@ -1494,6 +1551,24 @@
 		cond_resched();
 		if (!data.found)
 			break;
+		data.victim = NULL;
+		d_walk(parent, &data, select_collect2);
+		if (data.victim) {
+			struct dentry *parent;
+			spin_lock(&data.victim->d_lock);
+			if (!shrink_lock_dentry(data.victim)) {
+				spin_unlock(&data.victim->d_lock);
+				rcu_read_unlock();
+			} else {
+				rcu_read_unlock();
+				parent = data.victim->d_parent;
+				if (parent != data.victim)
+					__dput_to_list(parent, &data.dispose);
+				__dentry_kill(data.victim);
+			}
+		}
+		if (!list_empty(&data.dispose))
+			shrink_dentry_list(&data.dispose);
 	}
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
@@ -1606,7 +1681,6 @@
  
 struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 {
-	struct external_name *ext = NULL;
 	struct dentry *dentry;
 	char *dname;
 	int err;
@@ -1627,14 +1701,15 @@
 		dname = dentry->d_iname;
 	} else if (name->len > DNAME_INLINE_LEN-1) {
 		size_t size = offsetof(struct external_name, name[1]);
-
-		ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT);
-		if (!ext) {
+		struct external_name *p = kmalloc(size + name->len,
+						  GFP_KERNEL_ACCOUNT |
+						  __GFP_RECLAIMABLE);
+		if (!p) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
 		}
-		atomic_set(&ext->u.count, 1);
-		dname = ext->name;
+		atomic_set(&p->u.count, 1);
+		dname = p->name;
 	} else  {
 		dname = dentry->d_iname;
 	}	
@@ -1673,12 +1748,6 @@
 		}
 	}
 
-	if (unlikely(ext)) {
-		pg_data_t *pgdat = page_pgdat(virt_to_page(ext));
-		mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
-				    ksize(ext));
-	}
-
 	this_cpu_inc(nr_dentry);
 
 	return dentry;
@@ -1698,7 +1767,6 @@
 	struct dentry *dentry = __d_alloc(parent->d_sb, name);
 	if (!dentry)
 		return NULL;
-	dentry->d_flags |= DCACHE_RCUACCESS;
 	spin_lock(&parent->d_lock);
 	/*
 	 * don't need child lock because it is not subject
@@ -1723,7 +1791,7 @@
 {
 	struct dentry *dentry = d_alloc_anon(parent->d_sb);
 	if (dentry) {
-		dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR;
+		dentry->d_flags |= DCACHE_DENTRY_CURSOR;
 		dentry->d_parent = dget(parent);
 	}
 	return dentry;
@@ -1736,12 +1804,21 @@
  *
  * For a filesystem that just pins its dentries in memory and never
  * performs lookups at all, return an unhashed IS_ROOT dentry.
+ * This is used for pipes, sockets et.al. - the stuff that should
+ * never be anyone's children or parents.  Unlike all other
+ * dentries, these will not have RCU delay between dropping the
+ * last reference and freeing them.
+ *
+ * The only user is alloc_file_pseudo() and that's what should
+ * be considered a public interface.  Don't use directly.
  */
 struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 {
-	return __d_alloc(sb, name);
+	struct dentry *dentry = __d_alloc(sb, name);
+	if (likely(dentry))
+		dentry->d_flags |= DCACHE_NORCU;
+	return dentry;
 }
-EXPORT_SYMBOL(d_alloc_pseudo);
 
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
@@ -1840,6 +1917,11 @@
 	WARN_ON(d_in_lookup(dentry));
 
 	spin_lock(&dentry->d_lock);
+	/*
+	 * Decrement negative dentry count if it was in the LRU list.
+	 */
+	if (dentry->d_flags & DCACHE_LRU_LIST)
+		this_cpu_dec(nr_dentry_negative);
 	hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
 	raw_write_seqcount_begin(&dentry->d_seq);
 	__d_set_inode_and_type(dentry, inode, add_flags);
@@ -1903,12 +1985,10 @@
 
 	if (root_inode) {
 		res = d_alloc_anon(root_inode->i_sb);
-		if (res) {
-			res->d_flags |= DCACHE_RCUACCESS;
+		if (res)
 			d_instantiate(res, root_inode);
-		} else {
+		else
 			iput(root_inode);
-		}
 	}
 	return res;
 }
@@ -2358,7 +2438,6 @@
 void d_delete(struct dentry * dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	int isdir = d_is_dir(dentry);
 
 	spin_lock(&inode->i_lock);
 	spin_lock(&dentry->d_lock);
@@ -2373,7 +2452,6 @@
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&inode->i_lock);
 	}
-	fsnotify_nameremove(dentry, isdir);
 }
 EXPORT_SYMBOL(d_delete);
 
@@ -2707,7 +2785,7 @@
 		dentry->d_name.hash_len = target->d_name.hash_len;
 	}
 	if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
-		call_rcu(&old_name->u.head, __d_free_external_name);
+		kfree_rcu(old_name, u.head);
 }
 
 /*
@@ -2773,9 +2851,7 @@
 		copy_name(dentry, target);
 		target->d_hash.pprev = NULL;
 		dentry->d_parent->d_lockref.count++;
-		if (dentry == old_parent)
-			dentry->d_flags |= DCACHE_RCUACCESS;
-		else
+		if (dentry != old_parent) /* wasn't IS_ROOT */
 			WARN_ON(!--old_parent->d_lockref.count);
 	} else {
 		target->d_parent = old_parent;
@@ -2787,6 +2863,7 @@
 	list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
 	__d_rehash(dentry);
 	fsnotify_update_flags(dentry);
+	fscrypt_handle_d_move(dentry);
 
 	write_seqcount_end(&target->d_seq);
 	write_seqcount_end(&dentry->d_seq);

diff --git a/fs/dcookies.c b/fs/dcookies.c
index 57bc964..6eeb611 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * dcookies.c
  *

diff --git a/fs/debugfs/Makefile b/fs/debugfs/Makefile
index 840c456..9c0fe38 100644
--- a/fs/debugfs/Makefile
+++ b/fs/debugfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 debugfs-objs	:= inode.o file.o
 
 obj-$(CONFIG_DEBUG_FS)	+= debugfs.o

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4fce1da..87846aa 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c

@@ -19,6 +19,7 @@
 #include <linux/atomic.h>
 #include <linux/device.h>
 #include <linux/poll.h>
+#include <linux/security.h>
 
 #include "internal.h"
 
@@ -136,6 +137,25 @@
 }
 EXPORT_SYMBOL_GPL(debugfs_file_put);
 
+/*
+ * Only permit access to world-readable files when the kernel is locked down.
+ * We also need to exclude any file that has ways to write or alter it as root
+ * can bypass the permissions check.
+ */
+static bool debugfs_is_locked_down(struct inode *inode,
+				   struct file *filp,
+				   const struct file_operations *real_fops)
+{
+	if ((inode->i_mode & 07777) == 0444 &&
+	    !(filp->f_mode & FMODE_WRITE) &&
+	    !real_fops->unlocked_ioctl &&
+	    !real_fops->compat_ioctl &&
+	    !real_fops->mmap)
+		return false;
+
+	return security_locked_down(LOCKDOWN_DEBUGFS);
+}
+
 static int open_proxy_open(struct inode *inode, struct file *filp)
 {
 	struct dentry *dentry = F_DENTRY(filp);
@@ -147,6 +167,11 @@
 		return r == -EIO ? -ENOENT : r;
 
 	real_fops = debugfs_real_fops(filp);
+
+	r = debugfs_is_locked_down(inode, filp, real_fops);
+	if (r)
+		goto out;
+
 	real_fops = fops_get(real_fops);
 	if (!real_fops) {
 		/* Huh? Module did not clean up after itself at exit? */
@@ -272,6 +297,11 @@
 		return r == -EIO ? -ENOENT : r;
 
 	real_fops = debugfs_real_fops(filp);
+
+	r = debugfs_is_locked_down(inode, filp, real_fops);
+	if (r)
+		goto out;
+
 	real_fops = fops_get(real_fops);
 	if (!real_fops) {
 		/* Huh? Module did not cleanup after itself at exit? */
@@ -394,12 +424,11 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * be returned.
  */
 struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value)
@@ -440,12 +469,11 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * be returned.
  */
 struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 				  struct dentry *parent, u16 *value)
@@ -486,12 +514,11 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * be returned.
  */
 struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 				 struct dentry *parent, u32 *value)
@@ -533,12 +560,11 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * be returned.
  */
 struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 				 struct dentry *parent, u64 *value)
@@ -582,12 +608,11 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * be returned.
  */
 struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
 				    struct dentry *parent, unsigned long *value)
@@ -850,12 +875,11 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * be returned.
  */
 struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				   struct dentry *parent, bool *value)
@@ -904,12 +928,11 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * be returned.
  */
 struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				   struct dentry *parent,
@@ -1004,24 +1027,19 @@
  * @array as data. If the @mode variable is so set it can be read from.
  * Writing is not supported. Seek within the file is also not supported.
  * Once array is created its size can not be changed.
- *
- * The function returns a pointer to dentry on success. If debugfs is not
- * enabled in the kernel, the value -%ENODEV will be returned.
  */
-struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
-					    struct dentry *parent,
-					    u32 *array, u32 elements)
+void debugfs_create_u32_array(const char *name, umode_t mode,
+			      struct dentry *parent, u32 *array, u32 elements)
 {
 	struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
 
 	if (data == NULL)
-		return NULL;
+		return;
 
 	data->array = array;
 	data->elements = elements;
 
-	return debugfs_create_file_unsafe(name, mode, parent, data,
-					&u32_array_fops);
+	debugfs_create_file_unsafe(name, mode, parent, data, &u32_array_fops);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
 
@@ -1102,12 +1120,11 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * be returned.
  */
 struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
 				       struct dentry *parent,

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 13b0135..7b975db 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c

@@ -2,13 +2,16 @@
 /*
  *  inode.c - part of debugfs, a tiny little debug file system
  *
- *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
+ *  Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
+ *  Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org>
  *
  *  debugfs is for people to use instead of /proc or /sys.
  *  See ./Documentation/core-api/kernel-api.rst for more details.
  */
 
+#define pr_fmt(fmt)	"debugfs: " fmt
+
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -23,6 +26,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/security.h>
 
 #include "internal.h"
 
@@ -32,6 +36,32 @@
 static int debugfs_mount_count;
 static bool debugfs_registered;
 
+/*
+ * Don't allow access attributes to be changed whilst the kernel is locked down
+ * so that we can use the file mode as part of a heuristic to determine whether
+ * to lock down individual files.
+ */
+static int debugfs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	int ret = security_locked_down(LOCKDOWN_DEBUGFS);
+
+	if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
+		return ret;
+	return simple_setattr(dentry, ia);
+}
+
+static const struct inode_operations debugfs_file_inode_operations = {
+	.setattr	= debugfs_setattr,
+};
+static const struct inode_operations debugfs_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.setattr	= debugfs_setattr,
+};
+static const struct inode_operations debugfs_symlink_inode_operations = {
+	.get_link	= simple_get_link,
+	.setattr	= debugfs_setattr,
+};
+
 static struct inode *debugfs_get_inode(struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -163,19 +193,18 @@
 	return 0;
 }
 
-static void debugfs_evict_inode(struct inode *inode)
+static void debugfs_free_inode(struct inode *inode)
 {
-	truncate_inode_pages_final(&inode->i_data);
-	clear_inode(inode);
 	if (S_ISLNK(inode->i_mode))
 		kfree(inode->i_link);
+	free_inode_nonrcu(inode);
 }
 
 static const struct super_operations debugfs_super_operations = {
 	.statfs		= simple_statfs,
 	.remount_fs	= debugfs_remount,
 	.show_options	= debugfs_show_options,
-	.evict_inode	= debugfs_evict_inode,
+	.free_inode	= debugfs_free_inode,
 };
 
 static void debugfs_release_dentry(struct dentry *dentry)
@@ -286,15 +315,17 @@
 	struct dentry *dentry;
 	int error;
 
-	pr_debug("debugfs: creating file '%s'\n",name);
+	pr_debug("creating file '%s'\n", name);
 
 	if (IS_ERR(parent))
 		return parent;
 
 	error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
 			      &debugfs_mount_count);
-	if (error)
+	if (error) {
+		pr_err("Unable to pin filesystem for file '%s'\n", name);
 		return ERR_PTR(error);
+	}
 
 	/* If the parent is not specified, we create it in the root.
 	 * We need the root dentry to do this, which is in the super
@@ -307,6 +338,12 @@
 	inode_lock(d_inode(parent));
 	dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
+		if (d_is_dir(dentry))
+			pr_err("Directory '%s' with parent '%s' already present!\n",
+			       name, parent->d_name.name);
+		else
+			pr_err("File '%s' in directory '%s' already present!\n",
+			       name, parent->d_name.name);
 		dput(dentry);
 		dentry = ERR_PTR(-EEXIST);
 	}
@@ -324,7 +361,7 @@
 	inode_unlock(d_inode(dentry->d_parent));
 	dput(dentry);
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 static struct dentry *end_creating(struct dentry *dentry)
@@ -347,15 +384,19 @@
 	dentry = start_creating(name, parent);
 
 	if (IS_ERR(dentry))
-		return NULL;
+		return dentry;
 
 	inode = debugfs_get_inode(dentry->d_sb);
-	if (unlikely(!inode))
+	if (unlikely(!inode)) {
+		pr_err("out of free dentries, can not create file '%s'\n",
+		       name);
 		return failed_creating(dentry);
+	}
 
 	inode->i_mode = mode;
 	inode->i_private = data;
 
+	inode->i_op = &debugfs_file_inode_operations;
 	inode->i_fop = proxy_fops;
 	dentry->d_fsdata = (void *)((unsigned long)real_fops |
 				DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
@@ -386,7 +427,8 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
  * If debugfs is not enabled in the kernel, the value -%ENODEV will be
  * returned.
@@ -422,8 +464,8 @@
  * debugfs core.
  *
  * It is your responsibility to protect your struct file_operation
- * methods against file removals by means of debugfs_use_file_start()
- * and debugfs_use_file_finish(). ->open() is still protected by
+ * methods against file removals by means of debugfs_file_get()
+ * and debugfs_file_put(). ->open() is still protected by
  * debugfs though.
  *
  * Any struct file_operations defined by means of
@@ -464,7 +506,8 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
  * If debugfs is not enabled in the kernel, the value -%ENODEV will be
  * returned.
@@ -495,7 +538,8 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the file is
  * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
+ * you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
  *
  * If debugfs is not enabled in the kernel, the value -%ENODEV will be
  * returned.
@@ -506,14 +550,17 @@
 	struct inode *inode;
 
 	if (IS_ERR(dentry))
-		return NULL;
+		return dentry;
 
 	inode = debugfs_get_inode(dentry->d_sb);
-	if (unlikely(!inode))
+	if (unlikely(!inode)) {
+		pr_err("out of free dentries, can not create directory '%s'\n",
+		       name);
 		return failed_creating(dentry);
+	}
 
 	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	inode->i_op = &simple_dir_inode_operations;
+	inode->i_op = &debugfs_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -545,11 +592,14 @@
 	struct inode *inode;
 
 	if (IS_ERR(dentry))
-		return NULL;
+		return dentry;
 
 	inode = debugfs_get_inode(dentry->d_sb);
-	if (unlikely(!inode))
+	if (unlikely(!inode)) {
+		pr_err("out of free dentries, can not create automount '%s'\n",
+		       name);
 		return failed_creating(dentry);
+	}
 
 	make_empty_dir_inode(inode);
 	inode->i_flags |= S_AUTOMOUNT;
@@ -581,8 +631,8 @@
  * This function will return a pointer to a dentry if it succeeds.  This
  * pointer must be passed to the debugfs_remove() function when the symbolic
  * link is to be removed (no automatic cleanup happens if your module is
- * unloaded, you are responsible here.)  If an error occurs, %NULL will be
- * returned.
+ * unloaded, you are responsible here.)  If an error occurs, %ERR_PTR(-ERROR)
+ * will be returned.
  *
  * If debugfs is not enabled in the kernel, the value -%ENODEV will be
  * returned.
@@ -594,34 +644,33 @@
 	struct inode *inode;
 	char *link = kstrdup(target, GFP_KERNEL);
 	if (!link)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	dentry = start_creating(name, parent);
 	if (IS_ERR(dentry)) {
 		kfree(link);
-		return NULL;
+		return dentry;
 	}
 
 	inode = debugfs_get_inode(dentry->d_sb);
 	if (unlikely(!inode)) {
+		pr_err("out of free dentries, can not create symlink '%s'\n",
+		       name);
 		kfree(link);
 		return failed_creating(dentry);
 	}
 	inode->i_mode = S_IFLNK | S_IRWXUGO;
-	inode->i_op = &simple_symlink_inode_operations;
+	inode->i_op = &debugfs_symlink_inode_operations;
 	inode->i_link = link;
 	d_instantiate(dentry, inode);
 	return end_creating(dentry);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 
-static void __debugfs_remove_file(struct dentry *dentry, struct dentry *parent)
+static void __debugfs_file_removed(struct dentry *dentry)
 {
 	struct debugfs_fsdata *fsd;
 
-	simple_unlink(d_inode(parent), dentry);
-	d_delete(dentry);
-
 	/*
 	 * Paired with the closing smp_mb() implied by a successful
 	 * cmpxchg() in debugfs_file_get(): either
@@ -642,16 +691,18 @@
 
 	if (simple_positive(dentry)) {
 		dget(dentry);
-		if (!d_is_reg(dentry)) {
-			if (d_is_dir(dentry))
-				ret = simple_rmdir(d_inode(parent), dentry);
-			else
-				simple_unlink(d_inode(parent), dentry);
+		if (d_is_dir(dentry)) {
+			ret = simple_rmdir(d_inode(parent), dentry);
 			if (!ret)
-				d_delete(dentry);
+				fsnotify_rmdir(d_inode(parent), dentry);
 		} else {
-			__debugfs_remove_file(dentry, parent);
+			simple_unlink(d_inode(parent), dentry);
+			fsnotify_unlink(d_inode(parent), dentry);
 		}
+		if (!ret)
+			d_delete(dentry);
+		if (d_is_reg(dentry))
+			__debugfs_file_removed(dentry);
 		dput(dentry);
 	}
 	return ret;
@@ -787,6 +838,13 @@
 	struct dentry *dentry = NULL, *trap;
 	struct name_snapshot old_name;
 
+	if (IS_ERR(old_dir))
+		return old_dir;
+	if (IS_ERR(new_dir))
+		return new_dir;
+	if (IS_ERR_OR_NULL(old_dentry))
+		return old_dentry;
+
 	trap = lock_rename(new_dir, old_dir);
 	/* Source or destination directories don't exist? */
 	if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
@@ -809,7 +867,7 @@
 		goto exit;
 	}
 	d_move(old_dentry, dentry);
-	fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name.name,
+	fsnotify_move(d_inode(old_dir), d_inode(new_dir), &old_name.name,
 		d_is_dir(old_dentry),
 		NULL, old_dentry);
 	release_dentry_name_snapshot(&old_name);
@@ -820,7 +878,9 @@
 	if (dentry && !IS_ERR(dentry))
 		dput(dentry);
 	unlock_rename(new_dir, old_dir);
-	return NULL;
+	if (IS_ERR(dentry))
+		return dentry;
+	return ERR_PTR(-EINVAL);
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
 

diff --git a/fs/devpts/Makefile b/fs/devpts/Makefile
index 236696e..66064c8 100644
--- a/fs/devpts/Makefile
+++ b/fs/devpts/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux /dev/pts virtual filesystem.
 #

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c538145..42e5a76 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c

@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- --------------------------------------------------------- *
  *
  * linux/fs/devpts/inode.c
  *
  *  Copyright 1998-2004 H. Peter Anvin -- All Rights Reserved
  *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
  * ------------------------------------------------------------------------- */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -455,6 +452,7 @@
 	s->s_blocksize_bits = 10;
 	s->s_magic = DEVPTS_SUPER_MAGIC;
 	s->s_op = &devpts_sops;
+	s->s_d_op = &simple_dentry_operations;
 	s->s_time_gran = 1;
 
 	error = -ENOMEM;
@@ -623,7 +621,8 @@
 
 	dentry->d_fsdata = NULL;
 	drop_nlink(dentry->d_inode);
-	d_delete(dentry);
+	fsnotify_unlink(d_inode(dentry->d_parent), dentry);
+	d_drop(dentry);
 	dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
 }
 

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1991460..9329ced 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/direct-io.c
  *
@@ -240,9 +241,8 @@
 	}
 }
 
-/**
+/*
  * dio_complete() - called when all DIO BIO I/O has been completed
- * @offset: the byte offset in the file of the completed operation
  *
  * This drops i_dio_count, lets interested parties know that a DIO operation
  * has completed, and calculates the resulting return code for the operation.
@@ -518,7 +518,7 @@
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
+		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
@@ -537,9 +537,8 @@
  */
 static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
 {
-	struct bio_vec *bvec;
-	unsigned i;
 	blk_status_t err = bio->bi_status;
+	bool should_dirty = dio->op == REQ_OP_READ && dio->should_dirty;
 
 	if (err) {
 		if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT))
@@ -548,17 +547,10 @@
 			dio->io_error = -EIO;
 	}
 
-	if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
+	if (dio->is_async && should_dirty) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
-		bio_for_each_segment_all(bvec, bio, i) {
-			struct page *page = bvec->bv_page;
-
-			if (dio->op == REQ_OP_READ && !PageCompound(page) &&
-					dio->should_dirty)
-				set_page_dirty_lock(page);
-			put_page(page);
-		}
+		bio_release_pages(bio, should_dirty);
 		bio_put(bio);
 	}
 	return err;
@@ -679,6 +671,7 @@
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	int create;
 	unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
+	loff_t i_size;
 
 	/*
 	 * If there was a memory error and we've overwritten all the
@@ -708,8 +701,8 @@
 		 */
 		create = dio->op == REQ_OP_WRITE;
 		if (dio->flags & DIO_SKIP_HOLES) {
-			if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
-							i_blkbits))
+			i_size = i_size_read(dio->inode);
+			if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits)
 				create = 0;
 		}
 
@@ -1265,6 +1258,8 @@
 	} else {
 		dio->op = REQ_OP_READ;
 	}
+	if (iocb->ki_flags & IOCB_HIPRI)
+		dio->op_flags |= REQ_HIPRI;
 
 	/*
 	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
@@ -1313,7 +1308,7 @@
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 
-	dio->should_dirty = (iter->type == ITER_IOVEC);
+	dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ;
 	sdio.iter = iter;
 	sdio.final_block_in_request = end >> blkbits;
 

diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index e4242c3..f82a495 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 menuconfig DLM
 	tristate "Distributed Lock Manager (DLM)"
 	depends on INET

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 562fa8c..283c7b9 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -292,6 +290,8 @@
 		flush_workqueue(ls->ls_callback_wq);
 }
 
+#define MAX_CB_QUEUE 25
+
 void dlm_callback_resume(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb, *safe;
@@ -302,15 +302,23 @@
 	if (!ls->ls_callback_wq)
 		return;
 
+more:
 	mutex_lock(&ls->ls_cb_mutex);
 	list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
 		list_del_init(&lkb->lkb_cb_list);
 		queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
 		count++;
+		if (count == MAX_CB_QUEUE)
+			break;
 	}
 	mutex_unlock(&ls->ls_cb_mutex);
 
 	if (count)
 		log_rinfo(ls, "dlm_callback_resume %d", count);
+	if (count == MAX_CB_QUEUE) {
+		count = 0;
+		cond_resched();
+		goto more;
+	}
 }
 

diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 757b551..181ad7d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h

@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 1270551..3b21082 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 6041eec..2b471aa 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index fa08448..d6bbccb 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c

@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -739,7 +737,7 @@
 	debugfs_remove(ls->ls_debug_toss_dentry);
 }
 
-int dlm_create_debug_file(struct dlm_ls *ls)
+void dlm_create_debug_file(struct dlm_ls *ls)
 {
 	char name[DLM_LOCKSPACE_LEN + 8];
 
@@ -750,8 +748,6 @@
 						      dlm_root,
 						      ls,
 						      &format1_fops);
-	if (!ls->ls_debug_rsb_dentry)
-		goto fail;
 
 	/* format 2 */
 
@@ -763,8 +759,6 @@
 							dlm_root,
 							ls,
 							&format2_fops);
-	if (!ls->ls_debug_locks_dentry)
-		goto fail;
 
 	/* format 3 */
 
@@ -776,8 +770,6 @@
 						      dlm_root,
 						      ls,
 						      &format3_fops);
-	if (!ls->ls_debug_all_dentry)
-		goto fail;
 
 	/* format 4 */
 
@@ -789,8 +781,6 @@
 						       dlm_root,
 						       ls,
 						       &format4_fops);
-	if (!ls->ls_debug_toss_dentry)
-		goto fail;
 
 	memset(name, 0, sizeof(name));
 	snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name);
@@ -800,21 +790,12 @@
 							  dlm_root,
 							  ls,
 							  &waiters_fops);
-	if (!ls->ls_debug_waiters_dentry)
-		goto fail;
-
-	return 0;
-
- fail:
-	dlm_delete_debug_file(ls);
-	return -ENOMEM;
 }
 
-int __init dlm_register_debugfs(void)
+void __init dlm_register_debugfs(void)
 {
 	mutex_init(&debug_buf_lock);
 	dlm_root = debugfs_create_dir("dlm", NULL);
-	return dlm_root ? 0 : -ENOMEM;
 }
 
 void dlm_unregister_debugfs(void)

diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index d975851..10c36ae 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
index 4175063..03844d0 100644
--- a/fs/dlm/dir.h
+++ b/fs/dlm/dir.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 748e8d5..416d9de 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -721,14 +719,14 @@
 void dlm_plock_exit(void);
 
 #ifdef CONFIG_DLM_DEBUG
-int dlm_register_debugfs(void);
+void dlm_register_debugfs(void);
 void dlm_unregister_debugfs(void);
-int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_create_debug_file(struct dlm_ls *ls);
 void dlm_delete_debug_file(struct dlm_ls *ls);
 #else
-static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_register_debugfs(void) { }
 static inline void dlm_unregister_debugfs(void) { }
-static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_create_debug_file(struct dlm_ls *ls) { }
 static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
 #endif
 

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index cc91963..18d8159 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c

@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -1209,6 +1207,7 @@
 
 	if (rv < 0) {
 		log_error(ls, "create_lkb idr error %d", rv);
+		dlm_free_lkb(lkb);
 		return rv;
 	}
 
@@ -4179,6 +4178,7 @@
 			  (unsigned long long)lkb->lkb_recover_seq,
 			  ms->m_header.h_nodeid, ms->m_lkid);
 		error = -ENOENT;
+		dlm_put_lkb(lkb);
 		goto fail;
 	}
 
@@ -4232,6 +4232,7 @@
 			  lkb->lkb_id, lkb->lkb_remid,
 			  ms->m_header.h_nodeid, ms->m_lkid);
 		error = -ENOENT;
+		dlm_put_lkb(lkb);
 		goto fail;
 	}
 
@@ -5792,20 +5793,20 @@
 			goto out;
 		}
 	}
-
-	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
-	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
-	   lock and that lkb_astparam is the dlm_user_args structure. */
-
 	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
 			      fake_astfn, ua, fake_bastfn, &args);
-	lkb->lkb_flags |= DLM_IFL_USER;
-
 	if (error) {
+		kfree(ua->lksb.sb_lvbptr);
+		ua->lksb.sb_lvbptr = NULL;
+		kfree(ua);
 		__put_lkb(ls, lkb);
 		goto out;
 	}
 
+	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
+	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
+	   lock and that lkb_astparam is the dlm_user_args structure. */
+	lkb->lkb_flags |= DLM_IFL_USER;
 	error = request_lock(ls, lkb, name, namelen, &args);
 
 	switch (error) {

diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index ed8ebd3..456c6ec 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h

@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 5ba94be..afb8340 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -160,6 +158,7 @@
 	&dlm_attr_recover_nodeid.attr,
 	NULL,
 };
+ATTRIBUTE_GROUPS(dlm);
 
 static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
 			     char *buf)
@@ -189,7 +188,7 @@
 };
 
 static struct kobj_type dlm_ktype = {
-	.default_attrs = dlm_attrs,
+	.default_groups = dlm_groups,
 	.sysfs_ops     = &dlm_attr_ops,
 	.release       = lockspace_kobj_release,
 };
@@ -431,7 +430,7 @@
 	int do_unreg = 0;
 	int namelen = strlen(name);
 
-	if (namelen > DLM_LOCKSPACE_LEN)
+	if (namelen > DLM_LOCKSPACE_LEN || namelen == 0)
 		return -EINVAL;
 
 	if (!lvblen || (lvblen % 8))
@@ -680,11 +679,9 @@
 	kfree(ls->ls_recover_buf);
  out_lkbidr:
 	idr_destroy(&ls->ls_lkbidr);
-	for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
-		if (ls->ls_remove_names[i])
-			kfree(ls->ls_remove_names[i]);
-	}
  out_rsbtbl:
+	for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++)
+		kfree(ls->ls_remove_names[i]);
 	vfree(ls->ls_rsbtbl);
  out_lsfree:
 	if (do_unreg)
@@ -807,6 +804,7 @@
 
 	dlm_delete_debug_file(ls);
 
+	idr_destroy(&ls->ls_recover_idr);
 	kfree(ls->ls_recover_buf);
 
 	/*

diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index f879f87..a78d853 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index a5e4a22..3951d39 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -674,7 +672,7 @@
 		nvec = 2;
 	}
 	len = iov[0].iov_len + iov[1].iov_len;
-	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nvec, len);
+	iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len);
 
 	r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL);
 	if (ret <= 0)
@@ -1089,12 +1087,12 @@
 	 * since O_NONBLOCK argument in connect() function does not work here,
 	 * then, we should restore the default value of this attribute.
 	 */
-	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
 			  sizeof(tv));
 	result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
 				   0);
 	memset(&tv, 0, sizeof(tv));
-	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
 			  sizeof(tv));
 
 	if (result == -EINPROGRESS)
@@ -1630,8 +1628,10 @@
 
 static void work_stop(void)
 {
-	destroy_workqueue(recv_workqueue);
-	destroy_workqueue(send_workqueue);
+	if (recv_workqueue)
+		destroy_workqueue(recv_workqueue);
+	if (send_workqueue)
+		destroy_workqueue(send_workqueue);
 }
 
 static int work_start(void)
@@ -1691,13 +1691,17 @@
 	struct hlist_node *n;
 	struct connection *con;
 
-	flush_workqueue(recv_workqueue);
-	flush_workqueue(send_workqueue);
+	if (recv_workqueue)
+		flush_workqueue(recv_workqueue);
+	if (send_workqueue)
+		flush_workqueue(send_workqueue);
 	do {
 		ok = 1;
 		foreach_conn(stop_conn);
-		flush_workqueue(recv_workqueue);
-		flush_workqueue(send_workqueue);
+		if (recv_workqueue)
+			flush_workqueue(recv_workqueue);
+		if (send_workqueue)
+			flush_workqueue(send_workqueue);
 		for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
 			hlist_for_each_entry_safe(con, n,
 						  &connection_hash[i], list) {

diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 67462e5..687b289 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
index cc3e92f..09052d9 100644
--- a/fs/dlm/lvb_table.h
+++ b/fs/dlm/lvb_table.h

@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 8e1b618..afc66a1 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -37,9 +35,7 @@
 	if (error)
 		goto out_lockspace;
 
-	error = dlm_register_debugfs();
-	if (error)
-		goto out_config;
+	dlm_register_debugfs();
 
 	error = dlm_user_init();
 	if (error)
@@ -63,7 +59,6 @@
 	dlm_user_exit();
  out_debug:
 	dlm_unregister_debugfs();
- out_config:
 	dlm_config_exit();
  out_lockspace:
 	dlm_lockspace_exit();

diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 3fda383..7ad83de 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c

@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2011 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -671,7 +669,7 @@
 int dlm_ls_start(struct dlm_ls *ls)
 {
 	struct dlm_recover *rv, *rv_old;
-	struct dlm_config_node *nodes;
+	struct dlm_config_node *nodes = NULL;
 	int error, count;
 
 	rv = kzalloc(sizeof(*rv), GFP_NOFS);
@@ -680,7 +678,7 @@
 
 	error = dlm_config_nodes(ls->ls_name, &nodes, &count);
 	if (error < 0)
-		goto fail;
+		goto fail_rv;
 
 	spin_lock(&ls->ls_recover_lock);
 
@@ -712,8 +710,9 @@
 	return 0;
 
  fail:
-	kfree(rv);
 	kfree(nodes);
+ fail_rv:
+	kfree(rv);
 	return error;
 }
 

diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 3deb706..433b2fa 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h

@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2011 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 7cd24bc..5918f4d 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/
@@ -38,10 +36,8 @@
 
 void dlm_memory_exit(void)
 {
-	if (lkb_cache)
-		kmem_cache_destroy(lkb_cache);
-	if (rsb_cache)
-		kmem_cache_destroy(rsb_cache);
+	kmem_cache_destroy(lkb_cache);
+	kmem_cache_destroy(rsb_cache);
 }
 
 char *dlm_allocate_lvb(struct dlm_ls *ls)
@@ -86,8 +82,7 @@
 		struct dlm_user_args *ua;
 		ua = lkb->lkb_ua;
 		if (ua) {
-			if (ua->lksb.sb_lvbptr)
-				kfree(ua->lksb.sb_lvbptr);
+			kfree(ua->lksb.sb_lvbptr);
 			kfree(ua);
 		}
 	}

diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 177c11c..4f218ea 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f3396c6..921322d 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 95852a5..2e122e8 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 43a96c3..e7f5503 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c

@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
  */
 
 #include <net/genetlink.h>
@@ -68,6 +65,7 @@
 static const struct genl_ops dlm_nl_ops[] = {
 	{
 		.cmd	= DLM_CMD_HELLO,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit	= user_cmd,
 	},
 };

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index c7d5a2e..c38b2b8 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c

@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/fs.h>

diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 70c6259..e3d9f72 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index 206723a..454d3c4 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index ce2aa54..8928e99 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
index d8c8738..235e0d2 100644
--- a/fs/dlm/recover.h
+++ b/fs/dlm/recover.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 6f4e1d4..85e2453 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
index 8856079..d1944dc 100644
--- a/fs/dlm/recoverd.h
+++ b/fs/dlm/recoverd.h

@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 1695f1b..e89e0ff 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c

@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
index 10ce449..4e40346 100644
--- a/fs/dlm/requestqueue.h
+++ b/fs/dlm/requestqueue.h

@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 2a66939..5264bac 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c

@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
  */
 
 #include <linux/miscdevice.h>
@@ -25,6 +22,7 @@
 #include "lvb_table.h"
 #include "user.h"
 #include "ast.h"
+#include "config.h"
 
 static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
@@ -404,7 +402,7 @@
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	error = dlm_new_lockspace(params->name, NULL, params->flags,
+	error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags,
 				  DLM_USER_LVB_LEN, NULL, NULL, NULL,
 				  &lockspace);
 	if (error)
@@ -702,7 +700,7 @@
 	result.version[0] = DLM_DEVICE_VERSION_MAJOR;
 	result.version[1] = DLM_DEVICE_VERSION_MINOR;
 	result.version[2] = DLM_DEVICE_VERSION_PATCH;
-	memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+	memcpy(&result.lksb, &ua->lksb, offsetof(struct dlm_lksb, sb_lvbptr));
 	result.user_lksb = ua->user_lksb;
 
 	/* FIXME: dlm1 provides for the user's bastparam/addr to not be updated

diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 00499ab..6b9bce6 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h

@@ -1,9 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
  */
 
 #ifndef __USER_DOT_H__

diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index e36520a..cfd0d00 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c

@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/dlm/util.h b/fs/dlm/util.h
index 2b25915..cc719ca 100644
--- a/fs/dlm/util.h
+++ b/fs/dlm/util.h

@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /******************************************************************************
 *******************************************************************************
 **
 **  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
 **
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
 **
 *******************************************************************************
 ******************************************************************************/

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 8237701..d31b6c7 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c

@@ -21,8 +21,13 @@
 	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
+		/*
+		 * We must skip inodes in unusual state. We may also skip
+		 * inodes without pages but we deliberately won't in case
+		 * we need to reschedule to avoid softlockups.
+		 */
 		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    (inode->i_mapping->nrpages == 0)) {
+		    (inode->i_mapping->nrpages == 0 && !need_resched())) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
@@ -30,6 +35,7 @@
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&sb->s_inode_list_lock);
 
+		cond_resched();
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		toput_inode = inode;

diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 434aa31..522c35d 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config ECRYPT_FS
 	tristate "eCrypt filesystem layer support"
 	depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)

diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index 49678a6..4f2cc5b 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux eCryptfs
 #

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 4dd842f..f91db24 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
@@ -6,21 +7,6 @@
  * Copyright (C) 2004-2007 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
  *   		Michael C. Thompson <mcthomps@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <crypto/hash.h>
@@ -37,6 +23,7 @@
 #include <linux/slab.h>
 #include <asm/unaligned.h>
 #include <linux/kernel.h>
+#include <linux/xattr.h>
 #include "ecryptfs_kernel.h"
 
 #define DECRYPT		0
@@ -68,7 +55,6 @@
 	int err;
 
 	desc->tfm = tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 	err = crypto_shash_digest(desc, src, len, dst);
 	shash_desc_zero(desc);
 	return err;
@@ -610,7 +596,8 @@
 				full_alg_name);
 		goto out_free;
 	}
-	crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	crypto_skcipher_set_flags(crypt_stat->tfm,
+				  CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
 	rc = 0;
 out_free:
 	kfree(full_alg_name);
@@ -874,13 +861,10 @@
  * @crypt_stat: The cryptographic context
  * @page_virt: Source data to be parsed
  * @bytes_read: Updated with the number of bytes read
- *
- * Returns zero on success; non-zero if the flag set is invalid
  */
-static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
+static void ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
 				  char *page_virt, int *bytes_read)
 {
-	int rc = 0;
 	int i;
 	u32 flags;
 
@@ -893,7 +877,6 @@
 	/* Version is in top 8 bits of the 32-bit flag vector */
 	crypt_stat->file_version = ((flags >> 24) & 0xFF);
 	(*bytes_read) = 4;
-	return rc;
 }
 
 /**
@@ -1018,8 +1001,10 @@
 
 	rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
 				 inode);
-	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
-		return rc >= 0 ? -EINVAL : rc;
+	if (rc < 0)
+		return rc;
+	else if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+		return -EINVAL;
 	rc = ecryptfs_validate_marker(marker);
 	if (!rc)
 		ecryptfs_i_size_init(file_size, inode);
@@ -1129,9 +1114,21 @@
 				 char *page_virt, size_t size)
 {
 	int rc;
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+	struct inode *lower_inode = d_inode(lower_dentry);
 
-	rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode,
-			       ECRYPTFS_XATTR_NAME, page_virt, size, 0);
+	if (!(lower_inode->i_opflags & IOP_XATTR)) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	inode_lock(lower_inode);
+	rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME,
+			    page_virt, size, 0);
+	if (!rc && ecryptfs_inode)
+		fsstack_copy_attr_all(ecryptfs_inode, lower_inode);
+	inode_unlock(lower_inode);
+out:
 	return rc;
 }
 
@@ -1305,12 +1302,7 @@
 	if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
 		ecryptfs_i_size_init(page_virt, d_inode(ecryptfs_dentry));
 	offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
-	rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
-				    &bytes_read);
-	if (rc) {
-		ecryptfs_printk(KERN_WARNING, "Error processing flags\n");
-		goto out;
-	}
+	ecryptfs_process_flags(crypt_stat, (page_virt + offset), &bytes_read);
 	if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) {
 		ecryptfs_printk(KERN_WARNING, "File version is [%d]; only "
 				"file version [%d] is supported by this "
@@ -1381,8 +1373,10 @@
 				     ecryptfs_inode_to_lower(inode),
 				     ECRYPTFS_XATTR_NAME, file_size,
 				     ECRYPTFS_SIZE_AND_MARKER_BYTES);
-	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
-		return rc >= 0 ? -EINVAL : rc;
+	if (rc < 0)
+		return rc;
+	else if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+		return -EINVAL;
 	rc = ecryptfs_validate_marker(marker);
 	if (!rc)
 		ecryptfs_i_size_init(file_size, inode);
@@ -1590,7 +1584,7 @@
 		       "[%s]; rc = [%d]\n", full_alg_name, rc);
 		goto out;
 	}
-	crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
 	if (*key_size == 0)
 		*key_size = crypto_skcipher_default_keysize(*key_tfm);
 	get_random_bytes(dummy_key, *key_size);

diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c
index 3d2bdf5..1f65e99 100644
--- a/fs/ecryptfs/debug.c
+++ b/fs/ecryptfs/debug.c

@@ -1,24 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  * Functions only useful for debugging.
  *
  * Copyright (C) 2006 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include "ecryptfs_kernel.h"
@@ -97,25 +83,9 @@
  */
 void ecryptfs_dump_hex(char *data, int bytes)
 {
-	int i = 0;
-	int add_newline = 1;
-
 	if (ecryptfs_verbosity < 1)
 		return;
-	if (bytes != 0) {
-		printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]);
-		i++;
-	}
-	while (i < bytes) {
-		printk("0x%.2x.", (unsigned char)data[i]);
-		i++;
-		if (i % 16 == 0) {
-			printk("\n");
-			add_newline = 0;
-		} else
-			add_newline = 1;
-	}
-	if (add_newline)
-		printk("\n");
-}
 
+	print_hex_dump(KERN_DEBUG, "ecryptfs: ", DUMP_PREFIX_OFFSET, 16, 1,
+		       data, bytes, false);
+}

diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 63cd2c1..44606f0 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
@@ -5,21 +6,6 @@
  * Copyright (C) 2001-2003 Stony Brook University
  * Copyright (C) 2004-2006 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/dcache.h>

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e74cb2a..1c1a56b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /**
  * eCryptfs: Linux filesystem encryption layer
  * Kernel declarations.
@@ -8,21 +9,6 @@
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
  *              Trevor S. Highland <trevor.highland@gmail.com>
  *              Tyler Hicks <tyhicks@ou.edu>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #ifndef ECRYPTFS_KERNEL_H

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index b76a985..feecb57 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
@@ -6,21 +7,6 @@
  * Copyright (C) 2004-2007 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
  *   		Michael C. Thompson <mcthomps@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/file.h>

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 49121e5..e23752d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
@@ -6,21 +7,6 @@
  * Copyright (C) 2004-2007 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
  *              Michael C. Thompsion <mcthomps@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/file.h>
@@ -142,13 +128,20 @@
 			      struct inode *inode)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
 	struct dentry *lower_dir_dentry;
+	struct inode *lower_dir_inode;
 	int rc;
 
-	dget(lower_dentry);
-	lower_dir_dentry = lock_parent(lower_dentry);
-	rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+	lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+	lower_dir_inode = d_inode(lower_dir_dentry);
+	inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+	dget(lower_dentry);	// don't even try to make the lower negative
+	if (lower_dentry->d_parent != lower_dir_dentry)
+		rc = -EINVAL;
+	else if (d_unhashed(lower_dentry))
+		rc = -EINVAL;
+	else
+		rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
 	if (rc) {
 		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
 		goto out_unlock;
@@ -156,10 +149,11 @@
 	fsstack_copy_attr_times(dir, lower_dir_inode);
 	set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
 	inode->i_ctime = dir->i_ctime;
-	d_drop(dentry);
 out_unlock:
-	unlock_dir(lower_dir_dentry);
 	dput(lower_dentry);
+	inode_unlock(lower_dir_inode);
+	if (!rc)
+		d_drop(dentry);
 	return rc;
 }
 
@@ -325,9 +319,9 @@
 static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
 				     struct dentry *lower_dentry)
 {
-	struct inode *inode, *lower_inode = d_inode(lower_dentry);
+	struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+	struct inode *inode, *lower_inode;
 	struct ecryptfs_dentry_info *dentry_info;
-	struct vfsmount *lower_mnt;
 	int rc = 0;
 
 	dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
@@ -336,16 +330,23 @@
 		return ERR_PTR(-ENOMEM);
 	}
 
-	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
 	fsstack_copy_attr_atime(d_inode(dentry->d_parent),
-				d_inode(lower_dentry->d_parent));
+				d_inode(path->dentry));
 	BUG_ON(!d_count(lower_dentry));
 
 	ecryptfs_set_dentry_private(dentry, dentry_info);
-	dentry_info->lower_path.mnt = lower_mnt;
+	dentry_info->lower_path.mnt = mntget(path->mnt);
 	dentry_info->lower_path.dentry = lower_dentry;
 
-	if (d_really_is_negative(lower_dentry)) {
+	/*
+	 * negative dentry can go positive under us here - its parent is not
+	 * locked.  That's OK and that could happen just as we return from
+	 * ecryptfs_lookup() anyway.  Just need to be careful and fetch
+	 * ->d_inode only once - it's not stable here.
+	 */
+	lower_inode = READ_ONCE(lower_dentry->d_inode);
+
+	if (!lower_inode) {
 		/* We want to add because we couldn't find in lower */
 		d_add(dentry, NULL);
 		return NULL;
@@ -526,22 +527,30 @@
 {
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
+	struct inode *lower_dir_inode;
 	int rc;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	dget(dentry);
-	lower_dir_dentry = lock_parent(lower_dentry);
-	dget(lower_dentry);
-	rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
-	dput(lower_dentry);
-	if (!rc && d_really_is_positive(dentry))
+	lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+	lower_dir_inode = d_inode(lower_dir_dentry);
+
+	inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+	dget(lower_dentry);	// don't even try to make the lower negative
+	if (lower_dentry->d_parent != lower_dir_dentry)
+		rc = -EINVAL;
+	else if (d_unhashed(lower_dentry))
+		rc = -EINVAL;
+	else
+		rc = vfs_rmdir(lower_dir_inode, lower_dentry);
+	if (!rc) {
 		clear_nlink(d_inode(dentry));
-	fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
-	set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
-	unlock_dir(lower_dir_dentry);
+		fsstack_copy_attr_times(dir, lower_dir_inode);
+		set_nlink(dir, lower_dir_inode->i_nlink);
+	}
+	dput(lower_dentry);
+	inode_unlock(lower_dir_inode);
 	if (!rc)
 		d_drop(dentry);
-	dput(dentry);
 	return rc;
 }
 
@@ -579,25 +588,32 @@
 	struct dentry *lower_new_dentry;
 	struct dentry *lower_old_dir_dentry;
 	struct dentry *lower_new_dir_dentry;
-	struct dentry *trap = NULL;
+	struct dentry *trap;
 	struct inode *target_inode;
 
 	if (flags)
 		return -EINVAL;
 
+	lower_old_dir_dentry = ecryptfs_dentry_to_lower(old_dentry->d_parent);
+	lower_new_dir_dentry = ecryptfs_dentry_to_lower(new_dentry->d_parent);
+
 	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
 	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
-	dget(lower_old_dentry);
-	dget(lower_new_dentry);
-	lower_old_dir_dentry = dget_parent(lower_old_dentry);
-	lower_new_dir_dentry = dget_parent(lower_new_dentry);
+
 	target_inode = d_inode(new_dentry);
+
 	trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
-	/* source should not be ancestor of target */
-	if (trap == lower_old_dentry) {
-		rc = -EINVAL;
+	dget(lower_new_dentry);
+	rc = -EINVAL;
+	if (lower_old_dentry->d_parent != lower_old_dir_dentry)
 		goto out_lock;
-	}
+	if (lower_new_dentry->d_parent != lower_new_dir_dentry)
+		goto out_lock;
+	if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry))
+		goto out_lock;
+	/* source should not be ancestor of target */
+	if (trap == lower_old_dentry)
+		goto out_lock;
 	/* target should not be ancestor of source */
 	if (trap == lower_new_dentry) {
 		rc = -ENOTEMPTY;
@@ -615,11 +631,8 @@
 	if (new_dir != old_dir)
 		fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
 out_lock:
-	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
-	dput(lower_new_dir_dentry);
-	dput(lower_old_dir_dentry);
 	dput(lower_new_dentry);
-	dput(lower_old_dentry);
+	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
 	return rc;
 }
 
@@ -1130,7 +1143,7 @@
 	}
 }
 
-const struct xattr_handler ecryptfs_xattr_handler = {
+static const struct xattr_handler ecryptfs_xattr_handler = {
 	.prefix = "",  /* match anything */
 	.get = ecryptfs_xattr_get,
 	.set = ecryptfs_xattr_set,

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e74fe84..216fbe6 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  * In-kernel key management code.  Includes functions to parse and
@@ -8,21 +9,6 @@
  *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
  *              Michael C. Thompson <mcthomps@us.ibm.com>
  *              Trevor S. Highland <trevor.highland@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <crypto/hash.h>
@@ -769,7 +755,6 @@
 	}
 
 	s->hash_desc->tfm = s->hash_tfm;
-	s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	rc = crypto_shash_digest(s->hash_desc,
 				 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
@@ -1063,8 +1048,9 @@
 		       "rc = [%d]\n", __func__, rc);
 		goto out_free_unlock;
 	}
-	while (s->decrypted_filename[s->i] != '\0'
-	       && s->i < s->block_aligned_filename_size)
+
+	while (s->i < s->block_aligned_filename_size &&
+	       s->decrypted_filename[s->i] != '\0')
 		s->i++;
 	if (s->i == s->block_aligned_filename_size) {
 		printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
@@ -1626,9 +1612,9 @@
 	int rc = 0;
 
 	(*auth_tok_key) = request_key(&key_type_user, sig, NULL);
-	if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
+	if (IS_ERR(*auth_tok_key)) {
 		(*auth_tok_key) = ecryptfs_get_encrypted_key(sig);
-		if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
+		if (IS_ERR(*auth_tok_key)) {
 			printk(KERN_ERR "Could not find key with description: [%s]\n",
 			      sig);
 			rc = process_request_key_err(PTR_ERR(*auth_tok_key));

diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e00d45a..a7c903c 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2008 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/kthread.h>

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 025d66a..b8a7ce3 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
@@ -7,21 +8,6 @@
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
  *              Michael C. Thompson <mcthomps@us.ibm.com>
  *              Tyler Hicks <tyhicks@ou.edu>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/dcache.h>

diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 9fdd5bc..d668e60 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2004-2008 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
  *		Tyler Hicks <tyhicks@ou.edu>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 #include <linux/sched.h>
 #include <linux/slab.h>

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 2d1158e..742ece2 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c

@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2008 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/fs.h>

diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index cdf358b..cffa0c1 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  * This is where eCryptfs coordinates the symmetric encryption and
@@ -8,21 +9,6 @@
  * Copyright (C) 2001-2003 Stony Brook University
  * Copyright (C) 2004-2007 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/pagemap.h>

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index c596e7c..0438997 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2007 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/fs.h>

diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 85411ce..6b1853f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
@@ -6,21 +7,6 @@
  * Copyright (C) 2004-2006 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
  *              Michael C. Thompson <mcthomps@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -67,9 +53,8 @@
 	return inode;
 }
 
-static void ecryptfs_i_callback(struct rcu_head *head)
+static void ecryptfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct ecryptfs_inode_info *inode_info;
 	inode_info = ecryptfs_inode_to_private(inode);
 
@@ -92,7 +77,6 @@
 	inode_info = ecryptfs_inode_to_private(inode);
 	BUG_ON(inode_info->lower_file);
 	ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
-	call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
 
 /**
@@ -186,6 +170,7 @@
 const struct super_operations ecryptfs_sops = {
 	.alloc_inode = ecryptfs_alloc_inode,
 	.destroy_inode = ecryptfs_destroy_inode,
+	.free_inode = ecryptfs_free_inode,
 	.statfs = ecryptfs_statfs,
 	.remount_fs = NULL,
 	.evict_inode = ecryptfs_evict_inode,

diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
index c2499ef..edec8a1 100644
--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config EFIVAR_FS
 	tristate "EFI Variable filesystem"
 	depends on EFI

diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile
index 955d478..0b1c5e6 100644
--- a/fs/efivarfs/Makefile
+++ b/fs/efivarfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the efivarfs filesystem
 #

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 8e56842..e9e27a2 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2012 Red Hat, Inc.
  * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/efi.h>
@@ -110,16 +107,22 @@
 	return size;
 }
 
-static int
-efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+static inline unsigned int efivarfs_getflags(struct inode *inode)
 {
-	struct inode *inode = file->f_mapping->host;
 	unsigned int i_flags;
 	unsigned int flags = 0;
 
 	i_flags = inode->i_flags;
 	if (i_flags & S_IMMUTABLE)
 		flags |= FS_IMMUTABLE_FL;
+	return flags;
+}
+
+static int
+efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_mapping->host;
+	unsigned int flags = efivarfs_getflags(inode);
 
 	if (copy_to_user(arg, &flags, sizeof(flags)))
 		return -EFAULT;
@@ -132,6 +135,7 @@
 	struct inode *inode = file->f_mapping->host;
 	unsigned int flags;
 	unsigned int i_flags = 0;
+	unsigned int oldflags = efivarfs_getflags(inode);
 	int error;
 
 	if (!inode_owner_or_capable(inode))
@@ -143,9 +147,6 @@
 	if (flags & ~FS_IMMUTABLE_FL)
 		return -EOPNOTSUPP;
 
-	if (!capable(CAP_LINUX_IMMUTABLE))
-		return -EPERM;
-
 	if (flags & FS_IMMUTABLE_FL)
 		i_flags |= S_IMMUTABLE;
 
@@ -155,12 +156,16 @@
 		return error;
 
 	inode_lock(inode);
+
+	error = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (error)
+		goto out;
+
 	inode_set_flags(inode, i_flags, S_IMMUTABLE);
+out:
 	inode_unlock(inode);
-
 	mnt_drop_write_file(file);
-
-	return 0;
+	return error;
 }
 
 static long

diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 8c6ab6c..96c0c86 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2012 Red Hat, Inc.
  * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/efi.h>

diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index b450518..30ae44c 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2012 Red Hat, Inc.
  * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #ifndef EFIVAR_FS_INTERNAL_H
 #define EFIVAR_FS_INTERNAL_H

diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 5b68e42..fa4f644 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c

@@ -1,15 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2012 Red Hat, Inc.
  * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/ctype.h>
 #include <linux/efi.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/ucs2_string.h>
@@ -31,8 +29,6 @@
 	.evict_inode = efivarfs_evict_inode,
 };
 
-static struct super_block *efivarfs_sb;
-
 /*
  * Compare two efivarfs file names.
  *
@@ -191,14 +187,12 @@
 	return 0;
 }
 
-static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
+static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct inode *inode = NULL;
 	struct dentry *root;
 	int err;
 
-	efivarfs_sb = sb;
-
 	sb->s_maxbytes          = MAX_LFS_FILESIZE;
 	sb->s_blocksize         = PAGE_SIZE;
 	sb->s_blocksize_bits    = PAGE_SHIFT;
@@ -226,16 +220,24 @@
 	return err;
 }
 
-static struct dentry *efivarfs_mount(struct file_system_type *fs_type,
-				    int flags, const char *dev_name, void *data)
+static int efivarfs_get_tree(struct fs_context *fc)
 {
-	return mount_single(fs_type, flags, data, efivarfs_fill_super);
+	return get_tree_single(fc, efivarfs_fill_super);
+}
+
+static const struct fs_context_operations efivarfs_context_ops = {
+	.get_tree	= efivarfs_get_tree,
+};
+
+static int efivarfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &efivarfs_context_ops;
+	return 0;
 }
 
 static void efivarfs_kill_sb(struct super_block *sb)
 {
 	kill_litter_super(sb);
-	efivarfs_sb = NULL;
 
 	/* Remove all entries and destroy */
 	__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
@@ -244,7 +246,7 @@
 static struct file_system_type efivarfs_type = {
 	.owner   = THIS_MODULE,
 	.name    = "efivarfs",
-	.mount   = efivarfs_mount,
+	.init_fs_context = efivarfs_init_fs_context,
 	.kill_sb = efivarfs_kill_sb,
 };
 

diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index d020e3c..2df1bac 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config EFS_FS
 	tristate "EFS file system support (read only)"
 	depends on BLOCK

diff --git a/fs/efs/Makefile b/fs/efs/Makefile
index 963543d..85e5b88 100644
--- a/fs/efs/Makefile
+++ b/fs/efs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux efs-filesystem routines.
 #

diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index cdf0872..89e73a6 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * inode.c
  *

diff --git a/fs/efs/super.c b/fs/efs/super.c
index 6ffb7ba..4a6ebff 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c

@@ -74,17 +74,11 @@
 	return &ei->vfs_inode;
 }
 
-static void efs_i_callback(struct rcu_head *head)
+static void efs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
-static void efs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, efs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
@@ -122,7 +116,7 @@
 
 static const struct super_operations efs_superblock_operations = {
 	.alloc_inode	= efs_alloc_inode,
-	.destroy_inode	= efs_destroy_inode,
+	.free_inode	= efs_free_inode,
 	.statfs		= efs_statfs,
 	.remount_fs	= efs_remount,
 };
@@ -263,6 +257,8 @@
 	if (!sb)
 		return -ENOMEM;
 	s->s_fs_info = sb;
+	s->s_time_min = 0;
+	s->s_time_max = U32_MAX;
  
 	s->s_magic		= EFS_SUPER_MAGIC;
 	if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
new file mode 100644
index 0000000..9d634d3
--- /dev/null
+++ b/fs/erofs/Kconfig

@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config EROFS_FS
+	tristate "EROFS filesystem support"
+	depends on BLOCK
+	help
+	  EROFS (Enhanced Read-Only File System) is a lightweight
+	  read-only file system with modern designs (eg. page-sized
+	  blocks, inline xattrs/data, etc.) for scenarios which need
+	  high-performance read-only requirements, e.g. Android OS
+	  for mobile phones and LIVECDs.
+
+	  It also provides fixed-sized output compression support,
+	  which improves storage density, keeps relatively higher
+	  compression ratios, which is more useful to achieve high
+	  performance for embedded devices with limited memory.
+
+	  If unsure, say N.
+
+config EROFS_FS_DEBUG
+	bool "EROFS debugging feature"
+	depends on EROFS_FS
+	help
+	  Print debugging messages and enable more BUG_ONs which check
+	  filesystem consistency and find potential issues aggressively,
+	  which can be used for Android eng build, for example.
+
+	  For daily use, say N.
+
+config EROFS_FS_XATTR
+	bool "EROFS extended attributes"
+	depends on EROFS_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config EROFS_FS_POSIX_ACL
+	bool "EROFS Access Control Lists"
+	depends on EROFS_FS_XATTR
+	select FS_POSIX_ACL
+	default y
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
+
+config EROFS_FS_SECURITY
+	bool "EROFS Security Labels"
+	depends on EROFS_FS_XATTR
+	default y
+	help
+	  Security labels provide an access control facility to support Linux
+	  Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+	  Linux. This option enables an extended attribute handler for file
+	  security labels in the erofs filesystem, so that it requires enabling
+	  the extended attribute support in advance.
+
+	  If you are not using a security module, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compression Support"
+	depends on EROFS_FS
+	select LZ4_DECOMPRESS
+	default y
+	help
+	  Enable fixed-sized output compression for EROFS.
+
+	  If you don't want to enable compression feature, say N.
+
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates maximum # of pages of a compressed
+	  physical cluster.
+
+	  For example, if files in a image were compressed
+	  into 8k-unit, hard limit should not be configured
+	  less than 2. Otherwise, the image will be refused
+	  to mount on this kernel.
+

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
new file mode 100644
index 0000000..46f2aa4
--- /dev/null
+++ b/fs/erofs/Makefile

@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+EROFS_VERSION = "1.0"
+
+ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+
+obj-$(CONFIG_EROFS_FS) += erofs.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+

diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
new file mode 100644
index 0000000..07d279f
--- /dev/null
+++ b/fs/erofs/compress.h

@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_COMPRESS_H
+#define __EROFS_FS_COMPRESS_H
+
+#include "internal.h"
+
+enum {
+	Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+	Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
+
+struct z_erofs_decompress_req {
+	struct super_block *sb;
+	struct page **in, **out;
+
+	unsigned short pageofs_out;
+	unsigned int inputsize, outputsize;
+
+	/* indicate the algorithm will be used for decompression */
+	unsigned int alg;
+	bool inplace_io, partial_decoding;
+};
+
+/*
+ * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
+ * used to mark temporary allocated pages from other
+ * file/cached pages and NULL mapping pages.
+ */
+#define Z_EROFS_MAPPING_STAGING         ((void *)0x5A110C8D)
+
+/* check if a page is marked as staging */
+static inline bool z_erofs_page_is_staging(struct page *page)
+{
+	return page->mapping == Z_EROFS_MAPPING_STAGING;
+}
+
+static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
+					   struct page *page)
+{
+	if (!z_erofs_page_is_staging(page))
+		return false;
+
+	/* staging pages should not be used by others at the same time */
+	if (page_ref_count(page) > 1)
+		put_page(page);
+	else
+		list_add(&page->lru, pagepool);
+	return true;
+}
+
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+		       struct list_head *pagepool);
+
+#endif
+

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
new file mode 100644
index 0000000..fc3a8d8
--- /dev/null
+++ b/fs/erofs/data.c

@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <linux/prefetch.h>
+
+#include <trace/events/erofs.h>
+
+static void erofs_readendio(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	blk_status_t err = bio->bi_status;
+	struct bvec_iter_all iter_all;
+
+	bio_for_each_segment_all(bvec, bio, iter_all) {
+		struct page *page = bvec->bv_page;
+
+		/* page is already locked */
+		DBG_BUGON(PageUptodate(page));
+
+		if (err)
+			SetPageError(page);
+		else
+			SetPageUptodate(page);
+
+		unlock_page(page);
+		/* page could be reclaimed now */
+	}
+	bio_put(bio);
+}
+
+struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
+{
+	struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
+	struct page *page;
+
+	page = read_cache_page_gfp(mapping, blkaddr,
+				   mapping_gfp_constraint(mapping, ~__GFP_FS));
+	/* should already be PageUptodate */
+	if (!IS_ERR(page))
+		lock_page(page);
+	return page;
+}
+
+static int erofs_map_blocks_flatmode(struct inode *inode,
+				     struct erofs_map_blocks *map,
+				     int flags)
+{
+	int err = 0;
+	erofs_blk_t nblocks, lastblk;
+	u64 offset = map->m_la;
+	struct erofs_inode *vi = EROFS_I(inode);
+	bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+
+	trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
+
+	nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+	lastblk = nblocks - tailendpacking;
+
+	if (offset >= inode->i_size) {
+		/* leave out-of-bound access unmapped */
+		map->m_flags = 0;
+		map->m_plen = 0;
+		goto out;
+	}
+
+	/* there is no hole in flatmode */
+	map->m_flags = EROFS_MAP_MAPPED;
+
+	if (offset < blknr_to_addr(lastblk)) {
+		map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
+		map->m_plen = blknr_to_addr(lastblk) - offset;
+	} else if (tailendpacking) {
+		/* 2 - inode inline B: inode, [xattrs], inline last blk... */
+		struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+
+		map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize +
+			vi->xattr_isize + erofs_blkoff(map->m_la);
+		map->m_plen = inode->i_size - offset;
+
+		/* inline data should be located in one meta block */
+		if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+			erofs_err(inode->i_sb,
+				  "inline data cross block boundary @ nid %llu",
+				  vi->nid);
+			DBG_BUGON(1);
+			err = -EFSCORRUPTED;
+			goto err_out;
+		}
+
+		map->m_flags |= EROFS_MAP_META;
+	} else {
+		erofs_err(inode->i_sb,
+			  "internal error @ nid: %llu (size %llu), m_la 0x%llx",
+			  vi->nid, inode->i_size, map->m_la);
+		DBG_BUGON(1);
+		err = -EIO;
+		goto err_out;
+	}
+
+out:
+	map->m_llen = map->m_plen;
+
+err_out:
+	trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
+	return err;
+}
+
+int erofs_map_blocks(struct inode *inode,
+		     struct erofs_map_blocks *map, int flags)
+{
+	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
+		int err = z_erofs_map_blocks_iter(inode, map, flags);
+
+		if (map->mpage) {
+			put_page(map->mpage);
+			map->mpage = NULL;
+		}
+		return err;
+	}
+	return erofs_map_blocks_flatmode(inode, map, flags);
+}
+
+static inline struct bio *erofs_read_raw_page(struct bio *bio,
+					      struct address_space *mapping,
+					      struct page *page,
+					      erofs_off_t *last_block,
+					      unsigned int nblocks,
+					      bool ra)
+{
+	struct inode *const inode = mapping->host;
+	struct super_block *const sb = inode->i_sb;
+	erofs_off_t current_block = (erofs_off_t)page->index;
+	int err;
+
+	DBG_BUGON(!nblocks);
+
+	if (PageUptodate(page)) {
+		err = 0;
+		goto has_updated;
+	}
+
+	/* note that for readpage case, bio also equals to NULL */
+	if (bio &&
+	    /* not continuous */
+	    *last_block + 1 != current_block) {
+submit_bio_retry:
+		submit_bio(bio);
+		bio = NULL;
+	}
+
+	if (!bio) {
+		struct erofs_map_blocks map = {
+			.m_la = blknr_to_addr(current_block),
+		};
+		erofs_blk_t blknr;
+		unsigned int blkoff;
+
+		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+		if (err)
+			goto err_out;
+
+		/* zero out the holed page */
+		if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+			zero_user_segment(page, 0, PAGE_SIZE);
+			SetPageUptodate(page);
+
+			/* imply err = 0, see erofs_map_blocks */
+			goto has_updated;
+		}
+
+		/* for RAW access mode, m_plen must be equal to m_llen */
+		DBG_BUGON(map.m_plen != map.m_llen);
+
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
+		/* deal with inline page */
+		if (map.m_flags & EROFS_MAP_META) {
+			void *vsrc, *vto;
+			struct page *ipage;
+
+			DBG_BUGON(map.m_plen > PAGE_SIZE);
+
+			ipage = erofs_get_meta_page(inode->i_sb, blknr);
+
+			if (IS_ERR(ipage)) {
+				err = PTR_ERR(ipage);
+				goto err_out;
+			}
+
+			vsrc = kmap_atomic(ipage);
+			vto = kmap_atomic(page);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
+			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
+			kunmap_atomic(vto);
+			kunmap_atomic(vsrc);
+			flush_dcache_page(page);
+
+			SetPageUptodate(page);
+			/* TODO: could we unlock the page earlier? */
+			unlock_page(ipage);
+			put_page(ipage);
+
+			/* imply err = 0, see erofs_map_blocks */
+			goto has_updated;
+		}
+
+		/* pa must be block-aligned for raw reading */
+		DBG_BUGON(erofs_blkoff(map.m_pa));
+
+		/* max # of continuous pages */
+		if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
+			nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
+		if (nblocks > BIO_MAX_PAGES)
+			nblocks = BIO_MAX_PAGES;
+
+		bio = bio_alloc(GFP_NOIO, nblocks);
+
+		bio->bi_end_io = erofs_readendio;
+		bio_set_dev(bio, sb->s_bdev);
+		bio->bi_iter.bi_sector = (sector_t)blknr <<
+			LOG_SECTORS_PER_BLOCK;
+		bio->bi_opf = REQ_OP_READ;
+	}
+
+	err = bio_add_page(bio, page, PAGE_SIZE, 0);
+	/* out of the extent or bio is full */
+	if (err < PAGE_SIZE)
+		goto submit_bio_retry;
+
+	*last_block = current_block;
+
+	/* shift in advance in case of it followed by too many gaps */
+	if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) {
+		/* err should reassign to 0 after submitting */
+		err = 0;
+		goto submit_bio_out;
+	}
+
+	return bio;
+
+err_out:
+	/* for sync reading, set page error immediately */
+	if (!ra) {
+		SetPageError(page);
+		ClearPageUptodate(page);
+	}
+has_updated:
+	unlock_page(page);
+
+	/* if updated manually, continuous pages has a gap */
+	if (bio)
+submit_bio_out:
+		submit_bio(bio);
+	return err ? ERR_PTR(err) : NULL;
+}
+
+/*
+ * since we dont have write or truncate flows, so no inode
+ * locking needs to be held at the moment.
+ */
+static int erofs_raw_access_readpage(struct file *file, struct page *page)
+{
+	erofs_off_t last_block;
+	struct bio *bio;
+
+	trace_erofs_readpage(page, true);
+
+	bio = erofs_read_raw_page(NULL, page->mapping,
+				  page, &last_block, 1, false);
+
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	DBG_BUGON(bio);	/* since we have only one bio -- must be NULL */
+	return 0;
+}
+
+static int erofs_raw_access_readpages(struct file *filp,
+				      struct address_space *mapping,
+				      struct list_head *pages,
+				      unsigned int nr_pages)
+{
+	erofs_off_t last_block;
+	struct bio *bio = NULL;
+	gfp_t gfp = readahead_gfp_mask(mapping);
+	struct page *page = list_last_entry(pages, struct page, lru);
+
+	trace_erofs_readpages(mapping->host, page, nr_pages, true);
+
+	for (; nr_pages; --nr_pages) {
+		page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			bio = erofs_read_raw_page(bio, mapping, page,
+						  &last_block, nr_pages, true);
+
+			/* all the page errors are ignored when readahead */
+			if (IS_ERR(bio)) {
+				pr_err("%s, readahead error at page %lu of nid %llu\n",
+				       __func__, page->index,
+				       EROFS_I(mapping->host)->nid);
+
+				bio = NULL;
+			}
+		}
+
+		/* pages could still be locked */
+		put_page(page);
+	}
+	DBG_BUGON(!list_empty(pages));
+
+	/* the rare case (end in gaps) */
+	if (bio)
+		submit_bio(bio);
+	return 0;
+}
+
+static int erofs_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int create)
+{
+	struct erofs_map_blocks map = {
+		.m_la = iblock << 9,
+	};
+	int err;
+
+	err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+	if (err)
+		return err;
+
+	if (map.m_flags & EROFS_MAP_MAPPED)
+		bh->b_blocknr = erofs_blknr(map.m_pa);
+
+	return err;
+}
+
+static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+
+	if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) {
+		erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
+
+		if (block >> LOG_SECTORS_PER_BLOCK >= blks)
+			return 0;
+	}
+
+	return generic_block_bmap(mapping, block, erofs_get_block);
+}
+
+/* for uncompressed (aligned) files and raw access for other files */
+const struct address_space_operations erofs_raw_access_aops = {
+	.readpage = erofs_raw_access_readpage,
+	.readpages = erofs_raw_access_readpages,
+	.bmap = erofs_bmap,
+};
+

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
new file mode 100644
index 0000000..19f89f9
--- /dev/null
+++ b/fs/erofs/decompressor.c

@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "compress.h"
+#include <linux/module.h>
+#include <linux/lz4.h>
+
+#ifndef LZ4_DISTANCE_MAX	/* history window size */
+#define LZ4_DISTANCE_MAX 65535	/* set to maximum value by default */
+#endif
+
+#define LZ4_MAX_DISTANCE_PAGES	(DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
+#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize)  (((srcsize) >> 8) + 32)
+#endif
+
+struct z_erofs_decompressor {
+	/*
+	 * if destpages have sparsed pages, fill them with bounce pages.
+	 * it also check whether destpages indicate continuous physical memory.
+	 */
+	int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
+				 struct list_head *pagepool);
+	int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
+	char *name;
+};
+
+static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
+					 struct list_head *pagepool)
+{
+	const unsigned int nr =
+		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+	struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
+	unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
+					   BITS_PER_LONG)] = { 0 };
+	void *kaddr = NULL;
+	unsigned int i, j, top;
+
+	top = 0;
+	for (i = j = 0; i < nr; ++i, ++j) {
+		struct page *const page = rq->out[i];
+		struct page *victim;
+
+		if (j >= LZ4_MAX_DISTANCE_PAGES)
+			j = 0;
+
+		/* 'valid' bounced can only be tested after a complete round */
+		if (test_bit(j, bounced)) {
+			DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
+			DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
+			availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+		}
+
+		if (page) {
+			__clear_bit(j, bounced);
+			if (kaddr) {
+				if (kaddr + PAGE_SIZE == page_address(page))
+					kaddr += PAGE_SIZE;
+				else
+					kaddr = NULL;
+			} else if (!i) {
+				kaddr = page_address(page);
+			}
+			continue;
+		}
+		kaddr = NULL;
+		__set_bit(j, bounced);
+
+		if (top) {
+			victim = availables[--top];
+			get_page(victim);
+		} else {
+			victim = erofs_allocpage(pagepool, GFP_KERNEL, false);
+			if (!victim)
+				return -ENOMEM;
+			victim->mapping = Z_EROFS_MAPPING_STAGING;
+		}
+		rq->out[i] = victim;
+	}
+	return kaddr ? 1 : 0;
+}
+
+static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
+				       u8 *src, unsigned int pageofs_in)
+{
+	/*
+	 * if in-place decompression is ongoing, those decompressed
+	 * pages should be copied in order to avoid being overlapped.
+	 */
+	struct page **in = rq->in;
+	u8 *const tmp = erofs_get_pcpubuf(0);
+	u8 *tmpp = tmp;
+	unsigned int inlen = rq->inputsize - pageofs_in;
+	unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
+
+	while (tmpp < tmp + inlen) {
+		if (!src)
+			src = kmap_atomic(*in);
+		memcpy(tmpp, src + pageofs_in, count);
+		kunmap_atomic(src);
+		src = NULL;
+		tmpp += count;
+		pageofs_in = 0;
+		count = PAGE_SIZE;
+		++in;
+	}
+	return tmp;
+}
+
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+{
+	unsigned int inputmargin, inlen;
+	u8 *src;
+	bool copied, support_0padding;
+	int ret;
+
+	if (rq->inputsize > PAGE_SIZE)
+		return -EOPNOTSUPP;
+
+	src = kmap_atomic(*rq->in);
+	inputmargin = 0;
+	support_0padding = false;
+
+	/* decompression inplace is only safe when 0padding is enabled */
+	if (EROFS_SB(rq->sb)->feature_incompat &
+	    EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
+		support_0padding = true;
+
+		while (!src[inputmargin & ~PAGE_MASK])
+			if (!(++inputmargin & ~PAGE_MASK))
+				break;
+
+		if (inputmargin >= rq->inputsize) {
+			kunmap_atomic(src);
+			return -EIO;
+		}
+	}
+
+	copied = false;
+	inlen = rq->inputsize - inputmargin;
+	if (rq->inplace_io) {
+		const uint oend = (rq->pageofs_out +
+				   rq->outputsize) & ~PAGE_MASK;
+		const uint nr = PAGE_ALIGN(rq->pageofs_out +
+					   rq->outputsize) >> PAGE_SHIFT;
+
+		if (rq->partial_decoding || !support_0padding ||
+		    rq->out[nr - 1] != rq->in[0] ||
+		    rq->inputsize - oend <
+		      LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
+			src = generic_copy_inplace_data(rq, src, inputmargin);
+			inputmargin = 0;
+			copied = true;
+		}
+	}
+
+	ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+					  inlen, rq->outputsize,
+					  rq->outputsize);
+	if (ret < 0) {
+		erofs_err(rq->sb, "failed to decompress, in[%u, %u] out[%u]",
+			  inlen, inputmargin, rq->outputsize);
+		WARN_ON(1);
+		print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
+			       16, 1, src + inputmargin, inlen, true);
+		print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
+			       16, 1, out, rq->outputsize, true);
+		ret = -EIO;
+	}
+
+	if (copied)
+		erofs_put_pcpubuf(src);
+	else
+		kunmap_atomic(src);
+	return ret;
+}
+
+static struct z_erofs_decompressor decompressors[] = {
+	[Z_EROFS_COMPRESSION_SHIFTED] = {
+		.name = "shifted"
+	},
+	[Z_EROFS_COMPRESSION_LZ4] = {
+		.prepare_destpages = z_erofs_lz4_prepare_destpages,
+		.decompress = z_erofs_lz4_decompress,
+		.name = "lz4"
+	},
+};
+
+static void copy_from_pcpubuf(struct page **out, const char *dst,
+			      unsigned short pageofs_out,
+			      unsigned int outputsize)
+{
+	const char *end = dst + outputsize;
+	const unsigned int righthalf = PAGE_SIZE - pageofs_out;
+	const char *cur = dst - pageofs_out;
+
+	while (cur < end) {
+		struct page *const page = *out++;
+
+		if (page) {
+			char *buf = kmap_atomic(page);
+
+			if (cur >= dst) {
+				memcpy(buf, cur, min_t(uint, PAGE_SIZE,
+						       end - cur));
+			} else {
+				memcpy(buf + pageofs_out, cur + pageofs_out,
+				       min_t(uint, righthalf, end - cur));
+			}
+			kunmap_atomic(buf);
+		}
+		cur += PAGE_SIZE;
+	}
+}
+
+static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+				      struct list_head *pagepool)
+{
+	const unsigned int nrpages_out =
+		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+	const struct z_erofs_decompressor *alg = decompressors + rq->alg;
+	unsigned int dst_maptype;
+	void *dst;
+	int ret, i;
+
+	if (nrpages_out == 1 && !rq->inplace_io) {
+		DBG_BUGON(!*rq->out);
+		dst = kmap_atomic(*rq->out);
+		dst_maptype = 0;
+		goto dstmap_out;
+	}
+
+	/*
+	 * For the case of small output size (especially much less
+	 * than PAGE_SIZE), memcpy the decompressed data rather than
+	 * compressed data is preferred.
+	 */
+	if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+		dst = erofs_get_pcpubuf(0);
+		if (IS_ERR(dst))
+			return PTR_ERR(dst);
+
+		rq->inplace_io = false;
+		ret = alg->decompress(rq, dst);
+		if (!ret)
+			copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+					  rq->outputsize);
+
+		erofs_put_pcpubuf(dst);
+		return ret;
+	}
+
+	ret = alg->prepare_destpages(rq, pagepool);
+	if (ret < 0) {
+		return ret;
+	} else if (ret) {
+		dst = page_address(*rq->out);
+		dst_maptype = 1;
+		goto dstmap_out;
+	}
+
+	i = 0;
+	while (1) {
+		dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL);
+
+		/* retry two more times (totally 3 times) */
+		if (dst || ++i >= 3)
+			break;
+		vm_unmap_aliases();
+	}
+
+	if (!dst)
+		return -ENOMEM;
+
+	dst_maptype = 2;
+
+dstmap_out:
+	ret = alg->decompress(rq, dst + rq->pageofs_out);
+
+	if (!dst_maptype)
+		kunmap_atomic(dst);
+	else if (dst_maptype == 2)
+		vm_unmap_ram(dst, nrpages_out);
+	return ret;
+}
+
+static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
+				     struct list_head *pagepool)
+{
+	const unsigned int nrpages_out =
+		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+	const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
+	unsigned char *src, *dst;
+
+	if (nrpages_out > 2) {
+		DBG_BUGON(1);
+		return -EIO;
+	}
+
+	if (rq->out[0] == *rq->in) {
+		DBG_BUGON(nrpages_out != 1);
+		return 0;
+	}
+
+	src = kmap_atomic(*rq->in);
+	if (!rq->out[0]) {
+		dst = NULL;
+	} else {
+		dst = kmap_atomic(rq->out[0]);
+		memcpy(dst + rq->pageofs_out, src, righthalf);
+	}
+
+	if (rq->out[1] == *rq->in) {
+		memmove(src, src + righthalf, rq->pageofs_out);
+	} else if (nrpages_out == 2) {
+		if (dst)
+			kunmap_atomic(dst);
+		DBG_BUGON(!rq->out[1]);
+		dst = kmap_atomic(rq->out[1]);
+		memcpy(dst, src + righthalf, rq->pageofs_out);
+	}
+	if (dst)
+		kunmap_atomic(dst);
+	kunmap_atomic(src);
+	return 0;
+}
+
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+		       struct list_head *pagepool)
+{
+	if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
+		return z_erofs_shifted_transform(rq, pagepool);
+	return z_erofs_decompress_generic(rq, pagepool);
+}
+

diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
new file mode 100644
index 0000000..d28c623
--- /dev/null
+++ b/fs/erofs/dir.c

@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+
+static void debug_one_dentry(unsigned char d_type, const char *de_name,
+			     unsigned int de_namelen)
+{
+#ifdef CONFIG_EROFS_FS_DEBUG
+	/* since the on-disk name could not have the trailing '\0' */
+	unsigned char dbg_namebuf[EROFS_NAME_LEN + 1];
+
+	memcpy(dbg_namebuf, de_name, de_namelen);
+	dbg_namebuf[de_namelen] = '\0';
+
+	erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf,
+		  de_namelen, d_type);
+#endif
+}
+
+static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
+			       void *dentry_blk, unsigned int *ofs,
+			       unsigned int nameoff, unsigned int maxsize)
+{
+	struct erofs_dirent *de = dentry_blk + *ofs;
+	const struct erofs_dirent *end = dentry_blk + nameoff;
+
+	while (de < end) {
+		const char *de_name;
+		unsigned int de_namelen;
+		unsigned char d_type;
+
+		d_type = fs_ftype_to_dtype(de->file_type);
+
+		nameoff = le16_to_cpu(de->nameoff);
+		de_name = (char *)dentry_blk + nameoff;
+
+		/* the last dirent in the block? */
+		if (de + 1 >= end)
+			de_namelen = strnlen(de_name, maxsize - nameoff);
+		else
+			de_namelen = le16_to_cpu(de[1].nameoff) - nameoff;
+
+		/* a corrupted entry is found */
+		if (nameoff + de_namelen > maxsize ||
+		    de_namelen > EROFS_NAME_LEN) {
+			erofs_err(dir->i_sb, "bogus dirent @ nid %llu",
+				  EROFS_I(dir)->nid);
+			DBG_BUGON(1);
+			return -EFSCORRUPTED;
+		}
+
+		debug_one_dentry(d_type, de_name, de_namelen);
+		if (!dir_emit(ctx, de_name, de_namelen,
+			      le64_to_cpu(de->nid), d_type))
+			/* stopped by some reason */
+			return 1;
+		++de;
+		*ofs += sizeof(struct erofs_dirent);
+	}
+	*ofs = maxsize;
+	return 0;
+}
+
+static int erofs_readdir(struct file *f, struct dir_context *ctx)
+{
+	struct inode *dir = file_inode(f);
+	struct address_space *mapping = dir->i_mapping;
+	const size_t dirsize = i_size_read(dir);
+	unsigned int i = ctx->pos / EROFS_BLKSIZ;
+	unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
+	int err = 0;
+	bool initial = true;
+
+	while (ctx->pos < dirsize) {
+		struct page *dentry_page;
+		struct erofs_dirent *de;
+		unsigned int nameoff, maxsize;
+
+		dentry_page = read_mapping_page(mapping, i, NULL);
+		if (dentry_page == ERR_PTR(-ENOMEM)) {
+			err = -ENOMEM;
+			break;
+		} else if (IS_ERR(dentry_page)) {
+			erofs_err(dir->i_sb,
+				  "fail to readdir of logical block %u of nid %llu",
+				  i, EROFS_I(dir)->nid);
+			err = -EFSCORRUPTED;
+			break;
+		}
+
+		de = (struct erofs_dirent *)kmap(dentry_page);
+
+		nameoff = le16_to_cpu(de->nameoff);
+
+		if (nameoff < sizeof(struct erofs_dirent) ||
+		    nameoff >= PAGE_SIZE) {
+			erofs_err(dir->i_sb,
+				  "invalid de[0].nameoff %u @ nid %llu",
+				  nameoff, EROFS_I(dir)->nid);
+			err = -EFSCORRUPTED;
+			goto skip_this;
+		}
+
+		maxsize = min_t(unsigned int,
+				dirsize - ctx->pos + ofs, PAGE_SIZE);
+
+		/* search dirents at the arbitrary position */
+		if (initial) {
+			initial = false;
+
+			ofs = roundup(ofs, sizeof(struct erofs_dirent));
+			if (ofs >= nameoff)
+				goto skip_this;
+		}
+
+		err = erofs_fill_dentries(dir, ctx, de, &ofs,
+					  nameoff, maxsize);
+skip_this:
+		kunmap(dentry_page);
+
+		put_page(dentry_page);
+
+		ctx->pos = blknr_to_addr(i) + ofs;
+
+		if (err)
+			break;
+		++i;
+		ofs = 0;
+	}
+	return err < 0 ? err : 0;
+}
+
+const struct file_operations erofs_dir_fops = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= erofs_readdir,
+};
+

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
new file mode 100644
index 0000000..b1ee565
--- /dev/null
+++ b/fs/erofs/erofs_fs.h

@@ -0,0 +1,316 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */
+/*
+ * EROFS (Enhanced ROM File System) on-disk format definition
+ *
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_H
+#define __EROFS_FS_H
+
+#define EROFS_SUPER_OFFSET      1024
+
+/*
+ * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
+ * be incompatible with this kernel version.
+ */
+#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING	0x00000001
+#define EROFS_ALL_FEATURE_INCOMPAT		EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+
+/* 128-byte erofs on-disk super block */
+struct erofs_super_block {
+	__le32 magic;           /* file system magic number */
+	__le32 checksum;        /* crc32c(super_block) */
+	__le32 feature_compat;
+	__u8 blkszbits;         /* support block_size == PAGE_SIZE only */
+	__u8 reserved;
+
+	__le16 root_nid;	/* nid of root directory */
+	__le64 inos;            /* total valid ino # (== f_files - f_favail) */
+
+	__le64 build_time;      /* inode v1 time derivation */
+	__le32 build_time_nsec;	/* inode v1 time derivation in nano scale */
+	__le32 blocks;          /* used for statfs */
+	__le32 meta_blkaddr;	/* start block address of metadata area */
+	__le32 xattr_blkaddr;	/* start block address of shared xattr area */
+	__u8 uuid[16];          /* 128-bit uuid for volume */
+	__u8 volume_name[16];   /* volume name */
+	__le32 feature_incompat;
+
+	__u8 reserved2[44];
+};
+
+/*
+ * erofs inode datalayout (i_format in on-disk inode):
+ * 0 - inode plain without inline data A:
+ * inode, [xattrs], ... | ... | no-holed data
+ * 1 - inode VLE compression B (legacy):
+ * inode, [xattrs], extents ... | ...
+ * 2 - inode plain with inline data C:
+ * inode, [xattrs], last_inline_data, ... | ... | no-holed data
+ * 3 - inode compression D:
+ * inode, [xattrs], map_header, extents ... | ...
+ * 4~7 - reserved
+ */
+enum {
+	EROFS_INODE_FLAT_PLAIN			= 0,
+	EROFS_INODE_FLAT_COMPRESSION_LEGACY	= 1,
+	EROFS_INODE_FLAT_INLINE			= 2,
+	EROFS_INODE_FLAT_COMPRESSION		= 3,
+	EROFS_INODE_DATALAYOUT_MAX
+};
+
+static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
+{
+	return datamode == EROFS_INODE_FLAT_COMPRESSION ||
+		datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
+}
+
+/* bit definitions of inode i_advise */
+#define EROFS_I_VERSION_BITS            1
+#define EROFS_I_DATALAYOUT_BITS         3
+
+#define EROFS_I_VERSION_BIT             0
+#define EROFS_I_DATALAYOUT_BIT          1
+
+/* 32-byte reduced form of an ondisk inode */
+struct erofs_inode_compact {
+	__le16 i_format;	/* inode format hints */
+
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+	__le16 i_xattr_icount;
+	__le16 i_mode;
+	__le16 i_nlink;
+	__le32 i_size;
+	__le32 i_reserved;
+	union {
+		/* file total compressed blocks for data mapping 1 */
+		__le32 compressed_blocks;
+		__le32 raw_blkaddr;
+
+		/* for device files, used to indicate old/new device # */
+		__le32 rdev;
+	} i_u;
+	__le32 i_ino;           /* only used for 32-bit stat compatibility */
+	__le16 i_uid;
+	__le16 i_gid;
+	__le32 i_reserved2;
+};
+
+/* 32 bytes on-disk inode */
+#define EROFS_INODE_LAYOUT_COMPACT	0
+/* 64 bytes on-disk inode */
+#define EROFS_INODE_LAYOUT_EXTENDED	1
+
+/* 64-byte complete form of an ondisk inode */
+struct erofs_inode_extended {
+	__le16 i_format;	/* inode format hints */
+
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+	__le16 i_xattr_icount;
+	__le16 i_mode;
+	__le16 i_reserved;
+	__le64 i_size;
+	union {
+		/* file total compressed blocks for data mapping 1 */
+		__le32 compressed_blocks;
+		__le32 raw_blkaddr;
+
+		/* for device files, used to indicate old/new device # */
+		__le32 rdev;
+	} i_u;
+
+	/* only used for 32-bit stat compatibility */
+	__le32 i_ino;
+
+	__le32 i_uid;
+	__le32 i_gid;
+	__le64 i_ctime;
+	__le32 i_ctime_nsec;
+	__le32 i_nlink;
+	__u8   i_reserved2[16];
+};
+
+#define EROFS_MAX_SHARED_XATTRS         (128)
+/* h_shared_count between 129 ... 255 are special # */
+#define EROFS_SHARED_XATTR_EXTENT       (255)
+
+/*
+ * inline xattrs (n == i_xattr_icount):
+ * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes
+ *          12 bytes           /                   \
+ *                            /                     \
+ *                           /-----------------------\
+ *                           |  erofs_xattr_entries+ |
+ *                           +-----------------------+
+ * inline xattrs must starts in erofs_xattr_ibody_header,
+ * for read-only fs, no need to introduce h_refcount
+ */
+struct erofs_xattr_ibody_header {
+	__le32 h_reserved;
+	__u8   h_shared_count;
+	__u8   h_reserved2[7];
+	__le32 h_shared_xattrs[0];      /* shared xattr id array */
+};
+
+/* Name indexes */
+#define EROFS_XATTR_INDEX_USER              1
+#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS  2
+#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
+#define EROFS_XATTR_INDEX_TRUSTED           4
+#define EROFS_XATTR_INDEX_LUSTRE            5
+#define EROFS_XATTR_INDEX_SECURITY          6
+
+/* xattr entry (for both inline & shared xattrs) */
+struct erofs_xattr_entry {
+	__u8   e_name_len;      /* length of name */
+	__u8   e_name_index;    /* attribute name index */
+	__le16 e_value_size;    /* size of attribute value */
+	/* followed by e_name and e_value */
+	char   e_name[0];       /* attribute name */
+};
+
+static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
+{
+	if (!i_xattr_icount)
+		return 0;
+
+	return sizeof(struct erofs_xattr_ibody_header) +
+		sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
+}
+
+#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry))
+
+static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
+{
+	return EROFS_XATTR_ALIGN(sizeof(struct erofs_xattr_entry) +
+				 e->e_name_len + le16_to_cpu(e->e_value_size));
+}
+
+/* available compression algorithm types (for h_algorithmtype) */
+enum {
+	Z_EROFS_COMPRESSION_LZ4	= 0,
+	Z_EROFS_COMPRESSION_MAX
+};
+
+/*
+ * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
+ *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
+ *                                  (4B) + 2B + (4B) if compacted 2B is on.
+ */
+#define Z_EROFS_ADVISE_COMPACTED_2B_BIT         0
+
+#define Z_EROFS_ADVISE_COMPACTED_2B     (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+
+struct z_erofs_map_header {
+	__le32	h_reserved1;
+	__le16	h_advise;
+	/*
+	 * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
+	 * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
+	 */
+	__u8	h_algorithmtype;
+	/*
+	 * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
+	 * bit 3-4 : (physical - logical) cluster bits of head 1:
+	 *       For example, if logical clustersize = 4096, 1 for 8192.
+	 * bit 5-7 : (physical - logical) cluster bits of head 2.
+	 */
+	__u8	h_clusterbits;
+};
+
+#define Z_EROFS_VLE_LEGACY_HEADER_PADDING       8
+
+/*
+ * Fixed-sized output compression ondisk Logical Extent cluster type:
+ *    0 - literal (uncompressed) cluster
+ *    1 - compressed cluster (for the head logical cluster)
+ *    2 - compressed cluster (for the other logical clusters)
+ *
+ * In detail,
+ *    0 - literal (uncompressed) cluster,
+ *        di_advise = 0
+ *        di_clusterofs = the literal data offset of the cluster
+ *        di_blkaddr = the blkaddr of the literal cluster
+ *
+ *    1 - compressed cluster (for the head logical cluster)
+ *        di_advise = 1
+ *        di_clusterofs = the decompressed data offset of the cluster
+ *        di_blkaddr = the blkaddr of the compressed cluster
+ *
+ *    2 - compressed cluster (for the other logical clusters)
+ *        di_advise = 2
+ *        di_clusterofs =
+ *           the decompressed data offset in its own head cluster
+ *        di_u.delta[0] = distance to its corresponding head cluster
+ *        di_u.delta[1] = distance to its corresponding tail cluster
+ *                (di_advise could be 0, 1 or 2)
+ */
+enum {
+	Z_EROFS_VLE_CLUSTER_TYPE_PLAIN		= 0,
+	Z_EROFS_VLE_CLUSTER_TYPE_HEAD		= 1,
+	Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD	= 2,
+	Z_EROFS_VLE_CLUSTER_TYPE_RESERVED	= 3,
+	Z_EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
+#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0
+
+struct z_erofs_vle_decompressed_index {
+	__le16 di_advise;
+	/* where to decompress in the head cluster */
+	__le16 di_clusterofs;
+
+	union {
+		/* for the head cluster */
+		__le32 blkaddr;
+		/*
+		 * for the rest clusters
+		 * eg. for 4k page-sized cluster, maximum 4K*64k = 256M)
+		 * [0] - pointing to the head cluster
+		 * [1] - pointing to the tail cluster
+		 */
+		__le16 delta[2];
+	} di_u;
+};
+
+#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \
+	(round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \
+	 sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING)
+
+/* dirent sorts in alphabet order, thus we can do binary search */
+struct erofs_dirent {
+	__le64 nid;     /* node number */
+	__le16 nameoff; /* start offset of file name */
+	__u8 file_type; /* file type */
+	__u8 reserved;  /* reserved */
+} __packed;
+
+/*
+ * EROFS file types should match generic FT_* types and
+ * it seems no need to add BUILD_BUG_ONs since potential
+ * unmatchness will break other fses as well...
+ */
+
+#define EROFS_NAME_LEN      255
+
+/* check the EROFS on-disk layout strictly at compile time */
+static inline void erofs_check_ondisk_layout_definitions(void)
+{
+	BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
+	BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
+	BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
+	BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
+	BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
+	BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
+	BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
+	BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
+
+	BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
+		     Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
+}
+
+#endif
+

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
new file mode 100644
index 0000000..3350ab6
--- /dev/null
+++ b/fs/erofs/inode.c

@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "xattr.h"
+
+#include <trace/events/erofs.h>
+
+/* no locking */
+static int erofs_read_inode(struct inode *inode, void *data)
+{
+	struct erofs_inode *vi = EROFS_I(inode);
+	struct erofs_inode_compact *dic = data;
+	struct erofs_inode_extended *die;
+
+	const unsigned int ifmt = le16_to_cpu(dic->i_format);
+	struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+	erofs_blk_t nblks = 0;
+
+	vi->datalayout = erofs_inode_datalayout(ifmt);
+
+	if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
+		erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+			  vi->datalayout, vi->nid);
+		DBG_BUGON(1);
+		return -EOPNOTSUPP;
+	}
+
+	switch (erofs_inode_version(ifmt)) {
+	case EROFS_INODE_LAYOUT_EXTENDED:
+		die = data;
+
+		vi->inode_isize = sizeof(struct erofs_inode_extended);
+		vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
+
+		inode->i_mode = le16_to_cpu(die->i_mode);
+		switch (inode->i_mode & S_IFMT) {
+		case S_IFREG:
+		case S_IFDIR:
+		case S_IFLNK:
+			vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
+			break;
+		case S_IFCHR:
+		case S_IFBLK:
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(die->i_u.rdev));
+			break;
+		case S_IFIFO:
+		case S_IFSOCK:
+			inode->i_rdev = 0;
+			break;
+		default:
+			goto bogusimode;
+		}
+		i_uid_write(inode, le32_to_cpu(die->i_uid));
+		i_gid_write(inode, le32_to_cpu(die->i_gid));
+		set_nlink(inode, le32_to_cpu(die->i_nlink));
+
+		/* ns timestamp */
+		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
+			le64_to_cpu(die->i_ctime);
+		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
+			le32_to_cpu(die->i_ctime_nsec);
+
+		inode->i_size = le64_to_cpu(die->i_size);
+
+		/* total blocks for compressed files */
+		if (erofs_inode_is_data_compressed(vi->datalayout))
+			nblks = le32_to_cpu(die->i_u.compressed_blocks);
+		break;
+	case EROFS_INODE_LAYOUT_COMPACT:
+		vi->inode_isize = sizeof(struct erofs_inode_compact);
+		vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
+
+		inode->i_mode = le16_to_cpu(dic->i_mode);
+		switch (inode->i_mode & S_IFMT) {
+		case S_IFREG:
+		case S_IFDIR:
+		case S_IFLNK:
+			vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
+			break;
+		case S_IFCHR:
+		case S_IFBLK:
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(dic->i_u.rdev));
+			break;
+		case S_IFIFO:
+		case S_IFSOCK:
+			inode->i_rdev = 0;
+			break;
+		default:
+			goto bogusimode;
+		}
+		i_uid_write(inode, le16_to_cpu(dic->i_uid));
+		i_gid_write(inode, le16_to_cpu(dic->i_gid));
+		set_nlink(inode, le16_to_cpu(dic->i_nlink));
+
+		/* use build time to derive all file time */
+		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
+			sbi->build_time;
+		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
+			sbi->build_time_nsec;
+
+		inode->i_size = le32_to_cpu(dic->i_size);
+		if (erofs_inode_is_data_compressed(vi->datalayout))
+			nblks = le32_to_cpu(dic->i_u.compressed_blocks);
+		break;
+	default:
+		erofs_err(inode->i_sb,
+			  "unsupported on-disk inode version %u of nid %llu",
+			  erofs_inode_version(ifmt), vi->nid);
+		DBG_BUGON(1);
+		return -EOPNOTSUPP;
+	}
+
+	if (!nblks)
+		/* measure inode.i_blocks as generic filesystems */
+		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
+	else
+		inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
+	return 0;
+
+bogusimode:
+	erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
+		  inode->i_mode, vi->nid);
+	DBG_BUGON(1);
+	return -EFSCORRUPTED;
+}
+
+static int erofs_fill_symlink(struct inode *inode, void *data,
+			      unsigned int m_pofs)
+{
+	struct erofs_inode *vi = EROFS_I(inode);
+	char *lnk;
+
+	/* if it cannot be handled with fast symlink scheme */
+	if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
+	    inode->i_size >= PAGE_SIZE) {
+		inode->i_op = &erofs_symlink_iops;
+		return 0;
+	}
+
+	lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
+	if (!lnk)
+		return -ENOMEM;
+
+	m_pofs += vi->inode_isize + vi->xattr_isize;
+	/* inline symlink data shouldn't cross page boundary as well */
+	if (m_pofs + inode->i_size > PAGE_SIZE) {
+		kfree(lnk);
+		erofs_err(inode->i_sb,
+			  "inline data cross block boundary @ nid %llu",
+			  vi->nid);
+		DBG_BUGON(1);
+		return -EFSCORRUPTED;
+	}
+
+	memcpy(lnk, data + m_pofs, inode->i_size);
+	lnk[inode->i_size] = '\0';
+
+	inode->i_link = lnk;
+	inode->i_op = &erofs_fast_symlink_iops;
+	return 0;
+}
+
+static int erofs_fill_inode(struct inode *inode, int isdir)
+{
+	struct super_block *sb = inode->i_sb;
+	struct erofs_inode *vi = EROFS_I(inode);
+	struct page *page;
+	void *data;
+	int err;
+	erofs_blk_t blkaddr;
+	unsigned int ofs;
+	erofs_off_t inode_loc;
+
+	trace_erofs_fill_inode(inode, isdir);
+	inode_loc = iloc(EROFS_SB(sb), vi->nid);
+	blkaddr = erofs_blknr(inode_loc);
+	ofs = erofs_blkoff(inode_loc);
+
+	erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
+		  __func__, vi->nid, ofs, blkaddr);
+
+	page = erofs_get_meta_page(sb, blkaddr);
+
+	if (IS_ERR(page)) {
+		erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
+			  vi->nid, PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+
+	DBG_BUGON(!PageUptodate(page));
+	data = page_address(page);
+
+	err = erofs_read_inode(inode, data + ofs);
+	if (err)
+		goto out_unlock;
+
+	/* setup the new inode */
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &erofs_generic_iops;
+		inode->i_fop = &generic_ro_fops;
+		break;
+	case S_IFDIR:
+		inode->i_op = &erofs_dir_iops;
+		inode->i_fop = &erofs_dir_fops;
+		break;
+	case S_IFLNK:
+		err = erofs_fill_symlink(inode, data, ofs);
+		if (err)
+			goto out_unlock;
+		inode_nohighmem(inode);
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		inode->i_op = &erofs_generic_iops;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		goto out_unlock;
+	default:
+		err = -EFSCORRUPTED;
+		goto out_unlock;
+	}
+
+	if (erofs_inode_is_data_compressed(vi->datalayout)) {
+		err = z_erofs_fill_inode(inode);
+		goto out_unlock;
+	}
+	inode->i_mapping->a_ops = &erofs_raw_access_aops;
+
+out_unlock:
+	unlock_page(page);
+	put_page(page);
+	return err;
+}
+
+/*
+ * erofs nid is 64bits, but i_ino is 'unsigned long', therefore
+ * we should do more for 32-bit platform to find the right inode.
+ */
+static int erofs_ilookup_test_actor(struct inode *inode, void *opaque)
+{
+	const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+
+	return EROFS_I(inode)->nid == nid;
+}
+
+static int erofs_iget_set_actor(struct inode *inode, void *opaque)
+{
+	const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+
+	inode->i_ino = erofs_inode_hash(nid);
+	return 0;
+}
+
+static inline struct inode *erofs_iget_locked(struct super_block *sb,
+					      erofs_nid_t nid)
+{
+	const unsigned long hashval = erofs_inode_hash(nid);
+
+	return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
+		erofs_iget_set_actor, &nid);
+}
+
+struct inode *erofs_iget(struct super_block *sb,
+			 erofs_nid_t nid,
+			 bool isdir)
+{
+	struct inode *inode = erofs_iget_locked(sb, nid);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		int err;
+		struct erofs_inode *vi = EROFS_I(inode);
+
+		vi->nid = nid;
+
+		err = erofs_fill_inode(inode, isdir);
+		if (!err)
+			unlock_new_inode(inode);
+		else {
+			iget_failed(inode);
+			inode = ERR_PTR(err);
+		}
+	}
+	return inode;
+}
+
+int erofs_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int query_flags)
+{
+	struct inode *const inode = d_inode(path->dentry);
+
+	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+		stat->attributes |= STATX_ATTR_COMPRESSED;
+
+	stat->attributes |= STATX_ATTR_IMMUTABLE;
+	stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+				  STATX_ATTR_IMMUTABLE);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+const struct inode_operations erofs_generic_iops = {
+	.getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+	.listxattr = erofs_listxattr,
+#endif
+	.get_acl = erofs_get_acl,
+};
+
+const struct inode_operations erofs_symlink_iops = {
+	.get_link = page_get_link,
+	.getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+	.listxattr = erofs_listxattr,
+#endif
+	.get_acl = erofs_get_acl,
+};
+
+const struct inode_operations erofs_fast_symlink_iops = {
+	.get_link = simple_get_link,
+	.getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+	.listxattr = erofs_listxattr,
+#endif
+	.get_acl = erofs_get_acl,
+};
+

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
new file mode 100644
index 0000000..544a453
--- /dev/null
+++ b/fs/erofs/internal.h

@@ -0,0 +1,431 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_INTERNAL_H
+#define __EROFS_INTERNAL_H
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "erofs_fs.h"
+
+/* redefine pr_fmt "erofs: " */
+#undef pr_fmt
+#define pr_fmt(fmt) "erofs: " fmt
+
+__printf(3, 4) void _erofs_err(struct super_block *sb,
+			       const char *function, const char *fmt, ...);
+#define erofs_err(sb, fmt, ...)	\
+	_erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
+__printf(3, 4) void _erofs_info(struct super_block *sb,
+			       const char *function, const char *fmt, ...);
+#define erofs_info(sb, fmt, ...) \
+	_erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
+#ifdef CONFIG_EROFS_FS_DEBUG
+#define erofs_dbg(x, ...)       pr_debug(x "\n", ##__VA_ARGS__)
+#define DBG_BUGON               BUG_ON
+#else
+#define erofs_dbg(x, ...)       ((void)0)
+#define DBG_BUGON(x)            ((void)(x))
+#endif	/* !CONFIG_EROFS_FS_DEBUG */
+
+/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
+#define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
+
+typedef u64 erofs_nid_t;
+typedef u64 erofs_off_t;
+/* data type for filesystem-wide blocks number */
+typedef u32 erofs_blk_t;
+
+struct erofs_sb_info {
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* list for all registered superblocks, mainly for shrinker */
+	struct list_head list;
+	struct mutex umount_mutex;
+
+	/* the dedicated workstation for compression */
+	struct radix_tree_root workstn_tree;
+
+	/* threshold for decompression synchronously */
+	unsigned int max_sync_decompress_pages;
+
+	unsigned int shrinker_run_no;
+
+	/* current strategy of how to use managed cache */
+	unsigned char cache_strategy;
+
+	/* pseudo inode to manage cached pages */
+	struct inode *managed_cache;
+#endif	/* CONFIG_EROFS_FS_ZIP */
+	u32 blocks;
+	u32 meta_blkaddr;
+#ifdef CONFIG_EROFS_FS_XATTR
+	u32 xattr_blkaddr;
+#endif
+
+	/* inode slot unit size in bit shift */
+	unsigned char islotbits;
+
+	u32 build_time_nsec;
+	u64 build_time;
+
+	/* what we really care is nid, rather than ino.. */
+	erofs_nid_t root_nid;
+	/* used for statfs, f_files - f_favail */
+	u64 inos;
+
+	u8 uuid[16];                    /* 128-bit uuid for volume */
+	u8 volume_name[16];             /* volume name */
+	u32 feature_incompat;
+
+	unsigned int mount_opt;
+};
+
+#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
+#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info)
+
+/* Mount flags set via mount options or defaults */
+#define EROFS_MOUNT_XATTR_USER		0x00000010
+#define EROFS_MOUNT_POSIX_ACL		0x00000020
+
+#define clear_opt(sbi, option)	((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
+
+#ifdef CONFIG_EROFS_FS_ZIP
+enum {
+	EROFS_ZIP_CACHE_DISABLED,
+	EROFS_ZIP_CACHE_READAHEAD,
+	EROFS_ZIP_CACHE_READAROUND
+};
+
+#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+	/* the workgroup index in the workstation */
+	pgoff_t index;
+
+	/* overall workgroup reference count */
+	atomic_t refcount;
+};
+
+#if defined(CONFIG_SMP)
+static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
+						 int val)
+{
+	preempt_disable();
+	if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
+		preempt_enable();
+		return false;
+	}
+	return true;
+}
+
+static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
+					    int orig_val)
+{
+	/*
+	 * other observers should notice all modifications
+	 * in the freezing period.
+	 */
+	smp_mb();
+	atomic_set(&grp->refcount, orig_val);
+	preempt_enable();
+}
+
+static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
+{
+	return atomic_cond_read_relaxed(&grp->refcount,
+					VAL != EROFS_LOCKED_MAGIC);
+}
+#else
+static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
+						 int val)
+{
+	preempt_disable();
+	/* no need to spin on UP platforms, let's just disable preemption. */
+	if (val != atomic_read(&grp->refcount)) {
+		preempt_enable();
+		return false;
+	}
+	return true;
+}
+
+static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
+					    int orig_val)
+{
+	preempt_enable();
+}
+
+static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
+{
+	int v = atomic_read(&grp->refcount);
+
+	/* workgroup is never freezed on uniprocessor systems */
+	DBG_BUGON(v == EROFS_LOCKED_MAGIC);
+	return v;
+}
+#endif	/* !CONFIG_SMP */
+
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#define EROFS_PCPUBUF_NR_PAGES          Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PCPUBUF_NR_PAGES          0
+#endif	/* !CONFIG_EROFS_FS_ZIP */
+
+/* we strictly follow PAGE_SIZE and no buffer head yet */
+#define LOG_BLOCK_SIZE		PAGE_SHIFT
+
+#undef LOG_SECTORS_PER_BLOCK
+#define LOG_SECTORS_PER_BLOCK	(PAGE_SHIFT - 9)
+
+#undef SECTORS_PER_BLOCK
+#define SECTORS_PER_BLOCK	(1 << SECTORS_PER_BLOCK)
+
+#define EROFS_BLKSIZ		(1 << LOG_BLOCK_SIZE)
+
+#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ)
+#error erofs cannot be used in this platform
+#endif
+
+#define ROOT_NID(sb)		((sb)->root_nid)
+
+#define erofs_blknr(addr)       ((addr) / EROFS_BLKSIZ)
+#define erofs_blkoff(addr)      ((addr) % EROFS_BLKSIZ)
+#define blknr_to_addr(nr)       ((erofs_off_t)(nr) * EROFS_BLKSIZ)
+
+static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
+{
+	return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
+}
+
+/* atomic flag definitions */
+#define EROFS_I_EA_INITED_BIT	0
+#define EROFS_I_Z_INITED_BIT	1
+
+/* bitlock definitions (arranged in reverse order) */
+#define EROFS_I_BL_XATTR_BIT	(BITS_PER_LONG - 1)
+#define EROFS_I_BL_Z_BIT	(BITS_PER_LONG - 2)
+
+struct erofs_inode {
+	erofs_nid_t nid;
+
+	/* atomic flags (including bitlocks) */
+	unsigned long flags;
+
+	unsigned char datalayout;
+	unsigned char inode_isize;
+	unsigned short xattr_isize;
+
+	unsigned int xattr_shared_count;
+	unsigned int *xattr_shared_xattrs;
+
+	union {
+		erofs_blk_t raw_blkaddr;
+#ifdef CONFIG_EROFS_FS_ZIP
+		struct {
+			unsigned short z_advise;
+			unsigned char  z_algorithmtype[2];
+			unsigned char  z_logical_clusterbits;
+			unsigned char  z_physical_clusterbits[2];
+		};
+#endif	/* CONFIG_EROFS_FS_ZIP */
+	};
+	/* the corresponding vfs inode */
+	struct inode vfs_inode;
+};
+
+#define EROFS_I(ptr)	\
+	container_of(ptr, struct erofs_inode, vfs_inode)
+
+static inline unsigned long erofs_inode_datablocks(struct inode *inode)
+{
+	/* since i_size cannot be changed */
+	return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+}
+
+static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit,
+					  unsigned int bits)
+{
+
+	return (value >> bit) & ((1 << bits) - 1);
+}
+
+
+static inline unsigned int erofs_inode_version(unsigned int value)
+{
+	return erofs_bitrange(value, EROFS_I_VERSION_BIT,
+			      EROFS_I_VERSION_BITS);
+}
+
+static inline unsigned int erofs_inode_datalayout(unsigned int value)
+{
+	return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT,
+			      EROFS_I_DATALAYOUT_BITS);
+}
+
+extern const struct super_operations erofs_sops;
+
+extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
+#endif
+
+/*
+ * Logical to physical block mapping, used by erofs_map_blocks()
+ *
+ * Different with other file systems, it is used for 2 access modes:
+ *
+ * 1) RAW access mode:
+ *
+ * Users pass a valid (m_lblk, m_lofs -- usually 0) pair,
+ * and get the valid m_pblk, m_pofs and the longest m_len(in bytes).
+ *
+ * Note that m_lblk in the RAW access mode refers to the number of
+ * the compressed ondisk block rather than the uncompressed
+ * in-memory block for the compressed file.
+ *
+ * m_pofs equals to m_lofs except for the inline data page.
+ *
+ * 2) Normal access mode:
+ *
+ * If the inode is not compressed, it has no difference with
+ * the RAW access mode. However, if the inode is compressed,
+ * users should pass a valid (m_lblk, m_lofs) pair, and get
+ * the needed m_pblk, m_pofs, m_len to get the compressed data
+ * and the updated m_lblk, m_lofs which indicates the start
+ * of the corresponding uncompressed data in the file.
+ */
+enum {
+	BH_Zipped = BH_PrivateStart,
+	BH_FullMapped,
+};
+
+/* Has a disk mapping */
+#define EROFS_MAP_MAPPED	(1 << BH_Mapped)
+/* Located in metadata (could be copied from bd_inode) */
+#define EROFS_MAP_META		(1 << BH_Meta)
+/* The extent has been compressed */
+#define EROFS_MAP_ZIPPED	(1 << BH_Zipped)
+/* The length of extent is full */
+#define EROFS_MAP_FULL_MAPPED	(1 << BH_FullMapped)
+
+struct erofs_map_blocks {
+	erofs_off_t m_pa, m_la;
+	u64 m_plen, m_llen;
+
+	unsigned int m_flags;
+
+	struct page *mpage;
+};
+
+/* Flags used by erofs_map_blocks() */
+#define EROFS_GET_BLOCKS_RAW    0x0001
+
+/* zmap.c */
+#ifdef CONFIG_EROFS_FS_ZIP
+int z_erofs_fill_inode(struct inode *inode);
+int z_erofs_map_blocks_iter(struct inode *inode,
+			    struct erofs_map_blocks *map,
+			    int flags);
+#else
+static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; }
+static inline int z_erofs_map_blocks_iter(struct inode *inode,
+					  struct erofs_map_blocks *map,
+					  int flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif	/* !CONFIG_EROFS_FS_ZIP */
+
+/* data.c */
+struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+
+int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
+
+/* inode.c */
+static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
+{
+#if BITS_PER_LONG == 32
+	return (nid >> 32) ^ (nid & 0xffffffff);
+#else
+	return nid;
+#endif
+}
+
+extern const struct inode_operations erofs_generic_iops;
+extern const struct inode_operations erofs_symlink_iops;
+extern const struct inode_operations erofs_fast_symlink_iops;
+
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
+int erofs_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int query_flags);
+
+/* namei.c */
+extern const struct inode_operations erofs_dir_iops;
+
+int erofs_namei(struct inode *dir, struct qstr *name,
+		erofs_nid_t *nid, unsigned int *d_type);
+
+/* dir.c */
+extern const struct file_operations erofs_dir_fops;
+
+/* utils.c / zdata.c */
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail);
+
+#if (EROFS_PCPUBUF_NR_PAGES > 0)
+void *erofs_get_pcpubuf(unsigned int pagenr);
+#define erofs_put_pcpubuf(buf) do { \
+	(void)&(buf);	\
+	preempt_enable();	\
+} while (0)
+#else
+static inline void *erofs_get_pcpubuf(unsigned int pagenr)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+#define erofs_put_pcpubuf(buf) do {} while (0)
+#endif
+
+#ifdef CONFIG_EROFS_FS_ZIP
+int erofs_workgroup_put(struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+					     pgoff_t index, bool *tag);
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp, bool tag);
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+void erofs_shrinker_register(struct super_block *sb);
+void erofs_shrinker_unregister(struct super_block *sb);
+int __init erofs_init_shrinker(void);
+void erofs_exit_shrinker(void);
+int __init z_erofs_init_zip_subsystem(void);
+void z_erofs_exit_zip_subsystem(void);
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+				       struct erofs_workgroup *egrp);
+int erofs_try_to_free_cached_page(struct address_space *mapping,
+				  struct page *page);
+#else
+static inline void erofs_shrinker_register(struct super_block *sb) {}
+static inline void erofs_shrinker_unregister(struct super_block *sb) {}
+static inline int erofs_init_shrinker(void) { return 0; }
+static inline void erofs_exit_shrinker(void) {}
+static inline int z_erofs_init_zip_subsystem(void) { return 0; }
+static inline void z_erofs_exit_zip_subsystem(void) {}
+#endif	/* !CONFIG_EROFS_FS_ZIP */
+
+#define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
+
+#endif	/* __EROFS_INTERNAL_H */
+

diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
new file mode 100644
index 0000000..3abbecb
--- /dev/null
+++ b/fs/erofs/namei.c

@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "xattr.h"
+
+#include <trace/events/erofs.h>
+
+struct erofs_qstr {
+	const unsigned char *name;
+	const unsigned char *end;
+};
+
+/* based on the end of qn is accurate and it must have the trailing '\0' */
+static inline int erofs_dirnamecmp(const struct erofs_qstr *qn,
+				   const struct erofs_qstr *qd,
+				   unsigned int *matched)
+{
+	unsigned int i = *matched;
+
+	/*
+	 * on-disk error, let's only BUG_ON in the debugging mode.
+	 * otherwise, it will return 1 to just skip the invalid name
+	 * and go on (in consideration of the lookup performance).
+	 */
+	DBG_BUGON(qd->name > qd->end);
+
+	/* qd could not have trailing '\0' */
+	/* However it is absolutely safe if < qd->end */
+	while (qd->name + i < qd->end && qd->name[i] != '\0') {
+		if (qn->name[i] != qd->name[i]) {
+			*matched = i;
+			return qn->name[i] > qd->name[i] ? 1 : -1;
+		}
+		++i;
+	}
+	*matched = i;
+	/* See comments in __d_alloc on the terminating NUL character */
+	return qn->name[i] == '\0' ? 0 : 1;
+}
+
+#define nameoff_from_disk(off, sz)	(le16_to_cpu(off) & ((sz) - 1))
+
+static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
+					       u8 *data,
+					       unsigned int dirblksize,
+					       const int ndirents)
+{
+	int head, back;
+	unsigned int startprfx, endprfx;
+	struct erofs_dirent *const de = (struct erofs_dirent *)data;
+
+	/* since the 1st dirent has been evaluated previously */
+	head = 1;
+	back = ndirents - 1;
+	startprfx = endprfx = 0;
+
+	while (head <= back) {
+		const int mid = head + (back - head) / 2;
+		const int nameoff = nameoff_from_disk(de[mid].nameoff,
+						      dirblksize);
+		unsigned int matched = min(startprfx, endprfx);
+		struct erofs_qstr dname = {
+			.name = data + nameoff,
+			.end = mid >= ndirents - 1 ?
+				data + dirblksize :
+				data + nameoff_from_disk(de[mid + 1].nameoff,
+							 dirblksize)
+		};
+
+		/* string comparison without already matched prefix */
+		int ret = erofs_dirnamecmp(name, &dname, &matched);
+
+		if (!ret) {
+			return de + mid;
+		} else if (ret > 0) {
+			head = mid + 1;
+			startprfx = matched;
+		} else {
+			back = mid - 1;
+			endprfx = matched;
+		}
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct page *find_target_block_classic(struct inode *dir,
+					      struct erofs_qstr *name,
+					      int *_ndirents)
+{
+	unsigned int startprfx, endprfx;
+	int head, back;
+	struct address_space *const mapping = dir->i_mapping;
+	struct page *candidate = ERR_PTR(-ENOENT);
+
+	startprfx = endprfx = 0;
+	head = 0;
+	back = erofs_inode_datablocks(dir) - 1;
+
+	while (head <= back) {
+		const int mid = head + (back - head) / 2;
+		struct page *page = read_mapping_page(mapping, mid, NULL);
+
+		if (!IS_ERR(page)) {
+			struct erofs_dirent *de = kmap_atomic(page);
+			const int nameoff = nameoff_from_disk(de->nameoff,
+							      EROFS_BLKSIZ);
+			const int ndirents = nameoff / sizeof(*de);
+			int diff;
+			unsigned int matched;
+			struct erofs_qstr dname;
+
+			if (!ndirents) {
+				kunmap_atomic(de);
+				put_page(page);
+				erofs_err(dir->i_sb,
+					  "corrupted dir block %d @ nid %llu",
+					  mid, EROFS_I(dir)->nid);
+				DBG_BUGON(1);
+				page = ERR_PTR(-EFSCORRUPTED);
+				goto out;
+			}
+
+			matched = min(startprfx, endprfx);
+
+			dname.name = (u8 *)de + nameoff;
+			if (ndirents == 1)
+				dname.end = (u8 *)de + EROFS_BLKSIZ;
+			else
+				dname.end = (u8 *)de +
+					nameoff_from_disk(de[1].nameoff,
+							  EROFS_BLKSIZ);
+
+			/* string comparison without already matched prefix */
+			diff = erofs_dirnamecmp(name, &dname, &matched);
+			kunmap_atomic(de);
+
+			if (!diff) {
+				*_ndirents = 0;
+				goto out;
+			} else if (diff > 0) {
+				head = mid + 1;
+				startprfx = matched;
+
+				if (!IS_ERR(candidate))
+					put_page(candidate);
+				candidate = page;
+				*_ndirents = ndirents;
+			} else {
+				put_page(page);
+
+				back = mid - 1;
+				endprfx = matched;
+			}
+			continue;
+		}
+out:		/* free if the candidate is valid */
+		if (!IS_ERR(candidate))
+			put_page(candidate);
+		return page;
+	}
+	return candidate;
+}
+
+int erofs_namei(struct inode *dir,
+		struct qstr *name,
+		erofs_nid_t *nid, unsigned int *d_type)
+{
+	int ndirents;
+	struct page *page;
+	void *data;
+	struct erofs_dirent *de;
+	struct erofs_qstr qn;
+
+	if (!dir->i_size)
+		return -ENOENT;
+
+	qn.name = name->name;
+	qn.end = name->name + name->len;
+
+	ndirents = 0;
+	page = find_target_block_classic(dir, &qn, &ndirents);
+
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	data = kmap_atomic(page);
+	/* the target page has been mapped */
+	if (ndirents)
+		de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
+	else
+		de = (struct erofs_dirent *)data;
+
+	if (!IS_ERR(de)) {
+		*nid = le64_to_cpu(de->nid);
+		*d_type = de->file_type;
+	}
+
+	kunmap_atomic(data);
+	put_page(page);
+
+	return PTR_ERR_OR_ZERO(de);
+}
+
+/* NOTE: i_mutex is already held by vfs */
+static struct dentry *erofs_lookup(struct inode *dir,
+				   struct dentry *dentry,
+				   unsigned int flags)
+{
+	int err;
+	erofs_nid_t nid;
+	unsigned int d_type;
+	struct inode *inode;
+
+	DBG_BUGON(!d_really_is_negative(dentry));
+	/* dentry must be unhashed in lookup, no need to worry about */
+	DBG_BUGON(!d_unhashed(dentry));
+
+	trace_erofs_lookup(dir, dentry, flags);
+
+	/* file name exceeds fs limit */
+	if (dentry->d_name.len > EROFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	/* false uninitialized warnings on gcc 4.8.x */
+	err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
+
+	if (err == -ENOENT) {
+		/* negative dentry */
+		inode = NULL;
+	} else if (err) {
+		inode = ERR_PTR(err);
+	} else {
+		erofs_dbg("%s, %s (nid %llu) found, d_type %u", __func__,
+			  dentry->d_name.name, nid, d_type);
+		inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+const struct inode_operations erofs_dir_iops = {
+	.lookup = erofs_lookup,
+	.getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+	.listxattr = erofs_listxattr,
+#endif
+	.get_acl = erofs_get_acl,
+};
+

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
new file mode 100644
index 0000000..0e36949
--- /dev/null
+++ b/fs/erofs/super.c

@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include "xattr.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/erofs.h>
+
+static struct kmem_cache *erofs_inode_cachep __read_mostly;
+
+void _erofs_err(struct super_block *sb, const char *function,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf);
+	va_end(args);
+}
+
+void _erofs_info(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_info("(device %s): %pV", sb->s_id, &vaf);
+	va_end(args);
+}
+
+static void erofs_inode_init_once(void *ptr)
+{
+	struct erofs_inode *vi = ptr;
+
+	inode_init_once(&vi->vfs_inode);
+}
+
+static struct inode *erofs_alloc_inode(struct super_block *sb)
+{
+	struct erofs_inode *vi =
+		kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
+
+	if (!vi)
+		return NULL;
+
+	/* zero out everything except vfs_inode */
+	memset(vi, 0, offsetof(struct erofs_inode, vfs_inode));
+	return &vi->vfs_inode;
+}
+
+static void erofs_free_inode(struct inode *inode)
+{
+	struct erofs_inode *vi = EROFS_I(inode);
+
+	/* be careful of RCU symlink path */
+	if (inode->i_op == &erofs_fast_symlink_iops)
+		kfree(inode->i_link);
+	kfree(vi->xattr_shared_xattrs);
+
+	kmem_cache_free(erofs_inode_cachep, vi);
+}
+
+static bool check_layout_compatibility(struct super_block *sb,
+				       struct erofs_super_block *dsb)
+{
+	const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
+
+	EROFS_SB(sb)->feature_incompat = feature;
+
+	/* check if current kernel meets all mandatory requirements */
+	if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
+		erofs_err(sb,
+			  "unidentified incompatible feature %x, please upgrade kernel version",
+			   feature & ~EROFS_ALL_FEATURE_INCOMPAT);
+		return false;
+	}
+	return true;
+}
+
+static int erofs_read_superblock(struct super_block *sb)
+{
+	struct erofs_sb_info *sbi;
+	struct page *page;
+	struct erofs_super_block *dsb;
+	unsigned int blkszbits;
+	void *data;
+	int ret;
+
+	page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
+	if (IS_ERR(page)) {
+		erofs_err(sb, "cannot read erofs superblock");
+		return PTR_ERR(page);
+	}
+
+	sbi = EROFS_SB(sb);
+
+	data = kmap_atomic(page);
+	dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
+
+	ret = -EINVAL;
+	if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
+		erofs_err(sb, "cannot find valid erofs superblock");
+		goto out;
+	}
+
+	blkszbits = dsb->blkszbits;
+	/* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
+	if (blkszbits != LOG_BLOCK_SIZE) {
+		erofs_err(sb, "blksize %u isn't supported on this platform",
+			  1 << blkszbits);
+		goto out;
+	}
+
+	if (!check_layout_compatibility(sb, dsb))
+		goto out;
+
+	sbi->blocks = le32_to_cpu(dsb->blocks);
+	sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
+#ifdef CONFIG_EROFS_FS_XATTR
+	sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
+#endif
+	sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
+	sbi->root_nid = le16_to_cpu(dsb->root_nid);
+	sbi->inos = le64_to_cpu(dsb->inos);
+
+	sbi->build_time = le64_to_cpu(dsb->build_time);
+	sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
+
+	memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+
+	ret = strscpy(sbi->volume_name, dsb->volume_name,
+		      sizeof(dsb->volume_name));
+	if (ret < 0) {	/* -E2BIG */
+		erofs_err(sb, "bad volume name without NIL terminator");
+		ret = -EFSCORRUPTED;
+		goto out;
+	}
+	ret = 0;
+out:
+	kunmap_atomic(data);
+	put_page(page);
+	return ret;
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP
+static int erofs_build_cache_strategy(struct super_block *sb,
+				      substring_t *args)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	const char *cs = match_strdup(args);
+	int err = 0;
+
+	if (!cs) {
+		erofs_err(sb, "Not enough memory to store cache strategy");
+		return -ENOMEM;
+	}
+
+	if (!strcmp(cs, "disabled")) {
+		sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED;
+	} else if (!strcmp(cs, "readahead")) {
+		sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD;
+	} else if (!strcmp(cs, "readaround")) {
+		sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+	} else {
+		erofs_err(sb, "Unrecognized cache strategy \"%s\"", cs);
+		err = -EINVAL;
+	}
+	kfree(cs);
+	return err;
+}
+#else
+static int erofs_build_cache_strategy(struct super_block *sb,
+				      substring_t *args)
+{
+	erofs_info(sb, "EROFS compression is disabled, so cache strategy is ignored");
+	return 0;
+}
+#endif
+
+/* set up default EROFS parameters */
+static void erofs_default_options(struct erofs_sb_info *sbi)
+{
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+	sbi->max_sync_decompress_pages = 3;
+#endif
+#ifdef CONFIG_EROFS_FS_XATTR
+	set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+	set_opt(sbi, POSIX_ACL);
+#endif
+}
+
+enum {
+	Opt_user_xattr,
+	Opt_nouser_xattr,
+	Opt_acl,
+	Opt_noacl,
+	Opt_cache_strategy,
+	Opt_err
+};
+
+static match_table_t erofs_tokens = {
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_cache_strategy, "cache_strategy=%s"},
+	{Opt_err, NULL}
+};
+
+static int erofs_parse_options(struct super_block *sb, char *options)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+	int err;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ","))) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, erofs_tokens, args);
+
+		switch (token) {
+#ifdef CONFIG_EROFS_FS_XATTR
+		case Opt_user_xattr:
+			set_opt(EROFS_SB(sb), XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt(EROFS_SB(sb), XATTR_USER);
+			break;
+#else
+		case Opt_user_xattr:
+			erofs_info(sb, "user_xattr options not supported");
+			break;
+		case Opt_nouser_xattr:
+			erofs_info(sb, "nouser_xattr options not supported");
+			break;
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(EROFS_SB(sb), POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(EROFS_SB(sb), POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+			erofs_info(sb, "acl options not supported");
+			break;
+		case Opt_noacl:
+			erofs_info(sb, "noacl options not supported");
+			break;
+#endif
+		case Opt_cache_strategy:
+			err = erofs_build_cache_strategy(sb, args);
+			if (err)
+				return err;
+			break;
+		default:
+			erofs_err(sb, "Unrecognized mount option \"%s\" or missing value", p);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP
+static const struct address_space_operations managed_cache_aops;
+
+static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	int ret = 1;	/* 0 - busy */
+	struct address_space *const mapping = page->mapping;
+
+	DBG_BUGON(!PageLocked(page));
+	DBG_BUGON(mapping->a_ops != &managed_cache_aops);
+
+	if (PagePrivate(page))
+		ret = erofs_try_to_free_cached_page(mapping, page);
+
+	return ret;
+}
+
+static void erofs_managed_cache_invalidatepage(struct page *page,
+					       unsigned int offset,
+					       unsigned int length)
+{
+	const unsigned int stop = length + offset;
+
+	DBG_BUGON(!PageLocked(page));
+
+	/* Check for potential overflow in debug mode */
+	DBG_BUGON(stop > PAGE_SIZE || stop < length);
+
+	if (offset == 0 && stop == PAGE_SIZE)
+		while (!erofs_managed_cache_releasepage(page, GFP_NOFS))
+			cond_resched();
+}
+
+static const struct address_space_operations managed_cache_aops = {
+	.releasepage = erofs_managed_cache_releasepage,
+	.invalidatepage = erofs_managed_cache_invalidatepage,
+};
+
+static int erofs_init_managed_cache(struct super_block *sb)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	struct inode *const inode = new_inode(sb);
+
+	if (!inode)
+		return -ENOMEM;
+
+	set_nlink(inode, 1);
+	inode->i_size = OFFSET_MAX;
+
+	inode->i_mapping->a_ops = &managed_cache_aops;
+	mapping_set_gfp_mask(inode->i_mapping,
+			     GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+	sbi->managed_cache = inode;
+	return 0;
+}
+#else
+static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
+#endif
+
+static int erofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct erofs_sb_info *sbi;
+	int err;
+
+	sb->s_magic = EROFS_SUPER_MAGIC;
+
+	if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
+		erofs_err(sb, "failed to set erofs blksize");
+		return -EINVAL;
+	}
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+	err = erofs_read_superblock(sb);
+	if (err)
+		return err;
+
+	sb->s_flags |= SB_RDONLY | SB_NOATIME;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_time_gran = 1;
+
+	sb->s_op = &erofs_sops;
+
+#ifdef CONFIG_EROFS_FS_XATTR
+	sb->s_xattr = erofs_xattr_handlers;
+#endif
+	/* set erofs default mount options */
+	erofs_default_options(sbi);
+
+	err = erofs_parse_options(sb, data);
+	if (err)
+		return err;
+
+	if (test_opt(sbi, POSIX_ACL))
+		sb->s_flags |= SB_POSIXACL;
+	else
+		sb->s_flags &= ~SB_POSIXACL;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
+#endif
+
+	/* get the root inode */
+	inode = erofs_iget(sb, ROOT_NID(sbi), true);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (!S_ISDIR(inode->i_mode)) {
+		erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
+			  ROOT_NID(sbi), inode->i_mode);
+		iput(inode);
+		return -EINVAL;
+	}
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+
+	erofs_shrinker_register(sb);
+	/* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
+	err = erofs_init_managed_cache(sb);
+	if (err)
+		return err;
+
+	erofs_info(sb, "mounted with opts: %s, root inode @ nid %llu.",
+		   (char *)data, ROOT_NID(sbi));
+	return 0;
+}
+
+static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags,
+				  const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super);
+}
+
+/*
+ * could be triggered after deactivate_locked_super()
+ * is called, thus including umount and failed to initialize.
+ */
+static void erofs_kill_sb(struct super_block *sb)
+{
+	struct erofs_sb_info *sbi;
+
+	WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
+
+	kill_block_super(sb);
+
+	sbi = EROFS_SB(sb);
+	if (!sbi)
+		return;
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+}
+
+/* called when ->s_root is non-NULL */
+static void erofs_put_super(struct super_block *sb)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+	DBG_BUGON(!sbi);
+
+	erofs_shrinker_unregister(sb);
+#ifdef CONFIG_EROFS_FS_ZIP
+	iput(sbi->managed_cache);
+	sbi->managed_cache = NULL;
+#endif
+}
+
+static struct file_system_type erofs_fs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "erofs",
+	.mount          = erofs_mount,
+	.kill_sb        = erofs_kill_sb,
+	.fs_flags       = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("erofs");
+
+static int __init erofs_module_init(void)
+{
+	int err;
+
+	erofs_check_ondisk_layout_definitions();
+
+	erofs_inode_cachep = kmem_cache_create("erofs_inode",
+					       sizeof(struct erofs_inode), 0,
+					       SLAB_RECLAIM_ACCOUNT,
+					       erofs_inode_init_once);
+	if (!erofs_inode_cachep) {
+		err = -ENOMEM;
+		goto icache_err;
+	}
+
+	err = erofs_init_shrinker();
+	if (err)
+		goto shrinker_err;
+
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+
+	err = register_filesystem(&erofs_fs_type);
+	if (err)
+		goto fs_err;
+
+	return 0;
+
+fs_err:
+	z_erofs_exit_zip_subsystem();
+zip_err:
+	erofs_exit_shrinker();
+shrinker_err:
+	kmem_cache_destroy(erofs_inode_cachep);
+icache_err:
+	return err;
+}
+
+static void __exit erofs_module_exit(void)
+{
+	unregister_filesystem(&erofs_fs_type);
+	z_erofs_exit_zip_subsystem();
+	erofs_exit_shrinker();
+
+	/* Ensure all RCU free inodes are safe before cache is destroyed. */
+	rcu_barrier();
+	kmem_cache_destroy(erofs_inode_cachep);
+}
+
+/* get filesystem statistics */
+static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = EROFS_BLKSIZ;
+	buf->f_blocks = sbi->blocks;
+	buf->f_bfree = buf->f_bavail = 0;
+
+	buf->f_files = ULLONG_MAX;
+	buf->f_ffree = ULLONG_MAX - sbi->inos;
+
+	buf->f_namelen = EROFS_NAME_LEN;
+
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	return 0;
+}
+
+static int erofs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
+
+#ifdef CONFIG_EROFS_FS_XATTR
+	if (test_opt(sbi, XATTR_USER))
+		seq_puts(seq, ",user_xattr");
+	else
+		seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+	if (test_opt(sbi, POSIX_ACL))
+		seq_puts(seq, ",acl");
+	else
+		seq_puts(seq, ",noacl");
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP
+	if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) {
+		seq_puts(seq, ",cache_strategy=disabled");
+	} else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) {
+		seq_puts(seq, ",cache_strategy=readahead");
+	} else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) {
+		seq_puts(seq, ",cache_strategy=readaround");
+	} else {
+		seq_puts(seq, ",cache_strategy=(unknown)");
+		DBG_BUGON(1);
+	}
+#endif
+	return 0;
+}
+
+static int erofs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	unsigned int org_mnt_opt = sbi->mount_opt;
+	int err;
+
+	DBG_BUGON(!sb_rdonly(sb));
+	err = erofs_parse_options(sb, data);
+	if (err)
+		goto out;
+
+	if (test_opt(sbi, POSIX_ACL))
+		sb->s_flags |= SB_POSIXACL;
+	else
+		sb->s_flags &= ~SB_POSIXACL;
+
+	*flags |= SB_RDONLY;
+	return 0;
+out:
+	sbi->mount_opt = org_mnt_opt;
+	return err;
+}
+
+const struct super_operations erofs_sops = {
+	.put_super = erofs_put_super,
+	.alloc_inode = erofs_alloc_inode,
+	.free_inode = erofs_free_inode,
+	.statfs = erofs_statfs,
+	.show_options = erofs_show_options,
+	.remount_fs = erofs_remount,
+};
+
+module_init(erofs_module_init);
+module_exit(erofs_module_exit);
+
+MODULE_DESCRIPTION("Enhanced ROM File System");
+MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
+MODULE_LICENSE("GPL");
+

diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h
new file mode 100644
index 0000000..a72897c
--- /dev/null
+++ b/fs/erofs/tagptr.h

@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * A tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_TAGPTR_H
+#define __EROFS_FS_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void)(&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void)(&o == &n); \
+	(void)(&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif	/* __EROFS_FS_TAGPTR_H */
+

diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 0000000..d92b3e7
--- /dev/null
+++ b/fs/erofs/utils.c

@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <linux/pagevec.h>
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		DBG_BUGON(page_ref_count(page) != 1);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0);
+	}
+	return page;
+}
+
+#if (EROFS_PCPUBUF_NR_PAGES > 0)
+static struct {
+	u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
+} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
+
+void *erofs_get_pcpubuf(unsigned int pagenr)
+{
+	preempt_disable();
+	return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
+}
+#endif
+
+#ifdef CONFIG_EROFS_FS_ZIP
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+#define __erofs_workgroup_put(grp)	atomic_dec(&(grp)->refcount)
+
+static int erofs_workgroup_get(struct erofs_workgroup *grp)
+{
+	int o;
+
+repeat:
+	o = erofs_wait_on_workgroup_freezed(grp);
+	if (o <= 0)
+		return -1;
+
+	if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
+		goto repeat;
+
+	/* decrease refcount paired by erofs_workgroup_put */
+	if (o == 1)
+		atomic_long_dec(&erofs_global_shrink_cnt);
+	return 0;
+}
+
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+					     pgoff_t index, bool *tag)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_workgroup *grp;
+
+repeat:
+	rcu_read_lock();
+	grp = radix_tree_lookup(&sbi->workstn_tree, index);
+	if (grp) {
+		*tag = xa_pointer_tag(grp);
+		grp = xa_untag_pointer(grp);
+
+		if (erofs_workgroup_get(grp)) {
+			/* prefer to relax rcu read side */
+			rcu_read_unlock();
+			goto repeat;
+		}
+
+		DBG_BUGON(index != grp->index);
+	}
+	rcu_read_unlock();
+	return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp,
+			     bool tag)
+{
+	struct erofs_sb_info *sbi;
+	int err;
+
+	/* grp shouldn't be broken or used before */
+	if (atomic_read(&grp->refcount) != 1) {
+		DBG_BUGON(1);
+		return -EINVAL;
+	}
+
+	err = radix_tree_preload(GFP_NOFS);
+	if (err)
+		return err;
+
+	sbi = EROFS_SB(sb);
+	xa_lock(&sbi->workstn_tree);
+
+	grp = xa_tag_pointer(grp, tag);
+
+	/*
+	 * Bump up reference count before making this workgroup
+	 * visible to other users in order to avoid potential UAF
+	 * without serialized by workstn_lock.
+	 */
+	__erofs_workgroup_get(grp);
+
+	err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
+	if (err)
+		/*
+		 * it's safe to decrease since the workgroup isn't visible
+		 * and refcount >= 2 (cannot be freezed).
+		 */
+		__erofs_workgroup_put(grp);
+
+	xa_unlock(&sbi->workstn_tree);
+	radix_tree_preload_end();
+	return err;
+}
+
+static void  __erofs_workgroup_free(struct erofs_workgroup *grp)
+{
+	atomic_long_dec(&erofs_global_shrink_cnt);
+	erofs_workgroup_free_rcu(grp);
+}
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count)
+		__erofs_workgroup_free(grp);
+	return count;
+}
+
+static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
+{
+	erofs_workgroup_unfreeze(grp, 0);
+	__erofs_workgroup_free(grp);
+}
+
+static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+					   struct erofs_workgroup *grp,
+					   bool cleanup)
+{
+	/*
+	 * If managed cache is on, refcount of workgroups
+	 * themselves could be < 0 (freezed). In other words,
+	 * there is no guarantee that all refcounts > 0.
+	 */
+	if (!erofs_workgroup_try_to_freeze(grp, 1))
+		return false;
+
+	/*
+	 * Note that all cached pages should be unattached
+	 * before deleted from the radix tree. Otherwise some
+	 * cached pages could be still attached to the orphan
+	 * old workgroup when the new one is available in the tree.
+	 */
+	if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
+		erofs_workgroup_unfreeze(grp, 1);
+		return false;
+	}
+
+	/*
+	 * It's impossible to fail after the workgroup is freezed,
+	 * however in order to avoid some race conditions, add a
+	 * DBG_BUGON to observe this in advance.
+	 */
+	DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
+						     grp->index)) != grp);
+
+	/*
+	 * If managed cache is on, last refcount should indicate
+	 * the related workstation.
+	 */
+	erofs_workgroup_unfreeze_final(grp);
+	return true;
+}
+
+static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+					      unsigned long nr_shrink,
+					      bool cleanup)
+{
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned int freed = 0;
+
+	int i, found;
+repeat:
+	xa_lock(&sbi->workstn_tree);
+
+	found = radix_tree_gang_lookup(&sbi->workstn_tree,
+				       batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
+
+		first_index = grp->index + 1;
+
+		/* try to shrink each valid workgroup */
+		if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
+			continue;
+
+		++freed;
+		if (!--nr_shrink)
+			break;
+	}
+	xa_unlock(&sbi->workstn_tree);
+
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
+}
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_shrinker_register(struct super_block *sb)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_init(&sbi->umount_mutex);
+
+	spin_lock(&erofs_sb_list_lock);
+	list_add(&sbi->list, &erofs_sb_list);
+	spin_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_shrinker_unregister(struct super_block *sb)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+	mutex_lock(&sbi->umount_mutex);
+	erofs_shrink_workstation(sbi, ~0UL, true);
+
+	spin_lock(&erofs_sb_list_lock);
+	list_del(&sbi->list);
+	spin_unlock(&erofs_sb_list_lock);
+	mutex_unlock(&sbi->umount_mutex);
+}
+
+static unsigned long erofs_shrink_count(struct shrinker *shrink,
+					struct shrink_control *sc)
+{
+	return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+static unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct erofs_sb_info *sbi;
+	struct list_head *p;
+
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&erofs_sb_list_lock);
+	do {
+		run_no = ++shrinker_run_no;
+	} while (run_no == 0);
+
+	/* Iterate over all mounted superblocks and try to shrink them */
+	p = erofs_sb_list.next;
+	while (p != &erofs_sb_list) {
+		sbi = list_entry(p, struct erofs_sb_info, list);
+
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+
+		spin_unlock(&erofs_sb_list_lock);
+		sbi->shrinker_run_no = run_no;
+
+		freed += erofs_shrink_workstation(sbi, nr, false);
+
+		spin_lock(&erofs_sb_list_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&sbi->list, &erofs_sb_list);
+		mutex_unlock(&sbi->umount_mutex);
+
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&erofs_sb_list_lock);
+	return freed;
+}
+
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
+int __init erofs_init_shrinker(void)
+{
+	return register_shrinker(&erofs_shrinker_info);
+}
+
+void erofs_exit_shrinker(void)
+{
+	unregister_shrinker(&erofs_shrinker_info);
+}
+#endif	/* !CONFIG_EROFS_FS_ZIP */
+

diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
new file mode 100644
index 0000000..a13a787
--- /dev/null
+++ b/fs/erofs/xattr.c

@@ -0,0 +1,704 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include <linux/security.h>
+#include "xattr.h"
+
+struct xattr_iter {
+	struct super_block *sb;
+	struct page *page;
+	void *kaddr;
+
+	erofs_blk_t blkaddr;
+	unsigned int ofs;
+};
+
+static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
+{
+	/* the only user of kunmap() is 'init_inode_xattrs' */
+	if (!atomic)
+		kunmap(it->page);
+	else
+		kunmap_atomic(it->kaddr);
+
+	unlock_page(it->page);
+	put_page(it->page);
+}
+
+static inline void xattr_iter_end_final(struct xattr_iter *it)
+{
+	if (!it->page)
+		return;
+
+	xattr_iter_end(it, true);
+}
+
+static int init_inode_xattrs(struct inode *inode)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct xattr_iter it;
+	unsigned int i;
+	struct erofs_xattr_ibody_header *ih;
+	struct super_block *sb;
+	struct erofs_sb_info *sbi;
+	bool atomic_map;
+	int ret = 0;
+
+	/* the most case is that xattrs of this inode are initialized. */
+	if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
+		return 0;
+
+	if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE))
+		return -ERESTARTSYS;
+
+	/* someone has initialized xattrs for us? */
+	if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
+		goto out_unlock;
+
+	/*
+	 * bypass all xattr operations if ->xattr_isize is not greater than
+	 * sizeof(struct erofs_xattr_ibody_header), in detail:
+	 * 1) it is not enough to contain erofs_xattr_ibody_header then
+	 *    ->xattr_isize should be 0 (it means no xattr);
+	 * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk
+	 *    undefined right now (maybe use later with some new sb feature).
+	 */
+	if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) {
+		erofs_err(inode->i_sb,
+			  "xattr_isize %d of nid %llu is not supported yet",
+			  vi->xattr_isize, vi->nid);
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	} else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) {
+		if (vi->xattr_isize) {
+			erofs_err(inode->i_sb,
+				  "bogus xattr ibody @ nid %llu", vi->nid);
+			DBG_BUGON(1);
+			ret = -EFSCORRUPTED;
+			goto out_unlock;	/* xattr ondisk layout error */
+		}
+		ret = -ENOATTR;
+		goto out_unlock;
+	}
+
+	sb = inode->i_sb;
+	sbi = EROFS_SB(sb);
+	it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
+	it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
+
+	it.page = erofs_get_meta_page(sb, it.blkaddr);
+	if (IS_ERR(it.page)) {
+		ret = PTR_ERR(it.page);
+		goto out_unlock;
+	}
+
+	/* read in shared xattr array (non-atomic, see kmalloc below) */
+	it.kaddr = kmap(it.page);
+	atomic_map = false;
+
+	ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
+
+	vi->xattr_shared_count = ih->h_shared_count;
+	vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
+						sizeof(uint), GFP_KERNEL);
+	if (!vi->xattr_shared_xattrs) {
+		xattr_iter_end(&it, atomic_map);
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/* let's skip ibody header */
+	it.ofs += sizeof(struct erofs_xattr_ibody_header);
+
+	for (i = 0; i < vi->xattr_shared_count; ++i) {
+		if (it.ofs >= EROFS_BLKSIZ) {
+			/* cannot be unaligned */
+			DBG_BUGON(it.ofs != EROFS_BLKSIZ);
+			xattr_iter_end(&it, atomic_map);
+
+			it.page = erofs_get_meta_page(sb, ++it.blkaddr);
+			if (IS_ERR(it.page)) {
+				kfree(vi->xattr_shared_xattrs);
+				vi->xattr_shared_xattrs = NULL;
+				ret = PTR_ERR(it.page);
+				goto out_unlock;
+			}
+
+			it.kaddr = kmap_atomic(it.page);
+			atomic_map = true;
+			it.ofs = 0;
+		}
+		vi->xattr_shared_xattrs[i] =
+			le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
+		it.ofs += sizeof(__le32);
+	}
+	xattr_iter_end(&it, atomic_map);
+
+	set_bit(EROFS_I_EA_INITED_BIT, &vi->flags);
+
+out_unlock:
+	clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags);
+	return ret;
+}
+
+/*
+ * the general idea for these return values is
+ * if    0 is returned, go on processing the current xattr;
+ *       1 (> 0) is returned, skip this round to process the next xattr;
+ *    -err (< 0) is returned, an error (maybe ENOXATTR) occurred
+ *                            and need to be handled
+ */
+struct xattr_iter_handlers {
+	int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry);
+	int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf,
+		    unsigned int len);
+	int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz);
+	void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf,
+		      unsigned int len);
+};
+
+static inline int xattr_iter_fixup(struct xattr_iter *it)
+{
+	if (it->ofs < EROFS_BLKSIZ)
+		return 0;
+
+	xattr_iter_end(it, true);
+
+	it->blkaddr += erofs_blknr(it->ofs);
+
+	it->page = erofs_get_meta_page(it->sb, it->blkaddr);
+	if (IS_ERR(it->page)) {
+		int err = PTR_ERR(it->page);
+
+		it->page = NULL;
+		return err;
+	}
+
+	it->kaddr = kmap_atomic(it->page);
+	it->ofs = erofs_blkoff(it->ofs);
+	return 0;
+}
+
+static int inline_xattr_iter_begin(struct xattr_iter *it,
+				   struct inode *inode)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb);
+	unsigned int xattr_header_sz, inline_xattr_ofs;
+
+	xattr_header_sz = inlinexattr_header_size(inode);
+	if (xattr_header_sz >= vi->xattr_isize) {
+		DBG_BUGON(xattr_header_sz > vi->xattr_isize);
+		return -ENOATTR;
+	}
+
+	inline_xattr_ofs = vi->inode_isize + xattr_header_sz;
+
+	it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
+	it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
+
+	it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr);
+	if (IS_ERR(it->page))
+		return PTR_ERR(it->page);
+
+	it->kaddr = kmap_atomic(it->page);
+	return vi->xattr_isize - xattr_header_sz;
+}
+
+/*
+ * Regardless of success or failure, `xattr_foreach' will end up with
+ * `ofs' pointing to the next xattr item rather than an arbitrary position.
+ */
+static int xattr_foreach(struct xattr_iter *it,
+			 const struct xattr_iter_handlers *op,
+			 unsigned int *tlimit)
+{
+	struct erofs_xattr_entry entry;
+	unsigned int value_sz, processed, slice;
+	int err;
+
+	/* 0. fixup blkaddr, ofs, ipage */
+	err = xattr_iter_fixup(it);
+	if (err)
+		return err;
+
+	/*
+	 * 1. read xattr entry to the memory,
+	 *    since we do EROFS_XATTR_ALIGN
+	 *    therefore entry should be in the page
+	 */
+	entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs);
+	if (tlimit) {
+		unsigned int entry_sz = erofs_xattr_entry_size(&entry);
+
+		/* xattr on-disk corruption: xattr entry beyond xattr_isize */
+		if (*tlimit < entry_sz) {
+			DBG_BUGON(1);
+			return -EFSCORRUPTED;
+		}
+		*tlimit -= entry_sz;
+	}
+
+	it->ofs += sizeof(struct erofs_xattr_entry);
+	value_sz = le16_to_cpu(entry.e_value_size);
+
+	/* handle entry */
+	err = op->entry(it, &entry);
+	if (err) {
+		it->ofs += entry.e_name_len + value_sz;
+		goto out;
+	}
+
+	/* 2. handle xattr name (ofs will finally be at the end of name) */
+	processed = 0;
+
+	while (processed < entry.e_name_len) {
+		if (it->ofs >= EROFS_BLKSIZ) {
+			DBG_BUGON(it->ofs > EROFS_BLKSIZ);
+
+			err = xattr_iter_fixup(it);
+			if (err)
+				goto out;
+			it->ofs = 0;
+		}
+
+		slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+			      entry.e_name_len - processed);
+
+		/* handle name */
+		err = op->name(it, processed, it->kaddr + it->ofs, slice);
+		if (err) {
+			it->ofs += entry.e_name_len - processed + value_sz;
+			goto out;
+		}
+
+		it->ofs += slice;
+		processed += slice;
+	}
+
+	/* 3. handle xattr value */
+	processed = 0;
+
+	if (op->alloc_buffer) {
+		err = op->alloc_buffer(it, value_sz);
+		if (err) {
+			it->ofs += value_sz;
+			goto out;
+		}
+	}
+
+	while (processed < value_sz) {
+		if (it->ofs >= EROFS_BLKSIZ) {
+			DBG_BUGON(it->ofs > EROFS_BLKSIZ);
+
+			err = xattr_iter_fixup(it);
+			if (err)
+				goto out;
+			it->ofs = 0;
+		}
+
+		slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+			      value_sz - processed);
+		op->value(it, processed, it->kaddr + it->ofs, slice);
+		it->ofs += slice;
+		processed += slice;
+	}
+
+out:
+	/* xattrs should be 4-byte aligned (on-disk constraint) */
+	it->ofs = EROFS_XATTR_ALIGN(it->ofs);
+	return err < 0 ? err : 0;
+}
+
+struct getxattr_iter {
+	struct xattr_iter it;
+
+	char *buffer;
+	int buffer_size, index;
+	struct qstr name;
+};
+
+static int xattr_entrymatch(struct xattr_iter *_it,
+			    struct erofs_xattr_entry *entry)
+{
+	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+	return (it->index != entry->e_name_index ||
+		it->name.len != entry->e_name_len) ? -ENOATTR : 0;
+}
+
+static int xattr_namematch(struct xattr_iter *_it,
+			   unsigned int processed, char *buf, unsigned int len)
+{
+	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+	return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0;
+}
+
+static int xattr_checkbuffer(struct xattr_iter *_it,
+			     unsigned int value_sz)
+{
+	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+	int err = it->buffer_size < value_sz ? -ERANGE : 0;
+
+	it->buffer_size = value_sz;
+	return !it->buffer ? 1 : err;
+}
+
+static void xattr_copyvalue(struct xattr_iter *_it,
+			    unsigned int processed,
+			    char *buf, unsigned int len)
+{
+	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+	memcpy(it->buffer + processed, buf, len);
+}
+
+static const struct xattr_iter_handlers find_xattr_handlers = {
+	.entry = xattr_entrymatch,
+	.name = xattr_namematch,
+	.alloc_buffer = xattr_checkbuffer,
+	.value = xattr_copyvalue
+};
+
+static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
+{
+	int ret;
+	unsigned int remaining;
+
+	ret = inline_xattr_iter_begin(&it->it, inode);
+	if (ret < 0)
+		return ret;
+
+	remaining = ret;
+	while (remaining) {
+		ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining);
+		if (ret != -ENOATTR)
+			break;
+	}
+	xattr_iter_end_final(&it->it);
+
+	return ret ? ret : it->buffer_size;
+}
+
+static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct super_block *const sb = inode->i_sb;
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	unsigned int i;
+	int ret = -ENOATTR;
+
+	for (i = 0; i < vi->xattr_shared_count; ++i) {
+		erofs_blk_t blkaddr =
+			xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+
+		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+
+		if (!i || blkaddr != it->it.blkaddr) {
+			if (i)
+				xattr_iter_end(&it->it, true);
+
+			it->it.page = erofs_get_meta_page(sb, blkaddr);
+			if (IS_ERR(it->it.page))
+				return PTR_ERR(it->it.page);
+
+			it->it.kaddr = kmap_atomic(it->it.page);
+			it->it.blkaddr = blkaddr;
+		}
+
+		ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
+		if (ret != -ENOATTR)
+			break;
+	}
+	if (vi->xattr_shared_count)
+		xattr_iter_end_final(&it->it);
+
+	return ret ? ret : it->buffer_size;
+}
+
+static bool erofs_xattr_user_list(struct dentry *dentry)
+{
+	return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER);
+}
+
+static bool erofs_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+int erofs_getxattr(struct inode *inode, int index,
+		   const char *name,
+		   void *buffer, size_t buffer_size)
+{
+	int ret;
+	struct getxattr_iter it;
+
+	if (!name)
+		return -EINVAL;
+
+	ret = init_inode_xattrs(inode);
+	if (ret)
+		return ret;
+
+	it.index = index;
+
+	it.name.len = strlen(name);
+	if (it.name.len > EROFS_NAME_LEN)
+		return -ERANGE;
+	it.name.name = name;
+
+	it.buffer = buffer;
+	it.buffer_size = buffer_size;
+
+	it.it.sb = inode->i_sb;
+	ret = inline_getxattr(inode, &it);
+	if (ret == -ENOATTR)
+		ret = shared_getxattr(inode, &it);
+	return ret;
+}
+
+static int erofs_xattr_generic_get(const struct xattr_handler *handler,
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, void *buffer, size_t size)
+{
+	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+
+	switch (handler->flags) {
+	case EROFS_XATTR_INDEX_USER:
+		if (!test_opt(sbi, XATTR_USER))
+			return -EOPNOTSUPP;
+		break;
+	case EROFS_XATTR_INDEX_TRUSTED:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		break;
+	case EROFS_XATTR_INDEX_SECURITY:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return erofs_getxattr(inode, handler->flags, name, buffer, size);
+}
+
+const struct xattr_handler erofs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.flags	= EROFS_XATTR_INDEX_USER,
+	.list	= erofs_xattr_user_list,
+	.get	= erofs_xattr_generic_get,
+};
+
+const struct xattr_handler erofs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.flags	= EROFS_XATTR_INDEX_TRUSTED,
+	.list	= erofs_xattr_trusted_list,
+	.get	= erofs_xattr_generic_get,
+};
+
+#ifdef CONFIG_EROFS_FS_SECURITY
+const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.flags	= EROFS_XATTR_INDEX_SECURITY,
+	.get	= erofs_xattr_generic_get,
+};
+#endif
+
+const struct xattr_handler *erofs_xattr_handlers[] = {
+	&erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+	&erofs_xattr_security_handler,
+#endif
+	NULL,
+};
+
+struct listxattr_iter {
+	struct xattr_iter it;
+
+	struct dentry *dentry;
+	char *buffer;
+	int buffer_size, buffer_ofs;
+};
+
+static int xattr_entrylist(struct xattr_iter *_it,
+			   struct erofs_xattr_entry *entry)
+{
+	struct listxattr_iter *it =
+		container_of(_it, struct listxattr_iter, it);
+	unsigned int prefix_len;
+	const char *prefix;
+
+	const struct xattr_handler *h =
+		erofs_xattr_handler(entry->e_name_index);
+
+	if (!h || (h->list && !h->list(it->dentry)))
+		return 1;
+
+	prefix = xattr_prefix(h);
+	prefix_len = strlen(prefix);
+
+	if (!it->buffer) {
+		it->buffer_ofs += prefix_len + entry->e_name_len + 1;
+		return 1;
+	}
+
+	if (it->buffer_ofs + prefix_len
+		+ entry->e_name_len + 1 > it->buffer_size)
+		return -ERANGE;
+
+	memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len);
+	it->buffer_ofs += prefix_len;
+	return 0;
+}
+
+static int xattr_namelist(struct xattr_iter *_it,
+			  unsigned int processed, char *buf, unsigned int len)
+{
+	struct listxattr_iter *it =
+		container_of(_it, struct listxattr_iter, it);
+
+	memcpy(it->buffer + it->buffer_ofs, buf, len);
+	it->buffer_ofs += len;
+	return 0;
+}
+
+static int xattr_skipvalue(struct xattr_iter *_it,
+			   unsigned int value_sz)
+{
+	struct listxattr_iter *it =
+		container_of(_it, struct listxattr_iter, it);
+
+	it->buffer[it->buffer_ofs++] = '\0';
+	return 1;
+}
+
+static const struct xattr_iter_handlers list_xattr_handlers = {
+	.entry = xattr_entrylist,
+	.name = xattr_namelist,
+	.alloc_buffer = xattr_skipvalue,
+	.value = NULL
+};
+
+static int inline_listxattr(struct listxattr_iter *it)
+{
+	int ret;
+	unsigned int remaining;
+
+	ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry));
+	if (ret < 0)
+		return ret;
+
+	remaining = ret;
+	while (remaining) {
+		ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining);
+		if (ret)
+			break;
+	}
+	xattr_iter_end_final(&it->it);
+	return ret ? ret : it->buffer_ofs;
+}
+
+static int shared_listxattr(struct listxattr_iter *it)
+{
+	struct inode *const inode = d_inode(it->dentry);
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct super_block *const sb = inode->i_sb;
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < vi->xattr_shared_count; ++i) {
+		erofs_blk_t blkaddr =
+			xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+
+		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+		if (!i || blkaddr != it->it.blkaddr) {
+			if (i)
+				xattr_iter_end(&it->it, true);
+
+			it->it.page = erofs_get_meta_page(sb, blkaddr);
+			if (IS_ERR(it->it.page))
+				return PTR_ERR(it->it.page);
+
+			it->it.kaddr = kmap_atomic(it->it.page);
+			it->it.blkaddr = blkaddr;
+		}
+
+		ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
+		if (ret)
+			break;
+	}
+	if (vi->xattr_shared_count)
+		xattr_iter_end_final(&it->it);
+
+	return ret ? ret : it->buffer_ofs;
+}
+
+ssize_t erofs_listxattr(struct dentry *dentry,
+			char *buffer, size_t buffer_size)
+{
+	int ret;
+	struct listxattr_iter it;
+
+	ret = init_inode_xattrs(d_inode(dentry));
+	if (ret)
+		return ret;
+
+	it.dentry = dentry;
+	it.buffer = buffer;
+	it.buffer_size = buffer_size;
+	it.buffer_ofs = 0;
+
+	it.it.sb = dentry->d_sb;
+
+	ret = inline_listxattr(&it);
+	if (ret < 0 && ret != -ENOATTR)
+		return ret;
+	return shared_listxattr(&it);
+}
+
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	int prefix, rc;
+	char *value = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	rc = erofs_getxattr(inode, prefix, "", NULL, 0);
+	if (rc > 0) {
+		value = kmalloc(rc, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		rc = erofs_getxattr(inode, prefix, "", value, rc);
+	}
+
+	if (rc == -ENOATTR)
+		acl = NULL;
+	else if (rc < 0)
+		acl = ERR_PTR(rc);
+	else
+		acl = posix_acl_from_xattr(&init_user_ns, value, rc);
+	kfree(value);
+	return acl;
+}
+#endif
+

diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
new file mode 100644
index 0000000..3585b84
--- /dev/null
+++ b/fs/erofs/xattr.h

@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_XATTR_H
+#define __EROFS_XATTR_H
+
+#include "internal.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+/* Attribute not found */
+#define ENOATTR         ENODATA
+
+static inline unsigned int inlinexattr_header_size(struct inode *inode)
+{
+	return sizeof(struct erofs_xattr_ibody_header) +
+		sizeof(u32) * EROFS_I(inode)->xattr_shared_count;
+}
+
+static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi,
+					  unsigned int xattr_id)
+{
+#ifdef CONFIG_EROFS_FS_XATTR
+	return sbi->xattr_blkaddr +
+		xattr_id * sizeof(__u32) / EROFS_BLKSIZ;
+#else
+	return 0;
+#endif
+}
+
+static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
+					     unsigned int xattr_id)
+{
+	return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ;
+}
+
+#ifdef CONFIG_EROFS_FS_XATTR
+extern const struct xattr_handler erofs_xattr_user_handler;
+extern const struct xattr_handler erofs_xattr_trusted_handler;
+#ifdef CONFIG_EROFS_FS_SECURITY
+extern const struct xattr_handler erofs_xattr_security_handler;
+#endif
+
+static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
+{
+static const struct xattr_handler *xattr_handler_map[] = {
+	[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+	[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+	[EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+		&posix_acl_default_xattr_handler,
+#endif
+	[EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+	[EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
+#endif
+};
+
+	return idx && idx < ARRAY_SIZE(xattr_handler_map) ?
+		xattr_handler_map[idx] : NULL;
+}
+
+extern const struct xattr_handler *erofs_xattr_handlers[];
+
+int erofs_getxattr(struct inode *, int, const char *, void *, size_t);
+ssize_t erofs_listxattr(struct dentry *, char *, size_t);
+#else
+static inline int erofs_getxattr(struct inode *inode, int index,
+				 const char *name, void *buffer,
+				 size_t buffer_size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline ssize_t erofs_listxattr(struct dentry *dentry,
+				      char *buffer, size_t buffer_size)
+{
+	return -EOPNOTSUPP;
+}
+#endif	/* !CONFIG_EROFS_FS_XATTR */
+
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type);
+#else
+#define erofs_get_acl	(NULL)
+#endif
+
+#endif
+

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
new file mode 100644
index 0000000..fad80c9
--- /dev/null
+++ b/fs/erofs/zdata.c

@@ -0,0 +1,1439 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "zdata.h"
+#include "compress.h"
+#include <linux/prefetch.h>
+
+#include <trace/events/erofs.h>
+
+/*
+ * a compressed_pages[] placeholder in order to avoid
+ * being filled with file pages for in-place decompression.
+ */
+#define PAGE_UNALLOCATED     ((void *)0x5F0E4B1D)
+
+/* how to allocate cached pages for a pcluster */
+enum z_erofs_cache_alloctype {
+	DONTALLOC,	/* don't allocate any cached pages */
+	DELAYEDALLOC,	/* delayed allocation (at the time of submitting io) */
+};
+
+/*
+ * tagged pointer with 1-bit tag for all compressed pages
+ * tag 0 - the page is just found with an extra page reference
+ */
+typedef tagptr1_t compressed_page_t;
+
+#define tag_compressed_page_justfound(page) \
+	tagptr_fold(compressed_page_t, page, 1)
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *pcluster_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(pcluster_cachep);
+}
+
+static inline int z_erofs_init_workqueue(void)
+{
+	const unsigned int onlinecpus = num_possible_cpus();
+	const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE;
+
+	/*
+	 * no need to spawn too many threads, limiting threads could minimum
+	 * scheduling overhead, perhaps per-CPU threads should be better?
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags,
+					    onlinecpus + onlinecpus / 4);
+	return z_erofs_workqueue ? 0 : -ENOMEM;
+}
+
+static void z_erofs_pcluster_init_once(void *ptr)
+{
+	struct z_erofs_pcluster *pcl = ptr;
+	struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
+	unsigned int i;
+
+	mutex_init(&cl->lock);
+	cl->nr_pages = 0;
+	cl->vcnt = 0;
+	for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
+		pcl->compressed_pages[i] = NULL;
+}
+
+static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl)
+{
+	struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
+
+	atomic_set(&pcl->obj.refcount, 1);
+
+	DBG_BUGON(cl->nr_pages);
+	DBG_BUGON(cl->vcnt);
+}
+
+int __init z_erofs_init_zip_subsystem(void)
+{
+	pcluster_cachep = kmem_cache_create("erofs_compress",
+					    Z_EROFS_WORKGROUP_SIZE, 0,
+					    SLAB_RECLAIM_ACCOUNT,
+					    z_erofs_pcluster_init_once);
+	if (pcluster_cachep) {
+		if (!z_erofs_init_workqueue())
+			return 0;
+
+		kmem_cache_destroy(pcluster_cachep);
+	}
+	return -ENOMEM;
+}
+
+enum z_erofs_collectmode {
+	COLLECT_SECONDARY,
+	COLLECT_PRIMARY,
+	/*
+	 * The current collection was the tail of an exist chain, in addition
+	 * that the previous processed chained collections are all decided to
+	 * be hooked up to it.
+	 * A new chain will be created for the remaining collections which are
+	 * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
+	 * the next collection cannot reuse the whole page safely in
+	 * the following scenario:
+	 *  ________________________________________________________________
+	 * |      tail (partial) page     |       head (partial) page       |
+	 * |   (belongs to the next cl)   |   (belongs to the current cl)   |
+	 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+	 */
+	COLLECT_PRIMARY_HOOKED,
+	COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
+	/*
+	 * The current collection has been linked with the owned chain, and
+	 * could also be linked with the remaining collections, which means
+	 * if the processing page is the tail page of the collection, thus
+	 * the current collection can safely use the whole page (since
+	 * the previous collection is under control) for in-place I/O, as
+	 * illustrated below:
+	 *  ________________________________________________________________
+	 * |  tail (partial) page |          head (partial) page           |
+	 * |  (of the current cl) |      (of the previous collection)      |
+	 * |  PRIMARY_FOLLOWED or |                                        |
+	 * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
+	 *
+	 * [  (*) the above page can be used as inplace I/O.               ]
+	 */
+	COLLECT_PRIMARY_FOLLOWED,
+};
+
+struct z_erofs_collector {
+	struct z_erofs_pagevec_ctor vector;
+
+	struct z_erofs_pcluster *pcl, *tailpcl;
+	struct z_erofs_collection *cl;
+	struct page **compressedpages;
+	z_erofs_next_pcluster_t owned_head;
+
+	enum z_erofs_collectmode mode;
+};
+
+struct z_erofs_decompress_frontend {
+	struct inode *const inode;
+
+	struct z_erofs_collector clt;
+	struct erofs_map_blocks map;
+
+	/* used for applying cache strategy on the fly */
+	bool backmost;
+	erofs_off_t headoffset;
+};
+
+#define COLLECTOR_INIT() { \
+	.owned_head = Z_EROFS_PCLUSTER_TAIL, \
+	.mode = COLLECT_PRIMARY_FOLLOWED }
+
+#define DECOMPRESS_FRONTEND_INIT(__i) { \
+	.inode = __i, .clt = COLLECTOR_INIT(), \
+	.backmost = true, }
+
+static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static void preload_compressed_pages(struct z_erofs_collector *clt,
+				     struct address_space *mc,
+				     enum z_erofs_cache_alloctype type,
+				     struct list_head *pagepool)
+{
+	const struct z_erofs_pcluster *pcl = clt->pcl;
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+	struct page **pages = clt->compressedpages;
+	pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+	bool standalone = true;
+
+	if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+		return;
+
+	for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+		struct page *page;
+		compressed_page_t t;
+
+		/* the compressed page was loaded before */
+		if (READ_ONCE(*pages))
+			continue;
+
+		page = find_get_page(mc, index);
+
+		if (page) {
+			t = tag_compressed_page_justfound(page);
+		} else if (type == DELAYEDALLOC) {
+			t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
+		} else {	/* DONTALLOC */
+			if (standalone)
+				clt->compressedpages = pages;
+			standalone = false;
+			continue;
+		}
+
+		if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
+			continue;
+
+		if (page)
+			put_page(page);
+	}
+
+	if (standalone)		/* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
+		clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+}
+
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+				       struct erofs_workgroup *grp)
+{
+	struct z_erofs_pcluster *const pcl =
+		container_of(grp, struct z_erofs_pcluster, obj);
+	struct address_space *const mapping = MNGD_MAPPING(sbi);
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+	int i;
+
+	/*
+	 * refcount of workgroup is now freezed as 1,
+	 * therefore no need to worry about available decompression users.
+	 */
+	for (i = 0; i < clusterpages; ++i) {
+		struct page *page = pcl->compressed_pages[i];
+
+		if (!page)
+			continue;
+
+		/* block other users from reclaiming or migrating the page */
+		if (!trylock_page(page))
+			return -EBUSY;
+
+		if (page->mapping != mapping)
+			continue;
+
+		/* barrier is implied in the following 'unlock_page' */
+		WRITE_ONCE(pcl->compressed_pages[i], NULL);
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+
+		unlock_page(page);
+		put_page(page);
+	}
+	return 0;
+}
+
+int erofs_try_to_free_cached_page(struct address_space *mapping,
+				  struct page *page)
+{
+	struct z_erofs_pcluster *const pcl = (void *)page_private(page);
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+	int ret = 0;	/* 0 - busy */
+
+	if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
+		unsigned int i;
+
+		for (i = 0; i < clusterpages; ++i) {
+			if (pcl->compressed_pages[i] == page) {
+				WRITE_ONCE(pcl->compressed_pages[i], NULL);
+				ret = 1;
+				break;
+			}
+		}
+		erofs_workgroup_unfreeze(&pcl->obj, 1);
+
+		if (ret) {
+			ClearPagePrivate(page);
+			put_page(page);
+		}
+	}
+	return ret;
+}
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+					  struct page *page)
+{
+	struct z_erofs_pcluster *const pcl = clt->pcl;
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+
+	while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
+		if (!cmpxchg(clt->compressedpages++, NULL, page))
+			return true;
+	}
+	return false;
+}
+
+/* callers must be with collection lock held */
+static int z_erofs_attach_page(struct z_erofs_collector *clt,
+			       struct page *page,
+			       enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for inplaceio */
+	if (clt->mode >= COLLECT_PRIMARY &&
+	    type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+	    z_erofs_try_inplace_io(clt, page))
+		return 0;
+
+	ret = z_erofs_pagevec_enqueue(&clt->vector,
+				      page, type, &occupied);
+	clt->cl->vcnt += (unsigned int)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static enum z_erofs_collectmode
+try_to_claim_pcluster(struct z_erofs_pcluster *pcl,
+		      z_erofs_next_pcluster_t *owned_head)
+{
+	/* let's claim these following types of pclusters */
+retry:
+	if (pcl->next == Z_EROFS_PCLUSTER_NIL) {
+		/* type 1, nil pcluster */
+		if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
+			    *owned_head) != Z_EROFS_PCLUSTER_NIL)
+			goto retry;
+
+		*owned_head = &pcl->next;
+		/* lucky, I am the followee :) */
+		return COLLECT_PRIMARY_FOLLOWED;
+	} else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) {
+		/*
+		 * type 2, link to the end of a existing open chain,
+		 * be careful that its submission itself is governed
+		 * by the original owned chain.
+		 */
+		if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+			    *owned_head) != Z_EROFS_PCLUSTER_TAIL)
+			goto retry;
+		*owned_head = Z_EROFS_PCLUSTER_TAIL;
+		return COLLECT_PRIMARY_HOOKED;
+	}
+	return COLLECT_PRIMARY;	/* :( better luck next time */
+}
+
+static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
+					   struct inode *inode,
+					   struct erofs_map_blocks *map)
+{
+	struct erofs_workgroup *grp;
+	struct z_erofs_pcluster *pcl;
+	struct z_erofs_collection *cl;
+	unsigned int length;
+	bool tag;
+
+	grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag);
+	if (!grp)
+		return NULL;
+
+	pcl = container_of(grp, struct z_erofs_pcluster, obj);
+	if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
+		DBG_BUGON(1);
+		erofs_workgroup_put(grp);
+		return ERR_PTR(-EFSCORRUPTED);
+	}
+
+	cl = z_erofs_primarycollection(pcl);
+	if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
+		DBG_BUGON(1);
+		erofs_workgroup_put(grp);
+		return ERR_PTR(-EFSCORRUPTED);
+	}
+
+	length = READ_ONCE(pcl->length);
+	if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
+		if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
+			DBG_BUGON(1);
+			erofs_workgroup_put(grp);
+			return ERR_PTR(-EFSCORRUPTED);
+		}
+	} else {
+		unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
+
+		if (map->m_flags & EROFS_MAP_FULL_MAPPED)
+			llen |= Z_EROFS_PCLUSTER_FULL_LENGTH;
+
+		while (llen > length &&
+		       length != cmpxchg_relaxed(&pcl->length, length, llen)) {
+			cpu_relax();
+			length = READ_ONCE(pcl->length);
+		}
+	}
+	mutex_lock(&cl->lock);
+	/* used to check tail merging loop due to corrupted images */
+	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+		clt->tailpcl = pcl;
+	clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head);
+	/* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
+	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+		clt->tailpcl = NULL;
+	clt->pcl = pcl;
+	clt->cl = cl;
+	return cl;
+}
+
+static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
+					     struct inode *inode,
+					     struct erofs_map_blocks *map)
+{
+	struct z_erofs_pcluster *pcl;
+	struct z_erofs_collection *cl;
+	int err;
+
+	/* no available workgroup, let's allocate one */
+	pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
+	if (!pcl)
+		return ERR_PTR(-ENOMEM);
+
+	z_erofs_pcluster_init_always(pcl);
+	pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+
+	pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
+		(map->m_flags & EROFS_MAP_FULL_MAPPED ?
+			Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
+
+	if (map->m_flags & EROFS_MAP_ZIPPED)
+		pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
+	else
+		pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+
+	pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
+	pcl->clusterbits -= PAGE_SHIFT;
+
+	/* new pclusters should be claimed as type 1, primary and followed */
+	pcl->next = clt->owned_head;
+	clt->mode = COLLECT_PRIMARY_FOLLOWED;
+
+	cl = z_erofs_primarycollection(pcl);
+	cl->pageofs = map->m_la & ~PAGE_MASK;
+
+	/*
+	 * lock all primary followed works before visible to others
+	 * and mutex_trylock *never* fails for a new pcluster.
+	 */
+	mutex_trylock(&cl->lock);
+
+	err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0);
+	if (err) {
+		mutex_unlock(&cl->lock);
+		kmem_cache_free(pcluster_cachep, pcl);
+		return ERR_PTR(-EAGAIN);
+	}
+	/* used to check tail merging loop due to corrupted images */
+	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+		clt->tailpcl = pcl;
+	clt->owned_head = &pcl->next;
+	clt->pcl = pcl;
+	clt->cl = cl;
+	return cl;
+}
+
+static int z_erofs_collector_begin(struct z_erofs_collector *clt,
+				   struct inode *inode,
+				   struct erofs_map_blocks *map)
+{
+	struct z_erofs_collection *cl;
+
+	DBG_BUGON(clt->cl);
+
+	/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
+	DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
+	DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+	if (!PAGE_ALIGNED(map->m_pa)) {
+		DBG_BUGON(1);
+		return -EINVAL;
+	}
+
+repeat:
+	cl = cllookup(clt, inode, map);
+	if (!cl) {
+		cl = clregister(clt, inode, map);
+
+		if (cl == ERR_PTR(-EAGAIN))
+			goto repeat;
+	}
+
+	if (IS_ERR(cl))
+		return PTR_ERR(cl);
+
+	z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
+				  cl->pagevec, cl->vcnt);
+
+	clt->compressedpages = clt->pcl->compressed_pages;
+	if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
+		clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+	return 0;
+}
+
+/*
+ * keep in mind that no referenced pclusters will be freed
+ * only after a RCU grace period.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_collection *const cl =
+		container_of(head, struct z_erofs_collection, rcu);
+
+	kmem_cache_free(pcluster_cachep,
+			container_of(cl, struct z_erofs_pcluster,
+				     primary_collection));
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+	struct z_erofs_pcluster *const pcl =
+		container_of(grp, struct z_erofs_pcluster, obj);
+	struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl);
+
+	call_rcu(&cl->rcu, z_erofs_rcu_callback);
+}
+
+static void z_erofs_collection_put(struct z_erofs_collection *cl)
+{
+	struct z_erofs_pcluster *const pcl =
+		container_of(cl, struct z_erofs_pcluster, primary_collection);
+
+	erofs_workgroup_put(&pcl->obj);
+}
+
+static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+{
+	struct z_erofs_collection *cl = clt->cl;
+
+	if (!cl)
+		return false;
+
+	z_erofs_pagevec_ctor_exit(&clt->vector, false);
+	mutex_unlock(&cl->lock);
+
+	/*
+	 * if all pending pages are added, don't hold its reference
+	 * any longer if the pcluster isn't hosted by ourselves.
+	 */
+	if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+		z_erofs_collection_put(cl);
+
+	clt->cl = NULL;
+	return true;
+}
+
+static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
+					       gfp_t gfp)
+{
+	struct page *page = erofs_allocpage(pagepool, gfp, true);
+
+	page->mapping = Z_EROFS_MAPPING_STAGING;
+	return page;
+}
+
+static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
+				       unsigned int cachestrategy,
+				       erofs_off_t la)
+{
+	if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+		return false;
+
+	if (fe->backmost)
+		return true;
+
+	return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+		la < fe->headoffset;
+}
+
+static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
+				struct page *page,
+				struct list_head *pagepool)
+{
+	struct inode *const inode = fe->inode;
+	struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode);
+	struct erofs_map_blocks *const map = &fe->map;
+	struct z_erofs_collector *const clt = &fe->clt;
+	const loff_t offset = page_offset(page);
+	bool tight = true;
+
+	enum z_erofs_cache_alloctype cache_strategy;
+	enum z_erofs_page_type page_type;
+	unsigned int cur, end, spiltted, index;
+	int err = 0;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= map->m_la &&
+	    offset + cur < map->m_la + map->m_llen) {
+		/* didn't get a valid collection previously (very rare) */
+		if (!clt->cl)
+			goto restart_now;
+		goto hitted;
+	}
+
+	/* go ahead the next map_blocks */
+	erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	if (z_erofs_collector_end(clt))
+		fe->backmost = false;
+
+	map->m_la = offset + cur;
+	map->m_llen = 0;
+	err = z_erofs_map_blocks_iter(inode, map, 0);
+	if (err)
+		goto err_out;
+
+restart_now:
+	if (!(map->m_flags & EROFS_MAP_MAPPED))
+		goto hitted;
+
+	err = z_erofs_collector_begin(clt, inode, map);
+	if (err)
+		goto err_out;
+
+	/* preload all compressed pages (maybe downgrade role if necessary) */
+	if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la))
+		cache_strategy = DELAYEDALLOC;
+	else
+		cache_strategy = DONTALLOC;
+
+	preload_compressed_pages(clt, MNGD_MAPPING(sbi),
+				 cache_strategy, pagepool);
+
+hitted:
+	/*
+	 * Ensure the current partial page belongs to this submit chain rather
+	 * than other concurrent submit chains or the noio(bypass) chain since
+	 * those chains are handled asynchronously thus the page cannot be used
+	 * for inplace I/O or pagevec (should be processed in strict order.)
+	 */
+	tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
+		  clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+
+	cur = end - min_t(unsigned int, offset + end - map->m_la, end);
+	if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+	if (cur)
+		tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+
+retry:
+	err = z_erofs_attach_page(clt, page, page_type);
+	/* should allocate an additional staging page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *const newpage =
+			__stagingpage_alloc(pagepool, GFP_NOFS);
+
+		err = z_erofs_attach_page(clt, newpage,
+					  Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (err)
+		goto err_out;
+
+	index = page->index - (map->m_la >> PAGE_SHIFT);
+
+	z_erofs_onlinepage_fixup(page, index, true);
+
+	/* bump up the number of spiltted parts of a page */
+	++spiltted;
+	/* also update nr_pages */
+	clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	map->m_llen = offset + cur - map->m_la;
+
+	end = cur;
+	if (end > 0)
+		goto repeat;
+
+out:
+	z_erofs_onlinepage_endio(page);
+
+	erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		  __func__, page, spiltted, map->m_llen);
+	return err;
+
+	/* if some error occurred while processing this page */
+err_out:
+	SetPageError(page);
+	goto out;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t);
+	bool background = tagptr_unfold_tags(t);
+
+	if (!background) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&io->u.wait.lock, flags);
+		if (!atomic_add_return(bios, &io->pending_bios))
+			wake_up_locked(&io->u.wait);
+		spin_unlock_irqrestore(&io->u.wait.lock, flags);
+		return;
+	}
+
+	if (!atomic_add_return(bios, &io->pending_bios))
+		queue_work(z_erofs_workqueue, &io->u.work);
+}
+
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+{
+	struct erofs_sb_info *sbi = NULL;
+	blk_status_t err = bio->bi_status;
+	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
+
+	bio_for_each_segment_all(bvec, bio, iter_all) {
+		struct page *page = bvec->bv_page;
+		bool cachemngd = false;
+
+		DBG_BUGON(PageUptodate(page));
+		DBG_BUGON(!page->mapping);
+
+		if (!sbi && !z_erofs_page_is_staging(page))
+			sbi = EROFS_SB(page->mapping->host->i_sb);
+
+		/* sbi should already be gotten if the page is managed */
+		if (sbi)
+			cachemngd = erofs_page_is_managed(sbi, page);
+
+		if (err)
+			SetPageError(page);
+		else if (cachemngd)
+			SetPageUptodate(page);
+
+		if (cachemngd)
+			unlock_page(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static int z_erofs_decompress_pcluster(struct super_block *sb,
+				       struct z_erofs_pcluster *pcl,
+				       struct list_head *pagepool)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned int i, outputsize, llen, nr_pages;
+	struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped, partial;
+	struct z_erofs_collection *cl;
+	int err;
+
+	might_sleep();
+	cl = z_erofs_primarycollection(pcl);
+	DBG_BUGON(!READ_ONCE(cl->nr_pages));
+
+	mutex_lock(&cl->lock);
+	nr_pages = cl->nr_pages;
+
+	if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) {
+		pages = pages_onstack;
+	} else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES &&
+		   mutex_trylock(&z_pagemap_global_lock)) {
+		pages = z_pagemap_global;
+	} else {
+		gfp_t gfp_flags = GFP_KERNEL;
+
+		if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES)
+			gfp_flags |= __GFP_NOFAIL;
+
+		pages = kvmalloc_array(nr_pages, sizeof(struct page *),
+				       gfp_flags);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (!pages) {
+			mutex_lock(&z_pagemap_global_lock);
+			pages = z_pagemap_global;
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	err = 0;
+	z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
+				  cl->pagevec, 0);
+
+	for (i = 0; i < cl->vcnt; ++i) {
+		unsigned int pagenr;
+
+		page = z_erofs_pagevec_dequeue(&ctor, &page_type);
+
+		/* all pages in pagevec ought to be valid */
+		DBG_BUGON(!page);
+		DBG_BUGON(!page->mapping);
+
+		if (z_erofs_put_stagingpage(pagepool, page))
+			continue;
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		DBG_BUGON(pagenr >= nr_pages);
+
+		/*
+		 * currently EROFS doesn't support multiref(dedup),
+		 * so here erroring out one multiref page.
+		 */
+		if (pages[pagenr]) {
+			DBG_BUGON(1);
+			SetPageError(pages[pagenr]);
+			z_erofs_onlinepage_endio(pages[pagenr]);
+			err = -EFSCORRUPTED;
+		}
+		pages[pagenr] = page;
+	}
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	compressed_pages = pcl->compressed_pages;
+
+	for (i = 0; i < clusterpages; ++i) {
+		unsigned int pagenr;
+
+		page = compressed_pages[i];
+
+		/* all compressed pages ought to be valid */
+		DBG_BUGON(!page);
+		DBG_BUGON(!page->mapping);
+
+		if (!z_erofs_page_is_staging(page)) {
+			if (erofs_page_is_managed(sbi, page)) {
+				if (!PageUptodate(page))
+					err = -EIO;
+				continue;
+			}
+
+			/*
+			 * only if non-head page can be selected
+			 * for inplace decompression
+			 */
+			pagenr = z_erofs_onlinepage_index(page);
+
+			DBG_BUGON(pagenr >= nr_pages);
+			if (pages[pagenr]) {
+				DBG_BUGON(1);
+				SetPageError(pages[pagenr]);
+				z_erofs_onlinepage_endio(pages[pagenr]);
+				err = -EFSCORRUPTED;
+			}
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+
+		/* PG_error needs checking for inplaced and staging pages */
+		if (PageError(page)) {
+			DBG_BUGON(PageUptodate(page));
+			err = -EIO;
+		}
+	}
+
+	if (err)
+		goto out;
+
+	llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
+	if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) {
+		outputsize = llen;
+		partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
+	} else {
+		outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs;
+		partial = true;
+	}
+
+	err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
+					.sb = sb,
+					.in = compressed_pages,
+					.out = pages,
+					.pageofs_out = cl->pageofs,
+					.inputsize = PAGE_SIZE,
+					.outputsize = outputsize,
+					.alg = pcl->algorithmformat,
+					.inplace_io = overlapped,
+					.partial_decoding = partial
+				 }, pagepool);
+
+out:
+	/* must handle all compressed pages before endding pages */
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		if (erofs_page_is_managed(sbi, page))
+			continue;
+
+		/* recycle all individual staging pages */
+		(void)z_erofs_put_stagingpage(pagepool, page);
+
+		WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+		if (!page)
+			continue;
+
+		DBG_BUGON(!page->mapping);
+
+		/* recycle all individual staging pages */
+		if (z_erofs_put_stagingpage(pagepool, page))
+			continue;
+
+		if (err < 0)
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (pages != pages_onstack)
+		kvfree(pages);
+
+	cl->nr_pages = 0;
+	cl->vcnt = 0;
+
+	/* all cl locks MUST be taken before the following line */
+	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+
+	/* all cl locks SHOULD be released right now */
+	mutex_unlock(&cl->lock);
+
+	z_erofs_collection_put(cl);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_unzip_io *io,
+				  struct list_head *pagepool)
+{
+	z_erofs_next_pcluster_t owned = io->head;
+
+	while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
+		struct z_erofs_pcluster *pcl;
+
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
+
+		/* no possible that 'owned' equals NULL */
+		DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
+
+		pcl = container_of(owned, struct z_erofs_pcluster, next);
+		owned = READ_ONCE(pcl->next);
+
+		z_erofs_decompress_pcluster(sb, pcl, pagepool);
+	}
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_unzip_io_sb *iosb =
+		container_of(work, struct z_erofs_unzip_io_sb, io.u.work);
+	LIST_HEAD(pagepool);
+
+	DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool);
+
+	put_pages_list(&pagepool);
+	kvfree(iosb);
+}
+
+static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
+					       unsigned int nr,
+					       struct list_head *pagepool,
+					       struct address_space *mc,
+					       gfp_t gfp)
+{
+	/* determined at compile time to avoid too many #ifdefs */
+	const bool nocache = __builtin_constant_p(mc) ? !mc : false;
+	const pgoff_t index = pcl->obj.index;
+	bool tocache = false;
+
+	struct address_space *mapping;
+	struct page *oldpage, *page;
+
+	compressed_page_t t;
+	int justfound;
+
+repeat:
+	page = READ_ONCE(pcl->compressed_pages[nr]);
+	oldpage = page;
+
+	if (!page)
+		goto out_allocpage;
+
+	/*
+	 * the cached page has not been allocated and
+	 * an placeholder is out there, prepare it now.
+	 */
+	if (!nocache && page == PAGE_UNALLOCATED) {
+		tocache = true;
+		goto out_allocpage;
+	}
+
+	/* process the target tagged pointer */
+	t = tagptr_init(compressed_page_t, page);
+	justfound = tagptr_unfold_tags(t);
+	page = tagptr_unfold_ptr(t);
+
+	mapping = READ_ONCE(page->mapping);
+
+	/*
+	 * if managed cache is disabled, it's no way to
+	 * get such a cached-like page.
+	 */
+	if (nocache) {
+		/* if managed cache is disabled, it is impossible `justfound' */
+		DBG_BUGON(justfound);
+
+		/* and it should be locked, not uptodate, and not truncated */
+		DBG_BUGON(!PageLocked(page));
+		DBG_BUGON(PageUptodate(page));
+		DBG_BUGON(!mapping);
+		goto out;
+	}
+
+	/*
+	 * unmanaged (file) pages are all locked solidly,
+	 * therefore it is impossible for `mapping' to be NULL.
+	 */
+	if (mapping && mapping != mc)
+		/* ought to be unmanaged pages */
+		goto out;
+
+	lock_page(page);
+
+	/* only true if page reclaim goes wrong, should never happen */
+	DBG_BUGON(justfound && PagePrivate(page));
+
+	/* the page is still in manage cache */
+	if (page->mapping == mc) {
+		WRITE_ONCE(pcl->compressed_pages[nr], page);
+
+		ClearPageError(page);
+		if (!PagePrivate(page)) {
+			/*
+			 * impossible to be !PagePrivate(page) for
+			 * the current restriction as well if
+			 * the page is already in compressed_pages[].
+			 */
+			DBG_BUGON(!justfound);
+
+			justfound = 0;
+			set_page_private(page, (unsigned long)pcl);
+			SetPagePrivate(page);
+		}
+
+		/* no need to submit io if it is already up-to-date */
+		if (PageUptodate(page)) {
+			unlock_page(page);
+			page = NULL;
+		}
+		goto out;
+	}
+
+	/*
+	 * the managed page has been truncated, it's unsafe to
+	 * reuse this one, let's allocate a new cache-managed page.
+	 */
+	DBG_BUGON(page->mapping);
+	DBG_BUGON(!justfound);
+
+	tocache = true;
+	unlock_page(page);
+	put_page(page);
+out_allocpage:
+	page = __stagingpage_alloc(pagepool, gfp);
+	if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
+		list_add(&page->lru, pagepool);
+		cpu_relax();
+		goto repeat;
+	}
+	if (nocache || !tocache)
+		goto out;
+	if (add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+		page->mapping = Z_EROFS_MAPPING_STAGING;
+		goto out;
+	}
+
+	set_page_private(page, (unsigned long)pcl);
+	SetPagePrivate(page);
+out:	/* the only exit (for tracing and debugging) */
+	return page;
+}
+
+static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb,
+					      struct z_erofs_unzip_io *io,
+					      bool foreground)
+{
+	struct z_erofs_unzip_io_sb *iosb;
+
+	if (foreground) {
+		/* waitqueue available for foreground io */
+		DBG_BUGON(!io);
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+		goto out;
+	}
+
+	iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL);
+	DBG_BUGON(!iosb);
+
+	/* initialize fields in the allocated descriptor */
+	io = &iosb->io;
+	iosb->sb = sb;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+out:
+	io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+	return io;
+}
+
+/* define decompression jobqueue types */
+enum {
+	JQ_BYPASS,
+	JQ_SUBMIT,
+	NR_JOBQUEUES,
+};
+
+static void *jobqueueset_init(struct super_block *sb,
+			      z_erofs_next_pcluster_t qtail[],
+			      struct z_erofs_unzip_io *q[],
+			      struct z_erofs_unzip_io *fgq,
+			      bool forcefg)
+{
+	/*
+	 * if managed cache is enabled, bypass jobqueue is needed,
+	 * no need to read from device for all pclusters in this queue.
+	 */
+	q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true);
+	qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
+
+	q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg);
+	qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
+
+	return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg));
+}
+
+static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
+				    z_erofs_next_pcluster_t qtail[],
+				    z_erofs_next_pcluster_t owned_head)
+{
+	z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
+	z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
+
+	DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+	if (owned_head == Z_EROFS_PCLUSTER_TAIL)
+		owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+
+	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+	WRITE_ONCE(*submit_qtail, owned_head);
+	WRITE_ONCE(*bypass_qtail, &pcl->next);
+
+	qtail[JQ_BYPASS] = &pcl->next;
+}
+
+static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[],
+				       unsigned int nr_bios,
+				       bool force_fg)
+{
+	/*
+	 * although background is preferred, no one is pending for submission.
+	 * don't issue workqueue for decompression but drop it directly instead.
+	 */
+	if (force_fg || nr_bios)
+		return false;
+
+	kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io));
+	return true;
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   z_erofs_next_pcluster_t owned_head,
+				   struct list_head *pagepool,
+				   struct z_erofs_unzip_io *fgq,
+				   bool force_fg)
+{
+	struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+	z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+	struct z_erofs_unzip_io *q[NR_JOBQUEUES];
+	struct bio *bio;
+	void *bi_private;
+	/* since bio will be NULL, no need to initialize last_index */
+	pgoff_t uninitialized_var(last_index);
+	bool force_submit = false;
+	unsigned int nr_bios;
+
+	if (owned_head == Z_EROFS_PCLUSTER_TAIL)
+		return false;
+
+	force_submit = false;
+	bio = NULL;
+	nr_bios = 0;
+	bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg);
+
+	/* by default, all need io submission */
+	q[JQ_SUBMIT]->head = owned_head;
+
+	do {
+		struct z_erofs_pcluster *pcl;
+		unsigned int clusterpages;
+		pgoff_t first_index;
+		struct page *page;
+		unsigned int i = 0, bypass = 0;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+		DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
+
+		pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+
+		clusterpages = BIT(pcl->clusterbits);
+
+		/* close the main owned chain at first */
+		owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+				     Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+		first_index = pcl->obj.index;
+		force_submit |= (first_index != last_index + 1);
+
+repeat:
+		page = pickup_page_for_submission(pcl, i, pagepool,
+						  MNGD_MAPPING(sbi),
+						  GFP_NOFS);
+		if (!page) {
+			force_submit = true;
+			++bypass;
+			goto skippage;
+		}
+
+		if (bio && force_submit) {
+submit_bio_retry:
+			submit_bio(bio);
+			bio = NULL;
+		}
+
+		if (!bio) {
+			bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+
+			bio->bi_end_io = z_erofs_vle_read_endio;
+			bio_set_dev(bio, sb->s_bdev);
+			bio->bi_iter.bi_sector = (sector_t)(first_index + i) <<
+				LOG_SECTORS_PER_BLOCK;
+			bio->bi_private = bi_private;
+			bio->bi_opf = REQ_OP_READ;
+
+			++nr_bios;
+		}
+
+		err = bio_add_page(bio, page, PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		force_submit = false;
+		last_index = first_index + i;
+skippage:
+		if (++i < clusterpages)
+			goto repeat;
+
+		if (bypass < clusterpages)
+			qtail[JQ_SUBMIT] = &pcl->next;
+		else
+			move_to_bypass_jobqueue(pcl, qtail, owned_head);
+	} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+
+	if (bio)
+		submit_bio(bio);
+
+	if (postsubmit_is_all_bypassed(q, nr_bios, force_fg))
+		return true;
+
+	z_erofs_vle_unzip_kickoff(bi_private, nr_bios);
+	return true;
+}
+
+static void z_erofs_submit_and_unzip(struct super_block *sb,
+				     struct z_erofs_collector *clt,
+				     struct list_head *pagepool,
+				     bool force_fg)
+{
+	struct z_erofs_unzip_io io[NR_JOBQUEUES];
+
+	if (!z_erofs_vle_submit_all(sb, clt->owned_head,
+				    pagepool, io, force_fg))
+		return;
+
+	/* decompress no I/O pclusters immediately */
+	z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool);
+
+	if (!force_fg)
+		return;
+
+	/* wait until all bios are completed */
+	wait_event(io[JQ_SUBMIT].u.wait,
+		   !atomic_read(&io[JQ_SUBMIT].pending_bios));
+
+	/* let's synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+					     struct page *page)
+{
+	struct inode *const inode = page->mapping->host;
+	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+	int err;
+	LIST_HEAD(pagepool);
+
+	trace_erofs_readpage(page, false);
+
+	f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+
+	err = z_erofs_do_read_page(&f, page, &pagepool);
+	(void)z_erofs_collector_end(&f.clt);
+
+	/* if some compressed cluster ready, need submit them anyway */
+	z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true);
+
+	if (err)
+		erofs_err(inode->i_sb, "failed to read, err [%d]", err);
+
+	if (f.map.mpage)
+		put_page(f.map.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return err;
+}
+
+static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
+					    unsigned int nr)
+{
+	return nr <= sbi->max_sync_decompress_pages;
+}
+
+static int z_erofs_vle_normalaccess_readpages(struct file *filp,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned int nr_pages)
+{
+	struct inode *const inode = mapping->host;
+	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+
+	bool sync = should_decompress_synchronously(sbi, nr_pages);
+	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	LIST_HEAD(pagepool);
+
+	trace_erofs_readpages(mapping->host, lru_to_page(pages),
+			      nr_pages, false);
+
+	f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT;
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		/*
+		 * A pure asynchronous readahead is indicated if
+		 * a PG_readahead marked page is hitted at first.
+		 * Let's also do asynchronous decompression for this case.
+		 */
+		sync &= !(PageReadahead(page) && !head);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(&f, page, &pagepool);
+		if (err)
+			erofs_err(inode->i_sb,
+				  "readahead error at page %lu @ nid %llu",
+				  page->index, EROFS_I(inode)->nid);
+		put_page(page);
+	}
+
+	(void)z_erofs_collector_end(&f.clt);
+
+	z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync);
+
+	if (f.map.mpage)
+		put_page(f.map.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+const struct address_space_operations z_erofs_vle_normalaccess_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
+

diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
new file mode 100644
index 0000000..faf9501
--- /dev/null
+++ b/fs/erofs/zdata.h

@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_ZDATA_H
+#define __EROFS_FS_ZDATA_H
+
+#include "internal.h"
+#include "zpvec.h"
+
+#define Z_EROFS_NR_INLINE_PAGEVECS      3
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else;
+ *
+ * L: Field should be protected by pageset lock;
+ *
+ * A: Field should be accessed / updated in atomic for parallelized code.
+ */
+struct z_erofs_collection {
+	struct mutex lock;
+
+	/* I: page offset of start position of decompression */
+	unsigned short pageofs;
+
+	/* L: maximum relative page index in pagevec[] */
+	unsigned short nr_pages;
+
+	/* L: total number of pages in pagevec[] */
+	unsigned int vcnt;
+
+	union {
+		/* L: inline a certain number of pagevecs for bootstrap */
+		erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
+
+		/* I: can be used to free the pcluster by RCU. */
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_PCLUSTER_FULL_LENGTH    0x00000001
+#define Z_EROFS_PCLUSTER_LENGTH_BIT     1
+
+/*
+ * let's leave a type here in case of introducing
+ * another tagged pointer later.
+ */
+typedef void *z_erofs_next_pcluster_t;
+
+struct z_erofs_pcluster {
+	struct erofs_workgroup obj;
+	struct z_erofs_collection primary_collection;
+
+	/* A: point to next chained pcluster or TAILs */
+	z_erofs_next_pcluster_t next;
+
+	/* A: compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+
+	/* A: lower limit of decompressed length and if full length or not */
+	unsigned int length;
+
+	/* I: compression algorithm format */
+	unsigned char algorithmformat;
+	/* I: bit shift of physical cluster size */
+	unsigned char clusterbits;
+};
+
+#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
+
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_PCLUSTER_TAIL           ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_PCLUSTER_TAIL_CLOSED    ((void *)0x5F0EDEAD)
+
+#define Z_EROFS_PCLUSTER_NIL            (NULL)
+
+#define Z_EROFS_WORKGROUP_SIZE  sizeof(struct z_erofs_pcluster)
+
+struct z_erofs_unzip_io {
+	atomic_t pending_bios;
+	z_erofs_next_pcluster_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_unzip_io_sb {
+	struct z_erofs_unzip_io io;
+	struct super_block *sb;
+};
+
+#define MNGD_MAPPING(sbi)	((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+					 struct page *page)
+{
+	return page->mapping == MNGD_MAPPING(sbi);
+}
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned int z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	DBG_BUGON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		DBG_BUGON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned int v;
+
+	DBG_BUGON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VMAP_ONSTACK_PAGES	\
+	min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
+#define Z_EROFS_VMAP_GLOBAL_PAGES	2048
+
+#endif
+

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
new file mode 100644
index 0000000..6a26c29
--- /dev/null
+++ b/fs/erofs/zmap.c

@@ -0,0 +1,471 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018-2019 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <asm/unaligned.h>
+#include <trace/events/erofs.h>
+
+int z_erofs_fill_inode(struct inode *inode)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+
+	if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+		vi->z_advise = 0;
+		vi->z_algorithmtype[0] = 0;
+		vi->z_algorithmtype[1] = 0;
+		vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
+		vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
+		vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
+		set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+	}
+
+	inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops;
+	return 0;
+}
+
+static int fill_inode_lazy(struct inode *inode)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct super_block *const sb = inode->i_sb;
+	int err;
+	erofs_off_t pos;
+	struct page *page;
+	void *kaddr;
+	struct z_erofs_map_header *h;
+
+	if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+		return 0;
+
+	if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
+		return -ERESTARTSYS;
+
+	err = 0;
+	if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+		goto out_unlock;
+
+	DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+
+	pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
+		    vi->xattr_isize, 8);
+	page = erofs_get_meta_page(sb, erofs_blknr(pos));
+	if (IS_ERR(page)) {
+		err = PTR_ERR(page);
+		goto out_unlock;
+	}
+
+	kaddr = kmap_atomic(page);
+
+	h = kaddr + erofs_blkoff(pos);
+	vi->z_advise = le16_to_cpu(h->h_advise);
+	vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
+	vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+
+	if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
+		erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
+			  vi->z_algorithmtype[0], vi->nid);
+		err = -EOPNOTSUPP;
+		goto unmap_done;
+	}
+
+	vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
+	vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
+					((h->h_clusterbits >> 3) & 3);
+
+	if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
+		erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
+			  vi->z_physical_clusterbits[0], vi->nid);
+		err = -EOPNOTSUPP;
+		goto unmap_done;
+	}
+
+	vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
+					((h->h_clusterbits >> 5) & 7);
+	set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+unmap_done:
+	kunmap_atomic(kaddr);
+	unlock_page(page);
+	put_page(page);
+out_unlock:
+	clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
+	return err;
+}
+
+struct z_erofs_maprecorder {
+	struct inode *inode;
+	struct erofs_map_blocks *map;
+	void *kaddr;
+
+	unsigned long lcn;
+	/* compression extent information gathered */
+	u8  type;
+	u16 clusterofs;
+	u16 delta[2];
+	erofs_blk_t pblk;
+};
+
+static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
+				  erofs_blk_t eblk)
+{
+	struct super_block *const sb = m->inode->i_sb;
+	struct erofs_map_blocks *const map = m->map;
+	struct page *mpage = map->mpage;
+
+	if (mpage) {
+		if (mpage->index == eblk) {
+			if (!m->kaddr)
+				m->kaddr = kmap_atomic(mpage);
+			return 0;
+		}
+
+		if (m->kaddr) {
+			kunmap_atomic(m->kaddr);
+			m->kaddr = NULL;
+		}
+		put_page(mpage);
+	}
+
+	mpage = erofs_get_meta_page(sb, eblk);
+	if (IS_ERR(mpage)) {
+		map->mpage = NULL;
+		return PTR_ERR(mpage);
+	}
+	m->kaddr = kmap_atomic(mpage);
+	unlock_page(mpage);
+	map->mpage = mpage;
+	return 0;
+}
+
+static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+					     unsigned long lcn)
+{
+	struct inode *const inode = m->inode;
+	struct erofs_inode *const vi = EROFS_I(inode);
+	const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid);
+	const erofs_off_t pos =
+		Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize +
+					       vi->xattr_isize) +
+		lcn * sizeof(struct z_erofs_vle_decompressed_index);
+	struct z_erofs_vle_decompressed_index *di;
+	unsigned int advise, type;
+	int err;
+
+	err = z_erofs_reload_indexes(m, erofs_blknr(pos));
+	if (err)
+		return err;
+
+	m->lcn = lcn;
+	di = m->kaddr + erofs_blkoff(pos);
+
+	advise = le16_to_cpu(di->di_advise);
+	type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) &
+		((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1);
+	switch (type) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		m->clusterofs = 1 << vi->z_logical_clusterbits;
+		m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+		m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
+		break;
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		m->clusterofs = le16_to_cpu(di->di_clusterofs);
+		m->pblk = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		DBG_BUGON(1);
+		return -EOPNOTSUPP;
+	}
+	m->type = type;
+	return 0;
+}
+
+static unsigned int decode_compactedbits(unsigned int lobits,
+					 unsigned int lomask,
+					 u8 *in, unsigned int pos, u8 *type)
+{
+	const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
+	const unsigned int lo = v & lomask;
+
+	*type = (v >> lobits) & 3;
+	return lo;
+}
+
+static int unpack_compacted_index(struct z_erofs_maprecorder *m,
+				  unsigned int amortizedshift,
+				  unsigned int eofs)
+{
+	struct erofs_inode *const vi = EROFS_I(m->inode);
+	const unsigned int lclusterbits = vi->z_logical_clusterbits;
+	const unsigned int lomask = (1 << lclusterbits) - 1;
+	unsigned int vcnt, base, lo, encodebits, nblk;
+	int i;
+	u8 *in, type;
+
+	if (1 << amortizedshift == 4)
+		vcnt = 2;
+	else if (1 << amortizedshift == 2 && lclusterbits == 12)
+		vcnt = 16;
+	else
+		return -EOPNOTSUPP;
+
+	encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
+	base = round_down(eofs, vcnt << amortizedshift);
+	in = m->kaddr + base;
+
+	i = (eofs - base) >> amortizedshift;
+
+	lo = decode_compactedbits(lclusterbits, lomask,
+				  in, encodebits * i, &type);
+	m->type = type;
+	if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+		m->clusterofs = 1 << lclusterbits;
+		if (i + 1 != vcnt) {
+			m->delta[0] = lo;
+			return 0;
+		}
+		/*
+		 * since the last lcluster in the pack is special,
+		 * of which lo saves delta[1] rather than delta[0].
+		 * Hence, get delta[0] by the previous lcluster indirectly.
+		 */
+		lo = decode_compactedbits(lclusterbits, lomask,
+					  in, encodebits * (i - 1), &type);
+		if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+			lo = 0;
+		m->delta[0] = lo + 1;
+		return 0;
+	}
+	m->clusterofs = lo;
+	m->delta[0] = 0;
+	/* figout out blkaddr (pblk) for HEAD lclusters */
+	nblk = 1;
+	while (i > 0) {
+		--i;
+		lo = decode_compactedbits(lclusterbits, lomask,
+					  in, encodebits * i, &type);
+		if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+			i -= lo;
+
+		if (i >= 0)
+			++nblk;
+	}
+	in += (vcnt << amortizedshift) - sizeof(__le32);
+	m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
+	return 0;
+}
+
+static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+					    unsigned long lcn)
+{
+	struct inode *const inode = m->inode;
+	struct erofs_inode *const vi = EROFS_I(inode);
+	const unsigned int lclusterbits = vi->z_logical_clusterbits;
+	const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) +
+					vi->inode_isize + vi->xattr_isize, 8) +
+		sizeof(struct z_erofs_map_header);
+	const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+	unsigned int compacted_4b_initial, compacted_2b;
+	unsigned int amortizedshift;
+	erofs_off_t pos;
+	int err;
+
+	if (lclusterbits != 12)
+		return -EOPNOTSUPP;
+
+	if (lcn >= totalidx)
+		return -EINVAL;
+
+	m->lcn = lcn;
+	/* used to align to 32-byte (compacted_2b) alignment */
+	compacted_4b_initial = (32 - ebase % 32) / 4;
+	if (compacted_4b_initial == 32 / 4)
+		compacted_4b_initial = 0;
+
+	if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B)
+		compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+	else
+		compacted_2b = 0;
+
+	pos = ebase;
+	if (lcn < compacted_4b_initial) {
+		amortizedshift = 2;
+		goto out;
+	}
+	pos += compacted_4b_initial * 4;
+	lcn -= compacted_4b_initial;
+
+	if (lcn < compacted_2b) {
+		amortizedshift = 1;
+		goto out;
+	}
+	pos += compacted_2b * 2;
+	lcn -= compacted_2b;
+	amortizedshift = 2;
+out:
+	pos += lcn * (1 << amortizedshift);
+	err = z_erofs_reload_indexes(m, erofs_blknr(pos));
+	if (err)
+		return err;
+	return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
+}
+
+static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+				      unsigned int lcn)
+{
+	const unsigned int datamode = EROFS_I(m->inode)->datalayout;
+
+	if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+		return vle_legacy_load_cluster_from_disk(m, lcn);
+
+	if (datamode == EROFS_INODE_FLAT_COMPRESSION)
+		return compacted_load_cluster_from_disk(m, lcn);
+
+	return -EINVAL;
+}
+
+static int vle_extent_lookback(struct z_erofs_maprecorder *m,
+			       unsigned int lookback_distance)
+{
+	struct erofs_inode *const vi = EROFS_I(m->inode);
+	struct erofs_map_blocks *const map = m->map;
+	const unsigned int lclusterbits = vi->z_logical_clusterbits;
+	unsigned long lcn = m->lcn;
+	int err;
+
+	if (lcn < lookback_distance) {
+		erofs_err(m->inode->i_sb,
+			  "bogus lookback distance @ nid %llu", vi->nid);
+		DBG_BUGON(1);
+		return -EFSCORRUPTED;
+	}
+
+	/* load extent head logical cluster if needed */
+	lcn -= lookback_distance;
+	err = vle_load_cluster_from_disk(m, lcn);
+	if (err)
+		return err;
+
+	switch (m->type) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		if (!m->delta[0]) {
+			erofs_err(m->inode->i_sb,
+				  "invalid lookback distance 0 @ nid %llu",
+				  vi->nid);
+			DBG_BUGON(1);
+			return -EFSCORRUPTED;
+		}
+		return vle_extent_lookback(m, m->delta[0]);
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		map->m_flags &= ~EROFS_MAP_ZIPPED;
+		/* fallthrough */
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		map->m_la = (lcn << lclusterbits) | m->clusterofs;
+		break;
+	default:
+		erofs_err(m->inode->i_sb,
+			  "unknown type %u @ lcn %lu of nid %llu",
+			  m->type, lcn, vi->nid);
+		DBG_BUGON(1);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode,
+			    struct erofs_map_blocks *map,
+			    int flags)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct z_erofs_maprecorder m = {
+		.inode = inode,
+		.map = map,
+	};
+	int err = 0;
+	unsigned int lclusterbits, endoff;
+	unsigned long long ofs, end;
+
+	trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+
+	/* when trying to read beyond EOF, leave it unmapped */
+	if (map->m_la >= inode->i_size) {
+		map->m_llen = map->m_la + 1 - inode->i_size;
+		map->m_la = inode->i_size;
+		map->m_flags = 0;
+		goto out;
+	}
+
+	err = fill_inode_lazy(inode);
+	if (err)
+		goto out;
+
+	lclusterbits = vi->z_logical_clusterbits;
+	ofs = map->m_la;
+	m.lcn = ofs >> lclusterbits;
+	endoff = ofs & ((1 << lclusterbits) - 1);
+
+	err = vle_load_cluster_from_disk(&m, m.lcn);
+	if (err)
+		goto unmap_out;
+
+	map->m_flags = EROFS_MAP_ZIPPED;	/* by default, compressed */
+	end = (m.lcn + 1ULL) << lclusterbits;
+
+	switch (m.type) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (endoff >= m.clusterofs)
+			map->m_flags &= ~EROFS_MAP_ZIPPED;
+		/* fallthrough */
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (endoff >= m.clusterofs) {
+			map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+			break;
+		}
+		/* m.lcn should be >= 1 if endoff < m.clusterofs */
+		if (!m.lcn) {
+			erofs_err(inode->i_sb,
+				  "invalid logical cluster 0 at nid %llu",
+				  vi->nid);
+			err = -EFSCORRUPTED;
+			goto unmap_out;
+		}
+		end = (m.lcn << lclusterbits) | m.clusterofs;
+		map->m_flags |= EROFS_MAP_FULL_MAPPED;
+		m.delta[0] = 1;
+		/* fallthrough */
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		err = vle_extent_lookback(&m, m.delta[0]);
+		if (err)
+			goto unmap_out;
+		break;
+	default:
+		erofs_err(inode->i_sb,
+			  "unknown type %u @ offset %llu of nid %llu",
+			  m.type, ofs, vi->nid);
+		err = -EOPNOTSUPP;
+		goto unmap_out;
+	}
+
+	map->m_llen = end - map->m_la;
+	map->m_plen = 1 << lclusterbits;
+	map->m_pa = blknr_to_addr(m.pblk);
+	map->m_flags |= EROFS_MAP_MAPPED;
+
+unmap_out:
+	if (m.kaddr)
+		kunmap_atomic(m.kaddr);
+
+out:
+	erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+		  __func__, map->m_la, map->m_pa,
+		  map->m_llen, map->m_plen, map->m_flags);
+
+	trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+
+	/* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
+	DBG_BUGON(err < 0 && err != -ENOMEM);
+	return err;
+}
+

diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
new file mode 100644
index 0000000..5855690
--- /dev/null
+++ b/fs/erofs/zpvec.h

@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_ZPVEC_H
+#define __EROFS_FS_ZPVEC_H
+
+#include "tagptr.h"
+
+/* page type in pagevec for decompress subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+					     bool atomic)
+{
+	if (!ctor->curr)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned int nr)
+{
+	unsigned int index;
+
+	/* keep away from occupied pages */
+	if (ctor->next)
+		return ctor->next;
+
+	for (index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned int tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+	DBG_BUGON(nr >= ctor->nr);
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned int nr,
+					     erofs_vtptr_t *pages,
+					     unsigned int i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
+					   struct page *page,
+					   enum z_erofs_page_type type,
+					   bool *occupied)
+{
+	*occupied = false;
+	if (!ctor->next && type)
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (ctor->index >= ctor->nr)
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+	ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (ctor->index >= ctor->nr) {
+		DBG_BUGON(!ctor->next);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0);
+	return tagptr_unfold_ptr(t);
+}
+#endif
+

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08d3bd6..8aa0ea8 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  fs/eventfd.c
  *
@@ -21,6 +22,9 @@
 #include <linux/eventfd.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/idr.h>
+
+static DEFINE_IDA(eventfd_ida);
 
 struct eventfd_ctx {
 	struct kref kref;
@@ -35,6 +39,7 @@
 	 */
 	__u64 count;
 	unsigned int flags;
+	int id;
 };
 
 /**
@@ -69,6 +74,8 @@
 
 static void eventfd_free_ctx(struct eventfd_ctx *ctx)
 {
+	if (ctx->id >= 0)
+		ida_simple_remove(&eventfd_ida, ctx->id);
 	kfree(ctx);
 }
 
@@ -297,6 +304,7 @@
 	seq_printf(m, "eventfd-count: %16llx\n",
 		   (unsigned long long)ctx->count);
 	spin_unlock_irq(&ctx->wqh.lock);
+	seq_printf(m, "eventfd-id: %d\n", ctx->id);
 }
 #endif
 
@@ -400,6 +408,7 @@
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
+	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
 
 	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
 			      O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 42bbe68..c4159bc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c

@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  fs/eventpoll.c (Efficient event retrieval implementation)
  *  Copyright (C) 2001,...,2009	 Davide Libenzi
  *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
  *  Davide Libenzi <davidel@xmailserver.org>
- *
  */
 
 #include <linux/init.h>
@@ -50,10 +45,10 @@
  *
  * 1) epmutex (mutex)
  * 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
  *
  * The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
  * from inside the poll callback, that might be triggered from
  * a wake_up() that in turn might be called from IRQ context.
  * So we can't sleep inside the poll callback and hence we need
@@ -85,7 +80,7 @@
  * of epoll file descriptors, we use the current recursion depth as
  * the lockdep subkey.
  * It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
  * but having "ep->mtx" will make the interface more scalable.
  * Events that require holding "epmutex" are very rare, while for
  * normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +177,6 @@
  * This structure is stored inside the "private_data" member of the file
  * structure and represents the main data structure for the eventpoll
  * interface.
- *
- * Access to it is protected by the lock inside wq.
  */
 struct eventpoll {
 	/*
@@ -203,13 +196,16 @@
 	/* List of ready file descriptors */
 	struct list_head rdllist;
 
+	/* Lock which protects rdllist and ovflist */
+	rwlock_t lock;
+
 	/* RB tree root used to store monitored fd structs */
 	struct rb_root_cached rbr;
 
 	/*
 	 * This is a single linked list that chains all the "struct epitem" that
 	 * happened while transferring ready events to userspace w/out
-	 * holding ->wq.lock.
+	 * holding ->lock.
 	 */
 	struct epitem *ovflist;
 
@@ -295,7 +291,7 @@
 
 #include <linux/sysctl.h>
 
-static long zero;
+static long long_zero;
 static long long_max = LONG_MAX;
 
 struct ctl_table epoll_table[] = {
@@ -305,7 +301,7 @@
 		.maxlen		= sizeof(max_user_watches),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &zero,
+		.extra1		= &long_zero,
 		.extra2		= &long_max,
 	},
 	{ }
@@ -381,7 +377,8 @@
  */
 static inline int ep_events_available(struct eventpoll *ep)
 {
-	return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+	return !list_empty_careful(&ep->rdllist) ||
+		READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
 }
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -471,7 +468,6 @@
  *                  no re-entered.
  *
  * @ncalls: Pointer to the nested_calls structure to be used for this call.
- * @max_nests: Maximum number of allowed nesting calls.
  * @nproc: Nested call core function pointer.
  * @priv: Opaque data to be passed to the @nproc callback.
  * @cookie: Cookie to be used to identify this nested call.
@@ -480,7 +476,7 @@
  * Returns: Returns the code returned by the @nproc callback, or -1 if
  *          the maximum recursion limit has been exceeded.
  */
-static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+static int ep_call_nested(struct nested_calls *ncalls,
 			  int (*nproc)(void *, void *, int), void *priv,
 			  void *cookie, void *ctx)
 {
@@ -499,7 +495,7 @@
 	 */
 	list_for_each_entry(tncur, lsthead, llink) {
 		if (tncur->ctx == ctx &&
-		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
+		    (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
 			/*
 			 * Ops ... loop detected or maximum nest level reached.
 			 * We abort this wake by breaking the cycle itself.
@@ -573,7 +569,7 @@
 {
 	int this_cpu = get_cpu();
 
-	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+	ep_call_nested(&poll_safewake_ncalls,
 		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
 
 	put_cpu();
@@ -697,23 +693,23 @@
 	 * because we want the "sproc" callback to be able to do it
 	 * in a lockless way.
 	 */
-	spin_lock_irq(&ep->wq.lock);
+	write_lock_irq(&ep->lock);
 	list_splice_init(&ep->rdllist, &txlist);
-	ep->ovflist = NULL;
-	spin_unlock_irq(&ep->wq.lock);
+	WRITE_ONCE(ep->ovflist, NULL);
+	write_unlock_irq(&ep->lock);
 
 	/*
 	 * Now call the callback function.
 	 */
 	res = (*sproc)(ep, &txlist, priv);
 
-	spin_lock_irq(&ep->wq.lock);
+	write_lock_irq(&ep->lock);
 	/*
 	 * During the time we spent inside the "sproc" callback, some
 	 * other events might have been queued by the poll callback.
 	 * We re-insert them inside the main ready-list here.
 	 */
-	for (nepi = ep->ovflist; (epi = nepi) != NULL;
+	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
 	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
 		/*
 		 * We need to check if the item is already in the list.
@@ -722,7 +718,11 @@
 		 * contain them, and the list_splice() below takes care of them.
 		 */
 		if (!ep_is_linked(epi)) {
-			list_add_tail(&epi->rdllink, &ep->rdllist);
+			/*
+			 * ->ovflist is LIFO, so we have to reverse it in order
+			 * to keep in FIFO.
+			 */
+			list_add(&epi->rdllink, &ep->rdllist);
 			ep_pm_stay_awake(epi);
 		}
 	}
@@ -731,7 +731,7 @@
 	 * releasing the lock, events will be queued in the normal way inside
 	 * ep->rdllist.
 	 */
-	ep->ovflist = EP_UNACTIVE_PTR;
+	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
 
 	/*
 	 * Quickly re-inject items left on "txlist".
@@ -745,11 +745,11 @@
 		 * the ->poll() wait list (delayed after we release the lock).
 		 */
 		if (waitqueue_active(&ep->wq))
-			wake_up_locked(&ep->wq);
+			wake_up(&ep->wq);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
-	spin_unlock_irq(&ep->wq.lock);
+	write_unlock_irq(&ep->lock);
 
 	if (!ep_locked)
 		mutex_unlock(&ep->mtx);
@@ -789,10 +789,10 @@
 
 	rb_erase_cached(&epi->rbn, &ep->rbr);
 
-	spin_lock_irq(&ep->wq.lock);
+	write_lock_irq(&ep->lock);
 	if (ep_is_linked(epi))
 		list_del_init(&epi->rdllink);
-	spin_unlock_irq(&ep->wq.lock);
+	write_unlock_irq(&ep->lock);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
 	/*
@@ -842,7 +842,7 @@
 	 * Walks through the whole tree by freeing each "struct epitem". At this
 	 * point we are sure no poll callbacks will be lingering around, and also by
 	 * holding "epmutex" we can be sure that no file cleanup code will hit
-	 * us during this operation. So we can avoid the lock on "ep->wq.lock".
+	 * us during this operation. So we can avoid the lock on "ep->lock".
 	 * We do not need to lock ep->mtx, either, we only do it to prevent
 	 * a lockdep warning.
 	 */
@@ -1023,6 +1023,7 @@
 		goto free_uid;
 
 	mutex_init(&ep->mtx);
+	rwlock_init(&ep->lock);
 	init_waitqueue_head(&ep->wq);
 	init_waitqueue_head(&ep->poll_wait);
 	INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1113,107 @@
 }
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ *         existing list until all changes are completed, in other words
+ *         concurrent list_add_tail_lockless() calls should be protected
+ *         with a read lock, where write lock acts as a barrier which
+ *         makes sure all list_add_tail_lockless() calls are fully
+ *         completed.
+ *
+ *        Also an element can be locklessly added to the list only in one
+ *        direction i.e. either to the tail either to the head, otherwise
+ *        concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+					  struct list_head *head)
+{
+	struct list_head *prev;
+
+	/*
+	 * This is simple 'new->next = head' operation, but cmpxchg()
+	 * is used in order to detect that same element has been just
+	 * added to the list from another CPU: the winner observes
+	 * new->next == new.
+	 */
+	if (cmpxchg(&new->next, new, head) != new)
+		return false;
+
+	/*
+	 * Initially ->next of a new element must be updated with the head
+	 * (we are inserting to the tail) and only then pointers are atomically
+	 * exchanged.  XCHG guarantees memory ordering, thus ->next should be
+	 * updated before pointers are actually swapped and pointers are
+	 * swapped before prev->next is updated.
+	 */
+
+	prev = xchg(&head->prev, new);
+
+	/*
+	 * It is safe to modify prev->next and new->prev, because a new element
+	 * is added only to the tail and new->next is updated before XCHG.
+	 */
+
+	prev->next = new;
+	new->prev = prev;
+
+	return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+	struct eventpoll *ep = epi->ep;
+
+	/* Check that the same epi has not been just chained from another CPU */
+	if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+		return false;
+
+	/* Atomically exchange tail */
+	epi->next = xchg(&ep->ovflist, epi);
+
+	return true;
+}
+
 /*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
  * have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless.  Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries.  Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly.  This is detected using
+ * cmpxchg() operation.
  */
 static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	int pwake = 0;
-	unsigned long flags;
 	struct epitem *epi = ep_item_from_wait(wait);
 	struct eventpoll *ep = epi->ep;
 	__poll_t pollflags = key_to_poll(key);
+	unsigned long flags;
 	int ewake = 0;
 
-	spin_lock_irqsave(&ep->wq.lock, flags);
+	read_lock_irqsave(&ep->lock, flags);
 
 	ep_set_busy_poll_napi_id(epi);
 
@@ -1154,25 +1241,16 @@
 	 * semantics). All the events that happen during that period of time are
 	 * chained in ep->ovflist and requeued later on.
 	 */
-	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
-		if (epi->next == EP_UNACTIVE_PTR) {
-			epi->next = ep->ovflist;
-			ep->ovflist = epi;
-			if (epi->ws) {
-				/*
-				 * Activate ep->ws since epi->ws may get
-				 * deactivated at any time.
-				 */
-				__pm_stay_awake(ep->ws);
-			}
-
-		}
+	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
+		if (epi->next == EP_UNACTIVE_PTR &&
+		    chain_epi_lockless(epi))
+			ep_pm_stay_awake_rcu(epi);
 		goto out_unlock;
 	}
 
 	/* If this file is already in the ready list we exit soon */
-	if (!ep_is_linked(epi)) {
-		list_add_tail(&epi->rdllink, &ep->rdllist);
+	if (!ep_is_linked(epi) &&
+	    list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
 		ep_pm_stay_awake_rcu(epi);
 	}
 
@@ -1197,13 +1275,13 @@
 				break;
 			}
 		}
-		wake_up_locked(&ep->wq);
+		wake_up(&ep->wq);
 	}
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
 
 out_unlock:
-	spin_unlock_irqrestore(&ep->wq.lock, flags);
+	read_unlock_irqrestore(&ep->lock, flags);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -1333,7 +1411,6 @@
 				}
 			} else {
 				error = ep_call_nested(&poll_loop_ncalls,
-							EP_MAX_NESTS,
 							reverse_path_check_proc,
 							child_file, child_file,
 							current);
@@ -1367,7 +1444,7 @@
 	/* let's call this for all tfiles */
 	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
 		path_count_init();
-		error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+		error = ep_call_nested(&poll_loop_ncalls,
 					reverse_path_check_proc, current_file,
 					current_file, current);
 		if (error)
@@ -1382,13 +1459,13 @@
 	struct wakeup_source *ws;
 
 	if (!epi->ep->ws) {
-		epi->ep->ws = wakeup_source_register("eventpoll");
+		epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
 		if (!epi->ep->ws)
 			return -ENOMEM;
 	}
 
 	name = epi->ffd.file->f_path.dentry->d_name.name;
-	ws = wakeup_source_register(name);
+	ws = wakeup_source_register(NULL, name);
 
 	if (!ws)
 		return -ENOMEM;
@@ -1489,7 +1566,7 @@
 		goto error_remove_epi;
 
 	/* We have to drop the new item inside our item list to keep track of it */
-	spin_lock_irq(&ep->wq.lock);
+	write_lock_irq(&ep->lock);
 
 	/* record NAPI ID of new item if present */
 	ep_set_busy_poll_napi_id(epi);
@@ -1501,12 +1578,12 @@
 
 		/* Notify waiting tasks that events are available */
 		if (waitqueue_active(&ep->wq))
-			wake_up_locked(&ep->wq);
+			wake_up(&ep->wq);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
 
-	spin_unlock_irq(&ep->wq.lock);
+	write_unlock_irq(&ep->lock);
 
 	atomic_long_inc(&ep->user->epoll_watches);
 
@@ -1532,10 +1609,10 @@
 	 * list, since that is used/cleaned only inside a section bound by "mtx".
 	 * And ep_insert() is called with "mtx" held.
 	 */
-	spin_lock_irq(&ep->wq.lock);
+	write_lock_irq(&ep->lock);
 	if (ep_is_linked(epi))
 		list_del_init(&epi->rdllink);
-	spin_unlock_irq(&ep->wq.lock);
+	write_unlock_irq(&ep->lock);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
 
@@ -1579,9 +1656,9 @@
 	 * 1) Flush epi changes above to other CPUs.  This ensures
 	 *    we do not miss events from ep_poll_callback if an
 	 *    event occurs immediately after we call f_op->poll().
-	 *    We need this because we did not take ep->wq.lock while
+	 *    We need this because we did not take ep->lock while
 	 *    changing epi above (but ep_poll_callback does take
-	 *    ep->wq.lock).
+	 *    ep->lock).
 	 *
 	 * 2) We also need to ensure we do not miss _past_ events
 	 *    when calling f_op->poll().  This barrier also
@@ -1600,18 +1677,18 @@
 	 * list, push it inside.
 	 */
 	if (ep_item_poll(epi, &pt, 1)) {
-		spin_lock_irq(&ep->wq.lock);
+		write_lock_irq(&ep->lock);
 		if (!ep_is_linked(epi)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 			ep_pm_stay_awake(epi);
 
 			/* Notify waiting tasks that events are available */
 			if (waitqueue_active(&ep->wq))
-				wake_up_locked(&ep->wq);
+				wake_up(&ep->wq);
 			if (waitqueue_active(&ep->poll_wait))
 				pwake++;
 		}
-		spin_unlock_irq(&ep->wq.lock);
+		write_unlock_irq(&ep->lock);
 	}
 
 	/* We have to call this outside the lock */
@@ -1626,21 +1703,24 @@
 {
 	struct ep_send_events_data *esed = priv;
 	__poll_t revents;
-	struct epitem *epi;
-	struct epoll_event __user *uevent;
+	struct epitem *epi, *tmp;
+	struct epoll_event __user *uevent = esed->events;
 	struct wakeup_source *ws;
 	poll_table pt;
 
 	init_poll_funcptr(&pt, NULL);
+	esed->res = 0;
 
 	/*
 	 * We can loop without lock because we are passed a task private list.
 	 * Items cannot vanish during the loop because ep_scan_ready_list() is
 	 * holding "mtx" during this call.
 	 */
-	for (esed->res = 0, uevent = esed->events;
-	     !list_empty(head) && esed->res < esed->maxevents;) {
-		epi = list_first_entry(head, struct epitem, rdllink);
+	lockdep_assert_held(&ep->mtx);
+
+	list_for_each_entry_safe(epi, tmp, head, rdllink) {
+		if (esed->res >= esed->maxevents)
+			break;
 
 		/*
 		 * Activate ep->ws before deactivating epi->ws to prevent
@@ -1660,42 +1740,42 @@
 
 		list_del_init(&epi->rdllink);
 
-		revents = ep_item_poll(epi, &pt, 1);
-
 		/*
 		 * If the event mask intersect the caller-requested one,
 		 * deliver the event to userspace. Again, ep_scan_ready_list()
-		 * is holding "mtx", so no operations coming from userspace
+		 * is holding ep->mtx, so no operations coming from userspace
 		 * can change the item.
 		 */
-		if (revents) {
-			if (__put_user(revents, &uevent->events) ||
-			    __put_user(epi->event.data, &uevent->data)) {
-				list_add(&epi->rdllink, head);
-				ep_pm_stay_awake(epi);
-				if (!esed->res)
-					esed->res = -EFAULT;
-				return 0;
-			}
-			esed->res++;
-			uevent++;
-			if (epi->event.events & EPOLLONESHOT)
-				epi->event.events &= EP_PRIVATE_BITS;
-			else if (!(epi->event.events & EPOLLET)) {
-				/*
-				 * If this file has been added with Level
-				 * Trigger mode, we need to insert back inside
-				 * the ready list, so that the next call to
-				 * epoll_wait() will check again the events
-				 * availability. At this point, no one can insert
-				 * into ep->rdllist besides us. The epoll_ctl()
-				 * callers are locked out by
-				 * ep_scan_ready_list() holding "mtx" and the
-				 * poll callback will queue them in ep->ovflist.
-				 */
-				list_add_tail(&epi->rdllink, &ep->rdllist);
-				ep_pm_stay_awake(epi);
-			}
+		revents = ep_item_poll(epi, &pt, 1);
+		if (!revents)
+			continue;
+
+		if (__put_user(revents, &uevent->events) ||
+		    __put_user(epi->event.data, &uevent->data)) {
+			list_add(&epi->rdllink, head);
+			ep_pm_stay_awake(epi);
+			if (!esed->res)
+				esed->res = -EFAULT;
+			return 0;
+		}
+		esed->res++;
+		uevent++;
+		if (epi->event.events & EPOLLONESHOT)
+			epi->event.events &= EP_PRIVATE_BITS;
+		else if (!(epi->event.events & EPOLLET)) {
+			/*
+			 * If this file has been added with Level
+			 * Trigger mode, we need to insert back inside
+			 * the ready list, so that the next call to
+			 * epoll_wait() will check again the events
+			 * availability. At this point, no one can insert
+			 * into ep->rdllist besides us. The epoll_ctl()
+			 * callers are locked out by
+			 * ep_scan_ready_list() holding "mtx" and the
+			 * poll callback will queue them in ep->ovflist.
+			 */
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+			ep_pm_stay_awake(epi);
 		}
 	}
 
@@ -1747,6 +1827,7 @@
 {
 	int res = 0, eavail, timed_out = 0;
 	u64 slack = 0;
+	bool waiter = false;
 	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
 
@@ -1761,11 +1842,18 @@
 	} else if (timeout == 0) {
 		/*
 		 * Avoid the unnecessary trip to the wait queue loop, if the
-		 * caller specified a non blocking operation.
+		 * caller specified a non blocking operation. We still need
+		 * lock because we could race and not see an epi being added
+		 * to the ready list while in irq callback. Thus incorrectly
+		 * returning 0 back to userspace.
 		 */
 		timed_out = 1;
-		spin_lock_irq(&ep->wq.lock);
-		goto check_events;
+
+		write_lock_irq(&ep->lock);
+		eavail = ep_events_available(ep);
+		write_unlock_irq(&ep->lock);
+
+		goto send_events;
 	}
 
 fetch_events:
@@ -1773,64 +1861,66 @@
 	if (!ep_events_available(ep))
 		ep_busy_loop(ep, timed_out);
 
-	spin_lock_irq(&ep->wq.lock);
+	eavail = ep_events_available(ep);
+	if (eavail)
+		goto send_events;
 
-	if (!ep_events_available(ep)) {
-		/*
-		 * Busy poll timed out.  Drop NAPI ID for now, we can add
-		 * it back in when we have moved a socket with a valid NAPI
-		 * ID onto the ready list.
-		 */
-		ep_reset_busy_poll_napi_id(ep);
+	/*
+	 * Busy poll timed out.  Drop NAPI ID for now, we can add
+	 * it back in when we have moved a socket with a valid NAPI
+	 * ID onto the ready list.
+	 */
+	ep_reset_busy_poll_napi_id(ep);
 
-		/*
-		 * We don't have any available event to return to the caller.
-		 * We need to sleep here, and we will be wake up by
-		 * ep_poll_callback() when events will become available.
-		 */
+	/*
+	 * We don't have any available event to return to the caller.  We need
+	 * to sleep here, and we will be woken by ep_poll_callback() when events
+	 * become available.
+	 */
+	if (!waiter) {
+		waiter = true;
 		init_waitqueue_entry(&wait, current);
+
+		spin_lock_irq(&ep->wq.lock);
 		__add_wait_queue_exclusive(&ep->wq, &wait);
+		spin_unlock_irq(&ep->wq.lock);
+	}
 
-		for (;;) {
-			/*
-			 * We don't want to sleep if the ep_poll_callback() sends us
-			 * a wakeup in between. That's why we set the task state
-			 * to TASK_INTERRUPTIBLE before doing the checks.
-			 */
-			set_current_state(TASK_INTERRUPTIBLE);
-			/*
-			 * Always short-circuit for fatal signals to allow
-			 * threads to make a timely exit without the chance of
-			 * finding more events available and fetching
-			 * repeatedly.
-			 */
-			if (fatal_signal_pending(current)) {
-				res = -EINTR;
-				break;
-			}
-			if (ep_events_available(ep) || timed_out)
-				break;
-			if (signal_pending(current)) {
-				res = -EINTR;
-				break;
-			}
-
-			spin_unlock_irq(&ep->wq.lock);
-			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
-				timed_out = 1;
-
-			spin_lock_irq(&ep->wq.lock);
+	for (;;) {
+		/*
+		 * We don't want to sleep if the ep_poll_callback() sends us
+		 * a wakeup in between. That's why we set the task state
+		 * to TASK_INTERRUPTIBLE before doing the checks.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		/*
+		 * Always short-circuit for fatal signals to allow
+		 * threads to make a timely exit without the chance of
+		 * finding more events available and fetching
+		 * repeatedly.
+		 */
+		if (fatal_signal_pending(current)) {
+			res = -EINTR;
+			break;
 		}
 
-		__remove_wait_queue(&ep->wq, &wait);
-		__set_current_state(TASK_RUNNING);
+		eavail = ep_events_available(ep);
+		if (eavail)
+			break;
+		if (signal_pending(current)) {
+			res = -EINTR;
+			break;
+		}
+
+		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+			timed_out = 1;
+			break;
+		}
 	}
-check_events:
-	/* Is it worth to try to dig for events ? */
-	eavail = ep_events_available(ep);
 
-	spin_unlock_irq(&ep->wq.lock);
+	__set_current_state(TASK_RUNNING);
 
+send_events:
 	/*
 	 * Try to transfer events to user space. In case we get 0 events and
 	 * there's still timeout left over, we go trying again in search of
@@ -1840,6 +1930,12 @@
 	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
 		goto fetch_events;
 
+	if (waiter) {
+		spin_lock_irq(&ep->wq.lock);
+		__remove_wait_queue(&ep->wq, &wait);
+		spin_unlock_irq(&ep->wq.lock);
+	}
+
 	return res;
 }
 
@@ -1876,7 +1972,7 @@
 			ep_tovisit = epi->ffd.file->private_data;
 			if (ep_tovisit->visited)
 				continue;
-			error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+			error = ep_call_nested(&poll_loop_ncalls,
 					ep_loop_check_proc, epi->ffd.file,
 					ep_tovisit, current);
 			if (error != 0)
@@ -1916,7 +2012,7 @@
 	int ret;
 	struct eventpoll *ep_cur, *ep_next;
 
-	ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+	ret = ep_call_nested(&poll_loop_ncalls,
 			      ep_loop_check_proc, file, ep, current);
 	/* clear visited list */
 	list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
@@ -2172,7 +2268,7 @@
 		return -EINVAL;
 
 	/* Verify that the area passed by the user is writeable */
-	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
+	if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
 		return -EFAULT;
 
 	/* Get the "struct file *" for the eventpoll file */
@@ -2217,37 +2313,17 @@
 		size_t, sigsetsize)
 {
 	int error;
-	sigset_t ksigmask, sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	if (sigmask) {
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-		sigsaved = current->blocked;
-		set_current_blocked(&ksigmask);
-	}
+	error = set_user_sigmask(sigmask, sigsetsize);
+	if (error)
+		return error;
 
 	error = do_epoll_wait(epfd, events, maxevents, timeout);
-
-	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
-	 */
-	if (sigmask) {
-		if (error == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		} else
-			set_current_blocked(&sigsaved);
-	}
+	restore_saved_sigmask_unless(error == -EINTR);
 
 	return error;
 }
@@ -2260,37 +2336,17 @@
 			compat_size_t, sigsetsize)
 {
 	long err;
-	sigset_t ksigmask, sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (get_compat_sigset(&ksigmask, sigmask))
-			return -EFAULT;
-		sigsaved = current->blocked;
-		set_current_blocked(&ksigmask);
-	}
+	err = set_compat_user_sigmask(sigmask, sigsetsize);
+	if (err)
+		return err;
 
 	err = do_epoll_wait(epfd, events, maxevents, timeout);
-
-	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
-	 */
-	if (sigmask) {
-		if (err == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		} else
-			set_current_blocked(&sigsaved);
-	}
+	restore_saved_sigmask_unless(err == -EINTR);
 
 	return err;
 }

diff --git a/fs/exec.c b/fs/exec.c
index 1ebf6e5..c272312 100644
--- a/fs/exec.c
+++ b/fs/exec.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/exec.c
  *
@@ -218,55 +219,10 @@
 	if (ret <= 0)
 		return NULL;
 
-	if (write) {
-		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
-		unsigned long ptr_size, limit;
-
-		/*
-		 * Since the stack will hold pointers to the strings, we
-		 * must account for them as well.
-		 *
-		 * The size calculation is the entire vma while each arg page is
-		 * built, so each time we get here it's calculating how far it
-		 * is currently (rather than each call being just the newly
-		 * added size from the arg page).  As a result, we need to
-		 * always add the entire size of the pointers, so that on the
-		 * last call to get_arg_page() we'll actually have the entire
-		 * correct size.
-		 */
-		ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
-		if (ptr_size > ULONG_MAX - size)
-			goto fail;
-		size += ptr_size;
-
-		acct_arg_size(bprm, size / PAGE_SIZE);
-
-		/*
-		 * We've historically supported up to 32 pages (ARG_MAX)
-		 * of argument strings even with small stacks
-		 */
-		if (size <= ARG_MAX)
-			return page;
-
-		/*
-		 * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
-		 * (whichever is smaller) for the argv+env strings.
-		 * This ensures that:
-		 *  - the remaining binfmt code will not run out of stack space,
-		 *  - the program will have a reasonable amount of stack left
-		 *    to work from.
-		 */
-		limit = _STK_LIM / 4 * 3;
-		limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
-		if (size > limit)
-			goto fail;
-	}
+	if (write)
+		acct_arg_size(bprm, vma_pages(bprm->vma));
 
 	return page;
-
-fail:
-	put_page(page);
-	return NULL;
 }
 
 static void put_arg_page(struct page *page)
@@ -492,6 +448,50 @@
 	return i;
 }
 
+static int prepare_arg_pages(struct linux_binprm *bprm,
+			struct user_arg_ptr argv, struct user_arg_ptr envp)
+{
+	unsigned long limit, ptr_size;
+
+	bprm->argc = count(argv, MAX_ARG_STRINGS);
+	if (bprm->argc < 0)
+		return bprm->argc;
+
+	bprm->envc = count(envp, MAX_ARG_STRINGS);
+	if (bprm->envc < 0)
+		return bprm->envc;
+
+	/*
+	 * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
+	 * (whichever is smaller) for the argv+env strings.
+	 * This ensures that:
+	 *  - the remaining binfmt code will not run out of stack space,
+	 *  - the program will have a reasonable amount of stack left
+	 *    to work from.
+	 */
+	limit = _STK_LIM / 4 * 3;
+	limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
+	/*
+	 * We've historically supported up to 32 pages (ARG_MAX)
+	 * of argument strings even with small stacks
+	 */
+	limit = max_t(unsigned long, limit, ARG_MAX);
+	/*
+	 * We must account for the size of all the argv and envp pointers to
+	 * the argv and envp strings, since they will also take up space in
+	 * the stack. They aren't stored until much later when we can't
+	 * signal to the parent that the child has run out of stack space.
+	 * Instead, calculate it here so it's possible to fail gracefully.
+	 */
+	ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+	if (limit <= ptr_size)
+		return -E2BIG;
+	limit -= ptr_size;
+
+	bprm->argmin = bprm->p - limit;
+	return 0;
+}
+
 /*
  * 'copy_strings()' copies argument/environment strings from the old
  * processes's memory to the new process's stack.  The call to get_user_pages()
@@ -527,6 +527,10 @@
 		pos = bprm->p;
 		str += len;
 		bprm->p -= len;
+#ifdef CONFIG_MMU
+		if (bprm->p < bprm->argmin)
+			goto out;
+#endif
 
 		while (len > 0) {
 			int offset, bytes_to_copy;
@@ -908,14 +912,14 @@
 		goto out;
 
 	i_size = i_size_read(file_inode(file));
-	if (max_size > 0 && i_size > max_size) {
-		ret = -EFBIG;
-		goto out;
-	}
 	if (i_size <= 0) {
 		ret = -EINVAL;
 		goto out;
 	}
+	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
+		ret = -EFBIG;
+		goto out;
+	}
 
 	if (id != READING_FIRMWARE_PREALLOC_BUFFER)
 		*buf = vmalloc(i_size);
@@ -929,7 +933,7 @@
 		bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
 		if (bytes < 0) {
 			ret = bytes;
-			goto out;
+			goto out_free;
 		}
 
 		if (bytes == 0)
@@ -1011,7 +1015,7 @@
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
-	mm_release(tsk, old_mm);
+	exec_mm_release(tsk, old_mm);
 
 	if (old_mm) {
 		sync_mm_rss(old_mm);
@@ -1029,6 +1033,7 @@
 	}
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
+	membarrier_exec_mmap(mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
@@ -1084,7 +1089,7 @@
 		__set_current_state(TASK_KILLABLE);
 		spin_unlock_irq(lock);
 		schedule();
-		if (unlikely(__fatal_signal_pending(tsk)))
+		if (__fatal_signal_pending(tsk))
 			goto killed;
 		spin_lock_irq(lock);
 	}
@@ -1112,7 +1117,7 @@
 			write_unlock_irq(&tasklist_lock);
 			cgroup_threadgroup_change_end(tsk);
 			schedule();
-			if (unlikely(__fatal_signal_pending(tsk)))
+			if (__fatal_signal_pending(tsk))
 				goto killed;
 		}
 
@@ -1186,7 +1191,7 @@
 	flush_itimer_signals();
 #endif
 
-	if (atomic_read(&oldsighand->count) != 1) {
+	if (refcount_read(&oldsighand->count) != 1) {
 		struct sighand_struct *newsighand;
 		/*
 		 * This ->sighand is shared with the CLONE_SIGHAND
@@ -1196,7 +1201,7 @@
 		if (!newsighand)
 			return -ENOMEM;
 
-		atomic_set(&newsighand->count, 1);
+		refcount_set(&newsighand->count, 1);
 		memcpy(newsighand->action, oldsighand->action,
 		       sizeof(newsighand->action));
 
@@ -1399,7 +1404,7 @@
  * Or, if exec fails before, free_bprm() should release ->cred and
  * and unlock.
  */
-int prepare_bprm_creds(struct linux_binprm *bprm)
+static int prepare_bprm_creds(struct linux_binprm *bprm)
 {
 	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
 		return -ERESTARTNOINTR;
@@ -1560,7 +1565,7 @@
 
 /*
  * Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ * Check permissions, then read the first BINPRM_BUF_SIZE bytes
  *
  * This may be called multiple times for binary chains (scripts for example).
  */
@@ -1649,15 +1654,17 @@
 		if (!try_module_get(fmt->module))
 			continue;
 		read_unlock(&binfmt_lock);
+
 		bprm->recursion_depth++;
 		retval = fmt->load_binary(bprm);
+		bprm->recursion_depth--;
+
 		read_lock(&binfmt_lock);
 		put_binfmt(fmt);
-		bprm->recursion_depth--;
 		if (retval < 0 && !bprm->mm) {
 			/* we got to flush_old_exec() and failed after it */
 			read_unlock(&binfmt_lock);
-			force_sigsegv(SIGSEGV, current);
+			force_sigsegv(SIGSEGV);
 			return retval;
 		}
 		if (retval != -ENOEXEC || !bprm->file) {
@@ -1789,12 +1796,8 @@
 	if (retval)
 		goto out_unmark;
 
-	bprm->argc = count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
+	retval = prepare_arg_pages(bprm, argv, envp);
+	if (retval < 0)
 		goto out;
 
 	retval = prepare_binprm(bprm);
@@ -1823,10 +1826,9 @@
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	membarrier_execve(current);
 	rseq_execve(current);
 	acct_update_integrals(current);
-	task_numa_free(current);
+	task_numa_free(current, false);
 	free_bprm(bprm);
 	kfree(pathbuf);
 	if (filename)
@@ -1945,15 +1947,10 @@
  */
 void set_dumpable(struct mm_struct *mm, int value)
 {
-	unsigned long old, new;
-
 	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
 		return;
 
-	do {
-		old = READ_ONCE(mm->flags);
-		new = (old & ~MMF_DUMPABLE_MASK) | value;
-	} while (cmpxchg(&mm->flags, old, new) != old);
+	set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
 }
 
 SYSCALL_DEFINE3(execve,

diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
deleted file mode 100644
index 1b2d4c6..0000000
--- a/fs/exofs/BUGS
+++ /dev/null

@@ -1,3 +0,0 @@
-- Out-of-space may cause a severe problem if the object (and directory entry)
-  were written, but the inode attributes failed. Then if the filesystem was
-  unmounted and mounted the kernel can get into an endless loop doing a readdir.

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
deleted file mode 100644
index a364fd0..0000000
--- a/fs/exofs/Kbuild
+++ /dev/null

@@ -1,20 +0,0 @@
-#
-# Kbuild for the EXOFS module
-#
-# Copyright (C) 2008 Panasas Inc.  All rights reserved.
-#
-# Authors:
-#   Boaz Harrosh <ooo@electrozaur.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2
-#
-# Kbuild - Gets included from the Kernels Makefile and build system
-#
-
-# ore module library
-libore-y := ore.o ore_raid.o
-obj-$(CONFIG_ORE) += libore.o
-
-exofs-y := inode.o file.o namei.o dir.o super.o sys.o
-obj-$(CONFIG_EXOFS_FS) += exofs.o

diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
deleted file mode 100644
index 86194b2..0000000
--- a/fs/exofs/Kconfig
+++ /dev/null

@@ -1,13 +0,0 @@
-config EXOFS_FS
-	tristate "exofs: OSD based file system support"
-	depends on SCSI_OSD_ULD
-	help
-	  EXOFS is a file system that uses an OSD storage device,
-	  as its backing storage.
-
-# Debugging-related stuff
-config EXOFS_DEBUG
-	bool "Enable debugging"
-	depends on EXOFS_FS
-	help
-	  This option enables EXOFS debug prints.

diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
deleted file mode 100644
index 2daf232..0000000
--- a/fs/exofs/Kconfig.ore
+++ /dev/null

@@ -1,14 +0,0 @@
-# ORE - Objects Raid Engine (libore.ko)
-#
-# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
-# for every ORE user we do it like this. Any user should add itself here
-# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
-# selected here, and we default to "ON". So in effect it is like been
-# selected by any of the users.
-config ORE
-	tristate
-	depends on EXOFS_FS || PNFS_OBJLAYOUT
-	select ASYNC_XOR
-	select RAID6_PQ
-	select ASYNC_PQ
-	default SCSI_OSD_ULD

diff --git a/fs/exofs/common.h b/fs/exofs/common.h
deleted file mode 100644
index 7d88ef5..0000000
--- a/fs/exofs/common.h
+++ /dev/null

@@ -1,262 +0,0 @@
-/*
- * common.h - Common definitions for both Kernel and user-mode utilities
- *
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#ifndef __EXOFS_COM_H__
-#define __EXOFS_COM_H__
-
-#include <linux/types.h>
-
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_sec.h>
-
-/****************************************************************************
- * Object ID related defines
- * NOTE: inode# = object ID - EXOFS_OBJ_OFF
- ****************************************************************************/
-#define EXOFS_MIN_PID   0x10000	/* Smallest partition ID */
-#define EXOFS_OBJ_OFF	0x10000	/* offset for objects */
-#define EXOFS_SUPER_ID	0x10000	/* object ID for on-disk superblock */
-#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
-#define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
-
-/* exofs Application specific page/attribute */
-/* Inode attrs */
-# define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
-# define EXOFS_ATTR_INODE_DATA	1
-# define EXOFS_ATTR_INODE_FILE_LAYOUT	2
-# define EXOFS_ATTR_INODE_DIR_LAYOUT	3
-/* Partition attrs */
-# define EXOFS_APAGE_SB_DATA	(0xF0000000U + 3)
-# define EXOFS_ATTR_SB_STATS	1
-
-/*
- * The maximum number of files we can have is limited by the size of the
- * inode number.  This is the largest object ID that the file system supports.
- * Object IDs 0, 1, and 2 are always in use (see above defines).
- */
-enum {
-	EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
-					(1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
-	EXOFS_MAX_ID	 = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
-};
-
-/****************************************************************************
- * Misc.
- ****************************************************************************/
-#define EXOFS_BLKSHIFT	12
-#define EXOFS_BLKSIZE	(1UL << EXOFS_BLKSHIFT)
-
-/****************************************************************************
- * superblock-related things
- ****************************************************************************/
-#define EXOFS_SUPER_MAGIC	0x5DF5
-
-/*
- * The file system control block - stored in object EXOFS_SUPER_ID's data.
- * This is where the in-memory superblock is stored on disk.
- */
-enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
-struct exofs_fscb {
-	__le64  s_nextid;	/* Only used after mkfs */
-	__le64  s_numfiles;	/* Only used after mkfs */
-	__le32	s_version;	/* == EXOFS_FSCB_VER */
-	__le16  s_magic;	/* Magic signature */
-	__le16  s_newfs;	/* Non-zero if this is a new fs */
-
-	/* From here on it's a static part, only written by mkexofs */
-	__le64	s_dev_table_oid;   /* Resurved, not used */
-	__le64	s_dev_table_count; /* == 0 means no dev_table */
-} __packed;
-
-/*
- * This struct is set on the FS partition's attributes.
- * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
- * with the create command, to atomically persist the sb writeable information.
- */
-struct exofs_sb_stats {
-	__le64  s_nextid;	/* Highest object ID used */
-	__le64  s_numfiles;	/* Number of files on fs */
-} __packed;
-
-/*
- * Describes the raid used in the FS. It is part of the device table.
- * This here is taken from the pNFS-objects definition. In exofs we
- * use one raid policy through-out the filesystem. (NOTE: the funny
- * alignment at beginning. We take care of it at exofs_device_table.
- */
-struct exofs_dt_data_map {
-	__le32	cb_num_comps;
-	__le64	cb_stripe_unit;
-	__le32	cb_group_width;
-	__le32	cb_group_depth;
-	__le32	cb_mirror_cnt;
-	__le32	cb_raid_algorithm;
-} __packed;
-
-/*
- * This is an osd device information descriptor. It is a single entry in
- * the exofs device table. It describes an osd target lun which
- * contains data belonging to this FS. (Same partition_id on all devices)
- */
-struct exofs_dt_device_info {
-	__le32	systemid_len;
-	u8	systemid[OSD_SYSTEMID_LEN];
-	__le64	long_name_offset;	/* If !0 then offset-in-file */
-	__le32	osdname_len;		/* */
-	u8	osdname[44];		/* Embbeded, Usually an asci uuid */
-} __packed;
-
-/*
- * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
- * It contains the raid used for this multy-device FS and an array of
- * participating devices.
- */
-struct exofs_device_table {
-	__le32				dt_version;	/* == EXOFS_DT_VER */
-	struct exofs_dt_data_map	dt_data_map;	/* Raid policy to use */
-
-	/* Resurved space For future use. Total includeing this:
-	 * (8 * sizeof(le64))
-	 */
-	__le64				__Resurved[4];
-
-	__le64				dt_num_devices;	/* Array size */
-	struct exofs_dt_device_info	dt_dev_table[];	/* Array of devices */
-} __packed;
-
-/****************************************************************************
- * inode-related things
- ****************************************************************************/
-#define EXOFS_IDATA		5
-
-/*
- * The file control block - stored in an object's attributes.  This is where
- * the in-memory inode is stored on disk.
- */
-struct exofs_fcb {
-	__le64  i_size;			/* Size of the file */
-	__le16  i_mode;         	/* File mode */
-	__le16  i_links_count;  	/* Links count */
-	__le32  i_uid;          	/* Owner Uid */
-	__le32  i_gid;          	/* Group Id */
-	__le32  i_atime;        	/* Access time */
-	__le32  i_ctime;        	/* Creation time */
-	__le32  i_mtime;        	/* Modification time */
-	__le32  i_flags;        	/* File flags (unused for now)*/
-	__le32  i_generation;   	/* File version (for NFS) */
-	__le32  i_data[EXOFS_IDATA];	/* Short symlink names and device #s */
-};
-
-#define EXOFS_INO_ATTR_SIZE	sizeof(struct exofs_fcb)
-
-/* This is the Attribute the fcb is stored in */
-static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
-	EXOFS_APAGE_FS_DATA,
-	EXOFS_ATTR_INODE_DATA,
-	EXOFS_INO_ATTR_SIZE);
-
-/****************************************************************************
- * dentry-related things
- ****************************************************************************/
-#define EXOFS_NAME_LEN	255
-
-/*
- * The on-disk directory entry
- */
-struct exofs_dir_entry {
-	__le64		inode_no;		/* inode number           */
-	__le16		rec_len;		/* directory entry length */
-	u8		name_len;		/* name length            */
-	u8		file_type;		/* umm...file type        */
-	char		name[EXOFS_NAME_LEN];	/* file name              */
-};
-
-enum {
-	EXOFS_FT_UNKNOWN,
-	EXOFS_FT_REG_FILE,
-	EXOFS_FT_DIR,
-	EXOFS_FT_CHRDEV,
-	EXOFS_FT_BLKDEV,
-	EXOFS_FT_FIFO,
-	EXOFS_FT_SOCK,
-	EXOFS_FT_SYMLINK,
-	EXOFS_FT_MAX
-};
-
-#define EXOFS_DIR_PAD			4
-#define EXOFS_DIR_ROUND			(EXOFS_DIR_PAD - 1)
-#define EXOFS_DIR_REC_LEN(name_len) \
-	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
-	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
-
-/*
- * The on-disk (optional) layout structure.
- * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
- * attribute, attached to any inode, usually to a directory.
- */
-
-enum exofs_inode_layout_gen_functions {
-	LAYOUT_MOVING_WINDOW = 0,
-	LAYOUT_IMPLICT = 1,
-};
-
-struct exofs_on_disk_inode_layout {
-	__le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
-	__le16 pad;
-	union {
-		/* gen_func == LAYOUT_MOVING_WINDOW (default) */
-		struct exofs_layout_sliding_window {
-			__le32 num_devices; /* first n devices in global-table*/
-		} sliding_window __packed;
-
-		/* gen_func == LAYOUT_IMPLICT */
-		struct exofs_layout_implict_list {
-			struct exofs_dt_data_map data_map;
-			/* Variable array of size data_map.cb_num_comps. These
-			 * are device indexes of the devices in the global table
-			 */
-			__le32 dev_indexes[];
-		} implict __packed;
-	};
-} __packed;
-
-static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
-{
-	return sizeof(struct exofs_on_disk_inode_layout) +
-		max_devs * sizeof(__le32);
-}
-
-#endif /*ifndef __EXOFS_COM_H__*/

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
deleted file mode 100644
index f013867..0000000
--- a/fs/exofs/dir.c
+++ /dev/null

@@ -1,661 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/iversion.h>
-#include "exofs.h"
-
-static inline unsigned exofs_chunk_size(struct inode *inode)
-{
-	return inode->i_sb->s_blocksize;
-}
-
-static inline void exofs_put_page(struct page *page)
-{
-	kunmap(page);
-	put_page(page);
-}
-
-static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
-{
-	loff_t last_byte = inode->i_size;
-
-	last_byte -= page_nr << PAGE_SHIFT;
-	if (last_byte > PAGE_SIZE)
-		last_byte = PAGE_SIZE;
-	return last_byte;
-}
-
-static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *dir = mapping->host;
-	int err = 0;
-
-	inode_inc_iversion(dir);
-
-	if (!PageUptodate(page))
-		SetPageUptodate(page);
-
-	if (pos+len > dir->i_size) {
-		i_size_write(dir, pos+len);
-		mark_inode_dirty(dir);
-	}
-	set_page_dirty(page);
-
-	if (IS_DIRSYNC(dir))
-		err = write_one_page(page);
-	else
-		unlock_page(page);
-
-	return err;
-}
-
-static bool exofs_check_page(struct page *page)
-{
-	struct inode *dir = page->mapping->host;
-	unsigned chunk_size = exofs_chunk_size(dir);
-	char *kaddr = page_address(page);
-	unsigned offs, rec_len;
-	unsigned limit = PAGE_SIZE;
-	struct exofs_dir_entry *p;
-	char *error;
-
-	/* if the page is the last one in the directory */
-	if ((dir->i_size >> PAGE_SHIFT) == page->index) {
-		limit = dir->i_size & ~PAGE_MASK;
-		if (limit & (chunk_size - 1))
-			goto Ebadsize;
-		if (!limit)
-			goto out;
-	}
-	for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
-		p = (struct exofs_dir_entry *)(kaddr + offs);
-		rec_len = le16_to_cpu(p->rec_len);
-
-		if (rec_len < EXOFS_DIR_REC_LEN(1))
-			goto Eshort;
-		if (rec_len & 3)
-			goto Ealign;
-		if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
-			goto Enamelen;
-		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
-			goto Espan;
-	}
-	if (offs != limit)
-		goto Eend;
-out:
-	SetPageChecked(page);
-	return true;
-
-Ebadsize:
-	EXOFS_ERR("ERROR [exofs_check_page]: "
-		"size of directory(0x%lx) is not a multiple of chunk size\n",
-		dir->i_ino
-	);
-	goto fail;
-Eshort:
-	error = "rec_len is smaller than minimal";
-	goto bad_entry;
-Ealign:
-	error = "unaligned directory entry";
-	goto bad_entry;
-Enamelen:
-	error = "rec_len is too small for name_len";
-	goto bad_entry;
-Espan:
-	error = "directory entry across blocks";
-	goto bad_entry;
-bad_entry:
-	EXOFS_ERR(
-		"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
-		"offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n",
-		dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
-		_LLU(le64_to_cpu(p->inode_no)),
-		rec_len, p->name_len);
-	goto fail;
-Eend:
-	p = (struct exofs_dir_entry *)(kaddr + offs);
-	EXOFS_ERR("ERROR [exofs_check_page]: "
-		"entry in directory(0x%lx) spans the page boundary"
-		"offset=%lu, inode=0x%llx\n",
-		dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
-		_LLU(le64_to_cpu(p->inode_no)));
-fail:
-	SetPageError(page);
-	return false;
-}
-
-static struct page *exofs_get_page(struct inode *dir, unsigned long n)
-{
-	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_mapping_page(mapping, n, NULL);
-
-	if (!IS_ERR(page)) {
-		kmap(page);
-		if (unlikely(!PageChecked(page))) {
-			if (PageError(page) || !exofs_check_page(page))
-				goto fail;
-		}
-	}
-	return page;
-
-fail:
-	exofs_put_page(page);
-	return ERR_PTR(-EIO);
-}
-
-static inline int exofs_match(int len, const unsigned char *name,
-					struct exofs_dir_entry *de)
-{
-	if (len != de->name_len)
-		return 0;
-	if (!de->inode_no)
-		return 0;
-	return !memcmp(name, de->name, len);
-}
-
-static inline
-struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
-{
-	return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
-}
-
-static inline unsigned
-exofs_validate_entry(char *base, unsigned offset, unsigned mask)
-{
-	struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
-	struct exofs_dir_entry *p =
-			(struct exofs_dir_entry *)(base + (offset&mask));
-	while ((char *)p < (char *)de) {
-		if (p->rec_len == 0)
-			break;
-		p = exofs_next_entry(p);
-	}
-	return (char *)p - base;
-}
-
-static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
-	[EXOFS_FT_UNKNOWN]	= DT_UNKNOWN,
-	[EXOFS_FT_REG_FILE]	= DT_REG,
-	[EXOFS_FT_DIR]		= DT_DIR,
-	[EXOFS_FT_CHRDEV]	= DT_CHR,
-	[EXOFS_FT_BLKDEV]	= DT_BLK,
-	[EXOFS_FT_FIFO]		= DT_FIFO,
-	[EXOFS_FT_SOCK]		= DT_SOCK,
-	[EXOFS_FT_SYMLINK]	= DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= EXOFS_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= EXOFS_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= EXOFS_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= EXOFS_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= EXOFS_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= EXOFS_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= EXOFS_FT_SYMLINK,
-};
-
-static inline
-void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
-{
-	umode_t mode = inode->i_mode;
-	de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
-}
-
-static int
-exofs_readdir(struct file *file, struct dir_context *ctx)
-{
-	loff_t pos = ctx->pos;
-	struct inode *inode = file_inode(file);
-	unsigned int offset = pos & ~PAGE_MASK;
-	unsigned long n = pos >> PAGE_SHIFT;
-	unsigned long npages = dir_pages(inode);
-	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
-	bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
-
-	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
-		return 0;
-
-	for ( ; n < npages; n++, offset = 0) {
-		char *kaddr, *limit;
-		struct exofs_dir_entry *de;
-		struct page *page = exofs_get_page(inode, n);
-
-		if (IS_ERR(page)) {
-			EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
-				  inode->i_ino);
-			ctx->pos += PAGE_SIZE - offset;
-			return PTR_ERR(page);
-		}
-		kaddr = page_address(page);
-		if (unlikely(need_revalidate)) {
-			if (offset) {
-				offset = exofs_validate_entry(kaddr, offset,
-								chunk_mask);
-				ctx->pos = (n<<PAGE_SHIFT) + offset;
-			}
-			file->f_version = inode_query_iversion(inode);
-			need_revalidate = false;
-		}
-		de = (struct exofs_dir_entry *)(kaddr + offset);
-		limit = kaddr + exofs_last_byte(inode, n) -
-							EXOFS_DIR_REC_LEN(1);
-		for (; (char *)de <= limit; de = exofs_next_entry(de)) {
-			if (de->rec_len == 0) {
-				EXOFS_ERR("ERROR: "
-				     "zero-length entry in directory(0x%lx)\n",
-				     inode->i_ino);
-				exofs_put_page(page);
-				return -EIO;
-			}
-			if (de->inode_no) {
-				unsigned char t;
-
-				if (de->file_type < EXOFS_FT_MAX)
-					t = exofs_filetype_table[de->file_type];
-				else
-					t = DT_UNKNOWN;
-
-				if (!dir_emit(ctx, de->name, de->name_len,
-						le64_to_cpu(de->inode_no),
-						t)) {
-					exofs_put_page(page);
-					return 0;
-				}
-			}
-			ctx->pos += le16_to_cpu(de->rec_len);
-		}
-		exofs_put_page(page);
-	}
-	return 0;
-}
-
-struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
-			struct dentry *dentry, struct page **res_page)
-{
-	const unsigned char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
-	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
-	unsigned long start, n;
-	unsigned long npages = dir_pages(dir);
-	struct page *page = NULL;
-	struct exofs_i_info *oi = exofs_i(dir);
-	struct exofs_dir_entry *de;
-
-	if (npages == 0)
-		goto out;
-
-	*res_page = NULL;
-
-	start = oi->i_dir_start_lookup;
-	if (start >= npages)
-		start = 0;
-	n = start;
-	do {
-		char *kaddr;
-		page = exofs_get_page(dir, n);
-		if (!IS_ERR(page)) {
-			kaddr = page_address(page);
-			de = (struct exofs_dir_entry *) kaddr;
-			kaddr += exofs_last_byte(dir, n) - reclen;
-			while ((char *) de <= kaddr) {
-				if (de->rec_len == 0) {
-					EXOFS_ERR("ERROR: zero-length entry in "
-						  "directory(0x%lx)\n",
-						  dir->i_ino);
-					exofs_put_page(page);
-					goto out;
-				}
-				if (exofs_match(namelen, name, de))
-					goto found;
-				de = exofs_next_entry(de);
-			}
-			exofs_put_page(page);
-		}
-		if (++n >= npages)
-			n = 0;
-	} while (n != start);
-out:
-	return NULL;
-
-found:
-	*res_page = page;
-	oi->i_dir_start_lookup = n;
-	return de;
-}
-
-struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
-{
-	struct page *page = exofs_get_page(dir, 0);
-	struct exofs_dir_entry *de = NULL;
-
-	if (!IS_ERR(page)) {
-		de = exofs_next_entry(
-				(struct exofs_dir_entry *)page_address(page));
-		*p = page;
-	}
-	return de;
-}
-
-ino_t exofs_parent_ino(struct dentry *child)
-{
-	struct page *page;
-	struct exofs_dir_entry *de;
-	ino_t ino;
-
-	de = exofs_dotdot(d_inode(child), &page);
-	if (!de)
-		return 0;
-
-	ino = le64_to_cpu(de->inode_no);
-	exofs_put_page(page);
-	return ino;
-}
-
-ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
-{
-	ino_t res = 0;
-	struct exofs_dir_entry *de;
-	struct page *page;
-
-	de = exofs_find_entry(dir, dentry, &page);
-	if (de) {
-		res = le64_to_cpu(de->inode_no);
-		exofs_put_page(page);
-	}
-	return res;
-}
-
-int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
-			struct page *page, struct inode *inode)
-{
-	loff_t pos = page_offset(page) +
-			(char *) de - (char *) page_address(page);
-	unsigned len = le16_to_cpu(de->rec_len);
-	int err;
-
-	lock_page(page);
-	err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL);
-	if (err)
-		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
-			  err);
-
-	de->inode_no = cpu_to_le64(inode->i_ino);
-	exofs_set_de_type(de, inode);
-	if (likely(!err))
-		err = exofs_commit_chunk(page, pos, len);
-	exofs_put_page(page);
-	dir->i_mtime = dir->i_ctime = current_time(dir);
-	mark_inode_dirty(dir);
-	return err;
-}
-
-int exofs_add_link(struct dentry *dentry, struct inode *inode)
-{
-	struct inode *dir = d_inode(dentry->d_parent);
-	const unsigned char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
-	unsigned chunk_size = exofs_chunk_size(dir);
-	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
-	unsigned short rec_len, name_len;
-	struct page *page = NULL;
-	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-	struct exofs_dir_entry *de;
-	unsigned long npages = dir_pages(dir);
-	unsigned long n;
-	char *kaddr;
-	loff_t pos;
-	int err;
-
-	for (n = 0; n <= npages; n++) {
-		char *dir_end;
-
-		page = exofs_get_page(dir, n);
-		err = PTR_ERR(page);
-		if (IS_ERR(page))
-			goto out;
-		lock_page(page);
-		kaddr = page_address(page);
-		dir_end = kaddr + exofs_last_byte(dir, n);
-		de = (struct exofs_dir_entry *)kaddr;
-		kaddr += PAGE_SIZE - reclen;
-		while ((char *)de <= kaddr) {
-			if ((char *)de == dir_end) {
-				name_len = 0;
-				rec_len = chunk_size;
-				de->rec_len = cpu_to_le16(chunk_size);
-				de->inode_no = 0;
-				goto got_it;
-			}
-			if (de->rec_len == 0) {
-				EXOFS_ERR("ERROR: exofs_add_link: "
-				      "zero-length entry in directory(0x%lx)\n",
-				      inode->i_ino);
-				err = -EIO;
-				goto out_unlock;
-			}
-			err = -EEXIST;
-			if (exofs_match(namelen, name, de))
-				goto out_unlock;
-			name_len = EXOFS_DIR_REC_LEN(de->name_len);
-			rec_len = le16_to_cpu(de->rec_len);
-			if (!de->inode_no && rec_len >= reclen)
-				goto got_it;
-			if (rec_len >= name_len + reclen)
-				goto got_it;
-			de = (struct exofs_dir_entry *) ((char *) de + rec_len);
-		}
-		unlock_page(page);
-		exofs_put_page(page);
-	}
-
-	EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
-		  dentry, inode->i_ino);
-	return -EINVAL;
-
-got_it:
-	pos = page_offset(page) +
-		(char *)de - (char *)page_address(page);
-	err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
-							&page, NULL);
-	if (err)
-		goto out_unlock;
-	if (de->inode_no) {
-		struct exofs_dir_entry *de1 =
-			(struct exofs_dir_entry *)((char *)de + name_len);
-		de1->rec_len = cpu_to_le16(rec_len - name_len);
-		de->rec_len = cpu_to_le16(name_len);
-		de = de1;
-	}
-	de->name_len = namelen;
-	memcpy(de->name, name, namelen);
-	de->inode_no = cpu_to_le64(inode->i_ino);
-	exofs_set_de_type(de, inode);
-	err = exofs_commit_chunk(page, pos, rec_len);
-	dir->i_mtime = dir->i_ctime = current_time(dir);
-	mark_inode_dirty(dir);
-	sbi->s_numfiles++;
-
-out_put:
-	exofs_put_page(page);
-out:
-	return err;
-out_unlock:
-	unlock_page(page);
-	goto out_put;
-}
-
-int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
-	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-	char *kaddr = page_address(page);
-	unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
-	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
-	loff_t pos;
-	struct exofs_dir_entry *pde = NULL;
-	struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
-	int err;
-
-	while (de < dir) {
-		if (de->rec_len == 0) {
-			EXOFS_ERR("ERROR: exofs_delete_entry:"
-				  "zero-length entry in directory(0x%lx)\n",
-				  inode->i_ino);
-			err = -EIO;
-			goto out;
-		}
-		pde = de;
-		de = exofs_next_entry(de);
-	}
-	if (pde)
-		from = (char *)pde - (char *)page_address(page);
-	pos = page_offset(page) + from;
-	lock_page(page);
-	err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
-							&page, NULL);
-	if (err)
-		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
-			  err);
-	if (pde)
-		pde->rec_len = cpu_to_le16(to - from);
-	dir->inode_no = 0;
-	if (likely(!err))
-		err = exofs_commit_chunk(page, pos, to - from);
-	inode->i_ctime = inode->i_mtime = current_time(inode);
-	mark_inode_dirty(inode);
-	sbi->s_numfiles--;
-out:
-	exofs_put_page(page);
-	return err;
-}
-
-/* kept aligned on 4 bytes */
-#define THIS_DIR ".\0\0"
-#define PARENT_DIR "..\0"
-
-int exofs_make_empty(struct inode *inode, struct inode *parent)
-{
-	struct address_space *mapping = inode->i_mapping;
-	struct page *page = grab_cache_page(mapping, 0);
-	unsigned chunk_size = exofs_chunk_size(inode);
-	struct exofs_dir_entry *de;
-	int err;
-	void *kaddr;
-
-	if (!page)
-		return -ENOMEM;
-
-	err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
-							&page, NULL);
-	if (err) {
-		unlock_page(page);
-		goto fail;
-	}
-
-	kaddr = kmap_atomic(page);
-	de = (struct exofs_dir_entry *)kaddr;
-	de->name_len = 1;
-	de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
-	memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
-	de->inode_no = cpu_to_le64(inode->i_ino);
-	exofs_set_de_type(de, inode);
-
-	de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
-	de->name_len = 2;
-	de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
-	de->inode_no = cpu_to_le64(parent->i_ino);
-	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
-	exofs_set_de_type(de, inode);
-	kunmap_atomic(kaddr);
-	err = exofs_commit_chunk(page, 0, chunk_size);
-fail:
-	put_page(page);
-	return err;
-}
-
-int exofs_empty_dir(struct inode *inode)
-{
-	struct page *page = NULL;
-	unsigned long i, npages = dir_pages(inode);
-
-	for (i = 0; i < npages; i++) {
-		char *kaddr;
-		struct exofs_dir_entry *de;
-		page = exofs_get_page(inode, i);
-
-		if (IS_ERR(page))
-			continue;
-
-		kaddr = page_address(page);
-		de = (struct exofs_dir_entry *)kaddr;
-		kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
-
-		while ((char *)de <= kaddr) {
-			if (de->rec_len == 0) {
-				EXOFS_ERR("ERROR: exofs_empty_dir: "
-					  "zero-length directory entry"
-					  "kaddr=%p, de=%p\n", kaddr, de);
-				goto not_empty;
-			}
-			if (de->inode_no != 0) {
-				/* check for . and .. */
-				if (de->name[0] != '.')
-					goto not_empty;
-				if (de->name_len > 2)
-					goto not_empty;
-				if (de->name_len < 2) {
-					if (le64_to_cpu(de->inode_no) !=
-					    inode->i_ino)
-						goto not_empty;
-				} else if (de->name[1] != '.')
-					goto not_empty;
-			}
-			de = exofs_next_entry(de);
-		}
-		exofs_put_page(page);
-	}
-	return 1;
-
-not_empty:
-	exofs_put_page(page);
-	return 0;
-}
-
-const struct file_operations exofs_dir_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.iterate_shared	= exofs_readdir,
-};

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
deleted file mode 100644
index 5dc3924..0000000
--- a/fs/exofs/exofs.h
+++ /dev/null

@@ -1,240 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __EXOFS_H__
-#define __EXOFS_H__
-
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/backing-dev.h>
-#include <scsi/osd_ore.h>
-
-#include "common.h"
-
-#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define EXOFS_DBGMSG(fmt, a...) \
-	printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define EXOFS_DBGMSG(fmt, a...) \
-	do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-struct exofs_dev {
-	struct ore_dev ored;
-	unsigned did;
-	unsigned urilen;
-	uint8_t *uri;
-	struct kobject ed_kobj;
-};
-/*
- * our extension to the in-memory superblock
- */
-struct exofs_sb_info {
-	struct exofs_sb_stats s_ess;		/* Written often, pre-allocate*/
-	int		s_timeout;		/* timeout for OSD operations */
-	uint64_t	s_nextid;		/* highest object ID used     */
-	uint32_t	s_numfiles;		/* number of files on fs      */
-	spinlock_t	s_next_gen_lock;	/* spinlock for gen # update  */
-	u32		s_next_generation;	/* next gen # to use          */
-	atomic_t	s_curr_pending;		/* number of pending commands */
-
-	struct ore_layout	layout;		/* Default files layout       */
-	struct ore_comp one_comp;		/* id & cred of partition id=0*/
-	struct ore_components oc;		/* comps for the partition    */
-	struct kobject	s_kobj;			/* holds per-sbi kobject      */
-};
-
-/*
- * our extension to the in-memory inode
- */
-struct exofs_i_info {
-	struct inode   vfs_inode;          /* normal in-memory inode          */
-	wait_queue_head_t i_wq;            /* wait queue for inode            */
-	unsigned long  i_flags;            /* various atomic flags            */
-	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
-	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
-	uint64_t       i_commit_size;      /* the object's written length     */
-	struct ore_comp one_comp;	   /* same component for all devices  */
-	struct ore_components oc;	   /* inode view of the device table  */
-};
-
-static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
-{
-	return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
-}
-
-/*
- * our inode flags
- */
-#define OBJ_2BCREATED	0	/* object will be created soon*/
-#define OBJ_CREATED	1	/* object has been created on the osd*/
-
-static inline int obj_2bcreated(struct exofs_i_info *oi)
-{
-	return test_bit(OBJ_2BCREATED, &oi->i_flags);
-}
-
-static inline void set_obj_2bcreated(struct exofs_i_info *oi)
-{
-	set_bit(OBJ_2BCREATED, &oi->i_flags);
-}
-
-static inline int obj_created(struct exofs_i_info *oi)
-{
-	return test_bit(OBJ_CREATED, &oi->i_flags);
-}
-
-static inline void set_obj_created(struct exofs_i_info *oi)
-{
-	set_bit(OBJ_CREATED, &oi->i_flags);
-}
-
-int __exofs_wait_obj_created(struct exofs_i_info *oi);
-static inline int wait_obj_created(struct exofs_i_info *oi)
-{
-	if (likely(obj_created(oi)))
-		return 0;
-
-	return __exofs_wait_obj_created(oi);
-}
-
-/*
- * get to our inode from the vfs inode
- */
-static inline struct exofs_i_info *exofs_i(struct inode *inode)
-{
-	return container_of(inode, struct exofs_i_info, vfs_inode);
-}
-
-/*
- * Maximum count of links to a file
- */
-#define EXOFS_LINK_MAX           32000
-
-/*************************
- * function declarations *
- *************************/
-
-/* inode.c               */
-unsigned exofs_max_io_pages(struct ore_layout *layout,
-			    unsigned expected_pages);
-int exofs_setattr(struct dentry *, struct iattr *);
-int exofs_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned flags,
-		struct page **pagep, void **fsdata);
-extern struct inode *exofs_iget(struct super_block *, unsigned long);
-struct inode *exofs_new_inode(struct inode *, umode_t);
-extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
-extern void exofs_evict_inode(struct inode *);
-
-/* dir.c:                */
-int exofs_add_link(struct dentry *, struct inode *);
-ino_t exofs_inode_by_name(struct inode *, struct dentry *);
-int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
-int exofs_make_empty(struct inode *, struct inode *);
-struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
-					 struct page **);
-int exofs_empty_dir(struct inode *);
-struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
-ino_t exofs_parent_ino(struct dentry *child);
-int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
-		    struct inode *);
-
-/* super.c               */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
-			   const struct osd_obj_id *obj);
-int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
-
-/* sys.c                 */
-int exofs_sysfs_init(void);
-void exofs_sysfs_uninit(void);
-int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
-		       struct exofs_dt_device_info *dt_dev);
-void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
-int exofs_sysfs_odev_add(struct exofs_dev *edev,
-			 struct exofs_sb_info *sbi);
-void exofs_sysfs_dbg_print(void);
-
-/*********************
- * operation vectors *
- *********************/
-/* dir.c:            */
-extern const struct file_operations exofs_dir_operations;
-
-/* file.c            */
-extern const struct inode_operations exofs_file_inode_operations;
-extern const struct file_operations exofs_file_operations;
-
-/* inode.c           */
-extern const struct address_space_operations exofs_aops;
-
-/* namei.c           */
-extern const struct inode_operations exofs_dir_inode_operations;
-extern const struct inode_operations exofs_special_inode_operations;
-
-/* exofs_init_comps will initialize an ore_components device array
- * pointing to a single ore_comp struct, and a round-robin view
- * of the device table.
- * The first device of each inode is the [inode->ino % num_devices]
- * and the rest of the devices sequentially following where the
- * first device is after the last device.
- * It is assumed that the global device array at @sbi is twice
- * bigger and that the device table repeats twice.
- * See: exofs_read_lookup_dev_table()
- */
-static inline void exofs_init_comps(struct ore_components *oc,
-				    struct ore_comp *one_comp,
-				    struct exofs_sb_info *sbi, osd_id oid)
-{
-	unsigned dev_mod = (unsigned)oid, first_dev;
-
-	one_comp->obj.partition = sbi->one_comp.obj.partition;
-	one_comp->obj.id = oid;
-	exofs_make_credential(one_comp->cred, &one_comp->obj);
-
-	oc->first_dev = 0;
-	oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
-							sbi->layout.group_count;
-	oc->single_comp = EC_SINGLE_COMP;
-	oc->comps = one_comp;
-
-	/* Round robin device view of the table */
-	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
-	oc->ods = &sbi->oc.ods[first_dev];
-}
-
-#endif

diff --git a/fs/exofs/file.c b/fs/exofs/file.c
deleted file mode 100644
index a94594e..0000000
--- a/fs/exofs/file.c
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "exofs.h"
-
-static int exofs_release_file(struct inode *inode, struct file *filp)
-{
-	return 0;
-}
-
-/* exofs_file_fsync - flush the inode to disk
- *
- *   Note, in exofs all metadata is written as part of inode, regardless.
- *   The writeout is synchronous
- */
-static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
-			    int datasync)
-{
-	struct inode *inode = filp->f_mapping->host;
-	int ret;
-
-	ret = file_write_and_wait_range(filp, start, end);
-	if (ret)
-		return ret;
-
-	inode_lock(inode);
-	ret = sync_inode_metadata(filp->f_mapping->host, 1);
-	inode_unlock(inode);
-	return ret;
-}
-
-static int exofs_flush(struct file *file, fl_owner_t id)
-{
-	int ret = vfs_fsync(file, 0);
-	/* TODO: Flush the OSD target */
-	return ret;
-}
-
-const struct file_operations exofs_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read_iter	= generic_file_read_iter,
-	.write_iter	= generic_file_write_iter,
-	.mmap		= generic_file_mmap,
-	.open		= generic_file_open,
-	.release	= exofs_release_file,
-	.fsync		= exofs_file_fsync,
-	.flush		= exofs_flush,
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= iter_file_splice_write,
-};
-
-const struct inode_operations exofs_file_inode_operations = {
-	.setattr	= exofs_setattr,
-};

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
deleted file mode 100644
index 5f81fcd..0000000
--- a/fs/exofs/inode.c
+++ /dev/null

@@ -1,1514 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/slab.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-
-unsigned exofs_max_io_pages(struct ore_layout *layout,
-			    unsigned expected_pages)
-{
-	unsigned pages = min_t(unsigned, expected_pages,
-			       layout->max_io_length / PAGE_SIZE);
-
-	return pages;
-}
-
-struct page_collect {
-	struct exofs_sb_info *sbi;
-	struct inode *inode;
-	unsigned expected_pages;
-	struct ore_io_state *ios;
-
-	struct page **pages;
-	unsigned alloc_pages;
-	unsigned nr_pages;
-	unsigned long length;
-	loff_t pg_first; /* keep 64bit also in 32-arches */
-	bool read_4_write; /* This means two things: that the read is sync
-			    * And the pages should not be unlocked.
-			    */
-	struct page *that_locked_page;
-};
-
-static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
-		       struct inode *inode)
-{
-	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-
-	pcol->sbi = sbi;
-	pcol->inode = inode;
-	pcol->expected_pages = expected_pages;
-
-	pcol->ios = NULL;
-	pcol->pages = NULL;
-	pcol->alloc_pages = 0;
-	pcol->nr_pages = 0;
-	pcol->length = 0;
-	pcol->pg_first = -1;
-	pcol->read_4_write = false;
-	pcol->that_locked_page = NULL;
-}
-
-static void _pcol_reset(struct page_collect *pcol)
-{
-	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
-
-	pcol->pages = NULL;
-	pcol->alloc_pages = 0;
-	pcol->nr_pages = 0;
-	pcol->length = 0;
-	pcol->pg_first = -1;
-	pcol->ios = NULL;
-	pcol->that_locked_page = NULL;
-
-	/* this is probably the end of the loop but in writes
-	 * it might not end here. don't be left with nothing
-	 */
-	if (!pcol->expected_pages)
-		pcol->expected_pages =
-				exofs_max_io_pages(&pcol->sbi->layout, ~0);
-}
-
-static int pcol_try_alloc(struct page_collect *pcol)
-{
-	unsigned pages;
-
-	/* TODO: easily support bio chaining */
-	pages =  exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
-
-	for (; pages; pages >>= 1) {
-		pcol->pages = kmalloc_array(pages, sizeof(struct page *),
-					    GFP_KERNEL);
-		if (likely(pcol->pages)) {
-			pcol->alloc_pages = pages;
-			return 0;
-		}
-	}
-
-	EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
-		  pcol->expected_pages);
-	return -ENOMEM;
-}
-
-static void pcol_free(struct page_collect *pcol)
-{
-	kfree(pcol->pages);
-	pcol->pages = NULL;
-
-	if (pcol->ios) {
-		ore_put_io_state(pcol->ios);
-		pcol->ios = NULL;
-	}
-}
-
-static int pcol_add_page(struct page_collect *pcol, struct page *page,
-			 unsigned len)
-{
-	if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
-		return -ENOMEM;
-
-	pcol->pages[pcol->nr_pages++] = page;
-	pcol->length += len;
-	return 0;
-}
-
-enum {PAGE_WAS_NOT_IN_IO = 17};
-static int update_read_page(struct page *page, int ret)
-{
-	switch (ret) {
-	case 0:
-		/* Everything is OK */
-		SetPageUptodate(page);
-		if (PageError(page))
-			ClearPageError(page);
-		break;
-	case -EFAULT:
-		/* In this case we were trying to read something that wasn't on
-		 * disk yet - return a page full of zeroes.  This should be OK,
-		 * because the object should be empty (if there was a write
-		 * before this read, the read would be waiting with the page
-		 * locked */
-		clear_highpage(page);
-
-		SetPageUptodate(page);
-		if (PageError(page))
-			ClearPageError(page);
-		EXOFS_DBGMSG("recovered read error\n");
-		/* fall through */
-	case PAGE_WAS_NOT_IN_IO:
-		ret = 0; /* recovered error */
-		break;
-	default:
-		SetPageError(page);
-	}
-	return ret;
-}
-
-static void update_write_page(struct page *page, int ret)
-{
-	if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
-		return; /* don't pass start don't collect $200 */
-
-	if (ret) {
-		mapping_set_error(page->mapping, ret);
-		SetPageError(page);
-	}
-	end_page_writeback(page);
-}
-
-/* Called at the end of reads, to optionally unlock pages and update their
- * status.
- */
-static int __readpages_done(struct page_collect *pcol)
-{
-	int i;
-	u64 good_bytes;
-	u64 length = 0;
-	int ret = ore_check_io(pcol->ios, NULL);
-
-	if (likely(!ret)) {
-		good_bytes = pcol->length;
-		ret = PAGE_WAS_NOT_IN_IO;
-	} else {
-		good_bytes = 0;
-	}
-
-	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
-		     " length=0x%lx nr_pages=%u\n",
-		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
-		     pcol->nr_pages);
-
-	for (i = 0; i < pcol->nr_pages; i++) {
-		struct page *page = pcol->pages[i];
-		struct inode *inode = page->mapping->host;
-		int page_stat;
-
-		if (inode != pcol->inode)
-			continue; /* osd might add more pages at end */
-
-		if (likely(length < good_bytes))
-			page_stat = 0;
-		else
-			page_stat = ret;
-
-		EXOFS_DBGMSG2("    readpages_done(0x%lx, 0x%lx) %s\n",
-			  inode->i_ino, page->index,
-			  page_stat ? "bad_bytes" : "good_bytes");
-
-		ret = update_read_page(page, page_stat);
-		if (!pcol->read_4_write)
-			unlock_page(page);
-		length += PAGE_SIZE;
-	}
-
-	pcol_free(pcol);
-	EXOFS_DBGMSG2("readpages_done END\n");
-	return ret;
-}
-
-/* callback of async reads */
-static void readpages_done(struct ore_io_state *ios, void *p)
-{
-	struct page_collect *pcol = p;
-
-	__readpages_done(pcol);
-	atomic_dec(&pcol->sbi->s_curr_pending);
-	kfree(pcol);
-}
-
-static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
-{
-	int i;
-
-	for (i = 0; i < pcol->nr_pages; i++) {
-		struct page *page = pcol->pages[i];
-
-		if (rw == READ)
-			update_read_page(page, ret);
-		else
-			update_write_page(page, ret);
-
-		unlock_page(page);
-	}
-}
-
-static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
-	struct page_collect *pcol_src, struct page_collect *pcol)
-{
-	/* length was wrong or offset was not page aligned */
-	BUG_ON(pcol_src->nr_pages < ios->nr_pages);
-
-	if (pcol_src->nr_pages > ios->nr_pages) {
-		struct page **src_page;
-		unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
-		unsigned long len_less = pcol_src->length - ios->length;
-		unsigned i;
-		int ret;
-
-		/* This IO was trimmed */
-		pcol_src->nr_pages = ios->nr_pages;
-		pcol_src->length = ios->length;
-
-		/* Left over pages are passed to the next io */
-		pcol->expected_pages += pages_less;
-		pcol->nr_pages = pages_less;
-		pcol->length = len_less;
-		src_page = pcol_src->pages + pcol_src->nr_pages;
-		pcol->pg_first = (*src_page)->index;
-
-		ret = pcol_try_alloc(pcol);
-		if (unlikely(ret))
-			return ret;
-
-		for (i = 0; i < pages_less; ++i)
-			pcol->pages[i] = *src_page++;
-
-		EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
-			"pages_less=0x%x expected_pages=0x%x "
-			"next_offset=0x%llx next_len=0x%lx\n",
-			pcol_src->nr_pages, pages_less, pcol->expected_pages,
-			pcol->pg_first * PAGE_SIZE, pcol->length);
-	}
-	return 0;
-}
-
-static int read_exec(struct page_collect *pcol)
-{
-	struct exofs_i_info *oi = exofs_i(pcol->inode);
-	struct ore_io_state *ios;
-	struct page_collect *pcol_copy = NULL;
-	int ret;
-
-	if (!pcol->pages)
-		return 0;
-
-	if (!pcol->ios) {
-		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
-					     pcol->pg_first << PAGE_SHIFT,
-					     pcol->length, &pcol->ios);
-
-		if (ret)
-			return ret;
-	}
-
-	ios = pcol->ios;
-	ios->pages = pcol->pages;
-
-	if (pcol->read_4_write) {
-		ore_read(pcol->ios);
-		return __readpages_done(pcol);
-	}
-
-	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
-	if (!pcol_copy) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	*pcol_copy = *pcol;
-	ios->done = readpages_done;
-	ios->private = pcol_copy;
-
-	/* pages ownership was passed to pcol_copy */
-	_pcol_reset(pcol);
-
-	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
-	if (unlikely(ret))
-		goto err;
-
-	EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
-		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
-
-	ret = ore_read(ios);
-	if (unlikely(ret))
-		goto err;
-
-	atomic_inc(&pcol->sbi->s_curr_pending);
-
-	return 0;
-
-err:
-	if (!pcol_copy) /* Failed before ownership transfer */
-		pcol_copy = pcol;
-	_unlock_pcol_pages(pcol_copy, ret, READ);
-	pcol_free(pcol_copy);
-	kfree(pcol_copy);
-
-	return ret;
-}
-
-/* readpage_strip is called either directly from readpage() or by the VFS from
- * within read_cache_pages(), to add one more page to be read. It will try to
- * collect as many contiguous pages as posible. If a discontinuity is
- * encountered, or it runs out of resources, it will submit the previous segment
- * and will start a new collection. Eventually caller must submit the last
- * segment if present.
- */
-static int readpage_strip(void *data, struct page *page)
-{
-	struct page_collect *pcol = data;
-	struct inode *inode = pcol->inode;
-	struct exofs_i_info *oi = exofs_i(inode);
-	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_SHIFT;
-	size_t len;
-	int ret;
-
-	BUG_ON(!PageLocked(page));
-
-	/* FIXME: Just for debugging, will be removed */
-	if (PageUptodate(page))
-		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
-			  page->index);
-
-	pcol->that_locked_page = page;
-
-	if (page->index < end_index)
-		len = PAGE_SIZE;
-	else if (page->index == end_index)
-		len = i_size & ~PAGE_MASK;
-	else
-		len = 0;
-
-	if (!len || !obj_created(oi)) {
-		/* this will be out of bounds, or doesn't exist yet.
-		 * Current page is cleared and the request is split
-		 */
-		clear_highpage(page);
-
-		SetPageUptodate(page);
-		if (PageError(page))
-			ClearPageError(page);
-
-		if (!pcol->read_4_write)
-			unlock_page(page);
-		EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
-			     "read_4_write=%d index=0x%lx end_index=0x%lx "
-			     "splitting\n", inode->i_ino, len,
-			     pcol->read_4_write, page->index, end_index);
-
-		return read_exec(pcol);
-	}
-
-try_again:
-
-	if (unlikely(pcol->pg_first == -1)) {
-		pcol->pg_first = page->index;
-	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
-		   page->index)) {
-		/* Discontinuity detected, split the request */
-		ret = read_exec(pcol);
-		if (unlikely(ret))
-			goto fail;
-		goto try_again;
-	}
-
-	if (!pcol->pages) {
-		ret = pcol_try_alloc(pcol);
-		if (unlikely(ret))
-			goto fail;
-	}
-
-	if (len != PAGE_SIZE)
-		zero_user(page, len, PAGE_SIZE - len);
-
-	EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
-		     inode->i_ino, page->index, len);
-
-	ret = pcol_add_page(pcol, page, len);
-	if (ret) {
-		EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
-			  "this_len=0x%zx nr_pages=%u length=0x%lx\n",
-			  page, len, pcol->nr_pages, pcol->length);
-
-		/* split the request, and start again with current page */
-		ret = read_exec(pcol);
-		if (unlikely(ret))
-			goto fail;
-
-		goto try_again;
-	}
-
-	return 0;
-
-fail:
-	/* SetPageError(page); ??? */
-	unlock_page(page);
-	return ret;
-}
-
-static int exofs_readpages(struct file *file, struct address_space *mapping,
-			   struct list_head *pages, unsigned nr_pages)
-{
-	struct page_collect pcol;
-	int ret;
-
-	_pcol_init(&pcol, nr_pages, mapping->host);
-
-	ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
-	if (ret) {
-		EXOFS_ERR("read_cache_pages => %d\n", ret);
-		return ret;
-	}
-
-	ret = read_exec(&pcol);
-	if (unlikely(ret))
-		return ret;
-
-	return read_exec(&pcol);
-}
-
-static int _readpage(struct page *page, bool read_4_write)
-{
-	struct page_collect pcol;
-	int ret;
-
-	_pcol_init(&pcol, 1, page->mapping->host);
-
-	pcol.read_4_write = read_4_write;
-	ret = readpage_strip(&pcol, page);
-	if (ret) {
-		EXOFS_ERR("_readpage => %d\n", ret);
-		return ret;
-	}
-
-	return read_exec(&pcol);
-}
-
-/*
- * We don't need the file
- */
-static int exofs_readpage(struct file *file, struct page *page)
-{
-	return _readpage(page, false);
-}
-
-/* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct ore_io_state *ios, void *p)
-{
-	struct page_collect *pcol = p;
-	int i;
-	u64  good_bytes;
-	u64  length = 0;
-	int ret = ore_check_io(ios, NULL);
-
-	atomic_dec(&pcol->sbi->s_curr_pending);
-
-	if (likely(!ret)) {
-		good_bytes = pcol->length;
-		ret = PAGE_WAS_NOT_IN_IO;
-	} else {
-		good_bytes = 0;
-	}
-
-	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
-		     " length=0x%lx nr_pages=%u\n",
-		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
-		     pcol->nr_pages);
-
-	for (i = 0; i < pcol->nr_pages; i++) {
-		struct page *page = pcol->pages[i];
-		struct inode *inode = page->mapping->host;
-		int page_stat;
-
-		if (inode != pcol->inode)
-			continue; /* osd might add more pages to a bio */
-
-		if (likely(length < good_bytes))
-			page_stat = 0;
-		else
-			page_stat = ret;
-
-		update_write_page(page, page_stat);
-		unlock_page(page);
-		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
-			     inode->i_ino, page->index, page_stat);
-
-		length += PAGE_SIZE;
-	}
-
-	pcol_free(pcol);
-	kfree(pcol);
-	EXOFS_DBGMSG2("writepages_done END\n");
-}
-
-static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
-{
-	struct page_collect *pcol = priv;
-	pgoff_t index = offset / PAGE_SIZE;
-
-	if (!pcol->that_locked_page ||
-	    (pcol->that_locked_page->index != index)) {
-		struct page *page;
-		loff_t i_size = i_size_read(pcol->inode);
-
-		if (offset >= i_size) {
-			*uptodate = true;
-			EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
-			return ZERO_PAGE(0);
-		}
-
-		page =  find_get_page(pcol->inode->i_mapping, index);
-		if (!page) {
-			page = find_or_create_page(pcol->inode->i_mapping,
-						   index, GFP_NOFS);
-			if (unlikely(!page)) {
-				EXOFS_DBGMSG("grab_cache_page Failed "
-					"index=0x%llx\n", _LLU(index));
-				return NULL;
-			}
-			unlock_page(page);
-		}
-		*uptodate = PageUptodate(page);
-		EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
-		return page;
-	} else {
-		EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
-			     pcol->that_locked_page->index);
-		*uptodate = true;
-		return pcol->that_locked_page;
-	}
-}
-
-static void __r4w_put_page(void *priv, struct page *page)
-{
-	struct page_collect *pcol = priv;
-
-	if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
-		EXOFS_DBGMSG2("index=0x%lx\n", page->index);
-		put_page(page);
-		return;
-	}
-	EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
-		     ZERO_PAGE(0) == page ? -1 : page->index);
-}
-
-static const struct _ore_r4w_op _r4w_op = {
-	.get_page = &__r4w_get_page,
-	.put_page = &__r4w_put_page,
-};
-
-static int write_exec(struct page_collect *pcol)
-{
-	struct exofs_i_info *oi = exofs_i(pcol->inode);
-	struct ore_io_state *ios;
-	struct page_collect *pcol_copy = NULL;
-	int ret;
-
-	if (!pcol->pages)
-		return 0;
-
-	BUG_ON(pcol->ios);
-	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
-				 pcol->pg_first << PAGE_SHIFT,
-				 pcol->length, &pcol->ios);
-	if (unlikely(ret))
-		goto err;
-
-	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
-	if (!pcol_copy) {
-		EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	*pcol_copy = *pcol;
-
-	ios = pcol->ios;
-	ios->pages = pcol_copy->pages;
-	ios->done = writepages_done;
-	ios->r4w = &_r4w_op;
-	ios->private = pcol_copy;
-
-	/* pages ownership was passed to pcol_copy */
-	_pcol_reset(pcol);
-
-	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
-	if (unlikely(ret))
-		goto err;
-
-	EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
-		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
-
-	ret = ore_write(ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("write_exec: ore_write() Failed\n");
-		goto err;
-	}
-
-	atomic_inc(&pcol->sbi->s_curr_pending);
-	return 0;
-
-err:
-	if (!pcol_copy) /* Failed before ownership transfer */
-		pcol_copy = pcol;
-	_unlock_pcol_pages(pcol_copy, ret, WRITE);
-	pcol_free(pcol_copy);
-	kfree(pcol_copy);
-
-	return ret;
-}
-
-/* writepage_strip is called either directly from writepage() or by the VFS from
- * within write_cache_pages(), to add one more page to be written to storage.
- * It will try to collect as many contiguous pages as possible. If a
- * discontinuity is encountered or it runs out of resources it will submit the
- * previous segment and will start a new collection.
- * Eventually caller must submit the last segment if present.
- */
-static int writepage_strip(struct page *page,
-			   struct writeback_control *wbc_unused, void *data)
-{
-	struct page_collect *pcol = data;
-	struct inode *inode = pcol->inode;
-	struct exofs_i_info *oi = exofs_i(inode);
-	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_SHIFT;
-	size_t len;
-	int ret;
-
-	BUG_ON(!PageLocked(page));
-
-	ret = wait_obj_created(oi);
-	if (unlikely(ret))
-		goto fail;
-
-	if (page->index < end_index)
-		/* in this case, the page is within the limits of the file */
-		len = PAGE_SIZE;
-	else {
-		len = i_size & ~PAGE_MASK;
-
-		if (page->index > end_index || !len) {
-			/* in this case, the page is outside the limits
-			 * (truncate in progress)
-			 */
-			ret = write_exec(pcol);
-			if (unlikely(ret))
-				goto fail;
-			if (PageError(page))
-				ClearPageError(page);
-			unlock_page(page);
-			EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
-				     "outside the limits\n",
-				     inode->i_ino, page->index);
-			return 0;
-		}
-	}
-
-try_again:
-
-	if (unlikely(pcol->pg_first == -1)) {
-		pcol->pg_first = page->index;
-	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
-		   page->index)) {
-		/* Discontinuity detected, split the request */
-		ret = write_exec(pcol);
-		if (unlikely(ret))
-			goto fail;
-
-		EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
-			     inode->i_ino, page->index);
-		goto try_again;
-	}
-
-	if (!pcol->pages) {
-		ret = pcol_try_alloc(pcol);
-		if (unlikely(ret))
-			goto fail;
-	}
-
-	EXOFS_DBGMSG2("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
-		     inode->i_ino, page->index, len);
-
-	ret = pcol_add_page(pcol, page, len);
-	if (unlikely(ret)) {
-		EXOFS_DBGMSG2("Failed pcol_add_page "
-			     "nr_pages=%u total_length=0x%lx\n",
-			     pcol->nr_pages, pcol->length);
-
-		/* split the request, next loop will start again */
-		ret = write_exec(pcol);
-		if (unlikely(ret)) {
-			EXOFS_DBGMSG("write_exec failed => %d", ret);
-			goto fail;
-		}
-
-		goto try_again;
-	}
-
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-
-	return 0;
-
-fail:
-	EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
-		     inode->i_ino, page->index, ret);
-	mapping_set_error(page->mapping, -EIO);
-	unlock_page(page);
-	return ret;
-}
-
-static int exofs_writepages(struct address_space *mapping,
-		       struct writeback_control *wbc)
-{
-	struct page_collect pcol;
-	long start, end, expected_pages;
-	int ret;
-
-	start = wbc->range_start >> PAGE_SHIFT;
-	end = (wbc->range_end == LLONG_MAX) ?
-			start + mapping->nrpages :
-			wbc->range_end >> PAGE_SHIFT;
-
-	if (start || end)
-		expected_pages = end - start + 1;
-	else
-		expected_pages = mapping->nrpages;
-
-	if (expected_pages < 32L)
-		expected_pages = 32L;
-
-	EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
-		     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
-		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
-		     mapping->nrpages, start, end, expected_pages);
-
-	_pcol_init(&pcol, expected_pages, mapping->host);
-
-	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
-	if (unlikely(ret)) {
-		EXOFS_ERR("write_cache_pages => %d\n", ret);
-		return ret;
-	}
-
-	ret = write_exec(&pcol);
-	if (unlikely(ret))
-		return ret;
-
-	if (wbc->sync_mode == WB_SYNC_ALL) {
-		return write_exec(&pcol); /* pump the last reminder */
-	} else if (pcol.nr_pages) {
-		/* not SYNC let the reminder join the next writeout */
-		unsigned i;
-
-		for (i = 0; i < pcol.nr_pages; i++) {
-			struct page *page = pcol.pages[i];
-
-			end_page_writeback(page);
-			set_page_dirty(page);
-			unlock_page(page);
-		}
-	}
-	return 0;
-}
-
-/*
-static int exofs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct page_collect pcol;
-	int ret;
-
-	_pcol_init(&pcol, 1, page->mapping->host);
-
-	ret = writepage_strip(page, NULL, &pcol);
-	if (ret) {
-		EXOFS_ERR("exofs_writepage => %d\n", ret);
-		return ret;
-	}
-
-	return write_exec(&pcol);
-}
-*/
-/* i_mutex held using inode->i_size directly */
-static void _write_failed(struct inode *inode, loff_t to)
-{
-	if (to > inode->i_size)
-		truncate_pagecache(inode, inode->i_size);
-}
-
-int exofs_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned flags,
-		struct page **pagep, void **fsdata)
-{
-	int ret = 0;
-	struct page *page;
-
-	page = *pagep;
-	if (page == NULL) {
-		page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT,
-						   flags);
-		if (!page) {
-			EXOFS_DBGMSG("grab_cache_page_write_begin failed\n");
-			return -ENOMEM;
-		}
-		*pagep = page;
-	}
-
-	 /* read modify write */
-	if (!PageUptodate(page) && (len != PAGE_SIZE)) {
-		loff_t i_size = i_size_read(mapping->host);
-		pgoff_t end_index = i_size >> PAGE_SHIFT;
-
-		if (page->index > end_index) {
-			clear_highpage(page);
-			SetPageUptodate(page);
-		} else {
-			ret = _readpage(page, true);
-			if (ret) {
-				unlock_page(page);
-				EXOFS_DBGMSG("__readpage failed\n");
-			}
-		}
-	}
-	return ret;
-}
-
-static int exofs_write_begin_export(struct file *file,
-		struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned flags,
-		struct page **pagep, void **fsdata)
-{
-	*pagep = NULL;
-
-	return exofs_write_begin(file, mapping, pos, len, flags, pagep,
-					fsdata);
-}
-
-static int exofs_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct page *page, void *fsdata)
-{
-	struct inode *inode = mapping->host;
-	loff_t last_pos = pos + copied;
-
-	if (!PageUptodate(page)) {
-		if (copied < len) {
-			_write_failed(inode, pos + len);
-			copied = 0;
-			goto out;
-		}
-		SetPageUptodate(page);
-	}
-	if (last_pos > inode->i_size) {
-		i_size_write(inode, last_pos);
-		mark_inode_dirty(inode);
-	}
-	set_page_dirty(page);
-out:
-	unlock_page(page);
-	put_page(page);
-	return copied;
-}
-
-static int exofs_releasepage(struct page *page, gfp_t gfp)
-{
-	EXOFS_DBGMSG("page 0x%lx\n", page->index);
-	WARN_ON(1);
-	return 0;
-}
-
-static void exofs_invalidatepage(struct page *page, unsigned int offset,
-				 unsigned int length)
-{
-	EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
-		     page->index, offset, length);
-	WARN_ON(1);
-}
-
-
- /* TODO: Should be easy enough to do proprly */
-static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-	return 0;
-}
-
-const struct address_space_operations exofs_aops = {
-	.readpage	= exofs_readpage,
-	.readpages	= exofs_readpages,
-	.writepage	= NULL,
-	.writepages	= exofs_writepages,
-	.write_begin	= exofs_write_begin_export,
-	.write_end	= exofs_write_end,
-	.releasepage	= exofs_releasepage,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
-	.invalidatepage = exofs_invalidatepage,
-
-	/* Not implemented Yet */
-	.bmap		= NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
-	.direct_IO	= exofs_direct_IO,
-
-	/* With these NULL has special meaning or default is not exported */
-	.migratepage	= NULL,
-	.launder_page	= NULL,
-	.is_partially_uptodate = NULL,
-	.error_remove_page = NULL,
-};
-
-/******************************************************************************
- * INODE OPERATIONS
- *****************************************************************************/
-
-/*
- * Test whether an inode is a fast symlink.
- */
-static inline int exofs_inode_is_fast_symlink(struct inode *inode)
-{
-	struct exofs_i_info *oi = exofs_i(inode);
-
-	return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
-}
-
-static int _do_truncate(struct inode *inode, loff_t newsize)
-{
-	struct exofs_i_info *oi = exofs_i(inode);
-	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-	int ret;
-
-	inode->i_mtime = inode->i_ctime = current_time(inode);
-
-	ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
-	if (likely(!ret))
-		truncate_setsize(inode, newsize);
-
-	EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
-		     inode->i_ino, newsize, ret);
-	return ret;
-}
-
-/*
- * Set inode attributes - update size attribute on OSD if needed,
- *                        otherwise just call generic functions.
- */
-int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-	struct inode *inode = d_inode(dentry);
-	int error;
-
-	/* if we are about to modify an object, and it hasn't been
-	 * created yet, wait
-	 */
-	error = wait_obj_created(exofs_i(inode));
-	if (unlikely(error))
-		return error;
-
-	error = setattr_prepare(dentry, iattr);
-	if (unlikely(error))
-		return error;
-
-	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(inode)) {
-		error = _do_truncate(inode, iattr->ia_size);
-		if (unlikely(error))
-			return error;
-	}
-
-	setattr_copy(inode, iattr);
-	mark_inode_dirty(inode);
-	return 0;
-}
-
-static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
-	EXOFS_APAGE_FS_DATA,
-	EXOFS_ATTR_INODE_FILE_LAYOUT,
-	0);
-static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
-	EXOFS_APAGE_FS_DATA,
-	EXOFS_ATTR_INODE_DIR_LAYOUT,
-	0);
-
-/*
- * Read the Linux inode info from the OSD, and return it as is. In exofs the
- * inode info is in an application specific page/attribute of the osd-object.
- */
-static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-		    struct exofs_fcb *inode)
-{
-	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_attr attrs[] = {
-		[0] = g_attr_inode_data,
-		[1] = g_attr_inode_file_layout,
-		[2] = g_attr_inode_dir_layout,
-	};
-	struct ore_io_state *ios;
-	struct exofs_on_disk_inode_layout *layout;
-	int ret;
-
-	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
-		return ret;
-	}
-
-	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
-	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
-
-	ios->in_attr = attrs;
-	ios->in_attr_len = ARRAY_SIZE(attrs);
-
-	ret = ore_read(ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
-			  _LLU(oi->one_comp.obj.id), ret);
-		memset(inode, 0, sizeof(*inode));
-		inode->i_mode = 0040000 | (0777 & ~022);
-		/* If object is lost on target we might as well enable it's
-		 * delete.
-		 */
-		ret = 0;
-		goto out;
-	}
-
-	ret = extract_attr_from_ios(ios, &attrs[0]);
-	if (ret) {
-		EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
-		goto out;
-	}
-	WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
-	memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
-
-	ret = extract_attr_from_ios(ios, &attrs[1]);
-	if (ret) {
-		EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
-		goto out;
-	}
-	if (attrs[1].len) {
-		layout = attrs[1].val_ptr;
-		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
-			EXOFS_ERR("%s: unsupported files layout %d\n",
-				__func__, layout->gen_func);
-			ret = -ENOTSUPP;
-			goto out;
-		}
-	}
-
-	ret = extract_attr_from_ios(ios, &attrs[2]);
-	if (ret) {
-		EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
-		goto out;
-	}
-	if (attrs[2].len) {
-		layout = attrs[2].val_ptr;
-		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
-			EXOFS_ERR("%s: unsupported meta-data layout %d\n",
-				__func__, layout->gen_func);
-			ret = -ENOTSUPP;
-			goto out;
-		}
-	}
-
-out:
-	ore_put_io_state(ios);
-	return ret;
-}
-
-static void __oi_init(struct exofs_i_info *oi)
-{
-	init_waitqueue_head(&oi->i_wq);
-	oi->i_flags = 0;
-}
-/*
- * Fill in an inode read from the OSD and set it up for use
- */
-struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
-{
-	struct exofs_i_info *oi;
-	struct exofs_fcb fcb;
-	struct inode *inode;
-	int ret;
-
-	inode = iget_locked(sb, ino);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
-		return inode;
-	oi = exofs_i(inode);
-	__oi_init(oi);
-	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
-			 exofs_oi_objno(oi));
-
-	/* read the inode from the osd */
-	ret = exofs_get_inode(sb, oi, &fcb);
-	if (ret)
-		goto bad_inode;
-
-	set_obj_created(oi);
-
-	/* copy stuff from on-disk struct to in-memory struct */
-	inode->i_mode = le16_to_cpu(fcb.i_mode);
-	i_uid_write(inode, le32_to_cpu(fcb.i_uid));
-	i_gid_write(inode, le32_to_cpu(fcb.i_gid));
-	set_nlink(inode, le16_to_cpu(fcb.i_links_count));
-	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
-	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
-	inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
-	inode->i_ctime.tv_nsec =
-		inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
-	oi->i_commit_size = le64_to_cpu(fcb.i_size);
-	i_size_write(inode, oi->i_commit_size);
-	inode->i_blkbits = EXOFS_BLKSHIFT;
-	inode->i_generation = le32_to_cpu(fcb.i_generation);
-
-	oi->i_dir_start_lookup = 0;
-
-	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
-		ret = -ESTALE;
-		goto bad_inode;
-	}
-
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		if (fcb.i_data[0])
-			inode->i_rdev =
-				old_decode_dev(le32_to_cpu(fcb.i_data[0]));
-		else
-			inode->i_rdev =
-				new_decode_dev(le32_to_cpu(fcb.i_data[1]));
-	} else {
-		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
-	}
-
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_op = &exofs_file_inode_operations;
-		inode->i_fop = &exofs_file_operations;
-		inode->i_mapping->a_ops = &exofs_aops;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &exofs_dir_inode_operations;
-		inode->i_fop = &exofs_dir_operations;
-		inode->i_mapping->a_ops = &exofs_aops;
-	} else if (S_ISLNK(inode->i_mode)) {
-		if (exofs_inode_is_fast_symlink(inode)) {
-			inode->i_op = &simple_symlink_inode_operations;
-			inode->i_link = (char *)oi->i_data;
-		} else {
-			inode->i_op = &page_symlink_inode_operations;
-			inode_nohighmem(inode);
-			inode->i_mapping->a_ops = &exofs_aops;
-		}
-	} else {
-		inode->i_op = &exofs_special_inode_operations;
-		if (fcb.i_data[0])
-			init_special_inode(inode, inode->i_mode,
-			   old_decode_dev(le32_to_cpu(fcb.i_data[0])));
-		else
-			init_special_inode(inode, inode->i_mode,
-			   new_decode_dev(le32_to_cpu(fcb.i_data[1])));
-	}
-
-	unlock_new_inode(inode);
-	return inode;
-
-bad_inode:
-	iget_failed(inode);
-	return ERR_PTR(ret);
-}
-
-int __exofs_wait_obj_created(struct exofs_i_info *oi)
-{
-	if (!obj_created(oi)) {
-		EXOFS_DBGMSG("!obj_created\n");
-		BUG_ON(!obj_2bcreated(oi));
-		wait_event(oi->i_wq, obj_created(oi));
-		EXOFS_DBGMSG("wait_event done\n");
-	}
-	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
-}
-
-/*
- * Callback function from exofs_new_inode().  The important thing is that we
- * set the obj_created flag so that other methods know that the object exists on
- * the OSD.
- */
-static void create_done(struct ore_io_state *ios, void *p)
-{
-	struct inode *inode = p;
-	struct exofs_i_info *oi = exofs_i(inode);
-	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-	int ret;
-
-	ret = ore_check_io(ios, NULL);
-	ore_put_io_state(ios);
-
-	atomic_dec(&sbi->s_curr_pending);
-
-	if (unlikely(ret)) {
-		EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
-			  _LLU(exofs_oi_objno(oi)),
-			  _LLU(oi->one_comp.obj.partition));
-		/*TODO: When FS is corrupted creation can fail, object already
-		 * exist. Get rid of this asynchronous creation, if exist
-		 * increment the obj counter and try the next object. Until we
-		 * succeed. All these dangling objects will be made into lost
-		 * files by chkfs.exofs
-		 */
-	}
-
-	set_obj_created(oi);
-
-	wake_up(&oi->i_wq);
-}
-
-/*
- * Set up a new inode and create an object for it on the OSD
- */
-struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
-{
-	struct super_block *sb = dir->i_sb;
-	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct inode *inode;
-	struct exofs_i_info *oi;
-	struct ore_io_state *ios;
-	int ret;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	oi = exofs_i(inode);
-	__oi_init(oi);
-
-	set_obj_2bcreated(oi);
-
-	inode_init_owner(inode, dir, mode);
-	inode->i_ino = sbi->s_nextid++;
-	inode->i_blkbits = EXOFS_BLKSHIFT;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
-	oi->i_commit_size = inode->i_size = 0;
-	spin_lock(&sbi->s_next_gen_lock);
-	inode->i_generation = sbi->s_next_generation++;
-	spin_unlock(&sbi->s_next_gen_lock);
-	insert_inode_hash(inode);
-
-	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
-			 exofs_oi_objno(oi));
-	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
-
-	mark_inode_dirty(inode);
-
-	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
-		return ERR_PTR(ret);
-	}
-
-	ios->done = create_done;
-	ios->private = inode;
-
-	ret = ore_create(ios);
-	if (ret) {
-		ore_put_io_state(ios);
-		return ERR_PTR(ret);
-	}
-	atomic_inc(&sbi->s_curr_pending);
-
-	return inode;
-}
-
-/*
- * struct to pass two arguments to update_inode's callback
- */
-struct updatei_args {
-	struct exofs_sb_info	*sbi;
-	struct exofs_fcb	fcb;
-};
-
-/*
- * Callback function from exofs_update_inode().
- */
-static void updatei_done(struct ore_io_state *ios, void *p)
-{
-	struct updatei_args *args = p;
-
-	ore_put_io_state(ios);
-
-	atomic_dec(&args->sbi->s_curr_pending);
-
-	kfree(args);
-}
-
-/*
- * Write the inode to the OSD.  Just fill up the struct, and set the attribute
- * synchronously or asynchronously depending on the do_sync flag.
- */
-static int exofs_update_inode(struct inode *inode, int do_sync)
-{
-	struct exofs_i_info *oi = exofs_i(inode);
-	struct super_block *sb = inode->i_sb;
-	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct ore_io_state *ios;
-	struct osd_attr attr;
-	struct exofs_fcb *fcb;
-	struct updatei_args *args;
-	int ret;
-
-	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args) {
-		EXOFS_DBGMSG("Failed kzalloc of args\n");
-		return -ENOMEM;
-	}
-
-	fcb = &args->fcb;
-
-	fcb->i_mode = cpu_to_le16(inode->i_mode);
-	fcb->i_uid = cpu_to_le32(i_uid_read(inode));
-	fcb->i_gid = cpu_to_le32(i_gid_read(inode));
-	fcb->i_links_count = cpu_to_le16(inode->i_nlink);
-	fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
-	fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-	fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
-	oi->i_commit_size = i_size_read(inode);
-	fcb->i_size = cpu_to_le64(oi->i_commit_size);
-	fcb->i_generation = cpu_to_le32(inode->i_generation);
-
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		if (old_valid_dev(inode->i_rdev)) {
-			fcb->i_data[0] =
-				cpu_to_le32(old_encode_dev(inode->i_rdev));
-			fcb->i_data[1] = 0;
-		} else {
-			fcb->i_data[0] = 0;
-			fcb->i_data[1] =
-				cpu_to_le32(new_encode_dev(inode->i_rdev));
-			fcb->i_data[2] = 0;
-		}
-	} else
-		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-
-	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
-		goto free_args;
-	}
-
-	attr = g_attr_inode_data;
-	attr.val_ptr = fcb;
-	ios->out_attr_len = 1;
-	ios->out_attr = &attr;
-
-	wait_obj_created(oi);
-
-	if (!do_sync) {
-		args->sbi = sbi;
-		ios->done = updatei_done;
-		ios->private = args;
-	}
-
-	ret = ore_write(ios);
-	if (!do_sync && !ret) {
-		atomic_inc(&sbi->s_curr_pending);
-		goto out; /* deallocation in updatei_done */
-	}
-
-	ore_put_io_state(ios);
-free_args:
-	kfree(args);
-out:
-	EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
-		     inode->i_ino, do_sync, ret);
-	return ret;
-}
-
-int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	/* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
-	return exofs_update_inode(inode, 1);
-}
-
-/*
- * Callback function from exofs_delete_inode() - don't have much cleaning up to
- * do.
- */
-static void delete_done(struct ore_io_state *ios, void *p)
-{
-	struct exofs_sb_info *sbi = p;
-
-	ore_put_io_state(ios);
-
-	atomic_dec(&sbi->s_curr_pending);
-}
-
-/*
- * Called when the refcount of an inode reaches zero.  We remove the object
- * from the OSD here.  We make sure the object was created before we try and
- * delete it.
- */
-void exofs_evict_inode(struct inode *inode)
-{
-	struct exofs_i_info *oi = exofs_i(inode);
-	struct super_block *sb = inode->i_sb;
-	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct ore_io_state *ios;
-	int ret;
-
-	truncate_inode_pages_final(&inode->i_data);
-
-	/* TODO: should do better here */
-	if (inode->i_nlink || is_bad_inode(inode))
-		goto no_delete;
-
-	inode->i_size = 0;
-	clear_inode(inode);
-
-	/* if we are deleting an obj that hasn't been created yet, wait.
-	 * This also makes sure that create_done cannot be called with an
-	 * already evicted inode.
-	 */
-	wait_obj_created(oi);
-	/* ignore the error, attempt a remove anyway */
-
-	/* Now Remove the OSD objects */
-	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
-		return;
-	}
-
-	ios->done = delete_done;
-	ios->private = sbi;
-
-	ret = ore_remove(ios);
-	if (ret) {
-		EXOFS_ERR("%s: ore_remove failed\n", __func__);
-		ore_put_io_state(ios);
-		return;
-	}
-	atomic_inc(&sbi->s_curr_pending);
-
-	return;
-
-no_delete:
-	clear_inode(inode);
-}

diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
deleted file mode 100644
index 7295cd7..0000000
--- a/fs/exofs/namei.c
+++ /dev/null

@@ -1,323 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include "exofs.h"
-
-static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
-{
-	int err = exofs_add_link(dentry, inode);
-	if (!err) {
-		d_instantiate(dentry, inode);
-		return 0;
-	}
-	inode_dec_link_count(inode);
-	iput(inode);
-	return err;
-}
-
-static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
-				   unsigned int flags)
-{
-	struct inode *inode;
-	ino_t ino;
-
-	if (dentry->d_name.len > EXOFS_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	ino = exofs_inode_by_name(dir, dentry);
-	inode = ino ? exofs_iget(dir->i_sb, ino) : NULL;
-	return d_splice_alias(inode, dentry);
-}
-
-static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			 bool excl)
-{
-	struct inode *inode = exofs_new_inode(dir, mode);
-	int err = PTR_ERR(inode);
-	if (!IS_ERR(inode)) {
-		inode->i_op = &exofs_file_inode_operations;
-		inode->i_fop = &exofs_file_operations;
-		inode->i_mapping->a_ops = &exofs_aops;
-		mark_inode_dirty(inode);
-		err = exofs_add_nondir(dentry, inode);
-	}
-	return err;
-}
-
-static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		       dev_t rdev)
-{
-	struct inode *inode;
-	int err;
-
-	inode = exofs_new_inode(dir, mode);
-	err = PTR_ERR(inode);
-	if (!IS_ERR(inode)) {
-		init_special_inode(inode, inode->i_mode, rdev);
-		mark_inode_dirty(inode);
-		err = exofs_add_nondir(dentry, inode);
-	}
-	return err;
-}
-
-static int exofs_symlink(struct inode *dir, struct dentry *dentry,
-			  const char *symname)
-{
-	struct super_block *sb = dir->i_sb;
-	int err = -ENAMETOOLONG;
-	unsigned l = strlen(symname)+1;
-	struct inode *inode;
-	struct exofs_i_info *oi;
-
-	if (l > sb->s_blocksize)
-		goto out;
-
-	inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out;
-
-	oi = exofs_i(inode);
-	if (l > sizeof(oi->i_data)) {
-		/* slow symlink */
-		inode->i_op = &page_symlink_inode_operations;
-		inode_nohighmem(inode);
-		inode->i_mapping->a_ops = &exofs_aops;
-		memset(oi->i_data, 0, sizeof(oi->i_data));
-
-		err = page_symlink(inode, symname, l);
-		if (err)
-			goto out_fail;
-	} else {
-		/* fast symlink */
-		inode->i_op = &simple_symlink_inode_operations;
-		inode->i_link = (char *)oi->i_data;
-		memcpy(oi->i_data, symname, l);
-		inode->i_size = l-1;
-	}
-	mark_inode_dirty(inode);
-
-	err = exofs_add_nondir(dentry, inode);
-out:
-	return err;
-
-out_fail:
-	inode_dec_link_count(inode);
-	iput(inode);
-	goto out;
-}
-
-static int exofs_link(struct dentry *old_dentry, struct inode *dir,
-		struct dentry *dentry)
-{
-	struct inode *inode = d_inode(old_dentry);
-
-	inode->i_ctime = current_time(inode);
-	inode_inc_link_count(inode);
-	ihold(inode);
-
-	return exofs_add_nondir(dentry, inode);
-}
-
-static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	struct inode *inode;
-	int err;
-
-	inode_inc_link_count(dir);
-
-	inode = exofs_new_inode(dir, S_IFDIR | mode);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out_dir;
-
-	inode->i_op = &exofs_dir_inode_operations;
-	inode->i_fop = &exofs_dir_operations;
-	inode->i_mapping->a_ops = &exofs_aops;
-
-	inode_inc_link_count(inode);
-
-	err = exofs_make_empty(inode, dir);
-	if (err)
-		goto out_fail;
-
-	err = exofs_add_link(dentry, inode);
-	if (err)
-		goto out_fail;
-
-	d_instantiate(dentry, inode);
-out:
-	return err;
-
-out_fail:
-	inode_dec_link_count(inode);
-	inode_dec_link_count(inode);
-	iput(inode);
-out_dir:
-	inode_dec_link_count(dir);
-	goto out;
-}
-
-static int exofs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-	struct exofs_dir_entry *de;
-	struct page *page;
-	int err = -ENOENT;
-
-	de = exofs_find_entry(dir, dentry, &page);
-	if (!de)
-		goto out;
-
-	err = exofs_delete_entry(de, page);
-	if (err)
-		goto out;
-
-	inode->i_ctime = dir->i_ctime;
-	inode_dec_link_count(inode);
-	err = 0;
-out:
-	return err;
-}
-
-static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-	int err = -ENOTEMPTY;
-
-	if (exofs_empty_dir(inode)) {
-		err = exofs_unlink(dir, dentry);
-		if (!err) {
-			inode->i_size = 0;
-			inode_dec_link_count(inode);
-			inode_dec_link_count(dir);
-		}
-	}
-	return err;
-}
-
-static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir, struct dentry *new_dentry,
-			unsigned int flags)
-{
-	struct inode *old_inode = d_inode(old_dentry);
-	struct inode *new_inode = d_inode(new_dentry);
-	struct page *dir_page = NULL;
-	struct exofs_dir_entry *dir_de = NULL;
-	struct page *old_page;
-	struct exofs_dir_entry *old_de;
-	int err = -ENOENT;
-
-	if (flags & ~RENAME_NOREPLACE)
-		return -EINVAL;
-
-	old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
-	if (!old_de)
-		goto out;
-
-	if (S_ISDIR(old_inode->i_mode)) {
-		err = -EIO;
-		dir_de = exofs_dotdot(old_inode, &dir_page);
-		if (!dir_de)
-			goto out_old;
-	}
-
-	if (new_inode) {
-		struct page *new_page;
-		struct exofs_dir_entry *new_de;
-
-		err = -ENOTEMPTY;
-		if (dir_de && !exofs_empty_dir(new_inode))
-			goto out_dir;
-
-		err = -ENOENT;
-		new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
-		if (!new_de)
-			goto out_dir;
-		err = exofs_set_link(new_dir, new_de, new_page, old_inode);
-		new_inode->i_ctime = current_time(new_inode);
-		if (dir_de)
-			drop_nlink(new_inode);
-		inode_dec_link_count(new_inode);
-		if (err)
-			goto out_dir;
-	} else {
-		err = exofs_add_link(new_dentry, old_inode);
-		if (err)
-			goto out_dir;
-		if (dir_de)
-			inode_inc_link_count(new_dir);
-	}
-
-	old_inode->i_ctime = current_time(old_inode);
-
-	exofs_delete_entry(old_de, old_page);
-	mark_inode_dirty(old_inode);
-
-	if (dir_de) {
-		err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
-		inode_dec_link_count(old_dir);
-		if (err)
-			goto out_dir;
-	}
-	return 0;
-
-
-out_dir:
-	if (dir_de) {
-		kunmap(dir_page);
-		put_page(dir_page);
-	}
-out_old:
-	kunmap(old_page);
-	put_page(old_page);
-out:
-	return err;
-}
-
-const struct inode_operations exofs_dir_inode_operations = {
-	.create 	= exofs_create,
-	.lookup 	= exofs_lookup,
-	.link   	= exofs_link,
-	.unlink 	= exofs_unlink,
-	.symlink	= exofs_symlink,
-	.mkdir  	= exofs_mkdir,
-	.rmdir  	= exofs_rmdir,
-	.mknod  	= exofs_mknod,
-	.rename		= exofs_rename,
-	.setattr	= exofs_setattr,
-};
-
-const struct inode_operations exofs_special_inode_operations = {
-	.setattr	= exofs_setattr,
-};

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
deleted file mode 100644
index 5331a15..0000000
--- a/fs/exofs/ore.c
+++ /dev/null

@@ -1,1178 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <asm/div64.h>
-#include <linux/lcm.h>
-
-#include "ore_raid.h"
-
-MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
-MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
-MODULE_LICENSE("GPL");
-
-/* ore_verify_layout does a couple of things:
- * 1. Given a minimum number of needed parameters fixes up the rest of the
- *    members to be operatonals for the ore. The needed parameters are those
- *    that are defined by the pnfs-objects layout STD.
- * 2. Check to see if the current ore code actually supports these parameters
- *    for example stripe_unit must be a multple of the system PAGE_SIZE,
- *    and etc...
- * 3. Cache some havily used calculations that will be needed by users.
- */
-
-enum { BIO_MAX_PAGES_KMALLOC =
-		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
-
-int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
-{
-	u64 stripe_length;
-
-	switch (layout->raid_algorithm) {
-	case PNFS_OSD_RAID_0:
-		layout->parity = 0;
-		break;
-	case PNFS_OSD_RAID_5:
-		layout->parity = 1;
-		break;
-	case PNFS_OSD_RAID_PQ:
-		layout->parity = 2;
-		break;
-	case PNFS_OSD_RAID_4:
-	default:
-		ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
-			layout->raid_algorithm);
-		return -EINVAL;
-	}
-	if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
-		ORE_ERR("Stripe Unit(0x%llx)"
-			  " must be Multples of PAGE_SIZE(0x%lx)\n",
-			  _LLU(layout->stripe_unit), PAGE_SIZE);
-		return -EINVAL;
-	}
-	if (layout->group_width) {
-		if (!layout->group_depth) {
-			ORE_ERR("group_depth == 0 && group_width != 0\n");
-			return -EINVAL;
-		}
-		if (total_comps < (layout->group_width * layout->mirrors_p1)) {
-			ORE_ERR("Data Map wrong, "
-				"numdevs=%d < group_width=%d * mirrors=%d\n",
-				total_comps, layout->group_width,
-				layout->mirrors_p1);
-			return -EINVAL;
-		}
-		layout->group_count = total_comps / layout->mirrors_p1 /
-						layout->group_width;
-	} else {
-		if (layout->group_depth) {
-			printk(KERN_NOTICE "Warning: group_depth ignored "
-				"group_width == 0 && group_depth == %lld\n",
-				_LLU(layout->group_depth));
-		}
-		layout->group_width = total_comps / layout->mirrors_p1;
-		layout->group_depth = -1;
-		layout->group_count = 1;
-	}
-
-	stripe_length = (u64)layout->group_width * layout->stripe_unit;
-	if (stripe_length >= (1ULL << 32)) {
-		ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
-			_LLU(stripe_length));
-		return -EINVAL;
-	}
-
-	layout->max_io_length =
-		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
-					(layout->group_width - layout->parity);
-	if (layout->parity) {
-		unsigned stripe_length =
-				(layout->group_width - layout->parity) *
-				layout->stripe_unit;
-
-		layout->max_io_length /= stripe_length;
-		layout->max_io_length *= stripe_length;
-	}
-	ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
-
-	return 0;
-}
-EXPORT_SYMBOL(ore_verify_layout);
-
-static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
-{
-	return ios->oc->comps[index & ios->oc->single_comp].cred;
-}
-
-static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
-{
-	return &ios->oc->comps[index & ios->oc->single_comp].obj;
-}
-
-static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
-{
-	ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
-		    ios->oc->first_dev, ios->oc->numdevs, index,
-		    ios->oc->ods);
-
-	return ore_comp_dev(ios->oc, index);
-}
-
-int  _ore_get_io_state(struct ore_layout *layout,
-			struct ore_components *oc, unsigned numdevs,
-			unsigned sgs_per_dev, unsigned num_par_pages,
-			struct ore_io_state **pios)
-{
-	struct ore_io_state *ios;
-	size_t size_ios, size_extra, size_total;
-	void *ios_extra;
-
-	/*
-	 * The desired layout looks like this, with the extra_allocation
-	 * items pointed at from fields within ios or per_dev:
-
-	struct __alloc_all_io_state {
-		struct ore_io_state ios;
-		struct ore_per_dev_state per_dev[numdevs];
-		union {
-			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
-			struct page *pages[num_par_pages];
-		} extra_allocation;
-	} whole_allocation;
-
-	*/
-
-	/* This should never happen, so abort early if it ever does. */
-	if (sgs_per_dev && num_par_pages) {
-		ORE_DBGMSG("Tried to use both pages and sglist\n");
-		*pios = NULL;
-		return -EINVAL;
-	}
-
-	if (numdevs > (INT_MAX - sizeof(*ios)) /
-		       sizeof(struct ore_per_dev_state))
-		return -ENOMEM;
-	size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs;
-
-	if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry))
-		return -ENOMEM;
-	if (num_par_pages > INT_MAX / sizeof(struct page *))
-		return -ENOMEM;
-	size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs),
-			 sizeof(struct page *) * num_par_pages);
-
-	size_total = size_ios + size_extra;
-
-	if (likely(size_total <= PAGE_SIZE)) {
-		ios = kzalloc(size_total, GFP_KERNEL);
-		if (unlikely(!ios)) {
-			ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total);
-			*pios = NULL;
-			return -ENOMEM;
-		}
-		ios_extra = (char *)ios + size_ios;
-	} else {
-		ios = kzalloc(size_ios, GFP_KERNEL);
-		if (unlikely(!ios)) {
-			ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
-				   size_ios);
-			*pios = NULL;
-			return -ENOMEM;
-		}
-		ios_extra = kzalloc(size_extra, GFP_KERNEL);
-		if (unlikely(!ios_extra)) {
-			ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
-				   size_extra);
-			kfree(ios);
-			*pios = NULL;
-			return -ENOMEM;
-		}
-
-		/* In this case the per_dev[0].sgilist holds the pointer to
-		 * be freed
-		 */
-		ios->extra_part_alloc = true;
-	}
-
-	if (num_par_pages) {
-		ios->parity_pages = ios_extra;
-		ios->max_par_pages = num_par_pages;
-	}
-	if (sgs_per_dev) {
-		struct osd_sg_entry *sgilist = ios_extra;
-		unsigned d;
-
-		for (d = 0; d < numdevs; ++d) {
-			ios->per_dev[d].sglist = sgilist;
-			sgilist += sgs_per_dev;
-		}
-		ios->sgs_per_dev = sgs_per_dev;
-	}
-
-	ios->layout = layout;
-	ios->oc = oc;
-	*pios = ios;
-	return 0;
-}
-
-/* Allocate an io_state for only a single group of devices
- *
- * If a user needs to call ore_read/write() this version must be used becase it
- * allocates extra stuff for striping and raid.
- * The ore might decide to only IO less then @length bytes do to alignmets
- * and constrains as follows:
- * - The IO cannot cross group boundary.
- * - In raid5/6 The end of the IO must align at end of a stripe eg.
- *   (@offset + @length) % strip_size == 0. Or the complete range is within a
- *   single stripe.
- * - Memory condition only permitted a shorter IO. (A user can use @length=~0
- *   And check the returned ios->length for max_io_size.)
- *
- * The caller must check returned ios->length (and/or ios->nr_pages) and
- * re-issue these pages that fall outside of ios->length
- */
-int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
-		      bool is_reading, u64 offset, u64 length,
-		      struct ore_io_state **pios)
-{
-	struct ore_io_state *ios;
-	unsigned numdevs = layout->group_width * layout->mirrors_p1;
-	unsigned sgs_per_dev = 0, max_par_pages = 0;
-	int ret;
-
-	if (layout->parity && length) {
-		unsigned data_devs = layout->group_width - layout->parity;
-		unsigned stripe_size = layout->stripe_unit * data_devs;
-		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
-		u32 remainder;
-		u64 num_stripes;
-		u64 num_raid_units;
-
-		num_stripes = div_u64_rem(length, stripe_size, &remainder);
-		if (remainder)
-			++num_stripes;
-
-		num_raid_units =  num_stripes * layout->parity;
-
-		if (is_reading) {
-			/* For reads add per_dev sglist array */
-			/* TODO: Raid 6 we need twice more. Actually:
-			*         num_stripes / LCMdP(W,P);
-			*         if (W%P != 0) num_stripes *= parity;
-			*/
-
-			/* first/last seg is split */
-			num_raid_units += layout->group_width;
-			sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
-		} else {
-			/* For Writes add parity pages array. */
-			max_par_pages = num_raid_units * pages_in_unit *
-						sizeof(struct page *);
-		}
-	}
-
-	ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
-				pios);
-	if (unlikely(ret))
-		return ret;
-
-	ios = *pios;
-	ios->reading = is_reading;
-	ios->offset = offset;
-
-	if (length) {
-		ore_calc_stripe_info(layout, offset, length, &ios->si);
-		ios->length = ios->si.length;
-		ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
-				 ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
-		if (layout->parity)
-			_ore_post_alloc_raid_stuff(ios);
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(ore_get_rw_state);
-
-/* Allocate an io_state for all the devices in the comps array
- *
- * This version of io_state allocation is used mostly by create/remove
- * and trunc where we currently need all the devices. The only wastful
- * bit is the read/write_attributes with no IO. Those sites should
- * be converted to use ore_get_rw_state() with length=0
- */
-int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
-		      struct ore_io_state **pios)
-{
-	return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
-}
-EXPORT_SYMBOL(ore_get_io_state);
-
-void ore_put_io_state(struct ore_io_state *ios)
-{
-	if (ios) {
-		unsigned i;
-
-		for (i = 0; i < ios->numdevs; i++) {
-			struct ore_per_dev_state *per_dev = &ios->per_dev[i];
-
-			if (per_dev->or)
-				osd_end_request(per_dev->or);
-			if (per_dev->bio)
-				bio_put(per_dev->bio);
-		}
-
-		_ore_free_raid_stuff(ios);
-		kfree(ios);
-	}
-}
-EXPORT_SYMBOL(ore_put_io_state);
-
-static void _sync_done(struct ore_io_state *ios, void *p)
-{
-	struct completion *waiting = p;
-
-	complete(waiting);
-}
-
-static void _last_io(struct kref *kref)
-{
-	struct ore_io_state *ios = container_of(
-					kref, struct ore_io_state, kref);
-
-	ios->done(ios, ios->private);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
-	struct ore_io_state *ios = p;
-
-	kref_put(&ios->kref, _last_io);
-}
-
-int ore_io_execute(struct ore_io_state *ios)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	bool sync = (ios->done == NULL);
-	int i, ret;
-
-	if (sync) {
-		ios->done = _sync_done;
-		ios->private = &wait;
-	}
-
-	for (i = 0; i < ios->numdevs; i++) {
-		struct osd_request *or = ios->per_dev[i].or;
-		if (unlikely(!or))
-			continue;
-
-		ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
-		if (unlikely(ret)) {
-			ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
-				     ret);
-			return ret;
-		}
-	}
-
-	kref_init(&ios->kref);
-
-	for (i = 0; i < ios->numdevs; i++) {
-		struct osd_request *or = ios->per_dev[i].or;
-		if (unlikely(!or))
-			continue;
-
-		kref_get(&ios->kref);
-		osd_execute_request_async(or, _done_io, ios);
-	}
-
-	kref_put(&ios->kref, _last_io);
-	ret = 0;
-
-	if (sync) {
-		wait_for_completion(&wait);
-		ret = ore_check_io(ios, NULL);
-	}
-	return ret;
-}
-
-static void _clear_bio(struct bio *bio)
-{
-	struct bio_vec *bv;
-	unsigned i;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		unsigned this_count = bv->bv_len;
-
-		if (likely(PAGE_SIZE == this_count))
-			clear_highpage(bv->bv_page);
-		else
-			zero_user(bv->bv_page, bv->bv_offset, this_count);
-	}
-}
-
-int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
-{
-	enum osd_err_priority acumulated_osd_err = 0;
-	int acumulated_lin_err = 0;
-	int i;
-
-	for (i = 0; i < ios->numdevs; i++) {
-		struct osd_sense_info osi;
-		struct ore_per_dev_state *per_dev = &ios->per_dev[i];
-		struct osd_request *or = per_dev->or;
-		int ret;
-
-		if (unlikely(!or))
-			continue;
-
-		ret = osd_req_decode_sense(or, &osi);
-		if (likely(!ret))
-			continue;
-
-		if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
-		    per_dev->bio) {
-			/* start read offset passed endof file.
-			 * Note: if we do not have bio it means read-attributes
-			 * In this case we should return error to caller.
-			 */
-			_clear_bio(per_dev->bio);
-			ORE_DBGMSG("start read offset passed end of file "
-				"offset=0x%llx, length=0x%llx\n",
-				_LLU(per_dev->offset),
-				_LLU(per_dev->length));
-
-			continue; /* we recovered */
-		}
-
-		if (on_dev_error) {
-			u64 residual = ios->reading ?
-					or->in.residual : or->out.residual;
-			u64 offset = (ios->offset + ios->length) - residual;
-			unsigned dev = per_dev->dev - ios->oc->first_dev;
-			struct ore_dev *od = ios->oc->ods[dev];
-
-			on_dev_error(ios, od, dev, osi.osd_err_pri,
-				     offset, residual);
-		}
-		if (osi.osd_err_pri >= acumulated_osd_err) {
-			acumulated_osd_err = osi.osd_err_pri;
-			acumulated_lin_err = ret;
-		}
-	}
-
-	return acumulated_lin_err;
-}
-EXPORT_SYMBOL(ore_check_io);
-
-/*
- * L - logical offset into the file
- *
- * D - number of Data devices
- *	D = group_width - parity
- *
- * U - The number of bytes in a stripe within a group
- *	U =  stripe_unit * D
- *
- * T - The number of bytes striped within a group of component objects
- *     (before advancing to the next group)
- *	T = U * group_depth
- *
- * S - The number of bytes striped across all component objects
- *     before the pattern repeats
- *	S = T * group_count
- *
- * M - The "major" (i.e., across all components) cycle number
- *	M = L / S
- *
- * G - Counts the groups from the beginning of the major cycle
- *	G = (L - (M * S)) / T	[or (L % S) / T]
- *
- * H - The byte offset within the group
- *	H = (L - (M * S)) % T	[or (L % S) % T]
- *
- * N - The "minor" (i.e., across the group) stripe number
- *	N = H / U
- *
- * C - The component index coresponding to L
- *
- *	C = (H - (N * U)) / stripe_unit + G * D
- *	[or (L % U) / stripe_unit + G * D]
- *
- * O - The component offset coresponding to L
- *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
- *
- * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
- *          divide by parity
- *	LCMdP = lcm(group_width, parity) / parity
- *
- * R - The parity Rotation stripe
- *     (Note parity cycle always starts at a group's boundary)
- *	R = N % LCMdP
- *
- * I = the first parity device index
- *	I = (group_width + group_width - R*parity - parity) % group_width
- *
- * Craid - The component index Rotated
- *	Craid = (group_width + C - R*parity) % group_width
- *      (We add the group_width to avoid negative numbers modulo math)
- */
-void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-			  u64 length, struct ore_striping_info *si)
-{
-	u32	stripe_unit = layout->stripe_unit;
-	u32	group_width = layout->group_width;
-	u64	group_depth = layout->group_depth;
-	u32	parity      = layout->parity;
-
-	u32	D = group_width - parity;
-	u32	U = D * stripe_unit;
-	u64	T = U * group_depth;
-	u64	S = T * layout->group_count;
-	u64	M = div64_u64(file_offset, S);
-
-	/*
-	G = (L - (M * S)) / T
-	H = (L - (M * S)) % T
-	*/
-	u64	LmodS = file_offset - M * S;
-	u32	G = div64_u64(LmodS, T);
-	u64	H = LmodS - G * T;
-
-	u32	N = div_u64(H, U);
-	u32	Nlast;
-
-	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
-	u32	C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-	u32 first_dev = C - C % group_width;
-
-	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-
-	si->obj_offset = si->unit_off + (N * stripe_unit) +
-				  (M * group_depth * stripe_unit);
-	si->cur_comp = C - first_dev;
-	si->cur_pg = si->unit_off / PAGE_SIZE;
-
-	if (parity) {
-		u32 LCMdP = lcm(group_width, parity) / parity;
-		/* R     = N % LCMdP; */
-		u32 RxP   = (N % LCMdP) * parity;
-
-		si->par_dev = (group_width + group_width - parity - RxP) %
-			      group_width + first_dev;
-		si->dev = (group_width + group_width + C - RxP) %
-			  group_width + first_dev;
-		si->bytes_in_stripe = U;
-		si->first_stripe_start = M * S + G * T + N * U;
-	} else {
-		/* Make the math correct see _prepare_one_group */
-		si->par_dev = group_width;
-		si->dev = C;
-	}
-
-	si->dev *= layout->mirrors_p1;
-	si->par_dev *= layout->mirrors_p1;
-	si->offset = file_offset;
-	si->length = T - H;
-	if (si->length > length)
-		si->length = length;
-
-	Nlast = div_u64(H + si->length + U - 1, U);
-	si->maxdevUnits = Nlast - N;
-
-	si->M = M;
-}
-EXPORT_SYMBOL(ore_calc_stripe_info);
-
-int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
-			 unsigned pgbase, struct page **pages,
-			 struct ore_per_dev_state *per_dev, int cur_len)
-{
-	unsigned pg = *cur_pg;
-	struct request_queue *q =
-			osd_request_queue(_ios_od(ios, per_dev->dev));
-	unsigned len = cur_len;
-	int ret;
-
-	if (per_dev->bio == NULL) {
-		unsigned bio_size;
-
-		if (!ios->reading) {
-			bio_size = ios->si.maxdevUnits;
-		} else {
-			bio_size = (ios->si.maxdevUnits + 1) *
-			     (ios->layout->group_width - ios->layout->parity) /
-			     ios->layout->group_width;
-		}
-		bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
-
-		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
-		if (unlikely(!per_dev->bio)) {
-			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
-				     bio_size);
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	while (cur_len > 0) {
-		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
-		unsigned added_len;
-
-		cur_len -= pglen;
-
-		added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
-					    pglen, pgbase);
-		if (unlikely(pglen != added_len)) {
-			/* If bi_vcnt == bi_max then this is a SW BUG */
-			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
-				   "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
-				   per_dev->bio->bi_vcnt,
-				   per_dev->bio->bi_max_vecs,
-				   BIO_MAX_PAGES_KMALLOC, cur_len);
-			ret = -ENOMEM;
-			goto out;
-		}
-		_add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
-
-		pgbase = 0;
-		++pg;
-	}
-	BUG_ON(cur_len);
-
-	per_dev->length += len;
-	*cur_pg = pg;
-	ret = 0;
-out:	/* we fail the complete unit on an error eg don't advance
-	 * per_dev->length and cur_pg. This means that we might have a bigger
-	 * bio than the CDB requested length (per_dev->length). That's fine
-	 * only the oposite is fatal.
-	 */
-	return ret;
-}
-
-static int _add_parity_units(struct ore_io_state *ios,
-			     struct ore_striping_info *si,
-			     unsigned dev, unsigned first_dev,
-			     unsigned mirrors_p1, unsigned devs_in_group,
-			     unsigned cur_len)
-{
-	unsigned do_parity;
-	int ret = 0;
-
-	for (do_parity = ios->layout->parity; do_parity; --do_parity) {
-		struct ore_per_dev_state *per_dev;
-
-		per_dev = &ios->per_dev[dev - first_dev];
-		if (!per_dev->length && !per_dev->offset) {
-			/* Only/always the parity unit of the first
-			 * stripe will be empty. So this is a chance to
-			 * initialize the per_dev info.
-			 */
-			per_dev->dev = dev;
-			per_dev->offset = si->obj_offset - si->unit_off;
-		}
-
-		ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
-					   do_parity == 1);
-		if (unlikely(ret))
-				break;
-
-		if (do_parity != 1) {
-			dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
-			si->cur_comp = (si->cur_comp + 1) %
-						       ios->layout->group_width;
-		}
-	}
-
-	return ret;
-}
-
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
-	struct ore_striping_info *si = &ios->si;
-	unsigned stripe_unit = ios->layout->stripe_unit;
-	unsigned mirrors_p1 = ios->layout->mirrors_p1;
-	unsigned group_width = ios->layout->group_width;
-	unsigned devs_in_group = group_width * mirrors_p1;
-	unsigned dev = si->dev;
-	unsigned first_dev = dev - (dev % devs_in_group);
-	unsigned cur_pg = ios->pages_consumed;
-	u64 length = ios->length;
-	int ret = 0;
-
-	if (!ios->pages) {
-		ios->numdevs = ios->layout->mirrors_p1;
-		return 0;
-	}
-
-	BUG_ON(length > si->length);
-
-	while (length) {
-		struct ore_per_dev_state *per_dev =
-						&ios->per_dev[dev - first_dev];
-		unsigned cur_len, page_off = 0;
-
-		if (!per_dev->length && !per_dev->offset) {
-			/* First time initialize the per_dev info. */
-			per_dev->dev = dev;
-			if (dev == si->dev) {
-				WARN_ON(dev == si->par_dev);
-				per_dev->offset = si->obj_offset;
-				cur_len = stripe_unit - si->unit_off;
-				page_off = si->unit_off & ~PAGE_MASK;
-				BUG_ON(page_off && (page_off != ios->pgbase));
-			} else {
-				per_dev->offset = si->obj_offset - si->unit_off;
-				cur_len = stripe_unit;
-			}
-		} else {
-			cur_len = stripe_unit;
-		}
-		if (cur_len >= length)
-			cur_len = length;
-
-		ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
-					   per_dev, cur_len);
-		if (unlikely(ret))
-			goto out;
-
-		length -= cur_len;
-
-		dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
-		si->cur_comp = (si->cur_comp + 1) % group_width;
-		if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
-			if (!length && ios->sp2d) {
-				/* If we are writing and this is the very last
-				 * stripe. then operate on parity dev.
-				 */
-				dev = si->par_dev;
-				/* If last stripe operate on parity comp */
-				si->cur_comp = group_width - ios->layout->parity;
-			}
-
-			/* In writes cur_len just means if it's the
-			 * last one. See _ore_add_parity_unit.
-			 */
-			ret = _add_parity_units(ios, si, dev, first_dev,
-						mirrors_p1, devs_in_group,
-						ios->sp2d ? length : cur_len);
-			if (unlikely(ret))
-					goto out;
-
-			/* Rotate next par_dev backwards with wraping */
-			si->par_dev = (devs_in_group + si->par_dev -
-				       ios->layout->parity * mirrors_p1) %
-				      devs_in_group + first_dev;
-			/* Next stripe, start fresh */
-			si->cur_comp = 0;
-			si->cur_pg = 0;
-			si->obj_offset += cur_len;
-			si->unit_off = 0;
-		}
-	}
-out:
-	ios->numdevs = devs_in_group;
-	ios->pages_consumed = cur_pg;
-	return ret;
-}
-
-int ore_create(struct ore_io_state *ios)
-{
-	int i, ret;
-
-	for (i = 0; i < ios->oc->numdevs; i++) {
-		struct osd_request *or;
-
-		or = osd_start_request(_ios_od(ios, i));
-		if (unlikely(!or)) {
-			ORE_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
-
-		osd_req_create_object(or, _ios_obj(ios, i));
-	}
-	ret = ore_io_execute(ios);
-
-out:
-	return ret;
-}
-EXPORT_SYMBOL(ore_create);
-
-int ore_remove(struct ore_io_state *ios)
-{
-	int i, ret;
-
-	for (i = 0; i < ios->oc->numdevs; i++) {
-		struct osd_request *or;
-
-		or = osd_start_request(_ios_od(ios, i));
-		if (unlikely(!or)) {
-			ORE_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
-
-		osd_req_remove_object(or, _ios_obj(ios, i));
-	}
-	ret = ore_io_execute(ios);
-
-out:
-	return ret;
-}
-EXPORT_SYMBOL(ore_remove);
-
-static int _write_mirror(struct ore_io_state *ios, int cur_comp)
-{
-	struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
-	unsigned dev = ios->per_dev[cur_comp].dev;
-	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
-	int ret = 0;
-
-	if (ios->pages && !master_dev->length)
-		return 0; /* Just an empty slot */
-
-	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
-		struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
-		struct osd_request *or;
-
-		or = osd_start_request(_ios_od(ios, dev));
-		if (unlikely(!or)) {
-			ORE_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		per_dev->or = or;
-
-		if (ios->pages) {
-			struct bio *bio;
-
-			if (per_dev != master_dev) {
-				bio = bio_clone_fast(master_dev->bio,
-						     GFP_KERNEL, NULL);
-				if (unlikely(!bio)) {
-					ORE_DBGMSG(
-					      "Failed to allocate BIO size=%u\n",
-					      master_dev->bio->bi_max_vecs);
-					ret = -ENOMEM;
-					goto out;
-				}
-
-				bio->bi_disk = NULL;
-				bio->bi_next = NULL;
-				per_dev->offset = master_dev->offset;
-				per_dev->length = master_dev->length;
-				per_dev->bio =  bio;
-				per_dev->dev = dev;
-			} else {
-				bio = master_dev->bio;
-				/* FIXME: bio_set_dir() */
-				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-			}
-
-			osd_req_write(or, _ios_obj(ios, cur_comp),
-				      per_dev->offset, bio, per_dev->length);
-			ORE_DBGMSG("write(0x%llx) offset=0x%llx "
-				      "length=0x%llx dev=%d\n",
-				     _LLU(_ios_obj(ios, cur_comp)->id),
-				     _LLU(per_dev->offset),
-				     _LLU(per_dev->length), dev);
-		} else if (ios->kern_buff) {
-			per_dev->offset = ios->si.obj_offset;
-			per_dev->dev = ios->si.dev + dev;
-
-			/* no cross device without page array */
-			BUG_ON((ios->layout->group_width > 1) &&
-			       (ios->si.unit_off + ios->length >
-				ios->layout->stripe_unit));
-
-			ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
-						 per_dev->offset,
-						 ios->kern_buff, ios->length);
-			if (unlikely(ret))
-				goto out;
-			ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
-				      "length=0x%llx dev=%d\n",
-				     _LLU(_ios_obj(ios, cur_comp)->id),
-				     _LLU(per_dev->offset),
-				     _LLU(ios->length), per_dev->dev);
-		} else {
-			osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
-			ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
-				     _LLU(_ios_obj(ios, cur_comp)->id),
-				     ios->out_attr_len, dev);
-		}
-
-		if (ios->out_attr)
-			osd_req_add_set_attr_list(or, ios->out_attr,
-						  ios->out_attr_len);
-
-		if (ios->in_attr)
-			osd_req_add_get_attr_list(or, ios->in_attr,
-						  ios->in_attr_len);
-	}
-
-out:
-	return ret;
-}
-
-int ore_write(struct ore_io_state *ios)
-{
-	int i;
-	int ret;
-
-	if (unlikely(ios->sp2d && !ios->r4w)) {
-		/* A library is attempting a RAID-write without providing
-		 * a pages lock interface.
-		 */
-		WARN_ON_ONCE(1);
-		return -ENOTSUPP;
-	}
-
-	ret = _prepare_for_striping(ios);
-	if (unlikely(ret))
-		return ret;
-
-	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-		ret = _write_mirror(ios, i);
-		if (unlikely(ret))
-			return ret;
-	}
-
-	ret = ore_io_execute(ios);
-	return ret;
-}
-EXPORT_SYMBOL(ore_write);
-
-int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
-{
-	struct osd_request *or;
-	struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
-	struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
-	unsigned first_dev = (unsigned)obj->id;
-
-	if (ios->pages && !per_dev->length)
-		return 0; /* Just an empty slot */
-
-	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
-	or = osd_start_request(_ios_od(ios, first_dev));
-	if (unlikely(!or)) {
-		ORE_ERR("%s: osd_start_request failed\n", __func__);
-		return -ENOMEM;
-	}
-	per_dev->or = or;
-
-	if (ios->pages) {
-		if (per_dev->cur_sg) {
-			/* finalize the last sg_entry */
-			_ore_add_sg_seg(per_dev, 0, false);
-			if (unlikely(!per_dev->cur_sg))
-				return 0; /* Skip parity only device */
-
-			osd_req_read_sg(or, obj, per_dev->bio,
-					per_dev->sglist, per_dev->cur_sg);
-		} else {
-			/* The no raid case */
-			osd_req_read(or, obj, per_dev->offset,
-				     per_dev->bio, per_dev->length);
-		}
-
-		ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-			     " dev=%d sg_len=%d\n", _LLU(obj->id),
-			     _LLU(per_dev->offset), _LLU(per_dev->length),
-			     first_dev, per_dev->cur_sg);
-	} else {
-		BUG_ON(ios->kern_buff);
-
-		osd_req_get_attributes(or, obj);
-		ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
-			      _LLU(obj->id),
-			      ios->in_attr_len, first_dev);
-	}
-	if (ios->out_attr)
-		osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
-
-	if (ios->in_attr)
-		osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
-
-	return 0;
-}
-
-int ore_read(struct ore_io_state *ios)
-{
-	int i;
-	int ret;
-
-	ret = _prepare_for_striping(ios);
-	if (unlikely(ret))
-		return ret;
-
-	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-		ret = _ore_read_mirror(ios, i);
-		if (unlikely(ret))
-			return ret;
-	}
-
-	ret = ore_io_execute(ios);
-	return ret;
-}
-EXPORT_SYMBOL(ore_read);
-
-int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
-{
-	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
-	void *iter = NULL;
-	int nelem;
-
-	do {
-		nelem = 1;
-		osd_req_decode_get_attr_list(ios->per_dev[0].or,
-					     &cur_attr, &nelem, &iter);
-		if ((cur_attr.attr_page == attr->attr_page) &&
-		    (cur_attr.attr_id == attr->attr_id)) {
-			attr->len = cur_attr.len;
-			attr->val_ptr = cur_attr.val_ptr;
-			return 0;
-		}
-	} while (iter);
-
-	return -EIO;
-}
-EXPORT_SYMBOL(extract_attr_from_ios);
-
-static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
-			     struct osd_attr *attr)
-{
-	int last_comp = cur_comp + ios->layout->mirrors_p1;
-
-	for (; cur_comp < last_comp; ++cur_comp) {
-		struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
-		struct osd_request *or;
-
-		or = osd_start_request(_ios_od(ios, cur_comp));
-		if (unlikely(!or)) {
-			ORE_ERR("%s: osd_start_request failed\n", __func__);
-			return -ENOMEM;
-		}
-		per_dev->or = or;
-
-		osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
-		osd_req_add_set_attr_list(or, attr, 1);
-	}
-
-	return 0;
-}
-
-struct _trunc_info {
-	struct ore_striping_info si;
-	u64 prev_group_obj_off;
-	u64 next_group_obj_off;
-
-	unsigned first_group_dev;
-	unsigned nex_group_dev;
-};
-
-static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
-			     struct _trunc_info *ti)
-{
-	unsigned stripe_unit = layout->stripe_unit;
-
-	ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
-
-	ti->prev_group_obj_off = ti->si.M * stripe_unit;
-	ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
-
-	ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
-	ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-}
-
-int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
-		   u64 size)
-{
-	struct ore_io_state *ios;
-	struct exofs_trunc_attr {
-		struct osd_attr attr;
-		__be64 newsize;
-	} *size_attrs;
-	struct _trunc_info ti;
-	int i, ret;
-
-	ret = ore_get_io_state(layout, oc, &ios);
-	if (unlikely(ret))
-		return ret;
-
-	_calc_trunk_info(ios->layout, size, &ti);
-
-	size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
-			     GFP_KERNEL);
-	if (unlikely(!size_attrs)) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ios->numdevs = ios->oc->numdevs;
-
-	for (i = 0; i < ios->numdevs; ++i) {
-		struct exofs_trunc_attr *size_attr = &size_attrs[i];
-		u64 obj_size;
-
-		if (i < ti.first_group_dev)
-			obj_size = ti.prev_group_obj_off;
-		else if (i >= ti.nex_group_dev)
-			obj_size = ti.next_group_obj_off;
-		else if (i < ti.si.dev) /* dev within this group */
-			obj_size = ti.si.obj_offset +
-				      ios->layout->stripe_unit - ti.si.unit_off;
-		else if (i == ti.si.dev)
-			obj_size = ti.si.obj_offset;
-		else /* i > ti.dev */
-			obj_size = ti.si.obj_offset - ti.si.unit_off;
-
-		size_attr->newsize = cpu_to_be64(obj_size);
-		size_attr->attr = g_attr_logical_length;
-		size_attr->attr.val_ptr = &size_attr->newsize;
-
-		ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
-			     _LLU(oc->comps->obj.id), _LLU(obj_size), i);
-		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
-					&size_attr->attr);
-		if (unlikely(ret))
-			goto out;
-	}
-	ret = ore_io_execute(ios);
-
-out:
-	kfree(size_attrs);
-	ore_put_io_state(ios);
-	return ret;
-}
-EXPORT_SYMBOL(ore_truncate);
-
-const struct osd_attr g_attr_logical_length = ATTR_DEF(
-	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-EXPORT_SYMBOL(g_attr_logical_length);

diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
deleted file mode 100644
index 199590f..0000000
--- a/fs/exofs/ore_raid.c
+++ /dev/null

@@ -1,756 +0,0 @@
-/*
- * Copyright (C) 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of the objects raid engine (ore).
- *
- * It is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with "ore". If not, write to the Free Software Foundation, Inc:
- *	"Free Software Foundation <info@fsf.org>"
- */
-
-#include <linux/gfp.h>
-#include <linux/async_tx.h>
-
-#include "ore_raid.h"
-
-#undef ORE_DBGMSG2
-#define ORE_DBGMSG2 ORE_DBGMSG
-
-static struct page *_raid_page_alloc(void)
-{
-	return alloc_page(GFP_KERNEL);
-}
-
-static void _raid_page_free(struct page *p)
-{
-	__free_page(p);
-}
-
-/* This struct is forward declare in ore_io_state, but is private to here.
- * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
- *
- * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
- * Ascending page index access is sp2d(p-minor, c-major). But storage is
- * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
- * API.
- */
-struct __stripe_pages_2d {
-	/* Cache some hot path repeated calculations */
-	unsigned parity;
-	unsigned data_devs;
-	unsigned pages_in_unit;
-
-	bool needed ;
-
-	/* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
-	struct __1_page_stripe {
-		bool alloc;
-		unsigned write_count;
-		struct async_submit_ctl submit;
-		struct dma_async_tx_descriptor *tx;
-
-		/* The size of this array is data_devs + parity */
-		struct page **pages;
-		struct page **scribble;
-		/* bool array, size of this array is data_devs */
-		char *page_is_read;
-	} _1p_stripes[];
-};
-
-/* This can get bigger then a page. So support multiple page allocations
- * _sp2d_free should be called even if _sp2d_alloc fails (by returning
- * none-zero).
- */
-static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
-		       unsigned parity, struct __stripe_pages_2d **psp2d)
-{
-	struct __stripe_pages_2d *sp2d;
-	unsigned data_devs = group_width - parity;
-
-	/*
-	 * Desired allocation layout is, though when larger than PAGE_SIZE,
-	 * each struct __alloc_1p_arrays is separately allocated:
-
-	struct _alloc_all_bytes {
-		struct __alloc_stripe_pages_2d {
-			struct __stripe_pages_2d sp2d;
-			struct __1_page_stripe _1p_stripes[pages_in_unit];
-		} __asp2d;
-		struct __alloc_1p_arrays {
-			struct page *pages[group_width];
-			struct page *scribble[group_width];
-			char page_is_read[data_devs];
-		} __a1pa[pages_in_unit];
-	} *_aab;
-
-	struct __alloc_1p_arrays *__a1pa;
-	struct __alloc_1p_arrays *__a1pa_end;
-
-	*/
-
-	char *__a1pa;
-	char *__a1pa_end;
-
-	const size_t sizeof_stripe_pages_2d =
-		sizeof(struct __stripe_pages_2d) +
-		sizeof(struct __1_page_stripe) * pages_in_unit;
-	const size_t sizeof__a1pa =
-		ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs,
-		      sizeof(void *));
-	const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit;
-	const size_t alloc_total = sizeof_stripe_pages_2d +
-				   sizeof__a1pa_arrays;
-
-	unsigned num_a1pa, alloc_size, i;
-
-	/* FIXME: check these numbers in ore_verify_layout */
-	BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE);
-	BUG_ON(sizeof__a1pa > PAGE_SIZE);
-
-	/*
-	 * If alloc_total would be larger than PAGE_SIZE, only allocate
-	 * as many a1pa items as would fill the rest of the page, instead
-	 * of the full pages_in_unit count.
-	 */
-	if (alloc_total > PAGE_SIZE) {
-		num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa;
-		alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa;
-	} else {
-		num_a1pa = pages_in_unit;
-		alloc_size = alloc_total;
-	}
-
-	*psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL);
-	if (unlikely(!sp2d)) {
-		ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
-		return -ENOMEM;
-	}
-	/* From here Just call _sp2d_free */
-
-	/* Find start of a1pa area. */
-	__a1pa = (char *)sp2d + sizeof_stripe_pages_2d;
-	/* Find end of the _allocated_ a1pa area. */
-	__a1pa_end = __a1pa + alloc_size;
-
-	/* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */
-	for (i = 0; i < pages_in_unit; ++i) {
-		struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i];
-
-		if (unlikely(__a1pa >= __a1pa_end)) {
-			num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
-							pages_in_unit - i);
-			alloc_size = sizeof__a1pa * num_a1pa;
-
-			__a1pa = kzalloc(alloc_size, GFP_KERNEL);
-			if (unlikely(!__a1pa)) {
-				ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
-					   num_a1pa);
-				return -ENOMEM;
-			}
-			__a1pa_end = __a1pa + alloc_size;
-			/* First *pages is marked for kfree of the buffer */
-			stripe->alloc = true;
-		}
-
-		/*
-		 * Attach all _lp_stripes pointers to the allocation for
-		 * it which was either part of the original PAGE_SIZE
-		 * allocation or the subsequent allocation in this loop.
-		 */
-		stripe->pages = (void *)__a1pa;
-		stripe->scribble = stripe->pages + group_width;
-		stripe->page_is_read = (char *)stripe->scribble + group_width;
-		__a1pa += sizeof__a1pa;
-	}
-
-	sp2d->parity = parity;
-	sp2d->data_devs = data_devs;
-	sp2d->pages_in_unit = pages_in_unit;
-	return 0;
-}
-
-static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
-			const struct _ore_r4w_op *r4w, void *priv)
-{
-	unsigned data_devs = sp2d->data_devs;
-	unsigned group_width = data_devs + sp2d->parity;
-	int p, c;
-
-	if (!sp2d->needed)
-		return;
-
-	for (c = data_devs - 1; c >= 0; --c)
-		for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
-			struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
-			if (_1ps->page_is_read[c]) {
-				struct page *page = _1ps->pages[c];
-
-				r4w->put_page(priv, page);
-				_1ps->page_is_read[c] = false;
-			}
-		}
-
-	for (p = 0; p < sp2d->pages_in_unit; p++) {
-		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
-		memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
-		_1ps->write_count = 0;
-		_1ps->tx = NULL;
-	}
-
-	sp2d->needed = false;
-}
-
-static void _sp2d_free(struct __stripe_pages_2d *sp2d)
-{
-	unsigned i;
-
-	if (!sp2d)
-		return;
-
-	for (i = 0; i < sp2d->pages_in_unit; ++i) {
-		if (sp2d->_1p_stripes[i].alloc)
-			kfree(sp2d->_1p_stripes[i].pages);
-	}
-
-	kfree(sp2d);
-}
-
-static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
-{
-	unsigned p;
-
-	for (p = 0; p < sp2d->pages_in_unit; p++) {
-		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
-		if (_1ps->write_count)
-			return p;
-	}
-
-	return ~0;
-}
-
-static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
-{
-	int p;
-
-	for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
-		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
-		if (_1ps->write_count)
-			return p;
-	}
-
-	return ~0;
-}
-
-static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
-{
-	unsigned p;
-	unsigned tx_flags = ASYNC_TX_ACK;
-
-	if (sp2d->parity == 1)
-		tx_flags |= ASYNC_TX_XOR_ZERO_DST;
-
-	for (p = 0; p < sp2d->pages_in_unit; p++) {
-		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
-		if (!_1ps->write_count)
-			continue;
-
-		init_async_submit(&_1ps->submit, tx_flags,
-			NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
-
-		if (sp2d->parity == 1)
-			_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
-						_1ps->pages, 0, sp2d->data_devs,
-						PAGE_SIZE, &_1ps->submit);
-		else /* parity == 2 */
-			_1ps->tx = async_gen_syndrome(_1ps->pages, 0,
-						sp2d->data_devs + sp2d->parity,
-						PAGE_SIZE, &_1ps->submit);
-	}
-
-	for (p = 0; p < sp2d->pages_in_unit; p++) {
-		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-		/* NOTE: We wait for HW synchronously (I don't have such HW
-		 * to test with.) Is parallelism needed with today's multi
-		 * cores?
-		 */
-		async_tx_issue_pending(_1ps->tx);
-	}
-}
-
-void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
-		       struct ore_striping_info *si, struct page *page)
-{
-	struct __1_page_stripe *_1ps;
-
-	sp2d->needed = true;
-
-	_1ps = &sp2d->_1p_stripes[si->cur_pg];
-	_1ps->pages[si->cur_comp] = page;
-	++_1ps->write_count;
-
-	si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
-	/* si->cur_comp is advanced outside at main loop */
-}
-
-void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
-		     bool not_last)
-{
-	struct osd_sg_entry *sge;
-
-	ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
-		     "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
-		     per_dev->dev, cur_len, not_last, per_dev->cur_sg,
-		     _LLU(per_dev->offset), per_dev->length,
-		     per_dev->last_sgs_total);
-
-	if (!per_dev->cur_sg) {
-		sge = per_dev->sglist;
-
-		/* First time we prepare two entries */
-		if (per_dev->length) {
-			++per_dev->cur_sg;
-			sge->offset = per_dev->offset;
-			sge->len = per_dev->length;
-		} else {
-			/* Here the parity is the first unit of this object.
-			 * This happens every time we reach a parity device on
-			 * the same stripe as the per_dev->offset. We need to
-			 * just skip this unit.
-			 */
-			per_dev->offset += cur_len;
-			return;
-		}
-	} else {
-		/* finalize the last one */
-		sge = &per_dev->sglist[per_dev->cur_sg - 1];
-		sge->len = per_dev->length - per_dev->last_sgs_total;
-	}
-
-	if (not_last) {
-		/* Partly prepare the next one */
-		struct osd_sg_entry *next_sge = sge + 1;
-
-		++per_dev->cur_sg;
-		next_sge->offset = sge->offset + sge->len + cur_len;
-		/* Save cur len so we know how mutch was added next time */
-		per_dev->last_sgs_total = per_dev->length;
-		next_sge->len = 0;
-	} else if (!sge->len) {
-		/* Optimize for when the last unit is a parity */
-		--per_dev->cur_sg;
-	}
-}
-
-static int _alloc_read_4_write(struct ore_io_state *ios)
-{
-	struct ore_layout *layout = ios->layout;
-	int ret;
-	/* We want to only read those pages not in cache so worst case
-	 * is a stripe populated with every other page
-	 */
-	unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
-
-	ret = _ore_get_io_state(layout, ios->oc,
-				layout->group_width * layout->mirrors_p1,
-				sgs_per_dev, 0, &ios->ios_read_4_write);
-	return ret;
-}
-
-/* @si contains info of the to-be-inserted page. Update of @si should be
- * maintained by caller. Specificaly si->dev, si->obj_offset, ...
- */
-static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
-		       struct page *page, unsigned pg_len)
-{
-	struct request_queue *q;
-	struct ore_per_dev_state *per_dev;
-	struct ore_io_state *read_ios;
-	unsigned first_dev = si->dev - (si->dev %
-			  (ios->layout->group_width * ios->layout->mirrors_p1));
-	unsigned comp = si->dev - first_dev;
-	unsigned added_len;
-
-	if (!ios->ios_read_4_write) {
-		int ret = _alloc_read_4_write(ios);
-
-		if (unlikely(ret))
-			return ret;
-	}
-
-	read_ios = ios->ios_read_4_write;
-	read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
-
-	per_dev = &read_ios->per_dev[comp];
-	if (!per_dev->length) {
-		per_dev->bio = bio_kmalloc(GFP_KERNEL,
-					   ios->sp2d->pages_in_unit);
-		if (unlikely(!per_dev->bio)) {
-			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
-				     ios->sp2d->pages_in_unit);
-			return -ENOMEM;
-		}
-		per_dev->offset = si->obj_offset;
-		per_dev->dev = si->dev;
-	} else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
-		u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
-
-		_ore_add_sg_seg(per_dev, gap, true);
-	}
-	q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
-	added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
-				    si->obj_offset % PAGE_SIZE);
-	if (unlikely(added_len != pg_len)) {
-		ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
-			      per_dev->bio->bi_vcnt);
-		return -ENOMEM;
-	}
-
-	per_dev->length += pg_len;
-	return 0;
-}
-
-/* read the beginning of an unaligned first page */
-static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
-{
-	struct ore_striping_info si;
-	unsigned pg_len;
-
-	ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
-
-	pg_len = si.obj_offset % PAGE_SIZE;
-	si.obj_offset -= pg_len;
-
-	ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
-		   _LLU(si.obj_offset), pg_len, page->index, si.dev);
-
-	return _add_to_r4w(ios, &si, page, pg_len);
-}
-
-/* read the end of an incomplete last page */
-static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
-{
-	struct ore_striping_info si;
-	struct page *page;
-	unsigned pg_len, p, c;
-
-	ore_calc_stripe_info(ios->layout, *offset, 0, &si);
-
-	p = si.cur_pg;
-	c = si.cur_comp;
-	page = ios->sp2d->_1p_stripes[p].pages[c];
-
-	pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
-	*offset += pg_len;
-
-	ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
-		   p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
-
-	BUG_ON(!page);
-
-	return _add_to_r4w(ios, &si, page, pg_len);
-}
-
-static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
-{
-	struct bio_vec *bv;
-	unsigned i, d;
-
-	/* loop on all devices all pages */
-	for (d = 0; d < ios->numdevs; d++) {
-		struct bio *bio = ios->per_dev[d].bio;
-
-		if (!bio)
-			continue;
-
-		bio_for_each_segment_all(bv, bio, i) {
-			struct page *page = bv->bv_page;
-
-			SetPageUptodate(page);
-			if (PageError(page))
-				ClearPageError(page);
-		}
-	}
-}
-
-/* read_4_write is hacked to read the start of the first stripe and/or
- * the end of the last stripe. If needed, with an sg-gap at each device/page.
- * It is assumed to be called after the to_be_written pages of the first stripe
- * are populating ios->sp2d[][]
- *
- * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
- * These pages are held at sp2d[p].pages[c] but with
- * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
- * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
- * @uptodate=true, so we don't need to read it, only unlock, after IO.
- *
- * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
- * to-be-written count, we should consider the xor-in-place mode.
- * need_to_read_pages_count is the actual number of pages not present in cache.
- * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
- * approximation? In this mode the read pages are put in the empty places of
- * ios->sp2d[p][*], xor is calculated the same way. These pages are
- * allocated/freed and don't go through cache
- */
-static int _read_4_write_first_stripe(struct ore_io_state *ios)
-{
-	struct ore_striping_info read_si;
-	struct __stripe_pages_2d *sp2d = ios->sp2d;
-	u64 offset = ios->si.first_stripe_start;
-	unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-
-	if (offset == ios->offset) /* Go to start collect $200 */
-		goto read_last_stripe;
-
-	min_p = _sp2d_min_pg(sp2d);
-	max_p = _sp2d_max_pg(sp2d);
-
-	ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
-		   offset, ios->offset, min_p, max_p);
-
-	for (c = 0; ; c++) {
-		ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
-		read_si.obj_offset += min_p * PAGE_SIZE;
-		offset += min_p * PAGE_SIZE;
-		for (p = min_p; p <= max_p; p++) {
-			struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-			struct page **pp = &_1ps->pages[c];
-			bool uptodate;
-
-			if (*pp) {
-				if (ios->offset % PAGE_SIZE)
-					/* Read the remainder of the page */
-					_add_to_r4w_first_page(ios, *pp);
-				/* to-be-written pages start here */
-				goto read_last_stripe;
-			}
-
-			*pp = ios->r4w->get_page(ios->private, offset,
-						 &uptodate);
-			if (unlikely(!*pp))
-				return -ENOMEM;
-
-			if (!uptodate)
-				_add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
-
-			/* Mark read-pages to be cache_released */
-			_1ps->page_is_read[c] = true;
-			read_si.obj_offset += PAGE_SIZE;
-			offset += PAGE_SIZE;
-		}
-		offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
-	}
-
-read_last_stripe:
-	return 0;
-}
-
-static int _read_4_write_last_stripe(struct ore_io_state *ios)
-{
-	struct ore_striping_info read_si;
-	struct __stripe_pages_2d *sp2d = ios->sp2d;
-	u64 offset;
-	u64 last_stripe_end;
-	unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
-	unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-
-	offset = ios->offset + ios->length;
-	if (offset % PAGE_SIZE)
-		_add_to_r4w_last_page(ios, &offset);
-		/* offset will be aligned to next page */
-
-	last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
-				 * bytes_in_stripe;
-	if (offset == last_stripe_end) /* Optimize for the aligned case */
-		goto read_it;
-
-	ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
-	p = read_si.cur_pg;
-	c = read_si.cur_comp;
-
-	if (min_p == sp2d->pages_in_unit) {
-		/* Didn't do it yet */
-		min_p = _sp2d_min_pg(sp2d);
-		max_p = _sp2d_max_pg(sp2d);
-	}
-
-	ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
-		   offset, last_stripe_end, min_p, max_p);
-
-	while (offset < last_stripe_end) {
-		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
-		if ((min_p <= p) && (p <= max_p)) {
-			struct page *page;
-			bool uptodate;
-
-			BUG_ON(_1ps->pages[c]);
-			page = ios->r4w->get_page(ios->private, offset,
-						  &uptodate);
-			if (unlikely(!page))
-				return -ENOMEM;
-
-			_1ps->pages[c] = page;
-			/* Mark read-pages to be cache_released */
-			_1ps->page_is_read[c] = true;
-			if (!uptodate)
-				_add_to_r4w(ios, &read_si, page, PAGE_SIZE);
-		}
-
-		offset += PAGE_SIZE;
-		if (p == (sp2d->pages_in_unit - 1)) {
-			++c;
-			p = 0;
-			ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
-		} else {
-			read_si.obj_offset += PAGE_SIZE;
-			++p;
-		}
-	}
-
-read_it:
-	return 0;
-}
-
-static int _read_4_write_execute(struct ore_io_state *ios)
-{
-	struct ore_io_state *ios_read;
-	unsigned i;
-	int ret;
-
-	ios_read = ios->ios_read_4_write;
-	if (!ios_read)
-		return 0;
-
-	/* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
-	 * to check for per_dev->bio
-	 */
-	ios_read->pages = ios->pages;
-
-	/* Now read these devices */
-	for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
-		ret = _ore_read_mirror(ios_read, i);
-		if (unlikely(ret))
-			return ret;
-	}
-
-	ret = ore_io_execute(ios_read); /* Synchronus execution */
-	if (unlikely(ret)) {
-		ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
-		return ret;
-	}
-
-	_mark_read4write_pages_uptodate(ios_read, ret);
-	ore_put_io_state(ios_read);
-	ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
-	return 0;
-}
-
-/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
-int _ore_add_parity_unit(struct ore_io_state *ios,
-			    struct ore_striping_info *si,
-			    struct ore_per_dev_state *per_dev,
-			    unsigned cur_len, bool do_xor)
-{
-	if (ios->reading) {
-		if (per_dev->cur_sg >= ios->sgs_per_dev) {
-			ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
-				per_dev->cur_sg, ios->sgs_per_dev);
-			return -ENOMEM;
-		}
-		_ore_add_sg_seg(per_dev, cur_len, true);
-	} else {
-		struct __stripe_pages_2d *sp2d = ios->sp2d;
-		struct page **pages = ios->parity_pages + ios->cur_par_page;
-		unsigned num_pages;
-		unsigned array_start = 0;
-		unsigned i;
-		int ret;
-
-		si->cur_pg = _sp2d_min_pg(sp2d);
-		num_pages  = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
-
-		if (!per_dev->length) {
-			per_dev->offset += si->cur_pg * PAGE_SIZE;
-			/* If first stripe, Read in all read4write pages
-			 * (if needed) before we calculate the first parity.
-			 */
-			if (do_xor)
-				_read_4_write_first_stripe(ios);
-		}
-		if (!cur_len && do_xor)
-			/* If last stripe r4w pages of last stripe */
-			_read_4_write_last_stripe(ios);
-		_read_4_write_execute(ios);
-
-		for (i = 0; i < num_pages; i++) {
-			pages[i] = _raid_page_alloc();
-			if (unlikely(!pages[i]))
-				return -ENOMEM;
-
-			++(ios->cur_par_page);
-		}
-
-		BUG_ON(si->cur_comp < sp2d->data_devs);
-		BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
-
-		ret = _ore_add_stripe_unit(ios,  &array_start, 0, pages,
-					   per_dev, num_pages * PAGE_SIZE);
-		if (unlikely(ret))
-			return ret;
-
-		if (do_xor) {
-			_gen_xor_unit(sp2d);
-			_sp2d_reset(sp2d, ios->r4w, ios->private);
-		}
-	}
-	return 0;
-}
-
-int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
-{
-	if (ios->parity_pages) {
-		struct ore_layout *layout = ios->layout;
-		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
-
-		if (_sp2d_alloc(pages_in_unit, layout->group_width,
-				layout->parity, &ios->sp2d)) {
-			return -ENOMEM;
-		}
-	}
-	return 0;
-}
-
-void _ore_free_raid_stuff(struct ore_io_state *ios)
-{
-	if (ios->sp2d) { /* writing and raid */
-		unsigned i;
-
-		for (i = 0; i < ios->cur_par_page; i++) {
-			struct page *page = ios->parity_pages[i];
-
-			if (page)
-				_raid_page_free(page);
-		}
-		if (ios->extra_part_alloc)
-			kfree(ios->parity_pages);
-		/* If IO returned an error pages might need unlocking */
-		_sp2d_reset(ios->sp2d, ios->r4w, ios->private);
-		_sp2d_free(ios->sp2d);
-	} else {
-		/* Will only be set if raid reading && sglist is big */
-		if (ios->extra_part_alloc)
-			kfree(ios->per_dev[0].sglist);
-	}
-	if (ios->ios_read_4_write)
-		ore_put_io_state(ios->ios_read_4_write);
-}

diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
deleted file mode 100644
index a6e7467..0000000
--- a/fs/exofs/ore_raid.h
+++ /dev/null

@@ -1,62 +0,0 @@
-/*
- * Copyright (C) from 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of the objects raid engine (ore).
- *
- * It is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with "ore". If not, write to the Free Software Foundation, Inc:
- *	"Free Software Foundation <info@fsf.org>"
- */
-
-#include <scsi/osd_ore.h>
-
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
-	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
-	do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
-
-/* ios_raid.c stuff needed by ios.c */
-int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
-void _ore_free_raid_stuff(struct ore_io_state *ios);
-
-void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
-		 bool not_last);
-int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
-		     struct ore_per_dev_state *per_dev, unsigned cur_len,
-		     bool do_xor);
-void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
-		       struct ore_striping_info *si, struct page *page);
-static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
-				struct ore_striping_info *si, struct page *page)
-{
-	if (!sp2d) /* Inline the fast path */
-		return; /* Hay no raid stuff */
-	_ore_add_stripe_page(sp2d, si, page);
-}
-
-/* ios.c stuff needed by ios_raid.c */
-int  _ore_get_io_state(struct ore_layout *layout,
-			struct ore_components *oc, unsigned numdevs,
-			unsigned sgs_per_dev, unsigned num_par_pages,
-			struct ore_io_state **pios);
-int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
-		unsigned pgbase, struct page **pages,
-		struct ore_per_dev_state *per_dev, int cur_len);
-int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
-int ore_io_execute(struct ore_io_state *ios);

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
deleted file mode 100644
index 7d61e3f..0000000
--- a/fs/exofs/super.c
+++ /dev/null

@@ -1,1050 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/string.h>
-#include <linux/parser.h>
-#include <linux/vfs.h>
-#include <linux/random.h>
-#include <linux/module.h>
-#include <linux/exportfs.h>
-#include <linux/slab.h>
-#include <linux/iversion.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-
-/******************************************************************************
- * MOUNT OPTIONS
- *****************************************************************************/
-
-/*
- * struct to hold what we get from mount options
- */
-struct exofs_mountopt {
-	bool is_osdname;
-	const char *dev_name;
-	uint64_t pid;
-	int timeout;
-};
-
-/*
- * exofs-specific mount-time options.
- */
-enum { Opt_name, Opt_pid, Opt_to, Opt_err };
-
-/*
- * Our mount-time options.  These should ideally be 64-bit unsigned, but the
- * kernel's parsing functions do not currently support that.  32-bit should be
- * sufficient for most applications now.
- */
-static match_table_t tokens = {
-	{Opt_name, "osdname=%s"},
-	{Opt_pid, "pid=%u"},
-	{Opt_to, "to=%u"},
-	{Opt_err, NULL}
-};
-
-/*
- * The main option parsing method.  Also makes sure that all of the mandatory
- * mount options were set.
- */
-static int parse_options(char *options, struct exofs_mountopt *opts)
-{
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-	bool s_pid = false;
-
-	EXOFS_DBGMSG("parse_options %s\n", options);
-	/* defaults */
-	memset(opts, 0, sizeof(*opts));
-	opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		char str[32];
-
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_name:
-			kfree(opts->dev_name);
-			opts->dev_name = match_strdup(&args[0]);
-			if (unlikely(!opts->dev_name)) {
-				EXOFS_ERR("Error allocating dev_name");
-				return -ENOMEM;
-			}
-			opts->is_osdname = true;
-			break;
-		case Opt_pid:
-			if (0 == match_strlcpy(str, &args[0], sizeof(str)))
-				return -EINVAL;
-			opts->pid = simple_strtoull(str, NULL, 0);
-			if (opts->pid < EXOFS_MIN_PID) {
-				EXOFS_ERR("Partition ID must be >= %u",
-					  EXOFS_MIN_PID);
-				return -EINVAL;
-			}
-			s_pid = 1;
-			break;
-		case Opt_to:
-			if (match_int(&args[0], &option))
-				return -EINVAL;
-			if (option <= 0) {
-				EXOFS_ERR("Timeout must be > 0");
-				return -EINVAL;
-			}
-			opts->timeout = option * HZ;
-			break;
-		}
-	}
-
-	if (!s_pid) {
-		EXOFS_ERR("Need to specify the following options:\n");
-		EXOFS_ERR("    -o pid=pid_no_to_use\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/******************************************************************************
- * INODE CACHE
- *****************************************************************************/
-
-/*
- * Our inode cache.  Isn't it pretty?
- */
-static struct kmem_cache *exofs_inode_cachep;
-
-/*
- * Allocate an inode in the cache
- */
-static struct inode *exofs_alloc_inode(struct super_block *sb)
-{
-	struct exofs_i_info *oi;
-
-	oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
-	if (!oi)
-		return NULL;
-
-	inode_set_iversion(&oi->vfs_inode, 1);
-	return &oi->vfs_inode;
-}
-
-static void exofs_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
-}
-
-/*
- * Remove an inode from the cache
- */
-static void exofs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, exofs_i_callback);
-}
-
-/*
- * Initialize the inode
- */
-static void exofs_init_once(void *foo)
-{
-	struct exofs_i_info *oi = foo;
-
-	inode_init_once(&oi->vfs_inode);
-}
-
-/*
- * Create and initialize the inode cache
- */
-static int init_inodecache(void)
-{
-	exofs_inode_cachep = kmem_cache_create_usercopy("exofs_inode_cache",
-				sizeof(struct exofs_i_info), 0,
-				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
-				SLAB_ACCOUNT,
-				offsetof(struct exofs_i_info, i_data),
-				sizeof_field(struct exofs_i_info, i_data),
-				exofs_init_once);
-	if (exofs_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-/*
- * Destroy the inode cache
- */
-static void destroy_inodecache(void)
-{
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(exofs_inode_cachep);
-}
-
-/******************************************************************************
- * Some osd helpers
- *****************************************************************************/
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
-	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
-		    u64 offset, void *p, unsigned length)
-{
-	struct osd_request *or = osd_start_request(od);
-/*	struct osd_sense_info osi = {.key = 0};*/
-	int ret;
-
-	if (unlikely(!or)) {
-		EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
-		return -ENOMEM;
-	}
-	ret = osd_req_read_kern(or, obj, offset, p, length);
-	if (unlikely(ret)) {
-		EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
-		goto out;
-	}
-
-	ret = osd_finalize_request(or, 0, cred, NULL);
-	if (unlikely(ret)) {
-		EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
-		goto out;
-	}
-
-	ret = osd_execute_request(or);
-	if (unlikely(ret))
-		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
-	/* osd_req_decode_sense(or, ret); */
-
-out:
-	osd_end_request(or);
-	EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
-		      "length=0x%llx dev=%p ret=>%d\n",
-		      _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
-	return ret;
-}
-
-static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
-	EXOFS_APAGE_SB_DATA,
-	EXOFS_ATTR_SB_STATS,
-	sizeof(struct exofs_sb_stats));
-
-static int __sbi_read_stats(struct exofs_sb_info *sbi)
-{
-	struct osd_attr attrs[] = {
-		[0] = g_attr_sb_stats,
-	};
-	struct ore_io_state *ios;
-	int ret;
-
-	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
-		return ret;
-	}
-
-	ios->in_attr = attrs;
-	ios->in_attr_len = ARRAY_SIZE(attrs);
-
-	ret = ore_read(ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("Error reading super_block stats => %d\n", ret);
-		goto out;
-	}
-
-	ret = extract_attr_from_ios(ios, &attrs[0]);
-	if (ret) {
-		EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
-		goto out;
-	}
-	if (attrs[0].len) {
-		struct exofs_sb_stats *ess;
-
-		if (unlikely(attrs[0].len != sizeof(*ess))) {
-			EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
-				  "size(%d) != expected(%zd)\n",
-				  __func__, attrs[0].len, sizeof(*ess));
-			goto out;
-		}
-
-		ess = attrs[0].val_ptr;
-		sbi->s_nextid = le64_to_cpu(ess->s_nextid);
-		sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
-	}
-
-out:
-	ore_put_io_state(ios);
-	return ret;
-}
-
-static void stats_done(struct ore_io_state *ios, void *p)
-{
-	ore_put_io_state(ios);
-	/* Good thanks nothing to do anymore */
-}
-
-/* Asynchronously write the stats attribute */
-int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
-{
-	struct osd_attr attrs[] = {
-		[0] = g_attr_sb_stats,
-	};
-	struct ore_io_state *ios;
-	int ret;
-
-	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
-		return ret;
-	}
-
-	sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
-	sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
-	attrs[0].val_ptr = &sbi->s_ess;
-
-
-	ios->done = stats_done;
-	ios->private = sbi;
-	ios->out_attr = attrs;
-	ios->out_attr_len = ARRAY_SIZE(attrs);
-
-	ret = ore_write(ios);
-	if (unlikely(ret)) {
-		EXOFS_ERR("%s: ore_write failed.\n", __func__);
-		ore_put_io_state(ios);
-	}
-
-	return ret;
-}
-
-/******************************************************************************
- * SUPERBLOCK FUNCTIONS
- *****************************************************************************/
-static const struct super_operations exofs_sops;
-static const struct export_operations exofs_export_ops;
-
-/*
- * Write the superblock to the OSD
- */
-static int exofs_sync_fs(struct super_block *sb, int wait)
-{
-	struct exofs_sb_info *sbi;
-	struct exofs_fscb *fscb;
-	struct ore_comp one_comp;
-	struct ore_components oc;
-	struct ore_io_state *ios;
-	int ret = -ENOMEM;
-
-	fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
-	if (unlikely(!fscb))
-		return -ENOMEM;
-
-	sbi = sb->s_fs_info;
-
-	/* NOTE: We no longer dirty the super_block anywhere in exofs. The
-	 * reason we write the fscb here on unmount is so we can stay backwards
-	 * compatible with fscb->s_version == 1. (What we are not compatible
-	 * with is if a new version FS crashed and then we try to mount an old
-	 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
-	 * the writeable info is set in exofs_sbi_write_stats() above.
-	 */
-
-	exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
-
-	ret = ore_get_io_state(&sbi->layout, &oc, &ios);
-	if (unlikely(ret))
-		goto out;
-
-	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
-	memset(fscb, 0, ios->length);
-	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
-	fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
-	fscb->s_magic = cpu_to_le16(sb->s_magic);
-	fscb->s_newfs = 0;
-	fscb->s_version = EXOFS_FSCB_VER;
-
-	ios->offset = 0;
-	ios->kern_buff = fscb;
-
-	ret = ore_write(ios);
-	if (unlikely(ret))
-		EXOFS_ERR("%s: ore_write failed.\n", __func__);
-
-out:
-	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
-	ore_put_io_state(ios);
-	kfree(fscb);
-	return ret;
-}
-
-static void _exofs_print_device(const char *msg, const char *dev_path,
-				struct osd_dev *od, u64 pid)
-{
-	const struct osd_dev_info *odi = osduld_device_info(od);
-
-	printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
-		msg, dev_path ?: "", odi->osdname, _LLU(pid));
-}
-
-static void exofs_free_sbi(struct exofs_sb_info *sbi)
-{
-	unsigned numdevs = sbi->oc.numdevs;
-
-	while (numdevs) {
-		unsigned i = --numdevs;
-		struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
-
-		if (od) {
-			ore_comp_set_dev(&sbi->oc, i, NULL);
-			osduld_put_device(od);
-		}
-	}
-	kfree(sbi->oc.ods);
-	kfree(sbi);
-}
-
-/*
- * This function is called when the vfs is freeing the superblock.  We just
- * need to free our own part.
- */
-static void exofs_put_super(struct super_block *sb)
-{
-	int num_pend;
-	struct exofs_sb_info *sbi = sb->s_fs_info;
-
-	/* make sure there are no pending commands */
-	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
-	     num_pend = atomic_read(&sbi->s_curr_pending)) {
-		wait_queue_head_t wq;
-
-		printk(KERN_NOTICE "%s: !!Pending operations in flight. "
-		       "This is a BUG. please report to osd-dev@open-osd.org\n",
-		       __func__);
-		init_waitqueue_head(&wq);
-		wait_event_timeout(wq,
-				  (atomic_read(&sbi->s_curr_pending) == 0),
-				  msecs_to_jiffies(100));
-	}
-
-	_exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
-			    sbi->one_comp.obj.partition);
-
-	exofs_sysfs_sb_del(sbi);
-	exofs_free_sbi(sbi);
-	sb->s_fs_info = NULL;
-}
-
-static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
-				    struct exofs_device_table *dt)
-{
-	int ret;
-
-	sbi->layout.stripe_unit =
-				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
-	sbi->layout.group_width =
-				le32_to_cpu(dt->dt_data_map.cb_group_width);
-	sbi->layout.group_depth =
-				le32_to_cpu(dt->dt_data_map.cb_group_depth);
-	sbi->layout.mirrors_p1  =
-				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
-	sbi->layout.raid_algorithm  =
-				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
-
-	ret = ore_verify_layout(numdevs, &sbi->layout);
-
-	EXOFS_DBGMSG("exofs: layout: "
-		"num_comps=%u stripe_unit=0x%x group_width=%u "
-		"group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
-		numdevs,
-		sbi->layout.stripe_unit,
-		sbi->layout.group_width,
-		_LLU(sbi->layout.group_depth),
-		sbi->layout.mirrors_p1,
-		sbi->layout.raid_algorithm);
-	return ret;
-}
-
-static unsigned __ra_pages(struct ore_layout *layout)
-{
-	const unsigned _MIN_RA = 32; /* min 128K read-ahead */
-	unsigned ra_pages = layout->group_width * layout->stripe_unit /
-				PAGE_SIZE;
-	unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
-
-	ra_pages *= 2; /* two stripes */
-	if (ra_pages < _MIN_RA)
-		ra_pages = roundup(_MIN_RA, ra_pages / 2);
-
-	if (ra_pages > max_io_pages)
-		ra_pages = max_io_pages;
-
-	return ra_pages;
-}
-
-/* @odi is valid only as long as @fscb_dev is valid */
-static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
-			     struct osd_dev_info *odi)
-{
-	odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
-	if (likely(odi->systemid_len))
-		memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
-
-	odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
-	odi->osdname = dt_dev->osdname;
-
-	/* FIXME support long names. Will need a _put function */
-	if (dt_dev->long_name_offset)
-		return -EINVAL;
-
-	/* Make sure osdname is printable!
-	 * mkexofs should give us space for a null-terminator else the
-	 * device-table is invalid.
-	 */
-	if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
-		odi->osdname_len = sizeof(dt_dev->osdname) - 1;
-	dt_dev->osdname[odi->osdname_len] = 0;
-
-	/* If it's all zeros something is bad we read past end-of-obj */
-	return !(odi->systemid_len || odi->osdname_len);
-}
-
-static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
-		      struct exofs_dev **peds)
-{
-	/* Twice bigger table: See exofs_init_comps() and comment at
-	 * exofs_read_lookup_dev_table()
-	 */
-	const size_t numores = numdevs * 2 - 1;
-	struct exofs_dev *eds;
-	unsigned i;
-
-	sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) +
-			      numdevs * sizeof(struct exofs_dev), GFP_KERNEL);
-	if (unlikely(!sbi->oc.ods)) {
-		EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
-			  numdevs);
-		return -ENOMEM;
-	}
-
-	/* Start of allocated struct exofs_dev entries */
-	*peds = eds = (void *)sbi->oc.ods[numores];
-	/* Initialize pointers into struct exofs_dev */
-	for (i = 0; i < numdevs; ++i)
-		sbi->oc.ods[i] = &eds[i].ored;
-	return 0;
-}
-
-static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
-				       struct osd_dev *fscb_od,
-				       unsigned table_count)
-{
-	struct ore_comp comp;
-	struct exofs_device_table *dt;
-	struct exofs_dev *eds;
-	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
-					     sizeof(*dt);
-	unsigned numdevs, i;
-	int ret;
-
-	dt = kmalloc(table_bytes, GFP_KERNEL);
-	if (unlikely(!dt)) {
-		EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
-			  table_bytes);
-		return -ENOMEM;
-	}
-
-	sbi->oc.numdevs = 0;
-
-	comp.obj.partition = sbi->one_comp.obj.partition;
-	comp.obj.id = EXOFS_DEVTABLE_ID;
-	exofs_make_credential(comp.cred, &comp.obj);
-
-	ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
-			      table_bytes);
-	if (unlikely(ret)) {
-		EXOFS_ERR("ERROR: reading device table\n");
-		goto out;
-	}
-
-	numdevs = le64_to_cpu(dt->dt_num_devices);
-	if (unlikely(!numdevs)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	WARN_ON(table_count != numdevs);
-
-	ret = _read_and_match_data_map(sbi, numdevs, dt);
-	if (unlikely(ret))
-		goto out;
-
-	ret = __alloc_dev_table(sbi, numdevs, &eds);
-	if (unlikely(ret))
-		goto out;
-	/* exofs round-robins the device table view according to inode
-	 * number. We hold a: twice bigger table hence inodes can point
-	 * to any device and have a sequential view of the table
-	 * starting at this device. See exofs_init_comps()
-	 */
-	memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
-		(numdevs - 1) * sizeof(sbi->oc.ods[0]));
-
-	/* create sysfs subdir under which we put the device table
-	 * And cluster layout. A Superblock is identified by the string:
-	 *	"dev[0].osdname"_"pid"
-	 */
-	exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
-
-	for (i = 0; i < numdevs; i++) {
-		struct exofs_fscb fscb;
-		struct osd_dev_info odi;
-		struct osd_dev *od;
-
-		if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
-			EXOFS_ERR("ERROR: Read all-zeros device entry\n");
-			ret = -EINVAL;
-			goto out;
-		}
-
-		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
-		       i, odi.osdname);
-
-		/* the exofs id is currently the table index */
-		eds[i].did = i;
-
-		/* On all devices the device table is identical. The user can
-		 * specify any one of the participating devices on the command
-		 * line. We always keep them in device-table order.
-		 */
-		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-			eds[i].ored.od = fscb_od;
-			++sbi->oc.numdevs;
-			fscb_od = NULL;
-			exofs_sysfs_odev_add(&eds[i], sbi);
-			continue;
-		}
-
-		od = osduld_info_lookup(&odi);
-		if (IS_ERR(od)) {
-			ret = PTR_ERR(od);
-			EXOFS_ERR("ERROR: device requested is not found "
-				  "osd_name-%s =>%d\n", odi.osdname, ret);
-			goto out;
-		}
-
-		eds[i].ored.od = od;
-		++sbi->oc.numdevs;
-
-		/* Read the fscb of the other devices to make sure the FS
-		 * partition is there.
-		 */
-		ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
-				      sizeof(fscb));
-		if (unlikely(ret)) {
-			EXOFS_ERR("ERROR: Malformed participating device "
-				  "error reading fscb osd_name-%s\n",
-				  odi.osdname);
-			goto out;
-		}
-		exofs_sysfs_odev_add(&eds[i], sbi);
-
-		/* TODO: verify other information is correct and FS-uuid
-		 *	 matches. Benny what did you say about device table
-		 *	 generation and old devices?
-		 */
-	}
-
-out:
-	kfree(dt);
-	if (unlikely(fscb_od && !ret)) {
-			EXOFS_ERR("ERROR: Bad device-table container device not present\n");
-			osduld_put_device(fscb_od);
-			return -EINVAL;
-	}
-	return ret;
-}
-
-/*
- * Read the superblock from the OSD and fill in the fields
- */
-static int exofs_fill_super(struct super_block *sb, void *data, int silent)
-{
-	struct inode *root;
-	struct exofs_mountopt *opts = data;
-	struct exofs_sb_info *sbi;	/*extended info                  */
-	struct osd_dev *od;		/* Master device                 */
-	struct exofs_fscb fscb;		/*on-disk superblock info        */
-	struct ore_comp comp;
-	unsigned table_count;
-	int ret;
-
-	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
-		return -ENOMEM;
-
-	/* use mount options to fill superblock */
-	if (opts->is_osdname) {
-		struct osd_dev_info odi = {.systemid_len = 0};
-
-		odi.osdname_len = strlen(opts->dev_name);
-		odi.osdname = (u8 *)opts->dev_name;
-		od = osduld_info_lookup(&odi);
-		kfree(opts->dev_name);
-		opts->dev_name = NULL;
-	} else {
-		od = osduld_path_lookup(opts->dev_name);
-	}
-	if (IS_ERR(od)) {
-		ret = -EINVAL;
-		goto free_sbi;
-	}
-
-	/* Default layout in case we do not have a device-table */
-	sbi->layout.stripe_unit = PAGE_SIZE;
-	sbi->layout.mirrors_p1 = 1;
-	sbi->layout.group_width = 1;
-	sbi->layout.group_depth = -1;
-	sbi->layout.group_count = 1;
-	sbi->s_timeout = opts->timeout;
-
-	sbi->one_comp.obj.partition = opts->pid;
-	sbi->one_comp.obj.id = 0;
-	exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
-	sbi->oc.single_comp = EC_SINGLE_COMP;
-	sbi->oc.comps = &sbi->one_comp;
-
-	/* fill in some other data by hand */
-	memset(sb->s_id, 0, sizeof(sb->s_id));
-	strcpy(sb->s_id, "exofs");
-	sb->s_blocksize = EXOFS_BLKSIZE;
-	sb->s_blocksize_bits = EXOFS_BLKSHIFT;
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_max_links = EXOFS_LINK_MAX;
-	atomic_set(&sbi->s_curr_pending, 0);
-	sb->s_bdev = NULL;
-	sb->s_dev = 0;
-
-	comp.obj.partition = sbi->one_comp.obj.partition;
-	comp.obj.id = EXOFS_SUPER_ID;
-	exofs_make_credential(comp.cred, &comp.obj);
-
-	ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
-	if (unlikely(ret))
-		goto free_sbi;
-
-	sb->s_magic = le16_to_cpu(fscb.s_magic);
-	/* NOTE: we read below to be backward compatible with old versions */
-	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
-	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
-
-	/* make sure what we read from the object store is correct */
-	if (sb->s_magic != EXOFS_SUPER_MAGIC) {
-		if (!silent)
-			EXOFS_ERR("ERROR: Bad magic value\n");
-		ret = -EINVAL;
-		goto free_sbi;
-	}
-	if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
-		EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
-			  EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
-		ret = -EINVAL;
-		goto free_sbi;
-	}
-
-	/* start generation numbers from a random point */
-	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
-	spin_lock_init(&sbi->s_next_gen_lock);
-
-	table_count = le64_to_cpu(fscb.s_dev_table_count);
-	if (table_count) {
-		ret = exofs_read_lookup_dev_table(sbi, od, table_count);
-		if (unlikely(ret))
-			goto free_sbi;
-	} else {
-		struct exofs_dev *eds;
-
-		ret = __alloc_dev_table(sbi, 1, &eds);
-		if (unlikely(ret))
-			goto free_sbi;
-
-		ore_comp_set_dev(&sbi->oc, 0, od);
-		sbi->oc.numdevs = 1;
-	}
-
-	__sbi_read_stats(sbi);
-
-	/* set up operation vectors */
-	ret = super_setup_bdi(sb);
-	if (ret) {
-		EXOFS_DBGMSG("Failed to super_setup_bdi\n");
-		goto free_sbi;
-	}
-	sb->s_bdi->ra_pages = __ra_pages(&sbi->layout);
-	sb->s_fs_info = sbi;
-	sb->s_op = &exofs_sops;
-	sb->s_export_op = &exofs_export_ops;
-	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
-	if (IS_ERR(root)) {
-		EXOFS_ERR("ERROR: exofs_iget failed\n");
-		ret = PTR_ERR(root);
-		goto free_sbi;
-	}
-	sb->s_root = d_make_root(root);
-	if (!sb->s_root) {
-		EXOFS_ERR("ERROR: get root inode failed\n");
-		ret = -ENOMEM;
-		goto free_sbi;
-	}
-
-	if (!S_ISDIR(root->i_mode)) {
-		dput(sb->s_root);
-		sb->s_root = NULL;
-		EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
-		       root->i_mode);
-		ret = -EINVAL;
-		goto free_sbi;
-	}
-
-	exofs_sysfs_dbg_print();
-	_exofs_print_device("Mounting", opts->dev_name,
-			    ore_comp_dev(&sbi->oc, 0),
-			    sbi->one_comp.obj.partition);
-	return 0;
-
-free_sbi:
-	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
-		  opts->dev_name, sbi->one_comp.obj.partition, ret);
-	exofs_free_sbi(sbi);
-	return ret;
-}
-
-/*
- * Set up the superblock (calls exofs_fill_super eventually)
- */
-static struct dentry *exofs_mount(struct file_system_type *type,
-			  int flags, const char *dev_name,
-			  void *data)
-{
-	struct exofs_mountopt opts;
-	int ret;
-
-	ret = parse_options(data, &opts);
-	if (ret) {
-		kfree(opts.dev_name);
-		return ERR_PTR(ret);
-	}
-
-	if (!opts.dev_name)
-		opts.dev_name = dev_name;
-	return mount_nodev(type, flags, &opts, exofs_fill_super);
-}
-
-/*
- * Return information about the file system state in the buffer.  This is used
- * by the 'df' command, for example.
- */
-static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct super_block *sb = dentry->d_sb;
-	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct ore_io_state *ios;
-	struct osd_attr attrs[] = {
-		ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
-			OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
-		ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
-			OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
-	};
-	uint64_t capacity = ULLONG_MAX;
-	uint64_t used = ULLONG_MAX;
-	int ret;
-
-	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
-	if (ret) {
-		EXOFS_DBGMSG("ore_get_io_state failed.\n");
-		return ret;
-	}
-
-	ios->in_attr = attrs;
-	ios->in_attr_len = ARRAY_SIZE(attrs);
-
-	ret = ore_read(ios);
-	if (unlikely(ret))
-		goto out;
-
-	ret = extract_attr_from_ios(ios, &attrs[0]);
-	if (likely(!ret)) {
-		capacity = get_unaligned_be64(attrs[0].val_ptr);
-		if (unlikely(!capacity))
-			capacity = ULLONG_MAX;
-	} else
-		EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
-
-	ret = extract_attr_from_ios(ios, &attrs[1]);
-	if (likely(!ret))
-		used = get_unaligned_be64(attrs[1].val_ptr);
-	else
-		EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
-
-	/* fill in the stats buffer */
-	buf->f_type = EXOFS_SUPER_MAGIC;
-	buf->f_bsize = EXOFS_BLKSIZE;
-	buf->f_blocks = capacity >> 9;
-	buf->f_bfree = (capacity - used) >> 9;
-	buf->f_bavail = buf->f_bfree;
-	buf->f_files = sbi->s_numfiles;
-	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
-	buf->f_namelen = EXOFS_NAME_LEN;
-
-out:
-	ore_put_io_state(ios);
-	return ret;
-}
-
-static const struct super_operations exofs_sops = {
-	.alloc_inode    = exofs_alloc_inode,
-	.destroy_inode  = exofs_destroy_inode,
-	.write_inode    = exofs_write_inode,
-	.evict_inode    = exofs_evict_inode,
-	.put_super      = exofs_put_super,
-	.sync_fs	= exofs_sync_fs,
-	.statfs         = exofs_statfs,
-};
-
-/******************************************************************************
- * EXPORT OPERATIONS
- *****************************************************************************/
-
-static struct dentry *exofs_get_parent(struct dentry *child)
-{
-	unsigned long ino = exofs_parent_ino(child);
-
-	if (!ino)
-		return ERR_PTR(-ESTALE);
-
-	return d_obtain_alias(exofs_iget(child->d_sb, ino));
-}
-
-static struct inode *exofs_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
-{
-	struct inode *inode;
-
-	inode = exofs_iget(sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	if (generation && inode->i_generation != generation) {
-		/* we didn't find the right inode.. */
-		iput(inode);
-		return ERR_PTR(-ESTALE);
-	}
-	return inode;
-}
-
-static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
-				struct fid *fid, int fh_len, int fh_type)
-{
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    exofs_nfs_get_inode);
-}
-
-static struct dentry *exofs_fh_to_parent(struct super_block *sb,
-				struct fid *fid, int fh_len, int fh_type)
-{
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    exofs_nfs_get_inode);
-}
-
-static const struct export_operations exofs_export_ops = {
-	.fh_to_dentry = exofs_fh_to_dentry,
-	.fh_to_parent = exofs_fh_to_parent,
-	.get_parent = exofs_get_parent,
-};
-
-/******************************************************************************
- * INSMOD/RMMOD
- *****************************************************************************/
-
-/*
- * struct that describes this file system
- */
-static struct file_system_type exofs_type = {
-	.owner          = THIS_MODULE,
-	.name           = "exofs",
-	.mount          = exofs_mount,
-	.kill_sb        = generic_shutdown_super,
-};
-MODULE_ALIAS_FS("exofs");
-
-static int __init init_exofs(void)
-{
-	int err;
-
-	err = init_inodecache();
-	if (err)
-		goto out;
-
-	err = register_filesystem(&exofs_type);
-	if (err)
-		goto out_d;
-
-	/* We don't fail if sysfs creation failed */
-	exofs_sysfs_init();
-
-	return 0;
-out_d:
-	destroy_inodecache();
-out:
-	return err;
-}
-
-static void __exit exit_exofs(void)
-{
-	exofs_sysfs_uninit();
-	unregister_filesystem(&exofs_type);
-	destroy_inodecache();
-}
-
-MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
-MODULE_DESCRIPTION("exofs");
-MODULE_LICENSE("GPL");
-
-module_init(init_exofs)
-module_exit(exit_exofs)

diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
deleted file mode 100644
index 1f7d5e4..0000000
--- a/fs/exofs/sys.c
+++ /dev/null

@@ -1,205 +0,0 @@
-/*
- * Copyright (C) 2012
- * Sachin Bhamare <sbhamare@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License 2 as published by
- * the Free Software Foundation.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the:
- *	Free Software Foundation <licensing@fsf.org>
- */
-
-#include <linux/kobject.h>
-#include <linux/device.h>
-
-#include "exofs.h"
-
-struct odev_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct exofs_dev *, char *);
-	ssize_t (*store)(struct exofs_dev *, const char *, size_t);
-};
-
-static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr,
-		char *buf)
-{
-	struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
-	struct odev_attr *a = container_of(attr, struct odev_attr, attr);
-
-	return a->show ? a->show(edp, buf) : 0;
-}
-
-static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr,
-		const char *buf, size_t len)
-{
-	struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
-	struct odev_attr *a = container_of(attr, struct odev_attr, attr);
-
-	return a->store ? a->store(edp, buf, len) : len;
-}
-
-static const struct sysfs_ops odev_attr_ops = {
-	.show  = odev_attr_show,
-	.store = odev_attr_store,
-};
-
-
-static struct kset *exofs_kset;
-
-static ssize_t osdname_show(struct exofs_dev *edp, char *buf)
-{
-	struct osd_dev *odev = edp->ored.od;
-	const struct osd_dev_info *odi = osduld_device_info(odev);
-
-	return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname);
-}
-
-static ssize_t systemid_show(struct exofs_dev *edp, char *buf)
-{
-	struct osd_dev *odev = edp->ored.od;
-	const struct osd_dev_info *odi = osduld_device_info(odev);
-
-	memcpy(buf, odi->systemid, odi->systemid_len);
-	return odi->systemid_len;
-}
-
-static ssize_t uri_show(struct exofs_dev *edp, char *buf)
-{
-	return snprintf(buf, edp->urilen, "%s", edp->uri);
-}
-
-static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
-{
-	uint8_t *new_uri;
-
-	edp->urilen = strlen(buf) + 1;
-	new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
-	if (new_uri == NULL)
-		return -ENOMEM;
-	edp->uri = new_uri;
-	strncpy(edp->uri, buf, edp->urilen);
-	return edp->urilen;
-}
-
-#define OSD_ATTR(name, mode, show, store) \
-	static struct odev_attr odev_attr_##name = \
-					__ATTR(name, mode, show, store)
-
-OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL);
-OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL);
-OSD_ATTR(uri, S_IRWXU, uri_show, uri_store);
-
-static struct attribute *odev_attrs[] = {
-	&odev_attr_osdname.attr,
-	&odev_attr_systemid.attr,
-	&odev_attr_uri.attr,
-	NULL,
-};
-
-static struct kobj_type odev_ktype = {
-	.default_attrs	= odev_attrs,
-	.sysfs_ops	= &odev_attr_ops,
-};
-
-static struct kobj_type uuid_ktype = {
-};
-
-void exofs_sysfs_dbg_print(void)
-{
-#ifdef CONFIG_EXOFS_DEBUG
-	struct kobject *k_name, *k_tmp;
-
-	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
-		printk(KERN_INFO "%s: name %s ref %d\n",
-			__func__, kobject_name(k_name),
-			(int)kref_read(&k_name->kref));
-	}
-#endif
-}
-/*
- * This function removes all kobjects under exofs_kset
- * At the end of it, exofs_kset kobject will have a refcount
- * of 1 which gets decremented only on exofs module unload
- */
-void exofs_sysfs_sb_del(struct exofs_sb_info *sbi)
-{
-	struct kobject *k_name, *k_tmp;
-	struct kobject *s_kobj = &sbi->s_kobj;
-
-	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
-		/* Remove all that are children of this SBI */
-		if (k_name->parent == s_kobj)
-			kobject_put(k_name);
-	}
-	kobject_put(s_kobj);
-}
-
-/*
- * This function creates sysfs entries to hold the current exofs cluster
- * instance (uniquely identified by osdname,pid tuple).
- * This function gets called once per exofs mount instance.
- */
-int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
-		       struct exofs_dt_device_info *dt_dev)
-{
-	struct kobject *s_kobj;
-	int retval = 0;
-	uint64_t pid = sbi->one_comp.obj.partition;
-
-	/* allocate new uuid dirent */
-	s_kobj = &sbi->s_kobj;
-	s_kobj->kset = exofs_kset;
-	retval = kobject_init_and_add(s_kobj, &uuid_ktype,
-			&exofs_kset->kobj,  "%s_%llx", dt_dev->osdname, pid);
-	if (retval) {
-		EXOFS_ERR("ERROR: Failed to create sysfs entry for "
-			  "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval);
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi)
-{
-	struct kobject *d_kobj;
-	int retval = 0;
-
-	/* create osd device group which contains following attributes
-	 * osdname, systemid & uri
-	 */
-	d_kobj = &edev->ed_kobj;
-	d_kobj->kset = exofs_kset;
-	retval = kobject_init_and_add(d_kobj, &odev_ktype,
-			&sbi->s_kobj, "dev%u", edev->did);
-	if (retval) {
-		EXOFS_ERR("ERROR: Failed to create sysfs entry for "
-				"device dev%u\n", edev->did);
-		return retval;
-	}
-	return 0;
-}
-
-int exofs_sysfs_init(void)
-{
-	exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj);
-	if (!exofs_kset) {
-		EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-void exofs_sysfs_uninit(void)
-{
-	kset_unregister(exofs_kset);
-}

diff --git a/fs/exportfs/Makefile b/fs/exportfs/Makefile
index d7c5d4d..a04a8af 100644
--- a/fs/exportfs/Makefile
+++ b/fs/exportfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the filesystem export support routines.
 

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 63707ab..2dd55b1 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Neil Brown 2002
  * Copyright (C) Christoph Hellwig 2007
@@ -6,7 +7,7 @@
  * and for mapping back from file handles to dentries.
  *
  * For details on why we do all the strange and hairy things in here
- * take a look at Documentation/filesystems/nfs/Exporting.
+ * take a look at Documentation/filesystems/nfs/exporting.rst.
  */
 #include <linux/exportfs.h>
 #include <linux/fs.h>
@@ -147,6 +148,7 @@
 	tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf));
 	if (IS_ERR(tmp)) {
 		dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
+		err = PTR_ERR(tmp);
 		goto out_err;
 	}
 	if (tmp != dentry) {
@@ -517,26 +519,33 @@
 		 * inode is actually connected to the parent.
 		 */
 		err = exportfs_get_name(mnt, target_dir, nbuf, result);
-		if (!err) {
-			inode_lock(target_dir->d_inode);
-			nresult = lookup_one_len(nbuf, target_dir,
-						 strlen(nbuf));
-			inode_unlock(target_dir->d_inode);
-			if (!IS_ERR(nresult)) {
-				if (nresult->d_inode) {
-					dput(result);
-					result = nresult;
-				} else
-					dput(nresult);
-			}
+		if (err) {
+			dput(target_dir);
+			goto err_result;
 		}
 
+		inode_lock(target_dir->d_inode);
+		nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+		if (!IS_ERR(nresult)) {
+			if (unlikely(nresult->d_inode != result->d_inode)) {
+				dput(nresult);
+				nresult = ERR_PTR(-ESTALE);
+			}
+		}
+		inode_unlock(target_dir->d_inode);
 		/*
 		 * At this point we are done with the parent, but it's pinned
 		 * by the child dentry anyway.
 		 */
 		dput(target_dir);
 
+		if (IS_ERR(nresult)) {
+			err = PTR_ERR(nresult);
+			goto err_result;
+		}
+		dput(result);
+		result = nresult;
+
 		/*
 		 * And finally make sure the dentry is actually acceptable
 		 * to NFSD.

diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 894e4c5..54eec91 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config EXT2_FS
 	tristate "Second extended fs support"
 	help

diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 224c04a..cf4c77f 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c

@@ -256,11 +256,15 @@
 	if (default_acl) {
 		error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
 		posix_acl_release(default_acl);
+	} else {
+		inode->i_default_acl = NULL;
 	}
 	if (acl) {
 		if (!error)
 			error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
 		posix_acl_release(acl);
+	} else {
+		inode->i_acl = NULL;
 	}
 	return error;
 }

diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 33db133..e0cc551 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c

@@ -490,9 +490,7 @@
 	struct ext2_super_block * es = sbi->s_es;
 	unsigned freed = 0, group_freed;
 
-	if (block < le32_to_cpu(es->s_first_data_block) ||
-	    block + count < block ||
-	    block + count > le32_to_cpu(es->s_blocks_count)) {
+	if (!ext2_data_block_valid(sbi, block, count)) {
 		ext2_error (sb, "ext2_free_blocks",
 			    "Freeing blocks not in datazone - "
 			    "block = %lu, count = %lu", block, count);
@@ -1197,22 +1195,21 @@
 
 /*
  * Returns 1 if the passed-in block region is valid; 0 if some part overlaps
- * with filesystem metadata blocksi.
+ * with filesystem metadata blocks.
  */
 int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk,
 			  unsigned int count)
 {
 	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
-	    (start_blk + count < start_blk) ||
-	    (start_blk > le32_to_cpu(sbi->s_es->s_blocks_count)))
+	    (start_blk + count - 1 < start_blk) ||
+	    (start_blk + count - 1 >= le32_to_cpu(sbi->s_es->s_blocks_count)))
 		return 0;
 
 	/* Ensure we do not step over superblock */
 	if ((start_blk <= sbi->s_sb_block) &&
-	    (start_blk + count >= sbi->s_sb_block))
+	    (start_blk + count - 1 >= sbi->s_sb_block))
 		return 0;
 
-
 	return 1;
 }
 

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3b8114d..13318e2 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c

@@ -252,33 +252,10 @@
 	return (char *)p - base;
 }
 
-static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
-	[EXT2_FT_UNKNOWN]	= DT_UNKNOWN,
-	[EXT2_FT_REG_FILE]	= DT_REG,
-	[EXT2_FT_DIR]		= DT_DIR,
-	[EXT2_FT_CHRDEV]	= DT_CHR,
-	[EXT2_FT_BLKDEV]	= DT_BLK,
-	[EXT2_FT_FIFO]		= DT_FIFO,
-	[EXT2_FT_SOCK]		= DT_SOCK,
-	[EXT2_FT_SYMLINK]	= DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= EXT2_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= EXT2_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= EXT2_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= EXT2_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= EXT2_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= EXT2_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= EXT2_FT_SYMLINK,
-};
-
 static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
 {
-	umode_t mode = inode->i_mode;
 	if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
-		de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+		de->file_type = fs_umode_to_ftype(inode->i_mode);
 	else
 		de->file_type = 0;
 }
@@ -293,14 +270,14 @@
 	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
-	unsigned char *types = NULL;
 	bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+	bool has_filetype;
 
 	if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
 		return 0;
 
-	if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
-		types = ext2_filetype_table;
+	has_filetype =
+		EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE);
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
@@ -335,8 +312,8 @@
 			if (de->inode) {
 				unsigned char d_type = DT_UNKNOWN;
 
-				if (types && de->file_type < EXT2_FT_MAX)
-					d_type = types[de->file_type];
+				if (has_filetype)
+					d_type = fs_ftype_to_dtype(de->file_type);
 
 				if (!dir_emit(ctx, de->name, de->name_len,
 						le32_to_cpu(de->inode),

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 00e759f..10ab238 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h

@@ -390,11 +390,7 @@
 #define EXT2_MOUNT_USRQUOTA		0x020000  /* user quota */
 #define EXT2_MOUNT_GRPQUOTA		0x040000  /* group quota */
 #define EXT2_MOUNT_RESERVATION		0x080000  /* Preallocation */
-#ifdef CONFIG_FS_DAX
 #define EXT2_MOUNT_DAX			0x100000  /* Direct Access */
-#else
-#define EXT2_MOUNT_DAX			0
-#endif
 
 
 #define clear_opt(o, opt)		o &= ~EXT2_MOUNT_##opt
@@ -608,22 +604,6 @@
 };
 
 /*
- * Ext2 directory file types.  Only the low 3 bits are used.  The
- * other bits are reserved for now.
- */
-enum {
-	EXT2_FT_UNKNOWN		= 0,
-	EXT2_FT_REG_FILE	= 1,
-	EXT2_FT_DIR		= 2,
-	EXT2_FT_CHRDEV		= 3,
-	EXT2_FT_BLKDEV		= 4,
-	EXT2_FT_FIFO		= 5,
-	EXT2_FT_SOCK		= 6,
-	EXT2_FT_SYMLINK		= 7,
-	EXT2_FT_MAX
-};
-
-/*
  * EXT2_DIR_PAD defines the directory entries boundaries
  *
  * NOTE: It must be a multiple of 4
@@ -778,6 +758,7 @@
 extern void ext2_evict_inode(struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
+extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);

diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 28b2609..39c4772 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c

@@ -199,6 +199,7 @@
 #ifdef CONFIG_EXT2_FS_XATTR
 	.listxattr	= ext2_listxattr,
 #endif
+	.getattr	= ext2_getattr,
 	.setattr	= ext2_setattr,
 	.get_acl	= ext2_get_acl,
 	.set_acl	= ext2_set_acl,

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5c3d7b7..fda7d3f 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c

@@ -172,9 +172,7 @@
 	struct backing_dev_info *bdi;
 
 	bdi = inode_to_bdi(inode);
-	if (bdi_read_congested(bdi))
-		return;
-	if (bdi_write_congested(bdi))
+	if (bdi_rw_congested(bdi))
 		return;
 
 	block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
@@ -222,8 +220,6 @@
 			best_desc = desc;
 		}
 	}
-	if (!best_desc)
-		return -1;
 
 	return best_group;
 }
@@ -513,6 +509,7 @@
 	/*
 	 * Scanned all blockgroups.
 	 */
+	brelse(bitmap_bh);
 	err = -ENOSPC;
 	goto fail;
 got:

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e4bb938..7004ce5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c

@@ -451,7 +451,9 @@
 /**
  *	ext2_alloc_branch - allocate and set up a chain of blocks.
  *	@inode: owner
- *	@num: depth of the chain (number of blocks to allocate)
+ *	@indirect_blks: depth of the chain (number of blocks to allocate)
+ *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
@@ -717,7 +719,7 @@
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
 	/*
-	 * Next look up the indirect map to count the totoal number of
+	 * Next look up the indirect map to count the total number of
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext2_blks_to_allocate(partial, indirect_blks,
@@ -1239,6 +1241,7 @@
 				mark_inode_dirty(inode);
 				ext2_free_branches(inode, &nr, &nr+1, 1);
 			}
+			/* fall through */
 		case EXT2_IND_BLOCK:
 			nr = i_data[EXT2_DIND_BLOCK];
 			if (nr) {
@@ -1246,6 +1249,7 @@
 				mark_inode_dirty(inode);
 				ext2_free_branches(inode, &nr, &nr+1, 2);
 			}
+			/* fall through */
 		case EXT2_DIND_BLOCK:
 			nr = i_data[EXT2_TIND_BLOCK];
 			if (nr) {
@@ -1396,7 +1400,7 @@
 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 {
 	struct ext2_inode_info *ei;
-	struct buffer_head * bh;
+	struct buffer_head * bh = NULL;
 	struct ext2_inode *raw_inode;
 	struct inode *inode;
 	long ret = -EIO;
@@ -1442,7 +1446,6 @@
 	 */
 	if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
 		/* this inode is deleted */
-		brelse (bh);
 		ret = -ESTALE;
 		goto bad_inode;
 	}
@@ -1459,7 +1462,6 @@
 	    !ext2_data_block_valid(EXT2_SB(sb), ei->i_file_acl, 1)) {
 		ext2_error(sb, "ext2_iget", "bad extended attribute block %u",
 			   ei->i_file_acl);
-		brelse(bh);
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
 	}
@@ -1522,6 +1524,7 @@
 	return inode;
 	
 bad_inode:
+	brelse(bh);
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
@@ -1635,6 +1638,32 @@
 	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
+int ext2_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
+{
+	struct inode *inode = d_inode(path->dentry);
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	unsigned int flags;
+
+	flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
+	if (flags & EXT2_APPEND_FL)
+		stat->attributes |= STATX_ATTR_APPEND;
+	if (flags & EXT2_COMPR_FL)
+		stat->attributes |= STATX_ATTR_COMPRESSED;
+	if (flags & EXT2_IMMUTABLE_FL)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	if (flags & EXT2_NODUMP_FL)
+		stat->attributes |= STATX_ATTR_NODUMP;
+	stat->attributes_mask |= (STATX_ATTR_APPEND |
+			STATX_ATTR_COMPRESSED |
+			STATX_ATTR_ENCRYPTED |
+			STATX_ATTR_IMMUTABLE |
+			STATX_ATTR_NODUMP);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);

diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 0367c00..1b853fb 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c

@@ -60,18 +60,10 @@
 		}
 		oldflags = ei->i_flags;
 
-		/*
-		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-		 * the relevant capability.
-		 *
-		 * This test looks nicer. Thanks to Pauline Middelink
-		 */
-		if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				inode_unlock(inode);
-				ret = -EPERM;
-				goto setflags_out;
-			}
+		ret = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+		if (ret) {
+			inode_unlock(inode);
+			goto setflags_out;
 		}
 
 		flags = flags & EXT2_FL_USER_MODIFIABLE;

diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 0c26dcc..ccfbbf5 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c

@@ -416,6 +416,7 @@
 #ifdef CONFIG_EXT2_FS_XATTR
 	.listxattr	= ext2_listxattr,
 #endif
+	.getattr	= ext2_getattr,
 	.setattr	= ext2_setattr,
 	.get_acl	= ext2_get_acl,
 	.set_acl	= ext2_set_acl,
@@ -426,6 +427,7 @@
 #ifdef CONFIG_EXT2_FS_XATTR
 	.listxattr	= ext2_listxattr,
 #endif
+	.getattr	= ext2_getattr,
 	.setattr	= ext2_setattr,
 	.get_acl	= ext2_get_acl,
 	.set_acl	= ext2_set_acl,

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0c38e31..30c630d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/ext2/super.c
  *
@@ -73,7 +74,7 @@
 
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT2-fs: panic from previous error\n");
-	if (test_opt(sb, ERRORS_RO)) {
+	if (!sb_rdonly(sb) && test_opt(sb, ERRORS_RO)) {
 		ext2_msg(sb, KERN_CRIT,
 			     "error: remounting filesystem read-only");
 		sb->s_flags |= SB_RDONLY;
@@ -148,10 +149,9 @@
 
 	ext2_quota_off_umount(sb);
 
-	if (sbi->s_ea_block_cache) {
-		ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
-		sbi->s_ea_block_cache = NULL;
-	}
+	ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
+	sbi->s_ea_block_cache = NULL;
+
 	if (!sb_rdonly(sb)) {
 		struct ext2_super_block *es = sbi->s_es;
 
@@ -162,8 +162,7 @@
 	}
 	db_count = sbi->s_gdb_count;
 	for (i = 0; i < db_count; i++)
-		if (sbi->s_group_desc[i])
-			brelse (sbi->s_group_desc[i]);
+		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
 	kfree(sbi->s_debts);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
@@ -193,17 +192,11 @@
 	return &ei->vfs_inode;
 }
 
-static void ext2_i_callback(struct rcu_head *head)
+static void ext2_free_in_core_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
-static void ext2_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, ext2_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
@@ -309,20 +302,17 @@
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 
-#if defined(CONFIG_QUOTA)
-	if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
+	if (test_opt(sb, USRQUOTA))
 		seq_puts(seq, ",usrquota");
 
-	if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
+	if (test_opt(sb, GRPQUOTA))
 		seq_puts(seq, ",grpquota");
-#endif
 
-#ifdef CONFIG_FS_DAX
-	if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
+	if (test_opt(sb, XIP))
 		seq_puts(seq, ",xip");
-	if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
+
+	if (test_opt(sb, DAX))
 		seq_puts(seq, ",dax");
-#endif
 
 	if (!test_opt(sb, RESERVATION))
 		seq_puts(seq, ",noreservation");
@@ -355,7 +345,7 @@
 
 static const struct super_operations ext2_sops = {
 	.alloc_inode	= ext2_alloc_inode,
-	.destroy_inode	= ext2_destroy_inode,
+	.free_inode	= ext2_free_in_core_inode,
 	.write_inode	= ext2_write_inode,
 	.evict_inode	= ext2_evict_inode,
 	.put_super	= ext2_put_super,
@@ -761,7 +751,8 @@
 {
 	loff_t res = EXT2_NDIR_BLOCKS;
 	int meta_blocks;
-	loff_t upper_limit;
+	unsigned int upper_limit;
+	unsigned int ppb = 1 << (bits-2);
 
 	/* This is calculated to be the largest file size for a
 	 * dense, file such that the total number of
@@ -775,24 +766,34 @@
 	/* total blocks in file system block size */
 	upper_limit >>= (bits - 9);
 
-
-	/* indirect blocks */
-	meta_blocks = 1;
-	/* double indirect blocks */
-	meta_blocks += 1 + (1LL << (bits-2));
-	/* tripple indirect blocks */
-	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
-	upper_limit -= meta_blocks;
-	upper_limit <<= bits;
-
+	/* Compute how many blocks we can address by block tree */
 	res += 1LL << (bits-2);
 	res += 1LL << (2*(bits-2));
 	res += 1LL << (3*(bits-2));
-	res <<= bits;
-	if (res > upper_limit)
-		res = upper_limit;
+	/* Does block tree limit file size? */
+	if (res < upper_limit)
+		goto check_lfs;
 
+	res = upper_limit;
+	/* How many metadata blocks are needed for addressing upper_limit? */
+	upper_limit -= EXT2_NDIR_BLOCKS;
+	/* indirect blocks */
+	meta_blocks = 1;
+	upper_limit -= ppb;
+	/* double indirect blocks */
+	if (upper_limit < ppb * ppb) {
+		meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
+		res -= meta_blocks;
+		goto check_lfs;
+	}
+	meta_blocks += 1 + ppb;
+	upper_limit -= ppb * ppb;
+	/* tripple indirect blocks for the rest */
+	meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
+		DIV_ROUND_UP(upper_limit, ppb*ppb);
+	res -= meta_blocks;
+check_lfs:
+	res <<= bits;
 	if (res > MAX_LFS_FILESIZE)
 		res = MAX_LFS_FILESIZE;
 
@@ -933,8 +934,7 @@
 	sbi->s_resgid = opts.s_resgid;
 
 	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
-		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
-		 SB_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 	sb->s_iflags |= SB_I_CGROUPWB;
 
 	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
@@ -965,11 +965,11 @@
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
-	if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
+	if (test_opt(sb, DAX)) {
 		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
 			ext2_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
-			sbi->s_mount_opt &= ~EXT2_MOUNT_DAX;
+			clear_opt(sbi->s_mount_opt, DAX);
 		}
 	}
 
@@ -1001,6 +1001,8 @@
 
 	sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits);
 	sb->s_max_links = EXT2_LINK_MAX;
+	sb->s_time_min = S32_MIN;
+	sb->s_time_max = S32_MAX;
 
 	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) {
 		sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE;
@@ -1028,8 +1030,6 @@
 	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 
-	if (EXT2_INODE_SIZE(sb) == 0)
-		goto cantfind_ext2;
 	sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
 		goto cantfind_ext2;
@@ -1091,12 +1091,14 @@
 					   sizeof(struct buffer_head *),
 					   GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
+		ret = -ENOMEM;
 		ext2_msg(sb, KERN_ERR, "error: not enough memory");
 		goto failed_mount;
 	}
 	bgl_lock_init(sbi->s_blockgroup_lock);
 	sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
 	if (!sbi->s_debts) {
+		ret = -ENOMEM;
 		ext2_msg(sb, KERN_ERR, "error: not enough memory");
 		goto failed_mount_group_desc;
 	}
@@ -1152,6 +1154,7 @@
 #ifdef CONFIG_EXT2_FS_XATTR
 	sbi->s_ea_block_cache = ext2_xattr_create_cache();
 	if (!sbi->s_ea_block_cache) {
+		ret = -ENOMEM;
 		ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
 		goto failed_mount3;
 	}
@@ -1201,8 +1204,7 @@
 			sb->s_id);
 	goto failed_mount;
 failed_mount3:
-	if (sbi->s_ea_block_cache)
-		ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
+	ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1402,7 +1404,7 @@
 	sbi->s_resuid = new_opts.s_resuid;
 	sbi->s_resgid = new_opts.s_resgid;
 	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
-		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 	spin_unlock(&sbi->s_lock);
 
 	return 0;

diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index d5589dd..00cdb86 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c

@@ -23,6 +23,7 @@
 
 const struct inode_operations ext2_symlink_inode_operations = {
 	.get_link	= page_get_link,
+	.getattr	= ext2_getattr,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.listxattr	= ext2_listxattr,
@@ -31,6 +32,7 @@
  
 const struct inode_operations ext2_fast_symlink_inode_operations = {
 	.get_link	= simple_get_link,
+	.getattr	= ext2_getattr,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.listxattr	= ext2_listxattr,

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index dd8f10d..0456bc9 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c

@@ -134,6 +134,53 @@
 	return handler;
 }
 
+static bool
+ext2_xattr_header_valid(struct ext2_xattr_header *header)
+{
+	if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+	    header->h_blocks != cpu_to_le32(1))
+		return false;
+
+	return true;
+}
+
+static bool
+ext2_xattr_entry_valid(struct ext2_xattr_entry *entry,
+		       char *end, size_t end_offs)
+{
+	struct ext2_xattr_entry *next;
+	size_t size;
+
+	next = EXT2_XATTR_NEXT(entry);
+	if ((char *)next >= end)
+		return false;
+
+	if (entry->e_value_block != 0)
+		return false;
+
+	size = le32_to_cpu(entry->e_value_size);
+	if (size > end_offs ||
+	    le16_to_cpu(entry->e_value_offs) + size > end_offs)
+		return false;
+
+	return true;
+}
+
+static int
+ext2_xattr_cmp_entry(int name_index, size_t name_len, const char *name,
+		     struct ext2_xattr_entry *entry)
+{
+	int cmp;
+
+	cmp = name_index - entry->e_name_index;
+	if (!cmp)
+		cmp = name_len - entry->e_name_len;
+	if (!cmp)
+		cmp = memcmp(name, entry->e_name, name_len);
+
+	return cmp;
+}
+
 /*
  * ext2_xattr_get()
  *
@@ -152,7 +199,7 @@
 	struct ext2_xattr_entry *entry;
 	size_t name_len, size;
 	char *end;
-	int error;
+	int error, not_found;
 	struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
 
 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
@@ -176,9 +223,9 @@
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
-	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
-	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
-bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
+	if (!ext2_xattr_header_valid(HDR(bh))) {
+bad_block:
+		ext2_error(inode->i_sb, "ext2_xattr_get",
 			"inode %ld: bad block %d", inode->i_ino,
 			EXT2_I(inode)->i_file_acl);
 		error = -EIO;
@@ -188,29 +235,25 @@
 	/* find named attribute */
 	entry = FIRST_ENTRY(bh);
 	while (!IS_LAST_ENTRY(entry)) {
-		struct ext2_xattr_entry *next =
-			EXT2_XATTR_NEXT(entry);
-		if ((char *)next >= end)
+		if (!ext2_xattr_entry_valid(entry, end,
+		    inode->i_sb->s_blocksize))
 			goto bad_block;
-		if (name_index == entry->e_name_index &&
-		    name_len == entry->e_name_len &&
-		    memcmp(name, entry->e_name, name_len) == 0)
+
+		not_found = ext2_xattr_cmp_entry(name_index, name_len, name,
+						 entry);
+		if (!not_found)
 			goto found;
-		entry = next;
+		if (not_found < 0)
+			break;
+
+		entry = EXT2_XATTR_NEXT(entry);
 	}
 	if (ext2_xattr_cache_insert(ea_block_cache, bh))
 		ea_idebug(inode, "cache insert failed");
 	error = -ENODATA;
 	goto cleanup;
 found:
-	/* check the buffer size */
-	if (entry->e_value_block != 0)
-		goto bad_block;
 	size = le32_to_cpu(entry->e_value_size);
-	if (size > inode->i_sb->s_blocksize ||
-	    le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
-		goto bad_block;
-
 	if (ext2_xattr_cache_insert(ea_block_cache, bh))
 		ea_idebug(inode, "cache insert failed");
 	if (buffer) {
@@ -266,9 +309,9 @@
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
-	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
-	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
-bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
+	if (!ext2_xattr_header_valid(HDR(bh))) {
+bad_block:
+		ext2_error(inode->i_sb, "ext2_xattr_list",
 			"inode %ld: bad block %d", inode->i_ino,
 			EXT2_I(inode)->i_file_acl);
 		error = -EIO;
@@ -278,11 +321,10 @@
 	/* check the on-disk data structure */
 	entry = FIRST_ENTRY(bh);
 	while (!IS_LAST_ENTRY(entry)) {
-		struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(entry);
-
-		if ((char *)next >= end)
+		if (!ext2_xattr_entry_valid(entry, end,
+		    inode->i_sb->s_blocksize))
 			goto bad_block;
-		entry = next;
+		entry = EXT2_XATTR_NEXT(entry);
 	}
 	if (ext2_xattr_cache_insert(ea_block_cache, bh))
 		ea_idebug(inode, "cache insert failed");
@@ -342,6 +384,7 @@
 		return;
 
 	spin_lock(&EXT2_SB(sb)->s_lock);
+	ext2_update_dynamic_rev(sb);
 	EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
 	spin_unlock(&EXT2_SB(sb)->s_lock);
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
@@ -366,7 +409,7 @@
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bh = NULL;
 	struct ext2_xattr_header *header = NULL;
-	struct ext2_xattr_entry *here, *last;
+	struct ext2_xattr_entry *here = NULL, *last = NULL;
 	size_t name_len, free, min_offs = sb->s_blocksize;
 	int not_found = 1, error;
 	char *end;
@@ -405,47 +448,39 @@
 			le32_to_cpu(HDR(bh)->h_refcount));
 		header = HDR(bh);
 		end = bh->b_data + bh->b_size;
-		if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
-		    header->h_blocks != cpu_to_le32(1)) {
-bad_block:		ext2_error(sb, "ext2_xattr_set",
+		if (!ext2_xattr_header_valid(header)) {
+bad_block:
+			ext2_error(sb, "ext2_xattr_set",
 				"inode %ld: bad block %d", inode->i_ino, 
 				   EXT2_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
-		/* Find the named attribute. */
-		here = FIRST_ENTRY(bh);
-		while (!IS_LAST_ENTRY(here)) {
-			struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
-			if ((char *)next >= end)
-				goto bad_block;
-			if (!here->e_value_block && here->e_value_size) {
-				size_t offs = le16_to_cpu(here->e_value_offs);
-				if (offs < min_offs)
-					min_offs = offs;
-			}
-			not_found = name_index - here->e_name_index;
-			if (!not_found)
-				not_found = name_len - here->e_name_len;
-			if (!not_found)
-				not_found = memcmp(name, here->e_name,name_len);
-			if (not_found <= 0)
-				break;
-			here = next;
-		}
-		last = here;
-		/* We still need to compute min_offs and last. */
+		/*
+		 * Find the named attribute. If not found, 'here' will point
+		 * to entry where the new attribute should be inserted to
+		 * maintain sorting.
+		 */
+		last = FIRST_ENTRY(bh);
 		while (!IS_LAST_ENTRY(last)) {
-			struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
-			if ((char *)next >= end)
+			if (!ext2_xattr_entry_valid(last, end, sb->s_blocksize))
 				goto bad_block;
-			if (!last->e_value_block && last->e_value_size) {
+			if (last->e_value_size) {
 				size_t offs = le16_to_cpu(last->e_value_offs);
 				if (offs < min_offs)
 					min_offs = offs;
 			}
-			last = next;
+			if (not_found > 0) {
+				not_found = ext2_xattr_cmp_entry(name_index,
+								 name_len,
+								 name, last);
+				if (not_found <= 0)
+					here = last;
+			}
+			last = EXT2_XATTR_NEXT(last);
 		}
+		if (not_found > 0)
+			here = last;
 
 		/* Check whether we have enough space left. */
 		free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
@@ -453,7 +488,6 @@
 		/* We will use a new extended attribute block. */
 		free = sb->s_blocksize -
 			sizeof(struct ext2_xattr_header) - sizeof(__u32);
-		here = last = NULL;  /* avoid gcc uninitialized warning. */
 	}
 
 	if (not_found) {
@@ -469,14 +503,7 @@
 		error = -EEXIST;
 		if (flags & XATTR_CREATE)
 			goto cleanup;
-		if (!here->e_value_block && here->e_value_size) {
-			size_t size = le32_to_cpu(here->e_value_size);
-
-			if (le16_to_cpu(here->e_value_offs) + size > 
-			    sb->s_blocksize || size > sb->s_blocksize)
-				goto bad_block;
-			free += EXT2_XATTR_SIZE(size);
-		}
+		free += EXT2_XATTR_SIZE(le32_to_cpu(here->e_value_size));
 		free += EXT2_XATTR_LEN(name_len);
 	}
 	error = -ENOSPC;
@@ -505,11 +532,10 @@
 
 			unlock_buffer(bh);
 			ea_bdebug(bh, "cloning");
-			header = kmalloc(bh->b_size, GFP_KERNEL);
+			header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL);
 			error = -ENOMEM;
 			if (header == NULL)
 				goto cleanup;
-			memcpy(header, HDR(bh), bh->b_size);
 			header->h_refcount = cpu_to_le32(1);
 
 			offset = (char *)here - bh->b_data;
@@ -541,7 +567,7 @@
 		here->e_name_len = name_len;
 		memcpy(here->e_name, name, name_len);
 	} else {
-		if (!here->e_value_block && here->e_value_size) {
+		if (here->e_value_size) {
 			char *first_val = (char *)header + min_offs;
 			size_t offs = le16_to_cpu(here->e_value_offs);
 			char *val = (char *)header + offs;
@@ -568,7 +594,7 @@
 			last = ENTRY(header+1);
 			while (!IS_LAST_ENTRY(last)) {
 				size_t o = le16_to_cpu(last->e_value_offs);
-				if (!last->e_value_block && o < offs)
+				if (o < offs)
 					last->e_value_offs =
 						cpu_to_le16(o + size);
 				last = EXT2_XATTR_NEXT(last);
@@ -768,7 +794,7 @@
 	if (!EXT2_I(inode)->i_file_acl)
 		goto cleanup;
 
-	if (!ext2_data_block_valid(sbi, EXT2_I(inode)->i_file_acl, 0)) {
+	if (!ext2_data_block_valid(sbi, EXT2_I(inode)->i_file_acl, 1)) {
 		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
 			"inode %ld: xattr block %d is out of data blocks range",
 			inode->i_ino, EXT2_I(inode)->i_file_acl);
@@ -783,8 +809,7 @@
 		goto cleanup;
 	}
 	ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
-	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
-	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
+	if (!ext2_xattr_header_valid(HDR(bh))) {
 		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
 			"inode %ld: bad block %d", inode->i_ino,
 			EXT2_I(inode)->i_file_acl);
@@ -835,7 +860,8 @@
 	__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
 	int error;
 
-	error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
+	error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr,
+				      true);
 	if (error) {
 		if (error == -EBUSY) {
 			ea_bdebug(bh, "already in cache (%d cache entries)",

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index a453cc8..cbb5ca8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 # Ext3 configs are here for backward compatibility with old configs which may
 # have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
 # kernels after the removal of ext3 driver.
@@ -96,23 +97,8 @@
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
 
-config EXT4_ENCRYPTION
-	bool "Ext4 Encryption"
-	depends on EXT4_FS
-	select FS_ENCRYPTION
-	help
-	  Enable encryption of ext4 files and directories.  This
-	  feature is similar to ecryptfs, but it is more memory
-	  efficient since it avoids caching the encrypted and
-	  decrypted pages in the page cache.
-
-config EXT4_FS_ENCRYPTION
-	bool
-	default y
-	depends on EXT4_ENCRYPTION
-
 config EXT4_DEBUG
-	bool "EXT4 debugging support"
+	bool "Ext4 debugging support"
 	depends on EXT4_FS
 	help
 	  Enables run-time debugging support for the ext4 filesystem.

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8fdfcd3..b17ddc2 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile

@@ -13,3 +13,4 @@
 
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
+ext4-$(CONFIG_FS_VERITY)		+= verity.o

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index fb50f9a..8c7bbf3 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c

@@ -248,7 +248,8 @@
 		error = posix_acl_update_mode(inode, &mode, &acl);
 		if (error)
 			goto out_stop;
-		update_mode = 1;
+		if (mode != inode->i_mode)
+			update_mode = 1;
 	}
 
 	error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
@@ -284,12 +285,16 @@
 		error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
 				       default_acl, XATTR_CREATE);
 		posix_acl_release(default_acl);
+	} else {
+		inode->i_default_acl = NULL;
 	}
 	if (acl) {
 		if (!error)
 			error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
 					       acl, XATTR_CREATE);
 		posix_acl_release(acl);
+	} else {
+		inode->i_acl = NULL;
 	}
 	return error;
 }

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e5d6ee6..0b202e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c

@@ -603,9 +603,9 @@
 }
 
 /**
- * ext4_should_retry_alloc()
+ * ext4_should_retry_alloc() - check if a block allocation should be retried
  * @sb:			super block
- * @retries		number of attemps has been made
+ * @retries:		number of attemps has been made
  *
  * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
  * it is profitable to retry the operation, this function will wait

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 913061c..d4d4fdf 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c

@@ -38,6 +38,7 @@
 
 void ext4_exit_system_zone(void)
 {
+	rcu_barrier();
 	kmem_cache_destroy(ext4_system_zone_cachep);
 }
 
@@ -49,17 +50,26 @@
 	return 0;
 }
 
+static void release_system_zone(struct ext4_system_blocks *system_blks)
+{
+	struct ext4_system_zone	*entry, *n;
+
+	rbtree_postorder_for_each_entry_safe(entry, n,
+				&system_blks->root, node)
+		kmem_cache_free(ext4_system_zone_cachep, entry);
+}
+
 /*
  * Mark a range of blocks as belonging to the "system zone" --- that
  * is, filesystem metadata blocks which should never be used by
  * inodes.
  */
-static int add_system_zone(struct ext4_sb_info *sbi,
+static int add_system_zone(struct ext4_system_blocks *system_blks,
 			   ext4_fsblk_t start_blk,
 			   unsigned int count)
 {
 	struct ext4_system_zone *new_entry = NULL, *entry;
-	struct rb_node **n = &sbi->system_blks.rb_node, *node;
+	struct rb_node **n = &system_blks->root.rb_node, *node;
 	struct rb_node *parent = NULL, *new_node = NULL;
 
 	while (*n) {
@@ -91,7 +101,7 @@
 		new_node = &new_entry->node;
 
 		rb_link_node(new_node, parent, n);
-		rb_insert_color(new_node, &sbi->system_blks);
+		rb_insert_color(new_node, &system_blks->root);
 	}
 
 	/* Can we merge to the left? */
@@ -101,7 +111,7 @@
 		if (can_merge(entry, new_entry)) {
 			new_entry->start_blk = entry->start_blk;
 			new_entry->count += entry->count;
-			rb_erase(node, &sbi->system_blks);
+			rb_erase(node, &system_blks->root);
 			kmem_cache_free(ext4_system_zone_cachep, entry);
 		}
 	}
@@ -112,7 +122,7 @@
 		entry = rb_entry(node, struct ext4_system_zone, node);
 		if (can_merge(new_entry, entry)) {
 			new_entry->count += entry->count;
-			rb_erase(node, &sbi->system_blks);
+			rb_erase(node, &system_blks->root);
 			kmem_cache_free(ext4_system_zone_cachep, entry);
 		}
 	}
@@ -126,7 +136,7 @@
 	int first = 1;
 
 	printk(KERN_INFO "System zones: ");
-	node = rb_first(&sbi->system_blks);
+	node = rb_first(&sbi->system_blks->root);
 	while (node) {
 		entry = rb_entry(node, struct ext4_system_zone, node);
 		printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
@@ -137,68 +147,18 @@
 	printk(KERN_CONT "\n");
 }
 
-int ext4_setup_system_zone(struct super_block *sb)
-{
-	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_group_desc *gdp;
-	ext4_group_t i;
-	int flex_size = ext4_flex_bg_size(sbi);
-	int ret;
-
-	if (!test_opt(sb, BLOCK_VALIDITY)) {
-		if (sbi->system_blks.rb_node)
-			ext4_release_system_zone(sb);
-		return 0;
-	}
-	if (sbi->system_blks.rb_node)
-		return 0;
-
-	for (i=0; i < ngroups; i++) {
-		if (ext4_bg_has_super(sb, i) &&
-		    ((i < 5) || ((i % flex_size) == 0)))
-			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
-					ext4_bg_num_gdb(sb, i) + 1);
-		gdp = ext4_get_group_desc(sb, i, NULL);
-		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
-		if (ret)
-			return ret;
-		ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
-		if (ret)
-			return ret;
-		ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
-				sbi->s_itb_per_group);
-		if (ret)
-			return ret;
-	}
-
-	if (test_opt(sb, DEBUG))
-		debug_print_tree(sbi);
-	return 0;
-}
-
-/* Called when the filesystem is unmounted */
-void ext4_release_system_zone(struct super_block *sb)
-{
-	struct ext4_system_zone	*entry, *n;
-
-	rbtree_postorder_for_each_entry_safe(entry, n,
-			&EXT4_SB(sb)->system_blks, node)
-		kmem_cache_free(ext4_system_zone_cachep, entry);
-
-	EXT4_SB(sb)->system_blks = RB_ROOT;
-}
-
 /*
  * Returns 1 if the passed-in block region (start_blk,
  * start_blk+count) is valid; 0 if some part of the block region
  * overlaps with filesystem metadata blocks.
  */
-int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
-			  unsigned int count)
+static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
+				     struct ext4_system_blocks *system_blks,
+				     ext4_fsblk_t start_blk,
+				     unsigned int count)
 {
 	struct ext4_system_zone *entry;
-	struct rb_node *n = sbi->system_blks.rb_node;
+	struct rb_node *n;
 
 	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
 	    (start_blk + count < start_blk) ||
@@ -206,6 +166,11 @@
 		sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
 		return 0;
 	}
+
+	if (system_blks == NULL)
+		return 1;
+
+	n = system_blks->root.rb_node;
 	while (n) {
 		entry = rb_entry(n, struct ext4_system_zone, node);
 		if (start_blk + count - 1 < entry->start_blk)
@@ -220,6 +185,178 @@
 	return 1;
 }
 
+static int ext4_protect_reserved_inode(struct super_block *sb,
+				       struct ext4_system_blocks *system_blks,
+				       u32 ino)
+{
+	struct inode *inode;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_map_blocks map;
+	u32 i = 0, num;
+	int err = 0, n;
+
+	if ((ino < EXT4_ROOT_INO) ||
+	    (ino > le32_to_cpu(sbi->s_es->s_inodes_count)))
+		return -EINVAL;
+	inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	while (i < num) {
+		map.m_lblk = i;
+		map.m_len = num - i;
+		n = ext4_map_blocks(NULL, inode, &map, 0);
+		if (n < 0) {
+			err = n;
+			break;
+		}
+		if (n == 0) {
+			i++;
+		} else {
+			if (!ext4_data_block_valid_rcu(sbi, system_blks,
+						map.m_pblk, n)) {
+				ext4_error(sb, "blocks %llu-%llu from inode %u "
+					   "overlap system zone", map.m_pblk,
+					   map.m_pblk + map.m_len - 1, ino);
+				err = -EFSCORRUPTED;
+				break;
+			}
+			err = add_system_zone(system_blks, map.m_pblk, n);
+			if (err < 0)
+				break;
+			i += n;
+		}
+	}
+	iput(inode);
+	return err;
+}
+
+static void ext4_destroy_system_zone(struct rcu_head *rcu)
+{
+	struct ext4_system_blocks *system_blks;
+
+	system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
+	release_system_zone(system_blks);
+	kfree(system_blks);
+}
+
+/*
+ * Build system zone rbtree which is used for block validity checking.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. That's why we first build the rbtree and then
+ * swap it in place.
+ */
+int ext4_setup_system_zone(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_system_blocks *system_blks;
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	int flex_size = ext4_flex_bg_size(sbi);
+	int ret;
+
+	if (!test_opt(sb, BLOCK_VALIDITY)) {
+		if (sbi->system_blks)
+			ext4_release_system_zone(sb);
+		return 0;
+	}
+	if (sbi->system_blks)
+		return 0;
+
+	system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
+	if (!system_blks)
+		return -ENOMEM;
+
+	for (i=0; i < ngroups; i++) {
+		cond_resched();
+		if (ext4_bg_has_super(sb, i) &&
+		    ((i < 5) || ((i % flex_size) == 0)))
+			add_system_zone(system_blks,
+					ext4_group_first_block_no(sb, i),
+					ext4_bg_num_gdb(sb, i) + 1);
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		ret = add_system_zone(system_blks,
+				ext4_block_bitmap(sb, gdp), 1);
+		if (ret)
+			goto err;
+		ret = add_system_zone(system_blks,
+				ext4_inode_bitmap(sb, gdp), 1);
+		if (ret)
+			goto err;
+		ret = add_system_zone(system_blks,
+				ext4_inode_table(sb, gdp),
+				sbi->s_itb_per_group);
+		if (ret)
+			goto err;
+	}
+	if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
+		ret = ext4_protect_reserved_inode(sb, system_blks,
+				le32_to_cpu(sbi->s_es->s_journal_inum));
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * System blks rbtree complete, announce it once to prevent racing
+	 * with ext4_data_block_valid() accessing the rbtree at the same
+	 * time.
+	 */
+	rcu_assign_pointer(sbi->system_blks, system_blks);
+
+	if (test_opt(sb, DEBUG))
+		debug_print_tree(sbi);
+	return 0;
+err:
+	release_system_zone(system_blks);
+	kfree(system_blks);
+	return ret;
+}
+
+/*
+ * Called when the filesystem is unmounted or when remounting it with
+ * noblock_validity specified.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. So we first clear the system_blks pointer and
+ * then free the rbtree only after RCU grace period expires.
+ */
+void ext4_release_system_zone(struct super_block *sb)
+{
+	struct ext4_system_blocks *system_blks;
+
+	system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+					lockdep_is_held(&sb->s_umount));
+	rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
+
+	if (system_blks)
+		call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
+}
+
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+			  unsigned int count)
+{
+	struct ext4_system_blocks *system_blks;
+	int ret;
+
+	/*
+	 * Lock the system zone to prevent it being released concurrently
+	 * when doing a remount which inverse current "[no]block_validity"
+	 * mount option.
+	 */
+	rcu_read_lock();
+	system_blks = rcu_dereference(sbi->system_blks);
+	ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
+					count);
+	rcu_read_unlock();
+	return ret;
+}
+
 int ext4_check_blockref(const char *function, unsigned int line,
 			struct inode *inode, __le32 *p, unsigned int max)
 {
@@ -227,6 +364,11 @@
 	__le32 *bref = p;
 	unsigned int blk;
 
+	if (ext4_has_feature_journal(inode->i_sb) &&
+	    (inode->i_ino ==
+	     le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+		return 0;
+
 	while (bref < p+max) {
 		blk = le32_to_cpu(*bref++);
 		if (blk &&

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f93f988..9fdd2b2 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c

@@ -26,12 +26,16 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/iversion.h>
+#include <linux/unicode.h>
 #include "ext4.h"
 #include "xattr.h"
 
 static int ext4_dx_readdir(struct file *, struct dir_context *);
 
 /**
+ * is_dx_dir() - check if a directory is using htree indexing
+ * @inode: directory inode
+ *
  * Check if the given dir-inode refers to an htree-indexed directory
  * (or a directory which could potentially get converted to use htree
  * indexing).
@@ -108,10 +112,9 @@
 	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bh = NULL;
-	int dir_has_error = 0;
 	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
 
-	if (ext4_encrypted_inode(inode)) {
+	if (IS_ENCRYPTED(inode)) {
 		err = fscrypt_get_encryption_info(inode);
 		if (err && err != -ENOKEY)
 			return err;
@@ -138,14 +141,12 @@
 			return err;
 	}
 
-	if (ext4_encrypted_inode(inode)) {
+	if (IS_ENCRYPTED(inode)) {
 		err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
 		if (err < 0)
 			return err;
 	}
 
-	offset = ctx->pos & (sb->s_blocksize - 1);
-
 	while (ctx->pos < inode->i_size) {
 		struct ext4_map_blocks map;
 
@@ -154,9 +155,18 @@
 			goto errout;
 		}
 		cond_resched();
+		offset = ctx->pos & (sb->s_blocksize - 1);
 		map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
 		map.m_len = 1;
 		err = ext4_map_blocks(NULL, inode, &map, 0);
+		if (err == 0) {
+			/* m_len should never be zero but let's avoid
+			 * an infinite loop if it somehow is */
+			if (map.m_len == 0)
+				map.m_len = 1;
+			ctx->pos += map.m_len * sb->s_blocksize;
+			continue;
+		}
 		if (err > 0) {
 			pgoff_t index = map.m_pblk >>
 					(PAGE_SHIFT - inode->i_blkbits);
@@ -175,13 +185,6 @@
 		}
 
 		if (!bh) {
-			if (!dir_has_error) {
-				EXT4_ERROR_FILE(file, 0,
-						"directory contains a "
-						"hole at offset %llu",
-					   (unsigned long long) ctx->pos);
-				dir_has_error = 1;
-			}
 			/* corrupt size?  Maybe no more blocks to read */
 			if (ctx->pos > inode->i_blocks << 9)
 				break;
@@ -191,8 +194,7 @@
 
 		/* Check the checksum */
 		if (!buffer_verified(bh) &&
-		    !ext4_dirent_csum_verify(inode,
-				(struct ext4_dir_entry *)bh->b_data)) {
+		    !ext4_dirblock_csum_verify(inode, bh)) {
 			EXT4_ERROR_FILE(file, 0, "directory fails checksum "
 					"at offset %llu",
 					(unsigned long long)ctx->pos);
@@ -245,7 +247,7 @@
 			offset += ext4_rec_len_from_disk(de->rec_len,
 					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
-				if (!ext4_encrypted_inode(inode)) {
+				if (!IS_ENCRYPTED(inode)) {
 					if (!dir_emit(ctx, de->name,
 					    de->name_len,
 					    le32_to_cpu(de->inode),
@@ -283,9 +285,7 @@
 done:
 	err = 0;
 errout:
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
 	fscrypt_fname_free_buffer(&fstr);
-#endif
 	brelse(bh);
 	return err;
 }
@@ -613,7 +613,7 @@
 
 static int ext4_dir_open(struct inode * inode, struct file * filp)
 {
-	if (ext4_encrypted_inode(inode))
+	if (IS_ENCRYPTED(inode))
 		return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
 	return 0;
 }
@@ -662,3 +662,51 @@
 	.open		= ext4_dir_open,
 	.release	= ext4_release_dir,
 };
+
+#ifdef CONFIG_UNICODE
+static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
+			  const char *str, const struct qstr *name)
+{
+	struct qstr qstr = {.name = str, .len = len };
+	struct inode *inode = dentry->d_parent->d_inode;
+
+	if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) {
+		if (len != name->len)
+			return -1;
+		return memcmp(str, name->name, len);
+	}
+
+	return ext4_ci_compare(inode, name, &qstr, false);
+}
+
+static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
+{
+	const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
+	const struct unicode_map *um = sbi->s_encoding;
+	unsigned char *norm;
+	int len, ret = 0;
+
+	if (!IS_CASEFOLDED(dentry->d_inode) || !um)
+		return 0;
+
+	norm = kmalloc(PATH_MAX, GFP_ATOMIC);
+	if (!norm)
+		return -ENOMEM;
+
+	len = utf8_casefold(um, str, norm, PATH_MAX);
+	if (len < 0) {
+		if (ext4_has_strict_mode(sbi))
+			ret = -EINVAL;
+		goto out;
+	}
+	str->hash = full_name_hash(dentry, norm, len);
+out:
+	kfree(norm);
+	return ret;
+}
+
+const struct dentry_operations ext4_dentry_ops = {
+	.d_hash = ext4_d_hash,
+	.d_compare = ext4_d_compare,
+};
+#endif

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5cfb1e2..03db3e7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h

@@ -40,20 +40,11 @@
 #include <linux/compat.h>
 #endif
 
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
 #include <linux/fscrypt.h>
+#include <linux/fsverity.h>
 
 #include <linux/compiler.h>
 
-/* Until this gets included into linux/compiler-gcc.h */
-#ifndef __nonstring
-#if defined(GCC_VERSION) && (GCC_VERSION >= 80000)
-#define __nonstring __attribute__((nonstring))
-#else
-#define __nonstring
-#endif
-#endif
-
 /*
  * The fourth extended filesystem constants/structures
  */
@@ -195,6 +186,14 @@
 };
 
 /*
+ * Block validity checking, system zone rbtree.
+ */
+struct ext4_system_blocks {
+	struct rb_root root;
+	struct rcu_head rcu;
+};
+
+/*
  * Flags for ext4_io_end->flags
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
@@ -294,6 +293,9 @@
 				  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
 #define EXT4_LBLK_CMASK(s, lblk) ((lblk) &				\
 				  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Fill in the low bits to get the last block of the cluster */
+#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) |				\
+				    ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
 /* Get the cluster offset */
 #define EXT4_PBLK_COFF(s, pblk) ((pblk) &				\
 				 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
@@ -405,14 +407,16 @@
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_VERITY_FL			0x00100000 /* Verity protected inode */
 #define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
 #define EXT4_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
+#define EXT4_CASEFOLD_FL		0x40000000 /* Casefolded file */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x204BC0FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE		0x705BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x604BC0FF /* User modifiable flags */
 
 /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
 #define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \
@@ -427,14 +431,18 @@
 			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
 			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
 			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
-			   EXT4_PROJINHERIT_FL)
+			   EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
+			   EXT4_PROJINHERIT_FL))
 
 /* Flags that are appropriate for non-directories/regular files. */
 #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
 
+/* The only flags that should be swapped */
+#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 {
@@ -472,6 +480,7 @@
 	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
 	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
 	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_VERITY	= 20,	/* Verity protected inode */
 	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
 	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
 	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
@@ -517,6 +526,7 @@
 	CHECK_FLAG_VALUE(TOPDIR);
 	CHECK_FLAG_VALUE(HUGE_FILE);
 	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(VERITY);
 	CHECK_FLAG_VALUE(EA_INODE);
 	CHECK_FLAG_VALUE(EOFBLOCKS);
 	CHECK_FLAG_VALUE(INLINE_DATA);
@@ -628,6 +638,7 @@
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE		0x0008
 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER	0x0010
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
+#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040
 
 /*
  * ioctl commands
@@ -653,6 +664,10 @@
 #define EXT4_IOC_SET_ENCRYPTION_POLICY	FS_IOC_SET_ENCRYPTION_POLICY
 #define EXT4_IOC_GET_ENCRYPTION_PWSALT	FS_IOC_GET_ENCRYPTION_PWSALT
 #define EXT4_IOC_GET_ENCRYPTION_POLICY	FS_IOC_GET_ENCRYPTION_POLICY
+/* ioctl codes 19--39 are reserved for fscrypt */
+#define EXT4_IOC_CLEAR_ES_CACHE		_IO('f', 40)
+#define EXT4_IOC_GETSTATE		_IOW('f', 41, __u32)
+#define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
 
 #define EXT4_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
 #define EXT4_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
@@ -666,6 +681,16 @@
 #define EXT4_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
 #define EXT4_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
 
+/*
+ * Flags returned by EXT4_IOC_GETSTATE
+ *
+ * We only expose to userspace a subset of the state flags in
+ * i_state_flags
+ */
+#define EXT4_STATE_FLAG_EXT_PRECACHED	0x00000001
+#define EXT4_STATE_FLAG_NEW		0x00000002
+#define EXT4_STATE_FLAG_NEWENTRY	0x00000004
+#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE	0x00000008
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -683,6 +708,12 @@
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
 #endif
 
+/*
+ * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
+ * It indicates that the entry in extent status cache is for a hole.
+ */
+#define EXT4_FIEMAP_EXTENT_HOLE		0x08000000
+
 /* Max physical block we can address w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
 
@@ -812,31 +843,20 @@
 static inline void ext4_decode_extra_time(struct timespec64 *time,
 					  __le32 extra)
 {
-	if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) {
-
-#if 1
-		/* Handle legacy encoding of pre-1970 dates with epoch
-		 * bits 1,1. (This backwards compatibility may be removed
-		 * at the discretion of the ext4 developers.)
-		 */
-		u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
-		if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
-			extra_bits = 0;
-		time->tv_sec += extra_bits << 32;
-#else
+	if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
 		time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
-#endif
-	}
 	time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
 }
 
 #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)				\
 do {										\
-	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);		\
 	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     {\
+		(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	\
 		(raw_inode)->xtime ## _extra =					\
 				ext4_encode_extra_time(&(inode)->xtime);	\
 		}								\
+	else	\
+		(raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX));	\
 } while (0)
 
 #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
@@ -1030,6 +1050,9 @@
 	ext4_lblk_t i_da_metadata_calc_last_lblock;
 	int i_da_metadata_calc_len;
 
+	/* pending cluster reservations for bigalloc file systems */
+	struct ext4_pending_tree i_pending_tree;
+
 	/* on-disk additional length */
 	__u16 i_extra_isize;
 
@@ -1316,7 +1339,9 @@
 	__u8	s_first_error_time_hi;
 	__u8	s_last_error_time_hi;
 	__u8	s_pad[2];
-	__le32	s_reserved[96];		/* Padding to the end of the block */
+	__le16  s_encoding;		/* Filename charset encoding */
+	__le16  s_encoding_flags;	/* Filename charset encoding flags */
+	__le32	s_reserved[95];		/* Padding to the end of the block */
 	__le32	s_checksum;		/* crc32c(superblock) */
 };
 
@@ -1331,7 +1356,7 @@
 #define EXT4_MF_FS_ABORTED		0x0002	/* Fatal error detected */
 #define EXT4_MF_TEST_DUMMY_ENCRYPTION	0x0004
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 #define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
 						EXT4_MF_TEST_DUMMY_ENCRYPTION))
 #else
@@ -1341,6 +1366,16 @@
 /* Number of quota types we support */
 #define EXT4_MAXQUOTAS 3
 
+#define EXT4_ENC_UTF8_12_1	1
+
+/*
+ * Flags for ext4_sb_info.s_encoding_flags.
+ */
+#define EXT4_ENC_STRICT_MODE_FL	(1 << 0)
+
+#define ext4_has_strict_mode(sbi) \
+	(sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL)
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1390,6 +1425,10 @@
 	struct kobject s_kobj;
 	struct completion s_kobj_unregister;
 	struct super_block *s_sb;
+#ifdef CONFIG_UNICODE
+	struct unicode_map *s_encoding;
+	__u16 s_encoding_flags;
+#endif
 
 	/* Journaling */
 	struct journal_s *s_journal;
@@ -1406,7 +1445,7 @@
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
 	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-	struct rb_root system_blks;
+	struct ext4_system_blocks __rcu *system_blks;
 
 #ifdef EXTENTS_STATS
 	/* ext4 extents stats */
@@ -1545,6 +1584,7 @@
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
 	EXT4_STATE_LUSTRE_EA_INODE,	/* Lustre-style ea_inode */
+	EXT4_STATE_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -1595,9 +1635,12 @@
 #define EXT4_SB(sb)	(sb)
 #endif
 
-/*
- * Returns true if the inode is inode is encrypted
- */
+static inline bool ext4_verity_in_progress(struct inode *inode)
+{
+	return IS_ENABLED(CONFIG_FS_VERITY) &&
+	       ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+}
+
 #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
 
 /*
@@ -1620,6 +1663,10 @@
 
 #define EXT4_GOOD_OLD_INODE_SIZE 128
 
+#define EXT4_EXTRA_TIMESTAMP_MAX	(((s64)1 << 34) - 1  + S32_MIN)
+#define EXT4_NON_EXTRA_TIMESTAMP_MAX	S32_MAX
+#define EXT4_TIMESTAMP_MIN		S32_MIN
+
 /*
  * Feature set definitions
  */
@@ -1650,6 +1697,7 @@
 #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
 #define EXT4_FEATURE_RO_COMPAT_READONLY		0x1000
 #define EXT4_FEATURE_RO_COMPAT_PROJECT		0x2000
+#define EXT4_FEATURE_RO_COMPAT_VERITY		0x8000
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1666,6 +1714,9 @@
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
 #define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
 #define EXT4_FEATURE_INCOMPAT_ENCRYPT		0x10000
+#define EXT4_FEATURE_INCOMPAT_CASEFOLD		0x20000
+
+extern void ext4_update_dynamic_rev(struct super_block *sb);
 
 #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
 static inline bool ext4_has_feature_##name(struct super_block *sb) \
@@ -1675,6 +1726,7 @@
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+	ext4_update_dynamic_rev(sb); \
 	EXT4_SB(sb)->s_es->s_feature_compat |= \
 		cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
 } \
@@ -1692,6 +1744,7 @@
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+	ext4_update_dynamic_rev(sb); \
 	EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
 		cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
 } \
@@ -1709,6 +1762,7 @@
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+	ext4_update_dynamic_rev(sb); \
 	EXT4_SB(sb)->s_es->s_feature_incompat |= \
 		cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
 } \
@@ -1738,6 +1792,7 @@
 EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,	METADATA_CSUM)
 EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,		READONLY)
 EXT4_FEATURE_RO_COMPAT_FUNCS(project,		PROJECT)
+EXT4_FEATURE_RO_COMPAT_FUNCS(verity,		VERITY)
 
 EXT4_FEATURE_INCOMPAT_FUNCS(compression,	COMPRESSION)
 EXT4_FEATURE_INCOMPAT_FUNCS(filetype,		FILETYPE)
@@ -1754,6 +1809,7 @@
 EXT4_FEATURE_INCOMPAT_FUNCS(largedir,		LARGEDIR)
 EXT4_FEATURE_INCOMPAT_FUNCS(inline_data,	INLINE_DATA)
 EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
+EXT4_FEATURE_INCOMPAT_FUNCS(casefold,		CASEFOLD)
 
 #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1781,6 +1837,7 @@
 					 EXT4_FEATURE_INCOMPAT_MMP | \
 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
 					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+					 EXT4_FEATURE_INCOMPAT_CASEFOLD | \
 					 EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
 					 EXT4_FEATURE_INCOMPAT_LARGEDIR)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
@@ -1793,7 +1850,8 @@
 					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
 					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
 					 EXT4_FEATURE_RO_COMPAT_QUOTA |\
-					 EXT4_FEATURE_RO_COMPAT_PROJECT)
+					 EXT4_FEATURE_RO_COMPAT_PROJECT |\
+					 EXT4_FEATURE_RO_COMPAT_VERITY)
 
 #define EXTN_FEATURE_FUNCS(ver) \
 static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -2022,7 +2080,6 @@
 	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
 
 	desc.shash.tfm = sbi->s_chksum_driver;
-	desc.shash.flags = 0;
 	*(u32 *)desc.ctx = crc;
 
 	BUG_ON(crypto_shash_update(&desc.shash, address, length));
@@ -2056,9 +2113,12 @@
 	const struct qstr *usr_fname;
 	struct fscrypt_str disk_name;
 	struct dx_hash_info hinfo;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	struct fscrypt_str crypto_buf;
 #endif
+#ifdef CONFIG_UNICODE
+	struct fscrypt_str cf_name;
+#endif
 };
 
 #define fname_name(p) ((p)->disk_name.name)
@@ -2284,29 +2344,62 @@
 					      struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
-static inline bool ext4_encrypted_inode(struct inode *inode)
+#ifdef CONFIG_UNICODE
+extern void ext4_fname_setup_ci_filename(struct inode *dir,
+					 const struct qstr *iname,
+					 struct fscrypt_str *fname);
+#endif
+
+#ifdef CONFIG_FS_ENCRYPTION
+static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
+						const struct fscrypt_name *src)
 {
-	return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
+	memset(dst, 0, sizeof(*dst));
+
+	dst->usr_fname = src->usr_fname;
+	dst->disk_name = src->disk_name;
+	dst->hinfo.hash = src->hash;
+	dst->hinfo.minor_hash = src->minor_hash;
+	dst->crypto_buf = src->crypto_buf;
 }
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
 static inline int ext4_fname_setup_filename(struct inode *dir,
-			const struct qstr *iname,
-			int lookup, struct ext4_filename *fname)
+					    const struct qstr *iname,
+					    int lookup,
+					    struct ext4_filename *fname)
 {
 	struct fscrypt_name name;
 	int err;
 
-	memset(fname, 0, sizeof(struct ext4_filename));
-
 	err = fscrypt_setup_filename(dir, iname, lookup, &name);
+	if (err)
+		return err;
 
-	fname->usr_fname = name.usr_fname;
-	fname->disk_name = name.disk_name;
-	fname->hinfo.hash = name.hash;
-	fname->hinfo.minor_hash = name.minor_hash;
-	fname->crypto_buf = name.crypto_buf;
-	return err;
+	ext4_fname_from_fscrypt_name(fname, &name);
+
+#ifdef CONFIG_UNICODE
+	ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#endif
+	return 0;
+}
+
+static inline int ext4_fname_prepare_lookup(struct inode *dir,
+					    struct dentry *dentry,
+					    struct ext4_filename *fname)
+{
+	struct fscrypt_name name;
+	int err;
+
+	err = fscrypt_prepare_lookup(dir, dentry, &name);
+	if (err)
+		return err;
+
+	ext4_fname_from_fscrypt_name(fname, &name);
+
+#ifdef CONFIG_UNICODE
+	ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
+#endif
+	return 0;
 }
 
 static inline void ext4_fname_free_filename(struct ext4_filename *fname)
@@ -2319,20 +2412,44 @@
 	fname->crypto_buf.name = NULL;
 	fname->usr_fname = NULL;
 	fname->disk_name.name = NULL;
+
+#ifdef CONFIG_UNICODE
+	kfree(fname->cf_name.name);
+	fname->cf_name.name = NULL;
+#endif
 }
-#else
+#else /* !CONFIG_FS_ENCRYPTION */
 static inline int ext4_fname_setup_filename(struct inode *dir,
-		const struct qstr *iname,
-		int lookup, struct ext4_filename *fname)
+					    const struct qstr *iname,
+					    int lookup,
+					    struct ext4_filename *fname)
 {
 	fname->usr_fname = iname;
 	fname->disk_name.name = (unsigned char *) iname->name;
 	fname->disk_name.len = iname->len;
+
+#ifdef CONFIG_UNICODE
+	ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#endif
+
 	return 0;
 }
-static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
 
+static inline int ext4_fname_prepare_lookup(struct inode *dir,
+					    struct dentry *dentry,
+					    struct ext4_filename *fname)
+{
+	return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
+}
+
+static inline void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+#ifdef CONFIG_UNICODE
+	kfree(fname->cf_name.name);
+	fname->cf_name.name = NULL;
 #endif
+}
+#endif /* !CONFIG_FS_ENCRYPTION */
 
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
@@ -2380,8 +2497,8 @@
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
 
 /* hash.c */
-extern int ext4fs_dirhash(const char *name, int len, struct
-			  dx_hash_info *hinfo);
+extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
+			  struct dx_hash_info *hinfo);
 
 /* ialloc.c */
 extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
@@ -2459,8 +2576,19 @@
 #define FALL_BACK_TO_NONDELALLOC 1
 #define CONVERT_INLINE_DATA	 2
 
-extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
+typedef enum {
+	EXT4_IGET_NORMAL =	0,
+	EXT4_IGET_SPECIAL =	0x0001, /* OK to iget a system inode */
+	EXT4_IGET_HANDLE = 	0x0002	/* Inode # is from a handle */
+} ext4_iget_flags;
+
+extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+				 ext4_iget_flags flags, const char *function,
+				 unsigned int line);
+
+#define ext4_iget(sb, ino, flags) \
+	__ext4_iget((sb), (ino), (flags), __func__, __LINE__)
+
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
@@ -2484,10 +2612,11 @@
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
-extern int ext4_page_mkwrite(struct vm_fault *vmf);
-extern int ext4_filemap_fault(struct vm_fault *vmf);
+extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
+extern void ext4_da_release_space(struct inode *inode, int to_free);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
 extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
@@ -2511,8 +2640,8 @@
 extern int ext4_ind_migrate(struct inode *inode);
 
 /* namei.c */
-extern int ext4_dirent_csum_verify(struct inode *inode,
-				   struct ext4_dir_entry *dirent);
+extern int ext4_dirblock_csum_verify(struct inode *inode,
+				     struct buffer_head *bh);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -2542,6 +2671,8 @@
 extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 
 /* super.c */
+extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
+					 sector_t block, int op_flags);
 extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
 extern int ext4_calculate_overhead(struct super_block *sb);
 extern void ext4_superblock_csum_set(struct super_block *sb);
@@ -2663,7 +2794,6 @@
 
 #endif
 
-extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
 extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -2964,6 +3094,10 @@
 /* dir.c */
 extern const struct file_operations ext4_dir_operations;
 
+#ifdef CONFIG_UNICODE
+extern const struct dentry_operations ext4_dentry_ops;
+#endif
+
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
@@ -3008,11 +3142,11 @@
 extern int ext4_read_inline_dir(struct file *filp,
 				struct dir_context *ctx,
 				int *has_inline_data);
-extern int htree_inlinedir_to_tree(struct file *dir_file,
-				   struct inode *dir, ext4_lblk_t block,
-				   struct dx_hash_info *hinfo,
-				   __u32 start_hash, __u32 start_minor_hash,
-				   int *has_inline_data);
+extern int ext4_inlinedir_to_tree(struct file *dir_file,
+				  struct inode *dir, ext4_lblk_t block,
+				  struct dx_hash_info *hinfo,
+				  __u32 start_hash, __u32 start_minor_hash,
+				  int *has_inline_data);
 extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
 					struct ext4_filename *fname,
 					struct ext4_dir_entry_2 **res_dir,
@@ -3051,11 +3185,14 @@
 				 struct ext4_dir_entry_2 *de,
 				 int blocksize, int csum_size,
 				 unsigned int parent_ino, int dotdot_real_len);
-extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-				   unsigned int blocksize);
-extern int ext4_handle_dirty_dirent_node(handle_t *handle,
-					 struct inode *inode,
-					 struct buffer_head *bh);
+extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
+					unsigned int blocksize);
+extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
+				      struct buffer_head *bh);
+extern int ext4_ci_compare(const struct inode *parent,
+			   const struct qstr *fname,
+			   const struct qstr *entry, bool quick);
+
 #define S_SHIFT 12
 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
 	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
@@ -3078,6 +3215,8 @@
 extern int ext4_mpage_readpages(struct address_space *mapping,
 				struct list_head *pages, struct page *page,
 				unsigned nr_pages, bool is_readahead);
+extern int __init ext4_init_post_read_processing(void);
+extern void ext4_exit_post_read_processing(void);
 
 /* symlink.c */
 extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
@@ -3143,13 +3282,12 @@
 					      int flags);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
-extern int ext4_find_delalloc_range(struct inode *inode,
-				    ext4_lblk_t lblk_start,
-				    ext4_lblk_t lblk_end);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
+extern int ext4_get_es_cache(struct inode *inode,
+			     struct fiemap_extent_info *fieinfo,
+			     __u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
 extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
@@ -3157,6 +3295,7 @@
 				struct inode *inode2, ext4_lblk_t lblk1,
 			     ext4_lblk_t lblk2,  ext4_lblk_t count,
 			     int mark_unwritten,int *err);
+extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -3187,6 +3326,9 @@
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 
+/* verity.c */
+extern const struct fsverity_operations ext4_verityops;
+
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
@@ -3238,6 +3380,19 @@
 
 extern const struct iomap_ops ext4_iomap_ops;
 
+static inline int ext4_buffer_uptodate(struct buffer_head *bh)
+{
+	/*
+	 * If the buffer has the write error flag, we have failed
+	 * to write out data in the block.  In this  case, we don't
+	 * have to read the block because we may read the old data
+	 * successfully.
+	 */
+	if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
+		set_buffer_uptodate(bh);
+	return buffer_uptodate(bh);
+}
+
 #endif	/* __KERNEL__ */
 
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index adf6668..98bd0e9 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h

@@ -120,6 +120,19 @@
 };
 
 /*
+ * Used to record a portion of a cluster found at the beginning or end
+ * of an extent while traversing the extent tree during space removal.
+ * A partial cluster may be removed if it does not contain blocks shared
+ * with extents that aren't being deleted (tofree state).  Otherwise,
+ * it cannot be removed (nofree state).
+ */
+struct partial_cluster {
+	ext4_fsblk_t pclu;  /* physical cluster number */
+	ext4_lblk_t lblk;   /* logical block number within logical cluster */
+	enum {initial, tofree, nofree} state;
+};
+
+/*
  * structure for external API
  */
 

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 15b6dd7..ef8fcf7 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h

@@ -361,20 +361,20 @@
 }
 
 static inline int ext4_jbd2_inode_add_write(handle_t *handle,
-					    struct inode *inode)
+		struct inode *inode, loff_t start_byte, loff_t length)
 {
 	if (ext4_handle_valid(handle))
-		return jbd2_journal_inode_add_write(handle,
-						    EXT4_I(inode)->jinode);
+		return jbd2_journal_inode_ranged_write(handle,
+				EXT4_I(inode)->jinode, start_byte, length);
 	return 0;
 }
 
 static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
-					   struct inode *inode)
+		struct inode *inode, loff_t start_byte, loff_t length)
 {
 	if (ext4_handle_valid(handle))
-		return jbd2_journal_inode_add_wait(handle,
-						   EXT4_I(inode)->jinode);
+		return jbd2_journal_inode_ranged_wait(handle,
+				EXT4_I(inode)->jinode, start_byte, length);
 	return 0;
 }
 
@@ -384,7 +384,7 @@
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	if (ext4_handle_valid(handle)) {
+	if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
 		ei->i_sync_tid = handle->h_transaction->t_tid;
 		if (datasync)
 			ei->i_datasync_tid = handle->h_transaction->t_tid;
@@ -411,7 +411,7 @@
 	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
 	    !test_opt(inode->i_sb, DELALLOC))) {
 		/* We do not support data journalling for encrypted data */
-		if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
 			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
 		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
 	}

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 72a361d..fb0f99d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c

@@ -518,10 +518,14 @@
 	}
 	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
 		return bh;
-	err = __ext4_ext_check(function, line, inode,
-			       ext_block_hdr(bh), depth, pblk);
-	if (err)
-		goto errout;
+	if (!ext4_has_feature_journal(inode->i_sb) ||
+	    (inode->i_ino !=
+	     le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
+		err = __ext4_ext_check(function, line, inode,
+				       ext_block_hdr(bh), depth, pblk);
+		if (err)
+			goto errout;
+	}
 	set_buffer_verified(bh);
 	/*
 	 * If this is a leaf block, cache all of its entries
@@ -1035,6 +1039,7 @@
 	__le32 border;
 	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
 	int err = 0;
+	size_t ext_size = 0;
 
 	/* make decision: where to split? */
 	/* FIXME: now decision is simplest: at current extent */
@@ -1126,6 +1131,10 @@
 		le16_add_cpu(&neh->eh_entries, m);
 	}
 
+	/* zero out unused area in the extent block */
+	ext_size = sizeof(struct ext4_extent_header) +
+		sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
+	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
 	ext4_extent_block_csum_set(inode, neh);
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
@@ -1205,6 +1214,11 @@
 				sizeof(struct ext4_extent_idx) * m);
 			le16_add_cpu(&neh->eh_entries, m);
 		}
+		/* zero out unused area in the extent block */
+		ext_size = sizeof(struct ext4_extent_header) +
+		   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
+		memset(bh->b_data + ext_size, 0,
+			inode->i_sb->s_blocksize - ext_size);
 		ext4_extent_block_csum_set(inode, neh);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
@@ -1270,6 +1284,7 @@
 	ext4_fsblk_t newblock, goal = 0;
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 	int err = 0;
+	size_t ext_size = 0;
 
 	/* Try to prepend new index to old one */
 	if (ext_depth(inode))
@@ -1295,9 +1310,11 @@
 		goto out;
 	}
 
+	ext_size = sizeof(EXT4_I(inode)->i_data);
 	/* move top-level index/leaf into new block */
-	memmove(bh->b_data, EXT4_I(inode)->i_data,
-		sizeof(EXT4_I(inode)->i_data));
+	memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
+	/* zero out unused area in the extent block */
+	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
 
 	/* set size of new block */
 	neh = ext_block_hdr(bh);
@@ -2298,6 +2315,52 @@
 	return err;
 }
 
+static int ext4_fill_es_cache_info(struct inode *inode,
+				   ext4_lblk_t block, ext4_lblk_t num,
+				   struct fiemap_extent_info *fieinfo)
+{
+	ext4_lblk_t next, end = block + num - 1;
+	struct extent_status es;
+	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+	unsigned int flags;
+	int err;
+
+	while (block <= end) {
+		next = 0;
+		flags = 0;
+		if (!ext4_es_lookup_extent(inode, block, &next, &es))
+			break;
+		if (ext4_es_is_unwritten(&es))
+			flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (ext4_es_is_delayed(&es))
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+		if (ext4_es_is_hole(&es))
+			flags |= EXT4_FIEMAP_EXTENT_HOLE;
+		if (next == 0)
+			flags |= FIEMAP_EXTENT_LAST;
+		if (flags & (FIEMAP_EXTENT_DELALLOC|
+			     EXT4_FIEMAP_EXTENT_HOLE))
+			es.es_pblk = 0;
+		else
+			es.es_pblk = ext4_es_pblock(&es);
+		err = fiemap_fill_next_extent(fieinfo,
+				(__u64)es.es_lblk << blksize_bits,
+				(__u64)es.es_pblk << blksize_bits,
+				(__u64)es.es_len << blksize_bits,
+				flags);
+		if (next == 0)
+			break;
+		block = next;
+		if (err < 0)
+			return err;
+		if (err == 1)
+			return 0;
+	}
+	return 0;
+}
+
+
 /*
  * ext4_ext_determine_hole - determine hole around given block
  * @inode:	inode we lookup in
@@ -2351,8 +2414,8 @@
 {
 	struct extent_status es;
 
-	ext4_es_find_delayed_extent_range(inode, hole_start,
-					  hole_start + hole_len - 1, &es);
+	ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
+				  hole_start + hole_len - 1, &es);
 	if (es.es_len) {
 		/* There's delayed extent containing lblock? */
 		if (es.es_lblk <= hole_start)
@@ -2490,106 +2553,157 @@
 	return 0;
 }
 
+/*
+ * ext4_rereserve_cluster - increment the reserved cluster count when
+ *                          freeing a cluster with a pending reservation
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in cluster to be reserved
+ *
+ * Increments the reserved cluster count and adjusts quota in a bigalloc
+ * file system when freeing a partial cluster containing at least one
+ * delayed and unwritten block.  A partial cluster meeting that
+ * requirement will have a pending reservation.  If so, the
+ * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
+ * defer reserved and allocated space accounting to a subsequent call
+ * to this function.
+ */
+static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
+
+	spin_lock(&ei->i_block_reservation_lock);
+	ei->i_reserved_data_blocks++;
+	percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
+	spin_unlock(&ei->i_block_reservation_lock);
+
+	percpu_counter_add(&sbi->s_freeclusters_counter, 1);
+	ext4_remove_pending(inode, lblk);
+}
+
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 			      struct ext4_extent *ex,
-			      long long *partial_cluster,
+			      struct partial_cluster *partial,
 			      ext4_lblk_t from, ext4_lblk_t to)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned short ee_len = ext4_ext_get_actual_len(ex);
-	ext4_fsblk_t pblk;
-	int flags = get_default_free_blocks_flags(inode);
+	ext4_fsblk_t last_pblk, pblk;
+	ext4_lblk_t num;
+	int flags;
+
+	/* only extent tail removal is allowed */
+	if (from < le32_to_cpu(ex->ee_block) ||
+	    to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
+		ext4_error(sbi->s_sb,
+			   "strange request: removal(2) %u-%u from %u:%u",
+			   from, to, le32_to_cpu(ex->ee_block), ee_len);
+		return 0;
+	}
+
+#ifdef EXTENTS_STATS
+	spin_lock(&sbi->s_ext_stats_lock);
+	sbi->s_ext_blocks += ee_len;
+	sbi->s_ext_extents++;
+	if (ee_len < sbi->s_ext_min)
+		sbi->s_ext_min = ee_len;
+	if (ee_len > sbi->s_ext_max)
+		sbi->s_ext_max = ee_len;
+	if (ext_depth(inode) > sbi->s_depth_max)
+		sbi->s_depth_max = ext_depth(inode);
+	spin_unlock(&sbi->s_ext_stats_lock);
+#endif
+
+	trace_ext4_remove_blocks(inode, ex, from, to, partial);
+
+	/*
+	 * if we have a partial cluster, and it's different from the
+	 * cluster of the last block in the extent, we free it
+	 */
+	last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
+
+	if (partial->state != initial &&
+	    partial->pclu != EXT4_B2C(sbi, last_pblk)) {
+		if (partial->state == tofree) {
+			flags = get_default_free_blocks_flags(inode);
+			if (ext4_is_pending(inode, partial->lblk))
+				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
+			ext4_free_blocks(handle, inode, NULL,
+					 EXT4_C2B(sbi, partial->pclu),
+					 sbi->s_cluster_ratio, flags);
+			if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+				ext4_rereserve_cluster(inode, partial->lblk);
+		}
+		partial->state = initial;
+	}
+
+	num = le32_to_cpu(ex->ee_block) + ee_len - from;
+	pblk = ext4_ext_pblock(ex) + ee_len - num;
+
+	/*
+	 * We free the partial cluster at the end of the extent (if any),
+	 * unless the cluster is used by another extent (partial_cluster
+	 * state is nofree).  If a partial cluster exists here, it must be
+	 * shared with the last block in the extent.
+	 */
+	flags = get_default_free_blocks_flags(inode);
+
+	/* partial, left end cluster aligned, right end unaligned */
+	if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
+	    (EXT4_LBLK_CMASK(sbi, to) >= from) &&
+	    (partial->state != nofree)) {
+		if (ext4_is_pending(inode, to))
+			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_PBLK_CMASK(sbi, last_pblk),
+				 sbi->s_cluster_ratio, flags);
+		if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+			ext4_rereserve_cluster(inode, to);
+		partial->state = initial;
+		flags = get_default_free_blocks_flags(inode);
+	}
+
+	flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
 
 	/*
 	 * For bigalloc file systems, we never free a partial cluster
-	 * at the beginning of the extent.  Instead, we make a note
-	 * that we tried freeing the cluster, and check to see if we
+	 * at the beginning of the extent.  Instead, we check to see if we
 	 * need to free it on a subsequent call to ext4_remove_blocks,
 	 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
 	 */
 	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+	ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
 
-	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+	/* reset the partial cluster if we've freed past it */
+	if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
+		partial->state = initial;
+
 	/*
-	 * If we have a partial cluster, and it's different from the
-	 * cluster of the last block, we need to explicitly free the
-	 * partial cluster here.
+	 * If we've freed the entire extent but the beginning is not left
+	 * cluster aligned and is not marked as ineligible for freeing we
+	 * record the partial cluster at the beginning of the extent.  It
+	 * wasn't freed by the preceding ext4_free_blocks() call, and we
+	 * need to look farther to the left to determine if it's to be freed
+	 * (not shared with another extent). Else, reset the partial
+	 * cluster - we're either  done freeing or the beginning of the
+	 * extent is left cluster aligned.
 	 */
-	pblk = ext4_ext_pblock(ex) + ee_len - 1;
-	if (*partial_cluster > 0 &&
-	    *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
-		ext4_free_blocks(handle, inode, NULL,
-				 EXT4_C2B(sbi, *partial_cluster),
-				 sbi->s_cluster_ratio, flags);
-		*partial_cluster = 0;
-	}
-
-#ifdef EXTENTS_STATS
-	{
-		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-		spin_lock(&sbi->s_ext_stats_lock);
-		sbi->s_ext_blocks += ee_len;
-		sbi->s_ext_extents++;
-		if (ee_len < sbi->s_ext_min)
-			sbi->s_ext_min = ee_len;
-		if (ee_len > sbi->s_ext_max)
-			sbi->s_ext_max = ee_len;
-		if (ext_depth(inode) > sbi->s_depth_max)
-			sbi->s_depth_max = ext_depth(inode);
-		spin_unlock(&sbi->s_ext_stats_lock);
-	}
-#endif
-	if (from >= le32_to_cpu(ex->ee_block)
-	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		/* tail removal */
-		ext4_lblk_t num;
-		long long first_cluster;
-
-		num = le32_to_cpu(ex->ee_block) + ee_len - from;
-		pblk = ext4_ext_pblock(ex) + ee_len - num;
-		/*
-		 * Usually we want to free partial cluster at the end of the
-		 * extent, except for the situation when the cluster is still
-		 * used by any other extent (partial_cluster is negative).
-		 */
-		if (*partial_cluster < 0 &&
-		    *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
-			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
-
-		ext_debug("free last %u blocks starting %llu partial %lld\n",
-			  num, pblk, *partial_cluster);
-		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
-		/*
-		 * If the block range to be freed didn't start at the
-		 * beginning of a cluster, and we removed the entire
-		 * extent and the cluster is not used by any other extent,
-		 * save the partial cluster here, since we might need to
-		 * delete if we determine that the truncate or punch hole
-		 * operation has removed all of the blocks in the cluster.
-		 * If that cluster is used by another extent, preserve its
-		 * negative value so it isn't freed later on.
-		 *
-		 * If the whole extent wasn't freed, we've reached the
-		 * start of the truncated/punched region and have finished
-		 * removing blocks.  If there's a partial cluster here it's
-		 * shared with the remainder of the extent and is no longer
-		 * a candidate for removal.
-		 */
-		if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
-			first_cluster = (long long) EXT4_B2C(sbi, pblk);
-			if (first_cluster != -*partial_cluster)
-				*partial_cluster = first_cluster;
-		} else {
-			*partial_cluster = 0;
+	if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
+		if (partial->state == initial) {
+			partial->pclu = EXT4_B2C(sbi, pblk);
+			partial->lblk = from;
+			partial->state = tofree;
 		}
-	} else
-		ext4_error(sbi->s_sb, "strange request: removal(2) "
-			   "%u-%u from %u:%u",
-			   from, to, le32_to_cpu(ex->ee_block), ee_len);
+	} else {
+		partial->state = initial;
+	}
+
 	return 0;
 }
 
-
 /*
  * ext4_ext_rm_leaf() Removes the extents associated with the
  * blocks appearing between "start" and "end".  Both "start"
@@ -2608,7 +2722,7 @@
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		 struct ext4_ext_path *path,
-		 long long *partial_cluster,
+		 struct partial_cluster *partial,
 		 ext4_lblk_t start, ext4_lblk_t end)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2640,7 +2754,7 @@
 	ex_ee_block = le32_to_cpu(ex->ee_block);
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
-	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+	trace_ext4_ext_rm_leaf(inode, start, ex, partial);
 
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
@@ -2671,8 +2785,8 @@
 			 */
 			if (sbi->s_cluster_ratio > 1) {
 				pblk = ext4_ext_pblock(ex);
-				*partial_cluster =
-					-(long long) EXT4_B2C(sbi, pblk);
+				partial->pclu = EXT4_B2C(sbi, pblk);
+				partial->state = nofree;
 			}
 			ex--;
 			ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2714,8 +2828,7 @@
 		if (err)
 			goto out;
 
-		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
-					 a, b);
+		err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
 		if (err)
 			goto out;
 
@@ -2769,18 +2882,23 @@
 	 * If there's a partial cluster and at least one extent remains in
 	 * the leaf, free the partial cluster if it isn't shared with the
 	 * current extent.  If it is shared with the current extent
-	 * we zero partial_cluster because we've reached the start of the
+	 * we reset the partial cluster because we've reached the start of the
 	 * truncated/punched region and we're done removing blocks.
 	 */
-	if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
+	if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
 		pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
-		if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+		if (partial->pclu != EXT4_B2C(sbi, pblk)) {
+			int flags = get_default_free_blocks_flags(inode);
+
+			if (ext4_is_pending(inode, partial->lblk))
+				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
 			ext4_free_blocks(handle, inode, NULL,
-					 EXT4_C2B(sbi, *partial_cluster),
-					 sbi->s_cluster_ratio,
-					 get_default_free_blocks_flags(inode));
+					 EXT4_C2B(sbi, partial->pclu),
+					 sbi->s_cluster_ratio, flags);
+			if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+				ext4_rereserve_cluster(inode, partial->lblk);
 		}
-		*partial_cluster = 0;
+		partial->state = initial;
 	}
 
 	/* if this leaf is free, then we should
@@ -2819,10 +2937,14 @@
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path = NULL;
-	long long partial_cluster = 0;
+	struct partial_cluster partial;
 	handle_t *handle;
 	int i = 0, err = 0;
 
+	partial.pclu = 0;
+	partial.lblk = 0;
+	partial.state = initial;
+
 	ext_debug("truncate since %u to %u\n", start, end);
 
 	/* probably first extent we're gonna free will be last in block */
@@ -2882,8 +3004,8 @@
 			 */
 			if (sbi->s_cluster_ratio > 1) {
 				pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
-				partial_cluster =
-					-(long long) EXT4_B2C(sbi, pblk);
+				partial.pclu = EXT4_B2C(sbi, pblk);
+				partial.state = nofree;
 			}
 
 			/*
@@ -2897,23 +3019,27 @@
 			if (err < 0)
 				goto out;
 
-		} else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+		} else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
+			   partial.state == initial) {
 			/*
-			 * If there's an extent to the right its first cluster
-			 * contains the immediate right boundary of the
-			 * truncated/punched region.  Set partial_cluster to
-			 * its negative value so it won't be freed if shared
-			 * with the current extent.  The end < ee_block case
-			 * is handled in ext4_ext_rm_leaf().
+			 * If we're punching, there's an extent to the right.
+			 * If the partial cluster hasn't been set, set it to
+			 * that extent's first cluster and its state to nofree
+			 * so it won't be freed should it contain blocks to be
+			 * removed. If it's already set (tofree/nofree), we're
+			 * retrying and keep the original partial cluster info
+			 * so a cluster marked tofree as a result of earlier
+			 * extent removal is not lost.
 			 */
 			lblk = ex_end + 1;
 			err = ext4_ext_search_right(inode, path, &lblk, &pblk,
 						    &ex);
 			if (err)
 				goto out;
-			if (pblk)
-				partial_cluster =
-					-(long long) EXT4_B2C(sbi, pblk);
+			if (pblk) {
+				partial.pclu = EXT4_B2C(sbi, pblk);
+				partial.state = nofree;
+			}
 		}
 	}
 	/*
@@ -2948,8 +3074,7 @@
 		if (i == depth) {
 			/* this is leaf block */
 			err = ext4_ext_rm_leaf(handle, inode, path,
-					       &partial_cluster, start,
-					       end);
+					       &partial, start, end);
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
 			path[i].p_bh = NULL;
@@ -3021,21 +3146,24 @@
 		}
 	}
 
-	trace_ext4_ext_remove_space_done(inode, start, end, depth,
-			partial_cluster, path->p_hdr->eh_entries);
+	trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
+					 path->p_hdr->eh_entries);
 
 	/*
-	 * If we still have something in the partial cluster and we have removed
-	 * even the first extent, then we should free the blocks in the partial
-	 * cluster as well.  (This code will only run when there are no leaves
-	 * to the immediate left of the truncated/punched region.)
+	 * if there's a partial cluster and we have removed the first extent
+	 * in the file, then we also free the partial cluster, if any
 	 */
-	if (partial_cluster > 0 && err == 0) {
-		/* don't zero partial_cluster since it's not used afterwards */
+	if (partial.state == tofree && err == 0) {
+		int flags = get_default_free_blocks_flags(inode);
+
+		if (ext4_is_pending(inode, partial.lblk))
+			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
 		ext4_free_blocks(handle, inode, NULL,
-				 EXT4_C2B(sbi, partial_cluster),
-				 sbi->s_cluster_ratio,
-				 get_default_free_blocks_flags(inode));
+				 EXT4_C2B(sbi, partial.pclu),
+				 sbi->s_cluster_ratio, flags);
+		if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+			ext4_rereserve_cluster(inode, partial.lblk);
+		partial.state = initial;
 	}
 
 	/* TODO: flexible tree reduction should be here */
@@ -3569,7 +3697,7 @@
 		max_zeroout = sbi->s_extent_max_zeroout_kb >>
 			(inode->i_sb->s_blocksize_bits - 10);
 
-	if (ext4_encrypted_inode(inode))
+	if (IS_ENCRYPTED(inode))
 		max_zeroout = 0;
 
 	/*
@@ -3731,8 +3859,8 @@
 	 * illegal.
 	 */
 	if (ee_block != map->m_lblk || ee_len > map->m_len) {
-#ifdef EXT4_DEBUG
-		ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+#ifdef CONFIG_EXT4_DEBUG
+		ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
 			     " len %u; IO logical block %llu, len %u",
 			     inode->i_ino, (unsigned long long)ee_block, ee_len,
 			     (unsigned long long)map->m_lblk, map->m_len);
@@ -3819,114 +3947,6 @@
 	return ext4_mark_inode_dirty(handle, inode);
 }
 
-/**
- * ext4_find_delalloc_range: find delayed allocated block in the given range.
- *
- * Return 1 if there is a delalloc block in the range, otherwise 0.
- */
-int ext4_find_delalloc_range(struct inode *inode,
-			     ext4_lblk_t lblk_start,
-			     ext4_lblk_t lblk_end)
-{
-	struct extent_status es;
-
-	ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
-	if (es.es_len == 0)
-		return 0; /* there is no delay extent in this tree */
-	else if (es.es_lblk <= lblk_start &&
-		 lblk_start < es.es_lblk + es.es_len)
-		return 1;
-	else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
-		return 1;
-	else
-		return 0;
-}
-
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	ext4_lblk_t lblk_start, lblk_end;
-	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
-	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
-
-	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
-}
-
-/**
- * Determines how many complete clusters (out of those specified by the 'map')
- * are under delalloc and were reserved quota for.
- * This function is called when we are writing out the blocks that were
- * originally written with their allocation delayed, but then the space was
- * allocated using fallocate() before the delayed allocation could be resolved.
- * The cases to look for are:
- * ('=' indicated delayed allocated blocks
- *  '-' indicates non-delayed allocated blocks)
- * (a) partial clusters towards beginning and/or end outside of allocated range
- *     are not delalloc'ed.
- *	Ex:
- *	|----c---=|====c====|====c====|===-c----|
- *	         |++++++ allocated ++++++|
- *	==> 4 complete clusters in above example
- *
- * (b) partial cluster (outside of allocated range) towards either end is
- *     marked for delayed allocation. In this case, we will exclude that
- *     cluster.
- *	Ex:
- *	|----====c========|========c========|
- *	     |++++++ allocated ++++++|
- *	==> 1 complete clusters in above example
- *
- *	Ex:
- *	|================c================|
- *            |++++++ allocated ++++++|
- *	==> 0 complete clusters in above example
- *
- * The ext4_da_update_reserve_space will be called only if we
- * determine here that there were some "entire" clusters that span
- * this 'allocated' range.
- * In the non-bigalloc case, this function will just end up returning num_blks
- * without ever calling ext4_find_delalloc_range.
- */
-static unsigned int
-get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
-			   unsigned int num_blks)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
-	ext4_lblk_t lblk_from, lblk_to, c_offset;
-	unsigned int allocated_clusters = 0;
-
-	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
-	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
-
-	/* max possible clusters for this allocation */
-	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
-
-	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
-
-	/* Check towards left side */
-	c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
-	if (c_offset) {
-		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
-		lblk_to = lblk_from + c_offset - 1;
-
-		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
-			allocated_clusters--;
-	}
-
-	/* Now check towards right. */
-	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
-	if (allocated_clusters && c_offset) {
-		lblk_from = lblk_start + num_blks;
-		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
-
-		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
-			allocated_clusters--;
-	}
-
-	return allocated_clusters;
-}
-
 static int
 convert_initialized_extent(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map,
@@ -4094,37 +4114,10 @@
 	} else
 		allocated = ret;
 	map->m_flags |= EXT4_MAP_NEW;
-	/*
-	 * if we allocated more blocks than requested
-	 * we need to make sure we unmap the extra block
-	 * allocated. The actual needed block will get
-	 * unmapped later when we find the buffer_head marked
-	 * new.
-	 */
-	if (allocated > map->m_len) {
-		clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
-				   allocated - map->m_len);
+	if (allocated > map->m_len)
 		allocated = map->m_len;
-	}
 	map->m_len = allocated;
 
-	/*
-	 * If we have done fallocate with the offset that is already
-	 * delayed allocated, we would have block reservation
-	 * and quota reservation done in the delayed write path.
-	 * But fallocate would have already updated quota and block
-	 * count for this offset. So cancel these reservation
-	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-		unsigned int reserved_clusters;
-		reserved_clusters = get_reserved_cluster_alloc(inode,
-				map->m_lblk, map->m_len);
-		if (reserved_clusters)
-			ext4_da_update_reserve_space(inode,
-						     reserved_clusters,
-						     0);
-	}
-
 map_out:
 	map->m_flags |= EXT4_MAP_MAPPED;
 	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
@@ -4513,77 +4506,39 @@
 	map->m_flags |= EXT4_MAP_NEW;
 
 	/*
-	 * Update reserved blocks/metadata blocks after successful
-	 * block allocation which had been deferred till now.
+	 * Reduce the reserved cluster count to reflect successful deferred
+	 * allocation of delayed allocated clusters or direct allocation of
+	 * clusters discovered to be delayed allocated.  Once allocated, a
+	 * cluster is not included in the reserved count.
 	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-		unsigned int reserved_clusters;
-		/*
-		 * Check how many clusters we had reserved this allocated range
-		 */
-		reserved_clusters = get_reserved_cluster_alloc(inode,
-						map->m_lblk, allocated);
-		if (!map_from_cluster) {
-			BUG_ON(allocated_clusters < reserved_clusters);
-			if (reserved_clusters < allocated_clusters) {
-				struct ext4_inode_info *ei = EXT4_I(inode);
-				int reservation = allocated_clusters -
-						  reserved_clusters;
-				/*
-				 * It seems we claimed few clusters outside of
-				 * the range of this allocation. We should give
-				 * it back to the reservation pool. This can
-				 * happen in the following case:
-				 *
-				 * * Suppose s_cluster_ratio is 4 (i.e., each
-				 *   cluster has 4 blocks. Thus, the clusters
-				 *   are [0-3],[4-7],[8-11]...
-				 * * First comes delayed allocation write for
-				 *   logical blocks 10 & 11. Since there were no
-				 *   previous delayed allocated blocks in the
-				 *   range [8-11], we would reserve 1 cluster
-				 *   for this write.
-				 * * Next comes write for logical blocks 3 to 8.
-				 *   In this case, we will reserve 2 clusters
-				 *   (for [0-3] and [4-7]; and not for [8-11] as
-				 *   that range has a delayed allocated blocks.
-				 *   Thus total reserved clusters now becomes 3.
-				 * * Now, during the delayed allocation writeout
-				 *   time, we will first write blocks [3-8] and
-				 *   allocate 3 clusters for writing these
-				 *   blocks. Also, we would claim all these
-				 *   three clusters above.
-				 * * Now when we come here to writeout the
-				 *   blocks [10-11], we would expect to claim
-				 *   the reservation of 1 cluster we had made
-				 *   (and we would claim it since there are no
-				 *   more delayed allocated blocks in the range
-				 *   [8-11]. But our reserved cluster count had
-				 *   already gone to 0.
-				 *
-				 *   Thus, at the step 4 above when we determine
-				 *   that there are still some unwritten delayed
-				 *   allocated blocks outside of our current
-				 *   block range, we should increment the
-				 *   reserved clusters count so that when the
-				 *   remaining blocks finally gets written, we
-				 *   could claim them.
-				 */
-				dquot_reserve_block(inode,
-						EXT4_C2B(sbi, reservation));
-				spin_lock(&ei->i_block_reservation_lock);
-				ei->i_reserved_data_blocks += reservation;
-				spin_unlock(&ei->i_block_reservation_lock);
-			}
+	if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
+		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 			/*
-			 * We will claim quota for all newly allocated blocks.
-			 * We're updating the reserved space *after* the
-			 * correction above so we do not accidentally free
-			 * all the metadata reservation because we might
-			 * actually need it later on.
+			 * When allocating delayed allocated clusters, simply
+			 * reduce the reserved cluster count and claim quota
 			 */
 			ext4_da_update_reserve_space(inode, allocated_clusters,
 							1);
+		} else {
+			ext4_lblk_t lblk, len;
+			unsigned int n;
+
+			/*
+			 * When allocating non-delayed allocated clusters
+			 * (from fallocate, filemap, DIO, or clusters
+			 * allocated when delalloc has been disabled by
+			 * ext4_nonda_switch), reduce the reserved cluster
+			 * count by the number of allocated clusters that
+			 * have previously been delayed allocated.  Quota
+			 * has been claimed by ext4_mb_new_blocks() above,
+			 * so release the quota reservations made for any
+			 * previously delayed allocated clusters.
+			 */
+			lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
+			len = allocated_clusters << sbi->s_cluster_bits;
+			n = ext4_es_delayed_clu(inode, lblk, len);
+			if (n > 0)
+				ext4_da_update_reserve_space(inode, (int) n, 0);
 		}
 	}
 
@@ -4919,7 +4874,7 @@
 	 * leave it disabled for encrypted inodes for now.  This is a
 	 * bug we should fix....
 	 */
-	if (ext4_encrypted_inode(inode) &&
+	if (IS_ENCRYPTED(inode) &&
 	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
 		     FALLOC_FL_ZERO_RANGE)))
 		return -EOPNOTSUPP;
@@ -5075,8 +5030,10 @@
 	ext4_lblk_t block, next_del;
 
 	if (newes->es_pblk == 0) {
-		ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
-				newes->es_lblk + newes->es_len - 1, &es);
+		ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+					  newes->es_lblk,
+					  newes->es_lblk + newes->es_len - 1,
+					  &es);
 
 		/*
 		 * No extent in extent-tree contains block @newes->es_pblk,
@@ -5097,7 +5054,8 @@
 	}
 
 	block = newes->es_lblk + newes->es_len;
-	ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
+	ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
+				  EXT_MAX_BLOCKS, &es);
 	if (es.es_len == 0)
 		next_del = EXT_MAX_BLOCKS;
 	else
@@ -5105,8 +5063,6 @@
 
 	return next_del;
 }
-/* fiemap flags we can handle specified here */
-#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
 static int ext4_xattr_fiemap(struct inode *inode,
 				struct fiemap_extent_info *fieinfo)
@@ -5143,10 +5099,16 @@
 	return (error < 0 ? error : 0);
 }
 
-int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		__u64 start, __u64 len)
+static int _ext4_fiemap(struct inode *inode,
+			struct fiemap_extent_info *fieinfo,
+			__u64 start, __u64 len,
+			int (*fill)(struct inode *, ext4_lblk_t,
+				    ext4_lblk_t,
+				    struct fiemap_extent_info *))
 {
 	ext4_lblk_t start_blk;
+	u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
+
 	int error = 0;
 
 	if (ext4_has_inline_data(inode)) {
@@ -5163,14 +5125,18 @@
 		error = ext4_ext_precache(inode);
 		if (error)
 			return error;
+		fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
 	}
 
 	/* fallback to generic here if not in extents fmt */
-	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+	    fill == ext4_fill_fiemap_extents)
 		return generic_block_fiemap(inode, fieinfo, start, len,
 			ext4_get_block);
 
-	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+	if (fill == ext4_fill_es_cache_info)
+		ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
+	if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
 		return -EBADR;
 
 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
@@ -5189,12 +5155,36 @@
 		 * Walk the extent tree gathering extent information
 		 * and pushing extents back to the user.
 		 */
-		error = ext4_fill_fiemap_extents(inode, start_blk,
-						 len_blks, fieinfo);
+		error = fill(inode, start_blk, len_blks, fieinfo);
 	}
 	return error;
 }
 
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	return _ext4_fiemap(inode, fieinfo, start, len,
+			    ext4_fill_fiemap_extents);
+}
+
+int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		      __u64 start, __u64 len)
+{
+	if (ext4_has_inline_data(inode)) {
+		int has_inline;
+
+		down_read(&EXT4_I(inode)->xattr_sem);
+		has_inline = ext4_has_inline_data(inode);
+		up_read(&EXT4_I(inode)->xattr_sem);
+		if (has_inline)
+			return 0;
+	}
+
+	return _ext4_fiemap(inode, fieinfo, start, len,
+			    ext4_fill_es_cache_info);
+}
+
+
 /*
  * ext4_access_path:
  * Function to access the path buffer for marking it dirty.
@@ -5764,8 +5754,8 @@
 }
 
 /**
- * ext4_swap_extents - Swap extents between two inodes
- *
+ * ext4_swap_extents() - Swap extents between two inodes
+ * @handle: handle for this transaction
  * @inode1:	First inode
  * @inode2:	Second inode
  * @lblk1:	Start block for first inode
@@ -5958,3 +5948,82 @@
 	}
 	return replaced_count;
 }
+
+/*
+ * ext4_clu_mapped - determine whether any block in a logical cluster has
+ *                   been mapped to a physical cluster
+ *
+ * @inode - file containing the logical cluster
+ * @lclu - logical cluster of interest
+ *
+ * Returns 1 if any block in the logical cluster is mapped, signifying
+ * that a physical cluster has been allocated for it.  Otherwise,
+ * returns 0.  Can also return negative error codes.  Derived from
+ * ext4_ext_map_blocks().
+ */
+int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_ext_path *path;
+	int depth, mapped = 0, err = 0;
+	struct ext4_extent *extent;
+	ext4_lblk_t first_lblk, first_lclu, last_lclu;
+
+	/* search for the extent closest to the first block in the cluster */
+	path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		path = NULL;
+		goto out;
+	}
+
+	depth = ext_depth(inode);
+
+	/*
+	 * A consistent leaf must not be empty.  This situation is possible,
+	 * though, _during_ tree modification, and it's why an assert can't
+	 * be put in ext4_find_extent().
+	 */
+	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+		EXT4_ERROR_INODE(inode,
+		    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
+				 (unsigned long) EXT4_C2B(sbi, lclu),
+				 depth, path[depth].p_block);
+		err = -EFSCORRUPTED;
+		goto out;
+	}
+
+	extent = path[depth].p_ext;
+
+	/* can't be mapped if the extent tree is empty */
+	if (extent == NULL)
+		goto out;
+
+	first_lblk = le32_to_cpu(extent->ee_block);
+	first_lclu = EXT4_B2C(sbi, first_lblk);
+
+	/*
+	 * Three possible outcomes at this point - found extent spanning
+	 * the target cluster, to the left of the target cluster, or to the
+	 * right of the target cluster.  The first two cases are handled here.
+	 * The last case indicates the target cluster is not mapped.
+	 */
+	if (lclu >= first_lclu) {
+		last_lclu = EXT4_B2C(sbi, first_lblk +
+				     ext4_ext_get_actual_len(extent) - 1);
+		if (lclu <= last_lclu) {
+			mapped = 1;
+		} else {
+			first_lblk = ext4_ext_next_allocated_block(path);
+			first_lclu = EXT4_B2C(sbi, first_lblk);
+			if (lclu == first_lclu)
+				mapped = 1;
+		}
+	}
+
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	return err ? err : mapped;
+}

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index c4e6fb1..d996b44 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c

@@ -142,13 +142,16 @@
  */
 
 static struct kmem_cache *ext4_es_cachep;
+static struct kmem_cache *ext4_pending_cachep;
 
 static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
-			      ext4_lblk_t end);
+			      ext4_lblk_t end, int *reserved);
 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
 		       struct ext4_inode_info *locked_ei);
+static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+			     ext4_lblk_t len);
 
 int __init ext4_init_es(void)
 {
@@ -233,30 +236,38 @@
 }
 
 /*
- * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
- * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ * ext4_es_find_extent_range - find extent with specified status within block
+ *                             range or next extent following block range in
+ *                             extents status tree
  *
- * @inode: the inode which owns delayed extents
- * @lblk: the offset where we start to search
- * @end: the offset where we stop to search
- * @es: delayed extent that we found
+ * @inode - file containing the range
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block defining start of range
+ * @end - logical block defining end of range
+ * @es - extent found, if any
+ *
+ * Find the first extent within the block range specified by @lblk and @end
+ * in the extents status tree that satisfies @matching_fn.  If a match
+ * is found, it's returned in @es.  If not, and a matching extent is found
+ * beyond the block range, it's returned in @es.  If no match is found, an
+ * extent is returned in @es whose es_lblk, es_len, and es_pblk components
+ * are 0.
  */
-void ext4_es_find_delayed_extent_range(struct inode *inode,
-				 ext4_lblk_t lblk, ext4_lblk_t end,
-				 struct extent_status *es)
+static void __es_find_extent_range(struct inode *inode,
+				   int (*matching_fn)(struct extent_status *es),
+				   ext4_lblk_t lblk, ext4_lblk_t end,
+				   struct extent_status *es)
 {
 	struct ext4_es_tree *tree = NULL;
 	struct extent_status *es1 = NULL;
 	struct rb_node *node;
 
-	BUG_ON(es == NULL);
-	BUG_ON(end < lblk);
-	trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+	WARN_ON(es == NULL);
+	WARN_ON(end < lblk);
 
-	read_lock(&EXT4_I(inode)->i_es_lock);
 	tree = &EXT4_I(inode)->i_es_tree;
 
-	/* find extent in cache firstly */
+	/* see if the extent has been cached */
 	es->es_lblk = es->es_len = es->es_pblk = 0;
 	if (tree->cache_es) {
 		es1 = tree->cache_es;
@@ -271,28 +282,133 @@
 	es1 = __es_tree_search(&tree->root, lblk);
 
 out:
-	if (es1 && !ext4_es_is_delayed(es1)) {
+	if (es1 && !matching_fn(es1)) {
 		while ((node = rb_next(&es1->rb_node)) != NULL) {
 			es1 = rb_entry(node, struct extent_status, rb_node);
 			if (es1->es_lblk > end) {
 				es1 = NULL;
 				break;
 			}
-			if (ext4_es_is_delayed(es1))
+			if (matching_fn(es1))
 				break;
 		}
 	}
 
-	if (es1 && ext4_es_is_delayed(es1)) {
+	if (es1 && matching_fn(es1)) {
 		tree->cache_es = es1;
 		es->es_lblk = es1->es_lblk;
 		es->es_len = es1->es_len;
 		es->es_pblk = es1->es_pblk;
 	}
 
+}
+
+/*
+ * Locking for __es_find_extent_range() for external use
+ */
+void ext4_es_find_extent_range(struct inode *inode,
+			       int (*matching_fn)(struct extent_status *es),
+			       ext4_lblk_t lblk, ext4_lblk_t end,
+			       struct extent_status *es)
+{
+	trace_ext4_es_find_extent_range_enter(inode, lblk);
+
+	read_lock(&EXT4_I(inode)->i_es_lock);
+	__es_find_extent_range(inode, matching_fn, lblk, end, es);
 	read_unlock(&EXT4_I(inode)->i_es_lock);
 
-	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+	trace_ext4_es_find_extent_range_exit(inode, es);
+}
+
+/*
+ * __es_scan_range - search block range for block with specified status
+ *                   in extents status tree
+ *
+ * @inode - file containing the range
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block defining start of range
+ * @end - logical block defining end of range
+ *
+ * Returns true if at least one block in the specified block range satisfies
+ * the criterion specified by @matching_fn, and false if not.  If at least
+ * one extent has the specified status, then there is at least one block
+ * in the cluster with that status.  Should only be called by code that has
+ * taken i_es_lock.
+ */
+static bool __es_scan_range(struct inode *inode,
+			    int (*matching_fn)(struct extent_status *es),
+			    ext4_lblk_t start, ext4_lblk_t end)
+{
+	struct extent_status es;
+
+	__es_find_extent_range(inode, matching_fn, start, end, &es);
+	if (es.es_len == 0)
+		return false;   /* no matching extent in the tree */
+	else if (es.es_lblk <= start &&
+		 start < es.es_lblk + es.es_len)
+		return true;
+	else if (start <= es.es_lblk && es.es_lblk <= end)
+		return true;
+	else
+		return false;
+}
+/*
+ * Locking for __es_scan_range() for external use
+ */
+bool ext4_es_scan_range(struct inode *inode,
+			int (*matching_fn)(struct extent_status *es),
+			ext4_lblk_t lblk, ext4_lblk_t end)
+{
+	bool ret;
+
+	read_lock(&EXT4_I(inode)->i_es_lock);
+	ret = __es_scan_range(inode, matching_fn, lblk, end);
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+	return ret;
+}
+
+/*
+ * __es_scan_clu - search cluster for block with specified status in
+ *                 extents status tree
+ *
+ * @inode - file containing the cluster
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block in cluster to be searched
+ *
+ * Returns true if at least one extent in the cluster containing @lblk
+ * satisfies the criterion specified by @matching_fn, and false if not.  If at
+ * least one extent has the specified status, then there is at least one block
+ * in the cluster with that status.  Should only be called by code that has
+ * taken i_es_lock.
+ */
+static bool __es_scan_clu(struct inode *inode,
+			  int (*matching_fn)(struct extent_status *es),
+			  ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t lblk_start, lblk_end;
+
+	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
+	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+	return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
+}
+
+/*
+ * Locking for __es_scan_clu() for external use
+ */
+bool ext4_es_scan_clu(struct inode *inode,
+		      int (*matching_fn)(struct extent_status *es),
+		      ext4_lblk_t lblk)
+{
+	bool ret;
+
+	read_lock(&EXT4_I(inode)->i_es_lock);
+	ret = __es_scan_clu(inode, matching_fn, lblk);
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+	return ret;
 }
 
 static void ext4_es_list_add(struct inode *inode)
@@ -595,7 +711,7 @@
 			 * We don't need to check unwritten extent because
 			 * indirect-based file doesn't have it.
 			 */
-			BUG_ON(1);
+			BUG();
 		}
 	} else if (retval == 0) {
 		if (ext4_es_is_written(es)) {
@@ -664,7 +780,7 @@
 			}
 			p = &(*p)->rb_right;
 		} else {
-			BUG_ON(1);
+			BUG();
 			return -EINVAL;
 		}
 	}
@@ -694,6 +810,7 @@
 	struct extent_status newes;
 	ext4_lblk_t end = lblk + len - 1;
 	int err = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 	es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
 		 lblk, len, pblk, status, inode->i_ino);
@@ -719,7 +836,7 @@
 	ext4_es_insert_extent_check(inode, &newes);
 
 	write_lock(&EXT4_I(inode)->i_es_lock);
-	err = __es_remove_extent(inode, lblk, end);
+	err = __es_remove_extent(inode, lblk, end, NULL);
 	if (err != 0)
 		goto error;
 retry:
@@ -730,6 +847,11 @@
 	if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
 		err = 0;
 
+	if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
+	    (status & EXTENT_STATUS_WRITTEN ||
+	     status & EXTENT_STATUS_UNWRITTEN))
+		__revise_pending(inode, lblk, len);
+
 error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 
@@ -777,6 +899,7 @@
  * Return: 1 on found, 0 on not
  */
 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t *next_lblk,
 			  struct extent_status *es)
 {
 	struct ext4_es_tree *tree;
@@ -825,9 +948,18 @@
 		es->es_pblk = es1->es_pblk;
 		if (!ext4_es_is_referenced(es1))
 			ext4_es_set_referenced(es1);
-		stats->es_stats_cache_hits++;
+		percpu_counter_inc(&stats->es_stats_cache_hits);
+		if (next_lblk) {
+			node = rb_next(&es1->rb_node);
+			if (node) {
+				es1 = rb_entry(node, struct extent_status,
+					       rb_node);
+				*next_lblk = es1->es_lblk;
+			} else
+				*next_lblk = 0;
+		}
 	} else {
-		stats->es_stats_cache_misses++;
+		percpu_counter_inc(&stats->es_stats_cache_misses);
 	}
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -836,8 +968,322 @@
 	return found;
 }
 
+struct rsvd_count {
+	int ndelonly;
+	bool first_do_lblk_found;
+	ext4_lblk_t first_do_lblk;
+	ext4_lblk_t last_do_lblk;
+	struct extent_status *left_es;
+	bool partial;
+	ext4_lblk_t lclu;
+};
+
+/*
+ * init_rsvd - initialize reserved count data before removing block range
+ *	       in file from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @es - pointer to first extent in range
+ * @rc - pointer to reserved count data
+ *
+ * Assumes es is not NULL
+ */
+static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
+		      struct extent_status *es, struct rsvd_count *rc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct rb_node *node;
+
+	rc->ndelonly = 0;
+
+	/*
+	 * for bigalloc, note the first delonly block in the range has not
+	 * been found, record the extent containing the block to the left of
+	 * the region to be removed, if any, and note that there's no partial
+	 * cluster to track
+	 */
+	if (sbi->s_cluster_ratio > 1) {
+		rc->first_do_lblk_found = false;
+		if (lblk > es->es_lblk) {
+			rc->left_es = es;
+		} else {
+			node = rb_prev(&es->rb_node);
+			rc->left_es = node ? rb_entry(node,
+						      struct extent_status,
+						      rb_node) : NULL;
+		}
+		rc->partial = false;
+	}
+}
+
+/*
+ * count_rsvd - count the clusters containing delayed and not unwritten
+ *		(delonly) blocks in a range within an extent and add to
+ *	        the running tally in rsvd_count
+ *
+ * @inode - file containing extent
+ * @lblk - first block in range
+ * @len - length of range in blocks
+ * @es - pointer to extent containing clusters to be counted
+ * @rc - pointer to reserved count data
+ *
+ * Tracks partial clusters found at the beginning and end of extents so
+ * they aren't overcounted when they span adjacent extents
+ */
+static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
+		       struct extent_status *es, struct rsvd_count *rc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t i, end, nclu;
+
+	if (!ext4_es_is_delonly(es))
+		return;
+
+	WARN_ON(len <= 0);
+
+	if (sbi->s_cluster_ratio == 1) {
+		rc->ndelonly += (int) len;
+		return;
+	}
+
+	/* bigalloc */
+
+	i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
+	end = lblk + (ext4_lblk_t) len - 1;
+	end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
+
+	/* record the first block of the first delonly extent seen */
+	if (rc->first_do_lblk_found == false) {
+		rc->first_do_lblk = i;
+		rc->first_do_lblk_found = true;
+	}
+
+	/* update the last lblk in the region seen so far */
+	rc->last_do_lblk = end;
+
+	/*
+	 * if we're tracking a partial cluster and the current extent
+	 * doesn't start with it, count it and stop tracking
+	 */
+	if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
+		rc->ndelonly++;
+		rc->partial = false;
+	}
+
+	/*
+	 * if the first cluster doesn't start on a cluster boundary but
+	 * ends on one, count it
+	 */
+	if (EXT4_LBLK_COFF(sbi, i) != 0) {
+		if (end >= EXT4_LBLK_CFILL(sbi, i)) {
+			rc->ndelonly++;
+			rc->partial = false;
+			i = EXT4_LBLK_CFILL(sbi, i) + 1;
+		}
+	}
+
+	/*
+	 * if the current cluster starts on a cluster boundary, count the
+	 * number of whole delonly clusters in the extent
+	 */
+	if ((i + sbi->s_cluster_ratio - 1) <= end) {
+		nclu = (end - i + 1) >> sbi->s_cluster_bits;
+		rc->ndelonly += nclu;
+		i += nclu << sbi->s_cluster_bits;
+	}
+
+	/*
+	 * start tracking a partial cluster if there's a partial at the end
+	 * of the current extent and we're not already tracking one
+	 */
+	if (!rc->partial && i <= end) {
+		rc->partial = true;
+		rc->lclu = EXT4_B2C(sbi, i);
+	}
+}
+
+/*
+ * __pr_tree_search - search for a pending cluster reservation
+ *
+ * @root - root of pending reservation tree
+ * @lclu - logical cluster to search for
+ *
+ * Returns the pending reservation for the cluster identified by @lclu
+ * if found.  If not, returns a reservation for the next cluster if any,
+ * and if not, returns NULL.
+ */
+static struct pending_reservation *__pr_tree_search(struct rb_root *root,
+						    ext4_lblk_t lclu)
+{
+	struct rb_node *node = root->rb_node;
+	struct pending_reservation *pr = NULL;
+
+	while (node) {
+		pr = rb_entry(node, struct pending_reservation, rb_node);
+		if (lclu < pr->lclu)
+			node = node->rb_left;
+		else if (lclu > pr->lclu)
+			node = node->rb_right;
+		else
+			return pr;
+	}
+	if (pr && lclu < pr->lclu)
+		return pr;
+	if (pr && lclu > pr->lclu) {
+		node = rb_next(&pr->rb_node);
+		return node ? rb_entry(node, struct pending_reservation,
+				       rb_node) : NULL;
+	}
+	return NULL;
+}
+
+/*
+ * get_rsvd - calculates and returns the number of cluster reservations to be
+ *	      released when removing a block range from the extent status tree
+ *	      and releases any pending reservations within the range
+ *
+ * @inode - file containing block range
+ * @end - last block in range
+ * @right_es - pointer to extent containing next block beyond end or NULL
+ * @rc - pointer to reserved count data
+ *
+ * The number of reservations to be released is equal to the number of
+ * clusters containing delayed and not unwritten (delonly) blocks within
+ * the range, minus the number of clusters still containing delonly blocks
+ * at the ends of the range, and minus the number of pending reservations
+ * within the range.
+ */
+static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
+			     struct extent_status *right_es,
+			     struct rsvd_count *rc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct pending_reservation *pr;
+	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
+	struct rb_node *node;
+	ext4_lblk_t first_lclu, last_lclu;
+	bool left_delonly, right_delonly, count_pending;
+	struct extent_status *es;
+
+	if (sbi->s_cluster_ratio > 1) {
+		/* count any remaining partial cluster */
+		if (rc->partial)
+			rc->ndelonly++;
+
+		if (rc->ndelonly == 0)
+			return 0;
+
+		first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
+		last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
+
+		/*
+		 * decrease the delonly count by the number of clusters at the
+		 * ends of the range that still contain delonly blocks -
+		 * these clusters still need to be reserved
+		 */
+		left_delonly = right_delonly = false;
+
+		es = rc->left_es;
+		while (es && ext4_es_end(es) >=
+		       EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
+			if (ext4_es_is_delonly(es)) {
+				rc->ndelonly--;
+				left_delonly = true;
+				break;
+			}
+			node = rb_prev(&es->rb_node);
+			if (!node)
+				break;
+			es = rb_entry(node, struct extent_status, rb_node);
+		}
+		if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+			if (end < ext4_es_end(right_es)) {
+				es = right_es;
+			} else {
+				node = rb_next(&right_es->rb_node);
+				es = node ? rb_entry(node, struct extent_status,
+						     rb_node) : NULL;
+			}
+			while (es && es->es_lblk <=
+			       EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
+				if (ext4_es_is_delonly(es)) {
+					rc->ndelonly--;
+					right_delonly = true;
+					break;
+				}
+				node = rb_next(&es->rb_node);
+				if (!node)
+					break;
+				es = rb_entry(node, struct extent_status,
+					      rb_node);
+			}
+		}
+
+		/*
+		 * Determine the block range that should be searched for
+		 * pending reservations, if any.  Clusters on the ends of the
+		 * original removed range containing delonly blocks are
+		 * excluded.  They've already been accounted for and it's not
+		 * possible to determine if an associated pending reservation
+		 * should be released with the information available in the
+		 * extents status tree.
+		 */
+		if (first_lclu == last_lclu) {
+			if (left_delonly | right_delonly)
+				count_pending = false;
+			else
+				count_pending = true;
+		} else {
+			if (left_delonly)
+				first_lclu++;
+			if (right_delonly)
+				last_lclu--;
+			if (first_lclu <= last_lclu)
+				count_pending = true;
+			else
+				count_pending = false;
+		}
+
+		/*
+		 * a pending reservation found between first_lclu and last_lclu
+		 * represents an allocated cluster that contained at least one
+		 * delonly block, so the delonly total must be reduced by one
+		 * for each pending reservation found and released
+		 */
+		if (count_pending) {
+			pr = __pr_tree_search(&tree->root, first_lclu);
+			while (pr && pr->lclu <= last_lclu) {
+				rc->ndelonly--;
+				node = rb_next(&pr->rb_node);
+				rb_erase(&pr->rb_node, &tree->root);
+				kmem_cache_free(ext4_pending_cachep, pr);
+				if (!node)
+					break;
+				pr = rb_entry(node, struct pending_reservation,
+					      rb_node);
+			}
+		}
+	}
+	return rc->ndelonly;
+}
+
+
+/*
+ * __es_remove_extent - removes block range from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @end - last block in range
+ * @reserved - number of cluster reservations released
+ *
+ * If @reserved is not NULL and delayed allocation is enabled, counts
+ * block/cluster reservations freed by removing range and if bigalloc
+ * enabled cancels pending reservations as needed. Returns 0 on success,
+ * error code on failure.
+ */
 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
-			      ext4_lblk_t end)
+			      ext4_lblk_t end, int *reserved)
 {
 	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 	struct rb_node *node;
@@ -846,9 +1292,14 @@
 	ext4_lblk_t len1, len2;
 	ext4_fsblk_t block;
 	int err;
+	bool count_reserved = true;
+	struct rsvd_count rc;
 
+	if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
+		count_reserved = false;
 retry:
 	err = 0;
+
 	es = __es_tree_search(&tree->root, lblk);
 	if (!es)
 		goto out;
@@ -857,6 +1308,8 @@
 
 	/* Simply invalidate cache_es. */
 	tree->cache_es = NULL;
+	if (count_reserved)
+		init_rsvd(inode, lblk, es, &rc);
 
 	orig_es.es_lblk = es->es_lblk;
 	orig_es.es_len = es->es_len;
@@ -898,10 +1351,16 @@
 				ext4_es_store_pblock(es, block);
 			}
 		}
+		if (count_reserved)
+			count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
+				   &orig_es, &rc);
 		goto out;
 	}
 
 	if (len1 > 0) {
+		if (count_reserved)
+			count_rsvd(inode, lblk, orig_es.es_len - len1,
+				   &orig_es, &rc);
 		node = rb_next(&es->rb_node);
 		if (node)
 			es = rb_entry(node, struct extent_status, rb_node);
@@ -910,6 +1369,8 @@
 	}
 
 	while (es && ext4_es_end(es) <= end) {
+		if (count_reserved)
+			count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
 		node = rb_next(&es->rb_node);
 		rb_erase(&es->rb_node, &tree->root);
 		ext4_es_free_extent(inode, es);
@@ -924,6 +1385,9 @@
 		ext4_lblk_t orig_len = es->es_len;
 
 		len1 = ext4_es_end(es) - end;
+		if (count_reserved)
+			count_rsvd(inode, es->es_lblk, orig_len - len1,
+				   es, &rc);
 		es->es_lblk = end + 1;
 		es->es_len = len1;
 		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
@@ -932,20 +1396,28 @@
 		}
 	}
 
+	if (count_reserved)
+		*reserved = get_rsvd(inode, end, es, &rc);
 out:
 	return err;
 }
 
 /*
- * ext4_es_remove_extent() removes a space from a extent status tree.
+ * ext4_es_remove_extent - removes block range from extent status tree
  *
- * Return 0 on success, error code on failure.
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @len - number of blocks to remove
+ *
+ * Reduces block/cluster reservation count and for bigalloc cancels pending
+ * reservations as needed. Returns 0 on success, error code on failure.
  */
 int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 			  ext4_lblk_t len)
 {
 	ext4_lblk_t end;
 	int err = 0;
+	int reserved = 0;
 
 	trace_ext4_es_remove_extent(inode, lblk, len);
 	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
@@ -963,9 +1435,10 @@
 	 * is reclaimed.
 	 */
 	write_lock(&EXT4_I(inode)->i_es_lock);
-	err = __es_remove_extent(inode, lblk, end);
+	err = __es_remove_extent(inode, lblk, end, &reserved);
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 	ext4_es_print_tree(inode);
+	ext4_da_release_space(inode, reserved);
 	return err;
 }
 
@@ -1113,9 +1586,9 @@
 	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
 		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
 		   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
-	seq_printf(seq, "  %lu/%lu cache hits/misses\n",
-		   es_stats->es_stats_cache_hits,
-		   es_stats->es_stats_cache_misses);
+	seq_printf(seq, "  %lld/%lld cache hits/misses\n",
+		   percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
+		   percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
 	if (inode_cnt)
 		seq_printf(seq, "  %d inodes on list\n", inode_cnt);
 
@@ -1142,35 +1615,46 @@
 	sbi->s_es_nr_inode = 0;
 	spin_lock_init(&sbi->s_es_lock);
 	sbi->s_es_stats.es_stats_shrunk = 0;
-	sbi->s_es_stats.es_stats_cache_hits = 0;
-	sbi->s_es_stats.es_stats_cache_misses = 0;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
+				  GFP_KERNEL);
+	if (err)
+		return err;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
+				  GFP_KERNEL);
+	if (err)
+		goto err1;
 	sbi->s_es_stats.es_stats_scan_time = 0;
 	sbi->s_es_stats.es_stats_max_scan_time = 0;
 	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
 	if (err)
-		return err;
+		goto err2;
 	err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
 	if (err)
-		goto err1;
+		goto err3;
 
 	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
 	sbi->s_es_shrinker.count_objects = ext4_es_count;
 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
 	err = register_shrinker(&sbi->s_es_shrinker);
 	if (err)
-		goto err2;
+		goto err4;
 
 	return 0;
-
-err2:
+err4:
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
-err1:
+err3:
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+err2:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
+err1:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
 	return err;
 }
 
 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
 	unregister_shrinker(&sbi->s_es_shrinker);
@@ -1195,7 +1679,7 @@
 	es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
 	if (!es)
 		goto out_wrap;
-	node = &es->rb_node;
+
 	while (*nr_to_scan > 0) {
 		if (es->es_lblk > end) {
 			ei->i_es_shrink_lblk = end + 1;
@@ -1252,3 +1736,437 @@
 	ei->i_es_tree.cache_es = NULL;
 	return nr_shrunk;
 }
+
+/*
+ * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
+ * discretionary entries from the extent status cache.  (Some entries
+ * must be present for proper operations.)
+ */
+void ext4_clear_inode_es(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct extent_status *es;
+	struct ext4_es_tree *tree;
+	struct rb_node *node;
+
+	write_lock(&ei->i_es_lock);
+	tree = &EXT4_I(inode)->i_es_tree;
+	tree->cache_es = NULL;
+	node = rb_first(&tree->root);
+	while (node) {
+		es = rb_entry(node, struct extent_status, rb_node);
+		node = rb_next(node);
+		if (!ext4_es_is_delayed(es)) {
+			rb_erase(&es->rb_node, &tree->root);
+			ext4_es_free_extent(inode, es);
+		}
+	}
+	ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+	write_unlock(&ei->i_es_lock);
+}
+
+#ifdef ES_DEBUG__
+static void ext4_print_pending_tree(struct inode *inode)
+{
+	struct ext4_pending_tree *tree;
+	struct rb_node *node;
+	struct pending_reservation *pr;
+
+	printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
+	tree = &EXT4_I(inode)->i_pending_tree;
+	node = rb_first(&tree->root);
+	while (node) {
+		pr = rb_entry(node, struct pending_reservation, rb_node);
+		printk(KERN_DEBUG " %u", pr->lclu);
+		node = rb_next(node);
+	}
+	printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_print_pending_tree(inode)
+#endif
+
+int __init ext4_init_pending(void)
+{
+	ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
+					   sizeof(struct pending_reservation),
+					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
+	if (ext4_pending_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ext4_exit_pending(void)
+{
+	kmem_cache_destroy(ext4_pending_cachep);
+}
+
+void ext4_init_pending_tree(struct ext4_pending_tree *tree)
+{
+	tree->root = RB_ROOT;
+}
+
+/*
+ * __get_pending - retrieve a pointer to a pending reservation
+ *
+ * @inode - file containing the pending cluster reservation
+ * @lclu - logical cluster of interest
+ *
+ * Returns a pointer to a pending reservation if it's a member of
+ * the set, and NULL if not.  Must be called holding i_es_lock.
+ */
+static struct pending_reservation *__get_pending(struct inode *inode,
+						 ext4_lblk_t lclu)
+{
+	struct ext4_pending_tree *tree;
+	struct rb_node *node;
+	struct pending_reservation *pr = NULL;
+
+	tree = &EXT4_I(inode)->i_pending_tree;
+	node = (&tree->root)->rb_node;
+
+	while (node) {
+		pr = rb_entry(node, struct pending_reservation, rb_node);
+		if (lclu < pr->lclu)
+			node = node->rb_left;
+		else if (lclu > pr->lclu)
+			node = node->rb_right;
+		else if (lclu == pr->lclu)
+			return pr;
+	}
+	return NULL;
+}
+
+/*
+ * __insert_pending - adds a pending cluster reservation to the set of
+ *                    pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the cluster to be added
+ *
+ * Returns 0 on successful insertion and -ENOMEM on failure.  If the
+ * pending reservation is already in the set, returns successfully.
+ */
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
+	struct rb_node **p = &tree->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pending_reservation *pr;
+	ext4_lblk_t lclu;
+	int ret = 0;
+
+	lclu = EXT4_B2C(sbi, lblk);
+	/* search to find parent for insertion */
+	while (*p) {
+		parent = *p;
+		pr = rb_entry(parent, struct pending_reservation, rb_node);
+
+		if (lclu < pr->lclu) {
+			p = &(*p)->rb_left;
+		} else if (lclu > pr->lclu) {
+			p = &(*p)->rb_right;
+		} else {
+			/* pending reservation already inserted */
+			goto out;
+		}
+	}
+
+	pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+	if (pr == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	pr->lclu = lclu;
+
+	rb_link_node(&pr->rb_node, parent, p);
+	rb_insert_color(&pr->rb_node, &tree->root);
+
+out:
+	return ret;
+}
+
+/*
+ * __remove_pending - removes a pending cluster reservation from the set
+ *                    of pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the pending cluster reservation to be removed
+ *
+ * Returns successfully if pending reservation is not a member of the set.
+ */
+static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct pending_reservation *pr;
+	struct ext4_pending_tree *tree;
+
+	pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
+	if (pr != NULL) {
+		tree = &EXT4_I(inode)->i_pending_tree;
+		rb_erase(&pr->rb_node, &tree->root);
+		kmem_cache_free(ext4_pending_cachep, pr);
+	}
+}
+
+/*
+ * ext4_remove_pending - removes a pending cluster reservation from the set
+ *                       of pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the pending cluster reservation to be removed
+ *
+ * Locking for external use of __remove_pending.
+ */
+void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	write_lock(&ei->i_es_lock);
+	__remove_pending(inode, lblk);
+	write_unlock(&ei->i_es_lock);
+}
+
+/*
+ * ext4_is_pending - determine whether a cluster has a pending reservation
+ *                   on it
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the cluster
+ *
+ * Returns true if there's a pending reservation for the cluster in the
+ * set of pending reservations, and false if not.
+ */
+bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	bool ret;
+
+	read_lock(&ei->i_es_lock);
+	ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
+	read_unlock(&ei->i_es_lock);
+
+	return ret;
+}
+
+/*
+ * ext4_es_insert_delayed_block - adds a delayed block to the extents status
+ *                                tree, adding a pending reservation where
+ *                                needed
+ *
+ * @inode - file containing the newly added block
+ * @lblk - logical block to be added
+ * @allocated - indicates whether a physical cluster has been allocated for
+ *              the logical cluster that contains the block
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+				 bool allocated)
+{
+	struct extent_status newes;
+	int err = 0;
+
+	es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
+		 lblk, inode->i_ino);
+
+	newes.es_lblk = lblk;
+	newes.es_len = 1;
+	ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
+	trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
+
+	ext4_es_insert_extent_check(inode, &newes);
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+
+	err = __es_remove_extent(inode, lblk, lblk, NULL);
+	if (err != 0)
+		goto error;
+retry:
+	err = __es_insert_extent(inode, &newes);
+	if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+					  128, EXT4_I(inode)))
+		goto retry;
+	if (err != 0)
+		goto error;
+
+	if (allocated)
+		__insert_pending(inode, lblk);
+
+error:
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+
+	ext4_es_print_tree(inode);
+	ext4_print_pending_tree(inode);
+
+	return err;
+}
+
+/*
+ * __es_delayed_clu - count number of clusters containing blocks that
+ *                    are delayed only
+ *
+ * @inode - file containing block range
+ * @start - logical block defining start of range
+ * @end - logical block defining end of range
+ *
+ * Returns the number of clusters containing only delayed (not delayed
+ * and unwritten) blocks in the range specified by @start and @end.  Any
+ * cluster or part of a cluster within the range and containing a delayed
+ * and not unwritten block within the range is counted as a whole cluster.
+ */
+static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
+				     ext4_lblk_t end)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct extent_status *es;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct rb_node *node;
+	ext4_lblk_t first_lclu, last_lclu;
+	unsigned long long last_counted_lclu;
+	unsigned int n = 0;
+
+	/* guaranteed to be unequal to any ext4_lblk_t value */
+	last_counted_lclu = ~0ULL;
+
+	es = __es_tree_search(&tree->root, start);
+
+	while (es && (es->es_lblk <= end)) {
+		if (ext4_es_is_delonly(es)) {
+			if (es->es_lblk <= start)
+				first_lclu = EXT4_B2C(sbi, start);
+			else
+				first_lclu = EXT4_B2C(sbi, es->es_lblk);
+
+			if (ext4_es_end(es) >= end)
+				last_lclu = EXT4_B2C(sbi, end);
+			else
+				last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
+
+			if (first_lclu == last_counted_lclu)
+				n += last_lclu - first_lclu;
+			else
+				n += last_lclu - first_lclu + 1;
+			last_counted_lclu = last_lclu;
+		}
+		node = rb_next(&es->rb_node);
+		if (!node)
+			break;
+		es = rb_entry(node, struct extent_status, rb_node);
+	}
+
+	return n;
+}
+
+/*
+ * ext4_es_delayed_clu - count number of clusters containing blocks that
+ *                       are both delayed and unwritten
+ *
+ * @inode - file containing block range
+ * @lblk - logical block defining start of range
+ * @len - number of blocks in range
+ *
+ * Locking for external use of __es_delayed_clu().
+ */
+unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_lblk_t end;
+	unsigned int n;
+
+	if (len == 0)
+		return 0;
+
+	end = lblk + len - 1;
+	WARN_ON(end < lblk);
+
+	read_lock(&ei->i_es_lock);
+
+	n = __es_delayed_clu(inode, lblk, end);
+
+	read_unlock(&ei->i_es_lock);
+
+	return n;
+}
+
+/*
+ * __revise_pending - makes, cancels, or leaves unchanged pending cluster
+ *                    reservations for a specified block range depending
+ *                    upon the presence or absence of delayed blocks
+ *                    outside the range within clusters at the ends of the
+ *                    range
+ *
+ * @inode - file containing the range
+ * @lblk - logical block defining the start of range
+ * @len  - length of range in blocks
+ *
+ * Used after a newly allocated extent is added to the extents status tree.
+ * Requires that the extents in the range have either written or unwritten
+ * status.  Must be called while holding i_es_lock.
+ */
+static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+			     ext4_lblk_t len)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t end = lblk + len - 1;
+	ext4_lblk_t first, last;
+	bool f_del = false, l_del = false;
+
+	if (len == 0)
+		return;
+
+	/*
+	 * Two cases - block range within single cluster and block range
+	 * spanning two or more clusters.  Note that a cluster belonging
+	 * to a range starting and/or ending on a cluster boundary is treated
+	 * as if it does not contain a delayed extent.  The new range may
+	 * have allocated space for previously delayed blocks out to the
+	 * cluster boundary, requiring that any pre-existing pending
+	 * reservation be canceled.  Because this code only looks at blocks
+	 * outside the range, it should revise pending reservations
+	 * correctly even if the extent represented by the range can't be
+	 * inserted in the extents status tree due to ENOSPC.
+	 */
+
+	if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
+		first = EXT4_LBLK_CMASK(sbi, lblk);
+		if (first != lblk)
+			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+						first, lblk - 1);
+		if (f_del) {
+			__insert_pending(inode, first);
+		} else {
+			last = EXT4_LBLK_CMASK(sbi, end) +
+			       sbi->s_cluster_ratio - 1;
+			if (last != end)
+				l_del = __es_scan_range(inode,
+							&ext4_es_is_delonly,
+							end + 1, last);
+			if (l_del)
+				__insert_pending(inode, last);
+			else
+				__remove_pending(inode, last);
+		}
+	} else {
+		first = EXT4_LBLK_CMASK(sbi, lblk);
+		if (first != lblk)
+			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+						first, lblk - 1);
+		if (f_del)
+			__insert_pending(inode, first);
+		else
+			__remove_pending(inode, first);
+
+		last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
+		if (last != end)
+			l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+						end + 1, last);
+		if (l_del)
+			__insert_pending(inode, last);
+		else
+			__remove_pending(inode, last);
+	}
+}

diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8efdeb9..825313c 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h

@@ -70,14 +70,59 @@
 
 struct ext4_es_stats {
 	unsigned long es_stats_shrunk;
-	unsigned long es_stats_cache_hits;
-	unsigned long es_stats_cache_misses;
+	struct percpu_counter es_stats_cache_hits;
+	struct percpu_counter es_stats_cache_misses;
 	u64 es_stats_scan_time;
 	u64 es_stats_max_scan_time;
 	struct percpu_counter es_stats_all_cnt;
 	struct percpu_counter es_stats_shk_cnt;
 };
 
+/*
+ * Pending cluster reservations for bigalloc file systems
+ *
+ * A cluster with a pending reservation is a logical cluster shared by at
+ * least one extent in the extents status tree with delayed and unwritten
+ * status and at least one other written or unwritten extent.  The
+ * reservation is said to be pending because a cluster reservation would
+ * have to be taken in the event all blocks in the cluster shared with
+ * written or unwritten extents were deleted while the delayed and
+ * unwritten blocks remained.
+ *
+ * The set of pending cluster reservations is an auxiliary data structure
+ * used with the extents status tree to implement reserved cluster/block
+ * accounting for bigalloc file systems.  The set is kept in memory and
+ * records all pending cluster reservations.
+ *
+ * Its primary function is to avoid the need to read extents from the
+ * disk when invalidating pages as a result of a truncate, punch hole, or
+ * collapse range operation.  Page invalidation requires a decrease in the
+ * reserved cluster count if it results in the removal of all delayed
+ * and unwritten extents (blocks) from a cluster that is not shared with a
+ * written or unwritten extent, and no decrease otherwise.  Determining
+ * whether the cluster is shared can be done by searching for a pending
+ * reservation on it.
+ *
+ * Secondarily, it provides a potentially faster method for determining
+ * whether the reserved cluster count should be increased when a physical
+ * cluster is deallocated as a result of a truncate, punch hole, or
+ * collapse range operation.  The necessary information is also present
+ * in the extents status tree, but might be more rapidly accessed in
+ * the pending reservation set in many cases due to smaller size.
+ *
+ * The pending cluster reservation set is implemented as a red-black tree
+ * with the goal of minimizing per page search time overhead.
+ */
+
+struct pending_reservation {
+	struct rb_node rb_node;
+	ext4_lblk_t lclu;
+};
+
+struct ext4_pending_tree {
+	struct rb_root root;
+};
+
 extern int __init ext4_init_es(void);
 extern void ext4_exit_es(void);
 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -90,11 +135,19 @@
 				 unsigned int status);
 extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 				 ext4_lblk_t len);
-extern void ext4_es_find_delayed_extent_range(struct inode *inode,
-					ext4_lblk_t lblk, ext4_lblk_t end,
-					struct extent_status *es);
+extern void ext4_es_find_extent_range(struct inode *inode,
+				      int (*match_fn)(struct extent_status *es),
+				      ext4_lblk_t lblk, ext4_lblk_t end,
+				      struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t *next_lblk,
 				 struct extent_status *es);
+extern bool ext4_es_scan_range(struct inode *inode,
+			       int (*matching_fn)(struct extent_status *es),
+			       ext4_lblk_t lblk, ext4_lblk_t end);
+extern bool ext4_es_scan_clu(struct inode *inode,
+			     int (*matching_fn)(struct extent_status *es),
+			     ext4_lblk_t lblk);
 
 static inline unsigned int ext4_es_status(struct extent_status *es)
 {
@@ -126,6 +179,16 @@
 	return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
 }
 
+static inline int ext4_es_is_mapped(struct extent_status *es)
+{
+	return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
+}
+
+static inline int ext4_es_is_delonly(struct extent_status *es)
+{
+	return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
+}
+
 static inline void ext4_es_set_referenced(struct extent_status *es)
 {
 	es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -175,4 +238,15 @@
 
 extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
 
+extern int __init ext4_init_pending(void);
+extern void ext4_exit_pending(void);
+extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
+extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
+extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
+extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+					bool allocated);
+extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+					ext4_lblk_t len);
+extern void ext4_clear_inode_es(struct inode *inode);
+
 #endif /* _EXT4_EXTENTS_STATUS_H */

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 69d65d4..8d2bbcc 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c

@@ -125,7 +125,7 @@
 	struct super_block *sb = inode->i_sb;
 	int blockmask = sb->s_blocksize - 1;
 
-	if (pos >= i_size_read(inode))
+	if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
 		return 0;
 
 	if ((pos | iov_iter_alignment(from)) & blockmask)
@@ -165,6 +165,10 @@
 	ret = generic_write_checks(iocb, from);
 	if (ret <= 0)
 		return ret;
+
+	if (unlikely(IS_IMMUTABLE(inode)))
+		return -EPERM;
+
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
 	 * is smaller than s_maxbytes, which is for extent-mapped files.
@@ -226,8 +230,6 @@
 	if (IS_DAX(inode))
 		return ext4_dax_write_iter(iocb, from);
 #endif
-	if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
-		return -EOPNOTSUPP;
 
 	if (!inode_trylock(inode)) {
 		if (iocb->ki_flags & IOCB_NOWAIT)
@@ -264,6 +266,13 @@
 	}
 
 	ret = __generic_file_write_iter(iocb, from);
+	/*
+	 * Unaligned direct AIO must be the only IO in flight. Otherwise
+	 * overlapping aligned IO after unaligned might result in data
+	 * corruption.
+	 */
+	if (ret == -EIOCBQUEUED && unaligned_aio)
+		ext4_unwritten_wait(inode);
 	inode_unlock(inode);
 
 	if (ret > 0)
@@ -360,15 +369,17 @@
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_mapping->host;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct dax_device *dax_dev = sbi->s_daxdev;
 
-	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+	if (unlikely(ext4_forced_shutdown(sbi)))
 		return -EIO;
 
 	/*
-	 * We don't support synchronous mappings for non-DAX files. At least
-	 * until someone comes with a sensible use case.
+	 * We don't support synchronous mappings for non-DAX files and
+	 * for DAX files if underneath dax_device is not synchronous.
 	 */
-	if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+	if (!daxdev_mapping_supported(vma, dax_dev))
 		return -EOPNOTSUPP;
 
 	file_accessed(file);
@@ -444,6 +455,10 @@
 	if (ret)
 		return ret;
 
+	ret = fsverity_file_open(inode, filp);
+	if (ret)
+		return ret;
+
 	/*
 	 * Set up the jbd2_inode if we are opening the inode for
 	 * writing and the journal is present

diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 4b99e2d..dbccf46 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c

@@ -626,7 +626,7 @@
 {
 	struct ext4_fsmap dkeys[2];	/* per-dev keys */
 	struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS];
-	struct ext4_getfsmap_info info = {0};
+	struct ext4_getfsmap_info info = { NULL };
 	int i;
 	int error = 0;
 

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 26a7fe5..5508baa 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c

@@ -159,6 +159,9 @@
 			ret = err;
 	}
 out:
+	err = file_check_and_advance_wb_err(file);
+	if (ret == 0)
+		ret = err;
 	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }

diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index e22dcfa..3e13379 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c

@@ -6,6 +6,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/unicode.h>
 #include <linux/compiler.h>
 #include <linux/bitops.h>
 #include "ext4.h"
@@ -196,7 +197,8 @@
  * represented, and whether or not the returned hash is 32 bits or 64
  * bits.  32 bit hashes will return 0 for the minor hash.
  */
-int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+static int __ext4fs_dirhash(const char *name, int len,
+			    struct dx_hash_info *hinfo)
 {
 	__u32	hash;
 	__u32	minor_hash = 0;
@@ -231,6 +233,7 @@
 		break;
 	case DX_HASH_HALF_MD4_UNSIGNED:
 		str2hashbuf = str2hashbuf_unsigned;
+		/* fall through */
 	case DX_HASH_HALF_MD4:
 		p = name;
 		while (len > 0) {
@@ -244,6 +247,7 @@
 		break;
 	case DX_HASH_TEA_UNSIGNED:
 		str2hashbuf = str2hashbuf_unsigned;
+		/* fall through */
 	case DX_HASH_TEA:
 		p = name;
 		while (len > 0) {
@@ -266,3 +270,33 @@
 	hinfo->minor_hash = minor_hash;
 	return 0;
 }
+
+int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
+		   struct dx_hash_info *hinfo)
+{
+#ifdef CONFIG_UNICODE
+	const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding;
+	int r, dlen;
+	unsigned char *buff;
+	struct qstr qstr = {.name = name, .len = len };
+
+	if (len && IS_CASEFOLDED(dir) && um) {
+		buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
+		if (!buff)
+			return -ENOMEM;
+
+		dlen = utf8_casefold(um, &qstr, buff, PATH_MAX);
+		if (dlen < 0) {
+			kfree(buff);
+			goto opaque_seq;
+		}
+
+		r = __ext4fs_dirhash(buff, dlen, hinfo);
+
+		kfree(buff);
+		return r;
+	}
+opaque_seq:
+#endif
+	return __ext4fs_dirhash(name, len, hinfo);
+}

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 2addcb8..764ff4c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c

@@ -455,7 +455,7 @@
 		if (qstr) {
 			hinfo.hash_version = DX_HASH_HALF_MD4;
 			hinfo.seed = sbi->s_hash_seed;
-			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+			ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
 			grp = hinfo.hash;
 		} else
 			grp = prandom_u32();
@@ -771,7 +771,7 @@
 	if (unlikely(ext4_forced_shutdown(sbi)))
 		return ERR_PTR(-EIO);
 
-	if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+	if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
 	    !(i_flags & EXT4_EA_INODE_FL)) {
 		err = fscrypt_get_encryption_info(dir);
@@ -1216,7 +1216,7 @@
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
 	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 	if (IS_ERR(bitmap_bh))
-		return (struct inode *) bitmap_bh;
+		return ERR_CAST(bitmap_bh);
 
 	/* Having the inode bit set should be a 100% indicator that this
 	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
@@ -1225,7 +1225,7 @@
 	if (!ext4_test_bit(bit, bitmap_bh->b_data))
 		goto bad_orphan;
 
-	inode = ext4_iget(sb, ino);
+	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		ext4_error(sb, "couldn't read orphan inode %lu (err %d)",

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bf7fa15..36699a1 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c

@@ -294,14 +294,12 @@
 }
 
 /**
- *	ext4_alloc_branch - allocate and set up a chain of blocks.
- *	@handle: handle for this transaction
- *	@inode: owner
- *	@indirect_blks: number of allocated indirect blocks
- *	@blks: number of allocated direct blocks
- *	@goal: preferred place for allocation
- *	@offsets: offsets (in the blocks) to store the pointers to next.
- *	@branch: place to store the chain in.
+ * ext4_alloc_branch() - allocate and set up a chain of blocks
+ * @handle: handle for this transaction
+ * @ar: structure describing the allocation request
+ * @indirect_blks: number of allocated indirect blocks
+ * @offsets: offsets (in the blocks) to store the pointers to next.
+ * @branch: place to store the chain in.
  *
  *	This function allocates blocks, zeroes out all but the last one,
  *	links them into chain and (if we are synchronous) writes them to disk.
@@ -396,15 +394,11 @@
 }
 
 /**
- * ext4_splice_branch - splice the allocated branch onto inode.
+ * ext4_splice_branch() - splice the allocated branch onto inode.
  * @handle: handle for this transaction
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *	ext4_alloc_branch)
+ * @ar: structure describing the allocation request
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
- * @blks:  number of direct blocks we are adding
  *
  * This function fills the missing link and does all housekeeping needed in
  * inode (->i_blocks, etc.). In case of success we end up with the full
@@ -1183,18 +1177,21 @@
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
 			i_data[EXT4_IND_BLOCK] = 0;
 		}
+		/* fall through */
 	case EXT4_IND_BLOCK:
 		nr = i_data[EXT4_DIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
 			i_data[EXT4_DIND_BLOCK] = 0;
 		}
+		/* fall through */
 	case EXT4_DIND_BLOCK:
 		nr = i_data[EXT4_TIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
 			i_data[EXT4_TIND_BLOCK] = 0;
 		}
+		/* fall through */
 	case EXT4_TIND_BLOCK:
 		;
 	}
@@ -1219,6 +1216,7 @@
 	ext4_lblk_t offsets[4], offsets2[4];
 	Indirect chain[4], chain2[4];
 	Indirect *partial, *partial2;
+	Indirect *p = NULL, *p2 = NULL;
 	ext4_lblk_t max_block;
 	__le32 nr = 0, nr2 = 0;
 	int n = 0, n2 = 0;
@@ -1260,7 +1258,7 @@
 		}
 
 
-		partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+		partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
 		if (nr) {
 			if (partial == chain) {
 				/* Shared branch grows from the inode */
@@ -1285,13 +1283,11 @@
 				partial->p + 1,
 				(__le32 *)partial->bh->b_data+addr_per_block,
 				(chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
 			partial--;
 		}
 
 end_range:
-		partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+		partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
 		if (nr2) {
 			if (partial2 == chain2) {
 				/*
@@ -1321,16 +1317,14 @@
 					   (__le32 *)partial2->bh->b_data,
 					   partial2->p,
 					   (chain2+n2-1) - partial2);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
 			partial2--;
 		}
 		goto do_indirects;
 	}
 
 	/* Punch happened within the same level (n == n2) */
-	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
-	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+	partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
+	partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
 
 	/* Free top, but only if partial2 isn't its subtree. */
 	if (nr) {
@@ -1387,11 +1381,7 @@
 					   partial->p + 1,
 					   partial2->p,
 					   (chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
-			return 0;
+			goto cleanup;
 		}
 
 		/*
@@ -1406,8 +1396,6 @@
 					   partial->p + 1,
 					   (__le32 *)partial->bh->b_data+addr_per_block,
 					   (chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
 			partial--;
 		}
 		if (partial2 > chain2 && depth2 <= depth) {
@@ -1415,11 +1403,21 @@
 					   (__le32 *)partial2->bh->b_data,
 					   partial2->p,
 					   (chain2+n2-1) - partial2);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
 			partial2--;
 		}
 	}
+
+cleanup:
+	while (p && p > chain) {
+		BUFFER_TRACE(p->bh, "call brelse");
+		brelse(p->bh);
+		p--;
+	}
+	while (p2 && p2 > chain2) {
+		BUFFER_TRACE(p2->bh, "call brelse");
+		brelse(p2->bh);
+		p2--;
+	}
 	return 0;
 
 do_indirects:
@@ -1427,30 +1425,33 @@
 	switch (offsets[0]) {
 	default:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_IND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
 			i_data[EXT4_IND_BLOCK] = 0;
 		}
+		/* fall through */
 	case EXT4_IND_BLOCK:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_DIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
 			i_data[EXT4_DIND_BLOCK] = 0;
 		}
+		/* fall through */
 	case EXT4_DIND_BLOCK:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_TIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
 			i_data[EXT4_TIND_BLOCK] = 0;
 		}
+		/* fall through */
 	case EXT4_TIND_BLOCK:
 		;
 	}
-	return 0;
+	goto cleanup;
 }

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9c4bac1..2fec62d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c

@@ -705,8 +705,11 @@
 
 	if (!PageUptodate(page)) {
 		ret = ext4_read_inline_page(inode, page);
-		if (ret < 0)
+		if (ret < 0) {
+			unlock_page(page);
+			put_page(page);
 			goto out_up_read;
+		}
 	}
 
 	ret = 1;
@@ -1129,7 +1132,6 @@
 {
 	int err, csum_size = 0, header_size = 0;
 	struct ext4_dir_entry_2 *de;
-	struct ext4_dir_entry_tail *t;
 	void *target = dir_block->b_data;
 
 	/*
@@ -1155,13 +1157,11 @@
 			inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
 			inode->i_sb->s_blocksize - csum_size);
 
-	if (csum_size) {
-		t = EXT4_DIRENT_TAIL(dir_block->b_data,
-				     inode->i_sb->s_blocksize);
-		initialize_dirent_tail(t, inode->i_sb->s_blocksize);
-	}
+	if (csum_size)
+		ext4_initialize_dirent_tail(dir_block,
+					    inode->i_sb->s_blocksize);
 	set_buffer_uptodate(dir_block);
-	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+	err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
 	if (err)
 		return err;
 	set_buffer_verified(dir_block);
@@ -1324,11 +1324,11 @@
  * inlined dir.  It returns the number directory entries loaded
  * into the tree.  If there is an error it is returned in err.
  */
-int htree_inlinedir_to_tree(struct file *dir_file,
-			    struct inode *dir, ext4_lblk_t block,
-			    struct dx_hash_info *hinfo,
-			    __u32 start_hash, __u32 start_minor_hash,
-			    int *has_inline_data)
+int ext4_inlinedir_to_tree(struct file *dir_file,
+			   struct inode *dir, ext4_lblk_t block,
+			   struct dx_hash_info *hinfo,
+			   __u32 start_hash, __u32 start_minor_hash,
+			   int *has_inline_data)
 {
 	int err = 0, count = 0;
 	unsigned int parent_ino;
@@ -1404,7 +1404,7 @@
 			}
 		}
 
-		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||
 		    ((hinfo->hash == start_hash) &&
 		     (hinfo->minor_hash < start_minor_hash)))
@@ -1416,7 +1416,7 @@
 		err = ext4_htree_store_dirent(dir_file, hinfo->hash,
 					      hinfo->minor_hash, de, &tmp_str);
 		if (err) {
-			count = err;
+			ret = err;
 			goto out;
 		}
 		count++;
@@ -1887,12 +1887,12 @@
 	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
 	physical += offsetof(struct ext4_inode, i_block);
 
-	if (physical)
-		error = fiemap_fill_next_extent(fieinfo, start, physical,
-						inline_len, flags);
 	brelse(iloc.bh);
 out:
 	up_read(&EXT4_I(inode)->xattr_sem);
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, start, physical,
+						inline_len, flags);
 	return (error < 0 ? error : 0);
 }
 

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 244531d..d691d17 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c

@@ -391,7 +391,7 @@
 	 * inode's preallocations.
 	 */
 	if ((ei->i_reserved_data_blocks == 0) &&
-	    (atomic_read(&inode->i_writecount) == 0))
+	    !inode_is_open_for_write(inode))
 		ext4_discard_preallocations(inode);
 }
 
@@ -399,6 +399,10 @@
 				unsigned int line,
 				struct ext4_map_blocks *map)
 {
+	if (ext4_has_feature_journal(inode->i_sb) &&
+	    (inode->i_ino ==
+	     le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+		return 0;
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
 				   map->m_len)) {
 		ext4_error_inode(inode, func, line, map->m_pblk,
@@ -415,7 +419,7 @@
 {
 	int ret;
 
-	if (ext4_encrypted_inode(inode))
+	if (IS_ENCRYPTED(inode))
 		return fscrypt_zeroout_range(inode, lblk, pblk, len);
 
 	ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
@@ -523,7 +527,7 @@
 		return -EFSCORRUPTED;
 
 	/* Lookup extent status tree firstly */
-	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
 		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
 			map->m_pblk = ext4_es_pblock(&es) +
 					map->m_lblk - es.es_lblk;
@@ -541,7 +545,7 @@
 			map->m_len = retval;
 			retval = 0;
 		} else {
-			BUG_ON(1);
+			BUG();
 		}
 #ifdef ES_AGGRESSIVE_TEST
 		ext4_map_blocks_es_recheck(handle, inode, map,
@@ -577,8 +581,8 @@
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
 		    !(status & EXTENT_STATUS_WRITTEN) &&
-		    ext4_find_delalloc_range(inode, map->m_lblk,
-					     map->m_lblk + map->m_len - 1))
+		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+				       map->m_lblk + map->m_len - 1))
 			status |= EXTENT_STATUS_DELAYED;
 		ret = ext4_es_insert_extent(inode, map->m_lblk,
 					    map->m_len, map->m_pblk, status);
@@ -678,8 +682,6 @@
 		if (flags & EXT4_GET_BLOCKS_ZERO &&
 		    map->m_flags & EXT4_MAP_MAPPED &&
 		    map->m_flags & EXT4_MAP_NEW) {
-			clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
-					   map->m_len);
 			ret = ext4_issue_zeroout(inode, map->m_lblk,
 						 map->m_pblk, map->m_len);
 			if (ret) {
@@ -693,7 +695,7 @@
 		 * extent status tree.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
-		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+		    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
 			if (ext4_es_is_written(&es))
 				goto out_sem;
 		}
@@ -701,8 +703,8 @@
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
 		    !(status & EXTENT_STATUS_WRITTEN) &&
-		    ext4_find_delalloc_range(inode, map->m_lblk,
-					     map->m_lblk + map->m_len - 1))
+		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+				       map->m_lblk + map->m_len - 1))
 			status |= EXTENT_STATUS_DELAYED;
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 					    map->m_pblk, status);
@@ -729,10 +731,16 @@
 		    !(flags & EXT4_GET_BLOCKS_ZERO) &&
 		    !ext4_is_quota_file(inode) &&
 		    ext4_should_order_data(inode)) {
+			loff_t start_byte =
+				(loff_t)map->m_lblk << inode->i_blkbits;
+			loff_t length = (loff_t)map->m_len << inode->i_blkbits;
+
 			if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
-				ret = ext4_jbd2_inode_add_wait(handle, inode);
+				ret = ext4_jbd2_inode_add_wait(handle, inode,
+						start_byte, length);
 			else
-				ret = ext4_jbd2_inode_add_write(handle, inode);
+				ret = ext4_jbd2_inode_add_write(handle, inode,
+						start_byte, length);
 			if (ret)
 				return ret;
 		}
@@ -1016,7 +1024,7 @@
 	bh = ext4_getblk(handle, inode, block, map_flags);
 	if (IS_ERR(bh))
 		return bh;
-	if (!bh || buffer_uptodate(bh))
+	if (!bh || ext4_buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
 	wait_on_buffer(bh);
@@ -1043,7 +1051,7 @@
 
 	for (i = 0; i < bh_count; i++)
 		/* Note that NULL bhs[i] is valid because of holes. */
-		if (bhs[i] && !buffer_uptodate(bhs[i]))
+		if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
 			ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
 				    &bhs[i]);
 
@@ -1150,7 +1158,7 @@
 	return ret;
 }
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 				  get_block_t *get_block)
 {
@@ -1162,8 +1170,9 @@
 	int err = 0;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	unsigned bbits;
-	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
-	bool decrypt = false;
+	struct buffer_head *bh, *head, *wait[2];
+	int nr_wait = 0;
+	int i;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(from > PAGE_SIZE);
@@ -1194,7 +1203,6 @@
 			if (err)
 				break;
 			if (buffer_new(bh)) {
-				clean_bdev_bh_alias(bh);
 				if (PageUptodate(page)) {
 					clear_buffer_new(bh);
 					set_buffer_uptodate(bh);
@@ -1216,24 +1224,32 @@
 		    !buffer_unwritten(bh) &&
 		    (block_start < from || block_end > to)) {
 			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
-			*wait_bh++ = bh;
-			decrypt = ext4_encrypted_inode(inode) &&
-				S_ISREG(inode->i_mode);
+			wait[nr_wait++] = bh;
 		}
 	}
 	/*
 	 * If we issued read requests, let them complete.
 	 */
-	while (wait_bh > wait) {
-		wait_on_buffer(*--wait_bh);
-		if (!buffer_uptodate(*wait_bh))
+	for (i = 0; i < nr_wait; i++) {
+		wait_on_buffer(wait[i]);
+		if (!buffer_uptodate(wait[i]))
 			err = -EIO;
 	}
-	if (unlikely(err))
+	if (unlikely(err)) {
 		page_zero_new_buffers(page, from, to);
-	else if (decrypt)
-		err = fscrypt_decrypt_page(page->mapping->host, page,
-				PAGE_SIZE, 0, page->index);
+	} else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+		for (i = 0; i < nr_wait; i++) {
+			int err2;
+
+			err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
+								bh_offset(wait[i]));
+			if (err2) {
+				clear_buffer_uptodate(wait[i]);
+				err = err2;
+			}
+		}
+	}
+
 	return err;
 }
 #endif
@@ -1303,7 +1319,7 @@
 	/* In case writeback began while the page was unlocked */
 	wait_for_stable_page(page);
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	if (ext4_should_dioread_nolock(inode))
 		ret = ext4_block_write_begin(page, pos, len,
 					     ext4_get_block_unwritten);
@@ -1324,6 +1340,9 @@
 	}
 
 	if (ret) {
+		bool extended = (pos + len > inode->i_size) &&
+				!ext4_verity_in_progress(inode);
+
 		unlock_page(page);
 		/*
 		 * __block_write_begin may have instantiated a few blocks
@@ -1333,11 +1352,11 @@
 		 * Add inode to orphan list in case we crash before
 		 * truncate finishes
 		 */
-		if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		if (extended && ext4_can_truncate(inode))
 			ext4_orphan_add(handle, inode);
 
 		ext4_journal_stop(handle);
-		if (pos + len > inode->i_size) {
+		if (extended) {
 			ext4_truncate_failed_write(inode);
 			/*
 			 * If truncate failed early the inode might
@@ -1390,6 +1409,7 @@
 	int ret = 0, ret2;
 	int i_size_changed = 0;
 	int inline_data = ext4_has_inline_data(inode);
+	bool verity = ext4_verity_in_progress(inode);
 
 	trace_ext4_write_end(inode, pos, len, copied);
 	if (inline_data) {
@@ -1407,12 +1427,16 @@
 	/*
 	 * it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
+	 *
+	 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
+	 * blocks are being written past EOF, so skip the i_size update.
 	 */
-	i_size_changed = ext4_update_inode_size(inode, pos + copied);
+	if (!verity)
+		i_size_changed = ext4_update_inode_size(inode, pos + copied);
 	unlock_page(page);
 	put_page(page);
 
-	if (old_size < pos)
+	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -1423,7 +1447,7 @@
 	if (i_size_changed || inline_data)
 		ext4_mark_inode_dirty(handle, inode);
 
-	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
@@ -1434,7 +1458,7 @@
 	if (!ret)
 		ret = ret2;
 
-	if (pos + len > inode->i_size) {
+	if (pos + len > inode->i_size && !verity) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
@@ -1495,6 +1519,7 @@
 	unsigned from, to;
 	int size_changed = 0;
 	int inline_data = ext4_has_inline_data(inode);
+	bool verity = ext4_verity_in_progress(inode);
 
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_SIZE - 1);
@@ -1524,13 +1549,14 @@
 		if (!partial)
 			SetPageUptodate(page);
 	}
-	size_changed = ext4_update_inode_size(inode, pos + copied);
+	if (!verity)
+		size_changed = ext4_update_inode_size(inode, pos + copied);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
 	unlock_page(page);
 	put_page(page);
 
-	if (old_size < pos)
+	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
 
 	if (size_changed || inline_data) {
@@ -1539,7 +1565,7 @@
 			ret = ret2;
 	}
 
-	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
@@ -1550,7 +1576,7 @@
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	if (pos + len > inode->i_size) {
+	if (pos + len > inode->i_size && !verity) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
@@ -1595,7 +1621,7 @@
 	return 0;       /* success */
 }
 
-static void ext4_da_release_space(struct inode *inode, int to_free)
+void ext4_da_release_space(struct inode *inode, int to_free)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1630,64 +1656,6 @@
 	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
 }
 
-static void ext4_da_page_release_reservation(struct page *page,
-					     unsigned int offset,
-					     unsigned int length)
-{
-	int to_release = 0, contiguous_blks = 0;
-	struct buffer_head *head, *bh;
-	unsigned int curr_off = 0;
-	struct inode *inode = page->mapping->host;
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	unsigned int stop = offset + length;
-	int num_clusters;
-	ext4_fsblk_t lblk;
-
-	BUG_ON(stop > PAGE_SIZE || stop < length);
-
-	head = page_buffers(page);
-	bh = head;
-	do {
-		unsigned int next_off = curr_off + bh->b_size;
-
-		if (next_off > stop)
-			break;
-
-		if ((offset <= curr_off) && (buffer_delay(bh))) {
-			to_release++;
-			contiguous_blks++;
-			clear_buffer_delay(bh);
-		} else if (contiguous_blks) {
-			lblk = page->index <<
-			       (PAGE_SHIFT - inode->i_blkbits);
-			lblk += (curr_off >> inode->i_blkbits) -
-				contiguous_blks;
-			ext4_es_remove_extent(inode, lblk, contiguous_blks);
-			contiguous_blks = 0;
-		}
-		curr_off = next_off;
-	} while ((bh = bh->b_this_page) != head);
-
-	if (contiguous_blks) {
-		lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
-		lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
-		ext4_es_remove_extent(inode, lblk, contiguous_blks);
-	}
-
-	/* If we have released all the blocks belonging to a cluster, then we
-	 * need to release the reserved space for that cluster. */
-	num_clusters = EXT4_NUM_B2C(sbi, to_release);
-	while (num_clusters > 0) {
-		lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
-			((num_clusters - 1) << sbi->s_cluster_bits);
-		if (sbi->s_cluster_ratio == 1 ||
-		    !ext4_find_delalloc_cluster(inode, lblk))
-			ext4_da_release_space(inode, 1);
-
-		num_clusters--;
-	}
-}
-
 /*
  * Delayed allocation stuff
  */
@@ -1781,6 +1749,65 @@
 }
 
 /*
+ * ext4_insert_delayed_block - adds a delayed block to the extents status
+ *                             tree, incrementing the reserved cluster/block
+ *                             count or making a pending reservation
+ *                             where needed
+ *
+ * @inode - file containing the newly added block
+ * @lblk - logical block to be added
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int ret;
+	bool allocated = false;
+
+	/*
+	 * If the cluster containing lblk is shared with a delayed,
+	 * written, or unwritten extent in a bigalloc file system, it's
+	 * already been accounted for and does not need to be reserved.
+	 * A pending reservation must be made for the cluster if it's
+	 * shared with a written or unwritten extent and doesn't already
+	 * have one.  Written and unwritten extents can be purged from the
+	 * extents status tree if the system is under memory pressure, so
+	 * it's necessary to examine the extent tree if a search of the
+	 * extents status tree doesn't get a match.
+	 */
+	if (sbi->s_cluster_ratio == 1) {
+		ret = ext4_da_reserve_space(inode);
+		if (ret != 0)   /* ENOSPC */
+			goto errout;
+	} else {   /* bigalloc */
+		if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
+			if (!ext4_es_scan_clu(inode,
+					      &ext4_es_is_mapped, lblk)) {
+				ret = ext4_clu_mapped(inode,
+						      EXT4_B2C(sbi, lblk));
+				if (ret < 0)
+					goto errout;
+				if (ret == 0) {
+					ret = ext4_da_reserve_space(inode);
+					if (ret != 0)   /* ENOSPC */
+						goto errout;
+				} else {
+					allocated = true;
+				}
+			} else {
+				allocated = true;
+			}
+		}
+	}
+
+	ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+
+errout:
+	return ret;
+}
+
+/*
  * This function is grabs code from the very beginning of
  * ext4_map_blocks, but assumes that the caller is from delayed write
  * time. This function looks up the requested blocks and sets the
@@ -1808,7 +1835,7 @@
 		  (unsigned long) map->m_lblk);
 
 	/* Lookup extent status tree firstly */
-	if (ext4_es_lookup_extent(inode, iblock, &es)) {
+	if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
 		if (ext4_es_is_hole(&es)) {
 			retval = 0;
 			down_read(&EXT4_I(inode)->i_data_sem);
@@ -1836,7 +1863,7 @@
 		else if (ext4_es_is_unwritten(&es))
 			map->m_flags |= EXT4_MAP_UNWRITTEN;
 		else
-			BUG_ON(1);
+			BUG();
 
 #ifdef ES_AGGRESSIVE_TEST
 		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
@@ -1859,28 +1886,14 @@
 add_delayed:
 	if (retval == 0) {
 		int ret;
+
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
 		 */
-		/*
-		 * If the block was allocated from previously allocated cluster,
-		 * then we don't need to reserve it again. However we still need
-		 * to reserve metadata for every block we're going to write.
-		 */
-		if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
-		    !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
-			ret = ext4_da_reserve_space(inode);
-			if (ret) {
-				/* not enough space to reserve */
-				retval = ret;
-				goto out_unlock;
-			}
-		}
 
-		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
-					    ~0, EXTENT_STATUS_DELAYED);
-		if (ret) {
+		ret = ext4_insert_delayed_block(inode, map->m_lblk);
+		if (ret != 0) {
 			retval = ret;
 			goto out_unlock;
 		}
@@ -2116,7 +2129,8 @@
 
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
-	if (page->index == size >> PAGE_SHIFT)
+	if (page->index == size >> PAGE_SHIFT &&
+	    !ext4_verity_in_progress(inode))
 		len = size & ~PAGE_MASK;
 	else
 		len = PAGE_SIZE;
@@ -2200,7 +2214,8 @@
 	 * after page tables are updated.
 	 */
 	size = i_size_read(mpd->inode);
-	if (page->index == size >> PAGE_SHIFT)
+	if (page->index == size >> PAGE_SHIFT &&
+	    !ext4_verity_in_progress(mpd->inode))
 		len = size & ~PAGE_MASK;
 	else
 		len = PAGE_SIZE;
@@ -2299,6 +2314,9 @@
 	ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
 							>> inode->i_blkbits;
 
+	if (ext4_verity_in_progress(inode))
+		blocks = EXT_MAX_BLOCKS;
+
 	do {
 		BUG_ON(buffer_locked(bh));
 
@@ -2460,10 +2478,6 @@
 	}
 
 	BUG_ON(map->m_len == 0);
-	if (map->m_flags & EXT4_MAP_NEW) {
-		clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
-				   map->m_len);
-	}
 	return 0;
 }
 
@@ -2613,7 +2627,7 @@
 	long left = mpd->wbc->nr_to_write;
 	pgoff_t index = mpd->first_page;
 	pgoff_t end = mpd->last_page;
-	int tag;
+	xa_mark_t tag;
 	int i, err = 0;
 	int blkbits = mpd->inode->i_blkbits;
 	ext4_lblk_t lblk;
@@ -2743,14 +2757,6 @@
 		goto out_writepages;
 	}
 
-	if (ext4_should_dioread_nolock(inode)) {
-		/*
-		 * We may need to convert up to one extent per block in
-		 * the page and we may dirty the inode.
-		 */
-		rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits);
-	}
-
 	/*
 	 * If we have inline data and arrive here, it means that
 	 * we will soon create the block for the 1st page, so
@@ -2769,6 +2775,15 @@
 		ext4_journal_stop(handle);
 	}
 
+	if (ext4_should_dioread_nolock(inode)) {
+		/*
+		 * We may need to convert up to one extent per block in
+		 * the page and we may dirty the inode.
+		 */
+		rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+						PAGE_SIZE >> inode->i_blkbits);
+	}
+
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
@@ -2805,12 +2820,12 @@
 		goto unplug;
 	}
 	ret = mpage_prepare_extent_to_map(&mpd);
+	/* Unlock pages we didn't use */
+	mpage_release_unused_pages(&mpd, false);
 	/* Submit prepared bio */
 	ext4_io_submit(&mpd.io_submit);
 	ext4_put_io_end_defer(mpd.io_submit.io_end);
 	mpd.io_submit.io_end = NULL;
-	/* Unlock pages we didn't use */
-	mpage_release_unused_pages(&mpd, false);
 	if (ret < 0)
 		goto unplug;
 
@@ -2878,10 +2893,11 @@
 			handle = NULL;
 			mpd.do_map = 0;
 		}
-		/* Submit prepared bio */
-		ext4_io_submit(&mpd.io_submit);
 		/* Unlock pages we didn't use */
 		mpage_release_unused_pages(&mpd, give_up_on_write);
+		/* Submit prepared bio */
+		ext4_io_submit(&mpd.io_submit);
+
 		/*
 		 * Drop our io_end reference we got from init. We have
 		 * to be careful and use deferred io_end finishing if
@@ -3017,8 +3033,8 @@
 
 	index = pos >> PAGE_SHIFT;
 
-	if (ext4_nonda_switch(inode->i_sb) ||
-	    S_ISLNK(inode->i_mode)) {
+	if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
+	    ext4_verity_in_progress(inode)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
 		return ext4_write_begin(file, mapping, pos,
 					len, flags, pagep, fsdata);
@@ -3074,7 +3090,7 @@
 	/* In case writeback began while the page was unlocked */
 	wait_for_stable_page(page);
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	ret = ext4_block_write_begin(page, pos, len,
 				     ext4_da_get_block_prep);
 #else
@@ -3183,24 +3199,6 @@
 	return ret ? ret : copied;
 }
 
-static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
-				   unsigned int length)
-{
-	/*
-	 * Drop reserved blocks
-	 */
-	BUG_ON(!PageLocked(page));
-	if (!page_has_buffers(page))
-		goto out;
-
-	ext4_da_page_release_reservation(page, offset, length);
-
-out:
-	ext4_invalidatepage(page, offset, length);
-
-	return;
-}
-
 /*
  * Force all delayed allocation blocks to be allocated for a given inode.
  */
@@ -3450,7 +3448,8 @@
 			ext4_lblk_t end = map.m_lblk + map.m_len - 1;
 			struct extent_status es;
 
-			ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
+			ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+						  map.m_lblk, end, &es);
 
 			if (!es.es_len || es.es_lblk > end) {
 				/* entire range is a hole */
@@ -3848,10 +3847,12 @@
 	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+#ifdef CONFIG_FS_ENCRYPTION
+	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
 		return 0;
 #endif
+	if (fsverity_active(inode))
+		return 0;
 
 	/*
 	 * If we are doing data journalling we don't support O_DIRECT
@@ -3940,7 +3941,7 @@
 	.write_end		= ext4_da_write_end,
 	.set_page_dirty		= ext4_set_page_dirty,
 	.bmap			= ext4_bmap,
-	.invalidatepage		= ext4_da_invalidatepage,
+	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
@@ -4033,13 +4034,11 @@
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
-		if (S_ISREG(inode->i_mode) &&
-		    ext4_encrypted_inode(inode)) {
+		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
 			/* We expect the key to be set. */
 			BUG_ON(!fscrypt_has_encryption_key(inode));
-			BUG_ON(blocksize != PAGE_SIZE);
-			WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
-						page, PAGE_SIZE, 0, page->index));
+			WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
+					page, blocksize, bh_offset(bh)));
 		}
 	}
 	if (ext4_should_journal_data(inode)) {
@@ -4057,7 +4056,8 @@
 		err = 0;
 		mark_buffer_dirty(bh);
 		if (ext4_should_order_data(inode))
-			err = ext4_jbd2_inode_add_write(handle, inode);
+			err = ext4_jbd2_inode_add_write(handle, inode, from,
+					length);
 	}
 
 unlock:
@@ -4110,7 +4110,7 @@
 	struct inode *inode = mapping->host;
 
 	/* If we are processing an encrypted inode during orphan list handling */
-	if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode))
+	if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
 		return 0;
 
 	blocksize = inode->i_sb->s_blocksize;
@@ -4253,6 +4253,15 @@
 
 	trace_ext4_punch_hole(inode, offset, length, 0);
 
+	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	if (ext4_has_inline_data(inode)) {
+		down_write(&EXT4_I(inode)->i_mmap_sem);
+		ret = ext4_convert_inline_data(inode);
+		up_write(&EXT4_I(inode)->i_mmap_sem);
+		if (ret)
+			return ret;
+	}
+
 	/*
 	 * Write out all dirty pages to avoid race conditions
 	 * Then release them.
@@ -4542,6 +4551,7 @@
 	struct buffer_head	*bh;
 	struct super_block	*sb = inode->i_sb;
 	ext4_fsblk_t		block;
+	struct blk_plug		plug;
 	int			inodes_per_block, inode_offset;
 
 	iloc->bh = NULL;
@@ -4630,6 +4640,7 @@
 		 * If we need to do any I/O, try to pre-readahead extra
 		 * blocks from the inode table.
 		 */
+		blk_start_plug(&plug);
 		if (EXT4_SB(sb)->s_inode_readahead_blks) {
 			ext4_fsblk_t b, end, table;
 			unsigned num;
@@ -4660,6 +4671,7 @@
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+		blk_finish_plug(&plug);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, block,
@@ -4690,7 +4702,9 @@
 		return false;
 	if (ext4_has_inline_data(inode))
 		return false;
-	if (ext4_encrypted_inode(inode))
+	if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
+		return false;
+	if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
 		return false;
 	return true;
 }
@@ -4714,9 +4728,13 @@
 		new_fl |= S_DAX;
 	if (flags & EXT4_ENCRYPT_FL)
 		new_fl |= S_ENCRYPTED;
+	if (flags & EXT4_CASEFOLD_FL)
+		new_fl |= S_CASEFOLD;
+	if (flags & EXT4_VERITY_FL)
+		new_fl |= S_VERITY;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
-			S_ENCRYPTED);
+			S_ENCRYPTED|S_CASEFOLD|S_VERITY);
 }
 
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -4786,7 +4804,9 @@
 		return inode_peek_iversion(inode);
 }
 
-struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+			  ext4_iget_flags flags, const char *function,
+			  unsigned int line)
 {
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
@@ -4800,6 +4820,18 @@
 	gid_t i_gid;
 	projid_t i_projid;
 
+	if ((!(flags & EXT4_IGET_SPECIAL) &&
+	     (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
+	    (ino < EXT4_ROOT_INO) ||
+	    (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+		if (flags & EXT4_IGET_HANDLE)
+			return ERR_PTR(-ESTALE);
+		__ext4_error(sb, function, line,
+			     "inode #%lu: comm %s: iget: illegal inode #",
+			     ino, current->comm);
+		return ERR_PTR(-EFSCORRUPTED);
+	}
+
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -4815,18 +4847,26 @@
 	raw_inode = ext4_raw_inode(&iloc);
 
 	if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
-		EXT4_ERROR_INODE(inode, "root inode unallocated");
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: root inode unallocated");
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
 	}
 
+	if ((flags & EXT4_IGET_HANDLE) &&
+	    (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 			EXT4_INODE_SIZE(inode->i_sb) ||
 		    (ei->i_extra_isize & 3)) {
-			EXT4_ERROR_INODE(inode,
-					 "bad extra_isize %u (inode size %u)",
+			ext4_error_inode(inode, function, line, 0,
+					 "iget: bad extra_isize %u "
+					 "(inode size %u)",
 					 ei->i_extra_isize,
 					 EXT4_INODE_SIZE(inode->i_sb));
 			ret = -EFSCORRUPTED;
@@ -4848,7 +4888,8 @@
 	}
 
 	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
-		EXT4_ERROR_INODE(inode, "checksum invalid");
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: checksum invalid");
 		ret = -EFSBADCRC;
 		goto bad_inode;
 	}
@@ -4905,7 +4946,8 @@
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	inode->i_size = ext4_isize(sb, raw_inode);
 	if ((size = i_size_read(inode)) < 0) {
-		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: bad i_size value: %lld", size);
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
 	}
@@ -4981,7 +5023,8 @@
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: bad extended attribute block %llu",
 				 ei->i_file_acl);
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
@@ -5009,12 +5052,13 @@
 	} else if (S_ISLNK(inode->i_mode)) {
 		/* VFS does not allow setting these so must be corruption */
 		if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
-			EXT4_ERROR_INODE(inode,
-			  "immutable or append flags not allowed on symlinks");
+			ext4_error_inode(inode, function, line, 0,
+					 "iget: immutable or append flags "
+					 "not allowed on symlinks");
 			ret = -EFSCORRUPTED;
 			goto bad_inode;
 		}
-		if (ext4_encrypted_inode(inode)) {
+		if (IS_ENCRYPTED(inode)) {
 			inode->i_op = &ext4_encrypted_symlink_inode_operations;
 			ext4_set_aops(inode);
 		} else if (ext4_inode_is_fast_symlink(inode)) {
@@ -5040,9 +5084,13 @@
 		make_bad_inode(inode);
 	} else {
 		ret = -EFSCORRUPTED;
-		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
+	if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+		ext4_error_inode(inode, function, line, 0,
+				 "casefold flag without casefold feature");
 	brelse(iloc.bh);
 
 	unlock_new_inode(inode);
@@ -5054,13 +5102,6 @@
 	return ERR_PTR(ret);
 }
 
-struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
-{
-	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
-		return ERR_PTR(-EFSCORRUPTED);
-	return ext4_iget(sb, ino);
-}
-
 static int ext4_inode_blocks_set(handle_t *handle,
 				struct ext4_inode *raw_inode,
 				struct ext4_inode_info *ei)
@@ -5299,7 +5340,6 @@
 		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
 		if (err)
 			goto out_brelse;
-		ext4_update_dynamic_rev(sb);
 		ext4_set_feature_large_file(sb);
 		ext4_handle_sync(handle);
 		err = ext4_handle_dirty_super(handle, sb);
@@ -5349,9 +5389,13 @@
 {
 	int err;
 
-	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
+	    sb_rdonly(inode->i_sb))
 		return 0;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (EXT4_SB(inode->i_sb)->s_journal) {
 		if (ext4_journal_current_handle()) {
 			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
@@ -5367,7 +5411,8 @@
 		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
 			return 0;
 
-		err = ext4_force_commit(inode->i_sb);
+		err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+						EXT4_I(inode)->i_sync_tid);
 	} else {
 		struct ext4_iloc iloc;
 
@@ -5466,6 +5511,14 @@
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
+	if (unlikely(IS_IMMUTABLE(inode)))
+		return -EPERM;
+
+	if (unlikely(IS_APPEND(inode) &&
+		     (ia_valid & (ATTR_MODE | ATTR_UID |
+				  ATTR_GID | ATTR_TIMES_SET))))
+		return -EPERM;
+
 	error = setattr_prepare(dentry, attr);
 	if (error)
 		return error;
@@ -5474,6 +5527,10 @@
 	if (error)
 		return error;
 
+	error = fsverity_prepare_setattr(dentry, attr);
+	if (error)
+		return error;
+
 	if (is_quota_modification(inode, attr)) {
 		error = dquot_initialize(inode);
 		if (error)
@@ -5517,7 +5574,7 @@
 	if (attr->ia_valid & ATTR_SIZE) {
 		handle_t *handle;
 		loff_t oldsize = inode->i_size;
-		int shrink = (attr->ia_size <= inode->i_size);
+		int shrink = (attr->ia_size < inode->i_size);
 
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -5531,18 +5588,33 @@
 		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
 			inode_inc_iversion(inode);
 
-		if (ext4_should_order_data(inode) &&
-		    (attr->ia_size < inode->i_size)) {
-			error = ext4_begin_ordered_truncate(inode,
+		if (shrink) {
+			if (ext4_should_order_data(inode)) {
+				error = ext4_begin_ordered_truncate(inode,
 							    attr->ia_size);
-			if (error)
-				goto err_out;
+				if (error)
+					goto err_out;
+			}
+			/*
+			 * Blocks are going to be removed from the inode. Wait
+			 * for dio in flight.
+			 */
+			inode_dio_wait(inode);
 		}
+
+		down_write(&EXT4_I(inode)->i_mmap_sem);
+
+		rc = ext4_break_layouts(inode);
+		if (rc) {
+			up_write(&EXT4_I(inode)->i_mmap_sem);
+			return rc;
+		}
+
 		if (attr->ia_size != inode->i_size) {
 			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 			if (IS_ERR(handle)) {
 				error = PTR_ERR(handle);
-				goto err_out;
+				goto out_mmap_sem;
 			}
 			if (ext4_handle_valid(handle) && shrink) {
 				error = ext4_orphan_add(handle, inode);
@@ -5570,33 +5642,14 @@
 				i_size_write(inode, attr->ia_size);
 			up_write(&EXT4_I(inode)->i_data_sem);
 			ext4_journal_stop(handle);
-			if (error) {
-				if (orphan)
-					ext4_orphan_del(NULL, inode);
-				goto err_out;
-			}
-		}
-		if (!shrink)
-			pagecache_isize_extended(inode, oldsize, inode->i_size);
-
-		/*
-		 * Blocks are going to be removed from the inode. Wait
-		 * for dio in flight.  Temporarily disable
-		 * dioread_nolock to prevent livelock.
-		 */
-		if (orphan) {
-			if (!ext4_should_journal_data(inode)) {
-				inode_dio_wait(inode);
-			} else
+			if (error)
+				goto out_mmap_sem;
+			if (!shrink) {
+				pagecache_isize_extended(inode, oldsize,
+							 inode->i_size);
+			} else if (ext4_should_journal_data(inode)) {
 				ext4_wait_for_tail_page_commit(inode);
-		}
-		down_write(&EXT4_I(inode)->i_mmap_sem);
-
-		rc = ext4_break_layouts(inode);
-		if (rc) {
-			up_write(&EXT4_I(inode)->i_mmap_sem);
-			error = rc;
-			goto err_out;
+			}
 		}
 
 		/*
@@ -5604,11 +5657,16 @@
 		 * in data=journal mode to make pages freeable.
 		 */
 		truncate_pagecache(inode, inode->i_size);
-		if (shrink) {
+		/*
+		 * Call ext4_truncate() even if i_size didn't change to
+		 * truncate possible preallocated blocks.
+		 */
+		if (attr->ia_size <= oldsize) {
 			rc = ext4_truncate(inode);
 			if (rc)
 				error = rc;
 		}
+out_mmap_sem:
 		up_write(&EXT4_I(inode)->i_mmap_sem);
 	}
 
@@ -5854,8 +5912,23 @@
 {
 	struct ext4_inode *raw_inode;
 	struct ext4_xattr_ibody_header *header;
+	unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	int error;
 
+	/* this was checked at iget time, but double check for good measure */
+	if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+	    (ei->i_extra_isize & 3)) {
+		EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+				 ei->i_extra_isize,
+				 EXT4_INODE_SIZE(inode->i_sb));
+		return -EFSCORRUPTED;
+	}
+	if ((new_extra_isize < ei->i_extra_isize) ||
+	    (new_extra_isize < 4) ||
+	    (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+		return -EINVAL;	/* Should never happen */
+
 	raw_inode = ext4_raw_inode(iloc);
 
 	header = IHDR(inode, raw_inode);
@@ -5945,7 +6018,7 @@
 
 	ext4_write_lock_xattr(inode, &no_expand);
 
-	BUFFER_TRACE(iloc.bh, "get_write_access");
+	BUFFER_TRACE(iloc->bh, "get_write_access");
 	error = ext4_journal_get_write_access(handle, iloc->bh);
 	if (error) {
 		brelse(iloc->bh);
@@ -6032,36 +6105,6 @@
 	return;
 }
 
-#if 0
-/*
- * Bind an inode's backing buffer_head into this transaction, to prevent
- * it from being flushed to disk early.  Unlike
- * ext4_reserve_inode_write, this leaves behind no bh reference and
- * returns no iloc structure, so the caller needs to repeat the iloc
- * lookup to mark the inode dirty later.
- */
-static int ext4_pin_inode(handle_t *handle, struct inode *inode)
-{
-	struct ext4_iloc iloc;
-
-	int err = 0;
-	if (handle) {
-		err = ext4_get_inode_loc(inode, &iloc);
-		if (!err) {
-			BUFFER_TRACE(iloc.bh, "get_write_access");
-			err = jbd2_journal_get_write_access(handle, iloc.bh);
-			if (!err)
-				err = ext4_handle_dirty_metadata(handle,
-								 NULL,
-								 iloc.bh);
-			brelse(iloc.bh);
-		}
-	}
-	ext4_std_error(inode->i_sb, err);
-	return err;
-}
-#endif
-
 int ext4_change_inode_journal_flag(struct inode *inode, int val)
 {
 	journal_t *journal;
@@ -6154,13 +6197,14 @@
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
-	int ret;
+	int err;
+	vm_fault_t ret;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
@@ -6168,13 +6212,16 @@
 	get_block_t *get_block;
 	int retries = 0;
 
+	if (unlikely(IS_IMMUTABLE(inode)))
+		return VM_FAULT_SIGBUS;
+
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
 
 	down_read(&EXT4_I(inode)->i_mmap_sem);
 
-	ret = ext4_convert_inline_data(inode);
-	if (ret)
+	err = ext4_convert_inline_data(inode);
+	if (err)
 		goto out_ret;
 
 	/* Delalloc case is easy... */
@@ -6182,9 +6229,9 @@
 	    !ext4_should_journal_data(inode) &&
 	    !ext4_nonda_switch(inode->i_sb)) {
 		do {
-			ret = block_page_mkwrite(vma, vmf,
+			err = block_page_mkwrite(vma, vmf,
 						   ext4_da_get_block_prep);
-		} while (ret == -ENOSPC &&
+		} while (err == -ENOSPC &&
 		       ext4_should_retry_alloc(inode->i_sb, &retries));
 		goto out_ret;
 	}
@@ -6229,8 +6276,8 @@
 		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
-	ret = block_page_mkwrite(vma, vmf, get_block);
-	if (!ret && ext4_should_journal_data(inode)) {
+	err = block_page_mkwrite(vma, vmf, get_block);
+	if (!err && ext4_should_journal_data(inode)) {
 		if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
 			  PAGE_SIZE, NULL, do_journal_get_write_access)) {
 			unlock_page(page);
@@ -6241,24 +6288,24 @@
 		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	}
 	ext4_journal_stop(handle);
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+	if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry_alloc;
 out_ret:
-	ret = block_page_mkwrite_return(ret);
+	ret = block_page_mkwrite_return(err);
 out:
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
 }
 
-int ext4_filemap_fault(struct vm_fault *vmf)
+vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
 {
 	struct inode *inode = file_inode(vmf->vma->vm_file);
-	int err;
+	vm_fault_t ret;
 
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	err = filemap_fault(vmf);
+	ret = filemap_fault(vmf);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 
-	return err;
+	return ret;
 }

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0edee31..0b7f316 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c

@@ -63,18 +63,20 @@
 	loff_t isize;
 	struct ext4_inode_info *ei1;
 	struct ext4_inode_info *ei2;
+	unsigned long tmp;
 
 	ei1 = EXT4_I(inode1);
 	ei2 = EXT4_I(inode2);
 
 	swap(inode1->i_version, inode2->i_version);
-	swap(inode1->i_blocks, inode2->i_blocks);
-	swap(inode1->i_bytes, inode2->i_bytes);
 	swap(inode1->i_atime, inode2->i_atime);
 	swap(inode1->i_mtime, inode2->i_mtime);
 
 	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
-	swap(ei1->i_flags, ei2->i_flags);
+	tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
+	ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
+		(ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
+	ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
 	swap(ei1->i_disksize, ei2->i_disksize);
 	ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
 	ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
@@ -115,28 +117,42 @@
 	int err;
 	struct inode *inode_bl;
 	struct ext4_inode_info *ei_bl;
+	qsize_t size, size_bl, diff;
+	blkcnt_t blocks;
+	unsigned short bytes;
 
-	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
-	    IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
-	    ext4_has_inline_data(inode))
-		return -EINVAL;
-
-	if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
-	    !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
 	if (IS_ERR(inode_bl))
 		return PTR_ERR(inode_bl);
 	ei_bl = EXT4_I(inode_bl);
 
-	filemap_flush(inode->i_mapping);
-	filemap_flush(inode_bl->i_mapping);
-
 	/* Protect orig inodes against a truncate and make sure,
 	 * that only 1 swap_inode_boot_loader is running. */
 	lock_two_nondirectories(inode, inode_bl);
 
+	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+	    IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+	    (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ||
+	    ext4_has_inline_data(inode)) {
+		err = -EINVAL;
+		goto journal_err_out;
+	}
+
+	if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+	    !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto journal_err_out;
+	}
+
+	down_write(&EXT4_I(inode)->i_mmap_sem);
+	err = filemap_write_and_wait(inode->i_mapping);
+	if (err)
+		goto err_out;
+
+	err = filemap_write_and_wait(inode_bl->i_mapping);
+	if (err)
+		goto err_out;
+
 	/* Wait for all existing dio workers */
 	inode_dio_wait(inode);
 	inode_dio_wait(inode_bl);
@@ -147,7 +163,7 @@
 	handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
 	if (IS_ERR(handle)) {
 		err = -EINVAL;
-		goto journal_err_out;
+		goto err_out;
 	}
 
 	/* Protect extent tree against block allocations via delalloc */
@@ -170,6 +186,13 @@
 			memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
 	}
 
+	err = dquot_initialize(inode);
+	if (err)
+		goto err_out1;
+
+	size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
+	size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
+	diff = size - size_bl;
 	swap_inode_data(inode, inode_bl);
 
 	inode->i_ctime = inode_bl->i_ctime = current_time(inode);
@@ -183,34 +206,58 @@
 
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err < 0) {
+		/* No need to update quota information. */
 		ext4_warning(inode->i_sb,
 			"couldn't mark inode #%lu dirty (err %d)",
 			inode->i_ino, err);
 		/* Revert all changes: */
 		swap_inode_data(inode, inode_bl);
 		ext4_mark_inode_dirty(handle, inode);
-	} else {
-		err = ext4_mark_inode_dirty(handle, inode_bl);
-		if (err < 0) {
-			ext4_warning(inode_bl->i_sb,
-				"couldn't mark inode #%lu dirty (err %d)",
-				inode_bl->i_ino, err);
-			/* Revert all changes: */
-			swap_inode_data(inode, inode_bl);
-			ext4_mark_inode_dirty(handle, inode);
-			ext4_mark_inode_dirty(handle, inode_bl);
-		}
+		goto err_out1;
 	}
+
+	blocks = inode_bl->i_blocks;
+	bytes = inode_bl->i_bytes;
+	inode_bl->i_blocks = inode->i_blocks;
+	inode_bl->i_bytes = inode->i_bytes;
+	err = ext4_mark_inode_dirty(handle, inode_bl);
+	if (err < 0) {
+		/* No need to update quota information. */
+		ext4_warning(inode_bl->i_sb,
+			"couldn't mark inode #%lu dirty (err %d)",
+			inode_bl->i_ino, err);
+		goto revert;
+	}
+
+	/* Bootloader inode should not be counted into quota information. */
+	if (diff > 0)
+		dquot_free_space(inode, diff);
+	else
+		err = dquot_alloc_space(inode, -1 * diff);
+
+	if (err < 0) {
+revert:
+		/* Revert all changes: */
+		inode_bl->i_blocks = blocks;
+		inode_bl->i_bytes = bytes;
+		swap_inode_data(inode, inode_bl);
+		ext4_mark_inode_dirty(handle, inode);
+		ext4_mark_inode_dirty(handle, inode_bl);
+	}
+
+err_out1:
 	ext4_journal_stop(handle);
 	ext4_double_up_write_data_sem(inode, inode_bl);
 
+err_out:
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 journal_err_out:
 	unlock_two_nondirectories(inode, inode_bl);
 	iput(inode_bl);
 	return err;
 }
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static int uuid_is_zero(__u8 u[16])
 {
 	int	i;
@@ -222,6 +269,29 @@
 }
 #endif
 
+/*
+ * If immutable is set and we are not clearing it, we're not allowed to change
+ * anything else in the inode.  Don't error out if we're only trying to set
+ * immutable on an immutable file.
+ */
+static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
+				      unsigned int flags)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int oldflags = ei->i_flags;
+
+	if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL))
+		return 0;
+
+	if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL))
+		return -EPERM;
+	if (ext4_has_feature_project(inode->i_sb) &&
+	    __kprojid_val(ei->i_projid) != new_projid)
+		return -EPERM;
+
+	return 0;
+}
+
 static int ext4_ioctl_setflags(struct inode *inode,
 			       unsigned int flags)
 {
@@ -231,6 +301,7 @@
 	struct ext4_iloc iloc;
 	unsigned int oldflags, mask, i;
 	unsigned int jflag;
+	struct super_block *sb = inode->i_sb;
 
 	/* Is it quota file? Do not allow user to mess with it */
 	if (ext4_is_quota_file(inode))
@@ -241,16 +312,9 @@
 	/* The JOURNAL_DATA flag is modifiable only by root */
 	jflag = flags & EXT4_JOURNAL_DATA_FL;
 
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-	 * the relevant capability.
-	 *
-	 * This test looks nicer. Thanks to Pauline Middelink
-	 */
-	if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-		if (!capable(CAP_LINUX_IMMUTABLE))
-			goto flags_out;
-	}
+	err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (err)
+		goto flags_out;
 
 	/*
 	 * The JOURNAL_DATA flag can only be changed by
@@ -275,6 +339,37 @@
 			goto flags_out;
 	}
 
+	if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) {
+		if (!ext4_has_feature_casefold(sb)) {
+			err = -EOPNOTSUPP;
+			goto flags_out;
+		}
+
+		if (!S_ISDIR(inode->i_mode)) {
+			err = -ENOTDIR;
+			goto flags_out;
+		}
+
+		if (!ext4_empty_dir(inode)) {
+			err = -ENOTEMPTY;
+			goto flags_out;
+		}
+	}
+
+	/*
+	 * Wait for all pending directio and then flush all the dirty pages
+	 * for this file.  The flush marks all the pages readonly, so any
+	 * subsequent attempt to write to the file (particularly mmap pages)
+	 * will come through the filesystem and fail.
+	 */
+	if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) &&
+	    (flags & EXT4_IMMUTABLE_FL)) {
+		inode_dio_wait(inode);
+		err = filemap_write_and_wait(inode->i_mapping);
+		if (err)
+			goto flags_out;
+	}
+
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
@@ -543,7 +638,7 @@
 static int ext4_ioc_getfsmap(struct super_block *sb,
 			     struct fsmap_head __user *arg)
 {
-	struct getfsmap_info info = {0};
+	struct getfsmap_info info = { NULL };
 	struct ext4_fsmap_head xhead = {0};
 	struct fsmap_head head;
 	bool aborted = false;
@@ -639,30 +734,85 @@
 	return err;
 }
 
-static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
+static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
 {
-	/*
-	 * Project Quota ID state is only allowed to change from within the init
-	 * namespace. Enforce that restriction only if we are trying to change
-	 * the quota ID state. Everything else is allowed in user namespaces.
-	 */
-	if (current_user_ns() == &init_user_ns)
-		return 0;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid)
+	simple_fill_fsxattr(fa, ext4_iflags_to_xflags(ei->i_flags &
+						      EXT4_FL_USER_VISIBLE));
+
+	if (ext4_has_feature_project(inode->i_sb))
+		fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
+}
+
+/* copied from fs/ioctl.c */
+static int fiemap_check_ranges(struct super_block *sb,
+			       u64 start, u64 len, u64 *new_len)
+{
+	u64 maxbytes = (u64) sb->s_maxbytes;
+
+	*new_len = len;
+
+	if (len == 0)
 		return -EINVAL;
 
-	if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) {
-		if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
-			return -EINVAL;
-	} else {
-		if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
-			return -EINVAL;
-	}
+	if (start > maxbytes)
+		return -EFBIG;
+
+	/*
+	 * Shrink request scope to what the fs can actually handle.
+	 */
+	if (len > maxbytes || (maxbytes - len) < start)
+		*new_len = maxbytes - start;
 
 	return 0;
 }
 
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
+
+static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
+{
+	struct fiemap fiemap;
+	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
+	struct fiemap_extent_info fieinfo = { 0, };
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	u64 len;
+	int error;
+
+	if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
+		return -EFAULT;
+
+	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+		return -EINVAL;
+
+	error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+				    &len);
+	if (error)
+		return error;
+
+	fieinfo.fi_flags = fiemap.fm_flags;
+	fieinfo.fi_extents_max = fiemap.fm_extent_count;
+	fieinfo.fi_extents_start = ufiemap->fm_extents;
+
+	if (fiemap.fm_extent_count != 0 &&
+	    !access_ok(fieinfo.fi_extents_start,
+		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+		return -EFAULT;
+
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
+	fiemap.fm_flags = fieinfo.fi_flags;
+	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+	if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
+		error = -EFAULT;
+
+	return error;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -677,6 +827,8 @@
 		return ext4_ioc_getfsmap(sb, (void __user *)arg);
 	case EXT4_IOC_GETFLAGS:
 		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+		if (S_ISREG(inode->i_mode))
+			flags &= ~EXT4_PROJINHERIT_FL;
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
 		int err;
@@ -704,7 +856,11 @@
 			return err;
 
 		inode_lock(inode);
-		err = ext4_ioctl_setflags(inode, flags);
+		err = ext4_ioctl_check_immutable(inode,
+				from_kprojid(&init_user_ns, ei->i_projid),
+				flags);
+		if (!err)
+			err = ext4_ioctl_setflags(inode, flags);
 		inode_unlock(inode);
 		mnt_drop_write_file(filp);
 		return err;
@@ -931,7 +1087,7 @@
 		if (err == 0)
 			err = err2;
 		mnt_drop_write_file(filp);
-		if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+		if (!err && (o_group < EXT4_SB(sb)->s_groups_count) &&
 		    ext4_has_group_desc_csum(sb) &&
 		    test_opt(sb, INIT_INODE_TABLE))
 			err = ext4_register_li_request(sb, o_group);
@@ -953,6 +1109,13 @@
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
 
+		/*
+		 * We haven't replayed the journal, so we cannot use our
+		 * block-bitmap-guided storage zapping commands.
+		 */
+		if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb))
+			return -EROFS;
+
 		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
 		    sizeof(range)))
 			return -EFAULT;
@@ -978,7 +1141,7 @@
 		return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
 
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 		int err, err2;
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		handle_t *handle;
@@ -1018,19 +1181,67 @@
 #endif
 	}
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
 		return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_remove_key_all_users(filp,
+							  (void __user *)arg);
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+
+	case EXT4_IOC_CLEAR_ES_CACHE:
+	{
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+		ext4_clear_inode_es(inode);
+		return 0;
+	}
+
+	case EXT4_IOC_GETSTATE:
+	{
+		__u32	state = 0;
+
+		if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED))
+			state |= EXT4_STATE_FLAG_EXT_PRECACHED;
+		if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+			state |= EXT4_STATE_FLAG_NEW;
+		if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+			state |= EXT4_STATE_FLAG_NEWENTRY;
+		if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE))
+			state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE;
+
+		return put_user(state, (__u32 __user *) arg);
+	}
+
+	case EXT4_IOC_GET_ES_CACHE:
+		return ext4_ioctl_get_es_cache(filp, arg);
+
 	case EXT4_IOC_FSGETXATTR:
 	{
 		struct fsxattr fa;
 
-		memset(&fa, 0, sizeof(struct fsxattr));
-		fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
-
-		if (ext4_has_feature_project(inode->i_sb)) {
-			fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
-				EXT4_I(inode)->i_projid);
-		}
+		ext4_fill_fsxattr(inode, &fa);
 
 		if (copy_to_user((struct fsxattr __user *)arg,
 				 &fa, sizeof(fa)))
@@ -1039,7 +1250,7 @@
 	}
 	case EXT4_IOC_FSSETXATTR:
 	{
-		struct fsxattr fa;
+		struct fsxattr fa, old_fa;
 		int err;
 
 		if (copy_from_user(&fa, (struct fsxattr __user *)arg,
@@ -1062,11 +1273,15 @@
 			return err;
 
 		inode_lock(inode);
-		err = ext4_ioctl_check_project(inode, &fa);
+		ext4_fill_fsxattr(inode, &old_fa);
+		err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
 		if (err)
 			goto out;
 		flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
 			 (flags & EXT4_FL_XFLAG_VISIBLE);
+		err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags);
+		if (err)
+			goto out;
 		err = ext4_ioctl_setflags(inode, flags);
 		if (err)
 			goto out;
@@ -1078,6 +1293,17 @@
 	}
 	case EXT4_IOC_SHUTDOWN:
 		return ext4_shutdown(sb, arg);
+
+	case FS_IOC_ENABLE_VERITY:
+		if (!ext4_has_feature_verity(sb))
+			return -EOPNOTSUPP;
+		return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+	case FS_IOC_MEASURE_VERITY:
+		if (!ext4_has_feature_verity(sb))
+			return -EOPNOTSUPP;
+		return fsverity_ioctl_measure(filp, (void __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -1138,8 +1364,18 @@
 	case EXT4_IOC_SET_ENCRYPTION_POLICY:
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT:
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
 	case EXT4_IOC_SHUTDOWN:
 	case FS_IOC_GETFSMAP:
+	case FS_IOC_ENABLE_VERITY:
+	case FS_IOC_MEASURE_VERITY:
+	case EXT4_IOC_CLEAR_ES_CACHE:
+	case EXT4_IOC_GETSTATE:
+	case EXT4_IOC_GET_ES_CACHE:
 		break;
 	default:
 		return -ENOIOCTLCMD;

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e29fce2..a3e2767 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c

@@ -1539,7 +1539,7 @@
 		ex->fe_len += 1 << order;
 	}
 
-	if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) {
+	if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
 		/* Should never happen! (but apparently sometimes does?!?) */
 		WARN_ON(1);
 		ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
@@ -2490,6 +2490,7 @@
 	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
 	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
 	for (i = 0; i < ngroups; i++) {
+		cond_resched();
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
@@ -2705,6 +2706,7 @@
 
 	if (sbi->s_group_info) {
 		for (i = 0; i < ngroups; i++) {
+			cond_resched();
 			grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
 			kfree(grinfo->bb_bitmap);
@@ -4176,9 +4178,8 @@
 	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
 		>> bsbits;
 
-	if ((size == isize) &&
-	    !ext4_fs_is_busy(sbi) &&
-	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+	if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+	    !inode_is_open_for_write(ac->ac_inode)) {
 		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
 		return;
 	}
@@ -4258,7 +4259,7 @@
 			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
 			(unsigned) ar->lleft, (unsigned) ar->pleft,
 			(unsigned) ar->lright, (unsigned) ar->pright,
-			atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+			inode_is_open_for_write(ar->inode) ? "" : "non-");
 	return 0;
 
 }
@@ -4695,8 +4696,9 @@
  * ext4_free_blocks() -- Free given blocks and update quota
  * @handle:		handle for this transaction
  * @inode:		inode
- * @block:		start physical block to free
- * @count:		number of blocks to count
+ * @bh:			optional buffer of the block to be freed
+ * @block:		starting physical block to be freed
+ * @count:		number of blocks to be freed
  * @flags:		flags used by ext4_free_blocks
  */
 void ext4_free_blocks(handle_t *handle, struct inode *inode,
@@ -4915,9 +4917,17 @@
 			     &sbi->s_flex_groups[flex_group].free_clusters);
 	}
 
-	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
-		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
-	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+	/*
+	 * on a bigalloc file system, defer the s_freeclusters_counter
+	 * update to the caller (ext4_remove_space and friends) so they
+	 * can determine if a cluster freed here should be rereserved
+	 */
+	if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
+		if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+			dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+		percpu_counter_add(&sbi->s_freeclusters_counter,
+				   count_clusters);
+	}
 
 	ext4_mb_unload_buddy(&e4b);
 

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 61a9d19..b1e4d35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c

@@ -116,9 +116,9 @@
 	int i, retval = 0;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, pblock);
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	i_data = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -145,9 +145,9 @@
 	int i, retval = 0;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, pblock);
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	i_data = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -175,9 +175,9 @@
 	int i, retval = 0;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, pblock);
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	i_data = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -224,9 +224,9 @@
 	struct buffer_head *bh;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	tmp_idata = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -254,9 +254,9 @@
 	struct buffer_head *bh;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	tmp_idata = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -382,9 +382,9 @@
 	struct ext4_extent_header *eh;
 
 	block = ext4_idx_pblock(ix);
-	bh = sb_bread(inode->i_sb, block);
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, block, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	eh = (struct ext4_extent_header *)bh->b_data;
 	if (eh->eh_depth != 0) {
@@ -535,22 +535,22 @@
 	if (i_data[EXT4_IND_BLOCK]) {
 		retval = update_ind_extent_range(handle, tmp_inode,
 				le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
-			if (retval)
-				goto err_out;
+		if (retval)
+			goto err_out;
 	} else
 		lb.curr_block += max_entries;
 	if (i_data[EXT4_DIND_BLOCK]) {
 		retval = update_dind_extent_range(handle, tmp_inode,
 				le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
-			if (retval)
-				goto err_out;
+		if (retval)
+			goto err_out;
 	} else
 		lb.curr_block += max_entries * max_entries;
 	if (i_data[EXT4_TIND_BLOCK]) {
 		retval = update_tind_extent_range(handle, tmp_inode,
 				le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
-			if (retval)
-				goto err_out;
+		if (retval)
+			goto err_out;
 	}
 	/*
 	 * Build the last extent

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2f5be02..30ce3dc 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c

@@ -13,11 +13,10 @@
 #include "ext4_extents.h"
 
 /**
- * get_ext_path - Find an extent path for designated logical block number.
- *
- * @inode:	an inode which is searched
+ * get_ext_path() - Find an extent path for designated logical block number.
+ * @inode:	inode to be searched
  * @lblock:	logical block number to find an extent path
- * @path:	pointer to an extent path pointer (for output)
+ * @ppath:	pointer to an extent path pointer (for output)
  *
  * ext4_find_extent wrapper. Return 0 on success, or a negative error value
  * on failure.
@@ -42,8 +41,9 @@
 }
 
 /**
- * ext4_double_down_write_data_sem - Acquire two inodes' write lock
- *                                   of i_data_sem
+ * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
+ * @first: inode to be locked
+ * @second: inode to be locked
  *
  * Acquire write lock of i_data_sem of the two inodes
  */
@@ -390,7 +390,8 @@
 
 	/* Even in case of data=writeback it is reasonable to pin
 	 * inode to transaction, to prevent unexpected data loss */
-	*err = ext4_jbd2_inode_add_write(handle, orig_inode);
+	*err = ext4_jbd2_inode_add_write(handle, orig_inode,
+			(loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
 
 unlock_pages:
 	unlock_page(pagep[0]);
@@ -592,8 +593,7 @@
 		return -EOPNOTSUPP;
 	}
 
-	if (ext4_encrypted_inode(orig_inode) ||
-	    ext4_encrypted_inode(donor_inode)) {
+	if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
 		ext4_msg(orig_inode->i_sb, KERN_ERR,
 			 "Online defrag not supported for encrypted files");
 		return -EOPNOTSUPP;

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ffa2575..a427d20 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c

@@ -35,6 +35,7 @@
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/iversion.h>
+#include <linux/unicode.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -81,8 +82,18 @@
 static int ext4_dx_csum_verify(struct inode *inode,
 			       struct ext4_dir_entry *dirent);
 
+/*
+ * Hints to ext4_read_dirblock regarding whether we expect a directory
+ * block being read to be an index block, or a block containing
+ * directory entries (and if the latter, whether it was found via a
+ * logical block in an htree index block).  This is used to control
+ * what sort of sanity checkinig ext4_read_dirblock() will do on the
+ * directory block read from the storage device.  EITHER will means
+ * the caller doesn't know what kind of directory block will be read,
+ * so no specific verification will be done.
+ */
 typedef enum {
-	EITHER, INDEX, DIRENT
+	EITHER, INDEX, DIRENT, DIRENT_HTREE
 } dirblock_type_t;
 
 #define ext4_read_dirblock(inode, block, type) \
@@ -108,11 +119,14 @@
 
 		return bh;
 	}
-	if (!bh) {
+	if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
 		ext4_error_inode(inode, func, line, block,
-				 "Directory hole found");
+				 "Directory hole found for htree %s block",
+				 (type == INDEX) ? "index" : "leaf");
 		return ERR_PTR(-EFSCORRUPTED);
 	}
+	if (!bh)
+		return NULL;
 	dirent = (struct ext4_dir_entry *) bh->b_data;
 	/* Determine whether or not we have an index block */
 	if (is_dx(inode)) {
@@ -149,7 +163,7 @@
 		}
 	}
 	if (!is_dx_block) {
-		if (ext4_dirent_csum_verify(inode, dirent))
+		if (ext4_dirblock_csum_verify(inode, bh))
 			set_buffer_verified(bh);
 		else {
 			ext4_error_inode(inode, func, line, block,
@@ -279,9 +293,11 @@
 			     struct inode *dir, struct inode *inode);
 
 /* checksumming functions */
-void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-			    unsigned int blocksize)
+void ext4_initialize_dirent_tail(struct buffer_head *bh,
+				 unsigned int blocksize)
 {
+	struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+
 	memset(t, 0, sizeof(struct ext4_dir_entry_tail));
 	t->det_rec_len = ext4_rec_len_to_disk(
 			sizeof(struct ext4_dir_entry_tail), blocksize);
@@ -290,17 +306,17 @@
 
 /* Walk through a dirent block to find a checksum "dirent" at the tail */
 static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
-						   struct ext4_dir_entry *de)
+						   struct buffer_head *bh)
 {
 	struct ext4_dir_entry_tail *t;
 
 #ifdef PARANOID
 	struct ext4_dir_entry *d, *top;
 
-	d = de;
-	top = (struct ext4_dir_entry *)(((void *)de) +
+	d = (struct ext4_dir_entry *)bh->b_data;
+	top = (struct ext4_dir_entry *)(bh->b_data +
 		(EXT4_BLOCK_SIZE(inode->i_sb) -
-		sizeof(struct ext4_dir_entry_tail)));
+		 sizeof(struct ext4_dir_entry_tail)));
 	while (d < top && d->rec_len)
 		d = (struct ext4_dir_entry *)(((void *)d) +
 		    le16_to_cpu(d->rec_len));
@@ -310,7 +326,7 @@
 
 	t = (struct ext4_dir_entry_tail *)d;
 #else
-	t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+	t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb));
 #endif
 
 	if (t->det_reserved_zero1 ||
@@ -322,8 +338,7 @@
 	return t;
 }
 
-static __le32 ext4_dirent_csum(struct inode *inode,
-			       struct ext4_dir_entry *dirent, int size)
+static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -343,49 +358,49 @@
 		"No space for directory leaf checksum. Please run e2fsck -D.");
 }
 
-int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
 {
 	struct ext4_dir_entry_tail *t;
 
 	if (!ext4_has_metadata_csum(inode->i_sb))
 		return 1;
 
-	t = get_dirent_tail(inode, dirent);
+	t = get_dirent_tail(inode, bh);
 	if (!t) {
 		warn_no_space_for_csum(inode);
 		return 0;
 	}
 
-	if (t->det_checksum != ext4_dirent_csum(inode, dirent,
-						(void *)t - (void *)dirent))
+	if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data,
+						  (char *)t - bh->b_data))
 		return 0;
 
 	return 1;
 }
 
-static void ext4_dirent_csum_set(struct inode *inode,
-				 struct ext4_dir_entry *dirent)
+static void ext4_dirblock_csum_set(struct inode *inode,
+				 struct buffer_head *bh)
 {
 	struct ext4_dir_entry_tail *t;
 
 	if (!ext4_has_metadata_csum(inode->i_sb))
 		return;
 
-	t = get_dirent_tail(inode, dirent);
+	t = get_dirent_tail(inode, bh);
 	if (!t) {
 		warn_no_space_for_csum(inode);
 		return;
 	}
 
-	t->det_checksum = ext4_dirent_csum(inode, dirent,
-					   (void *)t - (void *)dirent);
+	t->det_checksum = ext4_dirblock_csum(inode, bh->b_data,
+					     (char *)t - bh->b_data);
 }
 
-int ext4_handle_dirty_dirent_node(handle_t *handle,
-				  struct inode *inode,
-				  struct buffer_head *bh)
+int ext4_handle_dirty_dirblock(handle_t *handle,
+			       struct inode *inode,
+			       struct buffer_head *bh)
 {
-	ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+	ext4_dirblock_csum_set(inode, bh);
 	return ext4_handle_dirty_metadata(handle, inode, bh);
 }
 
@@ -612,7 +627,7 @@
 		{
 			if (show_names)
 			{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 				int len;
 				char *name;
 				struct fscrypt_str fname_crypto_str =
@@ -621,7 +636,7 @@
 
 				name  = de->name;
 				len = de->name_len;
-				if (ext4_encrypted_inode(dir))
+				if (IS_ENCRYPTED(dir))
 					res = fscrypt_get_encryption_info(dir);
 				if (res) {
 					printk(KERN_WARNING "Error setting up"
@@ -629,7 +644,7 @@
 				}
 				if (!fscrypt_has_encryption_key(dir)) {
 					/* Directory is not encrypted */
-					ext4fs_dirhash(de->name,
+					ext4fs_dirhash(dir, de->name,
 						de->name_len, &h);
 					printk("%*.s:(U)%x.%u ", len,
 					       name, h.hash,
@@ -662,8 +677,8 @@
 						name = fname_crypto_str.name;
 						len = fname_crypto_str.len;
 					}
-					ext4fs_dirhash(de->name, de->name_len,
-						       &h);
+					ext4fs_dirhash(dir, de->name,
+						       de->name_len, &h);
 					printk("%*.s:(E)%x.%u ", len, name,
 					       h.hash, (unsigned) ((char *) de
 								   - base));
@@ -673,7 +688,7 @@
 #else
 				int len = de->name_len;
 				char *name = de->name;
-				ext4fs_dirhash(de->name, de->name_len, &h);
+				ext4fs_dirhash(dir, de->name, de->name_len, &h);
 				printk("%*.s:%x.%u ", len, name, h.hash,
 				       (unsigned) ((char *) de - base));
 #endif
@@ -762,7 +777,7 @@
 		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 	if (fname && fname_name(fname))
-		ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo);
+		ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo);
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
@@ -871,12 +886,15 @@
 {
 	struct dx_root_info *info;
 	int i;
+	unsigned int indirect_levels;
 
 	if (frames[0].bh == NULL)
 		return;
 
 	info = &((struct dx_root *)frames[0].bh->b_data)->info;
-	for (i = 0; i <= info->indirect_levels; i++) {
+	/* save local copy, "info" may be freed after brelse() */
+	indirect_levels = info->indirect_levels;
+	for (i = 0; i <= indirect_levels; i++) {
 		if (frames[i].bh == NULL)
 			break;
 		brelse(frames[i].bh);
@@ -976,7 +994,7 @@
 
 	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
 							(unsigned long)block));
-	bh = ext4_read_dirblock(dir, block, DIRENT);
+	bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
 	if (IS_ERR(bh))
 		return PTR_ERR(bh);
 
@@ -984,9 +1002,9 @@
 	top = (struct ext4_dir_entry_2 *) ((char *) de +
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	/* Check if the directory is encrypted */
-	if (ext4_encrypted_inode(dir)) {
+	if (IS_ENCRYPTED(dir)) {
 		err = fscrypt_get_encryption_info(dir);
 		if (err < 0) {
 			brelse(bh);
@@ -1008,14 +1026,14 @@
 			/* silently ignore the rest of the block */
 			break;
 		}
-		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||
 		    ((hinfo->hash == start_hash) &&
 		     (hinfo->minor_hash < start_minor_hash)))
 			continue;
 		if (de->inode == 0)
 			continue;
-		if (!ext4_encrypted_inode(dir)) {
+		if (!IS_ENCRYPTED(dir)) {
 			tmp_str.name = de->name;
 			tmp_str.len = de->name_len;
 			err = ext4_htree_store_dirent(dir_file,
@@ -1047,7 +1065,7 @@
 	}
 errout:
 	brelse(bh);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	fscrypt_fname_free_buffer(&fname_crypto_str);
 #endif
 	return count;
@@ -1086,10 +1104,10 @@
 		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 		if (ext4_has_inline_data(dir)) {
 			int has_inline_data = 1;
-			count = htree_inlinedir_to_tree(dir_file, dir, 0,
-							&hinfo, start_hash,
-							start_minor_hash,
-							&has_inline_data);
+			count = ext4_inlinedir_to_tree(dir_file, dir, 0,
+						       &hinfo, start_hash,
+						       start_minor_hash,
+						       &has_inline_data);
 			if (has_inline_data) {
 				*next_hash = ~0;
 				return count;
@@ -1197,7 +1215,7 @@
 
 	while ((char *) de < base + blocksize) {
 		if (de->name_len && de->inode) {
-			ext4fs_dirhash(de->name, de->name_len, &h);
+			ext4fs_dirhash(dir, de->name, de->name_len, &h);
 			map_tail--;
 			map_tail->hash = h.hash;
 			map_tail->offs = ((char *) de - base)>>2;
@@ -1252,24 +1270,105 @@
 	dx_set_count(entries, count + 1);
 }
 
+#ifdef CONFIG_UNICODE
+/*
+ * Test whether a case-insensitive directory entry matches the filename
+ * being searched for.  If quick is set, assume the name being looked up
+ * is already in the casefolded form.
+ *
+ * Returns: 0 if the directory entry matches, more than 0 if it
+ * doesn't match or less than zero on error.
+ */
+int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
+		    const struct qstr *entry, bool quick)
+{
+	const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
+	const struct unicode_map *um = sbi->s_encoding;
+	int ret;
+
+	if (quick)
+		ret = utf8_strncasecmp_folded(um, name, entry);
+	else
+		ret = utf8_strncasecmp(um, name, entry);
+
+	if (ret < 0) {
+		/* Handle invalid character sequence as either an error
+		 * or as an opaque byte sequence.
+		 */
+		if (ext4_has_strict_mode(sbi))
+			return -EINVAL;
+
+		if (name->len != entry->len)
+			return 1;
+
+		return !!memcmp(name->name, entry->name, name->len);
+	}
+
+	return ret;
+}
+
+void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
+				  struct fscrypt_str *cf_name)
+{
+	int len;
+
+	if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
+		cf_name->name = NULL;
+		return;
+	}
+
+	cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
+	if (!cf_name->name)
+		return;
+
+	len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding,
+			    iname, cf_name->name,
+			    EXT4_NAME_LEN);
+	if (len <= 0) {
+		kfree(cf_name->name);
+		cf_name->name = NULL;
+		return;
+	}
+	cf_name->len = (unsigned) len;
+
+}
+#endif
+
 /*
  * Test whether a directory entry matches the filename being searched for.
  *
  * Return: %true if the directory entry matches, otherwise %false.
  */
-static inline bool ext4_match(const struct ext4_filename *fname,
+static inline bool ext4_match(const struct inode *parent,
+			      const struct ext4_filename *fname,
 			      const struct ext4_dir_entry_2 *de)
 {
 	struct fscrypt_name f;
+#ifdef CONFIG_UNICODE
+	const struct qstr entry = {.name = de->name, .len = de->name_len};
+#endif
 
 	if (!de->inode)
 		return false;
 
 	f.usr_fname = fname->usr_fname;
 	f.disk_name = fname->disk_name;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	f.crypto_buf = fname->crypto_buf;
 #endif
+
+#ifdef CONFIG_UNICODE
+	if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) {
+		if (fname->cf_name.name) {
+			struct qstr cf = {.name = fname->cf_name.name,
+					  .len = fname->cf_name.len};
+			return !ext4_ci_compare(parent, &cf, &entry, true);
+		}
+		return !ext4_ci_compare(parent, fname->usr_fname, &entry,
+					false);
+	}
+#endif
+
 	return fscrypt_match_name(&f, de->name, de->name_len);
 }
 
@@ -1290,7 +1389,7 @@
 		/* this code is executed quadratically often */
 		/* do minimal checking `by hand' */
 		if ((char *) de + de->name_len <= dlimit &&
-		    ext4_match(fname, de)) {
+		    ext4_match(dir, fname, de)) {
 			/* found a match - just to be sure, do
 			 * a full check */
 			if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
@@ -1327,7 +1426,7 @@
 }
 
 /*
- *	ext4_find_entry()
+ *	__ext4_find_entry()
  *
  * finds an entry in the specified directory with the wanted name. It
  * returns the cache buffer in which the entry was found, and the entry
@@ -1337,39 +1436,32 @@
  * The returned buffer_head has ->b_count elevated.  The caller is expected
  * to brelse() it when appropriate.
  */
-static struct buffer_head * ext4_find_entry (struct inode *dir,
-					const struct qstr *d_name,
-					struct ext4_dir_entry_2 **res_dir,
-					int *inlined)
+static struct buffer_head *__ext4_find_entry(struct inode *dir,
+					     struct ext4_filename *fname,
+					     struct ext4_dir_entry_2 **res_dir,
+					     int *inlined)
 {
 	struct super_block *sb;
 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
 	struct buffer_head *bh, *ret = NULL;
 	ext4_lblk_t start, block;
-	const u8 *name = d_name->name;
+	const u8 *name = fname->usr_fname->name;
 	size_t ra_max = 0;	/* Number of bh's in the readahead
 				   buffer, bh_use[] */
 	size_t ra_ptr = 0;	/* Current index into readahead
 				   buffer */
 	ext4_lblk_t  nblocks;
 	int i, namelen, retval;
-	struct ext4_filename fname;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
-	namelen = d_name->len;
+	namelen = fname->usr_fname->len;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
 
-	retval = ext4_fname_setup_filename(dir, d_name, 1, &fname);
-	if (retval == -ENOENT)
-		return NULL;
-	if (retval)
-		return ERR_PTR(retval);
-
 	if (ext4_has_inline_data(dir)) {
 		int has_inline_data = 1;
-		ret = ext4_find_inline_entry(dir, &fname, res_dir,
+		ret = ext4_find_inline_entry(dir, fname, res_dir,
 					     &has_inline_data);
 		if (has_inline_data) {
 			if (inlined)
@@ -1389,7 +1481,7 @@
 		goto restart;
 	}
 	if (is_dx(dir)) {
-		ret = ext4_dx_find_entry(dir, &fname, res_dir);
+		ret = ext4_dx_find_entry(dir, fname, res_dir);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
@@ -1444,8 +1536,7 @@
 		if (!buffer_verified(bh) &&
 		    !is_dx_internal_node(dir, block,
 					 (struct ext4_dir_entry *)bh->b_data) &&
-		    !ext4_dirent_csum_verify(dir,
-				(struct ext4_dir_entry *)bh->b_data)) {
+		    !ext4_dirblock_csum_verify(dir, bh)) {
 			EXT4_ERROR_INODE(dir, "checksumming directory "
 					 "block %lu", (unsigned long)block);
 			brelse(bh);
@@ -1453,7 +1544,7 @@
 			goto cleanup_and_exit;
 		}
 		set_buffer_verified(bh);
-		i = search_dirblock(bh, dir, &fname,
+		i = search_dirblock(bh, dir, fname,
 			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
 		if (i == 1) {
 			EXT4_I(dir)->i_dir_start_lookup = block;
@@ -1484,10 +1575,50 @@
 	/* Clean up the read-ahead blocks */
 	for (; ra_ptr < ra_max; ra_ptr++)
 		brelse(bh_use[ra_ptr]);
-	ext4_fname_free_filename(&fname);
 	return ret;
 }
 
+static struct buffer_head *ext4_find_entry(struct inode *dir,
+					   const struct qstr *d_name,
+					   struct ext4_dir_entry_2 **res_dir,
+					   int *inlined)
+{
+	int err;
+	struct ext4_filename fname;
+	struct buffer_head *bh;
+
+	err = ext4_fname_setup_filename(dir, d_name, 1, &fname);
+	if (err == -ENOENT)
+		return NULL;
+	if (err)
+		return ERR_PTR(err);
+
+	bh = __ext4_find_entry(dir, &fname, res_dir, inlined);
+
+	ext4_fname_free_filename(&fname);
+	return bh;
+}
+
+static struct buffer_head *ext4_lookup_entry(struct inode *dir,
+					     struct dentry *dentry,
+					     struct ext4_dir_entry_2 **res_dir)
+{
+	int err;
+	struct ext4_filename fname;
+	struct buffer_head *bh;
+
+	err = ext4_fname_prepare_lookup(dir, dentry, &fname);
+	if (err == -ENOENT)
+		return NULL;
+	if (err)
+		return ERR_PTR(err);
+
+	bh = __ext4_find_entry(dir, &fname, res_dir, NULL);
+
+	ext4_fname_free_filename(&fname);
+	return bh;
+}
+
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 			struct ext4_filename *fname,
 			struct ext4_dir_entry_2 **res_dir)
@@ -1498,7 +1629,7 @@
 	ext4_lblk_t block;
 	int retval;
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	*res_dir = NULL;
 #endif
 	frame = dx_probe(fname, dir, NULL, frames);
@@ -1506,7 +1637,7 @@
 		return (struct buffer_head *) frame;
 	do {
 		block = dx_get_block(frame->at);
-		bh = ext4_read_dirblock(dir, block, DIRENT);
+		bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
 		if (IS_ERR(bh))
 			goto errout;
 
@@ -1546,18 +1677,13 @@
 	struct inode *inode;
 	struct ext4_dir_entry_2 *de;
 	struct buffer_head *bh;
-	int err;
-
-	err = fscrypt_prepare_lookup(dir, dentry, flags);
-	if (err)
-		return ERR_PTR(err);
 
 	if (dentry->d_name.len > EXT4_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+	bh = ext4_lookup_entry(dir, dentry, &de);
 	if (IS_ERR(bh))
-		return (struct dentry *) bh;
+		return ERR_CAST(bh);
 	inode = NULL;
 	if (bh) {
 		__u32 ino = le32_to_cpu(de->inode);
@@ -1571,14 +1697,14 @@
 					 dentry);
 			return ERR_PTR(-EFSCORRUPTED);
 		}
-		inode = ext4_iget_normal(dir->i_sb, ino);
+		inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
 		if (inode == ERR_PTR(-ESTALE)) {
 			EXT4_ERROR_INODE(dir,
 					 "deleted inode referenced: %u",
 					 ino);
 			return ERR_PTR(-EFSCORRUPTED);
 		}
-		if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+		if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
 		    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
 		    !fscrypt_has_permitted_context(dir, inode)) {
 			ext4_warning(inode->i_sb,
@@ -1588,6 +1714,17 @@
 			return ERR_PTR(-EPERM);
 		}
 	}
+
+#ifdef CONFIG_UNICODE
+	if (!inode && IS_CASEFOLDED(dir)) {
+		/* Eventually we want to call d_add_ci(dentry, NULL)
+		 * for negative dentries in the encoding case as
+		 * well.  For now, prevent the negative dentry
+		 * from being cached.
+		 */
+		return NULL;
+	}
+#endif
 	return d_splice_alias(inode, dentry);
 }
 
@@ -1601,7 +1738,7 @@
 
 	bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
 	if (IS_ERR(bh))
-		return (struct dentry *) bh;
+		return ERR_CAST(bh);
 	if (!bh)
 		return ERR_PTR(-ENOENT);
 	ino = le32_to_cpu(de->inode);
@@ -1613,7 +1750,7 @@
 		return ERR_PTR(-EFSCORRUPTED);
 	}
 
-	return d_obtain_alias(ext4_iget_normal(child->d_sb, ino));
+	return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL));
 }
 
 /*
@@ -1683,7 +1820,6 @@
 	char *data1 = (*bh)->b_data, *data2;
 	unsigned split, move, size;
 	struct ext4_dir_entry_2 *de = NULL, *de2;
-	struct ext4_dir_entry_tail *t;
 	int	csum_size = 0;
 	int	err = 0, i;
 
@@ -1744,11 +1880,8 @@
 					    (char *) de2,
 					    blocksize);
 	if (csum_size) {
-		t = EXT4_DIRENT_TAIL(data2, blocksize);
-		initialize_dirent_tail(t, blocksize);
-
-		t = EXT4_DIRENT_TAIL(data1, blocksize);
-		initialize_dirent_tail(t, blocksize);
+		ext4_initialize_dirent_tail(*bh, blocksize);
+		ext4_initialize_dirent_tail(bh2, blocksize);
 	}
 
 	dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
@@ -1762,7 +1895,7 @@
 		de = de2;
 	}
 	dx_insert_block(frame, hash2 + continued, newblock);
-	err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
+	err = ext4_handle_dirty_dirblock(handle, dir, bh2);
 	if (err)
 		goto journal_error;
 	err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
@@ -1798,7 +1931,7 @@
 		if (ext4_check_dir_entry(dir, NULL, de, bh,
 					 buf, buf_size, offset))
 			return -EFSCORRUPTED;
-		if (ext4_match(fname, de))
+		if (ext4_match(dir, fname, de))
 			return -EEXIST;
 		nlen = EXT4_DIR_REC_LEN(de->name_len);
 		rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
@@ -1890,7 +2023,7 @@
 	inode_inc_iversion(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+	err = ext4_handle_dirty_dirblock(handle, dir, bh);
 	if (err)
 		ext4_std_error(dir->i_sb, err);
 	return 0;
@@ -1909,8 +2042,7 @@
 	struct dx_frame	frames[EXT4_HTREE_LEVEL], *frame;
 	struct dx_entry *entries;
 	struct ext4_dir_entry_2	*de, *de2;
-	struct ext4_dir_entry_tail *t;
-	char		*data1, *top;
+	char		*data2, *top;
 	unsigned	len;
 	int		retval;
 	unsigned	blocksize;
@@ -1950,21 +2082,18 @@
 		return PTR_ERR(bh2);
 	}
 	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
-	data1 = bh2->b_data;
+	data2 = bh2->b_data;
 
-	memcpy (data1, de, len);
-	de = (struct ext4_dir_entry_2 *) data1;
-	top = data1 + len;
+	memcpy(data2, de, len);
+	de = (struct ext4_dir_entry_2 *) data2;
+	top = data2 + len;
 	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
 		de = de2;
-	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
-					   (char *) de,
-					   blocksize);
+	de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+					   (char *) de, blocksize);
 
-	if (csum_size) {
-		t = EXT4_DIRENT_TAIL(data1, blocksize);
-		initialize_dirent_tail(t, blocksize);
-	}
+	if (csum_size)
+		ext4_initialize_dirent_tail(bh2, blocksize);
 
 	/* Initialize the root; the dot dirents already exist */
 	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
@@ -1983,7 +2112,7 @@
 	if (fname->hinfo.hash_version <= DX_HASH_TEA)
 		fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-	ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo);
+	ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo);
 
 	memset(frames, 0, sizeof(frames));
 	frame = frames;
@@ -1994,7 +2123,7 @@
 	retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
 	if (retval)
 		goto out_frames;	
-	retval = ext4_handle_dirty_dirent_node(handle, dir, bh2);
+	retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
 	if (retval)
 		goto out_frames;	
 
@@ -2034,8 +2163,8 @@
 	struct inode *dir = d_inode(dentry->d_parent);
 	struct buffer_head *bh = NULL;
 	struct ext4_dir_entry_2 *de;
-	struct ext4_dir_entry_tail *t;
 	struct super_block *sb;
+	struct ext4_sb_info *sbi;
 	struct ext4_filename fname;
 	int	retval;
 	int	dx_fallback=0;
@@ -2047,10 +2176,17 @@
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	sb = dir->i_sb;
+	sbi = EXT4_SB(sb);
 	blocksize = sb->s_blocksize;
 	if (!dentry->d_name.len)
 		return -EINVAL;
 
+#ifdef CONFIG_UNICODE
+	if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
+	    sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
+		return -EINVAL;
+#endif
+
 	retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
 	if (retval)
 		return retval;
@@ -2076,6 +2212,11 @@
 	blocks = dir->i_size >> sb->s_blocksize_bits;
 	for (block = 0; block < blocks; block++) {
 		bh = ext4_read_dirblock(dir, block, DIRENT);
+		if (bh == NULL) {
+			bh = ext4_bread(handle, dir, block,
+					EXT4_GET_BLOCKS_CREATE);
+			goto add_to_new_block;
+		}
 		if (IS_ERR(bh)) {
 			retval = PTR_ERR(bh);
 			bh = NULL;
@@ -2096,6 +2237,7 @@
 		brelse(bh);
 	}
 	bh = ext4_append(handle, dir, &block);
+add_to_new_block:
 	if (IS_ERR(bh)) {
 		retval = PTR_ERR(bh);
 		bh = NULL;
@@ -2105,10 +2247,8 @@
 	de->inode = 0;
 	de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
 
-	if (csum_size) {
-		t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
-		initialize_dirent_tail(t, blocksize);
-	}
+	if (csum_size)
+		ext4_initialize_dirent_tail(bh, blocksize);
 
 	retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
 out:
@@ -2140,7 +2280,7 @@
 		return PTR_ERR(frame);
 	entries = frame->entries;
 	at = frame->at;
-	bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+	bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE);
 	if (IS_ERR(bh)) {
 		err = PTR_ERR(bh);
 		bh = NULL;
@@ -2262,7 +2402,7 @@
 			dxroot->info.indirect_levels += 1;
 			dxtrace(printk(KERN_DEBUG
 				       "Creating %d level index...\n",
-				       info->indirect_levels));
+				       dxroot->info.indirect_levels));
 			err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
 			if (err)
 				goto journal_error;
@@ -2366,7 +2506,7 @@
 		goto out;
 
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+	err = ext4_handle_dirty_dirblock(handle, dir, bh);
 	if (unlikely(err))
 		goto out;
 
@@ -2568,7 +2708,6 @@
 {
 	struct buffer_head *dir_block = NULL;
 	struct ext4_dir_entry_2 *de;
-	struct ext4_dir_entry_tail *t;
 	ext4_lblk_t block = 0;
 	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int csum_size = 0;
@@ -2592,13 +2731,11 @@
 	de = (struct ext4_dir_entry_2 *)dir_block->b_data;
 	ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
 	set_nlink(inode, 2);
-	if (csum_size) {
-		t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
-		initialize_dirent_tail(t, blocksize);
-	}
+	if (csum_size)
+		ext4_initialize_dirent_tail(dir_block, blocksize);
 
 	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+	err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
 	if (err)
 		goto out;
 	set_buffer_verified(dir_block);
@@ -2688,7 +2825,10 @@
 		EXT4_ERROR_INODE(inode, "invalid size");
 		return true;
 	}
-	bh = ext4_read_dirblock(inode, 0, EITHER);
+	/* The first directory block must not be a hole,
+	 * so treat it as DIRENT_HTREE
+	 */
+	bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
 	if (IS_ERR(bh))
 		return true;
 
@@ -2710,6 +2850,10 @@
 			brelse(bh);
 			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
 			bh = ext4_read_dirblock(inode, lblock, EITHER);
+			if (bh == NULL) {
+				offset += sb->s_blocksize;
+				continue;
+			}
 			if (IS_ERR(bh))
 				return true;
 			de = (struct ext4_dir_entry_2 *) bh->b_data;
@@ -2975,6 +3119,17 @@
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 
+#ifdef CONFIG_UNICODE
+	/* VFS negative dentries are incompatible with Encoding and
+	 * Case-insensitiveness. Eventually we'll want avoid
+	 * invalidating the dentries here, alongside with returning the
+	 * negative dentries at ext4_lookup(), when it is better
+	 * supported by the VFS for the CI case.
+	 */
+	if (IS_CASEFOLDED(dir))
+		d_invalidate(dentry);
+#endif
+
 end_rmdir:
 	brelse(bh);
 	if (handle)
@@ -3044,6 +3199,17 @@
 	inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
+#ifdef CONFIG_UNICODE
+	/* VFS negative dentries are incompatible with Encoding and
+	 * Case-insensitiveness. Eventually we'll want avoid
+	 * invalidating the dentries here, alongside with returning the
+	 * negative dentries at ext4_lookup(), when it is  better
+	 * supported by the VFS for the CI case.
+	 */
+	if (IS_CASEFOLDED(dir))
+		d_invalidate(dentry);
+#endif
+
 end_unlink:
 	brelse(bh);
 	if (handle)
@@ -3253,7 +3419,10 @@
 	struct buffer_head *bh;
 
 	if (!ext4_has_inline_data(inode)) {
-		bh = ext4_read_dirblock(inode, 0, EITHER);
+		/* The first directory block must not be a hole, so
+		 * treat it as DIRENT_HTREE
+		 */
+		bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
 		if (IS_ERR(bh)) {
 			*retval = PTR_ERR(bh);
 			return NULL;
@@ -3314,9 +3483,8 @@
 							   ent->inode,
 							   ent->dir_bh);
 		} else {
-			retval = ext4_handle_dirty_dirent_node(handle,
-							       ent->inode,
-							       ent->dir_bh);
+			retval = ext4_handle_dirty_dirblock(handle, ent->inode,
+							    ent->dir_bh);
 		}
 	} else {
 		retval = ext4_mark_inode_dirty(handle, ent->inode);
@@ -3346,8 +3514,7 @@
 	ext4_mark_inode_dirty(handle, ent->dir);
 	BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
 	if (!ent->inlined) {
-		retval = ext4_handle_dirty_dirent_node(handle,
-						       ent->dir, ent->bh);
+		retval = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh);
 		if (unlikely(retval)) {
 			ext4_std_error(ent->dir->i_sb, retval);
 			return retval;

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index db75901..12ceade 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c

@@ -61,14 +61,12 @@
 
 static void ext4_finish_bio(struct bio *bio)
 {
-	int i;
 	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-		struct page *data_page = NULL;
-#endif
+		struct page *bounce_page = NULL;
 		struct buffer_head *bh, *head;
 		unsigned bio_start = bvec->bv_offset;
 		unsigned bio_end = bio_start + bvec->bv_len;
@@ -78,13 +76,10 @@
 		if (!page)
 			continue;
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-		if (!page->mapping) {
-			/* The bounce data pages are unmapped. */
-			data_page = page;
-			fscrypt_pullback_bio_page(&page, false);
+		if (fscrypt_is_bounce_page(page)) {
+			bounce_page = page;
+			page = fscrypt_pagecache_page(bounce_page);
 		}
-#endif
 
 		if (bio->bi_status) {
 			SetPageError(page);
@@ -111,10 +106,7 @@
 		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
 		local_irq_restore(flags);
 		if (!under_io) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-			if (data_page)
-				fscrypt_restore_control_page(data_page);
-#endif
+			fscrypt_free_bounce_page(bounce_page);
 			end_page_writeback(page);
 		}
 	}
@@ -374,13 +366,13 @@
 	bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 	if (!bio)
 		return -ENOMEM;
-	wbc_init_bio(io->io_wbc, bio);
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio_set_dev(bio, bh->b_bdev);
 	bio->bi_end_io = ext4_end_bio;
 	bio->bi_private = ext4_get_io_end(io->io_end);
 	io->io_bio = bio;
 	io->io_next_block = bh->b_blocknr;
+	wbc_init_bio(io->io_wbc, bio);
 	return 0;
 }
 
@@ -404,7 +396,7 @@
 	ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
 	if (ret != bh->b_size)
 		goto submit_and_retry;
-	wbc_account_io(io->io_wbc, page, bh->b_size);
+	wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
 	io->io_next_block++;
 	return 0;
 }
@@ -415,7 +407,7 @@
 			struct writeback_control *wbc,
 			bool keep_towrite)
 {
-	struct page *data_page = NULL;
+	struct page *bounce_page = NULL;
 	struct inode *inode = page->mapping->host;
 	unsigned block_start;
 	struct buffer_head *bh, *head;
@@ -467,25 +459,30 @@
 				ext4_io_submit(io);
 			continue;
 		}
-		if (buffer_new(bh)) {
+		if (buffer_new(bh))
 			clear_buffer_new(bh);
-			clean_bdev_bh_alias(bh);
-		}
 		set_buffer_async_write(bh);
 		nr_to_submit++;
 	} while ((bh = bh->b_this_page) != head);
 
 	bh = head = page_buffers(page);
 
-	if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
-	    nr_to_submit) {
+	/*
+	 * If any blocks are being written to an encrypted file, encrypt them
+	 * into a bounce page.  For simplicity, just encrypt until the last
+	 * block which might be needed.  This may cause some unneeded blocks
+	 * (e.g. holes) to be unnecessarily encrypted, but this is rare and
+	 * can't happen in the common case of blocksize == PAGE_SIZE.
+	 */
+	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
 		gfp_t gfp_flags = GFP_NOFS;
+		unsigned int enc_bytes = round_up(len, i_blocksize(inode));
 
 	retry_encrypt:
-		data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
-						page->index, gfp_flags);
-		if (IS_ERR(data_page)) {
-			ret = PTR_ERR(data_page);
+		bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
+							       0, gfp_flags);
+		if (IS_ERR(bounce_page)) {
+			ret = PTR_ERR(bounce_page);
 			if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
 				if (io->io_bio) {
 					ext4_io_submit(io);
@@ -494,7 +491,7 @@
 				gfp_flags |= __GFP_NOFAIL;
 				goto retry_encrypt;
 			}
-			data_page = NULL;
+			bounce_page = NULL;
 			goto out;
 		}
 	}
@@ -503,8 +500,7 @@
 	do {
 		if (!buffer_async_write(bh))
 			continue;
-		ret = io_submit_add_bh(io, inode,
-				       data_page ? data_page : page, bh);
+		ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
 		if (ret) {
 			/*
 			 * We only get here on ENOMEM.  Not much else
@@ -520,8 +516,7 @@
 	/* Error stopped previous loop? Clean up buffers... */
 	if (ret) {
 	out:
-		if (data_page)
-			fscrypt_restore_control_page(data_page);
+		fscrypt_free_bounce_page(bounce_page);
 		printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
 		redirty_page_for_writepage(wbc, page);
 		do {

diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f461d75..a30b203 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c

@@ -47,13 +47,103 @@
 
 #include "ext4.h"
 
-static inline bool ext4_bio_encrypted(struct bio *bio)
+#define NUM_PREALLOC_POST_READ_CTXS	128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+	STEP_INITIAL = 0,
+	STEP_DECRYPT,
+	STEP_VERITY,
+};
+
+struct bio_post_read_ctx {
+	struct bio *bio;
+	struct work_struct work;
+	unsigned int cur_step;
+	unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
 {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	return unlikely(bio->bi_private != NULL);
-#else
-	return false;
-#endif
+	struct page *page;
+	struct bio_vec *bv;
+	struct bvec_iter_all iter_all;
+
+	bio_for_each_segment_all(bv, bio, iter_all) {
+		page = bv->bv_page;
+
+		/* PG_error was set if any post_read step failed */
+		if (bio->bi_status || PageError(page)) {
+			ClearPageUptodate(page);
+			/* will re-read again later */
+			ClearPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	if (bio->bi_private)
+		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	bio_put(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fscrypt_decrypt_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
+static void verity_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fsverity_verify_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+	/*
+	 * We use different work queues for decryption and for verity because
+	 * verity may require reading metadata pages that need decryption, and
+	 * we shouldn't recurse to the same workqueue.
+	 */
+	switch (++ctx->cur_step) {
+	case STEP_DECRYPT:
+		if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+			INIT_WORK(&ctx->work, decrypt_work);
+			fscrypt_enqueue_decrypt_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
+	case STEP_VERITY:
+		if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+			INIT_WORK(&ctx->work, verity_work);
+			fsverity_enqueue_verify_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
+	default:
+		__read_end_io(ctx->bio);
+	}
+}
+
+static bool bio_post_read_required(struct bio *bio)
+{
+	return bio->bi_private && !bio->bi_status;
 }
 
 /*
@@ -70,30 +160,53 @@
  */
 static void mpage_end_io(struct bio *bio)
 {
-	struct bio_vec *bv;
-	int i;
+	if (bio_post_read_required(bio)) {
+		struct bio_post_read_ctx *ctx = bio->bi_private;
 
-	if (ext4_bio_encrypted(bio)) {
-		if (bio->bi_status) {
-			fscrypt_release_ctx(bio->bi_private);
-		} else {
-			fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
-			return;
-		}
+		ctx->cur_step = STEP_INITIAL;
+		bio_post_read_processing(ctx);
+		return;
 	}
-	bio_for_each_segment_all(bv, bio, i) {
-		struct page *page = bv->bv_page;
+	__read_end_io(bio);
+}
 
-		if (!bio->bi_status) {
-			SetPageUptodate(page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		unlock_page(page);
+static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
+{
+	return fsverity_active(inode) &&
+	       idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
+static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
+						       struct bio *bio,
+						       pgoff_t first_idx)
+{
+	unsigned int post_read_steps = 0;
+	struct bio_post_read_ctx *ctx = NULL;
+
+	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+		post_read_steps |= 1 << STEP_DECRYPT;
+
+	if (ext4_need_verity(inode, first_idx))
+		post_read_steps |= 1 << STEP_VERITY;
+
+	if (post_read_steps) {
+		ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+		if (!ctx)
+			return ERR_PTR(-ENOMEM);
+		ctx->bio = bio;
+		ctx->enabled_steps = post_read_steps;
+		bio->bi_private = ctx;
 	}
+	return ctx;
+}
 
-	bio_put(bio);
+static inline loff_t ext4_readpage_limit(struct inode *inode)
+{
+	if (IS_ENABLED(CONFIG_FS_VERITY) &&
+	    (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+		return inode->i_sb->s_maxbytes;
+
+	return i_size_read(inode);
 }
 
 int ext4_mpage_readpages(struct address_space *mapping,
@@ -126,9 +239,10 @@
 		int fully_mapped = 1;
 		unsigned first_hole = blocks_per_page;
 
-		prefetchw(&page->flags);
 		if (pages) {
-			page = list_entry(pages->prev, struct page, lru);
+			page = lru_to_page(pages);
+
+			prefetchw(&page->flags);
 			list_del(&page->lru);
 			if (add_to_page_cache_lru(page, mapping, page->index,
 				  readahead_gfp_mask(mapping)))
@@ -140,7 +254,8 @@
 
 		block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
 		last_block = block_in_file + nr_pages * blocks_per_page;
-		last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+		last_block_in_file = (ext4_readpage_limit(inode) +
+				      blocksize - 1) >> blkbits;
 		if (last_block > last_block_in_file)
 			last_block = last_block_in_file;
 		page_block = 0;
@@ -217,6 +332,9 @@
 			zero_user_segment(page, first_hole << blkbits,
 					  PAGE_SIZE);
 			if (first_hole == 0) {
+				if (ext4_need_verity(inode, page->index) &&
+				    !fsverity_verify_page(page))
+					goto set_error_page;
 				SetPageUptodate(page);
 				unlock_page(page);
 				goto next_page;
@@ -240,19 +358,16 @@
 			bio = NULL;
 		}
 		if (bio == NULL) {
-			struct fscrypt_ctx *ctx = NULL;
+			struct bio_post_read_ctx *ctx;
 
-			if (ext4_encrypted_inode(inode) &&
-			    S_ISREG(inode->i_mode)) {
-				ctx = fscrypt_get_ctx(inode, GFP_NOFS);
-				if (IS_ERR(ctx))
-					goto set_error_page;
-			}
 			bio = bio_alloc(GFP_KERNEL,
 				min_t(int, nr_pages, BIO_MAX_PAGES));
-			if (!bio) {
-				if (ctx)
-					fscrypt_release_ctx(ctx);
+			if (!bio)
+				goto set_error_page;
+			ctx = get_bio_post_read_ctx(inode, bio, page->index);
+			if (IS_ERR(ctx)) {
+				bio_put(bio);
+				bio = NULL;
 				goto set_error_page;
 			}
 			bio_set_dev(bio, bdev);
@@ -293,3 +408,29 @@
 		submit_bio(bio);
 	return 0;
 }
+
+int __init ext4_init_post_read_processing(void)
+{
+	bio_post_read_ctx_cache =
+		kmem_cache_create("ext4_bio_post_read_ctx",
+				  sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+	if (!bio_post_read_ctx_cache)
+		goto fail;
+	bio_post_read_ctx_pool =
+		mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+					 bio_post_read_ctx_cache);
+	if (!bio_post_read_ctx_pool)
+		goto fail_free_cache;
+	return 0;
+
+fail_free_cache:
+	kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+	return -ENOMEM;
+}
+
+void ext4_exit_post_read_processing(void)
+{
+	mempool_destroy(bio_post_read_ctx_pool);
+	kmem_cache_destroy(bio_post_read_ctx_cache);
+}

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a5efee3..c0e9aef 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c

@@ -127,10 +127,12 @@
 	else if (free_blocks_count < 0)
 		ext4_warning(sb, "Bad blocks count %u",
 			     input->blocks_count);
-	else if (!(bh = sb_bread(sb, end - 1)))
+	else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) {
+		err = PTR_ERR(bh);
+		bh = NULL;
 		ext4_warning(sb, "Cannot read last block (%llu)",
 			     end - 1);
-	else if (outside(input->block_bitmap, start, end))
+	} else if (outside(input->block_bitmap, start, end))
 		ext4_warning(sb, "Block bitmap not in group (block %llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (outside(input->inode_bitmap, start, end))
@@ -781,11 +783,11 @@
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
 	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
-	struct buffer_head **o_group_desc, **n_group_desc;
-	struct buffer_head *dind;
-	struct buffer_head *gdb_bh;
+	struct buffer_head **o_group_desc, **n_group_desc = NULL;
+	struct buffer_head *dind = NULL;
+	struct buffer_head *gdb_bh = NULL;
 	int gdbackups;
-	struct ext4_iloc iloc;
+	struct ext4_iloc iloc = { .bh = NULL };
 	__le32 *data;
 	int err;
 
@@ -794,21 +796,22 @@
 		       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
 		       gdb_num);
 
-	gdb_bh = sb_bread(sb, gdblock);
-	if (!gdb_bh)
-		return -EIO;
+	gdb_bh = ext4_sb_bread(sb, gdblock, 0);
+	if (IS_ERR(gdb_bh))
+		return PTR_ERR(gdb_bh);
 
 	gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
 	if (gdbackups < 0) {
 		err = gdbackups;
-		goto exit_bh;
+		goto errout;
 	}
 
 	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
-	dind = sb_bread(sb, le32_to_cpu(*data));
-	if (!dind) {
-		err = -EIO;
-		goto exit_bh;
+	dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0);
+	if (IS_ERR(dind)) {
+		err = PTR_ERR(dind);
+		dind = NULL;
+		goto errout;
 	}
 
 	data = (__le32 *)dind->b_data;
@@ -816,18 +819,18 @@
 		ext4_warning(sb, "new group %u GDT block %llu not reserved",
 			     group, gdblock);
 		err = -EINVAL;
-		goto exit_dind;
+		goto errout;
 	}
 
 	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
 	if (unlikely(err))
-		goto exit_dind;
+		goto errout;
 
 	BUFFER_TRACE(gdb_bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, gdb_bh);
 	if (unlikely(err))
-		goto exit_dind;
+		goto errout;
 
 	BUFFER_TRACE(dind, "get_write_access");
 	err = ext4_journal_get_write_access(handle, dind);
@@ -837,7 +840,7 @@
 	/* ext4_reserve_inode_write() gets a reference on the iloc */
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (unlikely(err))
-		goto exit_dind;
+		goto errout;
 
 	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
 				     sizeof(struct buffer_head *),
@@ -846,7 +849,7 @@
 		err = -ENOMEM;
 		ext4_warning(sb, "not enough memory for %lu groups",
 			     gdb_num + 1);
-		goto exit_inode;
+		goto errout;
 	}
 
 	/*
@@ -862,7 +865,7 @@
 	err = ext4_handle_dirty_metadata(handle, NULL, dind);
 	if (unlikely(err)) {
 		ext4_std_error(sb, err);
-		goto exit_inode;
+		goto errout;
 	}
 	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >>
 			   (9 - EXT4_SB(sb)->s_cluster_bits);
@@ -872,7 +875,7 @@
 	if (unlikely(err)) {
 		ext4_std_error(sb, err);
 		iloc.bh = NULL;
-		goto exit_inode;
+		goto errout;
 	}
 	brelse(dind);
 
@@ -888,15 +891,11 @@
 	err = ext4_handle_dirty_super(handle, sb);
 	if (err)
 		ext4_std_error(sb, err);
-
 	return err;
-
-exit_inode:
+errout:
 	kvfree(n_group_desc);
 	brelse(iloc.bh);
-exit_dind:
 	brelse(dind);
-exit_bh:
 	brelse(gdb_bh);
 
 	ext4_debug("leaving with error %d\n", err);
@@ -916,9 +915,9 @@
 
 	gdblock = ext4_meta_bg_first_block_no(sb, group) +
 		   ext4_bg_has_super(sb, group);
-	gdb_bh = sb_bread(sb, gdblock);
-	if (!gdb_bh)
-		return -EIO;
+	gdb_bh = ext4_sb_bread(sb, gdblock, 0);
+	if (IS_ERR(gdb_bh))
+		return PTR_ERR(gdb_bh);
 	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
 				     sizeof(struct buffer_head *),
 				     GFP_NOFS);
@@ -934,11 +933,18 @@
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
 	n_group_desc[gdb_num] = gdb_bh;
+
+	BUFFER_TRACE(gdb_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gdb_bh);
+	if (err) {
+		kvfree(n_group_desc);
+		brelse(gdb_bh);
+		return err;
+	}
+
 	EXT4_SB(sb)->s_group_desc = n_group_desc;
 	EXT4_SB(sb)->s_gdb_count++;
 	kvfree(o_group_desc);
-	BUFFER_TRACE(gdb_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, gdb_bh);
 	return err;
 }
 
@@ -975,9 +981,10 @@
 		return -ENOMEM;
 
 	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
-	dind = sb_bread(sb, le32_to_cpu(*data));
-	if (!dind) {
-		err = -EIO;
+	dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0);
+	if (IS_ERR(dind)) {
+		err = PTR_ERR(dind);
+		dind = NULL;
 		goto exit_free;
 	}
 
@@ -996,9 +1003,10 @@
 			err = -EINVAL;
 			goto exit_bh;
 		}
-		primary[res] = sb_bread(sb, blk);
-		if (!primary[res]) {
-			err = -EIO;
+		primary[res] = ext4_sb_bread(sb, blk, 0);
+		if (IS_ERR(primary[res])) {
+			err = PTR_ERR(primary[res]);
+			primary[res] = NULL;
 			goto exit_bh;
 		}
 		gdbackups = verify_reserved_gdb(sb, group, primary[res]);
@@ -1631,13 +1639,13 @@
 	}
 
 	if (reserved_gdb || gdb_off == 0) {
-		if (ext4_has_feature_resize_inode(sb) ||
+		if (!ext4_has_feature_resize_inode(sb) ||
 		    !le16_to_cpu(es->s_reserved_gdt_blocks)) {
 			ext4_warning(sb,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
 		}
-		inode = ext4_iget(sb, EXT4_RESIZE_INO);
+		inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL);
 		if (IS_ERR(inode)) {
 			ext4_warning(sb, "Error opening resize inode");
 			return PTR_ERR(inode);
@@ -1753,8 +1761,6 @@
 		ext4_msg(sb, KERN_ERR,
 			 "filesystem too large to resize to %llu blocks safely",
 			 n_blocks_count);
-		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
 	}
 
@@ -1960,12 +1966,14 @@
 				le16_to_cpu(es->s_reserved_gdt_blocks);
 			n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
 			n_blocks_count = (ext4_fsblk_t)n_group *
-				EXT4_BLOCKS_PER_GROUP(sb);
+				EXT4_BLOCKS_PER_GROUP(sb) +
+				le32_to_cpu(es->s_first_data_block);
 			n_group--; /* set to last group number */
 		}
 
 		if (!resize_inode)
-			resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+			resize_inode = ext4_iget(sb, EXT4_RESIZE_INO,
+						 EXT4_IGET_SPECIAL);
 		if (IS_ERR(resize_inode)) {
 			ext4_warning(sb, "Error opening resize inode");
 			return PTR_ERR(resize_inode);
@@ -2071,6 +2079,10 @@
 		free_flex_gd(flex_gd);
 	if (resize_inode != NULL)
 		iput(resize_inode);
-	ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
+	if (err)
+		ext4_warning(sb, "error (%d) occurred during "
+			     "file system resize", err);
+	ext4_msg(sb, KERN_INFO, "resized filesystem to %llu",
+		 ext4_blocks_count(es));
 	return err;
 }

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8a149df..7357835 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c

@@ -42,6 +42,7 @@
 #include <linux/cleancache.h>
 #include <linux/uaccess.h>
 #include <linux/iversion.h>
+#include <linux/unicode.h>
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -140,6 +141,29 @@
 MODULE_ALIAS("ext3");
 #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 
+/*
+ * This works like sb_bread() except it uses ERR_PTR for error
+ * returns.  Currently with sb_bread it's impossible to distinguish
+ * between ENOMEM and EIO situations (since both result in a NULL
+ * return.
+ */
+struct buffer_head *
+ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
+{
+	struct buffer_head *bh = sb_getblk(sb, block);
+
+	if (bh == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (buffer_uptodate(bh))
+		return bh;
+	ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	put_bh(bh);
+	return ERR_PTR(-EIO);
+}
+
 static int ext4_verify_csum_type(struct super_block *sb,
 				 struct ext4_super_block *es)
 {
@@ -407,6 +431,12 @@
 	spin_unlock(&sbi->s_md_lock);
 }
 
+static bool system_going_down(void)
+{
+	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
+		|| system_state == SYSTEM_RESTART;
+}
+
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
  *
@@ -437,7 +467,12 @@
 		if (journal)
 			jbd2_journal_abort(journal, -EIO);
 	}
-	if (test_opt(sb, ERRORS_RO)) {
+	/*
+	 * We force ERRORS_RO behavior when system is rebooting. Otherwise we
+	 * could panic during 'reboot -f' as the underlying device got already
+	 * disabled.
+	 */
+	if (test_opt(sb, ERRORS_RO) || system_going_down()) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		/*
 		 * Make sure updated value of ->s_mount_flags will be visible
@@ -445,8 +480,7 @@
 		 */
 		smp_wmb();
 		sb->s_flags |= SB_RDONLY;
-	}
-	if (test_opt(sb, ERRORS_PANIC)) {
+	} else if (test_opt(sb, ERRORS_PANIC)) {
 		if (EXT4_SB(sb)->s_journal &&
 		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 			return;
@@ -665,7 +699,7 @@
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
 	}
-	if (test_opt(sb, ERRORS_PANIC)) {
+	if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
 		if (EXT4_SB(sb)->s_journal &&
 		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 			return;
@@ -1000,14 +1034,13 @@
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
-	if (sbi->s_ea_inode_cache) {
-		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
-		sbi->s_ea_inode_cache = NULL;
-	}
-	if (sbi->s_ea_block_cache) {
-		ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
-		sbi->s_ea_block_cache = NULL;
-	}
+
+	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+	sbi->s_ea_inode_cache = NULL;
+
+	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
+	sbi->s_ea_block_cache = NULL;
+
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 	brelse(sbi->s_sbh);
@@ -1022,6 +1055,9 @@
 		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->s_blockgroup_lock);
 	fs_put_dax(sbi->s_daxdev);
+#ifdef CONFIG_UNICODE
+	utf8_unload(sbi->s_encoding);
+#endif
 	kfree(sbi);
 }
 
@@ -1052,6 +1088,7 @@
 	ei->i_da_metadata_calc_len = 0;
 	ei->i_da_metadata_calc_last_lblock = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
+	ext4_init_pending_tree(&ei->i_pending_tree);
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
@@ -1070,13 +1107,16 @@
 {
 	int drop = generic_drop_inode(inode);
 
+	if (!drop)
+		drop = fscrypt_drop_inode(inode);
+
 	trace_ext4_drop_inode(inode, drop);
 	return drop;
 }
 
-static void ext4_i_callback(struct rcu_head *head)
+static void ext4_free_in_core_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
+	fscrypt_free_inode(inode);
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 
@@ -1091,7 +1131,6 @@
 				true);
 		dump_stack();
 	}
-	call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 
 static void init_once(void *foo)
@@ -1143,6 +1182,7 @@
 		EXT4_I(inode)->jinode = NULL;
 	}
 	fscrypt_put_encryption_info(inode);
+	fsverity_cleanup_inode(inode);
 }
 
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1150,20 +1190,11 @@
 {
 	struct inode *inode;
 
-	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
-		return ERR_PTR(-ESTALE);
-	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
-		return ERR_PTR(-ESTALE);
-
-	/* iget isn't really right if the inode is currently unallocated!!
-	 *
-	 * ext4_read_inode will return a bad_inode if the inode had been
-	 * deleted, so we should be safe.
-	 *
+	/*
 	 * Currently we don't know the generation for parent directory, so
 	 * a generation of 0 means "accept any"
 	 */
-	inode = ext4_iget_normal(sb, ino);
+	inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (generation && inode->i_generation != generation) {
@@ -1188,6 +1219,16 @@
 				    ext4_nfs_get_inode);
 }
 
+static int ext4_nfs_commit_metadata(struct inode *inode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL
+	};
+
+	trace_ext4_nfs_commit_metadata(inode);
+	return ext4_write_inode(inode, &wbc);
+}
+
 /*
  * Try to release metadata pages (indirect blocks, directories) which are
  * mapped via the block device.  Since these pages could have journal heads
@@ -1208,7 +1249,7 @@
 	return try_to_free_buffers(page);
 }
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
 {
 	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
@@ -1368,6 +1409,7 @@
 
 static const struct super_operations ext4_sops = {
 	.alloc_inode	= ext4_alloc_inode,
+	.free_inode	= ext4_free_in_core_inode,
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
@@ -1392,6 +1434,7 @@
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
 	.get_parent = ext4_get_parent,
+	.commit_metadata = ext4_nfs_commit_metadata,
 };
 
 enum {
@@ -1715,6 +1758,36 @@
 	{Opt_err, 0, 0}
 };
 
+#ifdef CONFIG_UNICODE
+static const struct ext4_sb_encodings {
+	__u16 magic;
+	char *name;
+	char *version;
+} ext4_sb_encoding_map[] = {
+	{EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
+};
+
+static int ext4_sb_read_encoding(const struct ext4_super_block *es,
+				 const struct ext4_sb_encodings **encoding,
+				 __u16 *flags)
+{
+	__u16 magic = le16_to_cpu(es->s_encoding);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
+		if (magic == ext4_sb_encoding_map[i].magic)
+			break;
+
+	if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
+		return -EINVAL;
+
+	*encoding = &ext4_sb_encoding_map[i];
+	*flags = le16_to_cpu(es->s_encoding_flags);
+
+	return 0;
+}
+#endif
+
 static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			    substring_t *args, unsigned long *journal_devnum,
 			    unsigned int *journal_ioprio, int is_remount)
@@ -1805,6 +1878,13 @@
 	} else if (token == Opt_commit) {
 		if (arg == 0)
 			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+		else if (arg > INT_MAX / HZ) {
+			ext4_msg(sb, KERN_ERR,
+				 "Invalid commit interval %d, "
+				 "must be smaller than %d",
+				 arg, INT_MAX / HZ);
+			return -1;
+		}
 		sbi->s_commit_interval = HZ * arg;
 	} else if (token == Opt_debug_want_extra_isize) {
 		sbi->s_want_extra_isize = arg;
@@ -1897,7 +1977,7 @@
 		*journal_ioprio =
 			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
 	} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 		sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
 		ext4_msg(sb, KERN_WARNING,
 			 "Test dummy encryption mode enabled");
@@ -1938,7 +2018,7 @@
 #ifdef CONFIG_FS_DAX
 		ext4_msg(sb, KERN_WARNING,
 		"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-			sbi->s_mount_opt |= m->mount_opt;
+		sbi->s_mount_opt |= m->mount_opt;
 #else
 		ext4_msg(sb, KERN_INFO, "dax option not supported");
 		return -1;
@@ -2224,7 +2304,6 @@
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
 	ext4_update_tstamp(es, s_mtime);
-	ext4_update_dynamic_rev(sb);
 	if (sbi->s_journal)
 		ext4_set_feature_journal_needs_recovery(sb);
 
@@ -2672,13 +2751,9 @@
 	loff_t res;
 	loff_t upper_limit = MAX_LFS_FILESIZE;
 
-	/* small i_blocks in vfs inode? */
-	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
-		/*
-		 * CONFIG_LBDAF is not enabled implies the inode
-		 * i_block represent total blocks in 512 bytes
-		 * 32 == size of vfs inode i_blocks * 8
-		 */
+	BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
+
+	if (!has_huge_files) {
 		upper_limit = (1LL << 32) - 1;
 
 		/* total blocks in file system block size */
@@ -2719,11 +2794,11 @@
 	 * number of 512-byte sectors of the file.
 	 */
 
-	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+	if (!has_huge_files) {
 		/*
-		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
-		 * the inode i_block field represents total file blocks in
-		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
+		 * !has_huge_files or implies that the inode i_block field
+		 * represents total file blocks in 2^32 512-byte sectors ==
+		 * size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 
@@ -2846,6 +2921,15 @@
 		return 0;
 	}
 
+#ifndef CONFIG_UNICODE
+	if (ext4_has_feature_casefold(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Filesystem with casefold feature cannot be "
+			 "mounted without CONFIG_UNICODE");
+		return 0;
+	}
+#endif
+
 	if (readonly)
 		return 1;
 
@@ -2863,18 +2947,6 @@
 				~EXT4_FEATURE_RO_COMPAT_SUPP));
 		return 0;
 	}
-	/*
-	 * Large file size enabled file system can only be mounted
-	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
-	 */
-	if (ext4_has_feature_huge_file(sb)) {
-		if (sizeof(blkcnt_t) < sizeof(u64)) {
-			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
-				 "cannot be mounted RDWR without "
-				 "CONFIG_LBDAF");
-			return 0;
-		}
-	}
 	if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
 		ext4_msg(sb, KERN_ERR,
 			 "Can't support bigalloc feature without "
@@ -3479,6 +3551,40 @@
 	return 0;
 }
 
+static void ext4_clamp_want_extra_isize(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	unsigned def_extra_isize = sizeof(struct ext4_inode) -
+						EXT4_GOOD_OLD_INODE_SIZE;
+
+	if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
+		sbi->s_want_extra_isize = 0;
+		return;
+	}
+	if (sbi->s_want_extra_isize < 4) {
+		sbi->s_want_extra_isize = def_extra_isize;
+		if (ext4_has_feature_extra_isize(sb)) {
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_want_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_want_extra_isize);
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_min_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_min_extra_isize);
+		}
+	}
+	/* Check if enough inode space is available */
+	if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
+	    (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+							sbi->s_inode_size)) {
+		sbi->s_want_extra_isize = def_extra_isize;
+		ext4_msg(sb, KERN_INFO,
+			 "required extra inode space not available");
+	}
+}
+
 static void ext4_set_resv_clusters(struct super_block *sb)
 {
 	ext4_fsblk_t resv_clusters;
@@ -3705,6 +3811,43 @@
 			   &journal_ioprio, 0))
 		goto failed_mount;
 
+#ifdef CONFIG_UNICODE
+	if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) {
+		const struct ext4_sb_encodings *encoding_info;
+		struct unicode_map *encoding;
+		__u16 encoding_flags;
+
+		if (ext4_has_feature_encrypt(sb)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Can't mount with encoding and encryption");
+			goto failed_mount;
+		}
+
+		if (ext4_sb_read_encoding(es, &encoding_info,
+					  &encoding_flags)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Encoding requested by superblock is unknown");
+			goto failed_mount;
+		}
+
+		encoding = utf8_load(encoding_info->version);
+		if (IS_ERR(encoding)) {
+			ext4_msg(sb, KERN_ERR,
+				 "can't mount with superblock charset: %s-%s "
+				 "not supported by the kernel. flags: 0x%x.",
+				 encoding_info->name, encoding_info->version,
+				 encoding_flags);
+			goto failed_mount;
+		}
+		ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
+			 "%s-%s with flags 0x%hx", encoding_info->name,
+			 encoding_info->version?:"\b", encoding_flags);
+
+		sbi->s_encoding = encoding;
+		sbi->s_encoding_flags = encoding_flags;
+	}
+#endif
+
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
 		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
 			    "with data=journal disables delayed "
@@ -3841,12 +3984,12 @@
 		if (ext4_has_feature_inline_data(sb)) {
 			ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
 					" that may contain inline data");
-			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
+			goto failed_mount;
 		}
 		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
 			ext4_msg(sb, KERN_ERR,
-				"DAX unsupported by block device. Turning off DAX.");
-			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
+				"DAX unsupported by block device.");
+			goto failed_mount;
 		}
 	}
 
@@ -3906,8 +4049,21 @@
 			       sbi->s_inode_size);
 			goto failed_mount;
 		}
-		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
-			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
+		/*
+		 * i_atime_extra is the last extra field available for [acm]times in
+		 * struct ext4_inode. Checking for that field should suffice to ensure
+		 * we have extra space for all three.
+		 */
+		if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
+			sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
+			sb->s_time_gran = 1;
+			sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
+		} else {
+			sb->s_time_gran = NSEC_PER_SEC;
+			sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
+		}
+
+		sb->s_time_min = EXT4_TIMESTAMP_MIN;
 	}
 
 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
@@ -4023,8 +4179,6 @@
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "filesystem"
 			 " too large to mount safely on this system");
-		if (sizeof(sector_t) < 8)
-			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
 		goto failed_mount;
 	}
 
@@ -4142,9 +4296,12 @@
 	sb->s_op = &ext4_sops;
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	sb->s_cop = &ext4_cryptops;
 #endif
+#ifdef CONFIG_FS_VERITY
+	sb->s_vop = &ext4_verityops;
+#endif
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &ext4_quota_operations;
 	if (ext4_has_feature_quota(sb))
@@ -4204,7 +4361,7 @@
 				 "data=, fs mounted w/o journal");
 			goto failed_mount_wq;
 		}
-		sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
+		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
 		clear_opt(sb, JOURNAL_CHECKSUM);
 		clear_opt(sb, DATA_FLAGS);
 		sbi->s_journal = NULL;
@@ -4292,6 +4449,11 @@
 		goto failed_mount_wq;
 	}
 
+	if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+		ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
+		goto failed_mount_wq;
+	}
+
 	if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
 	    !ext4_has_feature_encrypt(sb)) {
 		ext4_set_feature_encrypt(sb);
@@ -4327,7 +4489,7 @@
 	 * so we can safely mount the rest of the filesystem now.
 	 */
 
-	root = ext4_iget(sb, EXT4_ROOT_INO);
+	root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
 	if (IS_ERR(root)) {
 		ext4_msg(sb, KERN_ERR, "get root inode failed");
 		ret = PTR_ERR(root);
@@ -4339,6 +4501,12 @@
 		iput(root);
 		goto failed_mount4;
 	}
+
+#ifdef CONFIG_UNICODE
+	if (sbi->s_encoding)
+		sb->s_d_op = &ext4_dentry_ops;
+#endif
+
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
@@ -4353,30 +4521,7 @@
 	} else if (ret)
 		goto failed_mount4a;
 
-	/* determine the minimum size of new large inodes, if present */
-	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
-	    sbi->s_want_extra_isize == 0) {
-		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-						     EXT4_GOOD_OLD_INODE_SIZE;
-		if (ext4_has_feature_extra_isize(sb)) {
-			if (sbi->s_want_extra_isize <
-			    le16_to_cpu(es->s_want_extra_isize))
-				sbi->s_want_extra_isize =
-					le16_to_cpu(es->s_want_extra_isize);
-			if (sbi->s_want_extra_isize <
-			    le16_to_cpu(es->s_min_extra_isize))
-				sbi->s_want_extra_isize =
-					le16_to_cpu(es->s_min_extra_isize);
-		}
-	}
-	/* Check if enough inode space is available */
-	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
-							sbi->s_inode_size) {
-		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-						       EXT4_GOOD_OLD_INODE_SIZE;
-		ext4_msg(sb, KERN_INFO, "required extra inode space not"
-			 "available");
-	}
+	ext4_clamp_want_extra_isize(sb);
 
 	ext4_set_resv_clusters(sb);
 
@@ -4521,14 +4666,12 @@
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
-	if (sbi->s_ea_inode_cache) {
-		ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
-		sbi->s_ea_inode_cache = NULL;
-	}
-	if (sbi->s_ea_block_cache) {
-		ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
-		sbi->s_ea_block_cache = NULL;
-	}
+	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+	sbi->s_ea_inode_cache = NULL;
+
+	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
+	sbi->s_ea_block_cache = NULL;
+
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -4546,9 +4689,14 @@
 failed_mount:
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
+
+#ifdef CONFIG_UNICODE
+	utf8_unload(sbi->s_encoding);
+#endif
+
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
-		kfree(sbi->s_qf_names[i]);
+		kfree(get_qf_name(sb, sbi, i));
 #endif
 	ext4_blkdev_remove(sbi);
 	brelse(bh);
@@ -4597,7 +4745,7 @@
 	 * happen if we iget() an unused inode, as the subsequent iput()
 	 * will try to delete it.
 	 */
-	journal_inode = ext4_iget(sb, journal_inum);
+	journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
 	if (IS_ERR(journal_inode)) {
 		ext4_msg(sb, KERN_ERR, "no journal found");
 		return NULL;
@@ -4879,7 +5027,7 @@
 	ext4_superblock_csum_set(sb);
 	if (sync)
 		lock_buffer(sbh);
-	if (buffer_write_io_error(sbh)) {
+	if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the
 		 * superblock failed.  This could happen because the
@@ -5162,6 +5310,8 @@
 		goto restore_opts;
 	}
 
+	ext4_clamp_want_extra_isize(sb);
+
 	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
 	    test_opt(sb, JOURNAL_CHECKSUM)) {
 		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
@@ -5679,7 +5829,7 @@
 	if (!qf_inums[type])
 		return -EPERM;
 
-	qf_inode = ext4_iget(sb, qf_inums[type]);
+	qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
 	if (IS_ERR(qf_inode)) {
 		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
 		return PTR_ERR(qf_inode);
@@ -5689,9 +5839,9 @@
 	qf_inode->i_flags |= S_NOQUOTA;
 	lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
 	err = dquot_enable(qf_inode, type, format_id, flags);
-	iput(qf_inode);
 	if (err)
 		lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
+	iput(qf_inode);
 
 	return err;
 }
@@ -5978,6 +6128,14 @@
 	if (err)
 		return err;
 
+	err = ext4_init_pending();
+	if (err)
+		goto out7;
+
+	err = ext4_init_post_read_processing();
+	if (err)
+		goto out6;
+
 	err = ext4_init_pageio();
 	if (err)
 		goto out5;
@@ -6016,6 +6174,10 @@
 out4:
 	ext4_exit_pageio();
 out5:
+	ext4_exit_post_read_processing();
+out6:
+	ext4_exit_pending();
+out7:
 	ext4_exit_es();
 
 	return err;
@@ -6032,7 +6194,9 @@
 	ext4_exit_sysfs();
 	ext4_exit_system_zone();
 	ext4_exit_pageio();
+	ext4_exit_post_read_processing();
 	ext4_exit_es();
+	ext4_exit_pending();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");

diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9212a02..eb1efad 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c

@@ -30,6 +30,7 @@
 	attr_feature,
 	attr_pointer_ui,
 	attr_pointer_atomic,
+	attr_journal_task,
 } attr_id_t;
 
 typedef enum {
@@ -125,6 +126,14 @@
 	return count;
 }
 
+static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
+{
+	if (!sbi->s_journal)
+		return snprintf(buf, PAGE_SIZE, "<none>\n");
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+			task_pid_vnr(sbi->s_journal->j_task));
+}
+
 #define EXT4_ATTR(_name,_mode,_id)					\
 static struct ext4_attr ext4_attr_##_name = {				\
 	.attr = {.name = __stringify(_name), .mode = _mode },		\
@@ -188,6 +197,7 @@
 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_ATTR(first_error_time, 0444, first_error_time);
 EXT4_ATTR(last_error_time, 0444, last_error_time);
+EXT4_ATTR(journal_task, 0444, journal_task);
 
 static unsigned int old_bump_val = 128;
 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -217,28 +227,43 @@
 	ATTR_LIST(errors_count),
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
+	ATTR_LIST(journal_task),
 	NULL,
 };
+ATTRIBUTE_GROUPS(ext4);
 
 /* Features this copy of ext4 supports */
 EXT4_ATTR_FEATURE(lazy_itable_init);
 EXT4_ATTR_FEATURE(batched_discard);
 EXT4_ATTR_FEATURE(meta_bg_resize);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 EXT4_ATTR_FEATURE(encryption);
 #endif
+#ifdef CONFIG_UNICODE
+EXT4_ATTR_FEATURE(casefold);
+#endif
+#ifdef CONFIG_FS_VERITY
+EXT4_ATTR_FEATURE(verity);
+#endif
 EXT4_ATTR_FEATURE(metadata_csum_seed);
 
 static struct attribute *ext4_feat_attrs[] = {
 	ATTR_LIST(lazy_itable_init),
 	ATTR_LIST(batched_discard),
 	ATTR_LIST(meta_bg_resize),
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	ATTR_LIST(encryption),
 #endif
+#ifdef CONFIG_UNICODE
+	ATTR_LIST(casefold),
+#endif
+#ifdef CONFIG_FS_VERITY
+	ATTR_LIST(verity),
+#endif
 	ATTR_LIST(metadata_csum_seed),
 	NULL,
 };
+ATTRIBUTE_GROUPS(ext4_feat);
 
 static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
 {
@@ -304,6 +329,8 @@
 		return print_tstamp(buf, sbi->s_es, s_first_error_time);
 	case attr_last_error_time:
 		return print_tstamp(buf, sbi->s_es, s_last_error_time);
+	case attr_journal_task:
+		return journal_task_show(sbi, buf);
 	}
 
 	return 0;
@@ -355,13 +382,13 @@
 };
 
 static struct kobj_type ext4_sb_ktype = {
-	.default_attrs	= ext4_attrs,
+	.default_groups = ext4_groups,
 	.sysfs_ops	= &ext4_attr_ops,
 	.release	= ext4_sb_release,
 };
 
 static struct kobj_type ext4_feat_ktype = {
-	.default_attrs	= ext4_feat_attrs,
+	.default_groups = ext4_feat_groups,
 	.sysfs_ops	= &ext4_attr_ops,
 	.release	= (void (*)(struct kobject *))kfree,
 };

diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
new file mode 100644
index 0000000..d0d8a97
--- /dev/null
+++ b/fs/ext4/verity.c

@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/ext4/verity.c: fs-verity support for ext4
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for ext4.
+ *
+ * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past
+ * the end of the file, starting at the first 64K boundary beyond i_size.  This
+ * approach works because (a) verity files are readonly, and (b) pages fully
+ * beyond i_size aren't visible to userspace but can be read/written internally
+ * by ext4 with only some relatively small changes to ext4.  This approach
+ * avoids having to depend on the EA_INODE feature and on rearchitecturing
+ * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and
+ * to support encrypting xattrs.  Note that the verity metadata *must* be
+ * encrypted when the file is, since it contains hashes of the plaintext data.
+ *
+ * Using a 64K boundary rather than a 4K one keeps things ready for
+ * architectures with 64K pages, and it doesn't necessarily waste space on-disk
+ * since there can be a hole between i_size and the start of the Merkle tree.
+ */
+
+#include <linux/quotaops.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"
+#include "ext4_jbd2.h"
+
+static inline loff_t ext4_verity_metadata_pos(const struct inode *inode)
+{
+	return round_up(inode->i_size, 65536);
+}
+
+/*
+ * Read some verity metadata from the inode.  __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+			  loff_t pos)
+{
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *addr;
+
+		page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+					 NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		addr = kmap_atomic(page);
+		memcpy(buf, addr + offset_in_page(pos), n);
+		kunmap_atomic(addr);
+
+		put_page(page);
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+			   loff_t pos)
+{
+	if (pos + count > inode->i_sb->s_maxbytes)
+		return -EFBIG;
+
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *fsdata;
+		void *addr;
+		int res;
+
+		res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+					    &page, &fsdata);
+		if (res)
+			return res;
+
+		addr = kmap_atomic(page);
+		memcpy(addr + offset_in_page(pos), buf, n);
+		kunmap_atomic(addr);
+
+		res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+					  page, fsdata);
+		if (res < 0)
+			return res;
+		if (res != n)
+			return -EIO;
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+
+static int ext4_begin_enable_verity(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	const int credits = 2; /* superblock and inode for ext4_orphan_add() */
+	handle_t *handle;
+	int err;
+
+	if (ext4_verity_in_progress(inode))
+		return -EBUSY;
+
+	/*
+	 * Since the file was opened readonly, we have to initialize the jbd
+	 * inode and quotas here and not rely on ->open() doing it.  This must
+	 * be done before evicting the inline data.
+	 */
+
+	err = ext4_inode_attach_jinode(inode);
+	if (err)
+		return err;
+
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
+	err = ext4_convert_inline_data(inode);
+	if (err)
+		return err;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		ext4_warning_inode(inode,
+				   "verity is only allowed on extent-based files");
+		return -EOPNOTSUPP;
+	}
+
+	/*
+	 * ext4 uses the last allocated block to find the verity descriptor, so
+	 * we must remove any other blocks past EOF which might confuse things.
+	 */
+	err = ext4_truncate(inode);
+	if (err)
+		return err;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext4_orphan_add(handle, inode);
+	if (err == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+	ext4_journal_stop(handle);
+	return err;
+}
+
+/*
+ * ext4 stores the verity descriptor beginning on the next filesystem block
+ * boundary after the Merkle tree.  Then, the descriptor size is stored in the
+ * last 4 bytes of the last allocated filesystem block --- which is either the
+ * block in which the descriptor ends, or the next block after that if there
+ * weren't at least 4 bytes remaining.
+ *
+ * We can't simply store the descriptor in an xattr because it *must* be
+ * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt
+ * xattrs.  Also, if the descriptor includes a large signature blob it may be
+ * too large to store in an xattr without the EA_INODE feature.
+ */
+static int ext4_write_verity_descriptor(struct inode *inode, const void *desc,
+					size_t desc_size, u64 merkle_tree_size)
+{
+	const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) +
+				      merkle_tree_size, i_blocksize(inode));
+	const u64 desc_end = desc_pos + desc_size;
+	const __le32 desc_size_disk = cpu_to_le32(desc_size);
+	const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk),
+					   i_blocksize(inode)) -
+				  sizeof(desc_size_disk);
+	int err;
+
+	err = pagecache_write(inode, desc, desc_size, desc_pos);
+	if (err)
+		return err;
+
+	return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk),
+			       desc_size_pos);
+}
+
+static int ext4_end_enable_verity(struct file *filp, const void *desc,
+				  size_t desc_size, u64 merkle_tree_size)
+{
+	struct inode *inode = file_inode(filp);
+	const int credits = 2; /* superblock and inode for ext4_orphan_del() */
+	handle_t *handle;
+	int err = 0;
+	int err2;
+
+	if (desc != NULL) {
+		/* Succeeded; write the verity descriptor. */
+		err = ext4_write_verity_descriptor(inode, desc, desc_size,
+						   merkle_tree_size);
+
+		/* Write all pages before clearing VERITY_IN_PROGRESS. */
+		if (!err)
+			err = filemap_write_and_wait(inode->i_mapping);
+	}
+
+	/* If we failed, truncate anything we wrote past i_size. */
+	if (desc == NULL || err)
+		ext4_truncate(inode);
+
+	/*
+	 * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
+	 * deleting the inode from the orphan list, even if something failed.
+	 * If everything succeeded, we'll also set the verity bit in the same
+	 * transaction.
+	 */
+
+	ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+	if (IS_ERR(handle)) {
+		ext4_orphan_del(NULL, inode);
+		return PTR_ERR(handle);
+	}
+
+	err2 = ext4_orphan_del(handle, inode);
+	if (err2)
+		goto out_stop;
+
+	if (desc != NULL && !err) {
+		struct ext4_iloc iloc;
+
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err)
+			goto out_stop;
+		ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+		ext4_set_inode_flags(inode);
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	}
+out_stop:
+	ext4_journal_stop(handle);
+	return err ?: err2;
+}
+
+static int ext4_get_verity_descriptor_location(struct inode *inode,
+					       size_t *desc_size_ret,
+					       u64 *desc_pos_ret)
+{
+	struct ext4_ext_path *path;
+	struct ext4_extent *last_extent;
+	u32 end_lblk;
+	u64 desc_size_pos;
+	__le32 desc_size_disk;
+	u32 desc_size;
+	u64 desc_pos;
+	int err;
+
+	/*
+	 * Descriptor size is in last 4 bytes of last allocated block.
+	 * See ext4_write_verity_descriptor().
+	 */
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		EXT4_ERROR_INODE(inode, "verity file doesn't use extents");
+		return -EFSCORRUPTED;
+	}
+
+	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	last_extent = path[path->p_depth].p_ext;
+	if (!last_extent) {
+		EXT4_ERROR_INODE(inode, "verity file has no extents");
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		return -EFSCORRUPTED;
+	}
+
+	end_lblk = le32_to_cpu(last_extent->ee_block) +
+		   ext4_ext_get_actual_len(last_extent);
+	desc_size_pos = (u64)end_lblk << inode->i_blkbits;
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	if (desc_size_pos < sizeof(desc_size_disk))
+		goto bad;
+	desc_size_pos -= sizeof(desc_size_disk);
+
+	err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk),
+			     desc_size_pos);
+	if (err)
+		return err;
+	desc_size = le32_to_cpu(desc_size_disk);
+
+	/*
+	 * The descriptor is stored just before the desc_size_disk, but starting
+	 * on a filesystem block boundary.
+	 */
+
+	if (desc_size > INT_MAX || desc_size > desc_size_pos)
+		goto bad;
+
+	desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode));
+	if (desc_pos < ext4_verity_metadata_pos(inode))
+		goto bad;
+
+	*desc_size_ret = desc_size;
+	*desc_pos_ret = desc_pos;
+	return 0;
+
+bad:
+	EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor");
+	return -EFSCORRUPTED;
+}
+
+static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
+				      size_t buf_size)
+{
+	size_t desc_size = 0;
+	u64 desc_pos = 0;
+	int err;
+
+	err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos);
+	if (err)
+		return err;
+
+	if (buf_size) {
+		if (desc_size > buf_size)
+			return -ERANGE;
+		err = pagecache_read(inode, buf, desc_size, desc_pos);
+		if (err)
+			return err;
+	}
+	return desc_size;
+}
+
+static struct page *ext4_read_merkle_tree_page(struct inode *inode,
+					       pgoff_t index)
+{
+	index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
+
+	return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+					u64 index, int log_blocksize)
+{
+	loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize);
+
+	return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations ext4_verityops = {
+	.begin_enable_verity	= ext4_begin_enable_verity,
+	.end_enable_verity	= ext4_end_enable_verity,
+	.get_verity_descriptor	= ext4_get_verity_descriptor,
+	.read_merkle_tree_page	= ext4_read_merkle_tree_page,
+	.write_merkle_tree_block = ext4_write_merkle_tree_block,
+};

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 4380c86..491f9ee 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c

@@ -384,7 +384,7 @@
 	struct inode *inode;
 	int err;
 
-	inode = ext4_iget(parent->i_sb, ea_ino);
+	inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		ext4_error(parent->i_sb,
@@ -522,14 +522,13 @@
 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
 		  name_index, name, buffer, (long)buffer_size);
 
-	error = -ENODATA;
 	if (!EXT4_I(inode)->i_file_acl)
-		goto cleanup;
+		return -ENODATA;
 	ea_idebug(inode, "reading block %llu",
 		  (unsigned long long)EXT4_I(inode)->i_file_acl);
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh)
-		goto cleanup;
+	bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	error = ext4_xattr_check_block(inode, bh);
@@ -696,26 +695,23 @@
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
 
-	error = 0;
 	if (!EXT4_I(inode)->i_file_acl)
-		goto cleanup;
+		return 0;
 	ea_idebug(inode, "reading block %llu",
 		  (unsigned long long)EXT4_I(inode)->i_file_acl);
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	error = -EIO;
-	if (!bh)
-		goto cleanup;
+	bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	error = ext4_xattr_check_block(inode, bh);
 	if (error)
 		goto cleanup;
 	ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
-	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
-
+	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer,
+					buffer_size);
 cleanup:
 	brelse(bh);
-
 	return error;
 }
 
@@ -830,9 +826,10 @@
 	}
 
 	if (EXT4_I(inode)->i_file_acl) {
-		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-		if (!bh) {
-			ret = -EIO;
+		bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+		if (IS_ERR(bh)) {
+			ret = PTR_ERR(bh);
+			bh = NULL;
 			goto out;
 		}
 
@@ -1031,10 +1028,8 @@
 	inode_lock(ea_inode);
 
 	ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
-	if (ret) {
-		iloc.bh = NULL;
+	if (ret)
 		goto out;
-	}
 
 	ref_count = ext4_xattr_inode_get_ref(ea_inode);
 	ref_count += ref_change;
@@ -1080,12 +1075,10 @@
 	}
 
 	ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
-	iloc.bh = NULL;
 	if (ret)
 		ext4_warning_inode(ea_inode,
 				   "ext4_mark_iloc_dirty() failed ret=%d", ret);
 out:
-	brelse(iloc.bh);
 	inode_unlock(ea_inode);
 	return ret;
 }
@@ -1490,7 +1483,8 @@
 	}
 
 	while (ce) {
-		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+		ea_inode = ext4_iget(inode->i_sb, ce->e_value,
+				     EXT4_IGET_NORMAL);
 		if (!IS_ERR(ea_inode) &&
 		    !is_bad_inode(ea_inode) &&
 		    (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
@@ -1702,7 +1696,7 @@
 
 	/* No failures allowed past this point. */
 
-	if (!s->not_found && here->e_value_size && here->e_value_offs) {
+	if (!s->not_found && here->e_value_size && !here->e_value_inum) {
 		/* Remove the old value. */
 		void *first_val = s->base + min_offs;
 		size_t offs = le16_to_cpu(here->e_value_offs);
@@ -1825,16 +1819,15 @@
 
 	if (EXT4_I(inode)->i_file_acl) {
 		/* The inode already has an extended attribute block. */
-		bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		if (!bs->bh)
-			goto cleanup;
+		bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+		if (IS_ERR(bs->bh))
+			return PTR_ERR(bs->bh);
 		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
 			atomic_read(&(bs->bh->b_count)),
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		error = ext4_xattr_check_block(inode, bs->bh);
 		if (error)
-			goto cleanup;
+			return error;
 		/* Find the named attribute. */
 		bs->s.base = BHDR(bs->bh);
 		bs->s.first = BFIRST(bs->bh);
@@ -1843,13 +1836,10 @@
 		error = xattr_find_entry(inode, &bs->s.here, bs->s.end,
 					 i->name_index, i->name, 1);
 		if (error && error != -ENODATA)
-			goto cleanup;
+			return error;
 		bs->s.not_found = error;
 	}
-	error = 0;
-
-cleanup:
-	return error;
+	return 0;
 }
 
 static int
@@ -2278,9 +2268,9 @@
 
 	if (!EXT4_I(inode)->i_file_acl)
 		return NULL;
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh)
-		return ERR_PTR(-EIO);
+	bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+	if (IS_ERR(bh))
+		return bh;
 	error = ext4_xattr_check_block(inode, bh);
 	if (error) {
 		brelse(bh);
@@ -2733,7 +2723,7 @@
 	base = IFIRST(header);
 	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
 	min_offs = end - base;
-	total_ino = sizeof(struct ext4_xattr_ibody_header);
+	total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
 
 	error = xattr_check_inode(inode, header, end);
 	if (error)
@@ -2750,10 +2740,11 @@
 	if (EXT4_I(inode)->i_file_acl) {
 		struct buffer_head *bh;
 
-		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		if (!bh)
+		bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
 			goto cleanup;
+		}
 		error = ext4_xattr_check_block(inode, bh);
 		if (error) {
 			brelse(bh);
@@ -2907,11 +2898,13 @@
 	}
 
 	if (EXT4_I(inode)->i_file_acl) {
-		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-		if (!bh) {
-			EXT4_ERROR_INODE(inode, "block %llu read error",
-					 EXT4_I(inode)->i_file_acl);
-			error = -EIO;
+		bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
+			if (error == -EIO)
+				EXT4_ERROR_INODE(inode, "block %llu read error",
+						 EXT4_I(inode)->i_file_acl);
+			bh = NULL;
 			goto cleanup;
 		}
 		error = ext4_xattr_check_block(inode, bh);
@@ -3064,8 +3057,11 @@
 	while (ce) {
 		struct buffer_head *bh;
 
-		bh = sb_bread(inode->i_sb, ce->e_value);
-		if (!bh) {
+		bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
+		if (IS_ERR(bh)) {
+			if (PTR_ERR(bh) == -ENOMEM)
+				return NULL;
+			bh = NULL;
 			EXT4_ERROR_INODE(inode, "block %lu read error",
 					 (unsigned long)ce->e_value);
 		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {

diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 9a20ef4..652fd2e 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig

@@ -1,8 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config F2FS_FS
 	tristate "F2FS filesystem support"
 	depends on BLOCK
+	select NLS
 	select CRYPTO
 	select CRYPTO_CRC32
+	select F2FS_FS_XATTR if FS_ENCRYPTION
 	help
 	  F2FS is based on Log-structured File System (LFS), which supports
 	  versatile "flash-friendly" features. The design has been focused on
@@ -58,7 +61,9 @@
 	  Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
 	  Linux. This option enables an extended attribute handler for file
 	  security labels in the f2fs filesystem, so that it requires enabling
-	  the extended attribute support in advance.
+	  the extended attribute support in advance. In particular you need this
+	  option if you use the setcap command to assign initial process capabi-
+	  lities to executables (the security.* extended attributes).
 
 	  If you are not using a security module, say N.
 
@@ -70,17 +75,6 @@
 
 	  If you want to improve the performance, say N.
 
-config F2FS_FS_ENCRYPTION
-	bool "F2FS Encryption"
-	depends on F2FS_FS
-	depends on F2FS_FS_XATTR
-	select FS_ENCRYPTION
-	help
-	  Enable encryption of f2fs files and directories.  This
-	  feature is similar to ecryptfs, but it is more memory
-	  efficient since it avoids caching the encrypted and
-	  decrypted pages in the page cache.
-
 config F2FS_IO_TRACE
 	bool "F2FS IO tracer"
 	depends on F2FS_FS

diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 776c4b9..2aaecc6 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile

@@ -8,3 +8,4 @@
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
 f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
+f2fs-$(CONFIG_FS_VERITY) += verity.o

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1118241..217b290 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/acl.c
  *
@@ -7,10 +8,6 @@
  * Portions of this code from linux/fs/ext2/acl.c
  *
  * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/f2fs_fs.h>
 #include "f2fs.h"
@@ -53,6 +50,9 @@
 	struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
 	const char *end = value + size;
 
+	if (size < sizeof(struct f2fs_acl_header))
+		return ERR_PTR(-EINVAL);
+
 	if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
 		return ERR_PTR(-EINVAL);
 
@@ -160,7 +160,7 @@
 	return (void *)f2fs_acl;
 
 fail:
-	kfree(f2fs_acl);
+	kvfree(f2fs_acl);
 	return ERR_PTR(-EINVAL);
 }
 
@@ -190,7 +190,7 @@
 		acl = NULL;
 	else
 		acl = ERR_PTR(retval);
-	kfree(value);
+	kvfree(value);
 
 	return acl;
 }
@@ -240,7 +240,7 @@
 
 	error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
 
-	kfree(value);
+	kvfree(value);
 	if (!error)
 		set_cached_acl(inode, type, acl);
 
@@ -285,7 +285,7 @@
 	/* assert(atomic_read(acl->a_refcount) == 1); */
 
 	FOREACH_ACL_ENTRY(pa, acl, pe) {
-		switch(pa->e_tag) {
+		switch (pa->e_tag) {
 		case ACL_USER_OBJ:
 			pa->e_perm &= (mode >> 6) | ~S_IRWXO;
 			mode &= (pa->e_perm << 6) | ~S_IRWXU;
@@ -326,7 +326,7 @@
 	}
 
 	*mode_p = (*mode_p & ~S_IRWXUGO) | mode;
-        return not_equiv;
+	return not_equiv;
 }
 
 static int f2fs_acl_create(struct inode *dir, umode_t *mode,
@@ -352,12 +352,14 @@
 		return PTR_ERR(p);
 
 	clone = f2fs_acl_clone(p, GFP_NOFS);
-	if (!clone)
-		goto no_mem;
+	if (!clone) {
+		ret = -ENOMEM;
+		goto release_acl;
+	}
 
 	ret = f2fs_acl_create_masq(clone, mode);
 	if (ret < 0)
-		goto no_mem_clone;
+		goto release_clone;
 
 	if (ret == 0)
 		posix_acl_release(clone);
@@ -371,11 +373,11 @@
 
 	return 0;
 
-no_mem_clone:
+release_clone:
 	posix_acl_release(clone);
-no_mem:
+release_acl:
 	posix_acl_release(p);
-	return -ENOMEM;
+	return ret;
 }
 
 int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
@@ -394,12 +396,16 @@
 		error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
 				       ipage);
 		posix_acl_release(default_acl);
+	} else {
+		inode->i_default_acl = NULL;
 	}
 	if (acl) {
 		if (!error)
 			error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
 					       ipage);
 		posix_acl_release(acl);
+	} else {
+		inode->i_acl = NULL;
 	}
 
 	return error;

diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 2c68518..b96823c 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/acl.h
  *
@@ -7,10 +8,6 @@
  * Portions of this code from linux/fs/ext2/acl.h
  *
  * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #ifndef __F2FS_ACL_H__
 #define __F2FS_ACL_H__

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 59d0472..a0eef95 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/checkpoint.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/bio.h>
@@ -47,7 +44,7 @@
 		cond_resched();
 		goto repeat;
 	}
-	f2fs_wait_on_page_writeback(page, META, true);
+	f2fs_wait_on_page_writeback(page, META, true, true);
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 	return page;
@@ -69,7 +66,7 @@
 		.old_blkaddr = index,
 		.new_blkaddr = index,
 		.encrypted_page = NULL,
-		.is_meta = is_meta,
+		.is_por = !is_meta,
 	};
 	int err;
 
@@ -122,11 +119,8 @@
 		if (PTR_ERR(page) == -EIO &&
 				++count <= DEFAULT_RETRY_IO_COUNT)
 			goto retry;
-
 		f2fs_stop_checkpoint(sbi, false);
-		f2fs_bug_on(sbi, 1);
 	}
-
 	return page;
 }
 
@@ -136,6 +130,30 @@
 	return __get_meta_page(sbi, index, false);
 }
 
+static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
+							int type)
+{
+	struct seg_entry *se;
+	unsigned int segno, offset;
+	bool exist;
+
+	if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ)
+		return true;
+
+	segno = GET_SEGNO(sbi, blkaddr);
+	offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+	se = get_seg_entry(sbi, segno);
+
+	exist = f2fs_test_bit(offset, se->cur_valid_map);
+	if (!exist && type == DATA_GENERIC_ENHANCE) {
+		f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
+			 blkaddr, exist);
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		WARN_ON(1);
+	}
+	return exist;
+}
+
 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type)
 {
@@ -157,15 +175,22 @@
 			return false;
 		break;
 	case META_POR:
-	case DATA_GENERIC:
 		if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
-			blkaddr < MAIN_BLKADDR(sbi))) {
-			if (type == DATA_GENERIC) {
-				f2fs_msg(sbi->sb, KERN_WARNING,
-					"access invalid blkaddr:%u", blkaddr);
-				WARN_ON(1);
-			}
+			blkaddr < MAIN_BLKADDR(sbi)))
 			return false;
+		break;
+	case DATA_GENERIC:
+	case DATA_GENERIC_ENHANCE:
+	case DATA_GENERIC_ENHANCE_READ:
+		if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
+				blkaddr < MAIN_BLKADDR(sbi))) {
+			f2fs_warn(sbi, "access invalid blkaddr:%u",
+				  blkaddr);
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			WARN_ON(1);
+			return false;
+		} else {
+			return __is_bitmap_valid(sbi, blkaddr, type);
 		}
 		break;
 	case META_GENERIC:
@@ -195,7 +220,7 @@
 		.op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
 		.encrypted_page = NULL,
 		.in_list = false,
-		.is_meta = (type != META_POR),
+		.is_por = (type == META_POR),
 	};
 	struct blk_plug plug;
 
@@ -282,8 +307,7 @@
 	dec_page_count(sbi, F2FS_DIRTY_META);
 
 	if (wbc->for_reclaim)
-		f2fs_submit_merged_write_cond(sbi, page->mapping->host,
-						0, page->index, META);
+		f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META);
 
 	unlock_page(page);
 
@@ -313,8 +337,9 @@
 		goto skip_write;
 
 	/* collect a number of dirty meta pages and write together */
-	if (wbc->for_kupdate ||
-		get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+	if (wbc->sync_mode != WB_SYNC_ALL &&
+			get_pages(sbi, F2FS_DIRTY_META) <
+					nr_pages_to_skip(sbi, META))
 		goto skip_write;
 
 	/* if locked failed, cp will flush dirty pages instead */
@@ -377,9 +402,8 @@
 				goto continue_unlock;
 			}
 
-			f2fs_wait_on_page_writeback(page, META, true);
+			f2fs_wait_on_page_writeback(page, META, true, true);
 
-			BUG_ON(PageWriteback(page));
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
@@ -413,7 +437,7 @@
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
-		SetPagePrivate(page);
+		f2fs_set_page_private(page, 0);
 		f2fs_trace_pid(page);
 		return 1;
 	}
@@ -633,9 +657,8 @@
 
 err_out:
 	set_sbi_flag(sbi, SBI_NEED_FSCK);
-	f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: orphan failed (ino=%x), run fsck to fix.",
-			__func__, ino);
+	f2fs_warn(sbi, "%s: orphan failed (ino=%x), run fsck to fix.",
+		  __func__, ino);
 	return err;
 }
 
@@ -651,8 +674,13 @@
 	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
 
+	if (bdev_read_only(sbi->sb->s_bdev)) {
+		f2fs_info(sbi, "write access unavailable, skipping orphan cleanup");
+		return 0;
+	}
+
 	if (s_flags & SB_RDONLY) {
-		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
+		f2fs_info(sbi, "orphan cleanup on readonly fs");
 		sbi->sb->s_flags &= ~SB_RDONLY;
 	}
 
@@ -765,13 +793,27 @@
 	}
 }
 
+static __u32 f2fs_checkpoint_chksum(struct f2fs_sb_info *sbi,
+						struct f2fs_checkpoint *ckpt)
+{
+	unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset);
+	__u32 chksum;
+
+	chksum = f2fs_crc32(sbi, ckpt, chksum_ofs);
+	if (chksum_ofs < CP_CHKSUM_OFFSET) {
+		chksum_ofs += sizeof(chksum);
+		chksum = f2fs_chksum(sbi, chksum, (__u8 *)ckpt + chksum_ofs,
+						F2FS_BLKSIZE - chksum_ofs);
+	}
+	return chksum;
+}
+
 static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
 		struct f2fs_checkpoint **cp_block, struct page **cp_page,
 		unsigned long long *version)
 {
-	unsigned long blk_size = sbi->blocksize;
 	size_t crc_offset = 0;
-	__u32 crc = 0;
+	__u32 crc;
 
 	*cp_page = f2fs_get_meta_page(sbi, cp_addr);
 	if (IS_ERR(*cp_page))
@@ -780,17 +822,17 @@
 	*cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
 
 	crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
-	if (crc_offset > (blk_size - sizeof(__le32))) {
+	if (crc_offset < CP_MIN_CHKSUM_OFFSET ||
+			crc_offset > CP_CHKSUM_OFFSET) {
 		f2fs_put_page(*cp_page, 1);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"invalid crc_offset: %zu", crc_offset);
+		f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset);
 		return -EINVAL;
 	}
 
-	crc = cur_cp_crc(*cp_block);
-	if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) {
+	crc = f2fs_checkpoint_chksum(sbi, *cp_block);
+	if (crc != cur_cp_crc(*cp_block)) {
 		f2fs_put_page(*cp_page, 1);
-		f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
+		f2fs_warn(sbi, "invalid crc value");
 		return -EINVAL;
 	}
 
@@ -813,9 +855,8 @@
 
 	if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
 					sbi->blocks_per_seg) {
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"invalid cp_pack_total_block_count:%u",
-			le32_to_cpu(cp_block->cp_pack_total_block_count));
+		f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
+			  le32_to_cpu(cp_block->cp_pack_total_block_count));
 		goto invalid_cp;
 	}
 	pre_version = *version;
@@ -849,6 +890,7 @@
 	unsigned int cp_blks = 1 + __cp_payload(sbi);
 	block_t cp_blk_no;
 	int i;
+	int err;
 
 	sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks),
 				 GFP_KERNEL);
@@ -876,6 +918,7 @@
 	} else if (cp2) {
 		cur_page = cp2;
 	} else {
+		err = -EFSCORRUPTED;
 		goto fail_no_cp;
 	}
 
@@ -888,8 +931,10 @@
 		sbi->cur_cp_pack = 2;
 
 	/* Sanity checking of checkpoint */
-	if (f2fs_sanity_check_ckpt(sbi))
+	if (f2fs_sanity_check_ckpt(sbi)) {
+		err = -EFSCORRUPTED;
 		goto free_fail_no_cp;
+	}
 
 	if (cp_blks <= 1)
 		goto done;
@@ -903,8 +948,10 @@
 		unsigned char *ckpt = (unsigned char *)sbi->ckpt;
 
 		cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i);
-		if (IS_ERR(cur_page))
+		if (IS_ERR(cur_page)) {
+			err = PTR_ERR(cur_page);
 			goto free_fail_no_cp;
+		}
 		sit_bitmap_ptr = page_address(cur_page);
 		memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
 		f2fs_put_page(cur_page, 1);
@@ -918,8 +965,8 @@
 	f2fs_put_page(cp1, 1);
 	f2fs_put_page(cp2, 1);
 fail_no_cp:
-	kfree(sbi->ckpt);
-	return -EINVAL;
+	kvfree(sbi->ckpt);
+	return err;
 }
 
 static void __add_dirty_inode(struct inode *inode, enum inode_type type)
@@ -964,7 +1011,7 @@
 	inode_inc_dirty_pages(inode);
 	spin_unlock(&sbi->inode_lock[type]);
 
-	SetPagePrivate(page);
+	f2fs_set_page_private(page, 0);
 	f2fs_trace_pid(page);
 }
 
@@ -1016,13 +1063,11 @@
 	if (inode) {
 		unsigned long cur_ino = inode->i_ino;
 
-		if (is_dir)
-			F2FS_I(inode)->cp_task = current;
+		F2FS_I(inode)->cp_task = current;
 
 		filemap_fdatawrite(inode->i_mapping);
 
-		if (is_dir)
-			F2FS_I(inode)->cp_task = NULL;
+		F2FS_I(inode)->cp_task = NULL;
 
 		iput(inode);
 		/* We need to give cpu to another writers. */
@@ -1086,6 +1131,28 @@
 	ckpt->next_free_nid = cpu_to_le32(last_nid);
 }
 
+static bool __need_flush_quota(struct f2fs_sb_info *sbi)
+{
+	bool ret = false;
+
+	if (!is_journalled_quota(sbi))
+		return false;
+
+	down_write(&sbi->quota_sem);
+	if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
+		ret = false;
+	} else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
+		ret = false;
+	} else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) {
+		clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
+		ret = true;
+	} else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
+		ret = true;
+	}
+	up_write(&sbi->quota_sem);
+	return ret;
+}
+
 /*
  * Freeze all the FS-operations for checkpoint.
  */
@@ -1097,12 +1164,32 @@
 		.for_reclaim = 0,
 	};
 	struct blk_plug plug;
-	int err = 0;
+	int err = 0, cnt = 0;
 
 	blk_start_plug(&plug);
 
-retry_flush_dents:
+retry_flush_quotas:
 	f2fs_lock_all(sbi);
+	if (__need_flush_quota(sbi)) {
+		int locked;
+
+		if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) {
+			set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
+			set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
+			goto retry_flush_dents;
+		}
+		f2fs_unlock_all(sbi);
+
+		/* only failed during mount/umount/freeze/quotactl */
+		locked = down_read_trylock(&sbi->sb->s_umount);
+		f2fs_quota_sync(sbi->sb, -1);
+		if (locked)
+			up_read(&sbi->sb->s_umount);
+		cond_resched();
+		goto retry_flush_quotas;
+	}
+
+retry_flush_dents:
 	/* write all the dirty dentry pages */
 	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
 		f2fs_unlock_all(sbi);
@@ -1110,7 +1197,7 @@
 		if (err)
 			goto out;
 		cond_resched();
-		goto retry_flush_dents;
+		goto retry_flush_quotas;
 	}
 
 	/*
@@ -1126,7 +1213,7 @@
 		if (err)
 			goto out;
 		cond_resched();
-		goto retry_flush_dents;
+		goto retry_flush_quotas;
 	}
 
 retry_flush_nodes:
@@ -1214,9 +1301,28 @@
 	else
 		__clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
 
-	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
+		is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
 		__set_ckpt_flags(ckpt, CP_FSCK_FLAG);
 
+	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+		__set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+
+	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK))
+		__set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+
+	if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
+		__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+
+	if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
+		__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+
 	/* set this flag to activate crc|cp_ver for recovery */
 	__set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
 	__clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG);
@@ -1239,11 +1345,11 @@
 	struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
 	int err;
 
-	memcpy(page_address(page), src, PAGE_SIZE);
-	set_page_dirty(page);
+	f2fs_wait_on_page_writeback(page, META, true, true);
 
-	f2fs_wait_on_page_writeback(page, META, true);
-	f2fs_bug_on(sbi, PageWriteback(page));
+	memcpy(page_address(page), src, PAGE_SIZE);
+
+	set_page_dirty(page);
 	if (unlikely(!clear_page_dirty_for_io(page)))
 		f2fs_bug_on(sbi, 1);
 
@@ -1277,11 +1383,9 @@
 	int err;
 
 	/* Flush all the NAT/SIT pages */
-	while (get_pages(sbi, F2FS_DIRTY_META)) {
-		f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
-		if (unlikely(f2fs_cp_error(sbi)))
-			break;
-	}
+	f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+	f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
+					!f2fs_cp_error(sbi));
 
 	/*
 	 * modify checkpoint
@@ -1335,7 +1439,7 @@
 	get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
 	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
 
-	crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset));
+	crc32 = f2fs_checkpoint_chksum(sbi, ckpt);
 	*((__le32 *)((unsigned char *)ckpt +
 				le32_to_cpu(ckpt->checksum_offset)))
 				= cpu_to_le32(crc32);
@@ -1354,14 +1458,6 @@
 		for (i = 0; i < nm_i->nat_bits_blocks; i++)
 			f2fs_update_meta_page(sbi, nm_i->nat_bits +
 					(i << F2FS_BLKSIZE_BITS), blk + i);
-
-		/* Flush all the NAT BITS pages */
-		while (get_pages(sbi, F2FS_DIRTY_META)) {
-			f2fs_sync_meta_pages(sbi, META, LONG_MAX,
-							FS_CP_META_IO);
-			if (unlikely(f2fs_cp_error(sbi)))
-				break;
-		}
 	}
 
 	/* write out checkpoint buffer at block 0 */
@@ -1397,6 +1493,8 @@
 
 	/* Here, we have one bio having CP pack except cp pack 2 page */
 	f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+	f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
+					!f2fs_cp_error(sbi));
 
 	/* wait for previous submitted meta pages writeback */
 	f2fs_wait_on_all_pages_writeback(sbi);
@@ -1414,7 +1512,7 @@
 	 * invalidate intermediate page cache borrowed from meta inode
 	 * which are used for migration of encrypted inode's blocks.
 	 */
-	if (f2fs_sb_has_encrypt(sbi->sb))
+	if (f2fs_sb_has_encrypt(sbi))
 		invalidate_mapping_pages(META_MAPPING(sbi),
 				MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
 
@@ -1424,6 +1522,12 @@
 
 	clear_sbi_flag(sbi, SBI_IS_DIRTY);
 	clear_sbi_flag(sbi, SBI_NEED_CP);
+	clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
+
+	spin_lock(&sbi->stat_lock);
+	sbi->unusable_block_count = 0;
+	spin_unlock(&sbi->stat_lock);
+
 	__set_cp_next_pack(sbi);
 
 	/*
@@ -1448,6 +1552,14 @@
 	unsigned long long ckpt_ver;
 	int err = 0;
 
+	if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi))
+		return -EROFS;
+
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+		if (cpc->reason != CP_PAUSE)
+			return 0;
+		f2fs_warn(sbi, "Start checkpoint disabled!");
+	}
 	mutex_lock(&sbi->cp_mutex);
 
 	if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
@@ -1458,10 +1570,6 @@
 		err = -EIO;
 		goto out;
 	}
-	if (f2fs_readonly(sbi->sb)) {
-		err = -EROFS;
-		goto out;
-	}
 
 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
 
@@ -1499,7 +1607,10 @@
 	ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
 
 	/* write cached NAT/SIT entries to NAT/SIT area */
-	f2fs_flush_nat_entries(sbi, cpc);
+	err = f2fs_flush_nat_entries(sbi, cpc);
+	if (err)
+		goto stop;
+
 	f2fs_flush_sit_entries(sbi, cpc);
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
@@ -1508,13 +1619,12 @@
 		f2fs_release_discard_addrs(sbi);
 	else
 		f2fs_clear_prefree_segments(sbi, cpc);
-
+stop:
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
 
 	if (cpc->reason & CP_RECOVERY)
-		f2fs_msg(sbi->sb, KERN_NOTICE,
-			"checkpoint: version = %llx", ckpt_ver);
+		f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
 
 	/* do checkpoint periodically */
 	f2fs_update_time(sbi, CP_TIME);

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 11f2834..5755e89 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/data.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
@@ -17,6 +14,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
+#include <linux/swap.h>
 #include <linux/prefetch.h>
 #include <linux/uio.h>
 #include <linux/cleancache.h>
@@ -49,16 +47,34 @@
 			inode->i_ino ==  F2FS_NODE_INO(sbi) ||
 			S_ISDIR(inode->i_mode) ||
 			(S_ISREG(inode->i_mode) &&
-			is_inode_flag_set(inode, FI_ATOMIC_FILE)) ||
+			(f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
 			is_cold_data(page))
 		return true;
 	return false;
 }
 
+static enum count_type __read_io_type(struct page *page)
+{
+	struct address_space *mapping = page_file_mapping(page);
+
+	if (mapping) {
+		struct inode *inode = mapping->host;
+		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+		if (inode->i_ino == F2FS_META_INO(sbi))
+			return F2FS_RD_META;
+
+		if (inode->i_ino == F2FS_NODE_INO(sbi))
+			return F2FS_RD_NODE;
+	}
+	return F2FS_RD_DATA;
+}
+
 /* postprocessing steps for read bios */
 enum bio_post_read_step {
 	STEP_INITIAL = 0,
 	STEP_DECRYPT,
+	STEP_VERITY,
 };
 
 struct bio_post_read_ctx {
@@ -72,9 +88,9 @@
 {
 	struct page *page;
 	struct bio_vec *bv;
-	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, iter_all) {
 		page = bv->bv_page;
 
 		/* PG_error was set if any post_read step failed */
@@ -85,6 +101,7 @@
 		} else {
 			SetPageUptodate(page);
 		}
+		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
 		unlock_page(page);
 	}
 	if (bio->bi_private)
@@ -104,8 +121,23 @@
 	bio_post_read_processing(ctx);
 }
 
+static void verity_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fsverity_verify_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
 static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
 {
+	/*
+	 * We use different work queues for decryption and for verity because
+	 * verity may require reading metadata pages that need decryption, and
+	 * we shouldn't recurse to the same workqueue.
+	 */
 	switch (++ctx->cur_step) {
 	case STEP_DECRYPT:
 		if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
@@ -115,6 +147,14 @@
 		}
 		ctx->cur_step++;
 		/* fall-through */
+	case STEP_VERITY:
+		if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+			INIT_WORK(&ctx->work, verity_work);
+			fsverity_enqueue_verify_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
 	default:
 		__read_end_io(ctx->bio);
 	}
@@ -127,8 +167,9 @@
 
 static void f2fs_read_end_io(struct bio *bio)
 {
-	if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) {
-		f2fs_show_injection_info(FAULT_IO);
+	if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)),
+						FAULT_READ_IO)) {
+		f2fs_show_injection_info(FAULT_READ_IO);
 		bio->bi_status = BLK_STS_IOERR;
 	}
 
@@ -147,9 +188,14 @@
 {
 	struct f2fs_sb_info *sbi = bio->bi_private;
 	struct bio_vec *bvec;
-	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	if (time_to_inject(sbi, FAULT_WRITE_IO)) {
+		f2fs_show_injection_info(FAULT_WRITE_IO);
+		bio->bi_status = BLK_STS_IOERR;
+	}
+
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
 		enum count_type type = WB_DATA_TYPE(page);
 
@@ -164,7 +210,7 @@
 			continue;
 		}
 
-		fscrypt_pullback_bio_page(&page, true);
+		fscrypt_finalize_bounce_page(&page);
 
 		if (unlikely(bio->bi_status)) {
 			mapping_set_error(page->mapping, -EIO);
@@ -197,12 +243,14 @@
 	struct block_device *bdev = sbi->sb->s_bdev;
 	int i;
 
-	for (i = 0; i < sbi->s_ndevs; i++) {
-		if (FDEV(i).start_blk <= blk_addr &&
-					FDEV(i).end_blk >= blk_addr) {
-			blk_addr -= FDEV(i).start_blk;
-			bdev = FDEV(i).bdev;
-			break;
+	if (f2fs_is_multi_device(sbi)) {
+		for (i = 0; i < sbi->s_ndevs; i++) {
+			if (FDEV(i).start_blk <= blk_addr &&
+			    FDEV(i).end_blk >= blk_addr) {
+				blk_addr -= FDEV(i).start_blk;
+				bdev = FDEV(i).bdev;
+				break;
+			}
 		}
 	}
 	if (bio) {
@@ -216,6 +264,9 @@
 {
 	int i;
 
+	if (!f2fs_is_multi_device(sbi))
+		return 0;
+
 	for (i = 0; i < sbi->s_ndevs; i++)
 		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
 			return i;
@@ -232,26 +283,25 @@
 /*
  * Low-level block read/write IO operations.
  */
-static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
-				struct writeback_control *wbc,
-				int npages, bool is_read,
-				enum page_type type, enum temp_type temp)
+static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
 {
+	struct f2fs_sb_info *sbi = fio->sbi;
 	struct bio *bio;
 
 	bio = f2fs_bio_alloc(sbi, npages, true);
 
-	f2fs_target_device(sbi, blk_addr, bio);
-	if (is_read) {
+	f2fs_target_device(sbi, fio->new_blkaddr, bio);
+	if (is_read_io(fio->op)) {
 		bio->bi_end_io = f2fs_read_end_io;
 		bio->bi_private = NULL;
 	} else {
 		bio->bi_end_io = f2fs_write_end_io;
 		bio->bi_private = sbi;
-		bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, type, temp);
+		bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
+						fio->type, fio->temp);
 	}
-	if (wbc)
-		wbc_init_bio(wbc, bio);
+	if (fio->io_wbc)
+		wbc_init_bio(fio->io_wbc, bio);
 
 	return bio;
 }
@@ -268,6 +318,9 @@
 		if (test_opt(sbi, LFS) && current->plug)
 			blk_finish_plug(current->plug);
 
+		if (F2FS_IO_ALIGNED(sbi))
+			goto submit_io;
+
 		start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
 		start %= F2FS_IO_SIZE(sbi);
 
@@ -278,9 +331,10 @@
 		for (; start < F2FS_IO_SIZE(sbi); start++) {
 			struct page *page =
 				mempool_alloc(sbi->write_io_dummy,
-					GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+					      GFP_NOIO | __GFP_NOFAIL);
 			f2fs_bug_on(sbi, !page);
 
+			zero_user_segment(page, 0, PAGE_SIZE);
 			SetPagePrivate(page);
 			set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
 			lock_page(page);
@@ -320,31 +374,29 @@
 	io->bio = NULL;
 }
 
-static bool __has_merged_page(struct f2fs_bio_info *io,
-				struct inode *inode, nid_t ino, pgoff_t idx)
+static bool __has_merged_page(struct bio *bio, struct inode *inode,
+						struct page *page, nid_t ino)
 {
 	struct bio_vec *bvec;
 	struct page *target;
-	int i;
+	struct bvec_iter_all iter_all;
 
-	if (!io->bio)
+	if (!bio)
 		return false;
 
-	if (!inode && !ino)
+	if (!inode && !page && !ino)
 		return true;
 
-	bio_for_each_segment_all(bvec, io->bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 
-		if (bvec->bv_page->mapping)
-			target = bvec->bv_page;
-		else
-			target = fscrypt_control_page(bvec->bv_page);
-
-		if (idx != target->index)
-			continue;
+		target = bvec->bv_page;
+		if (fscrypt_is_bounce_page(target))
+			target = fscrypt_pagecache_page(target);
 
 		if (inode && inode == target->mapping->host)
 			return true;
+		if (page && page == target)
+			return true;
 		if (ino && ino == ino_of_node(target))
 			return true;
 	}
@@ -352,28 +404,6 @@
 	return false;
 }
 
-static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
-				nid_t ino, pgoff_t idx, enum page_type type)
-{
-	enum page_type btype = PAGE_TYPE_OF_BIO(type);
-	enum temp_type temp;
-	struct f2fs_bio_info *io;
-	bool ret = false;
-
-	for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
-		io = sbi->write_io[btype] + temp;
-
-		down_read(&io->io_rwsem);
-		ret = __has_merged_page(io, inode, ino, idx);
-		up_read(&io->io_rwsem);
-
-		/* TODO: use HOT temp only for meta pages now. */
-		if (ret || btype == META)
-			break;
-	}
-	return ret;
-}
-
 static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
 				enum page_type type, enum temp_type temp)
 {
@@ -395,17 +425,23 @@
 }
 
 static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
-				struct inode *inode, nid_t ino, pgoff_t idx,
-				enum page_type type, bool force)
+				struct inode *inode, struct page *page,
+				nid_t ino, enum page_type type, bool force)
 {
 	enum temp_type temp;
-
-	if (!force && !has_merged_page(sbi, inode, ino, idx, type))
-		return;
+	bool ret = true;
 
 	for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
+		if (!force)	{
+			enum page_type btype = PAGE_TYPE_OF_BIO(type);
+			struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
 
-		__f2fs_submit_merged_write(sbi, type, temp);
+			down_read(&io->io_rwsem);
+			ret = __has_merged_page(io->bio, inode, page, ino);
+			up_read(&io->io_rwsem);
+		}
+		if (ret)
+			__f2fs_submit_merged_write(sbi, type, temp);
 
 		/* TODO: use HOT temp only for meta pages now. */
 		if (type >= META)
@@ -415,14 +451,14 @@
 
 void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type)
 {
-	__submit_merged_write_cond(sbi, NULL, 0, 0, type, true);
+	__submit_merged_write_cond(sbi, NULL, NULL, 0, type, true);
 }
 
 void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
-				struct inode *inode, nid_t ino, pgoff_t idx,
-				enum page_type type)
+				struct inode *inode, struct page *page,
+				nid_t ino, enum page_type type)
 {
-	__submit_merged_write_cond(sbi, inode, ino, idx, type, false);
+	__submit_merged_write_cond(sbi, inode, page, ino, type, false);
 }
 
 void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
@@ -443,15 +479,15 @@
 			fio->encrypted_page : fio->page;
 
 	if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
-			__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
-		return -EFAULT;
+			fio->is_por ? META_POR : (__is_meta_io(fio) ?
+			META_GENERIC : DATA_GENERIC_ENHANCE)))
+		return -EFSCORRUPTED;
 
 	trace_f2fs_submit_page_bio(page, fio);
 	f2fs_trace_ios(fio, 0);
 
 	/* Allocate a new bio */
-	bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc,
-				1, is_read_io(fio->op), fio->type, fio->temp);
+	bio = __bio_alloc(fio, 1);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		bio_put(bio);
@@ -459,17 +495,108 @@
 	}
 
 	if (fio->io_wbc && !is_read_io(fio->op))
-		wbc_account_io(fio->io_wbc, page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
 
 	bio_set_op_attrs(bio, fio->op, fio->op_flags);
 
-	if (!is_read_io(fio->op))
-		inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page));
+	inc_page_count(fio->sbi, is_read_io(fio->op) ?
+			__read_io_type(page): WB_DATA_TYPE(fio->page));
 
 	__submit_bio(fio->sbi, bio, fio->type);
 	return 0;
 }
 
+static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
+				block_t last_blkaddr, block_t cur_blkaddr)
+{
+	if (last_blkaddr + 1 != cur_blkaddr)
+		return false;
+	return __same_bdev(sbi, cur_blkaddr, bio);
+}
+
+static bool io_type_is_mergeable(struct f2fs_bio_info *io,
+						struct f2fs_io_info *fio)
+{
+	if (io->fio.op != fio->op)
+		return false;
+	return io->fio.op_flags == fio->op_flags;
+}
+
+static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
+					struct f2fs_bio_info *io,
+					struct f2fs_io_info *fio,
+					block_t last_blkaddr,
+					block_t cur_blkaddr)
+{
+	if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) {
+		unsigned int filled_blocks =
+				F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size);
+		unsigned int io_size = F2FS_IO_SIZE(sbi);
+		unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt;
+
+		/* IOs in bio is aligned and left space of vectors is not enough */
+		if (!(filled_blocks % io_size) && left_vecs < io_size)
+			return false;
+	}
+	if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr))
+		return false;
+	return io_type_is_mergeable(io, fio);
+}
+
+int f2fs_merge_page_bio(struct f2fs_io_info *fio)
+{
+	struct bio *bio = *fio->bio;
+	struct page *page = fio->encrypted_page ?
+			fio->encrypted_page : fio->page;
+
+	if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
+			__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
+		return -EFSCORRUPTED;
+
+	trace_f2fs_submit_page_bio(page, fio);
+	f2fs_trace_ios(fio, 0);
+
+	if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block,
+						fio->new_blkaddr)) {
+		__submit_bio(fio->sbi, bio, fio->type);
+		bio = NULL;
+	}
+alloc_new:
+	if (!bio) {
+		bio = __bio_alloc(fio, BIO_MAX_PAGES);
+		bio_set_op_attrs(bio, fio->op, fio->op_flags);
+	}
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		__submit_bio(fio->sbi, bio, fio->type);
+		bio = NULL;
+		goto alloc_new;
+	}
+
+	if (fio->io_wbc)
+		wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
+
+	inc_page_count(fio->sbi, WB_DATA_TYPE(page));
+
+	*fio->last_block = fio->new_blkaddr;
+	*fio->bio = bio;
+
+	return 0;
+}
+
+static void f2fs_submit_ipu_bio(struct f2fs_sb_info *sbi, struct bio **bio,
+							struct page *page)
+{
+	if (!bio)
+		return;
+
+	if (!__has_merged_page(*bio, NULL, page, 0))
+		return;
+
+	__submit_bio(sbi, *bio, DATA);
+	*bio = NULL;
+}
+
 void f2fs_submit_page_write(struct f2fs_io_info *fio)
 {
 	struct f2fs_sb_info *sbi = fio->sbi;
@@ -493,9 +620,7 @@
 		spin_unlock(&io->io_lock);
 	}
 
-	if (__is_valid_data_blkaddr(fio->old_blkaddr))
-		verify_block_addr(fio, fio->old_blkaddr);
-	verify_block_addr(fio, fio->new_blkaddr);
+	verify_fio_blkaddr(fio);
 
 	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
@@ -504,21 +629,19 @@
 
 	inc_page_count(sbi, WB_DATA_TYPE(bio_page));
 
-	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
-	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
-			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
+	if (io->bio && !io_is_mergeable(sbi, io->bio, io, fio,
+			io->last_block_in_bio, fio->new_blkaddr))
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
-		if ((fio->type == DATA || fio->type == NODE) &&
+		if (F2FS_IO_ALIGNED(sbi) &&
+				(fio->type == DATA || fio->type == NODE) &&
 				fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
 			dec_page_count(sbi, WB_DATA_TYPE(bio_page));
 			fio->retry = true;
 			goto skip;
 		}
-		io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc,
-						BIO_MAX_PAGES, false,
-						fio->type, fio->temp);
+		io->bio = __bio_alloc(fio, BIO_MAX_PAGES);
 		io->fio = *fio;
 	}
 
@@ -528,7 +651,7 @@
 	}
 
 	if (fio->io_wbc)
-		wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
 
 	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
@@ -538,20 +661,27 @@
 	if (fio->in_list)
 		goto next;
 out:
+	if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
+				!f2fs_is_checkpoint_ready(sbi))
+		__submit_merged_bio(io);
 	up_write(&io->io_rwsem);
 }
 
+static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+{
+	return fsverity_active(inode) &&
+	       idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
 static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
-					unsigned nr_pages, unsigned op_flag)
+				      unsigned nr_pages, unsigned op_flag,
+				      pgoff_t first_idx)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct bio *bio;
 	struct bio_post_read_ctx *ctx;
 	unsigned int post_read_steps = 0;
 
-	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC))
-		return ERR_PTR(-EFAULT);
-
 	bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
@@ -561,6 +691,10 @@
 
 	if (f2fs_encrypted_file(inode))
 		post_read_steps |= 1 << STEP_DECRYPT;
+
+	if (f2fs_need_verity(inode, first_idx))
+		post_read_steps |= 1 << STEP_VERITY;
+
 	if (post_read_steps) {
 		ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
 		if (!ctx) {
@@ -570,9 +704,6 @@
 		ctx->bio = bio;
 		ctx->enabled_steps = post_read_steps;
 		bio->bi_private = ctx;
-
-		/* wait the page to be moved by cleaning */
-		f2fs_wait_on_block_writeback(sbi, blkaddr);
 	}
 
 	return bio;
@@ -582,17 +713,23 @@
 static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 							block_t blkaddr)
 {
-	struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct bio *bio;
 
+	bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
+	/* wait for GCed page writeback via META_MAPPING */
+	f2fs_wait_on_block_writeback(inode, blkaddr);
+
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		bio_put(bio);
 		return -EFAULT;
 	}
 	ClearPageError(page);
-	__submit_bio(F2FS_I_SB(inode), bio, DATA);
+	inc_page_count(sbi, F2FS_RD_DATA);
+	__submit_bio(sbi, bio, DATA);
 	return 0;
 }
 
@@ -618,7 +755,7 @@
  */
 void f2fs_set_data_blkaddr(struct dnode_of_data *dn)
 {
-	f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
 	__set_data_blkaddr(dn);
 	if (set_page_dirty(dn->node_page))
 		dn->node_changed = true;
@@ -648,7 +785,7 @@
 	trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
 						dn->ofs_in_node, count);
 
-	f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
 
 	for (; count > 0; dn->ofs_in_node++) {
 		block_t blkaddr = datablock_addr(dn->inode,
@@ -720,6 +857,11 @@
 
 	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
 		dn.data_blkaddr = ei.blk + index - ei.fofs;
+		if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
+						DATA_GENERIC_ENHANCE_READ)) {
+			err = -EFSCORRUPTED;
+			goto put_err;
+		}
 		goto got_it;
 	}
 
@@ -733,6 +875,13 @@
 		err = -ENOENT;
 		goto put_err;
 	}
+	if (dn.data_blkaddr != NEW_ADDR &&
+			!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
+						dn.data_blkaddr,
+						DATA_GENERIC_ENHANCE)) {
+		err = -EFSCORRUPTED;
+		goto put_err;
+	}
 got_it:
 	if (PageUptodate(page)) {
 		unlock_page(page);
@@ -882,7 +1031,6 @@
 	struct f2fs_summary sum;
 	struct node_info ni;
 	block_t old_blkaddr;
-	pgoff_t fofs;
 	blkcnt_t count = 1;
 	int err;
 
@@ -895,7 +1043,7 @@
 
 	dn->data_blkaddr = datablock_addr(dn->inode,
 				dn->node_page, dn->ofs_in_node);
-	if (dn->data_blkaddr == NEW_ADDR)
+	if (dn->data_blkaddr != NULL_ADDR)
 		goto alloc;
 
 	if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
@@ -909,14 +1057,12 @@
 	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
 		invalidate_mapping_pages(META_MAPPING(sbi),
 					old_blkaddr, old_blkaddr);
-	f2fs_set_data_blkaddr(dn);
+	f2fs_update_data_blkaddr(dn, dn->data_blkaddr);
 
-	/* update i_size */
-	fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
-							dn->ofs_in_node;
-	if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
-		f2fs_i_size_write(dn->inode,
-				((loff_t)(fofs + 1) << PAGE_SHIFT));
+	/*
+	 * i_size will be updated by direct_IO. Otherwise, we'll get stale
+	 * data from unwritten block via dio_read.
+	 */
 	return 0;
 }
 
@@ -935,6 +1081,9 @@
 			return err;
 	}
 
+	if (direct_io && allow_outplace_dio(inode, iocb, from))
+		return 0;
+
 	if (is_inode_flag_set(inode, FI_NO_PREALLOC))
 		return 0;
 
@@ -948,10 +1097,11 @@
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = NULL;
 	map.m_seg_type = NO_CHECK_TYPE;
+	map.m_may_create = true;
 
 	if (direct_io) {
 		map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
-		flag = f2fs_force_buffered_io(inode, WRITE) ?
+		flag = f2fs_force_buffered_io(inode, iocb, from) ?
 					F2FS_GET_BLOCK_PRE_AIO :
 					F2FS_GET_BLOCK_PRE_DIO;
 		goto map_blocks;
@@ -976,7 +1126,7 @@
 	return err;
 }
 
-static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
 {
 	if (flag == F2FS_GET_BLOCK_PRE_AIO) {
 		if (lock)
@@ -1006,7 +1156,7 @@
 	unsigned int maxblocks = map->m_len;
 	struct dnode_of_data dn;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	int mode = create ? ALLOC_NODE : LOOKUP_NODE;
+	int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE;
 	pgoff_t pgofs, end_offset, end;
 	int err = 0, ofs = 1;
 	unsigned int ofs_in_node, last_ofs_in_node;
@@ -1026,16 +1176,25 @@
 	end = pgofs + maxblocks;
 
 	if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+		if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+							map->m_may_create)
+			goto next_dnode;
+
 		map->m_pblk = ei.blk + pgofs - ei.fofs;
 		map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
 		map->m_flags = F2FS_MAP_MAPPED;
 		if (map->m_next_extent)
 			*map->m_next_extent = pgofs + map->m_len;
+
+		/* for hardware encryption, but to avoid potential issue in future */
+		if (flag == F2FS_GET_BLOCK_DIO)
+			f2fs_wait_on_block_writeback_range(inode,
+						map->m_pblk, map->m_len);
 		goto out;
 	}
 
 next_dnode:
-	if (create)
+	if (map->m_may_create)
 		__do_map_lock(sbi, flag, true);
 
 	/* When reading holes, we need its node page */
@@ -1065,12 +1224,22 @@
 	blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
 
 	if (__is_valid_data_blkaddr(blkaddr) &&
-		!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) {
-		err = -EFAULT;
+		!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
+		err = -EFSCORRUPTED;
 		goto sync_out;
 	}
 
-	if (!is_valid_data_blkaddr(sbi, blkaddr)) {
+	if (__is_valid_data_blkaddr(blkaddr)) {
+		/* use out-place-update for driect IO under LFS mode */
+		if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+							map->m_may_create) {
+			err = __allocate_data_block(&dn, map->m_seg_type);
+			if (err)
+				goto sync_out;
+			blkaddr = dn.data_blkaddr;
+			set_inode_flag(inode, FI_APPEND_WRITE);
+		}
+	} else {
 		if (create) {
 			if (unlikely(f2fs_cp_error(sbi))) {
 				err = -EIO;
@@ -1082,6 +1251,8 @@
 					last_ofs_in_node = dn.ofs_in_node;
 				}
 			} else {
+				WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO &&
+					flag != F2FS_GET_BLOCK_DIO);
 				err = __allocate_data_block(&dn,
 							map->m_seg_type);
 				if (!err)
@@ -1172,13 +1343,19 @@
 
 	f2fs_put_dnode(&dn);
 
-	if (create) {
+	if (map->m_may_create) {
 		__do_map_lock(sbi, flag, false);
 		f2fs_balance_fs(sbi, dn.node_changed);
 	}
 	goto next_dnode;
 
 sync_out:
+
+	/* for hardware encryption, but to avoid potential issue in future */
+	if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED)
+		f2fs_wait_on_block_writeback_range(inode,
+						map->m_pblk, map->m_len);
+
 	if (flag == F2FS_GET_BLOCK_PRECACHE) {
 		if (map->m_flags & F2FS_MAP_MAPPED) {
 			unsigned int ofs = start_pgofs - map->m_lblk;
@@ -1192,7 +1369,7 @@
 	}
 	f2fs_put_dnode(&dn);
 unlock_out:
-	if (create) {
+	if (map->m_may_create) {
 		__do_map_lock(sbi, flag, false);
 		f2fs_balance_fs(sbi, dn.node_changed);
 	}
@@ -1214,6 +1391,7 @@
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = NULL;
 	map.m_seg_type = NO_CHECK_TYPE;
+	map.m_may_create = false;
 	last_lblk = F2FS_BLK_ALIGN(pos + len);
 
 	while (map.m_lblk < last_lblk) {
@@ -1228,7 +1406,7 @@
 
 static int __get_data_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh, int create, int flag,
-			pgoff_t *next_pgofs, int seg_type)
+			pgoff_t *next_pgofs, int seg_type, bool may_write)
 {
 	struct f2fs_map_blocks map;
 	int err;
@@ -1238,6 +1416,7 @@
 	map.m_next_pgofs = next_pgofs;
 	map.m_next_extent = NULL;
 	map.m_seg_type = seg_type;
+	map.m_may_create = may_write;
 
 	err = f2fs_map_blocks(inode, &map, create, flag);
 	if (!err) {
@@ -1254,16 +1433,25 @@
 {
 	return __get_data_block(inode, iblock, bh_result, create,
 							flag, next_pgofs,
-							NO_CHECK_TYPE);
+							NO_CHECK_TYPE, create);
+}
+
+static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	return __get_data_block(inode, iblock, bh_result, create,
+				F2FS_GET_BLOCK_DIO, NULL,
+				f2fs_rw_hint_to_seg_type(inode->i_write_hint),
+				IS_SWAPFILE(inode) ? false : true);
 }
 
 static int get_data_block_dio(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	return __get_data_block(inode, iblock, bh_result, create,
-						F2FS_GET_BLOCK_DEFAULT, NULL,
-						f2fs_rw_hint_to_seg_type(
-							inode->i_write_hint));
+				F2FS_GET_BLOCK_DIO, NULL,
+				f2fs_rw_hint_to_seg_type(inode->i_write_hint),
+				false);
 }
 
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -1275,7 +1463,7 @@
 
 	return __get_data_block(inode, iblock, bh_result, create,
 						F2FS_GET_BLOCK_BMAP, NULL,
-						NO_CHECK_TYPE);
+						NO_CHECK_TYPE, create);
 }
 
 static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -1385,7 +1573,7 @@
 		goto out;
 	}
 
-	if (f2fs_has_inline_data(inode)) {
+	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) {
 		ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
 		if (ret != -EAGAIN)
 			goto out;
@@ -1418,7 +1606,7 @@
 	}
 
 	if (size) {
-		if (f2fs_encrypted_inode(inode))
+		if (IS_ENCRYPTED(inode))
 			flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
 
 		ret = fiemap_fill_next_extent(fieinfo, logical,
@@ -1451,6 +1639,133 @@
 	return ret;
 }
 
+static inline loff_t f2fs_readpage_limit(struct inode *inode)
+{
+	if (IS_ENABLED(CONFIG_FS_VERITY) &&
+	    (IS_VERITY(inode) || f2fs_verity_in_progress(inode)))
+		return inode->i_sb->s_maxbytes;
+
+	return i_size_read(inode);
+}
+
+static int f2fs_read_single_page(struct inode *inode, struct page *page,
+					unsigned nr_pages,
+					struct f2fs_map_blocks *map,
+					struct bio **bio_ret,
+					sector_t *last_block_in_bio,
+					bool is_readahead)
+{
+	struct bio *bio = *bio_ret;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	sector_t block_in_file;
+	sector_t last_block;
+	sector_t last_block_in_file;
+	sector_t block_nr;
+	int ret = 0;
+
+	block_in_file = (sector_t)page_index(page);
+	last_block = block_in_file + nr_pages;
+	last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >>
+							blkbits;
+	if (last_block > last_block_in_file)
+		last_block = last_block_in_file;
+
+	/* just zeroing out page which is beyond EOF */
+	if (block_in_file >= last_block)
+		goto zero_out;
+	/*
+	 * Map blocks using the previous result first.
+	 */
+	if ((map->m_flags & F2FS_MAP_MAPPED) &&
+			block_in_file > map->m_lblk &&
+			block_in_file < (map->m_lblk + map->m_len))
+		goto got_it;
+
+	/*
+	 * Then do more f2fs_map_blocks() calls until we are
+	 * done with this page.
+	 */
+	map->m_lblk = block_in_file;
+	map->m_len = last_block - block_in_file;
+
+	ret = f2fs_map_blocks(inode, map, 0, F2FS_GET_BLOCK_DEFAULT);
+	if (ret)
+		goto out;
+got_it:
+	if ((map->m_flags & F2FS_MAP_MAPPED)) {
+		block_nr = map->m_pblk + block_in_file - map->m_lblk;
+		SetPageMappedToDisk(page);
+
+		if (!PageUptodate(page) && (!PageSwapCache(page) &&
+					!cleancache_get_page(page))) {
+			SetPageUptodate(page);
+			goto confused;
+		}
+
+		if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+						DATA_GENERIC_ENHANCE_READ)) {
+			ret = -EFSCORRUPTED;
+			goto out;
+		}
+	} else {
+zero_out:
+		zero_user_segment(page, 0, PAGE_SIZE);
+		if (f2fs_need_verity(inode, page->index) &&
+		    !fsverity_verify_page(page)) {
+			ret = -EIO;
+			goto out;
+		}
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+		goto out;
+	}
+
+	/*
+	 * This page will go to BIO.  Do we need to send this
+	 * BIO off first?
+	 */
+	if (bio && !page_is_mergeable(F2FS_I_SB(inode), bio,
+				*last_block_in_bio, block_nr)) {
+submit_and_realloc:
+		__submit_bio(F2FS_I_SB(inode), bio, DATA);
+		bio = NULL;
+	}
+	if (bio == NULL) {
+		bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
+				is_readahead ? REQ_RAHEAD : 0, page->index);
+		if (IS_ERR(bio)) {
+			ret = PTR_ERR(bio);
+			bio = NULL;
+			goto out;
+		}
+	}
+
+	/*
+	 * If the page is under writeback, we need to wait for
+	 * its completion to see the correct decrypted data.
+	 */
+	f2fs_wait_on_block_writeback(inode, block_nr);
+
+	if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+		goto submit_and_realloc;
+
+	inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+	ClearPageError(page);
+	*last_block_in_bio = block_nr;
+	goto out;
+confused:
+	if (bio) {
+		__submit_bio(F2FS_I_SB(inode), bio, DATA);
+		bio = NULL;
+	}
+	unlock_page(page);
+out:
+	*bio_ret = bio;
+	return ret;
+}
+
 /*
  * This function was originally taken from fs/mpage.c, and customized for f2fs.
  * Major change was from block_size == page_size in f2fs by default.
@@ -1467,13 +1782,8 @@
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
 	struct inode *inode = mapping->host;
-	const unsigned blkbits = inode->i_blkbits;
-	const unsigned blocksize = 1 << blkbits;
-	sector_t block_in_file;
-	sector_t last_block;
-	sector_t last_block_in_file;
-	sector_t block_nr;
 	struct f2fs_map_blocks map;
+	int ret = 0;
 
 	map.m_pblk = 0;
 	map.m_lblk = 0;
@@ -1482,6 +1792,7 @@
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = NULL;
 	map.m_seg_type = NO_CHECK_TYPE;
+	map.m_may_create = false;
 
 	for (; nr_pages; nr_pages--) {
 		if (pages) {
@@ -1490,97 +1801,18 @@
 			prefetchw(&page->flags);
 			list_del(&page->lru);
 			if (add_to_page_cache_lru(page, mapping,
-						  page->index,
+						  page_index(page),
 						  readahead_gfp_mask(mapping)))
 				goto next_page;
 		}
 
-		block_in_file = (sector_t)page->index;
-		last_block = block_in_file + nr_pages;
-		last_block_in_file = (i_size_read(inode) + blocksize - 1) >>
-								blkbits;
-		if (last_block > last_block_in_file)
-			last_block = last_block_in_file;
-
-		/*
-		 * Map blocks using the previous result first.
-		 */
-		if ((map.m_flags & F2FS_MAP_MAPPED) &&
-				block_in_file > map.m_lblk &&
-				block_in_file < (map.m_lblk + map.m_len))
-			goto got_it;
-
-		/*
-		 * Then do more f2fs_map_blocks() calls until we are
-		 * done with this page.
-		 */
-		map.m_flags = 0;
-
-		if (block_in_file < last_block) {
-			map.m_lblk = block_in_file;
-			map.m_len = last_block - block_in_file;
-
-			if (f2fs_map_blocks(inode, &map, 0,
-						F2FS_GET_BLOCK_DEFAULT))
-				goto set_error_page;
-		}
-got_it:
-		if ((map.m_flags & F2FS_MAP_MAPPED)) {
-			block_nr = map.m_pblk + block_in_file - map.m_lblk;
-			SetPageMappedToDisk(page);
-
-			if (!PageUptodate(page) && !cleancache_get_page(page)) {
-				SetPageUptodate(page);
-				goto confused;
-			}
-
-			if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
-								DATA_GENERIC))
-				goto set_error_page;
-		} else {
+		ret = f2fs_read_single_page(inode, page, nr_pages, &map, &bio,
+					&last_block_in_bio, is_readahead);
+		if (ret) {
+			SetPageError(page);
 			zero_user_segment(page, 0, PAGE_SIZE);
-			if (!PageUptodate(page))
-				SetPageUptodate(page);
 			unlock_page(page);
-			goto next_page;
 		}
-
-		/*
-		 * This page will go to BIO.  Do we need to send this
-		 * BIO off first?
-		 */
-		if (bio && (last_block_in_bio != block_nr - 1 ||
-			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
-submit_and_realloc:
-			__submit_bio(F2FS_I_SB(inode), bio, DATA);
-			bio = NULL;
-		}
-		if (bio == NULL) {
-			bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
-					is_readahead ? REQ_RAHEAD : 0);
-			if (IS_ERR(bio)) {
-				bio = NULL;
-				goto set_error_page;
-			}
-		}
-
-		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
-			goto submit_and_realloc;
-
-		ClearPageError(page);
-		last_block_in_bio = block_nr;
-		goto next_page;
-set_error_page:
-		SetPageError(page);
-		zero_user_segment(page, 0, PAGE_SIZE);
-		unlock_page(page);
-		goto next_page;
-confused:
-		if (bio) {
-			__submit_bio(F2FS_I_SB(inode), bio, DATA);
-			bio = NULL;
-		}
-		unlock_page(page);
 next_page:
 		if (pages)
 			put_page(page);
@@ -1588,12 +1820,12 @@
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
 		__submit_bio(F2FS_I_SB(inode), bio, DATA);
-	return 0;
+	return pages ? 0 : ret;
 }
 
 static int f2fs_read_data_page(struct file *file, struct page *page)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	int ret = -EAGAIN;
 
 	trace_f2fs_readpage(page, DATA);
@@ -1602,7 +1834,8 @@
 	if (f2fs_has_inline_data(inode))
 		ret = f2fs_read_inline_data(inode, page);
 	if (ret == -EAGAIN)
-		ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1, false);
+		ret = f2fs_mpage_readpages(page_file_mapping(page),
+						NULL, page, 1, false);
 	return ret;
 }
 
@@ -1632,11 +1865,12 @@
 		return 0;
 
 	/* wait for GCed page writeback via META_MAPPING */
-	f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr);
+	f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
 
 retry_encrypt:
-	fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
-			PAGE_SIZE, 0, fio->page->index, gfp_flags);
+	fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(fio->page,
+							       PAGE_SIZE, 0,
+							       gfp_flags);
 	if (IS_ERR(fio->encrypted_page)) {
 		/* flush pending IOs and wait for a while in the ENOMEM case */
 		if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
@@ -1681,7 +1915,7 @@
 	if (policy & (0x1 << F2FS_IPU_ASYNC) &&
 			fio && fio->op == REQ_OP_WRITE &&
 			!(fio->op_flags & REQ_SYNC) &&
-			!f2fs_encrypted_inode(inode))
+			!IS_ENCRYPTED(inode))
 		return true;
 
 	/* this is only set during fdatasync */
@@ -1689,6 +1923,10 @@
 			is_inode_flag_set(inode, FI_NEED_IPU))
 		return true;
 
+	if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+			!f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
+		return true;
+
 	return false;
 }
 
@@ -1712,6 +1950,8 @@
 		return true;
 	if (S_ISDIR(inode->i_mode))
 		return true;
+	if (IS_NOQUOTA(inode))
+		return true;
 	if (f2fs_is_atomic_file(inode))
 		return true;
 	if (fio) {
@@ -1719,6 +1959,9 @@
 			return true;
 		if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
 			return true;
+		if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+			f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
+			return true;
 	}
 	return false;
 }
@@ -1749,8 +1992,8 @@
 		fio->old_blkaddr = ei.blk + page->index - ei.fofs;
 
 		if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
-							DATA_GENERIC))
-			return -EFAULT;
+						DATA_GENERIC_ENHANCE))
+			return -EFSCORRUPTED;
 
 		ipu_force = true;
 		fio->need_lock = LOCK_DONE;
@@ -1770,20 +2013,22 @@
 	/* This page is already truncated */
 	if (fio->old_blkaddr == NULL_ADDR) {
 		ClearPageUptodate(page);
+		clear_cold_data(page);
 		goto out_writepage;
 	}
 got_it:
 	if (__is_valid_data_blkaddr(fio->old_blkaddr) &&
 		!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
-							DATA_GENERIC)) {
-		err = -EFAULT;
+						DATA_GENERIC_ENHANCE)) {
+		err = -EFSCORRUPTED;
 		goto out_writepage;
 	}
 	/*
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
 	 */
-	if (ipu_force || (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr) &&
+	if (ipu_force ||
+		(__is_valid_data_blkaddr(fio->old_blkaddr) &&
 					need_inplace_update(fio))) {
 		err = encrypt_one_page(fio);
 		if (err)
@@ -1795,8 +2040,15 @@
 		if (fio->need_lock == LOCK_REQ)
 			f2fs_unlock_op(fio->sbi);
 		err = f2fs_inplace_write_data(fio);
+		if (err) {
+			if (f2fs_encrypted_file(inode))
+				fscrypt_finalize_bounce_page(&fio->encrypted_page);
+			if (PageWriteback(page))
+				end_page_writeback(page);
+		} else {
+			set_inode_flag(inode, FI_UPDATE_WRITE);
+		}
 		trace_f2fs_do_write_data_page(fio->page, IPU);
-		set_inode_flag(inode, FI_UPDATE_WRITE);
 		return err;
 	}
 
@@ -1836,6 +2088,8 @@
 }
 
 static int __write_data_page(struct page *page, bool *submitted,
+				struct bio **bio,
+				sector_t *last_block,
 				struct writeback_control *wbc,
 				enum iostat_type io_type)
 {
@@ -1861,6 +2115,8 @@
 		.need_lock = LOCK_RETRY,
 		.io_type = io_type,
 		.io_wbc = wbc,
+		.bio = bio,
+		.last_block = last_block,
 	};
 
 	trace_f2fs_writepage(page, DATA);
@@ -1880,7 +2136,7 @@
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 
-	if (page->index < end_index)
+	if (page->index < end_index || f2fs_verity_in_progress(inode))
 		goto write;
 
 	/*
@@ -1945,21 +2201,27 @@
 
 out:
 	inode_dec_dirty_pages(inode);
-	if (err)
+	if (err) {
 		ClearPageUptodate(page);
+		clear_cold_data(page);
+	}
 
 	if (wbc->for_reclaim) {
-		f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA);
+		f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA);
 		clear_inode_flag(inode, FI_HOT_DATA);
 		f2fs_remove_dirty_inode(inode);
 		submitted = NULL;
 	}
 
 	unlock_page(page);
-	if (!S_ISDIR(inode->i_mode))
+	if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
+					!F2FS_I(inode)->cp_task) {
+		f2fs_submit_ipu_bio(sbi, bio, page);
 		f2fs_balance_fs(sbi, need_balance_fs);
+	}
 
 	if (unlikely(f2fs_cp_error(sbi))) {
+		f2fs_submit_ipu_bio(sbi, bio, page);
 		f2fs_submit_merged_write(sbi, DATA);
 		submitted = NULL;
 	}
@@ -1986,7 +2248,7 @@
 static int f2fs_write_data_page(struct page *page,
 					struct writeback_control *wbc)
 {
-	return __write_data_page(page, NULL, wbc, FS_DATA_IO);
+	return __write_data_page(page, NULL, NULL, NULL, wbc, FS_DATA_IO);
 }
 
 /*
@@ -2002,15 +2264,17 @@
 	int done = 0;
 	struct pagevec pvec;
 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+	struct bio *bio = NULL;
+	sector_t last_block;
 	int nr_pages;
 	pgoff_t uninitialized_var(writeback_index);
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	pgoff_t done_index;
-	pgoff_t last_idx = ULONG_MAX;
 	int cycled;
 	int range_whole = 0;
-	int tag;
+	xa_mark_t tag;
+	int nwritten = 0;
 
 	pagevec_init(&pvec);
 
@@ -2078,18 +2342,20 @@
 			}
 
 			if (PageWriteback(page)) {
-				if (wbc->sync_mode != WB_SYNC_NONE)
+				if (wbc->sync_mode != WB_SYNC_NONE) {
 					f2fs_wait_on_page_writeback(page,
-								DATA, true);
-				else
+							DATA, true, true);
+					f2fs_submit_ipu_bio(sbi, &bio, page);
+				} else {
 					goto continue_unlock;
+				}
 			}
 
-			BUG_ON(PageWriteback(page));
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			ret = __write_data_page(page, &submitted, wbc, io_type);
+			ret = __write_data_page(page, &submitted, &bio,
+					&last_block, wbc, io_type);
 			if (unlikely(ret)) {
 				/*
 				 * keep nr_to_write, since vfs uses this to
@@ -2113,7 +2379,7 @@
 				done = 1;
 				break;
 			} else if (submitted) {
-				last_idx = page->index;
+				nwritten++;
 			}
 
 			if (--wbc->nr_to_write <= 0 &&
@@ -2135,9 +2401,12 @@
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = done_index;
 
-	if (last_idx != ULONG_MAX)
+	if (nwritten)
 		f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host,
-						0, last_idx, DATA);
+								NULL, 0, DATA);
+	/* submit cached bio of IPU write */
+	if (bio)
+		__submit_bio(sbi, bio, DATA);
 
 	return ret;
 }
@@ -2147,6 +2416,11 @@
 {
 	if (!S_ISREG(inode->i_mode))
 		return false;
+	if (IS_NOQUOTA(inode))
+		return false;
+	/* to avoid deadlock in path of data flush */
+	if (F2FS_I(inode)->cp_task)
+		return false;
 	if (wbc->sync_mode != WB_SYNC_ALL)
 		return true;
 	if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
@@ -2176,7 +2450,8 @@
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto skip_write;
 
-	if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
+	if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) &&
+			wbc->sync_mode == WB_SYNC_NONE &&
 			get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
 			f2fs_available_free_memory(sbi, DIRTY_DENTS))
 		goto skip_write;
@@ -2236,12 +2511,14 @@
 	struct inode *inode = mapping->host;
 	loff_t i_size = i_size_read(inode);
 
-	if (to > i_size) {
+	/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
+	if (to > i_size && !f2fs_verity_in_progress(inode)) {
 		down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 		down_write(&F2FS_I(inode)->i_mmap_sem);
 
 		truncate_pagecache(inode, i_size);
-		f2fs_truncate_blocks(inode, i_size, true);
+		if (!IS_NOQUOTA(inode))
+			f2fs_truncate_blocks(inode, i_size, true);
 
 		up_write(&F2FS_I(inode)->i_mmap_sem);
 		up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -2259,18 +2536,26 @@
 	bool locked = false;
 	struct extent_info ei = {0,0,0};
 	int err = 0;
+	int flag;
 
 	/*
 	 * we already allocated all the blocks, so we don't need to get
 	 * the block addresses when there is no need to fill the page.
 	 */
 	if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE &&
-			!is_inode_flag_set(inode, FI_NO_PREALLOC))
+	    !is_inode_flag_set(inode, FI_NO_PREALLOC) &&
+	    !f2fs_verity_in_progress(inode))
 		return 0;
 
+	/* f2fs_lock_op avoids race between write CP and convert_inline_page */
+	if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode))
+		flag = F2FS_GET_BLOCK_DEFAULT;
+	else
+		flag = F2FS_GET_BLOCK_PRE_AIO;
+
 	if (f2fs_has_inline_data(inode) ||
 			(pos & PAGE_MASK) >= i_size_read(inode)) {
-		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+		__do_map_lock(sbi, flag, true);
 		locked = true;
 	}
 restart:
@@ -2308,6 +2593,7 @@
 				f2fs_put_dnode(&dn);
 				__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
 								true);
+				WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
 				locked = true;
 				goto restart;
 			}
@@ -2321,7 +2607,7 @@
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (locked)
-		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+		__do_map_lock(sbi, flag, false);
 	return err;
 }
 
@@ -2339,6 +2625,11 @@
 
 	trace_f2fs_write_begin(inode, pos, len, flags);
 
+	if (!f2fs_is_checkpoint_ready(sbi)) {
+		err = -ENOSPC;
+		goto fail;
+	}
+
 	if ((f2fs_is_atomic_file(inode) &&
 			!f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
 			is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
@@ -2376,7 +2667,8 @@
 	if (err)
 		goto fail;
 
-	if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) {
+	if (need_balance && !IS_NOQUOTA(inode) &&
+			has_not_enough_free_secs(sbi, 0, 0)) {
 		unlock_page(page);
 		f2fs_balance_fs(sbi, true);
 		lock_page(page);
@@ -2387,16 +2679,13 @@
 		}
 	}
 
-	f2fs_wait_on_page_writeback(page, DATA, false);
-
-	/* wait for GCed page writeback via META_MAPPING */
-	if (f2fs_post_read_required(inode))
-		f2fs_wait_on_block_writeback(sbi, blkaddr);
+	f2fs_wait_on_page_writeback(page, DATA, false, true);
 
 	if (len == PAGE_SIZE || PageUptodate(page))
 		return 0;
 
-	if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) {
+	if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) &&
+	    !f2fs_verity_in_progress(inode)) {
 		zero_user_segment(page, len, PAGE_SIZE);
 		return 0;
 	}
@@ -2405,6 +2694,11 @@
 		zero_user_segment(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 	} else {
+		if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
+				DATA_GENERIC_ENHANCE_READ)) {
+			err = -EFSCORRUPTED;
+			goto fail;
+		}
 		err = f2fs_submit_page_read(inode, page, blkaddr);
 		if (err)
 			goto fail;
@@ -2454,7 +2748,8 @@
 
 	set_page_dirty(page);
 
-	if (pos + copied > i_size_read(inode))
+	if (pos + copied > i_size_read(inode) &&
+	    !f2fs_verity_in_progress(inode))
 		f2fs_i_size_write(inode, pos + copied);
 unlock_out:
 	f2fs_put_page(page, 1);
@@ -2482,41 +2777,105 @@
 	return 0;
 }
 
+static void f2fs_dio_end_io(struct bio *bio)
+{
+	struct f2fs_private_dio *dio = bio->bi_private;
+
+	dec_page_count(F2FS_I_SB(dio->inode),
+			dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
+
+	bio->bi_private = dio->orig_private;
+	bio->bi_end_io = dio->orig_end_io;
+
+	kvfree(dio);
+
+	bio_endio(bio);
+}
+
+static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode,
+							loff_t file_offset)
+{
+	struct f2fs_private_dio *dio;
+	bool write = (bio_op(bio) == REQ_OP_WRITE);
+
+	dio = f2fs_kzalloc(F2FS_I_SB(inode),
+			sizeof(struct f2fs_private_dio), GFP_NOFS);
+	if (!dio)
+		goto out;
+
+	dio->inode = inode;
+	dio->orig_end_io = bio->bi_end_io;
+	dio->orig_private = bio->bi_private;
+	dio->write = write;
+
+	bio->bi_end_io = f2fs_dio_end_io;
+	bio->bi_private = dio;
+
+	inc_page_count(F2FS_I_SB(inode),
+			write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
+
+	submit_bio(bio);
+	return;
+out:
+	bio->bi_status = BLK_STS_IOERR;
+	bio_endio(bio);
+}
+
 static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
 	size_t count = iov_iter_count(iter);
 	loff_t offset = iocb->ki_pos;
 	int rw = iov_iter_rw(iter);
 	int err;
 	enum rw_hint hint = iocb->ki_hint;
 	int whint_mode = F2FS_OPTION(sbi).whint_mode;
+	bool do_opu;
 
 	err = check_direct_IO(inode, iter, offset);
 	if (err)
 		return err < 0 ? err : 0;
 
-	if (f2fs_force_buffered_io(inode, rw))
+	if (f2fs_force_buffered_io(inode, iocb, iter))
 		return 0;
 
+	do_opu = allow_outplace_dio(inode, iocb, iter);
+
 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
 
 	if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
 		iocb->ki_hint = WRITE_LIFE_NOT_SET;
 
-	if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) {
-		if (iocb->ki_flags & IOCB_NOWAIT) {
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!down_read_trylock(&fi->i_gc_rwsem[rw])) {
 			iocb->ki_hint = hint;
 			err = -EAGAIN;
 			goto out;
 		}
-		down_read(&F2FS_I(inode)->i_gc_rwsem[rw]);
+		if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
+			up_read(&fi->i_gc_rwsem[rw]);
+			iocb->ki_hint = hint;
+			err = -EAGAIN;
+			goto out;
+		}
+	} else {
+		down_read(&fi->i_gc_rwsem[rw]);
+		if (do_opu)
+			down_read(&fi->i_gc_rwsem[READ]);
 	}
 
-	err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
-	up_read(&F2FS_I(inode)->i_gc_rwsem[rw]);
+	err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+			iter, rw == WRITE ? get_data_block_dio_write :
+			get_data_block_dio, NULL, f2fs_dio_submit_bio,
+			DIO_LOCKING | DIO_SKIP_HOLES);
+
+	if (do_opu)
+		up_read(&fi->i_gc_rwsem[READ]);
+
+	up_read(&fi->i_gc_rwsem[rw]);
 
 	if (rw == WRITE) {
 		if (whint_mode == WHINT_MODE_OFF)
@@ -2524,7 +2883,8 @@
 		if (err > 0) {
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
 									err);
-			set_inode_flag(inode, FI_UPDATE_WRITE);
+			if (!do_opu)
+				set_inode_flag(inode, FI_UPDATE_WRITE);
 		} else if (err < 0) {
 			f2fs_write_failed(mapping, offset + count);
 		}
@@ -2557,12 +2917,12 @@
 		}
 	}
 
-	/* This is atomic written page, keep Private */
+	clear_cold_data(page);
+
 	if (IS_ATOMIC_WRITTEN_PAGE(page))
 		return f2fs_drop_inmem_page(inode, page);
 
-	set_page_private(page, 0);
-	ClearPagePrivate(page);
+	f2fs_clear_page_private(page);
 }
 
 int f2fs_release_page(struct page *page, gfp_t wait)
@@ -2575,20 +2935,21 @@
 	if (IS_ATOMIC_WRITTEN_PAGE(page))
 		return 0;
 
-	set_page_private(page, 0);
-	ClearPagePrivate(page);
+	clear_cold_data(page);
+	f2fs_clear_page_private(page);
 	return 1;
 }
 
 static int f2fs_set_data_page_dirty(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 
 	trace_f2fs_set_page_dirty(page, DATA);
 
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
+	if (PageSwapCache(page))
+		return __set_page_dirty_nobuffers(page);
 
 	if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
 		if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
@@ -2644,14 +3005,10 @@
 			return -EAGAIN;
 	}
 
-	/*
-	 * A reference is expected if PagePrivate set when move mapping,
-	 * however F2FS breaks this for maintaining dirty page counts when
-	 * truncating pages. So here adjusting the 'extra_count' make it work.
-	 */
-	extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+	/* one extra reference was held for atomic_write page */
+	extra_count = atomic_written ? 1 : 0;
 	rc = migrate_page_move_mapping(mapping, newpage,
-				page, NULL, mode, extra_count);
+				page, extra_count);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		if (atomic_written)
 			mutex_unlock(&fi->inmem_lock);
@@ -2670,9 +3027,10 @@
 		get_page(newpage);
 	}
 
-	if (PagePrivate(page))
-		SetPagePrivate(newpage);
-	set_page_private(newpage, page_private(page));
+	if (PagePrivate(page)) {
+		f2fs_set_page_private(newpage, page_private(page));
+		f2fs_clear_page_private(page);
+	}
 
 	if (mode != MIGRATE_SYNC_NO_COPY)
 		migrate_page_copy(newpage, page);
@@ -2683,6 +3041,126 @@
 }
 #endif
 
+#ifdef CONFIG_SWAP
+/* Copied from generic_swapfile_activate() to check any holes */
+static int check_swap_activate(struct file *swap_file, unsigned int max)
+{
+	struct address_space *mapping = swap_file->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned blocks_per_page;
+	unsigned long page_no;
+	unsigned blkbits;
+	sector_t probe_block;
+	sector_t last_block;
+	sector_t lowest_block = -1;
+	sector_t highest_block = 0;
+
+	blkbits = inode->i_blkbits;
+	blocks_per_page = PAGE_SIZE >> blkbits;
+
+	/*
+	 * Map all the blocks into the extent list.  This code doesn't try
+	 * to be very smart.
+	 */
+	probe_block = 0;
+	page_no = 0;
+	last_block = i_size_read(inode) >> blkbits;
+	while ((probe_block + blocks_per_page) <= last_block && page_no < max) {
+		unsigned block_in_page;
+		sector_t first_block;
+
+		cond_resched();
+
+		first_block = bmap(inode, probe_block);
+		if (first_block == 0)
+			goto bad_bmap;
+
+		/*
+		 * It must be PAGE_SIZE aligned on-disk
+		 */
+		if (first_block & (blocks_per_page - 1)) {
+			probe_block++;
+			goto reprobe;
+		}
+
+		for (block_in_page = 1; block_in_page < blocks_per_page;
+					block_in_page++) {
+			sector_t block;
+
+			block = bmap(inode, probe_block + block_in_page);
+			if (block == 0)
+				goto bad_bmap;
+			if (block != first_block + block_in_page) {
+				/* Discontiguity */
+				probe_block++;
+				goto reprobe;
+			}
+		}
+
+		first_block >>= (PAGE_SHIFT - blkbits);
+		if (page_no) {	/* exclude the header page */
+			if (first_block < lowest_block)
+				lowest_block = first_block;
+			if (first_block > highest_block)
+				highest_block = first_block;
+		}
+
+		page_no++;
+		probe_block += blocks_per_page;
+reprobe:
+		continue;
+	}
+	return 0;
+
+bad_bmap:
+	pr_err("swapon: swapfile has holes\n");
+	return -EINVAL;
+}
+
+static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
+				sector_t *span)
+{
+	struct inode *inode = file_inode(file);
+	int ret;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+		return -EROFS;
+
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
+
+	ret = check_swap_activate(file, sis->max);
+	if (ret)
+		return ret;
+
+	set_inode_flag(inode, FI_PIN_FILE);
+	f2fs_precache_extents(inode);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+	return 0;
+}
+
+static void f2fs_swap_deactivate(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+
+	clear_inode_flag(inode, FI_PIN_FILE);
+}
+#else
+static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
+				sector_t *span)
+{
+	return -EOPNOTSUPP;
+}
+
+static void f2fs_swap_deactivate(struct file *file)
+{
+}
+#endif
+
 const struct address_space_operations f2fs_dblock_aops = {
 	.readpage	= f2fs_read_data_page,
 	.readpages	= f2fs_read_data_pages,
@@ -2695,25 +3173,29 @@
 	.releasepage	= f2fs_release_page,
 	.direct_IO	= f2fs_direct_IO,
 	.bmap		= f2fs_bmap,
+	.swap_activate  = f2fs_swap_activate,
+	.swap_deactivate = f2fs_swap_deactivate,
 #ifdef CONFIG_MIGRATION
 	.migratepage    = f2fs_migrate_page,
 #endif
 };
 
-void f2fs_clear_radix_tree_dirty_tag(struct page *page)
+void f2fs_clear_page_cache_dirty_tag(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	unsigned long flags;
 
 	xa_lock_irqsave(&mapping->i_pages, flags);
-	radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+	__xa_clear_mark(&mapping->i_pages, page_index(page),
 						PAGECACHE_TAG_DIRTY);
 	xa_unlock_irqrestore(&mapping->i_pages, flags);
 }
 
 int __init f2fs_init_post_read_processing(void)
 {
-	bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0);
+	bio_post_read_ctx_cache =
+		kmem_cache_create("f2fs_bio_post_read_ctx",
+				  sizeof(struct bio_post_read_ctx), 0, 0, NULL);
 	if (!bio_post_read_ctx_cache)
 		goto fail;
 	bio_post_read_ctx_pool =

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 214a968..9b0bedd 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * f2fs debugging statistics
  *
@@ -5,10 +6,6 @@
  *             http://www.samsung.com/
  * Copyright (c) 2012 Linux Foundation
  * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/fs.h>
@@ -30,8 +27,15 @@
 static void update_general_status(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	int i;
 
+	/* these will be changed if online resize is done */
+	si->main_area_segs = le32_to_cpu(raw_super->segment_count_main);
+	si->main_area_sections = le32_to_cpu(raw_super->section_count);
+	si->main_area_zones = si->main_area_sections /
+				le32_to_cpu(raw_super->secs_per_zone);
+
 	/* validation check of the segment numbers */
 	si->hit_largest = atomic64_read(&sbi->read_hit_largest);
 	si->hit_cached = atomic64_read(&sbi->read_hit_cached);
@@ -56,21 +60,26 @@
 	si->vw_cnt = atomic_read(&sbi->vw_cnt);
 	si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
 	si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
+	si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ);
+	si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE);
 	si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
 	si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
-	if (SM_I(sbi) && SM_I(sbi)->fcc_info) {
+	si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA);
+	si->nr_rd_node = get_pages(sbi, F2FS_RD_NODE);
+	si->nr_rd_meta = get_pages(sbi, F2FS_RD_META);
+	if (SM_I(sbi)->fcc_info) {
 		si->nr_flushed =
 			atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
 		si->nr_flushing =
-			atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+			atomic_read(&SM_I(sbi)->fcc_info->queued_flush);
 		si->flush_list_empty =
 			llist_empty(&SM_I(sbi)->fcc_info->issue_list);
 	}
-	if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
+	if (SM_I(sbi)->dcc_info) {
 		si->nr_discarded =
 			atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
 		si->nr_discarding =
-			atomic_read(&SM_I(sbi)->dcc_info->issing_discard);
+			atomic_read(&SM_I(sbi)->dcc_info->queued_discard);
 		si->nr_discard_cmd =
 			atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
 		si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
@@ -94,8 +103,10 @@
 	si->free_secs = free_sections(sbi);
 	si->prefree_count = prefree_segments(sbi);
 	si->dirty_count = dirty_segments(sbi);
-	si->node_pages = NODE_MAPPING(sbi)->nrpages;
-	si->meta_pages = META_MAPPING(sbi)->nrpages;
+	if (sbi->node_inode)
+		si->node_pages = NODE_MAPPING(sbi)->nrpages;
+	if (sbi->meta_inode)
+		si->meta_pages = META_MAPPING(sbi)->nrpages;
 	si->nats = NM_I(sbi)->nat_cnt;
 	si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
 	si->sits = MAIN_SEGS(sbi);
@@ -104,6 +115,8 @@
 	si->avail_nids = NM_I(sbi)->available_nids;
 	si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
 	si->bg_gc = sbi->bg_gc;
+	si->io_skip_bggc = sbi->io_skip_bggc;
+	si->other_skip_bggc = sbi->other_skip_bggc;
 	si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC];
 	si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC];
 	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -121,6 +134,9 @@
 		si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
 	}
 
+	for (i = META_CP; i < META_MAX; i++)
+		si->meta_count[i] = atomic_read(&sbi->meta_count[i]);
+
 	for (i = 0; i < 2; i++) {
 		si->segment_count[i] = sbi->segment_count[i];
 		si->block_count[i] = sbi->block_count[i];
@@ -168,7 +184,6 @@
 static void update_mem_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
-	unsigned npages;
 	int i;
 
 	if (si->base_mem)
@@ -190,10 +205,9 @@
 	si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
 	si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
 	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
-	if (f2fs_discard_en(sbi))
-		si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+	si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
 	si->base_mem += SIT_VBLOCK_MAP_SIZE;
-	if (sbi->segs_per_sec > 1)
+	if (__is_large_section(sbi))
 		si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
 	si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
 
@@ -252,10 +266,14 @@
 						sizeof(struct extent_node);
 
 	si->page_mem = 0;
-	npages = NODE_MAPPING(sbi)->nrpages;
-	si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
-	npages = META_MAPPING(sbi)->nrpages;
-	si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+	if (sbi->node_inode) {
+		unsigned npages = NODE_MAPPING(sbi)->nrpages;
+		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+	}
+	if (sbi->meta_inode) {
+		unsigned npages = META_MAPPING(sbi)->nrpages;
+		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+	}
 }
 
 static int stat_show(struct seq_file *s, void *v)
@@ -271,7 +289,8 @@
 		seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
 			si->sbi->sb->s_bdev, i++,
 			f2fs_readonly(si->sbi->sb) ? "RO": "RW",
-			f2fs_cp_error(si->sbi) ? "Error": "Good");
+			is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
+			"Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"));
 		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
 			   si->sit_area_segs, si->nat_area_segs);
 		seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -333,6 +352,13 @@
 			   si->prefree_count, si->free_segs, si->free_secs);
 		seq_printf(s, "CP calls: %d (BG: %d)\n",
 				si->cp_count, si->bg_cp_count);
+		seq_printf(s, "  - cp blocks : %u\n", si->meta_count[META_CP]);
+		seq_printf(s, "  - sit blocks : %u\n",
+				si->meta_count[META_SIT]);
+		seq_printf(s, "  - nat blocks : %u\n",
+				si->meta_count[META_NAT]);
+		seq_printf(s, "  - ssa blocks : %u\n",
+				si->meta_count[META_SSA]);
 		seq_printf(s, "GC calls: %d (BG: %d)\n",
 			   si->call_count, si->bg_gc);
 		seq_printf(s, "  - data segments : %d (%d)\n",
@@ -349,6 +375,8 @@
 				si->skipped_atomic_files[BG_GC] +
 				si->skipped_atomic_files[FG_GC],
 				si->skipped_atomic_files[BG_GC]);
+		seq_printf(s, "BG skip : IO: %u, Other: %u\n",
+				si->io_skip_bggc, si->other_skip_bggc);
 		seq_puts(s, "\nExtent Cache:\n");
 		seq_printf(s, "  - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
 				si->hit_largest, si->hit_cached,
@@ -360,7 +388,11 @@
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
+		seq_printf(s, "  - DIO (R: %4d, W: %4d)\n",
+			   si->nr_dio_read, si->nr_dio_write);
+		seq_printf(s, "  - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
+			   si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
+		seq_printf(s, "  - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
 			"Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
 			   si->nr_wb_cp_data, si->nr_wb_data,
 			   si->nr_flushing, si->nr_flushed,
@@ -428,23 +460,13 @@
 	return 0;
 }
 
-static int stat_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, stat_show, inode->i_private);
-}
-
-static const struct file_operations stat_fops = {
-	.owner = THIS_MODULE,
-	.open = stat_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(stat);
 
 int f2fs_build_stats(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct f2fs_stat_info *si;
+	int i;
 
 	si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
 	if (!si)
@@ -470,6 +492,8 @@
 	atomic_set(&sbi->inline_inode, 0);
 	atomic_set(&sbi->inline_dir, 0);
 	atomic_set(&sbi->inplace_count, 0);
+	for (i = META_CP; i < META_MAX; i++)
+		atomic_set(&sbi->meta_count[i], 0);
 
 	atomic_set(&sbi->aw_cnt, 0);
 	atomic_set(&sbi->vw_cnt, 0);
@@ -491,33 +515,19 @@
 	list_del(&si->stat_list);
 	mutex_unlock(&f2fs_stat_mutex);
 
-	kfree(si);
+	kvfree(si);
 }
 
-int __init f2fs_create_root_stats(void)
+void __init f2fs_create_root_stats(void)
 {
-	struct dentry *file;
-
 	f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
-	if (!f2fs_debugfs_root)
-		return -ENOMEM;
 
-	file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
-			NULL, &stat_fops);
-	if (!file) {
-		debugfs_remove(f2fs_debugfs_root);
-		f2fs_debugfs_root = NULL;
-		return -ENOMEM;
-	}
-
-	return 0;
+	debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
+			    &stat_fops);
 }
 
 void f2fs_destroy_root_stats(void)
 {
-	if (!f2fs_debugfs_root)
-		return;
-
 	debugfs_remove_recursive(f2fs_debugfs_root);
 	f2fs_debugfs_root = NULL;
 }

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index ecc3a4e..4033778 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c

@@ -1,16 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/dir.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
 #include <linux/sched/signal.h>
+#include <linux/unicode.h>
 #include "f2fs.h"
 #include "node.h"
 #include "acl.h"
@@ -84,7 +82,8 @@
 	return bidx;
 }
 
-static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
+static struct f2fs_dir_entry *find_in_block(struct inode *dir,
+				struct page *dentry_page,
 				struct fscrypt_name *fname,
 				f2fs_hash_t namehash,
 				int *max_slots,
@@ -96,7 +95,7 @@
 
 	dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
 
-	make_dentry_ptr_block(NULL, &d, dentry_blk);
+	make_dentry_ptr_block(dir, &d, dentry_blk);
 	de = f2fs_find_target_dentry(fname, namehash, max_slots, &d);
 	if (de)
 		*res_page = dentry_page;
@@ -104,14 +103,116 @@
 	return de;
 }
 
+#ifdef CONFIG_UNICODE
+/*
+ * Test whether a case-insensitive directory entry matches the filename
+ * being searched for.
+ *
+ * Returns: 0 if the directory entry matches, more than 0 if it
+ * doesn't match or less than zero on error.
+ */
+int f2fs_ci_compare(const struct inode *parent, const struct qstr *name,
+				const struct qstr *entry, bool quick)
+{
+	const struct f2fs_sb_info *sbi = F2FS_SB(parent->i_sb);
+	const struct unicode_map *um = sbi->s_encoding;
+	int ret;
+
+	if (quick)
+		ret = utf8_strncasecmp_folded(um, name, entry);
+	else
+		ret = utf8_strncasecmp(um, name, entry);
+
+	if (ret < 0) {
+		/* Handle invalid character sequence as either an error
+		 * or as an opaque byte sequence.
+		 */
+		if (f2fs_has_strict_mode(sbi))
+			return -EINVAL;
+
+		if (name->len != entry->len)
+			return 1;
+
+		return !!memcmp(name->name, entry->name, name->len);
+	}
+
+	return ret;
+}
+
+static void f2fs_fname_setup_ci_filename(struct inode *dir,
+					const struct qstr *iname,
+					struct fscrypt_str *cf_name)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+	if (!IS_CASEFOLDED(dir)) {
+		cf_name->name = NULL;
+		return;
+	}
+
+	cf_name->name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, GFP_NOFS);
+	if (!cf_name->name)
+		return;
+
+	cf_name->len = utf8_casefold(sbi->s_encoding,
+					iname, cf_name->name,
+					F2FS_NAME_LEN);
+	if ((int)cf_name->len <= 0) {
+		kvfree(cf_name->name);
+		cf_name->name = NULL;
+	}
+}
+#endif
+
+static inline bool f2fs_match_name(struct f2fs_dentry_ptr *d,
+					struct f2fs_dir_entry *de,
+					struct fscrypt_name *fname,
+					struct fscrypt_str *cf_str,
+					unsigned long bit_pos,
+					f2fs_hash_t namehash)
+{
+#ifdef CONFIG_UNICODE
+	struct inode *parent = d->inode;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(parent);
+	struct qstr entry;
+#endif
+
+	if (de->hash_code != namehash)
+		return false;
+
+#ifdef CONFIG_UNICODE
+	entry.name = d->filename[bit_pos];
+	entry.len = de->name_len;
+
+	if (sbi->s_encoding && IS_CASEFOLDED(parent)) {
+		if (cf_str->name) {
+			struct qstr cf = {.name = cf_str->name,
+					  .len = cf_str->len};
+			return !f2fs_ci_compare(parent, &cf, &entry, true);
+		}
+		return !f2fs_ci_compare(parent, fname->usr_fname, &entry,
+					false);
+	}
+#endif
+	if (fscrypt_match_name(fname, d->filename[bit_pos],
+				le16_to_cpu(de->name_len)))
+		return true;
+	return false;
+}
+
 struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname,
 			f2fs_hash_t namehash, int *max_slots,
 			struct f2fs_dentry_ptr *d)
 {
 	struct f2fs_dir_entry *de;
+	struct fscrypt_str cf_str = { .name = NULL, .len = 0 };
 	unsigned long bit_pos = 0;
 	int max_len = 0;
 
+#ifdef CONFIG_UNICODE
+	f2fs_fname_setup_ci_filename(d->inode, fname->usr_fname, &cf_str);
+#endif
+
 	if (max_slots)
 		*max_slots = 0;
 	while (bit_pos < d->max) {
@@ -128,9 +229,7 @@
 			continue;
 		}
 
-		if (de->hash_code == namehash &&
-		    fscrypt_match_name(fname, d->filename[bit_pos],
-				       le16_to_cpu(de->name_len)))
+		if (f2fs_match_name(d, de, fname, &cf_str, bit_pos, namehash))
 			goto found;
 
 		if (max_slots && max_len > *max_slots)
@@ -144,6 +243,10 @@
 found:
 	if (max_slots && max_len > *max_slots)
 		*max_slots = max_len;
+
+#ifdef CONFIG_UNICODE
+	kvfree(cf_str.name);
+#endif
 	return de;
 }
 
@@ -160,7 +263,7 @@
 	struct f2fs_dir_entry *de = NULL;
 	bool room = false;
 	int max_slots;
-	f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname);
+	f2fs_hash_t namehash = f2fs_dentry_hash(dir, &name, fname);
 
 	nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
 	nblock = bucket_blocks(level);
@@ -182,8 +285,8 @@
 			}
 		}
 
-		de = find_in_block(dentry_page, fname, namehash, &max_slots,
-								res_page);
+		de = find_in_block(dir, dentry_page, fname, namehash,
+							&max_slots, res_page);
 		if (de)
 			break;
 
@@ -221,9 +324,8 @@
 
 	max_depth = F2FS_I(dir)->i_current_depth;
 	if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
-		f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING,
-				"Corrupted max_depth of %lu: %u",
-				dir->i_ino, max_depth);
+		f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %lu: %u",
+			  dir->i_ino, max_depth);
 		max_depth = MAX_DIR_HASH_DEPTH;
 		f2fs_i_depth_write(dir, max_depth);
 	}
@@ -254,6 +356,14 @@
 	struct fscrypt_name fname;
 	int err;
 
+#ifdef CONFIG_UNICODE
+	if (f2fs_has_strict_mode(F2FS_I_SB(dir)) && IS_CASEFOLDED(dir) &&
+			utf8_validate(F2FS_I_SB(dir)->s_encoding, child)) {
+		*res_page = ERR_PTR(-EINVAL);
+		return NULL;
+	}
+#endif
+
 	err = fscrypt_setup_filename(dir, child, 1, &fname);
 	if (err) {
 		if (err == -ENOENT)
@@ -296,7 +406,7 @@
 {
 	enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, type, true);
+	f2fs_wait_on_page_writeback(page, type, true, true);
 	de->ino = cpu_to_le32(inode->i_ino);
 	set_de_type(de, inode->i_mode);
 	set_page_dirty(page);
@@ -310,7 +420,7 @@
 {
 	struct f2fs_inode *ri;
 
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 
 	/* copy name info. to this inode page */
 	ri = F2FS_INODE(ipage);
@@ -388,7 +498,7 @@
 		if (err)
 			goto put_error;
 
-		if ((f2fs_encrypted_inode(dir) || dummy_encrypt) &&
+		if ((IS_ENCRYPTED(dir) || dummy_encrypt) &&
 					f2fs_may_encrypt(inode)) {
 			err = fscrypt_inherit_context(dir, inode, page, false);
 			if (err)
@@ -402,7 +512,7 @@
 
 	if (new_name) {
 		init_dent_inode(new_name, page);
-		if (f2fs_encrypted_inode(dir))
+		if (IS_ENCRYPTED(dir))
 			file_set_enc_name(inode);
 	}
 
@@ -508,7 +618,7 @@
 
 	level = 0;
 	slots = GET_DENTRY_SLOTS(new_name->len);
-	dentry_hash = f2fs_dentry_hash(new_name, NULL);
+	dentry_hash = f2fs_dentry_hash(dir, new_name, NULL);
 
 	current_depth = F2FS_I(dir)->i_current_depth;
 	if (F2FS_I(dir)->chash == dentry_hash) {
@@ -553,7 +663,7 @@
 	++level;
 	goto start;
 add_dentry:
-	f2fs_wait_on_page_writeback(dentry_page, DATA, true);
+	f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
 
 	if (inode) {
 		down_write(&F2FS_I(inode)->i_sem);
@@ -572,6 +682,11 @@
 
 	if (inode) {
 		f2fs_i_pino_write(inode, dir->i_ino);
+
+		/* synchronize inode page's data from inode cache */
+		if (is_inode_flag_set(inode, FI_NEW_INODE))
+			f2fs_update_inode(inode, page);
+
 		f2fs_put_page(page, 1);
 	}
 
@@ -658,9 +773,9 @@
 	f2fs_put_page(page, 1);
 
 	clear_inode_flag(inode, FI_NEW_INODE);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 fail:
 	up_write(&F2FS_I(inode)->i_sem);
-	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return err;
 }
 
@@ -708,7 +823,7 @@
 		return f2fs_delete_inline_entry(dentry, page, dir, inode);
 
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 	dentry_blk = page_address(page);
 	bit_pos = dentry - dentry_blk->dentry;
@@ -729,10 +844,11 @@
 
 	if (bit_pos == NR_DENTRY_IN_BLOCK &&
 		!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
-		f2fs_clear_radix_tree_dirty_tag(page);
+		f2fs_clear_page_cache_dirty_tag(page);
 		clear_page_dirty_for_io(page);
-		ClearPagePrivate(page);
+		f2fs_clear_page_private(page);
 		ClearPageUptodate(page);
+		clear_cold_data(page);
 		inode_dec_dirty_pages(dir);
 		f2fs_remove_dirty_inode(dir);
 	}
@@ -784,9 +900,15 @@
 	struct f2fs_dir_entry *de = NULL;
 	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
+	struct blk_plug plug;
+	bool readdir_ra = sbi->readdir_ra == 1;
+	int err = 0;
 
 	bit_pos = ((unsigned long)ctx->pos % d->max);
 
+	if (readdir_ra)
+		blk_start_plug(&plug);
+
 	while (bit_pos < d->max) {
 		bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos);
 		if (bit_pos >= d->max)
@@ -796,6 +918,10 @@
 		if (de->name_len == 0) {
 			bit_pos++;
 			ctx->pos = start_pos + bit_pos;
+			printk_ratelimited(
+				"%s, invalid namelen(0), ino:%u, run fsck to fix.",
+				KERN_WARNING, le32_to_cpu(de->ino));
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
 			continue;
 		}
 
@@ -804,31 +930,45 @@
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
 
-		if (f2fs_encrypted_inode(d->inode)) {
+		/* check memory boundary before moving forward */
+		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+		if (unlikely(bit_pos > d->max ||
+				le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) {
+			f2fs_warn(sbi, "%s: corrupted namelen=%d, run fsck to fix.",
+				  __func__, le16_to_cpu(de->name_len));
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			err = -EFSCORRUPTED;
+			goto out;
+		}
+
+		if (IS_ENCRYPTED(d->inode)) {
 			int save_len = fstr->len;
-			int err;
 
 			err = fscrypt_fname_disk_to_usr(d->inode,
-						(u32)de->hash_code, 0,
-						&de_name, fstr);
+						(u32)le32_to_cpu(de->hash_code),
+						0, &de_name, fstr);
 			if (err)
-				return err;
+				goto out;
 
 			de_name = *fstr;
 			fstr->len = save_len;
 		}
 
 		if (!dir_emit(ctx, de_name.name, de_name.len,
-					le32_to_cpu(de->ino), d_type))
-			return 1;
+					le32_to_cpu(de->ino), d_type)) {
+			err = 1;
+			goto out;
+		}
 
-		if (sbi->readdir_ra == 1)
+		if (readdir_ra)
 			f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
 
-		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
 		ctx->pos = start_pos + bit_pos;
 	}
-	return 0;
+out:
+	if (readdir_ra)
+		blk_finish_plug(&plug);
+	return err;
 }
 
 static int f2fs_readdir(struct file *file, struct dir_context *ctx)
@@ -844,7 +984,7 @@
 	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
 	int err = 0;
 
-	if (f2fs_encrypted_inode(inode)) {
+	if (IS_ENCRYPTED(inode)) {
 		err = fscrypt_get_encryption_info(inode);
 		if (err && err != -ENOKEY)
 			goto out;
@@ -873,7 +1013,7 @@
 			page_cache_sync_readahead(inode->i_mapping, ra, file, n,
 				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
 
-		dentry_page = f2fs_get_lock_data_page(inode, n, false);
+		dentry_page = f2fs_find_data_page(inode, n);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
 			if (err == -ENOENT) {
@@ -891,11 +1031,11 @@
 		err = f2fs_fill_dentries(ctx, &d,
 				n * NR_DENTRY_IN_BLOCK, &fstr);
 		if (err) {
-			f2fs_put_page(dentry_page, 1);
+			f2fs_put_page(dentry_page, 0);
 			break;
 		}
 
-		f2fs_put_page(dentry_page, 1);
+		f2fs_put_page(dentry_page, 0);
 	}
 out_free:
 	fscrypt_fname_free_buffer(&fstr);
@@ -906,7 +1046,7 @@
 
 static int f2fs_dir_open(struct inode *inode, struct file *filp)
 {
-	if (f2fs_encrypted_inode(inode))
+	if (IS_ENCRYPTED(inode))
 		return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
 	return 0;
 }
@@ -922,3 +1062,50 @@
 	.compat_ioctl   = f2fs_compat_ioctl,
 #endif
 };
+
+#ifdef CONFIG_UNICODE
+static int f2fs_d_compare(const struct dentry *dentry, unsigned int len,
+			  const char *str, const struct qstr *name)
+{
+	struct qstr qstr = {.name = str, .len = len };
+
+	if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) {
+		if (len != name->len)
+			return -1;
+		return memcmp(str, name, len);
+	}
+
+	return f2fs_ci_compare(dentry->d_parent->d_inode, name, &qstr, false);
+}
+
+static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+	const struct unicode_map *um = sbi->s_encoding;
+	unsigned char *norm;
+	int len, ret = 0;
+
+	if (!IS_CASEFOLDED(dentry->d_inode))
+		return 0;
+
+	norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC);
+	if (!norm)
+		return -ENOMEM;
+
+	len = utf8_casefold(um, str, norm, PATH_MAX);
+	if (len < 0) {
+		if (f2fs_has_strict_mode(sbi))
+			ret = -EINVAL;
+		goto out;
+	}
+	str->hash = full_name_hash(dentry, norm, len);
+out:
+	kvfree(norm);
+	return ret;
+}
+
+const struct dentry_operations f2fs_dentry_ops = {
+	.d_hash = f2fs_d_hash,
+	.d_compare = f2fs_d_compare,
+};
+#endif

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index a70cd25..e600784 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * f2fs extent cache support
  *
@@ -5,10 +6,6 @@
  * Copyright (c) 2015 Samsung Electronics
  * Authors: Jaegeuk Kim <jaegeuk@kernel.org>
  *          Chao Yu <chao2.yu@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/fs.h>
@@ -30,10 +27,10 @@
 	return NULL;
 }
 
-static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
+static struct rb_entry *__lookup_rb_tree_slow(struct rb_root_cached *root,
 							unsigned int ofs)
 {
-	struct rb_node *node = root->rb_node;
+	struct rb_node *node = root->rb_root.rb_node;
 	struct rb_entry *re;
 
 	while (node) {
@@ -49,7 +46,7 @@
 	return NULL;
 }
 
-struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root,
+struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
 				struct rb_entry *cached_re, unsigned int ofs)
 {
 	struct rb_entry *re;
@@ -62,22 +59,25 @@
 }
 
 struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
-				struct rb_root *root, struct rb_node **parent,
-				unsigned int ofs)
+				struct rb_root_cached *root,
+				struct rb_node **parent,
+				unsigned int ofs, bool *leftmost)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p = &root->rb_root.rb_node;
 	struct rb_entry *re;
 
 	while (*p) {
 		*parent = *p;
 		re = rb_entry(*parent, struct rb_entry, rb_node);
 
-		if (ofs < re->ofs)
+		if (ofs < re->ofs) {
 			p = &(*p)->rb_left;
-		else if (ofs >= re->ofs + re->len)
+		} else if (ofs >= re->ofs + re->len) {
 			p = &(*p)->rb_right;
-		else
+			*leftmost = false;
+		} else {
 			f2fs_bug_on(sbi, 1);
+		}
 	}
 
 	return p;
@@ -92,16 +92,16 @@
  * in order to simpfy the insertion after.
  * tree must stay unchanged between lookup and insertion.
  */
-struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
+struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
 				struct rb_entry *cached_re,
 				unsigned int ofs,
 				struct rb_entry **prev_entry,
 				struct rb_entry **next_entry,
 				struct rb_node ***insert_p,
 				struct rb_node **insert_parent,
-				bool force)
+				bool force, bool *leftmost)
 {
-	struct rb_node **pnode = &root->rb_node;
+	struct rb_node **pnode = &root->rb_root.rb_node;
 	struct rb_node *parent = NULL, *tmp_node;
 	struct rb_entry *re = cached_re;
 
@@ -110,7 +110,7 @@
 	*prev_entry = NULL;
 	*next_entry = NULL;
 
-	if (RB_EMPTY_ROOT(root))
+	if (RB_EMPTY_ROOT(&root->rb_root))
 		return NULL;
 
 	if (re) {
@@ -118,16 +118,22 @@
 			goto lookup_neighbors;
 	}
 
+	if (leftmost)
+		*leftmost = true;
+
 	while (*pnode) {
 		parent = *pnode;
 		re = rb_entry(*pnode, struct rb_entry, rb_node);
 
-		if (ofs < re->ofs)
+		if (ofs < re->ofs) {
 			pnode = &(*pnode)->rb_left;
-		else if (ofs >= re->ofs + re->len)
+		} else if (ofs >= re->ofs + re->len) {
 			pnode = &(*pnode)->rb_right;
-		else
+			if (leftmost)
+				*leftmost = false;
+		} else {
 			goto lookup_neighbors;
+		}
 	}
 
 	*insert_p = pnode;
@@ -160,10 +166,10 @@
 }
 
 bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
-						struct rb_root *root)
+						struct rb_root_cached *root)
 {
 #ifdef CONFIG_F2FS_CHECK_FS
-	struct rb_node *cur = rb_first(root), *next;
+	struct rb_node *cur = rb_first_cached(root), *next;
 	struct rb_entry *cur_re, *next_re;
 
 	if (!cur)
@@ -178,10 +184,9 @@
 		next_re = rb_entry(next, struct rb_entry, rb_node);
 
 		if (cur_re->ofs + cur_re->len > next_re->ofs) {
-			f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, "
-				"cur(%u, %u) next(%u, %u)",
-				cur_re->ofs, cur_re->len,
-				next_re->ofs, next_re->len);
+			f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)",
+				  cur_re->ofs, cur_re->len,
+				  next_re->ofs, next_re->len);
 			return false;
 		}
 
@@ -196,7 +201,8 @@
 
 static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_info *ei,
-				struct rb_node *parent, struct rb_node **p)
+				struct rb_node *parent, struct rb_node **p,
+				bool leftmost)
 {
 	struct extent_node *en;
 
@@ -209,7 +215,7 @@
 	en->et = et;
 
 	rb_link_node(&en->rb_node, parent, p);
-	rb_insert_color(&en->rb_node, &et->root);
+	rb_insert_color_cached(&en->rb_node, &et->root, leftmost);
 	atomic_inc(&et->node_cnt);
 	atomic_inc(&sbi->total_ext_node);
 	return en;
@@ -218,7 +224,7 @@
 static void __detach_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_node *en)
 {
-	rb_erase(&en->rb_node, &et->root);
+	rb_erase_cached(&en->rb_node, &et->root);
 	atomic_dec(&et->node_cnt);
 	atomic_dec(&sbi->total_ext_node);
 
@@ -257,7 +263,7 @@
 		f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
 		memset(et, 0, sizeof(struct extent_tree));
 		et->ino = ino;
-		et->root = RB_ROOT;
+		et->root = RB_ROOT_CACHED;
 		et->cached_en = NULL;
 		rwlock_init(&et->lock);
 		INIT_LIST_HEAD(&et->list);
@@ -278,10 +284,10 @@
 static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_info *ei)
 {
-	struct rb_node **p = &et->root.rb_node;
+	struct rb_node **p = &et->root.rb_root.rb_node;
 	struct extent_node *en;
 
-	en = __attach_extent_node(sbi, et, ei, NULL, p);
+	en = __attach_extent_node(sbi, et, ei, NULL, p, true);
 	if (!en)
 		return NULL;
 
@@ -297,7 +303,7 @@
 	struct extent_node *en;
 	unsigned int count = atomic_read(&et->node_cnt);
 
-	node = rb_first(&et->root);
+	node = rb_first_cached(&et->root);
 	while (node) {
 		next = rb_next(node);
 		en = rb_entry(node, struct extent_node, rb_node);
@@ -455,7 +461,8 @@
 static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_info *ei,
 				struct rb_node **insert_p,
-				struct rb_node *insert_parent)
+				struct rb_node *insert_parent,
+				bool leftmost)
 {
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
@@ -467,9 +474,12 @@
 		goto do_insert;
 	}
 
-	p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs);
+	leftmost = true;
+
+	p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent,
+						ei->fofs, &leftmost);
 do_insert:
-	en = __attach_extent_node(sbi, et, ei, parent, p);
+	en = __attach_extent_node(sbi, et, ei, parent, p, leftmost);
 	if (!en)
 		return NULL;
 
@@ -495,6 +505,7 @@
 	unsigned int end = fofs + len;
 	unsigned int pos = (unsigned int)fofs;
 	bool updated = false;
+	bool leftmost = false;
 
 	if (!et)
 		return;
@@ -522,7 +533,8 @@
 					(struct rb_entry *)et->cached_en, fofs,
 					(struct rb_entry **)&prev_en,
 					(struct rb_entry **)&next_en,
-					&insert_p, &insert_parent, false);
+					&insert_p, &insert_parent, false,
+					&leftmost);
 	if (!en)
 		en = next_en;
 
@@ -549,7 +561,7 @@
 						end - dei.fofs + dei.blk,
 						org_end - end);
 				en1 = __insert_extent_tree(sbi, et, &ei,
-							NULL, NULL);
+							NULL, NULL, true);
 				next_en = en1;
 			} else {
 				en->ei.fofs = end;
@@ -590,7 +602,7 @@
 		set_extent_info(&ei, fofs, blkaddr, len);
 		if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
 			__insert_extent_tree(sbi, et, &ei,
-						insert_p, insert_parent);
+					insert_p, insert_parent, leftmost);
 
 		/* give up extent_cache, if split and small updates happen */
 		if (dei.len >= 1 &&

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ecb7351..4024790 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h

@@ -1,16 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/f2fs.h
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #ifndef _LINUX_F2FS_H
 #define _LINUX_F2FS_H
 
+#include <linux/uio.h>
 #include <linux/types.h>
 #include <linux/page-flags.h>
 #include <linux/buffer_head.h>
@@ -26,8 +24,8 @@
 #include <linux/quotaops.h>
 #include <crypto/hash.h>
 
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
 #include <linux/fscrypt.h>
+#include <linux/fsverity.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
@@ -53,9 +51,10 @@
 	FAULT_DIR_DEPTH,
 	FAULT_EVICT_INODE,
 	FAULT_TRUNCATE,
-	FAULT_IO,
+	FAULT_READ_IO,
 	FAULT_CHECKPOINT,
 	FAULT_DISCARD,
+	FAULT_WRITE_IO,
 	FAULT_MAX,
 };
 
@@ -68,7 +67,7 @@
 	unsigned int inject_type;
 };
 
-extern char *f2fs_fault_name[FAULT_MAX];
+extern const char *f2fs_fault_name[FAULT_MAX];
 #define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
 #endif
 
@@ -100,6 +99,7 @@
 #define F2FS_MOUNT_QUOTA		0x00400000
 #define F2FS_MOUNT_INLINE_XATTR_SIZE	0x00800000
 #define F2FS_MOUNT_RESERVE_ROOT		0x01000000
+#define F2FS_MOUNT_DISABLE_CHECKPOINT	0x02000000
 
 #define F2FS_OPTION(sbi)	((sbi)->mount_opt)
 #define clear_opt(sbi, option)	(F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -137,6 +137,9 @@
 	int alloc_mode;			/* segment allocation policy */
 	int fsync_mode;			/* fsync policy */
 	bool test_dummy_encryption;	/* test dummy encryption */
+	block_t unusable_cap;		/* Amount of space allowed to be
+					 * unusable when disabling checkpoint
+					 */
 };
 
 #define F2FS_FEATURE_ENCRYPT		0x0001
@@ -149,14 +152,17 @@
 #define F2FS_FEATURE_QUOTA_INO		0x0080
 #define F2FS_FEATURE_INODE_CRTIME	0x0100
 #define F2FS_FEATURE_LOST_FOUND		0x0200
-#define F2FS_FEATURE_VERITY		0x0400	/* reserved */
+#define F2FS_FEATURE_VERITY		0x0400
+#define F2FS_FEATURE_SB_CHKSUM		0x0800
+#define F2FS_FEATURE_CASEFOLD		0x1000
 
-#define F2FS_HAS_FEATURE(sb, mask)					\
-	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
-#define F2FS_SET_FEATURE(sb, mask)					\
-	(F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask))
-#define F2FS_CLEAR_FEATURE(sb, mask)					\
-	(F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask))
+#define __F2FS_HAS_FEATURE(raw_super, mask)				\
+	((raw_super->feature & cpu_to_le32(mask)) != 0)
+#define F2FS_HAS_FEATURE(sbi, mask)	__F2FS_HAS_FEATURE(sbi->raw_super, mask)
+#define F2FS_SET_FEATURE(sbi, mask)					\
+	(sbi->raw_super->feature |= cpu_to_le32(mask))
+#define F2FS_CLEAR_FEATURE(sbi, mask)					\
+	(sbi->raw_super->feature &= ~cpu_to_le32(mask))
 
 /*
  * Default values for user and/or group using reserved blocks
@@ -178,6 +184,7 @@
 #define	CP_RECOVERY	0x00000008
 #define	CP_DISCARD	0x00000010
 #define CP_TRIMMED	0x00000020
+#define CP_PAUSE	0x00000040
 
 #define MAX_DISCARD_BLOCKS(sbi)		BLKS_PER_SEC(sbi)
 #define DEF_MAX_DISCARD_REQUEST		8	/* issue 8 discards per round */
@@ -187,6 +194,9 @@
 #define DEF_DISCARD_URGENT_UTIL		80	/* do more discard over 80% */
 #define DEF_CP_INTERVAL			60	/* 60 secs */
 #define DEF_IDLE_INTERVAL		5	/* 5 secs */
+#define DEF_DISABLE_INTERVAL		5	/* 5 secs */
+#define DEF_DISABLE_QUICK_INTERVAL	1	/* 1 secs */
+#define DEF_UMOUNT_DISCARD_TIMEOUT	5	/* 5 secs */
 
 struct cp_control {
 	int reason;
@@ -203,8 +213,16 @@
 	META_NAT,
 	META_SIT,
 	META_SSA,
+	META_MAX,
 	META_POR,
-	DATA_GENERIC,
+	DATA_GENERIC,		/* check range only */
+	DATA_GENERIC_ENHANCE,	/* strong check on range and segment bitmap */
+	DATA_GENERIC_ENHANCE_READ,	/*
+					 * strong check on range and segment
+					 * bitmap but no warning due to race
+					 * condition of read on truncated area
+					 * by extent_cache
+					 */
 	META_GENERIC,
 };
 
@@ -249,7 +267,7 @@
 /* max discard pend list number */
 #define MAX_PLIST_NUM		512
 #define plist_idx(blk_num)	((blk_num) >= MAX_PLIST_NUM ?		\
-					(MAX_PLIST_NUM - 1) : (blk_num - 1))
+					(MAX_PLIST_NUM - 1) : ((blk_num) - 1))
 
 enum {
 	D_PREP,			/* initial */
@@ -280,7 +298,7 @@
 	struct block_device *bdev;	/* bdev */
 	unsigned short ref;		/* reference count */
 	unsigned char state;		/* state */
-	unsigned char issuing;		/* issuing discard */
+	unsigned char queued;		/* queued discard */
 	int error;			/* bio error */
 	spinlock_t lock;		/* for state/bio_ref updating */
 	unsigned short bio_ref;		/* bio reference count */
@@ -305,6 +323,7 @@
 	bool sync;			/* submit discard with REQ_SYNC flag */
 	bool ordered;			/* issue discard by lba order */
 	unsigned int granularity;	/* discard granularity */
+	int timeout;			/* discard timeout for put_super */
 };
 
 struct discard_cmd_control {
@@ -322,9 +341,9 @@
 	unsigned int undiscard_blks;		/* # of undiscard blocks */
 	unsigned int next_pos;			/* next discard position */
 	atomic_t issued_discard;		/* # of issued discard */
-	atomic_t issing_discard;		/* # of issing discard */
+	atomic_t queued_discard;		/* # of queued discard */
 	atomic_t discard_cmd_cnt;		/* # of cached cmd count */
-	struct rb_root root;			/* root of discard rb-tree */
+	struct rb_root_cached root;		/* root of discard rb-tree */
 	bool rbtree_check;			/* config for consistence check */
 };
 
@@ -398,6 +417,10 @@
 #define F2FS_IOC_SET_PIN_FILE		_IOW(F2FS_IOCTL_MAGIC, 13, __u32)
 #define F2FS_IOC_GET_PIN_FILE		_IOR(F2FS_IOCTL_MAGIC, 14, __u32)
 #define F2FS_IOC_PRECACHE_EXTENTS	_IO(F2FS_IOCTL_MAGIC, 15)
+#define F2FS_IOC_RESIZE_FS		_IOW(F2FS_IOCTL_MAGIC, 16, __u64)
+
+#define F2FS_IOC_GET_VOLUME_NAME	FS_IOC_GETFSLABEL
+#define F2FS_IOC_SET_VOLUME_NAME	FS_IOC_SETFSLABEL
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY	FS_IOC_SET_ENCRYPTION_POLICY
 #define F2FS_IOC_GET_ENCRYPTION_POLICY	FS_IOC_GET_ENCRYPTION_POLICY
@@ -412,6 +435,7 @@
 #define F2FS_GOING_DOWN_METASYNC	0x1	/* going down with metadata */
 #define F2FS_GOING_DOWN_NOSYNC		0x2	/* going down */
 #define F2FS_GOING_DOWN_METAFLUSH	0x3	/* going down with meta flush */
+#define F2FS_GOING_DOWN_NEED_FSCK	0x4	/* going down to trigger fsck */
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -450,7 +474,6 @@
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
-#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
 static inline int get_inline_xattr_addrs(struct inode *inode);
 #define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
@@ -462,8 +485,8 @@
 #define NR_INLINE_DENTRY(inode)	(MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
 				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
 				BITS_PER_BYTE + 1))
-#define INLINE_DENTRY_BITMAP_SIZE(inode)	((NR_INLINE_DENTRY(inode) + \
-					BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+#define INLINE_DENTRY_BITMAP_SIZE(inode) \
+	DIV_ROUND_UP(NR_INLINE_DENTRY(inode), BITS_PER_BYTE)
 #define INLINE_RESERVED_SIZE(inode)	(MAX_INLINE_DATA(inode) - \
 				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
 				NR_INLINE_DENTRY(inode) + \
@@ -527,6 +550,9 @@
 
 #define DEFAULT_RETRY_IO_COUNT	8	/* maximum retry read IO count */
 
+/* maximum retry quota flush count */
+#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT		8
+
 #define F2FS_LINK_MAX	0xffffffff	/* maximum link count per file */
 
 #define MAX_DIR_RA_PAGES	4	/* maximum ra pages of dir */
@@ -550,23 +576,15 @@
 };
 
 struct extent_node {
-	struct rb_node rb_node;
-	union {
-		struct {
-			unsigned int fofs;
-			unsigned int len;
-			u32 blk;
-		};
-		struct extent_info ei;	/* extent info */
-
-	};
+	struct rb_node rb_node;		/* rb node located in rb-tree */
+	struct extent_info ei;		/* extent info */
 	struct list_head list;		/* node in global extent list of sbi */
 	struct extent_tree *et;		/* extent tree pointer */
 };
 
 struct extent_tree {
 	nid_t ino;			/* inode number */
-	struct rb_root root;		/* root of extent info rb-tree */
+	struct rb_root_cached root;	/* root of extent info rb-tree */
 	struct extent_node *cached_en;	/* recently accessed extent node */
 	struct extent_info largest;	/* largested extent info */
 	struct list_head list;		/* to be used by sbi->zombie_list */
@@ -594,6 +612,7 @@
 	pgoff_t *m_next_pgofs;		/* point next possible non-hole pgofs */
 	pgoff_t *m_next_extent;		/* point to next possible extent */
 	int m_seg_type;
+	bool m_may_create;		/* indicate it is from write path */
 };
 
 /* for flag in get_data_block */
@@ -601,6 +620,7 @@
 	F2FS_GET_BLOCK_DEFAULT,
 	F2FS_GET_BLOCK_FIEMAP,
 	F2FS_GET_BLOCK_BMAP,
+	F2FS_GET_BLOCK_DIO,
 	F2FS_GET_BLOCK_PRE_DIO,
 	F2FS_GET_BLOCK_PRE_AIO,
 	F2FS_GET_BLOCK_PRECACHE,
@@ -615,7 +635,7 @@
 #define FADVISE_ENC_NAME_BIT	0x08
 #define FADVISE_KEEP_SIZE_BIT	0x10
 #define FADVISE_HOT_BIT		0x20
-#define FADVISE_VERITY_BIT	0x40	/* reserved */
+#define FADVISE_VERITY_BIT	0x40
 
 #define FADVISE_MODIFIABLE_BITS	(FADVISE_COLD_BIT | FADVISE_HOT_BIT)
 
@@ -635,6 +655,8 @@
 #define file_is_hot(inode)	is_file(inode, FADVISE_HOT_BIT)
 #define file_set_hot(inode)	set_file(inode, FADVISE_HOT_BIT)
 #define file_clear_hot(inode)	clear_file(inode, FADVISE_HOT_BIT)
+#define file_is_verity(inode)	is_file(inode, FADVISE_VERITY_BIT)
+#define file_set_verity(inode)	set_file(inode, FADVISE_VERITY_BIT)
 
 #define DEF_DIR_LEVEL		0
 
@@ -881,7 +903,7 @@
 	struct task_struct *f2fs_issue_flush;	/* flush thread */
 	wait_queue_head_t flush_wait_queue;	/* waiting queue for wake-up */
 	atomic_t issued_flush;			/* # of issued flushes */
-	atomic_t issing_flush;			/* # of issing flushes */
+	atomic_t queued_flush;			/* # of queued flushes */
 	struct llist_head issue_list;		/* list for command issue */
 	struct llist_node *dispatch_list;	/* list for command dispatch */
 };
@@ -945,6 +967,11 @@
 	F2FS_DIRTY_IMETA,
 	F2FS_WB_CP_DATA,
 	F2FS_WB_DATA,
+	F2FS_RD_DATA,
+	F2FS_RD_NODE,
+	F2FS_RD_META,
+	F2FS_DIO_WRITE,
+	F2FS_DIO_READ,
 	NR_COUNT_TYPE,
 };
 
@@ -1032,10 +1059,12 @@
 	bool submitted;		/* indicate IO submission */
 	int need_lock;		/* indicate we need to lock cp_rwsem */
 	bool in_list;		/* indicate fio is in io_list */
-	bool is_meta;		/* indicate borrow meta inode mapping or not */
+	bool is_por;		/* indicate IO is from recovery or not */
 	bool retry;		/* need to reallocate block address */
 	enum iostat_type io_type;	/* io type */
 	struct writeback_control *io_wbc; /* writeback control */
+	struct bio **bio;		/* bio for ipu */
+	sector_t *last_block;		/* last block number in bio */
 	unsigned char version;		/* version of the node */
 };
 
@@ -1059,8 +1088,8 @@
 	block_t start_blk;
 	block_t end_blk;
 #ifdef CONFIG_BLK_DEV_ZONED
-	unsigned int nr_blkz;			/* Total number of zones */
-	u8 *blkz_type;				/* Array of zones type */
+	unsigned int nr_blkz;		/* Total number of zones */
+	unsigned long *blkz_seq;	/* Bitmap indicating sequential zones */
 #endif
 };
 
@@ -1090,11 +1119,21 @@
 	SBI_NEED_CP,				/* need to checkpoint */
 	SBI_IS_SHUTDOWN,			/* shutdown by ioctl */
 	SBI_IS_RECOVERED,			/* recovered orphan/data */
+	SBI_CP_DISABLED,			/* CP was disabled last mount */
+	SBI_CP_DISABLED_QUICK,			/* CP was disabled quickly */
+	SBI_QUOTA_NEED_FLUSH,			/* need to flush quota info in CP */
+	SBI_QUOTA_SKIP_FLUSH,			/* skip flushing quota in current CP */
+	SBI_QUOTA_NEED_REPAIR,			/* quota file may be corrupted */
+	SBI_IS_RESIZEFS,			/* resizefs is in process */
 };
 
 enum {
 	CP_TIME,
 	REQ_TIME,
+	DISCARD_TIME,
+	GC_TIME,
+	DISABLE_TIME,
+	UMOUNT_DISCARD_TIMEOUT,
 	MAX_TIME,
 };
 
@@ -1122,7 +1161,7 @@
 	FSYNC_MODE_NOBARRIER,	/* fsync behaves nobarrier based on posix */
 };
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 #define DUMMY_ENCRYPTION_ENABLED(sbi) \
 			(unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
 #else
@@ -1137,6 +1176,10 @@
 	int valid_super_block;			/* valid super block no */
 	unsigned long s_flag;				/* flags for sbi */
 	struct mutex writepages;		/* mutex for writepages() */
+#ifdef CONFIG_UNICODE
+	struct unicode_map *s_encoding;
+	__u16 s_encoding_flags;
+#endif
 
 #ifdef CONFIG_BLK_DEV_ZONED
 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
@@ -1152,8 +1195,6 @@
 
 	/* for bio operations */
 	struct f2fs_bio_info *write_io[NR_PAGE_TYPE];	/* for write bios */
-	struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE];
-						/* bio ordering for NODE/DATA */
 	/* keep migration IO order for LFS mode */
 	struct rw_semaphore io_order_lock;
 	mempool_t *write_io_dummy;		/* Dummy pages */
@@ -1184,6 +1225,7 @@
 	/* for inode management */
 	struct list_head inode_list[NR_INODE_TYPE];	/* dirty inode list */
 	spinlock_t inode_lock[NR_INODE_TYPE];	/* for dirty inode list lock */
+	struct mutex flush_lock;		/* for flush exclusion */
 
 	/* for extent tree cache */
 	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
@@ -1207,11 +1249,11 @@
 	unsigned int segs_per_sec;		/* segments per section */
 	unsigned int secs_per_zone;		/* sections per zone */
 	unsigned int total_sections;		/* total section count */
+	struct mutex resize_mutex;		/* for resize exclusion */
 	unsigned int total_node_count;		/* total node block count */
 	unsigned int total_valid_node_count;	/* valid node block count */
 	loff_t max_file_blocks;			/* max block index of file */
 	int dir_level;				/* directory level */
-	unsigned int trigger_ssr_threshold;	/* threshold to trigger ssr */
 	int readdir_ra;				/* readahead inode in readdir */
 
 	block_t user_block_count;		/* # of user blocks */
@@ -1221,9 +1263,11 @@
 	block_t reserved_blocks;		/* configurable reserved blocks */
 	block_t current_reserved_blocks;	/* current reserved blocks */
 
-	unsigned int nquota_files;		/* # of quota sysfile */
+	/* Additional tracking for no checkpoint mode */
+	block_t unusable_block_count;		/* # of blocks saved by last cp */
 
-	u32 s_next_generation;			/* for NFS support */
+	unsigned int nquota_files;		/* # of quota sysfile */
+	struct rw_semaphore quota_sem;		/* blocking cp for flags */
 
 	/* # of pages, see count_type */
 	atomic_t nr_pages[NR_COUNT_TYPE];
@@ -1243,6 +1287,7 @@
 	struct f2fs_gc_kthread	*gc_thread;	/* GC thread */
 	unsigned int cur_victim_sec;		/* current victim section num */
 	unsigned int gc_mode;			/* current GC state */
+	unsigned int next_victim_seg[2];	/* next segment in victim section */
 	/* for skip statistic */
 	unsigned long long skipped_atomic_files[2];	/* FG_GC and BG_GC */
 	unsigned long long skipped_gc_rwsem;		/* FG_GC only */
@@ -1252,6 +1297,8 @@
 
 	/* maximum # of trials to find a victim segment for SSR and GC */
 	unsigned int max_victim_search;
+	/* migration granularity of garbage collection, unit: segment */
+	unsigned int migration_granularity;
 
 	/*
 	 * for stat information.
@@ -1259,6 +1306,7 @@
 	 */
 #ifdef CONFIG_F2FS_STAT_FS
 	struct f2fs_stat_info *stat_info;	/* FS status information */
+	atomic_t meta_count[META_MAX];		/* # of meta blocks */
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
 	atomic_t inplace_count;		/* # of inplace update */
@@ -1274,6 +1322,8 @@
 	atomic_t max_aw_cnt;			/* max # of atomic writes */
 	atomic_t max_vw_cnt;			/* max # of volatile writes */
 	int bg_gc;				/* background gc calls */
+	unsigned int io_skip_bggc;		/* skip background gc for in-flight IO */
+	unsigned int other_skip_bggc;		/* skip background gc for other reasons */
 	unsigned int ndirty_inode[NR_INODE_TYPE];	/* # of dirty inodes */
 #endif
 	spinlock_t stat_lock;			/* lock for stat operations */
@@ -1307,10 +1357,17 @@
 	__u32 s_chksum_seed;
 };
 
+struct f2fs_private_dio {
+	struct inode *inode;
+	void *orig_private;
+	bio_end_io_t *orig_end_io;
+	bool write;
+};
+
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-#define f2fs_show_injection_info(type)				\
-	printk("%sF2FS-fs : inject %s in %s of %pF\n",		\
-		KERN_INFO, f2fs_fault_name[type],		\
+#define f2fs_show_injection_info(type)					\
+	printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n",	\
+		KERN_INFO, f2fs_fault_name[type],			\
 		__func__, __builtin_return_address(0))
 static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
 {
@@ -1337,6 +1394,17 @@
 }
 #endif
 
+/*
+ * Test if the mounted volume is a multi-device volume.
+ *   - For a single regular disk volume, sbi->s_ndevs is 0.
+ *   - For a single zoned disk volume, sbi->s_ndevs is 1.
+ *   - For a multi-device volume, sbi->s_ndevs is always 2 or more.
+ */
+static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi)
+{
+	return sbi->s_ndevs > 1;
+}
+
 /* For write statistics. Suppose sector size is 512 bytes,
  * and the return value is in kbytes. s is of struct f2fs_sb_info.
  */
@@ -1346,7 +1414,15 @@
 
 static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
 {
-	sbi->last_time[type] = jiffies;
+	unsigned long now = jiffies;
+
+	sbi->last_time[type] = now;
+
+	/* DISCARD_TIME and GC_TIME are based on REQ_TIME */
+	if (type == REQ_TIME) {
+		sbi->last_time[DISCARD_TIME] = now;
+		sbi->last_time[GC_TIME] = now;
+	}
 }
 
 static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
@@ -1356,16 +1432,18 @@
 	return time_after(jiffies, sbi->last_time[type] + interval);
 }
 
-static inline bool is_idle(struct f2fs_sb_info *sbi)
+static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi,
+						int type)
 {
-	struct block_device *bdev = sbi->sb->s_bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-	struct request_list *rl = &q->root_rl;
+	unsigned long interval = sbi->interval_time[type] * HZ;
+	unsigned int wait_ms = 0;
+	long delta;
 
-	if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
-		return false;
+	delta = (sbi->last_time[type] + interval) - jiffies;
+	if (delta > 0)
+		wait_ms = jiffies_to_msecs(delta);
 
-	return f2fs_time_over(sbi, REQ_TIME);
+	return wait_ms;
 }
 
 /*
@@ -1383,7 +1461,6 @@
 	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx));
 
 	desc.shash.tfm = sbi->s_chksum_driver;
-	desc.shash.flags = 0;
 	*(u32 *)desc.ctx = crc;
 
 	err = crypto_shash_update(&desc.shash, address, length);
@@ -1432,7 +1509,7 @@
 
 static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
 {
-	return F2FS_M_SB(page->mapping);
+	return F2FS_M_SB(page_file_mapping(page));
 }
 
 static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
@@ -1574,16 +1651,23 @@
 static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
 {
 	unsigned long flags;
+	unsigned char *nat_bits;
 
-	set_sbi_flag(sbi, SBI_NEED_FSCK);
+	/*
+	 * In order to re-enable nat_bits we need to call fsck.f2fs by
+	 * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost,
+	 * so let's rely on regular fsck or unclean shutdown.
+	 */
 
 	if (lock)
 		spin_lock_irqsave(&sbi->cp_lock, flags);
 	__clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
-	kfree(NM_I(sbi)->nat_bits);
+	nat_bits = NM_I(sbi)->nat_bits;
 	NM_I(sbi)->nat_bits = NULL;
 	if (lock)
 		spin_unlock_irqrestore(&sbi->cp_lock, flags);
+
+	kvfree(nat_bits);
 }
 
 static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
@@ -1690,7 +1774,7 @@
 	if (time_to_inject(sbi, FAULT_BLOCK)) {
 		f2fs_show_injection_info(FAULT_BLOCK);
 		release = *count;
-		goto enospc;
+		goto release_quota;
 	}
 
 	/*
@@ -1706,7 +1790,12 @@
 
 	if (!__allow_reserved_blocks(sbi, inode, true))
 		avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
-
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+		if (avail_user_block_count > sbi->unusable_block_count)
+			avail_user_block_count -= sbi->unusable_block_count;
+		else
+			avail_user_block_count = 0;
+	}
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
 		diff = sbi->total_valid_block_count - avail_user_block_count;
 		if (diff > *count)
@@ -1730,10 +1819,25 @@
 
 enospc:
 	percpu_counter_sub(&sbi->alloc_valid_block_count, release);
+release_quota:
 	dquot_release_reservation_block(inode, release);
 	return -ENOSPC;
 }
 
+__printf(2, 3)
+void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...);
+
+#define f2fs_err(sbi, fmt, ...)						\
+	f2fs_printk(sbi, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn(sbi, fmt, ...)					\
+	f2fs_printk(sbi, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_notice(sbi, fmt, ...)					\
+	f2fs_printk(sbi, KERN_NOTICE fmt, ##__VA_ARGS__)
+#define f2fs_info(sbi, fmt, ...)					\
+	f2fs_printk(sbi, KERN_INFO fmt, ##__VA_ARGS__)
+#define f2fs_debug(sbi, fmt, ...)					\
+	f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__)
+
 static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 						struct inode *inode,
 						block_t count)
@@ -1742,13 +1846,20 @@
 
 	spin_lock(&sbi->stat_lock);
 	f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
-	f2fs_bug_on(sbi, inode->i_blocks < sectors);
 	sbi->total_valid_block_count -= (block_t)count;
 	if (sbi->reserved_blocks &&
 		sbi->current_reserved_blocks < sbi->reserved_blocks)
 		sbi->current_reserved_blocks = min(sbi->reserved_blocks,
 					sbi->current_reserved_blocks + count);
 	spin_unlock(&sbi->stat_lock);
+	if (unlikely(inode->i_blocks < sectors)) {
+		f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu, sectors:%llu",
+			  inode->i_ino,
+			  (unsigned long long)inode->i_blocks,
+			  (unsigned long long)sectors);
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		return;
+	}
 	f2fs_i_blocks_write(inode, count, false, true);
 }
 
@@ -1756,11 +1867,12 @@
 {
 	atomic_inc(&sbi->nr_pages[count_type]);
 
-	if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
-		count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
-		return;
-
-	set_sbi_flag(sbi, SBI_IS_DIRTY);
+	if (count_type == F2FS_DIRTY_DENTS ||
+			count_type == F2FS_DIRTY_NODES ||
+			count_type == F2FS_DIRTY_META ||
+			count_type == F2FS_DIRTY_QDATA ||
+			count_type == F2FS_DIRTY_IMETA)
+		set_sbi_flag(sbi, SBI_IS_DIRTY);
 }
 
 static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -1845,7 +1957,11 @@
 	if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) {
 		offset = (flag == SIT_BITMAP) ?
 			le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0;
-		return &ckpt->sit_nat_version_bitmap + offset;
+		/*
+		 * if large_nat_bitmap feature is enabled, leave checksum
+		 * protection for all nat/sit bitmaps.
+		 */
+		return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32);
 	}
 
 	if (__cp_payload(sbi) > 0) {
@@ -1892,13 +2008,19 @@
 					struct inode *inode, bool is_inode)
 {
 	block_t	valid_block_count;
-	unsigned int valid_node_count;
-	bool quota = inode && !is_inode;
+	unsigned int valid_node_count, user_block_count;
+	int err;
 
-	if (quota) {
-		int ret = dquot_reserve_block(inode, 1);
-		if (ret)
-			return ret;
+	if (is_inode) {
+		if (inode) {
+			err = dquot_alloc_inode(inode);
+			if (err)
+				return err;
+		}
+	} else {
+		err = dquot_reserve_block(inode, 1);
+		if (err)
+			return err;
 	}
 
 	if (time_to_inject(sbi, FAULT_BLOCK)) {
@@ -1913,8 +2035,11 @@
 
 	if (!__allow_reserved_blocks(sbi, inode, false))
 		valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+	user_block_count = sbi->user_block_count;
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+		user_block_count -= sbi->unusable_block_count;
 
-	if (unlikely(valid_block_count > sbi->user_block_count)) {
+	if (unlikely(valid_block_count > user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
 		goto enospc;
 	}
@@ -1940,8 +2065,12 @@
 	return 0;
 
 enospc:
-	if (quota)
+	if (is_inode) {
+		if (inode)
+			dquot_free_inode(inode);
+	} else {
 		dquot_release_reservation_block(inode, 1);
+	}
 	return -ENOSPC;
 }
 
@@ -1952,7 +2081,6 @@
 
 	f2fs_bug_on(sbi, !sbi->total_valid_block_count);
 	f2fs_bug_on(sbi, !sbi->total_valid_node_count);
-	f2fs_bug_on(sbi, !is_inode && !inode->i_blocks);
 
 	sbi->total_valid_node_count--;
 	sbi->total_valid_block_count--;
@@ -1962,8 +2090,18 @@
 
 	spin_unlock(&sbi->stat_lock);
 
-	if (!is_inode)
+	if (is_inode) {
+		dquot_free_inode(inode);
+	} else {
+		if (unlikely(inode->i_blocks == 0)) {
+			f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
+				  inode->i_ino,
+				  (unsigned long long)inode->i_blocks);
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			return;
+		}
 		f2fs_i_blocks_write(inode, 1, false, true);
+	}
 }
 
 static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
@@ -2092,6 +2230,29 @@
 	return bio_alloc(GFP_KERNEL, npages);
 }
 
+static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
+{
+	if (sbi->gc_mode == GC_URGENT)
+		return true;
+
+	if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
+		get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
+		get_pages(sbi, F2FS_WB_CP_DATA) ||
+		get_pages(sbi, F2FS_DIO_READ) ||
+		get_pages(sbi, F2FS_DIO_WRITE))
+		return false;
+
+	if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info &&
+			atomic_read(&SM_I(sbi)->dcc_info->queued_discard))
+		return false;
+
+	if (SM_I(sbi) && SM_I(sbi)->fcc_info &&
+			atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+		return false;
+
+	return f2fs_time_over(sbi, type);
+}
+
 static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
 				unsigned long index, void *item)
 {
@@ -2203,56 +2364,26 @@
 }
 
 /*
- * Inode flags
+ * On-disk inode flags (f2fs_inode::i_flags)
  */
-#define F2FS_SECRM_FL			0x00000001 /* Secure deletion */
-#define F2FS_UNRM_FL			0x00000002 /* Undelete */
-#define F2FS_COMPR_FL			0x00000004 /* Compress file */
 #define F2FS_SYNC_FL			0x00000008 /* Synchronous updates */
 #define F2FS_IMMUTABLE_FL		0x00000010 /* Immutable file */
 #define F2FS_APPEND_FL			0x00000020 /* writes to file may only append */
 #define F2FS_NODUMP_FL			0x00000040 /* do not dump file */
 #define F2FS_NOATIME_FL			0x00000080 /* do not update atime */
-/* Reserved for compression usage... */
-#define F2FS_DIRTY_FL			0x00000100
-#define F2FS_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
-#define F2FS_NOCOMPR_FL			0x00000400 /* Don't compress */
-#define F2FS_ENCRYPT_FL			0x00000800 /* encrypted file */
-/* End compression flags --- maybe not all used */
 #define F2FS_INDEX_FL			0x00001000 /* hash-indexed directory */
-#define F2FS_IMAGIC_FL			0x00002000 /* AFS directory */
-#define F2FS_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
-#define F2FS_NOTAIL_FL			0x00008000 /* file tail should not be merged */
 #define F2FS_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
-#define F2FS_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
-#define F2FS_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
-#define F2FS_EXTENTS_FL			0x00080000 /* Inode uses extents */
-#define F2FS_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
-#define F2FS_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
-#define F2FS_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
 #define F2FS_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
-#define F2FS_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
-
-#define F2FS_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */
-#define F2FS_FL_USER_MODIFIABLE		0x204BC0FF /* User modifiable flags */
-
-/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */
-#define F2FS_FL_XFLAG_VISIBLE		(F2FS_SYNC_FL | \
-					 F2FS_IMMUTABLE_FL | \
-					 F2FS_APPEND_FL | \
-					 F2FS_NODUMP_FL | \
-					 F2FS_NOATIME_FL | \
-					 F2FS_PROJINHERIT_FL)
+#define F2FS_CASEFOLD_FL		0x40000000 /* Casefolded file */
 
 /* Flags that should be inherited by new inodes from their parent. */
-#define F2FS_FL_INHERITED (F2FS_SECRM_FL | F2FS_UNRM_FL | F2FS_COMPR_FL |\
-			   F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL |\
-			   F2FS_NOCOMPR_FL | F2FS_JOURNAL_DATA_FL |\
-			   F2FS_NOTAIL_FL | F2FS_DIRSYNC_FL |\
-			   F2FS_PROJINHERIT_FL)
+#define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \
+			   F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
+			   F2FS_CASEFOLD_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define F2FS_REG_FLMASK		(~(F2FS_DIRSYNC_FL | F2FS_TOPDIR_FL))
+#define F2FS_REG_FLMASK		(~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
+				F2FS_CASEFOLD_FL))
 
 /* Flags that are appropriate for non-directories/regular files. */
 #define F2FS_OTHER_FLMASK	(F2FS_NODUMP_FL | F2FS_NOATIME_FL)
@@ -2299,6 +2430,7 @@
 	FI_PROJ_INHERIT,	/* indicate file inherits projectid */
 	FI_PIN_FILE,		/* indicate file should not be gced */
 	FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
+	FI_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2311,6 +2443,7 @@
 	case FI_NEW_INODE:
 		if (set)
 			return;
+		/* fall through */
 	case FI_DATA_EXIST:
 	case FI_INLINE_DOTS:
 	case FI_PIN_FILE:
@@ -2337,6 +2470,12 @@
 	__mark_inode_dirty_flag(inode, flag, false);
 }
 
+static inline bool f2fs_verity_in_progress(struct inode *inode)
+{
+	return IS_ENABLED(CONFIG_FS_VERITY) &&
+	       is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS);
+}
+
 static inline void set_acl_inode(struct inode *inode, umode_t mode)
 {
 	F2FS_I(inode)->i_acl_mode = mode;
@@ -2465,7 +2604,14 @@
 
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
-	return CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode);
+	unsigned int addrs = CUR_ADDRS_PER_INODE(inode) -
+				get_inline_xattr_addrs(inode);
+	return ALIGN_DOWN(addrs, 1);
+}
+
+static inline unsigned int addrs_per_block(struct inode *inode)
+{
+	return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, 1);
 }
 
 static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
@@ -2478,7 +2624,9 @@
 
 static inline int inline_xattr_size(struct inode *inode)
 {
-	return get_inline_xattr_addrs(inode) * sizeof(__le32);
+	if (f2fs_has_inline_xattr(inode))
+		return get_inline_xattr_addrs(inode) * sizeof(__le32);
+	return 0;
 }
 
 static inline int f2fs_has_inline_data(struct inode *inode)
@@ -2613,22 +2761,37 @@
 
 static inline bool f2fs_may_extent_tree(struct inode *inode)
 {
-	if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (!test_opt(sbi, EXTENT_CACHE) ||
 			is_inode_flag_set(inode, FI_NO_EXTENT))
 		return false;
 
+	/*
+	 * for recovered files during mount do not create extents
+	 * if shrinker is not registered.
+	 */
+	if (list_empty(&sbi->s_list))
+		return false;
+
 	return S_ISREG(inode->i_mode);
 }
 
 static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
 					size_t size, gfp_t flags)
 {
+	void *ret;
+
 	if (time_to_inject(sbi, FAULT_KMALLOC)) {
 		f2fs_show_injection_info(FAULT_KMALLOC);
 		return NULL;
 	}
 
-	return kmalloc(size, flags);
+	ret = kmalloc(size, flags);
+	if (ret)
+		return ret;
+
+	return kvmalloc(size, flags);
 }
 
 static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi,
@@ -2674,9 +2837,9 @@
 
 #define F2FS_OLD_ATTRIBUTE_SIZE	(offsetof(struct f2fs_inode, i_addr))
 #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field)		\
-		((offsetof(typeof(*f2fs_inode), field) +	\
+		((offsetof(typeof(*(f2fs_inode)), field) +	\
 		sizeof((f2fs_inode)->field))			\
-		<= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize))	\
+		<= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize)))	\
 
 static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
 {
@@ -2703,19 +2866,18 @@
 	spin_unlock(&sbi->iostat_lock);
 }
 
-#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META &&	\
-				(!is_read_io(fio->op) || fio->is_meta))
+#define __is_large_section(sbi)		((sbi)->segs_per_sec > 1)
+
+#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META)
 
 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type);
-void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...);
 static inline void verify_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type)
 {
 	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) {
-		f2fs_msg(sbi->sb, KERN_ERR,
-			"invalid blkaddr: %u, type: %d, run fsck to fix.",
-			blkaddr, type);
+		f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.",
+			 blkaddr, type);
 		f2fs_bug_on(sbi, 1);
 	}
 }
@@ -2727,13 +2889,25 @@
 	return true;
 }
 
-static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
-						block_t blkaddr)
+static inline void f2fs_set_page_private(struct page *page,
+						unsigned long data)
 {
-	if (!__is_valid_data_blkaddr(blkaddr))
-		return false;
-	verify_blkaddr(sbi, blkaddr, DATA_GENERIC);
-	return true;
+	if (PagePrivate(page))
+		return;
+
+	get_page(page);
+	SetPagePrivate(page);
+	set_page_private(page, data);
+}
+
+static inline void f2fs_clear_page_private(struct page *page)
+{
+	if (!PagePrivate(page))
+		return;
+
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	f2fs_put_page(page, 0);
 }
 
 /*
@@ -2751,6 +2925,7 @@
 int f2fs_precache_extents(struct inode *inode);
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid);
 int f2fs_pin_file_control(struct inode *inode, bool inc);
 
 /*
@@ -2775,6 +2950,11 @@
 							bool hot, bool set);
 struct dentry *f2fs_get_parent(struct dentry *child);
 
+extern int f2fs_ci_compare(const struct inode *parent,
+			   const struct qstr *name,
+			   const struct qstr *entry,
+			   bool quick);
+
 /*
  * dir.c
  */
@@ -2829,18 +3009,17 @@
 int f2fs_inode_dirtied(struct inode *inode, bool sync);
 void f2fs_inode_synced(struct inode *inode);
 int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
+int f2fs_quota_sync(struct super_block *sb, int type);
 void f2fs_quota_off_umount(struct super_block *sb);
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
 int f2fs_sync_fs(struct super_block *sb, int sync);
-extern __printf(3, 4)
-void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...);
 int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi);
 
 /*
  * hash.c
  */
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
-				struct fscrypt_name *fname);
+f2fs_hash_t f2fs_dentry_hash(const struct inode *dir,
+		const struct qstr *name_info, struct fscrypt_name *fname);
 
 /*
  * node.c
@@ -2871,7 +3050,7 @@
 void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
 struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
 struct page *f2fs_get_node_page_ra(struct page *parent, int start);
-void f2fs_move_node_page(struct page *node_page, int gc_type);
+int f2fs_move_node_page(struct page *node_page, int gc_type);
 int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			struct writeback_control *wbc, bool atomic,
 			unsigned int *seq_id);
@@ -2888,7 +3067,7 @@
 int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
 int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct f2fs_summary_block *sum);
-void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 int f2fs_build_node_manager(struct f2fs_sb_info *sbi);
 void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi);
 int __init f2fs_create_node_manager_caches(void);
@@ -2913,11 +3092,16 @@
 bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
 void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
 void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi);
 void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 					struct cp_control *cpc);
+void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi);
+block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi);
+int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
 void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
 int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+					unsigned int start, unsigned int end);
 void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
@@ -2943,8 +3127,10 @@
 			struct f2fs_summary *sum, int type,
 			struct f2fs_io_info *fio, bool add_list);
 void f2fs_wait_on_page_writeback(struct page *page,
-			enum page_type type, bool ordered);
-void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr);
+			enum page_type type, bool ordered, bool locked);
+void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
+void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
+								block_t len);
 void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
 void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
 int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
@@ -3004,10 +3190,11 @@
 void f2fs_destroy_post_read_processing(void);
 void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
 void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
-				struct inode *inode, nid_t ino, pgoff_t idx,
-				enum page_type type);
+				struct inode *inode, struct page *page,
+				nid_t ino, enum page_type type);
 void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
 int f2fs_submit_page_bio(struct f2fs_io_info *fio);
+int f2fs_merge_page_bio(struct f2fs_io_info *fio);
 void f2fs_submit_page_write(struct f2fs_io_info *fio);
 struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
 			block_t blk_addr, struct bio *bio);
@@ -3027,6 +3214,7 @@
 struct page *f2fs_get_new_data_page(struct inode *inode,
 			struct page *ipage, pgoff_t index, bool new_i_size);
 int f2fs_do_write_data_page(struct f2fs_io_info *fio);
+void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock);
 int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 			int create, int flag);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -3041,7 +3229,7 @@
 			struct page *page, enum migrate_mode mode);
 #endif
 bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
-void f2fs_clear_radix_tree_dirty_tag(struct page *page);
+void f2fs_clear_page_cache_dirty_tag(struct page *page);
 
 /*
  * gc.c
@@ -3052,6 +3240,7 @@
 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
 			unsigned int segno);
 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
+int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count);
 
 /*
  * recovery.c
@@ -3079,6 +3268,9 @@
 	int free_nids, avail_nids, alloc_nids;
 	int total_count, utilization;
 	int bg_gc, nr_wb_cp_data, nr_wb_data;
+	int nr_rd_data, nr_rd_node, nr_rd_meta;
+	int nr_dio_read, nr_dio_write;
+	unsigned int io_skip_bggc, other_skip_bggc;
 	int nr_flushing, nr_flushed, flush_list_empty;
 	int nr_discarding, nr_discarded;
 	int nr_discard_cmd;
@@ -3100,6 +3292,7 @@
 	int cursec[NR_CURSEG_TYPE];
 	int curzone[NR_CURSEG_TYPE];
 
+	unsigned int meta_count[META_MAX];
 	unsigned int segment_count[2];
 	unsigned int block_count[2];
 	unsigned int inplace_count;
@@ -3115,6 +3308,8 @@
 #define stat_inc_bg_cp_count(si)	((si)->bg_cp_count++)
 #define stat_inc_call_count(si)		((si)->call_count++)
 #define stat_inc_bggc_count(sbi)	((sbi)->bg_gc++)
+#define stat_io_skip_bggc_count(sbi)	((sbi)->io_skip_bggc++)
+#define stat_other_skip_bggc_count(sbi)	((sbi)->other_skip_bggc++)
 #define stat_inc_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]++)
 #define stat_dec_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]--)
 #define stat_inc_total_hit(sbi)		(atomic64_inc(&(sbi)->total_hit_ext))
@@ -3151,6 +3346,17 @@
 		if (f2fs_has_inline_dentry(inode))			\
 			(atomic_dec(&F2FS_I_SB(inode)->inline_dir));	\
 	} while (0)
+#define stat_inc_meta_count(sbi, blkaddr)				\
+	do {								\
+		if (blkaddr < SIT_I(sbi)->sit_base_addr)		\
+			atomic_inc(&(sbi)->meta_count[META_CP]);	\
+		else if (blkaddr < NM_I(sbi)->nat_blkaddr)		\
+			atomic_inc(&(sbi)->meta_count[META_SIT]);	\
+		else if (blkaddr < SM_I(sbi)->ssa_blkaddr)		\
+			atomic_inc(&(sbi)->meta_count[META_NAT]);	\
+		else if (blkaddr < SM_I(sbi)->main_blkaddr)		\
+			atomic_inc(&(sbi)->meta_count[META_SSA]);	\
+	} while (0)
 #define stat_inc_seg_type(sbi, curseg)					\
 		((sbi)->segment_count[(curseg)->alloc_type]++)
 #define stat_inc_block_count(sbi, curseg)				\
@@ -3213,13 +3419,15 @@
 
 int f2fs_build_stats(struct f2fs_sb_info *sbi);
 void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
-int __init f2fs_create_root_stats(void);
+void __init f2fs_create_root_stats(void);
 void f2fs_destroy_root_stats(void);
 #else
 #define stat_inc_cp_count(si)				do { } while (0)
 #define stat_inc_bg_cp_count(si)			do { } while (0)
 #define stat_inc_call_count(si)				do { } while (0)
 #define stat_inc_bggc_count(si)				do { } while (0)
+#define stat_io_skip_bggc_count(sbi)			do { } while (0)
+#define stat_other_skip_bggc_count(sbi)			do { } while (0)
 #define stat_inc_dirty_inode(sbi, type)			do { } while (0)
 #define stat_dec_dirty_inode(sbi, type)			do { } while (0)
 #define stat_inc_total_hit(sb)				do { } while (0)
@@ -3238,6 +3446,7 @@
 #define stat_inc_volatile_write(inode)			do { } while (0)
 #define stat_dec_volatile_write(inode)			do { } while (0)
 #define stat_update_max_volatile_write(inode)		do { } while (0)
+#define stat_inc_meta_count(sbi, blkaddr)		do { } while (0)
 #define stat_inc_seg_type(sbi, curseg)			do { } while (0)
 #define stat_inc_block_count(sbi, curseg)		do { } while (0)
 #define stat_inc_inplace_blocks(sbi)			do { } while (0)
@@ -3248,11 +3457,14 @@
 
 static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline int __init f2fs_create_root_stats(void) { return 0; }
+static inline void __init f2fs_create_root_stats(void) { }
 static inline void f2fs_destroy_root_stats(void) { }
 #endif
 
 extern const struct file_operations f2fs_dir_operations;
+#ifdef CONFIG_UNICODE
+extern const struct dentry_operations f2fs_dentry_ops;
+#endif
 extern const struct file_operations f2fs_file_operations;
 extern const struct inode_operations f2fs_file_inode_operations;
 extern const struct address_space_operations f2fs_dblock_aops;
@@ -3307,18 +3519,19 @@
 /*
  * extent_cache.c
  */
-struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root,
+struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
 				struct rb_entry *cached_re, unsigned int ofs);
 struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
-				struct rb_root *root, struct rb_node **parent,
-				unsigned int ofs);
-struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
+				struct rb_root_cached *root,
+				struct rb_node **parent,
+				unsigned int ofs, bool *leftmost);
+struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
 		struct rb_entry *cached_re, unsigned int ofs,
 		struct rb_entry **prev_entry, struct rb_entry **next_entry,
 		struct rb_node ***insert_p, struct rb_node **insert_parent,
-		bool force);
+		bool force, bool *leftmost);
 bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
-						struct rb_root *root);
+						struct rb_root_cached *root);
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
 bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
 void f2fs_drop_extent_tree(struct inode *inode);
@@ -3341,24 +3554,22 @@
 int f2fs_register_sysfs(struct f2fs_sb_info *sbi);
 void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
 
+/* verity.c */
+extern const struct fsverity_operations f2fs_verityops;
+
 /*
  * crypto support
  */
-static inline bool f2fs_encrypted_inode(struct inode *inode)
-{
-	return file_is_encrypt(inode);
-}
-
 static inline bool f2fs_encrypted_file(struct inode *inode)
 {
-	return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode);
+	return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
 }
 
 static inline void f2fs_set_encrypted_inode(struct inode *inode)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	file_set_encrypt(inode);
-	inode->i_flags |= S_ENCRYPTED;
+	f2fs_set_inode_flags(inode);
 #endif
 }
 
@@ -3368,13 +3579,13 @@
  */
 static inline bool f2fs_post_read_required(struct inode *inode)
 {
-	return f2fs_encrypted_file(inode);
+	return f2fs_encrypted_file(inode) || fsverity_active(inode);
 }
 
 #define F2FS_FEATURE_FUNCS(name, flagname) \
-static inline int f2fs_sb_has_##name(struct super_block *sb) \
+static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
 { \
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \
+	return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
 }
 
 F2FS_FEATURE_FUNCS(encrypt, ENCRYPT);
@@ -3386,28 +3597,64 @@
 F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO);
 F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME);
 F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
+F2FS_FEATURE_FUNCS(verity, VERITY);
+F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
+F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
 
 #ifdef CONFIG_BLK_DEV_ZONED
-static inline int get_blkz_type(struct f2fs_sb_info *sbi,
-			struct block_device *bdev, block_t blkaddr)
+static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
+				    block_t blkaddr)
 {
 	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
-	int i;
 
-	for (i = 0; i < sbi->s_ndevs; i++)
-		if (FDEV(i).bdev == bdev)
-			return FDEV(i).blkz_type[zno];
-	return -EINVAL;
+	return test_bit(zno, FDEV(devi).blkz_seq);
 }
 #endif
 
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
 {
-	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
-
-	return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb);
+	return f2fs_sb_has_blkzoned(sbi);
 }
 
+static inline bool f2fs_bdev_support_discard(struct block_device *bdev)
+{
+	return blk_queue_discard(bdev_get_queue(bdev)) ||
+	       bdev_is_zoned(bdev);
+}
+
+static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	if (!f2fs_is_multi_device(sbi))
+		return f2fs_bdev_support_discard(sbi->sb->s_bdev);
+
+	for (i = 0; i < sbi->s_ndevs; i++)
+		if (f2fs_bdev_support_discard(FDEV(i).bdev))
+			return true;
+	return false;
+}
+
+static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi)
+{
+	return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) ||
+					f2fs_hw_should_discard(sbi);
+}
+
+static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	if (!f2fs_is_multi_device(sbi))
+		return bdev_read_only(sbi->sb->s_bdev);
+
+	for (i = 0; i < sbi->s_ndevs; i++)
+		if (bdev_read_only(FDEV(i).bdev))
+			return true;
+	return false;
+}
+
+
 static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
 {
 	clear_opt(sbi, ADAPTIVE);
@@ -3425,7 +3672,7 @@
 
 static inline bool f2fs_may_encrypt(struct inode *inode)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	umode_t mode = inode->i_mode;
 
 	return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
@@ -3434,11 +3681,54 @@
 #endif
 }
 
-static inline bool f2fs_force_buffered_io(struct inode *inode, int rw)
+static inline int block_unaligned_IO(struct inode *inode,
+				struct kiocb *iocb, struct iov_iter *iter)
 {
-	return (f2fs_post_read_required(inode) ||
-			(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
-			F2FS_I_SB(inode)->s_ndevs);
+	unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
+	unsigned int blocksize_mask = (1 << i_blkbits) - 1;
+	loff_t offset = iocb->ki_pos;
+	unsigned long align = offset | iov_iter_alignment(iter);
+
+	return align & blocksize_mask;
+}
+
+static inline int allow_outplace_dio(struct inode *inode,
+				struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int rw = iov_iter_rw(iter);
+
+	return (test_opt(sbi, LFS) && (rw == WRITE) &&
+				!block_unaligned_IO(inode, iocb, iter));
+}
+
+static inline bool f2fs_force_buffered_io(struct inode *inode,
+				struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int rw = iov_iter_rw(iter);
+
+	if (f2fs_post_read_required(inode))
+		return true;
+	if (f2fs_is_multi_device(sbi))
+		return true;
+	/*
+	 * for blkzoned device, fallback direct IO to buffered IO, so
+	 * all IOs can be serialized by log-structured write.
+	 */
+	if (f2fs_sb_has_blkzoned(sbi))
+		return true;
+	if (test_opt(sbi, LFS) && (rw == WRITE)) {
+		if (block_unaligned_IO(inode, iocb, iter))
+			return true;
+		if (F2FS_IO_ALIGNED(sbi))
+			return true;
+	}
+	if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) &&
+					!IS_SWAPFILE(inode))
+		return true;
+
+	return false;
 }
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
@@ -3448,4 +3738,20 @@
 #define f2fs_build_fault_attr(sbi, rate, type)		do { } while (0)
 #endif
 
+static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
+{
+#ifdef CONFIG_QUOTA
+	if (f2fs_sb_has_quota_ino(sbi))
+		return true;
+	if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+		F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+		F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
+		return true;
 #endif
+	return false;
+}
+
+#define EFSBADCRC	EBADMSG		/* Bad CRC detected */
+#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
+
+#endif /* _LINUX_F2FS_H */

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5474aaa..29bc0a5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/file.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
@@ -23,6 +20,7 @@
 #include <linux/uio.h>
 #include <linux/uuid.h>
 #include <linux/file.h>
+#include <linux/nls.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -42,6 +40,8 @@
 	ret = filemap_fault(vmf);
 	up_read(&F2FS_I(inode)->i_mmap_sem);
 
+	trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret);
+
 	return ret;
 }
 
@@ -50,7 +50,7 @@
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct dnode_of_data dn;
+	struct dnode_of_data dn = { .node_changed = false };
 	int err;
 
 	if (unlikely(f2fs_cp_error(sbi))) {
@@ -58,23 +58,15 @@
 		goto err;
 	}
 
+	if (!f2fs_is_checkpoint_ready(sbi)) {
+		err = -ENOSPC;
+		goto err;
+	}
+
 	sb_start_pagefault(inode->i_sb);
 
 	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
 
-	/* block allocation */
-	f2fs_lock_op(sbi);
-	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = f2fs_reserve_block(&dn, page->index);
-	if (err) {
-		f2fs_unlock_op(sbi);
-		goto out;
-	}
-	f2fs_put_dnode(&dn);
-	f2fs_unlock_op(sbi);
-
-	f2fs_balance_fs(sbi, dn.node_changed);
-
 	file_update_time(vmf->vma->vm_file);
 	down_read(&F2FS_I(inode)->i_mmap_sem);
 	lock_page(page);
@@ -86,11 +78,28 @@
 		goto out_sem;
 	}
 
+	/* block allocation */
+	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = f2fs_get_block(&dn, page->index);
+	f2fs_put_dnode(&dn);
+	__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+	if (err) {
+		unlock_page(page);
+		goto out_sem;
+	}
+
+	/* fill the page */
+	f2fs_wait_on_page_writeback(page, DATA, false, true);
+
+	/* wait for GCed page writeback via META_MAPPING */
+	f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
+
 	/*
 	 * check to see if the page is mapped already (no holes)
 	 */
 	if (PageMappedToDisk(page))
-		goto mapped;
+		goto out_sem;
 
 	/* page is wholly or partially inside EOF */
 	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
@@ -105,21 +114,15 @@
 		SetPageUptodate(page);
 
 	f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE);
+	f2fs_update_time(sbi, REQ_TIME);
 
 	trace_f2fs_vm_page_mkwrite(page, DATA);
-mapped:
-	/* fill the page */
-	f2fs_wait_on_page_writeback(page, DATA, false);
-
-	/* wait for GCed page writeback via META_MAPPING */
-	if (f2fs_post_read_required(inode))
-		f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr);
-
 out_sem:
 	up_read(&F2FS_I(inode)->i_mmap_sem);
-out:
+
+	f2fs_balance_fs(sbi, dn.node_changed);
+
 	sb_end_pagefault(inode->i_sb);
-	f2fs_update_time(sbi, REQ_TIME);
 err:
 	return block_page_mkwrite_return(err);
 }
@@ -215,11 +218,15 @@
 	};
 	unsigned int seq_id = 0;
 
-	if (unlikely(f2fs_readonly(inode->i_sb)))
+	if (unlikely(f2fs_readonly(inode->i_sb) ||
+				is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
 		return 0;
 
 	trace_f2fs_sync_file_enter(inode);
 
+	if (S_ISDIR(inode->i_mode))
+		goto go_write;
+
 	/* if fdatasync is triggered, let's do in-place-update */
 	if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
 		set_inode_flag(inode, FI_NEED_IPU);
@@ -357,7 +364,7 @@
 	switch (whence) {
 	case SEEK_DATA:
 		if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
-			is_valid_data_blkaddr(sbi, blkaddr))
+			__is_valid_data_blkaddr(blkaddr))
 			return true;
 		break;
 	case SEEK_HOLE:
@@ -423,7 +430,7 @@
 
 			if (__is_valid_data_blkaddr(blkaddr) &&
 				!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
-						blkaddr, DATA_GENERIC)) {
+					blkaddr, DATA_GENERIC_ENHANCE)) {
 				f2fs_put_dnode(&dn);
 				goto fail;
 			}
@@ -495,6 +502,10 @@
 	if (err)
 		return err;
 
+	err = fsverity_file_open(inode, filp);
+	if (err)
+		return err;
+
 	filp->f_mode |= FMODE_NOWAIT;
 
 	return dquot_file_open(inode, filp);
@@ -524,7 +535,8 @@
 		f2fs_set_data_blkaddr(dn);
 
 		if (__is_valid_data_blkaddr(blkaddr) &&
-			!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC))
+			!f2fs_is_valid_blkaddr(sbi, blkaddr,
+					DATA_GENERIC_ENHANCE))
 			continue;
 
 		f2fs_invalidate_blocks(sbi, blkaddr);
@@ -553,7 +565,7 @@
 
 void f2fs_truncate_data_blocks(struct dnode_of_data *dn)
 {
-	f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
+	f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode));
 }
 
 static int truncate_partial_data_page(struct inode *inode, u64 from,
@@ -579,11 +591,11 @@
 	if (IS_ERR(page))
 		return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
 truncate_out:
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 	zero_user(page, offset, PAGE_SIZE - offset);
 
 	/* An encrypted inode should have a key and truncate the last page. */
-	f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode));
+	f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode));
 	if (!cache_only)
 		set_page_dirty(page);
 	f2fs_put_page(page, 1);
@@ -698,19 +710,17 @@
 	unsigned int flags;
 
 	if (f2fs_has_extra_attr(inode) &&
-			f2fs_sb_has_inode_crtime(inode->i_sb) &&
+			f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) &&
 			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
 		stat->result_mask |= STATX_BTIME;
 		stat->btime.tv_sec = fi->i_crtime.tv_sec;
 		stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
 	}
 
-	flags = fi->i_flags & F2FS_FL_USER_VISIBLE;
+	flags = fi->i_flags;
 	if (flags & F2FS_APPEND_FL)
 		stat->attributes |= STATX_ATTR_APPEND;
-	if (flags & F2FS_COMPR_FL)
-		stat->attributes |= STATX_ATTR_COMPRESSED;
-	if (f2fs_encrypted_inode(inode))
+	if (IS_ENCRYPTED(inode))
 		stat->attributes |= STATX_ATTR_ENCRYPTED;
 	if (flags & F2FS_IMMUTABLE_FL)
 		stat->attributes |= STATX_ATTR_IMMUTABLE;
@@ -718,7 +728,6 @@
 		stat->attributes |= STATX_ATTR_NODUMP;
 
 	stat->attributes_mask |= (STATX_ATTR_APPEND |
-				  STATX_ATTR_COMPRESSED |
 				  STATX_ATTR_ENCRYPTED |
 				  STATX_ATTR_IMMUTABLE |
 				  STATX_ATTR_NODUMP);
@@ -742,15 +751,18 @@
 		inode->i_uid = attr->ia_uid;
 	if (ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
-	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = timespec64_trunc(attr->ia_atime,
-						  inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_MTIME)
-		inode->i_mtime = timespec64_trunc(attr->ia_mtime,
-						  inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_CTIME)
-		inode->i_ctime = timespec64_trunc(attr->ia_ctime,
-						  inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_ATIME) {
+		inode->i_atime = timestamp_truncate(attr->ia_atime,
+						  inode);
+	}
+	if (ia_valid & ATTR_MTIME) {
+		inode->i_mtime = timestamp_truncate(attr->ia_mtime,
+						  inode);
+	}
+	if (ia_valid & ATTR_CTIME) {
+		inode->i_ctime = timestamp_truncate(attr->ia_ctime,
+						  inode);
+	}
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 
@@ -767,7 +779,6 @@
 {
 	struct inode *inode = d_inode(dentry);
 	int err;
-	bool size_changed = false;
 
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
@@ -780,6 +791,10 @@
 	if (err)
 		return err;
 
+	err = fsverity_prepare_setattr(dentry, attr);
+	if (err)
+		return err;
+
 	if (is_quota_modification(inode, attr)) {
 		err = dquot_initialize(inode);
 		if (err)
@@ -789,20 +804,45 @@
 		!uid_eq(attr->ia_uid, inode->i_uid)) ||
 		(attr->ia_valid & ATTR_GID &&
 		!gid_eq(attr->ia_gid, inode->i_gid))) {
+		f2fs_lock_op(F2FS_I_SB(inode));
 		err = dquot_transfer(inode, attr);
-		if (err)
+		if (err) {
+			set_sbi_flag(F2FS_I_SB(inode),
+					SBI_QUOTA_NEED_REPAIR);
+			f2fs_unlock_op(F2FS_I_SB(inode));
 			return err;
+		}
+		/*
+		 * update uid/gid under lock_op(), so that dquot and inode can
+		 * be updated atomically.
+		 */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		f2fs_mark_inode_dirty_sync(inode, true);
+		f2fs_unlock_op(F2FS_I_SB(inode));
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		bool to_smaller = (attr->ia_size <= i_size_read(inode));
+		loff_t old_size = i_size_read(inode);
+
+		if (attr->ia_size > MAX_INLINE_DATA(inode)) {
+			/*
+			 * should convert inline inode before i_size_write to
+			 * keep smaller than inline_data size with inline flag.
+			 */
+			err = f2fs_convert_inline_inode(inode);
+			if (err)
+				return err;
+		}
 
 		down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 		down_write(&F2FS_I(inode)->i_mmap_sem);
 
 		truncate_setsize(inode, attr->ia_size);
 
-		if (to_smaller)
+		if (attr->ia_size <= old_size)
 			err = f2fs_truncate(inode);
 		/*
 		 * do not trim all blocks after i_size if target size is
@@ -810,25 +850,13 @@
 		 */
 		up_write(&F2FS_I(inode)->i_mmap_sem);
 		up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-
 		if (err)
 			return err;
 
-		if (!to_smaller) {
-			/* should convert inline inode here */
-			if (!f2fs_may_inline_data(inode)) {
-				err = f2fs_convert_inline_inode(inode);
-				if (err)
-					return err;
-			}
-			inode->i_mtime = inode->i_ctime = current_time(inode);
-		}
-
 		down_write(&F2FS_I(inode)->i_sem);
+		inode->i_mtime = inode->i_ctime = current_time(inode);
 		F2FS_I(inode)->last_disk_size = i_size_read(inode);
 		up_write(&F2FS_I(inode)->i_sem);
-
-		size_changed = true;
 	}
 
 	__setattr_copy(inode, attr);
@@ -842,7 +870,7 @@
 	}
 
 	/* file size may changed here */
-	f2fs_mark_inode_dirty_sync(inode, size_changed);
+	f2fs_mark_inode_dirty_sync(inode, true);
 
 	/* inode change will produce dirty node pages flushed by checkpoint */
 	f2fs_balance_fs(F2FS_I_SB(inode), true);
@@ -879,7 +907,7 @@
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 	zero_user(page, start, len);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
@@ -995,7 +1023,8 @@
 	} else if (ret == -ENOENT) {
 		if (dn.max_level == 0)
 			return -ENOENT;
-		done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len);
+		done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - dn.ofs_in_node,
+									len);
 		blkaddr += done;
 		do_replace += done;
 		goto next;
@@ -1006,11 +1035,19 @@
 	for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
 		*blkaddr = datablock_addr(dn.inode,
 					dn.node_page, dn.ofs_in_node);
+
+		if (__is_valid_data_blkaddr(*blkaddr) &&
+			!f2fs_is_valid_blkaddr(sbi, *blkaddr,
+					DATA_GENERIC_ENHANCE)) {
+			f2fs_put_dnode(&dn);
+			return -EFSCORRUPTED;
+		}
+
 		if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) {
 
 			if (test_opt(sbi, LFS)) {
 				f2fs_put_dnode(&dn);
-				return -ENOTSUPP;
+				return -EOPNOTSUPP;
 			}
 
 			/* do not invalidate this block address */
@@ -1146,7 +1183,7 @@
 	int ret;
 
 	while (len) {
-		olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
+		olen = min((pgoff_t)4 * ADDRS_PER_BLOCK(src_inode), len);
 
 		src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode),
 					array_size(olen, sizeof(block_t)),
@@ -1191,7 +1228,7 @@
 static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+	pgoff_t nrpages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 	pgoff_t start = offset >> PAGE_SHIFT;
 	pgoff_t end = (offset + len) >> PAGE_SHIFT;
 	int ret;
@@ -1444,7 +1481,7 @@
 	pg_start = offset >> PAGE_SHIFT;
 	pg_end = (offset + len) >> PAGE_SHIFT;
 	delta = pg_end - pg_start;
-	idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+	idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 
 	/* avoid gc operation during block exchange */
 	down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1483,7 +1520,8 @@
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
-			.m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE };
+			.m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE,
+			.m_may_create = true };
 	pgoff_t pg_end;
 	loff_t new_size = i_size_read(inode);
 	loff_t off_end;
@@ -1507,7 +1545,12 @@
 	if (off_end)
 		map.m_len++;
 
-	err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+	if (f2fs_is_pinned_file(inode))
+		map.m_seg_type = CURSEG_COLD_DATA;
+
+	err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
+						F2FS_GET_BLOCK_PRE_DIO :
+						F2FS_GET_BLOCK_PRE_AIO));
 	if (err) {
 		pgoff_t last_off;
 
@@ -1541,12 +1584,14 @@
 
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode)))
+		return -ENOSPC;
 
 	/* f2fs only support ->fallocate for regular file */
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 
-	if (f2fs_encrypted_inode(inode) &&
+	if (IS_ENCRYPTED(inode) &&
 		(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
 		return -EOPNOTSUPP;
 
@@ -1624,42 +1669,22 @@
 	return 0;
 }
 
-static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
-{
-	struct inode *inode = file_inode(filp);
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	unsigned int flags = fi->i_flags;
-
-	if (f2fs_encrypted_inode(inode))
-		flags |= F2FS_ENCRYPT_FL;
-	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
-		flags |= F2FS_INLINE_DATA_FL;
-
-	flags &= F2FS_FL_USER_VISIBLE;
-
-	return put_user(flags, (int __user *)arg);
-}
-
-static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags)
+static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-	unsigned int oldflags;
 
 	/* Is it quota file? Do not allow user to mess with it */
 	if (IS_NOQUOTA(inode))
 		return -EPERM;
 
-	flags = f2fs_mask_flags(inode->i_mode, flags);
+	if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) {
+		if (!f2fs_sb_has_casefold(F2FS_I_SB(inode)))
+			return -EOPNOTSUPP;
+		if (!f2fs_empty_dir(inode))
+			return -ENOTEMPTY;
+	}
 
-	oldflags = fi->i_flags;
-
-	if ((flags ^ oldflags) & (F2FS_APPEND_FL | F2FS_IMMUTABLE_FL))
-		if (!capable(CAP_LINUX_IMMUTABLE))
-			return -EPERM;
-
-	flags = flags & F2FS_FL_USER_MODIFIABLE;
-	flags |= oldflags & ~F2FS_FL_USER_MODIFIABLE;
-	fi->i_flags = flags;
+	fi->i_flags = iflags | (fi->i_flags & ~mask);
 
 	if (fi->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
@@ -1668,30 +1693,141 @@
 
 	inode->i_ctime = current_time(inode);
 	f2fs_set_inode_flags(inode);
-	f2fs_mark_inode_dirty_sync(inode, false);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	return 0;
 }
 
+/* FS_IOC_GETFLAGS and FS_IOC_SETFLAGS support */
+
+/*
+ * To make a new on-disk f2fs i_flag gettable via FS_IOC_GETFLAGS, add an entry
+ * for it to f2fs_fsflags_map[], and add its FS_*_FL equivalent to
+ * F2FS_GETTABLE_FS_FL.  To also make it settable via FS_IOC_SETFLAGS, also add
+ * its FS_*_FL equivalent to F2FS_SETTABLE_FS_FL.
+ */
+
+static const struct {
+	u32 iflag;
+	u32 fsflag;
+} f2fs_fsflags_map[] = {
+	{ F2FS_SYNC_FL,		FS_SYNC_FL },
+	{ F2FS_IMMUTABLE_FL,	FS_IMMUTABLE_FL },
+	{ F2FS_APPEND_FL,	FS_APPEND_FL },
+	{ F2FS_NODUMP_FL,	FS_NODUMP_FL },
+	{ F2FS_NOATIME_FL,	FS_NOATIME_FL },
+	{ F2FS_INDEX_FL,	FS_INDEX_FL },
+	{ F2FS_DIRSYNC_FL,	FS_DIRSYNC_FL },
+	{ F2FS_PROJINHERIT_FL,	FS_PROJINHERIT_FL },
+	{ F2FS_CASEFOLD_FL,	FS_CASEFOLD_FL },
+};
+
+#define F2FS_GETTABLE_FS_FL (		\
+		FS_SYNC_FL |		\
+		FS_IMMUTABLE_FL |	\
+		FS_APPEND_FL |		\
+		FS_NODUMP_FL |		\
+		FS_NOATIME_FL |		\
+		FS_INDEX_FL |		\
+		FS_DIRSYNC_FL |		\
+		FS_PROJINHERIT_FL |	\
+		FS_ENCRYPT_FL |		\
+		FS_INLINE_DATA_FL |	\
+		FS_NOCOW_FL |		\
+		FS_VERITY_FL |		\
+		FS_CASEFOLD_FL)
+
+#define F2FS_SETTABLE_FS_FL (		\
+		FS_SYNC_FL |		\
+		FS_IMMUTABLE_FL |	\
+		FS_APPEND_FL |		\
+		FS_NODUMP_FL |		\
+		FS_NOATIME_FL |		\
+		FS_DIRSYNC_FL |		\
+		FS_PROJINHERIT_FL |	\
+		FS_CASEFOLD_FL)
+
+/* Convert f2fs on-disk i_flags to FS_IOC_{GET,SET}FLAGS flags */
+static inline u32 f2fs_iflags_to_fsflags(u32 iflags)
+{
+	u32 fsflags = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(f2fs_fsflags_map); i++)
+		if (iflags & f2fs_fsflags_map[i].iflag)
+			fsflags |= f2fs_fsflags_map[i].fsflag;
+
+	return fsflags;
+}
+
+/* Convert FS_IOC_{GET,SET}FLAGS flags to f2fs on-disk i_flags */
+static inline u32 f2fs_fsflags_to_iflags(u32 fsflags)
+{
+	u32 iflags = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(f2fs_fsflags_map); i++)
+		if (fsflags & f2fs_fsflags_map[i].fsflag)
+			iflags |= f2fs_fsflags_map[i].iflag;
+
+	return iflags;
+}
+
+static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags);
+
+	if (IS_ENCRYPTED(inode))
+		fsflags |= FS_ENCRYPT_FL;
+	if (IS_VERITY(inode))
+		fsflags |= FS_VERITY_FL;
+	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
+		fsflags |= FS_INLINE_DATA_FL;
+	if (is_inode_flag_set(inode, FI_PIN_FILE))
+		fsflags |= FS_NOCOW_FL;
+
+	fsflags &= F2FS_GETTABLE_FS_FL;
+
+	return put_user(fsflags, (int __user *)arg);
+}
+
 static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
-	unsigned int flags;
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	u32 fsflags, old_fsflags;
+	u32 iflags;
 	int ret;
 
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	if (get_user(flags, (int __user *)arg))
+	if (get_user(fsflags, (int __user *)arg))
 		return -EFAULT;
 
+	if (fsflags & ~F2FS_GETTABLE_FS_FL)
+		return -EOPNOTSUPP;
+	fsflags &= F2FS_SETTABLE_FS_FL;
+
+	iflags = f2fs_fsflags_to_iflags(fsflags);
+	if (f2fs_mask_flags(inode->i_mode, iflags) != iflags)
+		return -EOPNOTSUPP;
+
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
 
 	inode_lock(inode);
 
-	ret = __f2fs_ioc_setflags(inode, flags);
+	old_fsflags = f2fs_iflags_to_fsflags(fi->i_flags);
+	ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
+	if (ret)
+		goto out;
 
+	ret = f2fs_setflags_common(inode, iflags,
+			f2fs_fsflags_to_iflags(F2FS_SETTABLE_FS_FL));
+out:
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
@@ -1707,6 +1843,8 @@
 static int f2fs_ioc_start_atomic_write(struct file *filp)
 {
 	struct inode *inode = file_inode(filp);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int ret;
 
 	if (!inode_owner_or_capable(inode))
@@ -1715,6 +1853,9 @@
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 
+	if (filp->f_flags & O_DIRECT)
+		return -EINVAL;
+
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
@@ -1733,18 +1874,25 @@
 
 	down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 
-	if (!get_dirty_pages(inode))
-		goto skip_flush;
-
-	f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
-		"Unexpected flush for atomic writes: ino=%lu, npages=%u",
-					inode->i_ino, get_dirty_pages(inode));
+	/*
+	 * Should wait end_io to count F2FS_WB_CP_DATA correctly by
+	 * f2fs_is_atomic_file.
+	 */
+	if (get_dirty_pages(inode))
+		f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u",
+			  inode->i_ino, get_dirty_pages(inode));
 	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
 	if (ret) {
 		up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 		goto out;
 	}
-skip_flush:
+
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (list_empty(&fi->inmem_ilist))
+		list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+
+	/* add inode in inmem_list first and set atomic_file */
 	set_inode_flag(inode, FI_ATOMIC_FILE);
 	clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
 	up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1786,11 +1934,8 @@
 			goto err_out;
 
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
-		if (!ret) {
-			clear_inode_flag(inode, FI_ATOMIC_FILE);
-			F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
-			stat_dec_atomic_write(inode);
-		}
+		if (!ret)
+			f2fs_drop_inmem_pages(inode);
 	} else {
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
 	}
@@ -1949,6 +2094,13 @@
 		f2fs_stop_checkpoint(sbi, false);
 		set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
 		break;
+	case F2FS_GOING_DOWN_NEED_FSCK:
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+		set_sbi_flag(sbi, SBI_IS_DIRTY);
+		/* do checkpoint only */
+		ret = f2fs_sync_fs(sb, 1);
+		goto out;
 	default:
 		ret = -EINVAL;
 		goto out;
@@ -1964,6 +2116,9 @@
 out:
 	if (in != F2FS_GOING_DOWN_FULLSYNC)
 		mnt_drop_write_file(filp);
+
+	trace_f2fs_shutdown(sbi, in, ret);
+
 	return ret;
 }
 
@@ -1978,7 +2133,7 @@
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!blk_queue_discard(q))
+	if (!f2fs_hw_support_discard(F2FS_SB(sb)))
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&range, (struct fstrim_range __user *)arg,
@@ -2017,7 +2172,7 @@
 {
 	struct inode *inode = file_inode(filp);
 
-	if (!f2fs_sb_has_encrypt(inode->i_sb))
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode)))
 		return -EOPNOTSUPP;
 
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -2027,7 +2182,7 @@
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-	if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb))
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
 		return -EOPNOTSUPP;
 	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
@@ -2038,7 +2193,7 @@
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err;
 
-	if (!f2fs_sb_has_encrypt(inode->i_sb))
+	if (!f2fs_sb_has_encrypt(sbi))
 		return -EOPNOTSUPP;
 
 	err = mnt_want_write_file(filp);
@@ -2069,6 +2224,49 @@
 	return err;
 }
 
+static int f2fs_ioc_get_encryption_policy_ex(struct file *filp,
+					     unsigned long arg)
+{
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_add_encryption_key(struct file *filp, unsigned long arg)
+{
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_remove_encryption_key(struct file *filp, unsigned long arg)
+{
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_remove_encryption_key_all_users(struct file *filp,
+						    unsigned long arg)
+{
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fscrypt_ioctl_remove_key_all_users(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_get_encryption_key_status(struct file *filp,
+					      unsigned long arg)
+{
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+}
+
 static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -2123,9 +2321,9 @@
 		return -EROFS;
 
 	end = range.start + range.len;
-	if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) {
+	if (end < range.start || range.start < MAIN_BLKADDR(sbi) ||
+					end >= MAX_BLKADDR(sbi))
 		return -EINVAL;
-	}
 
 	ret = mnt_want_write_file(filp);
 	if (ret)
@@ -2142,7 +2340,7 @@
 	}
 
 	ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
-	range.start += sbi->blocks_per_seg;
+	range.start += BLKS_PER_SEC(sbi);
 	if (range.start <= end)
 		goto do_more;
 out:
@@ -2162,6 +2360,11 @@
 	if (f2fs_readonly(sbi->sb))
 		return -EROFS;
 
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+		f2fs_info(sbi, "Skipping Checkpoint. Checkpoints currently disabled.");
+		return -EINVAL;
+	}
+
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
@@ -2178,7 +2381,8 @@
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_map_blocks map = { .m_next_extent = NULL,
-					.m_seg_type = NO_CHECK_TYPE };
+					.m_seg_type = NO_CHECK_TYPE ,
+					.m_may_create = false };
 	struct extent_info ei = {0, 0, 0};
 	pgoff_t pg_start, pg_end, next_pgofs;
 	unsigned int blk_per_seg = sbi->blocks_per_seg;
@@ -2243,10 +2447,12 @@
 		map.m_lblk += map.m_len;
 	}
 
-	if (!fragmented)
+	if (!fragmented) {
+		total = 0;
 		goto out;
+	}
 
-	sec_num = (total + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
+	sec_num = DIV_ROUND_UP(total, BLKS_PER_SEC(sbi));
 
 	/*
 	 * make sure there are enough free section for LFS allocation, this can
@@ -2274,7 +2480,7 @@
 
 		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
 			map.m_lblk = next_pgofs;
-			continue;
+			goto check;
 		}
 
 		set_inode_flag(inode, FI_DO_DEFRAG);
@@ -2298,8 +2504,8 @@
 		}
 
 		map.m_lblk = idx;
-
-		if (idx < pg_end && cnt < blk_per_seg)
+check:
+		if (map.m_lblk < pg_end && cnt < blk_per_seg)
 			goto do_map;
 
 		clear_inode_flag(inode, FI_DO_DEFRAG);
@@ -2383,7 +2589,7 @@
 	if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
 		return -EINVAL;
 
-	if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
+	if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst))
 		return -EOPNOTSUPP;
 
 	if (src == dst) {
@@ -2533,16 +2739,17 @@
 	if (f2fs_readonly(sbi->sb))
 		return -EROFS;
 
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+		return -EINVAL;
+
 	if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg,
 							sizeof(range)))
 		return -EFAULT;
 
-	if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
-			sbi->segs_per_sec != 1) {
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"Can't flush %u in %d for segs_per_sec %u != 1\n",
-				range.dev_num, sbi->s_ndevs,
-				sbi->segs_per_sec);
+	if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num ||
+			__is_large_section(sbi)) {
+		f2fs_warn(sbi, "Can't flush %u in %d for segs_per_sec %u != 1",
+			  range.dev_num, sbi->s_ndevs, sbi->segs_per_sec);
 		return -EINVAL;
 	}
 
@@ -2591,18 +2798,33 @@
 }
 
 #ifdef CONFIG_QUOTA
+int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid)
+{
+	struct dquot *transfer_to[MAXQUOTAS] = {};
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct super_block *sb = sbi->sb;
+	int err = 0;
+
+	transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+	if (!IS_ERR(transfer_to[PRJQUOTA])) {
+		err = __dquot_transfer(inode, transfer_to);
+		if (err)
+			set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+		dqput(transfer_to[PRJQUOTA]);
+	}
+	return err;
+}
+
 static int f2fs_ioc_setproject(struct file *filp, __u32 projid)
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct super_block *sb = sbi->sb;
-	struct dquot *transfer_to[MAXQUOTAS] = {};
 	struct page *ipage;
 	kprojid_t kprojid;
 	int err;
 
-	if (!f2fs_sb_has_project_quota(sb)) {
+	if (!f2fs_sb_has_project_quota(sbi)) {
 		if (projid != F2FS_DEF_PROJID)
 			return -EOPNOTSUPP;
 		else
@@ -2617,53 +2839,45 @@
 	if (projid_eq(kprojid, F2FS_I(inode)->i_projid))
 		return 0;
 
-	err = mnt_want_write_file(filp);
-	if (err)
-		return err;
-
 	err = -EPERM;
-	inode_lock(inode);
-
 	/* Is it quota file? Do not allow user to mess with it */
 	if (IS_NOQUOTA(inode))
-		goto out_unlock;
+		return err;
 
 	ipage = f2fs_get_node_page(sbi, inode->i_ino);
-	if (IS_ERR(ipage)) {
-		err = PTR_ERR(ipage);
-		goto out_unlock;
-	}
+	if (IS_ERR(ipage))
+		return PTR_ERR(ipage);
 
 	if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize,
 								i_projid)) {
 		err = -EOVERFLOW;
 		f2fs_put_page(ipage, 1);
-		goto out_unlock;
+		return err;
 	}
 	f2fs_put_page(ipage, 1);
 
 	err = dquot_initialize(inode);
 	if (err)
-		goto out_unlock;
+		return err;
 
-	transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
-	if (!IS_ERR(transfer_to[PRJQUOTA])) {
-		err = __dquot_transfer(inode, transfer_to);
-		dqput(transfer_to[PRJQUOTA]);
-		if (err)
-			goto out_dirty;
-	}
+	f2fs_lock_op(sbi);
+	err = f2fs_transfer_project_quota(inode, kprojid);
+	if (err)
+		goto out_unlock;
 
 	F2FS_I(inode)->i_projid = kprojid;
 	inode->i_ctime = current_time(inode);
-out_dirty:
 	f2fs_mark_inode_dirty_sync(inode, true);
 out_unlock:
-	inode_unlock(inode);
-	mnt_drop_write_file(filp);
+	f2fs_unlock_op(sbi);
 	return err;
 }
 #else
+int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid)
+{
+	return 0;
+}
+
 static int f2fs_ioc_setproject(struct file *filp, __u32 projid)
 {
 	if (projid != F2FS_DEF_PROJID)
@@ -2672,64 +2886,76 @@
 }
 #endif
 
-/* Transfer internal flags to xflags */
-static inline __u32 f2fs_iflags_to_xflags(unsigned long iflags)
-{
-	__u32 xflags = 0;
+/* FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR support */
 
-	if (iflags & F2FS_SYNC_FL)
-		xflags |= FS_XFLAG_SYNC;
-	if (iflags & F2FS_IMMUTABLE_FL)
-		xflags |= FS_XFLAG_IMMUTABLE;
-	if (iflags & F2FS_APPEND_FL)
-		xflags |= FS_XFLAG_APPEND;
-	if (iflags & F2FS_NODUMP_FL)
-		xflags |= FS_XFLAG_NODUMP;
-	if (iflags & F2FS_NOATIME_FL)
-		xflags |= FS_XFLAG_NOATIME;
-	if (iflags & F2FS_PROJINHERIT_FL)
-		xflags |= FS_XFLAG_PROJINHERIT;
+/*
+ * To make a new on-disk f2fs i_flag gettable via FS_IOC_FSGETXATTR and settable
+ * via FS_IOC_FSSETXATTR, add an entry for it to f2fs_xflags_map[], and add its
+ * FS_XFLAG_* equivalent to F2FS_SUPPORTED_XFLAGS.
+ */
+
+static const struct {
+	u32 iflag;
+	u32 xflag;
+} f2fs_xflags_map[] = {
+	{ F2FS_SYNC_FL,		FS_XFLAG_SYNC },
+	{ F2FS_IMMUTABLE_FL,	FS_XFLAG_IMMUTABLE },
+	{ F2FS_APPEND_FL,	FS_XFLAG_APPEND },
+	{ F2FS_NODUMP_FL,	FS_XFLAG_NODUMP },
+	{ F2FS_NOATIME_FL,	FS_XFLAG_NOATIME },
+	{ F2FS_PROJINHERIT_FL,	FS_XFLAG_PROJINHERIT },
+};
+
+#define F2FS_SUPPORTED_XFLAGS (		\
+		FS_XFLAG_SYNC |		\
+		FS_XFLAG_IMMUTABLE |	\
+		FS_XFLAG_APPEND |	\
+		FS_XFLAG_NODUMP |	\
+		FS_XFLAG_NOATIME |	\
+		FS_XFLAG_PROJINHERIT)
+
+/* Convert f2fs on-disk i_flags to FS_IOC_FS{GET,SET}XATTR flags */
+static inline u32 f2fs_iflags_to_xflags(u32 iflags)
+{
+	u32 xflags = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(f2fs_xflags_map); i++)
+		if (iflags & f2fs_xflags_map[i].iflag)
+			xflags |= f2fs_xflags_map[i].xflag;
+
 	return xflags;
 }
 
-#define F2FS_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
-				  FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
-				  FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
-
-/* Transfer xflags flags to internal */
-static inline unsigned long f2fs_xflags_to_iflags(__u32 xflags)
+/* Convert FS_IOC_FS{GET,SET}XATTR flags to f2fs on-disk i_flags */
+static inline u32 f2fs_xflags_to_iflags(u32 xflags)
 {
-	unsigned long iflags = 0;
+	u32 iflags = 0;
+	int i;
 
-	if (xflags & FS_XFLAG_SYNC)
-		iflags |= F2FS_SYNC_FL;
-	if (xflags & FS_XFLAG_IMMUTABLE)
-		iflags |= F2FS_IMMUTABLE_FL;
-	if (xflags & FS_XFLAG_APPEND)
-		iflags |= F2FS_APPEND_FL;
-	if (xflags & FS_XFLAG_NODUMP)
-		iflags |= F2FS_NODUMP_FL;
-	if (xflags & FS_XFLAG_NOATIME)
-		iflags |= F2FS_NOATIME_FL;
-	if (xflags & FS_XFLAG_PROJINHERIT)
-		iflags |= F2FS_PROJINHERIT_FL;
+	for (i = 0; i < ARRAY_SIZE(f2fs_xflags_map); i++)
+		if (xflags & f2fs_xflags_map[i].xflag)
+			iflags |= f2fs_xflags_map[i].iflag;
 
 	return iflags;
 }
 
+static void f2fs_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+
+	simple_fill_fsxattr(fa, f2fs_iflags_to_xflags(fi->i_flags));
+
+	if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)))
+		fa->fsx_projid = from_kprojid(&init_user_ns, fi->i_projid);
+}
+
 static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
-	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct fsxattr fa;
 
-	memset(&fa, 0, sizeof(struct fsxattr));
-	fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags &
-				F2FS_FL_USER_VISIBLE);
-
-	if (f2fs_sb_has_project_quota(inode->i_sb))
-		fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
-							fi->i_projid);
+	f2fs_fill_fsxattr(inode, &fa);
 
 	if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa)))
 		return -EFAULT;
@@ -2739,9 +2965,8 @@
 static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	struct fsxattr fa;
-	unsigned int flags;
+	struct fsxattr fa, old_fa;
+	u32 iflags;
 	int err;
 
 	if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa)))
@@ -2751,11 +2976,11 @@
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	if (fa.fsx_xflags & ~F2FS_SUPPORTED_FS_XFLAGS)
+	if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS)
 		return -EOPNOTSUPP;
 
-	flags = f2fs_xflags_to_iflags(fa.fsx_xflags);
-	if (f2fs_mask_flags(inode->i_mode, flags) != flags)
+	iflags = f2fs_xflags_to_iflags(fa.fsx_xflags);
+	if (f2fs_mask_flags(inode->i_mode, iflags) != iflags)
 		return -EOPNOTSUPP;
 
 	err = mnt_want_write_file(filp);
@@ -2763,19 +2988,22 @@
 		return err;
 
 	inode_lock(inode);
-	flags = (fi->i_flags & ~F2FS_FL_XFLAG_VISIBLE) |
-				(flags & F2FS_FL_XFLAG_VISIBLE);
-	err = __f2fs_ioc_setflags(inode, flags);
-	inode_unlock(inode);
-	mnt_drop_write_file(filp);
+
+	f2fs_fill_fsxattr(inode, &old_fa);
+	err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
 	if (err)
-		return err;
+		goto out;
+
+	err = f2fs_setflags_common(inode, iflags,
+			f2fs_xflags_to_iflags(F2FS_SUPPORTED_XFLAGS));
+	if (err)
+		goto out;
 
 	err = f2fs_ioc_setproject(filp, fa.fsx_projid);
-	if (err)
-		return err;
-
-	return 0;
+out:
+	inode_unlock(inode);
+	mnt_drop_write_file(filp);
+	return err;
 }
 
 int f2fs_pin_file_control(struct inode *inode, bool inc)
@@ -2789,10 +3017,9 @@
 				fi->i_gc_failures[GC_FAILURE_PIN] + 1);
 
 	if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) {
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: Enable GC = ino %lx after %x GC trials\n",
-			__func__, inode->i_ino,
-			fi->i_gc_failures[GC_FAILURE_PIN]);
+		f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials",
+			  __func__, inode->i_ino,
+			  fi->i_gc_failures[GC_FAILURE_PIN]);
 		clear_inode_flag(inode, FI_PIN_FILE);
 		return -EAGAIN;
 	}
@@ -2805,9 +3032,6 @@
 	__u32 pin;
 	int ret = 0;
 
-	if (!inode_owner_or_capable(inode))
-		return -EACCES;
-
 	if (get_user(pin, (__u32 __user *)arg))
 		return -EFAULT;
 
@@ -2877,6 +3101,7 @@
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = &m_next_extent;
 	map.m_seg_type = NO_CHECK_TYPE;
+	map.m_may_create = false;
 	end = F2FS_I_SB(inode)->max_file_blocks;
 
 	while (map.m_lblk < end) {
@@ -2899,10 +3124,119 @@
 	return f2fs_precache_extents(file_inode(filp));
 }
 
+static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp));
+	__u64 block_count;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (f2fs_readonly(sbi->sb))
+		return -EROFS;
+
+	if (copy_from_user(&block_count, (void __user *)arg,
+			   sizeof(block_count)))
+		return -EFAULT;
+
+	ret = f2fs_resize_fs(sbi, block_count);
+
+	return ret;
+}
+
+static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
+	if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) {
+		f2fs_warn(F2FS_I_SB(inode),
+			  "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n",
+			  inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	return fsverity_ioctl_enable(filp, (const void __user *)arg);
+}
+
+static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg)
+{
+	if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fsverity_ioctl_measure(filp, (void __user *)arg);
+}
+
+static int f2fs_get_volume_name(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	char *vbuf;
+	int count;
+	int err = 0;
+
+	vbuf = f2fs_kzalloc(sbi, MAX_VOLUME_NAME, GFP_KERNEL);
+	if (!vbuf)
+		return -ENOMEM;
+
+	down_read(&sbi->sb_lock);
+	count = utf16s_to_utf8s(sbi->raw_super->volume_name,
+			ARRAY_SIZE(sbi->raw_super->volume_name),
+			UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME);
+	up_read(&sbi->sb_lock);
+
+	if (copy_to_user((char __user *)arg, vbuf,
+				min(FSLABEL_MAX, count)))
+		err = -EFAULT;
+
+	kvfree(vbuf);
+	return err;
+}
+
+static int f2fs_set_volume_name(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	char *vbuf;
+	int err = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vbuf = strndup_user((const char __user *)arg, FSLABEL_MAX);
+	if (IS_ERR(vbuf))
+		return PTR_ERR(vbuf);
+
+	err = mnt_want_write_file(filp);
+	if (err)
+		goto out;
+
+	down_write(&sbi->sb_lock);
+
+	memset(sbi->raw_super->volume_name, 0,
+			sizeof(sbi->raw_super->volume_name));
+	utf8s_to_utf16s(vbuf, strlen(vbuf), UTF16_LITTLE_ENDIAN,
+			sbi->raw_super->volume_name,
+			ARRAY_SIZE(sbi->raw_super->volume_name));
+
+	err = f2fs_commit_super(sbi, false);
+
+	up_write(&sbi->sb_lock);
+
+	mnt_drop_write_file(filp);
+out:
+	kfree(vbuf);
+	return err;
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp))))
+		return -ENOSPC;
 
 	switch (cmd) {
 	case F2FS_IOC_GETFLAGS:
@@ -2931,6 +3265,16 @@
 		return f2fs_ioc_get_encryption_policy(filp, arg);
 	case F2FS_IOC_GET_ENCRYPTION_PWSALT:
 		return f2fs_ioc_get_encryption_pwsalt(filp, arg);
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+		return f2fs_ioc_get_encryption_policy_ex(filp, arg);
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+		return f2fs_ioc_add_encryption_key(filp, arg);
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+		return f2fs_ioc_remove_encryption_key(filp, arg);
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+		return f2fs_ioc_remove_encryption_key_all_users(filp, arg);
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+		return f2fs_ioc_get_encryption_key_status(filp, arg);
 	case F2FS_IOC_GARBAGE_COLLECT:
 		return f2fs_ioc_gc(filp, arg);
 	case F2FS_IOC_GARBAGE_COLLECT_RANGE:
@@ -2955,6 +3299,16 @@
 		return f2fs_ioc_set_pin_file(filp, arg);
 	case F2FS_IOC_PRECACHE_EXTENTS:
 		return f2fs_ioc_precache_extents(filp, arg);
+	case F2FS_IOC_RESIZE_FS:
+		return f2fs_ioc_resize_fs(filp, arg);
+	case FS_IOC_ENABLE_VERITY:
+		return f2fs_ioc_enable_verity(filp, arg);
+	case FS_IOC_MEASURE_VERITY:
+		return f2fs_ioc_measure_verity(filp, arg);
+	case F2FS_IOC_GET_VOLUME_NAME:
+		return f2fs_get_volume_name(filp, arg);
+	case F2FS_IOC_SET_VOLUME_NAME:
+		return f2fs_set_volume_name(filp, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -2966,15 +3320,17 @@
 	struct inode *inode = file_inode(file);
 	ssize_t ret;
 
-	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
-		return -EIO;
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
+		ret = -EIO;
+		goto out;
+	}
 
-	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
-		return -EINVAL;
-
-	if (!inode_trylock(inode)) {
-		if (iocb->ki_flags & IOCB_NOWAIT)
-			return -EAGAIN;
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!inode_trylock(inode)) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	} else {
 		inode_lock(inode);
 	}
 
@@ -2987,18 +3343,16 @@
 		if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
 			set_inode_flag(inode, FI_NO_PREALLOC);
 
-		if ((iocb->ki_flags & IOCB_NOWAIT) &&
-			(iocb->ki_flags & IOCB_DIRECT)) {
-				if (!f2fs_overwrite_io(inode, iocb->ki_pos,
+		if ((iocb->ki_flags & IOCB_NOWAIT)) {
+			if (!f2fs_overwrite_io(inode, iocb->ki_pos,
 						iov_iter_count(from)) ||
-					f2fs_has_inline_data(inode) ||
-					f2fs_force_buffered_io(inode, WRITE)) {
-						clear_inode_flag(inode,
-								FI_NO_PREALLOC);
-						inode_unlock(inode);
-						return -EAGAIN;
-				}
-
+				f2fs_has_inline_data(inode) ||
+				f2fs_force_buffered_io(inode, iocb, from)) {
+				clear_inode_flag(inode, FI_NO_PREALLOC);
+				inode_unlock(inode);
+				ret = -EAGAIN;
+				goto out;
+			}
 		} else {
 			preallocated = true;
 			target_size = iocb->ki_pos + iov_iter_count(from);
@@ -3007,7 +3361,8 @@
 			if (err) {
 				clear_inode_flag(inode, FI_NO_PREALLOC);
 				inode_unlock(inode);
-				return err;
+				ret = err;
+				goto out;
 			}
 		}
 		ret = __generic_file_write_iter(iocb, from);
@@ -3021,7 +3376,9 @@
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
 	}
 	inode_unlock(inode);
-
+out:
+	trace_f2fs_file_write_iter(inode, iocb->ki_pos,
+					iov_iter_count(from), ret);
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
 	return ret;
@@ -3049,6 +3406,11 @@
 	case F2FS_IOC_SET_ENCRYPTION_POLICY:
 	case F2FS_IOC_GET_ENCRYPTION_PWSALT:
 	case F2FS_IOC_GET_ENCRYPTION_POLICY:
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
 	case F2FS_IOC_GARBAGE_COLLECT:
 	case F2FS_IOC_GARBAGE_COLLECT_RANGE:
 	case F2FS_IOC_WRITE_CHECKPOINT:
@@ -3061,6 +3423,11 @@
 	case F2FS_IOC_GET_PIN_FILE:
 	case F2FS_IOC_SET_PIN_FILE:
 	case F2FS_IOC_PRECACHE_EXTENTS:
+	case F2FS_IOC_RESIZE_FS:
+	case FS_IOC_ENABLE_VERITY:
+	case FS_IOC_MEASURE_VERITY:
+	case F2FS_IOC_GET_VOLUME_NAME:
+	case F2FS_IOC_SET_VOLUME_NAME:
 		break;
 	default:
 		return -ENOIOCTLCMD;

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5c8d004..5877bd7 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/gc.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/module.h>
@@ -43,13 +40,16 @@
 		if (gc_th->gc_wake)
 			gc_th->gc_wake = 0;
 
-		if (try_to_freeze())
+		if (try_to_freeze()) {
+			stat_other_skip_bggc_count(sbi);
 			continue;
+		}
 		if (kthread_should_stop())
 			break;
 
 		if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
 			increase_sleep_time(gc_th, &wait_ms);
+			stat_other_skip_bggc_count(sbi);
 			continue;
 		}
 
@@ -58,8 +58,10 @@
 			f2fs_stop_checkpoint(sbi, false);
 		}
 
-		if (!sb_start_write_trylock(sbi->sb))
+		if (!sb_start_write_trylock(sbi->sb)) {
+			stat_other_skip_bggc_count(sbi);
 			continue;
+		}
 
 		/*
 		 * [GC triggering condition]
@@ -80,12 +82,15 @@
 			goto do_gc;
 		}
 
-		if (!mutex_trylock(&sbi->gc_mutex))
+		if (!mutex_trylock(&sbi->gc_mutex)) {
+			stat_other_skip_bggc_count(sbi);
 			goto next;
+		}
 
-		if (!is_idle(sbi)) {
+		if (!is_idle(sbi, GC_TIME)) {
 			increase_sleep_time(gc_th, &wait_ms);
 			mutex_unlock(&sbi->gc_mutex);
+			stat_io_skip_bggc_count(sbi);
 			goto next;
 		}
 
@@ -137,7 +142,7 @@
 			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(gc_th->f2fs_gc_task)) {
 		err = PTR_ERR(gc_th->f2fs_gc_task);
-		kfree(gc_th);
+		kvfree(gc_th);
 		sbi->gc_thread = NULL;
 	}
 out:
@@ -150,7 +155,7 @@
 	if (!gc_th)
 		return;
 	kthread_stop(gc_th->f2fs_gc_task);
-	kfree(gc_th);
+	kvfree(gc_th);
 	sbi->gc_thread = NULL;
 }
 
@@ -306,10 +311,11 @@
 	struct sit_info *sm = SIT_I(sbi);
 	struct victim_sel_policy p;
 	unsigned int secno, last_victim;
-	unsigned int last_segment = MAIN_SEGS(sbi);
+	unsigned int last_segment;
 	unsigned int nsearched = 0;
 
 	mutex_lock(&dirty_i->seglist_lock);
+	last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec;
 
 	p.alloc_mode = alloc_mode;
 	select_policy(sbi, gc_type, type, &p);
@@ -318,8 +324,7 @@
 	p.min_cost = get_max_cost(sbi, &p);
 
 	if (*result != NULL_SEGNO) {
-		if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
-			get_valid_blocks(sbi, *result, false) &&
+		if (get_valid_blocks(sbi, *result, false) &&
 			!sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
 			p.min_segno = *result;
 		goto out;
@@ -328,6 +333,22 @@
 	if (p.max_search == 0)
 		goto out;
 
+	if (__is_large_section(sbi) && p.alloc_mode == LFS) {
+		if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) {
+			p.min_segno = sbi->next_victim_seg[BG_GC];
+			*result = p.min_segno;
+			sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+			goto got_result;
+		}
+		if (gc_type == FG_GC &&
+				sbi->next_victim_seg[FG_GC] != NULL_SEGNO) {
+			p.min_segno = sbi->next_victim_seg[FG_GC];
+			*result = p.min_segno;
+			sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
+			goto got_result;
+		}
+	}
+
 	last_victim = sm->last_victim[p.gc_mode];
 	if (p.alloc_mode == LFS && gc_type == FG_GC) {
 		p.min_segno = check_bg_victims(sbi);
@@ -361,10 +382,25 @@
 			nsearched++;
 		}
 
+#ifdef CONFIG_F2FS_CHECK_FS
+		/*
+		 * skip selecting the invalid segno (that is failed due to block
+		 * validity check failure during GC) to avoid endless GC loop in
+		 * such cases.
+		 */
+		if (test_bit(segno, sm->invalid_segmap))
+			goto next;
+#endif
+
 		secno = GET_SEC_FROM_SEG(sbi, segno);
 
 		if (sec_usage_check(sbi, secno))
 			goto next;
+		/* Don't touch checkpointed data */
+		if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+					get_ckpt_valid_blocks(sbi, segno) &&
+					p.alloc_mode != SSR))
+			goto next;
 		if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
 			goto next;
 
@@ -380,12 +416,15 @@
 				sm->last_victim[p.gc_mode] = last_victim + 1;
 			else
 				sm->last_victim[p.gc_mode] = segno + 1;
-			sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
+			sm->last_victim[p.gc_mode] %=
+				(MAIN_SECS(sbi) * sbi->segs_per_sec);
 			break;
 		}
 	}
 	if (p.min_segno != NULL_SEGNO) {
 got_it:
+		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+got_result:
 		if (p.alloc_mode == LFS) {
 			secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
 			if (gc_type == FG_GC)
@@ -393,13 +432,13 @@
 			else
 				set_bit(secno, dirty_i->victim_secmap);
 		}
-		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
 
+	}
+out:
+	if (p.min_segno != NULL_SEGNO)
 		trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
 				sbi->cur_victim_sec,
 				prefree_segments(sbi), free_segments(sbi));
-	}
-out:
 	mutex_unlock(&dirty_i->seglist_lock);
 
 	return (p.min_segno == NULL_SEGNO) ? 0 : 1;
@@ -464,7 +503,7 @@
  * On validity, copy that node with cold status, otherwise (invalid node)
  * ignore that.
  */
-static void gc_node_segment(struct f2fs_sb_info *sbi,
+static int gc_node_segment(struct f2fs_sb_info *sbi,
 		struct f2fs_summary *sum, unsigned int segno, int gc_type)
 {
 	struct f2fs_summary *entry;
@@ -472,6 +511,7 @@
 	int off;
 	int phase = 0;
 	bool fggc = (gc_type == FG_GC);
+	int submitted = 0;
 
 	start_addr = START_BLOCK(sbi, segno);
 
@@ -485,10 +525,11 @@
 		nid_t nid = le32_to_cpu(entry->nid);
 		struct page *node_page;
 		struct node_info ni;
+		int err;
 
 		/* stop BG_GC if there is not enough free sections. */
 		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
-			return;
+			return submitted;
 
 		if (check_valid_map(sbi, segno, off) == 0)
 			continue;
@@ -525,7 +566,9 @@
 			continue;
 		}
 
-		f2fs_move_node_page(node_page, gc_type);
+		err = f2fs_move_node_page(node_page, gc_type);
+		if (!err && gc_type == FG_GC)
+			submitted++;
 		stat_inc_node_blk_count(sbi, 1, gc_type);
 	}
 
@@ -534,6 +577,7 @@
 
 	if (fggc)
 		atomic_dec(&sbi->wb_sync_req[NODE]);
+	return submitted;
 }
 
 /*
@@ -560,7 +604,7 @@
 		int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
 		bidx = node_ofs - 5 - dec;
 	}
-	return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
+	return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode);
 }
 
 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -584,9 +628,8 @@
 	}
 
 	if (sum->version != dni->version) {
-		f2fs_msg(sbi->sb, KERN_WARNING,
-				"%s: valid data with mismatched node version.",
-				__func__);
+		f2fs_warn(sbi, "%s: valid data with mismatched node version.",
+			  __func__);
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
 	}
 
@@ -594,8 +637,21 @@
 	source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
 	f2fs_put_page(node_page, 1);
 
-	if (source_blkaddr != blkaddr)
+	if (source_blkaddr != blkaddr) {
+#ifdef CONFIG_F2FS_CHECK_FS
+		unsigned int segno = GET_SEGNO(sbi, blkaddr);
+		unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+		if (unlikely(check_valid_map(sbi, segno, offset))) {
+			if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) {
+				f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n",
+						blkaddr, source_blkaddr, segno);
+				f2fs_bug_on(sbi, 1);
+			}
+		}
+#endif
 		return false;
+	}
 	return true;
 }
 
@@ -625,6 +681,11 @@
 
 	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
 		dn.data_blkaddr = ei.blk + index - ei.fofs;
+		if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
+						DATA_GENERIC_ENHANCE_READ))) {
+			err = -EFSCORRUPTED;
+			goto put_page;
+		}
 		goto got_it;
 	}
 
@@ -634,9 +695,13 @@
 		goto put_page;
 	f2fs_put_dnode(&dn);
 
+	if (!__is_valid_data_blkaddr(dn.data_blkaddr)) {
+		err = -ENOENT;
+		goto put_page;
+	}
 	if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
-						DATA_GENERIC))) {
-		err = -EFAULT;
+						DATA_GENERIC_ENHANCE))) {
+		err = -EFSCORRUPTED;
 		goto put_page;
 	}
 got_it:
@@ -644,6 +709,14 @@
 	fio.page = page;
 	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
 
+	/*
+	 * don't cache encrypted data into meta inode until previous dirty
+	 * data were writebacked to avoid racing between GC and flush.
+	 */
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
+
+	f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
+
 	fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
 					dn.data_blkaddr,
 					FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -669,7 +742,7 @@
  * Move data block via META_MAPPING while keeping locked data page.
  * This can be used to move blocks, aka LBAs, directly on disk.
  */
-static void move_data_block(struct inode *inode, block_t bidx,
+static int move_data_block(struct inode *inode, block_t bidx,
 				int gc_type, unsigned int segno, int off)
 {
 	struct f2fs_io_info fio = {
@@ -688,25 +761,29 @@
 	struct node_info ni;
 	struct page *page, *mpage;
 	block_t newaddr;
-	int err;
+	int err = 0;
 	bool lfs_mode = test_opt(fio.sbi, LFS);
 
 	/* do not read out */
 	page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
 	if (!page)
-		return;
+		return -ENOMEM;
 
-	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+	if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
+		err = -ENOENT;
 		goto out;
+	}
 
 	if (f2fs_is_atomic_file(inode)) {
 		F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
 		F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
+		err = -EAGAIN;
 		goto out;
 	}
 
 	if (f2fs_is_pinned_file(inode)) {
 		f2fs_pin_file_control(inode, true);
+		err = -EAGAIN;
 		goto out;
 	}
 
@@ -717,6 +794,7 @@
 
 	if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
 		ClearPageUptodate(page);
+		err = -ENOENT;
 		goto put_out;
 	}
 
@@ -724,7 +802,9 @@
 	 * don't cache encrypted data into meta inode until previous dirty
 	 * data were writebacked to avoid racing between GC and flush.
 	 */
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
+
+	f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
 
 	err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
 	if (err)
@@ -739,6 +819,29 @@
 	if (lfs_mode)
 		down_write(&fio.sbi->io_order_lock);
 
+	mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
+					fio.old_blkaddr, false);
+	if (!mpage)
+		goto up_out;
+
+	fio.encrypted_page = mpage;
+
+	/* read source block in mpage */
+	if (!PageUptodate(mpage)) {
+		err = f2fs_submit_page_bio(&fio);
+		if (err) {
+			f2fs_put_page(mpage, 1);
+			goto up_out;
+		}
+		lock_page(mpage);
+		if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) ||
+						!PageUptodate(mpage))) {
+			err = -EIO;
+			f2fs_put_page(mpage, 1);
+			goto up_out;
+		}
+	}
+
 	f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
 					&sum, CURSEG_COLD_DATA, NULL, false);
 
@@ -746,45 +849,19 @@
 				newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
 	if (!fio.encrypted_page) {
 		err = -ENOMEM;
+		f2fs_put_page(mpage, 1);
 		goto recover_block;
 	}
 
-	mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
-					fio.old_blkaddr, FGP_LOCK, GFP_NOFS);
-	if (mpage) {
-		bool updated = false;
+	/* write target block */
+	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
+	memcpy(page_address(fio.encrypted_page),
+				page_address(mpage), PAGE_SIZE);
+	f2fs_put_page(mpage, 1);
+	invalidate_mapping_pages(META_MAPPING(fio.sbi),
+				fio.old_blkaddr, fio.old_blkaddr);
 
-		if (PageUptodate(mpage)) {
-			memcpy(page_address(fio.encrypted_page),
-					page_address(mpage), PAGE_SIZE);
-			updated = true;
-		}
-		f2fs_put_page(mpage, 1);
-		invalidate_mapping_pages(META_MAPPING(fio.sbi),
-					fio.old_blkaddr, fio.old_blkaddr);
-		if (updated)
-			goto write_page;
-	}
-
-	err = f2fs_submit_page_bio(&fio);
-	if (err)
-		goto put_page_out;
-
-	/* write page */
-	lock_page(fio.encrypted_page);
-
-	if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
-		err = -EIO;
-		goto put_page_out;
-	}
-	if (unlikely(!PageUptodate(fio.encrypted_page))) {
-		err = -EIO;
-		goto put_page_out;
-	}
-
-write_page:
 	set_page_dirty(fio.encrypted_page);
-	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
 	if (clear_page_dirty_for_io(fio.encrypted_page))
 		dec_page_count(fio.sbi, F2FS_DIRTY_META);
 
@@ -792,13 +869,14 @@
 	ClearPageError(page);
 
 	/* allocate block address */
-	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
 
 	fio.op = REQ_OP_WRITE;
 	fio.op_flags = REQ_SYNC;
 	fio.new_blkaddr = newaddr;
 	f2fs_submit_page_write(&fio);
 	if (fio.retry) {
+		err = -EAGAIN;
 		if (PageWriteback(fio.encrypted_page))
 			end_page_writeback(fio.encrypted_page);
 		goto put_page_out;
@@ -813,43 +891,52 @@
 put_page_out:
 	f2fs_put_page(fio.encrypted_page, 1);
 recover_block:
-	if (lfs_mode)
-		up_write(&fio.sbi->io_order_lock);
 	if (err)
 		f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
 								true, true);
+up_out:
+	if (lfs_mode)
+		up_write(&fio.sbi->io_order_lock);
 put_out:
 	f2fs_put_dnode(&dn);
 out:
 	f2fs_put_page(page, 1);
+	return err;
 }
 
-static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
+static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
 							unsigned int segno, int off)
 {
 	struct page *page;
+	int err = 0;
 
 	page = f2fs_get_lock_data_page(inode, bidx, true);
 	if (IS_ERR(page))
-		return;
+		return PTR_ERR(page);
 
-	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+	if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
+		err = -ENOENT;
 		goto out;
+	}
 
 	if (f2fs_is_atomic_file(inode)) {
 		F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
 		F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
+		err = -EAGAIN;
 		goto out;
 	}
 	if (f2fs_is_pinned_file(inode)) {
 		if (gc_type == FG_GC)
 			f2fs_pin_file_control(inode, true);
+		err = -EAGAIN;
 		goto out;
 	}
 
 	if (gc_type == BG_GC) {
-		if (PageWriteback(page))
+		if (PageWriteback(page)) {
+			err = -EAGAIN;
 			goto out;
+		}
 		set_page_dirty(page);
 		set_cold_data(page);
 	} else {
@@ -867,11 +954,11 @@
 			.io_type = FS_GC_DATA_IO,
 		};
 		bool is_dirty = PageDirty(page);
-		int err;
 
 retry:
+		f2fs_wait_on_page_writeback(page, DATA, true, true);
+
 		set_page_dirty(page);
-		f2fs_wait_on_page_writeback(page, DATA, true);
 		if (clear_page_dirty_for_io(page)) {
 			inode_dec_dirty_pages(inode);
 			f2fs_remove_dirty_inode(inode);
@@ -892,6 +979,7 @@
 	}
 out:
 	f2fs_put_page(page, 1);
+	return err;
 }
 
 /*
@@ -901,7 +989,7 @@
  * If the parent node is not valid or the data block address is different,
  * the victim data block is ignored.
  */
-static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
 {
 	struct super_block *sb = sbi->sb;
@@ -909,6 +997,7 @@
 	block_t start_addr;
 	int off;
 	int phase = 0;
+	int submitted = 0;
 
 	start_addr = START_BLOCK(sbi, segno);
 
@@ -925,7 +1014,7 @@
 
 		/* stop BG_GC if there is not enough free sections. */
 		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
-			return;
+			return submitted;
 
 		if (check_valid_map(sbi, segno, off) == 0)
 			continue;
@@ -997,6 +1086,7 @@
 		if (inode) {
 			struct f2fs_inode_info *fi = F2FS_I(inode);
 			bool locked = false;
+			int err;
 
 			if (S_ISREG(inode->i_mode)) {
 				if (!down_write_trylock(&fi->i_gc_rwsem[READ]))
@@ -1016,12 +1106,16 @@
 			start_bidx = f2fs_start_bidx_of_node(nofs, inode)
 								+ ofs_in_node;
 			if (f2fs_post_read_required(inode))
-				move_data_block(inode, start_bidx, gc_type,
-								segno, off);
+				err = move_data_block(inode, start_bidx,
+							gc_type, segno, off);
 			else
-				move_data_page(inode, start_bidx, gc_type,
+				err = move_data_page(inode, start_bidx, gc_type,
 								segno, off);
 
+			if (!err && (gc_type == FG_GC ||
+					f2fs_post_read_required(inode)))
+				submitted++;
+
 			if (locked) {
 				up_write(&fi->i_gc_rwsem[WRITE]);
 				up_write(&fi->i_gc_rwsem[READ]);
@@ -1033,6 +1127,8 @@
 
 	if (++phase < 5)
 		goto next_step;
+
+	return submitted;
 }
 
 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -1057,18 +1153,34 @@
 	struct blk_plug plug;
 	unsigned int segno = start_segno;
 	unsigned int end_segno = start_segno + sbi->segs_per_sec;
-	int seg_freed = 0;
+	int seg_freed = 0, migrated = 0;
 	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
 						SUM_TYPE_DATA : SUM_TYPE_NODE;
+	int submitted = 0;
+
+	if (__is_large_section(sbi))
+		end_segno = rounddown(end_segno, sbi->segs_per_sec);
 
 	/* readahead multi ssa blocks those have contiguous address */
-	if (sbi->segs_per_sec > 1)
+	if (__is_large_section(sbi))
 		f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
-					sbi->segs_per_sec, META_SSA, true);
+					end_segno - segno, META_SSA, true);
 
 	/* reference all summary page */
 	while (segno < end_segno) {
 		sum_page = f2fs_get_sum_page(sbi, segno++);
+		if (IS_ERR(sum_page)) {
+			int err = PTR_ERR(sum_page);
+
+			end_segno = segno - 1;
+			for (segno = start_segno; segno < end_segno; segno++) {
+				sum_page = find_get_page(META_MAPPING(sbi),
+						GET_SUM_BLOCK(sbi, segno));
+				f2fs_put_page(sum_page, 0);
+				f2fs_put_page(sum_page, 0);
+			}
+			return err;
+		}
 		unlock_page(sum_page);
 	}
 
@@ -1081,18 +1193,21 @@
 					GET_SUM_BLOCK(sbi, segno));
 		f2fs_put_page(sum_page, 0);
 
-		if (get_valid_blocks(sbi, segno, false) == 0 ||
-				!PageUptodate(sum_page) ||
-				unlikely(f2fs_cp_error(sbi)))
-			goto next;
+		if (get_valid_blocks(sbi, segno, false) == 0)
+			goto freed;
+		if (__is_large_section(sbi) &&
+				migrated >= sbi->migration_granularity)
+			goto skip;
+		if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
+			goto skip;
 
 		sum = page_address(sum_page);
 		if (type != GET_SUM_TYPE((&sum->footer))) {
-			f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) "
-				"type [%d, %d] in SSA and SIT",
-				segno, type, GET_SUM_TYPE((&sum->footer)));
+			f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
+				 segno, type, GET_SUM_TYPE((&sum->footer)));
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
-			goto next;
+			f2fs_stop_checkpoint(sbi, false);
+			goto skip;
 		}
 
 		/*
@@ -1103,21 +1218,27 @@
 		 *                                  - lock_page(sum_page)
 		 */
 		if (type == SUM_TYPE_NODE)
-			gc_node_segment(sbi, sum->entries, segno, gc_type);
-		else
-			gc_data_segment(sbi, sum->entries, gc_list, segno,
+			submitted += gc_node_segment(sbi, sum->entries, segno,
 								gc_type);
+		else
+			submitted += gc_data_segment(sbi, sum->entries, gc_list,
+							segno, gc_type);
 
 		stat_inc_seg_count(sbi, type, gc_type);
 
+freed:
 		if (gc_type == FG_GC &&
 				get_valid_blocks(sbi, segno, false) == 0)
 			seg_freed++;
-next:
+		migrated++;
+
+		if (__is_large_section(sbi) && segno + 1 < end_segno)
+			sbi->next_victim_seg[gc_type] = segno + 1;
+skip:
 		f2fs_put_page(sum_page, 0);
 	}
 
-	if (gc_type == FG_GC)
+	if (submitted)
 		f2fs_submit_merged_write(sbi,
 				(type == SUM_TYPE_NODE) ? NODE : DATA);
 
@@ -1172,7 +1293,8 @@
 		 * threshold, we can make them free by checkpoint. Then, we
 		 * secure free segments which doesn't need fggc any more.
 		 */
-		if (prefree_segments(sbi)) {
+		if (prefree_segments(sbi) &&
+				!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
 			ret = f2fs_write_checkpoint(sbi, &cpc);
 			if (ret)
 				goto stop;
@@ -1204,7 +1326,7 @@
 		round++;
 	}
 
-	if (gc_type == FG_GC)
+	if (gc_type == FG_GC && seg_freed)
 		sbi->cur_victim_sec = NULL_SEGNO;
 
 	if (sync)
@@ -1224,7 +1346,7 @@
 			segno = NULL_SEGNO;
 			goto gc_more;
 		}
-		if (gc_type == FG_GC)
+		if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))
 			ret = f2fs_write_checkpoint(sbi, &cpc);
 	}
 stop:
@@ -1244,7 +1366,7 @@
 
 	put_gc_inode(&gc_list);
 
-	if (sync)
+	if (sync && !ret)
 		ret = sec_freed ? 0 : -EAGAIN;
 	return ret;
 }
@@ -1256,7 +1378,180 @@
 	sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
 
 	/* give warm/cold data area from slower device */
-	if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+	if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi))
 		SIT_I(sbi)->last_victim[ALLOC_NEXT] =
 				GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
 }
+
+static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
+							unsigned int end)
+{
+	int type;
+	unsigned int segno, next_inuse;
+	int err = 0;
+
+	/* Move out cursegs from the target range */
+	for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++)
+		allocate_segment_for_resize(sbi, type, start, end);
+
+	/* do GC to move out valid blocks in the range */
+	for (segno = start; segno <= end; segno += sbi->segs_per_sec) {
+		struct gc_inode_list gc_list = {
+			.ilist = LIST_HEAD_INIT(gc_list.ilist),
+			.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
+		};
+
+		mutex_lock(&sbi->gc_mutex);
+		do_garbage_collect(sbi, segno, &gc_list, FG_GC);
+		mutex_unlock(&sbi->gc_mutex);
+		put_gc_inode(&gc_list);
+
+		if (get_valid_blocks(sbi, segno, true))
+			return -EAGAIN;
+	}
+
+	err = f2fs_sync_fs(sbi->sb, 1);
+	if (err)
+		return err;
+
+	next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start);
+	if (next_inuse <= end) {
+		f2fs_err(sbi, "segno %u should be free but still inuse!",
+			 next_inuse);
+		f2fs_bug_on(sbi, 1);
+	}
+	return err;
+}
+
+static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
+{
+	struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi);
+	int section_count = le32_to_cpu(raw_sb->section_count);
+	int segment_count = le32_to_cpu(raw_sb->segment_count);
+	int segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
+	long long block_count = le64_to_cpu(raw_sb->block_count);
+	int segs = secs * sbi->segs_per_sec;
+
+	raw_sb->section_count = cpu_to_le32(section_count + secs);
+	raw_sb->segment_count = cpu_to_le32(segment_count + segs);
+	raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
+	raw_sb->block_count = cpu_to_le64(block_count +
+					(long long)segs * sbi->blocks_per_seg);
+}
+
+static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
+{
+	int segs = secs * sbi->segs_per_sec;
+	long long user_block_count =
+				le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
+
+	SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs;
+	MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
+	FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
+	FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
+	F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count +
+					(long long)segs * sbi->blocks_per_seg);
+}
+
+int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
+{
+	__u64 old_block_count, shrunk_blocks;
+	unsigned int secs;
+	int gc_mode, gc_type;
+	int err = 0;
+	__u32 rem;
+
+	old_block_count = le64_to_cpu(F2FS_RAW_SUPER(sbi)->block_count);
+	if (block_count > old_block_count)
+		return -EINVAL;
+
+	/* new fs size should align to section size */
+	div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem);
+	if (rem)
+		return -EINVAL;
+
+	if (block_count == old_block_count)
+		return 0;
+
+	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+		f2fs_err(sbi, "Should run fsck to repair first.");
+		return -EFSCORRUPTED;
+	}
+
+	if (test_opt(sbi, DISABLE_CHECKPOINT)) {
+		f2fs_err(sbi, "Checkpoint should be enabled.");
+		return -EINVAL;
+	}
+
+	freeze_bdev(sbi->sb->s_bdev);
+
+	shrunk_blocks = old_block_count - block_count;
+	secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
+	spin_lock(&sbi->stat_lock);
+	if (shrunk_blocks + valid_user_blocks(sbi) +
+		sbi->current_reserved_blocks + sbi->unusable_block_count +
+		F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count)
+		err = -ENOSPC;
+	else
+		sbi->user_block_count -= shrunk_blocks;
+	spin_unlock(&sbi->stat_lock);
+	if (err) {
+		thaw_bdev(sbi->sb->s_bdev, sbi->sb);
+		return err;
+	}
+
+	mutex_lock(&sbi->resize_mutex);
+	set_sbi_flag(sbi, SBI_IS_RESIZEFS);
+
+	mutex_lock(&DIRTY_I(sbi)->seglist_lock);
+
+	MAIN_SECS(sbi) -= secs;
+
+	for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++)
+		if (SIT_I(sbi)->last_victim[gc_mode] >=
+					MAIN_SECS(sbi) * sbi->segs_per_sec)
+			SIT_I(sbi)->last_victim[gc_mode] = 0;
+
+	for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++)
+		if (sbi->next_victim_seg[gc_type] >=
+					MAIN_SECS(sbi) * sbi->segs_per_sec)
+			sbi->next_victim_seg[gc_type] = NULL_SEGNO;
+
+	mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
+
+	err = free_segment_range(sbi, MAIN_SECS(sbi) * sbi->segs_per_sec,
+			MAIN_SEGS(sbi) - 1);
+	if (err)
+		goto out;
+
+	update_sb_metadata(sbi, -secs);
+
+	err = f2fs_commit_super(sbi, false);
+	if (err) {
+		update_sb_metadata(sbi, secs);
+		goto out;
+	}
+
+	update_fs_metadata(sbi, -secs);
+	clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
+	err = f2fs_sync_fs(sbi->sb, 1);
+	if (err) {
+		update_fs_metadata(sbi, secs);
+		update_sb_metadata(sbi, secs);
+		f2fs_commit_super(sbi, false);
+	}
+out:
+	if (err) {
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
+
+		MAIN_SECS(sbi) += secs;
+		spin_lock(&sbi->stat_lock);
+		sbi->user_block_count += shrunk_blocks;
+		spin_unlock(&sbi->stat_lock);
+	}
+	clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
+	mutex_unlock(&sbi->resize_mutex);
+	thaw_bdev(sbi->sb->s_bdev, sbi->sb);
+	return err;
+}

diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index c8619e4..bbac9d3 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/gc.h
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #define GC_THREAD_MIN_WB_PAGES		1	/*
 						 * a threshold to determine

diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index eb2e031..5bc4dcd 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/hash.c
  *
@@ -7,16 +8,13 @@
  * Portions of this code from linux/fs/ext3/hash.c
  *
  * Copyright (C) 2002 by Theodore Ts'o
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
 #include <linux/cryptohash.h>
 #include <linux/pagemap.h>
+#include <linux/unicode.h>
 
 #include "f2fs.h"
 
@@ -70,7 +68,7 @@
 		*buf++ = pad;
 }
 
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+static f2fs_hash_t __f2fs_dentry_hash(const struct qstr *name_info,
 				struct fscrypt_name *fname)
 {
 	__u32 hash;
@@ -106,3 +104,37 @@
 	f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
 	return f2fs_hash;
 }
+
+f2fs_hash_t f2fs_dentry_hash(const struct inode *dir,
+		const struct qstr *name_info, struct fscrypt_name *fname)
+{
+#ifdef CONFIG_UNICODE
+	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	const struct unicode_map *um = sbi->s_encoding;
+	int r, dlen;
+	unsigned char *buff;
+	struct qstr folded;
+
+	if (!name_info->len || !IS_CASEFOLDED(dir))
+		goto opaque_seq;
+
+	buff = f2fs_kzalloc(sbi, sizeof(char) * PATH_MAX, GFP_KERNEL);
+	if (!buff)
+		return -ENOMEM;
+
+	dlen = utf8_casefold(um, name_info, buff, PATH_MAX);
+	if (dlen < 0) {
+		kvfree(buff);
+		goto opaque_seq;
+	}
+	folded.name = buff;
+	folded.len = dlen;
+	r = __f2fs_dentry_hash(&folded, fname);
+
+	kvfree(buff);
+	return r;
+
+opaque_seq:
+#endif
+	return __f2fs_dentry_hash(name_info, fname);
+}

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 115dc21..896db04 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c

@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/inline.c
  * Copyright (c) 2013, Intel Corporation
  * Authors: Huajun Li <huajun.li@intel.com>
  *          Haicheng Li <haicheng.li@intel.com>
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/fs.h>
@@ -74,7 +72,7 @@
 
 	addr = inline_data_addr(inode, ipage);
 
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 	memset(addr + from, 0, MAX_INLINE_DATA(inode) - from);
 	set_page_dirty(ipage);
 
@@ -133,6 +131,7 @@
 
 	err = f2fs_get_node_info(fio.sbi, dn->nid, &ni);
 	if (err) {
+		f2fs_truncate_data_blocks_range(dn, 1);
 		f2fs_put_dnode(dn);
 		return err;
 	}
@@ -142,11 +141,9 @@
 	if (unlikely(dn->data_blkaddr != NEW_ADDR)) {
 		f2fs_put_dnode(dn);
 		set_sbi_flag(fio.sbi, SBI_NEED_FSCK);
-		f2fs_msg(fio.sbi->sb, KERN_WARNING,
-			"%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, "
-			"run fsck to fix.",
-			__func__, dn->inode->i_ino, dn->data_blkaddr);
-		return -EINVAL;
+		f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.",
+			  __func__, dn->inode->i_ino, dn->data_blkaddr);
+		return -EFSCORRUPTED;
 	}
 
 	f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
@@ -163,7 +160,7 @@
 	fio.old_blkaddr = dn->data_blkaddr;
 	set_inode_flag(dn->inode, FI_HOT_DATA);
 	f2fs_outplace_write_data(dn, &fio);
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 	if (dirty) {
 		inode_dec_dirty_pages(dn->inode);
 		f2fs_remove_dirty_inode(dn->inode);
@@ -238,14 +235,14 @@
 
 	f2fs_bug_on(F2FS_I_SB(inode), page->index);
 
-	f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true);
 	src_addr = kmap_atomic(page);
 	dst_addr = inline_data_addr(inode, dn.inode_page);
 	memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
 	kunmap_atomic(src_addr);
 	set_page_dirty(dn.inode_page);
 
-	f2fs_clear_radix_tree_dirty_tag(page);
+	f2fs_clear_page_cache_dirty_tag(page);
 
 	set_inode_flag(inode, FI_APPEND_WRITE);
 	set_inode_flag(inode, FI_DATA_EXIST);
@@ -279,7 +276,7 @@
 		ipage = f2fs_get_node_page(sbi, inode->i_ino);
 		f2fs_bug_on(sbi, IS_ERR(ipage));
 
-		f2fs_wait_on_page_writeback(ipage, NODE, true);
+		f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 
 		src_addr = inline_data_addr(inode, npage);
 		dst_addr = inline_data_addr(inode, ipage);
@@ -324,7 +321,7 @@
 		return NULL;
 	}
 
-	namehash = f2fs_dentry_hash(&name, fname);
+	namehash = f2fs_dentry_hash(dir, &name, fname);
 
 	inline_dentry = inline_data_addr(dir, ipage);
 
@@ -385,15 +382,13 @@
 	if (unlikely(dn.data_blkaddr != NEW_ADDR)) {
 		f2fs_put_dnode(&dn);
 		set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
-		f2fs_msg(F2FS_P_SB(page)->sb, KERN_WARNING,
-			"%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, "
-			"run fsck to fix.",
-			__func__, dir->i_ino, dn.data_blkaddr);
-		err = -EINVAL;
+		f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.",
+			  __func__, dir->i_ino, dn.data_blkaddr);
+		err = -EFSCORRUPTED;
 		goto out;
 	}
 
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 	dentry_blk = page_address(page);
 
@@ -422,6 +417,14 @@
 	stat_dec_inline_dir(dir);
 	clear_inode_flag(dir, FI_INLINE_DENTRY);
 
+	/*
+	 * should retrieve reserved space which was used to keep
+	 * inline_dentry's structure for backward compatibility.
+	 */
+	if (!f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(dir)) &&
+			!f2fs_has_inline_xattr(dir))
+		F2FS_I(dir)->i_inline_xattr_size = 0;
+
 	f2fs_i_depth_write(dir, 1);
 	if (i_size_read(dir) < PAGE_SIZE)
 		f2fs_i_size_write(dir, PAGE_SIZE);
@@ -503,18 +506,27 @@
 
 	stat_dec_inline_dir(dir);
 	clear_inode_flag(dir, FI_INLINE_DENTRY);
-	kfree(backup_dentry);
+
+	/*
+	 * should retrieve reserved space which was used to keep
+	 * inline_dentry's structure for backward compatibility.
+	 */
+	if (!f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(dir)) &&
+			!f2fs_has_inline_xattr(dir))
+		F2FS_I(dir)->i_inline_xattr_size = 0;
+
+	kvfree(backup_dentry);
 	return 0;
 recover:
 	lock_page(ipage);
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir));
 	f2fs_i_depth_write(dir, 0);
 	f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
 	set_page_dirty(ipage);
 	f2fs_put_page(ipage, 1);
 
-	kfree(backup_dentry);
+	kvfree(backup_dentry);
 	return err;
 }
 
@@ -567,9 +579,9 @@
 		}
 	}
 
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 
-	name_hash = f2fs_dentry_hash(new_name, NULL);
+	name_hash = f2fs_dentry_hash(dir, new_name, NULL);
 	f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
 
 	set_page_dirty(ipage);
@@ -577,6 +589,11 @@
 	/* we don't need to mark_inode_dirty now */
 	if (inode) {
 		f2fs_i_pino_write(inode, dir->i_ino);
+
+		/* synchronize inode page's data from inode cache */
+		if (is_inode_flag_set(inode, FI_NEW_INODE))
+			f2fs_update_inode(inode, page);
+
 		f2fs_put_page(page, 1);
 	}
 
@@ -599,7 +616,7 @@
 	int i;
 
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, NODE, true);
+	f2fs_wait_on_page_writeback(page, NODE, true, true);
 
 	inline_dentry = inline_data_addr(dir, page);
 	make_dentry_ptr_inline(dir, &d, inline_dentry);
@@ -661,6 +678,12 @@
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
+	/*
+	 * f2fs_readdir was protected by inode.i_rwsem, it is safe to access
+	 * ipage without page's lock held.
+	 */
+	unlock_page(ipage);
+
 	inline_dentry = inline_data_addr(inode, ipage);
 
 	make_dentry_ptr_inline(inode, &d, inline_dentry);
@@ -669,7 +692,7 @@
 	if (!err)
 		ctx->pos = d.max;
 
-	f2fs_put_page(ipage, 1);
+	f2fs_put_page(ipage, 0);
 	return err < 0 ? err : 0;
 }
 
@@ -687,7 +710,13 @@
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
-	if (!f2fs_has_inline_data(inode)) {
+	if ((S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+				!f2fs_has_inline_data(inode)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	if (S_ISDIR(inode->i_mode) && !f2fs_has_inline_dentry(inode)) {
 		err = -EAGAIN;
 		goto out;
 	}

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index dd608b8..db4fec3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/inode.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
@@ -17,6 +14,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "xattr.h"
 
 #include <trace/events/f2fs.h>
 
@@ -46,11 +44,15 @@
 		new_fl |= S_NOATIME;
 	if (flags & F2FS_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
-	if (f2fs_encrypted_inode(inode))
+	if (file_is_encrypt(inode))
 		new_fl |= S_ENCRYPTED;
+	if (file_is_verity(inode))
+		new_fl |= S_VERITY;
+	if (flags & F2FS_CASEFOLD_FL)
+		new_fl |= S_CASEFOLD;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
-			S_ENCRYPTED);
+			S_ENCRYPTED|S_VERITY|S_CASEFOLD);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -75,8 +77,8 @@
 
 	if (!__is_valid_data_blkaddr(addr))
 		return 1;
-	if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC))
-		return -EFAULT;
+	if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE))
+		return -EFSCORRUPTED;
 	return 0;
 }
 
@@ -106,7 +108,7 @@
 
 	while (start < end) {
 		if (*start++) {
-			f2fs_wait_on_page_writeback(ipage, NODE, true);
+			f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 
 			set_inode_flag(inode, FI_DATA_EXIST);
 			set_raw_inline(inode, F2FS_INODE(ipage));
@@ -121,7 +123,7 @@
 {
 	struct f2fs_inode *ri = &F2FS_NODE(page)->i;
 
-	if (!f2fs_sb_has_inode_chksum(sbi->sb))
+	if (!f2fs_sb_has_inode_chksum(sbi))
 		return false;
 
 	if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR))
@@ -178,9 +180,8 @@
 	calculated = f2fs_inode_chksum(sbi, page);
 
 	if (provided != calculated)
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"checksum invalid, ino = %x, %x vs. %x",
-			ino_of_node(page), provided, calculated);
+		f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x",
+			  page->index, ino_of_node(page), provided, calculated);
 
 	return provided == calculated;
 }
@@ -204,50 +205,53 @@
 	iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks);
 	if (!iblocks) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, "
-			"run fsck to fix.",
-			__func__, inode->i_ino, iblocks);
+		f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.",
+			  __func__, inode->i_ino, iblocks);
 		return false;
 	}
 
 	if (ino_of_node(node_page) != nid_of_node(node_page)) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: corrupted inode footer i_ino=%lx, ino,nid: "
-			"[%u, %u] run fsck to fix.",
-			__func__, inode->i_ino,
-			ino_of_node(node_page), nid_of_node(node_page));
+		f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.",
+			  __func__, inode->i_ino,
+			  ino_of_node(node_page), nid_of_node(node_page));
 		return false;
 	}
 
-	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)
+	if (f2fs_sb_has_flexible_inline_xattr(sbi)
 			&& !f2fs_has_extra_attr(inode)) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: corrupted inode ino=%lx, run fsck to fix.",
-			__func__, inode->i_ino);
+		f2fs_warn(sbi, "%s: corrupted inode ino=%lx, run fsck to fix.",
+			  __func__, inode->i_ino);
 		return false;
 	}
 
 	if (f2fs_has_extra_attr(inode) &&
-			!f2fs_sb_has_extra_attr(sbi->sb)) {
+			!f2fs_sb_has_extra_attr(sbi)) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: inode (ino=%lx) is with extra_attr, "
-			"but extra_attr feature is off",
-			__func__, inode->i_ino);
+		f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off",
+			  __func__, inode->i_ino);
 		return false;
 	}
 
 	if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE ||
 			fi->i_extra_isize % sizeof(__le32)) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: inode (ino=%lx) has corrupted i_extra_isize: %d, "
-			"max: %zu",
-			__func__, inode->i_ino, fi->i_extra_isize,
-			F2FS_TOTAL_EXTRA_ATTR_SIZE);
+		f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu",
+			  __func__, inode->i_ino, fi->i_extra_isize,
+			  F2FS_TOTAL_EXTRA_ATTR_SIZE);
+		return false;
+	}
+
+	if (f2fs_has_extra_attr(inode) &&
+		f2fs_sb_has_flexible_inline_xattr(sbi) &&
+		f2fs_has_inline_xattr(inode) &&
+		(!fi->i_inline_xattr_size ||
+		fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu",
+			  __func__, inode->i_ino, fi->i_inline_xattr_size,
+			  MAX_INLINE_XATTR_SIZE);
 		return false;
 	}
 
@@ -255,15 +259,14 @@
 		struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
 
 		if (ei->len &&
-			(!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC) ||
+			(!f2fs_is_valid_blkaddr(sbi, ei->blk,
+						DATA_GENERIC_ENHANCE) ||
 			!f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1,
-							DATA_GENERIC))) {
+						DATA_GENERIC_ENHANCE))) {
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
-			f2fs_msg(sbi->sb, KERN_WARNING,
-				"%s: inode (ino=%lx) extent info [%u, %u, %u] "
-				"is incorrect, run fsck to fix",
-				__func__, inode->i_ino,
-				ei->blk, ei->fofs, ei->len);
+			f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix",
+				  __func__, inode->i_ino,
+				  ei->blk, ei->fofs, ei->len);
 			return false;
 		}
 	}
@@ -271,19 +274,15 @@
 	if (f2fs_has_inline_data(inode) &&
 			(!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: inode (ino=%lx, mode=%u) should not have "
-			"inline_data, run fsck to fix",
-			__func__, inode->i_ino, inode->i_mode);
+		f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix",
+			  __func__, inode->i_ino, inode->i_mode);
 		return false;
 	}
 
 	if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: inode (ino=%lx, mode=%u) should not have "
-			"inline_dentry, run fsck to fix",
-			__func__, inode->i_ino, inode->i_mode);
+		f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_dentry, run fsck to fix",
+			  __func__, inode->i_ino, inode->i_mode);
 		return false;
 	}
 
@@ -330,6 +329,8 @@
 					le16_to_cpu(ri->i_gc_failures);
 	fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
 	fi->i_flags = le32_to_cpu(ri->i_flags);
+	if (S_ISREG(inode->i_mode))
+		fi->i_flags &= ~F2FS_PROJINHERIT_FL;
 	fi->flags = 0;
 	fi->i_advise = ri->i_advise;
 	fi->i_pino = le32_to_cpu(ri->i_pino);
@@ -343,7 +344,7 @@
 	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
 					le16_to_cpu(ri->i_extra_isize) : 0;
 
-	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+	if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
 		fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
 	} else if (f2fs_has_inline_xattr(inode) ||
 				f2fs_has_inline_dentry(inode)) {
@@ -361,7 +362,7 @@
 
 	if (!sanity_check_inode(inode, node_page)) {
 		f2fs_put_page(node_page, 1);
-		return -EINVAL;
+		return -EFSCORRUPTED;
 	}
 
 	/* check data exist */
@@ -393,14 +394,14 @@
 	if (fi->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
 
-	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) &&
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi) &&
 			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
 		i_projid = (projid_t)le32_to_cpu(ri->i_projid);
 	else
 		i_projid = F2FS_DEF_PROJID;
 	fi->i_projid = make_kprojid(&init_user_ns, i_projid);
 
-	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) &&
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi) &&
 			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
 		fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime);
 		fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
@@ -456,7 +457,7 @@
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 		inode_nohighmem(inode);
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (f2fs_encrypted_inode(inode))
+		if (file_is_encrypt(inode))
 			inode->i_op = &f2fs_encrypted_symlink_inode_operations;
 		else
 			inode->i_op = &f2fs_symlink_inode_operations;
@@ -476,6 +477,7 @@
 	return inode;
 
 bad_inode:
+	f2fs_inode_synced(inode);
 	iget_failed(inode);
 	trace_f2fs_iget_exit(inode, ret);
 	return ERR_PTR(ret);
@@ -500,7 +502,7 @@
 	struct f2fs_inode *ri;
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 
-	f2fs_wait_on_page_writeback(node_page, NODE, true);
+	f2fs_wait_on_page_writeback(node_page, NODE, true, true);
 	set_page_dirty(node_page);
 
 	f2fs_inode_synced(inode);
@@ -545,11 +547,11 @@
 	if (f2fs_has_extra_attr(inode)) {
 		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
 
-		if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
+		if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)))
 			ri->i_inline_xattr_size =
 				cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
 
-		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) &&
 			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
 								i_projid)) {
 			projid_t i_projid;
@@ -559,7 +561,7 @@
 			ri->i_projid = cpu_to_le32(i_projid);
 		}
 
-		if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) &&
+		if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) &&
 			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
 								i_crtime)) {
 			ri->i_crtime =
@@ -616,6 +618,9 @@
 	if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
 		return 0;
 
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
+
 	/*
 	 * We need to balance fs here to prevent from producing dirty node pages
 	 * during the urgent cleaning time when runing out of free sections.
@@ -654,7 +659,11 @@
 	if (inode->i_nlink || is_bad_inode(inode))
 		goto no_delete;
 
-	dquot_initialize(inode);
+	err = dquot_initialize(inode);
+	if (err) {
+		err = 0;
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	}
 
 	f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
 	f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
@@ -686,9 +695,11 @@
 		goto retry;
 	}
 
-	if (err)
+	if (err) {
 		f2fs_update_inode_page(inode);
-	dquot_free_inode(inode);
+		if (dquot_initialize_needed(inode))
+			set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	}
 	sb_end_intwrite(inode->i_sb);
 no_delete:
 	dquot_drop(inode);
@@ -697,7 +708,8 @@
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
 
-	if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)))
+	if (likely(!f2fs_cp_error(sbi) &&
+				!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
 		f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
 	else
 		f2fs_inode_synced(inode);
@@ -726,6 +738,7 @@
 	}
 out_clear:
 	fscrypt_put_encryption_info(inode);
+	fsverity_cleanup_inode(inode);
 	clear_inode(inode);
 }
 
@@ -760,8 +773,7 @@
 	err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
 	if (err) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"May loss orphan inode, run fsck to fix.");
+		f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
 		goto out;
 	}
 
@@ -769,8 +781,7 @@
 		err = f2fs_acquire_orphan_inode(sbi);
 		if (err) {
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
-			f2fs_msg(sbi->sb, KERN_WARNING,
-				"Too many orphan inodes, run fsck to fix.");
+			f2fs_warn(sbi, "Too many orphan inodes, run fsck to fix.");
 		} else {
 			f2fs_add_orphan_inode(inode);
 		}

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 1f67e38..4faf06e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c

@@ -1,24 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/namei.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
 #include <linux/ctype.h>
+#include <linux/random.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 
 #include "f2fs.h"
 #include "node.h"
+#include "segment.h"
 #include "xattr.h"
 #include "acl.h"
 #include <trace/events/f2fs.h>
@@ -52,7 +51,7 @@
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	F2FS_I(inode)->i_crtime = inode->i_mtime;
-	inode->i_generation = sbi->s_next_generation++;
+	inode->i_generation = prandom_u32();
 
 	if (S_ISDIR(inode->i_mode))
 		F2FS_I(inode)->i_current_depth = 1;
@@ -63,7 +62,7 @@
 		goto fail;
 	}
 
-	if (f2fs_sb_has_project_quota(sbi->sb) &&
+	if (f2fs_sb_has_project_quota(sbi) &&
 		(F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
 		F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
 	else
@@ -74,18 +73,14 @@
 	if (err)
 		goto fail_drop;
 
-	err = dquot_alloc_inode(inode);
-	if (err)
-		goto fail_drop;
-
 	set_inode_flag(inode, FI_NEW_INODE);
 
 	/* If the directory encrypted, then we should encrypt the inode. */
-	if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+	if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
 				f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
-	if (f2fs_sb_has_extra_attr(sbi->sb)) {
+	if (f2fs_sb_has_extra_attr(sbi)) {
 		set_inode_flag(inode, FI_EXTRA_ATTR);
 		F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
 	}
@@ -98,7 +93,7 @@
 	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(inode, FI_INLINE_DENTRY);
 
-	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+	if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
 		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
 		if (f2fs_has_inline_xattr(inode))
 			xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
@@ -124,6 +119,8 @@
 	if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
 
+	f2fs_set_inode_flags(inode);
+
 	trace_f2fs_new_inode(inode, 0);
 	return inode;
 
@@ -146,7 +143,7 @@
 	return ERR_PTR(err);
 }
 
-static int is_extension_exist(const unsigned char *s, const char *sub)
+static inline int is_extension_exist(const unsigned char *s, const char *sub)
 {
 	size_t slen = strlen(s);
 	size_t sublen = strlen(sub);
@@ -184,16 +181,19 @@
 	hot_count = sbi->raw_super->hot_ext_count;
 
 	for (i = 0; i < cold_count + hot_count; i++) {
-		if (!is_extension_exist(name, extlist[i]))
-			continue;
-		if (i < cold_count)
-			file_set_cold(inode);
-		else
-			file_set_hot(inode);
-		break;
+		if (is_extension_exist(name, extlist[i]))
+			break;
 	}
 
 	up_read(&sbi->sb_lock);
+
+	if (i == cold_count + hot_count)
+		return;
+
+	if (i < cold_count)
+		file_set_cold(inode);
+	else
+		file_set_hot(inode);
 }
 
 int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
@@ -272,6 +272,8 @@
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
 
 	err = dquot_initialize(dir);
 	if (err)
@@ -318,6 +320,8 @@
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
 
 	err = fscrypt_prepare_link(old_dentry, dir, dentry);
 	if (err)
@@ -379,9 +383,8 @@
 	int err = 0;
 
 	if (f2fs_readonly(sbi->sb)) {
-		f2fs_msg(sbi->sb, KERN_INFO,
-			"skip recovering inline_dots inode (ino:%lu, pino:%u) "
-			"in readonly mountpoint", dir->i_ino, pino);
+		f2fs_info(sbi, "skip recovering inline_dots inode (ino:%lu, pino:%u) in readonly mountpoint",
+			  dir->i_ino, pino);
 		return 0;
 	}
 
@@ -430,19 +433,23 @@
 	nid_t ino = -1;
 	int err = 0;
 	unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
+	struct fscrypt_name fname;
 
 	trace_f2fs_lookup_start(dir, dentry, flags);
 
-	err = fscrypt_prepare_lookup(dir, dentry, flags);
-	if (err)
-		goto out;
-
 	if (dentry->d_name.len > F2FS_NAME_LEN) {
 		err = -ENAMETOOLONG;
 		goto out;
 	}
 
-	de = f2fs_find_entry(dir, &dentry->d_name, &page);
+	err = fscrypt_prepare_lookup(dir, dentry, &fname);
+	if (err == -ENOENT)
+		goto out_splice;
+	if (err)
+		goto out;
+	de = __f2fs_find_entry(dir, &fname, &page);
+	fscrypt_free_filename(&fname);
+
 	if (!de) {
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
@@ -471,19 +478,28 @@
 		if (err)
 			goto out_iput;
 	}
-	if (f2fs_encrypted_inode(dir) &&
+	if (IS_ENCRYPTED(dir) &&
 	    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
 	    !fscrypt_has_permitted_context(dir, inode)) {
-		f2fs_msg(inode->i_sb, KERN_WARNING,
-			 "Inconsistent encryption contexts: %lu/%lu",
-			 dir->i_ino, inode->i_ino);
+		f2fs_warn(F2FS_I_SB(inode), "Inconsistent encryption contexts: %lu/%lu",
+			  dir->i_ino, inode->i_ino);
 		err = -EPERM;
 		goto out_iput;
 	}
 out_splice:
+#ifdef CONFIG_UNICODE
+	if (!inode && IS_CASEFOLDED(dir)) {
+		/* Eventually we want to call d_add_ci(dentry, NULL)
+		 * for negative dentries in the encoding case as
+		 * well.  For now, prevent the negative dentry
+		 * from being cached.
+		 */
+		trace_f2fs_lookup_end(dir, dentry, ino, err);
+		return NULL;
+	}
+#endif
 	new = d_splice_alias(inode, dentry);
-	if (IS_ERR(new))
-		err = PTR_ERR(new);
+	err = PTR_ERR_OR_ZERO(new);
 	trace_f2fs_lookup_end(dir, dentry, ino, err);
 	return new;
 out_iput:
@@ -530,6 +546,16 @@
 		goto fail;
 	}
 	f2fs_delete_entry(de, page, dir, inode);
+#ifdef CONFIG_UNICODE
+	/* VFS negative dentries are incompatible with Encoding and
+	 * Case-insensitiveness. Eventually we'll want avoid
+	 * invalidating the dentries here, alongside with returning the
+	 * negative dentries at f2fs_lookup(), when it is  better
+	 * supported by the VFS for the CI case.
+	 */
+	if (IS_CASEFOLDED(dir))
+		d_invalidate(dentry);
+#endif
 	f2fs_unlock_op(sbi);
 
 	if (IS_DIRSYNC(dir))
@@ -564,6 +590,8 @@
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
 
 	err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
 				      &disk_link);
@@ -627,7 +655,7 @@
 	f2fs_handle_failed_inode(inode);
 out_free_encrypted_link:
 	if (disk_link.name != (unsigned char *)symname)
-		kfree(disk_link.name);
+		kvfree(disk_link.name);
 	return err;
 }
 
@@ -693,6 +721,8 @@
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
 
 	err = dquot_initialize(dir);
 	if (err)
@@ -791,8 +821,10 @@
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
 
-	if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
+	if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
 		int err = fscrypt_get_encryption_info(dir);
 		if (err)
 			return err;
@@ -823,10 +855,12 @@
 	struct f2fs_dir_entry *old_entry;
 	struct f2fs_dir_entry *new_entry;
 	bool is_old_inline = f2fs_has_inline_dentry(old_dir);
-	int err = -ENOENT;
+	int err;
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
 
 	if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
 			(!projid_eq(F2FS_I(new_dir)->i_projid,
@@ -847,6 +881,7 @@
 			goto out;
 	}
 
+	err = -ENOENT;
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry) {
 		if (IS_ERR(old_page))
@@ -983,6 +1018,8 @@
 
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		f2fs_sync_fs(sbi->sb, 1);
+
+	f2fs_update_time(sbi, REQ_TIME);
 	return 0;
 
 put_out_dir:
@@ -1012,10 +1049,12 @@
 	struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
 	struct f2fs_dir_entry *old_entry, *new_entry;
 	int old_nlink = 0, new_nlink = 0;
-	int err = -ENOENT;
+	int err;
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
 
 	if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
 			!projid_eq(F2FS_I(new_dir)->i_projid,
@@ -1033,6 +1072,7 @@
 	if (err)
 		goto out;
 
+	err = -ENOENT;
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry) {
 		if (IS_ERR(old_page))
@@ -1136,6 +1176,8 @@
 
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		f2fs_sync_fs(sbi->sb, 1);
+
+	f2fs_update_time(sbi, REQ_TIME);
 	return 0;
 out_new_dir:
 	if (new_dir_entry) {
@@ -1225,6 +1267,7 @@
 #ifdef CONFIG_F2FS_FS_XATTR
 	.listxattr	= f2fs_listxattr,
 #endif
+	.fiemap		= f2fs_fiemap,
 };
 
 const struct inode_operations f2fs_symlink_inode_operations = {

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 42ea42a..8b66bc4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/node.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
@@ -37,10 +34,9 @@
 {
 	if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-				"%s: out-of-range nid=%x, run fsck to fix.",
-				__func__, nid);
-		return -EINVAL;
+		f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.",
+			  __func__, nid);
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -104,7 +100,7 @@
 static void clear_node_page_dirty(struct page *page)
 {
 	if (PageDirty(page)) {
-		f2fs_clear_radix_tree_dirty_tag(page);
+		f2fs_clear_page_cache_dirty_tag(page);
 		clear_page_dirty_for_io(page);
 		dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
 	}
@@ -129,6 +125,8 @@
 
 	/* get current nat block page with lock */
 	src_page = get_current_nat_page(sbi, nid);
+	if (IS_ERR(src_page))
+		return src_page;
 	dst_page = f2fs_grab_meta_page(sbi, dst_off);
 	f2fs_bug_on(sbi, PageDirty(src_page));
 
@@ -455,7 +453,7 @@
 			new_blkaddr == NULL_ADDR);
 	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
 			new_blkaddr == NEW_ADDR);
-	f2fs_bug_on(sbi, is_valid_data_blkaddr(sbi, nat_get_blkaddr(e)) &&
+	f2fs_bug_on(sbi, __is_valid_data_blkaddr(nat_get_blkaddr(e)) &&
 			new_blkaddr == NEW_ADDR);
 
 	/* increment version no as node is removed */
@@ -466,7 +464,7 @@
 
 	/* change address */
 	nat_set_blkaddr(e, new_blkaddr);
-	if (!is_valid_data_blkaddr(sbi, new_blkaddr))
+	if (!__is_valid_data_blkaddr(new_blkaddr))
 		set_nat_flag(e, IS_CHECKPOINTED, false);
 	__set_nat_cache_dirty(nm_i, e);
 
@@ -527,6 +525,7 @@
 	struct f2fs_nat_entry ne;
 	struct nat_entry *e;
 	pgoff_t index;
+	block_t blkaddr;
 	int i;
 
 	ni->nid = nid;
@@ -570,6 +569,11 @@
 	node_info_from_raw_nat(ni, &ne);
 	f2fs_put_page(page, 1);
 cache:
+	blkaddr = le32_to_cpu(ne.block_addr);
+	if (__is_valid_data_blkaddr(blkaddr) &&
+		!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE))
+		return -EFAULT;
+
 	/* cache nat entry */
 	cache_nat_entry(sbi, nid, &ne);
 	return 0;
@@ -601,9 +605,9 @@
 pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
 {
 	const long direct_index = ADDRS_PER_INODE(dn->inode);
-	const long direct_blks = ADDRS_PER_BLOCK;
-	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
-	unsigned int skipped_unit = ADDRS_PER_BLOCK;
+	const long direct_blks = ADDRS_PER_BLOCK(dn->inode);
+	const long indirect_blks = ADDRS_PER_BLOCK(dn->inode) * NIDS_PER_BLOCK;
+	unsigned int skipped_unit = ADDRS_PER_BLOCK(dn->inode);
 	int cur_level = dn->cur_level;
 	int max_level = dn->max_level;
 	pgoff_t base = 0;
@@ -617,8 +621,10 @@
 	switch (dn->max_level) {
 	case 3:
 		base += 2 * indirect_blks;
+		/* fall through */
 	case 2:
 		base += 2 * direct_blks;
+		/* fall through */
 	case 1:
 		base += direct_index;
 		break;
@@ -637,9 +643,9 @@
 				int offset[4], unsigned int noffset[4])
 {
 	const long direct_index = ADDRS_PER_INODE(inode);
-	const long direct_blks = ADDRS_PER_BLOCK;
+	const long direct_blks = ADDRS_PER_BLOCK(inode);
 	const long dptrs_per_blk = NIDS_PER_BLOCK;
-	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+	const long indirect_blks = ADDRS_PER_BLOCK(inode) * NIDS_PER_BLOCK;
 	const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
 	int n = 0;
 	int level = 0;
@@ -827,6 +833,7 @@
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct node_info ni;
 	int err;
+	pgoff_t index;
 
 	err = f2fs_get_node_info(sbi, dn->nid, &ni);
 	if (err)
@@ -846,10 +853,11 @@
 	clear_node_page_dirty(dn->node_page);
 	set_sbi_flag(sbi, SBI_IS_DIRTY);
 
+	index = dn->node_page->index;
 	f2fs_put_page(dn->node_page, 1);
 
 	invalidate_mapping_pages(NODE_MAPPING(sbi),
-			dn->node_page->index, dn->node_page->index);
+			index, index);
 
 	dn->node_page = NULL;
 	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
@@ -1105,7 +1113,7 @@
 				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
 			lock_page(page);
 			BUG_ON(page->mapping != NODE_MAPPING(sbi));
-			f2fs_wait_on_page_writeback(page, NODE, true);
+			f2fs_wait_on_page_writeback(page, NODE, true, true);
 			ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
 			set_page_dirty(page);
 			unlock_page(page);
@@ -1178,8 +1186,12 @@
 		f2fs_put_dnode(&dn);
 		return -EIO;
 	}
-	f2fs_bug_on(F2FS_I_SB(inode),
-			inode->i_blocks != 0 && inode->i_blocks != 8);
+
+	if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) {
+		f2fs_warn(F2FS_I_SB(inode), "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
+			  inode->i_ino, (unsigned long long)inode->i_blocks);
+		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+	}
 
 	/* will put inode & node pages */
 	err = truncate_node(&dn);
@@ -1233,7 +1245,7 @@
 	new_ni.version = 0;
 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
 
-	f2fs_wait_on_page_writeback(page, NODE, true);
+	f2fs_wait_on_page_writeback(page, NODE, true, true);
 	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
 	set_cold_node(page, S_ISDIR(dn->inode->i_mode));
 	if (!PageUptodate(page))
@@ -1274,9 +1286,10 @@
 	int err;
 
 	if (PageUptodate(page)) {
-#ifdef CONFIG_F2FS_CHECK_FS
-		f2fs_bug_on(sbi, !f2fs_inode_chksum_verify(sbi, page));
-#endif
+		if (!f2fs_inode_chksum_verify(sbi, page)) {
+			ClearPageUptodate(page);
+			return -EFSBADCRC;
+		}
 		return LOCKED_PAGE;
 	}
 
@@ -1307,9 +1320,7 @@
 	if (f2fs_check_nid_range(sbi, nid))
 		return;
 
-	rcu_read_lock();
-	apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
-	rcu_read_unlock();
+	apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
 	if (apage)
 		return;
 
@@ -1361,16 +1372,15 @@
 	}
 
 	if (!f2fs_inode_chksum_verify(sbi, page)) {
-		err = -EBADMSG;
+		err = -EFSBADCRC;
 		goto out_err;
 	}
 page_hit:
 	if(unlikely(nid != nid_of_node(page))) {
-		f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, "
-			"nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
-			nid, nid_of_node(page), ino_of_node(page),
-			ofs_of_node(page), cpver_of_node(page),
-			next_blkaddr_of_node(page));
+		f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+			  nid, nid_of_node(page), ino_of_node(page),
+			  ofs_of_node(page), cpver_of_node(page),
+			  next_blkaddr_of_node(page));
 		err = -EINVAL;
 out_err:
 		ClearPageUptodate(page);
@@ -1514,7 +1524,8 @@
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 
-	if (wbc->sync_mode == WB_SYNC_NONE &&
+	if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+			wbc->sync_mode == WB_SYNC_NONE &&
 			IS_DNODE(page) && is_cold_node(page))
 		goto redirty_out;
 
@@ -1542,7 +1553,8 @@
 	}
 
 	if (__is_valid_data_blkaddr(ni.blk_addr) &&
-		!f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) {
+		!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
+					DATA_GENERIC_ENHANCE)) {
 		up_read(&sbi->node_write);
 		goto redirty_out;
 	}
@@ -1566,8 +1578,7 @@
 	up_read(&sbi->node_write);
 
 	if (wbc->for_reclaim) {
-		f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0,
-						page->index, NODE);
+		f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
 		submitted = NULL;
 	}
 
@@ -1589,8 +1600,10 @@
 	return AOP_WRITEPAGE_ACTIVATE;
 }
 
-void f2fs_move_node_page(struct page *node_page, int gc_type)
+int f2fs_move_node_page(struct page *node_page, int gc_type)
 {
+	int err = 0;
+
 	if (gc_type == FG_GC) {
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_ALL,
@@ -1598,16 +1611,20 @@
 			.for_reclaim = 0,
 		};
 
-		set_page_dirty(node_page);
-		f2fs_wait_on_page_writeback(node_page, NODE, true);
+		f2fs_wait_on_page_writeback(node_page, NODE, true, true);
 
-		f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
-		if (!clear_page_dirty_for_io(node_page))
+		set_page_dirty(node_page);
+
+		if (!clear_page_dirty_for_io(node_page)) {
+			err = -EAGAIN;
 			goto out_page;
+		}
 
 		if (__write_node_page(node_page, false, NULL,
-					&wbc, false, FS_GC_NODE_IO, NULL))
+					&wbc, false, FS_GC_NODE_IO, NULL)) {
+			err = -EAGAIN;
 			unlock_page(node_page);
+		}
 		goto release_page;
 	} else {
 		/* set page dirty and write it */
@@ -1618,6 +1635,7 @@
 	unlock_page(node_page);
 release_page:
 	f2fs_put_page(node_page, 0);
+	return err;
 }
 
 static int f2fs_write_node_page(struct page *page,
@@ -1632,13 +1650,13 @@
 			unsigned int *seq_id)
 {
 	pgoff_t index;
-	pgoff_t last_idx = ULONG_MAX;
 	struct pagevec pvec;
 	int ret = 0;
 	struct page *last_page = NULL;
 	bool marked = false;
 	nid_t ino = inode->i_ino;
 	int nr_pages;
+	int nwritten = 0;
 
 	if (atomic) {
 		last_page = last_fsync_dnode(sbi, ino);
@@ -1684,8 +1702,7 @@
 				goto continue_unlock;
 			}
 
-			f2fs_wait_on_page_writeback(page, NODE, true);
-			BUG_ON(PageWriteback(page));
+			f2fs_wait_on_page_writeback(page, NODE, true, true);
 
 			set_fsync_mark(page, 0);
 			set_dentry_mark(page, 0);
@@ -1716,7 +1733,7 @@
 				f2fs_put_page(last_page, 0);
 				break;
 			} else if (submitted) {
-				last_idx = page->index;
+				nwritten++;
 			}
 
 			if (page == last_page) {
@@ -1732,21 +1749,61 @@
 			break;
 	}
 	if (!ret && atomic && !marked) {
-		f2fs_msg(sbi->sb, KERN_DEBUG,
-			"Retry to write fsync mark: ino=%u, idx=%lx",
-					ino, last_page->index);
+		f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx",
+			   ino, last_page->index);
 		lock_page(last_page);
-		f2fs_wait_on_page_writeback(last_page, NODE, true);
+		f2fs_wait_on_page_writeback(last_page, NODE, true, true);
 		set_page_dirty(last_page);
 		unlock_page(last_page);
 		goto retry;
 	}
 out:
-	if (last_idx != ULONG_MAX)
-		f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE);
+	if (nwritten)
+		f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
 	return ret ? -EIO: 0;
 }
 
+static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	bool clean;
+
+	if (inode->i_ino != ino)
+		return 0;
+
+	if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
+		return 0;
+
+	spin_lock(&sbi->inode_lock[DIRTY_META]);
+	clean = list_empty(&F2FS_I(inode)->gdirty_list);
+	spin_unlock(&sbi->inode_lock[DIRTY_META]);
+
+	if (clean)
+		return 0;
+
+	inode = igrab(inode);
+	if (!inode)
+		return 0;
+	return 1;
+}
+
+static bool flush_dirty_inode(struct page *page)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+	struct inode *inode;
+	nid_t ino = ino_of_node(page);
+
+	inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL);
+	if (!inode)
+		return false;
+
+	f2fs_update_inode(inode, page);
+	unlock_page(page);
+
+	iput(inode);
+	return true;
+}
+
 int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
 				struct writeback_control *wbc,
 				bool do_balance, enum iostat_type io_type)
@@ -1770,6 +1827,7 @@
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			bool submitted = false;
+			bool may_dirty = true;
 
 			/* give a priority to WB_SYNC threads */
 			if (atomic_read(&sbi->wb_sync_req[NODE]) &&
@@ -1817,9 +1875,15 @@
 				goto lock_node;
 			}
 
-			f2fs_wait_on_page_writeback(page, NODE, true);
+			/* flush dirty inode */
+			if (IS_INODE(page) && may_dirty) {
+				may_dirty = false;
+				if (flush_dirty_inode(page))
+					goto lock_node;
+			}
 
-			BUG_ON(PageWriteback(page));
+			f2fs_wait_on_page_writeback(page, NODE, true, true);
+
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
@@ -1846,7 +1910,8 @@
 	}
 
 	if (step < 2) {
-		if (wbc->sync_mode == WB_SYNC_NONE && step == 1)
+		if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+				wbc->sync_mode == WB_SYNC_NONE && step == 1)
 			goto out;
 		step++;
 		goto next_step;
@@ -1886,7 +1951,7 @@
 		get_page(page);
 		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
 
-		f2fs_wait_on_page_writeback(page, NODE, true);
+		f2fs_wait_on_page_writeback(page, NODE, true, false);
 		if (TestClearPageError(page))
 			ret = -EIO;
 
@@ -1917,7 +1982,9 @@
 	f2fs_balance_fs_bg(sbi);
 
 	/* collect a number of dirty node pages and write together */
-	if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
+	if (wbc->sync_mode != WB_SYNC_ALL &&
+			get_pages(sbi, F2FS_DIRTY_NODES) <
+					nr_pages_to_skip(sbi, NODE))
 		goto skip_write;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
@@ -1956,7 +2023,7 @@
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
-		SetPagePrivate(page);
+		f2fs_set_page_private(page, 0);
 		f2fs_trace_pid(page);
 		return 1;
 	}
@@ -2071,6 +2138,9 @@
 	if (unlikely(nid == 0))
 		return false;
 
+	if (unlikely(f2fs_check_nid_range(sbi, nid)))
+		return false;
+
 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
 	i->nid = nid;
 	i->state = FREE_NID;
@@ -2270,15 +2340,18 @@
 						nm_i->nat_block_bitmap)) {
 			struct page *page = get_current_nat_page(sbi, nid);
 
-			ret = scan_nat_page(sbi, page, nid);
-			f2fs_put_page(page, 1);
+			if (IS_ERR(page)) {
+				ret = PTR_ERR(page);
+			} else {
+				ret = scan_nat_page(sbi, page, nid);
+				f2fs_put_page(page, 1);
+			}
 
 			if (ret) {
 				up_read(&nm_i->nat_tree_lock);
 				f2fs_bug_on(sbi, !mount);
-				f2fs_msg(sbi->sb, KERN_ERR,
-					"NAT is corrupt, run fsck to fix it");
-				return -EINVAL;
+				f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+				return ret;
 			}
 		}
 
@@ -2355,8 +2428,9 @@
 	spin_unlock(&nm_i->nid_list_lock);
 
 	/* Let's scan nat pages and its caches to get free nids */
-	f2fs_build_free_nids(sbi, true, false);
-	goto retry;
+	if (!f2fs_build_free_nids(sbi, true, false))
+		goto retry;
+	return false;
 }
 
 /*
@@ -2459,7 +2533,7 @@
 	src_addr = inline_xattr_addr(inode, page);
 	inline_size = inline_xattr_size(inode);
 
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 	memcpy(dst_addr, src_addr, inline_size);
 update_inode:
 	f2fs_update_inode(inode, ipage);
@@ -2553,17 +2627,17 @@
 	if (dst->i_inline & F2FS_EXTRA_ATTR) {
 		dst->i_extra_isize = src->i_extra_isize;
 
-		if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
+		if (f2fs_sb_has_flexible_inline_xattr(sbi) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 							i_inline_xattr_size))
 			dst->i_inline_xattr_size = src->i_inline_xattr_size;
 
-		if (f2fs_sb_has_project_quota(sbi->sb) &&
+		if (f2fs_sb_has_project_quota(sbi) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 								i_projid))
 			dst->i_projid = src->i_projid;
 
-		if (f2fs_sb_has_inode_crtime(sbi->sb) &&
+		if (f2fs_sb_has_inode_crtime(sbi) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 							i_crtime_nsec)) {
 			dst->i_crtime = src->i_crtime;
@@ -2696,7 +2770,7 @@
 		i = 1;
 	}
 	for (; i < NAT_ENTRY_PER_BLOCK; i++) {
-		if (nat_blk->entries[i].block_addr != NULL_ADDR)
+		if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
 			valid++;
 	}
 	if (valid == 0) {
@@ -2712,7 +2786,7 @@
 		__clear_bit_le(nat_index, nm_i->full_nat_bits);
 }
 
-static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 		struct nat_entry_set *set, struct cp_control *cpc)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -2736,6 +2810,9 @@
 		down_write(&curseg->journal_rwsem);
 	} else {
 		page = get_next_nat_page(sbi, start_nid);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
 		nat_blk = page_address(page);
 		f2fs_bug_on(sbi, !nat_blk);
 	}
@@ -2781,12 +2858,13 @@
 		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
 		kmem_cache_free(nat_entry_set_slab, set);
 	}
+	return 0;
 }
 
 /*
  * This function is called during the checkpointing process.
  */
-void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -2796,6 +2874,7 @@
 	unsigned int found;
 	nid_t set_idx = 0;
 	LIST_HEAD(sets);
+	int err = 0;
 
 	/* during unmount, let's flush nat_bits before checking dirty_nat_cnt */
 	if (enabled_nat_bits(sbi, cpc)) {
@@ -2805,7 +2884,7 @@
 	}
 
 	if (!nm_i->dirty_nat_cnt)
-		return;
+		return 0;
 
 	down_write(&nm_i->nat_tree_lock);
 
@@ -2828,11 +2907,16 @@
 	}
 
 	/* flush dirty nats in nat entry set */
-	list_for_each_entry_safe(set, tmp, &sets, set_list)
-		__flush_nat_entry_set(sbi, set, cpc);
+	list_for_each_entry_safe(set, tmp, &sets, set_list) {
+		err = __flush_nat_entry_set(sbi, set, cpc);
+		if (err)
+			break;
+	}
 
 	up_write(&nm_i->nat_tree_lock);
 	/* Allow dirty nats by node block allocation in write_begin */
+
+	return err;
 }
 
 static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
@@ -2859,10 +2943,8 @@
 		struct page *page;
 
 		page = f2fs_get_meta_page(sbi, nat_bits_addr++);
-		if (IS_ERR(page)) {
-			disable_nat_bits(sbi, true);
+		if (IS_ERR(page))
 			return PTR_ERR(page);
-		}
 
 		memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
 					page_address(page), F2FS_BLKSIZE);
@@ -2878,7 +2960,7 @@
 	nm_i->full_nat_bits = nm_i->nat_bits + 8;
 	nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
 
-	f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
+	f2fs_notice(sbi, "Found nat_bits in checkpoint");
 	return 0;
 }
 
@@ -2933,7 +3015,7 @@
 
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
 	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
-				sbi->nquota_files - F2FS_RESERVED_NODE_NUM;
+						F2FS_RESERVED_NODE_NUM;
 	nm_i->nid_cnt[FREE_NID] = 0;
 	nm_i->nid_cnt[PREALLOC_NID] = 0;
 	nm_i->nat_cnt = 0;
@@ -3097,17 +3179,17 @@
 
 		for (i = 0; i < nm_i->nat_blocks; i++)
 			kvfree(nm_i->free_nid_bitmap[i]);
-		kfree(nm_i->free_nid_bitmap);
+		kvfree(nm_i->free_nid_bitmap);
 	}
 	kvfree(nm_i->free_nid_count);
 
-	kfree(nm_i->nat_bitmap);
-	kfree(nm_i->nat_bits);
+	kvfree(nm_i->nat_bitmap);
+	kvfree(nm_i->nat_bits);
 #ifdef CONFIG_F2FS_CHECK_FS
-	kfree(nm_i->nat_bitmap_mir);
+	kvfree(nm_i->nat_bitmap_mir);
 #endif
 	sbi->nm_info = NULL;
-	kfree(nm_i);
+	kvfree(nm_i);
 }
 
 int __init f2fs_create_node_manager_caches(void)

diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0f4db7a..e05af5d 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/node.h
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 /* start node id of a node block dedicated to the given node id */
 #define	START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
@@ -364,7 +361,7 @@
 {
 	struct f2fs_node *rn = F2FS_NODE(p);
 
-	f2fs_wait_on_page_writeback(p, NODE, true);
+	f2fs_wait_on_page_writeback(p, NODE, true, true);
 
 	if (i)
 		rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9a8579f..783773e 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/recovery.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
@@ -99,8 +96,12 @@
 	return ERR_PTR(err);
 }
 
-static void del_fsync_inode(struct fsync_inode_entry *entry)
+static void del_fsync_inode(struct fsync_inode_entry *entry, int drop)
 {
+	if (drop) {
+		/* inode should not be recovered, drop it */
+		f2fs_inode_synced(entry->inode);
+	}
 	iput(entry->inode);
 	list_del(&entry->list);
 	kmem_cache_free(fsync_entry_slab, entry);
@@ -187,10 +188,36 @@
 		name = "<encrypted>";
 	else
 		name = raw_inode->i_name;
-	f2fs_msg(inode->i_sb, KERN_NOTICE,
-			"%s: ino = %x, name = %s, dir = %lx, err = %d",
-			__func__, ino_of_node(ipage), name,
-			IS_ERR(dir) ? 0 : dir->i_ino, err);
+	f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d",
+		    __func__, ino_of_node(ipage), name,
+		    IS_ERR(dir) ? 0 : dir->i_ino, err);
+	return err;
+}
+
+static int recover_quota_data(struct inode *inode, struct page *page)
+{
+	struct f2fs_inode *raw = F2FS_INODE(page);
+	struct iattr attr;
+	uid_t i_uid = le32_to_cpu(raw->i_uid);
+	gid_t i_gid = le32_to_cpu(raw->i_gid);
+	int err;
+
+	memset(&attr, 0, sizeof(attr));
+
+	attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid);
+	attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid);
+
+	if (!uid_eq(attr.ia_uid, inode->i_uid))
+		attr.ia_valid |= ATTR_UID;
+	if (!gid_eq(attr.ia_gid, inode->i_gid))
+		attr.ia_valid |= ATTR_GID;
+
+	if (!attr.ia_valid)
+		return 0;
+
+	err = dquot_transfer(inode, &attr);
+	if (err)
+		set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR);
 	return err;
 }
 
@@ -206,12 +233,41 @@
 		clear_inode_flag(inode, FI_DATA_EXIST);
 }
 
-static void recover_inode(struct inode *inode, struct page *page)
+static int recover_inode(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *raw = F2FS_INODE(page);
 	char *name;
+	int err;
 
 	inode->i_mode = le16_to_cpu(raw->i_mode);
+
+	err = recover_quota_data(inode, page);
+	if (err)
+		return err;
+
+	i_uid_write(inode, le32_to_cpu(raw->i_uid));
+	i_gid_write(inode, le32_to_cpu(raw->i_gid));
+
+	if (raw->i_inline & F2FS_EXTRA_ATTR) {
+		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) &&
+			F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize),
+								i_projid)) {
+			projid_t i_projid;
+			kprojid_t kprojid;
+
+			i_projid = (projid_t)le32_to_cpu(raw->i_projid);
+			kprojid = make_kprojid(&init_user_ns, i_projid);
+
+			if (!projid_eq(kprojid, F2FS_I(inode)->i_projid)) {
+				err = f2fs_transfer_project_quota(inode,
+								kprojid);
+				if (err)
+					return err;
+				F2FS_I(inode)->i_projid = kprojid;
+			}
+		}
+	}
+
 	f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
 	inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
 	inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
@@ -222,17 +278,22 @@
 
 	F2FS_I(inode)->i_advise = raw->i_advise;
 	F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
+	f2fs_set_inode_flags(inode);
+	F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] =
+				le16_to_cpu(raw->i_gc_failures);
 
 	recover_inline_flags(inode, raw);
 
+	f2fs_mark_inode_dirty_sync(inode, true);
+
 	if (file_enc_name(inode))
 		name = "<encrypted>";
 	else
 		name = F2FS_INODE(page)->i_name;
 
-	f2fs_msg(inode->i_sb, KERN_NOTICE,
-		"recover_inode: ino = %x, name = %s, inline = %x",
-			ino_of_node(page), name, raw->i_inline);
+	f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x",
+		    ino_of_node(page), name, raw->i_inline);
+	return 0;
 }
 
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
@@ -262,8 +323,10 @@
 			break;
 		}
 
-		if (!is_recoverable_dnode(page))
+		if (!is_recoverable_dnode(page)) {
+			f2fs_put_page(page, 1);
 			break;
+		}
 
 		if (!is_fsync_dnode(page))
 			goto next;
@@ -275,8 +338,10 @@
 			if (!check_only &&
 					IS_INODE(page) && is_dent_dnode(page)) {
 				err = f2fs_recover_inode_page(sbi, page);
-				if (err)
+				if (err) {
+					f2fs_put_page(page, 1);
 					break;
+				}
 				quota_inode = true;
 			}
 
@@ -292,6 +357,7 @@
 					err = 0;
 					goto next;
 				}
+				f2fs_put_page(page, 1);
 				break;
 			}
 		}
@@ -303,10 +369,10 @@
 		/* sanity check in order to detect looped node chain */
 		if (++loop_cnt >= free_blocks ||
 			blkaddr == next_blkaddr_of_node(page)) {
-			f2fs_msg(sbi->sb, KERN_NOTICE,
-				"%s: detect looped node chain, "
-				"blkaddr:%u, next:%u",
-				__func__, blkaddr, next_blkaddr_of_node(page));
+			f2fs_notice(sbi, "%s: detect looped node chain, blkaddr:%u, next:%u",
+				    __func__, blkaddr,
+				    next_blkaddr_of_node(page));
+			f2fs_put_page(page, 1);
 			err = -EINVAL;
 			break;
 		}
@@ -317,16 +383,15 @@
 
 		f2fs_ra_meta_pages_cond(sbi, blkaddr);
 	}
-	f2fs_put_page(page, 1);
 	return err;
 }
 
-static void destroy_fsync_dnodes(struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head, int drop)
 {
 	struct fsync_inode_entry *entry, *tmp;
 
 	list_for_each_entry_safe(entry, tmp, head, list)
-		del_fsync_inode(entry);
+		del_fsync_inode(entry, drop);
 }
 
 static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
@@ -359,6 +424,8 @@
 	}
 
 	sum_page = f2fs_get_sum_page(sbi, segno);
+	if (IS_ERR(sum_page))
+		return PTR_ERR(sum_page);
 	sum_node = (struct f2fs_summary_block *)page_address(sum_page);
 	sum = sum_node->entries[blkoff];
 	f2fs_put_page(sum_page, 1);
@@ -474,14 +541,21 @@
 		goto out;
 	}
 
-	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
 
 	err = f2fs_get_node_info(sbi, dn.nid, &ni);
 	if (err)
 		goto err;
 
 	f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
-	f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
+
+	if (ofs_of_node(dn.node_page) != ofs_of_node(page)) {
+		f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u",
+			  inode->i_ino, ofs_of_node(dn.node_page),
+			  ofs_of_node(page));
+		err = -EFSCORRUPTED;
+		goto err;
+	}
 
 	for (; start < end; start++, dn.ofs_in_node++) {
 		block_t src, dest;
@@ -489,6 +563,18 @@
 		src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
 		dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
 
+		if (__is_valid_data_blkaddr(src) &&
+			!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
+			err = -EFSCORRUPTED;
+			goto err;
+		}
+
+		if (__is_valid_data_blkaddr(dest) &&
+			!f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
+			err = -EFSCORRUPTED;
+			goto err;
+		}
+
 		/* skip recovering if dest is the same as src */
 		if (src == dest)
 			continue;
@@ -552,16 +638,14 @@
 err:
 	f2fs_put_dnode(&dn);
 out:
-	f2fs_msg(sbi->sb, KERN_NOTICE,
-		"recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
-		inode->i_ino,
-		file_keep_isize(inode) ? "keep" : "recover",
-		recovered, err);
+	f2fs_notice(sbi, "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
+		    inode->i_ino, file_keep_isize(inode) ? "keep" : "recover",
+		    recovered, err);
 	return err;
 }
 
 static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
-						struct list_head *dir_list)
+		struct list_head *tmp_inode_list, struct list_head *dir_list)
 {
 	struct curseg_info *curseg;
 	struct page *page = NULL;
@@ -599,8 +683,13 @@
 		 * In this case, we can lose the latest inode(x).
 		 * So, call recover_inode for the inode update.
 		 */
-		if (IS_INODE(page))
-			recover_inode(entry->inode, page);
+		if (IS_INODE(page)) {
+			err = recover_inode(entry->inode, page);
+			if (err) {
+				f2fs_put_page(page, 1);
+				break;
+			}
+		}
 		if (entry->last_dentry == blkaddr) {
 			err = recover_dentry(entry->inode, page, dir_list);
 			if (err) {
@@ -615,7 +704,7 @@
 		}
 
 		if (entry->blkaddr == blkaddr)
-			del_fsync_inode(entry);
+			list_move_tail(&entry->list, tmp_inode_list);
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
@@ -628,7 +717,7 @@
 
 int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 {
-	struct list_head inode_list;
+	struct list_head inode_list, tmp_inode_list;
 	struct list_head dir_list;
 	int err;
 	int ret = 0;
@@ -639,8 +728,7 @@
 #endif
 
 	if (s_flags & SB_RDONLY) {
-		f2fs_msg(sbi->sb, KERN_INFO,
-				"recover fsync data on readonly fs");
+		f2fs_info(sbi, "recover fsync data on readonly fs");
 		sbi->sb->s_flags &= ~SB_RDONLY;
 	}
 
@@ -659,6 +747,7 @@
 	}
 
 	INIT_LIST_HEAD(&inode_list);
+	INIT_LIST_HEAD(&tmp_inode_list);
 	INIT_LIST_HEAD(&dir_list);
 
 	/* prevent checkpoint */
@@ -677,11 +766,16 @@
 	need_writecp = true;
 
 	/* step #2: recover data */
-	err = recover_data(sbi, &inode_list, &dir_list);
+	err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list);
 	if (!err)
 		f2fs_bug_on(sbi, !list_empty(&inode_list));
+	else {
+		/* restore s_flags to let iput() trash data */
+		sbi->sb->s_flags = s_flags;
+	}
 skip:
-	destroy_fsync_dnodes(&inode_list);
+	destroy_fsync_dnodes(&inode_list, err);
+	destroy_fsync_dnodes(&tmp_inode_list, err);
 
 	/* truncate meta pages to be used by the recovery */
 	truncate_inode_pages_range(META_MAPPING(sbi),
@@ -690,13 +784,13 @@
 	if (err) {
 		truncate_inode_pages_final(NODE_MAPPING(sbi));
 		truncate_inode_pages_final(META_MAPPING(sbi));
+	} else {
+		clear_sbi_flag(sbi, SBI_POR_DOING);
 	}
-
-	clear_sbi_flag(sbi, SBI_POR_DOING);
 	mutex_unlock(&sbi->cp_mutex);
 
 	/* let's drop all the directory inodes for clean checkpoint */
-	destroy_fsync_dnodes(&dir_list);
+	destroy_fsync_dnodes(&dir_list, err);
 
 	if (need_writecp) {
 		set_sbi_flag(sbi, SBI_IS_RECOVERED);

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 30779aa..8087095 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/segment.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
@@ -179,6 +176,8 @@
 		return false;
 	if (sbi->gc_mode == GC_URGENT)
 		return true;
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+		return true;
 
 	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
 			SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
@@ -186,14 +185,11 @@
 
 void f2fs_register_inmem_page(struct inode *inode, struct page *page)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct inmem_pages *new;
 
 	f2fs_trace_pid(page);
 
-	set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
-	SetPagePrivate(page);
+	f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
 
 	new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
 
@@ -202,21 +198,18 @@
 	INIT_LIST_HEAD(&new->list);
 
 	/* increase reference count with clean state */
-	mutex_lock(&fi->inmem_lock);
 	get_page(page);
-	list_add_tail(&new->list, &fi->inmem_pages);
-	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
-	if (list_empty(&fi->inmem_ilist))
-		list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
-	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+	mutex_lock(&F2FS_I(inode)->inmem_lock);
+	list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages);
 	inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
-	mutex_unlock(&fi->inmem_lock);
+	mutex_unlock(&F2FS_I(inode)->inmem_lock);
 
 	trace_f2fs_register_inmem_page(page, INMEM);
 }
 
 static int __revoke_inmem_pages(struct inode *inode,
-				struct list_head *head, bool drop, bool recover)
+				struct list_head *head, bool drop, bool recover,
+				bool trylock)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inmem_pages *cur, *tmp;
@@ -228,9 +221,18 @@
 		if (drop)
 			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
 
-		lock_page(page);
+		if (trylock) {
+			/*
+			 * to avoid deadlock in between page lock and
+			 * inmem_lock.
+			 */
+			if (!trylock_page(page))
+				continue;
+		} else {
+			lock_page(page);
+		}
 
-		f2fs_wait_on_page_writeback(page, DATA, true);
+		f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 		if (recover) {
 			struct dnode_of_data dn;
@@ -267,10 +269,11 @@
 		}
 next:
 		/* we don't need to invalidate this in the sccessful status */
-		if (drop || recover)
+		if (drop || recover) {
 			ClearPageUptodate(page);
-		set_page_private(page, 0);
-		ClearPagePrivate(page);
+			clear_cold_data(page);
+		}
+		f2fs_clear_page_private(page);
 		f2fs_put_page(page, 1);
 
 		list_del(&cur->list);
@@ -317,17 +320,21 @@
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	mutex_lock(&fi->inmem_lock);
-	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
-	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
-	if (!list_empty(&fi->inmem_ilist))
-		list_del_init(&fi->inmem_ilist);
-	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
-	mutex_unlock(&fi->inmem_lock);
+	while (!list_empty(&fi->inmem_pages)) {
+		mutex_lock(&fi->inmem_lock);
+		__revoke_inmem_pages(inode, &fi->inmem_pages,
+						true, false, true);
+		mutex_unlock(&fi->inmem_lock);
+	}
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
 	stat_dec_atomic_write(inode);
+
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (!list_empty(&fi->inmem_ilist))
+		list_del_init(&fi->inmem_ilist);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 }
 
 void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
@@ -353,8 +360,7 @@
 	kmem_cache_free(inmem_entry_slab, cur);
 
 	ClearPageUptodate(page);
-	set_page_private(page, 0);
-	ClearPagePrivate(page);
+	f2fs_clear_page_private(page);
 	f2fs_put_page(page, 0);
 
 	trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
@@ -374,7 +380,7 @@
 		.io_type = FS_DATA_IO,
 	};
 	struct list_head revoke_list;
-	pgoff_t last_idx = ULONG_MAX;
+	bool submit_bio = false;
 	int err = 0;
 
 	INIT_LIST_HEAD(&revoke_list);
@@ -386,8 +392,9 @@
 		if (page->mapping == inode->i_mapping) {
 			trace_f2fs_commit_inmem_page(page, INMEM);
 
+			f2fs_wait_on_page_writeback(page, DATA, true, true);
+
 			set_page_dirty(page);
-			f2fs_wait_on_page_writeback(page, DATA, true);
 			if (clear_page_dirty_for_io(page)) {
 				inode_dec_dirty_pages(inode);
 				f2fs_remove_dirty_inode(inode);
@@ -409,14 +416,14 @@
 			}
 			/* record old blkaddr for revoking */
 			cur->old_addr = fio.old_blkaddr;
-			last_idx = page->index;
+			submit_bio = true;
 		}
 		unlock_page(page);
 		list_move_tail(&cur->list, &revoke_list);
 	}
 
-	if (last_idx != ULONG_MAX)
-		f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA);
+	if (submit_bio)
+		f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA);
 
 	if (err) {
 		/*
@@ -427,12 +434,15 @@
 		 * recovery or rewrite & commit last transaction. For other
 		 * error number, revoking was done by filesystem itself.
 		 */
-		err = __revoke_inmem_pages(inode, &revoke_list, false, true);
+		err = __revoke_inmem_pages(inode, &revoke_list,
+						false, true, false);
 
 		/* drop all uncommitted pages */
-		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+		__revoke_inmem_pages(inode, &fi->inmem_pages,
+						true, false, false);
 	} else {
-		__revoke_inmem_pages(inode, &revoke_list, false, false);
+		__revoke_inmem_pages(inode, &revoke_list,
+						false, false, false);
 	}
 
 	return err;
@@ -453,11 +463,6 @@
 
 	mutex_lock(&fi->inmem_lock);
 	err = __f2fs_commit_inmem_pages(inode);
-
-	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
-	if (!list_empty(&fi->inmem_ilist))
-		list_del_init(&fi->inmem_ilist);
-	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -483,6 +488,9 @@
 	if (need && excess_cached_nats(sbi))
 		f2fs_balance_fs_bg(sbi);
 
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return;
+
 	/*
 	 * We should do GC or end up with checkpoint, if there are so many dirty
 	 * dir/node pages without enough free segments.
@@ -511,7 +519,7 @@
 	else
 		f2fs_build_free_nids(sbi, false, false);
 
-	if (!is_idle(sbi) &&
+	if (!is_idle(sbi, REQ_TIME) &&
 		(!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi)))
 		return;
 
@@ -525,9 +533,13 @@
 		if (test_opt(sbi, DATA_FLUSH)) {
 			struct blk_plug plug;
 
+			mutex_lock(&sbi->flush_lock);
+
 			blk_start_plug(&plug);
 			f2fs_sync_dirty_inodes(sbi, FILE_INODE);
 			blk_finish_plug(&plug);
+
+			mutex_unlock(&sbi->flush_lock);
 		}
 		f2fs_sync_fs(sbi->sb, true);
 		stat_inc_bg_cp_count(sbi->stat_info);
@@ -537,9 +549,13 @@
 static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 				struct block_device *bdev)
 {
-	struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
+	struct bio *bio;
 	int ret;
 
+	bio = f2fs_bio_alloc(sbi, 0, false);
+	if (!bio)
+		return -ENOMEM;
+
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
 	bio_set_dev(bio, bdev);
 	ret = submit_bio_wait(bio);
@@ -555,7 +571,7 @@
 	int ret = 0;
 	int i;
 
-	if (!sbi->s_ndevs)
+	if (!f2fs_is_multi_device(sbi))
 		return __submit_flush_wait(sbi, sbi->sb->s_bdev);
 
 	for (i = 0; i < sbi->s_ndevs; i++) {
@@ -616,14 +632,17 @@
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE)) {
+		atomic_inc(&fcc->queued_flush);
 		ret = submit_flush_wait(sbi, ino);
+		atomic_dec(&fcc->queued_flush);
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
-	if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) {
+	if (atomic_inc_return(&fcc->queued_flush) == 1 ||
+	    f2fs_is_multi_device(sbi)) {
 		ret = submit_flush_wait(sbi, ino);
-		atomic_dec(&fcc->issing_flush);
+		atomic_dec(&fcc->queued_flush);
 
 		atomic_inc(&fcc->issued_flush);
 		return ret;
@@ -642,14 +661,14 @@
 
 	if (fcc->f2fs_issue_flush) {
 		wait_for_completion(&cmd.wait);
-		atomic_dec(&fcc->issing_flush);
+		atomic_dec(&fcc->queued_flush);
 	} else {
 		struct llist_node *list;
 
 		list = llist_del_all(&fcc->issue_list);
 		if (!list) {
 			wait_for_completion(&cmd.wait);
-			atomic_dec(&fcc->issing_flush);
+			atomic_dec(&fcc->queued_flush);
 		} else {
 			struct flush_cmd *tmp, *next;
 
@@ -658,7 +677,7 @@
 			llist_for_each_entry_safe(tmp, next, list, llnode) {
 				if (tmp == &cmd) {
 					cmd.ret = ret;
-					atomic_dec(&fcc->issing_flush);
+					atomic_dec(&fcc->queued_flush);
 					continue;
 				}
 				tmp->ret = ret;
@@ -687,7 +706,7 @@
 	if (!fcc)
 		return -ENOMEM;
 	atomic_set(&fcc->issued_flush, 0);
-	atomic_set(&fcc->issing_flush, 0);
+	atomic_set(&fcc->queued_flush, 0);
 	init_waitqueue_head(&fcc->flush_wait_queue);
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->fcc_info = fcc;
@@ -699,7 +718,7 @@
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
 		err = PTR_ERR(fcc->f2fs_issue_flush);
-		kfree(fcc);
+		kvfree(fcc);
 		SM_I(sbi)->fcc_info = NULL;
 		return err;
 	}
@@ -718,7 +737,7 @@
 		kthread_stop(flush_thread);
 	}
 	if (free) {
-		kfree(fcc);
+		kvfree(fcc);
 		SM_I(sbi)->fcc_info = NULL;
 	}
 }
@@ -727,7 +746,7 @@
 {
 	int ret = 0, i;
 
-	if (!sbi->s_ndevs)
+	if (!f2fs_is_multi_device(sbi))
 		return 0;
 
 	for (i = 1; i < sbi->s_ndevs; i++) {
@@ -785,9 +804,13 @@
 		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
 			dirty_i->nr_dirty[t]--;
 
-		if (get_valid_blocks(sbi, segno, true) == 0)
+		if (get_valid_blocks(sbi, segno, true) == 0) {
 			clear_bit(GET_SEC_FROM_SEG(sbi, segno),
 						dirty_i->victim_secmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+			clear_bit(segno, SIT_I(sbi)->invalid_segmap);
+#endif
+		}
 	}
 }
 
@@ -799,7 +822,7 @@
 static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned short valid_blocks;
+	unsigned short valid_blocks, ckpt_valid_blocks;
 
 	if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
 		return;
@@ -807,8 +830,10 @@
 	mutex_lock(&dirty_i->seglist_lock);
 
 	valid_blocks = get_valid_blocks(sbi, segno, false);
+	ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno);
 
-	if (valid_blocks == 0) {
+	if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) ||
+				ckpt_valid_blocks == sbi->blocks_per_seg)) {
 		__locate_dirty_segment(sbi, segno, PRE);
 		__remove_dirty_segment(sbi, segno, DIRTY);
 	} else if (valid_blocks < sbi->blocks_per_seg) {
@@ -821,6 +846,82 @@
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
+/* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */
+void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	unsigned int segno;
+
+	mutex_lock(&dirty_i->seglist_lock);
+	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
+		if (get_valid_blocks(sbi, segno, false))
+			continue;
+		if (IS_CURSEG(sbi, segno))
+			continue;
+		__locate_dirty_segment(sbi, segno, PRE);
+		__remove_dirty_segment(sbi, segno, DIRTY);
+	}
+	mutex_unlock(&dirty_i->seglist_lock);
+}
+
+block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
+{
+	int ovp_hole_segs =
+		(overprovision_segments(sbi) - reserved_segments(sbi));
+	block_t ovp_holes = ovp_hole_segs << sbi->log_blocks_per_seg;
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	block_t holes[2] = {0, 0};	/* DATA and NODE */
+	block_t unusable;
+	struct seg_entry *se;
+	unsigned int segno;
+
+	mutex_lock(&dirty_i->seglist_lock);
+	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
+		se = get_seg_entry(sbi, segno);
+		if (IS_NODESEG(se->type))
+			holes[NODE] += sbi->blocks_per_seg - se->valid_blocks;
+		else
+			holes[DATA] += sbi->blocks_per_seg - se->valid_blocks;
+	}
+	mutex_unlock(&dirty_i->seglist_lock);
+
+	unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE];
+	if (unusable > ovp_holes)
+		return unusable - ovp_holes;
+	return 0;
+}
+
+int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable)
+{
+	int ovp_hole_segs =
+		(overprovision_segments(sbi) - reserved_segments(sbi));
+	if (unusable > F2FS_OPTION(sbi).unusable_cap)
+		return -EAGAIN;
+	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
+		dirty_segments(sbi) > ovp_hole_segs)
+		return -EAGAIN;
+	return 0;
+}
+
+/* This is only used by SBI_CP_DISABLED */
+static unsigned int get_free_segment(struct f2fs_sb_info *sbi)
+{
+	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	unsigned int segno = 0;
+
+	mutex_lock(&dirty_i->seglist_lock);
+	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
+		if (get_valid_blocks(sbi, segno, false))
+			continue;
+		if (get_ckpt_valid_blocks(sbi, segno))
+			continue;
+		mutex_unlock(&dirty_i->seglist_lock);
+		return segno;
+	}
+	mutex_unlock(&dirty_i->seglist_lock);
+	return NULL_SEGNO;
+}
+
 static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t lstart,
 		block_t start, block_t len)
@@ -841,7 +942,7 @@
 	dc->len = len;
 	dc->ref = 0;
 	dc->state = D_PREP;
-	dc->issuing = 0;
+	dc->queued = 0;
 	dc->error = 0;
 	init_completion(&dc->wait);
 	list_add_tail(&dc->list, pend_list);
@@ -856,7 +957,8 @@
 static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi,
 				struct block_device *bdev, block_t lstart,
 				block_t start, block_t len,
-				struct rb_node *parent, struct rb_node **p)
+				struct rb_node *parent, struct rb_node **p,
+				bool leftmost)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct discard_cmd *dc;
@@ -864,7 +966,7 @@
 	dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
 
 	rb_link_node(&dc->rb_node, parent, p);
-	rb_insert_color(&dc->rb_node, &dcc->root);
+	rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost);
 
 	return dc;
 }
@@ -873,10 +975,10 @@
 							struct discard_cmd *dc)
 {
 	if (dc->state == D_DONE)
-		atomic_sub(dc->issuing, &dcc->issing_discard);
+		atomic_sub(dc->queued, &dcc->queued_discard);
 
 	list_del(&dc->list);
-	rb_erase(&dc->rb_node, &dcc->root);
+	rb_erase_cached(&dc->rb_node, &dcc->root);
 	dcc->undiscard_blks -= dc->len;
 
 	kmem_cache_free(discard_cmd_slab, dc);
@@ -905,9 +1007,9 @@
 		dc->error = 0;
 
 	if (dc->error)
-		f2fs_msg(sbi->sb, KERN_INFO,
-			"Issue discard(%u, %u, %u) failed, ret: %d",
-			dc->lstart, dc->start, dc->len, dc->error);
+		printk_ratelimited(
+			"%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d",
+			KERN_INFO, dc->lstart, dc->start, dc->len, dc->error);
 	__detach_discard_cmd(dcc, dc);
 }
 
@@ -967,6 +1069,7 @@
 
 	dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
 	dpolicy->io_aware_gran = MAX_PLIST_NUM;
+	dpolicy->timeout = 0;
 
 	if (discard_type == DPOLICY_BG) {
 		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -989,6 +1092,8 @@
 	} else if (discard_type == DPOLICY_UMOUNT) {
 		dpolicy->max_requests = UINT_MAX;
 		dpolicy->io_aware = false;
+		/* we need to issue all to keep CP_TRIMMED_FLAG */
+		dpolicy->granularity = 1;
 	}
 }
 
@@ -1076,12 +1181,12 @@
 		dc->bio_ref++;
 		spin_unlock_irqrestore(&dc->lock, flags);
 
-		atomic_inc(&dcc->issing_discard);
-		dc->issuing++;
+		atomic_inc(&dcc->queued_discard);
+		dc->queued++;
 		list_move_tail(&dc->list, wait_list);
 
 		/* sanity check on discard range */
-		__check_sit_bitmap(sbi, start, start + len);
+		__check_sit_bitmap(sbi, lstart, lstart + len);
 
 		bio->bi_private = dc;
 		bio->bi_end_io = f2fs_submit_discard_endio;
@@ -1113,6 +1218,7 @@
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct discard_cmd *dc = NULL;
+	bool leftmost = true;
 
 	if (insert_p && insert_parent) {
 		parent = insert_parent;
@@ -1120,9 +1226,11 @@
 		goto do_insert;
 	}
 
-	p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart);
+	p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent,
+							lstart, &leftmost);
 do_insert:
-	dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p);
+	dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent,
+								p, leftmost);
 	if (!dc)
 		return NULL;
 
@@ -1190,7 +1298,7 @@
 					NULL, lstart,
 					(struct rb_entry **)&prev_dc,
 					(struct rb_entry **)&next_dc,
-					&insert_p, &insert_parent, true);
+					&insert_p, &insert_parent, true, NULL);
 	if (dc)
 		prev_dc = dc;
 
@@ -1268,9 +1376,12 @@
 {
 	block_t lblkstart = blkstart;
 
+	if (!f2fs_bdev_support_discard(bdev))
+		return 0;
+
 	trace_f2fs_queue_discard(bdev, blkstart, blklen);
 
-	if (sbi->s_ndevs) {
+	if (f2fs_is_multi_device(sbi)) {
 		int devi = f2fs_target_device_index(sbi, blkstart);
 
 		blkstart -= FDEV(devi).start_blk;
@@ -1298,7 +1409,7 @@
 					NULL, pos,
 					(struct rb_entry **)&prev_dc,
 					(struct rb_entry **)&next_dc,
-					&insert_p, &insert_parent, true);
+					&insert_p, &insert_parent, true, NULL);
 	if (!dc)
 		dc = next_dc;
 
@@ -1311,7 +1422,7 @@
 		if (dc->state != D_PREP)
 			goto next;
 
-		if (dpolicy->io_aware && !is_idle(sbi)) {
+		if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) {
 			io_interrupted = true;
 			break;
 		}
@@ -1351,7 +1462,14 @@
 	int i, issued = 0;
 	bool io_interrupted = false;
 
+	if (dpolicy->timeout != 0)
+		f2fs_update_time(sbi, dpolicy->timeout);
+
 	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		if (dpolicy->timeout != 0 &&
+				f2fs_time_over(sbi, dpolicy->timeout))
+			break;
+
 		if (i + 1 < dpolicy->granularity)
 			break;
 
@@ -1370,8 +1488,12 @@
 		list_for_each_entry_safe(dc, tmp, pend_list, list) {
 			f2fs_bug_on(sbi, dc->state != D_PREP);
 
+			if (dpolicy->timeout != 0 &&
+				f2fs_time_over(sbi, dpolicy->timeout))
+				break;
+
 			if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
-								!is_idle(sbi)) {
+						!is_idle(sbi, DISCARD_TIME)) {
 				io_interrupted = true;
 				break;
 			}
@@ -1538,7 +1660,7 @@
 }
 
 /* This comes from f2fs_put_super */
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct discard_policy dpolicy;
@@ -1546,6 +1668,7 @@
 
 	__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
 					dcc->discard_granularity);
+	dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
 	__issue_discard_cmd(sbi, &dpolicy);
 	dropped = __drop_discard_cmd(sbi);
 
@@ -1579,6 +1702,10 @@
 		if (dcc->discard_wake)
 			dcc->discard_wake = 0;
 
+		/* clean up pending candidates before going to sleep */
+		if (atomic_read(&dcc->queued_discard))
+			__wait_all_discard_cmd(sbi, NULL);
+
 		if (try_to_freeze())
 			continue;
 		if (f2fs_readonly(sbi->sb))
@@ -1600,7 +1727,9 @@
 			__wait_all_discard_cmd(sbi, &dpolicy);
 			wait_ms = dpolicy.min_interval;
 		} else if (issued == -1){
-			wait_ms = dpolicy.mid_interval;
+			wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME);
+			if (!wait_ms)
+				wait_ms = dpolicy.mid_interval;
 		} else {
 			wait_ms = dpolicy.max_interval;
 		}
@@ -1619,42 +1748,34 @@
 	block_t lblkstart = blkstart;
 	int devi = 0;
 
-	if (sbi->s_ndevs) {
+	if (f2fs_is_multi_device(sbi)) {
 		devi = f2fs_target_device_index(sbi, blkstart);
+		if (blkstart < FDEV(devi).start_blk ||
+		    blkstart > FDEV(devi).end_blk) {
+			f2fs_err(sbi, "Invalid block %x", blkstart);
+			return -EIO;
+		}
 		blkstart -= FDEV(devi).start_blk;
 	}
 
-	/*
-	 * We need to know the type of the zone: for conventional zones,
-	 * use regular discard if the drive supports it. For sequential
-	 * zones, reset the zone write pointer.
-	 */
-	switch (get_blkz_type(sbi, bdev, blkstart)) {
-
-	case BLK_ZONE_TYPE_CONVENTIONAL:
-		if (!blk_queue_discard(bdev_get_queue(bdev)))
-			return 0;
-		return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
-	case BLK_ZONE_TYPE_SEQWRITE_REQ:
-	case BLK_ZONE_TYPE_SEQWRITE_PREF:
+	/* For sequential zones, reset the zone write pointer */
+	if (f2fs_blkz_is_seq(sbi, devi, blkstart)) {
 		sector = SECTOR_FROM_BLOCK(blkstart);
 		nr_sects = SECTOR_FROM_BLOCK(blklen);
 
 		if (sector & (bdev_zone_sectors(bdev) - 1) ||
 				nr_sects != bdev_zone_sectors(bdev)) {
-			f2fs_msg(sbi->sb, KERN_INFO,
-				"(%d) %s: Unaligned discard attempted (block %x + %x)",
-				devi, sbi->s_ndevs ? FDEV(devi).path: "",
-				blkstart, blklen);
+			f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)",
+				 devi, sbi->s_ndevs ? FDEV(devi).path : "",
+				 blkstart, blklen);
 			return -EIO;
 		}
 		trace_f2fs_issue_reset_zone(bdev, blkstart);
-		return blkdev_reset_zones(bdev, sector,
-					  nr_sects, GFP_NOFS);
-	default:
-		/* Unknown zone type: broken device ? */
-		return -EIO;
+		return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS);
 	}
+
+	/* For conventional zones, use regular discard if supported */
+	return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
 }
 #endif
 
@@ -1662,8 +1783,7 @@
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
 #ifdef CONFIG_BLK_DEV_ZONED
-	if (f2fs_sb_has_blkzoned(sbi->sb) &&
-				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
+	if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev))
 		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
 #endif
 	return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
@@ -1725,11 +1845,11 @@
 	struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
 	int i;
 
-	if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
+	if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi))
 		return false;
 
 	if (!force) {
-		if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
+		if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks ||
 			SM_I(sbi)->dcc_info->nr_discards >=
 				SM_I(sbi)->dcc_info->max_discards)
 			return false;
@@ -1810,7 +1930,7 @@
 	unsigned int start = 0, end = -1;
 	unsigned int secno, start_segno;
 	bool force = (cpc->reason & CP_DISCARD);
-	bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1;
+	bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
 
 	mutex_lock(&dirty_i->seglist_lock);
 
@@ -1835,14 +1955,14 @@
 				dirty_i->nr_dirty[PRE]--;
 		}
 
-		if (!test_opt(sbi, DISCARD))
+		if (!f2fs_realtime_discard_enable(sbi))
 			continue;
 
 		if (force && start >= cpc->trim_start &&
 					(end - 1) <= cpc->trim_end)
 				continue;
 
-		if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
+		if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) {
 			f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
 				(end - start) << sbi->log_blocks_per_seg);
 			continue;
@@ -1874,7 +1994,7 @@
 					sbi->blocks_per_seg, cur_pos);
 			len = next_pos - cur_pos;
 
-			if (f2fs_sb_has_blkzoned(sbi->sb) ||
+			if (f2fs_sb_has_blkzoned(sbi) ||
 			    (force && len < cpc->trim_minlen))
 				goto skip;
 
@@ -1922,13 +2042,13 @@
 	INIT_LIST_HEAD(&dcc->fstrim_list);
 	mutex_init(&dcc->cmd_lock);
 	atomic_set(&dcc->issued_discard, 0);
-	atomic_set(&dcc->issing_discard, 0);
+	atomic_set(&dcc->queued_discard, 0);
 	atomic_set(&dcc->discard_cmd_cnt, 0);
 	dcc->nr_discards = 0;
 	dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
 	dcc->undiscard_blks = 0;
 	dcc->next_pos = 0;
-	dcc->root = RB_ROOT;
+	dcc->root = RB_ROOT_CACHED;
 	dcc->rbtree_check = false;
 
 	init_waitqueue_head(&dcc->discard_wait_queue);
@@ -1938,7 +2058,7 @@
 				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(dcc->f2fs_issue_discard)) {
 		err = PTR_ERR(dcc->f2fs_issue_discard);
-		kfree(dcc);
+		kvfree(dcc);
 		SM_I(sbi)->dcc_info = NULL;
 		return err;
 	}
@@ -1955,7 +2075,14 @@
 
 	f2fs_stop_discard_thread(sbi);
 
-	kfree(dcc);
+	/*
+	 * Recovery can cache discard commands, so in error path of
+	 * fill_super(), it needs to give a chance to handle them.
+	 */
+	if (unlikely(atomic_read(&dcc->discard_cmd_cnt)))
+		f2fs_issue_discard_timeout(sbi);
+
+	kvfree(dcc);
 	SM_I(sbi)->dcc_info = NULL;
 }
 
@@ -2011,26 +2138,27 @@
 		mir_exist = f2fs_test_and_set_bit(offset,
 						se->cur_valid_map_mir);
 		if (unlikely(exist != mir_exist)) {
-			f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error "
-				"when setting bitmap, blk:%u, old bit:%d",
-				blkaddr, exist);
+			f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d",
+				 blkaddr, exist);
 			f2fs_bug_on(sbi, 1);
 		}
 #endif
 		if (unlikely(exist)) {
-			f2fs_msg(sbi->sb, KERN_ERR,
-				"Bitmap was wrongly set, blk:%u", blkaddr);
+			f2fs_err(sbi, "Bitmap was wrongly set, blk:%u",
+				 blkaddr);
 			f2fs_bug_on(sbi, 1);
 			se->valid_blocks--;
 			del = 0;
 		}
 
-		if (f2fs_discard_en(sbi) &&
-			!f2fs_test_and_set_bit(offset, se->discard_map))
+		if (!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;
 
-		/* don't overwrite by SSR to keep node chain */
-		if (IS_NODESEG(se->type)) {
+		/*
+		 * SSR should never reuse block which is checkpointed
+		 * or newly invalidated.
+		 */
+		if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
 			if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
 				se->ckpt_valid_blocks++;
 		}
@@ -2040,22 +2168,32 @@
 		mir_exist = f2fs_test_and_clear_bit(offset,
 						se->cur_valid_map_mir);
 		if (unlikely(exist != mir_exist)) {
-			f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error "
-				"when clearing bitmap, blk:%u, old bit:%d",
-				blkaddr, exist);
+			f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d",
+				 blkaddr, exist);
 			f2fs_bug_on(sbi, 1);
 		}
 #endif
 		if (unlikely(!exist)) {
-			f2fs_msg(sbi->sb, KERN_ERR,
-				"Bitmap was wrongly cleared, blk:%u", blkaddr);
+			f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u",
+				 blkaddr);
 			f2fs_bug_on(sbi, 1);
 			se->valid_blocks++;
 			del = 0;
+		} else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+			/*
+			 * If checkpoints are off, we must not reuse data that
+			 * was used in the previous checkpoint. If it was used
+			 * before, we must track that to know how much space we
+			 * really have.
+			 */
+			if (f2fs_test_bit(offset, se->ckpt_valid_map)) {
+				spin_lock(&sbi->stat_lock);
+				sbi->unusable_block_count++;
+				spin_unlock(&sbi->stat_lock);
+			}
 		}
 
-		if (f2fs_discard_en(sbi) &&
-			f2fs_test_and_clear_bit(offset, se->discard_map))
+		if (f2fs_test_and_clear_bit(offset, se->discard_map))
 			sbi->discard_blks++;
 	}
 	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -2066,7 +2204,7 @@
 	/* update total number of valid blocks to be written in ckpt area */
 	SIT_I(sbi)->written_valid_blocks += del;
 
-	if (sbi->segs_per_sec > 1)
+	if (__is_large_section(sbi))
 		get_sec_entry(sbi, segno)->valid_blocks += del;
 }
 
@@ -2099,7 +2237,7 @@
 	struct seg_entry *se;
 	bool is_cp = false;
 
-	if (!is_valid_data_blkaddr(sbi, blkaddr))
+	if (!__is_valid_data_blkaddr(blkaddr))
 		return true;
 
 	down_read(&sit_i->sentry_lock);
@@ -2332,9 +2470,12 @@
 static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
 {
 	/* if segs_per_sec is large than 1, we need to keep original policy. */
-	if (sbi->segs_per_sec != 1)
+	if (__is_large_section(sbi))
 		return CURSEG_I(sbi, type)->segno;
 
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+		return 0;
+
 	if (test_opt(sbi, NOHEAP) &&
 		(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
 		return 0;
@@ -2432,6 +2573,7 @@
 	__next_free_blkoff(sbi, curseg, 0);
 
 	sum_page = f2fs_get_sum_page(sbi, new_segno);
+	f2fs_bug_on(sbi, IS_ERR(sum_page));
 	sum_node = (struct f2fs_summary_block *)page_address(sum_page);
 	memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
 	f2fs_put_page(sum_page, 1);
@@ -2478,6 +2620,15 @@
 			return 1;
 		}
 	}
+
+	/* find valid_blocks=0 in dirty list */
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+		segno = get_free_segment(sbi);
+		if (segno != NULL_SEGNO) {
+			curseg->next_segno = segno;
+			return 1;
+		}
+	}
 	return 0;
 }
 
@@ -2495,7 +2646,8 @@
 	else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
 					type == CURSEG_WARM_NODE)
 		new_curseg(sbi, type, false);
-	else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
+	else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) &&
+			likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
 		new_curseg(sbi, type, false);
 	else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type))
 		change_curseg(sbi, type);
@@ -2505,6 +2657,39 @@
 	stat_inc_seg_type(sbi, curseg);
 }
 
+void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+					unsigned int start, unsigned int end)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	unsigned int segno;
+
+	down_read(&SM_I(sbi)->curseg_lock);
+	mutex_lock(&curseg->curseg_mutex);
+	down_write(&SIT_I(sbi)->sentry_lock);
+
+	segno = CURSEG_I(sbi, type)->segno;
+	if (segno < start || segno > end)
+		goto unlock;
+
+	if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type))
+		change_curseg(sbi, type);
+	else
+		new_curseg(sbi, type, true);
+
+	stat_inc_seg_type(sbi, curseg);
+
+	locate_dirty_segment(sbi, segno);
+unlock:
+	up_write(&SIT_I(sbi)->sentry_lock);
+
+	if (segno != curseg->segno)
+		f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u",
+			    type, segno, curseg->segno);
+
+	mutex_unlock(&curseg->curseg_mutex);
+	up_read(&SM_I(sbi)->curseg_lock);
+}
+
 void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
 {
 	struct curseg_info *curseg;
@@ -2570,7 +2755,7 @@
 					NULL, start,
 					(struct rb_entry **)&prev_dc,
 					(struct rb_entry **)&next_dc,
-					&insert_p, &insert_parent, true);
+					&insert_p, &insert_parent, true, NULL);
 	if (!dc)
 		dc = next_dc;
 
@@ -2628,7 +2813,7 @@
 	struct discard_policy dpolicy;
 	unsigned long long trimmed = 0;
 	int err = 0;
-	bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1;
+	bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
 
 	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
 		return -EINVAL;
@@ -2637,9 +2822,8 @@
 		goto out;
 
 	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"Found FS corruption, run fsck to fix.");
-		return -EIO;
+		f2fs_warn(sbi, "Found FS corruption, run fsck to fix.");
+		return -EFSCORRUPTED;
 	}
 
 	/* start/end segment number in main_area */
@@ -2671,7 +2855,7 @@
 	 * discard option. User configuration looks like using runtime discard
 	 * or periodic fstrim instead of it.
 	 */
-	if (test_opt(sbi, DISCARD))
+	if (f2fs_realtime_discard_enable(sbi))
 		goto out;
 
 	start_block = START_BLOCK(sbi, start_segno);
@@ -2932,12 +3116,14 @@
 		f2fs_inode_chksum_set(sbi, page);
 	}
 
+	if (F2FS_IO_ALIGNED(sbi))
+		fio->retry = false;
+
 	if (add_list) {
 		struct f2fs_bio_info *io;
 
 		INIT_LIST_HEAD(&fio->list);
 		fio->in_list = true;
-		fio->retry = false;
 		io = sbi->write_io[fio->type] + fio->temp;
 		spin_lock(&io->io_lock);
 		list_add_tail(&fio->list, &io->io_list);
@@ -2954,7 +3140,7 @@
 	struct f2fs_sb_info *sbi = fio->sbi;
 	unsigned int devidx;
 
-	if (!sbi->s_ndevs)
+	if (!f2fs_is_multi_device(sbi))
 		return;
 
 	devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
@@ -3020,6 +3206,7 @@
 	ClearPageError(page);
 	f2fs_submit_page_write(&fio);
 
+	stat_inc_meta_count(sbi, page->index);
 	f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
 }
 
@@ -3051,21 +3238,31 @@
 {
 	int err;
 	struct f2fs_sb_info *sbi = fio->sbi;
+	unsigned int segno;
 
 	fio->new_blkaddr = fio->old_blkaddr;
 	/* i/o temperature is needed for passing down write hints */
 	__get_segment_type(fio);
 
-	f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi,
-			GET_SEGNO(sbi, fio->new_blkaddr))->type));
+	segno = GET_SEGNO(sbi, fio->new_blkaddr);
+
+	if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) {
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.",
+			  __func__, segno);
+		return -EFSCORRUPTED;
+	}
 
 	stat_inc_inplace_blocks(fio->sbi);
 
-	err = f2fs_submit_page_bio(fio);
-	if (!err)
+	if (fio->bio)
+		err = f2fs_merge_page_bio(fio);
+	else
+		err = f2fs_submit_page_bio(fio);
+	if (!err) {
 		update_device_state(fio);
-
-	f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+		f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+	}
 
 	return err;
 }
@@ -3177,34 +3374,48 @@
 }
 
 void f2fs_wait_on_page_writeback(struct page *page,
-				enum page_type type, bool ordered)
+				enum page_type type, bool ordered, bool locked)
 {
 	if (PageWriteback(page)) {
 		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 
-		f2fs_submit_merged_write_cond(sbi, page->mapping->host,
-						0, page->index, type);
-		if (ordered)
+		f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
+		if (ordered) {
 			wait_on_page_writeback(page);
-		else
+			f2fs_bug_on(sbi, locked && PageWriteback(page));
+		} else {
 			wait_for_stable_page(page);
+		}
 	}
 }
 
-void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr)
+void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *cpage;
 
-	if (!is_valid_data_blkaddr(sbi, blkaddr))
+	if (!f2fs_post_read_required(inode))
+		return;
+
+	if (!__is_valid_data_blkaddr(blkaddr))
 		return;
 
 	cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
 	if (cpage) {
-		f2fs_wait_on_page_writeback(cpage, DATA, true);
+		f2fs_wait_on_page_writeback(cpage, DATA, true, true);
 		f2fs_put_page(cpage, 1);
 	}
 }
 
+void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
+								block_t len)
+{
+	block_t i;
+
+	for (i = 0; i < len; i++)
+		f2fs_wait_on_block_writeback(inode, blkaddr + i);
+}
+
 static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -3375,8 +3586,11 @@
 
 	/* sanity check for summary blocks */
 	if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES ||
-			sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES)
+			sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) {
+		f2fs_err(sbi, "invalid journal entries nats %u sits %u\n",
+			 nats_in_cursum(nat_j), sits_in_cursum(sit_j));
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -3607,7 +3821,7 @@
 	struct f2fs_journal *journal = curseg->journal;
 	struct sit_entry_set *ses, *tmp;
 	struct list_head *head = &SM_I(sbi)->sit_entry_set;
-	bool to_journal = true;
+	bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS);
 	struct seg_entry *se;
 
 	down_write(&sit_i->sentry_lock);
@@ -3626,7 +3840,8 @@
 	 * entries, remove all entries from journal and add and account
 	 * them in sit entry set.
 	 */
-	if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL))
+	if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) ||
+								!to_journal)
 		remove_sits_in_journal(sbi);
 
 	/*
@@ -3723,8 +3938,8 @@
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct sit_info *sit_i;
 	unsigned int sit_segs, start;
-	char *src_bitmap;
-	unsigned int bitmap_size;
+	char *src_bitmap, *bitmap;
+	unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size;
 
 	/* allocate memory for SIT information */
 	sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL);
@@ -3740,42 +3955,44 @@
 	if (!sit_i->sentries)
 		return -ENOMEM;
 
-	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, bitmap_size,
+	main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
+	sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size,
 								GFP_KERNEL);
 	if (!sit_i->dirty_sentries_bitmap)
 		return -ENOMEM;
 
+#ifdef CONFIG_F2FS_CHECK_FS
+	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 4;
+#else
+	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 3;
+#endif
+	sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
+	if (!sit_i->bitmap)
+		return -ENOMEM;
+
+	bitmap = sit_i->bitmap;
+
 	for (start = 0; start < MAIN_SEGS(sbi); start++) {
-		sit_i->sentries[start].cur_valid_map
-			= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
-		sit_i->sentries[start].ckpt_valid_map
-			= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
-		if (!sit_i->sentries[start].cur_valid_map ||
-				!sit_i->sentries[start].ckpt_valid_map)
-			return -ENOMEM;
+		sit_i->sentries[start].cur_valid_map = bitmap;
+		bitmap += SIT_VBLOCK_MAP_SIZE;
+
+		sit_i->sentries[start].ckpt_valid_map = bitmap;
+		bitmap += SIT_VBLOCK_MAP_SIZE;
 
 #ifdef CONFIG_F2FS_CHECK_FS
-		sit_i->sentries[start].cur_valid_map_mir
-			= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
-		if (!sit_i->sentries[start].cur_valid_map_mir)
-			return -ENOMEM;
+		sit_i->sentries[start].cur_valid_map_mir = bitmap;
+		bitmap += SIT_VBLOCK_MAP_SIZE;
 #endif
 
-		if (f2fs_discard_en(sbi)) {
-			sit_i->sentries[start].discard_map
-				= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE,
-								GFP_KERNEL);
-			if (!sit_i->sentries[start].discard_map)
-				return -ENOMEM;
-		}
+		sit_i->sentries[start].discard_map = bitmap;
+		bitmap += SIT_VBLOCK_MAP_SIZE;
 	}
 
 	sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 	if (!sit_i->tmp_map)
 		return -ENOMEM;
 
-	if (sbi->segs_per_sec > 1) {
+	if (__is_large_section(sbi)) {
 		sit_i->sec_entries =
 			f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry),
 						      MAIN_SECS(sbi)),
@@ -3788,17 +4005,23 @@
 	sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
 
 	/* setup SIT bitmap from ckeckpoint pack */
-	bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
+	sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
 	src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
 
-	sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+	sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL);
 	if (!sit_i->sit_bitmap)
 		return -ENOMEM;
 
 #ifdef CONFIG_F2FS_CHECK_FS
-	sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+	sit_i->sit_bitmap_mir = kmemdup(src_bitmap,
+					sit_bitmap_size, GFP_KERNEL);
 	if (!sit_i->sit_bitmap_mir)
 		return -ENOMEM;
+
+	sit_i->invalid_segmap = f2fs_kvzalloc(sbi,
+					main_bitmap_size, GFP_KERNEL);
+	if (!sit_i->invalid_segmap)
+		return -ENOMEM;
 #endif
 
 	/* init SIT information */
@@ -3807,7 +4030,7 @@
 	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
 	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
 	sit_i->written_valid_blocks = 0;
-	sit_i->bitmap_size = bitmap_size;
+	sit_i->bitmap_size = sit_bitmap_size;
 	sit_i->dirty_sentries = 0;
 	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
 	sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
@@ -3904,6 +4127,8 @@
 
 			se = &sit_i->sentries[start];
 			page = get_current_sit_page(sbi, start);
+			if (IS_ERR(page))
+				return PTR_ERR(page);
 			sit_blk = (struct f2fs_sit_block *)page_address(page);
 			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
 			f2fs_put_page(page, 1);
@@ -3916,21 +4141,19 @@
 				total_node_blocks += se->valid_blocks;
 
 			/* build discard map only one time */
-			if (f2fs_discard_en(sbi)) {
-				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
-					memset(se->discard_map, 0xff,
-						SIT_VBLOCK_MAP_SIZE);
-				} else {
-					memcpy(se->discard_map,
-						se->cur_valid_map,
-						SIT_VBLOCK_MAP_SIZE);
-					sbi->discard_blks +=
-						sbi->blocks_per_seg -
-						se->valid_blocks;
-				}
+			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+				memset(se->discard_map, 0xff,
+					SIT_VBLOCK_MAP_SIZE);
+			} else {
+				memcpy(se->discard_map,
+					se->cur_valid_map,
+					SIT_VBLOCK_MAP_SIZE);
+				sbi->discard_blks +=
+					sbi->blocks_per_seg -
+					se->valid_blocks;
 			}
 
-			if (sbi->segs_per_sec > 1)
+			if (__is_large_section(sbi))
 				get_sec_entry(sbi, start)->valid_blocks +=
 							se->valid_blocks;
 		}
@@ -3943,11 +4166,9 @@
 
 		start = le32_to_cpu(segno_in_journal(journal, i));
 		if (start >= MAIN_SEGS(sbi)) {
-			f2fs_msg(sbi->sb, KERN_ERR,
-					"Wrong journal entry on segno %u",
-					start);
-			set_sbi_flag(sbi, SBI_NEED_FSCK);
-			err = -EINVAL;
+			f2fs_err(sbi, "Wrong journal entry on segno %u",
+				 start);
+			err = -EFSCORRUPTED;
 			break;
 		}
 
@@ -3965,19 +4186,16 @@
 		if (IS_NODESEG(se->type))
 			total_node_blocks += se->valid_blocks;
 
-		if (f2fs_discard_en(sbi)) {
-			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
-				memset(se->discard_map, 0xff,
-							SIT_VBLOCK_MAP_SIZE);
-			} else {
-				memcpy(se->discard_map, se->cur_valid_map,
-							SIT_VBLOCK_MAP_SIZE);
-				sbi->discard_blks += old_valid_blocks;
-				sbi->discard_blks -= se->valid_blocks;
-			}
+		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
+		} else {
+			memcpy(se->discard_map, se->cur_valid_map,
+						SIT_VBLOCK_MAP_SIZE);
+			sbi->discard_blks += old_valid_blocks;
+			sbi->discard_blks -= se->valid_blocks;
 		}
 
-		if (sbi->segs_per_sec > 1) {
+		if (__is_large_section(sbi)) {
 			get_sec_entry(sbi, start)->valid_blocks +=
 							se->valid_blocks;
 			get_sec_entry(sbi, start)->valid_blocks -=
@@ -3987,11 +4205,9 @@
 	up_read(&curseg->journal_rwsem);
 
 	if (!err && total_node_blocks != valid_node_count(sbi)) {
-		f2fs_msg(sbi->sb, KERN_ERR,
-			"SIT is corrupted node# %u vs %u",
-			total_node_blocks, valid_node_count(sbi));
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		err = -EINVAL;
+		f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
+			 total_node_blocks, valid_node_count(sbi));
+		err = -EFSCORRUPTED;
 	}
 
 	return err;
@@ -4082,6 +4298,39 @@
 	return init_victim_secmap(sbi);
 }
 
+static int sanity_check_curseg(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	/*
+	 * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr;
+	 * In LFS curseg, all blkaddr after .next_blkoff should be unused.
+	 */
+	for (i = 0; i < NO_CHECK_TYPE; i++) {
+		struct curseg_info *curseg = CURSEG_I(sbi, i);
+		struct seg_entry *se = get_seg_entry(sbi, curseg->segno);
+		unsigned int blkofs = curseg->next_blkoff;
+
+		if (f2fs_test_bit(blkofs, se->cur_valid_map))
+			goto out;
+
+		if (curseg->alloc_type == SSR)
+			continue;
+
+		for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) {
+			if (!f2fs_test_bit(blkofs, se->cur_valid_map))
+				continue;
+out:
+			f2fs_err(sbi,
+				 "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u",
+				 i, curseg->segno, curseg->alloc_type,
+				 curseg->next_blkoff, blkofs);
+			return -EFSCORRUPTED;
+		}
+	}
+	return 0;
+}
+
 /*
  * Update min, max modified time for cost-benefit GC algorithm
  */
@@ -4177,6 +4426,10 @@
 	if (err)
 		return err;
 
+	err = sanity_check_curseg(sbi);
+	if (err)
+		return err;
+
 	init_min_max_mtime(sbi);
 	return 0;
 }
@@ -4212,7 +4465,7 @@
 
 	destroy_victim_secmap(sbi);
 	SM_I(sbi)->dirty_info = NULL;
-	kfree(dirty_i);
+	kvfree(dirty_i);
 }
 
 static void destroy_curseg(struct f2fs_sb_info *sbi)
@@ -4224,10 +4477,10 @@
 		return;
 	SM_I(sbi)->curseg_array = NULL;
 	for (i = 0; i < NR_CURSEG_TYPE; i++) {
-		kfree(array[i].sum_blk);
-		kfree(array[i].journal);
+		kvfree(array[i].sum_blk);
+		kvfree(array[i].journal);
 	}
-	kfree(array);
+	kvfree(array);
 }
 
 static void destroy_free_segmap(struct f2fs_sb_info *sbi)
@@ -4238,39 +4491,31 @@
 	SM_I(sbi)->free_info = NULL;
 	kvfree(free_i->free_segmap);
 	kvfree(free_i->free_secmap);
-	kfree(free_i);
+	kvfree(free_i);
 }
 
 static void destroy_sit_info(struct f2fs_sb_info *sbi)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	unsigned int start;
 
 	if (!sit_i)
 		return;
 
-	if (sit_i->sentries) {
-		for (start = 0; start < MAIN_SEGS(sbi); start++) {
-			kfree(sit_i->sentries[start].cur_valid_map);
-#ifdef CONFIG_F2FS_CHECK_FS
-			kfree(sit_i->sentries[start].cur_valid_map_mir);
-#endif
-			kfree(sit_i->sentries[start].ckpt_valid_map);
-			kfree(sit_i->sentries[start].discard_map);
-		}
-	}
-	kfree(sit_i->tmp_map);
+	if (sit_i->sentries)
+		kvfree(sit_i->bitmap);
+	kvfree(sit_i->tmp_map);
 
 	kvfree(sit_i->sentries);
 	kvfree(sit_i->sec_entries);
 	kvfree(sit_i->dirty_sentries_bitmap);
 
 	SM_I(sbi)->sit_info = NULL;
-	kfree(sit_i->sit_bitmap);
+	kvfree(sit_i->sit_bitmap);
 #ifdef CONFIG_F2FS_CHECK_FS
-	kfree(sit_i->sit_bitmap_mir);
+	kvfree(sit_i->sit_bitmap_mir);
+	kvfree(sit_i->invalid_segmap);
 #endif
-	kfree(sit_i);
+	kvfree(sit_i);
 }
 
 void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
@@ -4286,7 +4531,7 @@
 	destroy_free_segmap(sbi);
 	destroy_sit_info(sbi);
 	sbi->sm_info = NULL;
-	kfree(sm_info);
+	kvfree(sm_info);
 }
 
 int __init f2fs_create_segment_manager_caches(void)

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index b3d9e31..325781a 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/segment.h
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -85,7 +82,7 @@
 	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
 
 #define GET_SEGNO(sbi, blk_addr)					\
-	((!is_valid_data_blkaddr(sbi, blk_addr)) ?			\
+	((!__is_valid_data_blkaddr(blk_addr)) ?			\
 	NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),			\
 		GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
 #define BLKS_PER_SEC(sbi)					\
@@ -112,7 +109,7 @@
 #define	START_SEGNO(segno)		\
 	(SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
 #define SIT_BLK_CNT(sbi)			\
-	((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
+	DIV_ROUND_UP(MAIN_SEGS(sbi), SIT_ENTRY_PER_BLOCK)
 #define f2fs_bitmap_size(nr)			\
 	(BITS_TO_LONGS(nr) * sizeof(unsigned long))
 
@@ -229,9 +226,13 @@
 	block_t sit_base_addr;		/* start block address of SIT area */
 	block_t sit_blocks;		/* # of blocks used by SIT area */
 	block_t written_valid_blocks;	/* # of valid blocks in main area */
+	char *bitmap;			/* all bitmaps pointer */
 	char *sit_bitmap;		/* SIT bitmap pointer */
 #ifdef CONFIG_F2FS_CHECK_FS
 	char *sit_bitmap_mir;		/* SIT bitmap mirror */
+
+	/* bitmap of segments to be ignored by GC in case of errors */
+	unsigned long *invalid_segmap;
 #endif
 	unsigned int bitmap_size;	/* SIT bitmap size */
 
@@ -336,12 +337,18 @@
 	 * In order to get # of valid blocks in a section instantly from many
 	 * segments, f2fs manages two counting structures separately.
 	 */
-	if (use_section && sbi->segs_per_sec > 1)
+	if (use_section && __is_large_section(sbi))
 		return get_sec_entry(sbi, segno)->valid_blocks;
 	else
 		return get_seg_entry(sbi, segno)->valid_blocks;
 }
 
+static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
+				unsigned int segno)
+{
+	return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+}
+
 static inline void seg_info_from_raw_sit(struct seg_entry *se,
 					struct f2fs_sit_entry *rs)
 {
@@ -579,6 +586,15 @@
 		reserved_sections(sbi) + needed);
 }
 
+static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi)
+{
+	if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+		return true;
+	if (likely(!has_not_enough_free_secs(sbi, 0, 0)))
+		return true;
+	return false;
+}
+
 static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
 {
 	return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments;
@@ -644,14 +660,15 @@
 	f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1);
 }
 
-static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr)
+static inline void verify_fio_blkaddr(struct f2fs_io_info *fio)
 {
 	struct f2fs_sb_info *sbi = fio->sbi;
 
-	if (__is_meta_io(fio))
-		verify_blkaddr(sbi, blk_addr, META_GENERIC);
-	else
-		verify_blkaddr(sbi, blk_addr, DATA_GENERIC);
+	if (__is_valid_data_blkaddr(fio->old_blkaddr))
+		verify_blkaddr(sbi, fio->old_blkaddr, __is_meta_io(fio) ?
+					META_GENERIC : DATA_GENERIC);
+	verify_blkaddr(sbi, fio->new_blkaddr, __is_meta_io(fio) ?
+					META_GENERIC : DATA_GENERIC_ENHANCE);
 }
 
 /*
@@ -660,7 +677,6 @@
 static inline int check_block_count(struct f2fs_sb_info *sbi,
 		int segno, struct f2fs_sit_entry *raw_sit)
 {
-#ifdef CONFIG_F2FS_CHECK_FS
 	bool is_valid  = test_bit_le(0, raw_sit->valid_map) ? true : false;
 	int valid_blocks = 0;
 	int cur_pos = 0, next_pos;
@@ -681,21 +697,19 @@
 	} while (cur_pos < sbi->blocks_per_seg);
 
 	if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) {
-		f2fs_msg(sbi->sb, KERN_ERR,
-				"Mismatch valid blocks %d vs. %d",
-					GET_SIT_VBLOCKS(raw_sit), valid_blocks);
+		f2fs_err(sbi, "Mismatch valid blocks %d vs. %d",
+			 GET_SIT_VBLOCKS(raw_sit), valid_blocks);
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		return -EINVAL;
+		return -EFSCORRUPTED;
 	}
-#endif
+
 	/* check segment usage, and check boundary of a given segment number */
 	if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
 					|| segno > TOTAL_SEGS(sbi) - 1)) {
-		f2fs_msg(sbi->sb, KERN_ERR,
-				"Wrong valid blocks %d or segno %u",
-					GET_SIT_VBLOCKS(raw_sit), segno);
+		f2fs_err(sbi, "Wrong valid blocks %d or segno %u",
+			 GET_SIT_VBLOCKS(raw_sit), segno);
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		return -EINVAL;
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -853,7 +867,7 @@
 		}
 	}
 	mutex_unlock(&dcc->cmd_lock);
-	if (!wakeup)
+	if (!wakeup || !is_idle(sbi, DISCARD_TIME))
 		return;
 wake_up:
 	dcc->discard_wake = 1;

diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 36cfd81..a467aca 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c

@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * f2fs shrinker support
  *   the basic infra was copied from fs/ubifs/shrinker.c
  *
  * Copyright (c) 2015 Motorola Mobility
  * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
@@ -138,6 +135,6 @@
 	f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
 
 	spin_lock(&f2fs_list_lock);
-	list_del(&sbi->s_list);
+	list_del_init(&sbi->s_list);
 	spin_unlock(&f2fs_list_lock);
 }

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 287c9fe..1443cee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/super.c
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/module.h>
 #include <linux/init.h>
@@ -26,6 +23,7 @@
 #include <linux/f2fs_fs.h>
 #include <linux/sysfs.h>
 #include <linux/quota.h>
+#include <linux/unicode.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -41,7 +39,7 @@
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 
-char *f2fs_fault_name[FAULT_MAX] = {
+const char *f2fs_fault_name[FAULT_MAX] = {
 	[FAULT_KMALLOC]		= "kmalloc",
 	[FAULT_KVMALLOC]	= "kvmalloc",
 	[FAULT_PAGE_ALLOC]	= "page alloc",
@@ -53,9 +51,10 @@
 	[FAULT_DIR_DEPTH]	= "too big dir depth",
 	[FAULT_EVICT_INODE]	= "evict_inode fail",
 	[FAULT_TRUNCATE]	= "truncate fail",
-	[FAULT_IO]		= "IO error",
+	[FAULT_READ_IO]		= "read IO error",
 	[FAULT_CHECKPOINT]	= "checkpoint error",
 	[FAULT_DISCARD]		= "discard error",
+	[FAULT_WRITE_IO]	= "write IO error",
 };
 
 void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -138,6 +137,10 @@
 	Opt_alloc,
 	Opt_fsync,
 	Opt_test_dummy_encryption,
+	Opt_checkpoint_disable,
+	Opt_checkpoint_disable_cap,
+	Opt_checkpoint_disable_cap_perc,
+	Opt_checkpoint_enable,
 	Opt_err,
 };
 
@@ -196,44 +199,82 @@
 	{Opt_alloc, "alloc_mode=%s"},
 	{Opt_fsync, "fsync_mode=%s"},
 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
+	{Opt_checkpoint_disable, "checkpoint=disable"},
+	{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
+	{Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
+	{Opt_checkpoint_enable, "checkpoint=enable"},
 	{Opt_err, NULL},
 };
 
-void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
+	int level;
 
 	va_start(args, fmt);
-	vaf.fmt = fmt;
+
+	level = printk_get_level(fmt);
+	vaf.fmt = printk_skip_level(fmt);
 	vaf.va = &args;
-	printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
+	printk("%c%cF2FS-fs (%s): %pV\n",
+	       KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+
 	va_end(args);
 }
 
+#ifdef CONFIG_UNICODE
+static const struct f2fs_sb_encodings {
+	__u16 magic;
+	char *name;
+	char *version;
+} f2fs_sb_encoding_map[] = {
+	{F2FS_ENC_UTF8_12_1, "utf8", "12.1.0"},
+};
+
+static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb,
+				 const struct f2fs_sb_encodings **encoding,
+				 __u16 *flags)
+{
+	__u16 magic = le16_to_cpu(sb->s_encoding);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++)
+		if (magic == f2fs_sb_encoding_map[i].magic)
+			break;
+
+	if (i >= ARRAY_SIZE(f2fs_sb_encoding_map))
+		return -EINVAL;
+
+	*encoding = &f2fs_sb_encoding_map[i];
+	*flags = le16_to_cpu(sb->s_encoding_flags);
+
+	return 0;
+}
+#endif
+
 static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
 {
-	block_t limit = (sbi->user_block_count << 1) / 1000;
+	block_t limit = min((sbi->user_block_count << 1) / 1000,
+			sbi->user_block_count - sbi->reserved_blocks);
 
 	/* limit is 0.2% */
 	if (test_opt(sbi, RESERVE_ROOT) &&
 			F2FS_OPTION(sbi).root_reserved_blocks > limit) {
 		F2FS_OPTION(sbi).root_reserved_blocks = limit;
-		f2fs_msg(sbi->sb, KERN_INFO,
-			"Reduce reserved blocks for root = %u",
-			F2FS_OPTION(sbi).root_reserved_blocks);
+		f2fs_info(sbi, "Reduce reserved blocks for root = %u",
+			  F2FS_OPTION(sbi).root_reserved_blocks);
 	}
 	if (!test_opt(sbi, RESERVE_ROOT) &&
 		(!uid_eq(F2FS_OPTION(sbi).s_resuid,
 				make_kuid(&init_user_ns, F2FS_DEF_RESUID)) ||
 		!gid_eq(F2FS_OPTION(sbi).s_resgid,
 				make_kgid(&init_user_ns, F2FS_DEF_RESGID))))
-		f2fs_msg(sbi->sb, KERN_INFO,
-			"Ignore s_resuid=%u, s_resgid=%u w/o reserve_root",
-				from_kuid_munged(&init_user_ns,
-					F2FS_OPTION(sbi).s_resuid),
-				from_kgid_munged(&init_user_ns,
-					F2FS_OPTION(sbi).s_resgid));
+		f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root",
+			  from_kuid_munged(&init_user_ns,
+					   F2FS_OPTION(sbi).s_resuid),
+			  from_kgid_munged(&init_user_ns,
+					   F2FS_OPTION(sbi).s_resgid));
 }
 
 static void init_once(void *foo)
@@ -254,42 +295,36 @@
 	int ret = -EINVAL;
 
 	if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) {
-		f2fs_msg(sb, KERN_ERR,
-			"Cannot change journaled "
-			"quota options when quota turned on");
+		f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
 		return -EINVAL;
 	}
-	if (f2fs_sb_has_quota_ino(sb)) {
-		f2fs_msg(sb, KERN_INFO,
-			"QUOTA feature is enabled, so ignore qf_name");
+	if (f2fs_sb_has_quota_ino(sbi)) {
+		f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name");
 		return 0;
 	}
 
 	qname = match_strdup(args);
 	if (!qname) {
-		f2fs_msg(sb, KERN_ERR,
-			"Not enough memory for storing quotafile name");
-		return -EINVAL;
+		f2fs_err(sbi, "Not enough memory for storing quotafile name");
+		return -ENOMEM;
 	}
 	if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
 		if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
 			ret = 0;
 		else
-			f2fs_msg(sb, KERN_ERR,
-				 "%s quota file already specified",
+			f2fs_err(sbi, "%s quota file already specified",
 				 QTYPE2NAME(qtype));
 		goto errout;
 	}
 	if (strchr(qname, '/')) {
-		f2fs_msg(sb, KERN_ERR,
-			"quotafile must be on filesystem root");
+		f2fs_err(sbi, "quotafile must be on filesystem root");
 		goto errout;
 	}
 	F2FS_OPTION(sbi).s_qf_names[qtype] = qname;
 	set_opt(sbi, QUOTA);
 	return 0;
 errout:
-	kfree(qname);
+	kvfree(qname);
 	return ret;
 }
 
@@ -298,11 +333,10 @@
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 
 	if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) {
-		f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options"
-			" when quota turned on");
+		f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
 		return -EINVAL;
 	}
-	kfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
+	kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
 	F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
 	return 0;
 }
@@ -314,9 +348,8 @@
 	 * 'grpquota' mount options are allowed even without quota feature
 	 * to support legacy quotas in quota files.
 	 */
-	if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) {
-		f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. "
-			 "Cannot enable project quota enforcement.");
+	if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) {
+		f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement.");
 		return -1;
 	}
 	if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
@@ -336,21 +369,18 @@
 
 		if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) ||
 				test_opt(sbi, PRJQUOTA)) {
-			f2fs_msg(sbi->sb, KERN_ERR, "old and new quota "
-					"format mixing");
+			f2fs_err(sbi, "old and new quota format mixing");
 			return -1;
 		}
 
 		if (!F2FS_OPTION(sbi).s_jquota_fmt) {
-			f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format "
-					"not specified");
+			f2fs_err(sbi, "journaled quota format not specified");
 			return -1;
 		}
 	}
 
-	if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) {
-		f2fs_msg(sbi->sb, KERN_INFO,
-			"QUOTA feature is enabled, so ignore jquota_fmt");
+	if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) {
+		f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt");
 		F2FS_OPTION(sbi).s_jquota_fmt = 0;
 	}
 	return 0;
@@ -360,7 +390,6 @@
 static int parse_options(struct super_block *sb, char *options)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	struct request_queue *q;
 	substring_t args[MAX_OPT_ARGS];
 	char *p, *name;
 	int arg = 0;
@@ -400,10 +429,10 @@
 				set_opt(sbi, BG_GC);
 				set_opt(sbi, FORCE_FG_GC);
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_disable_roll_forward:
 			set_opt(sbi, DISABLE_ROLL_FORWARD);
@@ -415,19 +444,11 @@
 				return -EINVAL;
 			break;
 		case Opt_discard:
-			q = bdev_get_queue(sb->s_bdev);
-			if (blk_queue_discard(q)) {
-				set_opt(sbi, DISCARD);
-			} else if (!f2fs_sb_has_blkzoned(sb)) {
-				f2fs_msg(sb, KERN_WARNING,
-					"mounting with \"discard\" option, but "
-					"the device does not support discard");
-			}
+			set_opt(sbi, DISCARD);
 			break;
 		case Opt_nodiscard:
-			if (f2fs_sb_has_blkzoned(sb)) {
-				f2fs_msg(sb, KERN_WARNING,
-					"discard is required for zoned block devices");
+			if (f2fs_sb_has_blkzoned(sbi)) {
+				f2fs_warn(sbi, "discard is required for zoned block devices");
 				return -EINVAL;
 			}
 			clear_opt(sbi, DISCARD);
@@ -459,20 +480,16 @@
 			break;
 #else
 		case Opt_user_xattr:
-			f2fs_msg(sb, KERN_INFO,
-				"user_xattr options not supported");
+			f2fs_info(sbi, "user_xattr options not supported");
 			break;
 		case Opt_nouser_xattr:
-			f2fs_msg(sb, KERN_INFO,
-				"nouser_xattr options not supported");
+			f2fs_info(sbi, "nouser_xattr options not supported");
 			break;
 		case Opt_inline_xattr:
-			f2fs_msg(sb, KERN_INFO,
-				"inline_xattr options not supported");
+			f2fs_info(sbi, "inline_xattr options not supported");
 			break;
 		case Opt_noinline_xattr:
-			f2fs_msg(sb, KERN_INFO,
-				"noinline_xattr options not supported");
+			f2fs_info(sbi, "noinline_xattr options not supported");
 			break;
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -484,10 +501,10 @@
 			break;
 #else
 		case Opt_acl:
-			f2fs_msg(sb, KERN_INFO, "acl options not supported");
+			f2fs_info(sbi, "acl options not supported");
 			break;
 		case Opt_noacl:
-			f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+			f2fs_info(sbi, "noacl options not supported");
 			break;
 #endif
 		case Opt_active_logs:
@@ -537,9 +554,8 @@
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
 			if (test_opt(sbi, RESERVE_ROOT)) {
-				f2fs_msg(sb, KERN_INFO,
-					"Preserve previous reserve_root=%u",
-					F2FS_OPTION(sbi).root_reserved_blocks);
+				f2fs_info(sbi, "Preserve previous reserve_root=%u",
+					  F2FS_OPTION(sbi).root_reserved_blocks);
 			} else {
 				F2FS_OPTION(sbi).root_reserved_blocks = arg;
 				set_opt(sbi, RESERVE_ROOT);
@@ -550,8 +566,7 @@
 				return -EINVAL;
 			uid = make_kuid(current_user_ns(), arg);
 			if (!uid_valid(uid)) {
-				f2fs_msg(sb, KERN_ERR,
-					"Invalid uid value %d", arg);
+				f2fs_err(sbi, "Invalid uid value %d", arg);
 				return -EINVAL;
 			}
 			F2FS_OPTION(sbi).s_resuid = uid;
@@ -561,8 +576,7 @@
 				return -EINVAL;
 			gid = make_kgid(current_user_ns(), arg);
 			if (!gid_valid(gid)) {
-				f2fs_msg(sb, KERN_ERR,
-					"Invalid gid value %d", arg);
+				f2fs_err(sbi, "Invalid gid value %d", arg);
 				return -EINVAL;
 			}
 			F2FS_OPTION(sbi).s_resgid = gid;
@@ -574,11 +588,9 @@
 				return -ENOMEM;
 			if (strlen(name) == 8 &&
 					!strncmp(name, "adaptive", 8)) {
-				if (f2fs_sb_has_blkzoned(sb)) {
-					f2fs_msg(sb, KERN_WARNING,
-						 "adaptive mode is not allowed with "
-						 "zoned block device feature");
-					kfree(name);
+				if (f2fs_sb_has_blkzoned(sbi)) {
+					f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature");
+					kvfree(name);
 					return -EINVAL;
 				}
 				set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
@@ -586,44 +598,44 @@
 					!strncmp(name, "lfs", 3)) {
 				set_opt_mode(sbi, F2FS_MOUNT_LFS);
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_io_size_bits:
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
-			if (arg > __ilog2_u32(BIO_MAX_PAGES)) {
-				f2fs_msg(sb, KERN_WARNING,
-					"Not support %d, larger than %d",
-					1 << arg, BIO_MAX_PAGES);
+			if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) {
+				f2fs_warn(sbi, "Not support %d, larger than %d",
+					  1 << arg, BIO_MAX_PAGES);
 				return -EINVAL;
 			}
 			F2FS_OPTION(sbi).write_io_size_bits = arg;
 			break;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
 		case Opt_fault_injection:
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
 			f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE);
 			set_opt(sbi, FAULT_INJECTION);
-#else
-			f2fs_msg(sb, KERN_INFO,
-				"FAULT_INJECTION was not selected");
-#endif
 			break;
+
 		case Opt_fault_type:
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
 			f2fs_build_fault_attr(sbi, 0, arg);
 			set_opt(sbi, FAULT_INJECTION);
-#else
-			f2fs_msg(sb, KERN_INFO,
-				"FAULT_INJECTION was not selected");
-#endif
 			break;
+#else
+		case Opt_fault_injection:
+			f2fs_info(sbi, "fault_injection options not supported");
+			break;
+
+		case Opt_fault_type:
+			f2fs_info(sbi, "fault_type options not supported");
+			break;
+#endif
 		case Opt_lazytime:
 			sb->s_flags |= SB_LAZYTIME;
 			break;
@@ -701,8 +713,7 @@
 		case Opt_jqfmt_vfsv0:
 		case Opt_jqfmt_vfsv1:
 		case Opt_noquota:
-			f2fs_msg(sb, KERN_INFO,
-					"quota operations not supported");
+			f2fs_info(sbi, "quota operations not supported");
 			break;
 #endif
 		case Opt_whint:
@@ -719,10 +730,10 @@
 					!strncmp(name, "fs-based", 8)) {
 				F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_alloc:
 			name = match_strdup(&args[0]);
@@ -736,10 +747,10 @@
 					!strncmp(name, "reuse", 5)) {
 				F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_fsync:
 			name = match_strdup(&args[0]);
@@ -756,30 +767,52 @@
 				F2FS_OPTION(sbi).fsync_mode =
 							FSYNC_MODE_NOBARRIER;
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_test_dummy_encryption:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-			if (!f2fs_sb_has_encrypt(sb)) {
-				f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
+#ifdef CONFIG_FS_ENCRYPTION
+			if (!f2fs_sb_has_encrypt(sbi)) {
+				f2fs_err(sbi, "Encrypt feature is off");
 				return -EINVAL;
 			}
 
 			F2FS_OPTION(sbi).test_dummy_encryption = true;
-			f2fs_msg(sb, KERN_INFO,
-					"Test dummy encryption mode enabled");
+			f2fs_info(sbi, "Test dummy encryption mode enabled");
 #else
-			f2fs_msg(sb, KERN_INFO,
-					"Test dummy encryption mount option ignored");
+			f2fs_info(sbi, "Test dummy encryption mount option ignored");
 #endif
 			break;
+		case Opt_checkpoint_disable_cap_perc:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			if (arg < 0 || arg > 100)
+				return -EINVAL;
+			if (arg == 100)
+				F2FS_OPTION(sbi).unusable_cap =
+					sbi->user_block_count;
+			else
+				F2FS_OPTION(sbi).unusable_cap =
+					(sbi->user_block_count / 100) *	arg;
+			set_opt(sbi, DISABLE_CHECKPOINT);
+			break;
+		case Opt_checkpoint_disable_cap:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			F2FS_OPTION(sbi).unusable_cap = arg;
+			set_opt(sbi, DISABLE_CHECKPOINT);
+			break;
+		case Opt_checkpoint_disable:
+			set_opt(sbi, DISABLE_CHECKPOINT);
+			break;
+		case Opt_checkpoint_enable:
+			clear_opt(sbi, DISABLE_CHECKPOINT);
+			break;
 		default:
-			f2fs_msg(sb, KERN_ERR,
-				"Unrecognized mount option \"%s\" or missing value",
-				p);
+			f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
+				 p);
 			return -EINVAL;
 		}
 	}
@@ -787,53 +820,58 @@
 	if (f2fs_check_quota_options(sbi))
 		return -EINVAL;
 #else
-	if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) {
-		f2fs_msg(sbi->sb, KERN_INFO,
-			 "Filesystem with quota feature cannot be mounted RDWR "
-			 "without CONFIG_QUOTA");
+	if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) {
+		f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA");
 		return -EINVAL;
 	}
-	if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) {
-		f2fs_msg(sb, KERN_ERR,
-			"Filesystem with project quota feature cannot be "
-			"mounted RDWR without CONFIG_QUOTA");
+	if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) {
+		f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+		return -EINVAL;
+	}
+#endif
+#ifndef CONFIG_UNICODE
+	if (f2fs_sb_has_casefold(sbi)) {
+		f2fs_err(sbi,
+			"Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE");
 		return -EINVAL;
 	}
 #endif
 
 	if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
-		f2fs_msg(sb, KERN_ERR,
-				"Should set mode=lfs with %uKB-sized IO",
-				F2FS_IO_SIZE_KB(sbi));
+		f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
+			 F2FS_IO_SIZE_KB(sbi));
 		return -EINVAL;
 	}
 
 	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
-		if (!f2fs_sb_has_extra_attr(sb) ||
-			!f2fs_sb_has_flexible_inline_xattr(sb)) {
-			f2fs_msg(sb, KERN_ERR,
-					"extra_attr or flexible_inline_xattr "
-					"feature is off");
+		int min_size, max_size;
+
+		if (!f2fs_sb_has_extra_attr(sbi) ||
+			!f2fs_sb_has_flexible_inline_xattr(sbi)) {
+			f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off");
 			return -EINVAL;
 		}
 		if (!test_opt(sbi, INLINE_XATTR)) {
-			f2fs_msg(sb, KERN_ERR,
-					"inline_xattr_size option should be "
-					"set with inline_xattr option");
+			f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option");
 			return -EINVAL;
 		}
-		if (!F2FS_OPTION(sbi).inline_xattr_size ||
-			F2FS_OPTION(sbi).inline_xattr_size >=
-					DEF_ADDRS_PER_INODE -
-					F2FS_TOTAL_EXTRA_ATTR_SIZE -
-					DEF_INLINE_RESERVED_SIZE -
-					DEF_MIN_INLINE_SIZE) {
-			f2fs_msg(sb, KERN_ERR,
-					"inline xattr size is out of range");
+
+		min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32);
+		max_size = MAX_INLINE_XATTR_SIZE;
+
+		if (F2FS_OPTION(sbi).inline_xattr_size < min_size ||
+				F2FS_OPTION(sbi).inline_xattr_size > max_size) {
+			f2fs_err(sbi, "inline xattr size is out of range: %d ~ %d",
+				 min_size, max_size);
 			return -EINVAL;
 		}
 	}
 
+	if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) {
+		f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n");
+		return -EINVAL;
+	}
+
 	/* Not pass down write hints if the number of active logs is lesser
 	 * than NR_CURSEG_TYPE.
 	 */
@@ -873,7 +911,21 @@
 
 static int f2fs_drop_inode(struct inode *inode)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int ret;
+
+	/*
+	 * during filesystem shutdown, if checkpoint is disabled,
+	 * drop useless meta/node dirty pages.
+	 */
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+		if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+			inode->i_ino == F2FS_META_INO(sbi)) {
+			trace_f2fs_drop_inode(inode, 1);
+			return 1;
+		}
+	}
+
 	/*
 	 * This is to avoid a deadlock condition like below.
 	 * writeback_single_inode(inode)
@@ -897,6 +949,10 @@
 			sb_start_intwrite(inode->i_sb);
 			f2fs_i_size_write(inode, 0);
 
+			f2fs_submit_merged_write_cond(F2FS_I_SB(inode),
+					inode, NULL, 0, DATA);
+			truncate_inode_pages_final(inode->i_mapping);
+
 			if (F2FS_HAS_BLOCKS(inode))
 				f2fs_truncate(inode);
 
@@ -909,6 +965,8 @@
 		return 0;
 	}
 	ret = generic_drop_inode(inode);
+	if (!ret)
+		ret = fscrypt_drop_inode(inode);
 	trace_f2fs_drop_inode(inode, ret);
 	return ret;
 }
@@ -975,17 +1033,12 @@
 	f2fs_inode_dirtied(inode, false);
 }
 
-static void f2fs_i_callback(struct rcu_head *head)
+static void f2fs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
+	fscrypt_free_inode(inode);
 	kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode));
 }
 
-static void f2fs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, f2fs_i_callback);
-}
-
 static void destroy_percpu_info(struct f2fs_sb_info *sbi)
 {
 	percpu_counter_destroy(&sbi->alloc_valid_block_count);
@@ -999,10 +1052,10 @@
 	for (i = 0; i < sbi->s_ndevs; i++) {
 		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
 #ifdef CONFIG_BLK_DEV_ZONED
-		kfree(FDEV(i).blkz_type);
+		kvfree(FDEV(i).blkz_seq);
 #endif
 	}
-	kfree(sbi->devs);
+	kvfree(sbi->devs);
 }
 
 static void f2fs_put_super(struct super_block *sb)
@@ -1021,8 +1074,8 @@
 	 * But, the previous checkpoint was not done by umount, it needs to do
 	 * clean checkpoint again.
 	 */
-	if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
-			!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+	if ((is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
+			!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT,
 		};
@@ -1030,18 +1083,16 @@
 	}
 
 	/* be sure to wait for any on-going discard commands */
-	dropped = f2fs_wait_discard_bios(sbi);
+	dropped = f2fs_issue_discard_timeout(sbi);
 
-	if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) {
+	if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
+					!sbi->discard_blks && !dropped) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT | CP_TRIMMED,
 		};
 		f2fs_write_checkpoint(sbi, &cpc);
 	}
 
-	/* f2fs_write_checkpoint can update stat informaion */
-	f2fs_destroy_stats(sbi);
-
 	/*
 	 * normally superblock is clean, so we need to release this.
 	 * In addition, EIO will skip do checkpoint, we need this as well.
@@ -1059,31 +1110,43 @@
 	f2fs_bug_on(sbi, sbi->fsync_node_num);
 
 	iput(sbi->node_inode);
+	sbi->node_inode = NULL;
+
 	iput(sbi->meta_inode);
+	sbi->meta_inode = NULL;
+
+	/*
+	 * iput() can update stat information, if f2fs_write_checkpoint()
+	 * above failed with error.
+	 */
+	f2fs_destroy_stats(sbi);
 
 	/* destroy f2fs internal modules */
 	f2fs_destroy_node_manager(sbi);
 	f2fs_destroy_segment_manager(sbi);
 
-	kfree(sbi->ckpt);
+	kvfree(sbi->ckpt);
 
 	f2fs_unregister_sysfs(sbi);
 
 	sb->s_fs_info = NULL;
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
-	kfree(sbi->raw_super);
+	kvfree(sbi->raw_super);
 
 	destroy_device_list(sbi);
 	mempool_destroy(sbi->write_io_dummy);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+		kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
 #endif
 	destroy_percpu_info(sbi);
 	for (i = 0; i < NR_PAGE_TYPE; i++)
-		kfree(sbi->write_io[i]);
-	kfree(sbi);
+		kvfree(sbi->write_io[i]);
+#ifdef CONFIG_UNICODE
+	utf8_unload(sbi->s_encoding);
+#endif
+	kvfree(sbi);
 }
 
 int f2fs_sync_fs(struct super_block *sb, int sync)
@@ -1093,6 +1156,8 @@
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return 0;
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+		return 0;
 
 	trace_f2fs_sync_fs(sb, sync);
 
@@ -1192,14 +1257,21 @@
 	buf->f_blocks = total_count - start_count;
 	buf->f_bfree = user_block_count - valid_user_blocks(sbi) -
 						sbi->current_reserved_blocks;
+
+	spin_lock(&sbi->stat_lock);
+	if (unlikely(buf->f_bfree <= sbi->unusable_block_count))
+		buf->f_bfree = 0;
+	else
+		buf->f_bfree -= sbi->unusable_block_count;
+	spin_unlock(&sbi->stat_lock);
+
 	if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks)
 		buf->f_bavail = buf->f_bfree -
 				F2FS_OPTION(sbi).root_reserved_blocks;
 	else
 		buf->f_bavail = 0;
 
-	avail_node_count = sbi->total_node_count - sbi->nquota_files -
-						F2FS_RESERVED_NODE_NUM;
+	avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
 
 	if (avail_node_count > user_block_count) {
 		buf->f_files = user_block_count;
@@ -1276,6 +1348,8 @@
 		seq_puts(seq, ",disable_roll_forward");
 	if (test_opt(sbi, DISCARD))
 		seq_puts(seq, ",discard");
+	else
+		seq_puts(seq, ",nodiscard");
 	if (test_opt(sbi, NOHEAP))
 		seq_puts(seq, ",no_heap");
 	else
@@ -1336,7 +1410,8 @@
 				from_kgid_munged(&init_user_ns,
 					F2FS_OPTION(sbi).s_resgid));
 	if (F2FS_IO_SIZE_BITS(sbi))
-		seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi));
+		seq_printf(seq, ",io_bits=%u",
+				F2FS_OPTION(sbi).write_io_size_bits);
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	if (test_opt(sbi, FAULT_INJECTION)) {
 		seq_printf(seq, ",fault_injection=%u",
@@ -1360,7 +1435,7 @@
 		seq_printf(seq, ",whint_mode=%s", "user-based");
 	else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
 		seq_printf(seq, ",whint_mode=%s", "fs-based");
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	if (F2FS_OPTION(sbi).test_dummy_encryption)
 		seq_puts(seq, ",test_dummy_encryption");
 #endif
@@ -1370,6 +1445,9 @@
 	else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
 		seq_printf(seq, ",alloc_mode=%s", "reuse");
 
+	if (test_opt(sbi, DISABLE_CHECKPOINT))
+		seq_printf(seq, ",checkpoint=disable:%u",
+				F2FS_OPTION(sbi).unusable_cap);
 	if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX)
 		seq_printf(seq, ",fsync_mode=%s", "posix");
 	else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
@@ -1397,11 +1475,12 @@
 	set_opt(sbi, INLINE_DENTRY);
 	set_opt(sbi, EXTENT_CACHE);
 	set_opt(sbi, NOHEAP);
+	clear_opt(sbi, DISABLE_CHECKPOINT);
+	F2FS_OPTION(sbi).unusable_cap = 0;
 	sbi->sb->s_flags |= SB_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
-	if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev)))
-		set_opt(sbi, DISCARD);
-	if (f2fs_sb_has_blkzoned(sbi->sb))
+	set_opt(sbi, DISCARD);
+	if (f2fs_sb_has_blkzoned(sbi))
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
 	else
 		set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
@@ -1419,6 +1498,76 @@
 #ifdef CONFIG_QUOTA
 static int f2fs_enable_quotas(struct super_block *sb);
 #endif
+
+static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
+{
+	unsigned int s_flags = sbi->sb->s_flags;
+	struct cp_control cpc;
+	int err = 0;
+	int ret;
+	block_t unusable;
+
+	if (s_flags & SB_RDONLY) {
+		f2fs_err(sbi, "checkpoint=disable on readonly fs");
+		return -EINVAL;
+	}
+	sbi->sb->s_flags |= SB_ACTIVE;
+
+	f2fs_update_time(sbi, DISABLE_TIME);
+
+	while (!f2fs_time_over(sbi, DISABLE_TIME)) {
+		mutex_lock(&sbi->gc_mutex);
+		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+		if (err == -ENODATA) {
+			err = 0;
+			break;
+		}
+		if (err && err != -EAGAIN)
+			break;
+	}
+
+	ret = sync_filesystem(sbi->sb);
+	if (ret || err) {
+		err = ret ? ret: err;
+		goto restore_flag;
+	}
+
+	unusable = f2fs_get_unusable_blocks(sbi);
+	if (f2fs_disable_cp_again(sbi, unusable)) {
+		err = -EAGAIN;
+		goto restore_flag;
+	}
+
+	mutex_lock(&sbi->gc_mutex);
+	cpc.reason = CP_PAUSE;
+	set_sbi_flag(sbi, SBI_CP_DISABLED);
+	err = f2fs_write_checkpoint(sbi, &cpc);
+	if (err)
+		goto out_unlock;
+
+	spin_lock(&sbi->stat_lock);
+	sbi->unusable_block_count = unusable;
+	spin_unlock(&sbi->stat_lock);
+
+out_unlock:
+	mutex_unlock(&sbi->gc_mutex);
+restore_flag:
+	sbi->sb->s_flags = s_flags;	/* Restore MS_RDONLY status */
+	return err;
+}
+
+static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
+{
+	mutex_lock(&sbi->gc_mutex);
+	f2fs_dirty_to_prefree(sbi);
+
+	clear_sbi_flag(sbi, SBI_CP_DISABLED);
+	set_sbi_flag(sbi, SBI_IS_DIRTY);
+	mutex_unlock(&sbi->gc_mutex);
+
+	f2fs_sync_fs(sbi->sb, 1);
+}
+
 static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -1428,6 +1577,9 @@
 	bool need_restart_gc = false;
 	bool need_stop_gc = false;
 	bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+	bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
+	bool no_io_align = !F2FS_IO_ALIGNED(sbi);
+	bool checkpoint_changed;
 #ifdef CONFIG_QUOTA
 	int i, j;
 #endif
@@ -1448,7 +1600,7 @@
 				GFP_KERNEL);
 			if (!org_mount_opt.s_qf_names[i]) {
 				for (j = 0; j < i; j++)
-					kfree(org_mount_opt.s_qf_names[j]);
+					kvfree(org_mount_opt.s_qf_names[j]);
 				return -ENOMEM;
 			}
 		} else {
@@ -1460,8 +1612,8 @@
 	/* recover superblocks we couldn't write due to previous RO mount */
 	if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
 		err = f2fs_commit_super(sbi, false);
-		f2fs_msg(sb, KERN_INFO,
-			"Try to recover all the superblocks, ret: %d", err);
+		f2fs_info(sbi, "Try to recover all the superblocks, ret: %d",
+			  err);
 		if (!err)
 			clear_sbi_flag(sbi, SBI_NEED_SB_WRITE);
 	}
@@ -1472,6 +1624,8 @@
 	err = parse_options(sb, data);
 	if (err)
 		goto restore_opts;
+	checkpoint_changed =
+			disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT);
 
 	/*
 	 * Previous and new state of filesystem is RO,
@@ -1485,12 +1639,12 @@
 		err = dquot_suspend(sb, -1);
 		if (err < 0)
 			goto restore_opts;
-	} else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) {
+	} else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) {
 		/* dquot_resume needs RW */
 		sb->s_flags &= ~SB_RDONLY;
 		if (sb_any_quota_suspended(sb)) {
 			dquot_resume(sb, -1);
-		} else if (f2fs_sb_has_quota_ino(sb)) {
+		} else if (f2fs_sb_has_quota_ino(sbi)) {
 			err = f2fs_enable_quotas(sb);
 			if (err)
 				goto restore_opts;
@@ -1500,8 +1654,19 @@
 	/* disallow enable/disable extent_cache dynamically */
 	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
 		err = -EINVAL;
-		f2fs_msg(sbi->sb, KERN_WARNING,
-				"switch extent_cache option is not allowed");
+		f2fs_warn(sbi, "switch extent_cache option is not allowed");
+		goto restore_opts;
+	}
+
+	if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
+		err = -EINVAL;
+		f2fs_warn(sbi, "switch io_bits option is not allowed");
+		goto restore_opts;
+	}
+
+	if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
+		err = -EINVAL;
+		f2fs_warn(sbi, "disabling checkpoint not compatible with read-only");
 		goto restore_opts;
 	}
 
@@ -1533,6 +1698,16 @@
 		clear_sbi_flag(sbi, SBI_IS_CLOSE);
 	}
 
+	if (checkpoint_changed) {
+		if (test_opt(sbi, DISABLE_CHECKPOINT)) {
+			err = f2fs_disable_checkpoint(sbi);
+			if (err)
+				goto restore_gc;
+		} else {
+			f2fs_enable_checkpoint(sbi);
+		}
+	}
+
 	/*
 	 * We stop issue flush thread if FS is mounted as RO
 	 * or if flush_merge is not passed in mount option.
@@ -1549,19 +1724,19 @@
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(org_mount_opt.s_qf_names[i]);
+		kvfree(org_mount_opt.s_qf_names[i]);
 #endif
 	/* Update the POSIXACL Flag */
 	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 		(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	limit_reserve_root(sbi);
+	*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
 	return 0;
 restore_gc:
 	if (need_restart_gc) {
 		if (f2fs_start_gc_thread(sbi))
-			f2fs_msg(sbi->sb, KERN_WARNING,
-				"background gc thread has stopped");
+			f2fs_warn(sbi, "background gc thread has stopped");
 	} else if (need_stop_gc) {
 		f2fs_stop_gc_thread(sbi);
 	}
@@ -1569,7 +1744,7 @@
 #ifdef CONFIG_QUOTA
 	F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
-		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+		kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
 		F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i];
 	}
 #endif
@@ -1608,6 +1783,7 @@
 				congestion_wait(BLK_RW_ASYNC, HZ/50);
 				goto repeat;
 			}
+			set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
 			return PTR_ERR(page);
 		}
 
@@ -1619,6 +1795,7 @@
 		}
 		if (unlikely(!PageUptodate(page))) {
 			f2fs_put_page(page, 1);
+			set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
 			return -EIO;
 		}
 
@@ -1660,6 +1837,7 @@
 				congestion_wait(BLK_RW_ASYNC, HZ/50);
 				goto retry;
 			}
+			set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
 			break;
 		}
 
@@ -1696,6 +1874,11 @@
 
 static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
 {
+	if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) {
+		f2fs_err(sbi, "quota sysfile may be corrupted, skip loading it");
+		return 0;
+	}
+
 	return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type],
 					F2FS_OPTION(sbi).s_jquota_fmt, type);
 }
@@ -1705,11 +1888,10 @@
 	int enabled = 0;
 	int i, err;
 
-	if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) {
+	if (f2fs_sb_has_quota_ino(sbi) && rdonly) {
 		err = f2fs_enable_quotas(sbi->sb);
 		if (err) {
-			f2fs_msg(sbi->sb, KERN_ERR,
-					"Cannot turn on quota_ino: %d", err);
+			f2fs_err(sbi, "Cannot turn on quota_ino: %d", err);
 			return 0;
 		}
 		return 1;
@@ -1722,8 +1904,8 @@
 				enabled = 1;
 				continue;
 			}
-			f2fs_msg(sbi->sb, KERN_ERR,
-				"Cannot turn on quotas: %d on %d", err, i);
+			f2fs_err(sbi, "Cannot turn on quotas: %d on %d",
+				 err, i);
 		}
 	}
 	return enabled;
@@ -1736,7 +1918,7 @@
 	unsigned long qf_inum;
 	int err;
 
-	BUG_ON(!f2fs_sb_has_quota_ino(sb));
+	BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb)));
 
 	qf_inum = f2fs_qf_ino(sb, type);
 	if (!qf_inum)
@@ -1744,8 +1926,7 @@
 
 	qf_inode = f2fs_iget(sb, qf_inum);
 	if (IS_ERR(qf_inode)) {
-		f2fs_msg(sb, KERN_ERR,
-			"Bad quota inode %u:%lu", type, qf_inum);
+		f2fs_err(F2FS_SB(sb), "Bad quota inode %u:%lu", type, qf_inum);
 		return PTR_ERR(qf_inode);
 	}
 
@@ -1758,15 +1939,22 @@
 
 static int f2fs_enable_quotas(struct super_block *sb)
 {
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	int type, err = 0;
 	unsigned long qf_inum;
 	bool quota_mopt[MAXQUOTAS] = {
-		test_opt(F2FS_SB(sb), USRQUOTA),
-		test_opt(F2FS_SB(sb), GRPQUOTA),
-		test_opt(F2FS_SB(sb), PRJQUOTA),
+		test_opt(sbi, USRQUOTA),
+		test_opt(sbi, GRPQUOTA),
+		test_opt(sbi, PRJQUOTA),
 	};
 
-	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
+	if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) {
+		f2fs_err(sbi, "quota file may be corrupted, skip loading it");
+		return 0;
+	}
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+
 	for (type = 0; type < MAXQUOTAS; type++) {
 		qf_inum = f2fs_qf_ino(sb, type);
 		if (qf_inum) {
@@ -1774,12 +1962,12 @@
 				DQUOT_USAGE_ENABLED |
 				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
 			if (err) {
-				f2fs_msg(sb, KERN_ERR,
-					"Failed to enable quota tracking "
-					"(type=%d, err=%d). Please run "
-					"fsck to fix.", type, err);
+				f2fs_err(sbi, "Failed to enable quota tracking (type=%d, err=%d). Please run fsck to fix.",
+					 type, err);
 				for (type--; type >= 0; type--)
 					dquot_quota_off(sb, type);
+				set_sbi_flag(F2FS_SB(sb),
+						SBI_QUOTA_NEED_REPAIR);
 				return err;
 			}
 		}
@@ -1787,35 +1975,65 @@
 	return 0;
 }
 
-static int f2fs_quota_sync(struct super_block *sb, int type)
+int f2fs_quota_sync(struct super_block *sb, int type)
 {
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct quota_info *dqopt = sb_dqopt(sb);
 	int cnt;
 	int ret;
 
+	/*
+	 * do_quotactl
+	 *  f2fs_quota_sync
+	 *  down_read(quota_sem)
+	 *  dquot_writeback_dquots()
+	 *  f2fs_dquot_commit
+	 *                            block_operation
+	 *                            down_read(quota_sem)
+	 */
+	f2fs_lock_op(sbi);
+
+	down_read(&sbi->quota_sem);
 	ret = dquot_writeback_dquots(sb, type);
 	if (ret)
-		return ret;
+		goto out;
 
 	/*
 	 * Now when everything is written we can discard the pagecache so
 	 * that userspace sees the changes.
 	 */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		struct address_space *mapping;
+
 		if (type != -1 && cnt != type)
 			continue;
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
 
-		ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping);
+		mapping = dqopt->files[cnt]->i_mapping;
+
+		ret = filemap_fdatawrite(mapping);
 		if (ret)
-			return ret;
+			goto out;
+
+		/* if we are using journalled quota */
+		if (is_journalled_quota(sbi))
+			continue;
+
+		ret = filemap_fdatawait(mapping);
+		if (ret)
+			set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
 
 		inode_lock(dqopt->files[cnt]);
 		truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
 		inode_unlock(dqopt->files[cnt]);
 	}
-	return 0;
+out:
+	if (ret)
+		set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+	up_read(&sbi->quota_sem);
+	f2fs_unlock_op(sbi);
+	return ret;
 }
 
 static int f2fs_quota_on(struct super_block *sb, int type, int format_id,
@@ -1824,6 +2042,12 @@
 	struct inode *inode;
 	int err;
 
+	/* if quota sysfile exists, deny enabling quota with specific file */
+	if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) {
+		f2fs_err(F2FS_SB(sb), "quota sysfile already exists");
+		return -EBUSY;
+	}
+
 	err = f2fs_quota_sync(sb, type);
 	if (err)
 		return err;
@@ -1836,15 +2060,14 @@
 
 	inode_lock(inode);
 	F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL;
-	inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
-					S_NOATIME | S_IMMUTABLE);
+	f2fs_set_inode_flags(inode);
 	inode_unlock(inode);
 	f2fs_mark_inode_dirty_sync(inode, false);
 
 	return 0;
 }
 
-static int f2fs_quota_off(struct super_block *sb, int type)
+static int __f2fs_quota_off(struct super_block *sb, int type)
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	int err;
@@ -1857,12 +2080,12 @@
 		goto out_put;
 
 	err = dquot_quota_off(sb, type);
-	if (err || f2fs_sb_has_quota_ino(sb))
+	if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb)))
 		goto out_put;
 
 	inode_lock(inode);
 	F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL);
-	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
+	f2fs_set_inode_flags(inode);
 	inode_unlock(inode);
 	f2fs_mark_inode_dirty_sync(inode, false);
 out_put:
@@ -1870,23 +2093,125 @@
 	return err;
 }
 
+static int f2fs_quota_off(struct super_block *sb, int type)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	int err;
+
+	err = __f2fs_quota_off(sb, type);
+
+	/*
+	 * quotactl can shutdown journalled quota, result in inconsistence
+	 * between quota record and fs data by following updates, tag the
+	 * flag to let fsck be aware of it.
+	 */
+	if (is_journalled_quota(sbi))
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	return err;
+}
+
 void f2fs_quota_off_umount(struct super_block *sb)
 {
 	int type;
 	int err;
 
 	for (type = 0; type < MAXQUOTAS; type++) {
-		err = f2fs_quota_off(sb, type);
+		err = __f2fs_quota_off(sb, type);
 		if (err) {
 			int ret = dquot_quota_off(sb, type);
 
-			f2fs_msg(sb, KERN_ERR,
-				"Fail to turn off disk quota "
-				"(type: %d, err: %d, ret:%d), Please "
-				"run fsck to fix it.", type, err, ret);
-			set_sbi_flag(F2FS_SB(sb), SBI_NEED_FSCK);
+			f2fs_err(F2FS_SB(sb), "Fail to turn off disk quota (type: %d, err: %d, ret:%d), Please run fsck to fix it.",
+				 type, err, ret);
+			set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
 		}
 	}
+	/*
+	 * In case of checkpoint=disable, we must flush quota blocks.
+	 * This can cause NULL exception for node_inode in end_io, since
+	 * put_super already dropped it.
+	 */
+	sync_filesystem(sb);
+}
+
+static void f2fs_truncate_quota_inode_pages(struct super_block *sb)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!dqopt->files[type])
+			continue;
+		f2fs_inode_synced(dqopt->files[type]);
+	}
+}
+
+static int f2fs_dquot_commit(struct dquot *dquot)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
+	int ret;
+
+	down_read(&sbi->quota_sem);
+	ret = dquot_commit(dquot);
+	if (ret < 0)
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	up_read(&sbi->quota_sem);
+	return ret;
+}
+
+static int f2fs_dquot_acquire(struct dquot *dquot)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
+	int ret;
+
+	down_read(&sbi->quota_sem);
+	ret = dquot_acquire(dquot);
+	if (ret < 0)
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	up_read(&sbi->quota_sem);
+	return ret;
+}
+
+static int f2fs_dquot_release(struct dquot *dquot)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
+	int ret;
+
+	down_read(&sbi->quota_sem);
+	ret = dquot_release(dquot);
+	if (ret < 0)
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	up_read(&sbi->quota_sem);
+	return ret;
+}
+
+static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	int ret;
+
+	down_read(&sbi->quota_sem);
+	ret = dquot_mark_dquot_dirty(dquot);
+
+	/* if we are using journalled quota */
+	if (is_journalled_quota(sbi))
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
+
+	up_read(&sbi->quota_sem);
+	return ret;
+}
+
+static int f2fs_dquot_commit_info(struct super_block *sb, int type)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	int ret;
+
+	down_read(&sbi->quota_sem);
+	ret = dquot_commit_info(sb, type);
+	if (ret < 0)
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	up_read(&sbi->quota_sem);
+	return ret;
 }
 
 static int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
@@ -1897,11 +2222,11 @@
 
 static const struct dquot_operations f2fs_quota_operations = {
 	.get_reserved_space = f2fs_get_reserved_space,
-	.write_dquot	= dquot_commit,
-	.acquire_dquot	= dquot_acquire,
-	.release_dquot	= dquot_release,
-	.mark_dirty	= dquot_mark_dquot_dirty,
-	.write_info	= dquot_commit_info,
+	.write_dquot	= f2fs_dquot_commit,
+	.acquire_dquot	= f2fs_dquot_acquire,
+	.release_dquot	= f2fs_dquot_release,
+	.mark_dirty	= f2fs_dquot_mark_dquot_dirty,
+	.write_info	= f2fs_dquot_commit_info,
 	.alloc_dquot	= dquot_alloc,
 	.destroy_dquot	= dquot_destroy,
 	.get_projid	= f2fs_get_projid,
@@ -1919,6 +2244,11 @@
 	.get_nextdqblk	= dquot_get_next_dqblk,
 };
 #else
+int f2fs_quota_sync(struct super_block *sb, int type)
+{
+	return 0;
+}
+
 void f2fs_quota_off_umount(struct super_block *sb)
 {
 }
@@ -1926,8 +2256,8 @@
 
 static const struct super_operations f2fs_sops = {
 	.alloc_inode	= f2fs_alloc_inode,
+	.free_inode	= f2fs_free_inode,
 	.drop_inode	= f2fs_drop_inode,
-	.destroy_inode	= f2fs_destroy_inode,
 	.write_inode	= f2fs_write_inode,
 	.dirty_inode	= f2fs_dirty_inode,
 	.show_options	= f2fs_show_options,
@@ -1945,7 +2275,7 @@
 	.remount_fs	= f2fs_remount,
 };
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
 {
 	return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
@@ -1964,7 +2294,7 @@
 	 * if LOST_FOUND feature is enabled.
 	 *
 	 */
-	if (f2fs_sb_has_lost_found(sbi->sb) &&
+	if (f2fs_sb_has_lost_found(sbi) &&
 			inode->i_ino == F2FS_ROOT_INO(sbi))
 		return -EPERM;
 
@@ -2036,7 +2366,7 @@
 static loff_t max_file_blocks(void)
 {
 	loff_t result = 0;
-	loff_t leaf_count = ADDRS_PER_BLOCK;
+	loff_t leaf_count = DEF_ADDRS_PER_BLOCK;
 
 	/*
 	 * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
@@ -2097,55 +2427,49 @@
 				(segment_count << log_blocks_per_seg);
 
 	if (segment0_blkaddr != cp_blkaddr) {
-		f2fs_msg(sb, KERN_INFO,
-			"Mismatch start address, segment0(%u) cp_blkaddr(%u)",
-			segment0_blkaddr, cp_blkaddr);
+		f2fs_info(sbi, "Mismatch start address, segment0(%u) cp_blkaddr(%u)",
+			  segment0_blkaddr, cp_blkaddr);
 		return true;
 	}
 
 	if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) !=
 							sit_blkaddr) {
-		f2fs_msg(sb, KERN_INFO,
-			"Wrong CP boundary, start(%u) end(%u) blocks(%u)",
-			cp_blkaddr, sit_blkaddr,
-			segment_count_ckpt << log_blocks_per_seg);
+		f2fs_info(sbi, "Wrong CP boundary, start(%u) end(%u) blocks(%u)",
+			  cp_blkaddr, sit_blkaddr,
+			  segment_count_ckpt << log_blocks_per_seg);
 		return true;
 	}
 
 	if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) !=
 							nat_blkaddr) {
-		f2fs_msg(sb, KERN_INFO,
-			"Wrong SIT boundary, start(%u) end(%u) blocks(%u)",
-			sit_blkaddr, nat_blkaddr,
-			segment_count_sit << log_blocks_per_seg);
+		f2fs_info(sbi, "Wrong SIT boundary, start(%u) end(%u) blocks(%u)",
+			  sit_blkaddr, nat_blkaddr,
+			  segment_count_sit << log_blocks_per_seg);
 		return true;
 	}
 
 	if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) !=
 							ssa_blkaddr) {
-		f2fs_msg(sb, KERN_INFO,
-			"Wrong NAT boundary, start(%u) end(%u) blocks(%u)",
-			nat_blkaddr, ssa_blkaddr,
-			segment_count_nat << log_blocks_per_seg);
+		f2fs_info(sbi, "Wrong NAT boundary, start(%u) end(%u) blocks(%u)",
+			  nat_blkaddr, ssa_blkaddr,
+			  segment_count_nat << log_blocks_per_seg);
 		return true;
 	}
 
 	if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) !=
 							main_blkaddr) {
-		f2fs_msg(sb, KERN_INFO,
-			"Wrong SSA boundary, start(%u) end(%u) blocks(%u)",
-			ssa_blkaddr, main_blkaddr,
-			segment_count_ssa << log_blocks_per_seg);
+		f2fs_info(sbi, "Wrong SSA boundary, start(%u) end(%u) blocks(%u)",
+			  ssa_blkaddr, main_blkaddr,
+			  segment_count_ssa << log_blocks_per_seg);
 		return true;
 	}
 
 	if (main_end_blkaddr > seg_end_blkaddr) {
-		f2fs_msg(sb, KERN_INFO,
-			"Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)",
-			main_blkaddr,
-			segment0_blkaddr +
-				(segment_count << log_blocks_per_seg),
-			segment_count_main << log_blocks_per_seg);
+		f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)",
+			  main_blkaddr,
+			  segment0_blkaddr +
+			  (segment_count << log_blocks_per_seg),
+			  segment_count_main << log_blocks_per_seg);
 		return true;
 	} else if (main_end_blkaddr < seg_end_blkaddr) {
 		int err = 0;
@@ -2162,12 +2486,11 @@
 			err = __f2fs_commit_super(bh, NULL);
 			res = err ? "failed" : "done";
 		}
-		f2fs_msg(sb, KERN_INFO,
-			"Fix alignment : %s, start(%u) end(%u) block(%u)",
-			res, main_blkaddr,
-			segment0_blkaddr +
-				(segment_count << log_blocks_per_seg),
-			segment_count_main << log_blocks_per_seg);
+		f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%u) block(%u)",
+			  res, main_blkaddr,
+			  segment0_blkaddr +
+			  (segment_count << log_blocks_per_seg),
+			  segment_count_main << log_blocks_per_seg);
 		if (err)
 			return true;
 	}
@@ -2181,39 +2504,52 @@
 	block_t total_sections, blocks_per_seg;
 	struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
 					(bh->b_data + F2FS_SUPER_OFFSET);
-	struct super_block *sb = sbi->sb;
 	unsigned int blocksize;
+	size_t crc_offset = 0;
+	__u32 crc = 0;
 
-	if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
-		f2fs_msg(sb, KERN_INFO,
-			"Magic Mismatch, valid(0x%x) - read(0x%x)",
-			F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
-		return 1;
+	if (le32_to_cpu(raw_super->magic) != F2FS_SUPER_MAGIC) {
+		f2fs_info(sbi, "Magic Mismatch, valid(0x%x) - read(0x%x)",
+			  F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
+		return -EINVAL;
+	}
+
+	/* Check checksum_offset and crc in superblock */
+	if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) {
+		crc_offset = le32_to_cpu(raw_super->checksum_offset);
+		if (crc_offset !=
+			offsetof(struct f2fs_super_block, crc)) {
+			f2fs_info(sbi, "Invalid SB checksum offset: %zu",
+				  crc_offset);
+			return -EFSCORRUPTED;
+		}
+		crc = le32_to_cpu(raw_super->crc);
+		if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) {
+			f2fs_info(sbi, "Invalid SB checksum value: %u", crc);
+			return -EFSCORRUPTED;
+		}
 	}
 
 	/* Currently, support only 4KB page cache size */
 	if (F2FS_BLKSIZE != PAGE_SIZE) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid page_cache_size (%lu), supports only 4KB\n",
-			PAGE_SIZE);
-		return 1;
+		f2fs_info(sbi, "Invalid page_cache_size (%lu), supports only 4KB",
+			  PAGE_SIZE);
+		return -EFSCORRUPTED;
 	}
 
 	/* Currently, support only 4KB block size */
 	blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
 	if (blocksize != F2FS_BLKSIZE) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid blocksize (%u), supports only 4KB\n",
-			blocksize);
-		return 1;
+		f2fs_info(sbi, "Invalid blocksize (%u), supports only 4KB",
+			  blocksize);
+		return -EFSCORRUPTED;
 	}
 
 	/* check log blocks per segment */
 	if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid log blocks per segment (%u)\n",
-			le32_to_cpu(raw_super->log_blocks_per_seg));
-		return 1;
+		f2fs_info(sbi, "Invalid log blocks per segment (%u)",
+			  le32_to_cpu(raw_super->log_blocks_per_seg));
+		return -EFSCORRUPTED;
 	}
 
 	/* Currently, support 512/1024/2048/4096 bytes sector size */
@@ -2221,18 +2557,17 @@
 				F2FS_MAX_LOG_SECTOR_SIZE ||
 		le32_to_cpu(raw_super->log_sectorsize) <
 				F2FS_MIN_LOG_SECTOR_SIZE) {
-		f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
-			le32_to_cpu(raw_super->log_sectorsize));
-		return 1;
+		f2fs_info(sbi, "Invalid log sectorsize (%u)",
+			  le32_to_cpu(raw_super->log_sectorsize));
+		return -EFSCORRUPTED;
 	}
 	if (le32_to_cpu(raw_super->log_sectors_per_block) +
 		le32_to_cpu(raw_super->log_sectorsize) !=
 			F2FS_MAX_LOG_SECTOR_SIZE) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid log sectors per block(%u) log sectorsize(%u)",
-			le32_to_cpu(raw_super->log_sectors_per_block),
-			le32_to_cpu(raw_super->log_sectorsize));
-		return 1;
+		f2fs_info(sbi, "Invalid log sectors per block(%u) log sectorsize(%u)",
+			  le32_to_cpu(raw_super->log_sectors_per_block),
+			  le32_to_cpu(raw_super->log_sectorsize));
+		return -EFSCORRUPTED;
 	}
 
 	segment_count = le32_to_cpu(raw_super->segment_count);
@@ -2245,77 +2580,68 @@
 
 	if (segment_count > F2FS_MAX_SEGMENT ||
 				segment_count < F2FS_MIN_SEGMENTS) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid segment count (%u)",
-			segment_count);
-		return 1;
+		f2fs_info(sbi, "Invalid segment count (%u)", segment_count);
+		return -EFSCORRUPTED;
 	}
 
 	if (total_sections > segment_count ||
 			total_sections < F2FS_MIN_SEGMENTS ||
 			segs_per_sec > segment_count || !segs_per_sec) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid segment/section count (%u, %u x %u)",
-			segment_count, total_sections, segs_per_sec);
-		return 1;
+		f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)",
+			  segment_count, total_sections, segs_per_sec);
+		return -EFSCORRUPTED;
 	}
 
 	if ((segment_count / segs_per_sec) < total_sections) {
-		f2fs_msg(sb, KERN_INFO,
-			"Small segment_count (%u < %u * %u)",
-			segment_count, segs_per_sec, total_sections);
-		return 1;
+		f2fs_info(sbi, "Small segment_count (%u < %u * %u)",
+			  segment_count, segs_per_sec, total_sections);
+		return -EFSCORRUPTED;
 	}
 
-	if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) {
-		f2fs_msg(sb, KERN_INFO,
-			"Wrong segment_count / block_count (%u > %u)",
-			segment_count, le32_to_cpu(raw_super->block_count));
-		return 1;
+	if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) {
+		f2fs_info(sbi, "Wrong segment_count / block_count (%u > %llu)",
+			  segment_count, le64_to_cpu(raw_super->block_count));
+		return -EFSCORRUPTED;
 	}
 
 	if (secs_per_zone > total_sections || !secs_per_zone) {
-		f2fs_msg(sb, KERN_INFO,
-			"Wrong secs_per_zone / total_sections (%u, %u)",
-			secs_per_zone, total_sections);
-		return 1;
+		f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)",
+			  secs_per_zone, total_sections);
+		return -EFSCORRUPTED;
 	}
 	if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION ||
 			raw_super->hot_ext_count > F2FS_MAX_EXTENSION ||
 			(le32_to_cpu(raw_super->extension_count) +
 			raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) {
-		f2fs_msg(sb, KERN_INFO,
-			"Corrupted extension count (%u + %u > %u)",
-			le32_to_cpu(raw_super->extension_count),
-			raw_super->hot_ext_count,
-			F2FS_MAX_EXTENSION);
-		return 1;
+		f2fs_info(sbi, "Corrupted extension count (%u + %u > %u)",
+			  le32_to_cpu(raw_super->extension_count),
+			  raw_super->hot_ext_count,
+			  F2FS_MAX_EXTENSION);
+		return -EFSCORRUPTED;
 	}
 
 	if (le32_to_cpu(raw_super->cp_payload) >
 				(blocks_per_seg - F2FS_CP_PACKS)) {
-		f2fs_msg(sb, KERN_INFO,
-			"Insane cp_payload (%u > %u)",
-			le32_to_cpu(raw_super->cp_payload),
-			blocks_per_seg - F2FS_CP_PACKS);
-		return 1;
+		f2fs_info(sbi, "Insane cp_payload (%u > %u)",
+			  le32_to_cpu(raw_super->cp_payload),
+			  blocks_per_seg - F2FS_CP_PACKS);
+		return -EFSCORRUPTED;
 	}
 
 	/* check reserved ino info */
 	if (le32_to_cpu(raw_super->node_ino) != 1 ||
 		le32_to_cpu(raw_super->meta_ino) != 2 ||
 		le32_to_cpu(raw_super->root_ino) != 3) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)",
-			le32_to_cpu(raw_super->node_ino),
-			le32_to_cpu(raw_super->meta_ino),
-			le32_to_cpu(raw_super->root_ino));
-		return 1;
+		f2fs_info(sbi, "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)",
+			  le32_to_cpu(raw_super->node_ino),
+			  le32_to_cpu(raw_super->meta_ino),
+			  le32_to_cpu(raw_super->root_ino));
+		return -EFSCORRUPTED;
 	}
 
 	/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
 	if (sanity_check_area_boundary(sbi, bh))
-		return 1;
+		return -EFSCORRUPTED;
 
 	return 0;
 }
@@ -2332,8 +2658,9 @@
 	unsigned int log_blocks_per_seg;
 	unsigned int segment_count_main;
 	unsigned int cp_pack_start_sum, cp_payload;
-	block_t user_block_count;
-	int i;
+	block_t user_block_count, valid_user_blocks;
+	block_t avail_node_count, valid_node_count;
+	int i, j;
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -2352,8 +2679,7 @@
 
 	if (unlikely(fsmeta < F2FS_MIN_SEGMENTS ||
 			ovp_segments == 0 || reserved_segments == 0)) {
-		f2fs_msg(sbi->sb, KERN_ERR,
-			"Wrong layout: check mkfs.f2fs version");
+		f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version");
 		return 1;
 	}
 
@@ -2362,8 +2688,23 @@
 	log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
 	if (!user_block_count || user_block_count >=
 			segment_count_main << log_blocks_per_seg) {
-		f2fs_msg(sbi->sb, KERN_ERR,
-			"Wrong user_block_count: %u", user_block_count);
+		f2fs_err(sbi, "Wrong user_block_count: %u",
+			 user_block_count);
+		return 1;
+	}
+
+	valid_user_blocks = le64_to_cpu(ckpt->valid_block_count);
+	if (valid_user_blocks > user_block_count) {
+		f2fs_err(sbi, "Wrong valid_user_blocks: %u, user_block_count: %u",
+			 valid_user_blocks, user_block_count);
+		return 1;
+	}
+
+	valid_node_count = le32_to_cpu(ckpt->valid_node_count);
+	avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+	if (valid_node_count > avail_node_count) {
+		f2fs_err(sbi, "Wrong valid_node_count: %u, avail_node_count: %u",
+			 valid_node_count, avail_node_count);
 		return 1;
 	}
 
@@ -2374,11 +2715,40 @@
 		if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
 			le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg)
 			return 1;
+		for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
+				le32_to_cpu(ckpt->cur_node_segno[j])) {
+				f2fs_err(sbi, "Node segment (%u, %u) has the same segno: %u",
+					 i, j,
+					 le32_to_cpu(ckpt->cur_node_segno[i]));
+				return 1;
+			}
+		}
 	}
 	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
 		if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
 			le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg)
 			return 1;
+		for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_data_segno[i]) ==
+				le32_to_cpu(ckpt->cur_data_segno[j])) {
+				f2fs_err(sbi, "Data segment (%u, %u) has the same segno: %u",
+					 i, j,
+					 le32_to_cpu(ckpt->cur_data_segno[i]));
+				return 1;
+			}
+		}
+	}
+	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
+		for (j = 0; j < NR_CURSEG_DATA_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
+				le32_to_cpu(ckpt->cur_data_segno[j])) {
+				f2fs_err(sbi, "Node segment (%u) and Data segment (%u) has the same segno: %u",
+					 i, j,
+					 le32_to_cpu(ckpt->cur_node_segno[i]));
+				return 1;
+			}
+		}
 	}
 
 	sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
@@ -2386,9 +2756,8 @@
 
 	if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 ||
 		nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) {
-		f2fs_msg(sbi->sb, KERN_ERR,
-			"Wrong bitmap size: sit: %u, nat:%u",
-			sit_bitmap_size, nat_bitmap_size);
+		f2fs_err(sbi, "Wrong bitmap size: sit: %u, nat:%u",
+			 sit_bitmap_size, nat_bitmap_size);
 		return 1;
 	}
 
@@ -2397,14 +2766,22 @@
 	if (cp_pack_start_sum < cp_payload + 1 ||
 		cp_pack_start_sum > blocks_per_seg - 1 -
 			NR_CURSEG_TYPE) {
-		f2fs_msg(sbi->sb, KERN_ERR,
-			"Wrong cp_pack_start_sum: %u",
-			cp_pack_start_sum);
+		f2fs_err(sbi, "Wrong cp_pack_start_sum: %u",
+			 cp_pack_start_sum);
+		return 1;
+	}
+
+	if (__is_set_ckpt_flags(ckpt, CP_LARGE_NAT_BITMAP_FLAG) &&
+		le32_to_cpu(ckpt->checksum_offset) != CP_MIN_CHKSUM_OFFSET) {
+		f2fs_warn(sbi, "using deprecated layout of large_nat_bitmap, "
+			  "please run fsck v1.13.0 or higher to repair, chksum_offset: %u, "
+			  "fixed with patch: \"f2fs-tools: relocate chksum_offset for large_nat_bitmap feature\"",
+			  le32_to_cpu(ckpt->checksum_offset));
 		return 1;
 	}
 
 	if (unlikely(f2fs_cp_error(sbi))) {
-		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
+		f2fs_err(sbi, "A bug case: need to run fsck");
 		return 1;
 	}
 	return 0;
@@ -2413,7 +2790,7 @@
 static void init_sb_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = sbi->raw_super;
-	int i, j;
+	int i;
 
 	sbi->log_sectors_per_block =
 		le32_to_cpu(raw_super->log_sectors_per_block);
@@ -2431,11 +2808,19 @@
 	sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
 	sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
 	sbi->cur_victim_sec = NULL_SECNO;
+	sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+	sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
 	sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
+	sbi->migration_granularity = sbi->segs_per_sec;
 
 	sbi->dir_level = DEF_DIR_LEVEL;
 	sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
 	sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
+	sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
+	sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
+	sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
+	sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] =
+				DEF_UMOUNT_DISCARD_TIMEOUT;
 	clear_sbi_flag(sbi, SBI_NEED_FSCK);
 
 	for (i = 0; i < NR_COUNT_TYPE; i++)
@@ -2446,9 +2831,6 @@
 
 	INIT_LIST_HEAD(&sbi->s_list);
 	mutex_init(&sbi->umount_mutex);
-	for (i = 0; i < NR_PAGE_TYPE - 1; i++)
-		for (j = HOT; j < NR_TEMP_TYPE; j++)
-			mutex_init(&sbi->wio_mutex[i][j]);
 	init_rwsem(&sbi->io_order_lock);
 	spin_lock_init(&sbi->cp_lock);
 
@@ -2466,8 +2848,12 @@
 	if (err)
 		return err;
 
-	return percpu_counter_init(&sbi->total_valid_inode_count, 0,
+	err = percpu_counter_init(&sbi->total_valid_inode_count, 0,
 								GFP_KERNEL);
+	if (err)
+		percpu_counter_destroy(&sbi->alloc_valid_block_count);
+
+	return err;
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -2481,7 +2867,7 @@
 	unsigned int n = 0;
 	int err = -EIO;
 
-	if (!f2fs_sb_has_blkzoned(sbi->sb))
+	if (!f2fs_sb_has_blkzoned(sbi))
 		return 0;
 
 	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
@@ -2497,9 +2883,11 @@
 	if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
 		FDEV(devi).nr_blkz++;
 
-	FDEV(devi).blkz_type = f2fs_kmalloc(sbi, FDEV(devi).nr_blkz,
-								GFP_KERNEL);
-	if (!FDEV(devi).blkz_type)
+	FDEV(devi).blkz_seq = f2fs_kzalloc(sbi,
+					BITS_TO_LONGS(FDEV(devi).nr_blkz)
+					* sizeof(unsigned long),
+					GFP_KERNEL);
+	if (!FDEV(devi).blkz_seq)
 		return -ENOMEM;
 
 #define F2FS_REPORT_NR_ZONES   4096
@@ -2515,9 +2903,7 @@
 	while (zones && sector < nr_sectors) {
 
 		nr_zones = F2FS_REPORT_NR_ZONES;
-		err = blkdev_report_zones(bdev, sector,
-					  zones, &nr_zones,
-					  GFP_KERNEL);
+		err = blkdev_report_zones(bdev, sector, zones, &nr_zones);
 		if (err)
 			break;
 		if (!nr_zones) {
@@ -2526,13 +2912,14 @@
 		}
 
 		for (i = 0; i < nr_zones; i++) {
-			FDEV(devi).blkz_type[n] = zones[i].type;
+			if (zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL)
+				set_bit(n, FDEV(devi).blkz_seq);
 			sector += zones[i].len;
 			n++;
 		}
 	}
 
-	kfree(zones);
+	kvfree(zones);
 
 	return err;
 }
@@ -2561,18 +2948,17 @@
 	for (block = 0; block < 2; block++) {
 		bh = sb_bread(sb, block);
 		if (!bh) {
-			f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
-				block + 1);
+			f2fs_err(sbi, "Unable to read %dth superblock",
+				 block + 1);
 			err = -EIO;
 			continue;
 		}
 
 		/* sanity checking of raw super */
-		if (sanity_check_raw_super(sbi, bh)) {
-			f2fs_msg(sb, KERN_ERR,
-				"Can't find valid F2FS filesystem in %dth superblock",
-				block + 1);
-			err = -EINVAL;
+		err = sanity_check_raw_super(sbi, bh);
+		if (err) {
+			f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock",
+				 block + 1);
 			brelse(bh);
 			continue;
 		}
@@ -2592,7 +2978,7 @@
 
 	/* No valid superblock */
 	if (!*raw_super)
-		kfree(super);
+		kvfree(super);
 	else
 		err = 0;
 
@@ -2602,6 +2988,7 @@
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 {
 	struct buffer_head *bh;
+	__u32 crc = 0;
 	int err;
 
 	if ((recover && f2fs_readonly(sbi->sb)) ||
@@ -2610,6 +2997,13 @@
 		return -EROFS;
 	}
 
+	/* we should update superblock crc here */
+	if (!recover && f2fs_sb_has_sb_chksum(sbi)) {
+		crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi),
+				offsetof(struct f2fs_super_block, crc));
+		F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc);
+	}
+
 	/* write back-up superblock first */
 	bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1);
 	if (!bh)
@@ -2692,37 +3086,80 @@
 
 #ifdef CONFIG_BLK_DEV_ZONED
 		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
-				!f2fs_sb_has_blkzoned(sbi->sb)) {
-			f2fs_msg(sbi->sb, KERN_ERR,
-				"Zoned block device feature not enabled\n");
+				!f2fs_sb_has_blkzoned(sbi)) {
+			f2fs_err(sbi, "Zoned block device feature not enabled\n");
 			return -EINVAL;
 		}
 		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
 			if (init_blkz_info(sbi, i)) {
-				f2fs_msg(sbi->sb, KERN_ERR,
-					"Failed to initialize F2FS blkzone information");
+				f2fs_err(sbi, "Failed to initialize F2FS blkzone information");
 				return -EINVAL;
 			}
 			if (max_devices == 1)
 				break;
-			f2fs_msg(sbi->sb, KERN_INFO,
-				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
-				i, FDEV(i).path,
-				FDEV(i).total_segments,
-				FDEV(i).start_blk, FDEV(i).end_blk,
-				bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
-				"Host-aware" : "Host-managed");
+			f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+				  i, FDEV(i).path,
+				  FDEV(i).total_segments,
+				  FDEV(i).start_blk, FDEV(i).end_blk,
+				  bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
+				  "Host-aware" : "Host-managed");
 			continue;
 		}
 #endif
-		f2fs_msg(sbi->sb, KERN_INFO,
-			"Mount Device [%2d]: %20s, %8u, %8x - %8x",
-				i, FDEV(i).path,
-				FDEV(i).total_segments,
-				FDEV(i).start_blk, FDEV(i).end_blk);
+		f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x",
+			  i, FDEV(i).path,
+			  FDEV(i).total_segments,
+			  FDEV(i).start_blk, FDEV(i).end_blk);
 	}
-	f2fs_msg(sbi->sb, KERN_INFO,
-			"IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi));
+	f2fs_info(sbi,
+		  "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi));
+	return 0;
+}
+
+static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
+{
+#ifdef CONFIG_UNICODE
+	if (f2fs_sb_has_casefold(sbi) && !sbi->s_encoding) {
+		const struct f2fs_sb_encodings *encoding_info;
+		struct unicode_map *encoding;
+		__u16 encoding_flags;
+
+		if (f2fs_sb_has_encrypt(sbi)) {
+			f2fs_err(sbi,
+				"Can't mount with encoding and encryption");
+			return -EINVAL;
+		}
+
+		if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info,
+					  &encoding_flags)) {
+			f2fs_err(sbi,
+				 "Encoding requested by superblock is unknown");
+			return -EINVAL;
+		}
+
+		encoding = utf8_load(encoding_info->version);
+		if (IS_ERR(encoding)) {
+			f2fs_err(sbi,
+				 "can't mount with superblock charset: %s-%s "
+				 "not supported by the kernel. flags: 0x%x.",
+				 encoding_info->name, encoding_info->version,
+				 encoding_flags);
+			return PTR_ERR(encoding);
+		}
+		f2fs_info(sbi, "Using encoding defined by superblock: "
+			 "%s-%s with flags 0x%hx", encoding_info->name,
+			 encoding_info->version?:"\b", encoding_flags);
+
+		sbi->s_encoding = encoding;
+		sbi->s_encoding_flags = encoding_flags;
+		sbi->sb->s_d_op = &f2fs_dentry_ops;
+	}
+#else
+	if (f2fs_sb_has_casefold(sbi)) {
+		f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE");
+		return -EINVAL;
+	}
+#endif
 	return 0;
 }
 
@@ -2746,10 +3183,11 @@
 	struct f2fs_super_block *raw_super;
 	struct inode *root;
 	int err;
-	bool retry = true, need_fsck = false;
+	bool skip_recovery = false, need_fsck = false;
 	char *options = NULL;
 	int recovery, i, valid_super_block;
 	struct curseg_info *seg_i;
+	int retry_cnt = 1;
 
 try_onemore:
 	err = -EINVAL;
@@ -2767,7 +3205,7 @@
 	/* Load the checksum driver */
 	sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
 	if (IS_ERR(sbi->s_chksum_driver)) {
-		f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver.");
+		f2fs_err(sbi, "Cannot load crc32 driver.");
 		err = PTR_ERR(sbi->s_chksum_driver);
 		sbi->s_chksum_driver = NULL;
 		goto free_sbi;
@@ -2775,7 +3213,7 @@
 
 	/* set a block size */
 	if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
-		f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
+		f2fs_err(sbi, "unable to set blocksize");
 		goto free_sbi;
 	}
 
@@ -2788,7 +3226,7 @@
 	sbi->raw_super = raw_super;
 
 	/* precompute checksum seed for metadata */
-	if (f2fs_sb_has_inode_chksum(sb))
+	if (f2fs_sb_has_inode_chksum(sbi))
 		sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
 						sizeof(raw_super->uuid));
 
@@ -2798,9 +3236,8 @@
 	 * devices, but mandatory for host-managed zoned block devices.
 	 */
 #ifndef CONFIG_BLK_DEV_ZONED
-	if (f2fs_sb_has_blkzoned(sb)) {
-		f2fs_msg(sb, KERN_ERR,
-			 "Zoned block device support is not enabled\n");
+	if (f2fs_sb_has_blkzoned(sbi)) {
+		f2fs_err(sbi, "Zoned block device support is not enabled");
 		err = -EOPNOTSUPP;
 		goto free_sb_buf;
 	}
@@ -2821,17 +3258,17 @@
 	sb->s_maxbytes = sbi->max_file_blocks <<
 				le32_to_cpu(raw_super->log_blocksize);
 	sb->s_max_links = F2FS_LINK_MAX;
-	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+
+	err = f2fs_setup_casefold(sbi);
+	if (err)
+		goto free_options;
 
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &f2fs_quota_operations;
-	if (f2fs_sb_has_quota_ino(sb))
-		sb->s_qcop = &dquot_quotactl_sysfile_ops;
-	else
-		sb->s_qcop = &f2fs_quotactl_ops;
+	sb->s_qcop = &f2fs_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 
-	if (f2fs_sb_has_quota_ino(sbi->sb)) {
+	if (f2fs_sb_has_quota_ino(sbi)) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if (f2fs_qf_ino(sbi->sb, i))
 				sbi->nquota_files++;
@@ -2840,9 +3277,12 @@
 #endif
 
 	sb->s_op = &f2fs_sops;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	sb->s_cop = &f2fs_cryptops;
 #endif
+#ifdef CONFIG_FS_VERITY
+	sb->s_vop = &f2fs_verityops;
+#endif
 	sb->s_xattr = f2fs_xattr_handlers;
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
@@ -2857,6 +3297,7 @@
 	mutex_init(&sbi->gc_mutex);
 	mutex_init(&sbi->writepages);
 	mutex_init(&sbi->cp_mutex);
+	mutex_init(&sbi->resize_mutex);
 	init_rwsem(&sbi->node_write);
 	init_rwsem(&sbi->node_change);
 
@@ -2879,7 +3320,7 @@
 				     GFP_KERNEL);
 		if (!sbi->write_io[i]) {
 			err = -ENOMEM;
-			goto free_options;
+			goto free_bio_info;
 		}
 
 		for (j = HOT; j < n; j++) {
@@ -2892,6 +3333,7 @@
 	}
 
 	init_rwsem(&sbi->cp_rwsem);
+	init_rwsem(&sbi->quota_sem);
 	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);
 
@@ -2899,7 +3341,7 @@
 	if (err)
 		goto free_bio_info;
 
-	if (F2FS_IO_SIZE(sbi) > 1) {
+	if (F2FS_IO_ALIGNED(sbi)) {
 		sbi->write_io_dummy =
 			mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
 		if (!sbi->write_io_dummy) {
@@ -2911,21 +3353,31 @@
 	/* get an inode for meta space */
 	sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
 	if (IS_ERR(sbi->meta_inode)) {
-		f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
+		f2fs_err(sbi, "Failed to read F2FS meta data inode");
 		err = PTR_ERR(sbi->meta_inode);
 		goto free_io_dummy;
 	}
 
 	err = f2fs_get_valid_checkpoint(sbi);
 	if (err) {
-		f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint");
+		f2fs_err(sbi, "Failed to get valid F2FS checkpoint");
 		goto free_meta_inode;
 	}
 
+	if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG))
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) {
+		set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+		sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL;
+	}
+
+	if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FSCK_FLAG))
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+
 	/* Initialize device list */
 	err = f2fs_scan_devices(sbi);
 	if (err) {
-		f2fs_msg(sb, KERN_ERR, "Failed to find devices");
+		f2fs_err(sbi, "Failed to find devices");
 		goto free_devices;
 	}
 
@@ -2945,6 +3397,7 @@
 		INIT_LIST_HEAD(&sbi->inode_list[i]);
 		spin_lock_init(&sbi->inode_lock[i]);
 	}
+	mutex_init(&sbi->flush_lock);
 
 	f2fs_init_extent_cache_info(sbi);
 
@@ -2955,14 +3408,14 @@
 	/* setup f2fs internal modules */
 	err = f2fs_build_segment_manager(sbi);
 	if (err) {
-		f2fs_msg(sb, KERN_ERR,
-			"Failed to initialize F2FS segment manager");
+		f2fs_err(sbi, "Failed to initialize F2FS segment manager (%d)",
+			 err);
 		goto free_sm;
 	}
 	err = f2fs_build_node_manager(sbi);
 	if (err) {
-		f2fs_msg(sb, KERN_ERR,
-			"Failed to initialize F2FS node manager");
+		f2fs_err(sbi, "Failed to initialize F2FS node manager (%d)",
+			 err);
 		goto free_nm;
 	}
 
@@ -2980,36 +3433,36 @@
 
 	f2fs_build_gc_manager(sbi);
 
+	err = f2fs_build_stats(sbi);
+	if (err)
+		goto free_nm;
+
 	/* get an inode for node space */
 	sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
 	if (IS_ERR(sbi->node_inode)) {
-		f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
+		f2fs_err(sbi, "Failed to read node inode");
 		err = PTR_ERR(sbi->node_inode);
-		goto free_nm;
+		goto free_stats;
 	}
 
-	err = f2fs_build_stats(sbi);
-	if (err)
-		goto free_node_inode;
-
 	/* read root inode and dentry */
 	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
 	if (IS_ERR(root)) {
-		f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
+		f2fs_err(sbi, "Failed to read root inode");
 		err = PTR_ERR(root);
-		goto free_stats;
+		goto free_node_inode;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks ||
 			!root->i_size || !root->i_nlink) {
 		iput(root);
 		err = -EINVAL;
-		goto free_stats;
+		goto free_node_inode;
 	}
 
 	sb->s_root = d_make_root(root); /* allocate root dentry */
 	if (!sb->s_root) {
 		err = -ENOMEM;
-		goto free_root_inode;
+		goto free_node_inode;
 	}
 
 	err = f2fs_register_sysfs(sbi);
@@ -3018,13 +3471,10 @@
 
 #ifdef CONFIG_QUOTA
 	/* Enable quota usage during mount */
-	if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) {
+	if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) {
 		err = f2fs_enable_quotas(sb);
-		if (err) {
-			f2fs_msg(sb, KERN_ERR,
-				"Cannot turn on quotas: error %d", err);
-			goto free_sysfs;
-		}
+		if (err)
+			f2fs_err(sbi, "Cannot turn on quotas: error %d", err);
 	}
 #endif
 	/* if there are nt orphan nodes free them */
@@ -3032,29 +3482,38 @@
 	if (err)
 		goto free_meta;
 
+	if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)))
+		goto reset_checkpoint;
+
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
 		/*
 		 * mount should be failed, when device has readonly mode, and
 		 * previous checkpoint was not done by clean system shutdown.
 		 */
-		if (bdev_read_only(sb->s_bdev) &&
-				!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
-			err = -EROFS;
-			goto free_meta;
+		if (f2fs_hw_is_readonly(sbi)) {
+			if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+				err = -EROFS;
+				f2fs_err(sbi, "Need to recover fsync data, but write access unavailable");
+				goto free_meta;
+			}
+			f2fs_info(sbi, "write access unavailable, skipping recovery");
+			goto reset_checkpoint;
 		}
 
 		if (need_fsck)
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
 
-		if (!retry)
-			goto skip_recovery;
+		if (skip_recovery)
+			goto reset_checkpoint;
 
 		err = f2fs_recover_fsync_data(sbi, false);
 		if (err < 0) {
+			if (err != -ENOMEM)
+				skip_recovery = true;
 			need_fsck = true;
-			f2fs_msg(sb, KERN_ERR,
-				"Cannot recover all fsync data errno=%d", err);
+			f2fs_err(sbi, "Cannot recover all fsync data errno=%d",
+				 err);
 			goto free_meta;
 		}
 	} else {
@@ -3062,15 +3521,22 @@
 
 		if (!f2fs_readonly(sb) && err > 0) {
 			err = -EINVAL;
-			f2fs_msg(sb, KERN_ERR,
-				"Need to recover fsync data");
+			f2fs_err(sbi, "Need to recover fsync data");
 			goto free_meta;
 		}
 	}
-skip_recovery:
+reset_checkpoint:
 	/* f2fs_recover_fsync_data() cleared this already */
 	clear_sbi_flag(sbi, SBI_POR_DOING);
 
+	if (test_opt(sbi, DISABLE_CHECKPOINT)) {
+		err = f2fs_disable_checkpoint(sbi);
+		if (err)
+			goto sync_free_meta;
+	} else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) {
+		f2fs_enable_checkpoint(sbi);
+	}
+
 	/*
 	 * If filesystem is not mounted as read-only then
 	 * do start the gc_thread.
@@ -3079,34 +3545,39 @@
 		/* After POR, we can run background GC thread.*/
 		err = f2fs_start_gc_thread(sbi);
 		if (err)
-			goto free_meta;
+			goto sync_free_meta;
 	}
-	kfree(options);
+	kvfree(options);
 
 	/* recover broken superblock */
 	if (recovery) {
 		err = f2fs_commit_super(sbi, true);
-		f2fs_msg(sb, KERN_INFO,
-			"Try to recover %dth superblock, ret: %d",
-			sbi->valid_super_block ? 1 : 2, err);
+		f2fs_info(sbi, "Try to recover %dth superblock, ret: %d",
+			  sbi->valid_super_block ? 1 : 2, err);
 	}
 
 	f2fs_join_shrinker(sbi);
 
 	f2fs_tuning_parameters(sbi);
 
-	f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx",
-				cur_cp_version(F2FS_CKPT(sbi)));
+	f2fs_notice(sbi, "Mounted with checkpoint version = %llx",
+		    cur_cp_version(F2FS_CKPT(sbi)));
 	f2fs_update_time(sbi, CP_TIME);
 	f2fs_update_time(sbi, REQ_TIME);
+	clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
 	return 0;
 
+sync_free_meta:
+	/* safe to flush all the data */
+	sync_filesystem(sbi->sb);
+	retry_cnt = 0;
+
 free_meta:
 #ifdef CONFIG_QUOTA
-	if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb))
+	f2fs_truncate_quota_inode_pages(sb);
+	if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb))
 		f2fs_quota_off_umount(sbi->sb);
 #endif
-	f2fs_sync_inode_meta(sbi);
 	/*
 	 * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes()
 	 * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
@@ -3114,52 +3585,57 @@
 	 * falls into an infinite loop in f2fs_sync_meta_pages().
 	 */
 	truncate_inode_pages_final(META_MAPPING(sbi));
-#ifdef CONFIG_QUOTA
-free_sysfs:
-#endif
+	/* evict some inodes being cached by GC */
+	evict_inodes(sb);
 	f2fs_unregister_sysfs(sbi);
 free_root_inode:
 	dput(sb->s_root);
 	sb->s_root = NULL;
-free_stats:
-	f2fs_destroy_stats(sbi);
 free_node_inode:
 	f2fs_release_ino_entry(sbi, true);
 	truncate_inode_pages_final(NODE_MAPPING(sbi));
 	iput(sbi->node_inode);
+	sbi->node_inode = NULL;
+free_stats:
+	f2fs_destroy_stats(sbi);
 free_nm:
 	f2fs_destroy_node_manager(sbi);
 free_sm:
 	f2fs_destroy_segment_manager(sbi);
 free_devices:
 	destroy_device_list(sbi);
-	kfree(sbi->ckpt);
+	kvfree(sbi->ckpt);
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
 	iput(sbi->meta_inode);
+	sbi->meta_inode = NULL;
 free_io_dummy:
 	mempool_destroy(sbi->write_io_dummy);
 free_percpu:
 	destroy_percpu_info(sbi);
 free_bio_info:
 	for (i = 0; i < NR_PAGE_TYPE; i++)
-		kfree(sbi->write_io[i]);
+		kvfree(sbi->write_io[i]);
+
+#ifdef CONFIG_UNICODE
+	utf8_unload(sbi->s_encoding);
+#endif
 free_options:
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+		kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
 #endif
-	kfree(options);
+	kvfree(options);
 free_sb_buf:
-	kfree(raw_super);
+	kvfree(raw_super);
 free_sbi:
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
-	kfree(sbi);
+	kvfree(sbi);
 
 	/* give only one another chance */
-	if (retry) {
-		retry = false;
+	if (retry_cnt > 0 && skip_recovery) {
+		retry_cnt--;
 		shrink_dcache_sb(sb);
 		goto try_onemore;
 	}
@@ -3260,9 +3736,7 @@
 	err = register_filesystem(&f2fs_fs_type);
 	if (err)
 		goto free_shrinker;
-	err = f2fs_create_root_stats();
-	if (err)
-		goto free_filesystem;
+	f2fs_create_root_stats();
 	err = f2fs_init_post_read_processing();
 	if (err)
 		goto free_root_stats;
@@ -3270,7 +3744,6 @@
 
 free_root_stats:
 	f2fs_destroy_root_stats();
-free_filesystem:
 	unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
 	unregister_shrinker(&f2fs_shrinker_info);

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 81c0e53..b558b64 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c

@@ -1,18 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * f2fs sysfs interface
  *
  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  *             http://www.samsung.com/
  * Copyright (c) 2017 Chao Yu <chao@kernel.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/compiler.h>
 #include <linux/proc_fs.h>
 #include <linux/f2fs_fs.h>
 #include <linux/seq_file.h>
+#include <linux/unicode.h>
 
 #include "f2fs.h"
 #include "segment.h"
@@ -71,6 +69,33 @@
 		(unsigned long long)(dirty_segments(sbi)));
 }
 
+static ssize_t unusable_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	block_t unusable;
+
+	if (test_opt(sbi, DISABLE_CHECKPOINT))
+		unusable = sbi->unusable_block_count;
+	else
+		unusable = f2fs_get_unusable_blocks(sbi);
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)unusable);
+}
+
+static ssize_t encoding_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+#ifdef CONFIG_UNICODE
+	if (f2fs_sb_has_casefold(sbi))
+		return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n",
+			sbi->s_encoding->charset,
+			(sbi->s_encoding->version >> 16) & 0xff,
+			(sbi->s_encoding->version >> 8) & 0xff,
+			sbi->s_encoding->version & 0xff);
+#endif
+	return snprintf(buf, PAGE_SIZE, "(none)");
+}
+
 static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
@@ -93,33 +118,42 @@
 	if (!sb->s_bdev->bd_part)
 		return snprintf(buf, PAGE_SIZE, "0\n");
 
-	if (f2fs_sb_has_encrypt(sb))
+	if (f2fs_sb_has_encrypt(sbi))
 		len += snprintf(buf, PAGE_SIZE - len, "%s",
 						"encryption");
-	if (f2fs_sb_has_blkzoned(sb))
+	if (f2fs_sb_has_blkzoned(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "blkzoned");
-	if (f2fs_sb_has_extra_attr(sb))
+	if (f2fs_sb_has_extra_attr(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "extra_attr");
-	if (f2fs_sb_has_project_quota(sb))
+	if (f2fs_sb_has_project_quota(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "projquota");
-	if (f2fs_sb_has_inode_chksum(sb))
+	if (f2fs_sb_has_inode_chksum(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_checksum");
-	if (f2fs_sb_has_flexible_inline_xattr(sb))
+	if (f2fs_sb_has_flexible_inline_xattr(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "flexible_inline_xattr");
-	if (f2fs_sb_has_quota_ino(sb))
+	if (f2fs_sb_has_quota_ino(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "quota_ino");
-	if (f2fs_sb_has_inode_crtime(sb))
+	if (f2fs_sb_has_inode_crtime(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_crtime");
-	if (f2fs_sb_has_lost_found(sb))
+	if (f2fs_sb_has_lost_found(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "lost_found");
+	if (f2fs_sb_has_verity(sbi))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "verity");
+	if (f2fs_sb_has_sb_chksum(sbi))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "sb_checksum");
+	if (f2fs_sb_has_casefold(sbi))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "casefold");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
@@ -222,6 +256,8 @@
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
 		return -EINVAL;
+	if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
+		return -EINVAL;
 #endif
 	if (a->struct_type == RESERVED_BLOCKS) {
 		spin_lock(&sbi->stat_lock);
@@ -246,6 +282,11 @@
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "migration_granularity")) {
+		if (t == 0 || t > sbi->segs_per_sec)
+			return -EINVAL;
+	}
+
 	if (!strcmp(a->attr.name, "trim_sections"))
 		return -EINVAL;
 
@@ -273,10 +314,16 @@
 		return count;
 	}
 
-	*ui = t;
 
-	if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
-		f2fs_reset_iostat(sbi);
+	if (!strcmp(a->attr.name, "iostat_enable")) {
+		sbi->iostat_enable = !!t;
+		if (!sbi->iostat_enable)
+			f2fs_reset_iostat(sbi);
+		return count;
+	}
+
+	*ui = (unsigned int)t;
+
 	return count;
 }
 
@@ -337,6 +384,9 @@
 	FEAT_QUOTA_INO,
 	FEAT_INODE_CRTIME,
 	FEAT_LOST_FOUND,
+	FEAT_VERITY,
+	FEAT_SB_CHECKSUM,
+	FEAT_CASEFOLD,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -353,6 +403,9 @@
 	case FEAT_QUOTA_INO:
 	case FEAT_INODE_CRTIME:
 	case FEAT_LOST_FOUND:
+	case FEAT_VERITY:
+	case FEAT_SB_CHECKSUM:
+	case FEAT_CASEFOLD:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -404,9 +457,15 @@
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
+					interval_time[DISCARD_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
+		umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
@@ -419,8 +478,10 @@
 F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
 F2FS_GENERAL_RO_ATTR(features);
 F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
+F2FS_GENERAL_RO_ATTR(unusable);
+F2FS_GENERAL_RO_ATTR(encoding);
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
 #endif
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -434,6 +495,11 @@
 F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
 F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
 F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
+#ifdef CONFIG_FS_VERITY
+F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
+#endif
+F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
+F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -454,12 +520,16 @@
 	ATTR_LIST(min_hot_blocks),
 	ATTR_LIST(min_ssr_sections),
 	ATTR_LIST(max_victim_search),
+	ATTR_LIST(migration_granularity),
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
 	ATTR_LIST(ra_nid_pages),
 	ATTR_LIST(dirty_nats_ratio),
 	ATTR_LIST(cp_interval),
 	ATTR_LIST(idle_interval),
+	ATTR_LIST(discard_idle_interval),
+	ATTR_LIST(gc_idle_interval),
+	ATTR_LIST(umount_discard_timeout),
 	ATTR_LIST(iostat_enable),
 	ATTR_LIST(readdir_ra),
 	ATTR_LIST(gc_pin_file_thresh),
@@ -469,15 +539,18 @@
 	ATTR_LIST(inject_type),
 #endif
 	ATTR_LIST(dirty_segments),
+	ATTR_LIST(unusable),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(features),
 	ATTR_LIST(reserved_blocks),
 	ATTR_LIST(current_reserved_blocks),
+	ATTR_LIST(encoding),
 	NULL,
 };
+ATTRIBUTE_GROUPS(f2fs);
 
 static struct attribute *f2fs_feat_attrs[] = {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	ATTR_LIST(encryption),
 #endif
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -491,8 +564,14 @@
 	ATTR_LIST(quota_ino),
 	ATTR_LIST(inode_crtime),
 	ATTR_LIST(lost_found),
+#ifdef CONFIG_FS_VERITY
+	ATTR_LIST(verity),
+#endif
+	ATTR_LIST(sb_checksum),
+	ATTR_LIST(casefold),
 	NULL,
 };
+ATTRIBUTE_GROUPS(f2fs_feat);
 
 static const struct sysfs_ops f2fs_attr_ops = {
 	.show	= f2fs_attr_show,
@@ -500,7 +579,7 @@
 };
 
 static struct kobj_type f2fs_sb_ktype = {
-	.default_attrs	= f2fs_attrs,
+	.default_groups = f2fs_groups,
 	.sysfs_ops	= &f2fs_attr_ops,
 	.release	= f2fs_sb_release,
 };
@@ -514,7 +593,7 @@
 };
 
 static struct kobj_type f2fs_feat_ktype = {
-	.default_attrs	= f2fs_feat_attrs,
+	.default_groups = f2fs_feat_groups,
 	.sysfs_ops	= &f2fs_attr_ops,
 };
 
@@ -539,8 +618,7 @@
 
 		if ((i % 10) == 0)
 			seq_printf(seq, "%-10d", i);
-		seq_printf(seq, "%d|%-3u", se->type,
-					get_valid_blocks(sbi, i, false));
+		seq_printf(seq, "%d|%-3u", se->type, se->valid_blocks);
 		if ((i % 10) == 9 || i == (total_segs - 1))
 			seq_putc(seq, '\n');
 		else
@@ -566,8 +644,7 @@
 		struct seg_entry *se = get_seg_entry(sbi, i);
 
 		seq_printf(seq, "%-10d", i);
-		seq_printf(seq, "%d|%-3u|", se->type,
-					get_valid_blocks(sbi, i, false));
+		seq_printf(seq, "%d|%-3u|", se->type, se->valid_blocks);
 		for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
 			seq_printf(seq, " %.2x", se->cur_valid_map[j]);
 		seq_putc(seq, '\n');

diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index a1fcd00..d0ab533 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * f2fs IO tracer
  *
  * Copyright (c) 2014 Motorola Mobility
  * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
@@ -17,7 +14,7 @@
 #include "trace.h"
 
 static RADIX_TREE(pids, GFP_ATOMIC);
-static struct mutex pids_lock;
+static spinlock_t pids_lock;
 static struct last_io_info last_io;
 
 static inline void __print_last_io(void)
@@ -61,23 +58,29 @@
 
 	set_page_private(page, (unsigned long)pid);
 
+retry:
 	if (radix_tree_preload(GFP_NOFS))
 		return;
 
-	mutex_lock(&pids_lock);
+	spin_lock(&pids_lock);
 	p = radix_tree_lookup(&pids, pid);
 	if (p == current)
 		goto out;
 	if (p)
 		radix_tree_delete(&pids, pid);
 
-	f2fs_radix_tree_insert(&pids, pid, current);
+	if (radix_tree_insert(&pids, pid, current)) {
+		spin_unlock(&pids_lock);
+		radix_tree_preload_end();
+		cond_resched();
+		goto retry;
+	}
 
 	trace_printk("%3x:%3x %4x %-16s\n",
 			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
 			pid, current->comm);
 out:
-	mutex_unlock(&pids_lock);
+	spin_unlock(&pids_lock);
 	radix_tree_preload_end();
 }
 
@@ -122,7 +125,7 @@
 
 void f2fs_build_trace_ios(void)
 {
-	mutex_init(&pids_lock);
+	spin_lock_init(&pids_lock);
 }
 
 #define PIDVEC_SIZE	128
@@ -150,7 +153,7 @@
 	pid_t next_pid = 0;
 	unsigned int found;
 
-	mutex_lock(&pids_lock);
+	spin_lock(&pids_lock);
 	while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
 		unsigned idx;
 
@@ -158,5 +161,5 @@
 		for (idx = 0; idx < found; idx++)
 			radix_tree_delete(&pids, pid[idx]);
 	}
-	mutex_unlock(&pids_lock);
+	spin_unlock(&pids_lock);
 }

diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
index 67db24a..e8075fc 100644
--- a/fs/f2fs/trace.h
+++ b/fs/f2fs/trace.h

@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * f2fs IO tracer
  *
  * Copyright (c) 2014 Motorola Mobility
  * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #ifndef __F2FS_TRACE_H__
 #define __F2FS_TRACE_H__

diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
new file mode 100644
index 0000000..a401ef7
--- /dev/null
+++ b/fs/f2fs/verity.c

@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/f2fs/verity.c: fs-verity support for f2fs
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for f2fs.
+ *
+ * Like ext4, f2fs stores the verity metadata (Merkle tree and
+ * fsverity_descriptor) past the end of the file, starting at the first 64K
+ * boundary beyond i_size.  This approach works because (a) verity files are
+ * readonly, and (b) pages fully beyond i_size aren't visible to userspace but
+ * can be read/written internally by f2fs with only some relatively small
+ * changes to f2fs.  Extended attributes cannot be used because (a) f2fs limits
+ * the total size of an inode's xattr entries to 4096 bytes, which wouldn't be
+ * enough for even a single Merkle tree block, and (b) f2fs encryption doesn't
+ * encrypt xattrs, yet the verity metadata *must* be encrypted when the file is
+ * because it contains hashes of the plaintext data.
+ *
+ * Using a 64K boundary rather than a 4K one keeps things ready for
+ * architectures with 64K pages, and it doesn't necessarily waste space on-disk
+ * since there can be a hole between i_size and the start of the Merkle tree.
+ */
+
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode)
+{
+	return round_up(inode->i_size, 65536);
+}
+
+/*
+ * Read some verity metadata from the inode.  __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+			  loff_t pos)
+{
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *addr;
+
+		page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+					 NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		addr = kmap_atomic(page);
+		memcpy(buf, addr + offset_in_page(pos), n);
+		kunmap_atomic(addr);
+
+		put_page(page);
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+			   loff_t pos)
+{
+	if (pos + count > inode->i_sb->s_maxbytes)
+		return -EFBIG;
+
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *fsdata;
+		void *addr;
+		int res;
+
+		res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+					    &page, &fsdata);
+		if (res)
+			return res;
+
+		addr = kmap_atomic(page);
+		memcpy(addr + offset_in_page(pos), buf, n);
+		kunmap_atomic(addr);
+
+		res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+					  page, fsdata);
+		if (res < 0)
+			return res;
+		if (res != n)
+			return -EIO;
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+
+/*
+ * Format of f2fs verity xattr.  This points to the location of the verity
+ * descriptor within the file data rather than containing it directly because
+ * the verity descriptor *must* be encrypted when f2fs encryption is used.  But,
+ * f2fs encryption does not encrypt xattrs.
+ */
+struct fsverity_descriptor_location {
+	__le32 version;
+	__le32 size;
+	__le64 pos;
+};
+
+static int f2fs_begin_enable_verity(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	int err;
+
+	if (f2fs_verity_in_progress(inode))
+		return -EBUSY;
+
+	if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Since the file was opened readonly, we have to initialize the quotas
+	 * here and not rely on ->open() doing it.  This must be done before
+	 * evicting the inline data.
+	 */
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
+	err = f2fs_convert_inline_inode(inode);
+	if (err)
+		return err;
+
+	set_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+	return 0;
+}
+
+static int f2fs_end_enable_verity(struct file *filp, const void *desc,
+				  size_t desc_size, u64 merkle_tree_size)
+{
+	struct inode *inode = file_inode(filp);
+	u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size;
+	struct fsverity_descriptor_location dloc = {
+		.version = cpu_to_le32(1),
+		.size = cpu_to_le32(desc_size),
+		.pos = cpu_to_le64(desc_pos),
+	};
+	int err = 0;
+
+	if (desc != NULL) {
+		/* Succeeded; write the verity descriptor. */
+		err = pagecache_write(inode, desc, desc_size, desc_pos);
+
+		/* Write all pages before clearing FI_VERITY_IN_PROGRESS. */
+		if (!err)
+			err = filemap_write_and_wait(inode->i_mapping);
+	}
+
+	/* If we failed, truncate anything we wrote past i_size. */
+	if (desc == NULL || err)
+		f2fs_truncate(inode);
+
+	clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+
+	if (desc != NULL && !err) {
+		err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
+				    F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
+				    NULL, XATTR_CREATE);
+		if (!err) {
+			file_set_verity(inode);
+			f2fs_set_inode_flags(inode);
+			f2fs_mark_inode_dirty_sync(inode, true);
+		}
+	}
+	return err;
+}
+
+static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
+				      size_t buf_size)
+{
+	struct fsverity_descriptor_location dloc;
+	int res;
+	u32 size;
+	u64 pos;
+
+	/* Get the descriptor location */
+	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_VERITY,
+			    F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL);
+	if (res < 0 && res != -ERANGE)
+		return res;
+	if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) {
+		f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format");
+		return -EINVAL;
+	}
+	size = le32_to_cpu(dloc.size);
+	pos = le64_to_cpu(dloc.pos);
+
+	/* Get the descriptor */
+	if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes ||
+	    pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) {
+		f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr");
+		return -EFSCORRUPTED;
+	}
+	if (buf_size) {
+		if (size > buf_size)
+			return -ERANGE;
+		res = pagecache_read(inode, buf, size, pos);
+		if (res)
+			return res;
+	}
+	return size;
+}
+
+static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
+					       pgoff_t index)
+{
+	index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
+
+	return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
+					u64 index, int log_blocksize)
+{
+	loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize);
+
+	return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations f2fs_verityops = {
+	.begin_enable_verity	= f2fs_begin_enable_verity,
+	.end_enable_verity	= f2fs_end_enable_verity,
+	.get_verity_descriptor	= f2fs_get_verity_descriptor,
+	.read_merkle_tree_page	= f2fs_read_merkle_tree_page,
+	.write_merkle_tree_block = f2fs_write_merkle_tree_block,
+};

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 77a010e..181900a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/xattr.c
  *
@@ -13,10 +14,6 @@
  *  suggestion of Luka Renko <luka.renko@hermes.si>.
  * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
  *  Red Hat Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/rwsem.h>
 #include <linux/f2fs_fs.h>
@@ -24,6 +21,7 @@
 #include <linux/posix_acl_xattr.h>
 #include "f2fs.h"
 #include "xattr.h"
+#include "segment.h"
 
 static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 		struct dentry *unused, struct inode *inode,
@@ -205,12 +203,17 @@
 	return handler;
 }
 
-static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
-					size_t len, const char *name)
+static struct f2fs_xattr_entry *__find_xattr(void *base_addr,
+				void *last_base_addr, int index,
+				size_t len, const char *name)
 {
 	struct f2fs_xattr_entry *entry;
 
 	list_for_each_xattr(entry, base_addr) {
+		if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
+			(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr)
+			return NULL;
+
 		if (entry->e_name_index != index)
 			continue;
 		if (entry->e_name_len != len)
@@ -227,11 +230,11 @@
 {
 	struct f2fs_xattr_entry *entry;
 	unsigned int inline_size = inline_xattr_size(inode);
+	void *max_addr = base_addr + inline_size;
 
 	list_for_each_xattr(entry, base_addr) {
-		if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
-			(void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
-			base_addr + inline_size) {
+		if ((void *)entry + sizeof(__u32) > max_addr ||
+			(void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
 			*last_addr = entry;
 			return NULL;
 		}
@@ -242,6 +245,13 @@
 		if (!memcmp(entry->e_name, name, len))
 			break;
 	}
+
+	/* inline xattr header or entry across max inline xattr size */
+	if (IS_XATTR_LAST_ENTRY(entry) &&
+		(void *)entry + sizeof(__u32) > max_addr) {
+		*last_addr = entry;
+		return NULL;
+	}
 	return entry;
 }
 
@@ -291,22 +301,24 @@
 static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 				unsigned int index, unsigned int len,
 				const char *name, struct f2fs_xattr_entry **xe,
-				void **base_addr)
+				void **base_addr, int *base_size)
 {
-	void *cur_addr, *txattr_addr, *last_addr = NULL;
+	void *cur_addr, *txattr_addr, *last_txattr_addr;
+	void *last_addr = NULL;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
-	unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
 	unsigned int inline_size = inline_xattr_size(inode);
 	int err = 0;
 
-	if (!size && !inline_size)
+	if (!xnid && !inline_size)
 		return -ENODATA;
 
-	txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode),
-			inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS);
+	*base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE;
+	txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
 	if (!txattr_addr)
 		return -ENOMEM;
 
+	last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode);
+
 	/* read from inline xattr */
 	if (inline_size) {
 		err = read_inline_xattr(inode, ipage, txattr_addr);
@@ -315,8 +327,10 @@
 
 		*xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
 						index, len, name);
-		if (*xe)
+		if (*xe) {
+			*base_size = inline_size;
 			goto check;
+		}
 	}
 
 	/* read from xattr node block */
@@ -331,7 +345,14 @@
 	else
 		cur_addr = txattr_addr;
 
-	*xe = __find_xattr(cur_addr, index, len, name);
+	*xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name);
+	if (!*xe) {
+		f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+								inode->i_ino);
+		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+		err = -EFSCORRUPTED;
+		goto out;
+	}
 check:
 	if (IS_XATTR_LAST_ENTRY(*xe)) {
 		err = -ENODATA;
@@ -341,7 +362,7 @@
 	*base_addr = txattr_addr;
 	return 0;
 out:
-	kzfree(txattr_addr);
+	kvfree(txattr_addr);
 	return err;
 }
 
@@ -384,7 +405,7 @@
 	*base_addr = txattr_addr;
 	return 0;
 fail:
-	kzfree(txattr_addr);
+	kvfree(txattr_addr);
 	return err;
 }
 
@@ -418,7 +439,7 @@
 		}
 
 		f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
-							NODE, true);
+							NODE, true, true);
 		/* no need to use xattr node block */
 		if (hsize <= inline_size) {
 			err = f2fs_truncate_xattr_node(inode);
@@ -442,7 +463,7 @@
 			goto in_page_out;
 		}
 		f2fs_bug_on(sbi, new_nid);
-		f2fs_wait_on_page_writeback(xpage, NODE, true);
+		f2fs_wait_on_page_writeback(xpage, NODE, true, true);
 	} else {
 		struct dnode_of_data dn;
 		set_new_dnode(&dn, inode, NULL, NULL, new_nid);
@@ -477,6 +498,7 @@
 	int error = 0;
 	unsigned int size, len;
 	void *base_addr = NULL;
+	int base_size;
 
 	if (name == NULL)
 		return -EINVAL;
@@ -487,7 +509,7 @@
 
 	down_read(&F2FS_I(inode)->i_xattr_sem);
 	error = lookup_all_xattrs(inode, ipage, index, len, name,
-				&entry, &base_addr);
+				&entry, &base_addr, &base_size);
 	up_read(&F2FS_I(inode)->i_xattr_sem);
 	if (error)
 		return error;
@@ -501,11 +523,16 @@
 
 	if (buffer) {
 		char *pval = entry->e_name + entry->e_name_len;
+
+		if (base_size - (pval - (char *)base_addr) < size) {
+			error = -ERANGE;
+			goto out;
+		}
 		memcpy(buffer, pval, size);
 	}
 	error = size;
 out:
-	kzfree(base_addr);
+	kvfree(base_addr);
 	return error;
 }
 
@@ -533,7 +560,7 @@
 		if (!handler || (handler->list && !handler->list(dentry)))
 			continue;
 
-		prefix = handler->prefix ?: handler->name;
+		prefix = xattr_prefix(handler);
 		prefix_len = strlen(prefix);
 		size = prefix_len + entry->e_name_len + 1;
 		if (buffer) {
@@ -551,7 +578,7 @@
 	}
 	error = buffer_size - rest;
 cleanup:
-	kzfree(base_addr);
+	kvfree(base_addr);
 	return error;
 }
 
@@ -569,7 +596,8 @@
 			struct page *ipage, int flags)
 {
 	struct f2fs_xattr_entry *here, *last;
-	void *base_addr;
+	void *base_addr, *last_base_addr;
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	int found, newsize;
 	size_t len;
 	__u32 new_hsize;
@@ -593,8 +621,17 @@
 	if (error)
 		return error;
 
+	last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+
 	/* find entry with wanted name. */
-	here = __find_xattr(base_addr, index, len, name);
+	here = __find_xattr(base_addr, last_base_addr, index, len, name);
+	if (!here) {
+		f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+								inode->i_ino);
+		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+		error = -EFSCORRUPTED;
+		goto exit;
+	}
 
 	found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1;
 
@@ -682,7 +719,7 @@
 	if (!error && S_ISDIR(inode->i_mode))
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
 exit:
-	kzfree(base_addr);
+	kvfree(base_addr);
 	return error;
 }
 
@@ -693,6 +730,11 @@
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+	if (!f2fs_is_checkpoint_ready(sbi))
+		return -ENOSPC;
+
 	err = dquot_initialize(inode);
 	if (err)
 		return err;

diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index dbcd1d1..de0c600 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * fs/f2fs/xattr.h
  *
@@ -9,10 +10,6 @@
  * On-disk format of extended attributes for the ext2 filesystem.
  *
  * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #ifndef __F2FS_XATTR_H__
 #define __F2FS_XATTR_H__
@@ -37,8 +34,10 @@
 #define F2FS_XATTR_INDEX_ADVISE			7
 /* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */
 #define F2FS_XATTR_INDEX_ENCRYPTION		9
+#define F2FS_XATTR_INDEX_VERITY			11
 
 #define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT	"c"
+#define F2FS_XATTR_NAME_VERITY			"v"
 
 struct f2fs_xattr_header {
 	__le32  h_magic;        /* magic number for identification */
@@ -74,6 +73,8 @@
 				entry = XATTR_NEXT_ENTRY(entry))
 #define VALID_XATTR_BLOCK_SIZE	(PAGE_SIZE - sizeof(struct node_footer))
 #define XATTR_PADDING_SIZE	(sizeof(__u32))
+#define XATTR_SIZE(x,i)		(((x) ? VALID_XATTR_BLOCK_SIZE : 0) +	\
+						(inline_xattr_size(i)))
 #define MIN_OFFSET(i)		XATTR_ALIGN(inline_xattr_size(i) +	\
 						VALID_XATTR_BLOCK_SIZE)
 
@@ -81,6 +82,12 @@
 				sizeof(struct f2fs_xattr_header) -	\
 				sizeof(struct f2fs_xattr_entry))
 
+#define MAX_INLINE_XATTR_SIZE						\
+			(DEF_ADDRS_PER_INODE -				\
+			F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -	\
+			DEF_INLINE_RESERVED_SIZE -			\
+			MIN_INLINE_DENTRY_SIZE / sizeof(__le32))
+
 /*
  * On-disk structure of f2fs_xattr
  * We use inline xattrs space + 1 block for xattr.

diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index 3ff1772..718163d 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config FAT_FS
 	tristate
 	select NLS

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 78d501c..738e427 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c

@@ -363,7 +363,7 @@
 
 	*phys = 0;
 	*mapped_blocks = 0;
-	if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
+	if (!is_fat32(sbi) && (inode->i_ino == MSDOS_ROOT_INO)) {
 		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
 			*phys = sector + sbi->dir_start;
 			*mapped_blocks = 1;

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7f5f369..054acd9 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/fat/dir.c
  *
@@ -57,7 +58,7 @@
 	if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1)
 		return;
 	/* root dir of FAT12/FAT16 */
-	if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
+	if (!is_fat32(sbi) && (dir->i_ino == MSDOS_ROOT_INO))
 		return;
 
 	bh = sb_find_get_block(sb, phys);
@@ -87,9 +88,7 @@
 	int err, offset;
 
 next:
-	if (*bh)
-		brelse(*bh);
-
+	brelse(*bh);
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
 	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
@@ -369,7 +368,9 @@
 	}
 
 	memcpy(work, de->name, sizeof(work));
-	/* see namei.c, msdos_format_name */
+	/* For an explanation of the special treatment of 0x05 in
+	 * filenames, see msdos_format_name in namei_msdos.c
+	 */
 	if (work[0] == 0x05)
 		work[0] = 0xE5;
 
@@ -803,7 +804,7 @@
 		return fat_generic_ioctl(filp, cmd, arg);
 	}
 
-	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
+	if (!access_ok(d1, sizeof(struct __fat_dirent[2])))
 		return -EFAULT;
 	/*
 	 * Yes, we don't need this put_user() absolutely. However old
@@ -843,7 +844,7 @@
 		return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
 	}
 
-	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
+	if (!access_ok(d1, sizeof(struct compat_dirent[2])))
 		return -EFAULT;
 	/*
 	 * Yes, we don't need this put_user() absolutely. However old
@@ -1071,7 +1072,7 @@
 		}
 	}
 
-	dir->i_mtime = dir->i_atime = current_time(dir);
+	fat_truncate_time(dir, NULL, S_ATIME|S_MTIME);
 	if (IS_DIRSYNC(dir))
 		(void)fat_sync_inode(dir);
 	else
@@ -1097,8 +1098,11 @@
 			err = -ENOMEM;
 			goto error;
 		}
+		/* Avoid race with userspace read via bdev */
+		lock_buffer(bhs[n]);
 		memset(bhs[n]->b_data, 0, sb->s_blocksize);
 		set_buffer_uptodate(bhs[n]);
+		unlock_buffer(bhs[n]);
 		mark_buffer_dirty_inode(bhs[n], dir);
 
 		n++;
@@ -1155,6 +1159,8 @@
 	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
 
 	de = (struct msdos_dir_entry *)bhs[0]->b_data;
+	/* Avoid race with userspace read via bdev */
+	lock_buffer(bhs[0]);
 	/* filling the new directory slots ("." and ".." entries) */
 	memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
 	memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
@@ -1177,6 +1183,7 @@
 	de[0].size = de[1].size = 0;
 	memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
 	set_buffer_uptodate(bhs[0]);
+	unlock_buffer(bhs[0]);
 	mark_buffer_dirty_inode(bhs[0], dir);
 
 	err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
@@ -1234,11 +1241,14 @@
 
 			/* fill the directory entry */
 			copy = min(size, sb->s_blocksize);
+			/* Avoid race with userspace read via bdev */
+			lock_buffer(bhs[n]);
 			memcpy(bhs[n]->b_data, slots, copy);
+			set_buffer_uptodate(bhs[n]);
+			unlock_buffer(bhs[n]);
+			mark_buffer_dirty_inode(bhs[n], dir);
 			slots += copy;
 			size -= copy;
-			set_buffer_uptodate(bhs[n]);
-			mark_buffer_dirty_inode(bhs[n], dir);
 			if (!size)
 				break;
 			n++;
@@ -1311,7 +1321,7 @@
 		}
 	}
 	if (dir->i_ino == MSDOS_ROOT_INO) {
-		if (sbi->fat_bits != 32)
+		if (!is_fat32(sbi))
 			goto error;
 	} else if (MSDOS_I(dir)->i_start == 0) {
 		fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 9d7d2d5..922a0c6 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h

@@ -142,6 +142,34 @@
 	return sb->s_fs_info;
 }
 
+/*
+ * Functions that determine the variant of the FAT file system (i.e.,
+ * whether this is FAT12, FAT16 or FAT32.
+ */
+static inline bool is_fat12(const struct msdos_sb_info *sbi)
+{
+	return sbi->fat_bits == 12;
+}
+
+static inline bool is_fat16(const struct msdos_sb_info *sbi)
+{
+	return sbi->fat_bits == 16;
+}
+
+static inline bool is_fat32(const struct msdos_sb_info *sbi)
+{
+	return sbi->fat_bits == 32;
+}
+
+/* Maximum number of clusters */
+static inline u32 max_fat(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	return is_fat32(sbi) ? MAX_FAT32 :
+		is_fat16(sbi) ? MAX_FAT16 : MAX_FAT12;
+}
+
 static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 {
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
@@ -257,7 +285,7 @@
 				const struct msdos_dir_entry *de)
 {
 	int cluster = le16_to_cpu(de->start);
-	if (sbi->fat_bits == 32)
+	if (is_fat32(sbi))
 		cluster |= (le16_to_cpu(de->starthi) << 16);
 	return cluster;
 }
@@ -416,6 +444,10 @@
 			      __le16 __time, __le16 __date, u8 time_cs);
 extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
 			      __le16 *time, __le16 *date, u8 *time_cs);
+extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
+			     int flags);
+extern int fat_update_time(struct inode *inode, struct timespec64 *now,
+			   int flags);
 extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 
 int fat_cache_init(void);

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index f58c0ca..3647c65 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c

@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2004, OGAWA Hirofumi
- * Released under GPL v2.
  */
 
 #include <linux/blkdev.h>
@@ -290,19 +290,17 @@
 
 	mutex_init(&sbi->fat_lock);
 
-	switch (sbi->fat_bits) {
-	case 32:
+	if (is_fat32(sbi)) {
 		sbi->fatent_shift = 2;
 		sbi->fatent_ops = &fat32_ops;
-		break;
-	case 16:
+	} else if (is_fat16(sbi)) {
 		sbi->fatent_shift = 1;
 		sbi->fatent_ops = &fat16_ops;
-		break;
-	case 12:
+	} else if (is_fat12(sbi)) {
 		sbi->fatent_shift = -1;
 		sbi->fatent_ops = &fat12_ops;
-		break;
+	} else {
+		fat_fs_error(sb, "invalid FAT variant, %u bits", sbi->fat_bits);
 	}
 }
 
@@ -310,7 +308,7 @@
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
-	if (sb_rdonly(sb) || sbi->fat_bits != 32)
+	if (sb_rdonly(sb) || !is_fat32(sbi))
 		return;
 
 	__mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC);
@@ -327,7 +325,7 @@
 	/* Is this fatent's blocks including this entry? */
 	if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
 		return 0;
-	if (sbi->fat_bits == 12) {
+	if (is_fat12(sbi)) {
 		if ((offset + 1) < sb->s_blocksize) {
 			/* This entry is on bhs[0]. */
 			if (fatent->nr_bhs == 2) {
@@ -390,8 +388,11 @@
 				err = -ENOMEM;
 				goto error;
 			}
+			/* Avoid race with userspace read via bdev */
+			lock_buffer(c_bh);
 			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
 			set_buffer_uptodate(c_bh);
+			unlock_buffer(c_bh);
 			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
 			if (sb->s_flags & SB_SYNCHRONOUS)
 				err = sync_dirty_buffer(c_bh);

diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4f3d72f..4614c0b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/fat/file.c
  *
@@ -193,12 +194,17 @@
 int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
-	int res, err;
+	int err;
 
-	res = generic_file_fsync(filp, start, end, datasync);
+	err = __generic_file_fsync(filp, start, end, datasync);
+	if (err)
+		return err;
+
 	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+	if (err)
+		return err;
 
-	return res ? res : err;
+	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 }
 
 
@@ -214,6 +220,7 @@
 #endif
 	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
 	.fallocate	= fat_fallocate,
 };
 
@@ -227,7 +234,7 @@
 	if (err)
 		goto out;
 
-	inode->i_ctime = inode->i_mtime = current_time(inode);
+	fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
 	mark_inode_dirty(inode);
 	if (IS_SYNC(inode)) {
 		int err2;
@@ -330,7 +337,7 @@
 		MSDOS_I(inode)->i_logstart = 0;
 	}
 	MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
-	inode->i_ctime = inode->i_mtime = current_time(inode);
+	fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
 	if (wait) {
 		err = fat_sync_inode(inode);
 		if (err) {
@@ -542,6 +549,18 @@
 		up_write(&MSDOS_I(inode)->truncate_lock);
 	}
 
+	/*
+	 * setattr_copy can't truncate these appropriately, so we'll
+	 * copy them ourselves
+	 */
+	if (attr->ia_valid & ATTR_ATIME)
+		fat_truncate_time(inode, &attr->ia_atime, S_ATIME);
+	if (attr->ia_valid & ATTR_CTIME)
+		fat_truncate_time(inode, &attr->ia_ctime, S_CTIME);
+	if (attr->ia_valid & ATTR_MTIME)
+		fat_truncate_time(inode, &attr->ia_mtime, S_MTIME);
+	attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
+
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 out:
@@ -552,4 +571,5 @@
 const struct inode_operations fat_file_inode_operations = {
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
+	.update_time	= fat_update_time,
 };

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d6b81e3..5f04c5c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/fat/inode.c
  *
@@ -30,6 +31,11 @@
 
 #define KB_IN_SECTORS 2
 
+/* DOS dates from 1980/1/1 through 2107/12/31 */
+#define FAT_DATE_MIN (0<<9 | 1<<5 | 1)
+#define FAT_DATE_MAX (127<<9 | 12<<5 | 31)
+#define FAT_TIME_MAX (23<<11 | 59<<5 | 29)
+
 /*
  * A deserialized copy of the on-disk structure laid out in struct
  * fat_boot_sector.
@@ -244,7 +250,7 @@
 	if (err < len)
 		fat_write_failed(mapping, pos + len);
 	if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
-		inode->i_mtime = inode->i_ctime = current_time(inode);
+		fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
 		MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
 		mark_inode_dirty(inode);
 	}
@@ -564,7 +570,7 @@
 				  de->cdate, de->ctime_cs);
 		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
 	} else
-		inode->i_ctime = inode->i_atime = inode->i_mtime;
+		fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME);
 
 	return 0;
 }
@@ -686,7 +692,7 @@
 
 	b = (struct fat_boot_sector *) bh->b_data;
 
-	if (sbi->fat_bits == 32) {
+	if (is_fat32(sbi)) {
 		if (set)
 			b->fat32.state |= FAT_STATE_DIRTY;
 		else
@@ -746,17 +752,11 @@
 	return &ei->vfs_inode;
 }
 
-static void fat_i_callback(struct rcu_head *head)
+static void fat_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
-static void fat_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, fat_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -920,7 +920,7 @@
 static int fat_show_options(struct seq_file *m, struct dentry *root);
 static const struct super_operations fat_sops = {
 	.alloc_inode	= fat_alloc_inode,
-	.destroy_inode	= fat_destroy_inode,
+	.free_inode	= fat_free_inode,
 	.write_inode	= fat_write_inode,
 	.evict_inode	= fat_evict_inode,
 	.put_super	= fat_put_super,
@@ -1396,7 +1396,7 @@
 	inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
 	inode->i_op = sbi->dir_ops;
 	inode->i_fop = &fat_dir_operations;
-	if (sbi->fat_bits == 32) {
+	if (is_fat32(sbi)) {
 		MSDOS_I(inode)->i_start = sbi->root_cluster;
 		error = fat_calc_dir_size(inode);
 		if (error < 0)
@@ -1423,7 +1423,7 @@
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
 	/* Divide first to avoid overflow */
-	if (sbi->fat_bits != 12) {
+	if (!is_fat12(sbi)) {
 		unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
 		return ent_per_sec * sbi->fat_length;
 	}
@@ -1610,6 +1610,7 @@
 	int debug;
 	long error;
 	char buf[50];
+	struct timespec64 ts;
 
 	/*
 	 * GFP_KERNEL is ok here, because while we do hold the
@@ -1626,6 +1627,11 @@
 	sb->s_magic = MSDOS_SUPER_MAGIC;
 	sb->s_op = &fat_sops;
 	sb->s_export_op = &fat_export_ops;
+	/*
+	 * fat timestamps are complex and truncated by fat itself, so
+	 * we set 1 here to be fast
+	 */
+	sb->s_time_gran = 1;
 	mutex_init(&sbi->nfs_build_inode_lock);
 	ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
 			     DEFAULT_RATELIMIT_BURST);
@@ -1698,6 +1704,12 @@
 	sbi->free_clus_valid = 0;
 	sbi->prev_free = FAT_START_ENT;
 	sb->s_maxbytes = 0xffffffff;
+	fat_time_fat2unix(sbi, &ts, 0, cpu_to_le16(FAT_DATE_MIN), 0);
+	sb->s_time_min = ts.tv_sec;
+
+	fat_time_fat2unix(sbi, &ts, cpu_to_le16(FAT_TIME_MAX),
+			  cpu_to_le16(FAT_DATE_MAX), 0);
+	sb->s_time_max = ts.tv_sec;
 
 	if (!sbi->fat_length && bpb.fat32_length) {
 		struct fat_boot_fsinfo *fsinfo;
@@ -1738,7 +1750,7 @@
 	}
 
 	/* interpret volume ID as a little endian 32 bit integer */
-	if (sbi->fat_bits == 32)
+	if (is_fat32(sbi))
 		sbi->vol_id = bpb.fat32_vol_id;
 	else /* fat 16 or 12 */
 		sbi->vol_id = bpb.fat16_vol_id;
@@ -1764,11 +1776,11 @@
 
 	total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;
 
-	if (sbi->fat_bits != 32)
+	if (!is_fat32(sbi))
 		sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12;
 
 	/* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
-	if (sbi->fat_bits == 32)
+	if (is_fat32(sbi))
 		sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY;
 	else /* fat 16 or 12 */
 		sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY;
@@ -1776,7 +1788,7 @@
 	/* check that FAT table does not overflow */
 	fat_clusters = calc_fat_clusters(sb);
 	total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
-	if (total_clusters > MAX_FAT(sb)) {
+	if (total_clusters > max_fat(sb)) {
 		if (!silent)
 			fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
 			       total_clusters);
@@ -1798,11 +1810,15 @@
 	fat_ent_access_init(sb);
 
 	/*
-	 * The low byte of FAT's first entry must have same value with
-	 * media-field.  But in real world, too many devices is
-	 * writing wrong value.  So, removed that validity check.
+	 * The low byte of the first FAT entry must have the same value as
+	 * the media field of the boot sector. But in real world, too many
+	 * devices are writing wrong values. So, removed that validity check.
 	 *
-	 * if (FAT_FIRST_ENT(sb, media) != first)
+	 * The removed check compared the first FAT entry to a value dependent
+	 * on the media field like this:
+	 * == (0x0F00 | media), for FAT12
+	 * == (0XFF00 | media), for FAT16
+	 * == (0x0FFFFF | media), for FAT32
 	 */
 
 	error = -EINVAL;

diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 573836d..1e08bd5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/fat/misc.c
  *
@@ -7,6 +8,7 @@
  */
 
 #include "fat.h"
+#include <linux/iversion.h>
 
 /*
  * fat_fs_error reports a file system problem that might indicate fa data
@@ -63,7 +65,7 @@
 	struct buffer_head *bh;
 	struct fat_boot_fsinfo *fsinfo;
 
-	if (sbi->fat_bits != 32)
+	if (!is_fat32(sbi))
 		return 0;
 
 	bh = sb_bread(sb, sbi->fsinfo_sector);
@@ -185,6 +187,13 @@
 	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
 };
 
+static inline int fat_tz_offset(struct msdos_sb_info *sbi)
+{
+	return (sbi->options.tz_set ?
+	       -sbi->options.time_offset :
+	       sys_tz.tz_minuteswest) * SECS_PER_MIN;
+}
+
 /* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
 void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
 		       __le16 __time, __le16 __date, u8 time_cs)
@@ -210,10 +219,7 @@
 		   + days_in_year[month] + day
 		   + DAYS_DELTA) * SECS_PER_DAY;
 
-	if (!sbi->options.tz_set)
-		second += sys_tz.tz_minuteswest * SECS_PER_MIN;
-	else
-		second -= sbi->options.time_offset * SECS_PER_MIN;
+	second += fat_tz_offset(sbi);
 
 	if (time_cs) {
 		ts->tv_sec = second + (time_cs / 100);
@@ -229,9 +235,7 @@
 		       __le16 *time, __le16 *date, u8 *time_cs)
 {
 	struct tm tm;
-	time64_to_tm(ts->tv_sec,
-		   (sbi->options.tz_set ? sbi->options.time_offset :
-		   -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
+	time64_to_tm(ts->tv_sec, -fat_tz_offset(sbi), &tm);
 
 	/*  FAT can only support year between 1980 to 2107 */
 	if (tm.tm_year < 1980 - 1900) {
@@ -263,6 +267,80 @@
 }
 EXPORT_SYMBOL_GPL(fat_time_unix2fat);
 
+static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts)
+{
+	return (struct timespec64){ ts.tv_sec & ~1ULL, 0 };
+}
+/*
+ * truncate the various times with appropriate granularity:
+ *   root inode:
+ *     all times always 0
+ *   all other inodes:
+ *     mtime - 2 seconds
+ *     ctime
+ *       msdos - 2 seconds
+ *       vfat  - 10 milliseconds
+ *     atime - 24 hours (00:00:00 in local timezone)
+ */
+int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct timespec64 ts;
+
+	if (inode->i_ino == MSDOS_ROOT_INO)
+		return 0;
+
+	if (now == NULL) {
+		now = &ts;
+		ts = current_time(inode);
+	}
+
+	if (flags & S_ATIME) {
+		/* to localtime */
+		time64_t seconds = now->tv_sec - fat_tz_offset(sbi);
+		s32 remainder;
+
+		div_s64_rem(seconds, SECS_PER_DAY, &remainder);
+		/* to day boundary, and back to unix time */
+		seconds = seconds + fat_tz_offset(sbi) - remainder;
+
+		inode->i_atime = (struct timespec64){ seconds, 0 };
+	}
+	if (flags & S_CTIME) {
+		if (sbi->options.isvfat)
+			inode->i_ctime = timespec64_trunc(*now, 10000000);
+		else
+			inode->i_ctime = fat_timespec64_trunc_2secs(*now);
+	}
+	if (flags & S_MTIME)
+		inode->i_mtime = fat_timespec64_trunc_2secs(*now);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fat_truncate_time);
+
+int fat_update_time(struct inode *inode, struct timespec64 *now, int flags)
+{
+	int iflags = I_DIRTY_TIME;
+	bool dirty = false;
+
+	if (inode->i_ino == MSDOS_ROOT_INO)
+		return 0;
+
+	fat_truncate_time(inode, now, flags);
+	if (flags & S_VERSION)
+		dirty = inode_maybe_inc_iversion(inode, false);
+	if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
+	    !(inode->i_sb->s_flags & SB_LAZYTIME))
+		dirty = true;
+
+	if (dirty)
+		iflags |= I_DIRTY_SYNC;
+	__mark_inode_dirty(inode, iflags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fat_update_time);
+
 int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
 	int i, err = 0;

diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index efb8c40..9d06288 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/msdos/namei.c
  *
@@ -250,7 +251,7 @@
 	if (err)
 		return err;
 
-	dir->i_ctime = dir->i_mtime = *ts;
+	fat_truncate_time(dir, ts, S_CTIME|S_MTIME);
 	if (IS_DIRSYNC(dir))
 		(void)fat_sync_inode(dir);
 	else
@@ -294,7 +295,7 @@
 		err = PTR_ERR(inode);
 		goto out;
 	}
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
 	d_instantiate(dentry, inode);
@@ -327,7 +328,7 @@
 	drop_nlink(dir);
 
 	clear_nlink(inode);
-	inode->i_ctime = current_time(inode);
+	fat_truncate_time(inode, NULL, S_CTIME);
 	fat_detach(inode);
 out:
 	mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -380,7 +381,7 @@
 		goto out;
 	}
 	set_nlink(inode, 2);
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
 	d_instantiate(dentry, inode);
@@ -413,7 +414,7 @@
 	if (err)
 		goto out;
 	clear_nlink(inode);
-	inode->i_ctime = current_time(inode);
+	fat_truncate_time(inode, NULL, S_CTIME);
 	fat_detach(inode);
 out:
 	mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -478,7 +479,7 @@
 				mark_inode_dirty(old_inode);
 
 			inode_inc_iversion(old_dir);
-			old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+			fat_truncate_time(old_dir, NULL, S_CTIME|S_MTIME);
 			if (IS_DIRSYNC(old_dir))
 				(void)fat_sync_inode(old_dir);
 			else
@@ -538,7 +539,7 @@
 	if (err)
 		goto error_dotdot;
 	inode_inc_iversion(old_dir);
-	old_dir->i_ctime = old_dir->i_mtime = ts;
+	fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
 	if (IS_DIRSYNC(old_dir))
 		(void)fat_sync_inode(old_dir);
 	else
@@ -548,7 +549,7 @@
 		drop_nlink(new_inode);
 		if (is_dir)
 			drop_nlink(new_inode);
-		new_inode->i_ctime = ts;
+		fat_truncate_time(new_inode, &ts, S_CTIME);
 	}
 out:
 	brelse(sinfo.bh);
@@ -637,6 +638,7 @@
 	.rename		= msdos_rename,
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
+	.update_time	= fat_update_time,
 };
 
 static void setup(struct super_block *sb)

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 82cd1e6..0cdd0fb 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/vfat/namei.c
  *
@@ -678,7 +679,7 @@
 		goto cleanup;
 
 	/* update timestamp */
-	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+	fat_truncate_time(dir, ts, S_CTIME|S_MTIME);
 	if (IS_DIRSYNC(dir))
 		(void)fat_sync_inode(dir);
 	else
@@ -779,7 +780,7 @@
 		goto out;
 	}
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
 	d_instantiate(dentry, inode);
@@ -810,7 +811,7 @@
 	drop_nlink(dir);
 
 	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = current_time(inode);
+	fat_truncate_time(inode, NULL, S_ATIME|S_MTIME);
 	fat_detach(inode);
 	vfat_d_version_set(dentry, inode_query_iversion(dir));
 out:
@@ -836,7 +837,7 @@
 	if (err)
 		goto out;
 	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = current_time(inode);
+	fat_truncate_time(inode, NULL, S_ATIME|S_MTIME);
 	fat_detach(inode);
 	vfat_d_version_set(dentry, inode_query_iversion(dir));
 out:
@@ -876,7 +877,7 @@
 	}
 	inode_inc_iversion(inode);
 	set_nlink(inode, 2);
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
 	d_instantiate(dentry, inode);
@@ -969,7 +970,7 @@
 	if (err)
 		goto error_dotdot;
 	inode_inc_iversion(old_dir);
-	old_dir->i_ctime = old_dir->i_mtime = ts;
+	fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
 	if (IS_DIRSYNC(old_dir))
 		(void)fat_sync_inode(old_dir);
 	else
@@ -979,7 +980,7 @@
 		drop_nlink(new_inode);
 		if (is_dir)
 			drop_nlink(new_inode);
-		new_inode->i_ctime = ts;
+		fat_truncate_time(new_inode, &ts, S_CTIME);
 	}
 out:
 	brelse(sinfo.bh);
@@ -1032,6 +1033,7 @@
 	.rename		= vfat_rename,
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
+	.update_time	= fat_update_time,
 };
 
 static void setup(struct super_block *sb)

diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index eb19265..af19137 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c

@@ -1,14 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* fs/fat/nfs.c
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #include <linux/exportfs.h>

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 4137d96..3d40771 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c

@@ -735,7 +735,7 @@
 		return;
 
 	switch (signum) {
-		siginfo_t si;
+		kernel_siginfo_t si;
 		default:
 			/* Queue a rt signal with the appropriate fd as its
 			   value.  We use SI_SIGIO as the source, not 
@@ -769,7 +769,7 @@
 			si.si_fd    = fd;
 			if (!do_send_sig_info(signum, &si, p, type))
 				break;
-		/* fall-through: fall back on the old plain SIGIO signal */
+		/* fall-through - fall back on the old plain SIGIO signal */
 		case 0:
 			do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
 	}

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 0ee7274..01263ff 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c

@@ -246,7 +246,7 @@
  * sys_open_by_handle_at: Open the file handle
  * @mountdirfd: directory file descriptor
  * @handle: file handle to be opened
- * @flag: open flags.
+ * @flags: open flags.
  *
  * @mountdirfd indicate the directory file descriptor
  * of the mount point. file handle is decoded relative

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9..3da91a1 100644
--- a/fs/file.c
+++ b/fs/file.c

@@ -158,7 +158,7 @@
 	 * or have finished their rcu_read_lock_sched() section.
 	 */
 	if (atomic_read(&files->count) > 1)
-		synchronize_sched();
+		synchronize_rcu();
 
 	spin_lock(&files->file_lock);
 	if (!new_fdt)
@@ -457,6 +457,7 @@
 		.full_fds_bits	= init_files.full_fds_bits_init,
 	},
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
 };
 
 static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
@@ -640,6 +641,35 @@
 }
 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
 
+/*
+ * variant of __close_fd that gets a ref on the file for later fput
+ */
+int __close_fd_get_file(unsigned int fd, struct file **res)
+{
+	struct files_struct *files = current->files;
+	struct file *file;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (fd >= fdt->max_fds)
+		goto out_unlock;
+	file = fdt->fd[fd];
+	if (!file)
+		goto out_unlock;
+	rcu_assign_pointer(fdt->fd[fd], NULL);
+	__put_unused_fd(files, fd);
+	spin_unlock(&files->file_lock);
+	get_file(file);
+	*res = file;
+	return filp_close(file, files);
+
+out_unlock:
+	spin_unlock(&files->file_lock);
+	*res = NULL;
+	return -ENOENT;
+}
+
 void do_close_on_exec(struct files_struct *files)
 {
 	unsigned i;
@@ -676,7 +706,7 @@
 	spin_unlock(&files->file_lock);
 }
 
-static struct file *__fget(unsigned int fd, fmode_t mask)
+static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
 {
 	struct files_struct *files = current->files;
 	struct file *file;
@@ -691,7 +721,7 @@
 		 */
 		if (file->f_mode & mask)
 			file = NULL;
-		else if (!get_file_rcu(file))
+		else if (!get_file_rcu_many(file, refs))
 			goto loop;
 	}
 	rcu_read_unlock();
@@ -699,15 +729,20 @@
 	return file;
 }
 
+struct file *fget_many(unsigned int fd, unsigned int refs)
+{
+	return __fget(fd, FMODE_PATH, refs);
+}
+
 struct file *fget(unsigned int fd)
 {
-	return __fget(fd, FMODE_PATH);
+	return __fget(fd, FMODE_PATH, 1);
 }
 EXPORT_SYMBOL(fget);
 
 struct file *fget_raw(unsigned int fd)
 {
-	return __fget(fd, 0);
+	return __fget(fd, 0, 1);
 }
 EXPORT_SYMBOL(fget_raw);
 
@@ -738,7 +773,7 @@
 			return 0;
 		return (unsigned long)file;
 	} else {
-		file = __fget(fd, mask);
+		file = __fget(fd, mask, 1);
 		if (!file)
 			return 0;
 		return FDPUT_FPUT | (unsigned long)file;

diff --git a/fs/file_table.c b/fs/file_table.c
index e49af4c..30d55c9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/file_table.c
  *
@@ -255,6 +256,7 @@
 	struct dentry *dentry = file->f_path.dentry;
 	struct vfsmount *mnt = file->f_path.mnt;
 	struct inode *inode = file->f_inode;
+	fmode_t mode = file->f_mode;
 
 	if (unlikely(!(file->f_mode & FMODE_OPENED)))
 		goto out;
@@ -277,18 +279,20 @@
 	if (file->f_op->release)
 		file->f_op->release(inode, file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
-		     !(file->f_mode & FMODE_PATH))) {
+		     !(mode & FMODE_PATH))) {
 		cdev_put(inode->i_cdev);
 	}
 	fops_put(file->f_op);
 	put_pid(file->f_owner.pid);
-	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_dec(inode);
-	if (file->f_mode & FMODE_WRITER) {
+	if (mode & FMODE_WRITER) {
 		put_write_access(inode);
 		__mnt_drop_write(mnt);
 	}
 	dput(dentry);
+	if (unlikely(mode & FMODE_NEED_UNMOUNT))
+		dissolve_on_fput(mnt);
 	mntput(mnt);
 out:
 	file_free(file);
@@ -323,12 +327,13 @@
 {
 	delayed_fput(NULL);
 }
+EXPORT_SYMBOL_GPL(flush_delayed_fput);
 
 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 
-void fput(struct file *file)
+void fput_many(struct file *file, unsigned int refs)
 {
-	if (atomic_long_dec_and_test(&file->f_count)) {
+	if (atomic_long_sub_and_test(refs, &file->f_count)) {
 		struct task_struct *task = current;
 
 		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
@@ -347,6 +352,11 @@
 	}
 }
 
+void fput(struct file *file)
+{
+	fput_many(file, 1);
+}
+
 /*
  * synchronous analog of fput(); for kernel threads that might be needed
  * in some umount() (and thus can't use flush_delayed_fput() without
@@ -380,10 +390,11 @@
 void __init files_maxfiles_init(void)
 {
 	unsigned long n;
-	unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+	unsigned long nr_pages = totalram_pages();
+	unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
 
-	memreserve = min(memreserve, totalram_pages - 1);
-	n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
+	memreserve = min(memreserve, nr_pages - 1);
+	n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 }

diff --git a/fs/filesystems.c b/fs/filesystems.c
index b03f57b..9135646 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c

@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/fs_parser.h>
 
 /*
  * Handling of filesystem drivers list.
@@ -73,6 +74,9 @@
 	int res = 0;
 	struct file_system_type ** p;
 
+	if (fs->parameters && !fs_validate_description(fs->parameters))
+		return -EINVAL;
+
 	BUG_ON(strchr(fs->name, '.'));
 	if (fs->next)
 		return -EBUSY;

diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
index ce49df1..c05c71d 100644
--- a/fs/freevxfs/Kconfig
+++ b/fs/freevxfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
 	depends on BLOCK

diff --git a/fs/freevxfs/Makefile b/fs/freevxfs/Makefile
index 87ad097..e6ee592 100644
--- a/fs/freevxfs/Makefile
+++ b/fs/freevxfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # VxFS Makefile
 #

diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 48b24bb..578a506 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c

@@ -131,21 +131,14 @@
 	return &vi->vfs_inode;
 }
 
-static void vxfs_i_callback(struct rcu_head *head)
+static void vxfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-
 	kmem_cache_free(vxfs_inode_cachep, VXFS_INO(inode));
 }
 
-static void vxfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, vxfs_i_callback);
-}
-
 static const struct super_operations vxfs_super_ops = {
 	.alloc_inode		= vxfs_alloc_inode,
-	.destroy_inode		= vxfs_destroy_inode,
+	.free_inode		= vxfs_free_inode,
 	.evict_inode		= vxfs_evict_inode,
 	.put_super		= vxfs_put_super,
 	.statfs			= vxfs_statfs,
@@ -236,6 +229,8 @@
 
 	sbp->s_op = &vxfs_super_ops;
 	sbp->s_fs_info = infp;
+	sbp->s_time_min = 0;
+	sbp->s_time_max = U32_MAX;
 
 	if (!vxfs_try_sb_magic(sbp, silent, 1,
 			(__force __fs32)cpu_to_le32(VXFS_SUPER_MAGIC))) {

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863..335607b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/fs-writeback.c
  *
@@ -35,10 +36,6 @@
  */
 #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10))
 
-struct wb_completion {
-	atomic_t		cnt;
-};
-
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -60,19 +57,6 @@
 };
 
 /*
- * If one wants to wait for one or more wb_writeback_works, each work's
- * ->done should be set to a wb_completion defined using the following
- * macro.  Once all work items are issued with wb_queue_work(), the caller
- * can wait for the completion of all using wb_wait_for_completion().  Work
- * items which are waited upon aren't freed automatically on completion.
- */
-#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)				\
-	struct wb_completion cmpl = {					\
-		.cnt		= ATOMIC_INIT(1),			\
-	}
-
-
-/*
  * If an inode is constantly having its pages dirtied, but then the
  * updates stop dirtytime_expire_interval seconds in the past, it's
  * possible for the worst case time between when an inode has its
@@ -180,8 +164,13 @@
 
 	if (work->auto_free)
 		kfree(work);
-	if (done && atomic_dec_and_test(&done->cnt))
-		wake_up_all(&wb->bdi->wb_waitq);
+	if (done) {
+		wait_queue_head_t *waitq = done->waitq;
+
+		/* @done can't be accessed after the following dec */
+		if (atomic_dec_and_test(&done->cnt))
+			wake_up_all(waitq);
+	}
 }
 
 static void wb_queue_work(struct bdi_writeback *wb,
@@ -205,28 +194,44 @@
 
 /**
  * wb_wait_for_completion - wait for completion of bdi_writeback_works
- * @bdi: bdi work items were issued to
  * @done: target wb_completion
  *
  * Wait for one or more work items issued to @bdi with their ->done field
- * set to @done, which should have been defined with
- * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
- * work items are completed.  Work items which are waited upon aren't freed
+ * set to @done, which should have been initialized with
+ * DEFINE_WB_COMPLETION().  This function returns after all such work items
+ * are completed.  Work items which are waited upon aren't freed
  * automatically on completion.
  */
-static void wb_wait_for_completion(struct backing_dev_info *bdi,
-				   struct wb_completion *done)
+void wb_wait_for_completion(struct wb_completion *done)
 {
 	atomic_dec(&done->cnt);		/* put down the initial count */
-	wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+	wait_event(*done->waitq, !atomic_read(&done->cnt));
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
-/* parameters for foreign inode detection, see wb_detach_inode() */
+/*
+ * Parameters for foreign inode detection, see wbc_detach_inode() to see
+ * how they're used.
+ *
+ * These paramters are inherently heuristical as the detection target
+ * itself is fuzzy.  All we want to do is detaching an inode from the
+ * current owner if it's being written to by some other cgroups too much.
+ *
+ * The current cgroup writeback is built on the assumption that multiple
+ * cgroups writing to the same inode concurrently is very rare and a mode
+ * of operation which isn't well supported.  As such, the goal is not
+ * taking too long when a different cgroup takes over an inode while
+ * avoiding too aggressive flip-flops from occasional foreign writes.
+ *
+ * We record, very roughly, 2s worth of IO time history and if more than
+ * half of that is foreign, trigger the switch.  The recording is quantized
+ * to 16 slots.  To avoid tiny writes from swinging the decision too much,
+ * writes smaller than 1/8 of avg size are ignored.
+ */
 #define WB_FRN_TIME_SHIFT	13	/* 1s = 2^13, upto 8 secs w/ 16bit */
 #define WB_FRN_TIME_AVG_SHIFT	3	/* avg = avg * 7/8 + new * 1/8 */
-#define WB_FRN_TIME_CUT_DIV	2	/* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_CUT_DIV	8	/* ignore rounds < avg / 8 */
 #define WB_FRN_TIME_PERIOD	(2 * (1 << WB_FRN_TIME_SHIFT))	/* 2s */
 
 #define WB_FRN_HIST_SLOTS	16	/* inode->i_wb_frn_history is 16bit */
@@ -236,6 +241,7 @@
 					/* if foreign slots >= 8, switch */
 #define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
 					/* one round can affect upto 5 slots */
+#define WB_FRN_MAX_IN_FLIGHT	1024	/* don't queue too many concurrently */
 
 static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
 static struct workqueue_struct *isw_wq;
@@ -269,6 +275,7 @@
 	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
 		wb_put(wb);
 }
+EXPORT_SYMBOL_GPL(__inode_attach_wb);
 
 /**
  * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
@@ -331,17 +338,34 @@
 	struct work_struct	work;
 };
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+	down_write(&bdi->wb_switch_rwsem);
+}
+
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+	up_write(&bdi->wb_switch_rwsem);
+}
+
 static void inode_switch_wbs_work_fn(struct work_struct *work)
 {
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
-	struct radix_tree_iter iter;
+	XA_STATE(xas, &mapping->i_pages, 0);
+	struct page *page;
 	bool switched = false;
-	void **slot;
+
+	/*
+	 * If @inode switches cgwb membership while sync_inodes_sb() is
+	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
+	 */
+	down_read(&bdi->wb_switch_rwsem);
 
 	/*
 	 * By the time control reaches here, RCU grace period has passed
@@ -370,30 +394,25 @@
 	if (unlikely(inode->i_state & I_FREEING))
 		goto skip_switch;
 
+	trace_inode_switch_wbs(inode, old_wb, new_wb);
+
 	/*
 	 * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
 	 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
 	 * pages actually under writeback.
 	 */
-	radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
-				   PAGECACHE_TAG_DIRTY) {
-		struct page *page = radix_tree_deref_slot_protected(slot,
-						&mapping->i_pages.xa_lock);
-		if (likely(page) && PageDirty(page)) {
+	xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
+		if (PageDirty(page)) {
 			dec_wb_stat(old_wb, WB_RECLAIMABLE);
 			inc_wb_stat(new_wb, WB_RECLAIMABLE);
 		}
 	}
 
-	radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
-				   PAGECACHE_TAG_WRITEBACK) {
-		struct page *page = radix_tree_deref_slot_protected(slot,
-						&mapping->i_pages.xa_lock);
-		if (likely(page)) {
-			WARN_ON_ONCE(!PageWriteback(page));
-			dec_wb_stat(old_wb, WB_WRITEBACK);
-			inc_wb_stat(new_wb, WB_WRITEBACK);
-		}
+	xas_set(&xas, 0);
+	xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
+		WARN_ON_ONCE(!PageWriteback(page));
+		dec_wb_stat(old_wb, WB_WRITEBACK);
+		inc_wb_stat(new_wb, WB_WRITEBACK);
 	}
 
 	wb_get(new_wb);
@@ -435,6 +454,8 @@
 	spin_unlock(&new_wb->list_lock);
 	spin_unlock(&old_wb->list_lock);
 
+	up_read(&bdi->wb_switch_rwsem);
+
 	if (switched) {
 		wb_wakeup(new_wb);
 		wb_put(old_wb);
@@ -475,6 +496,10 @@
 	if (inode->i_state & I_WB_SWITCH)
 		return;
 
+	/* avoid queueing a new switch if too many are already in flight */
+	if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
+		return;
+
 	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
 	if (!isw)
 		return;
@@ -502,8 +527,6 @@
 
 	isw->inode = inode;
 
-	atomic_inc(&isw_nr_in_flight);
-
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
 	 * the RCU protected stat update paths to grab the i_page
@@ -511,6 +534,8 @@
 	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
 	 */
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+
+	atomic_inc(&isw_nr_in_flight);
 	return;
 
 out_free:
@@ -551,12 +576,16 @@
 	spin_unlock(&inode->i_lock);
 
 	/*
-	 * A dying wb indicates that the memcg-blkcg mapping has changed
-	 * and a new wb is already serving the memcg.  Switch immediately.
+	 * A dying wb indicates that either the blkcg associated with the
+	 * memcg changed or the associated memcg is dying.  In the first
+	 * case, a replacement wb should already be available and we should
+	 * refresh the wb immediately.  In the second case, trying to
+	 * refresh will keep failing.
 	 */
-	if (unlikely(wb_dying(wbc->wb)))
+	if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
 		inode_switch_wbs(inode, wbc->wb_id);
 }
+EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
 
 /**
  * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -654,6 +683,9 @@
 		if (wbc->wb_id != max_id)
 			history |= (1U << slots) - 1;
 
+		if (history)
+			trace_inode_foreign_history(inode, wbc, history);
+
 		/*
 		 * Switch if the current wb isn't the consistent winner.
 		 * If there are multiple closely competing dirtiers, the
@@ -676,9 +708,10 @@
 	wb_put(wbc->wb);
 	wbc->wb = NULL;
 }
+EXPORT_SYMBOL_GPL(wbc_detach_inode);
 
 /**
- * wbc_account_io - account IO issued during writeback
+ * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
  * @wbc: writeback_control of the writeback in progress
  * @page: page being written out
  * @bytes: number of bytes being written out
@@ -687,9 +720,10 @@
  * controlled by @wbc.  Keep the book for foreign inode detection.  See
  * wbc_detach_inode().
  */
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
-		    size_t bytes)
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+			      size_t bytes)
 {
+	struct cgroup_subsys_state *css;
 	int id;
 
 	/*
@@ -698,10 +732,15 @@
 	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
 	 * regular writeback instead of writing things out itself.
 	 */
-	if (!wbc->wb)
+	if (!wbc->wb || wbc->no_cgroup_owner)
 		return;
 
-	id = mem_cgroup_css_from_page(page)->id;
+	css = mem_cgroup_css_from_page(page);
+	/* dead cgroups shouldn't contribute to inode ownership arbitration */
+	if (!(css->flags & CSS_ONLINE))
+		return;
+
+	id = css->id;
 
 	if (id == wbc->wb_id) {
 		wbc->wb_bytes += bytes;
@@ -719,7 +758,7 @@
 	else
 		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
 }
-EXPORT_SYMBOL_GPL(wbc_account_io);
+EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
 
 /**
  * inode_congested - test whether an inode is congested
@@ -809,7 +848,7 @@
 restart:
 	rcu_read_lock();
 	list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
-		DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+		DEFINE_WB_COMPLETION(fallback_work_done, bdi);
 		struct wb_writeback_work fallback_work;
 		struct wb_writeback_work *work;
 		long nr_pages;
@@ -856,7 +895,7 @@
 		last_wb = wb;
 
 		rcu_read_unlock();
-		wb_wait_for_completion(bdi, &fallback_work_done);
+		wb_wait_for_completion(&fallback_work_done);
 		goto restart;
 	}
 	rcu_read_unlock();
@@ -866,6 +905,89 @@
 }
 
 /**
+ * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
+ * @bdi_id: target bdi id
+ * @memcg_id: target memcg css id
+ * @nr: number of pages to write, 0 for best-effort dirty flushing
+ * @reason: reason why some writeback work initiated
+ * @done: target wb_completion
+ *
+ * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
+ * with the specified parameters.
+ */
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
+			   enum wb_reason reason, struct wb_completion *done)
+{
+	struct backing_dev_info *bdi;
+	struct cgroup_subsys_state *memcg_css;
+	struct bdi_writeback *wb;
+	struct wb_writeback_work *work;
+	int ret;
+
+	/* lookup bdi and memcg */
+	bdi = bdi_get_by_id(bdi_id);
+	if (!bdi)
+		return -ENOENT;
+
+	rcu_read_lock();
+	memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
+	if (memcg_css && !css_tryget(memcg_css))
+		memcg_css = NULL;
+	rcu_read_unlock();
+	if (!memcg_css) {
+		ret = -ENOENT;
+		goto out_bdi_put;
+	}
+
+	/*
+	 * And find the associated wb.  If the wb isn't there already
+	 * there's nothing to flush, don't create one.
+	 */
+	wb = wb_get_lookup(bdi, memcg_css);
+	if (!wb) {
+		ret = -ENOENT;
+		goto out_css_put;
+	}
+
+	/*
+	 * If @nr is zero, the caller is attempting to write out most of
+	 * the currently dirty pages.  Let's take the current dirty page
+	 * count and inflate it by 25% which should be large enough to
+	 * flush out most dirty pages while avoiding getting livelocked by
+	 * concurrent dirtiers.
+	 */
+	if (!nr) {
+		unsigned long filepages, headroom, dirty, writeback;
+
+		mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
+				      &writeback);
+		nr = dirty * 10 / 8;
+	}
+
+	/* issue the writeback work */
+	work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
+	if (work) {
+		work->nr_pages = nr;
+		work->sync_mode = WB_SYNC_NONE;
+		work->range_cyclic = 1;
+		work->reason = reason;
+		work->done = done;
+		work->auto_free = 1;
+		wb_queue_work(wb, work);
+		ret = 0;
+	} else {
+		ret = -ENOMEM;
+	}
+
+	wb_put(wb);
+out_css_put:
+	css_put(memcg_css);
+out_bdi_put:
+	bdi_put(bdi);
+	return ret;
+}
+
+/**
  * cgroup_writeback_umount - flush inode wb switches for umount
  *
  * This function is called when a super_block is about to be destroyed and
@@ -878,7 +1000,11 @@
 void cgroup_writeback_umount(void)
 {
 	if (atomic_read(&isw_nr_in_flight)) {
-		synchronize_rcu();
+		/*
+		 * Use rcu_barrier() to wait for all pending callbacks to
+		 * ensure that all in-flight wb switches are in the workqueue.
+		 */
+		rcu_barrier();
 		flush_workqueue(isw_wq);
 	}
 }
@@ -894,6 +1020,9 @@
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+
 static struct bdi_writeback *
 locked_inode_to_wb_and_lock_list(struct inode *inode)
 	__releases(&inode->i_lock)
@@ -2321,7 +2450,8 @@
 static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
 				     enum wb_reason reason, bool skip_if_busy)
 {
-	DEFINE_WB_COMPLETION_ONSTACK(done);
+	struct backing_dev_info *bdi = sb->s_bdi;
+	DEFINE_WB_COMPLETION(done, bdi);
 	struct wb_writeback_work work = {
 		.sb			= sb,
 		.sync_mode		= WB_SYNC_NONE,
@@ -2330,14 +2460,13 @@
 		.nr_pages		= nr,
 		.reason			= reason,
 	};
-	struct backing_dev_info *bdi = sb->s_bdi;
 
 	if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
-	wb_wait_for_completion(bdi, &done);
+	wb_wait_for_completion(&done);
 }
 
 /**
@@ -2399,7 +2528,8 @@
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-	DEFINE_WB_COMPLETION_ONSTACK(done);
+	struct backing_dev_info *bdi = sb->s_bdi;
+	DEFINE_WB_COMPLETION(done, bdi);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
@@ -2409,7 +2539,6 @@
 		.reason		= WB_REASON_SYNC,
 		.for_sync	= 1,
 	};
-	struct backing_dev_info *bdi = sb->s_bdi;
 
 	/*
 	 * Can't skip on !bdi_has_dirty() because we should wait for !dirty
@@ -2420,8 +2549,11 @@
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
+	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
+	bdi_down_write_wb_switch_rwsem(bdi);
 	bdi_split_work_to_wbs(bdi, &work, false);
-	wb_wait_for_completion(bdi, &done);
+	wb_wait_for_completion(&done);
+	bdi_up_write_wb_switch_rwsem(bdi);
 
 	wait_sb_inodes(sb);
 }

diff --git a/fs/fs_context.c b/fs/fs_context.c
new file mode 100644
index 0000000..138b5b4
--- /dev/null
+++ b/fs/fs_context.c

@@ -0,0 +1,754 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Provide a way to create a superblock configuration context within the kernel
+ * that allows a superblock to be set up prior to mounting.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/security.h>
+#include <linux/mnt_namespace.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <net/net_namespace.h>
+#include <asm/sections.h>
+#include "mount.h"
+#include "internal.h"
+
+enum legacy_fs_param {
+	LEGACY_FS_UNSET_PARAMS,
+	LEGACY_FS_MONOLITHIC_PARAMS,
+	LEGACY_FS_INDIVIDUAL_PARAMS,
+};
+
+struct legacy_fs_context {
+	char			*legacy_data;	/* Data page for legacy filesystems */
+	size_t			data_size;
+	enum legacy_fs_param	param_type;
+};
+
+static int legacy_init_fs_context(struct fs_context *fc);
+
+static const struct constant_table common_set_sb_flag[] = {
+	{ "dirsync",	SB_DIRSYNC },
+	{ "lazytime",	SB_LAZYTIME },
+	{ "mand",	SB_MANDLOCK },
+	{ "posixacl",	SB_POSIXACL },
+	{ "ro",		SB_RDONLY },
+	{ "sync",	SB_SYNCHRONOUS },
+};
+
+static const struct constant_table common_clear_sb_flag[] = {
+	{ "async",	SB_SYNCHRONOUS },
+	{ "nolazytime",	SB_LAZYTIME },
+	{ "nomand",	SB_MANDLOCK },
+	{ "rw",		SB_RDONLY },
+	{ "silent",	SB_SILENT },
+};
+
+static const char *const forbidden_sb_flag[] = {
+	"bind",
+	"dev",
+	"exec",
+	"move",
+	"noatime",
+	"nodev",
+	"nodiratime",
+	"noexec",
+	"norelatime",
+	"nostrictatime",
+	"nosuid",
+	"private",
+	"rec",
+	"relatime",
+	"remount",
+	"shared",
+	"slave",
+	"strictatime",
+	"suid",
+	"unbindable",
+};
+
+/*
+ * Check for a common mount option that manipulates s_flags.
+ */
+static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
+{
+	unsigned int token;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++)
+		if (strcmp(key, forbidden_sb_flag[i]) == 0)
+			return -EINVAL;
+
+	token = lookup_constant(common_set_sb_flag, key, 0);
+	if (token) {
+		fc->sb_flags |= token;
+		fc->sb_flags_mask |= token;
+		return 0;
+	}
+
+	token = lookup_constant(common_clear_sb_flag, key, 0);
+	if (token) {
+		fc->sb_flags &= ~token;
+		fc->sb_flags_mask |= token;
+		return 0;
+	}
+
+	return -ENOPARAM;
+}
+
+/**
+ * vfs_parse_fs_param - Add a single parameter to a superblock config
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * A single mount option in string form is applied to the filesystem context
+ * being set up.  Certain standard options (for example "ro") are translated
+ * into flag bits without going to the filesystem.  The active security module
+ * is allowed to observe and poach options.  Any other options are passed over
+ * to the filesystem to parse.
+ *
+ * This may be called multiple times for a context.
+ *
+ * Returns 0 on success and a negative error code on failure.  In the event of
+ * failure, supplementary error information may have been set.
+ */
+int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	int ret;
+
+	if (!param->key)
+		return invalf(fc, "Unnamed parameter\n");
+
+	ret = vfs_parse_sb_flag(fc, param->key);
+	if (ret != -ENOPARAM)
+		return ret;
+
+	ret = security_fs_context_parse_param(fc, param);
+	if (ret != -ENOPARAM)
+		/* Param belongs to the LSM or is disallowed by the LSM; so
+		 * don't pass to the FS.
+		 */
+		return ret;
+
+	if (fc->ops->parse_param) {
+		ret = fc->ops->parse_param(fc, param);
+		if (ret != -ENOPARAM)
+			return ret;
+	}
+
+	/* If the filesystem doesn't take any arguments, give it the
+	 * default handling of source.
+	 */
+	if (strcmp(param->key, "source") == 0) {
+		if (param->type != fs_value_is_string)
+			return invalf(fc, "VFS: Non-string source");
+		if (fc->source)
+			return invalf(fc, "VFS: Multiple sources");
+		fc->source = param->string;
+		param->string = NULL;
+		return 0;
+	}
+
+	return invalf(fc, "%s: Unknown parameter '%s'",
+		      fc->fs_type->name, param->key);
+}
+EXPORT_SYMBOL(vfs_parse_fs_param);
+
+/**
+ * vfs_parse_fs_string - Convenience function to just parse a string.
+ */
+int vfs_parse_fs_string(struct fs_context *fc, const char *key,
+			const char *value, size_t v_size)
+{
+	int ret;
+
+	struct fs_parameter param = {
+		.key	= key,
+		.type	= fs_value_is_string,
+		.size	= v_size,
+	};
+
+	if (v_size > 0) {
+		param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
+		if (!param.string)
+			return -ENOMEM;
+	}
+
+	ret = vfs_parse_fs_param(fc, &param);
+	kfree(param.string);
+	return ret;
+}
+EXPORT_SYMBOL(vfs_parse_fs_string);
+
+/**
+ * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * @ctx: The superblock configuration to fill in.
+ * @data: The data to parse
+ *
+ * Parse a blob of data that's in key[=val][,key[=val]]* form.  This can be
+ * called from the ->monolithic_mount_data() fs_context operation.
+ *
+ * Returns 0 on success or the error returned by the ->parse_option() fs_context
+ * operation on failure.
+ */
+int generic_parse_monolithic(struct fs_context *fc, void *data)
+{
+	char *options = data, *key;
+	int ret = 0;
+
+	if (!options)
+		return 0;
+
+	ret = security_sb_eat_lsm_opts(options, &fc->security);
+	if (ret)
+		return ret;
+
+	while ((key = strsep(&options, ",")) != NULL) {
+		if (*key) {
+			size_t v_len = 0;
+			char *value = strchr(key, '=');
+
+			if (value) {
+				if (value == key)
+					continue;
+				*value++ = 0;
+				v_len = strlen(value);
+			}
+			ret = vfs_parse_fs_string(fc, key, value, v_len);
+			if (ret < 0)
+				break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(generic_parse_monolithic);
+
+/**
+ * alloc_fs_context - Create a filesystem context.
+ * @fs_type: The filesystem type.
+ * @reference: The dentry from which this one derives (or NULL)
+ * @sb_flags: Filesystem/superblock flags (SB_*)
+ * @sb_flags_mask: Applicable members of @sb_flags
+ * @purpose: The purpose that this configuration shall be used for.
+ *
+ * Open a filesystem and create a mount context.  The mount context is
+ * initialised with the supplied flags and, if a submount/automount from
+ * another superblock (referred to by @reference) is supplied, may have
+ * parameters such as namespaces copied across from that superblock.
+ */
+static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
+				      struct dentry *reference,
+				      unsigned int sb_flags,
+				      unsigned int sb_flags_mask,
+				      enum fs_context_purpose purpose)
+{
+	int (*init_fs_context)(struct fs_context *);
+	struct fs_context *fc;
+	int ret = -ENOMEM;
+
+	fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+	if (!fc)
+		return ERR_PTR(-ENOMEM);
+
+	fc->purpose	= purpose;
+	fc->sb_flags	= sb_flags;
+	fc->sb_flags_mask = sb_flags_mask;
+	fc->fs_type	= get_filesystem(fs_type);
+	fc->cred	= get_current_cred();
+	fc->net_ns	= get_net(current->nsproxy->net_ns);
+
+	mutex_init(&fc->uapi_mutex);
+
+	switch (purpose) {
+	case FS_CONTEXT_FOR_MOUNT:
+		fc->user_ns = get_user_ns(fc->cred->user_ns);
+		break;
+	case FS_CONTEXT_FOR_SUBMOUNT:
+		fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
+		break;
+	case FS_CONTEXT_FOR_RECONFIGURE:
+		atomic_inc(&reference->d_sb->s_active);
+		fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
+		fc->root = dget(reference);
+		break;
+	}
+
+	/* TODO: Make all filesystems support this unconditionally */
+	init_fs_context = fc->fs_type->init_fs_context;
+	if (!init_fs_context)
+		init_fs_context = legacy_init_fs_context;
+
+	ret = init_fs_context(fc);
+	if (ret < 0)
+		goto err_fc;
+	fc->need_free = true;
+	return fc;
+
+err_fc:
+	put_fs_context(fc);
+	return ERR_PTR(ret);
+}
+
+struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
+					unsigned int sb_flags)
+{
+	return alloc_fs_context(fs_type, NULL, sb_flags, 0,
+					FS_CONTEXT_FOR_MOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_mount);
+
+struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
+					unsigned int sb_flags,
+					unsigned int sb_flags_mask)
+{
+	return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
+				sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
+}
+EXPORT_SYMBOL(fs_context_for_reconfigure);
+
+struct fs_context *fs_context_for_submount(struct file_system_type *type,
+					   struct dentry *reference)
+{
+	return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_submount);
+
+void fc_drop_locked(struct fs_context *fc)
+{
+	struct super_block *sb = fc->root->d_sb;
+	dput(fc->root);
+	fc->root = NULL;
+	deactivate_locked_super(sb);
+}
+
+static void legacy_fs_context_free(struct fs_context *fc);
+
+/**
+ * vfs_dup_fc_config: Duplicate a filesystem context.
+ * @src_fc: The context to copy.
+ */
+struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
+{
+	struct fs_context *fc;
+	int ret;
+
+	if (!src_fc->ops->dup)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
+	if (!fc)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&fc->uapi_mutex);
+
+	fc->fs_private	= NULL;
+	fc->s_fs_info	= NULL;
+	fc->source	= NULL;
+	fc->security	= NULL;
+	get_filesystem(fc->fs_type);
+	get_net(fc->net_ns);
+	get_user_ns(fc->user_ns);
+	get_cred(fc->cred);
+	if (fc->log)
+		refcount_inc(&fc->log->usage);
+
+	/* Can't call put until we've called ->dup */
+	ret = fc->ops->dup(fc, src_fc);
+	if (ret < 0)
+		goto err_fc;
+
+	ret = security_fs_context_dup(fc, src_fc);
+	if (ret < 0)
+		goto err_fc;
+	return fc;
+
+err_fc:
+	put_fs_context(fc);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(vfs_dup_fs_context);
+
+/**
+ * logfc - Log a message to a filesystem context
+ * @fc: The filesystem context to log to.
+ * @fmt: The format of the buffer.
+ */
+void logfc(struct fs_context *fc, const char *fmt, ...)
+{
+	static const char store_failure[] = "OOM: Can't store error string";
+	struct fc_log *log = fc ? fc->log : NULL;
+	const char *p;
+	va_list va;
+	char *q;
+	u8 freeable;
+
+	va_start(va, fmt);
+	if (!strchr(fmt, '%')) {
+		p = fmt;
+		goto unformatted_string;
+	}
+	if (strcmp(fmt, "%s") == 0) {
+		p = va_arg(va, const char *);
+		goto unformatted_string;
+	}
+
+	q = kvasprintf(GFP_KERNEL, fmt, va);
+copied_string:
+	if (!q)
+		goto store_failure;
+	freeable = 1;
+	goto store_string;
+
+unformatted_string:
+	if ((unsigned long)p >= (unsigned long)__start_rodata &&
+	    (unsigned long)p <  (unsigned long)__end_rodata)
+		goto const_string;
+	if (log && within_module_core((unsigned long)p, log->owner))
+		goto const_string;
+	q = kstrdup(p, GFP_KERNEL);
+	goto copied_string;
+
+store_failure:
+	p = store_failure;
+const_string:
+	q = (char *)p;
+	freeable = 0;
+store_string:
+	if (!log) {
+		switch (fmt[0]) {
+		case 'w':
+			printk(KERN_WARNING "%s\n", q + 2);
+			break;
+		case 'e':
+			printk(KERN_ERR "%s\n", q + 2);
+			break;
+		default:
+			printk(KERN_NOTICE "%s\n", q + 2);
+			break;
+		}
+		if (freeable)
+			kfree(q);
+	} else {
+		unsigned int logsize = ARRAY_SIZE(log->buffer);
+		u8 index;
+
+		index = log->head & (logsize - 1);
+		BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) ||
+			     sizeof(log->tail) != sizeof(u8));
+		if ((u8)(log->head - log->tail) == logsize) {
+			/* The buffer is full, discard the oldest message */
+			if (log->need_free & (1 << index))
+				kfree(log->buffer[index]);
+			log->tail++;
+		}
+
+		log->buffer[index] = q;
+		log->need_free &= ~(1 << index);
+		log->need_free |= freeable << index;
+		log->head++;
+	}
+	va_end(va);
+}
+EXPORT_SYMBOL(logfc);
+
+/*
+ * Free a logging structure.
+ */
+static void put_fc_log(struct fs_context *fc)
+{
+	struct fc_log *log = fc->log;
+	int i;
+
+	if (log) {
+		if (refcount_dec_and_test(&log->usage)) {
+			fc->log = NULL;
+			for (i = 0; i <= 7; i++)
+				if (log->need_free & (1 << i))
+					kfree(log->buffer[i]);
+			kfree(log);
+		}
+	}
+}
+
+/**
+ * put_fs_context - Dispose of a superblock configuration context.
+ * @fc: The context to dispose of.
+ */
+void put_fs_context(struct fs_context *fc)
+{
+	struct super_block *sb;
+
+	if (fc->root) {
+		sb = fc->root->d_sb;
+		dput(fc->root);
+		fc->root = NULL;
+		deactivate_super(sb);
+	}
+
+	if (fc->need_free && fc->ops && fc->ops->free)
+		fc->ops->free(fc);
+
+	security_free_mnt_opts(&fc->security);
+	put_net(fc->net_ns);
+	put_user_ns(fc->user_ns);
+	put_cred(fc->cred);
+	put_fc_log(fc);
+	put_filesystem(fc->fs_type);
+	kfree(fc->source);
+	kfree(fc);
+}
+EXPORT_SYMBOL(put_fs_context);
+
+/*
+ * Free the config for a filesystem that doesn't support fs_context.
+ */
+static void legacy_fs_context_free(struct fs_context *fc)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+
+	if (ctx) {
+		if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
+			kfree(ctx->legacy_data);
+		kfree(ctx);
+	}
+}
+
+/*
+ * Duplicate a legacy config.
+ */
+static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+	struct legacy_fs_context *ctx;
+	struct legacy_fs_context *src_ctx = src_fc->fs_private;
+
+	ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
+		ctx->legacy_data = kmemdup(src_ctx->legacy_data,
+					   src_ctx->data_size, GFP_KERNEL);
+		if (!ctx->legacy_data) {
+			kfree(ctx);
+			return -ENOMEM;
+		}
+	}
+
+	fc->fs_private = ctx;
+	return 0;
+}
+
+/*
+ * Add a parameter to a legacy config.  We build up a comma-separated list of
+ * options.
+ */
+static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+	unsigned int size = ctx->data_size;
+	size_t len = 0;
+
+	if (strcmp(param->key, "source") == 0) {
+		if (param->type != fs_value_is_string)
+			return invalf(fc, "VFS: Legacy: Non-string source");
+		if (fc->source)
+			return invalf(fc, "VFS: Legacy: Multiple sources");
+		fc->source = param->string;
+		param->string = NULL;
+		return 0;
+	}
+
+	if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
+		return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
+
+	switch (param->type) {
+	case fs_value_is_string:
+		len = 1 + param->size;
+		/* Fall through */
+	case fs_value_is_flag:
+		len += strlen(param->key);
+		break;
+	default:
+		return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
+			      param->key);
+	}
+
+	if (len > PAGE_SIZE - 2 - size)
+		return invalf(fc, "VFS: Legacy: Cumulative options too large");
+	if (strchr(param->key, ',') ||
+	    (param->type == fs_value_is_string &&
+	     memchr(param->string, ',', param->size)))
+		return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
+			      param->key);
+	if (!ctx->legacy_data) {
+		ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!ctx->legacy_data)
+			return -ENOMEM;
+	}
+
+	ctx->legacy_data[size++] = ',';
+	len = strlen(param->key);
+	memcpy(ctx->legacy_data + size, param->key, len);
+	size += len;
+	if (param->type == fs_value_is_string) {
+		ctx->legacy_data[size++] = '=';
+		memcpy(ctx->legacy_data + size, param->string, param->size);
+		size += param->size;
+	}
+	ctx->legacy_data[size] = '\0';
+	ctx->data_size = size;
+	ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
+	return 0;
+}
+
+/*
+ * Add monolithic mount data.
+ */
+static int legacy_parse_monolithic(struct fs_context *fc, void *data)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+
+	if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
+		pr_warn("VFS: Can't mix monolithic and individual options\n");
+		return -EINVAL;
+	}
+
+	ctx->legacy_data = data;
+	ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
+	if (!ctx->legacy_data)
+		return 0;
+
+	if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
+		return 0;
+	return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
+}
+
+/*
+ * Get a mountable root with the legacy mount command.
+ */
+static int legacy_get_tree(struct fs_context *fc)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+	struct super_block *sb;
+	struct dentry *root;
+
+	root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
+				      fc->source, ctx->legacy_data);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	sb = root->d_sb;
+	BUG_ON(!sb);
+
+	fc->root = root;
+	return 0;
+}
+
+/*
+ * Handle remount.
+ */
+static int legacy_reconfigure(struct fs_context *fc)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+	struct super_block *sb = fc->root->d_sb;
+
+	if (!sb->s_op->remount_fs)
+		return 0;
+
+	return sb->s_op->remount_fs(sb, &fc->sb_flags,
+				    ctx ? ctx->legacy_data : NULL);
+}
+
+const struct fs_context_operations legacy_fs_context_ops = {
+	.free			= legacy_fs_context_free,
+	.dup			= legacy_fs_context_dup,
+	.parse_param		= legacy_parse_param,
+	.parse_monolithic	= legacy_parse_monolithic,
+	.get_tree		= legacy_get_tree,
+	.reconfigure		= legacy_reconfigure,
+};
+
+/*
+ * Initialise a legacy context for a filesystem that doesn't support
+ * fs_context.
+ */
+static int legacy_init_fs_context(struct fs_context *fc)
+{
+	fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+	if (!fc->fs_private)
+		return -ENOMEM;
+	fc->ops = &legacy_fs_context_ops;
+	return 0;
+}
+
+int parse_monolithic_mount_data(struct fs_context *fc, void *data)
+{
+	int (*monolithic_mount_data)(struct fs_context *, void *);
+
+	monolithic_mount_data = fc->ops->parse_monolithic;
+	if (!monolithic_mount_data)
+		monolithic_mount_data = generic_parse_monolithic;
+
+	return monolithic_mount_data(fc, data);
+}
+
+/*
+ * Clean up a context after performing an action on it and put it into a state
+ * from where it can be used to reconfigure a superblock.
+ *
+ * Note that here we do only the parts that can't fail; the rest is in
+ * finish_clean_context() below and in between those fs_context is marked
+ * FS_CONTEXT_AWAITING_RECONF.  The reason for splitup is that after
+ * successful mount or remount we need to report success to userland.
+ * Trying to do full reinit (for the sake of possible subsequent remount)
+ * and failing to allocate memory would've put us into a nasty situation.
+ * So here we only discard the old state and reinitialization is left
+ * until we actually try to reconfigure.
+ */
+void vfs_clean_context(struct fs_context *fc)
+{
+	if (fc->need_free && fc->ops && fc->ops->free)
+		fc->ops->free(fc);
+	fc->need_free = false;
+	fc->fs_private = NULL;
+	fc->s_fs_info = NULL;
+	fc->sb_flags = 0;
+	security_free_mnt_opts(&fc->security);
+	kfree(fc->source);
+	fc->source = NULL;
+
+	fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
+	fc->phase = FS_CONTEXT_AWAITING_RECONF;
+}
+
+int finish_clean_context(struct fs_context *fc)
+{
+	int error;
+
+	if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
+		return 0;
+
+	if (fc->fs_type->init_fs_context)
+		error = fc->fs_type->init_fs_context(fc);
+	else
+		error = legacy_init_fs_context(fc);
+	if (unlikely(error)) {
+		fc->phase = FS_CONTEXT_FAILED;
+		return error;
+	}
+	fc->need_free = true;
+	fc->phase = FS_CONTEXT_RECONF_PARAMS;
+	return 0;
+}

diff --git a/fs/fs_parser.c b/fs/fs_parser.c
new file mode 100644
index 0000000..d1930ad
--- /dev/null
+++ b/fs/fs_parser.c

@@ -0,0 +1,458 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Filesystem parameter parser.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+static const struct constant_table bool_names[] = {
+	{ "0",		false },
+	{ "1",		true },
+	{ "false",	false },
+	{ "no",		false },
+	{ "true",	true },
+	{ "yes",	true },
+};
+
+/**
+ * lookup_constant - Look up a constant by name in an ordered table
+ * @tbl: The table of constants to search.
+ * @tbl_size: The size of the table.
+ * @name: The name to look up.
+ * @not_found: The value to return if the name is not found.
+ */
+int __lookup_constant(const struct constant_table *tbl, size_t tbl_size,
+		      const char *name, int not_found)
+{
+	unsigned int i;
+
+	for (i = 0; i < tbl_size; i++)
+		if (strcmp(name, tbl[i].name) == 0)
+			return tbl[i].value;
+
+	return not_found;
+}
+EXPORT_SYMBOL(__lookup_constant);
+
+static const struct fs_parameter_spec *fs_lookup_key(
+	const struct fs_parameter_description *desc,
+	const char *name)
+{
+	const struct fs_parameter_spec *p;
+
+	if (!desc->specs)
+		return NULL;
+
+	for (p = desc->specs; p->name; p++)
+		if (strcmp(p->name, name) == 0)
+			return p;
+
+	return NULL;
+}
+
+/*
+ * fs_parse - Parse a filesystem configuration parameter
+ * @fc: The filesystem context to log errors through.
+ * @desc: The parameter description to use.
+ * @param: The parameter.
+ * @result: Where to place the result of the parse
+ *
+ * Parse a filesystem configuration parameter and attempt a conversion for a
+ * simple parameter for which this is requested.  If successful, the determined
+ * parameter ID is placed into @result->key, the desired type is indicated in
+ * @result->t and any converted value is placed into an appropriate member of
+ * the union in @result.
+ *
+ * The function returns the parameter number if the parameter was matched,
+ * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that
+ * unknown parameters are okay and -EINVAL if there was a conversion issue or
+ * the parameter wasn't recognised and unknowns aren't okay.
+ */
+int fs_parse(struct fs_context *fc,
+	     const struct fs_parameter_description *desc,
+	     struct fs_parameter *param,
+	     struct fs_parse_result *result)
+{
+	const struct fs_parameter_spec *p;
+	const struct fs_parameter_enum *e;
+	int ret = -ENOPARAM, b;
+
+	result->has_value = !!param->string;
+	result->negated = false;
+	result->uint_64 = 0;
+
+	p = fs_lookup_key(desc, param->key);
+	if (!p) {
+		/* If we didn't find something that looks like "noxxx", see if
+		 * "xxx" takes the "no"-form negative - but only if there
+		 * wasn't an value.
+		 */
+		if (result->has_value)
+			goto unknown_parameter;
+		if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2])
+			goto unknown_parameter;
+
+		p = fs_lookup_key(desc, param->key + 2);
+		if (!p)
+			goto unknown_parameter;
+		if (!(p->flags & fs_param_neg_with_no))
+			goto unknown_parameter;
+		result->boolean = false;
+		result->negated = true;
+	}
+
+	if (p->flags & fs_param_deprecated)
+		warnf(fc, "%s: Deprecated parameter '%s'",
+		      desc->name, param->key);
+
+	if (result->negated)
+		goto okay;
+
+	/* Certain parameter types only take a string and convert it. */
+	switch (p->type) {
+	case __fs_param_wasnt_defined:
+		return -EINVAL;
+	case fs_param_is_u32:
+	case fs_param_is_u32_octal:
+	case fs_param_is_u32_hex:
+	case fs_param_is_s32:
+	case fs_param_is_u64:
+	case fs_param_is_enum:
+	case fs_param_is_string:
+		if (param->type != fs_value_is_string)
+			goto bad_value;
+		if (!result->has_value) {
+			if (p->flags & fs_param_v_optional)
+				goto okay;
+			goto bad_value;
+		}
+		/* Fall through */
+	default:
+		break;
+	}
+
+	/* Try to turn the type we were given into the type desired by the
+	 * parameter and give an error if we can't.
+	 */
+	switch (p->type) {
+	case fs_param_is_flag:
+		if (param->type != fs_value_is_flag &&
+		    (param->type != fs_value_is_string || result->has_value))
+			return invalf(fc, "%s: Unexpected value for '%s'",
+				      desc->name, param->key);
+		result->boolean = true;
+		goto okay;
+
+	case fs_param_is_bool:
+		switch (param->type) {
+		case fs_value_is_flag:
+			result->boolean = true;
+			goto okay;
+		case fs_value_is_string:
+			if (param->size == 0) {
+				result->boolean = true;
+				goto okay;
+			}
+			b = lookup_constant(bool_names, param->string, -1);
+			if (b == -1)
+				goto bad_value;
+			result->boolean = b;
+			goto okay;
+		default:
+			goto bad_value;
+		}
+
+	case fs_param_is_u32:
+		ret = kstrtouint(param->string, 0, &result->uint_32);
+		goto maybe_okay;
+	case fs_param_is_u32_octal:
+		ret = kstrtouint(param->string, 8, &result->uint_32);
+		goto maybe_okay;
+	case fs_param_is_u32_hex:
+		ret = kstrtouint(param->string, 16, &result->uint_32);
+		goto maybe_okay;
+	case fs_param_is_s32:
+		ret = kstrtoint(param->string, 0, &result->int_32);
+		goto maybe_okay;
+	case fs_param_is_u64:
+		ret = kstrtoull(param->string, 0, &result->uint_64);
+		goto maybe_okay;
+
+	case fs_param_is_enum:
+		for (e = desc->enums; e->name[0]; e++) {
+			if (e->opt == p->opt &&
+			    strcmp(e->name, param->string) == 0) {
+				result->uint_32 = e->value;
+				goto okay;
+			}
+		}
+		goto bad_value;
+
+	case fs_param_is_string:
+		goto okay;
+	case fs_param_is_blob:
+		if (param->type != fs_value_is_blob)
+			goto bad_value;
+		goto okay;
+
+	case fs_param_is_fd: {
+		switch (param->type) {
+		case fs_value_is_string:
+			if (!result->has_value)
+				goto bad_value;
+
+			ret = kstrtouint(param->string, 0, &result->uint_32);
+			break;
+		case fs_value_is_file:
+			result->uint_32 = param->dirfd;
+			ret = 0;
+		default:
+			goto bad_value;
+		}
+
+		if (result->uint_32 > INT_MAX)
+			goto bad_value;
+		goto maybe_okay;
+	}
+
+	case fs_param_is_blockdev:
+	case fs_param_is_path:
+		goto okay;
+	default:
+		BUG();
+	}
+
+maybe_okay:
+	if (ret < 0)
+		goto bad_value;
+okay:
+	return p->opt;
+
+bad_value:
+	return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key);
+unknown_parameter:
+	return -ENOPARAM;
+}
+EXPORT_SYMBOL(fs_parse);
+
+/**
+ * fs_lookup_param - Look up a path referred to by a parameter
+ * @fc: The filesystem context to log errors through.
+ * @param: The parameter.
+ * @want_bdev: T if want a blockdev
+ * @_path: The result of the lookup
+ */
+int fs_lookup_param(struct fs_context *fc,
+		    struct fs_parameter *param,
+		    bool want_bdev,
+		    struct path *_path)
+{
+	struct filename *f;
+	unsigned int flags = 0;
+	bool put_f;
+	int ret;
+
+	switch (param->type) {
+	case fs_value_is_string:
+		f = getname_kernel(param->string);
+		if (IS_ERR(f))
+			return PTR_ERR(f);
+		put_f = true;
+		break;
+	case fs_value_is_filename_empty:
+		flags = LOOKUP_EMPTY;
+		/* Fall through */
+	case fs_value_is_filename:
+		f = param->name;
+		put_f = false;
+		break;
+	default:
+		return invalf(fc, "%s: not usable as path", param->key);
+	}
+
+	f->refcnt++; /* filename_lookup() drops our ref. */
+	ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
+	if (ret < 0) {
+		errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
+		goto out;
+	}
+
+	if (want_bdev &&
+	    !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) {
+		path_put(_path);
+		_path->dentry = NULL;
+		_path->mnt = NULL;
+		errorf(fc, "%s: Non-blockdev passed as '%s'",
+		       param->key, f->name);
+		ret = -ENOTBLK;
+	}
+
+out:
+	if (put_f)
+		putname(f);
+	return ret;
+}
+EXPORT_SYMBOL(fs_lookup_param);
+
+#ifdef CONFIG_VALIDATE_FS_PARSER
+/**
+ * validate_constant_table - Validate a constant table
+ * @name: Name to use in reporting
+ * @tbl: The constant table to validate.
+ * @tbl_size: The size of the table.
+ * @low: The lowest permissible value.
+ * @high: The highest permissible value.
+ * @special: One special permissible value outside of the range.
+ */
+bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
+			     int low, int high, int special)
+{
+	size_t i;
+	bool good = true;
+
+	if (tbl_size == 0) {
+		pr_warn("VALIDATE C-TBL: Empty\n");
+		return true;
+	}
+
+	for (i = 0; i < tbl_size; i++) {
+		if (!tbl[i].name) {
+			pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
+			good = false;
+		} else if (i > 0 && tbl[i - 1].name) {
+			int c = strcmp(tbl[i-1].name, tbl[i].name);
+
+			if (c == 0) {
+				pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
+				       i, tbl[i].name);
+				good = false;
+			}
+			if (c > 0) {
+				pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
+				       i, tbl[i-1].name, tbl[i].name);
+				good = false;
+			}
+		}
+
+		if (tbl[i].value != special &&
+		    (tbl[i].value < low || tbl[i].value > high)) {
+			pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
+			       i, tbl[i].name, tbl[i].value, low, high);
+			good = false;
+		}
+	}
+
+	return good;
+}
+
+/**
+ * fs_validate_description - Validate a parameter description
+ * @desc: The parameter description to validate.
+ */
+bool fs_validate_description(const struct fs_parameter_description *desc)
+{
+	const struct fs_parameter_spec *param, *p2;
+	const struct fs_parameter_enum *e;
+	const char *name = desc->name;
+	unsigned int nr_params = 0;
+	bool good = true, enums = false;
+
+	pr_notice("*** VALIDATE %s ***\n", name);
+
+	if (!name[0]) {
+		pr_err("VALIDATE Parser: No name\n");
+		name = "Unknown";
+		good = false;
+	}
+
+	if (desc->specs) {
+		for (param = desc->specs; param->name; param++) {
+			enum fs_parameter_type t = param->type;
+
+			/* Check that the type is in range */
+			if (t == __fs_param_wasnt_defined ||
+			    t >= nr__fs_parameter_type) {
+				pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n",
+				       name, param->name, t);
+				good = false;
+			} else if (t == fs_param_is_enum) {
+				enums = true;
+			}
+
+			/* Check for duplicate parameter names */
+			for (p2 = desc->specs; p2 < param; p2++) {
+				if (strcmp(param->name, p2->name) == 0) {
+					pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n",
+					       name, param->name);
+					good = false;
+				}
+			}
+		}
+
+		nr_params = param - desc->specs;
+	}
+
+	if (desc->enums) {
+		if (!nr_params) {
+			pr_err("VALIDATE %s: Enum table but no parameters\n",
+			       name);
+			good = false;
+			goto no_enums;
+		}
+		if (!enums) {
+			pr_err("VALIDATE %s: Enum table but no enum-type values\n",
+			       name);
+			good = false;
+			goto no_enums;
+		}
+
+		for (e = desc->enums; e->name[0]; e++) {
+			/* Check that all entries in the enum table have at
+			 * least one parameter that uses them.
+			 */
+			for (param = desc->specs; param->name; param++) {
+				if (param->opt == e->opt &&
+				    param->type != fs_param_is_enum) {
+					pr_err("VALIDATE %s: e[%tu] enum val for %s\n",
+					       name, e - desc->enums, param->name);
+					good = false;
+				}
+			}
+		}
+
+		/* Check that all enum-type parameters have at least one enum
+		 * value in the enum table.
+		 */
+		for (param = desc->specs; param->name; param++) {
+			if (param->type != fs_param_is_enum)
+				continue;
+			for (e = desc->enums; e->name[0]; e++)
+				if (e->opt == param->opt)
+					break;
+			if (!e->name[0]) {
+				pr_err("VALIDATE %s: PARAM[%s] enum with no values\n",
+				       name, param->name);
+				good = false;
+			}
+		}
+	} else {
+		if (enums) {
+			pr_err("VALIDATE %s: enum-type values, but no enum table\n",
+			       name);
+			good = false;
+			goto no_enums;
+		}
+	}
+
+no_enums:
+	return good;
+}
+#endif /* CONFIG_VALIDATE_FS_PARSER */

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index a6497cf..47ef3c7 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c

@@ -19,18 +19,12 @@
 	spin_unlock_irq(&pin->wait.lock);
 }
 
-void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
-{
-	spin_lock(&pin_lock);
-	if (p)
-		hlist_add_head(&pin->s_list, p);
-	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
-	spin_unlock(&pin_lock);
-}
-
 void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 {
-	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
+	spin_lock(&pin_lock);
+	hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
+	spin_unlock(&pin_lock);
 }
 
 void pin_kill(struct fs_pin *p)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index be02507..ca639ed 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/export.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>

diff --git a/fs/fs_types.c b/fs/fs_types.c
new file mode 100644
index 0000000..78365e5
--- /dev/null
+++ b/fs/fs_types.c

@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/export.h>
+
+/*
+ * fs on-disk file type to dirent file type conversion
+ */
+static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
+	[FT_UNKNOWN]	= DT_UNKNOWN,
+	[FT_REG_FILE]	= DT_REG,
+	[FT_DIR]	= DT_DIR,
+	[FT_CHRDEV]	= DT_CHR,
+	[FT_BLKDEV]	= DT_BLK,
+	[FT_FIFO]	= DT_FIFO,
+	[FT_SOCK]	= DT_SOCK,
+	[FT_SYMLINK]	= DT_LNK
+};
+
+/**
+ * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
+ * @filetype: The on-disk file type to convert.
+ *
+ * This function converts the on-disk file type value (FT_*) to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN		- Unknown type
+ * * DT_FIFO		- FIFO
+ * * DT_CHR		- Character device
+ * * DT_DIR		- Directory
+ * * DT_BLK		- Block device
+ * * DT_REG		- Regular file
+ * * DT_LNK		- Symbolic link
+ * * DT_SOCK		- Local-domain socket
+ */
+unsigned char fs_ftype_to_dtype(unsigned int filetype)
+{
+	if (filetype >= FT_MAX)
+		return DT_UNKNOWN;
+
+	return fs_dtype_by_ftype[filetype];
+}
+EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
+
+/*
+ * dirent file type to fs on-disk file type conversion
+ * Values not initialized explicitly are FT_UNKNOWN (0).
+ */
+static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
+	[DT_REG]	= FT_REG_FILE,
+	[DT_DIR]	= FT_DIR,
+	[DT_LNK]	= FT_SYMLINK,
+	[DT_CHR]	= FT_CHRDEV,
+	[DT_BLK]	= FT_BLKDEV,
+	[DT_FIFO]	= FT_FIFO,
+	[DT_SOCK]	= FT_SOCK,
+};
+
+/**
+ * fs_umode_to_ftype() - file mode to on-disk file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the on-disk file type (FT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * FT_UNKNOWN		- Unknown type
+ * * FT_REG_FILE	- Regular file
+ * * FT_DIR		- Directory
+ * * FT_CHRDEV		- Character device
+ * * FT_BLKDEV		- Block device
+ * * FT_FIFO		- FIFO
+ * * FT_SOCK		- Local-domain socket
+ * * FT_SYMLINK		- Symbolic link
+ */
+unsigned char fs_umode_to_ftype(umode_t mode)
+{
+	return fs_ftype_by_dtype[S_DT(mode)];
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
+
+/**
+ * fs_umode_to_dtype() - file mode to dirent file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN		- Unknown type
+ * * DT_FIFO		- FIFO
+ * * DT_CHR		- Character device
+ * * DT_DIR		- Directory
+ * * DT_BLK		- Block device
+ * * DT_REG		- Regular file
+ * * DT_LNK		- Symbolic link
+ * * DT_SOCK		- Local-domain socket
+ */
+unsigned char fs_umode_to_dtype(umode_t mode)
+{
+	return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_dtype);

diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 3f6dfa9..506c5e6 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 
 config FSCACHE
 	tristate "General filesystem local caching manager"

diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index cdcb376..f78793f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache cache handling
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL CACHE

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index c550512..0ce3965 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* netfs cookie management
  *
  * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
  * See Documentation/filesystems/caching/netfs-api.txt for more information on
  * the netfs API.
  */

diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index aa46e48..09ed879 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Filesystem index definition
  *
  * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL CACHE

diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
index 9a13e9e..4e5beea 100644
--- a/fs/fscache/histogram.c
+++ b/fs/fscache/histogram.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache latency histogram
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL THREAD

diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index d620902..9616af3 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* Internal definitions for FS-Cache
  *
  * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 /*

diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 30ad89d..59c2494 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* General filesystem local caching manager
  *
  * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL CACHE

diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index c2f6054..cce9221 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache netfs (client) registration
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL COOKIE

diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 43e6e28..72ebfe5 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Global fscache object list maintainer and viewer
  *
  * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL COOKIE

diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 6d9cb17..cfeba83 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache object state machine handler
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
  * See Documentation/filesystems/caching/object.txt for a description of the
  * object state machine and the in-kernel representations.
  */

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 8d26579..1a22a55 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c

@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache worker operation management routines
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
  * See Documentation/filesystems/caching/operations.txt
  */
 

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 111349f..26af6fd 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Cache page management and data I/O routines
  *
  * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL PAGE

diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index 49a8c90..5523446 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache statistics viewing interface
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL OPERATION

diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 00564a1..a5aa93e 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* FS-Cache statistics
  *
  * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #define FSCACHE_DEBUG_LEVEL THREAD

diff --git a/fs/fsopen.c b/fs/fsopen.c
new file mode 100644
index 0000000..043ffa8
--- /dev/null
+++ b/fs/fsopen.c

@@ -0,0 +1,475 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Filesystem access-by-fd.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/anon_inodes.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <uapi/linux/mount.h>
+#include "internal.h"
+#include "mount.h"
+
+/*
+ * Allow the user to read back any error, warning or informational messages.
+ */
+static ssize_t fscontext_read(struct file *file,
+			      char __user *_buf, size_t len, loff_t *pos)
+{
+	struct fs_context *fc = file->private_data;
+	struct fc_log *log = fc->log;
+	unsigned int logsize = ARRAY_SIZE(log->buffer);
+	ssize_t ret;
+	char *p;
+	bool need_free;
+	int index, n;
+
+	ret = mutex_lock_interruptible(&fc->uapi_mutex);
+	if (ret < 0)
+		return ret;
+
+	if (log->head == log->tail) {
+		mutex_unlock(&fc->uapi_mutex);
+		return -ENODATA;
+	}
+
+	index = log->tail & (logsize - 1);
+	p = log->buffer[index];
+	need_free = log->need_free & (1 << index);
+	log->buffer[index] = NULL;
+	log->need_free &= ~(1 << index);
+	log->tail++;
+	mutex_unlock(&fc->uapi_mutex);
+
+	ret = -EMSGSIZE;
+	n = strlen(p);
+	if (n > len)
+		goto err_free;
+	ret = -EFAULT;
+	if (copy_to_user(_buf, p, n) != 0)
+		goto err_free;
+	ret = n;
+
+err_free:
+	if (need_free)
+		kfree(p);
+	return ret;
+}
+
+static int fscontext_release(struct inode *inode, struct file *file)
+{
+	struct fs_context *fc = file->private_data;
+
+	if (fc) {
+		file->private_data = NULL;
+		put_fs_context(fc);
+	}
+	return 0;
+}
+
+const struct file_operations fscontext_fops = {
+	.read		= fscontext_read,
+	.release	= fscontext_release,
+	.llseek		= no_llseek,
+};
+
+/*
+ * Attach a filesystem context to a file and an fd.
+ */
+static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags)
+{
+	int fd;
+
+	fd = anon_inode_getfd("[fscontext]", &fscontext_fops, fc,
+			      O_RDWR | o_flags);
+	if (fd < 0)
+		put_fs_context(fc);
+	return fd;
+}
+
+static int fscontext_alloc_log(struct fs_context *fc)
+{
+	fc->log = kzalloc(sizeof(*fc->log), GFP_KERNEL);
+	if (!fc->log)
+		return -ENOMEM;
+	refcount_set(&fc->log->usage, 1);
+	fc->log->owner = fc->fs_type->owner;
+	return 0;
+}
+
+/*
+ * Open a filesystem by name so that it can be configured for mounting.
+ *
+ * We are allowed to specify a container in which the filesystem will be
+ * opened, thereby indicating which namespaces will be used (notably, which
+ * network namespace will be used for network filesystems).
+ */
+SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
+{
+	struct file_system_type *fs_type;
+	struct fs_context *fc;
+	const char *fs_name;
+	int ret;
+
+	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (flags & ~FSOPEN_CLOEXEC)
+		return -EINVAL;
+
+	fs_name = strndup_user(_fs_name, PAGE_SIZE);
+	if (IS_ERR(fs_name))
+		return PTR_ERR(fs_name);
+
+	fs_type = get_fs_type(fs_name);
+	kfree(fs_name);
+	if (!fs_type)
+		return -ENODEV;
+
+	fc = fs_context_for_mount(fs_type, 0);
+	put_filesystem(fs_type);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	fc->phase = FS_CONTEXT_CREATE_PARAMS;
+
+	ret = fscontext_alloc_log(fc);
+	if (ret < 0)
+		goto err_fc;
+
+	return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0);
+
+err_fc:
+	put_fs_context(fc);
+	return ret;
+}
+
+/*
+ * Pick a superblock into a context for reconfiguration.
+ */
+SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags)
+{
+	struct fs_context *fc;
+	struct path target;
+	unsigned int lookup_flags;
+	int ret;
+
+	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((flags & ~(FSPICK_CLOEXEC |
+		       FSPICK_SYMLINK_NOFOLLOW |
+		       FSPICK_NO_AUTOMOUNT |
+		       FSPICK_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+	if (flags & FSPICK_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & FSPICK_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & FSPICK_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	ret = user_path_at(dfd, path, lookup_flags, &target);
+	if (ret < 0)
+		goto err;
+
+	ret = -EINVAL;
+	if (target.mnt->mnt_root != target.dentry)
+		goto err_path;
+
+	fc = fs_context_for_reconfigure(target.dentry, 0, 0);
+	if (IS_ERR(fc)) {
+		ret = PTR_ERR(fc);
+		goto err_path;
+	}
+
+	fc->phase = FS_CONTEXT_RECONF_PARAMS;
+
+	ret = fscontext_alloc_log(fc);
+	if (ret < 0)
+		goto err_fc;
+
+	path_put(&target);
+	return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0);
+
+err_fc:
+	put_fs_context(fc);
+err_path:
+	path_put(&target);
+err:
+	return ret;
+}
+
+/*
+ * Check the state and apply the configuration.  Note that this function is
+ * allowed to 'steal' the value by setting param->xxx to NULL before returning.
+ */
+static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
+			       struct fs_parameter *param)
+{
+	struct super_block *sb;
+	int ret;
+
+	ret = finish_clean_context(fc);
+	if (ret)
+		return ret;
+	switch (cmd) {
+	case FSCONFIG_CMD_CREATE:
+		if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+			return -EBUSY;
+		if (!mount_capable(fc))
+			return -EPERM;
+		fc->phase = FS_CONTEXT_CREATING;
+		ret = vfs_get_tree(fc);
+		if (ret)
+			break;
+		sb = fc->root->d_sb;
+		ret = security_sb_kern_mount(sb);
+		if (unlikely(ret)) {
+			fc_drop_locked(fc);
+			break;
+		}
+		up_write(&sb->s_umount);
+		fc->phase = FS_CONTEXT_AWAITING_MOUNT;
+		return 0;
+	case FSCONFIG_CMD_RECONFIGURE:
+		if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
+			return -EBUSY;
+		fc->phase = FS_CONTEXT_RECONFIGURING;
+		sb = fc->root->d_sb;
+		if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			break;
+		}
+		down_write(&sb->s_umount);
+		ret = reconfigure_super(fc);
+		up_write(&sb->s_umount);
+		if (ret)
+			break;
+		vfs_clean_context(fc);
+		return 0;
+	default:
+		if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
+		    fc->phase != FS_CONTEXT_RECONF_PARAMS)
+			return -EBUSY;
+
+		return vfs_parse_fs_param(fc, param);
+	}
+	fc->phase = FS_CONTEXT_FAILED;
+	return ret;
+}
+
+/**
+ * sys_fsconfig - Set parameters and trigger actions on a context
+ * @fd: The filesystem context to act upon
+ * @cmd: The action to take
+ * @_key: Where appropriate, the parameter key to set
+ * @_value: Where appropriate, the parameter value to set
+ * @aux: Additional information for the value
+ *
+ * This system call is used to set parameters on a context, including
+ * superblock settings, data source and security labelling.
+ *
+ * Actions include triggering the creation of a superblock and the
+ * reconfiguration of the superblock attached to the specified context.
+ *
+ * When setting a parameter, @cmd indicates the type of value being proposed
+ * and @_key indicates the parameter to be altered.
+ *
+ * @_value and @aux are used to specify the value, should a value be required:
+ *
+ * (*) fsconfig_set_flag: No value is specified.  The parameter must be boolean
+ *     in nature.  The key may be prefixed with "no" to invert the
+ *     setting. @_value must be NULL and @aux must be 0.
+ *
+ * (*) fsconfig_set_string: A string value is specified.  The parameter can be
+ *     expecting boolean, integer, string or take a path.  A conversion to an
+ *     appropriate type will be attempted (which may include looking up as a
+ *     path).  @_value points to a NUL-terminated string and @aux must be 0.
+ *
+ * (*) fsconfig_set_binary: A binary blob is specified.  @_value points to the
+ *     blob and @aux indicates its size.  The parameter must be expecting a
+ *     blob.
+ *
+ * (*) fsconfig_set_path: A non-empty path is specified.  The parameter must be
+ *     expecting a path object.  @_value points to a NUL-terminated string that
+ *     is the path and @aux is a file descriptor at which to start a relative
+ *     lookup or AT_FDCWD.
+ *
+ * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH
+ *     implied.
+ *
+ * (*) fsconfig_set_fd: An open file descriptor is specified.  @_value must be
+ *     NULL and @aux indicates the file descriptor.
+ */
+SYSCALL_DEFINE5(fsconfig,
+		int, fd,
+		unsigned int, cmd,
+		const char __user *, _key,
+		const void __user *, _value,
+		int, aux)
+{
+	struct fs_context *fc;
+	struct fd f;
+	int ret;
+
+	struct fs_parameter param = {
+		.type	= fs_value_is_undefined,
+	};
+
+	if (fd < 0)
+		return -EINVAL;
+
+	switch (cmd) {
+	case FSCONFIG_SET_FLAG:
+		if (!_key || _value || aux)
+			return -EINVAL;
+		break;
+	case FSCONFIG_SET_STRING:
+		if (!_key || !_value || aux)
+			return -EINVAL;
+		break;
+	case FSCONFIG_SET_BINARY:
+		if (!_key || !_value || aux <= 0 || aux > 1024 * 1024)
+			return -EINVAL;
+		break;
+	case FSCONFIG_SET_PATH:
+	case FSCONFIG_SET_PATH_EMPTY:
+		if (!_key || !_value || (aux != AT_FDCWD && aux < 0))
+			return -EINVAL;
+		break;
+	case FSCONFIG_SET_FD:
+		if (!_key || _value || aux < 0)
+			return -EINVAL;
+		break;
+	case FSCONFIG_CMD_CREATE:
+	case FSCONFIG_CMD_RECONFIGURE:
+		if (_key || _value || aux)
+			return -EINVAL;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+	ret = -EINVAL;
+	if (f.file->f_op != &fscontext_fops)
+		goto out_f;
+
+	fc = f.file->private_data;
+	if (fc->ops == &legacy_fs_context_ops) {
+		switch (cmd) {
+		case FSCONFIG_SET_BINARY:
+		case FSCONFIG_SET_PATH:
+		case FSCONFIG_SET_PATH_EMPTY:
+		case FSCONFIG_SET_FD:
+			ret = -EOPNOTSUPP;
+			goto out_f;
+		}
+	}
+
+	if (_key) {
+		param.key = strndup_user(_key, 256);
+		if (IS_ERR(param.key)) {
+			ret = PTR_ERR(param.key);
+			goto out_f;
+		}
+	}
+
+	switch (cmd) {
+	case FSCONFIG_SET_FLAG:
+		param.type = fs_value_is_flag;
+		break;
+	case FSCONFIG_SET_STRING:
+		param.type = fs_value_is_string;
+		param.string = strndup_user(_value, 256);
+		if (IS_ERR(param.string)) {
+			ret = PTR_ERR(param.string);
+			goto out_key;
+		}
+		param.size = strlen(param.string);
+		break;
+	case FSCONFIG_SET_BINARY:
+		param.type = fs_value_is_blob;
+		param.size = aux;
+		param.blob = memdup_user_nul(_value, aux);
+		if (IS_ERR(param.blob)) {
+			ret = PTR_ERR(param.blob);
+			goto out_key;
+		}
+		break;
+	case FSCONFIG_SET_PATH:
+		param.type = fs_value_is_filename;
+		param.name = getname_flags(_value, 0, NULL);
+		if (IS_ERR(param.name)) {
+			ret = PTR_ERR(param.name);
+			goto out_key;
+		}
+		param.dirfd = aux;
+		param.size = strlen(param.name->name);
+		break;
+	case FSCONFIG_SET_PATH_EMPTY:
+		param.type = fs_value_is_filename_empty;
+		param.name = getname_flags(_value, LOOKUP_EMPTY, NULL);
+		if (IS_ERR(param.name)) {
+			ret = PTR_ERR(param.name);
+			goto out_key;
+		}
+		param.dirfd = aux;
+		param.size = strlen(param.name->name);
+		break;
+	case FSCONFIG_SET_FD:
+		param.type = fs_value_is_file;
+		ret = -EBADF;
+		param.file = fget(aux);
+		if (!param.file)
+			goto out_key;
+		break;
+	default:
+		break;
+	}
+
+	ret = mutex_lock_interruptible(&fc->uapi_mutex);
+	if (ret == 0) {
+		ret = vfs_fsconfig_locked(fc, cmd, &param);
+		mutex_unlock(&fc->uapi_mutex);
+	}
+
+	/* Clean up the our record of any value that we obtained from
+	 * userspace.  Note that the value may have been stolen by the LSM or
+	 * filesystem, in which case the value pointer will have been cleared.
+	 */
+	switch (cmd) {
+	case FSCONFIG_SET_STRING:
+	case FSCONFIG_SET_BINARY:
+		kfree(param.string);
+		break;
+	case FSCONFIG_SET_PATH:
+	case FSCONFIG_SET_PATH_EMPTY:
+		if (param.name)
+			putname(param.name);
+		break;
+	case FSCONFIG_SET_FD:
+		if (param.file)
+			fput(param.file);
+		break;
+	default:
+		break;
+	}
+out_key:
+	kfree(param.key);
+out_f:
+	fdput(f);
+	return ret;
+}

diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 76f09ce..0635cba 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config FUSE_FS
 	tristate "FUSE (Filesystem in Userspace) support"
 	select FS_POSIX_ACL
@@ -26,3 +27,14 @@
 
 	  If you want to develop or use a userspace character device
 	  based on CUSE, answer Y or M.
+
+config VIRTIO_FS
+	tristate "Virtio Filesystem"
+	depends on FUSE_FS
+	select VIRTIO
+	help
+	  The Virtio Filesystem allows guests to mount file systems from the
+          host.
+
+	  If you want to share files between guests or with the host, answer Y
+          or M.

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 60da84a..3e8cebf 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile

@@ -1,8 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the FUSE filesystem.
 #
 
 obj-$(CONFIG_FUSE_FS) += fuse.o
 obj-$(CONFIG_CUSE) += cuse.o
+obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
 
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o
+fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+virtiofs-y += virtio_fs.o

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 0b69465..c23f6f2 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c

@@ -10,6 +10,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/fs_context.h>
 
 #define FUSE_CTL_SUPER_MAGIC 0x65735543
 
@@ -35,7 +36,9 @@
 {
 	struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
 	if (fc) {
-		fuse_abort_conn(fc, true);
+		if (fc->abort_err)
+			fc->aborted = true;
+		fuse_abort_conn(fc);
 		fuse_conn_put(fc);
 	}
 	return count;
@@ -107,7 +110,7 @@
 	if (!fc)
 		return 0;
 
-	val = fc->max_background;
+	val = READ_ONCE(fc->max_background);
 	fuse_conn_put(fc);
 
 	return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -125,7 +128,12 @@
 	if (ret > 0) {
 		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
 		if (fc) {
+			spin_lock(&fc->bg_lock);
 			fc->max_background = val;
+			fc->blocked = fc->num_background >= fc->max_background;
+			if (!fc->blocked)
+				wake_up(&fc->blocked_waitq);
+			spin_unlock(&fc->bg_lock);
 			fuse_conn_put(fc);
 		}
 	}
@@ -144,7 +152,7 @@
 	if (!fc)
 		return 0;
 
-	val = fc->congestion_threshold;
+	val = READ_ONCE(fc->congestion_threshold);
 	fuse_conn_put(fc);
 
 	return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -155,18 +163,31 @@
 						    size_t count, loff_t *ppos)
 {
 	unsigned uninitialized_var(val);
+	struct fuse_conn *fc;
 	ssize_t ret;
 
 	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
 				    max_user_congthresh);
-	if (ret > 0) {
-		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
-		if (fc) {
-			fc->congestion_threshold = val;
-			fuse_conn_put(fc);
+	if (ret <= 0)
+		goto out;
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		goto out;
+
+	spin_lock(&fc->bg_lock);
+	fc->congestion_threshold = val;
+	if (fc->sb) {
+		if (fc->num_background < fc->congestion_threshold) {
+			clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+			clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+		} else {
+			set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+			set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
 		}
 	}
-
+	spin_unlock(&fc->bg_lock);
+	fuse_conn_put(fc);
+out:
 	return ret;
 }
 
@@ -297,7 +318,7 @@
 	drop_nlink(d_inode(fuse_control_sb->s_root));
 }
 
-static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
+static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx)
 {
 	static const struct tree_descr empty_descr = {""};
 	struct fuse_conn *fc;
@@ -323,10 +344,19 @@
 	return 0;
 }
 
-static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
-			int flags, const char *dev_name, void *raw_data)
+static int fuse_ctl_get_tree(struct fs_context *fc)
 {
-	return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
+	return get_tree_single(fc, fuse_ctl_fill_super);
+}
+
+static const struct fs_context_operations fuse_ctl_context_ops = {
+	.get_tree	= fuse_ctl_get_tree,
+};
+
+static int fuse_ctl_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &fuse_ctl_context_ops;
+	return 0;
 }
 
 static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -345,7 +375,7 @@
 static struct file_system_type fuse_ctl_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fusectl",
-	.mount		= fuse_ctl_mount,
+	.init_fs_context = fuse_ctl_init_fs_context,
 	.kill_sb	= fuse_ctl_kill_sb,
 };
 MODULE_ALIAS_FS("fusectl");

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 8f68181..00015d8 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c

@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * CUSE: Character device in Userspace
  *
  * Copyright (C) 2008-2009  SUSE Linux Products GmbH
  * Copyright (C) 2008-2009  Tejun Heo <tj@kernel.org>
  *
- * This file is released under the GPLv2.
- *
  * CUSE enables character devices to be implemented from userland much
  * like FUSE allows filesystems.  On initialization /dev/cuse is
  * created.  By opening the file and replying to the CUSE_INIT request
@@ -33,6 +32,8 @@
  * closed.
  */
 
+#define pr_fmt(fmt) "CUSE: " fmt
+
 #include <linux/fuse.h>
 #include <linux/cdev.h>
 #include <linux/device.h>
@@ -144,7 +145,7 @@
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 
-	fuse_sync_release(ff, file->f_flags);
+	fuse_sync_release(NULL, ff, file->f_flags);
 	fuse_conn_put(fc);
 
 	return 0;
@@ -224,7 +225,7 @@
 		return 0;
 
 	if (end[-1] != '\0') {
-		printk(KERN_ERR "CUSE: info not properly terminated\n");
+		pr_err("info not properly terminated\n");
 		return -EINVAL;
 	}
 
@@ -241,7 +242,7 @@
 		key = strstrip(key);
 
 	if (!strlen(key)) {
-		printk(KERN_ERR "CUSE: zero length info key specified\n");
+		pr_err("zero length info key specified\n");
 		return -EINVAL;
 	}
 
@@ -281,12 +282,11 @@
 		if (strcmp(key, "DEVNAME") == 0)
 			devinfo->name = val;
 		else
-			printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
-			       key);
+			pr_warn("unknown device info \"%s\"\n", key);
 	}
 
 	if (!devinfo->name || !strlen(devinfo->name)) {
-		printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+		pr_err("DEVNAME unspecified\n");
 		return -EINVAL;
 	}
 
@@ -298,6 +298,14 @@
 	kfree(dev);
 }
 
+struct cuse_init_args {
+	struct fuse_args_pages ap;
+	struct cuse_init_in in;
+	struct cuse_init_out out;
+	struct page *page;
+	struct fuse_page_desc desc;
+};
+
 /**
  * cuse_process_init_reply - finish initializing CUSE channel
  *
@@ -305,21 +313,22 @@
  * required data structures for it.  Please read the comment at the
  * top of this file for high level overview.
  */
-static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+static void cuse_process_init_reply(struct fuse_conn *fc,
+				    struct fuse_args *args, int error)
 {
+	struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
+	struct fuse_args_pages *ap = &ia->ap;
 	struct cuse_conn *cc = fc_to_cc(fc), *pos;
-	struct cuse_init_out *arg = req->out.args[0].value;
-	struct page *page = req->pages[0];
+	struct cuse_init_out *arg = &ia->out;
+	struct page *page = ap->pages[0];
 	struct cuse_devinfo devinfo = { };
 	struct device *dev;
 	struct cdev *cdev;
 	dev_t devt;
 	int rc, i;
 
-	if (req->out.h.error ||
-	    arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+	if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11)
 		goto err;
-	}
 
 	fc->minor = arg->minor;
 	fc->max_read = max_t(unsigned, arg->max_read, 4096);
@@ -328,7 +337,7 @@
 	/* parse init reply */
 	cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
 
-	rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+	rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size,
 				&devinfo);
 	if (rc)
 		goto err;
@@ -340,7 +349,7 @@
 	else
 		rc = register_chrdev_region(devt, 1, devinfo.name);
 	if (rc) {
-		printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+		pr_err("failed to register chrdev region\n");
 		goto err;
 	}
 
@@ -395,7 +404,7 @@
 	dev_set_uevent_suppress(dev, 0);
 	kobject_uevent(&dev->kobj, KOBJ_ADD);
 out:
-	kfree(arg);
+	kfree(ia);
 	__free_page(page);
 	return;
 
@@ -407,62 +416,56 @@
 err_region:
 	unregister_chrdev_region(devt, 1);
 err:
-	fuse_abort_conn(fc, false);
+	fuse_abort_conn(fc);
 	goto out;
 }
 
 static int cuse_send_init(struct cuse_conn *cc)
 {
 	int rc;
-	struct fuse_req *req;
 	struct page *page;
 	struct fuse_conn *fc = &cc->fc;
-	struct cuse_init_in *arg;
-	void *outarg;
+	struct cuse_init_args *ia;
+	struct fuse_args_pages *ap;
 
 	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
 
-	req = fuse_get_req_for_background(fc, 1);
-	if (IS_ERR(req)) {
-		rc = PTR_ERR(req);
-		goto err;
-	}
-
 	rc = -ENOMEM;
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
-		goto err_put_req;
+		goto err;
 
-	outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
-	if (!outarg)
+	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
+	if (!ia)
 		goto err_free_page;
 
-	arg = &req->misc.cuse_init_in;
-	arg->major = FUSE_KERNEL_VERSION;
-	arg->minor = FUSE_KERNEL_MINOR_VERSION;
-	arg->flags |= CUSE_UNRESTRICTED_IOCTL;
-	req->in.h.opcode = CUSE_INIT;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(struct cuse_init_in);
-	req->in.args[0].value = arg;
-	req->out.numargs = 2;
-	req->out.args[0].size = sizeof(struct cuse_init_out);
-	req->out.args[0].value = outarg;
-	req->out.args[1].size = CUSE_INIT_INFO_MAX;
-	req->out.argvar = 1;
-	req->out.argpages = 1;
-	req->pages[0] = page;
-	req->page_descs[0].length = req->out.args[1].size;
-	req->num_pages = 1;
-	req->end = cuse_process_init_reply;
-	fuse_request_send_background(fc, req);
+	ap = &ia->ap;
+	ia->in.major = FUSE_KERNEL_VERSION;
+	ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
+	ia->in.flags |= CUSE_UNRESTRICTED_IOCTL;
+	ap->args.opcode = CUSE_INIT;
+	ap->args.in_numargs = 1;
+	ap->args.in_args[0].size = sizeof(ia->in);
+	ap->args.in_args[0].value = &ia->in;
+	ap->args.out_numargs = 2;
+	ap->args.out_args[0].size = sizeof(ia->out);
+	ap->args.out_args[0].value = &ia->out;
+	ap->args.out_args[1].size = CUSE_INIT_INFO_MAX;
+	ap->args.out_argvar = 1;
+	ap->args.out_pages = 1;
+	ap->num_pages = 1;
+	ap->pages = &ia->page;
+	ap->descs = &ia->desc;
+	ia->page = page;
+	ia->desc.length = ap->args.out_args[1].size;
+	ap->args.end = cuse_process_init_reply;
 
-	return 0;
-
+	rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+	if (rc) {
+		kfree(ia);
 err_free_page:
-	__free_page(page);
-err_put_req:
-	fuse_put_request(fc, req);
+		__free_page(page);
+	}
 err:
 	return rc;
 }
@@ -503,9 +506,9 @@
 	 * Limit the cuse channel to requests that can
 	 * be represented in file->f_cred->user_ns.
 	 */
-	fuse_conn_init(&cc->fc, file->f_cred->user_ns);
+	fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
 
-	fud = fuse_dev_alloc(&cc->fc);
+	fud = fuse_dev_alloc_install(&cc->fc);
 	if (!fud) {
 		kfree(cc);
 		return -ENOMEM;
@@ -518,6 +521,7 @@
 	rc = cuse_send_init(cc);
 	if (rc) {
 		fuse_dev_free(fud);
+		fuse_conn_put(&cc->fc);
 		return rc;
 	}
 	file->private_data = fud;
@@ -586,7 +590,7 @@
 {
 	struct cuse_conn *cc = dev_get_drvdata(dev);
 
-	fuse_abort_conn(&cc->fc, false);
+	fuse_abort_conn(&cc->fc);
 	return count;
 }
 static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index bf0da03..ed1abc9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c

@@ -25,6 +25,10 @@
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
 
+/* Ordinary requests have even IDs, while interrupts IDs are odd */
+#define FUSE_INT_REQ_BIT (1ULL << 0)
+#define FUSE_REQ_ID_STEP (1ULL << 1)
+
 static struct kmem_cache *fuse_req_cachep;
 
 static struct fuse_dev *fuse_get_dev(struct file *file)
@@ -36,75 +40,30 @@
 	return READ_ONCE(file->private_data);
 }
 
-static void fuse_request_init(struct fuse_req *req, struct page **pages,
-			      struct fuse_page_desc *page_descs,
-			      unsigned npages)
+static void fuse_request_init(struct fuse_req *req)
 {
-	memset(req, 0, sizeof(*req));
-	memset(pages, 0, sizeof(*pages) * npages);
-	memset(page_descs, 0, sizeof(*page_descs) * npages);
 	INIT_LIST_HEAD(&req->list);
 	INIT_LIST_HEAD(&req->intr_entry);
 	init_waitqueue_head(&req->waitq);
 	refcount_set(&req->count, 1);
-	req->pages = pages;
-	req->page_descs = page_descs;
-	req->max_pages = npages;
 	__set_bit(FR_PENDING, &req->flags);
 }
 
-static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
+static struct fuse_req *fuse_request_alloc(gfp_t flags)
 {
-	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
-	if (req) {
-		struct page **pages;
-		struct fuse_page_desc *page_descs;
+	struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
+	if (req)
+		fuse_request_init(req);
 
-		if (npages <= FUSE_REQ_INLINE_PAGES) {
-			pages = req->inline_pages;
-			page_descs = req->inline_page_descs;
-		} else {
-			pages = kmalloc_array(npages, sizeof(struct page *),
-					      flags);
-			page_descs =
-				kmalloc_array(npages,
-					      sizeof(struct fuse_page_desc),
-					      flags);
-		}
-
-		if (!pages || !page_descs) {
-			kfree(pages);
-			kfree(page_descs);
-			kmem_cache_free(fuse_req_cachep, req);
-			return NULL;
-		}
-
-		fuse_request_init(req, pages, page_descs, npages);
-	}
 	return req;
 }
 
-struct fuse_req *fuse_request_alloc(unsigned npages)
+static void fuse_request_free(struct fuse_req *req)
 {
-	return __fuse_request_alloc(npages, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(fuse_request_alloc);
-
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
-{
-	return __fuse_request_alloc(npages, GFP_NOFS);
-}
-
-void fuse_request_free(struct fuse_req *req)
-{
-	if (req->pages != req->inline_pages) {
-		kfree(req->pages);
-		kfree(req->page_descs);
-	}
 	kmem_cache_free(fuse_req_cachep, req);
 }
 
-void __fuse_get_request(struct fuse_req *req)
+static void __fuse_get_request(struct fuse_req *req)
 {
 	refcount_inc(&req->count);
 }
@@ -141,8 +100,9 @@
 	}
 }
 
-static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
-				       bool for_background)
+static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+
+static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
 {
 	struct fuse_req *req;
 	int err;
@@ -165,7 +125,7 @@
 	if (fc->conn_error)
 		goto out;
 
-	req = fuse_request_alloc(npages);
+	req = fuse_request_alloc(GFP_KERNEL);
 	err = -ENOMEM;
 	if (!req) {
 		if (for_background)
@@ -193,97 +153,7 @@
 	return ERR_PTR(err);
 }
 
-struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
-{
-	return __fuse_get_req(fc, npages, false);
-}
-EXPORT_SYMBOL_GPL(fuse_get_req);
-
-struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
-					     unsigned npages)
-{
-	return __fuse_get_req(fc, npages, true);
-}
-EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
-
-/*
- * Return request in fuse_file->reserved_req.  However that may
- * currently be in use.  If that is the case, wait for it to become
- * available.
- */
-static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
-					 struct file *file)
-{
-	struct fuse_req *req = NULL;
-	struct fuse_file *ff = file->private_data;
-
-	do {
-		wait_event(fc->reserved_req_waitq, ff->reserved_req);
-		spin_lock(&fc->lock);
-		if (ff->reserved_req) {
-			req = ff->reserved_req;
-			ff->reserved_req = NULL;
-			req->stolen_file = get_file(file);
-		}
-		spin_unlock(&fc->lock);
-	} while (!req);
-
-	return req;
-}
-
-/*
- * Put stolen request back into fuse_file->reserved_req
- */
-static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
-{
-	struct file *file = req->stolen_file;
-	struct fuse_file *ff = file->private_data;
-
-	spin_lock(&fc->lock);
-	fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
-	BUG_ON(ff->reserved_req);
-	ff->reserved_req = req;
-	wake_up_all(&fc->reserved_req_waitq);
-	spin_unlock(&fc->lock);
-	fput(file);
-}
-
-/*
- * Gets a requests for a file operation, always succeeds
- *
- * This is used for sending the FLUSH request, which must get to
- * userspace, due to POSIX locks which may need to be unlocked.
- *
- * If allocation fails due to OOM, use the reserved request in
- * fuse_file.
- *
- * This is very unlikely to deadlock accidentally, since the
- * filesystem should not have it's own file open.  If deadlock is
- * intentional, it can still be broken by "aborting" the filesystem.
- */
-struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
-					     struct file *file)
-{
-	struct fuse_req *req;
-
-	atomic_inc(&fc->num_waiting);
-	wait_event(fc->blocked_waitq, fc->initialized);
-	/* Matches smp_wmb() in fuse_set_initialized() */
-	smp_rmb();
-	req = fuse_request_alloc(0);
-	if (!req)
-		req = get_reserved_req(fc, file);
-
-	req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
-	req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
-	req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
-
-	__set_bit(FR_WAITING, &req->flags);
-	__clear_bit(FR_BACKGROUND, &req->flags);
-	return req;
-}
-
-void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (refcount_dec_and_test(&req->count)) {
 		if (test_bit(FR_BACKGROUND, &req->flags)) {
@@ -291,10 +161,10 @@
 			 * We get here in the unlikely case that a background
 			 * request was allocated but not sent
 			 */
-			spin_lock(&fc->lock);
+			spin_lock(&fc->bg_lock);
 			if (!fc->blocked)
 				wake_up(&fc->blocked_waitq);
-			spin_unlock(&fc->lock);
+			spin_unlock(&fc->bg_lock);
 		}
 
 		if (test_bit(FR_WAITING, &req->flags)) {
@@ -302,15 +172,11 @@
 			fuse_drop_waiting(fc);
 		}
 
-		if (req->stolen_file)
-			put_reserved_req(fc, req);
-		else
-			fuse_request_free(req);
+		fuse_request_free(req);
 	}
 }
-EXPORT_SYMBOL_GPL(fuse_put_request);
 
-static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
 {
 	unsigned nbytes = 0;
 	unsigned i;
@@ -320,19 +186,47 @@
 
 	return nbytes;
 }
+EXPORT_SYMBOL_GPL(fuse_len_args);
 
-static u64 fuse_get_unique(struct fuse_iqueue *fiq)
+u64 fuse_get_unique(struct fuse_iqueue *fiq)
 {
-	return ++fiq->reqctr;
+	fiq->reqctr += FUSE_REQ_ID_STEP;
+	return fiq->reqctr;
+}
+EXPORT_SYMBOL_GPL(fuse_get_unique);
+
+static unsigned int fuse_req_hash(u64 unique)
+{
+	return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
 }
 
-static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
+/**
+ * A new request is available, wake fiq->waitq
+ */
+static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+	wake_up(&fiq->waitq);
+	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+	spin_unlock(&fiq->lock);
+}
+
+const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
+	.wake_forget_and_unlock		= fuse_dev_wake_and_unlock,
+	.wake_interrupt_and_unlock	= fuse_dev_wake_and_unlock,
+	.wake_pending_and_unlock	= fuse_dev_wake_and_unlock,
+};
+EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
+
+static void queue_request_and_unlock(struct fuse_iqueue *fiq,
+				     struct fuse_req *req)
+__releases(fiq->lock)
 {
 	req->in.h.len = sizeof(struct fuse_in_header) +
-		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+		fuse_len_args(req->args->in_numargs,
+			      (struct fuse_arg *) req->args->in_args);
 	list_add_tail(&req->list, &fiq->pending);
-	wake_up_locked(&fiq->waitq);
-	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+	fiq->ops->wake_pending_and_unlock(fiq);
 }
 
 void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
@@ -343,32 +237,31 @@
 	forget->forget_one.nodeid = nodeid;
 	forget->forget_one.nlookup = nlookup;
 
-	spin_lock(&fiq->waitq.lock);
+	spin_lock(&fiq->lock);
 	if (fiq->connected) {
 		fiq->forget_list_tail->next = forget;
 		fiq->forget_list_tail = forget;
-		wake_up_locked(&fiq->waitq);
-		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+		fiq->ops->wake_forget_and_unlock(fiq);
 	} else {
 		kfree(forget);
+		spin_unlock(&fiq->lock);
 	}
-	spin_unlock(&fiq->waitq.lock);
 }
 
 static void flush_bg_queue(struct fuse_conn *fc)
 {
+	struct fuse_iqueue *fiq = &fc->iq;
+
 	while (fc->active_background < fc->max_background &&
 	       !list_empty(&fc->bg_queue)) {
 		struct fuse_req *req;
-		struct fuse_iqueue *fiq = &fc->iq;
 
-		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
+		req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
 		list_del(&req->list);
 		fc->active_background++;
-		spin_lock(&fiq->waitq.lock);
+		spin_lock(&fiq->lock);
 		req->in.h.unique = fuse_get_unique(fiq);
-		queue_request(fiq, req);
-		spin_unlock(&fiq->waitq.lock);
+		queue_request_and_unlock(fiq, req);
 	}
 }
 
@@ -380,20 +273,29 @@
  * the 'end' callback is called if given, else the reference to the
  * request is released
  */
-static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	struct fuse_iqueue *fiq = &fc->iq;
+	bool async;
 
 	if (test_and_set_bit(FR_FINISHED, &req->flags))
 		goto put_request;
 
-	spin_lock(&fiq->waitq.lock);
-	list_del_init(&req->intr_entry);
-	spin_unlock(&fiq->waitq.lock);
+	async = req->args->end;
+	/*
+	 * test_and_set_bit() implies smp_mb() between bit
+	 * changing and below intr_entry check. Pairs with
+	 * smp_mb() from queue_interrupt().
+	 */
+	if (!list_empty(&req->intr_entry)) {
+		spin_lock(&fiq->lock);
+		list_del_init(&req->intr_entry);
+		spin_unlock(&fiq->lock);
+	}
 	WARN_ON(test_bit(FR_PENDING, &req->flags));
 	WARN_ON(test_bit(FR_SENT, &req->flags));
 	if (test_bit(FR_BACKGROUND, &req->flags)) {
-		spin_lock(&fc->lock);
+		spin_lock(&fc->bg_lock);
 		clear_bit(FR_BACKGROUND, &req->flags);
 		if (fc->num_background == fc->max_background) {
 			fc->blocked = 0;
@@ -416,28 +318,45 @@
 		fc->num_background--;
 		fc->active_background--;
 		flush_bg_queue(fc);
-		spin_unlock(&fc->lock);
+		spin_unlock(&fc->bg_lock);
+	} else {
+		/* Wake up waiter sleeping in request_wait_answer() */
+		wake_up(&req->waitq);
 	}
-	wake_up(&req->waitq);
-	if (req->end)
-		req->end(fc, req);
+
+	if (async)
+		req->args->end(fc, req->args, req->out.h.error);
 put_request:
 	fuse_put_request(fc, req);
 }
+EXPORT_SYMBOL_GPL(fuse_request_end);
 
-static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
-	spin_lock(&fiq->waitq.lock);
-	if (test_bit(FR_FINISHED, &req->flags)) {
-		spin_unlock(&fiq->waitq.lock);
-		return;
+	spin_lock(&fiq->lock);
+	/* Check for we've sent request to interrupt this req */
+	if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
+		spin_unlock(&fiq->lock);
+		return -EINVAL;
 	}
+
 	if (list_empty(&req->intr_entry)) {
 		list_add_tail(&req->intr_entry, &fiq->interrupts);
-		wake_up_locked(&fiq->waitq);
+		/*
+		 * Pairs with smp_mb() implied by test_and_set_bit()
+		 * from request_end().
+		 */
+		smp_mb();
+		if (test_bit(FR_FINISHED, &req->flags)) {
+			list_del_init(&req->intr_entry);
+			spin_unlock(&fiq->lock);
+			return 0;
+		}
+		fiq->ops->wake_interrupt_and_unlock(fiq);
+	} else {
+		spin_unlock(&fiq->lock);
 	}
-	spin_unlock(&fiq->waitq.lock);
-	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+	return 0;
 }
 
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
@@ -466,16 +385,16 @@
 		if (!err)
 			return;
 
-		spin_lock(&fiq->waitq.lock);
+		spin_lock(&fiq->lock);
 		/* Request is not yet in userspace, bail out */
 		if (test_bit(FR_PENDING, &req->flags)) {
 			list_del(&req->list);
-			spin_unlock(&fiq->waitq.lock);
+			spin_unlock(&fiq->lock);
 			__fuse_put_request(req);
 			req->out.h.error = -EINTR;
 			return;
 		}
-		spin_unlock(&fiq->waitq.lock);
+		spin_unlock(&fiq->lock);
 	}
 
 	/*
@@ -490,180 +409,194 @@
 	struct fuse_iqueue *fiq = &fc->iq;
 
 	BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
-	spin_lock(&fiq->waitq.lock);
+	spin_lock(&fiq->lock);
 	if (!fiq->connected) {
-		spin_unlock(&fiq->waitq.lock);
+		spin_unlock(&fiq->lock);
 		req->out.h.error = -ENOTCONN;
 	} else {
 		req->in.h.unique = fuse_get_unique(fiq);
-		queue_request(fiq, req);
 		/* acquire extra reference, since request is still needed
-		   after request_end() */
+		   after fuse_request_end() */
 		__fuse_get_request(req);
-		spin_unlock(&fiq->waitq.lock);
+		queue_request_and_unlock(fiq, req);
 
 		request_wait_answer(fc, req);
-		/* Pairs with smp_wmb() in request_end() */
+		/* Pairs with smp_wmb() in fuse_request_end() */
 		smp_rmb();
 	}
 }
 
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
-{
-	__set_bit(FR_ISREPLY, &req->flags);
-	if (!test_bit(FR_WAITING, &req->flags)) {
-		__set_bit(FR_WAITING, &req->flags);
-		atomic_inc(&fc->num_waiting);
-	}
-	__fuse_request_send(fc, req);
-}
-EXPORT_SYMBOL_GPL(fuse_request_send);
-
 static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
 {
-	if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS)
-		args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE;
+	if (fc->minor < 4 && args->opcode == FUSE_STATFS)
+		args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE;
 
 	if (fc->minor < 9) {
-		switch (args->in.h.opcode) {
+		switch (args->opcode) {
 		case FUSE_LOOKUP:
 		case FUSE_CREATE:
 		case FUSE_MKNOD:
 		case FUSE_MKDIR:
 		case FUSE_SYMLINK:
 		case FUSE_LINK:
-			args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+			args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
 			break;
 		case FUSE_GETATTR:
 		case FUSE_SETATTR:
-			args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+			args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
 			break;
 		}
 	}
 	if (fc->minor < 12) {
-		switch (args->in.h.opcode) {
+		switch (args->opcode) {
 		case FUSE_CREATE:
-			args->in.args[0].size = sizeof(struct fuse_open_in);
+			args->in_args[0].size = sizeof(struct fuse_open_in);
 			break;
 		case FUSE_MKNOD:
-			args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
+			args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
 			break;
 		}
 	}
 }
 
+static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
+	req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
+	req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
+}
+
+static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
+{
+	req->in.h.opcode = args->opcode;
+	req->in.h.nodeid = args->nodeid;
+	req->args = args;
+}
+
 ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
 {
 	struct fuse_req *req;
 	ssize_t ret;
 
-	req = fuse_get_req(fc, 0);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (args->force) {
+		atomic_inc(&fc->num_waiting);
+		req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL);
+
+		if (!args->nocreds)
+			fuse_force_creds(fc, req);
+
+		__set_bit(FR_WAITING, &req->flags);
+		__set_bit(FR_FORCE, &req->flags);
+	} else {
+		WARN_ON(args->nocreds);
+		req = fuse_get_req(fc, false);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+	}
 
 	/* Needs to be done after fuse_get_req() so that fc->minor is valid */
 	fuse_adjust_compat(fc, args);
+	fuse_args_to_req(req, args);
 
-	req->in.h.opcode = args->in.h.opcode;
-	req->in.h.nodeid = args->in.h.nodeid;
-	req->in.numargs = args->in.numargs;
-	memcpy(req->in.args, args->in.args,
-	       args->in.numargs * sizeof(struct fuse_in_arg));
-	req->out.argvar = args->out.argvar;
-	req->out.numargs = args->out.numargs;
-	memcpy(req->out.args, args->out.args,
-	       args->out.numargs * sizeof(struct fuse_arg));
-	fuse_request_send(fc, req);
+	if (!args->noreply)
+		__set_bit(FR_ISREPLY, &req->flags);
+	__fuse_request_send(fc, req);
 	ret = req->out.h.error;
-	if (!ret && args->out.argvar) {
-		BUG_ON(args->out.numargs != 1);
-		ret = req->out.args[0].size;
+	if (!ret && args->out_argvar) {
+		BUG_ON(args->out_numargs == 0);
+		ret = args->out_args[args->out_numargs - 1].size;
 	}
 	fuse_put_request(fc, req);
 
 	return ret;
 }
 
-/*
- * Called under fc->lock
- *
- * fc->connected must have been checked previously
- */
-void fuse_request_send_background_locked(struct fuse_conn *fc,
-					 struct fuse_req *req)
+static bool fuse_request_queue_background(struct fuse_conn *fc,
+					  struct fuse_req *req)
 {
-	BUG_ON(!test_bit(FR_BACKGROUND, &req->flags));
+	bool queued = false;
+
+	WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
 	if (!test_bit(FR_WAITING, &req->flags)) {
 		__set_bit(FR_WAITING, &req->flags);
 		atomic_inc(&fc->num_waiting);
 	}
 	__set_bit(FR_ISREPLY, &req->flags);
-	fc->num_background++;
-	if (fc->num_background == fc->max_background)
-		fc->blocked = 1;
-	if (fc->num_background == fc->congestion_threshold && fc->sb) {
-		set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
-		set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+	spin_lock(&fc->bg_lock);
+	if (likely(fc->connected)) {
+		fc->num_background++;
+		if (fc->num_background == fc->max_background)
+			fc->blocked = 1;
+		if (fc->num_background == fc->congestion_threshold && fc->sb) {
+			set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+			set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+		}
+		list_add_tail(&req->list, &fc->bg_queue);
+		flush_bg_queue(fc);
+		queued = true;
 	}
-	list_add_tail(&req->list, &fc->bg_queue);
-	flush_bg_queue(fc);
+	spin_unlock(&fc->bg_lock);
+
+	return queued;
 }
 
-void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+			    gfp_t gfp_flags)
 {
-	BUG_ON(!req->end);
-	spin_lock(&fc->lock);
-	if (fc->connected) {
-		fuse_request_send_background_locked(fc, req);
-		spin_unlock(&fc->lock);
+	struct fuse_req *req;
+
+	if (args->force) {
+		WARN_ON(!args->nocreds);
+		req = fuse_request_alloc(gfp_flags);
+		if (!req)
+			return -ENOMEM;
+		__set_bit(FR_BACKGROUND, &req->flags);
 	} else {
-		spin_unlock(&fc->lock);
-		req->out.h.error = -ENOTCONN;
-		req->end(fc, req);
-		fuse_put_request(fc, req);
+		WARN_ON(args->nocreds);
+		req = fuse_get_req(fc, true);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
 	}
-}
-EXPORT_SYMBOL_GPL(fuse_request_send_background);
 
-static int fuse_request_send_notify_reply(struct fuse_conn *fc,
-					  struct fuse_req *req, u64 unique)
+	fuse_args_to_req(req, args);
+
+	if (!fuse_request_queue_background(fc, req)) {
+		fuse_put_request(fc, req);
+		return -ENOTCONN;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_simple_background);
+
+static int fuse_simple_notify_reply(struct fuse_conn *fc,
+				    struct fuse_args *args, u64 unique)
 {
-	int err = -ENODEV;
+	struct fuse_req *req;
 	struct fuse_iqueue *fiq = &fc->iq;
+	int err = 0;
+
+	req = fuse_get_req(fc, false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	__clear_bit(FR_ISREPLY, &req->flags);
 	req->in.h.unique = unique;
-	spin_lock(&fiq->waitq.lock);
+
+	fuse_args_to_req(req, args);
+
+	spin_lock(&fiq->lock);
 	if (fiq->connected) {
-		queue_request(fiq, req);
-		err = 0;
+		queue_request_and_unlock(fiq, req);
+	} else {
+		err = -ENODEV;
+		spin_unlock(&fiq->lock);
+		fuse_put_request(fc, req);
 	}
-	spin_unlock(&fiq->waitq.lock);
 
 	return err;
 }
 
-void fuse_force_forget(struct file *file, u64 nodeid)
-{
-	struct inode *inode = file_inode(file);
-	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req;
-	struct fuse_forget_in inarg;
-
-	memset(&inarg, 0, sizeof(inarg));
-	inarg.nlookup = 1;
-	req = fuse_get_req_nofail_nopages(fc, file);
-	req->in.h.opcode = FUSE_FORGET;
-	req->in.h.nodeid = nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(inarg);
-	req->in.args[0].value = &inarg;
-	__clear_bit(FR_ISREPLY, &req->flags);
-	__fuse_request_send(fc, req);
-	/* ignore errors */
-	fuse_put_request(fc, req);
-}
-
 /*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
@@ -839,8 +772,8 @@
 	       1 << PG_lru |
 	       1 << PG_active |
 	       1 << PG_reclaim))) {
-		printk(KERN_WARNING "fuse: trying to steal weird page\n");
-		printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+		pr_warn("trying to steal weird page\n");
+		pr_warn("  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
 		return 1;
 	}
 	return 0;
@@ -1017,14 +950,15 @@
 {
 	unsigned i;
 	struct fuse_req *req = cs->req;
+	struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
 
-	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
+
+	for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) {
 		int err;
-		unsigned offset = req->page_descs[i].offset;
-		unsigned count = min(nbytes, req->page_descs[i].length);
+		unsigned int offset = ap->descs[i].offset;
+		unsigned int count = min(nbytes, ap->descs[i].length);
 
-		err = fuse_copy_page(cs, &req->pages[i], offset, count,
-				     zeroing);
+		err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing);
 		if (err)
 			return err;
 
@@ -1082,12 +1016,12 @@
  * Unlike other requests this is assembled on demand, without a need
  * to allocate a separate fuse_req structure.
  *
- * Called with fiq->waitq.lock held, releases it
+ * Called with fiq->lock held, releases it
  */
 static int fuse_read_interrupt(struct fuse_iqueue *fiq,
 			       struct fuse_copy_state *cs,
 			       size_t nbytes, struct fuse_req *req)
-__releases(fiq->waitq.lock)
+__releases(fiq->lock)
 {
 	struct fuse_in_header ih;
 	struct fuse_interrupt_in arg;
@@ -1095,15 +1029,14 @@
 	int err;
 
 	list_del_init(&req->intr_entry);
-	req->intr_unique = fuse_get_unique(fiq);
 	memset(&ih, 0, sizeof(ih));
 	memset(&arg, 0, sizeof(arg));
 	ih.len = reqsize;
 	ih.opcode = FUSE_INTERRUPT;
-	ih.unique = req->intr_unique;
+	ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT);
 	arg.unique = req->in.h.unique;
 
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 	if (nbytes < reqsize)
 		return -EINVAL;
 
@@ -1115,9 +1048,9 @@
 	return err ? err : reqsize;
 }
 
-static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
-					       unsigned max,
-					       unsigned *countp)
+struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+					     unsigned int max,
+					     unsigned int *countp)
 {
 	struct fuse_forget_link *head = fiq->forget_list_head.next;
 	struct fuse_forget_link **newhead = &head;
@@ -1136,14 +1069,15 @@
 
 	return head;
 }
+EXPORT_SYMBOL(fuse_dequeue_forget);
 
 static int fuse_read_single_forget(struct fuse_iqueue *fiq,
 				   struct fuse_copy_state *cs,
 				   size_t nbytes)
-__releases(fiq->waitq.lock)
+__releases(fiq->lock)
 {
 	int err;
-	struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL);
+	struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL);
 	struct fuse_forget_in arg = {
 		.nlookup = forget->forget_one.nlookup,
 	};
@@ -1154,7 +1088,7 @@
 		.len = sizeof(ih) + sizeof(arg),
 	};
 
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 	kfree(forget);
 	if (nbytes < ih.len)
 		return -EINVAL;
@@ -1172,7 +1106,7 @@
 
 static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
 				   struct fuse_copy_state *cs, size_t nbytes)
-__releases(fiq->waitq.lock)
+__releases(fiq->lock)
 {
 	int err;
 	unsigned max_forgets;
@@ -1186,13 +1120,13 @@
 	};
 
 	if (nbytes < ih.len) {
-		spin_unlock(&fiq->waitq.lock);
+		spin_unlock(&fiq->lock);
 		return -EINVAL;
 	}
 
 	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
-	head = dequeue_forget(fiq, max_forgets, &count);
-	spin_unlock(&fiq->waitq.lock);
+	head = fuse_dequeue_forget(fiq, max_forgets, &count);
+	spin_unlock(&fiq->lock);
 
 	arg.count = count;
 	ih.len += count * sizeof(struct fuse_forget_one);
@@ -1222,7 +1156,7 @@
 static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
 			    struct fuse_copy_state *cs,
 			    size_t nbytes)
-__releases(fiq->waitq.lock)
+__releases(fiq->lock)
 {
 	if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
 		return fuse_read_single_forget(fiq, cs, nbytes);
@@ -1236,7 +1170,7 @@
  * the pending list and copies request data to userspace buffer.  If
  * no reply is needed (FORGET) or request has been aborted or there
  * was an error during the copying then it's finished by calling
- * request_end().  Otherwise add it to the processing list, and set
+ * fuse_request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
 static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
@@ -1247,23 +1181,45 @@
 	struct fuse_iqueue *fiq = &fc->iq;
 	struct fuse_pqueue *fpq = &fud->pq;
 	struct fuse_req *req;
-	struct fuse_in *in;
+	struct fuse_args *args;
 	unsigned reqsize;
+	unsigned int hash;
+
+	/*
+	 * Require sane minimum read buffer - that has capacity for fixed part
+	 * of any request header + negotiated max_write room for data.
+	 *
+	 * Historically libfuse reserves 4K for fixed header room, but e.g.
+	 * GlusterFS reserves only 80 bytes
+	 *
+	 *	= `sizeof(fuse_in_header) + sizeof(fuse_write_in)`
+	 *
+	 * which is the absolute minimum any sane filesystem should be using
+	 * for header room.
+	 */
+	if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER,
+			   sizeof(struct fuse_in_header) +
+			   sizeof(struct fuse_write_in) +
+			   fc->max_write))
+		return -EINVAL;
 
  restart:
-	spin_lock(&fiq->waitq.lock);
-	err = -EAGAIN;
-	if ((file->f_flags & O_NONBLOCK) && fiq->connected &&
-	    !request_pending(fiq))
-		goto err_unlock;
+	for (;;) {
+		spin_lock(&fiq->lock);
+		if (!fiq->connected || request_pending(fiq))
+			break;
+		spin_unlock(&fiq->lock);
 
-	err = wait_event_interruptible_exclusive_locked(fiq->waitq,
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+		err = wait_event_interruptible_exclusive(fiq->waitq,
 				!fiq->connected || request_pending(fiq));
-	if (err)
-		goto err_unlock;
+		if (err)
+			return err;
+	}
 
 	if (!fiq->connected) {
-		err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
+		err = fc->aborted ? -ECONNABORTED : -ENODEV;
 		goto err_unlock;
 	}
 
@@ -1284,33 +1240,33 @@
 	req = list_entry(fiq->pending.next, struct fuse_req, list);
 	clear_bit(FR_PENDING, &req->flags);
 	list_del_init(&req->list);
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 
-	in = &req->in;
-	reqsize = in->h.len;
+	args = req->args;
+	reqsize = req->in.h.len;
 
 	/* If request is too large, reply with an error and restart the read */
 	if (nbytes < reqsize) {
 		req->out.h.error = -EIO;
 		/* SETXATTR is special, since it may contain too large data */
-		if (in->h.opcode == FUSE_SETXATTR)
+		if (args->opcode == FUSE_SETXATTR)
 			req->out.h.error = -E2BIG;
-		request_end(fc, req);
+		fuse_request_end(fc, req);
 		goto restart;
 	}
 	spin_lock(&fpq->lock);
 	list_add(&req->list, &fpq->io);
 	spin_unlock(&fpq->lock);
 	cs->req = req;
-	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
+	err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h));
 	if (!err)
-		err = fuse_copy_args(cs, in->numargs, in->argpages,
-				     (struct fuse_arg *) in->args, 0);
+		err = fuse_copy_args(cs, args->in_numargs, args->in_pages,
+				     (struct fuse_arg *) args->in_args, 0);
 	fuse_copy_finish(cs);
 	spin_lock(&fpq->lock);
 	clear_bit(FR_LOCKED, &req->flags);
 	if (!fpq->connected) {
-		err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
+		err = fc->aborted ? -ECONNABORTED : -ENODEV;
 		goto out_end;
 	}
 	if (err) {
@@ -1321,7 +1277,8 @@
 		err = reqsize;
 		goto out_end;
 	}
-	list_move_tail(&req->list, &fpq->processing);
+	hash = fuse_req_hash(req->in.h.unique);
+	list_move_tail(&req->list, &fpq->processing[hash]);
 	__fuse_get_request(req);
 	set_bit(FR_SENT, &req->flags);
 	spin_unlock(&fpq->lock);
@@ -1337,11 +1294,11 @@
 	if (!test_bit(FR_PRIVATE, &req->flags))
 		list_del_init(&req->list);
 	spin_unlock(&fpq->lock);
-	request_end(fc, req);
+	fuse_request_end(fc, req);
 	return err;
 
  err_unlock:
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 	return err;
 }
 
@@ -1660,9 +1617,19 @@
 	return err;
 }
 
-static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+struct fuse_retrieve_args {
+	struct fuse_args_pages ap;
+	struct fuse_notify_retrieve_in inarg;
+};
+
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
+			      int error)
 {
-	release_pages(req->pages, req->num_pages);
+	struct fuse_retrieve_args *ra =
+		container_of(args, typeof(*ra), ap.args);
+
+	release_pages(ra->ap.pages, ra->ap.num_pages);
+	kfree(ra);
 }
 
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
@@ -1670,40 +1637,49 @@
 {
 	int err;
 	struct address_space *mapping = inode->i_mapping;
-	struct fuse_req *req;
 	pgoff_t index;
 	loff_t file_size;
 	unsigned int num;
 	unsigned int offset;
 	size_t total_len = 0;
-	int num_pages;
+	unsigned int num_pages;
+	struct fuse_retrieve_args *ra;
+	size_t args_size = sizeof(*ra);
+	struct fuse_args_pages *ap;
+	struct fuse_args *args;
 
 	offset = outarg->offset & ~PAGE_MASK;
 	file_size = i_size_read(inode);
 
-	num = outarg->size;
+	num = min(outarg->size, fc->max_write);
 	if (outarg->offset > file_size)
 		num = 0;
 	else if (outarg->offset + num > file_size)
 		num = file_size - outarg->offset;
 
 	num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+	num_pages = min(num_pages, fc->max_pages);
 
-	req = fuse_get_req(fc, num_pages);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0]));
 
-	req->in.h.opcode = FUSE_NOTIFY_REPLY;
-	req->in.h.nodeid = outarg->nodeid;
-	req->in.numargs = 2;
-	req->in.argpages = 1;
-	req->page_descs[0].offset = offset;
-	req->end = fuse_retrieve_end;
+	ra = kzalloc(args_size, GFP_KERNEL);
+	if (!ra)
+		return -ENOMEM;
+
+	ap = &ra->ap;
+	ap->pages = (void *) (ra + 1);
+	ap->descs = (void *) (ap->pages + num_pages);
+
+	args = &ap->args;
+	args->nodeid = outarg->nodeid;
+	args->opcode = FUSE_NOTIFY_REPLY;
+	args->in_numargs = 2;
+	args->in_pages = true;
+	args->end = fuse_retrieve_end;
 
 	index = outarg->offset >> PAGE_SHIFT;
 
-	while (num && req->num_pages < num_pages) {
+	while (num && ap->num_pages < num_pages) {
 		struct page *page;
 		unsigned int this_num;
 
@@ -1712,26 +1688,25 @@
 			break;
 
 		this_num = min_t(unsigned, num, PAGE_SIZE - offset);
-		req->pages[req->num_pages] = page;
-		req->page_descs[req->num_pages].length = this_num;
-		req->num_pages++;
+		ap->pages[ap->num_pages] = page;
+		ap->descs[ap->num_pages].offset = offset;
+		ap->descs[ap->num_pages].length = this_num;
+		ap->num_pages++;
 
 		offset = 0;
 		num -= this_num;
 		total_len += this_num;
 		index++;
 	}
-	req->misc.retrieve_in.offset = outarg->offset;
-	req->misc.retrieve_in.size = total_len;
-	req->in.args[0].size = sizeof(req->misc.retrieve_in);
-	req->in.args[0].value = &req->misc.retrieve_in;
-	req->in.args[1].size = total_len;
+	ra->inarg.offset = outarg->offset;
+	ra->inarg.size = total_len;
+	args->in_args[0].size = sizeof(ra->inarg);
+	args->in_args[0].value = &ra->inarg;
+	args->in_args[1].size = total_len;
 
-	err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
-	if (err) {
-		fuse_retrieve_end(fc, req);
-		fuse_put_request(fc, req);
-	}
+	err = fuse_simple_notify_reply(fc, args, outarg->notify_unique);
+	if (err)
+		fuse_retrieve_end(fc, args, err);
 
 	return err;
 }
@@ -1807,36 +1782,35 @@
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
 {
+	unsigned int hash = fuse_req_hash(unique);
 	struct fuse_req *req;
 
-	list_for_each_entry(req, &fpq->processing, list) {
-		if (req->in.h.unique == unique || req->intr_unique == unique)
+	list_for_each_entry(req, &fpq->processing[hash], list) {
+		if (req->in.h.unique == unique)
 			return req;
 	}
 	return NULL;
 }
 
-static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
 			 unsigned nbytes)
 {
 	unsigned reqsize = sizeof(struct fuse_out_header);
 
-	if (out->h.error)
-		return nbytes != reqsize ? -EINVAL : 0;
+	reqsize += fuse_len_args(args->out_numargs, args->out_args);
 
-	reqsize += len_args(out->numargs, out->args);
-
-	if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+	if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar))
 		return -EINVAL;
 	else if (reqsize > nbytes) {
-		struct fuse_arg *lastarg = &out->args[out->numargs-1];
+		struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1];
 		unsigned diffsize = reqsize - nbytes;
+
 		if (diffsize > lastarg->size)
 			return -EINVAL;
 		lastarg->size -= diffsize;
 	}
-	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
-			      out->page_zeroing);
+	return fuse_copy_args(cs, args->out_numargs, args->out_pages,
+			      args->out_args, args->page_zeroing);
 }
 
 /*
@@ -1844,7 +1818,7 @@
  * the write buffer.  The request is then searched on the processing
  * list by the unique ID found in the header.  If found, then remove
  * it from the list and copy the rest of the buffer to the request.
- * The request is finished by calling request_end()
+ * The request is finished by calling fuse_request_end().
  */
 static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 				 struct fuse_copy_state *cs, size_t nbytes)
@@ -1855,16 +1829,17 @@
 	struct fuse_req *req;
 	struct fuse_out_header oh;
 
+	err = -EINVAL;
 	if (nbytes < sizeof(struct fuse_out_header))
-		return -EINVAL;
+		goto out;
 
 	err = fuse_copy_one(cs, &oh, sizeof(oh));
 	if (err)
-		goto err_finish;
+		goto copy_finish;
 
 	err = -EINVAL;
 	if (oh.len != nbytes)
-		goto err_finish;
+		goto copy_finish;
 
 	/*
 	 * Zero oh.unique indicates unsolicited notification message
@@ -1872,41 +1847,40 @@
 	 */
 	if (!oh.unique) {
 		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
-		return err ? err : nbytes;
+		goto out;
 	}
 
 	err = -EINVAL;
 	if (oh.error <= -1000 || oh.error > 0)
-		goto err_finish;
+		goto copy_finish;
 
 	spin_lock(&fpq->lock);
+	req = NULL;
+	if (fpq->connected)
+		req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
+
 	err = -ENOENT;
-	if (!fpq->connected)
-		goto err_unlock_pq;
+	if (!req) {
+		spin_unlock(&fpq->lock);
+		goto copy_finish;
+	}
 
-	req = request_find(fpq, oh.unique);
-	if (!req)
-		goto err_unlock_pq;
-
-	/* Is it an interrupt reply? */
-	if (req->intr_unique == oh.unique) {
+	/* Is it an interrupt reply ID? */
+	if (oh.unique & FUSE_INT_REQ_BIT) {
 		__fuse_get_request(req);
 		spin_unlock(&fpq->lock);
 
-		err = -EINVAL;
-		if (nbytes != sizeof(struct fuse_out_header)) {
-			fuse_put_request(fc, req);
-			goto err_finish;
-		}
-
-		if (oh.error == -ENOSYS)
+		err = 0;
+		if (nbytes != sizeof(struct fuse_out_header))
+			err = -EINVAL;
+		else if (oh.error == -ENOSYS)
 			fc->no_interrupt = 1;
 		else if (oh.error == -EAGAIN)
-			queue_interrupt(&fc->iq, req);
+			err = queue_interrupt(&fc->iq, req);
+
 		fuse_put_request(fc, req);
 
-		fuse_copy_finish(cs);
-		return nbytes;
+		goto copy_finish;
 	}
 
 	clear_bit(FR_SENT, &req->flags);
@@ -1915,10 +1889,13 @@
 	set_bit(FR_LOCKED, &req->flags);
 	spin_unlock(&fpq->lock);
 	cs->req = req;
-	if (!req->out.page_replace)
+	if (!req->args->page_replace)
 		cs->move_pages = 0;
 
-	err = copy_out_args(cs, &req->out, nbytes);
+	if (oh.error)
+		err = nbytes != sizeof(oh) ? -EINVAL : 0;
+	else
+		err = copy_out_args(cs, req->args, nbytes);
 	fuse_copy_finish(cs);
 
 	spin_lock(&fpq->lock);
@@ -1931,15 +1908,13 @@
 		list_del_init(&req->list);
 	spin_unlock(&fpq->lock);
 
-	request_end(fc, req);
-
+	fuse_request_end(fc, req);
+out:
 	return err ? err : nbytes;
 
- err_unlock_pq:
-	spin_unlock(&fpq->lock);
- err_finish:
+copy_finish:
 	fuse_copy_finish(cs);
-	return err;
+	goto out;
 }
 
 static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
@@ -1989,10 +1964,8 @@
 		rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
 
 	ret = -EINVAL;
-	if (rem < len) {
-		pipe_unlock(pipe);
-		goto out;
-	}
+	if (rem < len)
+		goto out_free;
 
 	rem = len;
 	while (rem) {
@@ -2010,7 +1983,9 @@
 			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 			pipe->nrbufs--;
 		} else {
-			pipe_buf_get(pipe, ibuf);
+			if (!pipe_buf_get(pipe, ibuf))
+				goto out_free;
+
 			*obuf = *ibuf;
 			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 			obuf->len = rem;
@@ -2032,10 +2007,12 @@
 
 	ret = fuse_dev_do_write(fud, &cs, len);
 
+	pipe_lock(pipe);
+out_free:
 	for (idx = 0; idx < nbuf; idx++)
 		pipe_buf_release(pipe, &bufs[idx]);
+	pipe_unlock(pipe);
 
-out:
 	kvfree(bufs);
 	return ret;
 }
@@ -2052,21 +2029,17 @@
 	fiq = &fud->fc->iq;
 	poll_wait(file, &fiq->waitq, wait);
 
-	spin_lock(&fiq->waitq.lock);
+	spin_lock(&fiq->lock);
 	if (!fiq->connected)
 		mask = EPOLLERR;
 	else if (request_pending(fiq))
 		mask |= EPOLLIN | EPOLLRDNORM;
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 
 	return mask;
 }
 
-/*
- * Abort all requests on the given list (pending or processing)
- *
- * This function releases and reacquires fc->lock
- */
+/* Abort all requests on the given list (pending or processing) */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
 {
 	while (!list_empty(head)) {
@@ -2075,7 +2048,7 @@
 		req->out.h.error = -ECONNABORTED;
 		clear_bit(FR_SENT, &req->flags);
 		list_del_init(&req->list);
-		request_end(fc, req);
+		fuse_request_end(fc, req);
 	}
 }
 
@@ -2112,7 +2085,7 @@
  * is OK, the request will in that case be removed from the list before we touch
  * it.
  */
-void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
+void fuse_abort_conn(struct fuse_conn *fc)
 {
 	struct fuse_iqueue *fiq = &fc->iq;
 
@@ -2121,10 +2094,13 @@
 		struct fuse_dev *fud;
 		struct fuse_req *req, *next;
 		LIST_HEAD(to_end);
+		unsigned int i;
 
+		/* Background queuing checks fc->connected under bg_lock */
+		spin_lock(&fc->bg_lock);
 		fc->connected = 0;
-		fc->blocked = 0;
-		fc->aborted = is_abort;
+		spin_unlock(&fc->bg_lock);
+
 		fuse_set_initialized(fc);
 		list_for_each_entry(fud, &fc->devices, entry) {
 			struct fuse_pqueue *fpq = &fud->pq;
@@ -2142,21 +2118,26 @@
 				}
 				spin_unlock(&req->waitq.lock);
 			}
-			list_splice_tail_init(&fpq->processing, &to_end);
+			for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+				list_splice_tail_init(&fpq->processing[i],
+						      &to_end);
 			spin_unlock(&fpq->lock);
 		}
+		spin_lock(&fc->bg_lock);
+		fc->blocked = 0;
 		fc->max_background = UINT_MAX;
 		flush_bg_queue(fc);
+		spin_unlock(&fc->bg_lock);
 
-		spin_lock(&fiq->waitq.lock);
+		spin_lock(&fiq->lock);
 		fiq->connected = 0;
 		list_for_each_entry(req, &fiq->pending, list)
 			clear_bit(FR_PENDING, &req->flags);
 		list_splice_tail_init(&fiq->pending, &to_end);
 		while (forget_pending(fiq))
-			kfree(dequeue_forget(fiq, 1, NULL));
-		wake_up_all_locked(&fiq->waitq);
-		spin_unlock(&fiq->waitq.lock);
+			kfree(fuse_dequeue_forget(fiq, 1, NULL));
+		wake_up_all(&fiq->waitq);
+		spin_unlock(&fiq->lock);
 		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 		end_polls(fc);
 		wake_up_all(&fc->blocked_waitq);
@@ -2184,10 +2165,12 @@
 		struct fuse_conn *fc = fud->fc;
 		struct fuse_pqueue *fpq = &fud->pq;
 		LIST_HEAD(to_end);
+		unsigned int i;
 
 		spin_lock(&fpq->lock);
 		WARN_ON(!list_empty(&fpq->io));
-		list_splice_init(&fpq->processing, &to_end);
+		for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+			list_splice_init(&fpq->processing[i], &to_end);
 		spin_unlock(&fpq->lock);
 
 		end_requests(fc, &to_end);
@@ -2195,7 +2178,7 @@
 		/* Are we the last open device? */
 		if (atomic_dec_and_test(&fc->dev_count)) {
 			WARN_ON(fc->iq.fasync != NULL);
-			fuse_abort_conn(fc, false);
+			fuse_abort_conn(fc);
 		}
 		fuse_dev_free(fud);
 	}
@@ -2221,7 +2204,7 @@
 	if (new->private_data)
 		return -EINVAL;
 
-	fud = fuse_dev_alloc(fc);
+	fud = fuse_dev_alloc_install(fc);
 	if (!fud)
 		return -ENOMEM;
 

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 82a1322..54d638f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c

@@ -14,24 +14,9 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/iversion.h>
 #include <linux/posix_acl.h>
 
-static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
-{
-	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_inode *fi = get_fuse_inode(dir);
-
-	if (!fc->do_readdirplus)
-		return false;
-	if (!fc->readdirplus_auto)
-		return true;
-	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
-		return true;
-	if (ctx->pos == 0)
-		return true;
-	return false;
-}
-
 static void fuse_advise_use_readdirplus(struct inode *dir)
 {
 	struct fuse_inode *fi = get_fuse_inode(dir);
@@ -39,20 +24,54 @@
 	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
 }
 
+#if BITS_PER_LONG >= 64
+static inline void __fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+	entry->d_fsdata = (void *) time;
+}
+
+static inline u64 fuse_dentry_time(const struct dentry *entry)
+{
+	return (u64)entry->d_fsdata;
+}
+
+#else
 union fuse_dentry {
 	u64 time;
 	struct rcu_head rcu;
 };
 
-static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
+static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time)
 {
-	((union fuse_dentry *) entry->d_fsdata)->time = time;
+	((union fuse_dentry *) dentry->d_fsdata)->time = time;
 }
 
-static inline u64 fuse_dentry_time(struct dentry *entry)
+static inline u64 fuse_dentry_time(const struct dentry *entry)
 {
 	return ((union fuse_dentry *) entry->d_fsdata)->time;
 }
+#endif
+
+static void fuse_dentry_settime(struct dentry *dentry, u64 time)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb);
+	bool delete = !time && fc->delete_stale;
+	/*
+	 * Mess with DCACHE_OP_DELETE because dput() will be faster without it.
+	 * Don't care about races, either way it's just an optimization
+	 */
+	if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) ||
+	    (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) {
+		spin_lock(&dentry->d_lock);
+		if (!delete)
+			dentry->d_flags &= ~DCACHE_OP_DELETE;
+		else
+			dentry->d_flags |= DCACHE_OP_DELETE;
+		spin_unlock(&dentry->d_lock);
+	}
+
+	__fuse_dentry_settime(dentry, time);
+}
 
 /*
  * FUSE caches dentries and attributes with separate timeout.  The
@@ -80,8 +99,7 @@
  * Set dentry and possibly attribute timeouts from the lookup/mk*
  * replies
  */
-static void fuse_change_entry_timeout(struct dentry *entry,
-				      struct fuse_entry_out *o)
+void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o)
 {
 	fuse_dentry_settime(entry,
 		time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
@@ -92,18 +110,29 @@
 	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
 }
 
-static u64 entry_attr_timeout(struct fuse_entry_out *o)
+u64 entry_attr_timeout(struct fuse_entry_out *o)
 {
 	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
 }
 
+static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
+{
+	set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask);
+}
+
 /*
  * Mark the attributes as stale, so that at the next call to
  * ->getattr() they will be fetched from userspace
  */
 void fuse_invalidate_attr(struct inode *inode)
 {
-	get_fuse_inode(inode)->i_time = 0;
+	fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS);
+}
+
+static void fuse_dir_changed(struct inode *dir)
+{
+	fuse_invalidate_attr(dir);
+	inode_maybe_inc_iversion(dir, false);
 }
 
 /**
@@ -113,7 +142,7 @@
 void fuse_invalidate_atime(struct inode *inode)
 {
 	if (!IS_RDONLY(inode))
-		fuse_invalidate_attr(inode);
+		fuse_invalidate_attr_mask(inode, STATX_ATIME);
 }
 
 /*
@@ -144,29 +173,14 @@
 			     struct fuse_entry_out *outarg)
 {
 	memset(outarg, 0, sizeof(struct fuse_entry_out));
-	args->in.h.opcode = FUSE_LOOKUP;
-	args->in.h.nodeid = nodeid;
-	args->in.numargs = 1;
-	args->in.args[0].size = name->len + 1;
-	args->in.args[0].value = name->name;
-	args->out.numargs = 1;
-	args->out.args[0].size = sizeof(struct fuse_entry_out);
-	args->out.args[0].value = outarg;
-}
-
-u64 fuse_get_attr_version(struct fuse_conn *fc)
-{
-	u64 curr_version;
-
-	/*
-	 * The spin lock isn't actually needed on 64bit archs, but we
-	 * don't yet care too much about such optimizations.
-	 */
-	spin_lock(&fc->lock);
-	curr_version = fc->attr_version;
-	spin_unlock(&fc->lock);
-
-	return curr_version;
+	args->opcode = FUSE_LOOKUP;
+	args->nodeid = nodeid;
+	args->in_numargs = 1;
+	args->in_args[0].size = name->len + 1;
+	args->in_args[0].value = name->name;
+	args->out_numargs = 1;
+	args->out_args[0].size = sizeof(struct fuse_entry_out);
+	args->out_args[0].value = outarg;
 }
 
 /*
@@ -227,9 +241,9 @@
 				fuse_queue_forget(fc, forget, outarg.nodeid, 1);
 				goto invalid;
 			}
-			spin_lock(&fc->lock);
+			spin_lock(&fi->lock);
 			fi->nlookup++;
-			spin_unlock(&fc->lock);
+			spin_unlock(&fi->lock);
 		}
 		kfree(forget);
 		if (ret == -ENOMEM)
@@ -262,14 +276,11 @@
 	goto out;
 }
 
-static int invalid_nodeid(u64 nodeid)
-{
-	return !nodeid || nodeid == FUSE_ROOT_ID;
-}
-
+#if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
-	dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
+	dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
+				   GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 
 	return dentry->d_fsdata ? 0 : -ENOMEM;
 }
@@ -279,16 +290,27 @@
 
 	kfree_rcu(fd, rcu);
 }
+#endif
+
+static int fuse_dentry_delete(const struct dentry *dentry)
+{
+	return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
+}
 
 const struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
+	.d_delete	= fuse_dentry_delete,
+#if BITS_PER_LONG < 64
 	.d_init		= fuse_dentry_init,
 	.d_release	= fuse_dentry_release,
+#endif
 };
 
 const struct dentry_operations fuse_root_dentry_operations = {
+#if BITS_PER_LONG < 64
 	.d_init		= fuse_dentry_init,
 	.d_release	= fuse_dentry_release,
+#endif
 };
 
 int fuse_valid_type(int m)
@@ -383,7 +405,8 @@
 	else
 		fuse_invalidate_entry_cache(entry);
 
-	fuse_advise_use_readdirplus(dir);
+	if (inode)
+		fuse_advise_use_readdirplus(dir);
 	return newent;
 
  out_iput:
@@ -410,6 +433,7 @@
 	struct fuse_create_in inarg;
 	struct fuse_open_out outopen;
 	struct fuse_entry_out outentry;
+	struct fuse_inode *fi;
 	struct fuse_file *ff;
 
 	/* Userspace expects S_IFREG in create mode */
@@ -434,18 +458,18 @@
 	inarg.flags = flags;
 	inarg.mode = mode;
 	inarg.umask = current_umask();
-	args.in.h.opcode = FUSE_CREATE;
-	args.in.h.nodeid = get_node_id(dir);
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = entry->d_name.len + 1;
-	args.in.args[1].value = entry->d_name.name;
-	args.out.numargs = 2;
-	args.out.args[0].size = sizeof(outentry);
-	args.out.args[0].value = &outentry;
-	args.out.args[1].size = sizeof(outopen);
-	args.out.args[1].value = &outopen;
+	args.opcode = FUSE_CREATE;
+	args.nodeid = get_node_id(dir);
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
+	args.out_numargs = 2;
+	args.out_args[0].size = sizeof(outentry);
+	args.out_args[0].value = &outentry;
+	args.out_args[1].size = sizeof(outopen);
+	args.out_args[1].value = &outopen;
 	err = fuse_simple_request(fc, &args);
 	if (err)
 		goto out_free_ff;
@@ -461,7 +485,7 @@
 			  &outentry.attr, entry_attr_timeout(&outentry), 0);
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
-		fuse_sync_release(ff, flags);
+		fuse_sync_release(NULL, ff, flags);
 		fuse_queue_forget(fc, forget, outentry.nodeid, 1);
 		err = -ENOMEM;
 		goto out_err;
@@ -469,10 +493,11 @@
 	kfree(forget);
 	d_instantiate(entry, inode);
 	fuse_change_entry_timeout(entry, &outentry);
-	fuse_invalidate_attr(dir);
+	fuse_dir_changed(dir);
 	err = finish_open(file, entry, generic_file_open);
 	if (err) {
-		fuse_sync_release(ff, flags);
+		fi = get_fuse_inode(inode);
+		fuse_sync_release(fi, ff, flags);
 	} else {
 		file->private_data = ff;
 		fuse_finish_open(inode, file);
@@ -549,10 +574,10 @@
 		return -ENOMEM;
 
 	memset(&outarg, 0, sizeof(outarg));
-	args->in.h.nodeid = get_node_id(dir);
-	args->out.numargs = 1;
-	args->out.args[0].size = sizeof(outarg);
-	args->out.args[0].value = &outarg;
+	args->nodeid = get_node_id(dir);
+	args->out_numargs = 1;
+	args->out_args[0].size = sizeof(outarg);
+	args->out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, args);
 	if (err)
 		goto out_put_forget_req;
@@ -583,7 +608,7 @@
 	} else {
 		fuse_change_entry_timeout(entry, &outarg);
 	}
-	fuse_invalidate_attr(dir);
+	fuse_dir_changed(dir);
 	return 0;
 
  out_put_forget_req:
@@ -605,12 +630,12 @@
 	inarg.mode = mode;
 	inarg.rdev = new_encode_dev(rdev);
 	inarg.umask = current_umask();
-	args.in.h.opcode = FUSE_MKNOD;
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = entry->d_name.len + 1;
-	args.in.args[1].value = entry->d_name.name;
+	args.opcode = FUSE_MKNOD;
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
 	return create_new_entry(fc, &args, dir, entry, mode);
 }
 
@@ -632,12 +657,12 @@
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
 	inarg.umask = current_umask();
-	args.in.h.opcode = FUSE_MKDIR;
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = entry->d_name.len + 1;
-	args.in.args[1].value = entry->d_name.name;
+	args.opcode = FUSE_MKDIR;
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
 	return create_new_entry(fc, &args, dir, entry, S_IFDIR);
 }
 
@@ -648,12 +673,12 @@
 	unsigned len = strlen(link) + 1;
 	FUSE_ARGS(args);
 
-	args.in.h.opcode = FUSE_SYMLINK;
-	args.in.numargs = 2;
-	args.in.args[0].size = entry->d_name.len + 1;
-	args.in.args[0].value = entry->d_name.name;
-	args.in.args[1].size = len;
-	args.in.args[1].value = link;
+	args.opcode = FUSE_SYMLINK;
+	args.in_numargs = 2;
+	args.in_args[0].size = entry->d_name.len + 1;
+	args.in_args[0].value = entry->d_name.name;
+	args.in_args[1].size = len;
+	args.in_args[1].value = link;
 	return create_new_entry(fc, &args, dir, entry, S_IFLNK);
 }
 
@@ -671,18 +696,18 @@
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	FUSE_ARGS(args);
 
-	args.in.h.opcode = FUSE_UNLINK;
-	args.in.h.nodeid = get_node_id(dir);
-	args.in.numargs = 1;
-	args.in.args[0].size = entry->d_name.len + 1;
-	args.in.args[0].value = entry->d_name.name;
+	args.opcode = FUSE_UNLINK;
+	args.nodeid = get_node_id(dir);
+	args.in_numargs = 1;
+	args.in_args[0].size = entry->d_name.len + 1;
+	args.in_args[0].value = entry->d_name.name;
 	err = fuse_simple_request(fc, &args);
 	if (!err) {
 		struct inode *inode = d_inode(entry);
 		struct fuse_inode *fi = get_fuse_inode(inode);
 
-		spin_lock(&fc->lock);
-		fi->attr_version = ++fc->attr_version;
+		spin_lock(&fi->lock);
+		fi->attr_version = atomic64_inc_return(&fc->attr_version);
 		/*
 		 * If i_nlink == 0 then unlink doesn't make sense, yet this can
 		 * happen if userspace filesystem is careless.  It would be
@@ -691,9 +716,9 @@
 		 */
 		if (inode->i_nlink > 0)
 			drop_nlink(inode);
-		spin_unlock(&fc->lock);
+		spin_unlock(&fi->lock);
 		fuse_invalidate_attr(inode);
-		fuse_invalidate_attr(dir);
+		fuse_dir_changed(dir);
 		fuse_invalidate_entry_cache(entry);
 		fuse_update_ctime(inode);
 	} else if (err == -EINTR)
@@ -707,15 +732,15 @@
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	FUSE_ARGS(args);
 
-	args.in.h.opcode = FUSE_RMDIR;
-	args.in.h.nodeid = get_node_id(dir);
-	args.in.numargs = 1;
-	args.in.args[0].size = entry->d_name.len + 1;
-	args.in.args[0].value = entry->d_name.name;
+	args.opcode = FUSE_RMDIR;
+	args.nodeid = get_node_id(dir);
+	args.in_numargs = 1;
+	args.in_args[0].size = entry->d_name.len + 1;
+	args.in_args[0].value = entry->d_name.name;
 	err = fuse_simple_request(fc, &args);
 	if (!err) {
 		clear_nlink(d_inode(entry));
-		fuse_invalidate_attr(dir);
+		fuse_dir_changed(dir);
 		fuse_invalidate_entry_cache(entry);
 	} else if (err == -EINTR)
 		fuse_invalidate_entry(entry);
@@ -734,15 +759,15 @@
 	memset(&inarg, 0, argsize);
 	inarg.newdir = get_node_id(newdir);
 	inarg.flags = flags;
-	args.in.h.opcode = opcode;
-	args.in.h.nodeid = get_node_id(olddir);
-	args.in.numargs = 3;
-	args.in.args[0].size = argsize;
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = oldent->d_name.len + 1;
-	args.in.args[1].value = oldent->d_name.name;
-	args.in.args[2].size = newent->d_name.len + 1;
-	args.in.args[2].value = newent->d_name.name;
+	args.opcode = opcode;
+	args.nodeid = get_node_id(olddir);
+	args.in_numargs = 3;
+	args.in_args[0].size = argsize;
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = oldent->d_name.len + 1;
+	args.in_args[1].value = oldent->d_name.name;
+	args.in_args[2].size = newent->d_name.len + 1;
+	args.in_args[2].value = newent->d_name.name;
 	err = fuse_simple_request(fc, &args);
 	if (!err) {
 		/* ctime changes */
@@ -754,9 +779,9 @@
 			fuse_update_ctime(d_inode(newent));
 		}
 
-		fuse_invalidate_attr(olddir);
+		fuse_dir_changed(olddir);
 		if (olddir != newdir)
-			fuse_invalidate_attr(newdir);
+			fuse_dir_changed(newdir);
 
 		/* newent will end up negative */
 		if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
@@ -819,12 +844,12 @@
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
-	args.in.h.opcode = FUSE_LINK;
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = newent->d_name.len + 1;
-	args.in.args[1].value = newent->d_name.name;
+	args.opcode = FUSE_LINK;
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = newent->d_name.len + 1;
+	args.in_args[1].value = newent->d_name.name;
 	err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
 	/* Contrary to "normal" filesystems it can happen that link
 	   makes two "logical" inodes point to the same "physical"
@@ -835,10 +860,10 @@
 	if (!err) {
 		struct fuse_inode *fi = get_fuse_inode(inode);
 
-		spin_lock(&fc->lock);
-		fi->attr_version = ++fc->attr_version;
+		spin_lock(&fi->lock);
+		fi->attr_version = atomic64_inc_return(&fc->attr_version);
 		inc_nlink(inode);
-		spin_unlock(&fc->lock);
+		spin_unlock(&fi->lock);
 		fuse_invalidate_attr(inode);
 		fuse_update_ctime(inode);
 	} else if (err == -EINTR) {
@@ -907,14 +932,14 @@
 		inarg.getattr_flags |= FUSE_GETATTR_FH;
 		inarg.fh = ff->fh;
 	}
-	args.in.h.opcode = FUSE_GETATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_GETATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (!err) {
 		if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
@@ -932,7 +957,8 @@
 }
 
 static int fuse_update_get_attr(struct inode *inode, struct file *file,
-				struct kstat *stat, unsigned int flags)
+				struct kstat *stat, u32 request_mask,
+				unsigned int flags)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	int err = 0;
@@ -942,6 +968,8 @@
 		sync = true;
 	else if (flags & AT_STATX_DONT_SYNC)
 		sync = false;
+	else if (request_mask & READ_ONCE(fi->inval_mask))
+		sync = true;
 	else
 		sync = time_before64(fi->i_time, get_jiffies_64());
 
@@ -959,7 +987,9 @@
 
 int fuse_update_attributes(struct inode *inode, struct file *file)
 {
-	return fuse_update_get_attr(inode, file, NULL, 0);
+	/* Do *not* need to get atime for internal purposes */
+	return fuse_update_get_attr(inode, file, NULL,
+				    STATX_BASIC_STATS & ~STATX_ATIME, 0);
 }
 
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
@@ -989,7 +1019,7 @@
 	if (!entry)
 		goto unlock;
 
-	fuse_invalidate_attr(parent);
+	fuse_dir_changed(parent);
 	fuse_invalidate_entry(entry);
 
 	if (child_nodeid != 0 && d_really_is_positive(entry)) {
@@ -1074,11 +1104,11 @@
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
-	args.in.h.opcode = FUSE_ACCESS;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
+	args.opcode = FUSE_ACCESS;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_access = 1;
@@ -1124,8 +1154,10 @@
 	if (fc->default_permissions ||
 	    ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
 		struct fuse_inode *fi = get_fuse_inode(inode);
+		u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID;
 
-		if (time_before64(fi->i_time, get_jiffies_64())) {
+		if (perm_mask & READ_ONCE(fi->inval_mask) ||
+		    time_before64(fi->i_time, get_jiffies_64())) {
 			refreshed = true;
 
 			err = fuse_perm_getattr(inode, mask);
@@ -1165,271 +1197,76 @@
 	return err;
 }
 
-static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
-			 struct dir_context *ctx)
+static int fuse_readlink_page(struct inode *inode, struct page *page)
 {
-	while (nbytes >= FUSE_NAME_OFFSET) {
-		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
-		size_t reclen = FUSE_DIRENT_SIZE(dirent);
-		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
-			return -EIO;
-		if (reclen > nbytes)
-			break;
-		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
-			return -EIO;
-
-		if (!dir_emit(ctx, dirent->name, dirent->namelen,
-			       dirent->ino, dirent->type))
-			break;
-
-		buf += reclen;
-		nbytes -= reclen;
-		ctx->pos = dirent->off;
-	}
-
-	return 0;
-}
-
-static int fuse_direntplus_link(struct file *file,
-				struct fuse_direntplus *direntplus,
-				u64 attr_version)
-{
-	struct fuse_entry_out *o = &direntplus->entry_out;
-	struct fuse_dirent *dirent = &direntplus->dirent;
-	struct dentry *parent = file->f_path.dentry;
-	struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
-	struct dentry *dentry;
-	struct dentry *alias;
-	struct inode *dir = d_inode(parent);
-	struct fuse_conn *fc;
-	struct inode *inode;
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-
-	if (!o->nodeid) {
-		/*
-		 * Unlike in the case of fuse_lookup, zero nodeid does not mean
-		 * ENOENT. Instead, it only means the userspace filesystem did
-		 * not want to return attributes/handle for this entry.
-		 *
-		 * So do nothing.
-		 */
-		return 0;
-	}
-
-	if (name.name[0] == '.') {
-		/*
-		 * We could potentially refresh the attributes of the directory
-		 * and its parent?
-		 */
-		if (name.len == 1)
-			return 0;
-		if (name.name[1] == '.' && name.len == 2)
-			return 0;
-	}
-
-	if (invalid_nodeid(o->nodeid))
-		return -EIO;
-	if (!fuse_valid_type(o->attr.mode))
-		return -EIO;
-
-	fc = get_fuse_conn(dir);
-
-	name.hash = full_name_hash(parent, name.name, name.len);
-	dentry = d_lookup(parent, &name);
-	if (!dentry) {
-retry:
-		dentry = d_alloc_parallel(parent, &name, &wq);
-		if (IS_ERR(dentry))
-			return PTR_ERR(dentry);
-	}
-	if (!d_in_lookup(dentry)) {
-		struct fuse_inode *fi;
-		inode = d_inode(dentry);
-		if (!inode ||
-		    get_node_id(inode) != o->nodeid ||
-		    ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
-			d_invalidate(dentry);
-			dput(dentry);
-			goto retry;
-		}
-		if (is_bad_inode(inode)) {
-			dput(dentry);
-			return -EIO;
-		}
-
-		fi = get_fuse_inode(inode);
-		spin_lock(&fc->lock);
-		fi->nlookup++;
-		spin_unlock(&fc->lock);
-
-		forget_all_cached_acls(inode);
-		fuse_change_attributes(inode, &o->attr,
-				       entry_attr_timeout(o),
-				       attr_version);
-		/*
-		 * The other branch comes via fuse_iget()
-		 * which bumps nlookup inside
-		 */
-	} else {
-		inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
-				  &o->attr, entry_attr_timeout(o),
-				  attr_version);
-		if (!inode)
-			inode = ERR_PTR(-ENOMEM);
-
-		alias = d_splice_alias(inode, dentry);
-		d_lookup_done(dentry);
-		if (alias) {
-			dput(dentry);
-			dentry = alias;
-		}
-		if (IS_ERR(dentry))
-			return PTR_ERR(dentry);
-	}
-	if (fc->readdirplus_auto)
-		set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
-	fuse_change_entry_timeout(dentry, o);
-
-	dput(dentry);
-	return 0;
-}
-
-static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
-			     struct dir_context *ctx, u64 attr_version)
-{
-	struct fuse_direntplus *direntplus;
-	struct fuse_dirent *dirent;
-	size_t reclen;
-	int over = 0;
-	int ret;
-
-	while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
-		direntplus = (struct fuse_direntplus *) buf;
-		dirent = &direntplus->dirent;
-		reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
-
-		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
-			return -EIO;
-		if (reclen > nbytes)
-			break;
-		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
-			return -EIO;
-
-		if (!over) {
-			/* We fill entries into dstbuf only as much as
-			   it can hold. But we still continue iterating
-			   over remaining entries to link them. If not,
-			   we need to send a FORGET for each of those
-			   which we did not link.
-			*/
-			over = !dir_emit(ctx, dirent->name, dirent->namelen,
-				       dirent->ino, dirent->type);
-			if (!over)
-				ctx->pos = dirent->off;
-		}
-
-		buf += reclen;
-		nbytes -= reclen;
-
-		ret = fuse_direntplus_link(file, direntplus, attr_version);
-		if (ret)
-			fuse_force_forget(file, direntplus->entry_out.nodeid);
-	}
-
-	return 0;
-}
-
-static int fuse_readdir(struct file *file, struct dir_context *ctx)
-{
-	int plus, err;
-	size_t nbytes;
-	struct page *page;
-	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req;
-	u64 attr_version = 0;
-	bool locked;
+	struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
+	struct fuse_args_pages ap = {
+		.num_pages = 1,
+		.pages = &page,
+		.descs = &desc,
+	};
+	char *link;
+	ssize_t res;
 
-	if (is_bad_inode(inode))
+	ap.args.opcode = FUSE_READLINK;
+	ap.args.nodeid = get_node_id(inode);
+	ap.args.out_pages = true;
+	ap.args.out_argvar = true;
+	ap.args.page_zeroing = true;
+	ap.args.out_numargs = 1;
+	ap.args.out_args[0].size = desc.length;
+	res = fuse_simple_request(fc, &ap.args);
+
+	fuse_invalidate_atime(inode);
+
+	if (res < 0)
+		return res;
+
+	if (WARN_ON(res >= PAGE_SIZE))
 		return -EIO;
 
-	req = fuse_get_req(fc, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	link = page_address(page);
+	link[res] = '\0';
+
+	return 0;
+}
+
+static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
+				 struct delayed_call *callback)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct page *page;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out_err;
+
+	if (fc->cache_symlinks)
+		return page_get_link(dentry, inode, callback);
+
+	err = -ECHILD;
+	if (!dentry)
+		goto out_err;
 
 	page = alloc_page(GFP_KERNEL);
-	if (!page) {
-		fuse_put_request(fc, req);
-		return -ENOMEM;
+	err = -ENOMEM;
+	if (!page)
+		goto out_err;
+
+	err = fuse_readlink_page(inode, page);
+	if (err) {
+		__free_page(page);
+		goto out_err;
 	}
 
-	plus = fuse_use_readdirplus(inode, ctx);
-	req->out.argpages = 1;
-	req->num_pages = 1;
-	req->pages[0] = page;
-	req->page_descs[0].length = PAGE_SIZE;
-	if (plus) {
-		attr_version = fuse_get_attr_version(fc);
-		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
-			       FUSE_READDIRPLUS);
-	} else {
-		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
-			       FUSE_READDIR);
-	}
-	locked = fuse_lock_inode(inode);
-	fuse_request_send(fc, req);
-	fuse_unlock_inode(inode, locked);
-	nbytes = req->out.args[0].size;
-	err = req->out.h.error;
-	fuse_put_request(fc, req);
-	if (!err) {
-		if (plus) {
-			err = parse_dirplusfile(page_address(page), nbytes,
-						file, ctx,
-						attr_version);
-		} else {
-			err = parse_dirfile(page_address(page), nbytes, file,
-					    ctx);
-		}
-	}
+	set_delayed_call(callback, page_put_link, page);
 
-	__free_page(page);
-	fuse_invalidate_atime(inode);
-	return err;
-}
+	return page_address(page);
 
-static const char *fuse_get_link(struct dentry *dentry,
-				 struct inode *inode,
-				 struct delayed_call *done)
-{
-	struct fuse_conn *fc = get_fuse_conn(inode);
-	FUSE_ARGS(args);
-	char *link;
-	ssize_t ret;
-
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
-
-	link = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!link)
-		return ERR_PTR(-ENOMEM);
-
-	args.in.h.opcode = FUSE_READLINK;
-	args.in.h.nodeid = get_node_id(inode);
-	args.out.argvar = 1;
-	args.out.numargs = 1;
-	args.out.args[0].size = PAGE_SIZE - 1;
-	args.out.args[0].value = link;
-	ret = fuse_simple_request(fc, &args);
-	if (ret < 0) {
-		kfree(link);
-		link = ERR_PTR(ret);
-	} else {
-		link[ret] = '\0';
-		set_delayed_call(done, kfree_link, link);
-	}
-	fuse_invalidate_atime(inode);
-	return link;
+out_err:
+	return ERR_PTR(err);
 }
 
 static int fuse_dir_open(struct inode *inode, struct file *file)
@@ -1447,7 +1284,25 @@
 static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
 			  int datasync)
 {
-	return fuse_fsync_common(file, start, end, datasync, 1);
+	struct inode *inode = file->f_mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	if (fc->no_fsyncdir)
+		return 0;
+
+	inode_lock(inode);
+	err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNCDIR);
+	if (err == -ENOSYS) {
+		fc->no_fsyncdir = 1;
+		err = 0;
+	}
+	inode_unlock(inode);
+
+	return err;
 }
 
 static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
@@ -1534,15 +1389,14 @@
  */
 void fuse_set_nowrite(struct inode *inode)
 {
-	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	BUG_ON(!inode_is_locked(inode));
 
-	spin_lock(&fc->lock);
+	spin_lock(&fi->lock);
 	BUG_ON(fi->writectr < 0);
 	fi->writectr += FUSE_NOWRITE;
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
 }
 
@@ -1563,11 +1417,11 @@
 
 void fuse_release_nowrite(struct inode *inode)
 {
-	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 
-	spin_lock(&fc->lock);
+	spin_lock(&fi->lock);
 	__fuse_release_nowrite(inode);
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 }
 
 static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
@@ -1575,14 +1429,14 @@
 			      struct fuse_setattr_in *inarg_p,
 			      struct fuse_attr_out *outarg_p)
 {
-	args->in.h.opcode = FUSE_SETATTR;
-	args->in.h.nodeid = get_node_id(inode);
-	args->in.numargs = 1;
-	args->in.args[0].size = sizeof(*inarg_p);
-	args->in.args[0].value = inarg_p;
-	args->out.numargs = 1;
-	args->out.args[0].size = sizeof(*outarg_p);
-	args->out.args[0].value = outarg_p;
+	args->opcode = FUSE_SETATTR;
+	args->nodeid = get_node_id(inode);
+	args->in_numargs = 1;
+	args->in_args[0].size = sizeof(*inarg_p);
+	args->in_args[0].value = inarg_p;
+	args->out_numargs = 1;
+	args->out_args[0].size = sizeof(*outarg_p);
+	args->out_args[0].value = outarg_p;
 }
 
 /*
@@ -1662,8 +1516,24 @@
 		file = NULL;
 	}
 
-	if (attr->ia_valid & ATTR_SIZE)
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (WARN_ON(!S_ISREG(inode->i_mode)))
+			return -EIO;
 		is_truncate = true;
+	}
+
+	/* Flush dirty data/metadata before non-truncate SETATTR */
+	if (is_wb && S_ISREG(inode->i_mode) &&
+	    attr->ia_valid &
+			(ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
+			 ATTR_TIMES_SET)) {
+		err = write_inode_now(inode, true);
+		if (err)
+			return err;
+
+		fuse_set_nowrite(inode);
+		fuse_release_nowrite(inode);
+	}
 
 	if (is_truncate) {
 		fuse_set_nowrite(inode);
@@ -1699,7 +1569,7 @@
 		goto error;
 	}
 
-	spin_lock(&fc->lock);
+	spin_lock(&fi->lock);
 	/* the kernel maintains i_mtime locally */
 	if (trust_local_cmtime) {
 		if (attr->ia_valid & ATTR_MTIME)
@@ -1717,10 +1587,10 @@
 		i_size_write(inode, outarg.attr.size);
 
 	if (is_truncate) {
-		/* NOTE: this may release/reacquire fc->lock */
+		/* NOTE: this may release/reacquire fi->lock */
 		__fuse_release_nowrite(inode);
 	}
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 
 	/*
 	 * Only call invalidate_inode_pages2() after removing
@@ -1811,7 +1681,7 @@
 	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
-	return fuse_update_get_attr(inode, NULL, stat, flags);
+	return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
 }
 
 static const struct inode_operations fuse_dir_inode_operations = {
@@ -1867,11 +1737,37 @@
 
 void fuse_init_dir(struct inode *inode)
 {
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
 	inode->i_op = &fuse_dir_inode_operations;
 	inode->i_fop = &fuse_dir_operations;
+
+	spin_lock_init(&fi->rdc.lock);
+	fi->rdc.cached = false;
+	fi->rdc.size = 0;
+	fi->rdc.pos = 0;
+	fi->rdc.version = 0;
 }
 
+static int fuse_symlink_readpage(struct file *null, struct page *page)
+{
+	int err = fuse_readlink_page(page->mapping->host, page);
+
+	if (!err)
+		SetPageUptodate(page);
+
+	unlock_page(page);
+
+	return err;
+}
+
+static const struct address_space_operations fuse_symlink_aops = {
+	.readpage	= fuse_symlink_readpage,
+};
+
 void fuse_init_symlink(struct inode *inode)
 {
 	inode->i_op = &fuse_symlink_inode_operations;
+	inode->i_data.a_ops = &fuse_symlink_aops;
+	inode_nohighmem(inode);
 }

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fbd6978..db48a5c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c

@@ -19,7 +19,17 @@
 #include <linux/falloc.h>
 #include <linux/uio.h>
 
-static const struct file_operations fuse_direct_io_file_operations;
+static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
+				      struct fuse_page_desc **desc)
+{
+	struct page **pages;
+
+	pages = kzalloc(npages * (sizeof(struct page *) +
+				  sizeof(struct fuse_page_desc)), flags);
+	*desc = (void *) (pages + npages);
+
+	return pages;
+}
 
 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 			  int opcode, struct fuse_open_out *outargp)
@@ -31,48 +41,55 @@
 	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
 	if (!fc->atomic_o_trunc)
 		inarg.flags &= ~O_TRUNC;
-	args.in.h.opcode = opcode;
-	args.in.h.nodeid = nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(*outargp);
-	args.out.args[0].value = outargp;
+	args.opcode = opcode;
+	args.nodeid = nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(*outargp);
+	args.out_args[0].value = outargp;
 
 	return fuse_simple_request(fc, &args);
 }
 
+struct fuse_release_args {
+	struct fuse_args args;
+	struct fuse_release_in inarg;
+	struct inode *inode;
+};
+
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
 	struct fuse_file *ff;
 
-	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL);
+	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
 	if (unlikely(!ff))
 		return NULL;
 
 	ff->fc = fc;
-	ff->reserved_req = fuse_request_alloc(0);
-	if (unlikely(!ff->reserved_req)) {
+	ff->release_args = kzalloc(sizeof(*ff->release_args),
+				   GFP_KERNEL_ACCOUNT);
+	if (!ff->release_args) {
 		kfree(ff);
 		return NULL;
 	}
 
 	INIT_LIST_HEAD(&ff->write_entry);
+	mutex_init(&ff->readdir.lock);
 	refcount_set(&ff->count, 1);
 	RB_CLEAR_NODE(&ff->polled_node);
 	init_waitqueue_head(&ff->poll_wait);
 
-	spin_lock(&fc->lock);
-	ff->kh = ++fc->khctr;
-	spin_unlock(&fc->lock);
+	ff->kh = atomic64_inc_return(&fc->khctr);
 
 	return ff;
 }
 
 void fuse_file_free(struct fuse_file *ff)
 {
-	fuse_request_free(ff->reserved_req);
+	kfree(ff->release_args);
+	mutex_destroy(&ff->readdir.lock);
 	kfree(ff);
 }
 
@@ -82,34 +99,31 @@
 	return ff;
 }
 
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
+			     int error)
 {
-	iput(req->misc.release.inode);
+	struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
+
+	iput(ra->inode);
+	kfree(ra);
 }
 
 static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 {
 	if (refcount_dec_and_test(&ff->count)) {
-		struct fuse_req *req = ff->reserved_req;
+		struct fuse_args *args = &ff->release_args->args;
 
-		if (ff->fc->no_open && !isdir) {
-			/*
-			 * Drop the release request when client does not
-			 * implement 'open'
-			 */
-			__clear_bit(FR_BACKGROUND, &req->flags);
-			iput(req->misc.release.inode);
-			fuse_put_request(ff->fc, req);
+		if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
+			/* Do nothing when client does not implement 'open' */
+			fuse_release_end(ff->fc, args, 0);
 		} else if (sync) {
-			__set_bit(FR_FORCE, &req->flags);
-			__clear_bit(FR_BACKGROUND, &req->flags);
-			fuse_request_send(ff->fc, req);
-			iput(req->misc.release.inode);
-			fuse_put_request(ff->fc, req);
+			fuse_simple_request(ff->fc, args);
+			fuse_release_end(ff->fc, args, 0);
 		} else {
-			req->end = fuse_release_end;
-			__set_bit(FR_BACKGROUND, &req->flags);
-			fuse_request_send_background(ff->fc, req);
+			args->end = fuse_release_end;
+			if (fuse_simple_background(ff->fc, args,
+						   GFP_KERNEL | __GFP_NOFAIL))
+				fuse_release_end(ff->fc, args, -ENOTCONN);
 		}
 		kfree(ff);
 	}
@@ -126,8 +140,9 @@
 		return -ENOMEM;
 
 	ff->fh = 0;
-	ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
-	if (!fc->no_open || isdir) {
+	/* Default for no-open */
+	ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
+	if (isdir ? !fc->no_opendir : !fc->no_open) {
 		struct fuse_open_out outarg;
 		int err;
 
@@ -136,11 +151,14 @@
 			ff->fh = outarg.fh;
 			ff->open_flags = outarg.open_flags;
 
-		} else if (err != -ENOSYS || isdir) {
+		} else if (err != -ENOSYS) {
 			fuse_file_free(ff);
 			return err;
 		} else {
-			fc->no_open = 1;
+			if (isdir)
+				fc->no_opendir = 1;
+			else
+				fc->no_open = 1;
 		}
 	}
 
@@ -157,17 +175,16 @@
 static void fuse_link_write_file(struct file *file)
 {
 	struct inode *inode = file_inode(file);
-	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_file *ff = file->private_data;
 	/*
 	 * file may be written through mmap, so chain it onto the
 	 * inodes's write_file list
 	 */
-	spin_lock(&fc->lock);
+	spin_lock(&fi->lock);
 	if (list_empty(&ff->write_entry))
 		list_add(&ff->write_entry, &fi->write_files);
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 }
 
 void fuse_finish_open(struct inode *inode, struct file *file)
@@ -175,19 +192,19 @@
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (ff->open_flags & FOPEN_DIRECT_IO)
-		file->f_op = &fuse_direct_io_file_operations;
 	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
 		invalidate_inode_pages2(inode->i_mapping);
-	if (ff->open_flags & FOPEN_NONSEEKABLE)
+	if (ff->open_flags & FOPEN_STREAM)
+		stream_open(inode, file);
+	else if (ff->open_flags & FOPEN_NONSEEKABLE)
 		nonseekable_open(inode, file);
 	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
 		struct fuse_inode *fi = get_fuse_inode(inode);
 
-		spin_lock(&fc->lock);
-		fi->attr_version = ++fc->attr_version;
+		spin_lock(&fi->lock);
+		fi->attr_version = atomic64_inc_return(&fc->attr_version);
 		i_size_write(inode, 0);
-		spin_unlock(&fc->lock);
+		spin_unlock(&fi->lock);
 		fuse_invalidate_attr(inode);
 		if (fc->writeback_cache)
 			file_update_time(file);
@@ -200,7 +217,7 @@
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
-	bool lock_inode = (file->f_flags & O_TRUNC) &&
+	bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
 			  fc->atomic_o_trunc &&
 			  fc->writeback_cache;
 
@@ -208,59 +225,70 @@
 	if (err)
 		return err;
 
-	if (lock_inode)
+	if (is_wb_truncate) {
 		inode_lock(inode);
+		fuse_set_nowrite(inode);
+	}
 
 	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
 
 	if (!err)
 		fuse_finish_open(inode, file);
 
-	if (lock_inode)
+	if (is_wb_truncate) {
+		fuse_release_nowrite(inode);
 		inode_unlock(inode);
+	}
 
 	return err;
 }
 
-static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
+static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
+				 int flags, int opcode)
 {
 	struct fuse_conn *fc = ff->fc;
-	struct fuse_req *req = ff->reserved_req;
-	struct fuse_release_in *inarg = &req->misc.release.in;
+	struct fuse_release_args *ra = ff->release_args;
 
+	/* Inode is NULL on error path of fuse_create_open() */
+	if (likely(fi)) {
+		spin_lock(&fi->lock);
+		list_del(&ff->write_entry);
+		spin_unlock(&fi->lock);
+	}
 	spin_lock(&fc->lock);
-	list_del(&ff->write_entry);
 	if (!RB_EMPTY_NODE(&ff->polled_node))
 		rb_erase(&ff->polled_node, &fc->polled_files);
 	spin_unlock(&fc->lock);
 
 	wake_up_interruptible_all(&ff->poll_wait);
 
-	inarg->fh = ff->fh;
-	inarg->flags = flags;
-	req->in.h.opcode = opcode;
-	req->in.h.nodeid = ff->nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(struct fuse_release_in);
-	req->in.args[0].value = inarg;
+	ra->inarg.fh = ff->fh;
+	ra->inarg.flags = flags;
+	ra->args.in_numargs = 1;
+	ra->args.in_args[0].size = sizeof(struct fuse_release_in);
+	ra->args.in_args[0].value = &ra->inarg;
+	ra->args.opcode = opcode;
+	ra->args.nodeid = ff->nodeid;
+	ra->args.force = true;
+	ra->args.nocreds = true;
 }
 
 void fuse_release_common(struct file *file, bool isdir)
 {
+	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
 	struct fuse_file *ff = file->private_data;
-	struct fuse_req *req = ff->reserved_req;
+	struct fuse_release_args *ra = ff->release_args;
 	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 
-	fuse_prepare_release(ff, file->f_flags, opcode);
+	fuse_prepare_release(fi, ff, file->f_flags, opcode);
 
 	if (ff->flock) {
-		struct fuse_release_in *inarg = &req->misc.release.in;
-		inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
-		inarg->lock_owner = fuse_lock_owner_id(ff->fc,
-						       (fl_owner_t) file);
+		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
+		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
+							  (fl_owner_t) file);
 	}
 	/* Hold inode until release is finished */
-	req->misc.release.inode = igrab(file_inode(file));
+	ra->inode = igrab(file_inode(file));
 
 	/*
 	 * Normally this will send the RELEASE request, however if
@@ -271,7 +299,7 @@
 	 * synchronous RELEASE is allowed (and desirable) in this case
 	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir);
+	fuse_file_put(ff, ff->fc->destroy, isdir);
 }
 
 static int fuse_open(struct inode *inode, struct file *file)
@@ -293,10 +321,10 @@
 	return 0;
 }
 
-void fuse_sync_release(struct fuse_file *ff, int flags)
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
 {
 	WARN_ON(refcount_read(&ff->count) > 1);
-	fuse_prepare_release(ff, flags, FUSE_RELEASE);
+	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
 	/*
 	 * iput(NULL) is a no-op and since the refcount is 1 and everything's
 	 * synchronous, we are fine with not doing igrab() here"
@@ -327,6 +355,32 @@
 	return (u64) v0 + ((u64) v1 << 32);
 }
 
+struct fuse_writepage_args {
+	struct fuse_io_args ia;
+	struct list_head writepages_entry;
+	struct list_head queue_entry;
+	struct fuse_writepage_args *next;
+	struct inode *inode;
+};
+
+static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
+					    pgoff_t idx_from, pgoff_t idx_to)
+{
+	struct fuse_writepage_args *wpa;
+
+	list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
+		pgoff_t curr_index;
+
+		WARN_ON(get_fuse_inode(wpa->inode) != fi);
+		curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
+		if (idx_from < curr_index + wpa->ia.ap.num_pages &&
+		    curr_index <= idx_to) {
+			return wpa;
+		}
+	}
+	return NULL;
+}
+
 /*
  * Check if any page in a range is under writeback
  *
@@ -336,24 +390,12 @@
 static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
 				   pgoff_t idx_to)
 {
-	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct fuse_req *req;
-	bool found = false;
+	bool found;
 
-	spin_lock(&fc->lock);
-	list_for_each_entry(req, &fi->writepages, writepages_entry) {
-		pgoff_t curr_index;
-
-		BUG_ON(req->inode != inode);
-		curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
-		if (idx_from < curr_index + req->num_pages &&
-		    curr_index <= idx_to) {
-			found = true;
-			break;
-		}
-	}
-	spin_unlock(&fc->lock);
+	spin_lock(&fi->lock);
+	found = fuse_find_writeback(fi, idx_from, idx_to);
+	spin_unlock(&fi->lock);
 
 	return found;
 }
@@ -369,12 +411,11 @@
  * Since fuse doesn't rely on the VM writeback tracking, this has to
  * use some other means.
  */
-static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
-	return 0;
 }
 
 /*
@@ -397,8 +438,8 @@
 	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
-	struct fuse_req *req;
 	struct fuse_flush_in inarg;
+	FUSE_ARGS(args);
 	int err;
 
 	if (is_bad_inode(inode))
@@ -419,19 +460,17 @@
 	if (err)
 		return err;
 
-	req = fuse_get_req_nofail_nopages(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.lock_owner = fuse_lock_owner_id(fc, id);
-	req->in.h.opcode = FUSE_FLUSH;
-	req->in.h.nodeid = get_node_id(inode);
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(inarg);
-	req->in.args[0].value = &inarg;
-	__set_bit(FR_FORCE, &req->flags);
-	fuse_request_send(fc, req);
-	err = req->out.h.error;
-	fuse_put_request(fc, req);
+	args.opcode = FUSE_FLUSH;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.force = true;
+
+	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_flush = 1;
 		err = 0;
@@ -440,13 +479,30 @@
 }
 
 int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
-		      int datasync, int isdir)
+		      int datasync, int opcode)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
 	FUSE_ARGS(args);
 	struct fuse_fsync_in inarg;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
+	args.opcode = opcode;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	return fuse_simple_request(fc, &args);
+}
+
+static int fuse_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
 	if (is_bad_inode(inode))
@@ -478,65 +534,49 @@
 	if (err)
 		goto out;
 
-	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+	if (fc->no_fsync)
 		goto out;
 
-	memset(&inarg, 0, sizeof(inarg));
-	inarg.fh = ff->fh;
-	inarg.fsync_flags = datasync ? 1 : 0;
-	args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	err = fuse_simple_request(fc, &args);
+	err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
 	if (err == -ENOSYS) {
-		if (isdir)
-			fc->no_fsyncdir = 1;
-		else
-			fc->no_fsync = 1;
+		fc->no_fsync = 1;
 		err = 0;
 	}
 out:
 	inode_unlock(inode);
+
 	return err;
 }
 
-static int fuse_fsync(struct file *file, loff_t start, loff_t end,
-		      int datasync)
+void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
+			 size_t count, int opcode)
 {
-	return fuse_fsync_common(file, start, end, datasync, 0);
-}
-
-void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
-		    size_t count, int opcode)
-{
-	struct fuse_read_in *inarg = &req->misc.read.in;
 	struct fuse_file *ff = file->private_data;
+	struct fuse_args *args = &ia->ap.args;
 
-	inarg->fh = ff->fh;
-	inarg->offset = pos;
-	inarg->size = count;
-	inarg->flags = file->f_flags;
-	req->in.h.opcode = opcode;
-	req->in.h.nodeid = ff->nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(struct fuse_read_in);
-	req->in.args[0].value = inarg;
-	req->out.argvar = 1;
-	req->out.numargs = 1;
-	req->out.args[0].size = count;
+	ia->read.in.fh = ff->fh;
+	ia->read.in.offset = pos;
+	ia->read.in.size = count;
+	ia->read.in.flags = file->f_flags;
+	args->opcode = opcode;
+	args->nodeid = ff->nodeid;
+	args->in_numargs = 1;
+	args->in_args[0].size = sizeof(ia->read.in);
+	args->in_args[0].value = &ia->read.in;
+	args->out_argvar = true;
+	args->out_numargs = 1;
+	args->out_args[0].size = count;
 }
 
-static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty)
+static void fuse_release_user_pages(struct fuse_args_pages *ap,
+				    bool should_dirty)
 {
-	unsigned i;
+	unsigned int i;
 
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
+	for (i = 0; i < ap->num_pages; i++) {
 		if (should_dirty)
-			set_page_dirty_lock(page);
-		put_page(page);
+			set_page_dirty_lock(ap->pages[i]);
+		put_page(ap->pages[i]);
 	}
 }
 
@@ -595,9 +635,9 @@
 			struct fuse_conn *fc = get_fuse_conn(inode);
 			struct fuse_inode *fi = get_fuse_inode(inode);
 
-			spin_lock(&fc->lock);
-			fi->attr_version = ++fc->attr_version;
-			spin_unlock(&fc->lock);
+			spin_lock(&fi->lock);
+			fi->attr_version = atomic64_inc_return(&fc->attr_version);
+			spin_unlock(&fi->lock);
 		}
 
 		io->iocb->ki_complete(io->iocb, res, 0);
@@ -606,64 +646,94 @@
 	kref_put(&io->refcnt, fuse_io_release);
 }
 
-static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
+static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
+					  unsigned int npages)
 {
-	struct fuse_io_priv *io = req->io;
-	ssize_t pos = -1;
+	struct fuse_io_args *ia;
 
-	fuse_release_user_pages(req, io->should_dirty);
-
-	if (io->write) {
-		if (req->misc.write.in.size != req->misc.write.out.size)
-			pos = req->misc.write.in.offset - io->offset +
-				req->misc.write.out.size;
-	} else {
-		if (req->misc.read.in.size != req->out.args[0].size)
-			pos = req->misc.read.in.offset - io->offset +
-				req->out.args[0].size;
+	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
+	if (ia) {
+		ia->io = io;
+		ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
+						&ia->ap.descs);
+		if (!ia->ap.pages) {
+			kfree(ia);
+			ia = NULL;
+		}
 	}
-
-	fuse_aio_complete(io, req->out.h.error, pos);
+	return ia;
 }
 
-static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
-		size_t num_bytes, struct fuse_io_priv *io)
+static void fuse_io_free(struct fuse_io_args *ia)
 {
+	kfree(ia->ap.pages);
+	kfree(ia);
+}
+
+static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
+				  int err)
+{
+	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
+	struct fuse_io_priv *io = ia->io;
+	ssize_t pos = -1;
+
+	fuse_release_user_pages(&ia->ap, io->should_dirty);
+
+	if (err) {
+		/* Nothing */
+	} else if (io->write) {
+		if (ia->write.out.size > ia->write.in.size) {
+			err = -EIO;
+		} else if (ia->write.in.size != ia->write.out.size) {
+			pos = ia->write.in.offset - io->offset +
+				ia->write.out.size;
+		}
+	} else {
+		u32 outsize = args->out_args[0].size;
+
+		if (ia->read.in.size != outsize)
+			pos = ia->read.in.offset - io->offset + outsize;
+	}
+
+	fuse_aio_complete(io, err, pos);
+	fuse_io_free(ia);
+}
+
+static ssize_t fuse_async_req_send(struct fuse_conn *fc,
+				   struct fuse_io_args *ia, size_t num_bytes)
+{
+	ssize_t err;
+	struct fuse_io_priv *io = ia->io;
+
 	spin_lock(&io->lock);
 	kref_get(&io->refcnt);
 	io->size += num_bytes;
 	io->reqs++;
 	spin_unlock(&io->lock);
 
-	req->io = io;
-	req->end = fuse_aio_complete_req;
+	ia->ap.args.end = fuse_aio_complete_req;
+	err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
 
-	__fuse_get_request(req);
-	fuse_request_send_background(fc, req);
-
-	return num_bytes;
+	return err ?: num_bytes;
 }
 
-static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
-			     loff_t pos, size_t count, fl_owner_t owner)
+static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
+			      fl_owner_t owner)
 {
-	struct file *file = io->iocb->ki_filp;
+	struct file *file = ia->io->iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 
-	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 	if (owner != NULL) {
-		struct fuse_read_in *inarg = &req->misc.read.in;
-
-		inarg->read_flags |= FUSE_READ_LOCKOWNER;
-		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+		ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
+		ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
 	}
 
-	if (io->async)
-		return fuse_async_req_send(fc, req, count, io);
+	if (ia->io->async)
+		return fuse_async_req_send(fc, ia, count);
 
-	fuse_request_send(fc, req);
-	return req->out.args[0].size;
+	return fuse_simple_request(fc, &ia->ap.args);
 }
 
 static void fuse_read_update_size(struct inode *inode, loff_t size,
@@ -672,19 +742,18 @@
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
-	spin_lock(&fc->lock);
+	spin_lock(&fi->lock);
 	if (attr_ver == fi->attr_version && size < inode->i_size &&
 	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
-		fi->attr_version = ++fc->attr_version;
+		fi->attr_version = atomic64_inc_return(&fc->attr_version);
 		i_size_write(inode, size);
 	}
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 }
 
-static void fuse_short_read(struct fuse_req *req, struct inode *inode,
-			    u64 attr_ver)
+static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
+			    struct fuse_args_pages *ap)
 {
-	size_t num_read = req->out.args[0].size;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	if (fc->writeback_cache) {
@@ -697,28 +766,31 @@
 		int start_idx = num_read >> PAGE_SHIFT;
 		size_t off = num_read & (PAGE_SIZE - 1);
 
-		for (i = start_idx; i < req->num_pages; i++) {
-			zero_user_segment(req->pages[i], off, PAGE_SIZE);
+		for (i = start_idx; i < ap->num_pages; i++) {
+			zero_user_segment(ap->pages[i], off, PAGE_SIZE);
 			off = 0;
 		}
 	} else {
-		loff_t pos = page_offset(req->pages[0]) + num_read;
+		loff_t pos = page_offset(ap->pages[0]) + num_read;
 		fuse_read_update_size(inode, pos, attr_ver);
 	}
 }
 
 static int fuse_do_readpage(struct file *file, struct page *page)
 {
-	struct kiocb iocb;
-	struct fuse_io_priv io;
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req;
-	size_t num_read;
 	loff_t pos = page_offset(page);
-	size_t count = PAGE_SIZE;
+	struct fuse_page_desc desc = { .length = PAGE_SIZE };
+	struct fuse_io_args ia = {
+		.ap.args.page_zeroing = true,
+		.ap.args.out_pages = true,
+		.ap.num_pages = 1,
+		.ap.pages = &page,
+		.ap.descs = &desc,
+	};
+	ssize_t res;
 	u64 attr_ver;
-	int err;
 
 	/*
 	 * Page writeback can extend beyond the lifetime of the
@@ -727,35 +799,21 @@
 	 */
 	fuse_wait_on_page_writeback(inode, page->index);
 
-	req = fuse_get_req(fc, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
 	attr_ver = fuse_get_attr_version(fc);
 
-	req->out.page_zeroing = 1;
-	req->out.argpages = 1;
-	req->num_pages = 1;
-	req->pages[0] = page;
-	req->page_descs[0].length = count;
-	init_sync_kiocb(&iocb, file);
-	io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb);
-	num_read = fuse_send_read(req, &io, pos, count, NULL);
-	err = req->out.h.error;
+	fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
+	res = fuse_simple_request(fc, &ia.ap.args);
+	if (res < 0)
+		return res;
+	/*
+	 * Short read means EOF.  If file size is larger, truncate it
+	 */
+	if (res < desc.length)
+		fuse_short_read(inode, attr_ver, res, &ia.ap);
 
-	if (!err) {
-		/*
-		 * Short read means EOF.  If file size is larger, truncate it
-		 */
-		if (num_read < count)
-			fuse_short_read(req, inode, attr_ver);
+	SetPageUptodate(page);
 
-		SetPageUptodate(page);
-	}
-
-	fuse_put_request(fc, req);
-
-	return err;
+	return 0;
 }
 
 static int fuse_readpage(struct file *file, struct page *page)
@@ -774,15 +832,18 @@
 	return err;
 }
 
-static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
+			       int err)
 {
 	int i;
-	size_t count = req->misc.read.in.size;
-	size_t num_read = req->out.args[0].size;
+	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
+	struct fuse_args_pages *ap = &ia->ap;
+	size_t count = ia->read.in.size;
+	size_t num_read = args->out_args[0].size;
 	struct address_space *mapping = NULL;
 
-	for (i = 0; mapping == NULL && i < req->num_pages; i++)
-		mapping = req->pages[i]->mapping;
+	for (i = 0; mapping == NULL && i < ap->num_pages; i++)
+		mapping = ap->pages[i]->mapping;
 
 	if (mapping) {
 		struct inode *inode = mapping->host;
@@ -790,93 +851,97 @@
 		/*
 		 * Short read means EOF. If file size is larger, truncate it
 		 */
-		if (!req->out.h.error && num_read < count)
-			fuse_short_read(req, inode, req->misc.read.attr_ver);
+		if (!err && num_read < count)
+			fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
 
 		fuse_invalidate_atime(inode);
 	}
 
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
-		if (!req->out.h.error)
+	for (i = 0; i < ap->num_pages; i++) {
+		struct page *page = ap->pages[i];
+
+		if (!err)
 			SetPageUptodate(page);
 		else
 			SetPageError(page);
 		unlock_page(page);
 		put_page(page);
 	}
-	if (req->ff)
-		fuse_file_put(req->ff, false, false);
+	if (ia->ff)
+		fuse_file_put(ia->ff, false, false);
+
+	fuse_io_free(ia);
 }
 
-static void fuse_send_readpages(struct fuse_req *req, struct file *file)
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
-	loff_t pos = page_offset(req->pages[0]);
-	size_t count = req->num_pages << PAGE_SHIFT;
+	struct fuse_args_pages *ap = &ia->ap;
+	loff_t pos = page_offset(ap->pages[0]);
+	size_t count = ap->num_pages << PAGE_SHIFT;
+	int err;
 
-	req->out.argpages = 1;
-	req->out.page_zeroing = 1;
-	req->out.page_replace = 1;
-	fuse_read_fill(req, file, pos, count, FUSE_READ);
-	req->misc.read.attr_ver = fuse_get_attr_version(fc);
+	ap->args.out_pages = true;
+	ap->args.page_zeroing = true;
+	ap->args.page_replace = true;
+	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
+	ia->read.attr_ver = fuse_get_attr_version(fc);
 	if (fc->async_read) {
-		req->ff = fuse_file_get(ff);
-		req->end = fuse_readpages_end;
-		fuse_request_send_background(fc, req);
+		ia->ff = fuse_file_get(ff);
+		ap->args.end = fuse_readpages_end;
+		err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+		if (!err)
+			return;
 	} else {
-		fuse_request_send(fc, req);
-		fuse_readpages_end(fc, req);
-		fuse_put_request(fc, req);
+		err = fuse_simple_request(fc, &ap->args);
 	}
+	fuse_readpages_end(fc, &ap->args, err);
 }
 
 struct fuse_fill_data {
-	struct fuse_req *req;
+	struct fuse_io_args *ia;
 	struct file *file;
 	struct inode *inode;
-	unsigned nr_pages;
+	unsigned int nr_pages;
+	unsigned int max_pages;
 };
 
 static int fuse_readpages_fill(void *_data, struct page *page)
 {
 	struct fuse_fill_data *data = _data;
-	struct fuse_req *req = data->req;
+	struct fuse_io_args *ia = data->ia;
+	struct fuse_args_pages *ap = &ia->ap;
 	struct inode *inode = data->inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	fuse_wait_on_page_writeback(inode, page->index);
 
-	if (req->num_pages &&
-	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
-	     (req->num_pages + 1) * PAGE_SIZE > fc->max_read ||
-	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
-		int nr_alloc = min_t(unsigned, data->nr_pages,
-				     FUSE_MAX_PAGES_PER_REQ);
-		fuse_send_readpages(req, data->file);
-		if (fc->async_read)
-			req = fuse_get_req_for_background(fc, nr_alloc);
-		else
-			req = fuse_get_req(fc, nr_alloc);
-
-		data->req = req;
-		if (IS_ERR(req)) {
+	if (ap->num_pages &&
+	    (ap->num_pages == fc->max_pages ||
+	     (ap->num_pages + 1) * PAGE_SIZE > fc->max_read ||
+	     ap->pages[ap->num_pages - 1]->index + 1 != page->index)) {
+		data->max_pages = min_t(unsigned int, data->nr_pages,
+					fc->max_pages);
+		fuse_send_readpages(ia, data->file);
+		data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
+		if (!ia) {
 			unlock_page(page);
-			return PTR_ERR(req);
+			return -ENOMEM;
 		}
+		ap = &ia->ap;
 	}
 
-	if (WARN_ON(req->num_pages >= req->max_pages)) {
+	if (WARN_ON(ap->num_pages >= data->max_pages)) {
 		unlock_page(page);
-		fuse_put_request(fc, req);
+		fuse_io_free(ia);
 		return -EIO;
 	}
 
 	get_page(page);
-	req->pages[req->num_pages] = page;
-	req->page_descs[req->num_pages].length = PAGE_SIZE;
-	req->num_pages++;
+	ap->pages[ap->num_pages] = page;
+	ap->descs[ap->num_pages].length = PAGE_SIZE;
+	ap->num_pages++;
 	data->nr_pages--;
 	return 0;
 }
@@ -888,7 +953,6 @@
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_fill_data data;
 	int err;
-	int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
 
 	err = -EIO;
 	if (is_bad_inode(inode))
@@ -896,27 +960,26 @@
 
 	data.file = file;
 	data.inode = inode;
-	if (fc->async_read)
-		data.req = fuse_get_req_for_background(fc, nr_alloc);
-	else
-		data.req = fuse_get_req(fc, nr_alloc);
 	data.nr_pages = nr_pages;
-	err = PTR_ERR(data.req);
-	if (IS_ERR(data.req))
+	data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
+;
+	data.ia = fuse_io_alloc(NULL, data.max_pages);
+	err = -ENOMEM;
+	if (!data.ia)
 		goto out;
 
 	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
 	if (!err) {
-		if (data.req->num_pages)
-			fuse_send_readpages(data.req, file);
+		if (data.ia->ap.num_pages)
+			fuse_send_readpages(data.ia, file);
 		else
-			fuse_put_request(fc, data.req);
+			fuse_io_free(data.ia);
 	}
 out:
 	return err;
 }
 
-static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -937,54 +1000,65 @@
 	return generic_file_read_iter(iocb, to);
 }
 
-static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
-			    loff_t pos, size_t count)
+static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
+				 loff_t pos, size_t count)
 {
-	struct fuse_write_in *inarg = &req->misc.write.in;
-	struct fuse_write_out *outarg = &req->misc.write.out;
+	struct fuse_args *args = &ia->ap.args;
 
-	inarg->fh = ff->fh;
-	inarg->offset = pos;
-	inarg->size = count;
-	req->in.h.opcode = FUSE_WRITE;
-	req->in.h.nodeid = ff->nodeid;
-	req->in.numargs = 2;
+	ia->write.in.fh = ff->fh;
+	ia->write.in.offset = pos;
+	ia->write.in.size = count;
+	args->opcode = FUSE_WRITE;
+	args->nodeid = ff->nodeid;
+	args->in_numargs = 2;
 	if (ff->fc->minor < 9)
-		req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
+		args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
 	else
-		req->in.args[0].size = sizeof(struct fuse_write_in);
-	req->in.args[0].value = inarg;
-	req->in.args[1].size = count;
-	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(struct fuse_write_out);
-	req->out.args[0].value = outarg;
+		args->in_args[0].size = sizeof(ia->write.in);
+	args->in_args[0].value = &ia->write.in;
+	args->in_args[1].size = count;
+	args->out_numargs = 1;
+	args->out_args[0].size = sizeof(ia->write.out);
+	args->out_args[0].value = &ia->write.out;
 }
 
-static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
-			      loff_t pos, size_t count, fl_owner_t owner)
+static unsigned int fuse_write_flags(struct kiocb *iocb)
 {
-	struct kiocb *iocb = io->iocb;
+	unsigned int flags = iocb->ki_filp->f_flags;
+
+	if (iocb->ki_flags & IOCB_DSYNC)
+		flags |= O_DSYNC;
+	if (iocb->ki_flags & IOCB_SYNC)
+		flags |= O_SYNC;
+
+	return flags;
+}
+
+static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
+			       size_t count, fl_owner_t owner)
+{
+	struct kiocb *iocb = ia->io->iocb;
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
-	struct fuse_write_in *inarg = &req->misc.write.in;
+	struct fuse_write_in *inarg = &ia->write.in;
+	ssize_t err;
 
-	fuse_write_fill(req, ff, pos, count);
-	inarg->flags = file->f_flags;
-	if (iocb->ki_flags & IOCB_DSYNC)
-		inarg->flags |= O_DSYNC;
-	if (iocb->ki_flags & IOCB_SYNC)
-		inarg->flags |= O_SYNC;
+	fuse_write_args_fill(ia, ff, pos, count);
+	inarg->flags = fuse_write_flags(iocb);
 	if (owner != NULL) {
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
 
-	if (io->async)
-		return fuse_async_req_send(fc, req, count, io);
+	if (ia->io->async)
+		return fuse_async_req_send(fc, ia, count);
 
-	fuse_request_send(fc, req);
-	return req->misc.write.out.size;
+	err = fuse_simple_request(fc, &ia->ap.args);
+	if (!err && ia->write.out.size > count)
+		err = -EIO;
+
+	return err ?: ia->write.out.size;
 }
 
 bool fuse_write_update_size(struct inode *inode, loff_t pos)
@@ -993,37 +1067,42 @@
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	bool ret = false;
 
-	spin_lock(&fc->lock);
-	fi->attr_version = ++fc->attr_version;
+	spin_lock(&fi->lock);
+	fi->attr_version = atomic64_inc_return(&fc->attr_version);
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
 		ret = true;
 	}
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 
 	return ret;
 }
 
-static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb,
-				    struct inode *inode, loff_t pos,
-				    size_t count)
+static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
+				     struct kiocb *iocb, struct inode *inode,
+				     loff_t pos, size_t count)
 {
-	size_t res;
-	unsigned offset;
-	unsigned i;
-	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+	struct fuse_args_pages *ap = &ia->ap;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	unsigned int offset, i;
+	int err;
 
-	for (i = 0; i < req->num_pages; i++)
-		fuse_wait_on_page_writeback(inode, req->pages[i]->index);
+	for (i = 0; i < ap->num_pages; i++)
+		fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
 
-	res = fuse_send_write(req, &io, pos, count, NULL);
+	fuse_write_args_fill(ia, ff, pos, count);
+	ia->write.in.flags = fuse_write_flags(iocb);
 
-	offset = req->page_descs[0].offset;
-	count = res;
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
+	err = fuse_simple_request(fc, &ap->args);
 
-		if (!req->out.h.error && !offset && count >= PAGE_SIZE)
+	offset = ap->descs[0].offset;
+	count = ia->write.out.size;
+	for (i = 0; i < ap->num_pages; i++) {
+		struct page *page = ap->pages[i];
+
+		if (!err && !offset && count >= PAGE_SIZE)
 			SetPageUptodate(page);
 
 		if (count > PAGE_SIZE - offset)
@@ -1036,20 +1115,21 @@
 		put_page(page);
 	}
 
-	return res;
+	return err;
 }
 
-static ssize_t fuse_fill_write_pages(struct fuse_req *req,
-			       struct address_space *mapping,
-			       struct iov_iter *ii, loff_t pos)
+static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
+				     struct address_space *mapping,
+				     struct iov_iter *ii, loff_t pos,
+				     unsigned int max_pages)
 {
 	struct fuse_conn *fc = get_fuse_conn(mapping->host);
 	unsigned offset = pos & (PAGE_SIZE - 1);
 	size_t count = 0;
 	int err;
 
-	req->in.argpages = 1;
-	req->page_descs[0].offset = offset;
+	ap->args.in_pages = true;
+	ap->descs[0].offset = offset;
 
 	do {
 		size_t tmp;
@@ -1085,9 +1165,9 @@
 		}
 
 		err = 0;
-		req->pages[req->num_pages] = page;
-		req->page_descs[req->num_pages].length = tmp;
-		req->num_pages++;
+		ap->pages[ap->num_pages] = page;
+		ap->descs[ap->num_pages].length = tmp;
+		ap->num_pages++;
 
 		count += tmp;
 		pos += tmp;
@@ -1098,17 +1178,18 @@
 		if (!fc->big_writes)
 			break;
 	} while (iov_iter_count(ii) && count < fc->max_write &&
-		 req->num_pages < req->max_pages && offset == 0);
+		 ap->num_pages < max_pages && offset == 0);
 
 	return count > 0 ? count : err;
 }
 
-static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
+				     unsigned int max_pages)
 {
-	return min_t(unsigned,
+	return min_t(unsigned int,
 		     ((pos + len - 1) >> PAGE_SHIFT) -
 		     (pos >> PAGE_SHIFT) + 1,
-		     FUSE_MAX_PAGES_PER_REQ);
+		     max_pages);
 }
 
 static ssize_t fuse_perform_write(struct kiocb *iocb,
@@ -1121,33 +1202,31 @@
 	int err = 0;
 	ssize_t res = 0;
 
-	if (is_bad_inode(inode))
-		return -EIO;
-
 	if (inode->i_size < pos + iov_iter_count(ii))
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
 	do {
-		struct fuse_req *req;
 		ssize_t count;
-		unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
+		struct fuse_io_args ia = {};
+		struct fuse_args_pages *ap = &ia.ap;
+		unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
+						      fc->max_pages);
 
-		req = fuse_get_req(fc, nr_pages);
-		if (IS_ERR(req)) {
-			err = PTR_ERR(req);
+		ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
+		if (!ap->pages) {
+			err = -ENOMEM;
 			break;
 		}
 
-		count = fuse_fill_write_pages(req, mapping, ii, pos);
+		count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
 		if (count <= 0) {
 			err = count;
 		} else {
-			size_t num_written;
-
-			num_written = fuse_send_write_pages(req, iocb, inode,
-							    pos, count);
-			err = req->out.h.error;
+			err = fuse_send_write_pages(&ia, iocb, inode,
+						    pos, count);
 			if (!err) {
+				size_t num_written = ia.write.out.size;
+
 				res += num_written;
 				pos += num_written;
 
@@ -1156,7 +1235,7 @@
 					err = -EIO;
 			}
 		}
-		fuse_put_request(fc, req);
+		kfree(ap->pages);
 	} while (!err && iov_iter_count(ii));
 
 	if (res > 0)
@@ -1168,7 +1247,7 @@
 	return res > 0 ? res : err;
 }
 
-static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -1244,14 +1323,14 @@
 	return written ? written : err;
 }
 
-static inline void fuse_page_descs_length_init(struct fuse_req *req,
-		unsigned index, unsigned nr_pages)
+static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
+					       unsigned int index,
+					       unsigned int nr_pages)
 {
 	int i;
 
 	for (i = index; i < index + nr_pages; i++)
-		req->page_descs[i].length = PAGE_SIZE -
-			req->page_descs[i].offset;
+		descs[i].length = PAGE_SIZE - descs[i].offset;
 }
 
 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
@@ -1265,33 +1344,34 @@
 	return min(iov_iter_single_seg_count(ii), max_size);
 }
 
-static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
-			       size_t *nbytesp, int write)
+static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
+			       size_t *nbytesp, int write,
+			       unsigned int max_pages)
 {
 	size_t nbytes = 0;  /* # bytes already packed in req */
 	ssize_t ret = 0;
 
 	/* Special case for kernel I/O: can copy directly into the buffer */
-	if (ii->type & ITER_KVEC) {
+	if (iov_iter_is_kvec(ii)) {
 		unsigned long user_addr = fuse_get_user_addr(ii);
 		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
 
 		if (write)
-			req->in.args[1].value = (void *) user_addr;
+			ap->args.in_args[1].value = (void *) user_addr;
 		else
-			req->out.args[0].value = (void *) user_addr;
+			ap->args.out_args[0].value = (void *) user_addr;
 
 		iov_iter_advance(ii, frag_size);
 		*nbytesp = frag_size;
 		return 0;
 	}
 
-	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
+	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
 		unsigned npages;
 		size_t start;
-		ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
+		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
 					*nbytesp - nbytes,
-					req->max_pages - req->num_pages,
+					max_pages - ap->num_pages,
 					&start);
 		if (ret < 0)
 			break;
@@ -1302,29 +1382,24 @@
 		ret += start;
 		npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
 
-		req->page_descs[req->num_pages].offset = start;
-		fuse_page_descs_length_init(req, req->num_pages, npages);
+		ap->descs[ap->num_pages].offset = start;
+		fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
 
-		req->num_pages += npages;
-		req->page_descs[req->num_pages - 1].length -=
+		ap->num_pages += npages;
+		ap->descs[ap->num_pages - 1].length -=
 			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
 	}
 
 	if (write)
-		req->in.argpages = 1;
+		ap->args.in_pages = 1;
 	else
-		req->out.argpages = 1;
+		ap->args.out_pages = 1;
 
 	*nbytesp = nbytes;
 
 	return ret < 0 ? ret : 0;
 }
 
-static inline int fuse_iter_npages(const struct iov_iter *ii_p)
-{
-	return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
-}
-
 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 		       loff_t *ppos, int flags)
 {
@@ -1340,16 +1415,16 @@
 	pgoff_t idx_from = pos >> PAGE_SHIFT;
 	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
 	ssize_t res = 0;
-	struct fuse_req *req;
 	int err = 0;
+	struct fuse_io_args *ia;
+	unsigned int max_pages;
 
-	if (io->async)
-		req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
-	else
-		req = fuse_get_req(fc, fuse_iter_npages(iter));
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	max_pages = iov_iter_npages(iter, fc->max_pages);
+	ia = fuse_io_alloc(io, max_pages);
+	if (!ia)
+		return -ENOMEM;
 
+	ia->io = io;
 	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
 		if (!write)
 			inode_lock(inode);
@@ -1360,46 +1435,49 @@
 
 	io->should_dirty = !write && iter_is_iovec(iter);
 	while (count) {
-		size_t nres;
+		ssize_t nres;
 		fl_owner_t owner = current->files;
 		size_t nbytes = min(count, nmax);
-		err = fuse_get_user_pages(req, iter, &nbytes, write);
+
+		err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
+					  max_pages);
 		if (err && !nbytes)
 			break;
 
-		if (write)
-			nres = fuse_send_write(req, io, pos, nbytes, owner);
-		else
-			nres = fuse_send_read(req, io, pos, nbytes, owner);
+		if (write) {
+			if (!capable(CAP_FSETID))
+				ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV;
 
-		if (!io->async)
-			fuse_release_user_pages(req, io->should_dirty);
-		if (req->out.h.error) {
-			err = req->out.h.error;
-			break;
-		} else if (nres > nbytes) {
-			res = 0;
-			err = -EIO;
+			nres = fuse_send_write(ia, pos, nbytes, owner);
+		} else {
+			nres = fuse_send_read(ia, pos, nbytes, owner);
+		}
+
+		if (!io->async || nres < 0) {
+			fuse_release_user_pages(&ia->ap, io->should_dirty);
+			fuse_io_free(ia);
+		}
+		ia = NULL;
+		if (nres < 0) {
+			err = nres;
 			break;
 		}
+		WARN_ON(nres > nbytes);
+
 		count -= nres;
 		res += nres;
 		pos += nres;
 		if (nres != nbytes)
 			break;
 		if (count) {
-			fuse_put_request(fc, req);
-			if (io->async)
-				req = fuse_get_req_for_background(fc,
-					fuse_iter_npages(iter));
-			else
-				req = fuse_get_req(fc, fuse_iter_npages(iter));
-			if (IS_ERR(req))
+			max_pages = iov_iter_npages(iter, fc->max_pages);
+			ia = fuse_io_alloc(io, max_pages);
+			if (!ia)
 				break;
 		}
 	}
-	if (!IS_ERR(req))
-		fuse_put_request(fc, req);
+	if (ia)
+		fuse_io_free(ia);
 	if (res > 0)
 		*ppos = pos;
 
@@ -1414,20 +1492,28 @@
 	ssize_t res;
 	struct inode *inode = file_inode(io->iocb->ki_filp);
 
-	if (is_bad_inode(inode))
-		return -EIO;
-
 	res = fuse_direct_io(io, iter, ppos, 0);
 
-	fuse_invalidate_attr(inode);
+	fuse_invalidate_atime(inode);
 
 	return res;
 }
 
+static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+
 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
-	return __fuse_direct_read(&io, to, &iocb->ki_pos);
+	ssize_t res;
+
+	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+		res = fuse_direct_IO(iocb, to);
+	} else {
+		struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+
+		res = __fuse_direct_read(&io, to, &iocb->ki_pos);
+	}
+
+	return res;
 }
 
 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1436,14 +1522,17 @@
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 	ssize_t res;
 
-	if (is_bad_inode(inode))
-		return -EIO;
-
 	/* Don't allow parallel writes to the same file */
 	inode_lock(inode);
 	res = generic_write_checks(iocb, from);
-	if (res > 0)
-		res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+	if (res > 0) {
+		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+			res = fuse_direct_IO(iocb, from);
+		} else {
+			res = fuse_direct_io(&io, from, &iocb->ki_pos,
+					     FUSE_DIO_WRITE);
+		}
+	}
 	fuse_invalidate_attr(inode);
 	if (res > 0)
 		fuse_write_update_size(inode, iocb->ki_pos);
@@ -1452,46 +1541,81 @@
 	return res;
 }
 
-static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-	int i;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
 
-	for (i = 0; i < req->num_pages; i++)
-		__free_page(req->pages[i]);
+	if (is_bad_inode(file_inode(file)))
+		return -EIO;
 
-	if (req->ff)
-		fuse_file_put(req->ff, false, false);
+	if (!(ff->open_flags & FOPEN_DIRECT_IO))
+		return fuse_cache_read_iter(iocb, to);
+	else
+		return fuse_direct_read_iter(iocb, to);
 }
 
-static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct inode *inode = req->inode;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+
+	if (is_bad_inode(file_inode(file)))
+		return -EIO;
+
+	if (!(ff->open_flags & FOPEN_DIRECT_IO))
+		return fuse_cache_write_iter(iocb, from);
+	else
+		return fuse_direct_write_iter(iocb, from);
+}
+
+static void fuse_writepage_free(struct fuse_writepage_args *wpa)
+{
+	struct fuse_args_pages *ap = &wpa->ia.ap;
+	int i;
+
+	for (i = 0; i < ap->num_pages; i++)
+		__free_page(ap->pages[i]);
+
+	if (wpa->ia.ff)
+		fuse_file_put(wpa->ia.ff, false, false);
+
+	kfree(ap->pages);
+	kfree(wpa);
+}
+
+static void fuse_writepage_finish(struct fuse_conn *fc,
+				  struct fuse_writepage_args *wpa)
+{
+	struct fuse_args_pages *ap = &wpa->ia.ap;
+	struct inode *inode = wpa->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	int i;
 
-	list_del(&req->writepages_entry);
-	for (i = 0; i < req->num_pages; i++) {
+	list_del(&wpa->writepages_entry);
+	for (i = 0; i < ap->num_pages; i++) {
 		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+		dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
 		wb_writeout_inc(&bdi->wb);
 	}
 	wake_up(&fi->page_waitq);
 }
 
-/* Called under fc->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
-				loff_t size)
-__releases(fc->lock)
-__acquires(fc->lock)
+/* Called under fi->lock, may release and reacquire it */
+static void fuse_send_writepage(struct fuse_conn *fc,
+				struct fuse_writepage_args *wpa, loff_t size)
+__releases(fi->lock)
+__acquires(fi->lock)
 {
-	struct fuse_inode *fi = get_fuse_inode(req->inode);
-	struct fuse_write_in *inarg = &req->misc.write.in;
-	__u64 data_size = req->num_pages * PAGE_SIZE;
+	struct fuse_writepage_args *aux, *next;
+	struct fuse_inode *fi = get_fuse_inode(wpa->inode);
+	struct fuse_write_in *inarg = &wpa->ia.write.in;
+	struct fuse_args *args = &wpa->ia.ap.args;
+	__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
+	int err;
 
-	if (!fc->connected)
-		goto out_free;
-
+	fi->writectr++;
 	if (inarg->offset + data_size <= size) {
 		inarg->size = data_size;
 	} else if (inarg->offset < size) {
@@ -1501,55 +1625,80 @@
 		goto out_free;
 	}
 
-	req->in.args[1].size = inarg->size;
-	fi->writectr++;
-	fuse_request_send_background_locked(fc, req);
+	args->in_args[1].size = inarg->size;
+	args->force = true;
+	args->nocreds = true;
+
+	err = fuse_simple_background(fc, args, GFP_ATOMIC);
+	if (err == -ENOMEM) {
+		spin_unlock(&fi->lock);
+		err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
+		spin_lock(&fi->lock);
+	}
+
+	/* Fails on broken connection only */
+	if (unlikely(err))
+		goto out_free;
+
 	return;
 
  out_free:
-	fuse_writepage_finish(fc, req);
-	spin_unlock(&fc->lock);
-	fuse_writepage_free(fc, req);
-	fuse_put_request(fc, req);
-	spin_lock(&fc->lock);
+	fi->writectr--;
+	fuse_writepage_finish(fc, wpa);
+	spin_unlock(&fi->lock);
+
+	/* After fuse_writepage_finish() aux request list is private */
+	for (aux = wpa->next; aux; aux = next) {
+		next = aux->next;
+		aux->next = NULL;
+		fuse_writepage_free(aux);
+	}
+
+	fuse_writepage_free(wpa);
+	spin_lock(&fi->lock);
 }
 
 /*
  * If fi->writectr is positive (no truncate or fsync going on) send
  * all queued writepage requests.
  *
- * Called with fc->lock
+ * Called with fi->lock
  */
 void fuse_flush_writepages(struct inode *inode)
-__releases(fc->lock)
-__acquires(fc->lock)
+__releases(fi->lock)
+__acquires(fi->lock)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	size_t crop = i_size_read(inode);
-	struct fuse_req *req;
+	loff_t crop = i_size_read(inode);
+	struct fuse_writepage_args *wpa;
 
 	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
-		req = list_entry(fi->queued_writes.next, struct fuse_req, list);
-		list_del_init(&req->list);
-		fuse_send_writepage(fc, req, crop);
+		wpa = list_entry(fi->queued_writes.next,
+				 struct fuse_writepage_args, queue_entry);
+		list_del_init(&wpa->queue_entry);
+		fuse_send_writepage(fc, wpa, crop);
 	}
 }
 
-static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
+			       int error)
 {
-	struct inode *inode = req->inode;
+	struct fuse_writepage_args *wpa =
+		container_of(args, typeof(*wpa), ia.ap.args);
+	struct inode *inode = wpa->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
-	mapping_set_error(inode->i_mapping, req->out.h.error);
-	spin_lock(&fc->lock);
-	while (req->misc.write.next) {
+	mapping_set_error(inode->i_mapping, error);
+	spin_lock(&fi->lock);
+	while (wpa->next) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_write_in *inarg = &req->misc.write.in;
-		struct fuse_req *next = req->misc.write.next;
-		req->misc.write.next = next->misc.write.next;
-		next->misc.write.next = NULL;
-		next->ff = fuse_file_get(req->ff);
+		struct fuse_write_in *inarg = &wpa->ia.write.in;
+		struct fuse_writepage_args *next = wpa->next;
+
+		wpa->next = next->next;
+		next->next = NULL;
+		next->ia.ff = fuse_file_get(wpa->ia.ff);
 		list_add(&next->writepages_entry, &fi->writepages);
 
 		/*
@@ -1578,9 +1727,9 @@
 		fuse_send_writepage(fc, next, inarg->offset + inarg->size);
 	}
 	fi->writectr--;
-	fuse_writepage_finish(fc, req);
-	spin_unlock(&fc->lock);
-	fuse_writepage_free(fc, req);
+	fuse_writepage_finish(fc, wpa);
+	spin_unlock(&fi->lock);
+	fuse_writepage_free(wpa);
 }
 
 static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
@@ -1588,13 +1737,13 @@
 {
 	struct fuse_file *ff = NULL;
 
-	spin_lock(&fc->lock);
+	spin_lock(&fi->lock);
 	if (!list_empty(&fi->write_files)) {
 		ff = list_entry(fi->write_files.next, struct fuse_file,
 				write_entry);
 		fuse_file_get(ff);
 	}
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 
 	return ff;
 }
@@ -1622,54 +1771,73 @@
 	return err;
 }
 
+static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
+{
+	struct fuse_writepage_args *wpa;
+	struct fuse_args_pages *ap;
+
+	wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
+	if (wpa) {
+		ap = &wpa->ia.ap;
+		ap->num_pages = 0;
+		ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
+		if (!ap->pages) {
+			kfree(wpa);
+			wpa = NULL;
+		}
+	}
+	return wpa;
+
+}
+
 static int fuse_writepage_locked(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct fuse_req *req;
+	struct fuse_writepage_args *wpa;
+	struct fuse_args_pages *ap;
 	struct page *tmp_page;
 	int error = -ENOMEM;
 
 	set_page_writeback(page);
 
-	req = fuse_request_alloc_nofs(1);
-	if (!req)
+	wpa = fuse_writepage_args_alloc();
+	if (!wpa)
 		goto err;
+	ap = &wpa->ia.ap;
 
-	/* writeback always goes to bg_queue */
-	__set_bit(FR_BACKGROUND, &req->flags);
 	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	if (!tmp_page)
 		goto err_free;
 
 	error = -EIO;
-	req->ff = fuse_write_file_get(fc, fi);
-	if (!req->ff)
+	wpa->ia.ff = fuse_write_file_get(fc, fi);
+	if (!wpa->ia.ff)
 		goto err_nofile;
 
-	fuse_write_fill(req, req->ff, page_offset(page), 0);
+	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
 
 	copy_highpage(tmp_page, page);
-	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
-	req->misc.write.next = NULL;
-	req->in.argpages = 1;
-	req->num_pages = 1;
-	req->pages[0] = tmp_page;
-	req->page_descs[0].offset = 0;
-	req->page_descs[0].length = PAGE_SIZE;
-	req->end = fuse_writepage_end;
-	req->inode = inode;
+	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
+	wpa->next = NULL;
+	ap->args.in_pages = true;
+	ap->num_pages = 1;
+	ap->pages[0] = tmp_page;
+	ap->descs[0].offset = 0;
+	ap->descs[0].length = PAGE_SIZE;
+	ap->args.end = fuse_writepage_end;
+	wpa->inode = inode;
 
 	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
-	spin_lock(&fc->lock);
-	list_add(&req->writepages_entry, &fi->writepages);
-	list_add_tail(&req->list, &fi->queued_writes);
+	spin_lock(&fi->lock);
+	list_add(&wpa->writepages_entry, &fi->writepages);
+	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
 	fuse_flush_writepages(inode);
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 
 	end_page_writeback(page);
 
@@ -1678,7 +1846,7 @@
 err_nofile:
 	__free_page(tmp_page);
 err_free:
-	fuse_request_free(req);
+	kfree(wpa);
 err:
 	mapping_set_error(page->mapping, error);
 	end_page_writeback(page);
@@ -1698,6 +1866,7 @@
 		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
 
 		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
 		return 0;
 	}
 
@@ -1708,97 +1877,123 @@
 }
 
 struct fuse_fill_wb_data {
-	struct fuse_req *req;
+	struct fuse_writepage_args *wpa;
 	struct fuse_file *ff;
 	struct inode *inode;
 	struct page **orig_pages;
+	unsigned int max_pages;
 };
 
+static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
+{
+	struct fuse_args_pages *ap = &data->wpa->ia.ap;
+	struct fuse_conn *fc = get_fuse_conn(data->inode);
+	struct page **pages;
+	struct fuse_page_desc *descs;
+	unsigned int npages = min_t(unsigned int,
+				    max_t(unsigned int, data->max_pages * 2,
+					  FUSE_DEFAULT_MAX_PAGES_PER_REQ),
+				    fc->max_pages);
+	WARN_ON(npages <= data->max_pages);
+
+	pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
+	if (!pages)
+		return false;
+
+	memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
+	memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
+	kfree(ap->pages);
+	ap->pages = pages;
+	ap->descs = descs;
+	data->max_pages = npages;
+
+	return true;
+}
+
 static void fuse_writepages_send(struct fuse_fill_wb_data *data)
 {
-	struct fuse_req *req = data->req;
+	struct fuse_writepage_args *wpa = data->wpa;
 	struct inode *inode = data->inode;
-	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	int num_pages = req->num_pages;
+	int num_pages = wpa->ia.ap.num_pages;
 	int i;
 
-	req->ff = fuse_file_get(data->ff);
-	spin_lock(&fc->lock);
-	list_add_tail(&req->list, &fi->queued_writes);
+	wpa->ia.ff = fuse_file_get(data->ff);
+	spin_lock(&fi->lock);
+	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
 	fuse_flush_writepages(inode);
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 
 	for (i = 0; i < num_pages; i++)
 		end_page_writeback(data->orig_pages[i]);
 }
 
-static bool fuse_writepage_in_flight(struct fuse_req *new_req,
+/*
+ * First recheck under fi->lock if the offending offset is still under
+ * writeback.  If yes, then iterate auxiliary write requests, to see if there's
+ * one already added for a page at this offset.  If there's none, then insert
+ * this new request onto the auxiliary list, otherwise reuse the existing one by
+ * copying the new page contents over to the old temporary page.
+ */
+static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
 				     struct page *page)
 {
-	struct fuse_conn *fc = get_fuse_conn(new_req->inode);
-	struct fuse_inode *fi = get_fuse_inode(new_req->inode);
-	struct fuse_req *tmp;
-	struct fuse_req *old_req;
-	bool found = false;
-	pgoff_t curr_index;
+	struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
+	struct fuse_writepage_args *tmp;
+	struct fuse_writepage_args *old_wpa;
+	struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
 
-	BUG_ON(new_req->num_pages != 0);
+	WARN_ON(new_ap->num_pages != 0);
 
-	spin_lock(&fc->lock);
-	list_del(&new_req->writepages_entry);
-	list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
-		BUG_ON(old_req->inode != new_req->inode);
-		curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT;
-		if (curr_index <= page->index &&
-		    page->index < curr_index + old_req->num_pages) {
-			found = true;
+	spin_lock(&fi->lock);
+	list_del(&new_wpa->writepages_entry);
+	old_wpa = fuse_find_writeback(fi, page->index, page->index);
+	if (!old_wpa) {
+		list_add(&new_wpa->writepages_entry, &fi->writepages);
+		spin_unlock(&fi->lock);
+		return false;
+	}
+
+	new_ap->num_pages = 1;
+	for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
+		pgoff_t curr_index;
+
+		WARN_ON(tmp->inode != new_wpa->inode);
+		curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
+		if (curr_index == page->index) {
+			WARN_ON(tmp->ia.ap.num_pages != 1);
+			swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
 			break;
 		}
 	}
-	if (!found) {
-		list_add(&new_req->writepages_entry, &fi->writepages);
-		goto out_unlock;
+
+	if (!tmp) {
+		new_wpa->next = old_wpa->next;
+		old_wpa->next = new_wpa;
 	}
 
-	new_req->num_pages = 1;
-	for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
-		BUG_ON(tmp->inode != new_req->inode);
-		curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
-		if (tmp->num_pages == 1 &&
-		    curr_index == page->index) {
-			old_req = tmp;
-		}
-	}
+	spin_unlock(&fi->lock);
 
-	if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
-		struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
-
-		copy_highpage(old_req->pages[0], page);
-		spin_unlock(&fc->lock);
+	if (tmp) {
+		struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
 
 		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		dec_node_page_state(page, NR_WRITEBACK_TEMP);
+		dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
 		wb_writeout_inc(&bdi->wb);
-		fuse_writepage_free(fc, new_req);
-		fuse_request_free(new_req);
-		goto out;
-	} else {
-		new_req->misc.write.next = old_req->misc.write.next;
-		old_req->misc.write.next = new_req;
+		fuse_writepage_free(new_wpa);
 	}
-out_unlock:
-	spin_unlock(&fc->lock);
-out:
-	return found;
+
+	return true;
 }
 
 static int fuse_writepages_fill(struct page *page,
 		struct writeback_control *wbc, void *_data)
 {
 	struct fuse_fill_wb_data *data = _data;
-	struct fuse_req *req = data->req;
+	struct fuse_writepage_args *wpa = data->wpa;
+	struct fuse_args_pages *ap = &wpa->ia.ap;
 	struct inode *inode = data->inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct page *tmp_page;
 	bool is_writeback;
@@ -1806,7 +2001,7 @@
 
 	if (!data->ff) {
 		err = -EIO;
-		data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
+		data->ff = fuse_write_file_get(fc, fi);
 		if (!data->ff)
 			goto out_unlock;
 	}
@@ -1819,13 +2014,19 @@
 	 */
 	is_writeback = fuse_page_is_writeback(inode, page->index);
 
-	if (req && req->num_pages &&
-	    (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
-	     (req->num_pages + 1) * PAGE_SIZE > fc->max_write ||
-	     data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
+	if (wpa && ap->num_pages &&
+	    (is_writeback || ap->num_pages == fc->max_pages ||
+	     (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
+	     data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
 		fuse_writepages_send(data);
-		data->req = NULL;
+		data->wpa = NULL;
+	} else if (wpa && ap->num_pages == data->max_pages) {
+		if (!fuse_pages_realloc(data)) {
+			fuse_writepages_send(data);
+			data->wpa = NULL;
+		}
 	}
+
 	err = -ENOMEM;
 	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	if (!tmp_page)
@@ -1840,60 +2041,59 @@
 	 * This is ensured by holding the page lock in page_mkwrite() while
 	 * checking fuse_page_is_writeback().  We already hold the page lock
 	 * since clear_page_dirty_for_io() and keep it held until we add the
-	 * request to the fi->writepages list and increment req->num_pages.
+	 * request to the fi->writepages list and increment ap->num_pages.
 	 * After this fuse_page_is_writeback() will indicate that the page is
 	 * under writeback, so we can release the page lock.
 	 */
-	if (data->req == NULL) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
-
+	if (data->wpa == NULL) {
 		err = -ENOMEM;
-		req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
-		if (!req) {
+		wpa = fuse_writepage_args_alloc();
+		if (!wpa) {
 			__free_page(tmp_page);
 			goto out_unlock;
 		}
+		data->max_pages = 1;
 
-		fuse_write_fill(req, data->ff, page_offset(page), 0);
-		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
-		req->misc.write.next = NULL;
-		req->in.argpages = 1;
-		__set_bit(FR_BACKGROUND, &req->flags);
-		req->num_pages = 0;
-		req->end = fuse_writepage_end;
-		req->inode = inode;
+		ap = &wpa->ia.ap;
+		fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
+		wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
+		wpa->next = NULL;
+		ap->args.in_pages = true;
+		ap->args.end = fuse_writepage_end;
+		ap->num_pages = 0;
+		wpa->inode = inode;
 
-		spin_lock(&fc->lock);
-		list_add(&req->writepages_entry, &fi->writepages);
-		spin_unlock(&fc->lock);
+		spin_lock(&fi->lock);
+		list_add(&wpa->writepages_entry, &fi->writepages);
+		spin_unlock(&fi->lock);
 
-		data->req = req;
+		data->wpa = wpa;
 	}
 	set_page_writeback(page);
 
 	copy_highpage(tmp_page, page);
-	req->pages[req->num_pages] = tmp_page;
-	req->page_descs[req->num_pages].offset = 0;
-	req->page_descs[req->num_pages].length = PAGE_SIZE;
+	ap->pages[ap->num_pages] = tmp_page;
+	ap->descs[ap->num_pages].offset = 0;
+	ap->descs[ap->num_pages].length = PAGE_SIZE;
 
 	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
-	if (is_writeback && fuse_writepage_in_flight(req, page)) {
+	if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
 		end_page_writeback(page);
-		data->req = NULL;
+		data->wpa = NULL;
 		goto out_unlock;
 	}
-	data->orig_pages[req->num_pages] = page;
+	data->orig_pages[ap->num_pages] = page;
 
 	/*
-	 * Protected by fc->lock against concurrent access by
+	 * Protected by fi->lock against concurrent access by
 	 * fuse_page_is_writeback().
 	 */
-	spin_lock(&fc->lock);
-	req->num_pages++;
-	spin_unlock(&fc->lock);
+	spin_lock(&fi->lock);
+	ap->num_pages++;
+	spin_unlock(&fi->lock);
 
 out_unlock:
 	unlock_page(page);
@@ -1905,6 +2105,7 @@
 			   struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_fill_wb_data data;
 	int err;
 
@@ -1913,20 +2114,20 @@
 		goto out;
 
 	data.inode = inode;
-	data.req = NULL;
+	data.wpa = NULL;
 	data.ff = NULL;
 
 	err = -ENOMEM;
-	data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ,
+	data.orig_pages = kcalloc(fc->max_pages,
 				  sizeof(struct page *),
 				  GFP_NOFS);
 	if (!data.orig_pages)
 		goto out;
 
 	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
-	if (data.req) {
+	if (data.wpa) {
 		/* Ignore errors if we can write at least one page */
-		BUG_ON(!data.req->num_pages);
+		WARN_ON(!data.wpa->ia.ap.num_pages);
 		fuse_writepages_send(&data);
 		err = 0;
 	}
@@ -2076,6 +2277,18 @@
 
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct fuse_file *ff = file->private_data;
+
+	if (ff->open_flags & FOPEN_DIRECT_IO) {
+		/* Can't provide the coherency needed for MAP_SHARED */
+		if (vma->vm_flags & VM_MAYSHARE)
+			return -ENODEV;
+
+		invalidate_inode_pages2(file->f_mapping);
+
+		return generic_file_mmap(file, vma);
+	}
+
 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
 		fuse_link_write_file(file);
 
@@ -2084,17 +2297,6 @@
 	return 0;
 }
 
-static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	/* Can't provide the coherency needed for MAP_SHARED */
-	if (vma->vm_flags & VM_MAYSHARE)
-		return -ENODEV;
-
-	invalidate_inode_pages2(file->f_mapping);
-
-	return generic_file_mmap(file, vma);
-}
-
 static int convert_fuse_file_lock(struct fuse_conn *fc,
 				  const struct fuse_file_lock *ffl,
 				  struct file_lock *fl)
@@ -2145,11 +2347,11 @@
 	inarg->lk.pid = pid;
 	if (flock)
 		inarg->lk_flags |= FUSE_LK_FLOCK;
-	args->in.h.opcode = opcode;
-	args->in.h.nodeid = get_node_id(inode);
-	args->in.numargs = 1;
-	args->in.args[0].size = sizeof(*inarg);
-	args->in.args[0].value = inarg;
+	args->opcode = opcode;
+	args->nodeid = get_node_id(inode);
+	args->in_numargs = 1;
+	args->in_args[0].size = sizeof(*inarg);
+	args->in_args[0].value = inarg;
 }
 
 static int fuse_getlk(struct file *file, struct file_lock *fl)
@@ -2162,9 +2364,9 @@
 	int err;
 
 	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (!err)
 		err = convert_fuse_file_lock(fc, &outarg.lk, fl);
@@ -2259,14 +2461,14 @@
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.block = block;
 	inarg.blocksize = inode->i_sb->s_blocksize;
-	args.in.h.opcode = FUSE_BMAP;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_BMAP;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS)
 		fc->no_bmap = 1;
@@ -2291,14 +2493,14 @@
 	if (fc->no_lseek)
 		goto fallback;
 
-	args.in.h.opcode = FUSE_LSEEK;
-	args.in.h.nodeid = ff->nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_LSEEK;
+	args.nodeid = ff->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (err) {
 		if (err == -ENOSYS) {
@@ -2388,10 +2590,11 @@
 }
 
 /* Make sure iov_length() won't overflow */
-static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
+				 size_t count)
 {
 	size_t n;
-	u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+	u32 max = fc->max_pages << PAGE_SHIFT;
 
 	for (n = 0; n < count; n++, iov++) {
 		if (iov->iov_len > (size_t) max)
@@ -2495,31 +2698,38 @@
 		.flags = flags
 	};
 	struct fuse_ioctl_out outarg;
-	struct fuse_req *req = NULL;
-	struct page **pages = NULL;
 	struct iovec *iov_page = NULL;
 	struct iovec *in_iov = NULL, *out_iov = NULL;
-	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
-	size_t in_size, out_size, transferred, c;
+	unsigned int in_iovs = 0, out_iovs = 0, max_pages;
+	size_t in_size, out_size, c;
+	ssize_t transferred;
 	int err, i;
 	struct iov_iter ii;
+	struct fuse_args_pages ap = {};
 
 #if BITS_PER_LONG == 32
 	inarg.flags |= FUSE_IOCTL_32BIT;
 #else
-	if (flags & FUSE_IOCTL_COMPAT)
+	if (flags & FUSE_IOCTL_COMPAT) {
 		inarg.flags |= FUSE_IOCTL_32BIT;
+#ifdef CONFIG_X86_X32
+		if (in_x32_syscall())
+			inarg.flags |= FUSE_IOCTL_COMPAT_X32;
+#endif
+	}
 #endif
 
 	/* assume all the iovs returned by client always fits in a page */
 	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
 
 	err = -ENOMEM;
-	pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
+	ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
 	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
-	if (!pages || !iov_page)
+	if (!ap.pages || !iov_page)
 		goto out;
 
+	fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
+
 	/*
 	 * If restricted, initialize IO parameters as encoded in @cmd.
 	 * RETRY from server is not allowed.
@@ -2554,58 +2764,46 @@
 
 	/* make sure there are enough buffer pages and init request with them */
 	err = -ENOMEM;
-	if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+	if (max_pages > fc->max_pages)
 		goto out;
-	while (num_pages < max_pages) {
-		pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
-		if (!pages[num_pages])
+	while (ap.num_pages < max_pages) {
+		ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (!ap.pages[ap.num_pages])
 			goto out;
-		num_pages++;
+		ap.num_pages++;
 	}
 
-	req = fuse_get_req(fc, num_pages);
-	if (IS_ERR(req)) {
-		err = PTR_ERR(req);
-		req = NULL;
-		goto out;
-	}
-	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
-	req->num_pages = num_pages;
-	fuse_page_descs_length_init(req, 0, req->num_pages);
 
 	/* okay, let's send it to the client */
-	req->in.h.opcode = FUSE_IOCTL;
-	req->in.h.nodeid = ff->nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(inarg);
-	req->in.args[0].value = &inarg;
+	ap.args.opcode = FUSE_IOCTL;
+	ap.args.nodeid = ff->nodeid;
+	ap.args.in_numargs = 1;
+	ap.args.in_args[0].size = sizeof(inarg);
+	ap.args.in_args[0].value = &inarg;
 	if (in_size) {
-		req->in.numargs++;
-		req->in.args[1].size = in_size;
-		req->in.argpages = 1;
+		ap.args.in_numargs++;
+		ap.args.in_args[1].size = in_size;
+		ap.args.in_pages = true;
 
 		err = -EFAULT;
 		iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
-		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) {
-			c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii);
+		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+			c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
 			if (c != PAGE_SIZE && iov_iter_count(&ii))
 				goto out;
 		}
 	}
 
-	req->out.numargs = 2;
-	req->out.args[0].size = sizeof(outarg);
-	req->out.args[0].value = &outarg;
-	req->out.args[1].size = out_size;
-	req->out.argpages = 1;
-	req->out.argvar = 1;
+	ap.args.out_numargs = 2;
+	ap.args.out_args[0].size = sizeof(outarg);
+	ap.args.out_args[0].value = &outarg;
+	ap.args.out_args[1].size = out_size;
+	ap.args.out_pages = true;
+	ap.args.out_argvar = true;
 
-	fuse_request_send(fc, req);
-	err = req->out.h.error;
-	transferred = req->out.args[1].size;
-	fuse_put_request(fc, req);
-	req = NULL;
-	if (err)
+	transferred = fuse_simple_request(fc, &ap.args);
+	err = transferred;
+	if (transferred < 0)
 		goto out;
 
 	/* did it ask for retry? */
@@ -2630,7 +2828,7 @@
 		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
 			goto out;
 
-		vaddr = kmap_atomic(pages[0]);
+		vaddr = kmap_atomic(ap.pages[0]);
 		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
 					    transferred, in_iovs + out_iovs,
 					    (flags & FUSE_IOCTL_COMPAT) != 0);
@@ -2641,11 +2839,11 @@
 		in_iov = iov_page;
 		out_iov = in_iov + in_iovs;
 
-		err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+		err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
 		if (err)
 			goto out;
 
-		err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+		err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
 		if (err)
 			goto out;
 
@@ -2658,19 +2856,17 @@
 
 	err = -EFAULT;
 	iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
-	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) {
-		c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii);
+	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+		c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
 		if (c != PAGE_SIZE && iov_iter_count(&ii))
 			goto out;
 	}
 	err = 0;
  out:
-	if (req)
-		fuse_put_request(fc, req);
 	free_page((unsigned long) iov_page);
-	while (num_pages)
-		__free_page(pages[--num_pages]);
-	kfree(pages);
+	while (ap.num_pages)
+		__free_page(ap.pages[--ap.num_pages]);
+	kfree(ap.pages);
 
 	return err ? err : outarg.result;
 }
@@ -2778,14 +2974,14 @@
 		fuse_register_polled_file(fc, ff);
 	}
 
-	args.in.h.opcode = FUSE_POLL;
-	args.in.h.nodeid = ff->nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_POLL;
+	args.nodeid = ff->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 
 	if (!err)
@@ -2836,9 +3032,9 @@
 	fuse_do_setattr(file_dentry(file), &attr, file);
 }
 
-static inline loff_t fuse_round_up(loff_t off)
+static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
 {
-	return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+	return round_up(off, fc->max_pages << PAGE_SHIFT);
 }
 
 static ssize_t
@@ -2867,7 +3063,7 @@
 	if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
 		if (offset >= i_size)
 			return 0;
-		iov_iter_truncate(iter, fuse_round_up(i_size - offset));
+		iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
 		count = iov_iter_count(iter);
 	}
 
@@ -2938,6 +3134,16 @@
 	return ret;
 }
 
+static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
+{
+	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+
+	if (!err)
+		fuse_sync_writes(inode);
+
+	return err;
+}
+
 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 				loff_t length)
 {
@@ -2966,23 +3172,28 @@
 		inode_lock(inode);
 		if (mode & FALLOC_FL_PUNCH_HOLE) {
 			loff_t endbyte = offset + length - 1;
-			err = filemap_write_and_wait_range(inode->i_mapping,
-							   offset, endbyte);
+
+			err = fuse_writeback_range(inode, offset, endbyte);
 			if (err)
 				goto out;
-
-			fuse_sync_writes(inode);
 		}
 	}
 
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + length > i_size_read(inode)) {
+		err = inode_newsize_ok(inode, offset + length);
+		if (err)
+			goto out;
+	}
+
 	if (!(mode & FALLOC_FL_KEEP_SIZE))
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
-	args.in.h.opcode = FUSE_FALLOCATE;
-	args.in.h.nodeid = ff->nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
+	args.opcode = FUSE_FALLOCATE;
+	args.nodeid = ff->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_fallocate = 1;
@@ -3014,6 +3225,111 @@
 	return err;
 }
 
+static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
+				      struct file *file_out, loff_t pos_out,
+				      size_t len, unsigned int flags)
+{
+	struct fuse_file *ff_in = file_in->private_data;
+	struct fuse_file *ff_out = file_out->private_data;
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
+	struct fuse_conn *fc = ff_in->fc;
+	FUSE_ARGS(args);
+	struct fuse_copy_file_range_in inarg = {
+		.fh_in = ff_in->fh,
+		.off_in = pos_in,
+		.nodeid_out = ff_out->nodeid,
+		.fh_out = ff_out->fh,
+		.off_out = pos_out,
+		.len = len,
+		.flags = flags
+	};
+	struct fuse_write_out outarg;
+	ssize_t err;
+	/* mark unstable when write-back is not used, and file_out gets
+	 * extended */
+	bool is_unstable = (!fc->writeback_cache) &&
+			   ((pos_out + len) > inode_out->i_size);
+
+	if (fc->no_copy_file_range)
+		return -EOPNOTSUPP;
+
+	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+		return -EXDEV;
+
+	if (fc->writeback_cache) {
+		inode_lock(inode_in);
+		err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
+		inode_unlock(inode_in);
+		if (err)
+			return err;
+	}
+
+	inode_lock(inode_out);
+
+	err = file_modified(file_out);
+	if (err)
+		goto out;
+
+	if (fc->writeback_cache) {
+		err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
+		if (err)
+			goto out;
+	}
+
+	if (is_unstable)
+		set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+	args.opcode = FUSE_COPY_FILE_RANGE;
+	args.nodeid = ff_in->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		fc->no_copy_file_range = 1;
+		err = -EOPNOTSUPP;
+	}
+	if (err)
+		goto out;
+
+	if (fc->writeback_cache) {
+		fuse_write_update_size(inode_out, pos_out + outarg.size);
+		file_update_time(file_out);
+	}
+
+	fuse_invalidate_attr(inode_out);
+
+	err = outarg.size;
+out:
+	if (is_unstable)
+		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+	inode_unlock(inode_out);
+	file_accessed(file_in);
+
+	return err;
+}
+
+static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
+				    struct file *dst_file, loff_t dst_off,
+				    size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
+				     len, flags);
+
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src_file, src_off, dst_file,
+					      dst_off, len, flags);
+	return ret;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read_iter	= fuse_file_read_iter,
@@ -3026,28 +3342,12 @@
 	.lock		= fuse_file_lock,
 	.flock		= fuse_file_flock,
 	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
 	.unlocked_ioctl	= fuse_file_ioctl,
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
 	.fallocate	= fuse_file_fallocate,
-};
-
-static const struct file_operations fuse_direct_io_file_operations = {
-	.llseek		= fuse_file_llseek,
-	.read_iter	= fuse_direct_read_iter,
-	.write_iter	= fuse_direct_write_iter,
-	.mmap		= fuse_direct_mmap,
-	.open		= fuse_open,
-	.flush		= fuse_flush,
-	.release	= fuse_release,
-	.fsync		= fuse_fsync,
-	.lock		= fuse_file_lock,
-	.flock		= fuse_file_flock,
-	.unlocked_ioctl	= fuse_file_ioctl,
-	.compat_ioctl	= fuse_file_compat_ioctl,
-	.poll		= fuse_file_poll,
-	.fallocate	= fuse_file_fallocate,
-	/* no splice_read */
+	.copy_file_range = fuse_copy_file_range,
 };
 
 static const struct address_space_operations fuse_file_aops  = {
@@ -3065,6 +3365,14 @@
 
 void fuse_init_file_inode(struct inode *inode)
 {
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
 	inode->i_fop = &fuse_file_operations;
 	inode->i_data.a_ops = &fuse_file_aops;
+
+	INIT_LIST_HEAD(&fi->write_files);
+	INIT_LIST_HEAD(&fi->queued_writes);
+	fi->writectr = 0;
+	init_waitqueue_head(&fi->page_waitq);
+	INIT_LIST_HEAD(&fi->writepages);
 }

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index cec8b8e..d148188 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h

@@ -9,6 +9,10 @@
 #ifndef _FS_FUSE_I_H
 #define _FS_FUSE_I_H
 
+#ifndef pr_fmt
+# define pr_fmt(fmt) "fuse: " fmt
+#endif
+
 #include <linux/fuse.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -28,8 +32,11 @@
 #include <linux/refcount.h>
 #include <linux/user_namespace.h>
 
-/** Max number of pages that can be used in a single read request */
-#define FUSE_MAX_PAGES_PER_REQ 32
+/** Default max number of pages that can be used in a single read request */
+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/** Maximum of max_pages received in init_out */
+#define FUSE_MAX_MAX_PAGES 256
 
 /** Bias for fi->writectr, meaning new writepages must not be sent */
 #define FUSE_NOWRITE INT_MIN
@@ -40,9 +47,6 @@
 /** Number of dentries for each connection in the control filesystem */
 #define FUSE_CTL_NUM_DENTRIES 5
 
-/** Number of page pointers embedded in fuse_req */
-#define FUSE_REQ_INLINE_PAGES 1
-
 /** List of active connections */
 extern struct list_head fuse_conn_list;
 
@@ -77,6 +81,9 @@
 	/** Time in jiffies until the file attributes are valid */
 	u64 i_time;
 
+	/* Which attributes are invalid */
+	u32 inval_mask;
+
 	/** The sticky bit in inode->i_mode may have been removed, so
 	    preserve the original mode */
 	umode_t orig_i_mode;
@@ -87,27 +94,60 @@
 	/** Version of last attribute change */
 	u64 attr_version;
 
-	/** Files usable in writepage.  Protected by fc->lock */
-	struct list_head write_files;
+	union {
+		/* Write related fields (regular file only) */
+		struct {
+			/* Files usable in writepage.  Protected by fi->lock */
+			struct list_head write_files;
 
-	/** Writepages pending on truncate or fsync */
-	struct list_head queued_writes;
+			/* Writepages pending on truncate or fsync */
+			struct list_head queued_writes;
 
-	/** Number of sent writes, a negative bias (FUSE_NOWRITE)
-	 * means more writes are blocked */
-	int writectr;
+			/* Number of sent writes, a negative bias
+			 * (FUSE_NOWRITE) means more writes are blocked */
+			int writectr;
 
-	/** Waitq for writepage completion */
-	wait_queue_head_t page_waitq;
+			/* Waitq for writepage completion */
+			wait_queue_head_t page_waitq;
 
-	/** List of writepage requestst (pending or sent) */
-	struct list_head writepages;
+			/* List of writepage requestst (pending or sent) */
+			struct list_head writepages;
+		};
+
+		/* readdir cache (directory only) */
+		struct {
+			/* true if fully cached */
+			bool cached;
+
+			/* size of cache */
+			loff_t size;
+
+			/* position at end of cache (position of next entry) */
+			loff_t pos;
+
+			/* version of the cache */
+			u64 version;
+
+			/* modification time of directory when cache was
+			 * started */
+			struct timespec64 mtime;
+
+			/* iversion of directory when cache was started */
+			u64 iversion;
+
+			/* protects above fields */
+			spinlock_t lock;
+		} rdc;
+	};
 
 	/** Miscellaneous bits describing inode state */
 	unsigned long state;
 
 	/** Lock for serializing lookup and readdir for back compatibility*/
 	struct mutex mutex;
+
+	/** Lock to protect write related fields */
+	spinlock_t lock;
 };
 
 /** FUSE inode state bits */
@@ -121,14 +161,15 @@
 };
 
 struct fuse_conn;
+struct fuse_release_args;
 
 /** FUSE specific file data */
 struct fuse_file {
 	/** Fuse connection for this file */
 	struct fuse_conn *fc;
 
-	/** Request reserved for flush and release */
-	struct fuse_req *reserved_req;
+	/* Argument space reserved for release */
+	struct fuse_release_args *release_args;
 
 	/** Kernel file handle guaranteed to be unique */
 	u64 kh;
@@ -148,6 +189,25 @@
 	/** Entry on inode's write_files list */
 	struct list_head write_entry;
 
+	/* Readdir related */
+	struct {
+		/*
+		 * Protects below fields against (crazy) parallel readdir on
+		 * same open file.  Uncontended in the normal case.
+		 */
+		struct mutex lock;
+
+		/* Dir stream position */
+		loff_t pos;
+
+		/* Offset in cache */
+		loff_t cache_off;
+
+		/* Version of cache we are reading */
+		u64 version;
+
+	} readdir;
+
 	/** RB node to be linked on fuse_conn->polled_files */
 	struct rb_node polled_node;
 
@@ -164,57 +224,12 @@
 	const void *value;
 };
 
-/** The request input */
-struct fuse_in {
-	/** The request header */
-	struct fuse_in_header h;
-
-	/** True if the data for the last argument is in req->pages */
-	unsigned argpages:1;
-
-	/** Number of arguments */
-	unsigned numargs;
-
-	/** Array of arguments */
-	struct fuse_in_arg args[3];
-};
-
 /** One output argument of a request */
 struct fuse_arg {
 	unsigned size;
 	void *value;
 };
 
-/** The request output */
-struct fuse_out {
-	/** Header returned from userspace */
-	struct fuse_out_header h;
-
-	/*
-	 * The following bitfields are not changed during the request
-	 * processing
-	 */
-
-	/** Last argument is variable length (can be shorter than
-	    arg->size) */
-	unsigned argvar:1;
-
-	/** Last argument is a list of pages to copy data to */
-	unsigned argpages:1;
-
-	/** Zero partially or not copied pages */
-	unsigned page_zeroing:1;
-
-	/** Pages may be replaced with new ones */
-	unsigned page_replace:1;
-
-	/** Number or arguments */
-	unsigned numargs;
-
-	/** Array of arguments */
-	struct fuse_arg args[2];
-};
-
 /** FUSE page descriptor */
 struct fuse_page_desc {
 	unsigned int length;
@@ -222,20 +237,28 @@
 };
 
 struct fuse_args {
-	struct {
-		struct {
-			uint32_t opcode;
-			uint64_t nodeid;
-		} h;
-		unsigned numargs;
-		struct fuse_in_arg args[3];
+	uint64_t nodeid;
+	uint32_t opcode;
+	unsigned short in_numargs;
+	unsigned short out_numargs;
+	bool force:1;
+	bool noreply:1;
+	bool nocreds:1;
+	bool in_pages:1;
+	bool out_pages:1;
+	bool out_argvar:1;
+	bool page_zeroing:1;
+	bool page_replace:1;
+	struct fuse_in_arg in_args[3];
+	struct fuse_arg out_args[2];
+	void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
+};
 
-	} in;
-	struct {
-		unsigned argvar:1;
-		unsigned numargs;
-		struct fuse_arg args[2];
-	} out;
+struct fuse_args_pages {
+	struct fuse_args args;
+	struct page **pages;
+	struct fuse_page_desc *descs;
+	unsigned int num_pages;
 };
 
 #define FUSE_ARGS(args) struct fuse_args args = {}
@@ -308,86 +331,79 @@
 	/** Entry on the interrupts list  */
 	struct list_head intr_entry;
 
+	/* Input/output arguments */
+	struct fuse_args *args;
+
 	/** refcount */
 	refcount_t count;
 
-	/** Unique ID for the interrupt request */
-	u64 intr_unique;
-
 	/* Request flags, updated with test/set/clear_bit() */
 	unsigned long flags;
 
-	/** The request input */
-	struct fuse_in in;
+	/* The request input header */
+	struct {
+		struct fuse_in_header h;
+	} in;
 
-	/** The request output */
-	struct fuse_out out;
+	/* The request output header */
+	struct {
+		struct fuse_out_header h;
+	} out;
 
 	/** Used to wake up the task waiting for completion of request*/
 	wait_queue_head_t waitq;
 
-	/** Data for asynchronous requests */
-	union {
-		struct {
-			struct fuse_release_in in;
-			struct inode *inode;
-		} release;
-		struct fuse_init_in init_in;
-		struct fuse_init_out init_out;
-		struct cuse_init_in cuse_init_in;
-		struct {
-			struct fuse_read_in in;
-			u64 attr_ver;
-		} read;
-		struct {
-			struct fuse_write_in in;
-			struct fuse_write_out out;
-			struct fuse_req *next;
-		} write;
-		struct fuse_notify_retrieve_in retrieve_in;
-	} misc;
-
-	/** page vector */
-	struct page **pages;
-
-	/** page-descriptor vector */
-	struct fuse_page_desc *page_descs;
-
-	/** size of the 'pages' array */
-	unsigned max_pages;
-
-	/** inline page vector */
-	struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
-
-	/** inline page-descriptor vector */
-	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
-
-	/** number of pages in vector */
-	unsigned num_pages;
-
-	/** File used in the request (or NULL) */
-	struct fuse_file *ff;
-
-	/** Inode used in the request or NULL */
-	struct inode *inode;
-
-	/** AIO control block */
-	struct fuse_io_priv *io;
-
-	/** Link on fi->writepages */
-	struct list_head writepages_entry;
-
-	/** Request completion callback */
-	void (*end)(struct fuse_conn *, struct fuse_req *);
-
-	/** Request is stolen from fuse_file->reserved_req */
-	struct file *stolen_file;
+#if IS_ENABLED(CONFIG_VIRTIO_FS)
+	/** virtio-fs's physically contiguous buffer for in and out args */
+	void *argbuf;
+#endif
 };
 
+struct fuse_iqueue;
+
+/**
+ * Input queue callbacks
+ *
+ * Input queue signalling is device-specific.  For example, the /dev/fuse file
+ * uses fiq->waitq and fasync to wake processes that are waiting on queue
+ * readiness.  These callbacks allow other device types to respond to input
+ * queue activity.
+ */
+struct fuse_iqueue_ops {
+	/**
+	 * Signal that a forget has been queued
+	 */
+	void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
+		__releases(fiq->lock);
+
+	/**
+	 * Signal that an INTERRUPT request has been queued
+	 */
+	void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
+		__releases(fiq->lock);
+
+	/**
+	 * Signal that a request has been queued
+	 */
+	void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
+		__releases(fiq->lock);
+
+	/**
+	 * Clean up when fuse_iqueue is destroyed
+	 */
+	void (*release)(struct fuse_iqueue *fiq);
+};
+
+/** /dev/fuse input queue operations */
+extern const struct fuse_iqueue_ops fuse_dev_fiq_ops;
+
 struct fuse_iqueue {
 	/** Connection established */
 	unsigned connected;
 
+	/** Lock protecting accesses to members of this structure */
+	spinlock_t lock;
+
 	/** Readers of the connection are waiting on this */
 	wait_queue_head_t waitq;
 
@@ -409,8 +425,17 @@
 
 	/** O_ASYNC requests */
 	struct fasync_struct *fasync;
+
+	/** Device-specific callbacks */
+	const struct fuse_iqueue_ops *ops;
+
+	/** Device-specific state */
+	void *priv;
 };
 
+#define FUSE_PQ_HASH_BITS 8
+#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS)
+
 struct fuse_pqueue {
 	/** Connection established */
 	unsigned connected;
@@ -418,8 +443,8 @@
 	/** Lock protecting accessess to  members of this structure */
 	spinlock_t lock;
 
-	/** The list of requests being processed */
-	struct list_head processing;
+	/** Hash table of requests being processed */
+	struct list_head *processing;
 
 	/** The list of requests under I/O */
 	struct list_head io;
@@ -439,6 +464,30 @@
 	struct list_head entry;
 };
 
+struct fuse_fs_context {
+	int fd;
+	unsigned int rootmode;
+	kuid_t user_id;
+	kgid_t group_id;
+	bool is_bdev:1;
+	bool fd_present:1;
+	bool rootmode_present:1;
+	bool user_id_present:1;
+	bool group_id_present:1;
+	bool default_permissions:1;
+	bool allow_other:1;
+	bool destroy:1;
+	bool no_control:1;
+	bool no_force_umount:1;
+	bool no_mount_options:1;
+	unsigned int max_read;
+	unsigned int blksize;
+	const char *subtype;
+
+	/* fuse_dev pointer to fill in, should contain NULL on entry */
+	void **fudptr;
+};
+
 /**
  * A Fuse connection.
  *
@@ -476,11 +525,14 @@
 	/** Maximum write size */
 	unsigned max_write;
 
+	/** Maxmum number of pages that can be used in a single request */
+	unsigned int max_pages;
+
 	/** Input queue */
 	struct fuse_iqueue iq;
 
 	/** The next unique kernel file handle */
-	u64 khctr;
+	atomic64_t khctr;
 
 	/** rbtree of fuse_files waiting for poll events indexed by ph */
 	struct rb_root polled_files;
@@ -500,6 +552,10 @@
 	/** The list of background requests set aside for later queuing */
 	struct list_head bg_queue;
 
+	/** Protects: max_background, congestion_threshold, num_background,
+	 * active_background, bg_queue, blocked */
+	spinlock_t bg_lock;
+
 	/** Flag indicating that INIT reply has been received. Allocating
 	 * any fuse request will be suspended until the flag is set */
 	int initialized;
@@ -512,9 +568,6 @@
 	/** waitq for blocked connection */
 	wait_queue_head_t blocked_waitq;
 
-	/** waitq for reserved requests */
-	wait_queue_head_t reserved_req_waitq;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -551,6 +604,9 @@
 	/** handle fs handles killing suid/sgid/cap on write/chown/trunc */
 	unsigned handle_killpriv:1;
 
+	/** cache READLINK responses in page cache */
+	unsigned cache_symlinks:1;
+
 	/*
 	 * The following bitfields are only for optimization purposes
 	 * and hence races in setting them will not cause malfunction
@@ -559,6 +615,9 @@
 	/** Is open/release not implemented by fs? */
 	unsigned no_open:1;
 
+	/** Is opendir/releasedir not implemented by fs? */
+	unsigned no_opendir:1;
+
 	/** Is fsync not implemented by fs? */
 	unsigned no_fsync:1;
 
@@ -616,6 +675,9 @@
 	/** Use enhanced/automatic page cache invalidation. */
 	unsigned auto_inval_data:1;
 
+	/** Filesystem is fully reponsible for page cache invalidation. */
+	unsigned explicit_inval_data:1;
+
 	/** Does the filesystem support readdirplus? */
 	unsigned do_readdirplus:1;
 
@@ -637,6 +699,24 @@
 	/** Allow other than the mounter user to access the filesystem ? */
 	unsigned allow_other:1;
 
+	/** Does the filesystem support copy_file_range? */
+	unsigned no_copy_file_range:1;
+
+	/* Send DESTROY request */
+	unsigned int destroy:1;
+
+	/* Delete dentries that have gone stale */
+	unsigned int delete_stale:1;
+
+	/** Do not create entry in fusectl fs */
+	unsigned int no_control:1;
+
+	/** Do not allow MNT_FORCE umount */
+	unsigned int no_force_umount:1;
+
+	/* Do not show mount options */
+	unsigned int no_mount_options:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -658,11 +738,8 @@
 	/** Key for lock owner ID scrambling */
 	u32 scramble_key[4];
 
-	/** Reserved request for the DESTROY message */
-	struct fuse_req *destroy_req;
-
 	/** Version counter for attribute changes */
-	u64 attr_version;
+	atomic64_t attr_version;
 
 	/** Called on final put */
 	void (*release)(struct fuse_conn *);
@@ -697,6 +774,16 @@
 	return get_fuse_inode(inode)->nodeid;
 }
 
+static inline int invalid_nodeid(u64 nodeid)
+{
+	return !nodeid || nodeid == FUSE_ROOT_ID;
+}
+
+static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+	return atomic64_read(&fc->attr_version);
+}
+
 /** Device operations */
 extern const struct file_operations fuse_dev_operations;
 
@@ -726,14 +813,32 @@
 
 struct fuse_forget_link *fuse_alloc_forget(void);
 
-/* Used by READDIRPLUS */
-void fuse_force_forget(struct file *file, u64 nodeid);
+struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+					     unsigned int max,
+					     unsigned int *countp);
 
-/**
+/*
  * Initialize READ or READDIR request
  */
-void fuse_read_fill(struct fuse_req *req, struct file *file,
-		    loff_t pos, size_t count, int opcode);
+struct fuse_io_args {
+	union {
+		struct {
+			struct fuse_read_in in;
+			u64 attr_ver;
+		} read;
+		struct {
+			struct fuse_write_in in;
+			struct fuse_write_out out;
+		} write;
+	};
+	struct fuse_args_pages ap;
+	struct fuse_io_priv *io;
+	struct fuse_file *ff;
+};
+
+void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
+			 size_t count, int opcode);
+
 
 /**
  * Send OPEN or OPENDIR request
@@ -744,7 +849,7 @@
 void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file);
 
-void fuse_sync_release(struct fuse_file *ff, int flags);
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags);
 
 /**
  * Send RELEASE or RELEASEDIR request
@@ -755,7 +860,7 @@
  * Send FSYNC or FSYNCDIR request
  */
 int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
-		      int datasync, int isdir);
+		      int datasync, int opcode);
 
 /**
  * Notify poll wakeup
@@ -806,62 +911,19 @@
 void __exit fuse_ctl_cleanup(void);
 
 /**
- * Allocate a request
- */
-struct fuse_req *fuse_request_alloc(unsigned npages);
-
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
-
-/**
- * Free a request
- */
-void fuse_request_free(struct fuse_req *req);
-
-/**
- * Get a request, may fail with -ENOMEM,
- * caller should specify # elements in req->pages[] explicitly
- */
-struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
-struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
-					     unsigned npages);
-
-/*
- * Increment reference count on request
- */
-void __fuse_get_request(struct fuse_req *req);
-
-/**
- * Gets a requests for a file operation, always succeeds
- */
-struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
-					     struct file *file);
-
-/**
- * Decrement reference count of a request.  If count goes to zero free
- * the request.
- */
-void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
-
-/**
- * Send a request (synchronous)
- */
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
-
-/**
  * Simple request sending that does request allocation and freeing
  */
 ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
+int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+			   gfp_t gfp_flags);
 
 /**
- * Send a request in the background
+ * End a finished request
  */
-void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-
-void fuse_request_send_background_locked(struct fuse_conn *fc,
-					 struct fuse_req *req);
+void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
 
 /* Abort all requests */
-void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
+void fuse_abort_conn(struct fuse_conn *fc);
 void fuse_wait_aborted(struct fuse_conn *fc);
 
 /**
@@ -873,6 +935,9 @@
 
 void fuse_invalidate_atime(struct inode *inode);
 
+u64 entry_attr_timeout(struct fuse_entry_out *o);
+void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
+
 /**
  * Acquire reference to fuse_conn
  */
@@ -881,15 +946,33 @@
 /**
  * Initialize fuse_conn
  */
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+		    const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
 
 /**
  * Release reference to fuse_conn
  */
 void fuse_conn_put(struct fuse_conn *fc);
 
-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
+struct fuse_dev *fuse_dev_alloc(void);
+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
 void fuse_dev_free(struct fuse_dev *fud);
+void fuse_send_init(struct fuse_conn *fc);
+
+/**
+ * Fill in superblock and initialize fuse connection
+ * @sb: partially-initialized superblock to fill in
+ * @ctx: mount context
+ */
+int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
+
+/**
+ * Disassociate fuse connection from superblock and kill the superblock
+ *
+ * Calls kill_anon_super(), do not use with bdev mounts.
+ */
+void fuse_kill_sb_anon(struct super_block *sb);
 
 /**
  * Add connection to control filesystem
@@ -922,8 +1005,6 @@
 void fuse_set_nowrite(struct inode *inode);
 void fuse_release_nowrite(struct inode *inode);
 
-u64 fuse_get_attr_version(struct fuse_conn *fc);
-
 /**
  * File-system tells the kernel to invalidate cache for the given node id.
  */
@@ -992,4 +1073,19 @@
 struct posix_acl *fuse_get_acl(struct inode *inode, int type);
 int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 
+
+/* readdir.c */
+int fuse_readdir(struct file *file, struct dir_context *ctx);
+
+/**
+ * Return the number of bytes in an arguments list
+ */
+unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
+
+/**
+ * Get the next unique ID for a request
+ */
+u64 fuse_get_unique(struct fuse_iqueue *fiq);
+void fuse_free_conn(struct fuse_conn *fc);
+
 #endif /* _FS_FUSE_I_H */

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index db9e60b..16aec32 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c

@@ -15,7 +15,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/statfs.h>
 #include <linux/random.h>
 #include <linux/sched.h>
@@ -59,83 +60,65 @@
 /** Congestion starts at 75% of maximum */
 #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
 
-struct fuse_mount_data {
-	int fd;
-	unsigned rootmode;
-	kuid_t user_id;
-	kgid_t group_id;
-	unsigned fd_present:1;
-	unsigned rootmode_present:1;
-	unsigned user_id_present:1;
-	unsigned group_id_present:1;
-	unsigned default_permissions:1;
-	unsigned allow_other:1;
-	unsigned max_read;
-	unsigned blksize;
-};
+#ifdef CONFIG_BLOCK
+static struct file_system_type fuseblk_fs_type;
+#endif
 
 struct fuse_forget_link *fuse_alloc_forget(void)
 {
-	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT);
 }
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
 {
-	struct inode *inode;
 	struct fuse_inode *fi;
 
-	inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
-	if (!inode)
+	fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
+	if (!fi)
 		return NULL;
 
-	fi = get_fuse_inode(inode);
 	fi->i_time = 0;
+	fi->inval_mask = 0;
 	fi->nodeid = 0;
 	fi->nlookup = 0;
 	fi->attr_version = 0;
-	fi->writectr = 0;
 	fi->orig_ino = 0;
 	fi->state = 0;
-	INIT_LIST_HEAD(&fi->write_files);
-	INIT_LIST_HEAD(&fi->queued_writes);
-	INIT_LIST_HEAD(&fi->writepages);
-	init_waitqueue_head(&fi->page_waitq);
 	mutex_init(&fi->mutex);
+	spin_lock_init(&fi->lock);
 	fi->forget = fuse_alloc_forget();
 	if (!fi->forget) {
-		kmem_cache_free(fuse_inode_cachep, inode);
+		kmem_cache_free(fuse_inode_cachep, fi);
 		return NULL;
 	}
 
-	return inode;
+	return &fi->inode;
 }
 
-static void fuse_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(fuse_inode_cachep, inode);
-}
-
-static void fuse_destroy_inode(struct inode *inode)
+static void fuse_free_inode(struct inode *inode)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	BUG_ON(!list_empty(&fi->write_files));
-	BUG_ON(!list_empty(&fi->queued_writes));
+
 	mutex_destroy(&fi->mutex);
 	kfree(fi->forget);
-	call_rcu(&inode->i_rcu, fuse_i_callback);
+	kmem_cache_free(fuse_inode_cachep, fi);
 }
 
 static void fuse_evict_inode(struct inode *inode)
 {
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 	if (inode->i_sb->s_flags & SB_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_inode *fi = get_fuse_inode(inode);
 		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
 		fi->forget = NULL;
 	}
+	if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) {
+		WARN_ON(!list_empty(&fi->write_files));
+		WARN_ON(!list_empty(&fi->queued_writes));
+	}
 }
 
 static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -165,8 +148,11 @@
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
-	fi->attr_version = ++fc->attr_version;
+	lockdep_assert_held(&fi->lock);
+
+	fi->attr_version = atomic64_inc_return(&fc->attr_version);
 	fi->i_time = attr_valid;
+	WRITE_ONCE(fi->inval_mask, 0);
 
 	inode->i_ino     = fuse_squash_ino(attr->ino);
 	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -210,10 +196,10 @@
 	loff_t oldsize;
 	struct timespec64 old_mtime;
 
-	spin_lock(&fc->lock);
+	spin_lock(&fi->lock);
 	if ((attr_version != 0 && fi->attr_version > attr_version) ||
 	    test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
-		spin_unlock(&fc->lock);
+		spin_unlock(&fi->lock);
 		return;
 	}
 
@@ -228,14 +214,15 @@
 	 */
 	if (!is_wb || !S_ISREG(inode->i_mode))
 		i_size_write(inode, attr->size);
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 
 	if (!is_wb && S_ISREG(inode->i_mode)) {
 		bool inval = false;
 
 		if (oldsize != attr->size) {
 			truncate_pagecache(inode, attr->size);
-			inval = true;
+			if (!fc->explicit_inval_data)
+				inval = true;
 		} else if (fc->auto_inval_data) {
 			struct timespec64 new_mtime = {
 				.tv_sec = attr->mtime,
@@ -323,9 +310,9 @@
 	}
 
 	fi = get_fuse_inode(inode);
-	spin_lock(&fc->lock);
+	spin_lock(&fi->lock);
 	fi->nlookup++;
-	spin_unlock(&fc->lock);
+	spin_unlock(&fi->lock);
 	fuse_change_attributes(inode, attr, attr_valid, attr_version);
 
 	return inode;
@@ -377,19 +364,21 @@
 
 static void fuse_umount_begin(struct super_block *sb)
 {
-	fuse_abort_conn(get_fuse_conn_super(sb), false);
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	if (!fc->no_force_umount)
+		fuse_abort_conn(fc);
 }
 
 static void fuse_send_destroy(struct fuse_conn *fc)
 {
-	struct fuse_req *req = fc->destroy_req;
-	if (req && fc->conn_init) {
-		fc->destroy_req = NULL;
-		req->in.h.opcode = FUSE_DESTROY;
-		__set_bit(FR_FORCE, &req->flags);
-		__clear_bit(FR_BACKGROUND, &req->flags);
-		fuse_request_send(fc, req);
-		fuse_put_request(fc, req);
+	if (fc->conn_init) {
+		FUSE_ARGS(args);
+
+		args.opcode = FUSE_DESTROY;
+		args.force = true;
+		args.nocreds = true;
+		fuse_simple_request(fc, &args);
 	}
 }
 
@@ -433,12 +422,12 @@
 	}
 
 	memset(&outarg, 0, sizeof(outarg));
-	args.in.numargs = 0;
-	args.in.h.opcode = FUSE_STATFS;
-	args.in.h.nodeid = get_node_id(d_inode(dentry));
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.in_numargs = 0;
+	args.opcode = FUSE_STATFS;
+	args.nodeid = get_node_id(d_inode(dentry));
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (!err)
 		convert_fuse_statfs(buf, &outarg.st);
@@ -446,6 +435,8 @@
 }
 
 enum {
+	OPT_SOURCE,
+	OPT_SUBTYPE,
 	OPT_FD,
 	OPT_ROOTMODE,
 	OPT_USER_ID,
@@ -457,111 +448,109 @@
 	OPT_ERR
 };
 
-static const match_table_t tokens = {
-	{OPT_FD,			"fd=%u"},
-	{OPT_ROOTMODE,			"rootmode=%o"},
-	{OPT_USER_ID,			"user_id=%u"},
-	{OPT_GROUP_ID,			"group_id=%u"},
-	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
-	{OPT_ALLOW_OTHER,		"allow_other"},
-	{OPT_MAX_READ,			"max_read=%u"},
-	{OPT_BLKSIZE,			"blksize=%u"},
-	{OPT_ERR,			NULL}
+static const struct fs_parameter_spec fuse_param_specs[] = {
+	fsparam_string	("source",		OPT_SOURCE),
+	fsparam_u32	("fd",			OPT_FD),
+	fsparam_u32oct	("rootmode",		OPT_ROOTMODE),
+	fsparam_u32	("user_id",		OPT_USER_ID),
+	fsparam_u32	("group_id",		OPT_GROUP_ID),
+	fsparam_flag	("default_permissions",	OPT_DEFAULT_PERMISSIONS),
+	fsparam_flag	("allow_other",		OPT_ALLOW_OTHER),
+	fsparam_u32	("max_read",		OPT_MAX_READ),
+	fsparam_u32	("blksize",		OPT_BLKSIZE),
+	fsparam_string	("subtype",		OPT_SUBTYPE),
+	{}
 };
 
-static int fuse_match_uint(substring_t *s, unsigned int *res)
+static const struct fs_parameter_description fuse_fs_parameters = {
+	.name		= "fuse",
+	.specs		= fuse_param_specs,
+};
+
+static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	int err = -ENOMEM;
-	char *buf = match_strdup(s);
-	if (buf) {
-		err = kstrtouint(buf, 10, res);
-		kfree(buf);
-	}
-	return err;
-}
+	struct fs_parse_result result;
+	struct fuse_fs_context *ctx = fc->fs_private;
+	int opt;
 
-static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
-			  struct user_namespace *user_ns)
-{
-	char *p;
-	memset(d, 0, sizeof(struct fuse_mount_data));
-	d->max_read = ~0;
-	d->blksize = FUSE_DEFAULT_BLKSIZE;
+	opt = fs_parse(fc, &fuse_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
 
-	while ((p = strsep(&opt, ",")) != NULL) {
-		int token;
-		int value;
-		unsigned uv;
-		substring_t args[MAX_OPT_ARGS];
-		if (!*p)
-			continue;
+	switch (opt) {
+	case OPT_SOURCE:
+		if (fc->source)
+			return invalf(fc, "fuse: Multiple sources specified");
+		fc->source = param->string;
+		param->string = NULL;
+		break;
 
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case OPT_FD:
-			if (match_int(&args[0], &value))
-				return 0;
-			d->fd = value;
-			d->fd_present = 1;
-			break;
-
-		case OPT_ROOTMODE:
-			if (match_octal(&args[0], &value))
-				return 0;
-			if (!fuse_valid_type(value))
-				return 0;
-			d->rootmode = value;
-			d->rootmode_present = 1;
-			break;
-
-		case OPT_USER_ID:
-			if (fuse_match_uint(&args[0], &uv))
-				return 0;
-			d->user_id = make_kuid(user_ns, uv);
-			if (!uid_valid(d->user_id))
-				return 0;
-			d->user_id_present = 1;
-			break;
-
-		case OPT_GROUP_ID:
-			if (fuse_match_uint(&args[0], &uv))
-				return 0;
-			d->group_id = make_kgid(user_ns, uv);
-			if (!gid_valid(d->group_id))
-				return 0;
-			d->group_id_present = 1;
-			break;
-
-		case OPT_DEFAULT_PERMISSIONS:
-			d->default_permissions = 1;
-			break;
-
-		case OPT_ALLOW_OTHER:
-			d->allow_other = 1;
-			break;
-
-		case OPT_MAX_READ:
-			if (match_int(&args[0], &value))
-				return 0;
-			d->max_read = value;
-			break;
-
-		case OPT_BLKSIZE:
-			if (!is_bdev || match_int(&args[0], &value))
-				return 0;
-			d->blksize = value;
-			break;
-
-		default:
-			return 0;
-		}
-	}
-
-	if (!d->fd_present || !d->rootmode_present ||
-	    !d->user_id_present || !d->group_id_present)
+	case OPT_SUBTYPE:
+		if (ctx->subtype)
+			return invalf(fc, "fuse: Multiple subtypes specified");
+		ctx->subtype = param->string;
+		param->string = NULL;
 		return 0;
 
-	return 1;
+	case OPT_FD:
+		ctx->fd = result.uint_32;
+		ctx->fd_present = 1;
+		break;
+
+	case OPT_ROOTMODE:
+		if (!fuse_valid_type(result.uint_32))
+			return invalf(fc, "fuse: Invalid rootmode");
+		ctx->rootmode = result.uint_32;
+		ctx->rootmode_present = 1;
+		break;
+
+	case OPT_USER_ID:
+		ctx->user_id = make_kuid(fc->user_ns, result.uint_32);
+		if (!uid_valid(ctx->user_id))
+			return invalf(fc, "fuse: Invalid user_id");
+		ctx->user_id_present = 1;
+		break;
+
+	case OPT_GROUP_ID:
+		ctx->group_id = make_kgid(fc->user_ns, result.uint_32);
+		if (!gid_valid(ctx->group_id))
+			return invalf(fc, "fuse: Invalid group_id");
+		ctx->group_id_present = 1;
+		break;
+
+	case OPT_DEFAULT_PERMISSIONS:
+		ctx->default_permissions = 1;
+		break;
+
+	case OPT_ALLOW_OTHER:
+		ctx->allow_other = 1;
+		break;
+
+	case OPT_MAX_READ:
+		ctx->max_read = result.uint_32;
+		break;
+
+	case OPT_BLKSIZE:
+		if (!ctx->is_bdev)
+			return invalf(fc, "fuse: blksize only supported for fuseblk");
+		ctx->blksize = result.uint_32;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void fuse_free_fc(struct fs_context *fc)
+{
+	struct fuse_fs_context *ctx = fc->fs_private;
+
+	if (ctx) {
+		kfree(ctx->subtype);
+		kfree(ctx);
+	}
 }
 
 static int fuse_show_options(struct seq_file *m, struct dentry *root)
@@ -569,6 +558,9 @@
 	struct super_block *sb = root->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
+	if (fc->no_mount_options)
+		return 0;
+
 	seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
 	seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
 	if (fc->default_permissions)
@@ -582,58 +574,69 @@
 	return 0;
 }
 
-static void fuse_iqueue_init(struct fuse_iqueue *fiq)
+static void fuse_iqueue_init(struct fuse_iqueue *fiq,
+			     const struct fuse_iqueue_ops *ops,
+			     void *priv)
 {
 	memset(fiq, 0, sizeof(struct fuse_iqueue));
+	spin_lock_init(&fiq->lock);
 	init_waitqueue_head(&fiq->waitq);
 	INIT_LIST_HEAD(&fiq->pending);
 	INIT_LIST_HEAD(&fiq->interrupts);
 	fiq->forget_list_tail = &fiq->forget_list_head;
 	fiq->connected = 1;
+	fiq->ops = ops;
+	fiq->priv = priv;
 }
 
 static void fuse_pqueue_init(struct fuse_pqueue *fpq)
 {
-	memset(fpq, 0, sizeof(struct fuse_pqueue));
+	unsigned int i;
+
 	spin_lock_init(&fpq->lock);
-	INIT_LIST_HEAD(&fpq->processing);
+	for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&fpq->processing[i]);
 	INIT_LIST_HEAD(&fpq->io);
 	fpq->connected = 1;
 }
 
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+		    const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
 {
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
+	spin_lock_init(&fc->bg_lock);
 	init_rwsem(&fc->killsb);
 	refcount_set(&fc->count, 1);
 	atomic_set(&fc->dev_count, 1);
 	init_waitqueue_head(&fc->blocked_waitq);
-	init_waitqueue_head(&fc->reserved_req_waitq);
-	fuse_iqueue_init(&fc->iq);
+	fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
 	INIT_LIST_HEAD(&fc->devices);
 	atomic_set(&fc->num_waiting, 0);
 	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
 	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
-	fc->khctr = 0;
+	atomic64_set(&fc->khctr, 0);
 	fc->polled_files = RB_ROOT;
 	fc->blocked = 0;
 	fc->initialized = 0;
 	fc->connected = 1;
-	fc->attr_version = 1;
+	atomic64_set(&fc->attr_version, 1);
 	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 	fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
 	fc->user_ns = get_user_ns(user_ns);
+	fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
 }
 EXPORT_SYMBOL_GPL(fuse_conn_init);
 
 void fuse_conn_put(struct fuse_conn *fc)
 {
 	if (refcount_dec_and_test(&fc->count)) {
-		if (fc->destroy_req)
-			fuse_request_free(fc->destroy_req);
+		struct fuse_iqueue *fiq = &fc->iq;
+
+		if (fiq->ops->release)
+			fiq->ops->release(fiq);
 		put_pid_ns(fc->pid_ns);
 		put_user_ns(fc->user_ns);
 		fc->release(fc);
@@ -808,7 +811,7 @@
 
 static const struct super_operations fuse_super_operations = {
 	.alloc_inode    = fuse_alloc_inode,
-	.destroy_inode  = fuse_destroy_inode,
+	.free_inode     = fuse_free_inode,
 	.evict_inode	= fuse_evict_inode,
 	.write_inode	= fuse_write_inode,
 	.drop_inode	= generic_delete_inode,
@@ -821,9 +824,12 @@
 
 static void sanitize_global_limit(unsigned *limit)
 {
+	/*
+	 * The default maximum number of async requests is calculated to consume
+	 * 1/2^13 of the total memory, assuming 392 bytes per request.
+	 */
 	if (*limit == 0)
-		*limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
-			 sizeof(struct fuse_req);
+		*limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392;
 
 	if (*limit >= 1 << 16)
 		*limit = (1 << 16) - 1;
@@ -852,6 +858,7 @@
 	sanitize_global_limit(&max_user_bgreq);
 	sanitize_global_limit(&max_user_congthresh);
 
+	spin_lock(&fc->bg_lock);
 	if (arg->max_background) {
 		fc->max_background = arg->max_background;
 
@@ -865,13 +872,22 @@
 		    fc->congestion_threshold > max_user_congthresh)
 			fc->congestion_threshold = max_user_congthresh;
 	}
+	spin_unlock(&fc->bg_lock);
 }
 
-static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
-{
-	struct fuse_init_out *arg = &req->misc.init_out;
+struct fuse_init_args {
+	struct fuse_args args;
+	struct fuse_init_in in;
+	struct fuse_init_out out;
+};
 
-	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
+static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
+			       int error)
+{
+	struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
+	struct fuse_init_out *arg = &ia->out;
+
+	if (error || arg->major != FUSE_KERNEL_VERSION)
 		fc->conn_error = 1;
 	else {
 		unsigned long ra_pages;
@@ -904,6 +920,8 @@
 				fc->dont_mask = 1;
 			if (arg->flags & FUSE_AUTO_INVAL_DATA)
 				fc->auto_inval_data = 1;
+			else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA)
+				fc->explicit_inval_data = 1;
 			if (arg->flags & FUSE_DO_READDIRPLUS) {
 				fc->do_readdirplus = 1;
 				if (arg->flags & FUSE_READDIRPLUS_AUTO)
@@ -924,8 +942,15 @@
 				fc->posix_acl = 1;
 				fc->sb->s_xattr = fuse_acl_xattr_handlers;
 			}
+			if (arg->flags & FUSE_CACHE_SYMLINKS)
+				fc->cache_symlinks = 1;
 			if (arg->flags & FUSE_ABORT_ERROR)
 				fc->abort_err = 1;
+			if (arg->flags & FUSE_MAX_PAGES) {
+				fc->max_pages =
+					min_t(unsigned int, FUSE_MAX_MAX_PAGES,
+					max_t(unsigned int, arg->max_pages, 1));
+			}
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -939,45 +964,57 @@
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
+	kfree(ia);
+
 	fuse_set_initialized(fc);
 	wake_up_all(&fc->blocked_waitq);
 }
 
-static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_send_init(struct fuse_conn *fc)
 {
-	struct fuse_init_in *arg = &req->misc.init_in;
+	struct fuse_init_args *ia;
 
-	arg->major = FUSE_KERNEL_VERSION;
-	arg->minor = FUSE_KERNEL_MINOR_VERSION;
-	arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
-	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
+	ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL);
+
+	ia->in.major = FUSE_KERNEL_VERSION;
+	ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
+	ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
+	ia->in.flags |=
+		FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
 		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
 		FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
 		FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
 		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
 		FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
 		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
-		FUSE_ABORT_ERROR;
-	req->in.h.opcode = FUSE_INIT;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(*arg);
-	req->in.args[0].value = arg;
-	req->out.numargs = 1;
+		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
+		FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
+	ia->args.opcode = FUSE_INIT;
+	ia->args.in_numargs = 1;
+	ia->args.in_args[0].size = sizeof(ia->in);
+	ia->args.in_args[0].value = &ia->in;
+	ia->args.out_numargs = 1;
 	/* Variable length argument used for backward compatibility
 	   with interface version < 7.5.  Rest of init_out is zeroed
 	   by do_get_request(), so a short reply is not a problem */
-	req->out.argvar = 1;
-	req->out.args[0].size = sizeof(struct fuse_init_out);
-	req->out.args[0].value = &req->misc.init_out;
-	req->end = process_init_reply;
-	fuse_request_send_background(fc, req);
-}
+	ia->args.out_argvar = 1;
+	ia->args.out_args[0].size = sizeof(ia->out);
+	ia->args.out_args[0].value = &ia->out;
+	ia->args.force = true;
+	ia->args.nocreds = true;
+	ia->args.end = process_init_reply;
 
-static void fuse_free_conn(struct fuse_conn *fc)
+	if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0)
+		process_init_reply(fc, &ia->args, -ENOTCONN);
+}
+EXPORT_SYMBOL_GPL(fuse_send_init);
+
+void fuse_free_conn(struct fuse_conn *fc)
 {
 	WARN_ON(!list_empty(&fc->devices));
 	kfree_rcu(fc, rcu);
 }
+EXPORT_SYMBOL_GPL(fuse_free_conn);
 
 static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 {
@@ -998,7 +1035,7 @@
 	if (err)
 		return err;
 
-	sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+	sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
 	/* fuse does it's own writeback accounting */
 	sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
 
@@ -1019,24 +1056,50 @@
 	return 0;
 }
 
-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
+struct fuse_dev *fuse_dev_alloc(void)
 {
 	struct fuse_dev *fud;
+	struct list_head *pq;
 
 	fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
-	if (fud) {
-		fud->fc = fuse_conn_get(fc);
-		fuse_pqueue_init(&fud->pq);
+	if (!fud)
+		return NULL;
 
-		spin_lock(&fc->lock);
-		list_add_tail(&fud->entry, &fc->devices);
-		spin_unlock(&fc->lock);
+	pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
+	if (!pq) {
+		kfree(fud);
+		return NULL;
 	}
 
+	fud->pq.processing = pq;
+	fuse_pqueue_init(&fud->pq);
+
 	return fud;
 }
 EXPORT_SYMBOL_GPL(fuse_dev_alloc);
 
+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc)
+{
+	fud->fc = fuse_conn_get(fc);
+	spin_lock(&fc->lock);
+	list_add_tail(&fud->entry, &fc->devices);
+	spin_unlock(&fc->lock);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_install);
+
+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc)
+{
+	struct fuse_dev *fud;
+
+	fud = fuse_dev_alloc();
+	if (!fud)
+		return NULL;
+
+	fuse_dev_install(fud, fc);
+	return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc_install);
+
 void fuse_dev_free(struct fuse_dev *fud)
 {
 	struct fuse_conn *fc = fud->fc;
@@ -1048,21 +1111,18 @@
 
 		fuse_conn_put(fc);
 	}
+	kfree(fud->pq.processing);
 	kfree(fud);
 }
 EXPORT_SYMBOL_GPL(fuse_dev_free);
 
-static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 {
 	struct fuse_dev *fud;
-	struct fuse_conn *fc;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct inode *root;
-	struct fuse_mount_data d;
-	struct file *file;
 	struct dentry *root_dentry;
-	struct fuse_req *init_req;
 	int err;
-	int is_bdev = sb->s_bdev != NULL;
 
 	err = -EINVAL;
 	if (sb->s_flags & SB_MANDLOCK)
@@ -1070,19 +1130,19 @@
 
 	sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
 
-	if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
-		goto err;
-
-	if (is_bdev) {
+	if (ctx->is_bdev) {
 #ifdef CONFIG_BLOCK
 		err = -EINVAL;
-		if (!sb_set_blocksize(sb, d.blksize))
+		if (!sb_set_blocksize(sb, ctx->blksize))
 			goto err;
 #endif
 	} else {
 		sb->s_blocksize = PAGE_SIZE;
 		sb->s_blocksize_bits = PAGE_SHIFT;
 	}
+
+	sb->s_subtype = ctx->subtype;
+	ctx->subtype = NULL;
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
 	sb->s_xattr = fuse_xattr_handlers;
@@ -1093,19 +1153,6 @@
 	if (sb->s_user_ns != &init_user_ns)
 		sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
 
-	file = fget(d.fd);
-	err = -EINVAL;
-	if (!file)
-		goto err;
-
-	/*
-	 * Require mount to happen from the same user namespace which
-	 * opened /dev/fuse to prevent potential attacks.
-	 */
-	if (file->f_op != &fuse_dev_operations ||
-	    file->f_cred->user_ns != sb->s_user_ns)
-		goto err_fput;
-
 	/*
 	 * If we are not in the initial user namespace posix
 	 * acls must be translated.
@@ -1113,17 +1160,9 @@
 	if (sb->s_user_ns != &init_user_ns)
 		sb->s_xattr = fuse_no_acl_xattr_handlers;
 
-	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
-	err = -ENOMEM;
-	if (!fc)
-		goto err_fput;
-
-	fuse_conn_init(fc, sb->s_user_ns);
-	fc->release = fuse_free_conn;
-
-	fud = fuse_dev_alloc(fc);
+	fud = fuse_dev_alloc_install(fc);
 	if (!fud)
-		goto err_put_conn;
+		goto err;
 
 	fc->dev = sb->s_dev;
 	fc->sb = sb;
@@ -1136,17 +1175,18 @@
 		fc->dont_mask = 1;
 	sb->s_flags |= SB_POSIXACL;
 
-	fc->default_permissions = d.default_permissions;
-	fc->allow_other = d.allow_other;
-	fc->user_id = d.user_id;
-	fc->group_id = d.group_id;
-	fc->max_read = max_t(unsigned, 4096, d.max_read);
-
-	/* Used by get_root_inode() */
-	sb->s_fs_info = fc;
+	fc->default_permissions = ctx->default_permissions;
+	fc->allow_other = ctx->allow_other;
+	fc->user_id = ctx->user_id;
+	fc->group_id = ctx->group_id;
+	fc->max_read = max_t(unsigned, 4096, ctx->max_read);
+	fc->destroy = ctx->destroy;
+	fc->no_control = ctx->no_control;
+	fc->no_force_umount = ctx->no_force_umount;
+	fc->no_mount_options = ctx->no_mount_options;
 
 	err = -ENOMEM;
-	root = fuse_get_root_inode(sb, d.rootmode);
+	root = fuse_get_root_inode(sb, ctx->rootmode);
 	sb->s_d_op = &fuse_root_dentry_operations;
 	root_dentry = d_make_root(root);
 	if (!root_dentry)
@@ -1154,20 +1194,9 @@
 	/* Root dentry doesn't have .d_revalidate */
 	sb->s_d_op = &fuse_dentry_operations;
 
-	init_req = fuse_request_alloc(0);
-	if (!init_req)
-		goto err_put_root;
-	__set_bit(FR_BACKGROUND, &init_req->flags);
-
-	if (is_bdev) {
-		fc->destroy_req = fuse_request_alloc(0);
-		if (!fc->destroy_req)
-			goto err_free_init_req;
-	}
-
 	mutex_lock(&fuse_mutex);
 	err = -EINVAL;
-	if (file->private_data)
+	if (*ctx->fudptr)
 		goto err_unlock;
 
 	err = fuse_ctl_add_conn(fc);
@@ -1176,27 +1205,62 @@
 
 	list_add_tail(&fc->entry, &fuse_conn_list);
 	sb->s_root = root_dentry;
-	file->private_data = fud;
+	*ctx->fudptr = fud;
 	mutex_unlock(&fuse_mutex);
+	return 0;
+
+ err_unlock:
+	mutex_unlock(&fuse_mutex);
+	dput(root_dentry);
+ err_dev_free:
+	fuse_dev_free(fud);
+ err:
+	return err;
+}
+EXPORT_SYMBOL_GPL(fuse_fill_super_common);
+
+static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+	struct fuse_fs_context *ctx = fsc->fs_private;
+	struct file *file;
+	int err;
+	struct fuse_conn *fc;
+
+	err = -EINVAL;
+	file = fget(ctx->fd);
+	if (!file)
+		goto err;
+
+	/*
+	 * Require mount to happen from the same user namespace which
+	 * opened /dev/fuse to prevent potential attacks.
+	 */
+	if ((file->f_op != &fuse_dev_operations) ||
+	    (file->f_cred->user_ns != sb->s_user_ns))
+		goto err_fput;
+	ctx->fudptr = &file->private_data;
+
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!fc)
+		goto err_fput;
+
+	fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
+	fc->release = fuse_free_conn;
+	sb->s_fs_info = fc;
+
+	err = fuse_fill_super_common(sb, ctx);
+	if (err)
+		goto err_put_conn;
 	/*
 	 * atomic_dec_and_test() in fput() provides the necessary
 	 * memory barrier for file->private_data to be visible on all
 	 * CPUs after this
 	 */
 	fput(file);
-
-	fuse_send_init(fc, init_req);
-
+	fuse_send_init(get_fuse_conn_super(sb));
 	return 0;
 
- err_unlock:
-	mutex_unlock(&fuse_mutex);
- err_free_init_req:
-	fuse_request_free(init_req);
- err_put_root:
-	dput(root_dentry);
- err_dev_free:
-	fuse_dev_free(fud);
  err_put_conn:
 	fuse_conn_put(fc);
 	sb->s_fs_info = NULL;
@@ -1206,11 +1270,52 @@
 	return err;
 }
 
-static struct dentry *fuse_mount(struct file_system_type *fs_type,
-		       int flags, const char *dev_name,
-		       void *raw_data)
+static int fuse_get_tree(struct fs_context *fc)
 {
-	return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	struct fuse_fs_context *ctx = fc->fs_private;
+
+	if (!ctx->fd_present || !ctx->rootmode_present ||
+	    !ctx->user_id_present || !ctx->group_id_present)
+		return -EINVAL;
+
+#ifdef CONFIG_BLOCK
+	if (ctx->is_bdev)
+		return get_tree_bdev(fc, fuse_fill_super);
+#endif
+
+	return get_tree_nodev(fc, fuse_fill_super);
+}
+
+static const struct fs_context_operations fuse_context_ops = {
+	.free		= fuse_free_fc,
+	.parse_param	= fuse_parse_param,
+	.get_tree	= fuse_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int fuse_init_fs_context(struct fs_context *fc)
+{
+	struct fuse_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->max_read = ~0;
+	ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+
+#ifdef CONFIG_BLOCK
+	if (fc->fs_type == &fuseblk_fs_type) {
+		ctx->is_bdev = true;
+		ctx->destroy = true;
+	}
+#endif
+
+	fc->fs_private = ctx;
+	fc->ops = &fuse_context_ops;
+	return 0;
 }
 
 static void fuse_sb_destroy(struct super_block *sb)
@@ -1218,9 +1323,10 @@
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
 	if (fc) {
-		fuse_send_destroy(fc);
+		if (fc->destroy)
+			fuse_send_destroy(fc);
 
-		fuse_abort_conn(fc, false);
+		fuse_abort_conn(fc);
 		fuse_wait_aborted(fc);
 
 		down_write(&fc->killsb);
@@ -1229,29 +1335,24 @@
 	}
 }
 
-static void fuse_kill_sb_anon(struct super_block *sb)
+void fuse_kill_sb_anon(struct super_block *sb)
 {
 	fuse_sb_destroy(sb);
 	kill_anon_super(sb);
 }
+EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
 
 static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
 	.fs_flags	= FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
-	.mount		= fuse_mount,
+	.init_fs_context = fuse_init_fs_context,
+	.parameters	= &fuse_fs_parameters,
 	.kill_sb	= fuse_kill_sb_anon,
 };
 MODULE_ALIAS_FS("fuse");
 
 #ifdef CONFIG_BLOCK
-static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
-			   int flags, const char *dev_name,
-			   void *raw_data)
-{
-	return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
-}
-
 static void fuse_kill_sb_blk(struct super_block *sb)
 {
 	fuse_sb_destroy(sb);
@@ -1261,7 +1362,8 @@
 static struct file_system_type fuseblk_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuseblk",
-	.mount		= fuse_mount_blk,
+	.init_fs_context = fuse_init_fs_context,
+	.parameters	= &fuse_fs_parameters,
 	.kill_sb	= fuse_kill_sb_blk,
 	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
@@ -1371,8 +1473,8 @@
 {
 	int res;
 
-	printk(KERN_INFO "fuse init (API version %i.%i)\n",
-	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+	pr_info("init (API version %i.%i)\n",
+		FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
 	INIT_LIST_HEAD(&fuse_conn_list);
 	res = fuse_fs_init();
@@ -1408,7 +1510,7 @@
 
 static void __exit fuse_exit(void)
 {
-	printk(KERN_DEBUG "fuse exit\n");
+	pr_debug("exit\n");
 
 	fuse_ctl_cleanup();
 	fuse_sysfs_cleanup();

diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
new file mode 100644
index 0000000..5c38b9d
--- /dev/null
+++ b/fs/fuse/readdir.c

@@ -0,0 +1,585 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2018  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+
+#include "fuse_i.h"
+#include <linux/iversion.h>
+#include <linux/posix_acl.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_inode *fi = get_fuse_inode(dir);
+
+	if (!fc->do_readdirplus)
+		return false;
+	if (!fc->readdirplus_auto)
+		return true;
+	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+		return true;
+	if (ctx->pos == 0)
+		return true;
+	return false;
+}
+
+static void fuse_add_dirent_to_cache(struct file *file,
+				     struct fuse_dirent *dirent, loff_t pos)
+{
+	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
+	size_t reclen = FUSE_DIRENT_SIZE(dirent);
+	pgoff_t index;
+	struct page *page;
+	loff_t size;
+	u64 version;
+	unsigned int offset;
+	void *addr;
+
+	spin_lock(&fi->rdc.lock);
+	/*
+	 * Is cache already completed?  Or this entry does not go at the end of
+	 * cache?
+	 */
+	if (fi->rdc.cached || pos != fi->rdc.pos) {
+		spin_unlock(&fi->rdc.lock);
+		return;
+	}
+	version = fi->rdc.version;
+	size = fi->rdc.size;
+	offset = size & ~PAGE_MASK;
+	index = size >> PAGE_SHIFT;
+	/* Dirent doesn't fit in current page?  Jump to next page. */
+	if (offset + reclen > PAGE_SIZE) {
+		index++;
+		offset = 0;
+	}
+	spin_unlock(&fi->rdc.lock);
+
+	if (offset) {
+		page = find_lock_page(file->f_mapping, index);
+	} else {
+		page = find_or_create_page(file->f_mapping, index,
+					   mapping_gfp_mask(file->f_mapping));
+	}
+	if (!page)
+		return;
+
+	spin_lock(&fi->rdc.lock);
+	/* Raced with another readdir */
+	if (fi->rdc.version != version || fi->rdc.size != size ||
+	    WARN_ON(fi->rdc.pos != pos))
+		goto unlock;
+
+	addr = kmap_atomic(page);
+	if (!offset)
+		clear_page(addr);
+	memcpy(addr + offset, dirent, reclen);
+	kunmap_atomic(addr);
+	fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
+	fi->rdc.pos = dirent->off;
+unlock:
+	spin_unlock(&fi->rdc.lock);
+	unlock_page(page);
+	put_page(page);
+}
+
+static void fuse_readdir_cache_end(struct file *file, loff_t pos)
+{
+	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
+	loff_t end;
+
+	spin_lock(&fi->rdc.lock);
+	/* does cache end position match current position? */
+	if (fi->rdc.pos != pos) {
+		spin_unlock(&fi->rdc.lock);
+		return;
+	}
+
+	fi->rdc.cached = true;
+	end = ALIGN(fi->rdc.size, PAGE_SIZE);
+	spin_unlock(&fi->rdc.lock);
+
+	/* truncate unused tail of cache */
+	truncate_inode_pages(file->f_mapping, end);
+}
+
+static bool fuse_emit(struct file *file, struct dir_context *ctx,
+		      struct fuse_dirent *dirent)
+{
+	struct fuse_file *ff = file->private_data;
+
+	if (ff->open_flags & FOPEN_CACHE_DIR)
+		fuse_add_dirent_to_cache(file, dirent, ctx->pos);
+
+	return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
+			dirent->type);
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+			 struct dir_context *ctx)
+{
+	while (nbytes >= FUSE_NAME_OFFSET) {
+		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+		size_t reclen = FUSE_DIRENT_SIZE(dirent);
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+			return -EIO;
+
+		if (!fuse_emit(file, ctx, dirent))
+			break;
+
+		buf += reclen;
+		nbytes -= reclen;
+		ctx->pos = dirent->off;
+	}
+
+	return 0;
+}
+
+static int fuse_direntplus_link(struct file *file,
+				struct fuse_direntplus *direntplus,
+				u64 attr_version)
+{
+	struct fuse_entry_out *o = &direntplus->entry_out;
+	struct fuse_dirent *dirent = &direntplus->dirent;
+	struct dentry *parent = file->f_path.dentry;
+	struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+	struct dentry *dentry;
+	struct dentry *alias;
+	struct inode *dir = d_inode(parent);
+	struct fuse_conn *fc;
+	struct inode *inode;
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+	if (!o->nodeid) {
+		/*
+		 * Unlike in the case of fuse_lookup, zero nodeid does not mean
+		 * ENOENT. Instead, it only means the userspace filesystem did
+		 * not want to return attributes/handle for this entry.
+		 *
+		 * So do nothing.
+		 */
+		return 0;
+	}
+
+	if (name.name[0] == '.') {
+		/*
+		 * We could potentially refresh the attributes of the directory
+		 * and its parent?
+		 */
+		if (name.len == 1)
+			return 0;
+		if (name.name[1] == '.' && name.len == 2)
+			return 0;
+	}
+
+	if (invalid_nodeid(o->nodeid))
+		return -EIO;
+	if (!fuse_valid_type(o->attr.mode))
+		return -EIO;
+
+	fc = get_fuse_conn(dir);
+
+	name.hash = full_name_hash(parent, name.name, name.len);
+	dentry = d_lookup(parent, &name);
+	if (!dentry) {
+retry:
+		dentry = d_alloc_parallel(parent, &name, &wq);
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+	}
+	if (!d_in_lookup(dentry)) {
+		struct fuse_inode *fi;
+		inode = d_inode(dentry);
+		if (!inode ||
+		    get_node_id(inode) != o->nodeid ||
+		    ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+			d_invalidate(dentry);
+			dput(dentry);
+			goto retry;
+		}
+		if (is_bad_inode(inode)) {
+			dput(dentry);
+			return -EIO;
+		}
+
+		fi = get_fuse_inode(inode);
+		spin_lock(&fi->lock);
+		fi->nlookup++;
+		spin_unlock(&fi->lock);
+
+		forget_all_cached_acls(inode);
+		fuse_change_attributes(inode, &o->attr,
+				       entry_attr_timeout(o),
+				       attr_version);
+		/*
+		 * The other branch comes via fuse_iget()
+		 * which bumps nlookup inside
+		 */
+	} else {
+		inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+				  &o->attr, entry_attr_timeout(o),
+				  attr_version);
+		if (!inode)
+			inode = ERR_PTR(-ENOMEM);
+
+		alias = d_splice_alias(inode, dentry);
+		d_lookup_done(dentry);
+		if (alias) {
+			dput(dentry);
+			dentry = alias;
+		}
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+	}
+	if (fc->readdirplus_auto)
+		set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+	fuse_change_entry_timeout(dentry, o);
+
+	dput(dentry);
+	return 0;
+}
+
+static void fuse_force_forget(struct file *file, u64 nodeid)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_forget_in inarg;
+	FUSE_ARGS(args);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.nlookup = 1;
+	args.opcode = FUSE_FORGET;
+	args.nodeid = nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.force = true;
+	args.noreply = true;
+
+	fuse_simple_request(fc, &args);
+	/* ignore errors */
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+			     struct dir_context *ctx, u64 attr_version)
+{
+	struct fuse_direntplus *direntplus;
+	struct fuse_dirent *dirent;
+	size_t reclen;
+	int over = 0;
+	int ret;
+
+	while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+		direntplus = (struct fuse_direntplus *) buf;
+		dirent = &direntplus->dirent;
+		reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+			return -EIO;
+
+		if (!over) {
+			/* We fill entries into dstbuf only as much as
+			   it can hold. But we still continue iterating
+			   over remaining entries to link them. If not,
+			   we need to send a FORGET for each of those
+			   which we did not link.
+			*/
+			over = !fuse_emit(file, ctx, dirent);
+			if (!over)
+				ctx->pos = dirent->off;
+		}
+
+		buf += reclen;
+		nbytes -= reclen;
+
+		ret = fuse_direntplus_link(file, direntplus, attr_version);
+		if (ret)
+			fuse_force_forget(file, direntplus->entry_out.nodeid);
+	}
+
+	return 0;
+}
+
+static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
+{
+	int plus;
+	ssize_t res;
+	struct page *page;
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_io_args ia = {};
+	struct fuse_args_pages *ap = &ia.ap;
+	struct fuse_page_desc desc = { .length = PAGE_SIZE };
+	u64 attr_version = 0;
+	bool locked;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	plus = fuse_use_readdirplus(inode, ctx);
+	ap->args.out_pages = 1;
+	ap->num_pages = 1;
+	ap->pages = &page;
+	ap->descs = &desc;
+	if (plus) {
+		attr_version = fuse_get_attr_version(fc);
+		fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+				    FUSE_READDIRPLUS);
+	} else {
+		fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+				    FUSE_READDIR);
+	}
+	locked = fuse_lock_inode(inode);
+	res = fuse_simple_request(fc, &ap->args);
+	fuse_unlock_inode(inode, locked);
+	if (res >= 0) {
+		if (!res) {
+			struct fuse_file *ff = file->private_data;
+
+			if (ff->open_flags & FOPEN_CACHE_DIR)
+				fuse_readdir_cache_end(file, ctx->pos);
+		} else if (plus) {
+			res = parse_dirplusfile(page_address(page), res,
+						file, ctx, attr_version);
+		} else {
+			res = parse_dirfile(page_address(page), res, file,
+					    ctx);
+		}
+	}
+
+	__free_page(page);
+	fuse_invalidate_atime(inode);
+	return res;
+}
+
+enum fuse_parse_result {
+	FOUND_ERR = -1,
+	FOUND_NONE = 0,
+	FOUND_SOME,
+	FOUND_ALL,
+};
+
+static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
+					       void *addr, unsigned int size,
+					       struct dir_context *ctx)
+{
+	unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK;
+	enum fuse_parse_result res = FOUND_NONE;
+
+	WARN_ON(offset >= size);
+
+	for (;;) {
+		struct fuse_dirent *dirent = addr + offset;
+		unsigned int nbytes = size - offset;
+		size_t reclen;
+
+		if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen)
+			break;
+
+		reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */
+
+		if (WARN_ON(dirent->namelen > FUSE_NAME_MAX))
+			return FOUND_ERR;
+		if (WARN_ON(reclen > nbytes))
+			return FOUND_ERR;
+		if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL))
+			return FOUND_ERR;
+
+		if (ff->readdir.pos == ctx->pos) {
+			res = FOUND_SOME;
+			if (!dir_emit(ctx, dirent->name, dirent->namelen,
+				      dirent->ino, dirent->type))
+				return FOUND_ALL;
+			ctx->pos = dirent->off;
+		}
+		ff->readdir.pos = dirent->off;
+		ff->readdir.cache_off += reclen;
+
+		offset += reclen;
+	}
+
+	return res;
+}
+
+static void fuse_rdc_reset(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	fi->rdc.cached = false;
+	fi->rdc.version++;
+	fi->rdc.size = 0;
+	fi->rdc.pos = 0;
+}
+
+#define UNCACHED 1
+
+static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
+{
+	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	enum fuse_parse_result res;
+	pgoff_t index;
+	unsigned int size;
+	struct page *page;
+	void *addr;
+
+	/* Seeked?  If so, reset the cache stream */
+	if (ff->readdir.pos != ctx->pos) {
+		ff->readdir.pos = 0;
+		ff->readdir.cache_off = 0;
+	}
+
+	/*
+	 * We're just about to start reading into the cache or reading the
+	 * cache; both cases require an up-to-date mtime value.
+	 */
+	if (!ctx->pos && fc->auto_inval_data) {
+		int err = fuse_update_attributes(inode, file);
+
+		if (err)
+			return err;
+	}
+
+retry:
+	spin_lock(&fi->rdc.lock);
+retry_locked:
+	if (!fi->rdc.cached) {
+		/* Starting cache? Set cache mtime. */
+		if (!ctx->pos && !fi->rdc.size) {
+			fi->rdc.mtime = inode->i_mtime;
+			fi->rdc.iversion = inode_query_iversion(inode);
+		}
+		spin_unlock(&fi->rdc.lock);
+		return UNCACHED;
+	}
+	/*
+	 * When at the beginning of the directory (i.e. just after opendir(3) or
+	 * rewinddir(3)), then need to check whether directory contents have
+	 * changed, and reset the cache if so.
+	 */
+	if (!ctx->pos) {
+		if (inode_peek_iversion(inode) != fi->rdc.iversion ||
+		    !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) {
+			fuse_rdc_reset(inode);
+			goto retry_locked;
+		}
+	}
+
+	/*
+	 * If cache version changed since the last getdents() call, then reset
+	 * the cache stream.
+	 */
+	if (ff->readdir.version != fi->rdc.version) {
+		ff->readdir.pos = 0;
+		ff->readdir.cache_off = 0;
+	}
+	/*
+	 * If at the beginning of the cache, than reset version to
+	 * current.
+	 */
+	if (ff->readdir.pos == 0)
+		ff->readdir.version = fi->rdc.version;
+
+	WARN_ON(fi->rdc.size < ff->readdir.cache_off);
+
+	index = ff->readdir.cache_off >> PAGE_SHIFT;
+
+	if (index == (fi->rdc.size >> PAGE_SHIFT))
+		size = fi->rdc.size & ~PAGE_MASK;
+	else
+		size = PAGE_SIZE;
+	spin_unlock(&fi->rdc.lock);
+
+	/* EOF? */
+	if ((ff->readdir.cache_off & ~PAGE_MASK) == size)
+		return 0;
+
+	page = find_get_page_flags(file->f_mapping, index,
+				   FGP_ACCESSED | FGP_LOCK);
+	spin_lock(&fi->rdc.lock);
+	if (!page) {
+		/*
+		 * Uh-oh: page gone missing, cache is useless
+		 */
+		if (fi->rdc.version == ff->readdir.version)
+			fuse_rdc_reset(inode);
+		goto retry_locked;
+	}
+
+	/* Make sure it's still the same version after getting the page. */
+	if (ff->readdir.version != fi->rdc.version) {
+		spin_unlock(&fi->rdc.lock);
+		unlock_page(page);
+		put_page(page);
+		goto retry;
+	}
+	spin_unlock(&fi->rdc.lock);
+
+	/*
+	 * Contents of the page are now protected against changing by holding
+	 * the page lock.
+	 */
+	addr = kmap(page);
+	res = fuse_parse_cache(ff, addr, size, ctx);
+	kunmap(page);
+	unlock_page(page);
+	put_page(page);
+
+	if (res == FOUND_ERR)
+		return -EIO;
+
+	if (res == FOUND_ALL)
+		return 0;
+
+	if (size == PAGE_SIZE) {
+		/* We hit end of page: skip to next page. */
+		ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE);
+		goto retry;
+	}
+
+	/*
+	 * End of cache reached.  If found position, then we are done, otherwise
+	 * need to fall back to uncached, since the position we were looking for
+	 * wasn't in the cache.
+	 */
+	return res == FOUND_SOME ? 0 : UNCACHED;
+}
+
+int fuse_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file_inode(file);
+	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	mutex_lock(&ff->readdir.lock);
+
+	err = UNCACHED;
+	if (ff->open_flags & FOPEN_CACHE_DIR)
+		err = fuse_readdir_cached(file, ctx);
+	if (err == UNCACHED)
+		err = fuse_readdir_uncached(file, ctx);
+
+	mutex_unlock(&ff->readdir.lock);
+
+	return err;
+}

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
new file mode 100644
index 0000000..a5c8604
--- /dev/null
+++ b/fs/fuse/virtio_fs.c

@@ -0,0 +1,1252 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio-fs: Virtio Filesystem
+ * Copyright (C) 2018 Red Hat, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_fs.h>
+#include <linux/delay.h>
+#include <linux/fs_context.h>
+#include <linux/highmem.h>
+#include "fuse_i.h"
+
+/* List of virtio-fs device instances and a lock for the list. Also provides
+ * mutual exclusion in device removal and mounting path
+ */
+static DEFINE_MUTEX(virtio_fs_mutex);
+static LIST_HEAD(virtio_fs_instances);
+
+enum {
+	VQ_HIPRIO,
+	VQ_REQUEST
+};
+
+/* Per-virtqueue state */
+struct virtio_fs_vq {
+	spinlock_t lock;
+	struct virtqueue *vq;     /* protected by ->lock */
+	struct work_struct done_work;
+	struct list_head queued_reqs;
+	struct list_head end_reqs;	/* End these requests */
+	struct delayed_work dispatch_work;
+	struct fuse_dev *fud;
+	bool connected;
+	long in_flight;
+	char name[24];
+} ____cacheline_aligned_in_smp;
+
+/* A virtio-fs device instance */
+struct virtio_fs {
+	struct kref refcount;
+	struct list_head list;    /* on virtio_fs_instances */
+	char *tag;
+	struct virtio_fs_vq *vqs;
+	unsigned int nvqs;               /* number of virtqueues */
+	unsigned int num_request_queues; /* number of request queues */
+};
+
+struct virtio_fs_forget {
+	struct fuse_in_header ih;
+	struct fuse_forget_in arg;
+	/* This request can be temporarily queued on virt queue */
+	struct list_head list;
+};
+
+static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
+				 struct fuse_req *req, bool in_flight);
+
+static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
+{
+	struct virtio_fs *fs = vq->vdev->priv;
+
+	return &fs->vqs[vq->index];
+}
+
+static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
+{
+	return &vq_to_fsvq(vq)->fud->pq;
+}
+
+/* Should be called with fsvq->lock held. */
+static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq)
+{
+	fsvq->in_flight++;
+}
+
+/* Should be called with fsvq->lock held. */
+static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
+{
+	WARN_ON(fsvq->in_flight <= 0);
+	fsvq->in_flight--;
+}
+
+static void release_virtio_fs_obj(struct kref *ref)
+{
+	struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
+
+	kfree(vfs->vqs);
+	kfree(vfs);
+}
+
+/* Make sure virtiofs_mutex is held */
+static void virtio_fs_put(struct virtio_fs *fs)
+{
+	kref_put(&fs->refcount, release_virtio_fs_obj);
+}
+
+static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
+{
+	struct virtio_fs *vfs = fiq->priv;
+
+	mutex_lock(&virtio_fs_mutex);
+	virtio_fs_put(vfs);
+	mutex_unlock(&virtio_fs_mutex);
+}
+
+static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
+{
+	WARN_ON(fsvq->in_flight < 0);
+
+	/* Wait for in flight requests to finish.*/
+	while (1) {
+		spin_lock(&fsvq->lock);
+		if (!fsvq->in_flight) {
+			spin_unlock(&fsvq->lock);
+			break;
+		}
+		spin_unlock(&fsvq->lock);
+		/* TODO use completion instead of timeout */
+		usleep_range(1000, 2000);
+	}
+
+	flush_work(&fsvq->done_work);
+	flush_delayed_work(&fsvq->dispatch_work);
+}
+
+static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
+{
+	struct virtio_fs_vq *fsvq;
+	int i;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		fsvq = &fs->vqs[i];
+		virtio_fs_drain_queue(fsvq);
+	}
+}
+
+static void virtio_fs_start_all_queues(struct virtio_fs *fs)
+{
+	struct virtio_fs_vq *fsvq;
+	int i;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		fsvq = &fs->vqs[i];
+		spin_lock(&fsvq->lock);
+		fsvq->connected = true;
+		spin_unlock(&fsvq->lock);
+	}
+}
+
+/* Add a new instance to the list or return -EEXIST if tag name exists*/
+static int virtio_fs_add_instance(struct virtio_fs *fs)
+{
+	struct virtio_fs *fs2;
+	bool duplicate = false;
+
+	mutex_lock(&virtio_fs_mutex);
+
+	list_for_each_entry(fs2, &virtio_fs_instances, list) {
+		if (strcmp(fs->tag, fs2->tag) == 0)
+			duplicate = true;
+	}
+
+	if (!duplicate)
+		list_add_tail(&fs->list, &virtio_fs_instances);
+
+	mutex_unlock(&virtio_fs_mutex);
+
+	if (duplicate)
+		return -EEXIST;
+	return 0;
+}
+
+/* Return the virtio_fs with a given tag, or NULL */
+static struct virtio_fs *virtio_fs_find_instance(const char *tag)
+{
+	struct virtio_fs *fs;
+
+	mutex_lock(&virtio_fs_mutex);
+
+	list_for_each_entry(fs, &virtio_fs_instances, list) {
+		if (strcmp(fs->tag, tag) == 0) {
+			kref_get(&fs->refcount);
+			goto found;
+		}
+	}
+
+	fs = NULL; /* not found */
+
+found:
+	mutex_unlock(&virtio_fs_mutex);
+
+	return fs;
+}
+
+static void virtio_fs_free_devs(struct virtio_fs *fs)
+{
+	unsigned int i;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+		if (!fsvq->fud)
+			continue;
+
+		fuse_dev_free(fsvq->fud);
+		fsvq->fud = NULL;
+	}
+}
+
+/* Read filesystem name from virtio config into fs->tag (must kfree()). */
+static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+	char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
+	char *end;
+	size_t len;
+
+	virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
+			   &tag_buf, sizeof(tag_buf));
+	end = memchr(tag_buf, '\0', sizeof(tag_buf));
+	if (end == tag_buf)
+		return -EINVAL; /* empty tag */
+	if (!end)
+		end = &tag_buf[sizeof(tag_buf)];
+
+	len = end - tag_buf;
+	fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
+	if (!fs->tag)
+		return -ENOMEM;
+	memcpy(fs->tag, tag_buf, len);
+	fs->tag[len] = '\0';
+	return 0;
+}
+
+/* Work function for hiprio completion */
+static void virtio_fs_hiprio_done_work(struct work_struct *work)
+{
+	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+						 done_work);
+	struct virtqueue *vq = fsvq->vq;
+
+	/* Free completed FUSE_FORGET requests */
+	spin_lock(&fsvq->lock);
+	do {
+		unsigned int len;
+		void *req;
+
+		virtqueue_disable_cb(vq);
+
+		while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+			kfree(req);
+			dec_in_flight_req(fsvq);
+		}
+	} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+	spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_request_dispatch_work(struct work_struct *work)
+{
+	struct fuse_req *req;
+	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+						 dispatch_work.work);
+	struct fuse_conn *fc = fsvq->fud->fc;
+	int ret;
+
+	pr_debug("virtio-fs: worker %s called.\n", __func__);
+	while (1) {
+		spin_lock(&fsvq->lock);
+		req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req,
+					       list);
+		if (!req) {
+			spin_unlock(&fsvq->lock);
+			break;
+		}
+
+		list_del_init(&req->list);
+		spin_unlock(&fsvq->lock);
+		fuse_request_end(fc, req);
+	}
+
+	/* Dispatch pending requests */
+	while (1) {
+		spin_lock(&fsvq->lock);
+		req = list_first_entry_or_null(&fsvq->queued_reqs,
+					       struct fuse_req, list);
+		if (!req) {
+			spin_unlock(&fsvq->lock);
+			return;
+		}
+		list_del_init(&req->list);
+		spin_unlock(&fsvq->lock);
+
+		ret = virtio_fs_enqueue_req(fsvq, req, true);
+		if (ret < 0) {
+			if (ret == -ENOMEM || ret == -ENOSPC) {
+				spin_lock(&fsvq->lock);
+				list_add_tail(&req->list, &fsvq->queued_reqs);
+				schedule_delayed_work(&fsvq->dispatch_work,
+						      msecs_to_jiffies(1));
+				spin_unlock(&fsvq->lock);
+				return;
+			}
+			req->out.h.error = ret;
+			spin_lock(&fsvq->lock);
+			dec_in_flight_req(fsvq);
+			spin_unlock(&fsvq->lock);
+			pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
+			       ret);
+			fuse_request_end(fc, req);
+		}
+	}
+}
+
+static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
+{
+	struct virtio_fs_forget *forget;
+	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+						 dispatch_work.work);
+	struct virtqueue *vq = fsvq->vq;
+	struct scatterlist sg;
+	struct scatterlist *sgs[] = {&sg};
+	bool notify;
+	int ret;
+
+	pr_debug("virtio-fs: worker %s called.\n", __func__);
+	while (1) {
+		spin_lock(&fsvq->lock);
+		forget = list_first_entry_or_null(&fsvq->queued_reqs,
+					struct virtio_fs_forget, list);
+		if (!forget) {
+			spin_unlock(&fsvq->lock);
+			return;
+		}
+
+		list_del(&forget->list);
+		if (!fsvq->connected) {
+			dec_in_flight_req(fsvq);
+			spin_unlock(&fsvq->lock);
+			kfree(forget);
+			continue;
+		}
+
+		sg_init_one(&sg, forget, sizeof(*forget));
+
+		/* Enqueue the request */
+		dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+		ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
+		if (ret < 0) {
+			if (ret == -ENOMEM || ret == -ENOSPC) {
+				pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
+					 ret);
+				list_add_tail(&forget->list,
+						&fsvq->queued_reqs);
+				schedule_delayed_work(&fsvq->dispatch_work,
+						msecs_to_jiffies(1));
+			} else {
+				pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+					 ret);
+				dec_in_flight_req(fsvq);
+				kfree(forget);
+			}
+			spin_unlock(&fsvq->lock);
+			return;
+		}
+
+		notify = virtqueue_kick_prepare(vq);
+		spin_unlock(&fsvq->lock);
+
+		if (notify)
+			virtqueue_notify(vq);
+		pr_debug("virtio-fs: worker %s dispatched one forget request.\n",
+			 __func__);
+	}
+}
+
+/* Allocate and copy args into req->argbuf */
+static int copy_args_to_argbuf(struct fuse_req *req)
+{
+	struct fuse_args *args = req->args;
+	unsigned int offset = 0;
+	unsigned int num_in;
+	unsigned int num_out;
+	unsigned int len;
+	unsigned int i;
+
+	num_in = args->in_numargs - args->in_pages;
+	num_out = args->out_numargs - args->out_pages;
+	len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
+	      fuse_len_args(num_out, args->out_args);
+
+	req->argbuf = kmalloc(len, GFP_ATOMIC);
+	if (!req->argbuf)
+		return -ENOMEM;
+
+	for (i = 0; i < num_in; i++) {
+		memcpy(req->argbuf + offset,
+		       args->in_args[i].value,
+		       args->in_args[i].size);
+		offset += args->in_args[i].size;
+	}
+
+	return 0;
+}
+
+/* Copy args out of and free req->argbuf */
+static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
+{
+	unsigned int remaining;
+	unsigned int offset;
+	unsigned int num_in;
+	unsigned int num_out;
+	unsigned int i;
+
+	remaining = req->out.h.len - sizeof(req->out.h);
+	num_in = args->in_numargs - args->in_pages;
+	num_out = args->out_numargs - args->out_pages;
+	offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args);
+
+	for (i = 0; i < num_out; i++) {
+		unsigned int argsize = args->out_args[i].size;
+
+		if (args->out_argvar &&
+		    i == args->out_numargs - 1 &&
+		    argsize > remaining) {
+			argsize = remaining;
+		}
+
+		memcpy(args->out_args[i].value, req->argbuf + offset, argsize);
+		offset += argsize;
+
+		if (i != args->out_numargs - 1)
+			remaining -= argsize;
+	}
+
+	/* Store the actual size of the variable-length arg */
+	if (args->out_argvar)
+		args->out_args[args->out_numargs - 1].size = remaining;
+
+	kfree(req->argbuf);
+	req->argbuf = NULL;
+}
+
+/* Work function for request completion */
+static void virtio_fs_requests_done_work(struct work_struct *work)
+{
+	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+						 done_work);
+	struct fuse_pqueue *fpq = &fsvq->fud->pq;
+	struct fuse_conn *fc = fsvq->fud->fc;
+	struct virtqueue *vq = fsvq->vq;
+	struct fuse_req *req;
+	struct fuse_args_pages *ap;
+	struct fuse_req *next;
+	struct fuse_args *args;
+	unsigned int len, i, thislen;
+	struct page *page;
+	LIST_HEAD(reqs);
+
+	/* Collect completed requests off the virtqueue */
+	spin_lock(&fsvq->lock);
+	do {
+		virtqueue_disable_cb(vq);
+
+		while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+			spin_lock(&fpq->lock);
+			list_move_tail(&req->list, &reqs);
+			spin_unlock(&fpq->lock);
+		}
+	} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+	spin_unlock(&fsvq->lock);
+
+	/* End requests */
+	list_for_each_entry_safe(req, next, &reqs, list) {
+		/*
+		 * TODO verify that server properly follows FUSE protocol
+		 * (oh.uniq, oh.len)
+		 */
+		args = req->args;
+		copy_args_from_argbuf(args, req);
+
+		if (args->out_pages && args->page_zeroing) {
+			len = args->out_args[args->out_numargs - 1].size;
+			ap = container_of(args, typeof(*ap), args);
+			for (i = 0; i < ap->num_pages; i++) {
+				thislen = ap->descs[i].length;
+				if (len < thislen) {
+					WARN_ON(ap->descs[i].offset);
+					page = ap->pages[i];
+					zero_user_segment(page, len, thislen);
+					len = 0;
+				} else {
+					len -= thislen;
+				}
+			}
+		}
+
+		spin_lock(&fpq->lock);
+		clear_bit(FR_SENT, &req->flags);
+		list_del_init(&req->list);
+		spin_unlock(&fpq->lock);
+
+		fuse_request_end(fc, req);
+		spin_lock(&fsvq->lock);
+		dec_in_flight_req(fsvq);
+		spin_unlock(&fsvq->lock);
+	}
+}
+
+/* Virtqueue interrupt handler */
+static void virtio_fs_vq_done(struct virtqueue *vq)
+{
+	struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
+
+	dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
+
+	schedule_work(&fsvq->done_work);
+}
+
+/* Initialize virtqueues */
+static int virtio_fs_setup_vqs(struct virtio_device *vdev,
+			       struct virtio_fs *fs)
+{
+	struct virtqueue **vqs;
+	vq_callback_t **callbacks;
+	const char **names;
+	unsigned int i;
+	int ret = 0;
+
+	virtio_cread(vdev, struct virtio_fs_config, num_request_queues,
+		     &fs->num_request_queues);
+	if (fs->num_request_queues == 0)
+		return -EINVAL;
+
+	fs->nvqs = 1 + fs->num_request_queues;
+	fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
+	if (!fs->vqs)
+		return -ENOMEM;
+
+	vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
+	callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
+					GFP_KERNEL);
+	names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
+	if (!vqs || !callbacks || !names) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
+	snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
+			"hiprio");
+	names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
+	INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
+	INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
+	INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
+	INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
+			virtio_fs_hiprio_dispatch_work);
+	spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
+
+	/* Initialize the requests virtqueues */
+	for (i = VQ_REQUEST; i < fs->nvqs; i++) {
+		spin_lock_init(&fs->vqs[i].lock);
+		INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
+		INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
+				  virtio_fs_request_dispatch_work);
+		INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
+		INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
+		snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
+			 "requests.%u", i - VQ_REQUEST);
+		callbacks[i] = virtio_fs_vq_done;
+		names[i] = fs->vqs[i].name;
+	}
+
+	ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < fs->nvqs; i++)
+		fs->vqs[i].vq = vqs[i];
+
+	virtio_fs_start_all_queues(fs);
+out:
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
+	if (ret)
+		kfree(fs->vqs);
+	return ret;
+}
+
+/* Free virtqueues (device must already be reset) */
+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
+				  struct virtio_fs *fs)
+{
+	vdev->config->del_vqs(vdev);
+}
+
+static int virtio_fs_probe(struct virtio_device *vdev)
+{
+	struct virtio_fs *fs;
+	int ret;
+
+	fs = kzalloc(sizeof(*fs), GFP_KERNEL);
+	if (!fs)
+		return -ENOMEM;
+	kref_init(&fs->refcount);
+	vdev->priv = fs;
+
+	ret = virtio_fs_read_tag(vdev, fs);
+	if (ret < 0)
+		goto out;
+
+	ret = virtio_fs_setup_vqs(vdev, fs);
+	if (ret < 0)
+		goto out;
+
+	/* TODO vq affinity */
+
+	/* Bring the device online in case the filesystem is mounted and
+	 * requests need to be sent before we return.
+	 */
+	virtio_device_ready(vdev);
+
+	ret = virtio_fs_add_instance(fs);
+	if (ret < 0)
+		goto out_vqs;
+
+	return 0;
+
+out_vqs:
+	vdev->config->reset(vdev);
+	virtio_fs_cleanup_vqs(vdev, fs);
+
+out:
+	vdev->priv = NULL;
+	kfree(fs);
+	return ret;
+}
+
+static void virtio_fs_stop_all_queues(struct virtio_fs *fs)
+{
+	struct virtio_fs_vq *fsvq;
+	int i;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		fsvq = &fs->vqs[i];
+		spin_lock(&fsvq->lock);
+		fsvq->connected = false;
+		spin_unlock(&fsvq->lock);
+	}
+}
+
+static void virtio_fs_remove(struct virtio_device *vdev)
+{
+	struct virtio_fs *fs = vdev->priv;
+
+	mutex_lock(&virtio_fs_mutex);
+	/* This device is going away. No one should get new reference */
+	list_del_init(&fs->list);
+	virtio_fs_stop_all_queues(fs);
+	virtio_fs_drain_all_queues(fs);
+	vdev->config->reset(vdev);
+	virtio_fs_cleanup_vqs(vdev, fs);
+
+	vdev->priv = NULL;
+	/* Put device reference on virtio_fs object */
+	virtio_fs_put(fs);
+	mutex_unlock(&virtio_fs_mutex);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_fs_freeze(struct virtio_device *vdev)
+{
+	/* TODO need to save state here */
+	pr_warn("virtio-fs: suspend/resume not yet supported\n");
+	return -EOPNOTSUPP;
+}
+
+static int virtio_fs_restore(struct virtio_device *vdev)
+{
+	 /* TODO need to restore state here */
+	return 0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+const static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
+	{},
+};
+
+const static unsigned int feature_table[] = {};
+
+static struct virtio_driver virtio_fs_driver = {
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.feature_table		= feature_table,
+	.feature_table_size	= ARRAY_SIZE(feature_table),
+	.probe			= virtio_fs_probe,
+	.remove			= virtio_fs_remove,
+#ifdef CONFIG_PM_SLEEP
+	.freeze			= virtio_fs_freeze,
+	.restore		= virtio_fs_restore,
+#endif
+};
+
+static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+	struct fuse_forget_link *link;
+	struct virtio_fs_forget *forget;
+	struct scatterlist sg;
+	struct scatterlist *sgs[] = {&sg};
+	struct virtio_fs *fs;
+	struct virtqueue *vq;
+	struct virtio_fs_vq *fsvq;
+	bool notify;
+	u64 unique;
+	int ret;
+
+	link = fuse_dequeue_forget(fiq, 1, NULL);
+	unique = fuse_get_unique(fiq);
+
+	fs = fiq->priv;
+	fsvq = &fs->vqs[VQ_HIPRIO];
+	spin_unlock(&fiq->lock);
+
+	/* Allocate a buffer for the request */
+	forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
+
+	forget->ih = (struct fuse_in_header){
+		.opcode = FUSE_FORGET,
+		.nodeid = link->forget_one.nodeid,
+		.unique = unique,
+		.len = sizeof(*forget),
+	};
+	forget->arg = (struct fuse_forget_in){
+		.nlookup = link->forget_one.nlookup,
+	};
+
+	sg_init_one(&sg, forget, sizeof(*forget));
+
+	/* Enqueue the request */
+	spin_lock(&fsvq->lock);
+
+	if (!fsvq->connected) {
+		kfree(forget);
+		spin_unlock(&fsvq->lock);
+		goto out;
+	}
+
+	vq = fsvq->vq;
+	dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+
+	ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
+	if (ret < 0) {
+		if (ret == -ENOMEM || ret == -ENOSPC) {
+			pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n",
+				 ret);
+			list_add_tail(&forget->list, &fsvq->queued_reqs);
+			schedule_delayed_work(&fsvq->dispatch_work,
+					msecs_to_jiffies(1));
+			inc_in_flight_req(fsvq);
+		} else {
+			pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+				 ret);
+			kfree(forget);
+		}
+		spin_unlock(&fsvq->lock);
+		goto out;
+	}
+
+	inc_in_flight_req(fsvq);
+	notify = virtqueue_kick_prepare(vq);
+
+	spin_unlock(&fsvq->lock);
+
+	if (notify)
+		virtqueue_notify(vq);
+out:
+	kfree(link);
+}
+
+static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+	/*
+	 * TODO interrupts.
+	 *
+	 * Normal fs operations on a local filesystems aren't interruptible.
+	 * Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
+	 * with shared lock between host and guest.
+	 */
+	spin_unlock(&fiq->lock);
+}
+
+/* Return the number of scatter-gather list elements required */
+static unsigned int sg_count_fuse_req(struct fuse_req *req)
+{
+	struct fuse_args *args = req->args;
+	struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
+	unsigned int total_sgs = 1 /* fuse_in_header */;
+
+	if (args->in_numargs - args->in_pages)
+		total_sgs += 1;
+
+	if (args->in_pages)
+		total_sgs += ap->num_pages;
+
+	if (!test_bit(FR_ISREPLY, &req->flags))
+		return total_sgs;
+
+	total_sgs += 1 /* fuse_out_header */;
+
+	if (args->out_numargs - args->out_pages)
+		total_sgs += 1;
+
+	if (args->out_pages)
+		total_sgs += ap->num_pages;
+
+	return total_sgs;
+}
+
+/* Add pages to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
+				       struct page **pages,
+				       struct fuse_page_desc *page_descs,
+				       unsigned int num_pages,
+				       unsigned int total_len)
+{
+	unsigned int i;
+	unsigned int this_len;
+
+	for (i = 0; i < num_pages && total_len; i++) {
+		sg_init_table(&sg[i], 1);
+		this_len =  min(page_descs[i].length, total_len);
+		sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
+		total_len -= this_len;
+	}
+
+	return i;
+}
+
+/* Add args to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_args(struct scatterlist *sg,
+				      struct fuse_req *req,
+				      struct fuse_arg *args,
+				      unsigned int numargs,
+				      bool argpages,
+				      void *argbuf,
+				      unsigned int *len_used)
+{
+	struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
+	unsigned int total_sgs = 0;
+	unsigned int len;
+
+	len = fuse_len_args(numargs - argpages, args);
+	if (len)
+		sg_init_one(&sg[total_sgs++], argbuf, len);
+
+	if (argpages)
+		total_sgs += sg_init_fuse_pages(&sg[total_sgs],
+						ap->pages, ap->descs,
+						ap->num_pages,
+						args[numargs - 1].size);
+
+	if (len_used)
+		*len_used = len;
+
+	return total_sgs;
+}
+
+/* Add a request to a virtqueue and kick the device */
+static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
+				 struct fuse_req *req, bool in_flight)
+{
+	/* requests need at least 4 elements */
+	struct scatterlist *stack_sgs[6];
+	struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
+	struct scatterlist **sgs = stack_sgs;
+	struct scatterlist *sg = stack_sg;
+	struct virtqueue *vq;
+	struct fuse_args *args = req->args;
+	unsigned int argbuf_used = 0;
+	unsigned int out_sgs = 0;
+	unsigned int in_sgs = 0;
+	unsigned int total_sgs;
+	unsigned int i;
+	int ret;
+	bool notify;
+	struct fuse_pqueue *fpq;
+
+	/* Does the sglist fit on the stack? */
+	total_sgs = sg_count_fuse_req(req);
+	if (total_sgs > ARRAY_SIZE(stack_sgs)) {
+		sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
+		sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
+		if (!sgs || !sg) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* Use a bounce buffer since stack args cannot be mapped */
+	ret = copy_args_to_argbuf(req);
+	if (ret < 0)
+		goto out;
+
+	/* Request elements */
+	sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
+	out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
+				     (struct fuse_arg *)args->in_args,
+				     args->in_numargs, args->in_pages,
+				     req->argbuf, &argbuf_used);
+
+	/* Reply elements */
+	if (test_bit(FR_ISREPLY, &req->flags)) {
+		sg_init_one(&sg[out_sgs + in_sgs++],
+			    &req->out.h, sizeof(req->out.h));
+		in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
+					    args->out_args, args->out_numargs,
+					    args->out_pages,
+					    req->argbuf + argbuf_used, NULL);
+	}
+
+	WARN_ON(out_sgs + in_sgs != total_sgs);
+
+	for (i = 0; i < total_sgs; i++)
+		sgs[i] = &sg[i];
+
+	spin_lock(&fsvq->lock);
+
+	if (!fsvq->connected) {
+		spin_unlock(&fsvq->lock);
+		ret = -ENOTCONN;
+		goto out;
+	}
+
+	vq = fsvq->vq;
+	ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
+	if (ret < 0) {
+		spin_unlock(&fsvq->lock);
+		goto out;
+	}
+
+	/* Request successfully sent. */
+	fpq = &fsvq->fud->pq;
+	spin_lock(&fpq->lock);
+	list_add_tail(&req->list, fpq->processing);
+	spin_unlock(&fpq->lock);
+	set_bit(FR_SENT, &req->flags);
+	/* matches barrier in request_wait_answer() */
+	smp_mb__after_atomic();
+
+	if (!in_flight)
+		inc_in_flight_req(fsvq);
+	notify = virtqueue_kick_prepare(vq);
+
+	spin_unlock(&fsvq->lock);
+
+	if (notify)
+		virtqueue_notify(vq);
+
+out:
+	if (ret < 0 && req->argbuf) {
+		kfree(req->argbuf);
+		req->argbuf = NULL;
+	}
+	if (sgs != stack_sgs) {
+		kfree(sgs);
+		kfree(sg);
+	}
+
+	return ret;
+}
+
+static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+	unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+	struct virtio_fs *fs;
+	struct fuse_req *req;
+	struct virtio_fs_vq *fsvq;
+	int ret;
+
+	WARN_ON(list_empty(&fiq->pending));
+	req = list_last_entry(&fiq->pending, struct fuse_req, list);
+	clear_bit(FR_PENDING, &req->flags);
+	list_del_init(&req->list);
+	WARN_ON(!list_empty(&fiq->pending));
+	spin_unlock(&fiq->lock);
+
+	fs = fiq->priv;
+
+	pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
+		  __func__, req->in.h.opcode, req->in.h.unique,
+		 req->in.h.nodeid, req->in.h.len,
+		 fuse_len_args(req->args->out_numargs, req->args->out_args));
+
+	fsvq = &fs->vqs[queue_id];
+	ret = virtio_fs_enqueue_req(fsvq, req, false);
+	if (ret < 0) {
+		if (ret == -ENOMEM || ret == -ENOSPC) {
+			/*
+			 * Virtqueue full. Retry submission from worker
+			 * context as we might be holding fc->bg_lock.
+			 */
+			spin_lock(&fsvq->lock);
+			list_add_tail(&req->list, &fsvq->queued_reqs);
+			inc_in_flight_req(fsvq);
+			schedule_delayed_work(&fsvq->dispatch_work,
+						msecs_to_jiffies(1));
+			spin_unlock(&fsvq->lock);
+			return;
+		}
+		req->out.h.error = ret;
+		pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
+
+		/* Can't end request in submission context. Use a worker */
+		spin_lock(&fsvq->lock);
+		list_add_tail(&req->list, &fsvq->end_reqs);
+		schedule_delayed_work(&fsvq->dispatch_work, 0);
+		spin_unlock(&fsvq->lock);
+		return;
+	}
+}
+
+const static struct fuse_iqueue_ops virtio_fs_fiq_ops = {
+	.wake_forget_and_unlock		= virtio_fs_wake_forget_and_unlock,
+	.wake_interrupt_and_unlock	= virtio_fs_wake_interrupt_and_unlock,
+	.wake_pending_and_unlock	= virtio_fs_wake_pending_and_unlock,
+	.release			= virtio_fs_fiq_release,
+};
+
+static int virtio_fs_fill_super(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct virtio_fs *fs = fc->iq.priv;
+	unsigned int i;
+	int err;
+	struct fuse_fs_context ctx = {
+		.rootmode = S_IFDIR,
+		.default_permissions = 1,
+		.allow_other = 1,
+		.max_read = UINT_MAX,
+		.blksize = 512,
+		.destroy = true,
+		.no_control = true,
+		.no_force_umount = true,
+		.no_mount_options = true,
+	};
+
+	mutex_lock(&virtio_fs_mutex);
+
+	/* After holding mutex, make sure virtiofs device is still there.
+	 * Though we are holding a reference to it, drive ->remove might
+	 * still have cleaned up virtual queues. In that case bail out.
+	 */
+	err = -EINVAL;
+	if (list_empty(&fs->list)) {
+		pr_info("virtio-fs: tag <%s> not found\n", fs->tag);
+		goto err;
+	}
+
+	err = -ENOMEM;
+	/* Allocate fuse_dev for hiprio and notification queues */
+	for (i = 0; i < VQ_REQUEST; i++) {
+		struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+		fsvq->fud = fuse_dev_alloc();
+		if (!fsvq->fud)
+			goto err_free_fuse_devs;
+	}
+
+	ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
+	err = fuse_fill_super_common(sb, &ctx);
+	if (err < 0)
+		goto err_free_fuse_devs;
+
+	fc = fs->vqs[VQ_REQUEST].fud->fc;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+		if (i == VQ_REQUEST)
+			continue; /* already initialized */
+		fuse_dev_install(fsvq->fud, fc);
+	}
+
+	/* Previous unmount will stop all queues. Start these again */
+	virtio_fs_start_all_queues(fs);
+	fuse_send_init(fc);
+	mutex_unlock(&virtio_fs_mutex);
+	return 0;
+
+err_free_fuse_devs:
+	virtio_fs_free_devs(fs);
+err:
+	mutex_unlock(&virtio_fs_mutex);
+	return err;
+}
+
+static void virtio_kill_sb(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct virtio_fs *vfs;
+	struct virtio_fs_vq *fsvq;
+
+	/* If mount failed, we can still be called without any fc */
+	if (!fc)
+		return fuse_kill_sb_anon(sb);
+
+	vfs = fc->iq.priv;
+	fsvq = &vfs->vqs[VQ_HIPRIO];
+
+	/* Stop forget queue. Soon destroy will be sent */
+	spin_lock(&fsvq->lock);
+	fsvq->connected = false;
+	spin_unlock(&fsvq->lock);
+	virtio_fs_drain_all_queues(vfs);
+
+	fuse_kill_sb_anon(sb);
+
+	/* fuse_kill_sb_anon() must have sent destroy. Stop all queues
+	 * and drain one more time and free fuse devices. Freeing fuse
+	 * devices will drop their reference on fuse_conn and that in
+	 * turn will drop its reference on virtio_fs object.
+	 */
+	virtio_fs_stop_all_queues(vfs);
+	virtio_fs_drain_all_queues(vfs);
+	virtio_fs_free_devs(vfs);
+}
+
+static int virtio_fs_test_super(struct super_block *sb,
+				struct fs_context *fsc)
+{
+	struct fuse_conn *fc = fsc->s_fs_info;
+
+	return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+}
+
+static int virtio_fs_set_super(struct super_block *sb,
+			       struct fs_context *fsc)
+{
+	int err;
+
+	err = get_anon_bdev(&sb->s_dev);
+	if (!err)
+		fuse_conn_get(fsc->s_fs_info);
+
+	return err;
+}
+
+static int virtio_fs_get_tree(struct fs_context *fsc)
+{
+	struct virtio_fs *fs;
+	struct super_block *sb;
+	struct fuse_conn *fc;
+	int err;
+
+	/* This gets a reference on virtio_fs object. This ptr gets installed
+	 * in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
+	 * to drop the reference to this object.
+	 */
+	fs = virtio_fs_find_instance(fsc->source);
+	if (!fs) {
+		pr_info("virtio-fs: tag <%s> not found\n", fsc->source);
+		return -EINVAL;
+	}
+
+	fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
+	if (!fc) {
+		mutex_lock(&virtio_fs_mutex);
+		virtio_fs_put(fs);
+		mutex_unlock(&virtio_fs_mutex);
+		return -ENOMEM;
+	}
+
+	fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
+		       fs);
+	fc->release = fuse_free_conn;
+	fc->delete_stale = true;
+
+	fsc->s_fs_info = fc;
+	sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
+	fuse_conn_put(fc);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	if (!sb->s_root) {
+		err = virtio_fs_fill_super(sb);
+		if (err) {
+			deactivate_locked_super(sb);
+			return err;
+		}
+
+		sb->s_flags |= SB_ACTIVE;
+	}
+
+	WARN_ON(fsc->root);
+	fsc->root = dget(sb->s_root);
+	return 0;
+}
+
+static const struct fs_context_operations virtio_fs_context_ops = {
+	.get_tree	= virtio_fs_get_tree,
+};
+
+static int virtio_fs_init_fs_context(struct fs_context *fsc)
+{
+	fsc->ops = &virtio_fs_context_ops;
+	return 0;
+}
+
+static struct file_system_type virtio_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "virtiofs",
+	.init_fs_context = virtio_fs_init_fs_context,
+	.kill_sb	= virtio_kill_sb,
+};
+
+static int __init virtio_fs_init(void)
+{
+	int ret;
+
+	ret = register_virtio_driver(&virtio_fs_driver);
+	if (ret < 0)
+		return ret;
+
+	ret = register_filesystem(&virtio_fs_type);
+	if (ret < 0) {
+		unregister_virtio_driver(&virtio_fs_driver);
+		return ret;
+	}
+
+	return 0;
+}
+module_init(virtio_fs_init);
+
+static void __exit virtio_fs_exit(void)
+{
+	unregister_filesystem(&virtio_fs_type);
+	unregister_virtio_driver(&virtio_fs_driver);
+}
+module_exit(virtio_fs_exit);
+
+MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>");
+MODULE_DESCRIPTION("Virtio Filesystem");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS(KBUILD_MODNAME);
+MODULE_DEVICE_TABLE(virtio, id_table);

diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 4337176..20d052e 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c

@@ -25,15 +25,15 @@
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
 	inarg.flags = flags;
-	args.in.h.opcode = FUSE_SETXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 3;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = strlen(name) + 1;
-	args.in.args[1].value = name;
-	args.in.args[2].size = size;
-	args.in.args[2].value = value;
+	args.opcode = FUSE_SETXATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 3;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = strlen(name) + 1;
+	args.in_args[1].value = name;
+	args.in_args[2].size = size;
+	args.in_args[2].value = value;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_setxattr = 1;
@@ -60,22 +60,22 @@
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
-	args.in.h.opcode = FUSE_GETXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = strlen(name) + 1;
-	args.in.args[1].value = name;
+	args.opcode = FUSE_GETXATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = strlen(name) + 1;
+	args.in_args[1].value = name;
 	/* This is really two different operations rolled into one */
-	args.out.numargs = 1;
+	args.out_numargs = 1;
 	if (size) {
-		args.out.argvar = 1;
-		args.out.args[0].size = size;
-		args.out.args[0].value = value;
+		args.out_argvar = true;
+		args.out_args[0].size = size;
+		args.out_args[0].value = value;
 	} else {
-		args.out.args[0].size = sizeof(outarg);
-		args.out.args[0].value = &outarg;
+		args.out_args[0].size = sizeof(outarg);
+		args.out_args[0].value = &outarg;
 	}
 	ret = fuse_simple_request(fc, &args);
 	if (!ret && !size)
@@ -121,20 +121,20 @@
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
-	args.in.h.opcode = FUSE_LISTXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
+	args.opcode = FUSE_LISTXATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
 	/* This is really two different operations rolled into one */
-	args.out.numargs = 1;
+	args.out_numargs = 1;
 	if (size) {
-		args.out.argvar = 1;
-		args.out.args[0].size = size;
-		args.out.args[0].value = list;
+		args.out_argvar = true;
+		args.out_args[0].size = size;
+		args.out_args[0].value = list;
 	} else {
-		args.out.args[0].size = sizeof(outarg);
-		args.out.args[0].value = &outarg;
+		args.out_args[0].size = sizeof(outarg);
+		args.out_args[0].value = &outarg;
 	}
 	ret = fuse_simple_request(fc, &args);
 	if (!ret && !size)
@@ -157,11 +157,11 @@
 	if (fc->no_removexattr)
 		return -EOPNOTSUPP;
 
-	args.in.h.opcode = FUSE_REMOVEXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = strlen(name) + 1;
-	args.in.args[0].value = name;
+	args.opcode = FUSE_REMOVEXATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = strlen(name) + 1;
+	args.in_args[0].value = name;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_removexattr = 1;

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3ed2b08..03c9668 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig

@@ -1,6 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on (64BIT || LBDAF)
 	select FS_POSIX_ACL
 	select CRC32
 	select LIBCRC32C

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index af5f87a..09e6be8 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/sched.h>

diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index f674fdd..61353a1 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __ACL_DOT_H__

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 31e8270..b9fe975 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/sched.h>
@@ -85,15 +82,11 @@
 }
 
 /**
- * gfs2_writepage_common - Common bits of writepage
- * @page: The page to be written
+ * gfs2_writepage - Write page for writeback mappings
+ * @page: The page
  * @wbc: The writeback control
- *
- * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
  */
-
-static int gfs2_writepage_common(struct page *page,
-				 struct writeback_control *wbc)
+static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -112,7 +105,9 @@
 		page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
 		goto out;
 	}
-	return 1;
+
+	return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
+
 redirty:
 	redirty_page_for_writepage(wbc, page);
 out:
@@ -120,24 +115,6 @@
 	return 0;
 }
 
-/**
- * gfs2_writepage - Write page for writeback mappings
- * @page: The page
- * @wbc: The writeback control
- *
- */
-
-static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
-{
-	int ret;
-
-	ret = gfs2_writepage_common(page, wbc);
-	if (ret <= 0)
-		return ret;
-
-	return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
-}
-
 /* This is the same as calling block_write_full_page, but it also
  * writes pages outside of i_size
  */
@@ -266,7 +243,7 @@
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	unsigned nrblocks = nr_pages * (PAGE_SIZE/inode->i_sb->s_blocksize);
+	unsigned nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits);
 	int i;
 	int ret;
 
@@ -366,7 +343,7 @@
 	pgoff_t done_index;
 	int cycled;
 	int range_whole = 0;
-	int tag;
+	xa_mark_t tag;
 
 	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
@@ -457,8 +434,7 @@
  *
  * Returns: errno
  */
-
-int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 {
 	struct buffer_head *dibh;
 	u64 dsize = i_size_read(&ip->i_inode);
@@ -521,7 +497,7 @@
 		error = mpage_readpage(page, gfs2_block_map);
 	}
 
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)))
 		return -EIO;
 
 	return error;
@@ -576,7 +552,7 @@
                        unsigned size)
 {
 	struct address_space *mapping = ip->i_inode.i_mapping;
-	unsigned long index = *pos / PAGE_SIZE;
+	unsigned long index = *pos >> PAGE_SHIFT;
 	unsigned offset = *pos & (PAGE_SIZE - 1);
 	unsigned copied = 0;
 	unsigned amt;
@@ -638,7 +614,7 @@
 	gfs2_glock_dq(&gh);
 out_uninit:
 	gfs2_holder_uninit(&gh);
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)))
 		ret = -EIO;
 	return ret;
 }
@@ -649,7 +625,7 @@
  */
 void adjust_fs_space(struct inode *inode)
 {
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
 	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -657,10 +633,13 @@
 	struct buffer_head *m_bh, *l_bh;
 	u64 fs_total, new_free;
 
+	if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0)
+		return;
+
 	/* Total up the file system space, according to the latest rindex. */
 	fs_total = gfs2_ri_total(sdp);
 	if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
-		return;
+		goto out;
 
 	spin_lock(&sdp->sd_statfs_spin);
 	gfs2_statfs_change_in(m_sc, m_bh->b_data +
@@ -675,52 +654,14 @@
 	gfs2_statfs_change(sdp, new_free, new_free, 0);
 
 	if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
-		goto out;
+		goto out2;
 	update_statfs(sdp, m_bh, l_bh);
 	brelse(l_bh);
-out:
+out2:
 	brelse(m_bh);
-}
-
-/**
- * gfs2_stuffed_write_end - Write end for stuffed files
- * @inode: The inode
- * @dibh: The buffer_head containing the on-disk inode
- * @pos: The file position
- * @copied: How much was actually copied by the VFS
- * @page: The page
- *
- * This copies the data from the page into the inode block after
- * the inode data structure itself.
- *
- * Returns: copied bytes or errno
- */
-int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
-			   loff_t pos, unsigned copied,
-			   struct page *page)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	u64 to = pos + copied;
-	void *kaddr;
-	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
-
-	BUG_ON(pos + copied > gfs2_max_stuffed_size(ip));
-
-	kaddr = kmap_atomic(page);
-	memcpy(buf + pos, kaddr + pos, copied);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr);
-
-	WARN_ON(!PageUptodate(page));
-	unlock_page(page);
-	put_page(page);
-
-	if (copied) {
-		if (inode->i_size < to)
-			i_size_write(inode, to);
-		mark_inode_dirty(inode);
-	}
-	return copied;
+out:
+	sdp->sd_rindex_uptodate = 0;
+	gfs2_trans_end(sdp);
 }
 
 /**
@@ -756,7 +697,7 @@
 		return 0;
 
 	if (!gfs2_is_stuffed(ip))
-		dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
+		dblock = iomap_bmap(mapping, lblock, &gfs2_iomap_ops);
 
 	gfs2_glock_dq_uninit(&i_gh);
 
@@ -820,10 +761,10 @@
  * @page: the page that's being released
  * @gfp_mask: passed from Linux VFS, ignored by us
  *
- * Call try_to_free_buffers() if the buffers in this page can be
- * released.
+ * Calls try_to_free_buffers() to free the buffers and put the page if the
+ * buffers can be released.
  *
- * Returns: 0
+ * Returns: 1 if the page was put or else 0
  */
 
 int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
@@ -885,7 +826,7 @@
 	return 0;
 }
 
-static const struct address_space_operations gfs2_writeback_aops = {
+static const struct address_space_operations gfs2_aops = {
 	.writepage = gfs2_writepage,
 	.writepages = gfs2_writepages,
 	.readpage = gfs2_readpage,
@@ -899,21 +840,6 @@
 	.error_remove_page = generic_error_remove_page,
 };
 
-static const struct address_space_operations gfs2_ordered_aops = {
-	.writepage = gfs2_writepage,
-	.writepages = gfs2_writepages,
-	.readpage = gfs2_readpage,
-	.readpages = gfs2_readpages,
-	.set_page_dirty = __set_page_dirty_buffers,
-	.bmap = gfs2_bmap,
-	.invalidatepage = gfs2_invalidatepage,
-	.releasepage = gfs2_releasepage,
-	.direct_IO = noop_direct_IO,
-	.migratepage = buffer_migrate_page,
-	.is_partially_uptodate = block_is_partially_uptodate,
-	.error_remove_page = generic_error_remove_page,
-};
-
 static const struct address_space_operations gfs2_jdata_aops = {
 	.writepage = gfs2_jdata_writepage,
 	.writepages = gfs2_jdata_writepages,
@@ -929,15 +855,8 @@
 
 void gfs2_set_aops(struct inode *inode)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (gfs2_is_writeback(ip))
-		inode->i_mapping->a_ops = &gfs2_writeback_aops;
-	else if (gfs2_is_ordered(ip))
-		inode->i_mapping->a_ops = &gfs2_ordered_aops;
-	else if (gfs2_is_jdata(ip))
+	if (gfs2_is_jdata(GFS2_I(inode)))
 		inode->i_mapping->a_ops = &gfs2_jdata_aops;
 	else
-		BUG();
+		inode->i_mapping->a_ops = &gfs2_aops;
 }
-

diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h
index fa8e5d0..ff9877a 100644
--- a/fs/gfs2/aops.h
+++ b/fs/gfs2/aops.h

@@ -8,10 +8,6 @@
 
 #include "incore.h"
 
-extern int stuffed_readpage(struct gfs2_inode *ip, struct page *page);
-extern int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
-				  loff_t pos, unsigned copied,
-				  struct page *page);
 extern void adjust_fs_space(struct inode *inode);
 extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
 				   unsigned int from, unsigned int len);

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7f8bb08..f63df54 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/spinlock.h>
@@ -14,6 +11,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/iomap.h>
+#include <linux/ktime.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -141,7 +139,7 @@
 		if (error)
 			goto out_brelse;
 		if (isdir) {
-			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
+			gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
 			if (error)
 				goto out_brelse;
@@ -392,6 +390,19 @@
 	return mp->mp_aheight - x - 1;
 }
 
+static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
+{
+	sector_t factor = 1, block = 0;
+	int hgt;
+
+	for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
+		if (hgt < mp->mp_aheight)
+			block += mp->mp_list[hgt] * factor;
+		factor *= sdp->sd_inptrs;
+	}
+	return block;
+}
+
 static void release_metapath(struct metapath *mp)
 {
 	int i;
@@ -432,60 +443,84 @@
 	return ptr - first;
 }
 
-typedef const __be64 *(*gfs2_metadata_walker)(
-		struct metapath *mp,
-		const __be64 *start, const __be64 *end,
-		u64 factor, void *data);
+enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
 
-#define WALK_STOP ((__be64 *)0)
-#define WALK_NEXT ((__be64 *)1)
+/*
+ * gfs2_metadata_walker - walk an indirect block
+ * @mp: Metapath to indirect block
+ * @ptrs: Number of pointers to look at
+ *
+ * When returning WALK_FOLLOW, the walker must update @mp to point at the right
+ * indirect block to follow.
+ */
+typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
+						   unsigned int ptrs);
 
-static int gfs2_walk_metadata(struct inode *inode, sector_t lblock,
-		u64 len, struct metapath *mp, gfs2_metadata_walker walker,
-		void *data)
+/*
+ * gfs2_walk_metadata - walk a tree of indirect blocks
+ * @inode: The inode
+ * @mp: Starting point of walk
+ * @max_len: Maximum number of blocks to walk
+ * @walker: Called during the walk
+ *
+ * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
+ * past the end of metadata, and a negative error code otherwise.
+ */
+
+static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
+		u64 max_len, gfs2_metadata_walker walker)
 {
-	struct metapath clone;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	const __be64 *start, *end, *ptr;
 	u64 factor = 1;
 	unsigned int hgt;
-	int ret = 0;
+	int ret;
 
-	for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--)
+	/*
+	 * The walk starts in the lowest allocated indirect block, which may be
+	 * before the position indicated by @mp.  Adjust @max_len accordingly
+	 * to avoid a short walk.
+	 */
+	for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
+		max_len += mp->mp_list[hgt] * factor;
+		mp->mp_list[hgt] = 0;
 		factor *= sdp->sd_inptrs;
+	}
 
 	for (;;) {
-		u64 step;
+		u16 start = mp->mp_list[hgt];
+		enum walker_status status;
+		unsigned int ptrs;
+		u64 len;
 
 		/* Walk indirect block. */
-		start = metapointer(hgt, mp);
-		end = metaend(hgt, mp);
-
-		step = (end - start) * factor;
-		if (step > len)
-			end = start + DIV_ROUND_UP_ULL(len, factor);
-
-		ptr = walker(mp, start, end, factor, data);
-		if (ptr == WALK_STOP)
+		ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
+		len = ptrs * factor;
+		if (len > max_len)
+			ptrs = DIV_ROUND_UP_ULL(max_len, factor);
+		status = walker(mp, ptrs);
+		switch (status) {
+		case WALK_STOP:
+			return 1;
+		case WALK_FOLLOW:
+			BUG_ON(mp->mp_aheight == mp->mp_fheight);
+			ptrs = mp->mp_list[hgt] - start;
+			len = ptrs * factor;
 			break;
-		if (step >= len)
+		case WALK_CONTINUE:
 			break;
-		len -= step;
-		if (ptr != WALK_NEXT) {
-			BUG_ON(!*ptr);
-			mp->mp_list[hgt] += ptr - start;
-			goto fill_up_metapath;
 		}
+		if (len >= max_len)
+			break;
+		max_len -= len;
+		if (status == WALK_FOLLOW)
+			goto fill_up_metapath;
 
 lower_metapath:
 		/* Decrease height of metapath. */
-		if (mp != &clone) {
-			clone_metapath(&clone, mp);
-			mp = &clone;
-		}
 		brelse(mp->mp_bh[hgt]);
 		mp->mp_bh[hgt] = NULL;
+		mp->mp_list[hgt] = 0;
 		if (!hgt)
 			break;
 		hgt--;
@@ -493,10 +528,7 @@
 
 		/* Advance in metadata tree. */
 		(mp->mp_list[hgt])++;
-		start = metapointer(hgt, mp);
-		end = metaend(hgt, mp);
-		if (start >= end) {
-			mp->mp_list[hgt] = 0;
+		if (mp->mp_list[hgt] >= sdp->sd_inptrs) {
 			if (!hgt)
 				break;
 			goto lower_metapath;
@@ -504,44 +536,36 @@
 
 fill_up_metapath:
 		/* Increase height of metapath. */
-		if (mp != &clone) {
-			clone_metapath(&clone, mp);
-			mp = &clone;
-		}
 		ret = fillup_metapath(ip, mp, ip->i_height - 1);
 		if (ret < 0)
-			break;
+			return ret;
 		hgt += ret;
 		for (; ret; ret--)
 			do_div(factor, sdp->sd_inptrs);
 		mp->mp_aheight = hgt + 1;
 	}
-	if (mp == &clone)
-		release_metapath(mp);
-	return ret;
+	return 0;
 }
 
-struct gfs2_hole_walker_args {
-	u64 blocks;
-};
-
-static const __be64 *gfs2_hole_walker(struct metapath *mp,
-		const __be64 *start, const __be64 *end,
-		u64 factor, void *data)
+static enum walker_status gfs2_hole_walker(struct metapath *mp,
+					   unsigned int ptrs)
 {
-	struct gfs2_hole_walker_args *args = data;
-	const __be64 *ptr;
+	const __be64 *start, *ptr, *end;
+	unsigned int hgt;
+
+	hgt = mp->mp_aheight - 1;
+	start = metapointer(hgt, mp);
+	end = start + ptrs;
 
 	for (ptr = start; ptr < end; ptr++) {
 		if (*ptr) {
-			args->blocks += (ptr - start) * factor;
+			mp->mp_list[hgt] += ptr - start;
 			if (mp->mp_aheight == mp->mp_fheight)
 				return WALK_STOP;
-			return ptr;  /* increase height */
+			return WALK_FOLLOW;
 		}
 	}
-	args->blocks += (end - start) * factor;
-	return WALK_NEXT;
+	return WALK_CONTINUE;
 }
 
 /**
@@ -559,12 +583,24 @@
 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 			  struct metapath *mp, struct iomap *iomap)
 {
-	struct gfs2_hole_walker_args args = { };
-	int ret = 0;
+	struct metapath clone;
+	u64 hole_size;
+	int ret;
 
-	ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args);
-	if (!ret)
-		iomap->length = args.blocks << inode->i_blkbits;
+	clone_metapath(&clone, mp);
+	ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
+	if (ret < 0)
+		goto out;
+
+	if (ret == 1)
+		hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
+	else
+		hole_size = len;
+	iomap->length = hole_size << inode->i_blkbits;
+	ret = 0;
+
+out:
+	release_metapath(&clone);
 	return ret;
 }
 
@@ -597,7 +633,6 @@
  * gfs2_iomap_alloc - Build a metadata tree of the requested height
  * @inode: The GFS2 inode
  * @iomap: The iomap structure
- * @flags: iomap flags
  * @mp: The metapath, with proper height information calculated
  *
  * In this routine we may have to alloc:
@@ -624,7 +659,7 @@
  */
 
 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
-			    unsigned flags, struct metapath *mp)
+			    struct metapath *mp)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -675,7 +710,7 @@
 			goto out;
 		alloced += n;
 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
-			gfs2_trans_add_unrevoke(sdp, bn, n);
+			gfs2_trans_remove_revoke(sdp, bn, n);
 		switch (state) {
 		/* Growing height of tree */
 		case ALLOC_GROW_HEIGHT:
@@ -709,7 +744,7 @@
 			}
 			if (n == 0)
 				break;
-		/* Branching from existing tree */
+		/* fall through - To branching from existing tree */
 		case ALLOC_GROW_DEPTH:
 			if (i > 1 && i < mp->mp_fheight)
 				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
@@ -720,7 +755,7 @@
 				state = ALLOC_DATA;
 			if (n == 0)
 				break;
-		/* Tree complete, adding data blocks */
+		/* fall through - To tree complete, adding data blocks */
 		case ALLOC_DATA:
 			BUG_ON(n > dblks);
 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
@@ -924,6 +959,32 @@
 	goto out;
 }
 
+/**
+ * gfs2_lblk_to_dblk - convert logical block to disk block
+ * @inode: the inode of the file we're mapping
+ * @lblock: the block relative to the start of the file
+ * @dblock: the returned dblock, if no error
+ *
+ * This function maps a single block from a file logical block (relative to
+ * the start of the file) to a file system absolute block using iomap.
+ *
+ * Returns: the absolute file system block, or an error
+ */
+int gfs2_lblk_to_dblk(struct inode *inode, u32 lblock, u64 *dblock)
+{
+	struct iomap iomap = { };
+	struct metapath mp = { .mp_aheight = 1, };
+	loff_t pos = (loff_t)lblock << inode->i_blkbits;
+	int ret;
+
+	ret = gfs2_iomap_get(inode, pos, i_blocksize(inode), 0, &iomap, &mp);
+	release_metapath(&mp);
+	if (ret == 0)
+		*dblock = iomap.addr >> inode->i_blkbits;
+
+	return ret;
+}
+
 static int gfs2_write_lock(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -964,15 +1025,39 @@
 	gfs2_glock_dq_uninit(&ip->i_gh);
 }
 
-static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos,
-				unsigned copied, struct page *page,
-				struct iomap *iomap)
+static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
+				   unsigned len, struct iomap *iomap)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int blockmask = i_blocksize(inode) - 1;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned int blocks;
 
-	gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
+	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
+	return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
 }
 
+static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
+				 unsigned copied, struct page *page,
+				 struct iomap *iomap)
+{
+	struct gfs2_trans *tr = current->journal_info;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (page && !gfs2_is_stuffed(ip))
+		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
+
+	if (tr->tr_num_buf_new)
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+
+	gfs2_trans_end(sdp);
+}
+
+static const struct iomap_page_ops gfs2_iomap_page_ops = {
+	.page_prepare = gfs2_iomap_page_prepare,
+	.page_done = gfs2_iomap_page_done,
+};
+
 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
 				  loff_t length, unsigned flags,
 				  struct iomap *iomap,
@@ -980,92 +1065,89 @@
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-	bool unstuff, alloc_required;
+	bool unstuff;
 	int ret;
 
-	ret = gfs2_write_lock(inode);
-	if (ret)
-		return ret;
-
 	unstuff = gfs2_is_stuffed(ip) &&
 		  pos + length > gfs2_max_stuffed_size(ip);
 
-	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, mp);
-	if (ret)
-		goto out_unlock;
+	if (unstuff || iomap->type == IOMAP_HOLE) {
+		unsigned int data_blocks, ind_blocks;
+		struct gfs2_alloc_parms ap = {};
+		unsigned int rblocks;
+		struct gfs2_trans *tr;
 
-	alloc_required = unstuff || iomap->type == IOMAP_HOLE;
-
-	if (alloc_required || gfs2_is_jdata(ip))
 		gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
 				       &ind_blocks);
-
-	if (alloc_required) {
-		struct gfs2_alloc_parms ap = {
-			.target = data_blocks + ind_blocks
-		};
-
+		ap.target = data_blocks + ind_blocks;
 		ret = gfs2_quota_lock_check(ip, &ap);
 		if (ret)
-			goto out_unlock;
+			return ret;
 
 		ret = gfs2_inplace_reserve(ip, &ap);
 		if (ret)
 			goto out_qunlock;
-	}
 
-	rblocks = RES_DINODE + ind_blocks;
-	if (gfs2_is_jdata(ip))
-		rblocks += data_blocks;
-	if (ind_blocks || data_blocks)
-		rblocks += RES_STATFS + RES_QUOTA;
-	if (inode == sdp->sd_rindex)
-		rblocks += 2 * RES_STATFS;
-	if (alloc_required)
+		rblocks = RES_DINODE + ind_blocks;
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks;
+		if (ind_blocks || data_blocks)
+			rblocks += RES_STATFS + RES_QUOTA;
+		if (inode == sdp->sd_rindex)
+			rblocks += 2 * RES_STATFS;
 		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
 
-	ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
-	if (ret)
-		goto out_trans_fail;
-
-	if (unstuff) {
-		ret = gfs2_unstuff_dinode(ip, NULL);
+		ret = gfs2_trans_begin(sdp, rblocks,
+				       iomap->length >> inode->i_blkbits);
 		if (ret)
-			goto out_trans_end;
-		release_metapath(mp);
-		ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
-				     flags, iomap, mp);
-		if (ret)
-			goto out_trans_end;
-	}
+			goto out_trans_fail;
 
-	if (iomap->type == IOMAP_HOLE) {
-		ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
-		if (ret) {
-			gfs2_trans_end(sdp);
-			gfs2_inplace_release(ip);
-			punch_hole(ip, iomap->offset, iomap->length);
-			goto out_qunlock;
+		if (unstuff) {
+			ret = gfs2_unstuff_dinode(ip, NULL);
+			if (ret)
+				goto out_trans_end;
+			release_metapath(mp);
+			ret = gfs2_iomap_get(inode, iomap->offset,
+					     iomap->length, flags, iomap, mp);
+			if (ret)
+				goto out_trans_end;
 		}
+
+		if (iomap->type == IOMAP_HOLE) {
+			ret = gfs2_iomap_alloc(inode, iomap, mp);
+			if (ret) {
+				gfs2_trans_end(sdp);
+				gfs2_inplace_release(ip);
+				punch_hole(ip, iomap->offset, iomap->length);
+				goto out_qunlock;
+			}
+		}
+
+		tr = current->journal_info;
+		if (tr->tr_num_buf_new)
+			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+
+		gfs2_trans_end(sdp);
 	}
-	if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
-		iomap->page_done = gfs2_iomap_journaled_page_done;
+
+	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
+		iomap->page_ops = &gfs2_iomap_page_ops;
 	return 0;
 
 out_trans_end:
 	gfs2_trans_end(sdp);
 out_trans_fail:
-	if (alloc_required)
-		gfs2_inplace_release(ip);
+	gfs2_inplace_release(ip);
 out_qunlock:
-	if (alloc_required)
-		gfs2_quota_unlock(ip);
-out_unlock:
-	gfs2_write_unlock(inode);
+	gfs2_quota_unlock(ip);
 	return ret;
 }
 
+static inline bool gfs2_iomap_need_write_lock(unsigned flags)
+{
+	return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
+}
+
 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@@ -1076,24 +1158,43 @@
 	iomap->flags |= IOMAP_F_BUFFER_HEAD;
 
 	trace_gfs2_iomap_start(ip, pos, length, flags);
-	if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) {
-		ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
-	} else {
-		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+	if (gfs2_iomap_need_write_lock(flags)) {
+		ret = gfs2_write_lock(inode);
+		if (ret)
+			goto out;
+	}
 
-		/*
-		 * Silently fall back to buffered I/O for stuffed files or if
-		 * we've hot a hole (see gfs2_file_direct_write).
-		 */
-		if ((flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT) &&
-		    iomap->type != IOMAP_MAPPED)
-			ret = -ENOTBLK;
+	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+	if (ret)
+		goto out_unlock;
+
+	switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
+	case IOMAP_WRITE:
+		if (flags & IOMAP_DIRECT) {
+			/*
+			 * Silently fall back to buffered I/O for stuffed files
+			 * or if we've got a hole (see gfs2_file_direct_write).
+			 */
+			if (iomap->type != IOMAP_MAPPED)
+				ret = -ENOTBLK;
+			goto out_unlock;
+		}
+		break;
+	case IOMAP_ZERO:
+		if (iomap->type == IOMAP_HOLE)
+			goto out_unlock;
+		break;
+	default:
+		goto out_unlock;
 	}
-	if (!ret) {
-		get_bh(mp.mp_bh[0]);
-		iomap->private = mp.mp_bh[0];
-	}
+
+	ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
+
+out_unlock:
+	if (ret && gfs2_iomap_need_write_lock(flags))
+		gfs2_write_unlock(inode);
 	release_metapath(&mp);
+out:
 	trace_gfs2_iomap_end(ip, iomap, ret);
 	return ret;
 }
@@ -1103,27 +1204,26 @@
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_trans *tr = current->journal_info;
-	struct buffer_head *dibh = iomap->private;
 
-	if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
-		goto out;
+	switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
+	case IOMAP_WRITE:
+		if (flags & IOMAP_DIRECT)
+			return 0;
+		break;
+	case IOMAP_ZERO:
+		 if (iomap->type == IOMAP_HOLE)
+			 return 0;
+		 break;
+	default:
+		 return 0;
+	}
 
-	if (iomap->type != IOMAP_INLINE) {
+	if (!gfs2_is_stuffed(ip))
 		gfs2_ordered_add_inode(ip);
 
-		if (tr->tr_num_buf_new)
-			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-		else
-			gfs2_trans_add_meta(ip->i_gl, dibh);
-	}
-
-	if (inode == sdp->sd_rindex) {
+	if (inode == sdp->sd_rindex)
 		adjust_fs_space(inode);
-		sdp->sd_rindex_uptodate = 0;
-	}
 
-	gfs2_trans_end(sdp);
 	gfs2_inplace_release(ip);
 
 	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
@@ -1140,11 +1240,17 @@
 
 	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
 		gfs2_quota_unlock(ip);
-	gfs2_write_unlock(inode);
 
-out:
-	if (dibh)
-		brelse(dibh);
+	if (unlikely(!written))
+		goto out_unlock;
+
+	if (iomap->flags & IOMAP_F_SIZE_CHANGED)
+		mark_inode_dirty(inode);
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+
+out_unlock:
+	if (gfs2_iomap_need_write_lock(flags))
+		gfs2_write_unlock(inode);
 	return 0;
 }
 
@@ -1192,7 +1298,7 @@
 	if (create) {
 		ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
 		if (!ret && iomap.type == IOMAP_HOLE)
-			ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp);
+			ret = gfs2_iomap_alloc(inode, &iomap, &mp);
 		release_metapath(&mp);
 	} else {
 		ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
@@ -1242,76 +1348,10 @@
 	return ret;
 }
 
-/**
- * gfs2_block_zero_range - Deal with zeroing out data
- *
- * This is partly borrowed from ext3.
- */
 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
 				 unsigned int length)
 {
-	struct address_space *mapping = inode->i_mapping;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned long index = from >> PAGE_SHIFT;
-	unsigned offset = from & (PAGE_SIZE-1);
-	unsigned blocksize, iblock, pos;
-	struct buffer_head *bh;
-	struct page *page;
-	int err;
-
-	page = find_or_create_page(mapping, index, GFP_NOFS);
-	if (!page)
-		return 0;
-
-	blocksize = inode->i_sb->s_blocksize;
-	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
-
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-
-	/* Find the buffer that contains "offset" */
-	bh = page_buffers(page);
-	pos = blocksize;
-	while (offset >= pos) {
-		bh = bh->b_this_page;
-		iblock++;
-		pos += blocksize;
-	}
-
-	err = 0;
-
-	if (!buffer_mapped(bh)) {
-		gfs2_block_map(inode, iblock, bh, 0);
-		/* unmapped? It's a hole - nothing to do */
-		if (!buffer_mapped(bh))
-			goto unlock;
-	}
-
-	/* Ok, it's mapped. Make sure it's up-to-date */
-	if (PageUptodate(page))
-		set_buffer_uptodate(bh);
-
-	if (!buffer_uptodate(bh)) {
-		err = -EIO;
-		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
-		wait_on_buffer(bh);
-		/* Uhhuh. Read error. Complain and punt. */
-		if (!buffer_uptodate(bh))
-			goto unlock;
-		err = 0;
-	}
-
-	if (gfs2_is_jdata(ip))
-		gfs2_trans_add_data(ip->i_gl, bh);
-	else
-		gfs2_ordered_add_inode(ip);
-
-	zero_user(page, offset, length);
-	mark_buffer_dirty(bh);
-unlock:
-	unlock_page(page);
-	put_page(page);
-	return err;
+	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
 }
 
 #define GFS2_JTRUNC_REVOKES 8192
@@ -1422,7 +1462,7 @@
 
 	ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
 	if (!ret && iomap->type == IOMAP_HOLE)
-		ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp);
+		ret = gfs2_iomap_alloc(inode, iomap, &mp);
 	release_metapath(&mp);
 	return ret;
 }
@@ -1560,7 +1600,7 @@
 			continue;
 		}
 		if (bstart) {
-			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+			__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
 			(*btotal) += blen;
 			gfs2_add_inode_blocks(&ip->i_inode, -blen);
 		}
@@ -1568,7 +1608,7 @@
 		blen = 1;
 	}
 	if (bstart) {
-		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+		__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
 		(*btotal) += blen;
 		gfs2_add_inode_blocks(&ip->i_inode, -blen);
 	}
@@ -1592,6 +1632,7 @@
 			brelse(dibh);
 			up_write(&ip->i_rw_mutex);
 			gfs2_trans_end(sdp);
+			buf_in_tr = false;
 		}
 		gfs2_glock_dq_uninit(rd_gh);
 		cond_resched();
@@ -1822,9 +1863,8 @@
 			gfs2_assert_withdraw(sdp, bh);
 			if (gfs2_assert_withdraw(sdp,
 						 prev_bnr != bh->b_blocknr)) {
-				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
-				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
-				       sdp->sd_fsname,
+				fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
+					 "s_h:%u, mp_h:%u\n",
 				       (unsigned long long)ip->i_no_addr,
 				       prev_bnr, ip->i_height, strip_h, mp_h);
 			}
@@ -2083,6 +2123,8 @@
 	}
 
 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+				 (unstuff &&
+				  gfs2_is_jdata(ip) ? RES_JDATA : 0) +
 				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
 				  0 : RES_QUOTA), 0);
 	if (error)
@@ -2098,7 +2140,7 @@
 	if (error)
 		goto do_end_trans;
 
-	i_size_write(inode, size);
+	truncate_setsize(inode, size);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
 	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -2248,7 +2290,9 @@
 	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
 	u64 size;
 	int rc;
+	ktime_t start, end;
 
+	start = ktime_get();
 	lblock_stop = i_size_read(jd->jd_inode) >> shift;
 	size = (lblock_stop - lblock) << shift;
 	jd->nr_extents = 0;
@@ -2268,8 +2312,9 @@
 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
 	} while(size > 0);
 
-	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
-		jd->nr_extents);
+	end = ktime_get();
+	fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
+		jd->nr_extents, ktime_ms_delta(end, start));
 	return 0;
 
 fail:

diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 6b18fb3..b88fd45 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __BMAP_DOT_H__
@@ -64,5 +61,6 @@
 extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
 extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
 extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int gfs2_lblk_to_dblk(struct inode *inode, u32 lblock, u64 *dblock);
 
 #endif /* __BMAP_DOT_H__ */

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 5173b98..2e215e8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/spinlock.h>
@@ -41,7 +38,7 @@
 	struct inode *inode;
 	struct gfs2_holder d_gh;
 	struct gfs2_inode *ip = NULL;
-	int error;
+	int error, valid = 0;
 	int had_lock = 0;
 
 	if (flags & LOOKUP_RCU)
@@ -54,53 +51,30 @@
 
 	if (inode) {
 		if (is_bad_inode(inode))
-			goto invalid;
+			goto out;
 		ip = GFS2_I(inode);
 	}
 
-	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
-		goto valid;
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) {
+		valid = 1;
+		goto out;
+	}
 
 	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
 	if (!had_lock) {
 		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
 		if (error)
-			goto fail;
-	} 
-
-	error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip);
-	switch (error) {
-	case 0:
-		if (!inode)
-			goto invalid_gunlock;
-		break;
-	case -ENOENT:
-		if (!inode)
-			goto valid_gunlock;
-		goto invalid_gunlock;
-	default:
-		goto fail_gunlock;
+			goto out;
 	}
 
-valid_gunlock:
+	error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip);
+	valid = inode ? !error : (error == -ENOENT);
+
 	if (!had_lock)
 		gfs2_glock_dq_uninit(&d_gh);
-valid:
+out:
 	dput(parent);
-	return 1;
-
-invalid_gunlock:
-	if (!had_lock)
-		gfs2_glock_dq_uninit(&d_gh);
-invalid:
-	dput(parent);
-	return 0;
-
-fail_gunlock:
-	gfs2_glock_dq_uninit(&d_gh);
-fail:
-	dput(parent);
-	return 0;
+	return valid;
 }
 
 static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index e370025..eb9c057 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 /*
@@ -506,7 +503,8 @@
  * For now the most important thing is to check that the various sizes
  * are correct.
  */
-static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+static int gfs2_check_dirent(struct gfs2_sbd *sdp,
+			     struct gfs2_dirent *dent, unsigned int offset,
 			     unsigned int size, unsigned int len, int first)
 {
 	const char *msg = "gfs2_dirent too small";
@@ -528,12 +526,12 @@
 		goto error;
 	return 0;
 error:
-	pr_warn("%s: %s (%s)\n",
+	fs_warn(sdp, "%s: %s (%s)\n",
 		__func__, msg, first ? "first in block" : "not first in block");
 	return -EIO;
 }
 
-static int gfs2_dirent_offset(const void *buf)
+static int gfs2_dirent_offset(struct gfs2_sbd *sdp, const void *buf)
 {
 	const struct gfs2_meta_header *h = buf;
 	int offset;
@@ -552,7 +550,8 @@
 	}
 	return offset;
 wrong_type:
-	pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type));
+	fs_warn(sdp, "%s: wrong block type %u\n", __func__,
+		be32_to_cpu(h->mh_type));
 	return -1;
 }
 
@@ -566,7 +565,7 @@
 	unsigned size;
 	int ret = 0;
 
-	ret = gfs2_dirent_offset(buf);
+	ret = gfs2_dirent_offset(GFS2_SB(inode), buf);
 	if (ret < 0)
 		goto consist_inode;
 
@@ -574,7 +573,7 @@
 	prev = NULL;
 	dent = buf + offset;
 	size = be16_to_cpu(dent->de_rec_len);
-	if (gfs2_check_dirent(dent, offset, size, len, 1))
+	if (gfs2_check_dirent(GFS2_SB(inode), dent, offset, size, len, 1))
 		goto consist_inode;
 	do {
 		ret = scan(dent, name, opaque);
@@ -586,7 +585,8 @@
 		prev = dent;
 		dent = buf + offset;
 		size = be16_to_cpu(dent->de_rec_len);
-		if (gfs2_check_dirent(dent, offset, size, len, 0))
+		if (gfs2_check_dirent(GFS2_SB(inode), dent, offset, size,
+				      len, 0))
 			goto consist_inode;
 	} while(1);
 
@@ -750,7 +750,7 @@
 	struct gfs2_dirent *dent;
 	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 				gfs2_dirent_find_offset, name, ptr);
-	if (!dent || IS_ERR(dent))
+	if (IS_ERR_OR_NULL(dent))
 		return dent;
 	return do_init_dirent(inode, dent, name, bh,
 			      (unsigned)(ptr - (void *)dent));
@@ -854,7 +854,7 @@
 		return ERR_PTR(error);
 	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
 got_dent:
-	if (unlikely(dent == NULL || IS_ERR(dent))) {
+	if (IS_ERR_OR_NULL(dent)) {
 		brelse(bh);
 		bh = NULL;
 	}
@@ -880,7 +880,7 @@
 	if (!bh)
 		return NULL;
 
-	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
+	gfs2_trans_remove_revoke(GFS2_SB(inode), bn, 1);
 	gfs2_trans_add_meta(ip->i_gl, bh);
 	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
 	leaf = (struct gfs2_leaf *)bh->b_data;
@@ -1043,7 +1043,7 @@
 	len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth));
 	half_len = len >> 1;
 	if (!half_len) {
-		pr_warn("i_depth %u lf_depth %u index %u\n",
+		fs_warn(GFS2_SB(inode), "i_depth %u lf_depth %u index %u\n",
 			dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
 		gfs2_consist_inode(dip);
 		error = -EIO;
@@ -1351,7 +1351,7 @@
 		if (!sdp->sd_args.ar_loccookie)
 			continue;
 		offset = (char *)(darr[i]) -
-			 (bh->b_data + gfs2_dirent_offset(bh->b_data));
+			(bh->b_data + gfs2_dirent_offset(sdp, bh->b_data));
 		offset /= GFS2_MIN_DIRENT_SIZE;
 		offset += leaf_nr * sdp->sd_max_dents_per_leaf;
 		if (offset >= GFS2_USE_HASH_FLAG ||
@@ -1463,8 +1463,7 @@
 				sort_offset : entries, copied);
 out_free:
 	for(i = 0; i < leaf; i++)
-		if (larr[i])
-			brelse(larr[i]);
+		brelse(larr[i]);
 	kvfree(larr);
 out:
 	return error;
@@ -2018,7 +2017,7 @@
 		l_blocks++;
 	}
 
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+	gfs2_rlist_alloc(&rlist);
 
 	for (x = 0; x < rlist.rl_rgrps; x++) {
 		struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
@@ -2039,6 +2038,8 @@
 	bh = leaf_bh;
 
 	for (blk = leaf_no; blk; blk = nblk) {
+		struct gfs2_rgrpd *rgd;
+
 		if (blk != leaf_no) {
 			error = get_leaf(dip, blk, &bh);
 			if (error)
@@ -2049,7 +2050,8 @@
 		if (blk != leaf_no)
 			brelse(bh);
 
-		gfs2_free_meta(dip, blk, 1);
+		rgd = gfs2_blk2rgrpd(sdp, blk, true);
+		gfs2_free_meta(dip, rgd, blk, 1);
 		gfs2_add_inode_blocks(&dip->i_inode, -1);
 	}
 

diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index e1b309c..5b76480 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __DIR_DOT_H__
@@ -35,8 +32,7 @@
 			const struct gfs2_inode *ip, struct gfs2_diradd *da);
 static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
 {
-	if (da->bh)
-		brelse(da->bh);
+	brelse(da->bh);
 	da->bh = NULL;
 }
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index a332f3c..3f71728 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/spinlock.h>

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 08369c6..997b326 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/slab.h>
@@ -139,27 +136,36 @@
 	{FS_JOURNAL_DATA_FL, GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA},
 };
 
+static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags)
+{
+	int i;
+	u32 fsflags = 0;
+
+	if (S_ISDIR(inode->i_mode))
+		gfsflags &= ~GFS2_DIF_JDATA;
+	else
+		gfsflags &= ~GFS2_DIF_INHERIT_JDATA;
+
+	for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++)
+		if (gfsflags & fsflag_gfs2flag[i].gfsflag)
+			fsflags |= fsflag_gfs2flag[i].fsflag;
+	return fsflags;
+}
+
 static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 {
 	struct inode *inode = file_inode(filp);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
-	int i, error;
-	u32 gfsflags, fsflags = 0;
+	int error;
+	u32 fsflags;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
 	error = gfs2_glock_nq(&gh);
 	if (error)
 		goto out_uninit;
 
-	gfsflags = ip->i_diskflags;
-	if (S_ISDIR(inode->i_mode))
-		gfsflags &= ~GFS2_DIF_JDATA;
-	else
-		gfsflags &= ~GFS2_DIF_INHERIT_JDATA;
-	for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++)
-		if (gfsflags & fsflag_gfs2flag[i].gfsflag)
-			fsflags |= fsflag_gfs2flag[i].fsflag;
+	fsflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags);
 
 	if (put_user(fsflags, ptr))
 		error = -EFAULT;
@@ -203,9 +209,11 @@
  * @filp: file pointer
  * @reqflags: The flags to set
  * @mask: Indicates which flags are valid
+ * @fsflags: The FS_* inode flags passed in
  *
  */
-static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask,
+			     const u32 fsflags)
 {
 	struct inode *inode = file_inode(filp);
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -213,7 +221,7 @@
 	struct buffer_head *bh;
 	struct gfs2_holder gh;
 	int error;
-	u32 new_flags, flags;
+	u32 new_flags, flags, oldflags;
 
 	error = mnt_want_write_file(filp);
 	if (error)
@@ -223,6 +231,11 @@
 	if (error)
 		goto out_drop_write;
 
+	oldflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags);
+	error = vfs_ioc_setflags_prepare(inode, oldflags, fsflags);
+	if (error)
+		goto out;
+
 	error = -EACCES;
 	if (!inode_owner_or_capable(inode))
 		goto out;
@@ -311,7 +324,18 @@
 		mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA);
 	}
 
-	return do_gfs2_set_flags(filp, gfsflags, mask);
+	return do_gfs2_set_flags(filp, gfsflags, mask, fsflags);
+}
+
+static int gfs2_getlabel(struct file *filp, char __user *label)
+{
+	struct inode *inode = file_inode(filp);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (copy_to_user(label, sdp->sd_sb.sb_locktable, GFS2_LOCKNAME_LEN))
+		return -EFAULT;
+
+	return 0;
 }
 
 static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -323,7 +347,10 @@
 		return gfs2_set_flags(filp, (u32 __user *)arg);
 	case FITRIM:
 		return gfs2_fitrim(filp, (void __user *)arg);
+	case FS_IOC_GETFSLABEL:
+		return gfs2_getlabel(filp, (char __user *)arg);
 	}
+
 	return -ENOTTY;
 }
 
@@ -347,36 +374,35 @@
 	size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
 	int hint = min_t(size_t, INT_MAX, blks);
 
-	if (hint > atomic_read(&ip->i_res.rs_sizehint))
-		atomic_set(&ip->i_res.rs_sizehint, hint);
+	if (hint > atomic_read(&ip->i_sizehint))
+		atomic_set(&ip->i_sizehint, hint);
 }
 
 /**
- * gfs2_allocate_page_backing - Use bmap to allocate blocks
+ * gfs2_allocate_page_backing - Allocate blocks for a write fault
  * @page: The (locked) page to allocate backing for
  *
- * We try to allocate all the blocks required for the page in
- * one go. This might fail for various reasons, so we keep
- * trying until all the blocks to back this page are allocated.
- * If some of the blocks are already allocated, thats ok too.
+ * We try to allocate all the blocks required for the page in one go.  This
+ * might fail for various reasons, so we keep trying until all the blocks to
+ * back this page are allocated.  If some of the blocks are already allocated,
+ * that is ok too.
  */
-
 static int gfs2_allocate_page_backing(struct page *page)
 {
-	struct inode *inode = page->mapping->host;
-	struct buffer_head bh;
-	unsigned long size = PAGE_SIZE;
-	u64 lblock = page->index << (PAGE_SHIFT - inode->i_blkbits);
+	u64 pos = page_offset(page);
+	u64 size = PAGE_SIZE;
 
 	do {
-		bh.b_state = 0;
-		bh.b_size = size;
-		gfs2_block_map(inode, lblock, &bh, 1);
-		if (!buffer_mapped(&bh))
+		struct iomap iomap = { };
+
+		if (gfs2_iomap_get_alloc(page->mapping->host, pos, 1, &iomap))
 			return -EIO;
-		size -= bh.b_size;
-		lblock += (bh.b_size >> inode->i_blkbits);
-	} while(size > 0);
+
+		iomap.length = min(iomap.length, size);
+		size -= iomap.length;
+		pos += iomap.length;
+	} while (size > 0);
+
 	return 0;
 }
 
@@ -397,7 +423,7 @@
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_alloc_parms ap = { .aflags = 0, };
 	unsigned long last_index;
-	u64 pos = page->index << PAGE_SHIFT;
+	u64 pos = page_offset(page);
 	unsigned int data_blocks, ind_blocks, rblocks;
 	struct gfs2_holder gh;
 	loff_t size;
@@ -1023,7 +1049,7 @@
 			rblocks += data_blocks ? data_blocks : 1;
 
 		error = gfs2_trans_begin(sdp, rblocks,
-					 PAGE_SIZE/sdp->sd_sb.sb_bsize);
+					 PAGE_SIZE >> inode->i_blkbits);
 		if (error)
 			goto out_trans_fail;
 
@@ -1039,11 +1065,10 @@
 		gfs2_quota_unlock(ip);
 	}
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) {
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size)
 		i_size_write(inode, pos + count);
-		file_update_time(file);
-		mark_inode_dirty(inode);
-	}
+	file_update_time(file);
+	mark_inode_dirty(inode);
 
 	if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
 		return vfs_fsync_range(file, pos, pos + count - 1,
@@ -1155,7 +1180,7 @@
 		cmd = F_SETLK;
 		fl->fl_type = F_UNLCK;
 	}
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) {
 		if (fl->fl_type == F_UNLCK)
 			locks_lock_file_wait(file, fl);
 		return -EIO;
@@ -1185,13 +1210,13 @@
 	mutex_lock(&fp->f_fl_mutex);
 
 	if (gfs2_holder_initialized(fl_gh)) {
+		struct file_lock request;
 		if (fl_gh->gh_state == state)
 			goto out;
-		locks_lock_file_wait(file,
-				     &(struct file_lock) {
-					     .fl_type = F_UNLCK,
-					     .fl_flags = FL_FLOCK
-				     });
+		locks_init_lock(&request);
+		request.fl_type = F_UNLCK;
+		request.fl_flags = FL_FLOCK;
+		locks_lock_file_wait(file, &request);
 		gfs2_glock_dq(fl_gh);
 		gfs2_holder_reinit(state, flags, fl_gh);
 	} else {
@@ -1266,6 +1291,7 @@
 	.llseek		= gfs2_llseek,
 	.read_iter	= gfs2_file_read_iter,
 	.write_iter	= gfs2_file_write_iter,
+	.iopoll		= iomap_dio_iopoll,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.mmap		= gfs2_mmap,
 	.open		= gfs2_open,
@@ -1296,6 +1322,7 @@
 	.llseek		= gfs2_llseek,
 	.read_iter	= gfs2_file_read_iter,
 	.write_iter	= gfs2_file_write_iter,
+	.iopoll		= iomap_dio_iopoll,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.mmap		= gfs2_mmap,
 	.open		= gfs2_open,

diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index ef606e3..ed78e5f 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __GFS2_DOT_H__

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4614ee2..0290a22 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -107,7 +104,7 @@
 
 static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
 {
-	u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0);
+	u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0);
 
 	return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
 }
@@ -140,6 +137,7 @@
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
+	BUG_ON(atomic_read(&gl->gl_revokes));
 	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
 	smp_mb();
 	wake_up_glock(gl);
@@ -183,15 +181,19 @@
 
 void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
 {
+	if (!(gl->gl_ops->go_flags & GLOF_LRU))
+		return;
+
 	spin_lock(&lru_lock);
 
-	if (!list_empty(&gl->gl_lru))
-		list_del_init(&gl->gl_lru);
-	else
-		atomic_inc(&lru_count);
-
+	list_del(&gl->gl_lru);
 	list_add_tail(&gl->gl_lru, &lru_list);
-	set_bit(GLF_LRU, &gl->gl_flags);
+
+	if (!test_bit(GLF_LRU, &gl->gl_flags)) {
+		set_bit(GLF_LRU, &gl->gl_flags);
+		atomic_inc(&lru_count);
+	}
+
 	spin_unlock(&lru_lock);
 }
 
@@ -201,7 +203,7 @@
 		return;
 
 	spin_lock(&lru_lock);
-	if (!list_empty(&gl->gl_lru)) {
+	if (test_bit(GLF_LRU, &gl->gl_flags)) {
 		list_del_init(&gl->gl_lru);
 		atomic_dec(&lru_count);
 		clear_bit(GLF_LRU, &gl->gl_flags);
@@ -303,6 +305,11 @@
 	clear_bit(HIF_WAIT, &gh->gh_iflags);
 	smp_mb__after_atomic();
 	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+	if (gh->gh_flags & GL_ASYNC) {
+		struct gfs2_sbd *sdp = gh->gh_gl->gl_name.ln_sbd;
+
+		wake_up(&sdp->sd_async_glock_wait);
+	}
 }
 
 /**
@@ -494,7 +501,8 @@
 			do_xmote(gl, gh, LM_ST_UNLOCKED);
 			break;
 		default: /* Everything else */
-			pr_err("wanted %u got %u\n", gl->gl_target, state);
+			fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n",
+			       gl->gl_target, state);
 			GLOCK_BUG_ON(gl, 1);
 		}
 		spin_unlock(&gl->gl_lockref.lock);
@@ -541,7 +549,7 @@
 	unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
 	int ret;
 
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) &&
 	    target != LM_ST_UNLOCKED)
 		return;
 	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
@@ -577,8 +585,8 @@
 			gfs2_glock_queue_work(gl, 0);
 		}
 		else if (ret) {
-			pr_err("lm_lock ret %d\n", ret);
-			GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN,
+			fs_err(sdp, "lm_lock ret %d\n", ret);
+			GLOCK_BUG_ON(gl, !test_bit(SDF_WITHDRAWN,
 						   &sdp->sd_flags));
 		}
 	} else { /* lock_nolock */
@@ -678,7 +686,7 @@
 		goto out;
 
 	inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
-	if (inode && !IS_ERR(inode)) {
+	if (!IS_ERR_OR_NULL(inode)) {
 		d_prune_aliases(inode);
 		iput(inode);
 	}
@@ -928,6 +936,17 @@
 	gh->gh_ip = 0;
 }
 
+static void gfs2_glock_update_hold_time(struct gfs2_glock *gl,
+					unsigned long start_time)
+{
+	/* Have we waited longer that a second? */
+	if (time_after(jiffies, start_time + HZ)) {
+		/* Lengthen the minimum hold time. */
+		gl->gl_hold_time = min(gl->gl_hold_time + GL_GLOCK_HOLD_INCR,
+				       GL_GLOCK_MAX_HOLD);
+	}
+}
+
 /**
  * gfs2_glock_wait - wait on a glock acquisition
  * @gh: the glock holder
@@ -937,18 +956,99 @@
 
 int gfs2_glock_wait(struct gfs2_holder *gh)
 {
-	unsigned long time1 = jiffies;
+	unsigned long start_time = jiffies;
 
 	might_sleep();
 	wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
-	if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */
-		/* Lengthen the minimum hold time. */
-		gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time +
-					      GL_GLOCK_HOLD_INCR,
-					      GL_GLOCK_MAX_HOLD);
+	gfs2_glock_update_hold_time(gh->gh_gl, start_time);
 	return gh->gh_error;
 }
 
+static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	int i;
+
+	for (i = 0; i < num_gh; i++)
+		if (test_bit(HIF_WAIT, &ghs[i].gh_iflags))
+			return 1;
+	return 0;
+}
+
+/**
+ * gfs2_glock_async_wait - wait on multiple asynchronous glock acquisitions
+ * @num_gh: the number of holders in the array
+ * @ghs: the glock holder array
+ *
+ * Returns: 0 on success, meaning all glocks have been granted and are held.
+ *          -ESTALE if the request timed out, meaning all glocks were released,
+ *          and the caller should retry the operation.
+ */
+
+int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd;
+	int i, ret = 0, timeout = 0;
+	unsigned long start_time = jiffies;
+	bool keep_waiting;
+
+	might_sleep();
+	/*
+	 * Total up the (minimum hold time * 2) of all glocks and use that to
+	 * determine the max amount of time we should wait.
+	 */
+	for (i = 0; i < num_gh; i++)
+		timeout += ghs[i].gh_gl->gl_hold_time << 1;
+
+wait_for_dlm:
+	if (!wait_event_timeout(sdp->sd_async_glock_wait,
+				!glocks_pending(num_gh, ghs), timeout))
+		ret = -ESTALE; /* request timed out. */
+
+	/*
+	 * If dlm granted all our requests, we need to adjust the glock
+	 * minimum hold time values according to how long we waited.
+	 *
+	 * If our request timed out, we need to repeatedly release any held
+	 * glocks we acquired thus far to allow dlm to acquire the remaining
+	 * glocks without deadlocking.  We cannot currently cancel outstanding
+	 * glock acquisitions.
+	 *
+	 * The HIF_WAIT bit tells us which requests still need a response from
+	 * dlm.
+	 *
+	 * If dlm sent us any errors, we return the first error we find.
+	 */
+	keep_waiting = false;
+	for (i = 0; i < num_gh; i++) {
+		/* Skip holders we have already dequeued below. */
+		if (!gfs2_holder_queued(&ghs[i]))
+			continue;
+		/* Skip holders with a pending DLM response. */
+		if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) {
+			keep_waiting = true;
+			continue;
+		}
+
+		if (test_bit(HIF_HOLDER, &ghs[i].gh_iflags)) {
+			if (ret == -ESTALE)
+				gfs2_glock_dq(&ghs[i]);
+			else
+				gfs2_glock_update_hold_time(ghs[i].gh_gl,
+							    start_time);
+		}
+		if (!ret)
+			ret = ghs[i].gh_error;
+	}
+
+	if (keep_waiting)
+		goto wait_for_dlm;
+
+	/*
+	 * At this point, we've either acquired all locks or released them all.
+	 */
+	return ret;
+}
+
 /**
  * handle_callback - process a demote request
  * @gl: the glock
@@ -1015,9 +1115,9 @@
 	struct gfs2_holder *gh2;
 	int try_futile = 0;
 
-	BUG_ON(gh->gh_owner_pid == NULL);
+	GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL);
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
-		BUG();
+		GLOCK_BUG_ON(gl, true);
 
 	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
 		if (test_bit(GLF_LOCK, &gl->gl_flags))
@@ -1064,15 +1164,15 @@
 	return;
 
 trap_recursive:
-	pr_err("original: %pSR\n", (void *)gh2->gh_ip);
-	pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid));
-	pr_err("lock type: %d req lock state : %d\n",
+	fs_err(sdp, "original: %pSR\n", (void *)gh2->gh_ip);
+	fs_err(sdp, "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+	fs_err(sdp, "lock type: %d req lock state : %d\n",
 	       gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
-	pr_err("new: %pSR\n", (void *)gh->gh_ip);
-	pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid));
-	pr_err("lock type: %d req lock state : %d\n",
+	fs_err(sdp, "new: %pSR\n", (void *)gh->gh_ip);
+	fs_err(sdp, "pid: %d\n", pid_nr(gh->gh_owner_pid));
+	fs_err(sdp, "lock type: %d req lock state : %d\n",
 	       gh->gh_gl->gl_name.ln_type, gh->gh_state);
-	gfs2_dump_glock(NULL, gl);
+	gfs2_dump_glock(NULL, gl, true);
 	BUG();
 }
 
@@ -1091,7 +1191,7 @@
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	int error = 0;
 
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)))
 		return -EIO;
 
 	if (test_bit(GLF_LRU, &gl->gl_flags))
@@ -1158,8 +1258,7 @@
 		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
 			fast_path = 1;
 	}
-	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl) &&
-	    (glops->go_flags & GLOF_LRU))
+	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
 		gfs2_glock_add_to_lru(gl);
 
 	trace_gfs2_glock_queue(gh, 0);
@@ -1455,6 +1554,7 @@
 		if (!spin_trylock(&gl->gl_lockref.lock)) {
 add_back_to_lru:
 			list_add(&gl->gl_lru, &lru_list);
+			set_bit(GLF_LRU, &gl->gl_flags);
 			atomic_inc(&lru_count);
 			continue;
 		}
@@ -1462,7 +1562,6 @@
 			spin_unlock(&gl->gl_lockref.lock);
 			goto add_back_to_lru;
 		}
-		clear_bit(GLF_LRU, &gl->gl_flags);
 		gl->gl_lockref.count++;
 		if (demote_ok(gl))
 			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
@@ -1497,6 +1596,7 @@
 		if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
 			list_move(&gl->gl_lru, &dispose);
 			atomic_dec(&lru_count);
+			clear_bit(GLF_LRU, &gl->gl_flags);
 			freed++;
 			continue;
 		}
@@ -1607,16 +1707,16 @@
 	glock_hash_walk(thaw_glock, sdp);
 }
 
-static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid)
 {
 	spin_lock(&gl->gl_lockref.lock);
-	gfs2_dump_glock(seq, gl);
+	gfs2_dump_glock(seq, gl, fsid);
 	spin_unlock(&gl->gl_lockref.lock);
 }
 
 static void dump_glock_func(struct gfs2_glock *gl)
 {
-	dump_glock(NULL, gl);
+	dump_glock(NULL, gl, true);
 }
 
 /**
@@ -1701,10 +1801,12 @@
  * dump_holder - print information about a glock holder
  * @seq: the seq_file struct
  * @gh: the glock holder
+ * @fs_id_buf: pointer to file system id (if requested)
  *
  */
 
-static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
+static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh,
+			const char *fs_id_buf)
 {
 	struct task_struct *gh_owner = NULL;
 	char flags_buf[32];
@@ -1712,8 +1814,8 @@
 	rcu_read_lock();
 	if (gh->gh_owner_pid)
 		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
-		       state2str(gh->gh_state),
+	gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+		       fs_id_buf, state2str(gh->gh_state),
 		       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
 		       gh->gh_error,
 		       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
@@ -1763,6 +1865,7 @@
  * gfs2_dump_glock - print information about a glock
  * @seq: The seq_file struct
  * @gl: the glock
+ * @fsid: If true, also dump the file system id
  *
  * The file format is as follows:
  * One line per object, capital letters are used to indicate objects
@@ -1776,19 +1879,24 @@
  *
  */
 
-void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	unsigned long long dtime;
 	const struct gfs2_holder *gh;
 	char gflags_buf[32];
+	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
 
+	memset(fs_id_buf, 0, sizeof(fs_id_buf));
+	if (fsid && sdp) /* safety precaution */
+		sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
 	dtime = jiffies - gl->gl_demote_time;
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n",
-		  state2str(gl->gl_state),
+	gfs2_print_dbg(seq, "%sG:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d "
+		       "v:%d r:%d m:%ld\n", fs_id_buf, state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
 		  gflags2str(gflags_buf, gl),
@@ -1799,10 +1907,10 @@
 		  (int)gl->gl_lockref.count, gl->gl_hold_time);
 
 	list_for_each_entry(gh, &gl->gl_holders, gh_list)
-		dump_holder(seq, gh);
+		dump_holder(seq, gh, fs_id_buf);
 
 	if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
-		glops->go_dump(seq, gl);
+		glops->go_dump(seq, gl, fs_id_buf);
 }
 
 static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -2003,7 +2111,7 @@
 
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-	dump_glock(seq, iter_ptr);
+	dump_glock(seq, iter_ptr, false);
 	return 0;
 }
 
@@ -2130,71 +2238,29 @@
 	.release = seq_release,
 };
 
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
 {
-	struct dentry *dent;
+	sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
 
-	dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
-	if (IS_ERR_OR_NULL(dent))
-		goto fail;
-	sdp->debugfs_dir = dent;
+	debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+			    &gfs2_glocks_fops);
 
-	dent = debugfs_create_file("glocks",
-				   S_IFREG | S_IRUGO,
-				   sdp->debugfs_dir, sdp,
-				   &gfs2_glocks_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto fail;
-	sdp->debugfs_dentry_glocks = dent;
+	debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+			    &gfs2_glstats_fops);
 
-	dent = debugfs_create_file("glstats",
-				   S_IFREG | S_IRUGO,
-				   sdp->debugfs_dir, sdp,
-				   &gfs2_glstats_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto fail;
-	sdp->debugfs_dentry_glstats = dent;
-
-	dent = debugfs_create_file("sbstats",
-				   S_IFREG | S_IRUGO,
-				   sdp->debugfs_dir, sdp,
-				   &gfs2_sbstats_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto fail;
-	sdp->debugfs_dentry_sbstats = dent;
-
-	return 0;
-fail:
-	gfs2_delete_debugfs_file(sdp);
-	return dent ? PTR_ERR(dent) : -ENOMEM;
+	debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+			    &gfs2_sbstats_fops);
 }
 
 void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
 {
-	if (sdp->debugfs_dir) {
-		if (sdp->debugfs_dentry_glocks) {
-			debugfs_remove(sdp->debugfs_dentry_glocks);
-			sdp->debugfs_dentry_glocks = NULL;
-		}
-		if (sdp->debugfs_dentry_glstats) {
-			debugfs_remove(sdp->debugfs_dentry_glstats);
-			sdp->debugfs_dentry_glstats = NULL;
-		}
-		if (sdp->debugfs_dentry_sbstats) {
-			debugfs_remove(sdp->debugfs_dentry_sbstats);
-			sdp->debugfs_dentry_sbstats = NULL;
-		}
-		debugfs_remove(sdp->debugfs_dir);
-		sdp->debugfs_dir = NULL;
-	}
+	debugfs_remove_recursive(sdp->debugfs_dir);
+	sdp->debugfs_dir = NULL;
 }
 
-int gfs2_register_debugfs(void)
+void gfs2_register_debugfs(void)
 {
 	gfs2_root = debugfs_create_dir("gfs2", NULL);
-	if (IS_ERR(gfs2_root))
-		return PTR_ERR(gfs2_root);
-	return gfs2_root ? 0 : -ENOMEM;
 }
 
 void gfs2_unregister_debugfs(void)

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 5e12220..b8adaf8 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __GLOCK_DOT_H__
@@ -193,6 +190,7 @@
 extern int gfs2_glock_nq(struct gfs2_holder *gh);
 extern int gfs2_glock_poll(struct gfs2_holder *gh);
 extern int gfs2_glock_wait(struct gfs2_holder *gh);
+extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq(struct gfs2_holder *gh);
 extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
 extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
@@ -202,8 +200,11 @@
 			     struct gfs2_holder *gh);
 extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
-#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
+extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
+			    bool fsid);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) {		\
+			gfs2_dump_glock(NULL, gl, true);	\
+			BUG(); } } while(0)
 extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
@@ -243,9 +244,9 @@
 extern int __init gfs2_glock_init(void);
 extern void gfs2_glock_exit(void);
 
-extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
 extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-extern int gfs2_register_debugfs(void);
+extern void gfs2_register_debugfs(void);
 extern void gfs2_unregister_debugfs(void);
 
 extern const struct lm_lockops gfs2_dlm_ops;
@@ -260,6 +261,11 @@
 	return gh->gh_gl;
 }
 
+static inline bool gfs2_holder_queued(struct gfs2_holder *gh)
+{
+	return !list_empty(&gh->gh_list);
+}
+
 /**
  * glock_set_object - set the gl_object field of a glock
  * @gl: the glock
@@ -269,7 +275,7 @@
 {
 	spin_lock(&gl->gl_lockref.lock);
 	if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL))
-		gfs2_dump_glock(NULL, gl);
+		gfs2_dump_glock(NULL, gl, true);
 	gl->gl_object = object;
 	spin_unlock(&gl->gl_lockref.lock);
 }
@@ -281,7 +287,7 @@
  *
  * I'd love to similarly add this:
  *	else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object))
- *		gfs2_dump_glock(NULL, gl);
+ *		gfs2_dump_glock(NULL, gl, true);
  * Unfortunately, that's not possible because as soon as gfs2_delete_inode
  * frees the block in the rgrp, another process can reassign it for an I_NEW
  * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget.

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c63bee9..ff21369 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/spinlock.h>
@@ -28,6 +25,7 @@
 #include "util.h"
 #include "trans.h"
 #include "dir.h"
+#include "lops.h"
 
 struct workqueue_struct *gfs2_freeze_wq;
 
@@ -463,20 +461,31 @@
  * inode_go_dump - print information about an inode
  * @seq: The iterator
  * @ip: the inode
+ * @fs_id_buf: file system id (may be empty)
  *
  */
 
-static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl,
+			  const char *fs_id_buf)
 {
-	const struct gfs2_inode *ip = gl->gl_object;
+	struct gfs2_inode *ip = gl->gl_object;
+	struct inode *inode = &ip->i_inode;
+	unsigned long nrpages;
+
 	if (ip == NULL)
 		return;
-	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
+
+	xa_lock_irq(&inode->i_data.i_pages);
+	nrpages = inode->i_data.nrpages;
+	xa_unlock_irq(&inode->i_data.i_pages);
+
+	gfs2_print_dbg(seq, "%s I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu "
+		       "p:%lu\n", fs_id_buf,
 		  (unsigned long long)ip->i_no_formal_ino,
 		  (unsigned long long)ip->i_no_addr,
 		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
 		  (unsigned int)ip->i_diskflags,
-		  (unsigned long long)i_size_read(&ip->i_inode));
+		  (unsigned long long)i_size_read(inode), nrpages);
 }
 
 /**
@@ -497,7 +506,8 @@
 		atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
 		error = freeze_super(sdp->sd_vfs);
 		if (error) {
-			printk(KERN_INFO "GFS2: couldn't freeze filesystem: %d\n", error);
+			fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
+				error);
 			gfs2_assert_withdraw(sdp, 0);
 		}
 		queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
@@ -523,14 +533,14 @@
 	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
 		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
-		error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+		error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
 		if (error)
 			gfs2_consist(sdp);
 		if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
 			gfs2_consist(sdp);
 
 		/*  Initialize some head of the log stuff  */
-		if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+		if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) {
 			sdp->sd_log_sequence = head.lh_sequence + 1;
 			gfs2_log_pointers_init(sdp, head.lh_blkno);
 		}

diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 8ed1857..2dd192e 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __GLOPS_DOT_H__

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b96d39c..5f89c51 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __INCORE_DOT_H__
@@ -92,7 +89,7 @@
 	unsigned long bi_flags;
 	u32 bi_offset;
 	u32 bi_start;
-	u32 bi_len;
+	u32 bi_bytes;
 	u32 bi_blocks;
 };
 
@@ -165,7 +162,6 @@
 	u64 bd_blkno;
 
 	struct list_head bd_list;
-	const struct gfs2_log_operations *bd_ops;
 
 	struct gfs2_trans *bd_tr;
 	struct list_head bd_ail_st_list;
@@ -244,7 +240,8 @@
 	int (*go_demote_ok) (const struct gfs2_glock *gl);
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
-	void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+	void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
+			const char *fs_id_buf);
 	void (*go_callback)(struct gfs2_glock *gl, bool remote);
 	const int go_type;
 	const unsigned long go_flags;
@@ -309,10 +306,6 @@
 */
 
 struct gfs2_blkreserv {
-	/* components used during write (step 1): */
-	atomic_t rs_sizehint;         /* hint of the write size */
-
-	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
 	struct rb_node rs_node;       /* link to other block reservations */
 	struct gfs2_rbm rs_rbm;       /* Start of reservation */
 	u32 rs_free;                  /* how many blocks are still free */
@@ -417,8 +410,10 @@
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
 	struct gfs2_qadata *i_qadata; /* quota allocation data */
+	struct gfs2_holder i_rgd_gh;
 	struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
 	u64 i_goal;	/* goal block for allocations */
+	atomic_t i_sizehint;  /* hint of the write size */
 	struct rw_semaphore i_rw_mutex;
 	struct list_head i_ordered;
 	struct list_head i_trunc_list;
@@ -510,7 +505,6 @@
 	unsigned int tr_num_buf_rm;
 	unsigned int tr_num_databuf_rm;
 	unsigned int tr_num_revoke;
-	unsigned int tr_num_revoke_rm;
 
 	struct list_head tr_list;
 	struct list_head tr_databuf;
@@ -538,7 +532,7 @@
 	unsigned long jd_flags;
 #define JDF_RECOVERY 1
 	unsigned int jd_jid;
-	unsigned int jd_blocks;
+	u32 jd_blocks;
 	int jd_recover_error;
 	/* Replay stuff */
 
@@ -590,10 +584,10 @@
 	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
 	unsigned int ar_loccookie:1;		/* use location based readdir
 						   cookies */
-	int ar_commit;				/* Commit interval */
-	int ar_statfs_quantum;			/* The fast statfs interval */
-	int ar_quota_quantum;			/* The quota interval */
-	int ar_statfs_percent;			/* The % change to force sync */
+	s32 ar_commit;				/* Commit interval */
+	s32 ar_statfs_quantum;			/* The fast statfs interval */
+	s32 ar_quota_quantum;			/* The quota interval */
+	s32 ar_statfs_percent;			/* The % change to force sync */
 };
 
 struct gfs2_tune {
@@ -615,7 +609,7 @@
 enum {
 	SDF_JOURNAL_CHECKED	= 0,
 	SDF_JOURNAL_LIVE	= 1,
-	SDF_SHUTDOWN		= 2,
+	SDF_WITHDRAWN		= 2,
 	SDF_NOBARRIERS		= 3,
 	SDF_NORECOVERY		= 4,
 	SDF_DEMOTE		= 5,
@@ -623,6 +617,8 @@
 	SDF_RORECOVERY		= 7, /* read only recovery */
 	SDF_SKIP_DLM_UNLOCK	= 8,
 	SDF_FORCE_AIL_FLUSH     = 9,
+	SDF_AIL1_IO_ERROR	= 10,
+	SDF_FS_FROZEN           = 11,
 };
 
 enum gfs2_freeze_state {
@@ -729,6 +725,7 @@
 	struct gfs2_glock *sd_freeze_gl;
 	struct work_struct sd_freeze_work;
 	wait_queue_head_t sd_glock_wait;
+	wait_queue_head_t sd_async_glock_wait;
 	atomic_t sd_glock_disposal;
 	struct completion sd_locking_init;
 	struct completion sd_wdack;
@@ -811,8 +808,8 @@
 	atomic_t sd_log_pinned;
 	unsigned int sd_log_num_revoke;
 
-	struct list_head sd_log_le_revoke;
-	struct list_head sd_log_le_ordered;
+	struct list_head sd_log_revokes;
+	struct list_head sd_log_ordered;
 	spinlock_t sd_ordered_lock;
 
 	atomic_t sd_log_thresh1;
@@ -855,9 +852,6 @@
 
 	unsigned long sd_last_warning;
 	struct dentry *debugfs_dir;    /* debugfs directory */
-	struct dentry *debugfs_dentry_glocks;
-	struct dentry *debugfs_dentry_glstats;
-	struct dentry *debugfs_dentry_sbstats;
 };
 
 static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 648f0ca..e1e18fb 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/slab.h>
@@ -744,17 +741,19 @@
 			       the gfs2 structures. */
 	if (default_acl) {
 		error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		if (error)
+			goto fail_gunlock3;
 		posix_acl_release(default_acl);
+		default_acl = NULL;
 	}
 	if (acl) {
-		if (!error)
-			error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		if (error)
+			goto fail_gunlock3;
 		posix_acl_release(acl);
+		acl = NULL;
 	}
 
-	if (error)
-		goto fail_gunlock3;
-
 	error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
 					     &gfs2_initxattrs, NULL);
 	if (error)
@@ -789,14 +788,12 @@
 	}
 	gfs2_rsqa_delete(ip, NULL);
 fail_free_acls:
-	if (default_acl)
-		posix_acl_release(default_acl);
-	if (acl)
-		posix_acl_release(acl);
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
 fail_gunlock:
 	gfs2_dir_no_add(&da);
 	gfs2_glock_dq_uninit(ghs);
-	if (inode && !IS_ERR(inode)) {
+	if (!IS_ERR_OR_NULL(inode)) {
 		clear_nlink(inode);
 		if (!free_vfs_inode)
 			mark_inode_dirty(inode);
@@ -1351,7 +1348,7 @@
 	struct gfs2_inode *ip = GFS2_I(d_inode(odentry));
 	struct gfs2_inode *nip = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh;
+	struct gfs2_holder ghs[4], r_gh, rd_gh;
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
@@ -1360,6 +1357,7 @@
 	int error;
 
 	gfs2_holder_mark_uninitialized(&r_gh);
+	gfs2_holder_mark_uninitialized(&rd_gh);
 	if (d_really_is_positive(ndentry)) {
 		nip = GFS2_I(d_inode(ndentry));
 		if (ip == nip)
@@ -1390,24 +1388,19 @@
 	}
 
 	num_gh = 1;
-	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs);
 	if (odip != ndip) {
-		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE,GL_ASYNC,
+				 ghs + num_gh);
 		num_gh++;
 	}
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh);
 	num_gh++;
 
 	if (nip) {
-		gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC,
+				 ghs + num_gh);
 		num_gh++;
-		/* grab the resource lock for unlink flag twiddling 
-		 * this is the case of the target file already existing
-		 * so we unlink before doing the rename
-		 */
-		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1);
-		if (nrgd)
-			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
 	}
 
 	for (x = 0; x < num_gh; x++) {
@@ -1415,6 +1408,25 @@
 		if (error)
 			goto out_gunlock;
 	}
+	error = gfs2_glock_async_wait(num_gh, ghs);
+	if (error)
+		goto out_gunlock;
+
+	if (nip) {
+		/* Grab the resource group glock for unlink flag twiddling.
+		 * This is the case where the target dinode already exists
+		 * so we unlink before doing the rename.
+		 */
+		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1);
+		if (!nrgd) {
+			error = -ENOENT;
+			goto out_gunlock;
+		}
+		error = gfs2_glock_nq_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+					   &rd_gh);
+		if (error)
+			goto out_gunlock;
+	}
 
 	error = -ENOENT;
 	if (ip->i_inode.i_nlink == 0)
@@ -1544,8 +1556,12 @@
 		gfs2_quota_unlock(ndip);
 out_gunlock:
 	gfs2_dir_no_add(&da);
+	if (gfs2_holder_initialized(&rd_gh))
+		gfs2_glock_dq_uninit(&rd_gh);
+
 	while (x--) {
-		gfs2_glock_dq(ghs + x);
+		if (gfs2_holder_queued(ghs + x))
+			gfs2_glock_dq(ghs + x);
 		gfs2_holder_uninit(ghs + x);
 	}
 out_gunlock_r:
@@ -1575,7 +1591,7 @@
 	struct gfs2_inode *oip = GFS2_I(odentry->d_inode);
 	struct gfs2_inode *nip = GFS2_I(ndentry->d_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh;
+	struct gfs2_holder ghs[4], r_gh;
 	unsigned int num_gh;
 	unsigned int x;
 	umode_t old_mode = oip->i_inode.i_mode;
@@ -1609,15 +1625,16 @@
 	}
 
 	num_gh = 1;
-	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs);
 	if (odip != ndip) {
-		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC,
+				 ghs + num_gh);
 		num_gh++;
 	}
-	gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh);
 	num_gh++;
 
-	gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh);
 	num_gh++;
 
 	for (x = 0; x < num_gh; x++) {
@@ -1626,6 +1643,10 @@
 			goto out_gunlock;
 	}
 
+	error = gfs2_glock_async_wait(num_gh, ghs);
+	if (error)
+		goto out_gunlock;
+
 	error = -ENOENT;
 	if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0)
 		goto out_gunlock;
@@ -1686,7 +1707,8 @@
 	gfs2_trans_end(sdp);
 out_gunlock:
 	while (x--) {
-		gfs2_glock_dq(ghs + x);
+		if (gfs2_holder_queued(ghs + x))
+			gfs2_glock_dq(ghs + x);
 		gfs2_holder_uninit(ghs + x);
 	}
 out_gunlock_r:

diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index b5b6341..580adbf 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __INODE_DOT_H__
@@ -30,16 +27,14 @@
 	return ip->i_diskflags & GFS2_DIF_JDATA;
 }
 
-static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
+static inline bool gfs2_is_ordered(const struct gfs2_sbd *sdp)
 {
-	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
+	return sdp->sd_args.ar_data == GFS2_DATA_ORDERED;
 }
 
-static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
+static inline bool gfs2_is_writeback(const struct gfs2_sbd *sdp)
 {
-	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
+	return sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK;
 }
 
 static inline int gfs2_is_dir(const struct gfs2_inode *ip)
@@ -61,8 +56,8 @@
 
 static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
 {
-	gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
-	change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+	change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT;
+	gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
 	inode->i_blocks += change;
 }
 

diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index ac7caa2..7c71973 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright 2004-2011 Red Hat, Inc.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -31,9 +28,10 @@
  * @delta is the difference between the current rtt sample and the
  * running average srtt. We add 1/8 of that to the srtt in order to
  * update the current srtt estimate. The variance estimate is a bit
- * more complicated. We subtract the abs value of the @delta from
- * the current variance estimate and add 1/4 of that to the running
- * total.
+ * more complicated. We subtract the current variance estimate from
+ * the abs value of the @delta and add 1/4 of that to the running
+ * total.  That's equivalent to 3/4 of the current variance
+ * estimate plus 1/4 of the abs of @delta.
  *
  * Note that the index points at the array entry containing the smoothed
  * mean value, and the variance is always in the following entry
@@ -49,7 +47,7 @@
 	s64 delta = sample - s->stats[index];
 	s->stats[index] += (delta >> 3);
 	index++;
-	s->stats[index] += ((abs(delta) - s->stats[index]) >> 2);
+	s->stats[index] += (s64)(abs(delta) - s->stats[index]) >> 2;
 }
 
 /**
@@ -177,14 +175,14 @@
 		gfs2_glock_cb(gl, LM_ST_SHARED);
 		break;
 	default:
-		pr_err("unknown bast mode %d\n", mode);
+		fs_err(gl->gl_name.ln_sbd, "unknown bast mode %d\n", mode);
 		BUG();
 	}
 }
 
 /* convert gfs lock-state to dlm lock-mode */
 
-static int make_mode(const unsigned int lmstate)
+static int make_mode(struct gfs2_sbd *sdp, const unsigned int lmstate)
 {
 	switch (lmstate) {
 	case LM_ST_UNLOCKED:
@@ -196,7 +194,7 @@
 	case LM_ST_SHARED:
 		return DLM_LOCK_PR;
 	}
-	pr_err("unknown LM state %d\n", lmstate);
+	fs_err(sdp, "unknown LM state %d\n", lmstate);
 	BUG();
 	return -1;
 }
@@ -257,7 +255,7 @@
 	u32 lkf;
 	char strname[GDLM_STRNAME_BYTES] = "";
 
-	req = make_mode(req_state);
+	req = make_mode(gl->gl_name.ln_sbd, req_state);
 	lkf = make_flags(gl, flags, req);
 	gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
 	gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
@@ -309,7 +307,7 @@
 	error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
 			   NULL, gl);
 	if (error) {
-		pr_err("gdlm_unlock %x,%llx err=%d\n",
+		fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n",
 		       gl->gl_name.ln_type,
 		       (unsigned long long)gl->gl_name.ln_number, error);
 		return;
@@ -1037,12 +1035,12 @@
 	}
 
 	old_size = ls->ls_recover_size;
-
-	if (old_size >= max_jid + 1)
+	new_size = old_size;
+	while (new_size < max_jid + 1)
+		new_size += RECOVER_SIZE_INC;
+	if (new_size == old_size)
 		return 0;
 
-	new_size = old_size + RECOVER_SIZE_INC;
-
 	submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS);
 	result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS);
 	if (!submit || !result) {

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ee20ea4..58e237f 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/sched.h>
@@ -108,7 +105,9 @@
 		gfs2_assert(sdp, bd->bd_tr == tr);
 
 		if (!buffer_busy(bh)) {
-			if (!buffer_uptodate(bh)) {
+			if (!buffer_uptodate(bh) &&
+			    !test_and_set_bit(SDF_AIL1_IO_ERROR,
+					      &sdp->sd_flags)) {
 				gfs2_io_error_bh(sdp, bh);
 				*withdraw = true;
 			}
@@ -206,7 +205,8 @@
 		gfs2_assert(sdp, bd->bd_tr == tr);
 		if (buffer_busy(bh))
 			continue;
-		if (!buffer_uptodate(bh)) {
+		if (!buffer_uptodate(bh) &&
+		    !test_and_set_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
 			gfs2_io_error_bh(sdp, bh);
 			*withdraw = true;
 		}
@@ -548,9 +548,9 @@
 	LIST_HEAD(written);
 
 	spin_lock(&sdp->sd_ordered_lock);
-	list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp);
-	while (!list_empty(&sdp->sd_log_le_ordered)) {
-		ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+	list_sort(NULL, &sdp->sd_log_ordered, &ip_cmp);
+	while (!list_empty(&sdp->sd_log_ordered)) {
+		ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered);
 		if (ip->i_inode.i_mapping->nrpages == 0) {
 			test_and_clear_bit(GIF_ORDERED, &ip->i_flags);
 			list_del(&ip->i_ordered);
@@ -561,7 +561,7 @@
 		filemap_fdatawrite(ip->i_inode.i_mapping);
 		spin_lock(&sdp->sd_ordered_lock);
 	}
-	list_splice(&written, &sdp->sd_log_le_ordered);
+	list_splice(&written, &sdp->sd_log_ordered);
 	spin_unlock(&sdp->sd_ordered_lock);
 }
 
@@ -570,8 +570,8 @@
 	struct gfs2_inode *ip;
 
 	spin_lock(&sdp->sd_ordered_lock);
-	while (!list_empty(&sdp->sd_log_le_ordered)) {
-		ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+	while (!list_empty(&sdp->sd_log_ordered)) {
+		ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered);
 		list_del(&ip->i_ordered);
 		WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
 		if (ip->i_inode.i_mapping->nrpages == 0)
@@ -602,11 +602,11 @@
 	bd->bd_blkno = bh->b_blocknr;
 	gfs2_remove_from_ail(bd); /* drops ref on bh */
 	bd->bd_bh = NULL;
-	bd->bd_ops = &gfs2_revoke_lops;
 	sdp->sd_log_num_revoke++;
-	atomic_inc(&gl->gl_revokes);
+	if (atomic_inc_return(&gl->gl_revokes) == 1)
+		gfs2_glock_hold(gl);
 	set_bit(GLF_LFLUSH, &gl->gl_flags);
-	list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+	list_add(&bd->bd_list, &sdp->sd_log_revokes);
 }
 
 void gfs2_write_revokes(struct gfs2_sbd *sdp)
@@ -618,7 +618,7 @@
 
 	gfs2_ail1_empty(sdp);
 	spin_lock(&sdp->sd_ail_lock);
-	list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+	list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
 		list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
 			if (list_empty(&bd->bd_list)) {
 				have_revokes = 1;
@@ -642,7 +642,7 @@
 	}
 	gfs2_log_lock(sdp);
 	spin_lock(&sdp->sd_ail_lock);
-	list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+	list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
 		list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
 			if (max_revokes == 0)
 				goto out_of_blocks;
@@ -664,11 +664,12 @@
 }
 
 /**
- * write_log_header - Write a journal log header buffer at sd_log_flush_head
+ * gfs2_write_log_header - Write a journal log header buffer at lblock
  * @sdp: The GFS2 superblock
  * @jd: journal descriptor of the journal to which we are writing
  * @seq: sequence number
  * @tail: tail of the log
+ * @lblock: value for lh_blkno (block number relative to start of journal)
  * @flags: log header flags GFS2_LOG_HEAD_*
  * @op_flags: flags to pass to the bio
  *
@@ -676,7 +677,8 @@
  */
 
 void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
-			   u64 seq, u32 tail, u32 flags, int op_flags)
+			   u64 seq, u32 tail, u32 lblock, u32 flags,
+			   int op_flags)
 {
 	struct gfs2_log_header *lh;
 	u32 hash, crc;
@@ -684,7 +686,7 @@
 	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 	struct timespec64 tv;
 	struct super_block *sb = sdp->sd_vfs;
-	u64 addr;
+	u64 dblock;
 
 	lh = page_address(page);
 	clear_page(lh);
@@ -697,15 +699,21 @@
 	lh->lh_sequence = cpu_to_be64(seq);
 	lh->lh_flags = cpu_to_be32(flags);
 	lh->lh_tail = cpu_to_be32(tail);
-	lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
+	lh->lh_blkno = cpu_to_be32(lblock);
 	hash = ~crc32(~0, lh, LH_V1_SIZE);
 	lh->lh_hash = cpu_to_be32(hash);
 
 	ktime_get_coarse_real_ts64(&tv);
 	lh->lh_nsec = cpu_to_be32(tv.tv_nsec);
 	lh->lh_sec = cpu_to_be64(tv.tv_sec);
-	addr = gfs2_log_bmap(sdp);
-	lh->lh_addr = cpu_to_be64(addr);
+	if (!list_empty(&jd->extent_list))
+		dblock = gfs2_log_bmap(sdp);
+	else {
+		int ret = gfs2_lblk_to_dblk(jd->jd_inode, lblock, &dblock);
+		if (gfs2_assert_withdraw(sdp, ret == 0))
+			return;
+	}
+	lh->lh_addr = cpu_to_be64(dblock);
 	lh->lh_jinode = cpu_to_be64(GFS2_I(jd->jd_inode)->i_no_addr);
 
 	/* We may only write local statfs, quota, etc., when writing to our
@@ -730,8 +738,8 @@
 		     sb->s_blocksize - LH_V1_SIZE - 4);
 	lh->lh_crc = cpu_to_be32(crc);
 
-	gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr);
-	gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags);
+	gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock);
+	gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags);
 	log_flush_wait(sdp);
 }
 
@@ -759,7 +767,7 @@
 	}
 	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
 	gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail,
-			      flags, op_flags);
+			      sdp->sd_log_flush_head, flags, op_flags);
 
 	if (sdp->sd_log_tail != tail)
 		log_pull_tail(sdp, tail);
@@ -808,7 +816,7 @@
 
 	gfs2_ordered_write(sdp);
 	lops_before_commit(sdp, tr);
-	gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
+	gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE);
 
 	if (sdp->sd_log_head != sdp->sd_log_flush_head) {
 		log_flush_wait(sdp);
@@ -874,7 +882,6 @@
 	old->tr_num_buf_rm	+= new->tr_num_buf_rm;
 	old->tr_num_databuf_rm	+= new->tr_num_databuf_rm;
 	old->tr_num_revoke	+= new->tr_num_revoke;
-	old->tr_num_revoke_rm	+= new->tr_num_revoke_rm;
 
 	list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
 	list_splice_tail_init(&new->tr_buf, &old->tr_buf);
@@ -896,7 +903,7 @@
 		set_bit(TR_ATTACHED, &tr->tr_flags);
 	}
 
-	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
+	sdp->sd_log_commited_revoke += tr->tr_num_revoke;
 	reserved = calc_reserved(sdp);
 	maxres = sdp->sd_log_blks_reserved + tr->tr_reserved;
 	gfs2_assert_withdraw(sdp, maxres >= reserved);

diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 2024143..2315fca 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __LOG_DOT_H__
@@ -51,16 +48,15 @@
 
 static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
 {
-	struct gfs2_sbd *sdp;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 
-	if (!gfs2_is_ordered(ip))
+	if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp))
 		return;
 
-	sdp = GFS2_SB(&ip->i_inode);
 	if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
 		spin_lock(&sdp->sd_ordered_lock);
 		if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
-			list_add(&ip->i_ordered, &sdp->sd_log_le_ordered);
+			list_add(&ip->i_ordered, &sdp->sd_log_ordered);
 		spin_unlock(&sdp->sd_ordered_lock);
 	}
 }
@@ -71,7 +67,8 @@
 extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
 extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
 extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
-				  u64 seq, u32 tail, u32 flags, int op_flags);
+				  u64 seq, u32 tail, u32 lblock, u32 flags,
+				  int op_flags);
 extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
 			   u32 type);
 extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index f2567f9..5b17979 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/sched.h>
@@ -17,7 +14,9 @@
 #include <linux/bio.h>
 #include <linux/fs.h>
 #include <linux/list_sort.h>
+#include <linux/blkdev.h>
 
+#include "bmap.h"
 #include "dir.h"
 #include "gfs2.h"
 #include "incore.h"
@@ -81,7 +80,7 @@
 	if (sdp->sd_args.ar_discard)
 		gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
 	memcpy(bi->bi_clone + bi->bi_offset,
-	       bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
+	       bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes);
 	clear_bit(GBF_FULL, &bi->bi_flags);
 	rgd->rd_free_clone = rgd->rd_free;
 	rgd->rd_extfail_pt = rgd->rd_free;
@@ -168,7 +167,8 @@
  * that is pinned in the pagecache.
  */
 
-static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
+				  struct bio_vec *bvec,
 				  blk_status_t error)
 {
 	struct buffer_head *bh, *next;
@@ -193,7 +193,6 @@
 /**
  * gfs2_end_log_write - end of i/o to the log
  * @bio: The bio
- * @error: Status of i/o request
  *
  * Each bio_vec contains either data from the pagecache or data
  * relating to the log itself. Here we iterate over the bio_vec
@@ -206,7 +205,7 @@
 	struct gfs2_sbd *sdp = bio->bi_private;
 	struct bio_vec *bvec;
 	struct page *page;
-	int i;
+	struct bvec_iter_all iter_all;
 
 	if (bio->bi_status) {
 		fs_err(sdp, "Error %d writing to journal, jid=%u\n",
@@ -214,7 +213,7 @@
 		wake_up(&sdp->sd_logd_waitq);
 	}
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		page = bvec->bv_page;
 		if (page_has_buffers(page))
 			gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
@@ -228,83 +227,86 @@
 }
 
 /**
- * gfs2_log_flush_bio - Submit any pending log bio
- * @sdp: The superblock
- * @op: REQ_OP
- * @op_flags: req_flag_bits
+ * gfs2_log_submit_bio - Submit any pending log bio
+ * @biop: Address of the bio pointer
+ * @opf: REQ_OP | op_flags
  *
  * Submit any pending part-built or full bio to the block device. If
  * there is no pending bio, then this is a no-op.
  */
 
-void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags)
+void gfs2_log_submit_bio(struct bio **biop, int opf)
 {
-	if (sdp->sd_log_bio) {
+	struct bio *bio = *biop;
+	if (bio) {
+		struct gfs2_sbd *sdp = bio->bi_private;
 		atomic_inc(&sdp->sd_log_in_flight);
-		bio_set_op_attrs(sdp->sd_log_bio, op, op_flags);
-		submit_bio(sdp->sd_log_bio);
-		sdp->sd_log_bio = NULL;
+		bio->bi_opf = opf;
+		submit_bio(bio);
+		*biop = NULL;
 	}
 }
 
 /**
- * gfs2_log_alloc_bio - Allocate a new bio for log writing
- * @sdp: The superblock
- * @blkno: The next device block number we want to write to
+ * gfs2_log_alloc_bio - Allocate a bio
+ * @sdp: The super block
+ * @blkno: The device block number we want to write to
+ * @end_io: The bi_end_io callback
  *
- * This should never be called when there is a cached bio in the
- * super block. When it returns, there will be a cached bio in the
- * super block which will have as many bio_vecs as the device is
- * happy to handle.
+ * Allocate a new bio, initialize it with the given parameters and return it.
  *
- * Returns: Newly allocated bio
+ * Returns: The newly allocated bio
  */
 
-static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
+static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
+				      bio_end_io_t *end_io)
 {
 	struct super_block *sb = sdp->sd_vfs;
-	struct bio *bio;
+	struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 
-	BUG_ON(sdp->sd_log_bio);
-
-	bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 	bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
 	bio_set_dev(bio, sb->s_bdev);
-	bio->bi_end_io = gfs2_end_log_write;
+	bio->bi_end_io = end_io;
 	bio->bi_private = sdp;
 
-	sdp->sd_log_bio = bio;
-
 	return bio;
 }
 
 /**
  * gfs2_log_get_bio - Get cached log bio, or allocate a new one
- * @sdp: The superblock
+ * @sdp: The super block
  * @blkno: The device block number we want to write to
+ * @bio: The bio to get or allocate
+ * @op: REQ_OP
+ * @end_io: The bi_end_io callback
+ * @flush: Always flush the current bio and allocate a new one?
  *
  * If there is a cached bio, then if the next block number is sequential
  * with the previous one, return it, otherwise flush the bio to the
- * device. If there is not a cached bio, or we just flushed it, then
+ * device. If there is no cached bio, or we just flushed it, then
  * allocate a new one.
  *
  * Returns: The bio to use for log writes
  */
 
-static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
+static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno,
+				    struct bio **biop, int op,
+				    bio_end_io_t *end_io, bool flush)
 {
-	struct bio *bio = sdp->sd_log_bio;
-	u64 nblk;
+	struct bio *bio = *biop;
 
 	if (bio) {
+		u64 nblk;
+
 		nblk = bio_end_sector(bio);
 		nblk >>= sdp->sd_fsb2bb_shift;
-		if (blkno == nblk)
+		if (blkno == nblk && !flush)
 			return bio;
-		gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
+		gfs2_log_submit_bio(biop, op);
 	}
 
-	return gfs2_log_alloc_bio(sdp, blkno);
+	*biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
+	return *biop;
 }
 
 /**
@@ -326,11 +328,12 @@
 	struct bio *bio;
 	int ret;
 
-	bio = gfs2_log_get_bio(sdp, blkno);
+	bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, REQ_OP_WRITE,
+			       gfs2_end_log_write, false);
 	ret = bio_add_page(bio, page, size, offset);
 	if (ret == 0) {
-		gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
-		bio = gfs2_log_alloc_bio(sdp, blkno);
+		bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio,
+				       REQ_OP_WRITE, gfs2_end_log_write, true);
 		ret = bio_add_page(bio, page, size, offset);
 		WARN_ON(ret == 0);
 	}
@@ -370,6 +373,205 @@
 		       gfs2_log_bmap(sdp));
 }
 
+/**
+ * gfs2_end_log_read - end I/O callback for reads from the log
+ * @bio: The bio
+ *
+ * Simply unlock the pages in the bio. The main thread will wait on them and
+ * process them in order as necessary.
+ */
+
+static void gfs2_end_log_read(struct bio *bio)
+{
+	struct page *page;
+	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
+
+	bio_for_each_segment_all(bvec, bio, iter_all) {
+		page = bvec->bv_page;
+		if (bio->bi_status) {
+			int err = blk_status_to_errno(bio->bi_status);
+
+			SetPageError(page);
+			mapping_set_error(page->mapping, err);
+		}
+		unlock_page(page);
+	}
+
+	bio_put(bio);
+}
+
+/**
+ * gfs2_jhead_pg_srch - Look for the journal head in a given page.
+ * @jd: The journal descriptor
+ * @page: The page to look in
+ *
+ * Returns: 1 if found, 0 otherwise.
+ */
+
+static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
+			      struct gfs2_log_header_host *head,
+			      struct page *page)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_log_header_host uninitialized_var(lh);
+	void *kaddr = kmap_atomic(page);
+	unsigned int offset;
+	bool ret = false;
+
+	for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
+		if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
+			if (lh.lh_sequence > head->lh_sequence)
+				*head = lh;
+			else {
+				ret = true;
+				break;
+			}
+		}
+	}
+	kunmap_atomic(kaddr);
+	return ret;
+}
+
+/**
+ * gfs2_jhead_process_page - Search/cleanup a page
+ * @jd: The journal descriptor
+ * @index: Index of the page to look into
+ * @done: If set, perform only cleanup, else search and set if found.
+ *
+ * Find the page with 'index' in the journal's mapping. Search the page for
+ * the journal head if requested (cleanup == false). Release refs on the
+ * page so the page cache can reclaim it (put_page() twice). We grabbed a
+ * reference on this page two times, first when we did a find_or_create_page()
+ * to obtain the page to add it to the bio and second when we do a
+ * find_get_page() here to get the page to wait on while I/O on it is being
+ * completed.
+ * This function is also used to free up a page we might've grabbed but not
+ * used. Maybe we added it to a bio, but not submitted it for I/O. Or we
+ * submitted the I/O, but we already found the jhead so we only need to drop
+ * our references to the page.
+ */
+
+static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
+				    struct gfs2_log_header_host *head,
+				    bool *done)
+{
+	struct page *page;
+
+	page = find_get_page(jd->jd_inode->i_mapping, index);
+	wait_on_page_locked(page);
+
+	if (PageError(page))
+		*done = true;
+
+	if (!*done)
+		*done = gfs2_jhead_pg_srch(jd, head, page);
+
+	put_page(page); /* Once for find_get_page */
+	put_page(page); /* Once more for find_or_create_page */
+}
+
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: The journal descriptor
+ * @head: The log descriptor for the head of the log is returned here
+ *
+ * Do a search of a journal by reading it in large chunks using bios and find
+ * the valid log entry with the highest sequence number.  (i.e. the log head)
+ *
+ * Returns: 0 on success, errno otherwise
+ */
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
+		    bool keep_cache)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct address_space *mapping = jd->jd_inode->i_mapping;
+	unsigned int block = 0, blocks_submitted = 0, blocks_read = 0;
+	unsigned int bsize = sdp->sd_sb.sb_bsize;
+	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
+	unsigned int shift = PAGE_SHIFT - bsize_shift;
+	unsigned int readhead_blocks = BIO_MAX_PAGES << shift;
+	struct gfs2_journal_extent *je;
+	int sz, ret = 0;
+	struct bio *bio = NULL;
+	struct page *page = NULL;
+	bool done = false;
+	errseq_t since;
+
+	memset(head, 0, sizeof(*head));
+	if (list_empty(&jd->extent_list))
+		gfs2_map_journal_extents(sdp, jd);
+
+	since = filemap_sample_wb_err(mapping);
+	list_for_each_entry(je, &jd->extent_list, list) {
+		for (; block < je->lblock + je->blocks; block++) {
+			u64 dblock;
+
+			if (!page) {
+				page = find_or_create_page(mapping,
+						block >> shift, GFP_NOFS);
+				if (!page) {
+					ret = -ENOMEM;
+					done = true;
+					goto out;
+				}
+			}
+
+			if (bio) {
+				unsigned int off;
+
+				off = (block << bsize_shift) & ~PAGE_MASK;
+				sz = bio_add_page(bio, page, bsize, off);
+				if (sz == bsize) { /* block added */
+					if (off + bsize == PAGE_SIZE) {
+						page = NULL;
+						goto page_added;
+					}
+					continue;
+				}
+				blocks_submitted = block + 1;
+				submit_bio(bio);
+				bio = NULL;
+			}
+
+			dblock = je->dblock + (block - je->lblock);
+			bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read);
+			bio->bi_opf = REQ_OP_READ;
+			sz = bio_add_page(bio, page, bsize, 0);
+			gfs2_assert_warn(sdp, sz == bsize);
+			if (bsize == PAGE_SIZE)
+				page = NULL;
+
+page_added:
+			if (blocks_submitted < blocks_read + readhead_blocks) {
+				/* Keep at least one bio in flight */
+				continue;
+			}
+
+			gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done);
+			blocks_read += PAGE_SIZE >> bsize_shift;
+			if (done)
+				goto out;  /* found */
+		}
+	}
+
+out:
+	if (bio)
+		submit_bio(bio);
+	while (blocks_read < block) {
+		gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done);
+		blocks_read += PAGE_SIZE >> bsize_shift;
+	}
+
+	if (!ret)
+		ret = filemap_check_wb_err(mapping, since);
+
+	if (!keep_cache)
+		truncate_inode_pages(mapping, 0);
+
+	return ret;
+}
+
 static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
 				      u32 ld_length, u32 ld_data1)
 {
@@ -523,7 +725,7 @@
 	jd->jd_replayed_blocks = 0;
 }
 
-static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
 				struct gfs2_log_descriptor *ld, __be64 *ptr,
 				int pass)
 {
@@ -557,9 +759,27 @@
 
 		if (gfs2_meta_check(sdp, bh_ip))
 			error = -EIO;
-		else
-			mark_buffer_dirty(bh_ip);
+		else {
+			struct gfs2_meta_header *mh =
+				(struct gfs2_meta_header *)bh_ip->b_data;
 
+			if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG)) {
+				struct gfs2_rgrpd *rgd;
+
+				rgd = gfs2_blk2rgrpd(sdp, blkno, false);
+				if (rgd && rgd->rd_addr == blkno &&
+				    rgd->rd_bits && rgd->rd_bits->bi_bh) {
+					fs_info(sdp, "Replaying 0x%llx but we "
+						"already have a bh!\n",
+						(unsigned long long)blkno);
+					fs_info(sdp, "busy:%d, pinned:%d\n",
+						buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0,
+						buffer_pinned(rgd->rd_bits->bi_bh));
+					gfs2_dump_glock(NULL, rgd->rd_gl, true);
+				}
+			}
+			mark_buffer_dirty(bh_ip);
+		}
 		brelse(bh_log);
 		brelse(bh_ip);
 
@@ -616,7 +836,7 @@
 {
 	struct gfs2_meta_header *mh;
 	unsigned int offset;
-	struct list_head *head = &sdp->sd_log_le_revoke;
+	struct list_head *head = &sdp->sd_log_revokes;
 	struct gfs2_bufdata *bd;
 	struct page *page;
 	unsigned int length;
@@ -654,7 +874,7 @@
 
 static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
-	struct list_head *head = &sdp->sd_log_le_revoke;
+	struct list_head *head = &sdp->sd_log_revokes;
 	struct gfs2_bufdata *bd;
 	struct gfs2_glock *gl;
 
@@ -662,8 +882,10 @@
 		bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
 		list_del_init(&bd->bd_list);
 		gl = bd->bd_gl;
-		atomic_dec(&gl->gl_revokes);
-		clear_bit(GLF_LFLUSH, &gl->gl_flags);
+		if (atomic_dec_return(&gl->gl_revokes) == 0) {
+			clear_bit(GLF_LFLUSH, &gl->gl_flags);
+			gfs2_glock_queue_put(gl);
+		}
 		kmem_cache_free(gfs2_bufdata_cachep, bd);
 	}
 }
@@ -678,7 +900,7 @@
 	jd->jd_replay_tail = head->lh_tail;
 }
 
-static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
 				   struct gfs2_log_descriptor *ld, __be64 *ptr,
 				   int pass)
 {
@@ -760,7 +982,7 @@
 	gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
 }
 
-static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
 				    struct gfs2_log_descriptor *ld,
 				    __be64 *ptr, int pass)
 {
@@ -846,7 +1068,7 @@
 }
 
 
-const struct gfs2_log_operations gfs2_buf_lops = {
+static const struct gfs2_log_operations gfs2_buf_lops = {
 	.lo_before_commit = buf_lo_before_commit,
 	.lo_after_commit = buf_lo_after_commit,
 	.lo_before_scan = buf_lo_before_scan,
@@ -855,7 +1077,7 @@
 	.lo_name = "buf",
 };
 
-const struct gfs2_log_operations gfs2_revoke_lops = {
+static const struct gfs2_log_operations gfs2_revoke_lops = {
 	.lo_before_commit = revoke_lo_before_commit,
 	.lo_after_commit = revoke_lo_after_commit,
 	.lo_before_scan = revoke_lo_before_scan,
@@ -864,7 +1086,7 @@
 	.lo_name = "revoke",
 };
 
-const struct gfs2_log_operations gfs2_databuf_lops = {
+static const struct gfs2_log_operations gfs2_databuf_lops = {
 	.lo_before_commit = databuf_lo_before_commit,
 	.lo_after_commit = databuf_lo_after_commit,
 	.lo_scan_elements = databuf_lo_scan_elements,

diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index e494939..9c05995 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __LOPS_DOT_H__
@@ -20,18 +17,15 @@
 	((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \
 	 ~(2 * sizeof(__be64) - 1))
 
-extern const struct gfs2_log_operations gfs2_glock_lops;
-extern const struct gfs2_log_operations gfs2_buf_lops;
-extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_databuf_lops;
-
 extern const struct gfs2_log_operations *gfs2_log_ops[];
 extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp);
 extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
 			   unsigned size, unsigned offset, u64 blkno);
 extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
-extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags);
+extern void gfs2_log_submit_bio(struct bio **biop, int opf);
 extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
+			   struct gfs2_log_header_host *head, bool keep_cache);
 
 static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
 {
@@ -77,7 +71,7 @@
 			gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
 }
 
-static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+static inline int lops_scan_elements(struct gfs2_jdesc *jd, u32 start,
 				     struct gfs2_log_descriptor *ld,
 				     __be64 *ptr,
 				     unsigned int pass)

diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 2d55e2c..a1a295b 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -39,9 +36,11 @@
 	struct gfs2_inode *ip = foo;
 
 	inode_init_once(&ip->i_inode);
+	atomic_set(&ip->i_sizehint, 0);
 	init_rwsem(&ip->i_rw_mutex);
 	INIT_LIST_HEAD(&ip->i_trunc_list);
 	ip->i_qadata = NULL;
+	gfs2_holder_mark_uninitialized(&ip->i_rgd_gh);
 	memset(&ip->i_res, 0, sizeof(ip->i_res));
 	RB_CLEAR_NODE(&ip->i_res.rs_node);
 	ip->i_hash_cache = NULL;
@@ -176,16 +175,12 @@
 	if (!gfs2_page_pool)
 		goto fail_mempool;
 
-	error = gfs2_register_debugfs();
-	if (error)
-		goto fail_debugfs;
+	gfs2_register_debugfs();
 
 	pr_info("GFS2 installed\n");
 
 	return 0;
 
-fail_debugfs:
-	mempool_destroy(gfs2_page_pool);
 fail_mempool:
 	destroy_workqueue(gfs2_freeze_wq);
 fail_wq3:

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index be9c0bf..662ef36 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/sched.h>
@@ -189,9 +186,9 @@
 static void gfs2_meta_read_endio(struct bio *bio)
 {
 	struct bio_vec *bvec;
-	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
 		struct buffer_head *bh = page_buffers(page);
 		unsigned int len = bvec->bv_len;
@@ -254,7 +251,7 @@
 	struct buffer_head *bh, *bhs[2];
 	int num = 0;
 
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) {
 		*bhp = NULL;
 		return -EIO;
 	}
@@ -312,7 +309,7 @@
 
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
 {
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)))
 		return -EIO;
 
 	wait_on_buffer(bh);
@@ -323,7 +320,7 @@
 			gfs2_io_error_bh_wd(sdp, bh);
 		return -EIO;
 	}
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)))
 		return -EIO;
 
 	return 0;

diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index ffdf6aa..eafb74e 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __DIO_DOT_H__

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b041cb8..18daf49 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -24,6 +21,7 @@
 #include <linux/lockdep.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
+#include <linux/fs_parser.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -41,6 +39,7 @@
 #include "dir.h"
 #include "meta_io.h"
 #include "trace_gfs2.h"
+#include "lops.h"
 
 #define DO 0
 #define UNDO 1
@@ -63,6 +62,13 @@
 	gt->gt_complain_secs = 10;
 }
 
+void free_sbd(struct gfs2_sbd *sdp)
+{
+	if (sdp->sd_lkstats)
+		free_percpu(sdp->sd_lkstats);
+	kfree(sdp);
+}
+
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp;
@@ -74,16 +80,15 @@
 
 	sdp->sd_vfs = sb;
 	sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats);
-	if (!sdp->sd_lkstats) {
-		kfree(sdp);
-		return NULL;
-	}
+	if (!sdp->sd_lkstats)
+		goto fail;
 	sb->s_fs_info = sdp;
 
 	set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
 	gfs2_tune_init(&sdp->sd_tune);
 
 	init_waitqueue_head(&sdp->sd_glock_wait);
+	init_waitqueue_head(&sdp->sd_async_glock_wait);
 	atomic_set(&sdp->sd_glock_disposal, 0);
 	init_completion(&sdp->sd_locking_init);
 	init_completion(&sdp->sd_wdack);
@@ -117,8 +122,8 @@
 
 	spin_lock_init(&sdp->sd_log_lock);
 	atomic_set(&sdp->sd_log_pinned, 0);
-	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
-	INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
+	INIT_LIST_HEAD(&sdp->sd_log_revokes);
+	INIT_LIST_HEAD(&sdp->sd_log_ordered);
 	spin_lock_init(&sdp->sd_ordered_lock);
 
 	init_waitqueue_head(&sdp->sd_log_waitq);
@@ -136,8 +141,11 @@
 	mutex_init(&sdp->sd_freeze_mutex);
 
 	return sdp;
-}
 
+fail:
+	free_sbd(sdp);
+	return NULL;
+}
 
 /**
  * gfs2_check_sb - Check superblock
@@ -570,7 +578,7 @@
 
 		INIT_WORK(&jd->jd_work, gfs2_recover_func);
 		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
-		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+		if (IS_ERR_OR_NULL(jd->jd_inode)) {
 			if (!jd->jd_inode)
 				error = -ENOENT;
 			else
@@ -616,7 +624,7 @@
 		fs_err(sdp, "Error checking journal for spectator mount.\n");
 		goto out_unlock;
 	}
-	error = gfs2_find_jhead(jd, &head);
+	error = gfs2_find_jhead(jd, &head, false);
 	if (error) {
 		fs_err(sdp, "Error parsing journal for spectator mount.\n");
 		goto out_unlock;
@@ -998,7 +1006,7 @@
 void gfs2_lm_unmount(struct gfs2_sbd *sdp)
 {
 	const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	if (likely(!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) &&
 	    lm->lm_unmount)
 		lm->lm_unmount(sdp);
 }
@@ -1024,16 +1032,17 @@
 }
 
 /**
- * fill_super - Read in superblock
+ * gfs2_fill_super - Read in superblock
  * @sb: The VFS superblock
- * @data: Mount options
+ * @args: Mount options
  * @silent: Don't complain if it's not a GFS2 filesystem
  *
- * Returns: errno
+ * Returns: -errno
  */
-
-static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
+static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
 {
+	struct gfs2_args *args = fc->fs_private;
+	int silent = fc->sb_flags & SB_SILENT;
 	struct gfs2_sbd *sdp;
 	struct gfs2_holder mount_gh;
 	int error;
@@ -1088,8 +1097,7 @@
 	if (error) {
 		/* In this case, we haven't initialized sysfs, so we have to
 		   manually free the sdp. */
-		free_percpu(sdp->sd_lkstats);
-		kfree(sdp);
+		free_sbd(sdp);
 		sb->s_fs_info = NULL;
 		return error;
 	}
@@ -1192,7 +1200,6 @@
 	gfs2_lm_unmount(sdp);
 fail_debug:
 	gfs2_delete_debugfs_file(sdp);
-	free_percpu(sdp->sd_lkstats);
 	/* gfs2_sys_fs_del must be the last thing we do, since it causes
 	 * sysfs to call function gfs2_sbd_release, which frees sdp. */
 	gfs2_sys_fs_del(sdp);
@@ -1200,161 +1207,418 @@
 	return error;
 }
 
-static int set_gfs2_super(struct super_block *s, void *data)
+/**
+ * gfs2_get_tree - Get the GFS2 superblock and root directory
+ * @fc: The filesystem context
+ *
+ * Returns: 0 or -errno on error
+ */
+static int gfs2_get_tree(struct fs_context *fc)
 {
-	s->s_bdev = data;
-	s->s_dev = s->s_bdev->bd_dev;
-	s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+	struct gfs2_args *args = fc->fs_private;
+	struct gfs2_sbd *sdp;
+	int error;
+
+	error = get_tree_bdev(fc, gfs2_fill_super);
+	if (error)
+		return error;
+
+	sdp = fc->root->d_sb->s_fs_info;
+	dput(fc->root);
+	if (args->ar_meta)
+		fc->root = dget(sdp->sd_master_dir);
+	else
+		fc->root = dget(sdp->sd_root_dir);
 	return 0;
 }
 
-static int test_gfs2_super(struct super_block *s, void *ptr)
+static void gfs2_fc_free(struct fs_context *fc)
 {
-	struct block_device *bdev = ptr;
-	return (bdev == s->s_bdev);
+	struct gfs2_args *args = fc->fs_private;
+
+	kfree(args);
 }
 
-/**
- * gfs2_mount - Get the GFS2 superblock
- * @fs_type: The GFS2 filesystem type
- * @flags: Mount flags
- * @dev_name: The name of the device
- * @data: The mount arguments
- *
- * Q. Why not use get_sb_bdev() ?
- * A. We need to select one of two root directories to mount, independent
- *    of whether this is the initial, or subsequent, mount of this sb
- *
- * Returns: 0 or -ve on error
- */
+enum gfs2_param {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_upgrade,
+	Opt_acl,
+	Opt_quota,
+	Opt_suiddir,
+	Opt_data,
+	Opt_meta,
+	Opt_discard,
+	Opt_commit,
+	Opt_errors,
+	Opt_statfs_quantum,
+	Opt_statfs_percent,
+	Opt_quota_quantum,
+	Opt_barrier,
+	Opt_rgrplvb,
+	Opt_loccookie,
+};
 
-static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data)
+enum opt_quota {
+	Opt_quota_unset = 0,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+};
+
+static const unsigned int opt_quota_values[] = {
+	[Opt_quota_off]     = GFS2_QUOTA_OFF,
+	[Opt_quota_account] = GFS2_QUOTA_ACCOUNT,
+	[Opt_quota_on]      = GFS2_QUOTA_ON,
+};
+
+enum opt_data {
+	Opt_data_writeback = GFS2_DATA_WRITEBACK,
+	Opt_data_ordered   = GFS2_DATA_ORDERED,
+};
+
+enum opt_errors {
+	Opt_errors_withdraw = GFS2_ERRORS_WITHDRAW,
+	Opt_errors_panic    = GFS2_ERRORS_PANIC,
+};
+
+static const struct fs_parameter_spec gfs2_param_specs[] = {
+	fsparam_string ("lockproto",          Opt_lockproto),
+	fsparam_string ("locktable",          Opt_locktable),
+	fsparam_string ("hostdata",           Opt_hostdata),
+	fsparam_flag   ("spectator",          Opt_spectator),
+	fsparam_flag   ("norecovery",         Opt_spectator),
+	fsparam_flag   ("ignore_local_fs",    Opt_ignore_local_fs),
+	fsparam_flag   ("localflocks",        Opt_localflocks),
+	fsparam_flag   ("localcaching",       Opt_localcaching),
+	fsparam_flag_no("debug",              Opt_debug),
+	fsparam_flag   ("upgrade",            Opt_upgrade),
+	fsparam_flag_no("acl",                Opt_acl),
+	fsparam_flag_no("suiddir",            Opt_suiddir),
+	fsparam_enum   ("data",               Opt_data),
+	fsparam_flag   ("meta",               Opt_meta),
+	fsparam_flag_no("discard",            Opt_discard),
+	fsparam_s32    ("commit",             Opt_commit),
+	fsparam_enum   ("errors",             Opt_errors),
+	fsparam_s32    ("statfs_quantum",     Opt_statfs_quantum),
+	fsparam_s32    ("statfs_percent",     Opt_statfs_percent),
+	fsparam_s32    ("quota_quantum",      Opt_quota_quantum),
+	fsparam_flag_no("barrier",            Opt_barrier),
+	fsparam_flag_no("rgrplvb",            Opt_rgrplvb),
+	fsparam_flag_no("loccookie",          Opt_loccookie),
+	/* quota can be a flag or an enum so it gets special treatment */
+	__fsparam(fs_param_is_enum, "quota", Opt_quota, fs_param_neg_with_no|fs_param_v_optional),
+	{}
+};
+
+static const struct fs_parameter_enum gfs2_param_enums[] = {
+	{ Opt_quota,    "off",        Opt_quota_off },
+	{ Opt_quota,    "account",    Opt_quota_account },
+	{ Opt_quota,    "on",         Opt_quota_on },
+	{ Opt_data,     "writeback",  Opt_data_writeback },
+	{ Opt_data,     "ordered",    Opt_data_ordered },
+	{ Opt_errors,   "withdraw",   Opt_errors_withdraw },
+	{ Opt_errors,   "panic",      Opt_errors_panic },
+	{}
+};
+
+const struct fs_parameter_description gfs2_fs_parameters = {
+	.name = "gfs2",
+	.specs = gfs2_param_specs,
+	.enums = gfs2_param_enums,
+};
+
+/* Parse a single mount parameter */
+static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	struct block_device *bdev;
-	struct super_block *s;
-	fmode_t mode = FMODE_READ | FMODE_EXCL;
-	int error;
-	struct gfs2_args args;
-	struct gfs2_sbd *sdp;
+	struct gfs2_args *args = fc->fs_private;
+	struct fs_parse_result result;
+	int o;
 
-	if (!(flags & SB_RDONLY))
-		mode |= FMODE_WRITE;
+	o = fs_parse(fc, &gfs2_fs_parameters, param, &result);
+	if (o < 0)
+		return o;
 
-	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
-	if (IS_ERR(bdev))
-		return ERR_CAST(bdev);
-
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		error = -EBUSY;
-		goto error_bdev;
+	switch (o) {
+	case Opt_lockproto:
+		strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
+		break;
+	case Opt_locktable:
+		strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
+		break;
+	case Opt_hostdata:
+		strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
+		break;
+	case Opt_spectator:
+		args->ar_spectator = 1;
+		break;
+	case Opt_ignore_local_fs:
+		/* Retained for backwards compat only */
+		break;
+	case Opt_localflocks:
+		args->ar_localflocks = 1;
+		break;
+	case Opt_localcaching:
+		/* Retained for backwards compat only */
+		break;
+	case Opt_debug:
+		if (result.boolean && args->ar_errors == GFS2_ERRORS_PANIC)
+			return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive");
+		args->ar_debug = result.boolean;
+		break;
+	case Opt_upgrade:
+		/* Retained for backwards compat only */
+		break;
+	case Opt_acl:
+		args->ar_posix_acl = result.boolean;
+		break;
+	case Opt_quota:
+		/* The quota option can be a flag or an enum. A non-zero int_32
+		   result means that we have an enum index. Otherwise we have
+		   to rely on the 'negated' flag to tell us whether 'quota' or
+		   'noquota' was specified. */
+		if (result.negated)
+			args->ar_quota = GFS2_QUOTA_OFF;
+		else if (result.int_32 > 0)
+			args->ar_quota = opt_quota_values[result.int_32];
+		else
+			args->ar_quota = GFS2_QUOTA_ON;
+		break;
+	case Opt_suiddir:
+		args->ar_suiddir = result.boolean;
+		break;
+	case Opt_data:
+		/* The uint_32 result maps directly to GFS2_DATA_* */
+		args->ar_data = result.uint_32;
+		break;
+	case Opt_meta:
+		args->ar_meta = 1;
+		break;
+	case Opt_discard:
+		args->ar_discard = result.boolean;
+		break;
+	case Opt_commit:
+		if (result.int_32 <= 0)
+			return invalf(fc, "gfs2: commit mount option requires a positive numeric argument");
+		args->ar_commit = result.int_32;
+		break;
+	case Opt_statfs_quantum:
+		if (result.int_32 < 0)
+			return invalf(fc, "gfs2: statfs_quantum mount option requires a non-negative numeric argument");
+		args->ar_statfs_quantum = result.int_32;
+		break;
+	case Opt_quota_quantum:
+		if (result.int_32 <= 0)
+			return invalf(fc, "gfs2: quota_quantum mount option requires a positive numeric argument");
+		args->ar_quota_quantum = result.int_32;
+		break;
+	case Opt_statfs_percent:
+		if (result.int_32 < 0 || result.int_32 > 100)
+			return invalf(fc, "gfs2: statfs_percent mount option requires a numeric argument between 0 and 100");
+		args->ar_statfs_percent = result.int_32;
+		break;
+	case Opt_errors:
+		if (args->ar_debug && result.uint_32 == GFS2_ERRORS_PANIC)
+			return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive");
+		args->ar_errors = result.uint_32;
+		break;
+	case Opt_barrier:
+		args->ar_nobarrier = result.boolean;
+		break;
+	case Opt_rgrplvb:
+		args->ar_rgrplvb = result.boolean;
+		break;
+	case Opt_loccookie:
+		args->ar_loccookie = result.boolean;
+		break;
+	default:
+		return invalf(fc, "gfs2: invalid mount option: %s", param->key);
 	}
-	s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	error = PTR_ERR(s);
-	if (IS_ERR(s))
-		goto error_bdev;
+	return 0;
+}
 
-	if (s->s_root) {
-		/*
-		 * s_umount nests inside bd_mutex during
-		 * __invalidate_device().  blkdev_put() acquires
-		 * bd_mutex and can't be called under s_umount.  Drop
-		 * s_umount temporarily.  This is safe as we're
-		 * holding an active reference.
-		 */
-		up_write(&s->s_umount);
-		blkdev_put(bdev, mode);
-		down_write(&s->s_umount);
-	} else {
-		/* s_mode must be set before deactivate_locked_super calls */
-		s->s_mode = mode;
-	}
+static int gfs2_reconfigure(struct fs_context *fc)
+{
+	struct super_block *sb = fc->root->d_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args *oldargs = &sdp->sd_args;
+	struct gfs2_args *newargs = fc->fs_private;
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	int error = 0;
 
-	memset(&args, 0, sizeof(args));
-	args.ar_quota = GFS2_QUOTA_DEFAULT;
-	args.ar_data = GFS2_DATA_DEFAULT;
-	args.ar_commit = 30;
-	args.ar_statfs_quantum = 30;
-	args.ar_quota_quantum = 60;
-	args.ar_errors = GFS2_ERRORS_DEFAULT;
+	sync_filesystem(sb);
 
-	error = gfs2_mount_args(&args, data);
-	if (error) {
-		pr_warn("can't parse mount arguments\n");
-		goto error_super;
-	}
-
-	if (s->s_root) {
-		error = -EBUSY;
-		if ((flags ^ s->s_flags) & SB_RDONLY)
-			goto error_super;
-	} else {
-		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
-		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0);
-		if (error)
-			goto error_super;
-		s->s_flags |= SB_ACTIVE;
-		bdev->bd_super = s;
-	}
-
-	sdp = s->s_fs_info;
-	if (args.ar_meta)
-		return dget(sdp->sd_master_dir);
+	spin_lock(&gt->gt_spin);
+	oldargs->ar_commit = gt->gt_logd_secs;
+	oldargs->ar_quota_quantum = gt->gt_quota_quantum;
+	if (gt->gt_statfs_slow)
+		oldargs->ar_statfs_quantum = 0;
 	else
-		return dget(sdp->sd_root_dir);
+		oldargs->ar_statfs_quantum = gt->gt_statfs_quantum;
+	spin_unlock(&gt->gt_spin);
 
-error_super:
-	deactivate_locked_super(s);
-	return ERR_PTR(error);
-error_bdev:
-	blkdev_put(bdev, mode);
-	return ERR_PTR(error);
+	if (strcmp(newargs->ar_lockproto, oldargs->ar_lockproto)) {
+		errorf(fc, "gfs2: reconfiguration of locking protocol not allowed");
+		return -EINVAL;
+	}
+	if (strcmp(newargs->ar_locktable, oldargs->ar_locktable)) {
+		errorf(fc, "gfs2: reconfiguration of lock table not allowed");
+		return -EINVAL;
+	}
+	if (strcmp(newargs->ar_hostdata, oldargs->ar_hostdata)) {
+		errorf(fc, "gfs2: reconfiguration of host data not allowed");
+		return -EINVAL;
+	}
+	if (newargs->ar_spectator != oldargs->ar_spectator) {
+		errorf(fc, "gfs2: reconfiguration of spectator mode not allowed");
+		return -EINVAL;
+	}
+	if (newargs->ar_localflocks != oldargs->ar_localflocks) {
+		errorf(fc, "gfs2: reconfiguration of localflocks not allowed");
+		return -EINVAL;
+	}
+	if (newargs->ar_meta != oldargs->ar_meta) {
+		errorf(fc, "gfs2: switching between gfs2 and gfs2meta not allowed");
+		return -EINVAL;
+	}
+	if (oldargs->ar_spectator)
+		fc->sb_flags |= SB_RDONLY;
+
+	if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) {
+		if (fc->sb_flags & SB_RDONLY) {
+			error = gfs2_make_fs_ro(sdp);
+			if (error)
+				errorf(fc, "gfs2: unable to remount read-only");
+		} else {
+			error = gfs2_make_fs_rw(sdp);
+			if (error)
+				errorf(fc, "gfs2: unable to remount read-write");
+		}
+	}
+	sdp->sd_args = *newargs;
+
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= SB_POSIXACL;
+	else
+		sb->s_flags &= ~SB_POSIXACL;
+	if (sdp->sd_args.ar_nobarrier)
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	else
+		clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	spin_lock(&gt->gt_spin);
+	gt->gt_logd_secs = newargs->ar_commit;
+	gt->gt_quota_quantum = newargs->ar_quota_quantum;
+	if (newargs->ar_statfs_quantum) {
+		gt->gt_statfs_slow = 0;
+		gt->gt_statfs_quantum = newargs->ar_statfs_quantum;
+	}
+	else {
+		gt->gt_statfs_slow = 1;
+		gt->gt_statfs_quantum = 30;
+	}
+	spin_unlock(&gt->gt_spin);
+
+	gfs2_online_uevent(sdp);
+	return error;
 }
 
-static int set_meta_super(struct super_block *s, void *ptr)
+static const struct fs_context_operations gfs2_context_ops = {
+	.free        = gfs2_fc_free,
+	.parse_param = gfs2_parse_param,
+	.get_tree    = gfs2_get_tree,
+	.reconfigure = gfs2_reconfigure,
+};
+
+/* Set up the filesystem mount context */
+static int gfs2_init_fs_context(struct fs_context *fc)
+{
+	struct gfs2_args *args;
+
+	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (args == NULL)
+		return -ENOMEM;
+
+	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+		struct gfs2_sbd *sdp = fc->root->d_sb->s_fs_info;
+
+		*args = sdp->sd_args;
+	} else {
+		memset(args, 0, sizeof(*args));
+		args->ar_quota = GFS2_QUOTA_DEFAULT;
+		args->ar_data = GFS2_DATA_DEFAULT;
+		args->ar_commit = 30;
+		args->ar_statfs_quantum = 30;
+		args->ar_quota_quantum = 60;
+		args->ar_errors = GFS2_ERRORS_DEFAULT;
+	}
+	fc->fs_private = args;
+	fc->ops = &gfs2_context_ops;
+	return 0;
+}
+
+static int set_meta_super(struct super_block *s, struct fs_context *fc)
 {
 	return -EINVAL;
 }
 
-static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
-			int flags, const char *dev_name, void *data)
+static int test_meta_super(struct super_block *s, struct fs_context *fc)
+{
+	return (fc->sget_key == s->s_bdev);
+}
+
+static int gfs2_meta_get_tree(struct fs_context *fc)
 {
 	struct super_block *s;
 	struct gfs2_sbd *sdp;
 	struct path path;
 	int error;
 
-	if (!dev_name || !*dev_name)
-		return ERR_PTR(-EINVAL);
+	if (!fc->source || !*fc->source)
+		return -EINVAL;
 
-	error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
+	error = kern_path(fc->source, LOOKUP_FOLLOW, &path);
 	if (error) {
 		pr_warn("path_lookup on %s returned error %d\n",
-			dev_name, error);
-		return ERR_PTR(error);
+		        fc->source, error);
+		return error;
 	}
-	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
-		 path.dentry->d_sb->s_bdev);
+	fc->fs_type = &gfs2_fs_type;
+	fc->sget_key = path.dentry->d_sb->s_bdev;
+	s = sget_fc(fc, test_meta_super, set_meta_super);
 	path_put(&path);
 	if (IS_ERR(s)) {
 		pr_warn("gfs2 mount does not exist\n");
-		return ERR_CAST(s);
+		return PTR_ERR(s);
 	}
-	if ((flags ^ s->s_flags) & SB_RDONLY) {
+	if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
 		deactivate_locked_super(s);
-		return ERR_PTR(-EBUSY);
+		return -EBUSY;
 	}
 	sdp = s->s_fs_info;
-	return dget(sdp->sd_master_dir);
+	fc->root = dget(sdp->sd_master_dir);
+	return 0;
+}
+
+static const struct fs_context_operations gfs2_meta_context_ops = {
+	.free        = gfs2_fc_free,
+	.get_tree    = gfs2_meta_get_tree,
+};
+
+static int gfs2_meta_init_fs_context(struct fs_context *fc)
+{
+	int ret = gfs2_init_fs_context(fc);
+
+	if (ret)
+		return ret;
+
+	fc->ops = &gfs2_meta_context_ops;
+	return 0;
 }
 
 static void gfs2_kill_sb(struct super_block *sb)
@@ -1372,14 +1636,14 @@
 	sdp->sd_root_dir = NULL;
 	sdp->sd_master_dir = NULL;
 	shrink_dcache_sb(sb);
-	free_percpu(sdp->sd_lkstats);
 	kill_block_super(sb);
 }
 
 struct file_system_type gfs2_fs_type = {
 	.name = "gfs2",
 	.fs_flags = FS_REQUIRES_DEV,
-	.mount = gfs2_mount,
+	.init_fs_context = gfs2_init_fs_context,
+	.parameters = &gfs2_fs_parameters,
 	.kill_sb = gfs2_kill_sb,
 	.owner = THIS_MODULE,
 };
@@ -1388,7 +1652,7 @@
 struct file_system_type gfs2meta_fs_type = {
 	.name = "gfs2meta",
 	.fs_flags = FS_REQUIRES_DEV,
-	.mount = gfs2_mount_meta,
+	.init_fs_context = gfs2_meta_init_fs_context,
 	.owner = THIS_MODULE,
 };
 MODULE_ALIAS_FS("gfs2meta");

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 0efae7a..7c016a0 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 /*
@@ -777,7 +774,7 @@
 	nbytes = sizeof(struct gfs2_quota);
 
 	pg_beg = loc >> PAGE_SHIFT;
-	pg_off = loc % PAGE_SIZE;
+	pg_off = offset_in_page(loc);
 
 	/* If the quota straddles a page boundary, split the write in two */
 	if ((pg_off + nbytes) > PAGE_SIZE) {
@@ -1183,7 +1180,7 @@
  *
  * Returns: 0 on success.
  *                  min_req = ap->min_target ? ap->min_target : ap->target;
- *                  quota must allow atleast min_req blks for success and
+ *                  quota must allow at least min_req blks for success and
  *                  ap->allowed is set to the number of blocks allowed
  *
  *          -EDQUOT otherwise, quota violation. ap->allowed is set to number
@@ -1478,7 +1475,7 @@
 {
 	if (error == 0 || error == -EROFS)
 		return;
-	if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+	if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) {
 		fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
 		sdp->sd_log_error = error;
 		wake_up(&sdp->sd_logd_waitq);

diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 836f294..765627d 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __QUOTA_DOT_H__

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 0f501f9..c529f87 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/module.h>
@@ -120,6 +117,35 @@
 	}
 }
 
+int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh,
+		     unsigned int blkno, struct gfs2_log_header_host *head)
+{
+	u32 hash, crc;
+
+	if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
+	    lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) ||
+	    (blkno && be32_to_cpu(lh->lh_blkno) != blkno))
+		return 1;
+
+	hash = crc32(~0, lh, LH_V1_SIZE - 4);
+	hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */
+
+	if (be32_to_cpu(lh->lh_hash) != hash)
+		return 1;
+
+	crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4,
+		     sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4);
+
+	if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc))
+		return 1;
+
+	head->lh_sequence = be64_to_cpu(lh->lh_sequence);
+	head->lh_flags = be32_to_cpu(lh->lh_flags);
+	head->lh_tail = be32_to_cpu(lh->lh_tail);
+	head->lh_blkno = be32_to_cpu(lh->lh_blkno);
+
+	return 0;
+}
 /**
  * get_log_header - read the log header for a given segment
  * @jd: the journal
@@ -137,159 +163,18 @@
 static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
 			  struct gfs2_log_header_host *head)
 {
-	struct gfs2_log_header *lh;
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 	struct buffer_head *bh;
-	u32 hash, crc;
 	int error;
 
 	error = gfs2_replay_read_block(jd, blk, &bh);
 	if (error)
 		return error;
-	lh = (void *)bh->b_data;
 
-	hash = crc32(~0, lh, LH_V1_SIZE - 4);
-	hash = ~crc32_le_shift(hash, 4);  /* assume lh_hash is zero */
-
-	crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4,
-		     bh->b_size - LH_V1_SIZE - 4);
-
-	error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
-		lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) ||
-		be32_to_cpu(lh->lh_blkno) != blk ||
-		be32_to_cpu(lh->lh_hash) != hash ||
-		(lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc);
-
+	error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data,
+				 blk, head);
 	brelse(bh);
 
-	if (!error) {
-		head->lh_sequence = be64_to_cpu(lh->lh_sequence);
-		head->lh_flags = be32_to_cpu(lh->lh_flags);
-		head->lh_tail = be32_to_cpu(lh->lh_tail);
-		head->lh_blkno = be32_to_cpu(lh->lh_blkno);
-	}
-	return error;
-}
-
-/**
- * find_good_lh - find a good log header
- * @jd: the journal
- * @blk: the segment to start searching from
- * @lh: the log header to fill in
- * @forward: if true search forward in the log, else search backward
- *
- * Call get_log_header() to get a log header for a segment, but if the
- * segment is bad, either scan forward or backward until we find a good one.
- *
- * Returns: errno
- */
-
-static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
-			struct gfs2_log_header_host *head)
-{
-	unsigned int orig_blk = *blk;
-	int error;
-
-	for (;;) {
-		error = get_log_header(jd, *blk, head);
-		if (error <= 0)
-			return error;
-
-		if (++*blk == jd->jd_blocks)
-			*blk = 0;
-
-		if (*blk == orig_blk) {
-			gfs2_consist_inode(GFS2_I(jd->jd_inode));
-			return -EIO;
-		}
-	}
-}
-
-/**
- * jhead_scan - make sure we've found the head of the log
- * @jd: the journal
- * @head: this is filled in with the log descriptor of the head
- *
- * At this point, seg and lh should be either the head of the log or just
- * before.  Scan forward until we find the head.
- *
- * Returns: errno
- */
-
-static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
-{
-	unsigned int blk = head->lh_blkno;
-	struct gfs2_log_header_host lh;
-	int error;
-
-	for (;;) {
-		if (++blk == jd->jd_blocks)
-			blk = 0;
-
-		error = get_log_header(jd, blk, &lh);
-		if (error < 0)
-			return error;
-		if (error == 1)
-			continue;
-
-		if (lh.lh_sequence == head->lh_sequence) {
-			gfs2_consist_inode(GFS2_I(jd->jd_inode));
-			return -EIO;
-		}
-		if (lh.lh_sequence < head->lh_sequence)
-			break;
-
-		*head = lh;
-	}
-
-	return 0;
-}
-
-/**
- * gfs2_find_jhead - find the head of a log
- * @jd: the journal
- * @head: the log descriptor for the head of the log is returned here
- *
- * Do a binary search of a journal and find the valid log entry with the
- * highest sequence number.  (i.e. the log head)
- *
- * Returns: errno
- */
-
-int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
-{
-	struct gfs2_log_header_host lh_1, lh_m;
-	u32 blk_1, blk_2, blk_m;
-	int error;
-
-	blk_1 = 0;
-	blk_2 = jd->jd_blocks - 1;
-
-	for (;;) {
-		blk_m = (blk_1 + blk_2) / 2;
-
-		error = find_good_lh(jd, &blk_1, &lh_1);
-		if (error)
-			return error;
-
-		error = find_good_lh(jd, &blk_m, &lh_m);
-		if (error)
-			return error;
-
-		if (blk_1 == blk_m || blk_m == blk_2)
-			break;
-
-		if (lh_1.lh_sequence <= lh_m.lh_sequence)
-			blk_1 = blk_m;
-		else
-			blk_2 = blk_m;
-	}
-
-	error = jhead_scan(jd, &lh_1);
-	if (error)
-		return error;
-
-	*head = lh_1;
-
 	return error;
 }
 
@@ -305,7 +190,7 @@
  * Returns: errno
  */
 
-static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
+static int foreach_descriptor(struct gfs2_jdesc *jd, u32 start,
 			      unsigned int end, int pass)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
@@ -375,10 +260,12 @@
 			  struct gfs2_log_header_host *head)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	u32 lblock = head->lh_blkno;
 
-	sdp->sd_log_flush_head = head->lh_blkno;
-	gfs2_replay_incr_blk(jd, &sdp->sd_log_flush_head);
-	gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0,
+	gfs2_replay_incr_blk(jd, &lblock);
+	if (jd->jd_jid == sdp->sd_lockstruct.ls_jid)
+		sdp->sd_log_flush_head = lblock;
+	gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock,
 			      GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY,
 			      REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC);
 }
@@ -456,10 +343,12 @@
 	if (error)
 		goto fail_gunlock_ji;
 
-	error = gfs2_find_jhead(jd, &head);
+	error = gfs2_find_jhead(jd, &head, true);
 	if (error)
 		goto fail_gunlock_ji;
 	t_jhd = ktime_get();
+	fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid,
+		ktime_ms_delta(t_jhd, t_jlck));
 
 	if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
 		fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
@@ -499,7 +388,8 @@
 		}
 
 		t_tlck = ktime_get();
-		fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
+		fs_info(sdp, "jid=%u: Replaying journal...0x%x to 0x%x\n",
+			jd->jd_jid, head.lh_tail, head.lh_blkno);
 
 		for (pass = 0; pass < 2; pass++) {
 			lops_before_scan(jd, &head, pass);

diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 11fdfab..0d30f8e 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __RECOVERY_DOT_H__
@@ -14,7 +11,7 @@
 
 extern struct workqueue_struct *gfs_recovery_wq;
 
-static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, unsigned int *blk)
+static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk)
 {
 	if (++*blk == jd->jd_blocks)
 	        *blk = 0;
@@ -27,10 +24,11 @@
 extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
 extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
 
-extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
-		    struct gfs2_log_header_host *head);
 extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
 extern void gfs2_recover_func(struct work_struct *work);
+extern int __get_log_header(struct gfs2_sbd *sdp,
+			    const struct gfs2_log_header *lh, unsigned int blkno,
+			    struct gfs2_log_header_host *head);
 
 #endif /* __RECOVERY_DOT_H__ */
 

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 449d0cb..2466bb4 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -90,7 +87,7 @@
 {
 	unsigned char *byte1, *byte2, *end, cur_state;
 	struct gfs2_bitmap *bi = rbm_bi(rbm);
-	unsigned int buflen = bi->bi_len;
+	unsigned int buflen = bi->bi_bytes;
 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
 
 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
@@ -101,12 +98,16 @@
 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
 
 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
-		pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
+		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
+
+		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
 			rbm->offset, cur_state, new_state);
-		pr_warn("rgrp=0x%llx bi_start=0x%x\n",
-			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
-		pr_warn("bi_offset=0x%x bi_len=0x%x\n",
-			bi->bi_offset, bi->bi_len);
+		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
+			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
+			(unsigned long long)bi->bi_bh->b_blocknr);
+		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
+			bi->bi_offset, bi->bi_bytes,
+			(unsigned long long)gfs2_rbm_to_block(rbm));
 		dump_stack();
 		gfs2_consist_rgrpd(rbm->rgd);
 		return;
@@ -269,15 +270,10 @@
 
 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 {
-	u64 rblock = block - rbm->rgd->rd_data0;
-
-	if (WARN_ON_ONCE(rblock > UINT_MAX))
-		return -EINVAL;
-	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
+	if (!rgrp_contains_block(rbm->rgd, block))
 		return -E2BIG;
-
 	rbm->bii = 0;
-	rbm->offset = (u32)(rblock);
+	rbm->offset = block - rbm->rgd->rd_data0;
 	/* Check if the block is within the first block */
 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
 		return 0;
@@ -382,7 +378,7 @@
 		if (bi->bi_clone)
 			start = bi->bi_clone;
 		start += bi->bi_offset;
-		end = start + bi->bi_len;
+		end = start + bi->bi_bytes;
 		BUG_ON(rbm.offset & 3);
 		start += (rbm.offset / GFS2_NBBY);
 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
@@ -467,7 +463,7 @@
 			count[x] += gfs2_bitcount(rgd,
 						  bi->bi_bh->b_data +
 						  bi->bi_offset,
-						  bi->bi_len, x);
+						  bi->bi_bytes, x);
 	}
 
 	if (count[0] != rgd->rd_free) {
@@ -614,11 +610,12 @@
 	return gfs2_qa_alloc(ip);
 }
 
-static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
+static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
+		    const char *fs_id_buf)
 {
 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
 
-	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
+	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
 		       (unsigned long long)ip->i_no_addr,
 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
 		       rs->rs_rbm.offset, rs->rs_free);
@@ -642,7 +639,10 @@
 	RB_CLEAR_NODE(&rs->rs_node);
 
 	if (rs->rs_free) {
-		struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
+		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
+				 rs->rs_free - 1;
+		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
+		struct gfs2_bitmap *start, *last;
 
 		/* return reserved blocks to the rgrp */
 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
@@ -653,7 +653,13 @@
 		   it will force the number to be recalculated later. */
 		rgd->rd_extfail_pt += rs->rs_free;
 		rs->rs_free = 0;
-		clear_bit(GBF_FULL, &bi->bi_flags);
+		if (gfs2_rbm_from_block(&last_rbm, last_block))
+			return;
+		start = rbm_bi(&rs->rs_rbm);
+		last = rbm_bi(&last_rbm);
+		do
+			clear_bit(GBF_FULL, &start->bi_flags);
+		while (start++ != last);
 	}
 }
 
@@ -739,11 +745,13 @@
 
 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
 {
-	pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
-	pr_info("ri_length = %u\n", rgd->rd_length);
-	pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
-	pr_info("ri_data = %u\n", rgd->rd_data);
-	pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+
+	fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
+	fs_info(sdp, "ri_length = %u\n", rgd->rd_length);
+	fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
+	fs_info(sdp, "ri_data = %u\n", rgd->rd_data);
+	fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes);
 }
 
 /**
@@ -781,21 +789,21 @@
 			bytes = bytes_left;
 			bi->bi_offset = sizeof(struct gfs2_rgrp);
 			bi->bi_start = 0;
-			bi->bi_len = bytes;
+			bi->bi_bytes = bytes;
 			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* header block */
 		} else if (x == 0) {
 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
 			bi->bi_offset = sizeof(struct gfs2_rgrp);
 			bi->bi_start = 0;
-			bi->bi_len = bytes;
+			bi->bi_bytes = bytes;
 			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* last block */
 		} else if (x + 1 == length) {
 			bytes = bytes_left;
 			bi->bi_offset = sizeof(struct gfs2_meta_header);
 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
-			bi->bi_len = bytes;
+			bi->bi_bytes = bytes;
 			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* other blocks */
 		} else {
@@ -803,7 +811,7 @@
 				sizeof(struct gfs2_meta_header);
 			bi->bi_offset = sizeof(struct gfs2_meta_header);
 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
-			bi->bi_len = bytes;
+			bi->bi_bytes = bytes;
 			bi->bi_blocks = bytes * GFS2_NBBY;
 		}
 
@@ -815,11 +823,11 @@
 		return -EIO;
 	}
 	bi = rgd->rd_bits + (length - 1);
-	if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
+	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
 		if (gfs2_consist_rgrpd(rgd)) {
 			gfs2_rindex_print(rgd);
 			fs_err(sdp, "start=%u len=%u offset=%u\n",
-			       bi->bi_start, bi->bi_len, bi->bi_offset);
+			       bi->bi_start, bi->bi_bytes, bi->bi_offset);
 		}
 		return -EIO;
 	}
@@ -1104,12 +1112,36 @@
 {
 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	int valid = 1;
 
-	if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free ||
-	    rgl->rl_dinodes != str->rg_dinodes ||
-	    rgl->rl_igeneration != str->rg_igeneration)
-		return 0;
-	return 1;
+	if (rgl->rl_flags != str->rg_flags) {
+		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
+			(unsigned long long)rgd->rd_addr,
+		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
+		valid = 0;
+	}
+	if (rgl->rl_free != str->rg_free) {
+		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
+			(unsigned long long)rgd->rd_addr,
+			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
+		valid = 0;
+	}
+	if (rgl->rl_dinodes != str->rg_dinodes) {
+		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
+			(unsigned long long)rgd->rd_addr,
+			be32_to_cpu(rgl->rl_dinodes),
+			be32_to_cpu(str->rg_dinodes));
+		valid = 0;
+	}
+	if (rgl->rl_igeneration != str->rg_igeneration) {
+		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
+			(unsigned long long)rgd->rd_addr,
+			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
+			(unsigned long long)be64_to_cpu(str->rg_igeneration));
+		valid = 0;
+	}
+	return valid;
 }
 
 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
@@ -1123,8 +1155,8 @@
 		goal = 0;
 		buffer = bi->bi_bh->b_data + bi->bi_offset;
 		WARN_ON(!buffer_uptodate(bi->bi_bh));
-		while (goal < bi->bi_len * GFS2_NBBY) {
-			goal = gfs2_bitfit(buffer, bi->bi_len, goal,
+		while (goal < bi->bi_blocks) {
+			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
 					   GFS2_BLKST_UNLINKED);
 			if (goal == BFITNOENT)
 				break;
@@ -1227,7 +1259,7 @@
 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
 	rl_flags &= ~GFS2_RDF_MASK;
 	rgd->rd_flags &= GFS2_RDF_MASK;
-	rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
 	if (rgd->rd_rgl->rl_unlinked == 0)
 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
@@ -1296,7 +1328,7 @@
 	u32 trimmed = 0;
 	u8 diff;
 
-	for (x = 0; x < bi->bi_len; x++) {
+	for (x = 0; x < bi->bi_bytes; x++) {
 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
 		clone += bi->bi_offset;
 		clone += x;
@@ -1542,8 +1574,8 @@
 	if (S_ISDIR(inode->i_mode))
 		extlen = 1;
 	else {
-		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
-		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
+		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
 	}
 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
 		return;
@@ -1696,25 +1728,22 @@
 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
 			 const struct gfs2_inode *ip, bool nowrap)
 {
+	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
 	struct buffer_head *bh;
-	int initial_bii;
-	u32 initial_offset;
-	int first_bii = rbm->bii;
-	u32 first_offset = rbm->offset;
+	int last_bii;
 	u32 offset;
 	u8 *buffer;
-	int n = 0;
-	int iters = rbm->rgd->rd_length;
+	bool wrapped = false;
 	int ret;
 	struct gfs2_bitmap *bi;
 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
 
-	/* If we are not starting at the beginning of a bitmap, then we
-	 * need to add one to the bitmap count to ensure that we search
-	 * the starting bitmap twice.
+	/*
+	 * Determine the last bitmap to search.  If we're not starting at the
+	 * beginning of a bitmap, we need to search that bitmap twice to scan
+	 * the entire resource group.
 	 */
-	if (rbm->offset != 0)
-		iters++;
+	last_bii = rbm->bii - (rbm->offset == 0);
 
 	while(1) {
 		bi = rbm_bi(rbm);
@@ -1728,47 +1757,46 @@
 		WARN_ON(!buffer_uptodate(bh));
 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
 			buffer = bi->bi_clone + bi->bi_offset;
-		initial_offset = rbm->offset;
-		offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
-		if (offset == BFITNOENT)
-			goto bitmap_full;
+		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
+		if (offset == BFITNOENT) {
+			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
+				set_bit(GBF_FULL, &bi->bi_flags);
+			goto next_bitmap;
+		}
 		rbm->offset = offset;
 		if (ip == NULL)
 			return 0;
 
-		initial_bii = rbm->bii;
 		ret = gfs2_reservation_check_and_update(rbm, ip,
 							minext ? *minext : 0,
 							&maxext);
 		if (ret == 0)
 			return 0;
-		if (ret > 0) {
-			n += (rbm->bii - initial_bii);
+		if (ret > 0)
 			goto next_iter;
-		}
 		if (ret == -E2BIG) {
 			rbm->bii = 0;
 			rbm->offset = 0;
-			n += (rbm->bii - initial_bii);
 			goto res_covered_end_of_rgrp;
 		}
 		return ret;
 
-bitmap_full:	/* Mark bitmap as full and fall through */
-		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
-			set_bit(GBF_FULL, &bi->bi_flags);
-
 next_bitmap:	/* Find next bitmap in the rgrp */
 		rbm->offset = 0;
 		rbm->bii++;
 		if (rbm->bii == rbm->rgd->rd_length)
 			rbm->bii = 0;
 res_covered_end_of_rgrp:
-		if ((rbm->bii == 0) && nowrap)
-			break;
-		n++;
+		if (rbm->bii == 0) {
+			if (wrapped)
+				break;
+			wrapped = true;
+			if (nowrap)
+				break;
+		}
 next_iter:
-		if (n >= iters)
+		/* Have we scanned the entire resource group? */
+		if (wrapped && rbm->bii > last_bii)
 			break;
 	}
 
@@ -1778,8 +1806,8 @@
 	/* If the extent was too small, and it's smaller than the smallest
 	   to have failed before, remember for future reference that it's
 	   useless to search this rgrp again for this amount or more. */
-	if ((first_offset == 0) && (first_bii == 0) &&
-	    (*minext < rbm->rgd->rd_extfail_pt))
+	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
+	    *minext < rbm->rgd->rd_extfail_pt)
 		rbm->rgd->rd_extfail_pt = *minext;
 
 	/* If the maximum extent we found is big enough to fulfill the
@@ -2000,7 +2028,7 @@
  * We try our best to find an rgrp that has at least ap->target blocks
  * available. After a couple of passes (loops == 2), the prospects of finding
  * such an rgrp diminish. At this stage, we return the first rgrp that has
- * atleast ap->min_target blocks available. Either way, we set ap->allowed to
+ * at least ap->min_target blocks available. Either way, we set ap->allowed to
  * the number of blocks available in the chosen rgrp.
  *
  * Returns: 0 on success,
@@ -2054,7 +2082,7 @@
 			}
 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
 						   LM_ST_EXCLUSIVE, flags,
-						   &rs->rs_rgd_gh);
+						   &ip->i_rgd_gh);
 			if (unlikely(error))
 				return error;
 			if (!gfs2_rs_active(rs) && (loops < 2) &&
@@ -2063,13 +2091,13 @@
 			if (sdp->sd_args.ar_rgrplvb) {
 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
 				if (unlikely(error)) {
-					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
 					return error;
 				}
 			}
 		}
 
-		/* Skip unuseable resource groups */
+		/* Skip unusable resource groups */
 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
 						 GFS2_RDF_ERROR)) ||
 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
@@ -2106,7 +2134,7 @@
 
 		/* Unlock rgrp if required */
 		if (!rg_locked)
-			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
 next_rgrp:
 		/* Find the next rgrp, and continue looking */
 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
@@ -2143,10 +2171,8 @@
 
 void gfs2_inplace_release(struct gfs2_inode *ip)
 {
-	struct gfs2_blkreserv *rs = &ip->i_res;
-
-	if (gfs2_holder_initialized(&rs->rs_rgd_gh))
-		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+	if (gfs2_holder_initialized(&ip->i_rgd_gh))
+		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
 }
 
 /**
@@ -2185,27 +2211,21 @@
 /**
  * rgblk_free - Change alloc state of given block(s)
  * @sdp: the filesystem
+ * @rgd: the resource group the blocks are in
  * @bstart: the start of a run of blocks to free
  * @blen: the length of the block run (all must lie within ONE RG!)
  * @new_state: GFS2_BLKST_XXX the after-allocation block state
- *
- * Returns:  Resource group containing the block(s)
  */
 
-static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
-				     u32 blen, unsigned char new_state)
+static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
+		       u64 bstart, u32 blen, unsigned char new_state)
 {
 	struct gfs2_rbm rbm;
 	struct gfs2_bitmap *bi, *bi_prev = NULL;
 
-	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
-	if (!rbm.rgd) {
-		if (gfs2_consist(sdp))
-			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
-		return NULL;
-	}
-
-	gfs2_rbm_from_block(&rbm, bstart);
+	rbm.rgd = rgd;
+	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
+		return;
 	while (blen--) {
 		bi = rbm_bi(&rbm);
 		if (bi != bi_prev) {
@@ -2214,7 +2234,7 @@
 						      GFP_NOFS | __GFP_NOFAIL);
 				memcpy(bi->bi_clone + bi->bi_offset,
 				       bi->bi_bh->b_data + bi->bi_offset,
-				       bi->bi_len);
+				       bi->bi_bytes);
 			}
 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
 			bi_prev = bi;
@@ -2222,18 +2242,18 @@
 		gfs2_setbit(&rbm, false, new_state);
 		gfs2_rbm_incr(&rbm);
 	}
-
-	return rbm.rgd;
 }
 
 /**
  * gfs2_rgrp_dump - print out an rgrp
  * @seq: The iterator
  * @gl: The glock in question
+ * @fs_id_buf: pointer to file system id (if requested)
  *
  */
 
-void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl,
+		    const char *fs_id_buf)
 {
 	struct gfs2_rgrpd *rgd = gl->gl_object;
 	struct gfs2_blkreserv *trs;
@@ -2241,14 +2261,23 @@
 
 	if (rgd == NULL)
 		return;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
+	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
+		       fs_id_buf,
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
 		       rgd->rd_reserved, rgd->rd_extfail_pt);
+	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
+		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
+
+		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
+			       be32_to_cpu(rgl->rl_flags),
+			       be32_to_cpu(rgl->rl_free),
+			       be32_to_cpu(rgl->rl_dinodes));
+	}
 	spin_lock(&rgd->rd_rsspin);
 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
-		dump_rs(seq, trs);
+		dump_rs(seq, trs, fs_id_buf);
 	}
 	spin_unlock(&rgd->rd_rsspin);
 }
@@ -2256,10 +2285,13 @@
 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
+
 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
 		(unsigned long long)rgd->rd_addr);
 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
-	gfs2_rgrp_dump(NULL, rgd->rd_gl);
+	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
+	gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
 	rgd->rd_flags |= GFS2_RDF_ERROR;
 }
 
@@ -2296,7 +2328,7 @@
 				goto out;
 			/* We used up our block reservation, so we should
 			   reserve more blocks next time. */
-			atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
+			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
 		}
 		__rs_deltree(rs);
 	}
@@ -2330,7 +2362,10 @@
 	else
 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
 
-	gfs2_rbm_from_block(rbm, goal);
+	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
+		rbm->bii = 0;
+		rbm->offset = 0;
+	}
 }
 
 /**
@@ -2393,7 +2428,7 @@
 		}
 	}
 	if (rbm.rgd->rd_free < *nblocks) {
-		pr_warn("nblocks=%u\n", *nblocks);
+		fs_warn(sdp, "nblocks=%u\n", *nblocks);
 		goto rgrp_error;
 	}
 
@@ -2410,7 +2445,7 @@
 
 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
 	if (dinode)
-		gfs2_trans_add_unrevoke(sdp, block, *nblocks);
+		gfs2_trans_remove_revoke(sdp, block, *nblocks);
 
 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
 
@@ -2428,20 +2463,19 @@
 /**
  * __gfs2_free_blocks - free a contiguous run of block(s)
  * @ip: the inode these blocks are being freed from
+ * @rgd: the resource group the blocks are in
  * @bstart: first block of a run of contiguous blocks
  * @blen: the length of the block run
  * @meta: 1 if the blocks represent metadata
  *
  */
 
-void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
+void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+			u64 bstart, u32 blen, int meta)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrpd *rgd;
 
-	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
-	if (!rgd)
-		return;
+	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
 	rgd->rd_free += blen;
 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
@@ -2456,16 +2490,18 @@
 /**
  * gfs2_free_meta - free a contiguous run of data block(s)
  * @ip: the inode these blocks are being freed from
+ * @rgd: the resource group the blocks are in
  * @bstart: first block of a run of contiguous blocks
  * @blen: the length of the block run
  *
  */
 
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+		    u64 bstart, u32 blen)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 
-	__gfs2_free_blocks(ip, bstart, blen, 1);
+	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
 	gfs2_statfs_change(sdp, 0, +blen, 0);
 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
 }
@@ -2477,9 +2513,10 @@
 	struct gfs2_rgrpd *rgd;
 	u64 blkno = ip->i_no_addr;
 
-	rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
+	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
 	if (!rgd)
 		return;
+	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -2489,13 +2526,8 @@
 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
-	struct gfs2_rgrpd *tmp_rgd;
 
-	tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE);
-	if (!tmp_rgd)
-		return;
-	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
-
+	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
 	if (!rgd->rd_dinodes)
 		gfs2_consist_rgrpd(rgd);
 	rgd->rd_dinodes--;
@@ -2539,7 +2571,8 @@
 
 	rbm.rgd = rgd;
 	error = gfs2_rbm_from_block(&rbm, no_addr);
-	WARN_ON_ONCE(error != 0);
+	if (WARN_ON_ONCE(error))
+		goto fail;
 
 	if (gfs2_testbit(&rbm, false) != type)
 		error = -ESTALE;
@@ -2625,13 +2658,12 @@
  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
  *      and initialize an array of glock holders for them
  * @rlist: the list of resource groups
- * @state: the lock state to acquire the RG lock in
  *
  * FIXME: Don't use NOFAIL
  *
  */
 
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
 {
 	unsigned int x;
 
@@ -2640,7 +2672,7 @@
 				      GFP_NOFS | __GFP_NOFAIL);
 	for (x = 0; x < rlist->rl_rgrps; x++)
 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
-				state, 0,
+				LM_ST_EXCLUSIVE, 0,
 				&rlist->rl_ghs[x]);
 }
 

diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index e90478e..c14a673 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __RGRP_DOT_H__
@@ -18,8 +15,7 @@
  * By reserving 32 blocks at a time, we can optimize / shortcut how we search
  * through the bitmaps by looking a word at a time.
  */
-#define RGRP_RSRV_MINBYTES 8
-#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+#define RGRP_RSRV_MINBLKS 32
 #define RGRP_RSRV_ADDBLKS 64
 
 struct gfs2_rgrpd;
@@ -52,8 +48,10 @@
 extern int gfs2_rsqa_alloc(struct gfs2_inode *ip);
 extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
 extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount);
-extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
-extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+			       u64 bstart, u32 blen, int meta);
+extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+			   u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
 extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
@@ -68,10 +66,11 @@
 
 extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
 			   u64 block);
-extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist);
 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl,
+			   const char *fs_id_buf);
 extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				   struct buffer_head *bh,
 				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c212893..5fa1eec 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -45,258 +42,7 @@
 #include "util.h"
 #include "sys.h"
 #include "xattr.h"
-
-#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
-
-enum {
-	Opt_lockproto,
-	Opt_locktable,
-	Opt_hostdata,
-	Opt_spectator,
-	Opt_ignore_local_fs,
-	Opt_localflocks,
-	Opt_localcaching,
-	Opt_debug,
-	Opt_nodebug,
-	Opt_upgrade,
-	Opt_acl,
-	Opt_noacl,
-	Opt_quota_off,
-	Opt_quota_account,
-	Opt_quota_on,
-	Opt_quota,
-	Opt_noquota,
-	Opt_suiddir,
-	Opt_nosuiddir,
-	Opt_data_writeback,
-	Opt_data_ordered,
-	Opt_meta,
-	Opt_discard,
-	Opt_nodiscard,
-	Opt_commit,
-	Opt_err_withdraw,
-	Opt_err_panic,
-	Opt_statfs_quantum,
-	Opt_statfs_percent,
-	Opt_quota_quantum,
-	Opt_barrier,
-	Opt_nobarrier,
-	Opt_rgrplvb,
-	Opt_norgrplvb,
-	Opt_loccookie,
-	Opt_noloccookie,
-	Opt_error,
-};
-
-static const match_table_t tokens = {
-	{Opt_lockproto, "lockproto=%s"},
-	{Opt_locktable, "locktable=%s"},
-	{Opt_hostdata, "hostdata=%s"},
-	{Opt_spectator, "spectator"},
-	{Opt_spectator, "norecovery"},
-	{Opt_ignore_local_fs, "ignore_local_fs"},
-	{Opt_localflocks, "localflocks"},
-	{Opt_localcaching, "localcaching"},
-	{Opt_debug, "debug"},
-	{Opt_nodebug, "nodebug"},
-	{Opt_upgrade, "upgrade"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_quota_off, "quota=off"},
-	{Opt_quota_account, "quota=account"},
-	{Opt_quota_on, "quota=on"},
-	{Opt_quota, "quota"},
-	{Opt_noquota, "noquota"},
-	{Opt_suiddir, "suiddir"},
-	{Opt_nosuiddir, "nosuiddir"},
-	{Opt_data_writeback, "data=writeback"},
-	{Opt_data_ordered, "data=ordered"},
-	{Opt_meta, "meta"},
-	{Opt_discard, "discard"},
-	{Opt_nodiscard, "nodiscard"},
-	{Opt_commit, "commit=%d"},
-	{Opt_err_withdraw, "errors=withdraw"},
-	{Opt_err_panic, "errors=panic"},
-	{Opt_statfs_quantum, "statfs_quantum=%d"},
-	{Opt_statfs_percent, "statfs_percent=%d"},
-	{Opt_quota_quantum, "quota_quantum=%d"},
-	{Opt_barrier, "barrier"},
-	{Opt_nobarrier, "nobarrier"},
-	{Opt_rgrplvb, "rgrplvb"},
-	{Opt_norgrplvb, "norgrplvb"},
-	{Opt_loccookie, "loccookie"},
-	{Opt_noloccookie, "noloccookie"},
-	{Opt_error, NULL}
-};
-
-/**
- * gfs2_mount_args - Parse mount options
- * @args: The structure into which the parsed options will be written
- * @options: The options to parse
- *
- * Return: errno
- */
-
-int gfs2_mount_args(struct gfs2_args *args, char *options)
-{
-	char *o;
-	int token;
-	substring_t tmp[MAX_OPT_ARGS];
-	int rv;
-
-	/* Split the options into tokens with the "," character and
-	   process them */
-
-	while (1) {
-		o = strsep(&options, ",");
-		if (o == NULL)
-			break;
-		if (*o == '\0')
-			continue;
-
-		token = match_token(o, tokens, tmp);
-		switch (token) {
-		case Opt_lockproto:
-			match_strlcpy(args->ar_lockproto, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_locktable:
-			match_strlcpy(args->ar_locktable, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_hostdata:
-			match_strlcpy(args->ar_hostdata, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_spectator:
-			args->ar_spectator = 1;
-			break;
-		case Opt_ignore_local_fs:
-			/* Retained for backwards compat only */
-			break;
-		case Opt_localflocks:
-			args->ar_localflocks = 1;
-			break;
-		case Opt_localcaching:
-			/* Retained for backwards compat only */
-			break;
-		case Opt_debug:
-			if (args->ar_errors == GFS2_ERRORS_PANIC) {
-				pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
-				return -EINVAL;
-			}
-			args->ar_debug = 1;
-			break;
-		case Opt_nodebug:
-			args->ar_debug = 0;
-			break;
-		case Opt_upgrade:
-			/* Retained for backwards compat only */
-			break;
-		case Opt_acl:
-			args->ar_posix_acl = 1;
-			break;
-		case Opt_noacl:
-			args->ar_posix_acl = 0;
-			break;
-		case Opt_quota_off:
-		case Opt_noquota:
-			args->ar_quota = GFS2_QUOTA_OFF;
-			break;
-		case Opt_quota_account:
-			args->ar_quota = GFS2_QUOTA_ACCOUNT;
-			break;
-		case Opt_quota_on:
-		case Opt_quota:
-			args->ar_quota = GFS2_QUOTA_ON;
-			break;
-		case Opt_suiddir:
-			args->ar_suiddir = 1;
-			break;
-		case Opt_nosuiddir:
-			args->ar_suiddir = 0;
-			break;
-		case Opt_data_writeback:
-			args->ar_data = GFS2_DATA_WRITEBACK;
-			break;
-		case Opt_data_ordered:
-			args->ar_data = GFS2_DATA_ORDERED;
-			break;
-		case Opt_meta:
-			args->ar_meta = 1;
-			break;
-		case Opt_discard:
-			args->ar_discard = 1;
-			break;
-		case Opt_nodiscard:
-			args->ar_discard = 0;
-			break;
-		case Opt_commit:
-			rv = match_int(&tmp[0], &args->ar_commit);
-			if (rv || args->ar_commit <= 0) {
-				pr_warn("commit mount option requires a positive numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_statfs_quantum:
-			rv = match_int(&tmp[0], &args->ar_statfs_quantum);
-			if (rv || args->ar_statfs_quantum < 0) {
-				pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_quota_quantum:
-			rv = match_int(&tmp[0], &args->ar_quota_quantum);
-			if (rv || args->ar_quota_quantum <= 0) {
-				pr_warn("quota_quantum mount option requires a positive numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_statfs_percent:
-			rv = match_int(&tmp[0], &args->ar_statfs_percent);
-			if (rv || args->ar_statfs_percent < 0 ||
-			    args->ar_statfs_percent > 100) {
-				pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_err_withdraw:
-			args->ar_errors = GFS2_ERRORS_WITHDRAW;
-			break;
-		case Opt_err_panic:
-			if (args->ar_debug) {
-				pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
-				return -EINVAL;
-			}
-			args->ar_errors = GFS2_ERRORS_PANIC;
-			break;
-		case Opt_barrier:
-			args->ar_nobarrier = 0;
-			break;
-		case Opt_nobarrier:
-			args->ar_nobarrier = 1;
-			break;
-		case Opt_rgrplvb:
-			args->ar_rgrplvb = 1;
-			break;
-		case Opt_norgrplvb:
-			args->ar_rgrplvb = 0;
-			break;
-		case Opt_loccookie:
-			args->ar_loccookie = 1;
-			break;
-		case Opt_noloccookie:
-			args->ar_loccookie = 0;
-			break;
-		case Opt_error:
-		default:
-			pr_warn("invalid mount option: %s\n", o);
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
+#include "lops.h"
 
 /**
  * gfs2_jindex_free - Clear all the journal index information
@@ -396,6 +142,7 @@
 
 fail:
 	kthread_stop(sdp->sd_logd_process);
+	sdp->sd_logd_process = NULL;
 	return error;
 }
 
@@ -425,7 +172,7 @@
 
 	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
-	error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+	error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
 	if (error)
 		goto fail;
 
@@ -453,8 +200,12 @@
 	freeze_gh.gh_flags |= GL_NOCACHE;
 	gfs2_glock_dq_uninit(&freeze_gh);
 fail_threads:
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
+	if (sdp->sd_quotad_process)
+		kthread_stop(sdp->sd_quotad_process);
+	sdp->sd_quotad_process = NULL;
+	if (sdp->sd_logd_process)
+		kthread_stop(sdp->sd_logd_process);
+	sdp->sd_logd_process = NULL;
 	return error;
 }
 
@@ -680,7 +431,7 @@
 		error = gfs2_jdesc_check(jd);
 		if (error)
 			break;
-		error = gfs2_find_jhead(jd, &lh);
+		error = gfs2_find_jhead(jd, &lh, false);
 		if (error)
 			break;
 		if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
@@ -802,7 +553,7 @@
 
 	if (!(flags & I_DIRTY_INODE))
 		return;
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)))
 		return;
 	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
 		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
@@ -844,20 +595,24 @@
  * Returns: errno
  */
 
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 {
 	struct gfs2_holder freeze_gh;
 	int error;
 
 	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE,
 				   &freeze_gh);
-	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+	if (error && !test_bit(SDF_WITHDRAWN, &sdp->sd_flags))
 		return error;
 
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
-
 	flush_workqueue(gfs2_delete_workqueue);
+	if (sdp->sd_quotad_process)
+		kthread_stop(sdp->sd_quotad_process);
+	sdp->sd_quotad_process = NULL;
+	if (sdp->sd_logd_process)
+		kthread_stop(sdp->sd_logd_process);
+	sdp->sd_logd_process = NULL;
+
 	gfs2_quota_sync(sdp->sd_vfs, 0);
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
 
@@ -971,15 +726,14 @@
 	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
 				   &freeze_gh);
 	if (error) {
-		printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error);
+		fs_info(sdp, "GFS2: couldn't get freeze lock : %d\n", error);
 		gfs2_assert_withdraw(sdp, 0);
-	}
-	else {
+	} else {
 		atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
 		error = thaw_super(sb);
 		if (error) {
-			printk(KERN_INFO "GFS2: couldn't thaw filesystem: %d\n",
-			       error);
+			fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n",
+				error);
 			gfs2_assert_withdraw(sdp, 0);
 		}
 		if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
@@ -987,6 +741,8 @@
 		gfs2_glock_dq_uninit(&freeze_gh);
 	}
 	deactivate_super(sb);
+	clear_bit_unlock(SDF_FS_FROZEN, &sdp->sd_flags);
+	wake_up_bit(&sdp->sd_flags, SDF_FS_FROZEN);
 	return;
 }
 
@@ -1005,7 +761,7 @@
 	if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
 		goto out;
 
-	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+	if (test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) {
 		error = -EINVAL;
 		goto out;
 	}
@@ -1015,20 +771,15 @@
 		if (!error)
 			break;
 
-		switch (error) {
-		case -EBUSY:
+		if (error == -EBUSY)
 			fs_err(sdp, "waiting for recovery before freeze\n");
-			break;
-
-		default:
+		else
 			fs_err(sdp, "error freezing FS: %d\n", error);
-			break;
-		}
 
 		fs_err(sdp, "retrying...\n");
 		msleep(1000);
 	}
-	error = 0;
+	set_bit(SDF_FS_FROZEN, &sdp->sd_flags);
 out:
 	mutex_unlock(&sdp->sd_freeze_mutex);
 	return error;
@@ -1053,7 +804,7 @@
 
 	gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
 	mutex_unlock(&sdp->sd_freeze_mutex);
-	return 0;
+	return wait_on_bit(&sdp->sd_flags, SDF_FS_FROZEN, TASK_INTERRUPTIBLE);
 }
 
 /**
@@ -1224,86 +975,6 @@
 }
 
 /**
- * gfs2_remount_fs - called when the FS is remounted
- * @sb:  the filesystem
- * @flags:  the remount flags
- * @data:  extra data passed in (not used right now)
- *
- * Returns: errno
- */
-
-static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
-	struct gfs2_tune *gt = &sdp->sd_tune;
-	int error;
-
-	sync_filesystem(sb);
-
-	spin_lock(&gt->gt_spin);
-	args.ar_commit = gt->gt_logd_secs;
-	args.ar_quota_quantum = gt->gt_quota_quantum;
-	if (gt->gt_statfs_slow)
-		args.ar_statfs_quantum = 0;
-	else
-		args.ar_statfs_quantum = gt->gt_statfs_quantum;
-	spin_unlock(&gt->gt_spin);
-	error = gfs2_mount_args(&args, data);
-	if (error)
-		return error;
-
-	/* Not allowed to change locking details */
-	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
-	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
-	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
-		return -EINVAL;
-
-	/* Some flags must not be changed */
-	if (args_neq(&args, &sdp->sd_args, spectator) ||
-	    args_neq(&args, &sdp->sd_args, localflocks) ||
-	    args_neq(&args, &sdp->sd_args, meta))
-		return -EINVAL;
-
-	if (sdp->sd_args.ar_spectator)
-		*flags |= SB_RDONLY;
-
-	if ((sb->s_flags ^ *flags) & SB_RDONLY) {
-		if (*flags & SB_RDONLY)
-			error = gfs2_make_fs_ro(sdp);
-		else
-			error = gfs2_make_fs_rw(sdp);
-		if (error)
-			return error;
-	}
-
-	sdp->sd_args = args;
-	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= SB_POSIXACL;
-	else
-		sb->s_flags &= ~SB_POSIXACL;
-	if (sdp->sd_args.ar_nobarrier)
-		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-	else
-		clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-	spin_lock(&gt->gt_spin);
-	gt->gt_logd_secs = args.ar_commit;
-	gt->gt_quota_quantum = args.ar_quota_quantum;
-	if (args.ar_statfs_quantum) {
-		gt->gt_statfs_slow = 0;
-		gt->gt_statfs_quantum = args.ar_statfs_quantum;
-	}
-	else {
-		gt->gt_statfs_slow = 1;
-		gt->gt_statfs_quantum = 30;
-	}
-	spin_unlock(&gt->gt_spin);
-
-	gfs2_online_uevent(sdp);
-	return 0;
-}
-
-/**
  * gfs2_drop_inode - Drop an inode (test for remote unlink)
  * @inode: The inode to drop
  *
@@ -1630,8 +1301,6 @@
 			goto out_truncate;
 	}
 
-	/* Case 1 starts here */
-
 	if (S_ISDIR(inode->i_mode) &&
 	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
 		error = gfs2_dir_exhash_dealloc(ip);
@@ -1670,7 +1339,6 @@
 	write_inode_now(inode, 1);
 	gfs2_ail_flush(ip->i_gl, 0);
 
-	/* Case 2 starts here */
 	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 	if (error)
 		goto out_unlock;
@@ -1680,7 +1348,6 @@
 	gfs2_trans_end(sdp);
 
 out_unlock:
-	/* Error path for case 1 */
 	if (gfs2_rs_active(&ip->i_res))
 		gfs2_rs_deltree(&ip->i_res);
 
@@ -1699,7 +1366,6 @@
 	if (error && error != GLR_TRYFAILED && error != -EROFS)
 		fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
 out:
-	/* Case 3 starts here */
 	truncate_inode_pages_final(&inode->i_data);
 	gfs2_rsqa_delete(ip, NULL);
 	gfs2_ordered_del_inode(ip);
@@ -1726,30 +1392,24 @@
 	struct gfs2_inode *ip;
 
 	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
-	if (ip) {
-		ip->i_flags = 0;
-		ip->i_gl = NULL;
-		memset(&ip->i_res, 0, sizeof(ip->i_res));
-		RB_CLEAR_NODE(&ip->i_res.rs_node);
-		ip->i_rahead = 0;
-	}
+	if (!ip)
+		return NULL;
+	ip->i_flags = 0;
+	ip->i_gl = NULL;
+	memset(&ip->i_res, 0, sizeof(ip->i_res));
+	RB_CLEAR_NODE(&ip->i_res.rs_node);
+	ip->i_rahead = 0;
 	return &ip->i_inode;
 }
 
-static void gfs2_i_callback(struct rcu_head *head)
+static void gfs2_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(gfs2_inode_cachep, inode);
-}
-
-static void gfs2_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, gfs2_i_callback);
+	kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode));
 }
 
 const struct super_operations gfs2_super_ops = {
 	.alloc_inode		= gfs2_alloc_inode,
-	.destroy_inode		= gfs2_destroy_inode,
+	.free_inode		= gfs2_free_inode,
 	.write_inode		= gfs2_write_inode,
 	.dirty_inode		= gfs2_dirty_inode,
 	.evict_inode		= gfs2_evict_inode,
@@ -1758,7 +1418,6 @@
 	.freeze_super		= gfs2_freeze,
 	.thaw_super		= gfs2_unfreeze,
 	.statfs			= gfs2_statfs,
-	.remount_fs		= gfs2_remount_fs,
 	.drop_inode		= gfs2_drop_inode,
 	.show_options		= gfs2_show_options,
 };

diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 73c97dc..b8bf811 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __SUPER_DOT_H__
@@ -27,8 +24,6 @@
 
 extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
-extern int gfs2_mount_args(struct gfs2_args *args, char *data);
-
 extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
 extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 
@@ -36,6 +31,7 @@
 				     struct gfs2_inode **ipp);
 
 extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
 extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
@@ -47,6 +43,8 @@
 extern int gfs2_statfs_sync(struct super_block *sb, int type);
 extern void gfs2_freeze_func(struct work_struct *work);
 
+extern void free_sbd(struct gfs2_sbd *sdp);
+
 extern struct file_system_type gfs2_fs_type;
 extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 1787d29..dd15b8e 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -121,7 +118,7 @@
 
 static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
 {
-	unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
+	unsigned int b = test_bit(SDF_WITHDRAWN, &sdp->sd_flags);
 	return snprintf(buf, PAGE_SIZE, "%u\n", b);
 }
 
@@ -299,17 +296,18 @@
 	&gfs2_attr_demote_rq.attr,
 	NULL,
 };
+ATTRIBUTE_GROUPS(gfs2);
 
 static void gfs2_sbd_release(struct kobject *kobj)
 {
 	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
 
-	kfree(sdp);
+	free_sbd(sdp);
 }
 
 static struct kobj_type gfs2_ktype = {
 	.release = gfs2_sbd_release,
-	.default_attrs = gfs2_attrs,
+	.default_groups = gfs2_groups,
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
@@ -650,7 +648,6 @@
 	char ro[20];
 	char spectator[20];
 	char *envp[] = { ro, spectator, NULL };
-	int sysfs_frees_sdp = 0;
 
 	sprintf(ro, "RDONLY=%d", sb_rdonly(sb));
 	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
@@ -661,8 +658,6 @@
 	if (error)
 		goto fail_reg;
 
-	sysfs_frees_sdp = 1; /* Freeing sdp is now done by sysfs calling
-				function gfs2_sbd_release. */
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
 		goto fail_reg;
@@ -685,12 +680,8 @@
 fail_tune:
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_reg:
-	free_percpu(sdp->sd_lkstats);
 	fs_err(sdp, "error %d adding sysfs files\n", error);
-	if (sysfs_frees_sdp)
-		kobject_put(&sdp->sd_kobj);
-	else
-		kfree(sdp);
+	kobject_put(&sdp->sd_kobj);
 	sb->s_fs_info = NULL;
 	return error;
 }

diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 79182d6..f8dacf2 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __SYS_DOT_H__

diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 064c9a0..35e3059 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -74,16 +71,16 @@
 	return error;
 }
 
-static void gfs2_print_trans(const struct gfs2_trans *tr)
+static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr)
 {
-	pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
-	pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
+	fs_warn(sdp, "Transaction created at: %pSR\n", (void *)tr->tr_ip);
+	fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n",
 		tr->tr_blocks, tr->tr_revokes, tr->tr_reserved,
 		test_bit(TR_TOUCHED, &tr->tr_flags));
-	pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
+	fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u\n",
 		tr->tr_num_buf_new, tr->tr_num_buf_rm,
 		tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
-		tr->tr_num_revoke, tr->tr_num_revoke_rm);
+		tr->tr_num_revoke);
 }
 
 void gfs2_trans_end(struct gfs2_sbd *sdp)
@@ -109,7 +106,7 @@
 
 	if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) &&
 				       (tr->tr_num_revoke <= tr->tr_revokes)))
-		gfs2_print_trans(tr);
+		gfs2_print_trans(sdp, tr);
 
 	gfs2_log_commit(sdp, tr);
 	if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags))
@@ -124,15 +121,13 @@
 }
 
 static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
-					       struct buffer_head *bh,
-					       const struct gfs2_log_operations *lops)
+					       struct buffer_head *bh)
 {
 	struct gfs2_bufdata *bd;
 
 	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
 	bd->bd_bh = bh;
 	bd->bd_gl = gl;
-	bd->bd_ops = lops;
 	INIT_LIST_HEAD(&bd->bd_list);
 	bh->b_private = bd;
 	return bd;
@@ -169,7 +164,7 @@
 		gfs2_log_unlock(sdp);
 		unlock_buffer(bh);
 		if (bh->b_private == NULL)
-			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+			bd = gfs2_alloc_bufdata(gl, bh);
 		else
 			bd = bh->b_private;
 		lock_buffer(bh);
@@ -210,7 +205,7 @@
 		unlock_buffer(bh);
 		lock_page(bh->b_page);
 		if (bh->b_private == NULL)
-			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+			bd = gfs2_alloc_bufdata(gl, bh);
 		else
 			bd = bh->b_private;
 		unlock_page(bh->b_page);
@@ -225,12 +220,13 @@
 	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
 	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
 	if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
-		pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+		fs_err(sdp, "Attempting to add uninitialised block to "
+		       "journal (inplace block=%lld)\n",
 		       (unsigned long long)bd->bd_bh->b_blocknr);
 		BUG();
 	}
 	if (unlikely(state == SFS_FROZEN)) {
-		printk(KERN_INFO "GFS2:adding buf while frozen\n");
+		fs_info(sdp, "GFS2:adding buf while frozen\n");
 		gfs2_assert_withdraw(sdp, 0);
 	}
 	gfs2_pin(sdp, bd->bd_bh);
@@ -254,20 +250,20 @@
 	tr->tr_num_revoke++;
 }
 
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
+void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
 {
 	struct gfs2_bufdata *bd, *tmp;
 	struct gfs2_trans *tr = current->journal_info;
 	unsigned int n = len;
 
 	gfs2_log_lock(sdp);
-	list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_list) {
+	list_for_each_entry_safe(bd, tmp, &sdp->sd_log_revokes, bd_list) {
 		if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
 			list_del_init(&bd->bd_list);
 			gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
 			sdp->sd_log_num_revoke--;
 			kmem_cache_free(gfs2_bufdata_cachep, bd);
-			tr->tr_num_revoke_rm++;
+			tr->tr_num_revoke--;
 			if (--n == 0)
 				break;
 		}

diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index ad70087..6071334 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __TRANS_DOT_H__
@@ -44,6 +41,6 @@
 extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
 extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
 extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
+extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
 
 #endif /* __TRANS_DOT_H__ */

diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 59c811d..c451591 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -19,6 +16,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "glock.h"
+#include "rgrp.h"
 #include "util.h"
 
 struct kmem_cache *gfs2_glock_cachep __read_mostly;
@@ -43,7 +41,7 @@
 	struct va_format vaf;
 
 	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
-	    test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+	    test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags))
 		return 0;
 
 	if (fmt) {
@@ -180,7 +178,11 @@
 			 const char *function, char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
 	int rv;
+
+	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
+	gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
 	rv = gfs2_lm_withdraw(sdp,
 			      "fatal: filesystem consistency error\n"
 			      "  RG = %llu\n"
@@ -256,12 +258,13 @@
 			const char *function, char *file, unsigned int line,
 			bool withdraw)
 {
-	fs_err(sdp,
-	       "fatal: I/O error\n"
-	       "  block = %llu\n"
-	       "  function = %s, file = %s, line = %u\n",
-	       (unsigned long long)bh->b_blocknr,
-	       function, file, line);
+	if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags))
+		fs_err(sdp,
+		       "fatal: I/O error\n"
+		       "  block = %llu\n"
+		       "  function = %s, file = %s, line = %u\n",
+		       (unsigned long long)bh->b_blocknr,
+		       function, file, line);
 	if (withdraw)
 		gfs2_lm_withdraw(sdp, NULL);
 }

diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 96ac4ab..4b68b2c 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __UTIL_DOT_H__
@@ -86,7 +83,7 @@
 	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
 	u32 magic = be32_to_cpu(mh->mh_magic);
 	if (unlikely(magic != GFS2_MAGIC)) {
-		pr_err("Magic number missing at %llu\n",
+		fs_err(sdp, "Magic number missing at %llu\n",
 		       (unsigned long long)bh->b_blocknr);
 		return -EIO;
 	}

diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 3851598..bbe593d 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #include <linux/slab.h>
@@ -283,7 +280,7 @@
 			blen++;
 		else {
 			if (bstart)
-				gfs2_free_meta(ip, bstart, blen);
+				gfs2_free_meta(ip, rgd, bstart, blen);
 			bstart = bn;
 			blen = 1;
 		}
@@ -292,7 +289,7 @@
 		gfs2_add_inode_blocks(&ip->i_inode, -1);
 	}
 	if (bstart)
-		gfs2_free_meta(ip, bstart, blen);
+		gfs2_free_meta(ip, rgd, bstart, blen);
 
 	if (prev && !leave) {
 		u32 len;
@@ -631,7 +628,7 @@
 	error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 	if (error)
 		return error;
-	gfs2_trans_add_unrevoke(sdp, block, 1);
+	gfs2_trans_remove_revoke(sdp, block, 1);
 	*bhp = gfs2_meta_new(ip->i_gl, block);
 	gfs2_trans_add_meta(ip->i_gl, *bhp);
 	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
@@ -693,7 +690,7 @@
 			error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 			if (error)
 				return error;
-			gfs2_trans_add_unrevoke(sdp, block, 1);
+			gfs2_trans_remove_revoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
 			gfs2_trans_add_meta(ip->i_gl, bh);
 			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
@@ -997,7 +994,7 @@
 		error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL);
 		if (error)
 			return error;
-		gfs2_trans_add_unrevoke(sdp, blk, 1);
+		gfs2_trans_remove_revoke(sdp, blk, 1);
 		indbh = gfs2_meta_new(ip->i_gl, blk);
 		gfs2_trans_add_meta(ip->i_gl, indbh);
 		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
@@ -1250,6 +1247,7 @@
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrp_list rlist;
+	struct gfs2_rgrpd *rgd;
 	struct buffer_head *indbh, *dibh;
 	__be64 *eablk, *end;
 	unsigned int rg_blocks = 0;
@@ -1299,11 +1297,10 @@
 	else
 		goto out;
 
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+	gfs2_rlist_alloc(&rlist);
 
 	for (x = 0; x < rlist.rl_rgrps; x++) {
-		struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
-
+		rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
 		rg_blocks += rgd->rd_length;
 	}
 
@@ -1320,6 +1317,7 @@
 
 	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
 	bstart = 0;
+	rgd = NULL;
 	blen = 0;
 
 	for (; eablk < end; eablk++) {
@@ -1333,8 +1331,9 @@
 			blen++;
 		else {
 			if (bstart)
-				gfs2_free_meta(ip, bstart, blen);
+				gfs2_free_meta(ip, rgd, bstart, blen);
 			bstart = bn;
+			rgd = gfs2_blk2rgrpd(sdp, bstart, true);
 			blen = 1;
 		}
 
@@ -1342,7 +1341,7 @@
 		gfs2_add_inode_blocks(&ip->i_inode, -1);
 	}
 	if (bstart)
-		gfs2_free_meta(ip, bstart, blen);
+		gfs2_free_meta(ip, rgd, bstart, blen);
 
 	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
 
@@ -1391,7 +1390,7 @@
 	if (error)
 		goto out_gunlock;
 
-	gfs2_free_meta(ip, ip->i_eattr, 1);
+	gfs2_free_meta(ip, rgd, ip->i_eattr, 1);
 
 	ip->i_eattr = 0;
 	gfs2_add_inode_blocks(&ip->i_inode, -1);

diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index 2d887c8..2aed9d7 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
  */
 
 #ifndef __EATTR_DOT_H__

diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index 998e3a6..44f6e89 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config HFS_FS
 	tristate "Apple Macintosh file system support"
 	depends on BLOCK

diff --git a/fs/hfs/Makefile b/fs/hfs/Makefile
index c41f5a8..b65459b 100644
--- a/fs/hfs/Makefile
+++ b/fs/hfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux hfs filesystem routines.
 #

diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index da25c49..8963965 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c

@@ -445,6 +445,7 @@
 			/* restore search_key */
 			hfs_bnode_read_key(node, fd->search_key, 14);
 		}
+		new_node = NULL;
 	}
 
 	if (!rec && node->parent)

diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 9bdff5e..19017d2 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c

@@ -220,6 +220,30 @@
 	return node;
 }
 
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
+{
+	struct inode *inode = tree->inode;
+	u32 count;
+	int res;
+
+	while (tree->free_nodes < rsvd_nodes) {
+		res = hfs_extend_file(inode);
+		if (res)
+			return res;
+		HFS_I(inode)->phys_size = inode->i_size =
+				(loff_t)HFS_I(inode)->alloc_blocks *
+				HFS_SB(tree->sb)->alloc_blksz;
+		HFS_I(inode)->fs_blocks = inode->i_size >>
+					  tree->sb->s_blocksize_bits;
+		inode_set_bytes(inode, inode->i_size);
+		count = inode->i_size >> tree->node_size_shift;
+		tree->free_nodes += count - tree->node_count;
+		tree->node_count = count;
+	}
+	return 0;
+}
+
 struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 {
 	struct hfs_bnode *node, *next_node;
@@ -229,26 +253,11 @@
 	u16 off16;
 	u16 len;
 	u8 *data, byte, m;
-	int i;
+	int i, res;
 
-	while (!tree->free_nodes) {
-		struct inode *inode = tree->inode;
-		u32 count;
-		int res;
-
-		res = hfs_extend_file(inode);
-		if (res)
-			return ERR_PTR(res);
-		HFS_I(inode)->phys_size = inode->i_size =
-				(loff_t)HFS_I(inode)->alloc_blocks *
-				HFS_SB(tree->sb)->alloc_blksz;
-		HFS_I(inode)->fs_blocks = inode->i_size >>
-					  tree->sb->s_blocksize_bits;
-		inode_set_bytes(inode, inode->i_size);
-		count = inode->i_size >> tree->node_size_shift;
-		tree->free_nodes = count - tree->node_count;
-		tree->node_count = count;
-	}
+	res = hfs_bmap_reserve(tree, 1);
+	if (res)
+		return ERR_PTR(res);
 
 	nidx = 0;
 	node = hfs_bnode_find(tree, nidx);

diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index c8b252d..dcc2aab 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h

@@ -82,6 +82,7 @@
 extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp);
 extern void hfs_btree_close(struct hfs_btree *);
 extern void hfs_btree_write(struct hfs_btree *);
+extern int hfs_bmap_reserve(struct hfs_btree *, int);
 extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *);
 extern void hfs_bmap_free(struct hfs_bnode *node);
 

diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 8a66405..d365bf0 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c

@@ -97,6 +97,14 @@
 	if (err)
 		return err;
 
+	/*
+	 * Fail early and avoid ENOSPC during the btree operations. We may
+	 * have to split the root node at most once.
+	 */
+	err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+	if (err)
+		goto err2;
+
 	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
 	entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
 			HFS_CDR_THD : HFS_CDR_FTH,
@@ -295,6 +303,14 @@
 		return err;
 	dst_fd = src_fd;
 
+	/*
+	 * Fail early and avoid ENOSPC during the btree operations. We may
+	 * have to split the root node at most once.
+	 */
+	err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth);
+	if (err)
+		goto out;
+
 	/* find the old dir entry and read the data */
 	hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
 	err = hfs_brec_find(&src_fd);

diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 5d01826..263d502 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c

@@ -117,6 +117,10 @@
 	if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
 		if (res != -ENOENT)
 			return res;
+		/* Fail early and avoid ENOSPC during the btree operation */
+		res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+		if (res)
+			return res;
 		hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
 		HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
 	} else {
@@ -300,7 +304,7 @@
 		return 0;
 
 	blocks = 0;
-	for (i = 0; i < 3; extent++, i++)
+	for (i = 0; i < 3; i++)
 		blocks += be16_to_cpu(extent[i].count);
 
 	res = hfs_free_extents(sb, extent, blocks, blocks);
@@ -341,7 +345,9 @@
 	ablock = (u32)block / HFS_SB(sb)->fs_div;
 
 	if (block >= HFS_I(inode)->fs_blocks) {
-		if (block > HFS_I(inode)->fs_blocks || !create)
+		if (!create)
+			return 0;
+		if (block > HFS_I(inode)->fs_blocks)
 			return -EIO;
 		if (ablock >= HFS_I(inode)->alloc_blocks) {
 			res = hfs_extend_file(inode);

diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a2dfa1b..da243c8 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c

@@ -642,6 +642,8 @@
 
 		truncate_setsize(inode, attr->ia_size);
 		hfs_file_truncate(inode);
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+						  current_time(inode);
 	}
 
 	setattr_copy(inode, attr);

diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1738767..c333246 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c

@@ -167,20 +167,14 @@
 	return i ? &i->vfs_inode : NULL;
 }
 
-static void hfs_i_callback(struct rcu_head *head)
+static void hfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
 }
 
-static void hfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, hfs_i_callback);
-}
-
 static const struct super_operations hfs_super_operations = {
 	.alloc_inode	= hfs_alloc_inode,
-	.destroy_inode	= hfs_destroy_inode,
+	.free_inode	= hfs_free_inode,
 	.write_inode	= hfs_write_inode,
 	.evict_inode	= hfs_evict_inode,
 	.put_super	= hfs_put_super,

diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index a633718..7d4229a 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config HFSPLUS_FS
 	tristate "Apple Extended HFS file system support"
 	depends on BLOCK

diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 2bab6b3..e6d5544 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c

@@ -217,6 +217,11 @@
 	if (err)
 		goto failed_init_create_attr;
 
+	/* Fail early and avoid ENOSPC during the btree operation */
+	err = hfs_bmap_reserve(fd.tree, fd.tree->depth + 1);
+	if (err)
+		goto failed_create_attr;
+
 	if (name) {
 		err = hfsplus_attr_build_key(sb, fd.search_key,
 						inode->i_ino, name);
@@ -313,6 +318,11 @@
 	if (err)
 		return err;
 
+	/* Fail early and avoid ENOSPC during the btree operation */
+	err = hfs_bmap_reserve(fd.tree, fd.tree->depth);
+	if (err)
+		goto out;
+
 	if (name) {
 		err = hfsplus_attr_build_key(sb, fd.search_key,
 						inode->i_ino, name);

diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index aa17a39..1918544 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c

@@ -449,6 +449,7 @@
 			/* restore search_key */
 			hfs_bnode_read_key(node, fd->search_key, 14);
 		}
+		new_node = NULL;
 	}
 
 	if (!rec && node->parent)

diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 3de3bc4..66774f4 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c

@@ -342,6 +342,34 @@
 	return node;
 }
 
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
+{
+	struct inode *inode = tree->inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	u32 count;
+	int res;
+
+	if (rsvd_nodes <= 0)
+		return 0;
+
+	while (tree->free_nodes < rsvd_nodes) {
+		res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
+		if (res)
+			return res;
+		hip->phys_size = inode->i_size =
+			(loff_t)hip->alloc_blocks <<
+				HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
+		hip->fs_blocks =
+			hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
+		inode_set_bytes(inode, inode->i_size);
+		count = inode->i_size >> tree->node_size_shift;
+		tree->free_nodes += count - tree->node_count;
+		tree->node_count = count;
+	}
+	return 0;
+}
+
 struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 {
 	struct hfs_bnode *node, *next_node;
@@ -351,27 +379,11 @@
 	u16 off16;
 	u16 len;
 	u8 *data, byte, m;
-	int i;
+	int i, res;
 
-	while (!tree->free_nodes) {
-		struct inode *inode = tree->inode;
-		struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
-		u32 count;
-		int res;
-
-		res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
-		if (res)
-			return ERR_PTR(res);
-		hip->phys_size = inode->i_size =
-			(loff_t)hip->alloc_blocks <<
-				HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
-		hip->fs_blocks =
-			hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
-		inode_set_bytes(inode, inode->i_size);
-		count = inode->i_size >> tree->node_size_shift;
-		tree->free_nodes = count - tree->node_count;
-		tree->node_count = count;
-	}
+	res = hfs_bmap_reserve(tree, 1);
+	if (res)
+		return ERR_PTR(res);
 
 	nidx = 0;
 	node = hfs_bnode_find(tree, nidx);

diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index a196369..35472cb 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c

@@ -265,6 +265,14 @@
 	if (err)
 		return err;
 
+	/*
+	 * Fail early and avoid ENOSPC during the btree operations. We may
+	 * have to split the root node at most once.
+	 */
+	err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+	if (err)
+		goto err2;
+
 	hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
 	entry_size = hfsplus_fill_cat_thread(sb, &entry,
 		S_ISDIR(inode->i_mode) ?
@@ -333,6 +341,14 @@
 	if (err)
 		return err;
 
+	/*
+	 * Fail early and avoid ENOSPC during the btree operations. We may
+	 * have to split the root node at most once.
+	 */
+	err = hfs_bmap_reserve(fd.tree, 2 * (int)fd.tree->depth - 2);
+	if (err)
+		goto out;
+
 	if (!str) {
 		int len;
 
@@ -433,6 +449,14 @@
 		return err;
 	dst_fd = src_fd;
 
+	/*
+	 * Fail early and avoid ENOSPC during the btree operations. We may
+	 * have to split the root node at most twice.
+	 */
+	err = hfs_bmap_reserve(src_fd.tree, 4 * (int)src_fd.tree->depth - 1);
+	if (err)
+		goto out;
+
 	/* find the old dir entry and read the data */
 	err = hfsplus_cat_build_key(sb, src_fd.search_key,
 			src_dir->i_ino, src_name);

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f376626..29a9dcf 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c

@@ -565,6 +565,7 @@
 	.symlink		= hfsplus_symlink,
 	.mknod			= hfsplus_mknod,
 	.rename			= hfsplus_rename,
+	.getattr		= hfsplus_getattr,
 	.listxattr		= hfsplus_listxattr,
 };
 

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 8e0f597..a930ddd 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c

@@ -100,6 +100,10 @@
 	if (hip->extent_state & HFSPLUS_EXT_NEW) {
 		if (res != -ENOENT)
 			return res;
+		/* Fail early and avoid ENOSPC during the btree operation */
+		res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+		if (res)
+			return res;
 		hfs_brec_insert(fd, hip->cached_extents,
 				sizeof(hfsplus_extent_rec));
 		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
@@ -233,7 +237,9 @@
 	ablock = iblock >> sbi->fs_shift;
 
 	if (iblock >= hip->fs_blocks) {
-		if (iblock > hip->fs_blocks || !create)
+		if (!create)
+			return 0;
+		if (iblock > hip->fs_blocks)
 			return -EIO;
 		if (ablock >= hip->alloc_blocks) {
 			res = hfsplus_file_extend(inode, false);

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 8e03943..b8471bf 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h

@@ -311,6 +311,7 @@
 #define hfs_btree_open hfsplus_btree_open
 #define hfs_btree_close hfsplus_btree_close
 #define hfs_btree_write hfsplus_btree_write
+#define hfs_bmap_reserve hfsplus_bmap_reserve
 #define hfs_bmap_alloc hfsplus_bmap_alloc
 #define hfs_bmap_free hfsplus_bmap_free
 #define hfs_bnode_read hfsplus_bnode_read
@@ -395,6 +396,7 @@
 struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id);
 void hfs_btree_close(struct hfs_btree *tree);
 int hfs_btree_write(struct hfs_btree *tree);
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes);
 struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree);
 void hfs_bmap_free(struct hfs_bnode *node);
 
@@ -486,6 +488,8 @@
 			      struct hfsplus_fork_raw *fork);
 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd);
 int hfsplus_cat_write_inode(struct inode *inode);
+int hfsplus_getattr(const struct path *path, struct kstat *stat,
+		    u32 request_mask, unsigned int query_flags);
 int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 

diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8e9427a..d131c8e 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c

@@ -261,6 +261,7 @@
 		}
 		truncate_setsize(inode, attr->ia_size);
 		hfsplus_file_truncate(inode);
+		inode->i_mtime = inode->i_ctime = current_time(inode);
 	}
 
 	setattr_copy(inode, attr);
@@ -269,6 +270,26 @@
 	return 0;
 }
 
+int hfsplus_getattr(const struct path *path, struct kstat *stat,
+		    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *inode = d_inode(path->dentry);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+
+	if (inode->i_flags & S_APPEND)
+		stat->attributes |= STATX_ATTR_APPEND;
+	if (inode->i_flags & S_IMMUTABLE)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	if (hip->userflags & HFSPLUS_FLG_NODUMP)
+		stat->attributes |= STATX_ATTR_NODUMP;
+
+	stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
+				 STATX_ATTR_NODUMP;
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
 int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync)
 {
@@ -328,6 +349,7 @@
 
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.setattr	= hfsplus_setattr,
+	.getattr	= hfsplus_getattr,
 	.listxattr	= hfsplus_listxattr,
 };
 

diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5e6502e..ce15b94 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c

@@ -57,9 +57,8 @@
 	return 0;
 }
 
-static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
+static inline unsigned int hfsplus_getflags(struct inode *inode)
 {
-	struct inode *inode = file_inode(file);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags = 0;
 
@@ -69,6 +68,13 @@
 		flags |= FS_APPEND_FL;
 	if (hip->userflags & HFSPLUS_FLG_NODUMP)
 		flags |= FS_NODUMP_FL;
+	return flags;
+}
+
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file_inode(file);
+	unsigned int flags = hfsplus_getflags(inode);
 
 	return put_user(flags, user_flags);
 }
@@ -78,6 +84,7 @@
 	struct inode *inode = file_inode(file);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags, new_fl = 0;
+	unsigned int oldflags = hfsplus_getflags(inode);
 	int err = 0;
 
 	err = mnt_want_write_file(file);
@@ -96,13 +103,9 @@
 
 	inode_lock(inode);
 
-	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
-	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
-		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			err = -EPERM;
-			goto out_unlock_inode;
-		}
-	}
+	err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (err)
+		goto out_unlock_inode;
 
 	/* don't silently ignore unsupported ext2 flags */
 	if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb4535e..2b9e574 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/hfsplus/super.c
  *
@@ -18,7 +19,7 @@
 #include <linux/nls.h>
 
 static struct inode *hfsplus_alloc_inode(struct super_block *sb);
-static void hfsplus_destroy_inode(struct inode *inode);
+static void hfsplus_free_inode(struct inode *inode);
 
 #include "hfsplus_fs.h"
 #include "xattr.h"
@@ -361,7 +362,7 @@
 
 static const struct super_operations hfsplus_sops = {
 	.alloc_inode	= hfsplus_alloc_inode,
-	.destroy_inode	= hfsplus_destroy_inode,
+	.free_inode	= hfsplus_free_inode,
 	.write_inode	= hfsplus_write_inode,
 	.evict_inode	= hfsplus_evict_inode,
 	.put_super	= hfsplus_put_super,
@@ -628,18 +629,11 @@
 	return i ? &i->vfs_inode : NULL;
 }
 
-static void hfsplus_i_callback(struct rcu_head *head)
+static void hfsplus_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-
 	kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
 
-static void hfsplus_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, hfsplus_i_callback);
-}
-
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
 static struct dentry *hfsplus_mount(struct file_system_type *fs_type,

diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index d5403b4..bb0b27d 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c

@@ -407,7 +407,7 @@
 	int offset = 0;
 
 	if (!is_known_namespace(xattr_name)) {
-		strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
+		memcpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
 		offset += XATTR_MAC_OSX_PREFIX_LEN;
 		len += XATTR_MAC_OSX_PREFIX_LEN;
 	}

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 33b8423..f4295aa 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h

@@ -87,7 +87,7 @@
 extern int hostfs_do_rmdir(const char *file);
 extern int do_mknod(const char *file, int mode, unsigned int major,
 		    unsigned int minor);
-extern int link_file(const char *from, const char *to);
+extern int link_file(const char *to, const char *from);
 extern int hostfs_do_readlink(char *file, char *buf, int size);
 extern int rename_file(char *from, char *to);
 extern int rename2_file(char *from, char *to, unsigned int flags);

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 444c7b1..5a7eb0c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c

@@ -243,17 +243,11 @@
 	}
 }
 
-static void hostfs_i_callback(struct rcu_head *head)
+static void hostfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kfree(HOSTFS_I(inode));
 }
 
-static void hostfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, hostfs_i_callback);
-}
-
 static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
 {
 	const char *root_path = root->d_sb->s_fs_info;
@@ -270,7 +264,7 @@
 
 static const struct super_operations hostfs_sbops = {
 	.alloc_inode	= hostfs_alloc_inode,
-	.destroy_inode	= hostfs_destroy_inode,
+	.free_inode	= hostfs_free_inode,
 	.evict_inode	= hostfs_evict_inode,
 	.statfs		= hostfs_statfs,
 	.show_options	= hostfs_show_options,

diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c..56aa033 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
 	depends on BLOCK

diff --git a/fs/hpfs/Makefile b/fs/hpfs/Makefile
index 57b786f..153c173 100644
--- a/fs/hpfs/Makefile
+++ b/fs/hpfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux hpfs filesystem routines.
 #

diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index 823a328..302f451 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h

@@ -120,11 +120,11 @@
   u8 bad_sector: 1;			/* bad sector, corrupted disk (???) */
   u8 bad_bitmap: 1;			/* bad bitmap */
   u8 fast: 1;				/* partition was fast formatted */
-  u8 old_wrote: 1;			/* old version wrote to partion */
-  u8 old_wrote_1: 1;			/* old version wrote to partion (?) */
+  u8 old_wrote: 1;			/* old version wrote to partition */
+  u8 old_wrote_1: 1;			/* old version wrote to partition (?) */
 #else
-  u8 old_wrote_1: 1;			/* old version wrote to partion (?) */
-  u8 old_wrote: 1;			/* old version wrote to partion */
+  u8 old_wrote_1: 1;			/* old version wrote to partition (?) */
+  u8 old_wrote: 1;			/* old version wrote to partition */
   u8 fast: 1;				/* partition was fast formatted */
   u8 bad_bitmap: 1;			/* bad bitmap */
   u8 bad_sector: 1;			/* bad sector, corrupted disk (???) */

diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index ab2e7cc..1cca832 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h

@@ -334,7 +334,7 @@
  * local time (HPFS) to GMT (Unix)
  */
 
-static inline time64_t local_to_gmt(struct super_block *s, time32_t t)
+static inline time64_t local_to_gmt(struct super_block *s, time64_t t)
 {
 	extern struct timezone sys_tz;
 	return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift;
@@ -343,9 +343,7 @@
 static inline time32_t gmt_to_local(struct super_block *s, time64_t t)
 {
 	extern struct timezone sys_tz;
-	t = t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
-
-	return clamp_t(time64_t, t, 0, U32_MAX);
+	return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
 }
 
 static inline time32_t local_get_seconds(struct super_block *s)

diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f2c3ebc..0a677a9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/hpfs/super.c
  *
@@ -238,17 +239,11 @@
 	return &ei->vfs_inode;
 }
 
-static void hpfs_i_callback(struct rcu_head *head)
+static void hpfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
-static void hpfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, hpfs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
@@ -532,7 +527,7 @@
 static const struct super_operations hpfs_sops =
 {
 	.alloc_inode	= hpfs_alloc_inode,
-	.destroy_inode	= hpfs_destroy_inode,
+	.free_inode	= hpfs_free_inode,
 	.evict_inode	= hpfs_evict_inode,
 	.put_super	= hpfs_put_super,
 	.statfs		= hpfs_statfs,
@@ -619,6 +614,8 @@
 	s->s_magic = HPFS_SUPER_MAGIC;
 	s->s_op = &hpfs_sops;
 	s->s_d_op = &hpfs_dentry_operations;
+	s->s_time_min =  local_to_gmt(s, 0);
+	s->s_time_max =  local_to_gmt(s, U32_MAX);
 
 	sbi->sb_root = le32_to_cpu(superblock->root);
 	sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors);

diff --git a/fs/hugetlbfs/Makefile b/fs/hugetlbfs/Makefile
index 6adf870..d876ecf 100644
--- a/fs/hugetlbfs/Makefile
+++ b/fs/hugetlbfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux ramfs routines.
 #

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 32920a1..a478df0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c

@@ -27,7 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/hugetlb.h>
 #include <linux/pagevec.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
@@ -45,11 +45,17 @@
 static const struct inode_operations hugetlbfs_dir_inode_operations;
 static const struct inode_operations hugetlbfs_inode_operations;
 
-struct hugetlbfs_config {
+enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+struct hugetlbfs_fs_context {
 	struct hstate		*hstate;
+	unsigned long long	max_size_opt;
+	unsigned long long	min_size_opt;
 	long			max_hpages;
 	long			nr_inodes;
 	long			min_hpages;
+	enum hugetlbfs_size_type max_val_type;
+	enum hugetlbfs_size_type min_val_type;
 	kuid_t			uid;
 	kgid_t			gid;
 	umode_t			mode;
@@ -57,22 +63,30 @@
 
 int sysctl_hugetlb_shm_group;
 
-enum {
-	Opt_size, Opt_nr_inodes,
-	Opt_mode, Opt_uid, Opt_gid,
-	Opt_pagesize, Opt_min_size,
-	Opt_err,
+enum hugetlb_param {
+	Opt_gid,
+	Opt_min_size,
+	Opt_mode,
+	Opt_nr_inodes,
+	Opt_pagesize,
+	Opt_size,
+	Opt_uid,
 };
 
-static const match_table_t tokens = {
-	{Opt_size,	"size=%s"},
-	{Opt_nr_inodes,	"nr_inodes=%s"},
-	{Opt_mode,	"mode=%o"},
-	{Opt_uid,	"uid=%u"},
-	{Opt_gid,	"gid=%u"},
-	{Opt_pagesize,	"pagesize=%s"},
-	{Opt_min_size,	"min_size=%s"},
-	{Opt_err,	NULL},
+static const struct fs_parameter_spec hugetlb_param_specs[] = {
+	fsparam_u32   ("gid",		Opt_gid),
+	fsparam_string("min_size",	Opt_min_size),
+	fsparam_u32   ("mode",		Opt_mode),
+	fsparam_string("nr_inodes",	Opt_nr_inodes),
+	fsparam_string("pagesize",	Opt_pagesize),
+	fsparam_string("size",		Opt_size),
+	fsparam_u32   ("uid",		Opt_uid),
+	{}
+};
+
+static const struct fs_parameter_description hugetlb_fs_parameters = {
+	.name		= "hugetlbfs",
+	.specs		= hugetlb_param_specs,
 };
 
 #ifdef CONFIG_NUMA
@@ -426,9 +440,7 @@
 			u32 hash;
 
 			index = page->index;
-			hash = hugetlb_fault_mutex_hash(h, current->mm,
-							&pseudo_vma,
-							mapping, index, 0);
+			hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 			/*
@@ -485,8 +497,15 @@
 	struct resv_map *resv_map;
 
 	remove_inode_hugepages(inode, 0, LLONG_MAX);
-	resv_map = (struct resv_map *)inode->i_mapping->private_data;
-	/* root inode doesn't have the resv_map, so we should check it */
+
+	/*
+	 * Get the resv_map from the address space embedded in the inode.
+	 * This is the address space which points to any resv_map allocated
+	 * at inode creation time.  If this is a device special inode,
+	 * i_mapping may not point to the original address space.
+	 */
+	resv_map = (struct resv_map *)(&inode->i_data)->private_data;
+	/* Only regular and link inodes have associated reserve maps */
 	if (resv_map)
 		resv_map_release(&resv_map->refs);
 	clear_inode(inode);
@@ -530,7 +549,7 @@
 		inode_lock(inode);
 
 		/* protected by i_mutex */
-		if (info->seals & F_SEAL_WRITE) {
+		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
 			inode_unlock(inode);
 			return -EPERM;
 		}
@@ -625,8 +644,7 @@
 		addr = index * hpage_size;
 
 		/* mutex taken here, fault path and hole punch */
-		hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
-						index, addr);
+		hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 		/* See if already present in mapping to avoid alloc/free */
@@ -708,16 +726,16 @@
 }
 
 static struct inode *hugetlbfs_get_root(struct super_block *sb,
-					struct hugetlbfs_config *config)
+					struct hugetlbfs_fs_context *ctx)
 {
 	struct inode *inode;
 
 	inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_mode = S_IFDIR | config->mode;
-		inode->i_uid = config->uid;
-		inode->i_gid = config->gid;
+		inode->i_mode = S_IFDIR | ctx->mode;
+		inode->i_uid = ctx->uid;
+		inode->i_gid = ctx->gid;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 		inode->i_op = &hugetlbfs_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
@@ -741,11 +759,17 @@
 					umode_t mode, dev_t dev)
 {
 	struct inode *inode;
-	struct resv_map *resv_map;
+	struct resv_map *resv_map = NULL;
 
-	resv_map = resv_map_alloc();
-	if (!resv_map)
-		return NULL;
+	/*
+	 * Reserve maps are only needed for inodes that can have associated
+	 * page allocations.
+	 */
+	if (S_ISREG(mode) || S_ISLNK(mode)) {
+		resv_map = resv_map_alloc();
+		if (!resv_map)
+			return NULL;
+	}
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -780,8 +804,10 @@
 			break;
 		}
 		lockdep_annotate_inode_mutex_key(inode);
-	} else
-		kref_put(&resv_map->refs, resv_map_release);
+	} else {
+		if (resv_map)
+			kref_put(&resv_map->refs, resv_map_release);
+	}
 
 	return inode;
 }
@@ -859,6 +885,18 @@
 	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
+
+	/*
+	 * page_private is subpool pointer in hugetlb pages.  Transfer to
+	 * new page.  PagePrivate is not associated with page_private for
+	 * hugetlb pages and can not be set here as only page_huge_active
+	 * pages can be migrated.
+	 */
+	if (page_private(page)) {
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+	}
+
 	if (mode != MIGRATE_SYNC_NO_COPY)
 		migrate_page_copy(newpage, page);
 	else
@@ -1017,9 +1055,8 @@
 	return &p->vfs_inode;
 }
 
-static void hugetlbfs_i_callback(struct rcu_head *head)
+static void hugetlbfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
 
@@ -1027,7 +1064,6 @@
 {
 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
-	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
 }
 
 static const struct address_space_operations hugetlbfs_aops = {
@@ -1074,6 +1110,7 @@
 
 static const struct super_operations hugetlbfs_ops = {
 	.alloc_inode    = hugetlbfs_alloc_inode,
+	.free_inode     = hugetlbfs_free_inode,
 	.destroy_inode  = hugetlbfs_destroy_inode,
 	.evict_inode	= hugetlbfs_evict_inode,
 	.statfs		= hugetlbfs_statfs,
@@ -1081,8 +1118,6 @@
 	.show_options	= hugetlbfs_show_options,
 };
 
-enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
-
 /*
  * Convert size option passed from command line to number of huge pages
  * in the pool specified by hstate.  Size option could be in bytes
@@ -1105,170 +1140,151 @@
 	return size_opt;
 }
 
-static int
-hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
+/*
+ * Parse one mount parameter.
+ */
+static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *p, *rest;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-	unsigned long long max_size_opt = 0, min_size_opt = 0;
-	enum hugetlbfs_size_type max_val_type = NO_SIZE, min_val_type = NO_SIZE;
+	struct hugetlbfs_fs_context *ctx = fc->fs_private;
+	struct fs_parse_result result;
+	char *rest;
+	unsigned long ps;
+	int opt;
 
-	if (!options)
+	opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_uid:
+		ctx->uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(ctx->uid))
+			goto bad_val;
 		return 0;
 
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
+	case Opt_gid:
+		ctx->gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(ctx->gid))
+			goto bad_val;
+		return 0;
 
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_uid:
-			if (match_int(&args[0], &option))
- 				goto bad_val;
-			pconfig->uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(pconfig->uid))
-				goto bad_val;
-			break;
+	case Opt_mode:
+		ctx->mode = result.uint_32 & 01777U;
+		return 0;
 
-		case Opt_gid:
-			if (match_int(&args[0], &option))
- 				goto bad_val;
-			pconfig->gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(pconfig->gid))
-				goto bad_val;
-			break;
+	case Opt_size:
+		/* memparse() will accept a K/M/G without a digit */
+		if (!isdigit(param->string[0]))
+			goto bad_val;
+		ctx->max_size_opt = memparse(param->string, &rest);
+		ctx->max_val_type = SIZE_STD;
+		if (*rest == '%')
+			ctx->max_val_type = SIZE_PERCENT;
+		return 0;
 
-		case Opt_mode:
-			if (match_octal(&args[0], &option))
- 				goto bad_val;
-			pconfig->mode = option & 01777U;
-			break;
+	case Opt_nr_inodes:
+		/* memparse() will accept a K/M/G without a digit */
+		if (!isdigit(param->string[0]))
+			goto bad_val;
+		ctx->nr_inodes = memparse(param->string, &rest);
+		return 0;
 
-		case Opt_size: {
-			/* memparse() will accept a K/M/G without a digit */
-			if (!isdigit(*args[0].from))
-				goto bad_val;
-			max_size_opt = memparse(args[0].from, &rest);
-			max_val_type = SIZE_STD;
-			if (*rest == '%')
-				max_val_type = SIZE_PERCENT;
-			break;
-		}
-
-		case Opt_nr_inodes:
-			/* memparse() will accept a K/M/G without a digit */
-			if (!isdigit(*args[0].from))
-				goto bad_val;
-			pconfig->nr_inodes = memparse(args[0].from, &rest);
-			break;
-
-		case Opt_pagesize: {
-			unsigned long ps;
-			ps = memparse(args[0].from, &rest);
-			pconfig->hstate = size_to_hstate(ps);
-			if (!pconfig->hstate) {
-				pr_err("Unsupported page size %lu MB\n",
-					ps >> 20);
-				return -EINVAL;
-			}
-			break;
-		}
-
-		case Opt_min_size: {
-			/* memparse() will accept a K/M/G without a digit */
-			if (!isdigit(*args[0].from))
-				goto bad_val;
-			min_size_opt = memparse(args[0].from, &rest);
-			min_val_type = SIZE_STD;
-			if (*rest == '%')
-				min_val_type = SIZE_PERCENT;
-			break;
-		}
-
-		default:
-			pr_err("Bad mount option: \"%s\"\n", p);
+	case Opt_pagesize:
+		ps = memparse(param->string, &rest);
+		ctx->hstate = size_to_hstate(ps);
+		if (!ctx->hstate) {
+			pr_err("Unsupported page size %lu MB\n", ps >> 20);
 			return -EINVAL;
-			break;
 		}
+		return 0;
+
+	case Opt_min_size:
+		/* memparse() will accept a K/M/G without a digit */
+		if (!isdigit(param->string[0]))
+			goto bad_val;
+		ctx->min_size_opt = memparse(param->string, &rest);
+		ctx->min_val_type = SIZE_STD;
+		if (*rest == '%')
+			ctx->min_val_type = SIZE_PERCENT;
+		return 0;
+
+	default:
+		return -EINVAL;
 	}
 
+bad_val:
+	return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n",
+		      param->string, param->key);
+}
+
+/*
+ * Validate the parsed options.
+ */
+static int hugetlbfs_validate(struct fs_context *fc)
+{
+	struct hugetlbfs_fs_context *ctx = fc->fs_private;
+
 	/*
 	 * Use huge page pool size (in hstate) to convert the size
 	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
 	 */
-	pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
-						max_size_opt, max_val_type);
-	pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
-						min_size_opt, min_val_type);
+	ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
+						   ctx->max_size_opt,
+						   ctx->max_val_type);
+	ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
+						   ctx->min_size_opt,
+						   ctx->min_val_type);
 
 	/*
 	 * If max_size was specified, then min_size must be smaller
 	 */
-	if (max_val_type > NO_SIZE &&
-	    pconfig->min_hpages > pconfig->max_hpages) {
-		pr_err("minimum size can not be greater than maximum size\n");
+	if (ctx->max_val_type > NO_SIZE &&
+	    ctx->min_hpages > ctx->max_hpages) {
+		pr_err("Minimum size can not be greater than maximum size\n");
 		return -EINVAL;
 	}
 
 	return 0;
-
-bad_val:
-	pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
- 	return -EINVAL;
 }
 
 static int
-hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
+hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
-	int ret;
-	struct hugetlbfs_config config;
+	struct hugetlbfs_fs_context *ctx = fc->fs_private;
 	struct hugetlbfs_sb_info *sbinfo;
 
-	config.max_hpages = -1; /* No limit on size by default */
-	config.nr_inodes = -1; /* No limit on number of inodes by default */
-	config.uid = current_fsuid();
-	config.gid = current_fsgid();
-	config.mode = 0755;
-	config.hstate = &default_hstate;
-	config.min_hpages = -1; /* No default minimum size */
-	ret = hugetlbfs_parse_options(data, &config);
-	if (ret)
-		return ret;
-
 	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
 	if (!sbinfo)
 		return -ENOMEM;
 	sb->s_fs_info = sbinfo;
-	sbinfo->hstate = config.hstate;
 	spin_lock_init(&sbinfo->stat_lock);
-	sbinfo->max_inodes = config.nr_inodes;
-	sbinfo->free_inodes = config.nr_inodes;
-	sbinfo->spool = NULL;
-	sbinfo->uid = config.uid;
-	sbinfo->gid = config.gid;
-	sbinfo->mode = config.mode;
+	sbinfo->hstate		= ctx->hstate;
+	sbinfo->max_inodes	= ctx->nr_inodes;
+	sbinfo->free_inodes	= ctx->nr_inodes;
+	sbinfo->spool		= NULL;
+	sbinfo->uid		= ctx->uid;
+	sbinfo->gid		= ctx->gid;
+	sbinfo->mode		= ctx->mode;
 
 	/*
 	 * Allocate and initialize subpool if maximum or minimum size is
 	 * specified.  Any needed reservations (for minimim size) are taken
 	 * taken when the subpool is created.
 	 */
-	if (config.max_hpages != -1 || config.min_hpages != -1) {
-		sbinfo->spool = hugepage_new_subpool(config.hstate,
-							config.max_hpages,
-							config.min_hpages);
+	if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
+		sbinfo->spool = hugepage_new_subpool(ctx->hstate,
+						     ctx->max_hpages,
+						     ctx->min_hpages);
 		if (!sbinfo->spool)
 			goto out_free;
 	}
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = huge_page_size(config.hstate);
-	sb->s_blocksize_bits = huge_page_shift(config.hstate);
+	sb->s_blocksize = huge_page_size(ctx->hstate);
+	sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
 	sb->s_magic = HUGETLBFS_MAGIC;
 	sb->s_op = &hugetlbfs_ops;
 	sb->s_time_gran = 1;
-	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
+	sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
 	if (!sb->s_root)
 		goto out_free;
 	return 0;
@@ -1278,16 +1294,52 @@
 	return -ENOMEM;
 }
 
-static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_tree(struct fs_context *fc)
 {
-	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+	int err = hugetlbfs_validate(fc);
+	if (err)
+		return err;
+	return get_tree_nodev(fc, hugetlbfs_fill_super);
+}
+
+static void hugetlbfs_fs_context_free(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations hugetlbfs_fs_context_ops = {
+	.free		= hugetlbfs_fs_context_free,
+	.parse_param	= hugetlbfs_parse_param,
+	.get_tree	= hugetlbfs_get_tree,
+};
+
+static int hugetlbfs_init_fs_context(struct fs_context *fc)
+{
+	struct hugetlbfs_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->max_hpages	= -1; /* No limit on size by default */
+	ctx->nr_inodes	= -1; /* No limit on number of inodes by default */
+	ctx->uid	= current_fsuid();
+	ctx->gid	= current_fsgid();
+	ctx->mode	= 0755;
+	ctx->hstate	= &default_hstate;
+	ctx->min_hpages	= -1; /* No default minimum size */
+	ctx->max_val_type = NO_SIZE;
+	ctx->min_val_type = NO_SIZE;
+	fc->fs_private = ctx;
+	fc->ops	= &hugetlbfs_fs_context_ops;
+	return 0;
 }
 
 static struct file_system_type hugetlbfs_fs_type = {
-	.name		= "hugetlbfs",
-	.mount		= hugetlbfs_mount,
-	.kill_sb	= kill_litter_super,
+	.name			= "hugetlbfs",
+	.init_fs_context	= hugetlbfs_init_fs_context,
+	.parameters		= &hugetlb_fs_parameters,
+	.kill_sb		= kill_litter_super,
 };
 
 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1372,8 +1424,29 @@
 	return file;
 }
 
+static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
+{
+	struct fs_context *fc;
+	struct vfsmount *mnt;
+
+	fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
+	if (IS_ERR(fc)) {
+		mnt = ERR_CAST(fc);
+	} else {
+		struct hugetlbfs_fs_context *ctx = fc->fs_private;
+		ctx->hstate = h;
+		mnt = fc_mount(fc);
+		put_fs_context(fc);
+	}
+	if (IS_ERR(mnt))
+		pr_err("Cannot mount internal hugetlbfs for page size %uK",
+		       1U << (h->order + PAGE_SHIFT - 10));
+	return mnt;
+}
+
 static int __init init_hugetlbfs_fs(void)
 {
+	struct vfsmount *mnt;
 	struct hstate *h;
 	int error;
 	int i;
@@ -1396,24 +1469,16 @@
 
 	i = 0;
 	for_each_hstate(h) {
-		char buf[50];
-		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
-
-		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
-		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
-							buf);
-
-		if (IS_ERR(hugetlbfs_vfsmount[i])) {
-			pr_err("Cannot mount internal hugetlbfs for "
-				"page size %uK", ps_kb);
-			error = PTR_ERR(hugetlbfs_vfsmount[i]);
-			hugetlbfs_vfsmount[i] = NULL;
+		mnt = mount_one_hugetlbfs(h);
+		if (IS_ERR(mnt) && i == 0) {
+			error = PTR_ERR(mnt);
+			goto out;
 		}
+		hugetlbfs_vfsmount[i] = mnt;
 		i++;
 	}
-	/* Non default hstates are optional */
-	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
-		return 0;
+
+	return 0;
 
  out:
 	kmem_cache_destroy(hugetlbfs_inode_cachep);

diff --git a/fs/inode.c b/fs/inode.c
index 65ae154..fef457a 100644
--- a/fs/inode.c
+++ b/fs/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * (C) 1997 Linus Torvalds
  * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
@@ -10,7 +11,7 @@
 #include <linux/swap.h>
 #include <linux/security.h>
 #include <linux/cdev.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/posix_acl.h>
@@ -180,6 +181,9 @@
 	mapping->flags = 0;
 	mapping->wb_err = 0;
 	atomic_set(&mapping->i_mmap_writable, 0);
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+	atomic_set(&mapping->nr_thps, 0);
+#endif
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->private_data = NULL;
 	mapping->writeback_index = 0;
@@ -202,12 +206,28 @@
 }
 EXPORT_SYMBOL(inode_init_always);
 
+void free_inode_nonrcu(struct inode *inode)
+{
+	kmem_cache_free(inode_cachep, inode);
+}
+EXPORT_SYMBOL(free_inode_nonrcu);
+
+static void i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	if (inode->free_inode)
+		inode->free_inode(inode);
+	else
+		free_inode_nonrcu(inode);
+}
+
 static struct inode *alloc_inode(struct super_block *sb)
 {
+	const struct super_operations *ops = sb->s_op;
 	struct inode *inode;
 
-	if (sb->s_op->alloc_inode)
-		inode = sb->s_op->alloc_inode(sb);
+	if (ops->alloc_inode)
+		inode = ops->alloc_inode(sb);
 	else
 		inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
@@ -215,22 +235,19 @@
 		return NULL;
 
 	if (unlikely(inode_init_always(sb, inode))) {
-		if (inode->i_sb->s_op->destroy_inode)
-			inode->i_sb->s_op->destroy_inode(inode);
-		else
-			kmem_cache_free(inode_cachep, inode);
+		if (ops->destroy_inode) {
+			ops->destroy_inode(inode);
+			if (!ops->free_inode)
+				return NULL;
+		}
+		inode->free_inode = ops->free_inode;
+		i_callback(&inode->i_rcu);
 		return NULL;
 	}
 
 	return inode;
 }
 
-void free_inode_nonrcu(struct inode *inode)
-{
-	kmem_cache_free(inode_cachep, inode);
-}
-EXPORT_SYMBOL(free_inode_nonrcu);
-
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
@@ -253,20 +270,19 @@
 }
 EXPORT_SYMBOL(__destroy_inode);
 
-static void i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(inode_cachep, inode);
-}
-
 static void destroy_inode(struct inode *inode)
 {
+	const struct super_operations *ops = inode->i_sb->s_op;
+
 	BUG_ON(!list_empty(&inode->i_lru));
 	__destroy_inode(inode);
-	if (inode->i_sb->s_op->destroy_inode)
-		inode->i_sb->s_op->destroy_inode(inode);
-	else
-		call_rcu(&inode->i_rcu, i_callback);
+	if (ops->destroy_inode) {
+		ops->destroy_inode(inode);
+		if (!ops->free_inode)
+			return;
+	}
+	inode->free_inode = ops->free_inode;
+	call_rcu(&inode->i_rcu, i_callback);
 }
 
 /**
@@ -349,7 +365,7 @@
 
 static void __address_space_init_once(struct address_space *mapping)
 {
-	INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT);
+	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
@@ -730,11 +746,8 @@
 		return LRU_REMOVED;
 	}
 
-	/*
-	 * Recently referenced inodes and inodes with many attached pages
-	 * get one more pass.
-	 */
-	if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) {
+	/* recently referenced inodes get one more pass */
+	if (inode->i_state & I_REFERENCED) {
 		inode->i_state &= ~I_REFERENCED;
 		spin_unlock(&inode->i_lock);
 		return LRU_ROTATE;
@@ -1604,7 +1617,7 @@
  * passed since the last atime update.
  */
 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
-			     struct timespec now)
+			     struct timespec64 now)
 {
 
 	if (!(mnt->mnt_flags & MNT_RELATIME))
@@ -1705,7 +1718,7 @@
 
 	now = current_time(inode);
 
-	if (!relatime_need_update(mnt, inode, timespec64_to_timespec(now)))
+	if (!relatime_need_update(mnt, inode, now))
 		return false;
 
 	if (timespec64_equal(&inode->i_atime, &now))
@@ -1820,8 +1833,13 @@
 	int kill;
 	int error = 0;
 
-	/* Fast path for nothing security related */
-	if (IS_NOSEC(inode))
+	/*
+	 * Fast path for nothing security related.
+	 * As well for non-regular files, e.g. blkdev inodes.
+	 * For example, blkdev_write_iter() might get here
+	 * trying to remove privs which it is not allowed to.
+	 */
+	if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
 		return 0;
 
 	kill = dentry_needs_remove_privs(dentry);
@@ -1884,6 +1902,26 @@
 }
 EXPORT_SYMBOL(file_update_time);
 
+/* Caller must hold the file's inode lock */
+int file_modified(struct file *file)
+{
+	int err;
+
+	/*
+	 * Clear the security bits if the process is not being run by root.
+	 * This keeps people from modifying setuid and setgid binaries.
+	 */
+	err = file_remove_privs(file);
+	if (err)
+		return err;
+
+	if (unlikely(file->f_mode & FMODE_NOCMTIME))
+		return 0;
+
+	return file_update_time(file);
+}
+EXPORT_SYMBOL(file_modified);
+
 int inode_needs_sync(struct inode *inode)
 {
 	if (IS_SYNC(inode))
@@ -2096,14 +2134,8 @@
 void inode_set_flags(struct inode *inode, unsigned int flags,
 		     unsigned int mask)
 {
-	unsigned int old_flags, new_flags;
-
 	WARN_ON_ONCE(flags & ~mask);
-	do {
-		old_flags = READ_ONCE(inode->i_flags);
-		new_flags = (old_flags & ~mask) | flags;
-	} while (unlikely(cmpxchg(&inode->i_flags, old_flags,
-				  new_flags) != old_flags));
+	set_mask_bits(&inode->i_flags, mask, flags);
 }
 EXPORT_SYMBOL(inode_set_flags);
 
@@ -2138,6 +2170,37 @@
 EXPORT_SYMBOL(timespec64_trunc);
 
 /**
+ * timestamp_truncate - Truncate timespec to a granularity
+ * @t: Timespec
+ * @inode: inode being updated
+ *
+ * Truncate a timespec to the granularity supported by the fs
+ * containing the inode. Always rounds down. gran must
+ * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
+ */
+struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned int gran = sb->s_time_gran;
+
+	t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
+	if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
+		t.tv_nsec = 0;
+
+	/* Avoid division in the common cases 1 ns and 1 s. */
+	if (gran == 1)
+		; /* nothing */
+	else if (gran == NSEC_PER_SEC)
+		t.tv_nsec = 0;
+	else if (gran > 1 && gran < NSEC_PER_SEC)
+		t.tv_nsec -= t.tv_nsec % gran;
+	else
+		WARN(1, "invalid file time granularity: %u", gran);
+	return t;
+}
+EXPORT_SYMBOL(timestamp_truncate);
+
+/**
  * current_time - Return FS time
  * @inode: inode.
  *
@@ -2149,13 +2212,101 @@
  */
 struct timespec64 current_time(struct inode *inode)
 {
-	struct timespec64 now = current_kernel_time64();
+	struct timespec64 now;
+
+	ktime_get_coarse_real_ts64(&now);
 
 	if (unlikely(!inode->i_sb)) {
 		WARN(1, "current_time() called with uninitialized super_block in the inode");
 		return now;
 	}
 
-	return timespec64_trunc(now, inode->i_sb->s_time_gran);
+	return timestamp_truncate(now, inode);
 }
 EXPORT_SYMBOL(current_time);
+
+/*
+ * Generic function to check FS_IOC_SETFLAGS values and reject any invalid
+ * configurations.
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that they have
+ * exclusive access to the inode structure.
+ */
+int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
+			     unsigned int flags)
+{
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 *
+	 * This test looks nicer. Thanks to Pauline Middelink
+	 */
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	return 0;
+}
+EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
+
+/*
+ * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
+ * configurations.
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that they have
+ * exclusive access to the inode structure.
+ */
+int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
+			     struct fsxattr *fa)
+{
+	/*
+	 * Can't modify an immutable/append-only file unless we have
+	 * appropriate permission.
+	 */
+	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
+			(FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	/*
+	 * Project Quota ID state is only allowed to change from within the init
+	 * namespace. Enforce that restriction only if we are trying to change
+	 * the quota ID state. Everything else is allowed in user namespaces.
+	 */
+	if (current_user_ns() != &init_user_ns) {
+		if (old_fa->fsx_projid != fa->fsx_projid)
+			return -EINVAL;
+		if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
+				FS_XFLAG_PROJINHERIT)
+			return -EINVAL;
+	}
+
+	/* Check extent size hints. */
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
+			!S_ISDIR(inode->i_mode))
+		return -EINVAL;
+
+	if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
+	    !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+		return -EINVAL;
+
+	/*
+	 * It is only valid to set the DAX flag on regular files and
+	 * directories on filesystems.
+	 */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+	    !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+		return -EINVAL;
+
+	/* Extent size hints of zero turn off the flags. */
+	if (fa->fsx_extsize == 0)
+		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+	if (fa->fsx_cowextsize == 0)
+		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);

diff --git a/fs/internal.h b/fs/internal.h
index d410186..315fcd8 100644
--- a/fs/internal.h
+++ b/fs/internal.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* fs/ internal definitions
  *
  * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 struct super_block;
@@ -17,6 +13,8 @@
 struct path;
 struct mount;
 struct shrink_control;
+struct fs_context;
+struct user_namespace;
 
 /*
  * block_dev.c
@@ -43,8 +41,6 @@
 extern void guard_bio_eod(int rw, struct bio *bio);
 extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
 		get_block_t *get_block, struct iomap *iomap);
-int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
-		struct page *page);
 
 /*
  * char_dev.c
@@ -52,8 +48,19 @@
 extern void __init chrdev_init(void);
 
 /*
+ * fs_context.c
+ */
+extern const struct fs_context_operations legacy_fs_context_ops;
+extern int parse_monolithic_mount_data(struct fs_context *, void *);
+extern void fc_drop_locked(struct fs_context *);
+extern void vfs_clean_context(struct fs_context *fc);
+extern int finish_clean_context(struct fs_context *fc);
+
+/*
  * namei.c
  */
+extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
+			   struct path *path, struct path *root);
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
@@ -80,11 +87,10 @@
 
 extern void __init mnt_init(void);
 
-extern int __mnt_want_write(struct vfsmount *);
 extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write(struct vfsmount *);
 extern void __mnt_drop_write_file(struct file *);
 
+extern void dissolve_on_fput(struct vfsmount *);
 /*
  * fs_struct.c
  */
@@ -99,11 +105,10 @@
 /*
  * super.c
  */
-extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int reconfigure_super(struct fs_context *);
 extern bool trylock_super(struct super_block *sb);
-extern struct dentry *mount_fs(struct file_system_type *,
-			       int, const char *, void *);
 extern struct super_block *user_get_super(dev_t);
+extern bool mount_capable(struct fs_context *);
 
 /*
  * open.c
@@ -150,6 +155,10 @@
 extern int d_set_mounted(struct dentry *dentry);
 extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 extern struct dentry *d_alloc_cursor(struct dentry *);
+extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
+extern char *simple_dname(struct dentry *, char *, int);
+extern void dput_to_list(struct dentry *, struct list_head *);
+extern void shrink_dentry_list(struct list_head *);
 
 /*
  * read_write.c
@@ -178,15 +187,5 @@
 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
 
-/*
- * iomap support:
- */
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
-		void *data, struct iomap *iomap);
-
-loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
-		unsigned flags, const struct iomap_ops *ops, void *data,
-		iomap_actor_t actor);
-
 /* direct-io.c: */
 int sb_init_dio_done_wq(struct super_block *sb);

diff --git a/fs/io_uring.c b/fs/io_uring.c
new file mode 100644
index 0000000..cbe8dab
--- /dev/null
+++ b/fs/io_uring.c

@@ -0,0 +1,4040 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared application/kernel submission and completion ring pairs, for
+ * supporting fast/efficient IO.
+ *
+ * A note on the read/write ordering memory barriers that are matched between
+ * the application and kernel side.
+ *
+ * After the application reads the CQ ring tail, it must use an
+ * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
+ * before writing the tail (using smp_load_acquire to read the tail will
+ * do). It also needs a smp_mb() before updating CQ head (ordering the
+ * entry load(s) with the head store), pairing with an implicit barrier
+ * through a control-dependency in io_get_cqring (smp_store_release to
+ * store head will do). Failure to do so could lead to reading invalid
+ * CQ entries.
+ *
+ * Likewise, the application must use an appropriate smp_wmb() before
+ * writing the SQ tail (ordering SQ entry stores with the tail store),
+ * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
+ * to store the tail will do). And it needs a barrier ordering the SQ
+ * head load before writing new SQ entries (smp_load_acquire to read
+ * head will do).
+ *
+ * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
+ * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
+ * updating the SQ tail; a full memory barrier smp_mb() is needed
+ * between.
+ *
+ * Also see the examples in the liburing library:
+ *
+ *	git://git.kernel.dk/liburing
+ *
+ * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
+ * from data shared between the kernel and application. This is done both
+ * for ordering purposes, but also to ensure that once a value is loaded from
+ * data that the application could potentially modify, it remains stable.
+ *
+ * Copyright (C) 2018-2019 Jens Axboe
+ * Copyright (c) 2018-2019 Christoph Hellwig
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/refcount.h>
+#include <linux/uio.h>
+
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_context.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/bvec.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <net/af_unix.h>
+#include <net/scm.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/nospec.h>
+#include <linux/sizes.h>
+#include <linux/hugetlb.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "internal.h"
+
+#define IORING_MAX_ENTRIES	32768
+#define IORING_MAX_FIXED_FILES	1024
+
+struct io_uring {
+	u32 head ____cacheline_aligned_in_smp;
+	u32 tail ____cacheline_aligned_in_smp;
+};
+
+/*
+ * This data is shared with the application through the mmap at offsets
+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
+struct io_rings {
+	/*
+	 * Head and tail offsets into the ring; the offsets need to be
+	 * masked to get valid indices.
+	 *
+	 * The kernel controls head of the sq ring and the tail of the cq ring,
+	 * and the application controls tail of the sq ring and the head of the
+	 * cq ring.
+	 */
+	struct io_uring		sq, cq;
+	/*
+	 * Bitmasks to apply to head and tail offsets (constant, equals
+	 * ring_entries - 1)
+	 */
+	u32			sq_ring_mask, cq_ring_mask;
+	/* Ring sizes (constant, power of 2) */
+	u32			sq_ring_entries, cq_ring_entries;
+	/*
+	 * Number of invalid entries dropped by the kernel due to
+	 * invalid index stored in array
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application (i.e. get number of "new events" by comparing to
+	 * cached value).
+	 *
+	 * After a new SQ head value was read by the application this
+	 * counter includes all submissions that were dropped reaching
+	 * the new SQ head (and possibly more).
+	 */
+	u32			sq_dropped;
+	/*
+	 * Runtime flags
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application.
+	 *
+	 * The application needs a full memory barrier before checking
+	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+	 */
+	u32			sq_flags;
+	/*
+	 * Number of completion events lost because the queue was full;
+	 * this should be avoided by the application by making sure
+	 * there are not more requests pending thatn there is space in
+	 * the completion queue.
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application (i.e. get number of "new events" by comparing to
+	 * cached value).
+	 *
+	 * As completion events come in out of order this counter is not
+	 * ordered with any other data.
+	 */
+	u32			cq_overflow;
+	/*
+	 * Ring buffer of completion events.
+	 *
+	 * The kernel writes completion events fresh every time they are
+	 * produced, so the application is allowed to modify pending
+	 * entries.
+	 */
+	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
+};
+
+struct io_mapped_ubuf {
+	u64		ubuf;
+	size_t		len;
+	struct		bio_vec *bvec;
+	unsigned int	nr_bvecs;
+};
+
+struct async_list {
+	spinlock_t		lock;
+	atomic_t		cnt;
+	struct list_head	list;
+
+	struct file		*file;
+	off_t			io_start;
+	size_t			io_len;
+};
+
+struct io_ring_ctx {
+	struct {
+		struct percpu_ref	refs;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		unsigned int		flags;
+		bool			compat;
+		bool			account_mem;
+
+		/*
+		 * Ring buffer of indices into array of io_uring_sqe, which is
+		 * mmapped by the application using the IORING_OFF_SQES offset.
+		 *
+		 * This indirection could e.g. be used to assign fixed
+		 * io_uring_sqe entries to operations and only submit them to
+		 * the queue when needed.
+		 *
+		 * The kernel modifies neither the indices array nor the entries
+		 * array.
+		 */
+		u32			*sq_array;
+		unsigned		cached_sq_head;
+		unsigned		sq_entries;
+		unsigned		sq_mask;
+		unsigned		sq_thread_idle;
+		unsigned		cached_sq_dropped;
+		struct io_uring_sqe	*sq_sqes;
+
+		struct list_head	defer_list;
+		struct list_head	timeout_list;
+	} ____cacheline_aligned_in_smp;
+
+	/* IO offload */
+	struct workqueue_struct	*sqo_wq[2];
+	struct task_struct	*sqo_thread;	/* if using sq thread polling */
+	struct mm_struct	*sqo_mm;
+	wait_queue_head_t	sqo_wait;
+	struct completion	sqo_thread_started;
+
+	struct {
+		unsigned		cached_cq_tail;
+		atomic_t		cached_cq_overflow;
+		unsigned		cq_entries;
+		unsigned		cq_mask;
+		struct wait_queue_head	cq_wait;
+		struct fasync_struct	*cq_fasync;
+		struct eventfd_ctx	*cq_ev_fd;
+		atomic_t		cq_timeouts;
+	} ____cacheline_aligned_in_smp;
+
+	struct io_rings	*rings;
+
+	/*
+	 * If used, fixed file set. Writers must ensure that ->refs is dead,
+	 * readers must ensure that ->refs is alive as long as the file* is
+	 * used. Only updated through io_uring_register(2).
+	 */
+	struct file		**user_files;
+	unsigned		nr_user_files;
+
+	/* if used, fixed mapped user buffers */
+	unsigned		nr_user_bufs;
+	struct io_mapped_ubuf	*user_bufs;
+
+	struct user_struct	*user;
+
+	struct cred		*creds;
+
+	struct completion	ctx_done;
+
+	struct {
+		struct mutex		uring_lock;
+		wait_queue_head_t	wait;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		spinlock_t		completion_lock;
+		bool			poll_multi_file;
+		/*
+		 * ->poll_list is protected by the ctx->uring_lock for
+		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
+		 * For SQPOLL, only the single threaded io_sq_thread() will
+		 * manipulate the list, hence no extra locking is needed there.
+		 */
+		struct list_head	poll_list;
+		struct list_head	cancel_list;
+	} ____cacheline_aligned_in_smp;
+
+	struct async_list	pending_async[2];
+
+#if defined(CONFIG_UNIX)
+	struct socket		*ring_sock;
+#endif
+};
+
+struct sqe_submit {
+	const struct io_uring_sqe	*sqe;
+	unsigned short			index;
+	u32				sequence;
+	bool				has_user;
+	bool				needs_lock;
+	bool				needs_fixed_file;
+};
+
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
+struct io_poll_iocb {
+	struct file			*file;
+	struct wait_queue_head		*head;
+	__poll_t			events;
+	bool				done;
+	bool				canceled;
+	struct wait_queue_entry		wait;
+};
+
+struct io_timeout {
+	struct file			*file;
+	struct hrtimer			timer;
+};
+
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
+struct io_kiocb {
+	union {
+		struct file		*file;
+		struct kiocb		rw;
+		struct io_poll_iocb	poll;
+		struct io_timeout	timeout;
+	};
+
+	struct sqe_submit	submit;
+
+	struct io_ring_ctx	*ctx;
+	struct list_head	list;
+	struct list_head	link_list;
+	unsigned int		flags;
+	refcount_t		refs;
+#define REQ_F_NOWAIT		1	/* must not punt to workers */
+#define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
+#define REQ_F_FIXED_FILE	4	/* ctx owns file */
+#define REQ_F_SEQ_PREV		8	/* sequential with previous */
+#define REQ_F_IO_DRAIN		16	/* drain existing IO first */
+#define REQ_F_IO_DRAINED	32	/* drain done */
+#define REQ_F_LINK		64	/* linked sqes */
+#define REQ_F_LINK_DONE		128	/* linked sqes done */
+#define REQ_F_FAIL_LINK		256	/* fail rest of links */
+#define REQ_F_SHADOW_DRAIN	512	/* link-drain shadow req */
+#define REQ_F_TIMEOUT		1024	/* timeout request */
+#define REQ_F_ISREG		2048	/* regular file */
+#define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
+#define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
+	u64			user_data;
+	u32			result;
+	u32			sequence;
+
+	struct work_struct	work;
+};
+
+#define IO_PLUG_THRESHOLD		2
+#define IO_IOPOLL_BATCH			8
+
+struct io_submit_state {
+	struct blk_plug		plug;
+
+	/*
+	 * io_kiocb alloc cache
+	 */
+	void			*reqs[IO_IOPOLL_BATCH];
+	unsigned		int free_reqs;
+	unsigned		int cur_req;
+
+	/*
+	 * File reference cache
+	 */
+	struct file		*file;
+	unsigned int		fd;
+	unsigned int		has_refs;
+	unsigned int		used_refs;
+	unsigned int		ios_left;
+};
+
+static void io_sq_wq_submit_work(struct work_struct *work);
+static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+				 long res);
+static void __io_free_req(struct io_kiocb *req);
+
+static struct kmem_cache *req_cachep;
+
+static const struct file_operations io_uring_fops;
+
+struct sock *io_uring_get_socket(struct file *file)
+{
+#if defined(CONFIG_UNIX)
+	if (file->f_op == &io_uring_fops) {
+		struct io_ring_ctx *ctx = file->private_data;
+
+		return ctx->ring_sock->sk;
+	}
+#endif
+	return NULL;
+}
+EXPORT_SYMBOL(io_uring_get_socket);
+
+static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+{
+	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
+
+	complete(&ctx->ctx_done);
+}
+
+static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+{
+	struct io_ring_ctx *ctx;
+	int i;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
+			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+		kfree(ctx);
+		return NULL;
+	}
+
+	ctx->flags = p->flags;
+	init_waitqueue_head(&ctx->cq_wait);
+	init_completion(&ctx->ctx_done);
+	init_completion(&ctx->sqo_thread_started);
+	mutex_init(&ctx->uring_lock);
+	init_waitqueue_head(&ctx->wait);
+	for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
+		spin_lock_init(&ctx->pending_async[i].lock);
+		INIT_LIST_HEAD(&ctx->pending_async[i].list);
+		atomic_set(&ctx->pending_async[i].cnt, 0);
+	}
+	spin_lock_init(&ctx->completion_lock);
+	INIT_LIST_HEAD(&ctx->poll_list);
+	INIT_LIST_HEAD(&ctx->cancel_list);
+	INIT_LIST_HEAD(&ctx->defer_list);
+	INIT_LIST_HEAD(&ctx->timeout_list);
+	return ctx;
+}
+
+static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
+				       struct io_kiocb *req)
+{
+	return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
+					+ atomic_read(&ctx->cached_cq_overflow);
+}
+
+static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
+				     struct io_kiocb *req)
+{
+	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
+		return false;
+
+	return __io_sequence_defer(ctx, req);
+}
+
+static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
+	if (req && !io_sequence_defer(ctx, req)) {
+		list_del_init(&req->list);
+		return req;
+	}
+
+	return NULL;
+}
+
+static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
+	if (req) {
+		if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+			return NULL;
+		if (!__io_sequence_defer(ctx, req)) {
+			list_del_init(&req->list);
+			return req;
+		}
+	}
+
+	return NULL;
+}
+
+static void __io_commit_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = ctx->rings;
+
+	if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
+		/* order cqe stores with ring update */
+		smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
+
+		if (wq_has_sleeper(&ctx->cq_wait)) {
+			wake_up_interruptible(&ctx->cq_wait);
+			kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
+		}
+	}
+}
+
+static inline void io_queue_async_work(struct io_ring_ctx *ctx,
+				       struct io_kiocb *req)
+{
+	int rw = 0;
+
+	if (req->submit.sqe) {
+		switch (req->submit.sqe->opcode) {
+		case IORING_OP_WRITEV:
+		case IORING_OP_WRITE_FIXED:
+			rw = !(req->rw.ki_flags & IOCB_DIRECT);
+			break;
+		}
+	}
+
+	queue_work(ctx->sqo_wq[rw], &req->work);
+}
+
+static void io_kill_timeout(struct io_kiocb *req)
+{
+	int ret;
+
+	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	if (ret != -1) {
+		atomic_inc(&req->ctx->cq_timeouts);
+		list_del(&req->list);
+		io_cqring_fill_event(req->ctx, req->user_data, 0);
+		__io_free_req(req);
+	}
+}
+
+static void io_kill_timeouts(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req, *tmp;
+
+	spin_lock_irq(&ctx->completion_lock);
+	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+		io_kill_timeout(req);
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	while ((req = io_get_timeout_req(ctx)) != NULL)
+		io_kill_timeout(req);
+
+	__io_commit_cqring(ctx);
+
+	while ((req = io_get_deferred_req(ctx)) != NULL) {
+		if (req->flags & REQ_F_SHADOW_DRAIN) {
+			/* Just for drain, free it. */
+			__io_free_req(req);
+			continue;
+		}
+		req->flags |= REQ_F_IO_DRAINED;
+		io_queue_async_work(ctx, req);
+	}
+}
+
+static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = ctx->rings;
+	unsigned tail;
+
+	tail = ctx->cached_cq_tail;
+	/*
+	 * writes to the cq entry need to come after reading head; the
+	 * control dependency is enough as we're using WRITE_ONCE to
+	 * fill the cq entry
+	 */
+	if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
+		return NULL;
+
+	ctx->cached_cq_tail++;
+	return &rings->cqes[tail & ctx->cq_mask];
+}
+
+static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+				 long res)
+{
+	struct io_uring_cqe *cqe;
+
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	cqe = io_get_cqring(ctx);
+	if (cqe) {
+		WRITE_ONCE(cqe->user_data, ki_user_data);
+		WRITE_ONCE(cqe->res, res);
+		WRITE_ONCE(cqe->flags, 0);
+	} else {
+		WRITE_ONCE(ctx->rings->cq_overflow,
+				atomic_inc_return(&ctx->cached_cq_overflow));
+	}
+}
+
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+	if (waitqueue_active(&ctx->sqo_wait))
+		wake_up(&ctx->sqo_wait);
+	if (ctx->cq_ev_fd)
+		eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
+				long res)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	io_cqring_fill_event(ctx, user_data, res);
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	io_cqring_ev_posted(ctx);
+}
+
+static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
+				   struct io_submit_state *state)
+{
+	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+	struct io_kiocb *req;
+
+	if (!percpu_ref_tryget(&ctx->refs))
+		return NULL;
+
+	if (!state) {
+		req = kmem_cache_alloc(req_cachep, gfp);
+		if (unlikely(!req))
+			goto out;
+	} else if (!state->free_reqs) {
+		size_t sz;
+		int ret;
+
+		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
+		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
+
+		/*
+		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
+		 * retry single alloc to be on the safe side.
+		 */
+		if (unlikely(ret <= 0)) {
+			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+			if (!state->reqs[0])
+				goto out;
+			ret = 1;
+		}
+		state->free_reqs = ret - 1;
+		state->cur_req = 1;
+		req = state->reqs[0];
+	} else {
+		req = state->reqs[state->cur_req];
+		state->free_reqs--;
+		state->cur_req++;
+	}
+
+	req->file = NULL;
+	req->ctx = ctx;
+	req->flags = 0;
+	/* one is dropped after submission, the other at completion */
+	refcount_set(&req->refs, 2);
+	req->result = 0;
+	return req;
+out:
+	percpu_ref_put(&ctx->refs);
+	return NULL;
+}
+
+static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
+{
+	if (*nr) {
+		kmem_cache_free_bulk(req_cachep, *nr, reqs);
+		percpu_ref_put_many(&ctx->refs, *nr);
+		*nr = 0;
+	}
+}
+
+static void __io_free_req(struct io_kiocb *req)
+{
+	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
+		fput(req->file);
+	percpu_ref_put(&req->ctx->refs);
+	kmem_cache_free(req_cachep, req);
+}
+
+static void io_req_link_next(struct io_kiocb *req)
+{
+	struct io_kiocb *nxt;
+
+	/*
+	 * The list should never be empty when we are called here. But could
+	 * potentially happen if the chain is messed up, check to be on the
+	 * safe side.
+	 */
+	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
+	if (nxt) {
+		list_del(&nxt->list);
+		if (!list_empty(&req->link_list)) {
+			INIT_LIST_HEAD(&nxt->link_list);
+			list_splice(&req->link_list, &nxt->link_list);
+			nxt->flags |= REQ_F_LINK;
+		}
+
+		nxt->flags |= REQ_F_LINK_DONE;
+		INIT_WORK(&nxt->work, io_sq_wq_submit_work);
+		io_queue_async_work(req->ctx, nxt);
+	}
+}
+
+/*
+ * Called if REQ_F_LINK is set, and we fail the head request
+ */
+static void io_fail_links(struct io_kiocb *req)
+{
+	struct io_kiocb *link;
+
+	while (!list_empty(&req->link_list)) {
+		link = list_first_entry(&req->link_list, struct io_kiocb, list);
+		list_del(&link->list);
+
+		io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
+		__io_free_req(link);
+	}
+}
+
+static void io_free_req(struct io_kiocb *req)
+{
+	/*
+	 * If LINK is set, we have dependent requests in this chain. If we
+	 * didn't fail this request, queue the first one up, moving any other
+	 * dependencies to the next request. In case of failure, fail the rest
+	 * of the chain.
+	 */
+	if (req->flags & REQ_F_LINK) {
+		if (req->flags & REQ_F_FAIL_LINK)
+			io_fail_links(req);
+		else
+			io_req_link_next(req);
+	}
+
+	__io_free_req(req);
+}
+
+static void io_put_req(struct io_kiocb *req)
+{
+	if (refcount_dec_and_test(&req->refs))
+		io_free_req(req);
+}
+
+static unsigned io_cqring_events(struct io_rings *rings)
+{
+	/* See comment at the top of this file */
+	smp_rmb();
+	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
+}
+
+static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = ctx->rings;
+
+	/* make sure SQ entry isn't read before tail */
+	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
+}
+
+/*
+ * Find and free completed poll iocbs
+ */
+static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
+			       struct list_head *done)
+{
+	void *reqs[IO_IOPOLL_BATCH];
+	struct io_kiocb *req;
+	int to_free;
+
+	to_free = 0;
+	while (!list_empty(done)) {
+		req = list_first_entry(done, struct io_kiocb, list);
+		list_del(&req->list);
+
+		io_cqring_fill_event(ctx, req->user_data, req->result);
+		(*nr_events)++;
+
+		if (refcount_dec_and_test(&req->refs)) {
+			/* If we're not using fixed files, we have to pair the
+			 * completion part with the file put. Use regular
+			 * completions for those, only batch free for fixed
+			 * file and non-linked commands.
+			 */
+			if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+			    REQ_F_FIXED_FILE) {
+				reqs[to_free++] = req;
+				if (to_free == ARRAY_SIZE(reqs))
+					io_free_req_many(ctx, reqs, &to_free);
+			} else {
+				io_free_req(req);
+			}
+		}
+	}
+
+	io_commit_cqring(ctx);
+	io_free_req_many(ctx, reqs, &to_free);
+}
+
+static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
+			long min)
+{
+	struct io_kiocb *req, *tmp;
+	LIST_HEAD(done);
+	bool spin;
+	int ret;
+
+	/*
+	 * Only spin for completions if we don't have multiple devices hanging
+	 * off our complete list, and we're under the requested amount.
+	 */
+	spin = !ctx->poll_multi_file && *nr_events < min;
+
+	ret = 0;
+	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+		struct kiocb *kiocb = &req->rw;
+
+		/*
+		 * Move completed entries to our local list. If we find a
+		 * request that requires polling, break out and complete
+		 * the done list first, if we have entries there.
+		 */
+		if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+			list_move_tail(&req->list, &done);
+			continue;
+		}
+		if (!list_empty(&done))
+			break;
+
+		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+		if (ret < 0)
+			break;
+
+		if (ret && spin)
+			spin = false;
+		ret = 0;
+	}
+
+	if (!list_empty(&done))
+		io_iopoll_complete(ctx, nr_events, &done);
+
+	return ret;
+}
+
+/*
+ * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
+ * non-spinning poll check - we'll still enter the driver poll loop, but only
+ * as a non-spinning completion check.
+ */
+static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
+				long min)
+{
+	while (!list_empty(&ctx->poll_list) && !need_resched()) {
+		int ret;
+
+		ret = io_do_iopoll(ctx, nr_events, min);
+		if (ret < 0)
+			return ret;
+		if (!min || *nr_events >= min)
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * We can't just wait for polled events to come to us, we have to actively
+ * find and complete them.
+ */
+static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+{
+	if (!(ctx->flags & IORING_SETUP_IOPOLL))
+		return;
+
+	mutex_lock(&ctx->uring_lock);
+	while (!list_empty(&ctx->poll_list)) {
+		unsigned int nr_events = 0;
+
+		io_iopoll_getevents(ctx, &nr_events, 1);
+
+		/*
+		 * Ensure we allow local-to-the-cpu processing to take place,
+		 * in this case we need to ensure that we reap all events.
+		 */
+		cond_resched();
+	}
+	mutex_unlock(&ctx->uring_lock);
+}
+
+static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+			    long min)
+{
+	int iters = 0, ret = 0;
+
+	do {
+		int tmin = 0;
+
+		/*
+		 * Don't enter poll loop if we already have events pending.
+		 * If we do, we can potentially be spinning for commands that
+		 * already triggered a CQE (eg in error).
+		 */
+		if (io_cqring_events(ctx->rings))
+			break;
+
+		/*
+		 * If a submit got punted to a workqueue, we can have the
+		 * application entering polling for a command before it gets
+		 * issued. That app will hold the uring_lock for the duration
+		 * of the poll right here, so we need to take a breather every
+		 * now and then to ensure that the issue has a chance to add
+		 * the poll to the issued list. Otherwise we can spin here
+		 * forever, while the workqueue is stuck trying to acquire the
+		 * very same mutex.
+		 */
+		if (!(++iters & 7)) {
+			mutex_unlock(&ctx->uring_lock);
+			mutex_lock(&ctx->uring_lock);
+		}
+
+		if (*nr_events < min)
+			tmin = min - *nr_events;
+
+		ret = io_iopoll_getevents(ctx, nr_events, tmin);
+		if (ret <= 0)
+			break;
+		ret = 0;
+	} while (min && !*nr_events && !need_resched());
+
+	return ret;
+}
+
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+			   long min)
+{
+	int ret;
+
+	/*
+	 * We disallow the app entering submit/complete with polling, but we
+	 * still need to lock the ring to prevent racing with polled issue
+	 * that got punted to a workqueue.
+	 */
+	mutex_lock(&ctx->uring_lock);
+	ret = __io_iopoll_check(ctx, nr_events, min);
+	mutex_unlock(&ctx->uring_lock);
+	return ret;
+}
+
+static void kiocb_end_write(struct io_kiocb *req)
+{
+	/*
+	 * Tell lockdep we inherited freeze protection from submission
+	 * thread.
+	 */
+	if (req->flags & REQ_F_ISREG) {
+		struct inode *inode = file_inode(req->file);
+
+		__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+	}
+	file_end_write(req->file);
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+	if (kiocb->ki_flags & IOCB_WRITE)
+		kiocb_end_write(req);
+
+	if ((req->flags & REQ_F_LINK) && res != req->result)
+		req->flags |= REQ_F_FAIL_LINK;
+	io_cqring_add_event(req->ctx, req->user_data, res);
+	io_put_req(req);
+}
+
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+{
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+	if (kiocb->ki_flags & IOCB_WRITE)
+		kiocb_end_write(req);
+
+	if ((req->flags & REQ_F_LINK) && res != req->result)
+		req->flags |= REQ_F_FAIL_LINK;
+	req->result = res;
+	if (res != -EAGAIN)
+		req->flags |= REQ_F_IOPOLL_COMPLETED;
+}
+
+/*
+ * After the iocb has been issued, it's safe to be found on the poll list.
+ * Adding the kiocb to the list AFTER submission ensures that we don't
+ * find it from a io_iopoll_getevents() thread before the issuer is done
+ * accessing the kiocb cookie.
+ */
+static void io_iopoll_req_issued(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	/*
+	 * Track whether we have multiple files in our lists. This will impact
+	 * how we do polling eventually, not spinning if we're on potentially
+	 * different devices.
+	 */
+	if (list_empty(&ctx->poll_list)) {
+		ctx->poll_multi_file = false;
+	} else if (!ctx->poll_multi_file) {
+		struct io_kiocb *list_req;
+
+		list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
+						list);
+		if (list_req->rw.ki_filp != req->rw.ki_filp)
+			ctx->poll_multi_file = true;
+	}
+
+	/*
+	 * For fast devices, IO may have already completed. If it has, add
+	 * it to the front so we find it first.
+	 */
+	if (req->flags & REQ_F_IOPOLL_COMPLETED)
+		list_add(&req->list, &ctx->poll_list);
+	else
+		list_add_tail(&req->list, &ctx->poll_list);
+}
+
+static void io_file_put(struct io_submit_state *state)
+{
+	if (state->file) {
+		int diff = state->has_refs - state->used_refs;
+
+		if (diff)
+			fput_many(state->file, diff);
+		state->file = NULL;
+	}
+}
+
+/*
+ * Get as many references to a file as we have IOs left in this submission,
+ * assuming most submissions are for one file, or at least that each file
+ * has more than one submission.
+ */
+static struct file *io_file_get(struct io_submit_state *state, int fd)
+{
+	if (!state)
+		return fget(fd);
+
+	if (state->file) {
+		if (state->fd == fd) {
+			state->used_refs++;
+			state->ios_left--;
+			return state->file;
+		}
+		io_file_put(state);
+	}
+	state->file = fget_many(fd, state->ios_left);
+	if (!state->file)
+		return NULL;
+
+	state->fd = fd;
+	state->has_refs = state->ios_left;
+	state->used_refs = 1;
+	state->ios_left--;
+	return state->file;
+}
+
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static bool io_file_supports_async(struct file *file)
+{
+	umode_t mode = file_inode(file)->i_mode;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode))
+		return true;
+	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+		return true;
+
+	return false;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
+		      bool force_nonblock)
+{
+	const struct io_uring_sqe *sqe = s->sqe;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct kiocb *kiocb = &req->rw;
+	unsigned ioprio;
+	int ret;
+
+	if (!req->file)
+		return -EBADF;
+
+	if (S_ISREG(file_inode(req->file)->i_mode))
+		req->flags |= REQ_F_ISREG;
+
+	/*
+	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
+	 * we know to async punt it even if it was opened O_NONBLOCK
+	 */
+	if (force_nonblock && !io_file_supports_async(req->file)) {
+		req->flags |= REQ_F_MUST_PUNT;
+		return -EAGAIN;
+	}
+
+	kiocb->ki_pos = READ_ONCE(sqe->off);
+	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+
+	ioprio = READ_ONCE(sqe->ioprio);
+	if (ioprio) {
+		ret = ioprio_check_cap(ioprio);
+		if (ret)
+			return ret;
+
+		kiocb->ki_ioprio = ioprio;
+	} else
+		kiocb->ki_ioprio = get_current_ioprio();
+
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	if (unlikely(ret))
+		return ret;
+
+	/* don't allow async punt if RWF_NOWAIT was requested */
+	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+	    (req->file->f_flags & O_NONBLOCK))
+		req->flags |= REQ_F_NOWAIT;
+
+	if (force_nonblock)
+		kiocb->ki_flags |= IOCB_NOWAIT;
+
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
+		    !kiocb->ki_filp->f_op->iopoll)
+			return -EOPNOTSUPP;
+
+		kiocb->ki_flags |= IOCB_HIPRI;
+		kiocb->ki_complete = io_complete_rw_iopoll;
+		req->result = 0;
+	} else {
+		if (kiocb->ki_flags & IOCB_HIPRI)
+			return -EINVAL;
+		kiocb->ki_complete = io_complete_rw;
+	}
+	return 0;
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+	switch (ret) {
+	case -EIOCBQUEUED:
+		break;
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/*
+		 * We can't just restart the syscall, since previously
+		 * submitted sqes may already be in progress. Just fail this
+		 * IO with EINTR.
+		 */
+		ret = -EINTR;
+		/* fall through */
+	default:
+		kiocb->ki_complete(kiocb, ret, 0);
+	}
+}
+
+static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
+			   const struct io_uring_sqe *sqe,
+			   struct iov_iter *iter)
+{
+	size_t len = READ_ONCE(sqe->len);
+	struct io_mapped_ubuf *imu;
+	unsigned index, buf_index;
+	size_t offset;
+	u64 buf_addr;
+
+	/* attempt to use fixed buffers without having provided iovecs */
+	if (unlikely(!ctx->user_bufs))
+		return -EFAULT;
+
+	buf_index = READ_ONCE(sqe->buf_index);
+	if (unlikely(buf_index >= ctx->nr_user_bufs))
+		return -EFAULT;
+
+	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+	imu = &ctx->user_bufs[index];
+	buf_addr = READ_ONCE(sqe->addr);
+
+	/* overflow */
+	if (buf_addr + len < buf_addr)
+		return -EFAULT;
+	/* not inside the mapped region */
+	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+		return -EFAULT;
+
+	/*
+	 * May not be a start of buffer, set size appropriately
+	 * and advance us to the beginning.
+	 */
+	offset = buf_addr - imu->ubuf;
+	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+
+	if (offset) {
+		/*
+		 * Don't use iov_iter_advance() here, as it's really slow for
+		 * using the latter parts of a big fixed buffer - it iterates
+		 * over each segment manually. We can cheat a bit here, because
+		 * we know that:
+		 *
+		 * 1) it's a BVEC iter, we set it up
+		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+		 *    first and last bvec
+		 *
+		 * So just find our index, and adjust the iterator afterwards.
+		 * If the offset is within the first bvec (or the whole first
+		 * bvec, just use iov_iter_advance(). This makes it easier
+		 * since we can just skip the first segment, which may not
+		 * be PAGE_SIZE aligned.
+		 */
+		const struct bio_vec *bvec = imu->bvec;
+
+		if (offset <= bvec->bv_len) {
+			iov_iter_advance(iter, offset);
+		} else {
+			unsigned long seg_skip;
+
+			/* skip first vec */
+			offset -= bvec->bv_len;
+			seg_skip = 1 + (offset >> PAGE_SHIFT);
+
+			iter->bvec = bvec + seg_skip;
+			iter->nr_segs -= seg_skip;
+			iter->count -= bvec->bv_len + offset;
+			iter->iov_offset = offset & ~PAGE_MASK;
+		}
+	}
+
+	return len;
+}
+
+static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
+			       const struct sqe_submit *s, struct iovec **iovec,
+			       struct iov_iter *iter)
+{
+	const struct io_uring_sqe *sqe = s->sqe;
+	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	size_t sqe_len = READ_ONCE(sqe->len);
+	u8 opcode;
+
+	/*
+	 * We're reading ->opcode for the second time, but the first read
+	 * doesn't care whether it's _FIXED or not, so it doesn't matter
+	 * whether ->opcode changes concurrently. The first read does care
+	 * about whether it is a READ or a WRITE, so we don't trust this read
+	 * for that purpose and instead let the caller pass in the read/write
+	 * flag.
+	 */
+	opcode = READ_ONCE(sqe->opcode);
+	if (opcode == IORING_OP_READ_FIXED ||
+	    opcode == IORING_OP_WRITE_FIXED) {
+		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+		*iovec = NULL;
+		return ret;
+	}
+
+	if (!s->has_user)
+		return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+	if (ctx->compat)
+		return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
+						iovec, iter);
+#endif
+
+	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+}
+
+static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
+{
+	if (al->file == kiocb->ki_filp) {
+		off_t start, end;
+
+		/*
+		 * Allow merging if we're anywhere in the range of the same
+		 * page. Generally this happens for sub-page reads or writes,
+		 * and it's beneficial to allow the first worker to bring the
+		 * page in and the piggy backed work can then work on the
+		 * cached page.
+		 */
+		start = al->io_start & PAGE_MASK;
+		end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
+		if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
+			return true;
+	}
+
+	al->file = NULL;
+	return false;
+}
+
+/*
+ * Make a note of the last file/offset/direction we punted to async
+ * context. We'll use this information to see if we can piggy back a
+ * sequential request onto the previous one, if it's still hasn't been
+ * completed by the async worker.
+ */
+static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
+{
+	struct async_list *async_list = &req->ctx->pending_async[rw];
+	struct kiocb *kiocb = &req->rw;
+	struct file *filp = kiocb->ki_filp;
+
+	if (io_should_merge(async_list, kiocb)) {
+		unsigned long max_bytes;
+
+		/* Use 8x RA size as a decent limiter for both reads/writes */
+		max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
+		if (!max_bytes)
+			max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
+
+		/* If max len are exceeded, reset the state */
+		if (async_list->io_len + len <= max_bytes) {
+			req->flags |= REQ_F_SEQ_PREV;
+			async_list->io_len += len;
+		} else {
+			async_list->file = NULL;
+		}
+	}
+
+	/* New file? Reset state. */
+	if (async_list->file != filp) {
+		async_list->io_start = kiocb->ki_pos;
+		async_list->io_len = len;
+		async_list->file = filp;
+	}
+}
+
+/*
+ * For files that don't have ->read_iter() and ->write_iter(), handle them
+ * by looping over ->read() or ->write() manually.
+ */
+static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
+			   struct iov_iter *iter)
+{
+	ssize_t ret = 0;
+
+	/*
+	 * Don't support polled IO through this interface, and we can't
+	 * support non-blocking either. For the latter, this just causes
+	 * the kiocb to be handled from an async context.
+	 */
+	if (kiocb->ki_flags & IOCB_HIPRI)
+		return -EOPNOTSUPP;
+	if (kiocb->ki_flags & IOCB_NOWAIT)
+		return -EAGAIN;
+
+	while (iov_iter_count(iter)) {
+		struct iovec iovec = iov_iter_iovec(iter);
+		ssize_t nr;
+
+		if (rw == READ) {
+			nr = file->f_op->read(file, iovec.iov_base,
+					      iovec.iov_len, &kiocb->ki_pos);
+		} else {
+			nr = file->f_op->write(file, iovec.iov_base,
+					       iovec.iov_len, &kiocb->ki_pos);
+		}
+
+		if (nr < 0) {
+			if (!ret)
+				ret = nr;
+			break;
+		}
+		ret += nr;
+		if (nr != iovec.iov_len)
+			break;
+		iov_iter_advance(iter, nr);
+	}
+
+	return ret;
+}
+
+static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
+		   bool force_nonblock)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct kiocb *kiocb = &req->rw;
+	struct iov_iter iter;
+	struct file *file;
+	size_t iov_count;
+	ssize_t read_size, ret;
+
+	ret = io_prep_rw(req, s, force_nonblock);
+	if (ret)
+		return ret;
+	file = kiocb->ki_filp;
+
+	if (unlikely(!(file->f_mode & FMODE_READ)))
+		return -EBADF;
+
+	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+	if (ret < 0)
+		return ret;
+
+	read_size = ret;
+	if (req->flags & REQ_F_LINK)
+		req->result = read_size;
+
+	iov_count = iov_iter_count(&iter);
+	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
+	if (!ret) {
+		ssize_t ret2;
+
+		if (file->f_op->read_iter)
+			ret2 = call_read_iter(file, kiocb, &iter);
+		else
+			ret2 = loop_rw_iter(READ, file, kiocb, &iter);
+
+		/*
+		 * In case of a short read, punt to async. This can happen
+		 * if we have data partially cached. Alternatively we can
+		 * return the short read, in which case the application will
+		 * need to issue another SQE and wait for it. That SQE will
+		 * need async punt anyway, so it's more efficient to do it
+		 * here.
+		 */
+		if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
+		    (req->flags & REQ_F_ISREG) &&
+		    ret2 > 0 && ret2 < read_size)
+			ret2 = -EAGAIN;
+		/* Catch -EAGAIN return for forced non-blocking submission */
+		if (!force_nonblock || ret2 != -EAGAIN) {
+			io_rw_done(kiocb, ret2);
+		} else {
+			/*
+			 * If ->needs_lock is true, we're already in async
+			 * context.
+			 */
+			if (!s->needs_lock)
+				io_async_list_note(READ, req, iov_count);
+			ret = -EAGAIN;
+		}
+	}
+	kfree(iovec);
+	return ret;
+}
+
+static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
+		    bool force_nonblock)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct kiocb *kiocb = &req->rw;
+	struct iov_iter iter;
+	struct file *file;
+	size_t iov_count;
+	ssize_t ret;
+
+	ret = io_prep_rw(req, s, force_nonblock);
+	if (ret)
+		return ret;
+
+	file = kiocb->ki_filp;
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		return -EBADF;
+
+	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+	if (ret < 0)
+		return ret;
+
+	if (req->flags & REQ_F_LINK)
+		req->result = ret;
+
+	iov_count = iov_iter_count(&iter);
+
+	ret = -EAGAIN;
+	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
+		/* If ->needs_lock is true, we're already in async context. */
+		if (!s->needs_lock)
+			io_async_list_note(WRITE, req, iov_count);
+		goto out_free;
+	}
+
+	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
+	if (!ret) {
+		ssize_t ret2;
+
+		/*
+		 * Open-code file_start_write here to grab freeze protection,
+		 * which will be released by another thread in
+		 * io_complete_rw().  Fool lockdep by telling it the lock got
+		 * released so that it doesn't complain about the held lock when
+		 * we return to userspace.
+		 */
+		if (req->flags & REQ_F_ISREG) {
+			__sb_start_write(file_inode(file)->i_sb,
+						SB_FREEZE_WRITE, true);
+			__sb_writers_release(file_inode(file)->i_sb,
+						SB_FREEZE_WRITE);
+		}
+		kiocb->ki_flags |= IOCB_WRITE;
+
+		if (file->f_op->write_iter)
+			ret2 = call_write_iter(file, kiocb, &iter);
+		else
+			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
+		if (!force_nonblock || ret2 != -EAGAIN) {
+			io_rw_done(kiocb, ret2);
+		} else {
+			/*
+			 * If ->needs_lock is true, we're already in async
+			 * context.
+			 */
+			if (!s->needs_lock)
+				io_async_list_note(WRITE, req, iov_count);
+			ret = -EAGAIN;
+		}
+	}
+out_free:
+	kfree(iovec);
+	return ret;
+}
+
+/*
+ * IORING_OP_NOP just posts a completion event, nothing else.
+ */
+static int io_nop(struct io_kiocb *req, u64 user_data)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	long err = 0;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
+	io_cqring_add_event(ctx, user_data, err);
+	io_put_req(req);
+	return 0;
+}
+
+static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!req->file)
+		return -EBADF;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		    bool force_nonblock)
+{
+	loff_t sqe_off = READ_ONCE(sqe->off);
+	loff_t sqe_len = READ_ONCE(sqe->len);
+	loff_t end = sqe_off + sqe_len;
+	unsigned fsync_flags;
+	int ret;
+
+	fsync_flags = READ_ONCE(sqe->fsync_flags);
+	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
+		return -EINVAL;
+
+	ret = io_prep_fsync(req, sqe);
+	if (ret)
+		return ret;
+
+	/* fsync always requires a blocking context */
+	if (force_nonblock)
+		return -EAGAIN;
+
+	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
+				end > 0 ? end : LLONG_MAX,
+				fsync_flags & IORING_FSYNC_DATASYNC);
+
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req);
+	return 0;
+}
+
+static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret = 0;
+
+	if (!req->file)
+		return -EBADF;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+		return -EINVAL;
+
+	return ret;
+}
+
+static int io_sync_file_range(struct io_kiocb *req,
+			      const struct io_uring_sqe *sqe,
+			      bool force_nonblock)
+{
+	loff_t sqe_off;
+	loff_t sqe_len;
+	unsigned flags;
+	int ret;
+
+	ret = io_prep_sfr(req, sqe);
+	if (ret)
+		return ret;
+
+	/* sync_file_range always requires a blocking context */
+	if (force_nonblock)
+		return -EAGAIN;
+
+	sqe_off = READ_ONCE(sqe->off);
+	sqe_len = READ_ONCE(sqe->len);
+	flags = READ_ONCE(sqe->sync_range_flags);
+
+	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
+
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req);
+	return 0;
+}
+
+#if defined(CONFIG_NET)
+static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   bool force_nonblock,
+		   long (*fn)(struct socket *, struct user_msghdr __user *,
+				unsigned int))
+{
+	struct socket *sock;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
+	sock = sock_from_file(req->file, &ret);
+	if (sock) {
+		struct user_msghdr __user *msg;
+		unsigned flags;
+
+		flags = READ_ONCE(sqe->msg_flags);
+		if (flags & MSG_DONTWAIT)
+			req->flags |= REQ_F_NOWAIT;
+		else if (force_nonblock)
+			flags |= MSG_DONTWAIT;
+
+		msg = (struct user_msghdr __user *) (unsigned long)
+			READ_ONCE(sqe->addr);
+
+		ret = fn(sock, msg, flags);
+		if (force_nonblock && ret == -EAGAIN)
+			return ret;
+	}
+
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req);
+	return 0;
+}
+#endif
+
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static void io_poll_remove_one(struct io_kiocb *req)
+{
+	struct io_poll_iocb *poll = &req->poll;
+
+	spin_lock(&poll->head->lock);
+	WRITE_ONCE(poll->canceled, true);
+	if (!list_empty(&poll->wait.entry)) {
+		list_del_init(&poll->wait.entry);
+		io_queue_async_work(req->ctx, req);
+	}
+	spin_unlock(&poll->head->lock);
+
+	list_del_init(&req->list);
+}
+
+static void io_poll_remove_all(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	spin_lock_irq(&ctx->completion_lock);
+	while (!list_empty(&ctx->cancel_list)) {
+		req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+		io_poll_remove_one(req);
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
+/*
+ * Find a running poll command that matches one specified in sqe->addr,
+ * and remove it if found.
+ */
+static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *poll_req, *next;
+	int ret = -ENOENT;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+	    sqe->poll_events)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->completion_lock);
+	list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
+		if (READ_ONCE(sqe->addr) == poll_req->user_data) {
+			io_poll_remove_one(poll_req);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req);
+	return 0;
+}
+
+static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			     __poll_t mask)
+{
+	req->poll.done = true;
+	io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
+	io_commit_cqring(ctx);
+}
+
+static void io_poll_complete_work(struct work_struct *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_poll_iocb *poll = &req->poll;
+	struct poll_table_struct pt = { ._key = poll->events };
+	struct io_ring_ctx *ctx = req->ctx;
+	const struct cred *old_cred;
+	__poll_t mask = 0;
+
+	old_cred = override_creds(ctx->creds);
+
+	if (!READ_ONCE(poll->canceled))
+		mask = vfs_poll(poll->file, &pt) & poll->events;
+
+	/*
+	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
+	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
+	 * synchronize with them.  In the cancellation case the list_del_init
+	 * itself is not actually needed, but harmless so we keep it in to
+	 * avoid further branches in the fast path.
+	 */
+	spin_lock_irq(&ctx->completion_lock);
+	if (!mask && !READ_ONCE(poll->canceled)) {
+		add_wait_queue(poll->head, &poll->wait);
+		spin_unlock_irq(&ctx->completion_lock);
+		goto out;
+	}
+	list_del_init(&req->list);
+	io_poll_complete(ctx, req, mask);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	io_put_req(req);
+out:
+	revert_creds(old_cred);
+}
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+			void *key)
+{
+	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
+							wait);
+	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
+	struct io_ring_ctx *ctx = req->ctx;
+	__poll_t mask = key_to_poll(key);
+	unsigned long flags;
+
+	/* for instances that support it check for an event match first: */
+	if (mask && !(mask & poll->events))
+		return 0;
+
+	list_del_init(&poll->wait.entry);
+
+	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+		list_del(&req->list);
+		io_poll_complete(ctx, req, mask);
+		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+		io_cqring_ev_posted(ctx);
+		io_put_req(req);
+	} else {
+		io_queue_async_work(ctx, req);
+	}
+
+	return 1;
+}
+
+struct io_poll_table {
+	struct poll_table_struct pt;
+	struct io_kiocb *req;
+	int error;
+};
+
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+			       struct poll_table_struct *p)
+{
+	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+	if (unlikely(pt->req->poll.head)) {
+		pt->error = -EINVAL;
+		return;
+	}
+
+	pt->error = 0;
+	pt->req->poll.head = head;
+	add_wait_queue(head, &pt->req->poll.wait);
+}
+
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_poll_iocb *poll = &req->poll;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_poll_table ipt;
+	bool cancel = false;
+	__poll_t mask;
+	u16 events;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+		return -EINVAL;
+	if (!poll->file)
+		return -EBADF;
+
+	req->submit.sqe = NULL;
+	INIT_WORK(&req->work, io_poll_complete_work);
+	events = READ_ONCE(sqe->poll_events);
+	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+	poll->head = NULL;
+	poll->done = false;
+	poll->canceled = false;
+
+	ipt.pt._qproc = io_poll_queue_proc;
+	ipt.pt._key = poll->events;
+	ipt.req = req;
+	ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+	/* initialized the list so that we can do list_empty checks */
+	INIT_LIST_HEAD(&poll->wait.entry);
+	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+
+	INIT_LIST_HEAD(&req->list);
+
+	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+
+	spin_lock_irq(&ctx->completion_lock);
+	if (likely(poll->head)) {
+		spin_lock(&poll->head->lock);
+		if (unlikely(list_empty(&poll->wait.entry))) {
+			if (ipt.error)
+				cancel = true;
+			ipt.error = 0;
+			mask = 0;
+		}
+		if (mask || ipt.error)
+			list_del_init(&poll->wait.entry);
+		else if (cancel)
+			WRITE_ONCE(poll->canceled, true);
+		else if (!poll->done) /* actually waiting for an event */
+			list_add_tail(&req->list, &ctx->cancel_list);
+		spin_unlock(&poll->head->lock);
+	}
+	if (mask) { /* no async, we'd stolen it */
+		ipt.error = 0;
+		io_poll_complete(ctx, req, mask);
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+
+	if (mask) {
+		io_cqring_ev_posted(ctx);
+		io_put_req(req);
+	}
+	return ipt.error;
+}
+
+static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
+{
+	struct io_ring_ctx *ctx;
+	struct io_kiocb *req, *prev;
+	unsigned long flags;
+
+	req = container_of(timer, struct io_kiocb, timeout.timer);
+	ctx = req->ctx;
+	atomic_inc(&ctx->cq_timeouts);
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	/*
+	 * Adjust the reqs sequence before the current one because it
+	 * will consume a slot in the cq_ring and the the cq_tail pointer
+	 * will be increased, otherwise other timeout reqs may return in
+	 * advance without waiting for enough wait_nr.
+	 */
+	prev = req;
+	list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
+		prev->sequence++;
+	list_del(&req->list);
+
+	io_cqring_fill_event(ctx, req->user_data, -ETIME);
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	io_cqring_ev_posted(ctx);
+
+	io_put_req(req);
+	return HRTIMER_NORESTART;
+}
+
+static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	unsigned count;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct list_head *entry;
+	struct timespec64 ts;
+	unsigned span = 0;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
+	    sqe->len != 1)
+		return -EINVAL;
+
+	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
+		return -EFAULT;
+
+	req->flags |= REQ_F_TIMEOUT;
+
+	/*
+	 * sqe->off holds how many events that need to occur for this
+	 * timeout event to be satisfied. If it isn't set, then this is
+	 * a pure timeout request, sequence isn't used.
+	 */
+	count = READ_ONCE(sqe->off);
+	if (!count) {
+		req->flags |= REQ_F_TIMEOUT_NOSEQ;
+		spin_lock_irq(&ctx->completion_lock);
+		entry = ctx->timeout_list.prev;
+		goto add;
+	}
+
+	req->sequence = ctx->cached_sq_head + count - 1;
+	/* reuse it to store the count */
+	req->submit.sequence = count;
+
+	/*
+	 * Insertion sort, ensuring the first entry in the list is always
+	 * the one we need first.
+	 */
+	spin_lock_irq(&ctx->completion_lock);
+	list_for_each_prev(entry, &ctx->timeout_list) {
+		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
+		unsigned nxt_sq_head;
+		long long tmp, tmp_nxt;
+
+		if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
+			continue;
+
+		/*
+		 * Since cached_sq_head + count - 1 can overflow, use type long
+		 * long to store it.
+		 */
+		tmp = (long long)ctx->cached_sq_head + count - 1;
+		nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
+		tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
+
+		/*
+		 * cached_sq_head may overflow, and it will never overflow twice
+		 * once there is some timeout req still be valid.
+		 */
+		if (ctx->cached_sq_head < nxt_sq_head)
+			tmp += UINT_MAX;
+
+		if (tmp > tmp_nxt)
+			break;
+
+		/*
+		 * Sequence of reqs after the insert one and itself should
+		 * be adjusted because each timeout req consumes a slot.
+		 */
+		span++;
+		nxt->sequence++;
+	}
+	req->sequence -= span;
+add:
+	list_add(&req->list, entry);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	req->timeout.timer.function = io_timeout_fn;
+	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
+			HRTIMER_MODE_REL);
+	return 0;
+}
+
+static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	struct io_uring_sqe *sqe_copy;
+
+	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
+		return 0;
+
+	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+	if (!sqe_copy)
+		return -EAGAIN;
+
+	spin_lock_irq(&ctx->completion_lock);
+	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
+		spin_unlock_irq(&ctx->completion_lock);
+		kfree(sqe_copy);
+		return 0;
+	}
+
+	memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
+	req->submit.sqe = sqe_copy;
+
+	INIT_WORK(&req->work, io_sq_wq_submit_work);
+	list_add_tail(&req->list, &ctx->defer_list);
+	spin_unlock_irq(&ctx->completion_lock);
+	return -EIOCBQUEUED;
+}
+
+static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			   const struct sqe_submit *s, bool force_nonblock)
+{
+	int ret, opcode;
+
+	req->user_data = READ_ONCE(s->sqe->user_data);
+
+	if (unlikely(s->index >= ctx->sq_entries))
+		return -EINVAL;
+
+	opcode = READ_ONCE(s->sqe->opcode);
+	switch (opcode) {
+	case IORING_OP_NOP:
+		ret = io_nop(req, req->user_data);
+		break;
+	case IORING_OP_READV:
+		if (unlikely(s->sqe->buf_index))
+			return -EINVAL;
+		ret = io_read(req, s, force_nonblock);
+		break;
+	case IORING_OP_WRITEV:
+		if (unlikely(s->sqe->buf_index))
+			return -EINVAL;
+		ret = io_write(req, s, force_nonblock);
+		break;
+	case IORING_OP_READ_FIXED:
+		ret = io_read(req, s, force_nonblock);
+		break;
+	case IORING_OP_WRITE_FIXED:
+		ret = io_write(req, s, force_nonblock);
+		break;
+	case IORING_OP_FSYNC:
+		ret = io_fsync(req, s->sqe, force_nonblock);
+		break;
+	case IORING_OP_POLL_ADD:
+		ret = io_poll_add(req, s->sqe);
+		break;
+	case IORING_OP_POLL_REMOVE:
+		ret = io_poll_remove(req, s->sqe);
+		break;
+	case IORING_OP_SYNC_FILE_RANGE:
+		ret = io_sync_file_range(req, s->sqe, force_nonblock);
+		break;
+	case IORING_OP_SENDMSG:
+		ret = io_sendmsg(req, s->sqe, force_nonblock);
+		break;
+	case IORING_OP_RECVMSG:
+		ret = io_recvmsg(req, s->sqe, force_nonblock);
+		break;
+	case IORING_OP_TIMEOUT:
+		ret = io_timeout(req, s->sqe);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (ret)
+		return ret;
+
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		if (req->result == -EAGAIN)
+			return -EAGAIN;
+
+		/* workqueue context doesn't hold uring_lock, grab it now */
+		if (s->needs_lock)
+			mutex_lock(&ctx->uring_lock);
+		io_iopoll_req_issued(req);
+		if (s->needs_lock)
+			mutex_unlock(&ctx->uring_lock);
+	}
+
+	return 0;
+}
+
+static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
+						 const struct io_uring_sqe *sqe)
+{
+	switch (sqe->opcode) {
+	case IORING_OP_READV:
+	case IORING_OP_READ_FIXED:
+		return &ctx->pending_async[READ];
+	case IORING_OP_WRITEV:
+	case IORING_OP_WRITE_FIXED:
+		return &ctx->pending_async[WRITE];
+	default:
+		return NULL;
+	}
+}
+
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+{
+	u8 opcode = READ_ONCE(sqe->opcode);
+
+	return !(opcode == IORING_OP_READ_FIXED ||
+		 opcode == IORING_OP_WRITE_FIXED);
+}
+
+static void io_sq_wq_submit_work(struct work_struct *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct mm_struct *cur_mm = NULL;
+	struct async_list *async_list;
+	const struct cred *old_cred;
+	LIST_HEAD(req_list);
+	mm_segment_t old_fs;
+	int ret;
+
+	old_cred = override_creds(ctx->creds);
+	async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
+restart:
+	do {
+		struct sqe_submit *s = &req->submit;
+		const struct io_uring_sqe *sqe = s->sqe;
+		unsigned int flags = req->flags;
+
+		/* Ensure we clear previously set non-block flag */
+		req->rw.ki_flags &= ~IOCB_NOWAIT;
+
+		ret = 0;
+		if (io_sqe_needs_user(sqe) && !cur_mm) {
+			if (!mmget_not_zero(ctx->sqo_mm)) {
+				ret = -EFAULT;
+			} else {
+				cur_mm = ctx->sqo_mm;
+				use_mm(cur_mm);
+				old_fs = get_fs();
+				set_fs(USER_DS);
+			}
+		}
+
+		if (!ret) {
+			s->has_user = cur_mm != NULL;
+			s->needs_lock = true;
+			do {
+				ret = __io_submit_sqe(ctx, req, s, false);
+				/*
+				 * We can get EAGAIN for polled IO even though
+				 * we're forcing a sync submission from here,
+				 * since we can't wait for request slots on the
+				 * block side.
+				 */
+				if (ret != -EAGAIN)
+					break;
+				cond_resched();
+			} while (1);
+		}
+
+		/* drop submission reference */
+		io_put_req(req);
+
+		if (ret) {
+			io_cqring_add_event(ctx, sqe->user_data, ret);
+			io_put_req(req);
+		}
+
+		/* async context always use a copy of the sqe */
+		kfree(sqe);
+
+		/* req from defer and link list needn't decrease async cnt */
+		if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
+			goto out;
+
+		if (!async_list)
+			break;
+		if (!list_empty(&req_list)) {
+			req = list_first_entry(&req_list, struct io_kiocb,
+						list);
+			list_del(&req->list);
+			continue;
+		}
+		if (list_empty(&async_list->list))
+			break;
+
+		req = NULL;
+		spin_lock(&async_list->lock);
+		if (list_empty(&async_list->list)) {
+			spin_unlock(&async_list->lock);
+			break;
+		}
+		list_splice_init(&async_list->list, &req_list);
+		spin_unlock(&async_list->lock);
+
+		req = list_first_entry(&req_list, struct io_kiocb, list);
+		list_del(&req->list);
+	} while (req);
+
+	/*
+	 * Rare case of racing with a submitter. If we find the count has
+	 * dropped to zero AND we have pending work items, then restart
+	 * the processing. This is a tiny race window.
+	 */
+	if (async_list) {
+		ret = atomic_dec_return(&async_list->cnt);
+		while (!ret && !list_empty(&async_list->list)) {
+			spin_lock(&async_list->lock);
+			atomic_inc(&async_list->cnt);
+			list_splice_init(&async_list->list, &req_list);
+			spin_unlock(&async_list->lock);
+
+			if (!list_empty(&req_list)) {
+				req = list_first_entry(&req_list,
+							struct io_kiocb, list);
+				list_del(&req->list);
+				goto restart;
+			}
+			ret = atomic_dec_return(&async_list->cnt);
+		}
+	}
+
+out:
+	if (cur_mm) {
+		set_fs(old_fs);
+		unuse_mm(cur_mm);
+		mmput(cur_mm);
+	}
+	revert_creds(old_cred);
+}
+
+/*
+ * See if we can piggy back onto previously submitted work, that is still
+ * running. We currently only allow this if the new request is sequential
+ * to the previous one we punted.
+ */
+static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
+{
+	bool ret;
+
+	if (!list)
+		return false;
+	if (!(req->flags & REQ_F_SEQ_PREV))
+		return false;
+	if (!atomic_read(&list->cnt))
+		return false;
+
+	ret = true;
+	spin_lock(&list->lock);
+	list_add_tail(&req->list, &list->list);
+	/*
+	 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
+	 */
+	smp_mb();
+	if (!atomic_read(&list->cnt)) {
+		list_del_init(&req->list);
+		ret = false;
+	}
+	spin_unlock(&list->lock);
+	return ret;
+}
+
+static bool io_op_needs_file(const struct io_uring_sqe *sqe)
+{
+	int op = READ_ONCE(sqe->opcode);
+
+	switch (op) {
+	case IORING_OP_NOP:
+	case IORING_OP_POLL_REMOVE:
+	case IORING_OP_TIMEOUT:
+		return false;
+	default:
+		return true;
+	}
+}
+
+static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
+			   struct io_submit_state *state, struct io_kiocb *req)
+{
+	unsigned flags;
+	int fd;
+
+	flags = READ_ONCE(s->sqe->flags);
+	fd = READ_ONCE(s->sqe->fd);
+
+	if (flags & IOSQE_IO_DRAIN)
+		req->flags |= REQ_F_IO_DRAIN;
+	/*
+	 * All io need record the previous position, if LINK vs DARIN,
+	 * it can be used to mark the position of the first IO in the
+	 * link list.
+	 */
+	req->sequence = s->sequence;
+
+	if (!io_op_needs_file(s->sqe))
+		return 0;
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files ||
+		    (unsigned) fd >= ctx->nr_user_files))
+			return -EBADF;
+		req->file = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		if (s->needs_fixed_file)
+			return -EBADF;
+		req->file = io_file_get(state, fd);
+		if (unlikely(!req->file))
+			return -EBADF;
+	}
+
+	return 0;
+}
+
+static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			struct sqe_submit *s)
+{
+	int ret;
+
+	ret = __io_submit_sqe(ctx, req, s, true);
+
+	/*
+	 * We async punt it if the file wasn't marked NOWAIT, or if the file
+	 * doesn't support non-blocking read/write attempts
+	 */
+	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
+	    (req->flags & REQ_F_MUST_PUNT))) {
+		struct io_uring_sqe *sqe_copy;
+
+		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+		if (sqe_copy) {
+			struct async_list *list;
+
+			s->sqe = sqe_copy;
+			memcpy(&req->submit, s, sizeof(*s));
+			list = io_async_list_from_sqe(ctx, s->sqe);
+			if (!io_add_to_prev_work(list, req)) {
+				if (list)
+					atomic_inc(&list->cnt);
+				INIT_WORK(&req->work, io_sq_wq_submit_work);
+				io_queue_async_work(ctx, req);
+			}
+
+			/*
+			 * Queued up for async execution, worker will release
+			 * submit reference when the iocb is actually submitted.
+			 */
+			return 0;
+		}
+	}
+
+	/* drop submission reference */
+	io_put_req(req);
+
+	/* and drop final reference, if we failed */
+	if (ret) {
+		io_cqring_add_event(ctx, req->user_data, ret);
+		if (req->flags & REQ_F_LINK)
+			req->flags |= REQ_F_FAIL_LINK;
+		io_put_req(req);
+	}
+
+	return ret;
+}
+
+static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			struct sqe_submit *s)
+{
+	int ret;
+
+	ret = io_req_defer(ctx, req, s->sqe);
+	if (ret) {
+		if (ret != -EIOCBQUEUED) {
+			io_free_req(req);
+			io_cqring_add_event(ctx, s->sqe->user_data, ret);
+		}
+		return 0;
+	}
+
+	return __io_queue_sqe(ctx, req, s);
+}
+
+static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			      struct sqe_submit *s, struct io_kiocb *shadow)
+{
+	int ret;
+	int need_submit = false;
+
+	if (!shadow)
+		return io_queue_sqe(ctx, req, s);
+
+	/*
+	 * Mark the first IO in link list as DRAIN, let all the following
+	 * IOs enter the defer list. all IO needs to be completed before link
+	 * list.
+	 */
+	req->flags |= REQ_F_IO_DRAIN;
+	ret = io_req_defer(ctx, req, s->sqe);
+	if (ret) {
+		if (ret != -EIOCBQUEUED) {
+			io_free_req(req);
+			__io_free_req(shadow);
+			io_cqring_add_event(ctx, s->sqe->user_data, ret);
+			return 0;
+		}
+	} else {
+		/*
+		 * If ret == 0 means that all IOs in front of link io are
+		 * running done. let's queue link head.
+		 */
+		need_submit = true;
+	}
+
+	/* Insert shadow req to defer_list, blocking next IOs */
+	spin_lock_irq(&ctx->completion_lock);
+	list_add_tail(&shadow->list, &ctx->defer_list);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	if (need_submit)
+		return __io_queue_sqe(ctx, req, s);
+
+	return 0;
+}
+
+#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
+
+static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
+			  struct io_submit_state *state, struct io_kiocb **link)
+{
+	struct io_uring_sqe *sqe_copy;
+	struct io_kiocb *req;
+	int ret;
+
+	/* enforce forwards compatibility on users */
+	if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	req = io_get_req(ctx, state);
+	if (unlikely(!req)) {
+		ret = -EAGAIN;
+		goto err;
+	}
+
+	ret = io_req_set_file(ctx, s, state, req);
+	if (unlikely(ret)) {
+err_req:
+		io_free_req(req);
+err:
+		io_cqring_add_event(ctx, s->sqe->user_data, ret);
+		return;
+	}
+
+	req->user_data = s->sqe->user_data;
+
+	/*
+	 * If we already have a head request, queue this one for async
+	 * submittal once the head completes. If we don't have a head but
+	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
+	 * submitted sync once the chain is complete. If none of those
+	 * conditions are true (normal request), then just queue it.
+	 */
+	if (*link) {
+		struct io_kiocb *prev = *link;
+
+		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+		if (!sqe_copy) {
+			ret = -EAGAIN;
+			goto err_req;
+		}
+
+		s->sqe = sqe_copy;
+		memcpy(&req->submit, s, sizeof(*s));
+		list_add_tail(&req->list, &prev->link_list);
+	} else if (s->sqe->flags & IOSQE_IO_LINK) {
+		req->flags |= REQ_F_LINK;
+
+		memcpy(&req->submit, s, sizeof(*s));
+		INIT_LIST_HEAD(&req->link_list);
+		*link = req;
+	} else {
+		io_queue_sqe(ctx, req, s);
+	}
+}
+
+/*
+ * Batched submission is done, ensure local IO is flushed out.
+ */
+static void io_submit_state_end(struct io_submit_state *state)
+{
+	blk_finish_plug(&state->plug);
+	io_file_put(state);
+	if (state->free_reqs)
+		kmem_cache_free_bulk(req_cachep, state->free_reqs,
+					&state->reqs[state->cur_req]);
+}
+
+/*
+ * Start submission side cache.
+ */
+static void io_submit_state_start(struct io_submit_state *state,
+				  struct io_ring_ctx *ctx, unsigned max_ios)
+{
+	blk_start_plug(&state->plug);
+	state->free_reqs = 0;
+	state->file = NULL;
+	state->ios_left = max_ios;
+}
+
+static void io_commit_sqring(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = ctx->rings;
+
+	if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
+		/*
+		 * Ensure any loads from the SQEs are done at this point,
+		 * since once we write the new head, the application could
+		 * write new data to them.
+		 */
+		smp_store_release(&rings->sq.head, ctx->cached_sq_head);
+	}
+}
+
+/*
+ * Fetch an sqe, if one is available. Note that s->sqe will point to memory
+ * that is mapped by userspace. This means that care needs to be taken to
+ * ensure that reads are stable, as we cannot rely on userspace always
+ * being a good citizen. If members of the sqe are validated and then later
+ * used, it's important that those reads are done through READ_ONCE() to
+ * prevent a re-load down the line.
+ */
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+{
+	struct io_rings *rings = ctx->rings;
+	u32 *sq_array = ctx->sq_array;
+	unsigned head;
+
+	/*
+	 * The cached sq head (or cq tail) serves two purposes:
+	 *
+	 * 1) allows us to batch the cost of updating the user visible
+	 *    head updates.
+	 * 2) allows the kernel side to track the head on its own, even
+	 *    though the application is the one updating it.
+	 */
+	head = ctx->cached_sq_head;
+	/* make sure SQ entry isn't read before tail */
+	if (head == smp_load_acquire(&rings->sq.tail))
+		return false;
+
+	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
+	if (head < ctx->sq_entries) {
+		s->index = head;
+		s->sqe = &ctx->sq_sqes[head];
+		s->sequence = ctx->cached_sq_head;
+		ctx->cached_sq_head++;
+		return true;
+	}
+
+	/* drop invalid entries */
+	ctx->cached_sq_head++;
+	ctx->cached_sq_dropped++;
+	WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
+	return false;
+}
+
+static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
+			  bool has_user, bool mm_fault)
+{
+	struct io_submit_state state, *statep = NULL;
+	struct io_kiocb *link = NULL;
+	struct io_kiocb *shadow_req = NULL;
+	bool prev_was_link = false;
+	int i, submitted = 0;
+
+	if (nr > IO_PLUG_THRESHOLD) {
+		io_submit_state_start(&state, ctx, nr);
+		statep = &state;
+	}
+
+	for (i = 0; i < nr; i++) {
+		struct sqe_submit s;
+
+		if (!io_get_sqring(ctx, &s))
+			break;
+
+		/*
+		 * If previous wasn't linked and we have a linked command,
+		 * that's the end of the chain. Submit the previous link.
+		 */
+		if (!prev_was_link && link) {
+			io_queue_link_head(ctx, link, &link->submit, shadow_req);
+			link = NULL;
+			shadow_req = NULL;
+		}
+		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
+
+		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
+			if (!shadow_req) {
+				shadow_req = io_get_req(ctx, NULL);
+				if (unlikely(!shadow_req))
+					goto out;
+				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
+				refcount_dec(&shadow_req->refs);
+			}
+			shadow_req->sequence = s.sequence;
+		}
+
+out:
+		if (unlikely(mm_fault)) {
+			io_cqring_add_event(ctx, s.sqe->user_data,
+						-EFAULT);
+		} else {
+			s.has_user = has_user;
+			s.needs_lock = true;
+			s.needs_fixed_file = true;
+			io_submit_sqe(ctx, &s, statep, &link);
+			submitted++;
+		}
+	}
+
+	if (link)
+		io_queue_link_head(ctx, link, &link->submit, shadow_req);
+	if (statep)
+		io_submit_state_end(&state);
+
+	return submitted;
+}
+
+static int io_sq_thread(void *data)
+{
+	struct io_ring_ctx *ctx = data;
+	struct mm_struct *cur_mm = NULL;
+	const struct cred *old_cred;
+	mm_segment_t old_fs;
+	DEFINE_WAIT(wait);
+	unsigned inflight;
+	unsigned long timeout;
+
+	complete(&ctx->sqo_thread_started);
+
+	old_fs = get_fs();
+	set_fs(USER_DS);
+	old_cred = override_creds(ctx->creds);
+
+	timeout = inflight = 0;
+	while (!kthread_should_park()) {
+		bool mm_fault = false;
+		unsigned int to_submit;
+
+		if (inflight) {
+			unsigned nr_events = 0;
+
+			if (ctx->flags & IORING_SETUP_IOPOLL) {
+				/*
+				 * inflight is the count of the maximum possible
+				 * entries we submitted, but it can be smaller
+				 * if we dropped some of them. If we don't have
+				 * poll entries available, then we know that we
+				 * have nothing left to poll for. Reset the
+				 * inflight count to zero in that case.
+				 */
+				mutex_lock(&ctx->uring_lock);
+				if (!list_empty(&ctx->poll_list))
+					__io_iopoll_check(ctx, &nr_events, 0);
+				else
+					inflight = 0;
+				mutex_unlock(&ctx->uring_lock);
+			} else {
+				/*
+				 * Normal IO, just pretend everything completed.
+				 * We don't have to poll completions for that.
+				 */
+				nr_events = inflight;
+			}
+
+			inflight -= nr_events;
+			if (!inflight)
+				timeout = jiffies + ctx->sq_thread_idle;
+		}
+
+		to_submit = io_sqring_entries(ctx);
+		if (!to_submit) {
+			/*
+			 * We're polling. If we're within the defined idle
+			 * period, then let us spin without work before going
+			 * to sleep.
+			 */
+			if (inflight || !time_after(jiffies, timeout)) {
+				cond_resched();
+				continue;
+			}
+
+			/*
+			 * Drop cur_mm before scheduling, we can't hold it for
+			 * long periods (or over schedule()). Do this before
+			 * adding ourselves to the waitqueue, as the unuse/drop
+			 * may sleep.
+			 */
+			if (cur_mm) {
+				unuse_mm(cur_mm);
+				mmput(cur_mm);
+				cur_mm = NULL;
+			}
+
+			prepare_to_wait(&ctx->sqo_wait, &wait,
+						TASK_INTERRUPTIBLE);
+
+			/* Tell userspace we may need a wakeup call */
+			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+			/* make sure to read SQ tail after writing flags */
+			smp_mb();
+
+			to_submit = io_sqring_entries(ctx);
+			if (!to_submit) {
+				if (kthread_should_park()) {
+					finish_wait(&ctx->sqo_wait, &wait);
+					break;
+				}
+				if (signal_pending(current))
+					flush_signals(current);
+				schedule();
+				finish_wait(&ctx->sqo_wait, &wait);
+
+				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+				continue;
+			}
+			finish_wait(&ctx->sqo_wait, &wait);
+
+			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+		}
+
+		/* Unless all new commands are FIXED regions, grab mm */
+		if (!cur_mm) {
+			mm_fault = !mmget_not_zero(ctx->sqo_mm);
+			if (!mm_fault) {
+				use_mm(ctx->sqo_mm);
+				cur_mm = ctx->sqo_mm;
+			}
+		}
+
+		to_submit = min(to_submit, ctx->sq_entries);
+		inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
+					   mm_fault);
+
+		/* Commit SQ ring head once we've consumed all SQEs */
+		io_commit_sqring(ctx);
+	}
+
+	set_fs(old_fs);
+	if (cur_mm) {
+		unuse_mm(cur_mm);
+		mmput(cur_mm);
+	}
+	revert_creds(old_cred);
+
+	kthread_parkme();
+
+	return 0;
+}
+
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
+{
+	struct io_submit_state state, *statep = NULL;
+	struct io_kiocb *link = NULL;
+	struct io_kiocb *shadow_req = NULL;
+	bool prev_was_link = false;
+	int i, submit = 0;
+
+	if (to_submit > IO_PLUG_THRESHOLD) {
+		io_submit_state_start(&state, ctx, to_submit);
+		statep = &state;
+	}
+
+	for (i = 0; i < to_submit; i++) {
+		struct sqe_submit s;
+
+		if (!io_get_sqring(ctx, &s))
+			break;
+
+		/*
+		 * If previous wasn't linked and we have a linked command,
+		 * that's the end of the chain. Submit the previous link.
+		 */
+		if (!prev_was_link && link) {
+			io_queue_link_head(ctx, link, &link->submit, shadow_req);
+			link = NULL;
+			shadow_req = NULL;
+		}
+		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
+
+		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
+			if (!shadow_req) {
+				shadow_req = io_get_req(ctx, NULL);
+				if (unlikely(!shadow_req))
+					goto out;
+				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
+				refcount_dec(&shadow_req->refs);
+			}
+			shadow_req->sequence = s.sequence;
+		}
+
+out:
+		s.has_user = true;
+		s.needs_lock = false;
+		s.needs_fixed_file = false;
+		submit++;
+		io_submit_sqe(ctx, &s, statep, &link);
+	}
+
+	if (link)
+		io_queue_link_head(ctx, link, &link->submit, shadow_req);
+	if (statep)
+		io_submit_state_end(statep);
+
+	io_commit_sqring(ctx);
+
+	return submit;
+}
+
+struct io_wait_queue {
+	struct wait_queue_entry wq;
+	struct io_ring_ctx *ctx;
+	unsigned to_wait;
+	unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	/*
+	 * Wake up if we have enough events, or if a timeout occured since we
+	 * started waiting. For timeouts, we always want to return to userspace,
+	 * regardless of event count.
+	 */
+	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			    int wake_flags, void *key)
+{
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+							wq);
+
+	if (!io_should_wake(iowq))
+		return -1;
+
+	return autoremove_wake_function(curr, mode, wake_flags, key);
+}
+
+/*
+ * Wait until events become available, if we don't already have some. The
+ * application must reap them itself, as they reside on the shared cq ring.
+ */
+static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
+			  const sigset_t __user *sig, size_t sigsz)
+{
+	struct io_wait_queue iowq = {
+		.wq = {
+			.private	= current,
+			.func		= io_wake_function,
+			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
+		},
+		.ctx		= ctx,
+		.to_wait	= min_events,
+	};
+	struct io_rings *rings = ctx->rings;
+	int ret;
+
+	if (io_cqring_events(rings) >= min_events)
+		return 0;
+
+	if (sig) {
+#ifdef CONFIG_COMPAT
+		if (in_compat_syscall())
+			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
+						      sigsz);
+		else
+#endif
+			ret = set_user_sigmask(sig, sigsz);
+
+		if (ret)
+			return ret;
+	}
+
+	ret = 0;
+	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	do {
+		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
+						TASK_INTERRUPTIBLE);
+		if (io_should_wake(&iowq))
+			break;
+		schedule();
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+	} while (1);
+	finish_wait(&ctx->wait, &iowq.wq);
+
+	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
+	if (ret == -ERESTARTSYS)
+		ret = -EINTR;
+
+	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
+}
+
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_UNIX)
+	if (ctx->ring_sock) {
+		struct sock *sock = ctx->ring_sock->sk;
+		struct sk_buff *skb;
+
+		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+			kfree_skb(skb);
+	}
+#else
+	int i;
+
+	for (i = 0; i < ctx->nr_user_files; i++)
+		fput(ctx->user_files[i]);
+#endif
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+	if (!ctx->user_files)
+		return -ENXIO;
+
+	__io_sqe_files_unregister(ctx);
+	kfree(ctx->user_files);
+	ctx->user_files = NULL;
+	ctx->nr_user_files = 0;
+	return 0;
+}
+
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+	if (ctx->sqo_thread) {
+		wait_for_completion(&ctx->sqo_thread_started);
+		/*
+		 * The park is a bit of a work-around, without it we get
+		 * warning spews on shutdown with SQPOLL set and affinity
+		 * set to a single CPU.
+		 */
+		kthread_park(ctx->sqo_thread);
+		kthread_stop(ctx->sqo_thread);
+		ctx->sqo_thread = NULL;
+	}
+}
+
+static void io_finish_async(struct io_ring_ctx *ctx)
+{
+	int i;
+
+	io_sq_thread_stop(ctx);
+
+	for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
+		if (ctx->sqo_wq[i]) {
+			destroy_workqueue(ctx->sqo_wq[i]);
+			ctx->sqo_wq[i] = NULL;
+		}
+	}
+}
+
+#if defined(CONFIG_UNIX)
+static void io_destruct_skb(struct sk_buff *skb)
+{
+	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
+		if (ctx->sqo_wq[i])
+			flush_workqueue(ctx->sqo_wq[i]);
+
+	unix_destruct_scm(skb);
+}
+
+/*
+ * Ensure the UNIX gc is aware of our file set, so we are certain that
+ * the io_uring can be safely unregistered on process exit, even if we have
+ * loops in the file referencing.
+ */
+static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+{
+	struct sock *sk = ctx->ring_sock->sk;
+	struct scm_fp_list *fpl;
+	struct sk_buff *skb;
+	int i;
+
+	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+		unsigned long inflight = ctx->user->unix_inflight + nr;
+
+		if (inflight > task_rlimit(current, RLIMIT_NOFILE))
+			return -EMFILE;
+	}
+
+	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+	if (!fpl)
+		return -ENOMEM;
+
+	skb = alloc_skb(0, GFP_KERNEL);
+	if (!skb) {
+		kfree(fpl);
+		return -ENOMEM;
+	}
+
+	skb->sk = sk;
+	skb->destructor = io_destruct_skb;
+
+	fpl->user = get_uid(ctx->user);
+	for (i = 0; i < nr; i++) {
+		fpl->fp[i] = get_file(ctx->user_files[i + offset]);
+		unix_inflight(fpl->user, fpl->fp[i]);
+	}
+
+	fpl->max = fpl->count = nr;
+	UNIXCB(skb).fp = fpl;
+	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+	skb_queue_head(&sk->sk_receive_queue, skb);
+
+	for (i = 0; i < nr; i++)
+		fput(fpl->fp[i]);
+
+	return 0;
+}
+
+/*
+ * If UNIX sockets are enabled, fd passing can cause a reference cycle which
+ * causes regular reference counting to break down. We rely on the UNIX
+ * garbage collection to take care of this problem for us.
+ */
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+	unsigned left, total;
+	int ret = 0;
+
+	total = 0;
+	left = ctx->nr_user_files;
+	while (left) {
+		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
+
+		ret = __io_sqe_files_scm(ctx, this_files, total);
+		if (ret)
+			break;
+		left -= this_files;
+		total += this_files;
+	}
+
+	if (!ret)
+		return 0;
+
+	while (total < ctx->nr_user_files) {
+		fput(ctx->user_files[total]);
+		total++;
+	}
+
+	return ret;
+}
+#else
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+	return 0;
+}
+#endif
+
+static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+				 unsigned nr_args)
+{
+	__s32 __user *fds = (__s32 __user *) arg;
+	int fd, ret = 0;
+	unsigned i;
+
+	if (ctx->user_files)
+		return -EBUSY;
+	if (!nr_args)
+		return -EINVAL;
+	if (nr_args > IORING_MAX_FIXED_FILES)
+		return -EMFILE;
+
+	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
+	if (!ctx->user_files)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_args; i++) {
+		ret = -EFAULT;
+		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
+			break;
+
+		ctx->user_files[i] = fget(fd);
+
+		ret = -EBADF;
+		if (!ctx->user_files[i])
+			break;
+		/*
+		 * Don't allow io_uring instances to be registered. If UNIX
+		 * isn't enabled, then this causes a reference cycle and this
+		 * instance can never get freed. If UNIX is enabled we'll
+		 * handle it just fine, but there's still no point in allowing
+		 * a ring fd as it doesn't support regular read/write anyway.
+		 */
+		if (ctx->user_files[i]->f_op == &io_uring_fops) {
+			fput(ctx->user_files[i]);
+			break;
+		}
+		ctx->nr_user_files++;
+		ret = 0;
+	}
+
+	if (ret) {
+		for (i = 0; i < ctx->nr_user_files; i++)
+			fput(ctx->user_files[i]);
+
+		kfree(ctx->user_files);
+		ctx->user_files = NULL;
+		ctx->nr_user_files = 0;
+		return ret;
+	}
+
+	ret = io_sqe_files_scm(ctx);
+	if (ret)
+		io_sqe_files_unregister(ctx);
+
+	return ret;
+}
+
+static int io_sq_offload_start(struct io_ring_ctx *ctx,
+			       struct io_uring_params *p)
+{
+	int ret;
+
+	init_waitqueue_head(&ctx->sqo_wait);
+	mmgrab(current->mm);
+	ctx->sqo_mm = current->mm;
+
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		ret = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto err;
+
+		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+		if (!ctx->sq_thread_idle)
+			ctx->sq_thread_idle = HZ;
+
+		if (p->flags & IORING_SETUP_SQ_AFF) {
+			int cpu = p->sq_thread_cpu;
+
+			ret = -EINVAL;
+			if (cpu >= nr_cpu_ids)
+				goto err;
+			if (!cpu_online(cpu))
+				goto err;
+
+			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
+							ctx, cpu,
+							"io_uring-sq");
+		} else {
+			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+							"io_uring-sq");
+		}
+		if (IS_ERR(ctx->sqo_thread)) {
+			ret = PTR_ERR(ctx->sqo_thread);
+			ctx->sqo_thread = NULL;
+			goto err;
+		}
+		wake_up_process(ctx->sqo_thread);
+	} else if (p->flags & IORING_SETUP_SQ_AFF) {
+		/* Can't have SQ_AFF without SQPOLL */
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* Do QD, or 2 * CPUS, whatever is smallest */
+	ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
+			WQ_UNBOUND | WQ_FREEZABLE,
+			min(ctx->sq_entries - 1, 2 * num_online_cpus()));
+	if (!ctx->sqo_wq[0]) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * This is for buffered writes, where we want to limit the parallelism
+	 * due to file locking in file systems. As "normal" buffered writes
+	 * should parellelize on writeout quite nicely, limit us to having 2
+	 * pending. This avoids massive contention on the inode when doing
+	 * buffered async writes.
+	 */
+	ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
+						WQ_UNBOUND | WQ_FREEZABLE, 2);
+	if (!ctx->sqo_wq[1]) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	return 0;
+err:
+	io_finish_async(ctx);
+	mmdrop(ctx->sqo_mm);
+	ctx->sqo_mm = NULL;
+	return ret;
+}
+
+static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+{
+	atomic_long_sub(nr_pages, &user->locked_vm);
+}
+
+static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+{
+	unsigned long page_limit, cur_pages, new_pages;
+
+	/* Don't allow more pages than we can safely lock */
+	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	do {
+		cur_pages = atomic_long_read(&user->locked_vm);
+		new_pages = cur_pages + nr_pages;
+		if (new_pages > page_limit)
+			return -ENOMEM;
+	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+					new_pages) != cur_pages);
+
+	return 0;
+}
+
+static void io_mem_free(void *ptr)
+{
+	struct page *page;
+
+	if (!ptr)
+		return;
+
+	page = virt_to_head_page(ptr);
+	if (put_page_testzero(page))
+		free_compound_page(page);
+}
+
+static void *io_mem_alloc(size_t size)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
+				__GFP_NORETRY;
+
+	return (void *) __get_free_pages(gfp_flags, get_order(size));
+}
+
+static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
+				size_t *sq_offset)
+{
+	struct io_rings *rings;
+	size_t off, sq_array_size;
+
+	off = struct_size(rings, cqes, cq_entries);
+	if (off == SIZE_MAX)
+		return SIZE_MAX;
+
+#ifdef CONFIG_SMP
+	off = ALIGN(off, SMP_CACHE_BYTES);
+	if (off == 0)
+		return SIZE_MAX;
+#endif
+
+	sq_array_size = array_size(sizeof(u32), sq_entries);
+	if (sq_array_size == SIZE_MAX)
+		return SIZE_MAX;
+
+	if (check_add_overflow(off, sq_array_size, &off))
+		return SIZE_MAX;
+
+	if (sq_offset)
+		*sq_offset = off;
+
+	return off;
+}
+
+static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
+{
+	size_t pages;
+
+	pages = (size_t)1 << get_order(
+		rings_size(sq_entries, cq_entries, NULL));
+	pages += (size_t)1 << get_order(
+		array_size(sizeof(struct io_uring_sqe), sq_entries));
+
+	return pages;
+}
+
+static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+{
+	int i, j;
+
+	if (!ctx->user_bufs)
+		return -ENXIO;
+
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+		for (j = 0; j < imu->nr_bvecs; j++)
+			put_user_page(imu->bvec[j].bv_page);
+
+		if (ctx->account_mem)
+			io_unaccount_mem(ctx->user, imu->nr_bvecs);
+		kvfree(imu->bvec);
+		imu->nr_bvecs = 0;
+	}
+
+	kfree(ctx->user_bufs);
+	ctx->user_bufs = NULL;
+	ctx->nr_user_bufs = 0;
+	return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+		       void __user *arg, unsigned index)
+{
+	struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+	if (ctx->compat) {
+		struct compat_iovec __user *ciovs;
+		struct compat_iovec ciov;
+
+		ciovs = (struct compat_iovec __user *) arg;
+		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+			return -EFAULT;
+
+		dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+		dst->iov_len = ciov.iov_len;
+		return 0;
+	}
+#endif
+	src = (struct iovec __user *) arg;
+	if (copy_from_user(dst, &src[index], sizeof(*dst)))
+		return -EFAULT;
+	return 0;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
+				  unsigned nr_args)
+{
+	struct vm_area_struct **vmas = NULL;
+	struct page **pages = NULL;
+	int i, j, got_pages = 0;
+	int ret = -EINVAL;
+
+	if (ctx->user_bufs)
+		return -EBUSY;
+	if (!nr_args || nr_args > UIO_MAXIOV)
+		return -EINVAL;
+
+	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
+					GFP_KERNEL);
+	if (!ctx->user_bufs)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_args; i++) {
+		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+		unsigned long off, start, end, ubuf;
+		int pret, nr_pages;
+		struct iovec iov;
+		size_t size;
+
+		ret = io_copy_iov(ctx, &iov, arg, i);
+		if (ret)
+			goto err;
+
+		/*
+		 * Don't impose further limits on the size and buffer
+		 * constraints here, we'll -EINVAL later when IO is
+		 * submitted if they are wrong.
+		 */
+		ret = -EFAULT;
+		if (!iov.iov_base || !iov.iov_len)
+			goto err;
+
+		/* arbitrary limit, but we need something */
+		if (iov.iov_len > SZ_1G)
+			goto err;
+
+		ubuf = (unsigned long) iov.iov_base;
+		end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		start = ubuf >> PAGE_SHIFT;
+		nr_pages = end - start;
+
+		if (ctx->account_mem) {
+			ret = io_account_mem(ctx->user, nr_pages);
+			if (ret)
+				goto err;
+		}
+
+		ret = 0;
+		if (!pages || nr_pages > got_pages) {
+			kfree(vmas);
+			kfree(pages);
+			pages = kvmalloc_array(nr_pages, sizeof(struct page *),
+						GFP_KERNEL);
+			vmas = kvmalloc_array(nr_pages,
+					sizeof(struct vm_area_struct *),
+					GFP_KERNEL);
+			if (!pages || !vmas) {
+				ret = -ENOMEM;
+				if (ctx->account_mem)
+					io_unaccount_mem(ctx->user, nr_pages);
+				goto err;
+			}
+			got_pages = nr_pages;
+		}
+
+		imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
+						GFP_KERNEL);
+		ret = -ENOMEM;
+		if (!imu->bvec) {
+			if (ctx->account_mem)
+				io_unaccount_mem(ctx->user, nr_pages);
+			goto err;
+		}
+
+		ret = 0;
+		down_read(&current->mm->mmap_sem);
+		pret = get_user_pages(ubuf, nr_pages,
+				      FOLL_WRITE | FOLL_LONGTERM,
+				      pages, vmas);
+		if (pret == nr_pages) {
+			/* don't support file backed memory */
+			for (j = 0; j < nr_pages; j++) {
+				struct vm_area_struct *vma = vmas[j];
+
+				if (vma->vm_file &&
+				    !is_file_hugepages(vma->vm_file)) {
+					ret = -EOPNOTSUPP;
+					break;
+				}
+			}
+		} else {
+			ret = pret < 0 ? pret : -EFAULT;
+		}
+		up_read(&current->mm->mmap_sem);
+		if (ret) {
+			/*
+			 * if we did partial map, or found file backed vmas,
+			 * release any pages we did get
+			 */
+			if (pret > 0)
+				put_user_pages(pages, pret);
+			if (ctx->account_mem)
+				io_unaccount_mem(ctx->user, nr_pages);
+			kvfree(imu->bvec);
+			goto err;
+		}
+
+		off = ubuf & ~PAGE_MASK;
+		size = iov.iov_len;
+		for (j = 0; j < nr_pages; j++) {
+			size_t vec_len;
+
+			vec_len = min_t(size_t, size, PAGE_SIZE - off);
+			imu->bvec[j].bv_page = pages[j];
+			imu->bvec[j].bv_len = vec_len;
+			imu->bvec[j].bv_offset = off;
+			off = 0;
+			size -= vec_len;
+		}
+		/* store original address for later verification */
+		imu->ubuf = ubuf;
+		imu->len = iov.iov_len;
+		imu->nr_bvecs = nr_pages;
+
+		ctx->nr_user_bufs++;
+	}
+	kvfree(pages);
+	kvfree(vmas);
+	return 0;
+err:
+	kvfree(pages);
+	kvfree(vmas);
+	io_sqe_buffer_unregister(ctx);
+	return ret;
+}
+
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
+{
+	__s32 __user *fds = arg;
+	int fd;
+
+	if (ctx->cq_ev_fd)
+		return -EBUSY;
+
+	if (copy_from_user(&fd, fds, sizeof(*fds)))
+		return -EFAULT;
+
+	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
+	if (IS_ERR(ctx->cq_ev_fd)) {
+		int ret = PTR_ERR(ctx->cq_ev_fd);
+		ctx->cq_ev_fd = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+static int io_eventfd_unregister(struct io_ring_ctx *ctx)
+{
+	if (ctx->cq_ev_fd) {
+		eventfd_ctx_put(ctx->cq_ev_fd);
+		ctx->cq_ev_fd = NULL;
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+{
+	io_finish_async(ctx);
+	if (ctx->sqo_mm)
+		mmdrop(ctx->sqo_mm);
+
+	io_iopoll_reap_events(ctx);
+	io_sqe_buffer_unregister(ctx);
+	io_sqe_files_unregister(ctx);
+	io_eventfd_unregister(ctx);
+
+#if defined(CONFIG_UNIX)
+	if (ctx->ring_sock) {
+		ctx->ring_sock->file = NULL; /* so that iput() is called */
+		sock_release(ctx->ring_sock);
+	}
+#endif
+
+	io_mem_free(ctx->rings);
+	io_mem_free(ctx->sq_sqes);
+
+	percpu_ref_exit(&ctx->refs);
+	if (ctx->account_mem)
+		io_unaccount_mem(ctx->user,
+				ring_pages(ctx->sq_entries, ctx->cq_entries));
+	free_uid(ctx->user);
+	if (ctx->creds)
+		put_cred(ctx->creds);
+	kfree(ctx);
+}
+
+static __poll_t io_uring_poll(struct file *file, poll_table *wait)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(file, &ctx->cq_wait, wait);
+	/*
+	 * synchronizes with barrier from wq_has_sleeper call in
+	 * io_commit_cqring
+	 */
+	smp_rmb();
+	if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
+	    ctx->rings->sq_ring_entries)
+		mask |= EPOLLOUT | EPOLLWRNORM;
+	if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static int io_uring_fasync(int fd, struct file *file, int on)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+
+	return fasync_helper(fd, file, on, &ctx->cq_fasync);
+}
+
+static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+{
+	mutex_lock(&ctx->uring_lock);
+	percpu_ref_kill(&ctx->refs);
+	mutex_unlock(&ctx->uring_lock);
+
+	io_kill_timeouts(ctx);
+	io_poll_remove_all(ctx);
+	io_iopoll_reap_events(ctx);
+	wait_for_completion(&ctx->ctx_done);
+	io_ring_ctx_free(ctx);
+}
+
+static int io_uring_release(struct inode *inode, struct file *file)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+
+	file->private_data = NULL;
+	io_ring_ctx_wait_and_kill(ctx);
+	return 0;
+}
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long sz = vma->vm_end - vma->vm_start;
+	struct io_ring_ctx *ctx = file->private_data;
+	unsigned long pfn;
+	struct page *page;
+	void *ptr;
+
+	switch (offset) {
+	case IORING_OFF_SQ_RING:
+	case IORING_OFF_CQ_RING:
+		ptr = ctx->rings;
+		break;
+	case IORING_OFF_SQES:
+		ptr = ctx->sq_sqes;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	page = virt_to_head_page(ptr);
+	if (sz > page_size(page))
+		return -EINVAL;
+
+	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
+		u32, min_complete, u32, flags, const sigset_t __user *, sig,
+		size_t, sigsz)
+{
+	struct io_ring_ctx *ctx;
+	long ret = -EBADF;
+	int submitted = 0;
+	struct fd f;
+
+	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EOPNOTSUPP;
+	if (f.file->f_op != &io_uring_fops)
+		goto out_fput;
+
+	ret = -ENXIO;
+	ctx = f.file->private_data;
+	if (!percpu_ref_tryget(&ctx->refs))
+		goto out_fput;
+
+	/*
+	 * For SQ polling, the thread will do all submissions and completions.
+	 * Just return the requested submit count, and wake the thread if
+	 * we were asked to.
+	 */
+	ret = 0;
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		if (flags & IORING_ENTER_SQ_WAKEUP)
+			wake_up(&ctx->sqo_wait);
+		submitted = to_submit;
+	} else if (to_submit) {
+		to_submit = min(to_submit, ctx->sq_entries);
+
+		mutex_lock(&ctx->uring_lock);
+		submitted = io_ring_submit(ctx, to_submit);
+		mutex_unlock(&ctx->uring_lock);
+	}
+	if (flags & IORING_ENTER_GETEVENTS) {
+		unsigned nr_events = 0;
+
+		min_complete = min(min_complete, ctx->cq_entries);
+
+		if (ctx->flags & IORING_SETUP_IOPOLL) {
+			ret = io_iopoll_check(ctx, &nr_events, min_complete);
+		} else {
+			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+		}
+	}
+
+	percpu_ref_put(&ctx->refs);
+out_fput:
+	fdput(f);
+	return submitted ? submitted : ret;
+}
+
+static const struct file_operations io_uring_fops = {
+	.release	= io_uring_release,
+	.mmap		= io_uring_mmap,
+	.poll		= io_uring_poll,
+	.fasync		= io_uring_fasync,
+};
+
+static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+				  struct io_uring_params *p)
+{
+	struct io_rings *rings;
+	size_t size, sq_array_offset;
+
+	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	rings = io_mem_alloc(size);
+	if (!rings)
+		return -ENOMEM;
+
+	ctx->rings = rings;
+	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
+	rings->sq_ring_mask = p->sq_entries - 1;
+	rings->cq_ring_mask = p->cq_entries - 1;
+	rings->sq_ring_entries = p->sq_entries;
+	rings->cq_ring_entries = p->cq_entries;
+	ctx->sq_mask = rings->sq_ring_mask;
+	ctx->cq_mask = rings->cq_ring_mask;
+	ctx->sq_entries = rings->sq_ring_entries;
+	ctx->cq_entries = rings->cq_ring_entries;
+
+	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	ctx->sq_sqes = io_mem_alloc(size);
+	if (!ctx->sq_sqes)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * Allocate an anonymous fd, this is what constitutes the application
+ * visible backing of an io_uring instance. The application mmaps this
+ * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
+ * we have to tie this fd to a socket for file garbage collection purposes.
+ */
+static int io_uring_get_fd(struct io_ring_ctx *ctx)
+{
+	struct file *file;
+	int ret;
+
+#if defined(CONFIG_UNIX)
+	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
+				&ctx->ring_sock);
+	if (ret)
+		return ret;
+#endif
+
+	ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	if (ret < 0)
+		goto err;
+
+	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
+					O_RDWR | O_CLOEXEC);
+	if (IS_ERR(file)) {
+		put_unused_fd(ret);
+		ret = PTR_ERR(file);
+		goto err;
+	}
+
+#if defined(CONFIG_UNIX)
+	ctx->ring_sock->file = file;
+	ctx->ring_sock->sk->sk_user_data = ctx;
+#endif
+	fd_install(ret, file);
+	return ret;
+err:
+#if defined(CONFIG_UNIX)
+	sock_release(ctx->ring_sock);
+	ctx->ring_sock = NULL;
+#endif
+	return ret;
+}
+
+static int io_uring_create(unsigned entries, struct io_uring_params *p)
+{
+	struct user_struct *user = NULL;
+	struct io_ring_ctx *ctx;
+	bool account_mem;
+	int ret;
+
+	if (!entries || entries > IORING_MAX_ENTRIES)
+		return -EINVAL;
+
+	/*
+	 * Use twice as many entries for the CQ ring. It's possible for the
+	 * application to drive a higher depth than the size of the SQ ring,
+	 * since the sqes are only used at submission time. This allows for
+	 * some flexibility in overcommitting a bit.
+	 */
+	p->sq_entries = roundup_pow_of_two(entries);
+	p->cq_entries = 2 * p->sq_entries;
+
+	user = get_uid(current_user());
+	account_mem = !capable(CAP_IPC_LOCK);
+
+	if (account_mem) {
+		ret = io_account_mem(user,
+				ring_pages(p->sq_entries, p->cq_entries));
+		if (ret) {
+			free_uid(user);
+			return ret;
+		}
+	}
+
+	ctx = io_ring_ctx_alloc(p);
+	if (!ctx) {
+		if (account_mem)
+			io_unaccount_mem(user, ring_pages(p->sq_entries,
+								p->cq_entries));
+		free_uid(user);
+		return -ENOMEM;
+	}
+	ctx->compat = in_compat_syscall();
+	ctx->account_mem = account_mem;
+	ctx->user = user;
+
+	ctx->creds = prepare_creds();
+	if (!ctx->creds) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = io_allocate_scq_urings(ctx, p);
+	if (ret)
+		goto err;
+
+	ret = io_sq_offload_start(ctx, p);
+	if (ret)
+		goto err;
+
+	memset(&p->sq_off, 0, sizeof(p->sq_off));
+	p->sq_off.head = offsetof(struct io_rings, sq.head);
+	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
+	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
+	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
+	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
+	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
+	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
+
+	memset(&p->cq_off, 0, sizeof(p->cq_off));
+	p->cq_off.head = offsetof(struct io_rings, cq.head);
+	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
+	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
+	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
+	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
+	p->cq_off.cqes = offsetof(struct io_rings, cqes);
+
+	/*
+	 * Install ring fd as the very last thing, so we don't risk someone
+	 * having closed it before we finish setup
+	 */
+	ret = io_uring_get_fd(ctx);
+	if (ret < 0)
+		goto err;
+
+	p->features = IORING_FEAT_SINGLE_MMAP;
+	return ret;
+err:
+	io_ring_ctx_wait_and_kill(ctx);
+	return ret;
+}
+
+/*
+ * Sets up an aio uring context, and returns the fd. Applications asks for a
+ * ring size, we return the actual sq/cq ring sizes (among other things) in the
+ * params structure passed in.
+ */
+static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
+{
+	struct io_uring_params p;
+	long ret;
+	int i;
+
+	if (copy_from_user(&p, params, sizeof(p)))
+		return -EFAULT;
+	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
+		if (p.resv[i])
+			return -EINVAL;
+	}
+
+	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
+			IORING_SETUP_SQ_AFF))
+		return -EINVAL;
+
+	ret = io_uring_create(entries, &p);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(params, &p, sizeof(p)))
+		return -EFAULT;
+
+	return ret;
+}
+
+SYSCALL_DEFINE2(io_uring_setup, u32, entries,
+		struct io_uring_params __user *, params)
+{
+	return io_uring_setup(entries, params);
+}
+
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+			       void __user *arg, unsigned nr_args)
+	__releases(ctx->uring_lock)
+	__acquires(ctx->uring_lock)
+{
+	int ret;
+
+	/*
+	 * We're inside the ring mutex, if the ref is already dying, then
+	 * someone else killed the ctx or is already going through
+	 * io_uring_register().
+	 */
+	if (percpu_ref_is_dying(&ctx->refs))
+		return -ENXIO;
+
+	percpu_ref_kill(&ctx->refs);
+
+	/*
+	 * Drop uring mutex before waiting for references to exit. If another
+	 * thread is currently inside io_uring_enter() it might need to grab
+	 * the uring_lock to make progress. If we hold it here across the drain
+	 * wait, then we can deadlock. It's safe to drop the mutex here, since
+	 * no new references will come in after we've killed the percpu ref.
+	 */
+	mutex_unlock(&ctx->uring_lock);
+	wait_for_completion(&ctx->ctx_done);
+	mutex_lock(&ctx->uring_lock);
+
+	switch (opcode) {
+	case IORING_REGISTER_BUFFERS:
+		ret = io_sqe_buffer_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_BUFFERS:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_sqe_buffer_unregister(ctx);
+		break;
+	case IORING_REGISTER_FILES:
+		ret = io_sqe_files_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_FILES:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_sqe_files_unregister(ctx);
+		break;
+	case IORING_REGISTER_EVENTFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_eventfd_register(ctx, arg);
+		break;
+	case IORING_UNREGISTER_EVENTFD:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_eventfd_unregister(ctx);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	/* bring the ctx back to life */
+	reinit_completion(&ctx->ctx_done);
+	percpu_ref_reinit(&ctx->refs);
+	return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+		void __user *, arg, unsigned int, nr_args)
+{
+	struct io_ring_ctx *ctx;
+	long ret = -EBADF;
+	struct fd f;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EOPNOTSUPP;
+	if (f.file->f_op != &io_uring_fops)
+		goto out_fput;
+
+	ctx = f.file->private_data;
+
+	mutex_lock(&ctx->uring_lock);
+	ret = __io_uring_register(ctx, opcode, arg, nr_args);
+	mutex_unlock(&ctx->uring_lock);
+out_fput:
+	fdput(f);
+	return ret;
+}
+
+static int __init io_uring_init(void)
+{
+	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+	return 0;
+};
+__initcall(io_uring_init);

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0400297..fef3a6b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c

@@ -203,7 +203,7 @@
 	fieinfo.fi_extents_start = ufiemap->fm_extents;
 
 	if (fiemap.fm_extent_count != 0 &&
-	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+	    !access_ok(fieinfo.fi_extents_start,
 		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
 		return -EFAULT;
 
@@ -223,6 +223,7 @@
 			     u64 off, u64 olen, u64 destoff)
 {
 	struct fd src_file = fdget(srcfd);
+	loff_t cloned;
 	int ret;
 
 	if (!src_file.file)
@@ -230,7 +231,14 @@
 	ret = -EXDEV;
 	if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
 		goto fdput;
-	ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+	cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
+				      olen, 0);
+	if (cloned < 0)
+		ret = cloned;
+	else if (olen && cloned != olen)
+		ret = -EINVAL;
+	else
+		ret = 0;
 fdput:
 	fdput(src_file);
 	return ret;

diff --git a/fs/iomap.c b/fs/iomap.c
deleted file mode 100644
index ec15cf2..0000000
--- a/fs/iomap.c
+++ /dev/null

@@ -1,2110 +0,0 @@
-/*
- * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
-#include <linux/iomap.h>
-#include <linux/uaccess.h>
-#include <linux/gfp.h>
-#include <linux/migrate.h>
-#include <linux/mm.h>
-#include <linux/mm_inline.h>
-#include <linux/swap.h>
-#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-#include <linux/file.h>
-#include <linux/uio.h>
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/dax.h>
-#include <linux/sched/signal.h>
-#include <linux/swap.h>
-
-#include "internal.h"
-
-/*
- * Execute a iomap write on a segment of the mapping that spans a
- * contiguous range of pages that have identical block mapping state.
- *
- * This avoids the need to map pages individually, do individual allocations
- * for each page and most importantly avoid the need for filesystem specific
- * locking per page. Instead, all the operations are amortised over the entire
- * range of pages. It is assumed that the filesystems will lock whatever
- * resources they require in the iomap_begin call, and release them in the
- * iomap_end call.
- */
-loff_t
-iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
-		const struct iomap_ops *ops, void *data, iomap_actor_t actor)
-{
-	struct iomap iomap = { 0 };
-	loff_t written = 0, ret;
-
-	/*
-	 * Need to map a range from start position for length bytes. This can
-	 * span multiple pages - it is only guaranteed to return a range of a
-	 * single type of pages (e.g. all into a hole, all mapped or all
-	 * unwritten). Failure at this point has nothing to undo.
-	 *
-	 * If allocation is required for this range, reserve the space now so
-	 * that the allocation is guaranteed to succeed later on. Once we copy
-	 * the data into the page cache pages, then we cannot fail otherwise we
-	 * expose transient stale data. If the reserve fails, we can safely
-	 * back out at this point as there is nothing to undo.
-	 */
-	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
-	if (ret)
-		return ret;
-	if (WARN_ON(iomap.offset > pos))
-		return -EIO;
-	if (WARN_ON(iomap.length == 0))
-		return -EIO;
-
-	/*
-	 * Cut down the length to the one actually provided by the filesystem,
-	 * as it might not be able to give us the whole size that we requested.
-	 */
-	if (iomap.offset + iomap.length < pos + length)
-		length = iomap.offset + iomap.length - pos;
-
-	/*
-	 * Now that we have guaranteed that the space allocation will succeed.
-	 * we can do the copy-in page by page without having to worry about
-	 * failures exposing transient data.
-	 */
-	written = actor(inode, pos, length, data, &iomap);
-
-	/*
-	 * Now the data has been copied, commit the range we've copied.  This
-	 * should not fail unless the filesystem has had a fatal error.
-	 */
-	if (ops->iomap_end) {
-		ret = ops->iomap_end(inode, pos, length,
-				     written > 0 ? written : 0,
-				     flags, &iomap);
-	}
-
-	return written ? written : ret;
-}
-
-static sector_t
-iomap_sector(struct iomap *iomap, loff_t pos)
-{
-	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
-}
-
-static struct iomap_page *
-iomap_page_create(struct inode *inode, struct page *page)
-{
-	struct iomap_page *iop = to_iomap_page(page);
-
-	if (iop || i_blocksize(inode) == PAGE_SIZE)
-		return iop;
-
-	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
-	atomic_set(&iop->read_count, 0);
-	atomic_set(&iop->write_count, 0);
-	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
-	set_page_private(page, (unsigned long)iop);
-	SetPagePrivate(page);
-	return iop;
-}
-
-static void
-iomap_page_release(struct page *page)
-{
-	struct iomap_page *iop = to_iomap_page(page);
-
-	if (!iop)
-		return;
-	WARN_ON_ONCE(atomic_read(&iop->read_count));
-	WARN_ON_ONCE(atomic_read(&iop->write_count));
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	kfree(iop);
-}
-
-/*
- * Calculate the range inside the page that we actually need to read.
- */
-static void
-iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
-		loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
-{
-	unsigned block_bits = inode->i_blkbits;
-	unsigned block_size = (1 << block_bits);
-	unsigned poff = offset_in_page(*pos);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
-	unsigned first = poff >> block_bits;
-	unsigned last = (poff + plen - 1) >> block_bits;
-	unsigned end = offset_in_page(i_size_read(inode)) >> block_bits;
-
-	/*
-	 * If the block size is smaller than the page size we need to check the
-	 * per-block uptodate status and adjust the offset and length if needed
-	 * to avoid reading in already uptodate ranges.
-	 */
-	if (iop) {
-		unsigned int i;
-
-		/* move forward for each leading block marked uptodate */
-		for (i = first; i <= last; i++) {
-			if (!test_bit(i, iop->uptodate))
-				break;
-			*pos += block_size;
-			poff += block_size;
-			plen -= block_size;
-			first++;
-		}
-
-		/* truncate len if we find any trailing uptodate block(s) */
-		for ( ; i <= last; i++) {
-			if (test_bit(i, iop->uptodate)) {
-				plen -= (last - i + 1) * block_size;
-				last = i - 1;
-				break;
-			}
-		}
-	}
-
-	/*
-	 * If the extent spans the block that contains the i_size we need to
-	 * handle both halves separately so that we properly zero data in the
-	 * page cache for blocks that are entirely outside of i_size.
-	 */
-	if (first <= end && last > end)
-		plen -= (last - end) * block_size;
-
-	*offp = poff;
-	*lenp = plen;
-}
-
-static void
-iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
-{
-	struct iomap_page *iop = to_iomap_page(page);
-	struct inode *inode = page->mapping->host;
-	unsigned first = off >> inode->i_blkbits;
-	unsigned last = (off + len - 1) >> inode->i_blkbits;
-	unsigned int i;
-	bool uptodate = true;
-
-	if (iop) {
-		for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
-			if (i >= first && i <= last)
-				set_bit(i, iop->uptodate);
-			else if (!test_bit(i, iop->uptodate))
-				uptodate = false;
-		}
-	}
-
-	if (uptodate && !PageError(page))
-		SetPageUptodate(page);
-}
-
-static void
-iomap_read_finish(struct iomap_page *iop, struct page *page)
-{
-	if (!iop || atomic_dec_and_test(&iop->read_count))
-		unlock_page(page);
-}
-
-static void
-iomap_read_page_end_io(struct bio_vec *bvec, int error)
-{
-	struct page *page = bvec->bv_page;
-	struct iomap_page *iop = to_iomap_page(page);
-
-	if (unlikely(error)) {
-		ClearPageUptodate(page);
-		SetPageError(page);
-	} else {
-		iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
-	}
-
-	iomap_read_finish(iop, page);
-}
-
-static void
-iomap_read_inline_data(struct inode *inode, struct page *page,
-		struct iomap *iomap)
-{
-	size_t size = i_size_read(inode);
-	void *addr;
-
-	if (PageUptodate(page))
-		return;
-
-	BUG_ON(page->index);
-	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
-
-	addr = kmap_atomic(page);
-	memcpy(addr, iomap->inline_data, size);
-	memset(addr + size, 0, PAGE_SIZE - size);
-	kunmap_atomic(addr);
-	SetPageUptodate(page);
-}
-
-static void
-iomap_read_end_io(struct bio *bio)
-{
-	int error = blk_status_to_errno(bio->bi_status);
-	struct bio_vec *bvec;
-	int i;
-
-	bio_for_each_segment_all(bvec, bio, i)
-		iomap_read_page_end_io(bvec, error);
-	bio_put(bio);
-}
-
-struct iomap_readpage_ctx {
-	struct page		*cur_page;
-	bool			cur_page_in_bio;
-	bool			is_readahead;
-	struct bio		*bio;
-	struct list_head	*pages;
-};
-
-static loff_t
-iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-		struct iomap *iomap)
-{
-	struct iomap_readpage_ctx *ctx = data;
-	struct page *page = ctx->cur_page;
-	struct iomap_page *iop = iomap_page_create(inode, page);
-	bool is_contig = false;
-	loff_t orig_pos = pos;
-	unsigned poff, plen;
-	sector_t sector;
-
-	if (iomap->type == IOMAP_INLINE) {
-		WARN_ON_ONCE(pos);
-		iomap_read_inline_data(inode, page, iomap);
-		return PAGE_SIZE;
-	}
-
-	/* zero post-eof blocks as the page may be mapped */
-	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
-	if (plen == 0)
-		goto done;
-
-	if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
-		zero_user(page, poff, plen);
-		iomap_set_range_uptodate(page, poff, plen);
-		goto done;
-	}
-
-	ctx->cur_page_in_bio = true;
-
-	/*
-	 * Try to merge into a previous segment if we can.
-	 */
-	sector = iomap_sector(iomap, pos);
-	if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
-		if (__bio_try_merge_page(ctx->bio, page, plen, poff))
-			goto done;
-		is_contig = true;
-	}
-
-	/*
-	 * If we start a new segment we need to increase the read count, and we
-	 * need to do so before submitting any previous full bio to make sure
-	 * that we don't prematurely unlock the page.
-	 */
-	if (iop)
-		atomic_inc(&iop->read_count);
-
-	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
-		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
-		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
-		if (ctx->bio)
-			submit_bio(ctx->bio);
-
-		if (ctx->is_readahead) /* same as readahead_gfp_mask */
-			gfp |= __GFP_NORETRY | __GFP_NOWARN;
-		ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
-		ctx->bio->bi_opf = REQ_OP_READ;
-		if (ctx->is_readahead)
-			ctx->bio->bi_opf |= REQ_RAHEAD;
-		ctx->bio->bi_iter.bi_sector = sector;
-		bio_set_dev(ctx->bio, iomap->bdev);
-		ctx->bio->bi_end_io = iomap_read_end_io;
-	}
-
-	__bio_add_page(ctx->bio, page, plen, poff);
-done:
-	/*
-	 * Move the caller beyond our range so that it keeps making progress.
-	 * For that we have to include any leading non-uptodate ranges, but
-	 * we can skip trailing ones as they will be handled in the next
-	 * iteration.
-	 */
-	return pos - orig_pos + plen;
-}
-
-int
-iomap_readpage(struct page *page, const struct iomap_ops *ops)
-{
-	struct iomap_readpage_ctx ctx = { .cur_page = page };
-	struct inode *inode = page->mapping->host;
-	unsigned poff;
-	loff_t ret;
-
-	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
-		ret = iomap_apply(inode, page_offset(page) + poff,
-				PAGE_SIZE - poff, 0, ops, &ctx,
-				iomap_readpage_actor);
-		if (ret <= 0) {
-			WARN_ON_ONCE(ret == 0);
-			SetPageError(page);
-			break;
-		}
-	}
-
-	if (ctx.bio) {
-		submit_bio(ctx.bio);
-		WARN_ON_ONCE(!ctx.cur_page_in_bio);
-	} else {
-		WARN_ON_ONCE(ctx.cur_page_in_bio);
-		unlock_page(page);
-	}
-
-	/*
-	 * Just like mpage_readpages and block_read_full_page we always
-	 * return 0 and just mark the page as PageError on errors.  This
-	 * should be cleaned up all through the stack eventually.
-	 */
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iomap_readpage);
-
-static struct page *
-iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
-		loff_t length, loff_t *done)
-{
-	while (!list_empty(pages)) {
-		struct page *page = lru_to_page(pages);
-
-		if (page_offset(page) >= (u64)pos + length)
-			break;
-
-		list_del(&page->lru);
-		if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
-				GFP_NOFS))
-			return page;
-
-		/*
-		 * If we already have a page in the page cache at index we are
-		 * done.  Upper layers don't care if it is uptodate after the
-		 * readpages call itself as every page gets checked again once
-		 * actually needed.
-		 */
-		*done += PAGE_SIZE;
-		put_page(page);
-	}
-
-	return NULL;
-}
-
-static loff_t
-iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
-		void *data, struct iomap *iomap)
-{
-	struct iomap_readpage_ctx *ctx = data;
-	loff_t done, ret;
-
-	for (done = 0; done < length; done += ret) {
-		if (ctx->cur_page && offset_in_page(pos + done) == 0) {
-			if (!ctx->cur_page_in_bio)
-				unlock_page(ctx->cur_page);
-			put_page(ctx->cur_page);
-			ctx->cur_page = NULL;
-		}
-		if (!ctx->cur_page) {
-			ctx->cur_page = iomap_next_page(inode, ctx->pages,
-					pos, length, &done);
-			if (!ctx->cur_page)
-				break;
-			ctx->cur_page_in_bio = false;
-		}
-		ret = iomap_readpage_actor(inode, pos + done, length - done,
-				ctx, iomap);
-	}
-
-	return done;
-}
-
-int
-iomap_readpages(struct address_space *mapping, struct list_head *pages,
-		unsigned nr_pages, const struct iomap_ops *ops)
-{
-	struct iomap_readpage_ctx ctx = {
-		.pages		= pages,
-		.is_readahead	= true,
-	};
-	loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
-	loff_t last = page_offset(list_entry(pages->next, struct page, lru));
-	loff_t length = last - pos + PAGE_SIZE, ret = 0;
-
-	while (length > 0) {
-		ret = iomap_apply(mapping->host, pos, length, 0, ops,
-				&ctx, iomap_readpages_actor);
-		if (ret <= 0) {
-			WARN_ON_ONCE(ret == 0);
-			goto done;
-		}
-		pos += ret;
-		length -= ret;
-	}
-	ret = 0;
-done:
-	if (ctx.bio)
-		submit_bio(ctx.bio);
-	if (ctx.cur_page) {
-		if (!ctx.cur_page_in_bio)
-			unlock_page(ctx.cur_page);
-		put_page(ctx.cur_page);
-	}
-
-	/*
-	 * Check that we didn't lose a page due to the arcance calling
-	 * conventions..
-	 */
-	WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iomap_readpages);
-
-int
-iomap_is_partially_uptodate(struct page *page, unsigned long from,
-		unsigned long count)
-{
-	struct iomap_page *iop = to_iomap_page(page);
-	struct inode *inode = page->mapping->host;
-	unsigned first = from >> inode->i_blkbits;
-	unsigned last = (from + count - 1) >> inode->i_blkbits;
-	unsigned i;
-
-	if (iop) {
-		for (i = first; i <= last; i++)
-			if (!test_bit(i, iop->uptodate))
-				return 0;
-		return 1;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
-
-int
-iomap_releasepage(struct page *page, gfp_t gfp_mask)
-{
-	/*
-	 * mm accommodates an old ext3 case where clean pages might not have had
-	 * the dirty bit cleared. Thus, it can send actual dirty pages to
-	 * ->releasepage() via shrink_active_list(), skip those here.
-	 */
-	if (PageDirty(page) || PageWriteback(page))
-		return 0;
-	iomap_page_release(page);
-	return 1;
-}
-EXPORT_SYMBOL_GPL(iomap_releasepage);
-
-void
-iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
-{
-	/*
-	 * If we are invalidating the entire page, clear the dirty state from it
-	 * and release it to avoid unnecessary buildup of the LRU.
-	 */
-	if (offset == 0 && len == PAGE_SIZE) {
-		WARN_ON_ONCE(PageWriteback(page));
-		cancel_dirty_page(page);
-		iomap_page_release(page);
-	}
-}
-EXPORT_SYMBOL_GPL(iomap_invalidatepage);
-
-#ifdef CONFIG_MIGRATION
-int
-iomap_migrate_page(struct address_space *mapping, struct page *newpage,
-		struct page *page, enum migrate_mode mode)
-{
-	int ret;
-
-	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
-	if (ret != MIGRATEPAGE_SUCCESS)
-		return ret;
-
-	if (page_has_private(page)) {
-		ClearPagePrivate(page);
-		set_page_private(newpage, page_private(page));
-		set_page_private(page, 0);
-		SetPagePrivate(newpage);
-	}
-
-	if (mode != MIGRATE_SYNC_NO_COPY)
-		migrate_page_copy(newpage, page);
-	else
-		migrate_page_states(newpage, page);
-	return MIGRATEPAGE_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(iomap_migrate_page);
-#endif /* CONFIG_MIGRATION */
-
-static void
-iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
-{
-	loff_t i_size = i_size_read(inode);
-
-	/*
-	 * Only truncate newly allocated pages beyoned EOF, even if the
-	 * write started inside the existing inode size.
-	 */
-	if (pos + len > i_size)
-		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
-}
-
-static int
-iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
-		unsigned poff, unsigned plen, unsigned from, unsigned to,
-		struct iomap *iomap)
-{
-	struct bio_vec bvec;
-	struct bio bio;
-
-	if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
-		zero_user_segments(page, poff, from, to, poff + plen);
-		iomap_set_range_uptodate(page, poff, plen);
-		return 0;
-	}
-
-	bio_init(&bio, &bvec, 1);
-	bio.bi_opf = REQ_OP_READ;
-	bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
-	bio_set_dev(&bio, iomap->bdev);
-	__bio_add_page(&bio, page, plen, poff);
-	return submit_bio_wait(&bio);
-}
-
-static int
-__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
-		struct page *page, struct iomap *iomap)
-{
-	struct iomap_page *iop = iomap_page_create(inode, page);
-	loff_t block_size = i_blocksize(inode);
-	loff_t block_start = pos & ~(block_size - 1);
-	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
-	unsigned from = offset_in_page(pos), to = from + len, poff, plen;
-	int status = 0;
-
-	if (PageUptodate(page))
-		return 0;
-
-	do {
-		iomap_adjust_read_range(inode, iop, &block_start,
-				block_end - block_start, &poff, &plen);
-		if (plen == 0)
-			break;
-
-		if ((from > poff && from < poff + plen) ||
-		    (to > poff && to < poff + plen)) {
-			status = iomap_read_page_sync(inode, block_start, page,
-					poff, plen, from, to, iomap);
-			if (status)
-				break;
-		}
-
-	} while ((block_start += plen) < block_end);
-
-	return status;
-}
-
-static int
-iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
-		struct page **pagep, struct iomap *iomap)
-{
-	pgoff_t index = pos >> PAGE_SHIFT;
-	struct page *page;
-	int status = 0;
-
-	BUG_ON(pos + len > iomap->offset + iomap->length);
-
-	if (fatal_signal_pending(current))
-		return -EINTR;
-
-	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
-	if (!page)
-		return -ENOMEM;
-
-	if (iomap->type == IOMAP_INLINE)
-		iomap_read_inline_data(inode, page, iomap);
-	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
-		status = __block_write_begin_int(page, pos, len, NULL, iomap);
-	else
-		status = __iomap_write_begin(inode, pos, len, page, iomap);
-	if (unlikely(status)) {
-		unlock_page(page);
-		put_page(page);
-		page = NULL;
-
-		iomap_write_failed(inode, pos, len);
-	}
-
-	*pagep = page;
-	return status;
-}
-
-int
-iomap_set_page_dirty(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-	int newly_dirty;
-
-	if (unlikely(!mapping))
-		return !TestSetPageDirty(page);
-
-	/*
-	 * Lock out page->mem_cgroup migration to keep PageDirty
-	 * synchronized with per-memcg dirty page counters.
-	 */
-	lock_page_memcg(page);
-	newly_dirty = !TestSetPageDirty(page);
-	if (newly_dirty)
-		__set_page_dirty(page, mapping, 0);
-	unlock_page_memcg(page);
-
-	if (newly_dirty)
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return newly_dirty;
-}
-EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
-
-static int
-__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
-		unsigned copied, struct page *page, struct iomap *iomap)
-{
-	flush_dcache_page(page);
-
-	/*
-	 * The blocks that were entirely written will now be uptodate, so we
-	 * don't have to worry about a readpage reading them and overwriting a
-	 * partial write.  However if we have encountered a short write and only
-	 * partially written into a block, it will not be marked uptodate, so a
-	 * readpage might come in and destroy our partial write.
-	 *
-	 * Do the simplest thing, and just treat any short write to a non
-	 * uptodate page as a zero-length write, and force the caller to redo
-	 * the whole thing.
-	 */
-	if (unlikely(copied < len && !PageUptodate(page))) {
-		copied = 0;
-	} else {
-		iomap_set_range_uptodate(page, offset_in_page(pos), len);
-		iomap_set_page_dirty(page);
-	}
-	return __generic_write_end(inode, pos, copied, page);
-}
-
-static int
-iomap_write_end_inline(struct inode *inode, struct page *page,
-		struct iomap *iomap, loff_t pos, unsigned copied)
-{
-	void *addr;
-
-	WARN_ON_ONCE(!PageUptodate(page));
-	BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
-
-	addr = kmap_atomic(page);
-	memcpy(iomap->inline_data + pos, addr + pos, copied);
-	kunmap_atomic(addr);
-
-	mark_inode_dirty(inode);
-	__generic_write_end(inode, pos, copied, page);
-	return copied;
-}
-
-static int
-iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
-		unsigned copied, struct page *page, struct iomap *iomap)
-{
-	int ret;
-
-	if (iomap->type == IOMAP_INLINE) {
-		ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
-	} else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
-		ret = generic_write_end(NULL, inode->i_mapping, pos, len,
-				copied, page, NULL);
-	} else {
-		ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
-	}
-
-	if (iomap->page_done)
-		iomap->page_done(inode, pos, copied, page, iomap);
-
-	if (ret < len)
-		iomap_write_failed(inode, pos, len);
-	return ret;
-}
-
-static loff_t
-iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-		struct iomap *iomap)
-{
-	struct iov_iter *i = data;
-	long status = 0;
-	ssize_t written = 0;
-	unsigned int flags = AOP_FLAG_NOFS;
-
-	do {
-		struct page *page;
-		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long bytes;	/* Bytes to write to page */
-		size_t copied;		/* Bytes copied from user */
-
-		offset = offset_in_page(pos);
-		bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_count(i));
-again:
-		if (bytes > length)
-			bytes = length;
-
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
-		 */
-		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
-			status = -EFAULT;
-			break;
-		}
-
-		status = iomap_write_begin(inode, pos, bytes, flags, &page,
-				iomap);
-		if (unlikely(status))
-			break;
-
-		if (mapping_writably_mapped(inode->i_mapping))
-			flush_dcache_page(page);
-
-		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
-
-		flush_dcache_page(page);
-
-		status = iomap_write_end(inode, pos, bytes, copied, page,
-				iomap);
-		if (unlikely(status < 0))
-			break;
-		copied = status;
-
-		cond_resched();
-
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
-			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
-			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
-			goto again;
-		}
-		pos += copied;
-		written += copied;
-		length -= copied;
-
-		balance_dirty_pages_ratelimited(inode->i_mapping);
-	} while (iov_iter_count(i) && length);
-
-	return written ? written : status;
-}
-
-ssize_t
-iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
-		const struct iomap_ops *ops)
-{
-	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
-
-	while (iov_iter_count(iter)) {
-		ret = iomap_apply(inode, pos, iov_iter_count(iter),
-				IOMAP_WRITE, ops, iter, iomap_write_actor);
-		if (ret <= 0)
-			break;
-		pos += ret;
-		written += ret;
-	}
-
-	return written ? written : ret;
-}
-EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
-
-static struct page *
-__iomap_read_page(struct inode *inode, loff_t offset)
-{
-	struct address_space *mapping = inode->i_mapping;
-	struct page *page;
-
-	page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
-	if (IS_ERR(page))
-		return page;
-	if (!PageUptodate(page)) {
-		put_page(page);
-		return ERR_PTR(-EIO);
-	}
-	return page;
-}
-
-static loff_t
-iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-		struct iomap *iomap)
-{
-	long status = 0;
-	ssize_t written = 0;
-
-	do {
-		struct page *page, *rpage;
-		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long bytes;	/* Bytes to write to page */
-
-		offset = offset_in_page(pos);
-		bytes = min_t(loff_t, PAGE_SIZE - offset, length);
-
-		rpage = __iomap_read_page(inode, pos);
-		if (IS_ERR(rpage))
-			return PTR_ERR(rpage);
-
-		status = iomap_write_begin(inode, pos, bytes,
-					   AOP_FLAG_NOFS, &page, iomap);
-		put_page(rpage);
-		if (unlikely(status))
-			return status;
-
-		WARN_ON_ONCE(!PageUptodate(page));
-
-		status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
-		if (unlikely(status <= 0)) {
-			if (WARN_ON_ONCE(status == 0))
-				return -EIO;
-			return status;
-		}
-
-		cond_resched();
-
-		pos += status;
-		written += status;
-		length -= status;
-
-		balance_dirty_pages_ratelimited(inode->i_mapping);
-	} while (length);
-
-	return written;
-}
-
-int
-iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
-		const struct iomap_ops *ops)
-{
-	loff_t ret;
-
-	while (len) {
-		ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
-				iomap_dirty_actor);
-		if (ret <= 0)
-			return ret;
-		pos += ret;
-		len -= ret;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iomap_file_dirty);
-
-static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
-		unsigned bytes, struct iomap *iomap)
-{
-	struct page *page;
-	int status;
-
-	status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
-				   iomap);
-	if (status)
-		return status;
-
-	zero_user(page, offset, bytes);
-	mark_page_accessed(page);
-
-	return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
-}
-
-static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
-		struct iomap *iomap)
-{
-	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
-			iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
-}
-
-static loff_t
-iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
-		void *data, struct iomap *iomap)
-{
-	bool *did_zero = data;
-	loff_t written = 0;
-	int status;
-
-	/* already zeroed?  we're done. */
-	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
-	    	return count;
-
-	do {
-		unsigned offset, bytes;
-
-		offset = offset_in_page(pos);
-		bytes = min_t(loff_t, PAGE_SIZE - offset, count);
-
-		if (IS_DAX(inode))
-			status = iomap_dax_zero(pos, offset, bytes, iomap);
-		else
-			status = iomap_zero(inode, pos, offset, bytes, iomap);
-		if (status < 0)
-			return status;
-
-		pos += bytes;
-		count -= bytes;
-		written += bytes;
-		if (did_zero)
-			*did_zero = true;
-	} while (count > 0);
-
-	return written;
-}
-
-int
-iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-		const struct iomap_ops *ops)
-{
-	loff_t ret;
-
-	while (len > 0) {
-		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
-				ops, did_zero, iomap_zero_range_actor);
-		if (ret <= 0)
-			return ret;
-
-		pos += ret;
-		len -= ret;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iomap_zero_range);
-
-int
-iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		const struct iomap_ops *ops)
-{
-	unsigned int blocksize = i_blocksize(inode);
-	unsigned int off = pos & (blocksize - 1);
-
-	/* Block boundary? Nothing to do */
-	if (!off)
-		return 0;
-	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
-}
-EXPORT_SYMBOL_GPL(iomap_truncate_page);
-
-static loff_t
-iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
-		void *data, struct iomap *iomap)
-{
-	struct page *page = data;
-	int ret;
-
-	if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
-		ret = __block_write_begin_int(page, pos, length, NULL, iomap);
-		if (ret)
-			return ret;
-		block_commit_write(page, 0, length);
-	} else {
-		WARN_ON_ONCE(!PageUptodate(page));
-		iomap_page_create(inode, page);
-		set_page_dirty(page);
-	}
-
-	return length;
-}
-
-int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
-{
-	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	unsigned long length;
-	loff_t offset, size;
-	ssize_t ret;
-
-	lock_page(page);
-	size = i_size_read(inode);
-	if ((page->mapping != inode->i_mapping) ||
-	    (page_offset(page) > size)) {
-		/* We overload EFAULT to mean page got truncated */
-		ret = -EFAULT;
-		goto out_unlock;
-	}
-
-	/* page is wholly or partially inside EOF */
-	if (((page->index + 1) << PAGE_SHIFT) > size)
-		length = offset_in_page(size);
-	else
-		length = PAGE_SIZE;
-
-	offset = page_offset(page);
-	while (length > 0) {
-		ret = iomap_apply(inode, offset, length,
-				IOMAP_WRITE | IOMAP_FAULT, ops, page,
-				iomap_page_mkwrite_actor);
-		if (unlikely(ret <= 0))
-			goto out_unlock;
-		offset += ret;
-		length -= ret;
-	}
-
-	wait_for_stable_page(page);
-	return VM_FAULT_LOCKED;
-out_unlock:
-	unlock_page(page);
-	return block_page_mkwrite_return(ret);
-}
-EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-
-struct fiemap_ctx {
-	struct fiemap_extent_info *fi;
-	struct iomap prev;
-};
-
-static int iomap_to_fiemap(struct fiemap_extent_info *fi,
-		struct iomap *iomap, u32 flags)
-{
-	switch (iomap->type) {
-	case IOMAP_HOLE:
-		/* skip holes */
-		return 0;
-	case IOMAP_DELALLOC:
-		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
-		break;
-	case IOMAP_MAPPED:
-		break;
-	case IOMAP_UNWRITTEN:
-		flags |= FIEMAP_EXTENT_UNWRITTEN;
-		break;
-	case IOMAP_INLINE:
-		flags |= FIEMAP_EXTENT_DATA_INLINE;
-		break;
-	}
-
-	if (iomap->flags & IOMAP_F_MERGED)
-		flags |= FIEMAP_EXTENT_MERGED;
-	if (iomap->flags & IOMAP_F_SHARED)
-		flags |= FIEMAP_EXTENT_SHARED;
-
-	return fiemap_fill_next_extent(fi, iomap->offset,
-			iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
-			iomap->length, flags);
-}
-
-static loff_t
-iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-		struct iomap *iomap)
-{
-	struct fiemap_ctx *ctx = data;
-	loff_t ret = length;
-
-	if (iomap->type == IOMAP_HOLE)
-		return length;
-
-	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
-	ctx->prev = *iomap;
-	switch (ret) {
-	case 0:		/* success */
-		return length;
-	case 1:		/* extent array full */
-		return 0;
-	default:
-		return ret;
-	}
-}
-
-int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
-		loff_t start, loff_t len, const struct iomap_ops *ops)
-{
-	struct fiemap_ctx ctx;
-	loff_t ret;
-
-	memset(&ctx, 0, sizeof(ctx));
-	ctx.fi = fi;
-	ctx.prev.type = IOMAP_HOLE;
-
-	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
-	if (ret)
-		return ret;
-
-	if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
-		ret = filemap_write_and_wait(inode->i_mapping);
-		if (ret)
-			return ret;
-	}
-
-	while (len > 0) {
-		ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
-				iomap_fiemap_actor);
-		/* inode with no (attribute) mapping will give ENOENT */
-		if (ret == -ENOENT)
-			break;
-		if (ret < 0)
-			return ret;
-		if (ret == 0)
-			break;
-
-		start += ret;
-		len -= ret;
-	}
-
-	if (ctx.prev.type != IOMAP_HOLE) {
-		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
-		if (ret < 0)
-			return ret;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iomap_fiemap);
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- * Returns true if found and updates @lastoff to the offset in file.
- */
-static bool
-page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
-		int whence)
-{
-	const struct address_space_operations *ops = inode->i_mapping->a_ops;
-	unsigned int bsize = i_blocksize(inode), off;
-	bool seek_data = whence == SEEK_DATA;
-	loff_t poff = page_offset(page);
-
-	if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
-		return false;
-
-	if (*lastoff < poff) {
-		/*
-		 * Last offset smaller than the start of the page means we found
-		 * a hole:
-		 */
-		if (whence == SEEK_HOLE)
-			return true;
-		*lastoff = poff;
-	}
-
-	/*
-	 * Just check the page unless we can and should check block ranges:
-	 */
-	if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
-		return PageUptodate(page) == seek_data;
-
-	lock_page(page);
-	if (unlikely(page->mapping != inode->i_mapping))
-		goto out_unlock_not_found;
-
-	for (off = 0; off < PAGE_SIZE; off += bsize) {
-		if (offset_in_page(*lastoff) >= off + bsize)
-			continue;
-		if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
-			unlock_page(page);
-			return true;
-		}
-		*lastoff = poff + off + bsize;
-	}
-
-out_unlock_not_found:
-	unlock_page(page);
-	return false;
-}
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
- *
- * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: uptodate buffer heads count as data; everything else
- * counts as a hole.
- *
- * Returns the resulting offset on successs, and -ENOENT otherwise.
- */
-static loff_t
-page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
-		int whence)
-{
-	pgoff_t index = offset >> PAGE_SHIFT;
-	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
-	loff_t lastoff = offset;
-	struct pagevec pvec;
-
-	if (length <= 0)
-		return -ENOENT;
-
-	pagevec_init(&pvec);
-
-	do {
-		unsigned nr_pages, i;
-
-		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
-						end - 1);
-		if (nr_pages == 0)
-			break;
-
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			if (page_seek_hole_data(inode, page, &lastoff, whence))
-				goto check_range;
-			lastoff = page_offset(page) + PAGE_SIZE;
-		}
-		pagevec_release(&pvec);
-	} while (index < end);
-
-	/* When no page at lastoff and we are not done, we found a hole. */
-	if (whence != SEEK_HOLE)
-		goto not_found;
-
-check_range:
-	if (lastoff < offset + length)
-		goto out;
-not_found:
-	lastoff = -ENOENT;
-out:
-	pagevec_release(&pvec);
-	return lastoff;
-}
-
-
-static loff_t
-iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
-		      void *data, struct iomap *iomap)
-{
-	switch (iomap->type) {
-	case IOMAP_UNWRITTEN:
-		offset = page_cache_seek_hole_data(inode, offset, length,
-						   SEEK_HOLE);
-		if (offset < 0)
-			return length;
-		/* fall through */
-	case IOMAP_HOLE:
-		*(loff_t *)data = offset;
-		return 0;
-	default:
-		return length;
-	}
-}
-
-loff_t
-iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
-{
-	loff_t size = i_size_read(inode);
-	loff_t length = size - offset;
-	loff_t ret;
-
-	/* Nothing to be found before or beyond the end of the file. */
-	if (offset < 0 || offset >= size)
-		return -ENXIO;
-
-	while (length > 0) {
-		ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
-				  &offset, iomap_seek_hole_actor);
-		if (ret < 0)
-			return ret;
-		if (ret == 0)
-			break;
-
-		offset += ret;
-		length -= ret;
-	}
-
-	return offset;
-}
-EXPORT_SYMBOL_GPL(iomap_seek_hole);
-
-static loff_t
-iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
-		      void *data, struct iomap *iomap)
-{
-	switch (iomap->type) {
-	case IOMAP_HOLE:
-		return length;
-	case IOMAP_UNWRITTEN:
-		offset = page_cache_seek_hole_data(inode, offset, length,
-						   SEEK_DATA);
-		if (offset < 0)
-			return length;
-		/*FALLTHRU*/
-	default:
-		*(loff_t *)data = offset;
-		return 0;
-	}
-}
-
-loff_t
-iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
-{
-	loff_t size = i_size_read(inode);
-	loff_t length = size - offset;
-	loff_t ret;
-
-	/* Nothing to be found before or beyond the end of the file. */
-	if (offset < 0 || offset >= size)
-		return -ENXIO;
-
-	while (length > 0) {
-		ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
-				  &offset, iomap_seek_data_actor);
-		if (ret < 0)
-			return ret;
-		if (ret == 0)
-			break;
-
-		offset += ret;
-		length -= ret;
-	}
-
-	if (length <= 0)
-		return -ENXIO;
-	return offset;
-}
-EXPORT_SYMBOL_GPL(iomap_seek_data);
-
-/*
- * Private flags for iomap_dio, must not overlap with the public ones in
- * iomap.h:
- */
-#define IOMAP_DIO_WRITE_FUA	(1 << 28)
-#define IOMAP_DIO_NEED_SYNC	(1 << 29)
-#define IOMAP_DIO_WRITE		(1 << 30)
-#define IOMAP_DIO_DIRTY		(1 << 31)
-
-struct iomap_dio {
-	struct kiocb		*iocb;
-	iomap_dio_end_io_t	*end_io;
-	loff_t			i_size;
-	loff_t			size;
-	atomic_t		ref;
-	unsigned		flags;
-	int			error;
-	bool			wait_for_completion;
-
-	union {
-		/* used during submission and for synchronous completion: */
-		struct {
-			struct iov_iter		*iter;
-			struct task_struct	*waiter;
-			struct request_queue	*last_queue;
-			blk_qc_t		cookie;
-		} submit;
-
-		/* used for aio completion: */
-		struct {
-			struct work_struct	work;
-		} aio;
-	};
-};
-
-static ssize_t iomap_dio_complete(struct iomap_dio *dio)
-{
-	struct kiocb *iocb = dio->iocb;
-	struct inode *inode = file_inode(iocb->ki_filp);
-	loff_t offset = iocb->ki_pos;
-	ssize_t ret;
-
-	if (dio->end_io) {
-		ret = dio->end_io(iocb,
-				dio->error ? dio->error : dio->size,
-				dio->flags);
-	} else {
-		ret = dio->error;
-	}
-
-	if (likely(!ret)) {
-		ret = dio->size;
-		/* check for short read */
-		if (offset + ret > dio->i_size &&
-		    !(dio->flags & IOMAP_DIO_WRITE))
-			ret = dio->i_size - offset;
-		iocb->ki_pos += ret;
-	}
-
-	/*
-	 * Try again to invalidate clean pages which might have been cached by
-	 * non-direct readahead, or faulted in by get_user_pages() if the source
-	 * of the write was an mmap'ed region of the file we're writing.  Either
-	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
-	 * this invalidation fails, tough, the write still worked...
-	 *
-	 * And this page cache invalidation has to be after dio->end_io(), as
-	 * some filesystems convert unwritten extents to real allocations in
-	 * end_io() when necessary, otherwise a racing buffer read would cache
-	 * zeros from unwritten extents.
-	 */
-	if (!dio->error &&
-	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
-		int err;
-		err = invalidate_inode_pages2_range(inode->i_mapping,
-				offset >> PAGE_SHIFT,
-				(offset + dio->size - 1) >> PAGE_SHIFT);
-		if (err)
-			dio_warn_stale_pagecache(iocb->ki_filp);
-	}
-
-	/*
-	 * If this is a DSYNC write, make sure we push it to stable storage now
-	 * that we've written data.
-	 */
-	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
-		ret = generic_write_sync(iocb, ret);
-
-	inode_dio_end(file_inode(iocb->ki_filp));
-	kfree(dio);
-
-	return ret;
-}
-
-static void iomap_dio_complete_work(struct work_struct *work)
-{
-	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
-	struct kiocb *iocb = dio->iocb;
-
-	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
-}
-
-/*
- * Set an error in the dio if none is set yet.  We have to use cmpxchg
- * as the submission context and the completion context(s) can race to
- * update the error.
- */
-static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
-{
-	cmpxchg(&dio->error, 0, ret);
-}
-
-static void iomap_dio_bio_end_io(struct bio *bio)
-{
-	struct iomap_dio *dio = bio->bi_private;
-	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
-
-	if (bio->bi_status)
-		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
-
-	if (atomic_dec_and_test(&dio->ref)) {
-		if (dio->wait_for_completion) {
-			struct task_struct *waiter = dio->submit.waiter;
-			WRITE_ONCE(dio->submit.waiter, NULL);
-			wake_up_process(waiter);
-		} else if (dio->flags & IOMAP_DIO_WRITE) {
-			struct inode *inode = file_inode(dio->iocb->ki_filp);
-
-			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
-			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
-		} else {
-			iomap_dio_complete_work(&dio->aio.work);
-		}
-	}
-
-	if (should_dirty) {
-		bio_check_pages_dirty(bio);
-	} else {
-		struct bio_vec *bvec;
-		int i;
-
-		bio_for_each_segment_all(bvec, bio, i)
-			put_page(bvec->bv_page);
-		bio_put(bio);
-	}
-}
-
-static blk_qc_t
-iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
-		unsigned len)
-{
-	struct page *page = ZERO_PAGE(0);
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_KERNEL, 1);
-	bio_set_dev(bio, iomap->bdev);
-	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-	bio->bi_private = dio;
-	bio->bi_end_io = iomap_dio_bio_end_io;
-
-	get_page(page);
-	__bio_add_page(bio, page, len, 0);
-	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
-
-	atomic_inc(&dio->ref);
-	return submit_bio(bio);
-}
-
-static loff_t
-iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
-		struct iomap_dio *dio, struct iomap *iomap)
-{
-	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-	unsigned int fs_block_size = i_blocksize(inode), pad;
-	unsigned int align = iov_iter_alignment(dio->submit.iter);
-	struct iov_iter iter;
-	struct bio *bio;
-	bool need_zeroout = false;
-	bool use_fua = false;
-	int nr_pages, ret;
-	size_t copied = 0;
-
-	if ((pos | length | align) & ((1 << blkbits) - 1))
-		return -EINVAL;
-
-	if (iomap->type == IOMAP_UNWRITTEN) {
-		dio->flags |= IOMAP_DIO_UNWRITTEN;
-		need_zeroout = true;
-	}
-
-	if (iomap->flags & IOMAP_F_SHARED)
-		dio->flags |= IOMAP_DIO_COW;
-
-	if (iomap->flags & IOMAP_F_NEW) {
-		need_zeroout = true;
-	} else {
-		/*
-		 * Use a FUA write if we need datasync semantics, this
-		 * is a pure data IO that doesn't require any metadata
-		 * updates and the underlying device supports FUA. This
-		 * allows us to avoid cache flushes on IO completion.
-		 */
-		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
-		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
-		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
-			use_fua = true;
-	}
-
-	/*
-	 * Operate on a partial iter trimmed to the extent we were called for.
-	 * We'll update the iter in the dio once we're done with this extent.
-	 */
-	iter = *dio->submit.iter;
-	iov_iter_truncate(&iter, length);
-
-	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-	if (nr_pages <= 0)
-		return nr_pages;
-
-	if (need_zeroout) {
-		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
-		if (pad)
-			iomap_dio_zero(dio, iomap, pos - pad, pad);
-	}
-
-	do {
-		size_t n;
-		if (dio->error) {
-			iov_iter_revert(dio->submit.iter, copied);
-			return 0;
-		}
-
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
-		bio_set_dev(bio, iomap->bdev);
-		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-		bio->bi_write_hint = dio->iocb->ki_hint;
-		bio->bi_ioprio = dio->iocb->ki_ioprio;
-		bio->bi_private = dio;
-		bio->bi_end_io = iomap_dio_bio_end_io;
-
-		ret = bio_iov_iter_get_pages(bio, &iter);
-		if (unlikely(ret)) {
-			bio_put(bio);
-			return copied ? copied : ret;
-		}
-
-		n = bio->bi_iter.bi_size;
-		if (dio->flags & IOMAP_DIO_WRITE) {
-			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
-			if (use_fua)
-				bio->bi_opf |= REQ_FUA;
-			else
-				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
-			task_io_account_write(n);
-		} else {
-			bio->bi_opf = REQ_OP_READ;
-			if (dio->flags & IOMAP_DIO_DIRTY)
-				bio_set_pages_dirty(bio);
-		}
-
-		iov_iter_advance(dio->submit.iter, n);
-
-		dio->size += n;
-		pos += n;
-		copied += n;
-
-		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-
-		atomic_inc(&dio->ref);
-
-		dio->submit.last_queue = bdev_get_queue(iomap->bdev);
-		dio->submit.cookie = submit_bio(bio);
-	} while (nr_pages);
-
-	if (need_zeroout) {
-		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
-		if (pad)
-			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
-	}
-	return copied;
-}
-
-static loff_t
-iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
-{
-	length = iov_iter_zero(length, dio->submit.iter);
-	dio->size += length;
-	return length;
-}
-
-static loff_t
-iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
-		struct iomap_dio *dio, struct iomap *iomap)
-{
-	struct iov_iter *iter = dio->submit.iter;
-	size_t copied;
-
-	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
-
-	if (dio->flags & IOMAP_DIO_WRITE) {
-		loff_t size = inode->i_size;
-
-		if (pos > size)
-			memset(iomap->inline_data + size, 0, pos - size);
-		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
-		if (copied) {
-			if (pos + copied > size)
-				i_size_write(inode, pos + copied);
-			mark_inode_dirty(inode);
-		}
-	} else {
-		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
-	}
-	dio->size += copied;
-	return copied;
-}
-
-static loff_t
-iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
-		void *data, struct iomap *iomap)
-{
-	struct iomap_dio *dio = data;
-
-	switch (iomap->type) {
-	case IOMAP_HOLE:
-		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
-			return -EIO;
-		return iomap_dio_hole_actor(length, dio);
-	case IOMAP_UNWRITTEN:
-		if (!(dio->flags & IOMAP_DIO_WRITE))
-			return iomap_dio_hole_actor(length, dio);
-		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
-	case IOMAP_MAPPED:
-		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
-	case IOMAP_INLINE:
-		return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
-	default:
-		WARN_ON_ONCE(1);
-		return -EIO;
-	}
-}
-
-/*
- * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
- * is being issued as AIO or not.  This allows us to optimise pure data writes
- * to use REQ_FUA rather than requiring generic_write_sync() to issue a
- * REQ_FLUSH post write. This is slightly tricky because a single request here
- * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
- * may be pure data writes. In that case, we still need to do a full data sync
- * completion.
- */
-ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
-{
-	struct address_space *mapping = iocb->ki_filp->f_mapping;
-	struct inode *inode = file_inode(iocb->ki_filp);
-	size_t count = iov_iter_count(iter);
-	loff_t pos = iocb->ki_pos, start = pos;
-	loff_t end = iocb->ki_pos + count - 1, ret = 0;
-	unsigned int flags = IOMAP_DIRECT;
-	struct blk_plug plug;
-	struct iomap_dio *dio;
-
-	lockdep_assert_held(&inode->i_rwsem);
-
-	if (!count)
-		return 0;
-
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
-	if (!dio)
-		return -ENOMEM;
-
-	dio->iocb = iocb;
-	atomic_set(&dio->ref, 1);
-	dio->size = 0;
-	dio->i_size = i_size_read(inode);
-	dio->end_io = end_io;
-	dio->error = 0;
-	dio->flags = 0;
-	dio->wait_for_completion = is_sync_kiocb(iocb);
-
-	dio->submit.iter = iter;
-	dio->submit.waiter = current;
-	dio->submit.cookie = BLK_QC_T_NONE;
-	dio->submit.last_queue = NULL;
-
-	if (iov_iter_rw(iter) == READ) {
-		if (pos >= dio->i_size)
-			goto out_free_dio;
-
-		if (iter->type == ITER_IOVEC)
-			dio->flags |= IOMAP_DIO_DIRTY;
-	} else {
-		flags |= IOMAP_WRITE;
-		dio->flags |= IOMAP_DIO_WRITE;
-
-		/* for data sync or sync, we need sync completion processing */
-		if (iocb->ki_flags & IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_NEED_SYNC;
-
-		/*
-		 * For datasync only writes, we optimistically try using FUA for
-		 * this IO.  Any non-FUA write that occurs will clear this flag,
-		 * hence we know before completion whether a cache flush is
-		 * necessary.
-		 */
-		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_WRITE_FUA;
-	}
-
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (filemap_range_has_page(mapping, start, end)) {
-			ret = -EAGAIN;
-			goto out_free_dio;
-		}
-		flags |= IOMAP_NOWAIT;
-	}
-
-	ret = filemap_write_and_wait_range(mapping, start, end);
-	if (ret)
-		goto out_free_dio;
-
-	/*
-	 * Try to invalidate cache pages for the range we're direct
-	 * writing.  If this invalidation fails, tough, the write will
-	 * still work, but racing two incompatible write paths is a
-	 * pretty crazy thing to do, so we don't support it 100%.
-	 */
-	ret = invalidate_inode_pages2_range(mapping,
-			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-	if (ret)
-		dio_warn_stale_pagecache(iocb->ki_filp);
-	ret = 0;
-
-	if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
-	    !inode->i_sb->s_dio_done_wq) {
-		ret = sb_init_dio_done_wq(inode->i_sb);
-		if (ret < 0)
-			goto out_free_dio;
-	}
-
-	inode_dio_begin(inode);
-
-	blk_start_plug(&plug);
-	do {
-		ret = iomap_apply(inode, pos, count, flags, ops, dio,
-				iomap_dio_actor);
-		if (ret <= 0) {
-			/* magic error code to fall back to buffered I/O */
-			if (ret == -ENOTBLK) {
-				dio->wait_for_completion = true;
-				ret = 0;
-			}
-			break;
-		}
-		pos += ret;
-
-		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
-			break;
-	} while ((count = iov_iter_count(iter)) > 0);
-	blk_finish_plug(&plug);
-
-	if (ret < 0)
-		iomap_dio_set_error(dio, ret);
-
-	/*
-	 * If all the writes we issued were FUA, we don't need to flush the
-	 * cache on IO completion. Clear the sync flag for this case.
-	 */
-	if (dio->flags & IOMAP_DIO_WRITE_FUA)
-		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
-
-	if (!atomic_dec_and_test(&dio->ref)) {
-		if (!dio->wait_for_completion)
-			return -EIOCBQUEUED;
-
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (!READ_ONCE(dio->submit.waiter))
-				break;
-
-			if (!(iocb->ki_flags & IOCB_HIPRI) ||
-			    !dio->submit.last_queue ||
-			    !blk_poll(dio->submit.last_queue,
-					 dio->submit.cookie))
-				io_schedule();
-		}
-		__set_current_state(TASK_RUNNING);
-	}
-
-	ret = iomap_dio_complete(dio);
-
-	return ret;
-
-out_free_dio:
-	kfree(dio);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iomap_dio_rw);
-
-/* Swapfile activation */
-
-#ifdef CONFIG_SWAP
-struct iomap_swapfile_info {
-	struct iomap iomap;		/* accumulated iomap */
-	struct swap_info_struct *sis;
-	uint64_t lowest_ppage;		/* lowest physical addr seen (pages) */
-	uint64_t highest_ppage;		/* highest physical addr seen (pages) */
-	unsigned long nr_pages;		/* number of pages collected */
-	int nr_extents;			/* extent count */
-};
-
-/*
- * Collect physical extents for this swap file.  Physical extents reported to
- * the swap code must be trimmed to align to a page boundary.  The logical
- * offset within the file is irrelevant since the swapfile code maps logical
- * page numbers of the swap device to the physical page-aligned extents.
- */
-static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
-{
-	struct iomap *iomap = &isi->iomap;
-	unsigned long nr_pages;
-	uint64_t first_ppage;
-	uint64_t first_ppage_reported;
-	uint64_t next_ppage;
-	int error;
-
-	/*
-	 * Round the start up and the end down so that the physical
-	 * extent aligns to a page boundary.
-	 */
-	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
-	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
-			PAGE_SHIFT;
-
-	/* Skip too-short physical extents. */
-	if (first_ppage >= next_ppage)
-		return 0;
-	nr_pages = next_ppage - first_ppage;
-
-	/*
-	 * Calculate how much swap space we're adding; the first page contains
-	 * the swap header and doesn't count.  The mm still wants that first
-	 * page fed to add_swap_extent, however.
-	 */
-	first_ppage_reported = first_ppage;
-	if (iomap->offset == 0)
-		first_ppage_reported++;
-	if (isi->lowest_ppage > first_ppage_reported)
-		isi->lowest_ppage = first_ppage_reported;
-	if (isi->highest_ppage < (next_ppage - 1))
-		isi->highest_ppage = next_ppage - 1;
-
-	/* Add extent, set up for the next call. */
-	error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
-	if (error < 0)
-		return error;
-	isi->nr_extents += error;
-	isi->nr_pages += nr_pages;
-	return 0;
-}
-
-/*
- * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
- * swap only cares about contiguous page-aligned physical extents and makes no
- * distinction between written and unwritten extents.
- */
-static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
-		loff_t count, void *data, struct iomap *iomap)
-{
-	struct iomap_swapfile_info *isi = data;
-	int error;
-
-	switch (iomap->type) {
-	case IOMAP_MAPPED:
-	case IOMAP_UNWRITTEN:
-		/* Only real or unwritten extents. */
-		break;
-	case IOMAP_INLINE:
-		/* No inline data. */
-		pr_err("swapon: file is inline\n");
-		return -EINVAL;
-	default:
-		pr_err("swapon: file has unallocated extents\n");
-		return -EINVAL;
-	}
-
-	/* No uncommitted metadata or shared blocks. */
-	if (iomap->flags & IOMAP_F_DIRTY) {
-		pr_err("swapon: file is not committed\n");
-		return -EINVAL;
-	}
-	if (iomap->flags & IOMAP_F_SHARED) {
-		pr_err("swapon: file has shared extents\n");
-		return -EINVAL;
-	}
-
-	/* Only one bdev per swap file. */
-	if (iomap->bdev != isi->sis->bdev) {
-		pr_err("swapon: file is on multiple devices\n");
-		return -EINVAL;
-	}
-
-	if (isi->iomap.length == 0) {
-		/* No accumulated extent, so just store it. */
-		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
-	} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
-		/* Append this to the accumulated extent. */
-		isi->iomap.length += iomap->length;
-	} else {
-		/* Otherwise, add the retained iomap and store this one. */
-		error = iomap_swapfile_add_extent(isi);
-		if (error)
-			return error;
-		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
-	}
-	return count;
-}
-
-/*
- * Iterate a swap file's iomaps to construct physical extents that can be
- * passed to the swapfile subsystem.
- */
-int iomap_swapfile_activate(struct swap_info_struct *sis,
-		struct file *swap_file, sector_t *pagespan,
-		const struct iomap_ops *ops)
-{
-	struct iomap_swapfile_info isi = {
-		.sis = sis,
-		.lowest_ppage = (sector_t)-1ULL,
-	};
-	struct address_space *mapping = swap_file->f_mapping;
-	struct inode *inode = mapping->host;
-	loff_t pos = 0;
-	loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
-	loff_t ret;
-
-	/*
-	 * Persist all file mapping metadata so that we won't have any
-	 * IOMAP_F_DIRTY iomaps.
-	 */
-	ret = vfs_fsync(swap_file, 1);
-	if (ret)
-		return ret;
-
-	while (len > 0) {
-		ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
-				ops, &isi, iomap_swapfile_activate_actor);
-		if (ret <= 0)
-			return ret;
-
-		pos += ret;
-		len -= ret;
-	}
-
-	if (isi.iomap.length) {
-		ret = iomap_swapfile_add_extent(&isi);
-		if (ret)
-			return ret;
-	}
-
-	*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
-	sis->max = isi.nr_pages;
-	sis->pages = isi.nr_pages - 1;
-	sis->highest_bit = isi.nr_pages - 1;
-	return isi.nr_extents;
-}
-EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
-#endif /* CONFIG_SWAP */
-
-static loff_t
-iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
-		void *data, struct iomap *iomap)
-{
-	sector_t *bno = data, addr;
-
-	if (iomap->type == IOMAP_MAPPED) {
-		addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
-		if (addr > INT_MAX)
-			WARN(1, "would truncate bmap result\n");
-		else
-			*bno = addr;
-	}
-	return 0;
-}
-
-/* legacy ->bmap interface.  0 is the error return (!) */
-sector_t
-iomap_bmap(struct address_space *mapping, sector_t bno,
-		const struct iomap_ops *ops)
-{
-	struct inode *inode = mapping->host;
-	loff_t pos = bno << inode->i_blkbits;
-	unsigned blocksize = i_blocksize(inode);
-
-	if (filemap_write_and_wait(mapping))
-		return 0;
-
-	bno = 0;
-	iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
-	return bno;
-}
-EXPORT_SYMBOL_GPL(iomap_bmap);

diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
new file mode 100644
index 0000000..93cd119
--- /dev/null
+++ b/fs/iomap/Makefile

@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (c) 2019 Oracle.
+# All Rights Reserved.
+#
+obj-$(CONFIG_FS_IOMAP)		+= iomap.o
+
+iomap-y				+= \
+					apply.o \
+					buffered-io.o \
+					direct-io.o \
+					fiemap.o \
+					seek.o
+
+iomap-$(CONFIG_SWAP)		+= swapfile.o

diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
new file mode 100644
index 0000000..54c02ae
--- /dev/null
+++ b/fs/iomap/apply.c

@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+		const struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+	struct iomap iomap = { 0 };
+	loff_t written = 0, ret;
+
+	/*
+	 * Need to map a range from start position for length bytes. This can
+	 * span multiple pages - it is only guaranteed to return a range of a
+	 * single type of pages (e.g. all into a hole, all mapped or all
+	 * unwritten). Failure at this point has nothing to undo.
+	 *
+	 * If allocation is required for this range, reserve the space now so
+	 * that the allocation is guaranteed to succeed later on. Once we copy
+	 * the data into the page cache pages, then we cannot fail otherwise we
+	 * expose transient stale data. If the reserve fails, we can safely
+	 * back out at this point as there is nothing to undo.
+	 */
+	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+	if (ret)
+		return ret;
+	if (WARN_ON(iomap.offset > pos))
+		return -EIO;
+	if (WARN_ON(iomap.length == 0))
+		return -EIO;
+
+	/*
+	 * Cut down the length to the one actually provided by the filesystem,
+	 * as it might not be able to give us the whole size that we requested.
+	 */
+	if (iomap.offset + iomap.length < pos + length)
+		length = iomap.offset + iomap.length - pos;
+
+	/*
+	 * Now that we have guaranteed that the space allocation will succeed.
+	 * we can do the copy-in page by page without having to worry about
+	 * failures exposing transient data.
+	 */
+	written = actor(inode, pos, length, data, &iomap);
+
+	/*
+	 * Now the data has been copied, commit the range we've copied.  This
+	 * should not fail unless the filesystem has had a fatal error.
+	 */
+	if (ops->iomap_end) {
+		ret = ops->iomap_end(inode, pos, length,
+				     written > 0 ? written : 0,
+				     flags, &iomap);
+	}
+
+	return written ? written : ret;
+}

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
new file mode 100644
index 0000000..e25901a
--- /dev/null
+++ b/fs/iomap/buffered-io.c

@@ -0,0 +1,1073 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/buffer_head.h>
+#include <linux/dax.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/sched/signal.h>
+#include <linux/migrate.h>
+
+#include "../internal.h"
+
+static struct iomap_page *
+iomap_page_create(struct inode *inode, struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (iop || i_blocksize(inode) == PAGE_SIZE)
+		return iop;
+
+	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
+	atomic_set(&iop->read_count, 0);
+	atomic_set(&iop->write_count, 0);
+	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+
+	/*
+	 * migrate_page_move_mapping() assumes that pages with private data have
+	 * their count elevated by 1.
+	 */
+	get_page(page);
+	set_page_private(page, (unsigned long)iop);
+	SetPagePrivate(page);
+	return iop;
+}
+
+static void
+iomap_page_release(struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (!iop)
+		return;
+	WARN_ON_ONCE(atomic_read(&iop->read_count));
+	WARN_ON_ONCE(atomic_read(&iop->write_count));
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	put_page(page);
+	kfree(iop);
+}
+
+/*
+ * Calculate the range inside the page that we actually need to read.
+ */
+static void
+iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
+		loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
+{
+	loff_t orig_pos = *pos;
+	loff_t isize = i_size_read(inode);
+	unsigned block_bits = inode->i_blkbits;
+	unsigned block_size = (1 << block_bits);
+	unsigned poff = offset_in_page(*pos);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	unsigned first = poff >> block_bits;
+	unsigned last = (poff + plen - 1) >> block_bits;
+
+	/*
+	 * If the block size is smaller than the page size we need to check the
+	 * per-block uptodate status and adjust the offset and length if needed
+	 * to avoid reading in already uptodate ranges.
+	 */
+	if (iop) {
+		unsigned int i;
+
+		/* move forward for each leading block marked uptodate */
+		for (i = first; i <= last; i++) {
+			if (!test_bit(i, iop->uptodate))
+				break;
+			*pos += block_size;
+			poff += block_size;
+			plen -= block_size;
+			first++;
+		}
+
+		/* truncate len if we find any trailing uptodate block(s) */
+		for ( ; i <= last; i++) {
+			if (test_bit(i, iop->uptodate)) {
+				plen -= (last - i + 1) * block_size;
+				last = i - 1;
+				break;
+			}
+		}
+	}
+
+	/*
+	 * If the extent spans the block that contains the i_size we need to
+	 * handle both halves separately so that we properly zero data in the
+	 * page cache for blocks that are entirely outside of i_size.
+	 */
+	if (orig_pos <= isize && orig_pos + length > isize) {
+		unsigned end = offset_in_page(isize - 1) >> block_bits;
+
+		if (first <= end && last > end)
+			plen -= (last - end) * block_size;
+	}
+
+	*offp = poff;
+	*lenp = plen;
+}
+
+static void
+iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = off >> inode->i_blkbits;
+	unsigned last = (off + len - 1) >> inode->i_blkbits;
+	unsigned int i;
+	bool uptodate = true;
+
+	if (iop) {
+		for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
+			if (i >= first && i <= last)
+				set_bit(i, iop->uptodate);
+			else if (!test_bit(i, iop->uptodate))
+				uptodate = false;
+		}
+	}
+
+	if (uptodate && !PageError(page))
+		SetPageUptodate(page);
+}
+
+static void
+iomap_read_finish(struct iomap_page *iop, struct page *page)
+{
+	if (!iop || atomic_dec_and_test(&iop->read_count))
+		unlock_page(page);
+}
+
+static void
+iomap_read_page_end_io(struct bio_vec *bvec, int error)
+{
+	struct page *page = bvec->bv_page;
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (unlikely(error)) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
+	}
+
+	iomap_read_finish(iop, page);
+}
+
+static void
+iomap_read_end_io(struct bio *bio)
+{
+	int error = blk_status_to_errno(bio->bi_status);
+	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
+
+	bio_for_each_segment_all(bvec, bio, iter_all)
+		iomap_read_page_end_io(bvec, error);
+	bio_put(bio);
+}
+
+struct iomap_readpage_ctx {
+	struct page		*cur_page;
+	bool			cur_page_in_bio;
+	bool			is_readahead;
+	struct bio		*bio;
+	struct list_head	*pages;
+};
+
+static void
+iomap_read_inline_data(struct inode *inode, struct page *page,
+		struct iomap *iomap)
+{
+	size_t size = i_size_read(inode);
+	void *addr;
+
+	if (PageUptodate(page))
+		return;
+
+	BUG_ON(page->index);
+	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+	addr = kmap_atomic(page);
+	memcpy(addr, iomap->inline_data, size);
+	memset(addr + size, 0, PAGE_SIZE - size);
+	kunmap_atomic(addr);
+	SetPageUptodate(page);
+}
+
+static loff_t
+iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iomap_readpage_ctx *ctx = data;
+	struct page *page = ctx->cur_page;
+	struct iomap_page *iop = iomap_page_create(inode, page);
+	bool same_page = false, is_contig = false;
+	loff_t orig_pos = pos;
+	unsigned poff, plen;
+	sector_t sector;
+
+	if (iomap->type == IOMAP_INLINE) {
+		WARN_ON_ONCE(pos);
+		iomap_read_inline_data(inode, page, iomap);
+		return PAGE_SIZE;
+	}
+
+	/* zero post-eof blocks as the page may be mapped */
+	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
+	if (plen == 0)
+		goto done;
+
+	if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
+		zero_user(page, poff, plen);
+		iomap_set_range_uptodate(page, poff, plen);
+		goto done;
+	}
+
+	ctx->cur_page_in_bio = true;
+
+	/*
+	 * Try to merge into a previous segment if we can.
+	 */
+	sector = iomap_sector(iomap, pos);
+	if (ctx->bio && bio_end_sector(ctx->bio) == sector)
+		is_contig = true;
+
+	if (is_contig &&
+	    __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) {
+		if (!same_page && iop)
+			atomic_inc(&iop->read_count);
+		goto done;
+	}
+
+	/*
+	 * If we start a new segment we need to increase the read count, and we
+	 * need to do so before submitting any previous full bio to make sure
+	 * that we don't prematurely unlock the page.
+	 */
+	if (iop)
+		atomic_inc(&iop->read_count);
+
+	if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) {
+		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+		if (ctx->bio)
+			submit_bio(ctx->bio);
+
+		if (ctx->is_readahead) /* same as readahead_gfp_mask */
+			gfp |= __GFP_NORETRY | __GFP_NOWARN;
+		ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+		ctx->bio->bi_opf = REQ_OP_READ;
+		if (ctx->is_readahead)
+			ctx->bio->bi_opf |= REQ_RAHEAD;
+		ctx->bio->bi_iter.bi_sector = sector;
+		bio_set_dev(ctx->bio, iomap->bdev);
+		ctx->bio->bi_end_io = iomap_read_end_io;
+	}
+
+	bio_add_page(ctx->bio, page, plen, poff);
+done:
+	/*
+	 * Move the caller beyond our range so that it keeps making progress.
+	 * For that we have to include any leading non-uptodate ranges, but
+	 * we can skip trailing ones as they will be handled in the next
+	 * iteration.
+	 */
+	return pos - orig_pos + plen;
+}
+
+int
+iomap_readpage(struct page *page, const struct iomap_ops *ops)
+{
+	struct iomap_readpage_ctx ctx = { .cur_page = page };
+	struct inode *inode = page->mapping->host;
+	unsigned poff;
+	loff_t ret;
+
+	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
+		ret = iomap_apply(inode, page_offset(page) + poff,
+				PAGE_SIZE - poff, 0, ops, &ctx,
+				iomap_readpage_actor);
+		if (ret <= 0) {
+			WARN_ON_ONCE(ret == 0);
+			SetPageError(page);
+			break;
+		}
+	}
+
+	if (ctx.bio) {
+		submit_bio(ctx.bio);
+		WARN_ON_ONCE(!ctx.cur_page_in_bio);
+	} else {
+		WARN_ON_ONCE(ctx.cur_page_in_bio);
+		unlock_page(page);
+	}
+
+	/*
+	 * Just like mpage_readpages and block_read_full_page we always
+	 * return 0 and just mark the page as PageError on errors.  This
+	 * should be cleaned up all through the stack eventually.
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_readpage);
+
+static struct page *
+iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
+		loff_t length, loff_t *done)
+{
+	while (!list_empty(pages)) {
+		struct page *page = lru_to_page(pages);
+
+		if (page_offset(page) >= (u64)pos + length)
+			break;
+
+		list_del(&page->lru);
+		if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
+				GFP_NOFS))
+			return page;
+
+		/*
+		 * If we already have a page in the page cache at index we are
+		 * done.  Upper layers don't care if it is uptodate after the
+		 * readpages call itself as every page gets checked again once
+		 * actually needed.
+		 */
+		*done += PAGE_SIZE;
+		put_page(page);
+	}
+
+	return NULL;
+}
+
+static loff_t
+iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_readpage_ctx *ctx = data;
+	loff_t done, ret;
+
+	for (done = 0; done < length; done += ret) {
+		if (ctx->cur_page && offset_in_page(pos + done) == 0) {
+			if (!ctx->cur_page_in_bio)
+				unlock_page(ctx->cur_page);
+			put_page(ctx->cur_page);
+			ctx->cur_page = NULL;
+		}
+		if (!ctx->cur_page) {
+			ctx->cur_page = iomap_next_page(inode, ctx->pages,
+					pos, length, &done);
+			if (!ctx->cur_page)
+				break;
+			ctx->cur_page_in_bio = false;
+		}
+		ret = iomap_readpage_actor(inode, pos + done, length - done,
+				ctx, iomap);
+	}
+
+	return done;
+}
+
+int
+iomap_readpages(struct address_space *mapping, struct list_head *pages,
+		unsigned nr_pages, const struct iomap_ops *ops)
+{
+	struct iomap_readpage_ctx ctx = {
+		.pages		= pages,
+		.is_readahead	= true,
+	};
+	loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
+	loff_t last = page_offset(list_entry(pages->next, struct page, lru));
+	loff_t length = last - pos + PAGE_SIZE, ret = 0;
+
+	while (length > 0) {
+		ret = iomap_apply(mapping->host, pos, length, 0, ops,
+				&ctx, iomap_readpages_actor);
+		if (ret <= 0) {
+			WARN_ON_ONCE(ret == 0);
+			goto done;
+		}
+		pos += ret;
+		length -= ret;
+	}
+	ret = 0;
+done:
+	if (ctx.bio)
+		submit_bio(ctx.bio);
+	if (ctx.cur_page) {
+		if (!ctx.cur_page_in_bio)
+			unlock_page(ctx.cur_page);
+		put_page(ctx.cur_page);
+	}
+
+	/*
+	 * Check that we didn't lose a page due to the arcance calling
+	 * conventions..
+	 */
+	WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_readpages);
+
+/*
+ * iomap_is_partially_uptodate checks whether blocks within a page are
+ * uptodate or not.
+ *
+ * Returns true if all blocks which correspond to a file portion
+ * we want to read within the page are uptodate.
+ */
+int
+iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned len, first, last;
+	unsigned i;
+
+	/* Limit range to one page */
+	len = min_t(unsigned, PAGE_SIZE - from, count);
+
+	/* First and last blocks in range within page */
+	first = from >> inode->i_blkbits;
+	last = (from + len - 1) >> inode->i_blkbits;
+
+	if (iop) {
+		for (i = first; i <= last; i++)
+			if (!test_bit(i, iop->uptodate))
+				return 0;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+
+int
+iomap_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	/*
+	 * mm accommodates an old ext3 case where clean pages might not have had
+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
+	 * ->releasepage() via shrink_active_list(), skip those here.
+	 */
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+	iomap_page_release(page);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(iomap_releasepage);
+
+void
+iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
+{
+	/*
+	 * If we are invalidating the entire page, clear the dirty state from it
+	 * and release it to avoid unnecessary buildup of the LRU.
+	 */
+	if (offset == 0 && len == PAGE_SIZE) {
+		WARN_ON_ONCE(PageWriteback(page));
+		cancel_dirty_page(page);
+		iomap_page_release(page);
+	}
+}
+EXPORT_SYMBOL_GPL(iomap_invalidatepage);
+
+#ifdef CONFIG_MIGRATION
+int
+iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (page_has_private(page)) {
+		ClearPagePrivate(page);
+		get_page(newpage);
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+		put_page(page);
+		SetPagePrivate(newpage);
+	}
+
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(iomap_migrate_page);
+#endif /* CONFIG_MIGRATION */
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+	loff_t i_size = i_size_read(inode);
+
+	/*
+	 * Only truncate newly allocated pages beyoned EOF, even if the
+	 * write started inside the existing inode size.
+	 */
+	if (pos + len > i_size)
+		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
+		unsigned poff, unsigned plen, unsigned from, unsigned to,
+		struct iomap *iomap)
+{
+	struct bio_vec bvec;
+	struct bio bio;
+
+	if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
+		zero_user_segments(page, poff, from, to, poff + plen);
+		iomap_set_range_uptodate(page, poff, plen);
+		return 0;
+	}
+
+	bio_init(&bio, &bvec, 1);
+	bio.bi_opf = REQ_OP_READ;
+	bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
+	bio_set_dev(&bio, iomap->bdev);
+	__bio_add_page(&bio, page, plen, poff);
+	return submit_bio_wait(&bio);
+}
+
+static int
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
+		struct page *page, struct iomap *iomap)
+{
+	struct iomap_page *iop = iomap_page_create(inode, page);
+	loff_t block_size = i_blocksize(inode);
+	loff_t block_start = pos & ~(block_size - 1);
+	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
+	unsigned from = offset_in_page(pos), to = from + len, poff, plen;
+	int status = 0;
+
+	if (PageUptodate(page))
+		return 0;
+
+	do {
+		iomap_adjust_read_range(inode, iop, &block_start,
+				block_end - block_start, &poff, &plen);
+		if (plen == 0)
+			break;
+
+		if ((from > poff && from < poff + plen) ||
+		    (to > poff && to < poff + plen)) {
+			status = iomap_read_page_sync(inode, block_start, page,
+					poff, plen, from, to, iomap);
+			if (status)
+				break;
+		}
+
+	} while ((block_start += plen) < block_end);
+
+	return status;
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, struct iomap *iomap)
+{
+	const struct iomap_page_ops *page_ops = iomap->page_ops;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	struct page *page;
+	int status = 0;
+
+	BUG_ON(pos + len > iomap->offset + iomap->length);
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	if (page_ops && page_ops->page_prepare) {
+		status = page_ops->page_prepare(inode, pos, len, iomap);
+		if (status)
+			return status;
+	}
+
+	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+	if (!page) {
+		status = -ENOMEM;
+		goto out_no_page;
+	}
+
+	if (iomap->type == IOMAP_INLINE)
+		iomap_read_inline_data(inode, page, iomap);
+	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
+		status = __block_write_begin_int(page, pos, len, NULL, iomap);
+	else
+		status = __iomap_write_begin(inode, pos, len, page, iomap);
+
+	if (unlikely(status))
+		goto out_unlock;
+
+	*pagep = page;
+	return 0;
+
+out_unlock:
+	unlock_page(page);
+	put_page(page);
+	iomap_write_failed(inode, pos, len);
+
+out_no_page:
+	if (page_ops && page_ops->page_done)
+		page_ops->page_done(inode, pos, 0, NULL, iomap);
+	return status;
+}
+
+int
+iomap_set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int newly_dirty;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	/*
+	 * Lock out page->mem_cgroup migration to keep PageDirty
+	 * synchronized with per-memcg dirty page counters.
+	 */
+	lock_page_memcg(page);
+	newly_dirty = !TestSetPageDirty(page);
+	if (newly_dirty)
+		__set_page_dirty(page, mapping, 0);
+	unlock_page_memcg(page);
+
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	return newly_dirty;
+}
+EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
+
+static int
+__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+		unsigned copied, struct page *page, struct iomap *iomap)
+{
+	flush_dcache_page(page);
+
+	/*
+	 * The blocks that were entirely written will now be uptodate, so we
+	 * don't have to worry about a readpage reading them and overwriting a
+	 * partial write.  However if we have encountered a short write and only
+	 * partially written into a block, it will not be marked uptodate, so a
+	 * readpage might come in and destroy our partial write.
+	 *
+	 * Do the simplest thing, and just treat any short write to a non
+	 * uptodate page as a zero-length write, and force the caller to redo
+	 * the whole thing.
+	 */
+	if (unlikely(copied < len && !PageUptodate(page)))
+		return 0;
+	iomap_set_range_uptodate(page, offset_in_page(pos), len);
+	iomap_set_page_dirty(page);
+	return copied;
+}
+
+static int
+iomap_write_end_inline(struct inode *inode, struct page *page,
+		struct iomap *iomap, loff_t pos, unsigned copied)
+{
+	void *addr;
+
+	WARN_ON_ONCE(!PageUptodate(page));
+	BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+	addr = kmap_atomic(page);
+	memcpy(iomap->inline_data + pos, addr + pos, copied);
+	kunmap_atomic(addr);
+
+	mark_inode_dirty(inode);
+	return copied;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+		unsigned copied, struct page *page, struct iomap *iomap)
+{
+	const struct iomap_page_ops *page_ops = iomap->page_ops;
+	loff_t old_size = inode->i_size;
+	int ret;
+
+	if (iomap->type == IOMAP_INLINE) {
+		ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
+	} else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
+		ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
+				page, NULL);
+	} else {
+		ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
+	}
+
+	/*
+	 * Update the in-memory inode size after copying the data into the page
+	 * cache.  It's up to the file system to write the updated size to disk,
+	 * preferably after I/O completion so that no stale data is exposed.
+	 */
+	if (pos + ret > old_size) {
+		i_size_write(inode, pos + ret);
+		iomap->flags |= IOMAP_F_SIZE_CHANGED;
+	}
+	unlock_page(page);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+	if (page_ops && page_ops->page_done)
+		page_ops->page_done(inode, pos, ret, page, iomap);
+	put_page(page);
+
+	if (ret < len)
+		iomap_write_failed(inode, pos, len);
+	return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iov_iter *i = data;
+	long status = 0;
+	ssize_t written = 0;
+	unsigned int flags = AOP_FLAG_NOFS;
+
+	do {
+		struct page *page;
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+		size_t copied;		/* Bytes copied from user */
+
+		offset = offset_in_page(pos);
+		bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_count(i));
+again:
+		if (bytes > length)
+			bytes = length;
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+			status = -EFAULT;
+			break;
+		}
+
+		status = iomap_write_begin(inode, pos, bytes, flags, &page,
+				iomap);
+		if (unlikely(status))
+			break;
+
+		if (mapping_writably_mapped(inode->i_mapping))
+			flush_dcache_page(page);
+
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+
+		flush_dcache_page(page);
+
+		status = iomap_write_end(inode, pos, bytes, copied, page,
+				iomap);
+		if (unlikely(status < 0))
+			break;
+		copied = status;
+
+		cond_resched();
+
+		iov_iter_advance(i, copied);
+		if (unlikely(copied == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_single_seg_count(i));
+			goto again;
+		}
+		pos += copied;
+		written += copied;
+		length -= copied;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+	} while (iov_iter_count(i) && length);
+
+	return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+	while (iov_iter_count(iter)) {
+		ret = iomap_apply(inode, pos, iov_iter_count(iter),
+				IOMAP_WRITE, ops, iter, iomap_write_actor);
+		if (ret <= 0)
+			break;
+		pos += ret;
+		written += ret;
+	}
+
+	return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static struct page *
+__iomap_read_page(struct inode *inode, loff_t offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+
+	page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
+	if (IS_ERR(page))
+		return page;
+	if (!PageUptodate(page)) {
+		put_page(page);
+		return ERR_PTR(-EIO);
+	}
+	return page;
+}
+
+static loff_t
+iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	long status = 0;
+	ssize_t written = 0;
+
+	do {
+		struct page *page, *rpage;
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+
+		offset = offset_in_page(pos);
+		bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+
+		rpage = __iomap_read_page(inode, pos);
+		if (IS_ERR(rpage))
+			return PTR_ERR(rpage);
+
+		status = iomap_write_begin(inode, pos, bytes,
+					   AOP_FLAG_NOFS, &page, iomap);
+		put_page(rpage);
+		if (unlikely(status))
+			return status;
+
+		WARN_ON_ONCE(!PageUptodate(page));
+
+		status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
+		if (unlikely(status <= 0)) {
+			if (WARN_ON_ONCE(status == 0))
+				return -EIO;
+			return status;
+		}
+
+		cond_resched();
+
+		pos += status;
+		written += status;
+		length -= status;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+	} while (length);
+
+	return written;
+}
+
+int
+iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+		const struct iomap_ops *ops)
+{
+	loff_t ret;
+
+	while (len) {
+		ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
+				iomap_dirty_actor);
+		if (ret <= 0)
+			return ret;
+		pos += ret;
+		len -= ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_file_dirty);
+
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+		unsigned bytes, struct iomap *iomap)
+{
+	struct page *page;
+	int status;
+
+	status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
+				   iomap);
+	if (status)
+		return status;
+
+	zero_user(page, offset, bytes);
+	mark_page_accessed(page);
+
+	return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
+}
+
+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
+		struct iomap *iomap)
+{
+	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
+			iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
+}
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+		void *data, struct iomap *iomap)
+{
+	bool *did_zero = data;
+	loff_t written = 0;
+	int status;
+
+	/* already zeroed?  we're done. */
+	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+		return count;
+
+	do {
+		unsigned offset, bytes;
+
+		offset = offset_in_page(pos);
+		bytes = min_t(loff_t, PAGE_SIZE - offset, count);
+
+		if (IS_DAX(inode))
+			status = iomap_dax_zero(pos, offset, bytes, iomap);
+		else
+			status = iomap_zero(inode, pos, offset, bytes, iomap);
+		if (status < 0)
+			return status;
+
+		pos += bytes;
+		count -= bytes;
+		written += bytes;
+		if (did_zero)
+			*did_zero = true;
+	} while (count > 0);
+
+	return written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+		const struct iomap_ops *ops)
+{
+	loff_t ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+				ops, did_zero, iomap_zero_range_actor);
+		if (ret <= 0)
+			return ret;
+
+		pos += ret;
+		len -= ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		const struct iomap_ops *ops)
+{
+	unsigned int blocksize = i_blocksize(inode);
+	unsigned int off = pos & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!off)
+		return 0;
+	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct page *page = data;
+	int ret;
+
+	if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
+		ret = __block_write_begin_int(page, pos, length, NULL, iomap);
+		if (ret)
+			return ret;
+		block_commit_write(page, 0, length);
+	} else {
+		WARN_ON_ONCE(!PageUptodate(page));
+		iomap_page_create(inode, page);
+		set_page_dirty(page);
+	}
+
+	return length;
+}
+
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	unsigned long length;
+	loff_t offset, size;
+	ssize_t ret;
+
+	lock_page(page);
+	size = i_size_read(inode);
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_offset(page) > size)) {
+		/* We overload EFAULT to mean page got truncated */
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_SHIFT) > size)
+		length = offset_in_page(size);
+	else
+		length = PAGE_SIZE;
+
+	offset = page_offset(page);
+	while (length > 0) {
+		ret = iomap_apply(inode, offset, length,
+				IOMAP_WRITE | IOMAP_FAULT, ops, page,
+				iomap_page_mkwrite_actor);
+		if (unlikely(ret <= 0))
+			goto out_unlock;
+		offset += ret;
+		length -= ret;
+	}
+
+	wait_for_stable_page(page);
+	return VM_FAULT_LOCKED;
+out_unlock:
+	unlock_page(page);
+	return block_page_mkwrite_return(ret);
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
new file mode 100644
index 0000000..1fc28c2
--- /dev/null
+++ b/fs/iomap/direct-io.c

@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/backing-dev.h>
+#include <linux/uio.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "../internal.h"
+
+/*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+#define IOMAP_DIO_WRITE_FUA	(1 << 28)
+#define IOMAP_DIO_NEED_SYNC	(1 << 29)
+#define IOMAP_DIO_WRITE		(1 << 30)
+#define IOMAP_DIO_DIRTY		(1 << 31)
+
+struct iomap_dio {
+	struct kiocb		*iocb;
+	const struct iomap_dio_ops *dops;
+	loff_t			i_size;
+	loff_t			size;
+	atomic_t		ref;
+	unsigned		flags;
+	int			error;
+	bool			wait_for_completion;
+
+	union {
+		/* used during submission and for synchronous completion: */
+		struct {
+			struct iov_iter		*iter;
+			struct task_struct	*waiter;
+			struct request_queue	*last_queue;
+			blk_qc_t		cookie;
+		} submit;
+
+		/* used for aio completion: */
+		struct {
+			struct work_struct	work;
+		} aio;
+	};
+};
+
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+	struct request_queue *q = READ_ONCE(kiocb->private);
+
+	if (!q)
+		return 0;
+	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+		struct bio *bio)
+{
+	atomic_inc(&dio->ref);
+
+	if (dio->iocb->ki_flags & IOCB_HIPRI)
+		bio_set_polled(bio, dio->iocb);
+
+	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+	dio->submit.cookie = submit_bio(bio);
+}
+
+static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+{
+	const struct iomap_dio_ops *dops = dio->dops;
+	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	loff_t offset = iocb->ki_pos;
+	ssize_t ret = dio->error;
+
+	if (dops && dops->end_io)
+		ret = dops->end_io(iocb, dio->size, ret, dio->flags);
+
+	if (likely(!ret)) {
+		ret = dio->size;
+		/* check for short read */
+		if (offset + ret > dio->i_size &&
+		    !(dio->flags & IOMAP_DIO_WRITE))
+			ret = dio->i_size - offset;
+		iocb->ki_pos += ret;
+	}
+
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 *
+	 * And this page cache invalidation has to be after ->end_io(), as some
+	 * filesystems convert unwritten extents to real allocations in
+	 * ->end_io() when necessary, otherwise a racing buffer read would cache
+	 * zeros from unwritten extents.
+	 */
+	if (!dio->error &&
+	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+		int err;
+		err = invalidate_inode_pages2_range(inode->i_mapping,
+				offset >> PAGE_SHIFT,
+				(offset + dio->size - 1) >> PAGE_SHIFT);
+		if (err)
+			dio_warn_stale_pagecache(iocb->ki_filp);
+	}
+
+	/*
+	 * If this is a DSYNC write, make sure we push it to stable storage now
+	 * that we've written data.
+	 */
+	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+		ret = generic_write_sync(iocb, ret);
+
+	inode_dio_end(file_inode(iocb->ki_filp));
+	kfree(dio);
+
+	return ret;
+}
+
+static void iomap_dio_complete_work(struct work_struct *work)
+{
+	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+	struct kiocb *iocb = dio->iocb;
+
+	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+}
+
+/*
+ * Set an error in the dio if none is set yet.  We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+{
+	cmpxchg(&dio->error, 0, ret);
+}
+
+static void iomap_dio_bio_end_io(struct bio *bio)
+{
+	struct iomap_dio *dio = bio->bi_private;
+	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+	if (bio->bi_status)
+		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+	if (atomic_dec_and_test(&dio->ref)) {
+		if (dio->wait_for_completion) {
+			struct task_struct *waiter = dio->submit.waiter;
+			WRITE_ONCE(dio->submit.waiter, NULL);
+			blk_wake_io_task(waiter);
+		} else if (dio->flags & IOMAP_DIO_WRITE) {
+			struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+		} else {
+			iomap_dio_complete_work(&dio->aio.work);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
+static void
+iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+		unsigned len)
+{
+	struct page *page = ZERO_PAGE(0);
+	int flags = REQ_SYNC | REQ_IDLE;
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	bio_set_dev(bio, iomap->bdev);
+	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+	bio->bi_private = dio;
+	bio->bi_end_io = iomap_dio_bio_end_io;
+
+	get_page(page);
+	__bio_add_page(bio, page, len, 0);
+	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
+	iomap_dio_submit_bio(dio, iomap, bio);
+}
+
+static loff_t
+iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
+		struct iomap_dio *dio, struct iomap *iomap)
+{
+	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int align = iov_iter_alignment(dio->submit.iter);
+	struct iov_iter iter;
+	struct bio *bio;
+	bool need_zeroout = false;
+	bool use_fua = false;
+	int nr_pages, ret = 0;
+	size_t copied = 0;
+
+	if ((pos | length | align) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	if (iomap->type == IOMAP_UNWRITTEN) {
+		dio->flags |= IOMAP_DIO_UNWRITTEN;
+		need_zeroout = true;
+	}
+
+	if (iomap->flags & IOMAP_F_SHARED)
+		dio->flags |= IOMAP_DIO_COW;
+
+	if (iomap->flags & IOMAP_F_NEW) {
+		need_zeroout = true;
+	} else if (iomap->type == IOMAP_MAPPED) {
+		/*
+		 * Use a FUA write if we need datasync semantics, this is a pure
+		 * data IO that doesn't require any metadata updates (including
+		 * after IO completion such as unwritten extent conversion) and
+		 * the underlying device supports FUA. This allows us to avoid
+		 * cache flushes on IO completion.
+		 */
+		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
+			use_fua = true;
+	}
+
+	/*
+	 * Operate on a partial iter trimmed to the extent we were called for.
+	 * We'll update the iter in the dio once we're done with this extent.
+	 */
+	iter = *dio->submit.iter;
+	iov_iter_truncate(&iter, length);
+
+	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+	if (nr_pages <= 0)
+		return nr_pages;
+
+	if (need_zeroout) {
+		/* zero out from the start of the block to the write offset */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos - pad, pad);
+	}
+
+	do {
+		size_t n;
+		if (dio->error) {
+			iov_iter_revert(dio->submit.iter, copied);
+			return 0;
+		}
+
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio_set_dev(bio, iomap->bdev);
+		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+		bio->bi_write_hint = dio->iocb->ki_hint;
+		bio->bi_ioprio = dio->iocb->ki_ioprio;
+		bio->bi_private = dio;
+		bio->bi_end_io = iomap_dio_bio_end_io;
+
+		ret = bio_iov_iter_get_pages(bio, &iter);
+		if (unlikely(ret)) {
+			/*
+			 * We have to stop part way through an IO. We must fall
+			 * through to the sub-block tail zeroing here, otherwise
+			 * this short IO may expose stale data in the tail of
+			 * the block we haven't written data to.
+			 */
+			bio_put(bio);
+			goto zero_tail;
+		}
+
+		n = bio->bi_iter.bi_size;
+		if (dio->flags & IOMAP_DIO_WRITE) {
+			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+			if (use_fua)
+				bio->bi_opf |= REQ_FUA;
+			else
+				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+			task_io_account_write(n);
+		} else {
+			bio->bi_opf = REQ_OP_READ;
+			if (dio->flags & IOMAP_DIO_DIRTY)
+				bio_set_pages_dirty(bio);
+		}
+
+		iov_iter_advance(dio->submit.iter, n);
+
+		dio->size += n;
+		pos += n;
+		copied += n;
+
+		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+		iomap_dio_submit_bio(dio, iomap, bio);
+	} while (nr_pages);
+
+	/*
+	 * We need to zeroout the tail of a sub-block write if the extent type
+	 * requires zeroing or the write extends beyond EOF. If we don't zero
+	 * the block tail in the latter case, we can expose stale data via mmap
+	 * reads of the EOF block.
+	 */
+zero_tail:
+	if (need_zeroout ||
+	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
+		/* zero out from the end of the write to the end of the block */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+	}
+	return copied ? copied : ret;
+}
+
+static loff_t
+iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
+{
+	length = iov_iter_zero(length, dio->submit.iter);
+	dio->size += length;
+	return length;
+}
+
+static loff_t
+iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
+		struct iomap_dio *dio, struct iomap *iomap)
+{
+	struct iov_iter *iter = dio->submit.iter;
+	size_t copied;
+
+	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+	if (dio->flags & IOMAP_DIO_WRITE) {
+		loff_t size = inode->i_size;
+
+		if (pos > size)
+			memset(iomap->inline_data + size, 0, pos - size);
+		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+		if (copied) {
+			if (pos + copied > size)
+				i_size_write(inode, pos + copied);
+			mark_inode_dirty(inode);
+		}
+	} else {
+		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+	}
+	dio->size += copied;
+	return copied;
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_dio *dio = data;
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+			return -EIO;
+		return iomap_dio_hole_actor(length, dio);
+	case IOMAP_UNWRITTEN:
+		if (!(dio->flags & IOMAP_DIO_WRITE))
+			return iomap_dio_hole_actor(length, dio);
+		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+	case IOMAP_MAPPED:
+		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+	case IOMAP_INLINE:
+		return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+}
+
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not.  This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ */
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_iter_count(iter);
+	loff_t pos = iocb->ki_pos, start = pos;
+	loff_t end = iocb->ki_pos + count - 1, ret = 0;
+	unsigned int flags = IOMAP_DIRECT;
+	bool wait_for_completion = is_sync_kiocb(iocb);
+	struct blk_plug plug;
+	struct iomap_dio *dio;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (!count)
+		return 0;
+
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return -ENOMEM;
+
+	dio->iocb = iocb;
+	atomic_set(&dio->ref, 1);
+	dio->size = 0;
+	dio->i_size = i_size_read(inode);
+	dio->dops = dops;
+	dio->error = 0;
+	dio->flags = 0;
+
+	dio->submit.iter = iter;
+	dio->submit.waiter = current;
+	dio->submit.cookie = BLK_QC_T_NONE;
+	dio->submit.last_queue = NULL;
+
+	if (iov_iter_rw(iter) == READ) {
+		if (pos >= dio->i_size)
+			goto out_free_dio;
+
+		if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
+			dio->flags |= IOMAP_DIO_DIRTY;
+	} else {
+		flags |= IOMAP_WRITE;
+		dio->flags |= IOMAP_DIO_WRITE;
+
+		/* for data sync or sync, we need sync completion processing */
+		if (iocb->ki_flags & IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_NEED_SYNC;
+
+		/*
+		 * For datasync only writes, we optimistically try using FUA for
+		 * this IO.  Any non-FUA write that occurs will clear this flag,
+		 * hence we know before completion whether a cache flush is
+		 * necessary.
+		 */
+		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_WRITE_FUA;
+	}
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (filemap_range_has_page(mapping, start, end)) {
+			ret = -EAGAIN;
+			goto out_free_dio;
+		}
+		flags |= IOMAP_NOWAIT;
+	}
+
+	ret = filemap_write_and_wait_range(mapping, start, end);
+	if (ret)
+		goto out_free_dio;
+
+	/*
+	 * Try to invalidate cache pages for the range we're direct
+	 * writing.  If this invalidation fails, tough, the write will
+	 * still work, but racing two incompatible write paths is a
+	 * pretty crazy thing to do, so we don't support it 100%.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+	if (ret)
+		dio_warn_stale_pagecache(iocb->ki_filp);
+	ret = 0;
+
+	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
+	    !inode->i_sb->s_dio_done_wq) {
+		ret = sb_init_dio_done_wq(inode->i_sb);
+		if (ret < 0)
+			goto out_free_dio;
+	}
+
+	inode_dio_begin(inode);
+
+	blk_start_plug(&plug);
+	do {
+		ret = iomap_apply(inode, pos, count, flags, ops, dio,
+				iomap_dio_actor);
+		if (ret <= 0) {
+			/* magic error code to fall back to buffered I/O */
+			if (ret == -ENOTBLK) {
+				wait_for_completion = true;
+				ret = 0;
+			}
+			break;
+		}
+		pos += ret;
+
+		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+			break;
+	} while ((count = iov_iter_count(iter)) > 0);
+	blk_finish_plug(&plug);
+
+	if (ret < 0)
+		iomap_dio_set_error(dio, ret);
+
+	/*
+	 * If all the writes we issued were FUA, we don't need to flush the
+	 * cache on IO completion. Clear the sync flag for this case.
+	 */
+	if (dio->flags & IOMAP_DIO_WRITE_FUA)
+		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
+	WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+	WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
+	/*
+	 * We are about to drop our additional submission reference, which
+	 * might be the last reference to the dio.  There are three three
+	 * different ways we can progress here:
+	 *
+	 *  (a) If this is the last reference we will always complete and free
+	 *	the dio ourselves.
+	 *  (b) If this is not the last reference, and we serve an asynchronous
+	 *	iocb, we must never touch the dio after the decrement, the
+	 *	I/O completion handler will complete and free it.
+	 *  (c) If this is not the last reference, but we serve a synchronous
+	 *	iocb, the I/O completion handler will wake us up on the drop
+	 *	of the final reference, and we will complete and free it here
+	 *	after we got woken by the I/O completion handler.
+	 */
+	dio->wait_for_completion = wait_for_completion;
+	if (!atomic_dec_and_test(&dio->ref)) {
+		if (!wait_for_completion)
+			return -EIOCBQUEUED;
+
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!READ_ONCE(dio->submit.waiter))
+				break;
+
+			if (!(iocb->ki_flags & IOCB_HIPRI) ||
+			    !dio->submit.last_queue ||
+			    !blk_poll(dio->submit.last_queue,
+					 dio->submit.cookie, true))
+				io_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	return iomap_dio_complete(dio);
+
+out_free_dio:
+	kfree(dio);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_rw);

diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
new file mode 100644
index 0000000..f26fdd3
--- /dev/null
+++ b/fs/iomap/fiemap.c

@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+
+struct fiemap_ctx {
+	struct fiemap_extent_info *fi;
+	struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+		struct iomap *iomap, u32 flags)
+{
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		/* skip holes */
+		return 0;
+	case IOMAP_DELALLOC:
+		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+		break;
+	case IOMAP_MAPPED:
+		break;
+	case IOMAP_UNWRITTEN:
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+		break;
+	case IOMAP_INLINE:
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
+		break;
+	}
+
+	if (iomap->flags & IOMAP_F_MERGED)
+		flags |= FIEMAP_EXTENT_MERGED;
+	if (iomap->flags & IOMAP_F_SHARED)
+		flags |= FIEMAP_EXTENT_SHARED;
+
+	return fiemap_fill_next_extent(fi, iomap->offset,
+			iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
+			iomap->length, flags);
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct fiemap_ctx *ctx = data;
+	loff_t ret = length;
+
+	if (iomap->type == IOMAP_HOLE)
+		return length;
+
+	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+	ctx->prev = *iomap;
+	switch (ret) {
+	case 0:		/* success */
+		return length;
+	case 1:		/* extent array full */
+		return 0;
+	default:
+		return ret;
+	}
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+		loff_t start, loff_t len, const struct iomap_ops *ops)
+{
+	struct fiemap_ctx ctx;
+	loff_t ret;
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.fi = fi;
+	ctx.prev.type = IOMAP_HOLE;
+
+	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
+		ret = filemap_write_and_wait(inode->i_mapping);
+		if (ret)
+			return ret;
+	}
+
+	while (len > 0) {
+		ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
+				iomap_fiemap_actor);
+		/* inode with no (attribute) mapping will give ENOENT */
+		if (ret == -ENOENT)
+			break;
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			break;
+
+		start += ret;
+		len -= ret;
+	}
+
+	if (ctx.prev.type != IOMAP_HOLE) {
+		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
+
+static loff_t
+iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	sector_t *bno = data, addr;
+
+	if (iomap->type == IOMAP_MAPPED) {
+		addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
+		if (addr > INT_MAX)
+			WARN(1, "would truncate bmap result\n");
+		else
+			*bno = addr;
+	}
+	return 0;
+}
+
+/* legacy ->bmap interface.  0 is the error return (!) */
+sector_t
+iomap_bmap(struct address_space *mapping, sector_t bno,
+		const struct iomap_ops *ops)
+{
+	struct inode *inode = mapping->host;
+	loff_t pos = bno << inode->i_blkbits;
+	unsigned blocksize = i_blocksize(inode);
+
+	if (filemap_write_and_wait(mapping))
+		return 0;
+
+	bno = 0;
+	iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+	return bno;
+}
+EXPORT_SYMBOL_GPL(iomap_bmap);

diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
new file mode 100644
index 0000000..c04bad4
--- /dev/null
+++ b/fs/iomap/seek.c

@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Red Hat, Inc.
+ * Copyright (c) 2018 Christoph Hellwig.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ * Returns true if found and updates @lastoff to the offset in file.
+ */
+static bool
+page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
+		int whence)
+{
+	const struct address_space_operations *ops = inode->i_mapping->a_ops;
+	unsigned int bsize = i_blocksize(inode), off;
+	bool seek_data = whence == SEEK_DATA;
+	loff_t poff = page_offset(page);
+
+	if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
+		return false;
+
+	if (*lastoff < poff) {
+		/*
+		 * Last offset smaller than the start of the page means we found
+		 * a hole:
+		 */
+		if (whence == SEEK_HOLE)
+			return true;
+		*lastoff = poff;
+	}
+
+	/*
+	 * Just check the page unless we can and should check block ranges:
+	 */
+	if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
+		return PageUptodate(page) == seek_data;
+
+	lock_page(page);
+	if (unlikely(page->mapping != inode->i_mapping))
+		goto out_unlock_not_found;
+
+	for (off = 0; off < PAGE_SIZE; off += bsize) {
+		if (offset_in_page(*lastoff) >= off + bsize)
+			continue;
+		if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
+			unlock_page(page);
+			return true;
+		}
+		*lastoff = poff + off + bsize;
+	}
+
+out_unlock_not_found:
+	unlock_page(page);
+	return false;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: uptodate buffer heads count as data; everything else
+ * counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+static loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+		int whence)
+{
+	pgoff_t index = offset >> PAGE_SHIFT;
+	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+	loff_t lastoff = offset;
+	struct pagevec pvec;
+
+	if (length <= 0)
+		return -ENOENT;
+
+	pagevec_init(&pvec);
+
+	do {
+		unsigned nr_pages, i;
+
+		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+						end - 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (page_seek_hole_data(inode, page, &lastoff, whence))
+				goto check_range;
+			lastoff = page_offset(page) + PAGE_SIZE;
+		}
+		pagevec_release(&pvec);
+	} while (index < end);
+
+	/* When no page at lastoff and we are not done, we found a hole. */
+	if (whence != SEEK_HOLE)
+		goto not_found;
+
+check_range:
+	if (lastoff < offset + length)
+		goto out;
+not_found:
+	lastoff = -ENOENT;
+out:
+	pagevec_release(&pvec);
+	return lastoff;
+}
+
+
+static loff_t
+iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
+		      void *data, struct iomap *iomap)
+{
+	switch (iomap->type) {
+	case IOMAP_UNWRITTEN:
+		offset = page_cache_seek_hole_data(inode, offset, length,
+						   SEEK_HOLE);
+		if (offset < 0)
+			return length;
+		/* fall through */
+	case IOMAP_HOLE:
+		*(loff_t *)data = offset;
+		return 0;
+	default:
+		return length;
+	}
+}
+
+loff_t
+iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
+{
+	loff_t size = i_size_read(inode);
+	loff_t length = size - offset;
+	loff_t ret;
+
+	/* Nothing to be found before or beyond the end of the file. */
+	if (offset < 0 || offset >= size)
+		return -ENXIO;
+
+	while (length > 0) {
+		ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
+				  &offset, iomap_seek_hole_actor);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			break;
+
+		offset += ret;
+		length -= ret;
+	}
+
+	return offset;
+}
+EXPORT_SYMBOL_GPL(iomap_seek_hole);
+
+static loff_t
+iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
+		      void *data, struct iomap *iomap)
+{
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		return length;
+	case IOMAP_UNWRITTEN:
+		offset = page_cache_seek_hole_data(inode, offset, length,
+						   SEEK_DATA);
+		if (offset < 0)
+			return length;
+		/*FALLTHRU*/
+	default:
+		*(loff_t *)data = offset;
+		return 0;
+	}
+}
+
+loff_t
+iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
+{
+	loff_t size = i_size_read(inode);
+	loff_t length = size - offset;
+	loff_t ret;
+
+	/* Nothing to be found before or beyond the end of the file. */
+	if (offset < 0 || offset >= size)
+		return -ENXIO;
+
+	while (length > 0) {
+		ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
+				  &offset, iomap_seek_data_actor);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			break;
+
+		offset += ret;
+		length -= ret;
+	}
+
+	if (length <= 0)
+		return -ENXIO;
+	return offset;
+}
+EXPORT_SYMBOL_GPL(iomap_seek_data);

diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
new file mode 100644
index 0000000..152a230
--- /dev/null
+++ b/fs/iomap/swapfile.c

@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/swap.h>
+
+/* Swapfile activation */
+
+struct iomap_swapfile_info {
+	struct iomap iomap;		/* accumulated iomap */
+	struct swap_info_struct *sis;
+	uint64_t lowest_ppage;		/* lowest physical addr seen (pages) */
+	uint64_t highest_ppage;		/* highest physical addr seen (pages) */
+	unsigned long nr_pages;		/* number of pages collected */
+	int nr_extents;			/* extent count */
+};
+
+/*
+ * Collect physical extents for this swap file.  Physical extents reported to
+ * the swap code must be trimmed to align to a page boundary.  The logical
+ * offset within the file is irrelevant since the swapfile code maps logical
+ * page numbers of the swap device to the physical page-aligned extents.
+ */
+static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
+{
+	struct iomap *iomap = &isi->iomap;
+	unsigned long nr_pages;
+	uint64_t first_ppage;
+	uint64_t first_ppage_reported;
+	uint64_t next_ppage;
+	int error;
+
+	/*
+	 * Round the start up and the end down so that the physical
+	 * extent aligns to a page boundary.
+	 */
+	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
+	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
+			PAGE_SHIFT;
+
+	/* Skip too-short physical extents. */
+	if (first_ppage >= next_ppage)
+		return 0;
+	nr_pages = next_ppage - first_ppage;
+
+	/*
+	 * Calculate how much swap space we're adding; the first page contains
+	 * the swap header and doesn't count.  The mm still wants that first
+	 * page fed to add_swap_extent, however.
+	 */
+	first_ppage_reported = first_ppage;
+	if (iomap->offset == 0)
+		first_ppage_reported++;
+	if (isi->lowest_ppage > first_ppage_reported)
+		isi->lowest_ppage = first_ppage_reported;
+	if (isi->highest_ppage < (next_ppage - 1))
+		isi->highest_ppage = next_ppage - 1;
+
+	/* Add extent, set up for the next call. */
+	error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
+	if (error < 0)
+		return error;
+	isi->nr_extents += error;
+	isi->nr_pages += nr_pages;
+	return 0;
+}
+
+/*
+ * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
+ * swap only cares about contiguous page-aligned physical extents and makes no
+ * distinction between written and unwritten extents.
+ */
+static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
+		loff_t count, void *data, struct iomap *iomap)
+{
+	struct iomap_swapfile_info *isi = data;
+	int error;
+
+	switch (iomap->type) {
+	case IOMAP_MAPPED:
+	case IOMAP_UNWRITTEN:
+		/* Only real or unwritten extents. */
+		break;
+	case IOMAP_INLINE:
+		/* No inline data. */
+		pr_err("swapon: file is inline\n");
+		return -EINVAL;
+	default:
+		pr_err("swapon: file has unallocated extents\n");
+		return -EINVAL;
+	}
+
+	/* No uncommitted metadata or shared blocks. */
+	if (iomap->flags & IOMAP_F_DIRTY) {
+		pr_err("swapon: file is not committed\n");
+		return -EINVAL;
+	}
+	if (iomap->flags & IOMAP_F_SHARED) {
+		pr_err("swapon: file has shared extents\n");
+		return -EINVAL;
+	}
+
+	/* Only one bdev per swap file. */
+	if (iomap->bdev != isi->sis->bdev) {
+		pr_err("swapon: file is on multiple devices\n");
+		return -EINVAL;
+	}
+
+	if (isi->iomap.length == 0) {
+		/* No accumulated extent, so just store it. */
+		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+	} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
+		/* Append this to the accumulated extent. */
+		isi->iomap.length += iomap->length;
+	} else {
+		/* Otherwise, add the retained iomap and store this one. */
+		error = iomap_swapfile_add_extent(isi);
+		if (error)
+			return error;
+		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+	}
+	return count;
+}
+
+/*
+ * Iterate a swap file's iomaps to construct physical extents that can be
+ * passed to the swapfile subsystem.
+ */
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+		struct file *swap_file, sector_t *pagespan,
+		const struct iomap_ops *ops)
+{
+	struct iomap_swapfile_info isi = {
+		.sis = sis,
+		.lowest_ppage = (sector_t)-1ULL,
+	};
+	struct address_space *mapping = swap_file->f_mapping;
+	struct inode *inode = mapping->host;
+	loff_t pos = 0;
+	loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
+	loff_t ret;
+
+	/*
+	 * Persist all file mapping metadata so that we won't have any
+	 * IOMAP_F_DIRTY iomaps.
+	 */
+	ret = vfs_fsync(swap_file, 1);
+	if (ret)
+		return ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
+				ops, &isi, iomap_swapfile_activate_actor);
+		if (ret <= 0)
+			return ret;
+
+		pos += ret;
+		len -= ret;
+	}
+
+	if (isi.iomap.length) {
+		ret = iomap_swapfile_add_extent(&isi);
+		if (ret)
+			return ret;
+	}
+
+	*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
+	sis->max = isi.nr_pages;
+	sis->pages = isi.nr_pages - 1;
+	sis->highest_bit = isi.nr_pages - 1;
+	return isi.nr_extents;
+}
+EXPORT_SYMBOL_GPL(iomap_swapfile_activate);

diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
index 8ab9878..5e74195 100644
--- a/fs/isofs/Kconfig
+++ b/fs/isofs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config ISO9660_FS
 	tristate "ISO 9660 CDROM file system support"
 	help

diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 10205ec..bc12ac7 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c

@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- linux-c -*- ------------------------------------------------------- *
  *   
  *   Copyright 2001 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
- *   USA; either version 2 of the License, or (at your option) any later
- *   version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*

diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 947ce22..f0fe641 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c

@@ -46,7 +46,7 @@
 	return i;
 }
 
-/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
+/* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */
 int get_acorn_filename(struct iso_directory_record *de,
 			    char *retname, struct inode *inode)
 {

diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 85a9093..35768a6 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c

@@ -10,7 +10,7 @@
  *
  * The following files are helpful:
  *
- *     Documentation/filesystems/nfs/Exporting
+ *     Documentation/filesystems/nfs/exporting.rst
  *     fs/exportfs/expfs.c.
  */
 

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 488a9e7..62c0462 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/isofs/inode.c
  *
@@ -29,6 +30,9 @@
 #include "isofs.h"
 #include "zisofs.h"
 
+/* max tz offset is 13 hours */
+#define MAX_TZ_OFFSET (52*15*60)
+
 #define BEQUIET
 
 static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
@@ -72,17 +76,11 @@
 	return &ei->vfs_inode;
 }
 
-static void isofs_i_callback(struct rcu_head *head)
+static void isofs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
-static void isofs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, isofs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct iso_inode_info *ei = foo;
@@ -122,7 +120,7 @@
 
 static const struct super_operations isofs_sops = {
 	.alloc_inode	= isofs_alloc_inode,
-	.destroy_inode	= isofs_destroy_inode,
+	.free_inode	= isofs_free_inode,
 	.put_super	= isofs_put_super,
 	.statfs		= isofs_statfs,
 	.remount_fs	= isofs_remount,
@@ -806,6 +804,10 @@
 	 */
 	s->s_maxbytes = 0x80000000000LL;
 
+	/* ECMA-119 timestamp from 1900/1/1 with tz offset */
+	s->s_time_min = mktime64(1900, 1, 1, 0, 0, 0) - MAX_TZ_OFFSET;
+	s->s_time_max = mktime64(U8_MAX+1900, 12, 31, 23, 59, 59) + MAX_TZ_OFFSET;
+
 	/* Set this for reference. Its not currently used except on write
 	   which we don't have .. */
 

diff --git a/fs/isofs/zisofs.h b/fs/isofs/zisofs.h
index 2737957..70f1a21 100644
--- a/fs/isofs/zisofs.h
+++ b/fs/isofs/zisofs.h

@@ -1,13 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* ----------------------------------------------------------------------- *
  *   
  *   Copyright 2001 H. Peter Anvin - All Rights Reserved
  *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
- *   USA; either version 2 of the License, or (at your option) any later
- *   version; incorporated herein by reference.
- *
  * ----------------------------------------------------------------------- */
 
 /*

diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 5a9f553..4ad2c67 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config JBD2
 	tristate
 	select CRC32

diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
index 802a341..126b4da 100644
--- a/fs/jbd2/Makefile
+++ b/fs/jbd2/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux journaling routines.
 #

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 26f8d7e..a190906 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c

@@ -113,7 +113,7 @@
 	nblocks = jbd2_space_needed(journal);
 	while (jbd2_log_space_left(journal) < nblocks) {
 		write_unlock(&journal->j_state_lock);
-		mutex_lock(&journal->j_checkpoint_mutex);
+		mutex_lock_io(&journal->j_checkpoint_mutex);
 
 		/*
 		 * Test again, another process may have checkpointed while we
@@ -132,7 +132,6 @@
 			return;
 		}
 		spin_lock(&journal->j_list_lock);
-		nblocks = jbd2_space_needed(journal);
 		space_left = jbd2_log_space_left(journal);
 		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
@@ -276,9 +275,22 @@
 		"JBD2: %s: Waiting for Godot: block %llu\n",
 		journal->j_devname, (unsigned long long) bh->b_blocknr);
 
+			if (batch_count)
+				__flush_batch(journal, &batch_count);
 			jbd2_log_start_commit(journal, tid);
+			/*
+			 * jbd2_journal_commit_transaction() may want
+			 * to take the checkpoint_mutex if JBD2_FLUSHED
+			 * is set, jbd2_update_log_tail() called by
+			 * jbd2_journal_commit_transaction() may also take
+			 * checkpoint_mutex.  So we need to temporarily
+			 * drop it.
+			 */
+			mutex_unlock(&journal->j_checkpoint_mutex);
 			jbd2_log_wait_commit(journal, tid);
-			goto retry;
+			mutex_lock_io(&journal->j_checkpoint_mutex);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
 		if (!buffer_dirty(bh)) {
 			if (unlikely(buffer_write_io_error(bh)) && !result)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 150cc03..132fb92 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c

@@ -184,17 +184,18 @@
 /*
  * write the filemap data using writepage() address_space_operations.
  * We don't do block allocation here even for delalloc. We don't
- * use writepages() because with dealyed allocation we may be doing
+ * use writepages() because with delayed allocation we may be doing
  * block allocation in writepages().
  */
-static int journal_submit_inode_data_buffers(struct address_space *mapping)
+static int journal_submit_inode_data_buffers(struct address_space *mapping,
+		loff_t dirty_start, loff_t dirty_end)
 {
 	int ret;
 	struct writeback_control wbc = {
 		.sync_mode =  WB_SYNC_ALL,
 		.nr_to_write = mapping->nrpages * 2,
-		.range_start = 0,
-		.range_end = i_size_read(mapping->host),
+		.range_start = dirty_start,
+		.range_end = dirty_end,
 	};
 
 	ret = generic_writepages(mapping, &wbc);
@@ -218,6 +219,9 @@
 
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		loff_t dirty_start = jinode->i_dirty_start;
+		loff_t dirty_end = jinode->i_dirty_end;
+
 		if (!(jinode->i_flags & JI_WRITE_DATA))
 			continue;
 		mapping = jinode->i_vfs_inode->i_mapping;
@@ -230,7 +234,8 @@
 		 * only allocated blocks here.
 		 */
 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
-		err = journal_submit_inode_data_buffers(mapping);
+		err = journal_submit_inode_data_buffers(mapping, dirty_start,
+				dirty_end);
 		if (!ret)
 			ret = err;
 		spin_lock(&journal->j_list_lock);
@@ -257,12 +262,16 @@
 	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		loff_t dirty_start = jinode->i_dirty_start;
+		loff_t dirty_end = jinode->i_dirty_end;
+
 		if (!(jinode->i_flags & JI_WAIT_DATA))
 			continue;
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
-		err = filemap_fdatawait_keep_errors(
-				jinode->i_vfs_inode->i_mapping);
+		err = filemap_fdatawait_range_keep_errors(
+				jinode->i_vfs_inode->i_mapping, dirty_start,
+				dirty_end);
 		if (!ret)
 			ret = err;
 		spin_lock(&journal->j_list_lock);
@@ -282,6 +291,8 @@
 				&jinode->i_transaction->t_inode_list);
 		} else {
 			jinode->i_transaction = NULL;
+			jinode->i_dirty_start = 0;
+			jinode->i_dirty_end = 0;
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
@@ -439,6 +450,8 @@
 		finish_wait(&journal->j_wait_updates, &wait);
 	}
 	spin_unlock(&commit_transaction->t_handle_lock);
+	commit_transaction->t_state = T_SWITCH;
+	write_unlock(&journal->j_state_lock);
 
 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 			journal->j_max_transaction_buffers);
@@ -505,6 +518,7 @@
 	atomic_sub(atomic_read(&journal->j_reserved_credits),
 		   &commit_transaction->t_outstanding_credits);
 
+	write_lock(&journal->j_state_lock);
 	trace_jbd2_commit_flushing(journal, commit_transaction);
 	stats.run.rs_flushing = jiffies;
 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -691,9 +705,11 @@
                            the last tag we set up. */
 
 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
-
-			jbd2_descriptor_block_csum_set(journal, descriptor);
 start_journal_io:
+			if (descriptor)
+				jbd2_descriptor_block_csum_set(journal,
+							descriptor);
+
 			for (i = 0; i < bufs; i++) {
 				struct buffer_head *bh = wbuf[i];
 				/*

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8ef6b6d..1c58859 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c

@@ -66,9 +66,6 @@
 EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_forget);
-#if 0
-EXPORT_SYMBOL(journal_sync_buffer);
-#endif
 EXPORT_SYMBOL(jbd2_journal_flush);
 EXPORT_SYMBOL(jbd2_journal_revoke);
 
@@ -92,8 +89,8 @@
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_inode_add_write);
-EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
+EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
+EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
@@ -142,22 +139,6 @@
 	return cpu_to_be32(csum);
 }
 
-static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
-{
-	if (!jbd2_journal_has_csum_v2or3(j))
-		return 1;
-
-	return sb->s_checksum == jbd2_superblock_csum(j, sb);
-}
-
-static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
-{
-	if (!jbd2_journal_has_csum_v2or3(j))
-		return;
-
-	sb->s_checksum = jbd2_superblock_csum(j, sb);
-}
-
 /*
  * Helper function used to manage commit timeouts
  */
@@ -219,7 +200,7 @@
 	if (journal->j_flags & JBD2_UNMOUNT)
 		goto end_loop;
 
-	jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+	jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
 		journal->j_commit_sequence, journal->j_commit_request);
 
 	if (journal->j_commit_sequence != journal->j_commit_request) {
@@ -340,7 +321,7 @@
  * IO is in progress. do_get_write_access() handles this.
  *
  * The function returns a pointer to the buffer_head to be used for IO.
- * 
+ *
  *
  * Return value:
  *  <0: Error
@@ -516,7 +497,7 @@
 		 */
 
 		journal->j_commit_request = target;
-		jbd_debug(1, "JBD2: requesting commit %d/%d\n",
+		jbd_debug(1, "JBD2: requesting commit %u/%u\n",
 			  journal->j_commit_request,
 			  journal->j_commit_sequence);
 		journal->j_running_transaction->t_requested = jiffies;
@@ -529,7 +510,7 @@
 		WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
 			  journal->j_commit_request,
 			  journal->j_commit_sequence,
-			  target, journal->j_running_transaction ? 
+			  target, journal->j_running_transaction ?
 			  journal->j_running_transaction->t_tid : 0);
 	return 0;
 }
@@ -714,12 +695,12 @@
 #ifdef CONFIG_JBD2_DEBUG
 	if (!tid_geq(journal->j_commit_request, tid)) {
 		printk(KERN_ERR
-		       "%s: error: j_commit_request=%d, tid=%d\n",
+		       "%s: error: j_commit_request=%u, tid=%u\n",
 		       __func__, journal->j_commit_request, tid);
 	}
 #endif
 	while (tid_gt(tid, journal->j_commit_sequence)) {
-		jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
+		jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
 				  tid, journal->j_commit_sequence);
 		read_unlock(&journal->j_state_lock);
 		wake_up(&journal->j_wait_commit);
@@ -960,7 +941,7 @@
 
 	trace_jbd2_update_log_tail(journal, tid, block, freed);
 	jbd_debug(1,
-		  "Cleaning journal tail from %d to %d (offset %lu), "
+		  "Cleaning journal tail from %u to %u (offset %lu), "
 		  "freeing %lu\n",
 		  journal->j_tail_sequence, tid, block, freed);
 
@@ -1334,7 +1315,7 @@
 	 */
 	if (sb->s_start == 0) {
 		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
-			"(start %ld, seq %d, errno %d)\n",
+			"(start %ld, seq %u, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
 		journal->j_flags |= JBD2_FLUSHED;
@@ -1356,16 +1337,23 @@
 	return jbd2_journal_start_thread(journal);
 }
 
+/*
+ * This function expects that the caller will have locked the journal
+ * buffer head, and will return with it unlocked
+ */
 static int jbd2_write_superblock(journal_t *journal, int write_flags)
 {
 	struct buffer_head *bh = journal->j_sb_buffer;
 	journal_superblock_t *sb = journal->j_superblock;
 	int ret;
 
+	/* Buffer got discarded which means block device got invalidated */
+	if (!buffer_mapped(bh))
+		return -EIO;
+
 	trace_jbd2_write_superblock(journal, write_flags);
 	if (!(journal->j_flags & JBD2_BARRIER))
 		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
-	lock_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the journal
@@ -1381,7 +1369,8 @@
 		clear_buffer_write_io_error(bh);
 		set_buffer_uptodate(bh);
 	}
-	jbd2_superblock_csum_set(journal, sb);
+	if (jbd2_journal_has_csum_v2or3(journal))
+		sb->s_checksum = jbd2_superblock_csum(journal, sb);
 	get_bh(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
@@ -1424,6 +1413,7 @@
 	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
 		  tail_block, tail_tid);
 
+	lock_buffer(journal->j_sb_buffer);
 	sb->s_sequence = cpu_to_be32(tail_tid);
 	sb->s_start    = cpu_to_be32(tail_block);
 
@@ -1454,18 +1444,17 @@
 	journal_superblock_t *sb = journal->j_superblock;
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
-	read_lock(&journal->j_state_lock);
-	/* Is it already empty? */
-	if (sb->s_start == 0) {
-		read_unlock(&journal->j_state_lock);
+	lock_buffer(journal->j_sb_buffer);
+	if (sb->s_start == 0) {		/* Is it already empty? */
+		unlock_buffer(journal->j_sb_buffer);
 		return;
 	}
-	jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
+
+	jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
 		  journal->j_tail_sequence);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
 	sb->s_start    = cpu_to_be32(0);
-	read_unlock(&journal->j_state_lock);
 
 	jbd2_write_superblock(journal, write_op);
 
@@ -1488,9 +1477,8 @@
 	journal_superblock_t *sb = journal->j_superblock;
 	int errcode;
 
-	read_lock(&journal->j_state_lock);
+	lock_buffer(journal->j_sb_buffer);
 	errcode = journal->j_errno;
-	read_unlock(&journal->j_state_lock);
 	if (errcode == -ESHUTDOWN)
 		errcode = 0;
 	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
@@ -1595,17 +1583,18 @@
 		}
 	}
 
-	/* Check superblock checksum */
-	if (!jbd2_superblock_csum_verify(journal, sb)) {
-		printk(KERN_ERR "JBD2: journal checksum error\n");
-		err = -EFSBADCRC;
-		goto out;
-	}
+	if (jbd2_journal_has_csum_v2or3(journal)) {
+		/* Check superblock checksum */
+		if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+			printk(KERN_ERR "JBD2: journal checksum error\n");
+			err = -EFSBADCRC;
+			goto out;
+		}
 
-	/* Precompute checksum seed for all metadata */
-	if (jbd2_journal_has_csum_v2or3(journal))
+		/* Precompute checksum seed for all metadata */
 		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
 						   sizeof(sb->s_uuid));
+	}
 
 	set_buffer_verified(bh);
 
@@ -1894,28 +1883,27 @@
 
 	sb = journal->j_superblock;
 
+	/* Load the checksum driver if necessary */
+	if ((journal->j_chksum_driver == NULL) &&
+	    INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+		if (IS_ERR(journal->j_chksum_driver)) {
+			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+			journal->j_chksum_driver = NULL;
+			return 0;
+		}
+		/* Precompute checksum seed for all metadata */
+		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+						   sizeof(sb->s_uuid));
+	}
+
+	lock_buffer(journal->j_sb_buffer);
+
 	/* If enabling v3 checksums, update superblock */
 	if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
 		sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
 		sb->s_feature_compat &=
 			~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
-
-		/* Load the checksum driver */
-		if (journal->j_chksum_driver == NULL) {
-			journal->j_chksum_driver = crypto_alloc_shash("crc32c",
-								      0, 0);
-			if (IS_ERR(journal->j_chksum_driver)) {
-				printk(KERN_ERR "JBD2: Cannot load crc32c "
-				       "driver.\n");
-				journal->j_chksum_driver = NULL;
-				return 0;
-			}
-
-			/* Precompute checksum seed for all metadata */
-			journal->j_csum_seed = jbd2_chksum(journal, ~0,
-							   sb->s_uuid,
-							   sizeof(sb->s_uuid));
-		}
 	}
 
 	/* If enabling v1 checksums, downgrade superblock */
@@ -1927,6 +1915,7 @@
 	sb->s_feature_compat    |= cpu_to_be32(compat);
 	sb->s_feature_ro_compat |= cpu_to_be32(ro);
 	sb->s_feature_incompat  |= cpu_to_be32(incompat);
+	unlock_buffer(journal->j_sb_buffer);
 
 	return 1;
 #undef COMPAT_FEATURE_ON
@@ -2067,7 +2056,7 @@
 	err = jbd2_journal_skip_recovery(journal);
 	if (write) {
 		/* Lock to make assertions happy... */
-		mutex_lock(&journal->j_checkpoint_mutex);
+		mutex_lock_io(&journal->j_checkpoint_mutex);
 		jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
@@ -2383,22 +2372,19 @@
 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
 #endif
 
-static int jbd2_journal_init_journal_head_cache(void)
+static int __init jbd2_journal_init_journal_head_cache(void)
 {
-	int retval;
-
-	J_ASSERT(jbd2_journal_head_cache == NULL);
+	J_ASSERT(!jbd2_journal_head_cache);
 	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
 				sizeof(struct journal_head),
 				0,		/* offset */
 				SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
 				NULL);		/* ctor */
-	retval = 0;
 	if (!jbd2_journal_head_cache) {
-		retval = -ENOMEM;
 		printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
+		return -ENOMEM;
 	}
-	return retval;
+	return 0;
 }
 
 static void jbd2_journal_destroy_journal_head_cache(void)
@@ -2585,6 +2571,8 @@
 	jinode->i_next_transaction = NULL;
 	jinode->i_vfs_inode = inode;
 	jinode->i_flags = 0;
+	jinode->i_dirty_start = 0;
+	jinode->i_dirty_end = 0;
 	INIT_LIST_HEAD(&jinode->i_list);
 }
 
@@ -2644,28 +2632,38 @@
 
 struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 
-static int __init jbd2_journal_init_handle_cache(void)
+static int __init jbd2_journal_init_inode_cache(void)
 {
-	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
-	if (jbd2_handle_cache == NULL) {
-		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
-		return -ENOMEM;
-	}
+	J_ASSERT(!jbd2_inode_cache);
 	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
-	if (jbd2_inode_cache == NULL) {
-		printk(KERN_EMERG "JBD2: failed to create inode cache\n");
-		kmem_cache_destroy(jbd2_handle_cache);
+	if (!jbd2_inode_cache) {
+		pr_emerg("JBD2: failed to create inode cache\n");
 		return -ENOMEM;
 	}
 	return 0;
 }
 
+static int __init jbd2_journal_init_handle_cache(void)
+{
+	J_ASSERT(!jbd2_handle_cache);
+	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
+	if (!jbd2_handle_cache) {
+		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void jbd2_journal_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(jbd2_inode_cache);
+	jbd2_inode_cache = NULL;
+}
+
 static void jbd2_journal_destroy_handle_cache(void)
 {
 	kmem_cache_destroy(jbd2_handle_cache);
 	jbd2_handle_cache = NULL;
-	kmem_cache_destroy(jbd2_inode_cache);
-	jbd2_inode_cache = NULL;
 }
 
 /*
@@ -2676,21 +2674,27 @@
 {
 	int ret;
 
-	ret = jbd2_journal_init_revoke_caches();
+	ret = jbd2_journal_init_revoke_record_cache();
+	if (ret == 0)
+		ret = jbd2_journal_init_revoke_table_cache();
 	if (ret == 0)
 		ret = jbd2_journal_init_journal_head_cache();
 	if (ret == 0)
 		ret = jbd2_journal_init_handle_cache();
 	if (ret == 0)
+		ret = jbd2_journal_init_inode_cache();
+	if (ret == 0)
 		ret = jbd2_journal_init_transaction_cache();
 	return ret;
 }
 
 static void jbd2_journal_destroy_caches(void)
 {
-	jbd2_journal_destroy_revoke_caches();
+	jbd2_journal_destroy_revoke_record_cache();
+	jbd2_journal_destroy_revoke_table_cache();
 	jbd2_journal_destroy_journal_head_cache();
 	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_inode_cache();
 	jbd2_journal_destroy_transaction_cache();
 	jbd2_journal_destroy_slabs();
 }

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a1143e5..f08073d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c

@@ -178,33 +178,41 @@
 	return NULL;
 }
 
-void jbd2_journal_destroy_revoke_caches(void)
+void jbd2_journal_destroy_revoke_record_cache(void)
 {
 	kmem_cache_destroy(jbd2_revoke_record_cache);
 	jbd2_revoke_record_cache = NULL;
+}
+
+void jbd2_journal_destroy_revoke_table_cache(void)
+{
 	kmem_cache_destroy(jbd2_revoke_table_cache);
 	jbd2_revoke_table_cache = NULL;
 }
 
-int __init jbd2_journal_init_revoke_caches(void)
+int __init jbd2_journal_init_revoke_record_cache(void)
 {
 	J_ASSERT(!jbd2_revoke_record_cache);
-	J_ASSERT(!jbd2_revoke_table_cache);
-
 	jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
 					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
-	if (!jbd2_revoke_record_cache)
-		goto record_cache_failure;
 
+	if (!jbd2_revoke_record_cache) {
+		pr_emerg("JBD2: failed to create revoke_record cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int __init jbd2_journal_init_revoke_table_cache(void)
+{
+	J_ASSERT(!jbd2_revoke_table_cache);
 	jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
 					     SLAB_TEMPORARY);
-	if (!jbd2_revoke_table_cache)
-		goto table_cache_failure;
-	return 0;
-table_cache_failure:
-	jbd2_journal_destroy_revoke_caches();
-record_cache_failure:
+	if (!jbd2_revoke_table_cache) {
+		pr_emerg("JBD2: failed to create revoke_table cache\n");
 		return -ENOMEM;
+	}
+	return 0;
 }
 
 static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
@@ -630,10 +638,8 @@
 {
 	jbd2_journal_revoke_header_t *header;
 
-	if (is_journal_aborted(journal)) {
-		put_bh(descriptor);
+	if (is_journal_aborted(journal))
 		return;
-	}
 
 	header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
 	header->r_count = cpu_to_be32(offset);

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c0b66a7..bee8498 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c

@@ -42,9 +42,11 @@
 					0,
 					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
 					NULL);
-	if (transaction_cache)
-		return 0;
-	return -ENOMEM;
+	if (!transaction_cache) {
+		pr_emerg("JBD2: failed to create transaction cache\n");
+		return -ENOMEM;
+	}
+	return 0;
 }
 
 void jbd2_journal_destroy_transaction_cache(void)
@@ -63,7 +65,7 @@
 /*
  * jbd2_get_transaction: obtain a new transaction_t object.
  *
- * Simply allocate and initialise a new transaction.  Create it in
+ * Simply initialise a new transaction. Initialize it in
  * RUNNING state and add it to the current journal (which should not
  * have an existing running transaction: we only make a new transaction
  * once we have started to commit the old one).
@@ -75,8 +77,8 @@
  *
  */
 
-static transaction_t *
-jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+static void jbd2_get_transaction(journal_t *journal,
+				transaction_t *transaction)
 {
 	transaction->t_journal = journal;
 	transaction->t_state = T_RUNNING;
@@ -100,8 +102,6 @@
 	transaction->t_max_wait = 0;
 	transaction->t_start = jiffies;
 	transaction->t_requested = 0;
-
-	return transaction;
 }
 
 /*
@@ -138,9 +138,9 @@
 }
 
 /*
- * Wait until running transaction passes T_LOCKED state. Also starts the commit
- * if needed. The function expects running transaction to exist and releases
- * j_state_lock.
+ * Wait until running transaction passes to T_FLUSH state and new transaction
+ * can thus be started. Also starts the commit if needed. The function expects
+ * running transaction to exist and releases j_state_lock.
  */
 static void wait_transaction_locked(journal_t *journal)
 	__releases(journal->j_state_lock)
@@ -160,6 +160,32 @@
 	finish_wait(&journal->j_wait_transaction_locked, &wait);
 }
 
+/*
+ * Wait until running transaction transitions from T_SWITCH to T_FLUSH
+ * state and new transaction can thus be started. The function releases
+ * j_state_lock.
+ */
+static void wait_transaction_switching(journal_t *journal)
+	__releases(journal->j_state_lock)
+{
+	DEFINE_WAIT(wait);
+
+	if (WARN_ON(!journal->j_running_transaction ||
+		    journal->j_running_transaction->t_state != T_SWITCH))
+		return;
+	prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+			TASK_UNINTERRUPTIBLE);
+	read_unlock(&journal->j_state_lock);
+	/*
+	 * We don't call jbd2_might_wait_for_commit() here as there's no
+	 * waiting for outstanding handles happening anymore in T_SWITCH state
+	 * and handling of reserved handles actually relies on that for
+	 * correctness.
+	 */
+	schedule();
+	finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
 static void sub_reserved_credits(journal_t *journal, int blocks)
 {
 	atomic_sub(blocks, &journal->j_reserved_credits);
@@ -183,7 +209,8 @@
 	 * If the current transaction is locked down for commit, wait
 	 * for the lock to be released.
 	 */
-	if (t->t_state == T_LOCKED) {
+	if (t->t_state != T_RUNNING) {
+		WARN_ON_ONCE(t->t_state >= T_FLUSH);
 		wait_transaction_locked(journal);
 		return 1;
 	}
@@ -360,8 +387,14 @@
 		/*
 		 * We have handle reserved so we are allowed to join T_LOCKED
 		 * transaction and we don't have to check for transaction size
-		 * and journal space.
+		 * and journal space. But we still have to wait while running
+		 * transaction is being switched to a committing one as it
+		 * won't wait for any handles anymore.
 		 */
+		if (transaction->t_state == T_SWITCH) {
+			wait_transaction_switching(journal);
+			goto repeat;
+		}
 		sub_reserved_credits(journal, blocks);
 		handle->h_reserved = 0;
 	}
@@ -536,6 +569,9 @@
 	}
 	handle->h_type = type;
 	handle->h_line_no = line_no;
+	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
+				handle->h_transaction->t_tid, type,
+				line_no, handle->h_buffer_credits);
 	return 0;
 }
 EXPORT_SYMBOL(jbd2_journal_start_reserved);
@@ -910,7 +946,7 @@
 	 * this is the first time this transaction is touching this buffer,
 	 * reset the modified flag
 	 */
-       jh->b_modified = 0;
+	jh->b_modified = 0;
 
 	/*
 	 * If the buffer is not journaled right now, we need to make sure it
@@ -1219,11 +1255,12 @@
 	struct journal_head *jh;
 	char *committed_data = NULL;
 
-	JBUFFER_TRACE(jh, "entry");
 	if (jbd2_write_access_granted(handle, bh, true))
 		return 0;
 
 	jh = jbd2_journal_add_journal_head(bh);
+	JBUFFER_TRACE(jh, "entry");
+
 	/*
 	 * Do this first --- it can drop the journal lock, so we want to
 	 * make sure that obtaining the committed_data is done
@@ -1334,15 +1371,17 @@
 
 	if (is_handle_aborted(handle))
 		return -EROFS;
-	if (!buffer_jbd(bh)) {
-		ret = -EUCLEAN;
-		goto out;
-	}
+	if (!buffer_jbd(bh))
+		return -EUCLEAN;
+
 	/*
 	 * We don't grab jh reference here since the buffer must be part
 	 * of the running transaction.
 	 */
 	jh = bh2jh(bh);
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
+
 	/*
 	 * This and the following assertions are unreliable since we may see jh
 	 * in inconsistent state unless we grab bh_state lock. But this is
@@ -1376,9 +1415,6 @@
 	}
 
 	journal = transaction->t_journal;
-	jbd_debug(5, "journal_head %p\n", jh);
-	JBUFFER_TRACE(jh, "entry");
-
 	jbd_lock_bh_state(bh);
 
 	if (jh->b_modified == 0) {
@@ -1564,9 +1600,7 @@
 			__jbd2_journal_unfile_buffer(jh);
 			if (!buffer_jbd(bh)) {
 				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				__bforget(bh);
-				goto drop;
+				goto not_jbd;
 			}
 		}
 		spin_unlock(&journal->j_list_lock);
@@ -1576,14 +1610,21 @@
 		/* However, if the buffer is still owned by a prior
 		 * (committing) transaction, we can't drop it yet... */
 		JBUFFER_TRACE(jh, "belongs to older transaction");
-		/* ... but we CAN drop it from the new transaction if we
-		 * have also modified it since the original commit. */
+		/* ... but we CAN drop it from the new transaction through
+		 * marking the buffer as freed and set j_next_transaction to
+		 * the new transaction, so that not only the commit code
+		 * knows it should clear dirty bits when it is done with the
+		 * buffer, but also the buffer can be checkpointed only
+		 * after the new transaction commits. */
 
-		if (jh->b_next_transaction) {
-			J_ASSERT(jh->b_next_transaction == transaction);
+		set_buffer_freed(bh);
+
+		if (!jh->b_next_transaction) {
 			spin_lock(&journal->j_list_lock);
-			jh->b_next_transaction = NULL;
+			jh->b_next_transaction = transaction;
 			spin_unlock(&journal->j_list_lock);
+		} else {
+			J_ASSERT(jh->b_next_transaction == transaction);
 
 			/*
 			 * only drop a reference if this transaction modified
@@ -1592,9 +1633,40 @@
 			if (was_modified)
 				drop_reserve = 1;
 		}
+	} else {
+		/*
+		 * Finally, if the buffer is not belongs to any
+		 * transaction, we can just drop it now if it has no
+		 * checkpoint.
+		 */
+		spin_lock(&journal->j_list_lock);
+		if (!jh->b_cp_transaction) {
+			JBUFFER_TRACE(jh, "belongs to none transaction");
+			spin_unlock(&journal->j_list_lock);
+			goto not_jbd;
+		}
+
+		/*
+		 * Otherwise, if the buffer has been written to disk,
+		 * it is safe to remove the checkpoint and drop it.
+		 */
+		if (!buffer_dirty(bh)) {
+			__jbd2_journal_remove_checkpoint(jh);
+			spin_unlock(&journal->j_list_lock);
+			goto not_jbd;
+		}
+
+		/*
+		 * The buffer is still not written to disk, we should
+		 * attach this buffer to current transaction so that the
+		 * buffer can be checkpointed only after the current
+		 * transaction commits.
+		 */
+		clear_buffer_dirty(bh);
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+		spin_unlock(&journal->j_list_lock);
 	}
 
-not_jbd:
 	jbd_unlock_bh_state(bh);
 	__brelse(bh);
 drop:
@@ -1603,6 +1675,11 @@
 		handle->h_buffer_credits++;
 	}
 	return err;
+
+not_jbd:
+	jbd_unlock_bh_state(bh);
+	__bforget(bh);
+	goto drop;
 }
 
 /**
@@ -2491,7 +2568,7 @@
  * File inode in the inode list of the handle's transaction
  */
 static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
-				   unsigned long flags)
+		unsigned long flags, loff_t start_byte, loff_t end_byte)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal;
@@ -2503,26 +2580,17 @@
 	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
 			transaction->t_tid);
 
-	/*
-	 * First check whether inode isn't already on the transaction's
-	 * lists without taking the lock. Note that this check is safe
-	 * without the lock as we cannot race with somebody removing inode
-	 * from the transaction. The reason is that we remove inode from the
-	 * transaction only in journal_release_jbd_inode() and when we commit
-	 * the transaction. We are guarded from the first case by holding
-	 * a reference to the inode. We are safe against the second case
-	 * because if jinode->i_transaction == transaction, commit code
-	 * cannot touch the transaction because we hold reference to it,
-	 * and if jinode->i_next_transaction == transaction, commit code
-	 * will only file the inode where we want it.
-	 */
-	if ((jinode->i_transaction == transaction ||
-	    jinode->i_next_transaction == transaction) &&
-	    (jinode->i_flags & flags) == flags)
-		return 0;
-
 	spin_lock(&journal->j_list_lock);
 	jinode->i_flags |= flags;
+
+	if (jinode->i_dirty_end) {
+		jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
+		jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
+	} else {
+		jinode->i_dirty_start = start_byte;
+		jinode->i_dirty_end = end_byte;
+	}
+
 	/* Is inode already attached where we need it? */
 	if (jinode->i_transaction == transaction ||
 	    jinode->i_next_transaction == transaction)
@@ -2554,15 +2622,19 @@
 	return 0;
 }
 
-int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+int jbd2_journal_inode_ranged_write(handle_t *handle,
+		struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
 {
 	return jbd2_journal_file_inode(handle, jinode,
-				       JI_WRITE_DATA | JI_WAIT_DATA);
+			JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
+			start_byte + length - 1);
 }
 
-int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
+		loff_t start_byte, loff_t length)
 {
-	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
+			start_byte, start_byte + length - 1);
 }
 
 /*

diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
index ad850c5..7c96bc1 100644
--- a/fs/jffs2/Kconfig
+++ b/fs/jffs2/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config JFFS2_FS
 	tristate "Journalling Flash File System v2 (JFFS2) support"
 	select CRC32

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 453a6a1..2b4d501 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c

@@ -125,7 +125,7 @@
 			if (try_to_freeze())
 				goto again;
 
-			signr = kernel_dequeue_signal(NULL);
+			signr = kernel_dequeue_signal();
 
 			switch(signr) {
 			case SIGSTOP:

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 7d8654a..f8fb89b 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c

@@ -109,9 +109,9 @@
 	return ret;
 }
 
-int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
+int jffs2_do_readpage_unlock(void *data, struct page *pg)
 {
-	int ret = jffs2_do_readpage_nolock(inode, pg);
+	int ret = jffs2_do_readpage_nolock(data, pg);
 	unlock_page(pg);
 	return ret;
 }

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index eab04ec..ab8cdd9 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c

@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/cred.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/list.h>
 #include <linux/mtd/mtd.h>
 #include <linux/pagemap.h>
@@ -184,7 +185,7 @@
 	if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
 		truncate_setsize(inode, iattr->ia_size);
 		inode->i_blocks = (inode->i_size + 511) >> 9;
-	}	
+	}
 
 	return 0;
 }
@@ -340,6 +341,7 @@
 			rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
 		else
 			rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
+		/* fall through */
 
 	case S_IFSOCK:
 	case S_IFIFO:
@@ -390,7 +392,7 @@
 	jffs2_do_setattr(inode, &iattr);
 }
 
-int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
+int jffs2_do_remount_fs(struct super_block *sb, struct fs_context *fc)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
@@ -408,10 +410,10 @@
 		mutex_unlock(&c->alloc_sem);
 	}
 
-	if (!(*flags & SB_RDONLY))
+	if (!(fc->sb_flags & SB_RDONLY))
 		jffs2_start_garbage_collect_thread(c);
 
-	*flags |= SB_NOATIME;
+	fc->sb_flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -508,7 +510,7 @@
 	return hashsize;
 }
 
-int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
+int jffs2_do_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct jffs2_sb_info *c;
 	struct inode *root_i;
@@ -523,11 +525,11 @@
 
 #ifndef CONFIG_JFFS2_FS_WRITEBUFFER
 	if (c->mtd->type == MTD_NANDFLASH) {
-		pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n");
+		errorf(fc, "Cannot operate on NAND flash unless jffs2 NAND support is compiled in");
 		return -EINVAL;
 	}
 	if (c->mtd->type == MTD_DATAFLASH) {
-		pr_err("Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in\n");
+		errorf(fc, "Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in");
 		return -EINVAL;
 	}
 #endif
@@ -541,12 +543,12 @@
 	 */
 	if ((c->sector_size * blocks) != c->flash_size) {
 		c->flash_size = c->sector_size * blocks;
-		pr_info("Flash size not aligned to erasesize, reducing to %dKiB\n",
-			c->flash_size / 1024);
+		infof(fc, "Flash size not aligned to erasesize, reducing to %dKiB",
+		      c->flash_size / 1024);
 	}
 
 	if (c->flash_size < 5*c->sector_size) {
-		pr_err("Too few erase blocks (%d)\n",
+		errorf(fc, "Too few erase blocks (%d)",
 		       c->flash_size / c->sector_size);
 		return -EINVAL;
 	}
@@ -589,6 +591,9 @@
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = JFFS2_SUPER_MAGIC;
+	sb->s_time_min = 0;
+	sb->s_time_max = U32_MAX;
+
 	if (!sb_rdonly(sb))
 		jffs2_start_garbage_collect_thread(c);
 	return 0;
@@ -677,33 +682,6 @@
 	return JFFS2_INODE_INFO(inode);
 }
 
-unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
-				   struct jffs2_inode_info *f,
-				   unsigned long offset,
-				   unsigned long *priv)
-{
-	struct inode *inode = OFNI_EDONI_2SFFJ(f);
-	struct page *pg;
-
-	pg = read_cache_page(inode->i_mapping, offset >> PAGE_SHIFT,
-			     (void *)jffs2_do_readpage_unlock, inode);
-	if (IS_ERR(pg))
-		return (void *)pg;
-
-	*priv = (unsigned long)pg;
-	return kmap(pg);
-}
-
-void jffs2_gc_release_page(struct jffs2_sb_info *c,
-			   unsigned char *ptr,
-			   unsigned long *priv)
-{
-	struct page *pg = (void *)*priv;
-
-	kunmap(pg);
-	put_page(pg);
-}
-
 static int jffs2_flash_setup(struct jffs2_sb_info *c) {
 	int ret = 0;
 

diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 9ed0f26..373b3b7 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c

@@ -1165,12 +1165,13 @@
 				       struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
 				       uint32_t start, uint32_t end)
 {
+	struct inode *inode = OFNI_EDONI_2SFFJ(f);
 	struct jffs2_full_dnode *new_fn;
 	struct jffs2_raw_inode ri;
 	uint32_t alloclen, offset, orig_end, orig_start;
 	int ret = 0;
 	unsigned char *comprbuf = NULL, *writebuf;
-	unsigned long pg;
+	struct page *page;
 	unsigned char *pg_ptr;
 
 	memset(&ri, 0, sizeof(ri));
@@ -1325,15 +1326,18 @@
 	 * end up here trying to GC the *same* page that jffs2_write_begin() is
 	 * trying to write out, read_cache_page() will not deadlock. */
 	mutex_unlock(&f->sem);
-	pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
-	mutex_lock(&f->sem);
-
-	if (IS_ERR(pg_ptr)) {
+	page = read_cache_page(inode->i_mapping, start >> PAGE_SHIFT,
+			       jffs2_do_readpage_unlock, inode);
+	if (IS_ERR(page)) {
 		pr_warn("read_cache_page() returned error: %ld\n",
-			PTR_ERR(pg_ptr));
-		return PTR_ERR(pg_ptr);
+			PTR_ERR(page));
+		mutex_lock(&f->sem);
+		return PTR_ERR(page);
 	}
 
+	pg_ptr = kmap(page);
+	mutex_lock(&f->sem);
+
 	offset = start;
 	while(offset < orig_end) {
 		uint32_t datalen;
@@ -1396,6 +1400,7 @@
 		}
 	}
 
-	jffs2_gc_release_page(c, pg_ptr, &pg);
+	kunmap(page);
+	put_page(page);
 	return ret;
 }

diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a2dbbb3..ef1cfa6 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h

@@ -155,7 +155,7 @@
 extern const struct inode_operations jffs2_file_inode_operations;
 extern const struct address_space_operations jffs2_file_address_operations;
 int jffs2_fsync(struct file *, loff_t, loff_t, int);
-int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
+int jffs2_do_readpage_unlock(void *data, struct page *pg);
 
 /* ioctl.c */
 long jffs2_ioctl(struct file *, unsigned int, unsigned long);
@@ -172,8 +172,8 @@
 struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
 			       struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
-int jffs2_do_remount_fs(struct super_block *, int *, char *);
-int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
+int jffs2_do_remount_fs(struct super_block *sb, struct fs_context *fc);
+int jffs2_do_fill_super(struct super_block *sb, struct fs_context *fc);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
 			    struct jffs2_inode_info *f);
 struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c,
@@ -183,9 +183,6 @@
 				   struct jffs2_inode_info *f,
 				   unsigned long offset,
 				   unsigned long *priv);
-void jffs2_gc_release_page(struct jffs2_sb_info *c,
-			   unsigned char *pg,
-			   unsigned long *priv);
 void jffs2_flash_cleanup(struct jffs2_sb_info *c);
 
 

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 389ea53..bccfc40 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c

@@ -1414,11 +1414,6 @@
 
 	jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL);
 
-	if (f->target) {
-		kfree(f->target);
-		f->target = NULL;
-	}
-
 	fds = f->dents;
 	while(fds) {
 		fd = fds;

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 90431dd..5f7e284 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c

@@ -527,8 +527,11 @@
 					err = jffs2_fill_scan_buf(c, sumptr, 
 								  jeb->offset + c->sector_size - sumlen,
 								  sumlen - buf_len);				
-					if (err)
+					if (err) {
+						if (sumlen > buf_size)
+							kfree(sumptr);
 						return err;
+					}
 				}
 			}
 

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 902a7dd..0e6406c 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c

@@ -19,7 +19,8 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/mount.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/jffs2.h>
 #include <linux/pagemap.h>
 #include <linux/mtd/super.h>
@@ -44,15 +45,12 @@
 	return &f->vfs_inode;
 }
 
-static void jffs2_i_callback(struct rcu_head *head)
+static void jffs2_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
-}
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 
-static void jffs2_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, jffs2_i_callback);
+	kfree(f->target);
+	kmem_cache_free(jffs2_inode_cachep, f);
 }
 
 static void jffs2_i_init_once(void *foo)
@@ -101,7 +99,8 @@
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
-	cancel_delayed_work_sync(&c->wbuf_dwork);
+	if (jffs2_is_writebuffered(c))
+		cancel_delayed_work_sync(&c->wbuf_dwork);
 #endif
 
 	mutex_lock(&c->alloc_sem);
@@ -159,105 +158,83 @@
 /*
  * JFFS2 mount options.
  *
+ * Opt_source: The source device
  * Opt_override_compr: override default compressor
  * Opt_rp_size: size of reserved pool in KiB
- * Opt_err: just end of array marker
  */
 enum {
 	Opt_override_compr,
 	Opt_rp_size,
-	Opt_err,
 };
 
-static const match_table_t tokens = {
-	{Opt_override_compr, "compr=%s"},
-	{Opt_rp_size, "rp_size=%u"},
-	{Opt_err, NULL},
+static const struct fs_parameter_spec jffs2_param_specs[] = {
+	fsparam_enum	("compr",	Opt_override_compr),
+	fsparam_u32	("rp_size",	Opt_rp_size),
+	{}
 };
 
-static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
-{
-	substring_t args[MAX_OPT_ARGS];
-	char *p, *name;
-	unsigned int opt;
-
-	if (!data)
-		return 0;
-
-	while ((p = strsep(&data, ","))) {
-		int token;
-
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_override_compr:
-			name = match_strdup(&args[0]);
-
-			if (!name)
-				return -ENOMEM;
-			if (!strcmp(name, "none"))
-				c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
+static const struct fs_parameter_enum jffs2_param_enums[] = {
+	{ Opt_override_compr,	"none",	JFFS2_COMPR_MODE_NONE },
 #ifdef CONFIG_JFFS2_LZO
-			else if (!strcmp(name, "lzo"))
-				c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
+	{ Opt_override_compr,	"lzo",	JFFS2_COMPR_MODE_FORCELZO },
 #endif
 #ifdef CONFIG_JFFS2_ZLIB
-			else if (!strcmp(name, "zlib"))
-				c->mount_opts.compr =
-						JFFS2_COMPR_MODE_FORCEZLIB;
+	{ Opt_override_compr,	"zlib",	JFFS2_COMPR_MODE_FORCEZLIB },
 #endif
-			else {
-				pr_err("Error: unknown compressor \"%s\"\n",
-				       name);
-				kfree(name);
-				return -EINVAL;
-			}
-			kfree(name);
-			c->mount_opts.override_compr = true;
-			break;
-		case Opt_rp_size:
-			if (match_int(&args[0], &opt))
-				return -EINVAL;
-			opt *= 1024;
-			if (opt > c->mtd->size) {
-				pr_warn("Too large reserve pool specified, max "
-					"is %llu KB\n", c->mtd->size / 1024);
-				return -EINVAL;
-			}
-			c->mount_opts.rp_size = opt;
-			break;
-		default:
-			pr_err("Error: unrecognized mount option '%s' or missing value\n",
-			       p);
-			return -EINVAL;
-		}
+	{}
+};
+
+const struct fs_parameter_description jffs2_fs_parameters = {
+	.name		= "jffs2",
+	.specs		= jffs2_param_specs,
+	.enums		= jffs2_param_enums,
+};
+
+static int jffs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct fs_parse_result result;
+	struct jffs2_sb_info *c = fc->s_fs_info;
+	int opt;
+
+	opt = fs_parse(fc, &jffs2_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_override_compr:
+		c->mount_opts.compr = result.uint_32;
+		c->mount_opts.override_compr = true;
+		break;
+	case Opt_rp_size:
+		if (result.uint_32 > UINT_MAX / 1024)
+			return invalf(fc, "jffs2: rp_size unrepresentable");
+		opt = result.uint_32 * 1024;
+		if (opt > c->mtd->size)
+			return invalf(fc, "jffs2: Too large reserve pool specified, max is %llu KB",
+				      c->mtd->size / 1024);
+		c->mount_opts.rp_size = opt;
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	return 0;
 }
 
-static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
+static int jffs2_reconfigure(struct fs_context *fc)
 {
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-	int err;
+	struct super_block *sb = fc->root->d_sb;
 
 	sync_filesystem(sb);
-	err = jffs2_parse_options(c, data);
-	if (err)
-		return -EINVAL;
-
-	return jffs2_do_remount_fs(sb, flags, data);
+	return jffs2_do_remount_fs(sb, fc);
 }
 
 static const struct super_operations jffs2_super_operations =
 {
 	.alloc_inode =	jffs2_alloc_inode,
-	.destroy_inode =jffs2_destroy_inode,
+	.free_inode =	jffs2_free_inode,
 	.put_super =	jffs2_put_super,
 	.statfs =	jffs2_statfs,
-	.remount_fs =	jffs2_remount_fs,
 	.evict_inode =	jffs2_evict_inode,
 	.dirty_inode =	jffs2_dirty_inode,
 	.show_options =	jffs2_show_options,
@@ -267,26 +244,16 @@
 /*
  * fill in the superblock
  */
-static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
+static int jffs2_fill_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct jffs2_sb_info *c;
-	int ret;
+	struct jffs2_sb_info *c = sb->s_fs_info;
 
 	jffs2_dbg(1, "jffs2_get_sb_mtd():"
 		  " New superblock for device %d (\"%s\")\n",
 		  sb->s_mtd->index, sb->s_mtd->name);
 
-	c = kzalloc(sizeof(*c), GFP_KERNEL);
-	if (!c)
-		return -ENOMEM;
-
 	c->mtd = sb->s_mtd;
 	c->os_priv = sb;
-	sb->s_fs_info = c;
-
-	ret = jffs2_parse_options(c, data);
-	if (ret)
-		return -EINVAL;
 
 	/* Initialize JFFS2 superblock locks, the further initialization will
 	 * be done later */
@@ -304,15 +271,37 @@
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 	sb->s_flags |= SB_POSIXACL;
 #endif
-	ret = jffs2_do_fill_super(sb, data, silent);
-	return ret;
+	return jffs2_do_fill_super(sb, fc);
 }
 
-static struct dentry *jffs2_mount(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data)
+static int jffs2_get_tree(struct fs_context *fc)
 {
-	return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
+	return get_tree_mtd(fc, jffs2_fill_super);
+}
+
+static void jffs2_free_fc(struct fs_context *fc)
+{
+	kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations jffs2_context_ops = {
+	.free		= jffs2_free_fc,
+	.parse_param	= jffs2_parse_param,
+	.get_tree	= jffs2_get_tree,
+	.reconfigure	= jffs2_reconfigure,
+};
+
+static int jffs2_init_fs_context(struct fs_context *fc)
+{
+	struct jffs2_sb_info *ctx;
+
+	ctx = kzalloc(sizeof(struct jffs2_sb_info), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	fc->s_fs_info = ctx;
+	fc->ops = &jffs2_context_ops;
+	return 0;
 }
 
 static void jffs2_put_super (struct super_block *sb)
@@ -349,7 +338,8 @@
 static struct file_system_type jffs2_fs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"jffs2",
-	.mount =	jffs2_mount,
+	.init_fs_context = jffs2_init_fs_context,
+	.parameters =	&jffs2_fs_parameters,
 	.kill_sb =	jffs2_kill_sb,
 };
 MODULE_ALIAS_FS("jffs2");

diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 851de78..05cb0e8 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig

@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config JFS_FS
 	tristate "JFS filesystem support"
 	select NLS
 	select CRC32
 	help
 	  This is a port of IBM's Journaled Filesystem .  More information is
-	  available in the file <file:Documentation/filesystems/jfs.txt>.
+	  available in the file <file:Documentation/admin-guide/jfs.rst>.
 
 	  If you do not intend to use the JFS filesystem, say N.
 

diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 2e71b6e..92cc0ac 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c

@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines  Corp., 2002-2004
  *   Copyright (C) Andreas Gruenbacher, 2001
  *   Copyright (C) Linus Torvalds, 1991, 1992
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/sched.h>
@@ -117,7 +104,8 @@
 		rc = posix_acl_update_mode(inode, &mode, &acl);
 		if (rc)
 			goto end_tx;
-		update_mode = 1;
+		if (mode != inode->i_mode)
+			update_mode = 1;
 	}
 	rc = __jfs_set_acl(tid, inode, type, acl);
 	if (!rc) {
@@ -146,12 +134,16 @@
 	if (default_acl) {
 		rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl);
 		posix_acl_release(default_acl);
+	} else {
+		inode->i_default_acl = NULL;
 	}
 
 	if (acl) {
 		if (!rc)
 			rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
 		posix_acl_release(acl);
+	} else {
+		inode->i_acl = NULL;
 	}
 
 	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |

diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 36665fd..930d270 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/mm.h>

diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 054cc76..9486afc 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>
@@ -31,6 +18,7 @@
 #include "jfs_extent.h"
 #include "jfs_unicode.h"
 #include "jfs_debug.h"
+#include "jfs_dmap.h"
 
 
 struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
@@ -150,6 +138,8 @@
 
 void jfs_evict_inode(struct inode *inode)
 {
+	struct jfs_inode_info *ji = JFS_IP(inode);
+
 	jfs_info("In jfs_evict_inode, inode = 0x%p", inode);
 
 	if (!inode->i_nlink && !is_bad_inode(inode)) {
@@ -166,7 +156,6 @@
 			/*
 			 * Free the inode from the quota allocation.
 			 */
-			dquot_initialize(inode);
 			dquot_free_inode(inode);
 		}
 	} else {
@@ -174,6 +163,16 @@
 	}
 	clear_inode(inode);
 	dquot_drop(inode);
+
+	BUG_ON(!list_empty(&ji->anon_inode_list));
+
+	spin_lock_irq(&ji->ag_lock);
+	if (ji->active_ag != -1) {
+		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+		atomic_dec(&bmap->db_active[ji->active_ag]);
+		ji->active_ag = -1;
+	}
+	spin_unlock_irq(&ji->ag_lock);
 }
 
 void jfs_dirty_inode(struct inode *inode, int flags)

diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index ba34dae..10ee0ec 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c

@@ -98,24 +98,16 @@
 		/* Lock against other parallel changes of flags */
 		inode_lock(inode);
 
-		oldflags = jfs_inode->mode2;
-
-		/*
-		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-		 * the relevant capability.
-		 */
-		if ((oldflags & JFS_IMMUTABLE_FL) ||
-			((flags ^ oldflags) &
-			(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				inode_unlock(inode);
-				err = -EPERM;
-				goto setflags_out;
-			}
+		oldflags = jfs_map_ext2(jfs_inode->mode2 & JFS_FL_USER_VISIBLE,
+					0);
+		err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+		if (err) {
+			inode_unlock(inode);
+			goto setflags_out;
 		}
 
 		flags = flags & JFS_FL_USER_MODIFIABLE;
-		flags |= oldflags & ~JFS_FL_USER_MODIFIABLE;
+		flags |= jfs_inode->mode2 & ~JFS_FL_USER_MODIFIABLE;
 		jfs_inode->mode2 = flags;
 
 		jfs_set_inode_flags(inode);

diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 489f993b..9f8f92d 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines  Corp., 2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_ACL
 #define _H_JFS_ACL

diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h
index 79c6180..ce055ef 100644
--- a/fs/jfs/jfs_btree.h
+++ b/fs/jfs/jfs_btree.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_BTREE
 #define _H_JFS_BTREE

diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 35a5b2a..888cdd6 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 0d9e35d..48e2150 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h

@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_DEBUG
 #define _H_JFS_DEBUG

diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 1682a87..5fa9fd5 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2001
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_DINODE
 #define _H_JFS_DINODE

diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c
index f76ff0a..575cb2b 100644
--- a/fs/jfs/jfs_discard.c
+++ b/fs/jfs/jfs_discard.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) Tino Reichardt, 2012
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/jfs_discard.h b/fs/jfs/jfs_discard.h
index 40d1ee6..05215fa 100644
--- a/fs/jfs/jfs_discard.h
+++ b/fs/jfs/jfs_discard.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) Tino Reichardt, 2012
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_DISCARD
 #define _H_JFS_DISCARD

diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 49263e2..caade18 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *   Portions Copyright (C) Tino Reichardt, 2012
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 562b9a7..29891fa 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_DMAP
 #define _H_JFS_DMAP

diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 52bae3f..3acc954 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 /*

diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index fd4169e..1758289 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_DTREE
 #define	_H_JFS_DTREE

diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 2ae7d59..f65bd6b 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
index b567e12..dd635a8 100644
--- a/fs/jfs/jfs_extent.h
+++ b/fs/jfs/jfs_extent.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2001
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_EXTENT
 #define _H_JFS_EXTENT

diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index b67d646..1e89929 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2003
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_FILSYS
 #define _H_JFS_FILSYS

diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 93e8c59..937ca07 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 /*

diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
index 610a0e9..dd7409f 100644
--- a/fs/jfs/jfs_imap.h
+++ b/fs/jfs/jfs_imap.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_IMAP
 #define _H_JFS_IMAP

diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 912a3af..a466ec4 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h

@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_INCORE
 #define _H_JFS_INCORE
@@ -23,6 +10,8 @@
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/bitops.h>
+#include <linux/uuid.h>
+
 #include "jfs_types.h"
 #include "jfs_xtree.h"
 #include "jfs_dtree.h"
@@ -178,8 +167,8 @@
 	pxd_t		logpxd;		/* pxd describing log	*/
 	pxd_t		fsckpxd;	/* pxd describing fsck wkspc */
 	pxd_t		ait2;		/* pxd describing AIT copy	*/
-	char		uuid[16];	/* 128-bit uuid for volume	*/
-	char		loguuid[16];	/* 128-bit uuid for log	*/
+	uuid_t		uuid;		/* 128-bit uuid for volume	*/
+	uuid_t		loguuid;	/* 128-bit uuid for log	*/
 	/*
 	 * commit_state is used for synchronization of the jfs_commit
 	 * threads.  It is protected by LAZY_LOCK().

diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 4572b7c..4cef170 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 7b0b3a4..70a0d12 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2001
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_INODE
 #define _H_JFS_INODE

diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
index ecf0488..feb37dd 100644
--- a/fs/jfs/jfs_lock.h
+++ b/fs/jfs/jfs_lock.h

@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2001
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_LOCK
 #define _H_JFS_LOCK

diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 6b68df3..9330eff 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 /*
@@ -1092,8 +1079,7 @@
 	mutex_lock(&jfs_log_mutex);
 	list_for_each_entry(log, &jfs_external_logs, journal_list) {
 		if (log->bdev->bd_dev == sbi->logdev) {
-			if (memcmp(log->uuid, sbi->loguuid,
-				   sizeof(log->uuid))) {
+			if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
 				jfs_warn("wrong uuid on JFS journal");
 				mutex_unlock(&jfs_log_mutex);
 				return -EINVAL;
@@ -1130,7 +1116,7 @@
 	}
 
 	log->bdev = bdev;
-	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
+	uuid_copy(&log->uuid, &sbi->loguuid);
 
 	/*
 	 * initialize log:
@@ -1336,7 +1322,7 @@
 			jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x",
 				 log, (unsigned long long)log->base, log->size);
 		} else {
-			if (memcmp(logsuper->uuid, log->uuid, 16)) {
+			if (!uuid_equal(&logsuper->uuid, &log->uuid)) {
 				jfs_warn("wrong uuid on JFS log device");
 				goto errout20;
 			}
@@ -1732,7 +1718,7 @@
 	int i;
 	struct logsuper *logsuper;
 	struct lbuf *bpsuper;
-	char *uuid = sbi->uuid;
+	uuid_t *uuid = &sbi->uuid;
 
 	/*
 	 * insert/remove file system device to log active file system list.
@@ -1743,8 +1729,8 @@
 	logsuper = (struct logsuper *) bpsuper->l_ldata;
 	if (activate) {
 		for (i = 0; i < MAX_ACTIVE; i++)
-			if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) {
-				memcpy(logsuper->active[i].uuid, uuid, 16);
+			if (uuid_is_null(&logsuper->active[i].uuid)) {
+				uuid_copy(&logsuper->active[i].uuid, uuid);
 				sbi->aggregate = i;
 				break;
 			}
@@ -1755,8 +1741,9 @@
 		}
 	} else {
 		for (i = 0; i < MAX_ACTIVE; i++)
-			if (!memcmp(logsuper->active[i].uuid, uuid, 16)) {
-				memcpy(logsuper->active[i].uuid, NULL_UUID, 16);
+			if (uuid_equal(&logsuper->active[i].uuid, uuid)) {
+				uuid_copy(&logsuper->active[i].uuid,
+					  &uuid_null);
 				break;
 			}
 		if (i == MAX_ACTIVE) {

diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index e38c215..7fd125c 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h

@@ -1,24 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_LOGMGR
 #define _H_JFS_LOGMGR
 
+#include <linux/uuid.h>
+
 #include "jfs_filsys.h"
 #include "jfs_lock.h"
 
@@ -73,15 +62,13 @@
 	__le32 state;		/* 4: state - see below */
 
 	__le32 end;		/* 4: addr of last log record set by logredo */
-	char uuid[16];		/* 16: 128-bit journal uuid */
+	uuid_t uuid;		/* 16: 128-bit journal uuid */
 	char label[16];		/* 16: journal label */
 	struct {
-		char uuid[16];
+		uuid_t uuid;
 	} active[MAX_ACTIVE];	/* 2048: active file systems list */
 };
 
-#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
-
 /* log flag: commit option (see jfs_filsys.h) */
 
 /* log state */
@@ -410,7 +397,7 @@
 	spinlock_t synclock;	/* 4: synclist lock */
 	struct lbuf *wqueue;	/* 4: log pageout queue */
 	int count;		/* 4: count */
-	char uuid[16];		/* 16: 128-bit uuid of log device */
+	uuid_t uuid;		/* 16: 128-bit uuid of log device */
 
 	int no_integrity;	/* 3: flag to disable journaling to disk */
 };

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index fa2c682..a2f5338 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2005
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index 8b0ee51..4179f9d 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h

@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_METAPAGE
 #define _H_JFS_METAPAGE

diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index d865860..eb8b9e2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 /*
@@ -389,8 +376,8 @@
 		sbi->logpxd = j_sb->s_logpxd;
 	else {
 		sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev));
-		memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid));
-		memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid));
+		uuid_copy(&sbi->uuid, &j_sb->s_uuid);
+		uuid_copy(&sbi->loguuid, &j_sb->s_loguuid);
 	}
 	sbi->fsckpxd = j_sb->s_fsckpxd;
 	sbi->ait2 = j_sb->s_ait2;

diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 04847b8..93402c4 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h

@@ -1,23 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2003
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_SUPERBLOCK
 #define _H_JFS_SUPERBLOCK
 
+#include <linux/uuid.h>
+
 /*
  * make the magic number something a human could read
  */
@@ -98,11 +87,9 @@
 	__le64 s_xsize;		/* 8: extendfs s_size */
 	pxd_t s_xfsckpxd;	/* 8: extendfs fsckpxd */
 	pxd_t s_xlogpxd;	/* 8: extendfs logpxd */
-	/* - 128 byte boundary - */
-
-	char s_uuid[16];	/* 16: 128-bit uuid for volume */
+	uuid_t s_uuid;		/* 16: 128-bit uuid for volume */
 	char s_label[16];	/* 16: volume label */
-	char s_loguuid[16];	/* 16: 128-bit uuid for log device */
+	uuid_t s_loguuid;	/* 16: 128-bit uuid for log device */
 
 };
 

diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index a5663cb..c8ce7f1 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2005
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 /*
@@ -1928,8 +1915,7 @@
 	 * header ?
 	 */
 	if (tlck->type & tlckTRUNCATE) {
-		/* This odd declaration suppresses a bogus gcc warning */
-		pxd_t pxd = pxd;	/* truncated extent of xad */
+		pxd_t pxd;	/* truncated extent of xad */
 		int twm;
 
 		/*

diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
index ab72889..ba71eb5 100644
--- a/fs/jfs/jfs_txnmgr.h
+++ b/fs/jfs/jfs_txnmgr.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_TXNMGR
 #define _H_JFS_TXNMGR

diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 8f602dc..3ff9f26 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_TYPES
 #define	_H_JFS_TYPES

diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 7971f37..3e8b13e 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 /*

diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c
index 0148e2e..0c1e902 100644
--- a/fs/jfs/jfs_unicode.c
+++ b/fs/jfs/jfs_unicode.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 8f0f02c..9db62d0 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h

@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_UNICODE
 #define _H_JFS_UNICODE

diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c
index cfe5066..d0b18c7 100644
--- a/fs/jfs/jfs_uniupr.c
+++ b/fs/jfs/jfs_uniupr.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 561f6af..f0558b3 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #ifndef H_JFS_XATTR

diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 2c200b5..16ad920 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2005
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 /*
  *	jfs_xtree.c: extent allocation descriptor B+-tree manager

diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 1e09879..5f51be8 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h

@@ -1,19 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_XTREE
 #define _H_JFS_XTREE

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 14528c0..7a55d14 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>
@@ -203,7 +190,7 @@
  * RETURN:	Errors from subroutines
  *
  * note:
- * EACCESS: user needs search+write permission on the parent directory
+ * EACCES: user needs search+write permission on the parent directory
  */
 static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
 {

diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7ddcb44..66acea9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines  Corp., 2000-2004
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 09da5cf..b2dc4d1 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines Corp., 2000-2004
  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>
@@ -124,27 +111,9 @@
 	return &jfs_inode->vfs_inode;
 }
 
-static void jfs_i_callback(struct rcu_head *head)
+static void jfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	struct jfs_inode_info *ji = JFS_IP(inode);
-	kmem_cache_free(jfs_inode_cachep, ji);
-}
-
-static void jfs_destroy_inode(struct inode *inode)
-{
-	struct jfs_inode_info *ji = JFS_IP(inode);
-
-	BUG_ON(!list_empty(&ji->anon_inode_list));
-
-	spin_lock_irq(&ji->ag_lock);
-	if (ji->active_ag != -1) {
-		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
-		atomic_dec(&bmap->db_active[ji->active_ag]);
-		ji->active_ag = -1;
-	}
-	spin_unlock_irq(&ji->ag_lock);
-	call_rcu(&inode->i_rcu, jfs_i_callback);
+	kmem_cache_free(jfs_inode_cachep, JFS_IP(inode));
 }
 
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -174,9 +143,11 @@
 	buf->f_files = maxinodes;
 	buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) -
 				    atomic_read(&imap->im_numfree));
-	buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2);
-	buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2,
-					sizeof(sbi->uuid)/2);
+	buf->f_fsid.val[0] = crc32_le(0, (char *)&sbi->uuid,
+				      sizeof(sbi->uuid)/2);
+	buf->f_fsid.val[1] = crc32_le(0,
+				      (char *)&sbi->uuid + sizeof(sbi->uuid)/2,
+				      sizeof(sbi->uuid)/2);
 
 	buf->f_namelen = JFS_NAME_MAX;
 	return 0;
@@ -247,7 +218,7 @@
 	{Opt_resize_nosize, "resize"},
 	{Opt_errors, "errors=%s"},
 	{Opt_ignore, "noquota"},
-	{Opt_ignore, "quota"},
+	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
 	{Opt_uid, "uid=%u"},
@@ -532,6 +503,8 @@
 
 	sb->s_fs_info = sbi;
 	sb->s_max_links = JFS_LINK_MAX;
+	sb->s_time_min = 0;
+	sb->s_time_max = U32_MAX;
 	sbi->sb = sb;
 	sbi->uid = INVALID_UID;
 	sbi->gid = INVALID_GID;
@@ -912,7 +885,7 @@
 
 static const struct super_operations jfs_super_operations = {
 	.alloc_inode	= jfs_alloc_inode,
-	.destroy_inode	= jfs_destroy_inode,
+	.free_inode	= jfs_free_inode,
 	.dirty_inode	= jfs_dirty_inode,
 	.write_inode	= jfs_write_inode,
 	.evict_inode	= jfs_evict_inode,

diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 3832060..a040719 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) Christoph Hellwig, 2001-2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index a679798..db41e78 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c

@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *   Copyright (C) International Business Machines  Corp., 2000-2004
  *   Copyright (C) Christoph Hellwig, 2002
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #include <linux/capability.h>

diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig
index 397b5f7..e7f0910 100644
--- a/fs/kernfs/Kconfig
+++ b/fs/kernfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # KERNFS should be selected by its users
 #

diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
index 674337c..4ca54ff 100644
--- a/fs/kernfs/Makefile
+++ b/fs/kernfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the kernfs pseudo filesystem
 #

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 4ca0b5c..6ebae6b 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c

@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/kernfs/dir.c - kernfs directory implementation
  *
  * Copyright (c) 2001-3 Patrick Mochel
  * Copyright (c) 2007 SUSE Linux Products GmbH
  * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2.
  */
 
 #include <linux/sched.h>
@@ -138,6 +137,9 @@
 	if (kn_from == kn_to)
 		return strlcpy(buf, "/", buflen);
 
+	if (!buf)
+		return -EINVAL;
+
 	common = kernfs_common_ancestor(kn_from, kn_to);
 	if (WARN_ON(!common))
 		return -EINVAL;
@@ -145,8 +147,7 @@
 	depth_to = kernfs_depth(common, kn_to);
 	depth_from = kernfs_depth(common, kn_from);
 
-	if (buf)
-		buf[0] = '\0';
+	buf[0] = '\0';
 
 	for (i = 0; i < depth_from; i++)
 		len += strlcpy(buf + len, parent_str,
@@ -431,7 +432,6 @@
  */
 void kernfs_put_active(struct kernfs_node *kn)
 {
-	struct kernfs_root *root = kernfs_root(kn);
 	int v;
 
 	if (unlikely(!kn))
@@ -443,7 +443,7 @@
 	if (likely(v != KN_DEACTIVATED_BIAS))
 		return;
 
-	wake_up_all(&root->deactivate_waitq);
+	wake_up_all(&kernfs_root(kn)->deactivate_waitq);
 }
 
 /**
@@ -532,12 +532,9 @@
 	kfree_const(kn->name);
 
 	if (kn->iattr) {
-		if (kn->iattr->ia_secdata)
-			security_release_secctx(kn->iattr->ia_secdata,
-						kn->iattr->ia_secdata_len);
 		simple_xattrs_free(&kn->iattr->xattrs);
+		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
 	}
-	kfree(kn->iattr);
 	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, kn->id.ino);
 	spin_unlock(&kernfs_idr_lock);
@@ -618,6 +615,7 @@
 }
 
 static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
+					     struct kernfs_node *parent,
 					     const char *name, umode_t mode,
 					     kuid_t uid, kgid_t gid,
 					     unsigned flags)
@@ -650,11 +648,10 @@
 	kn->id.generation = gen;
 
 	/*
-	 * set ino first. This barrier is paired with atomic_inc_not_zero in
+	 * set ino first. This RELEASE is paired with atomic_inc_not_zero in
 	 * kernfs_find_and_get_node_by_ino
 	 */
-	smp_mb__before_atomic();
-	atomic_set(&kn->count, 1);
+	atomic_set_release(&kn->count, 1);
 	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
 	RB_CLEAR_NODE(&kn->rb);
 
@@ -674,6 +671,12 @@
 			goto err_out3;
 	}
 
+	if (parent) {
+		ret = security_kernfs_init_security(parent, kn);
+		if (ret)
+			goto err_out3;
+	}
+
 	return kn;
 
  err_out3:
@@ -692,7 +695,7 @@
 {
 	struct kernfs_node *kn;
 
-	kn = __kernfs_new_node(kernfs_root(parent),
+	kn = __kernfs_new_node(kernfs_root(parent), parent,
 			       name, mode, uid, gid, flags);
 	if (kn) {
 		kernfs_get(parent);
@@ -795,9 +798,8 @@
 	/* Update timestamps on the parent */
 	ps_iattr = parent->iattr;
 	if (ps_iattr) {
-		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ktime_get_real_ts64(&ps_iattrs->ia_ctime);
-		ps_iattrs->ia_mtime = ps_iattrs->ia_ctime;
+		ktime_get_real_ts64(&ps_iattr->ia_ctime);
+		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
 
 	mutex_unlock(&kernfs_mutex);
@@ -962,7 +964,7 @@
 	INIT_LIST_HEAD(&root->supers);
 	root->next_generation = 1;
 
-	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+	kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO,
 			       GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 			       KERNFS_DIR);
 	if (!kn) {
@@ -1329,9 +1331,8 @@
 
 			/* update timestamps on the parent */
 			if (ps_iattr) {
-				ktime_get_real_ts64(&ps_iattr->ia_iattr.ia_ctime);
-				ps_iattr->ia_iattr.ia_mtime =
-					ps_iattr->ia_iattr.ia_ctime;
+				ktime_get_real_ts64(&ps_iattr->ia_ctime);
+				ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 			}
 
 			kernfs_put(pos);

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index dbf5bc2..e8c792b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c

@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/kernfs/file.c - kernfs file implementation
  *
  * Copyright (c) 2001-3 Patrick Mochel
  * Copyright (c) 2007 SUSE Linux Products GmbH
  * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2.
  */
 
 #include <linux/fs.h>
@@ -832,32 +831,40 @@
  * to see if it supports poll (Neither 'poll' nor 'select' return
  * an appropriate error code).  When in doubt, set a suitable timeout value.
  */
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
+{
+	struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
+	struct kernfs_open_node *on = kn->attr.open;
+
+	poll_wait(of->file, &on->poll, wait);
+
+	if (of->event != atomic_read(&on->event))
+		return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+
+	return DEFAULT_POLLMASK;
+}
+
 static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
 {
 	struct kernfs_open_file *of = kernfs_of(filp);
 	struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
-	struct kernfs_open_node *on = kn->attr.open;
+	__poll_t ret;
 
 	if (!kernfs_get_active(kn))
-		goto trigger;
+		return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
 
-	poll_wait(filp, &on->poll, wait);
+	if (kn->attr.ops->poll)
+		ret = kn->attr.ops->poll(of, wait);
+	else
+		ret = kernfs_generic_poll(of, wait);
 
 	kernfs_put_active(kn);
-
-	if (of->event != atomic_read(&on->event))
-		goto trigger;
-
-	return DEFAULT_POLLMASK;
-
- trigger:
-	return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+	return ret;
 }
 
 static void kernfs_notify_workfn(struct work_struct *work)
 {
 	struct kernfs_node *kn;
-	struct kernfs_open_node *on;
 	struct kernfs_super_info *info;
 repeat:
 	/* pop one off the notify_list */
@@ -871,23 +878,13 @@
 	kn->attr.notify_next = NULL;
 	spin_unlock_irq(&kernfs_notify_lock);
 
-	/* kick poll */
-	spin_lock_irq(&kernfs_open_node_lock);
-
-	on = kn->attr.open;
-	if (on) {
-		atomic_inc(&on->event);
-		wake_up_interruptible(&on->poll);
-	}
-
-	spin_unlock_irq(&kernfs_open_node_lock);
-
 	/* kick fsnotify */
 	mutex_lock(&kernfs_mutex);
 
 	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
 		struct kernfs_node *parent;
 		struct inode *inode;
+		struct qstr name;
 
 		/*
 		 * We want fsnotify_modify() on @kn but as the
@@ -899,6 +896,7 @@
 		if (!inode)
 			continue;
 
+		name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
 		parent = kernfs_get_parent(kn);
 		if (parent) {
 			struct inode *p_inode;
@@ -906,7 +904,7 @@
 			p_inode = ilookup(info->sb, parent->id.ino);
 			if (p_inode) {
 				fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
-					 inode, FSNOTIFY_EVENT_INODE, kn->name, 0);
+					 inode, FSNOTIFY_EVENT_INODE, &name, 0);
 				iput(p_inode);
 			}
 
@@ -914,7 +912,7 @@
 		}
 
 		fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
-			 kn->name, 0);
+			 &name, 0);
 		iput(inode);
 	}
 
@@ -934,10 +932,21 @@
 {
 	static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
 	unsigned long flags;
+	struct kernfs_open_node *on;
 
 	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
 		return;
 
+	/* kick poll immediately */
+	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+	on = kn->attr.open;
+	if (on) {
+		atomic_inc(&on->event);
+		wake_up_interruptible(&on->poll);
+	}
+	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+
+	/* schedule work to kick fsnotify */
 	spin_lock_irqsave(&kernfs_notify_lock, flags);
 	if (!kn->attr.notify_next) {
 		kernfs_get(kn);

diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 80cebcd..f3eaa88 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c

@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/kernfs/inode.c - kernfs inode implementation
  *
  * Copyright (c) 2001-3 Patrick Mochel
  * Copyright (c) 2007 SUSE Linux Products GmbH
  * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2.
  */
 
 #include <linux/pagemap.h>
@@ -31,30 +30,27 @@
 	.listxattr	= kernfs_iop_listxattr,
 };
 
-static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
+static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
 {
 	static DEFINE_MUTEX(iattr_mutex);
 	struct kernfs_iattrs *ret;
-	struct iattr *iattrs;
 
 	mutex_lock(&iattr_mutex);
 
-	if (kn->iattr)
+	if (kn->iattr || !alloc)
 		goto out_unlock;
 
-	kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+	kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
 	if (!kn->iattr)
 		goto out_unlock;
-	iattrs = &kn->iattr->ia_iattr;
 
 	/* assign default attributes */
-	iattrs->ia_mode = kn->mode;
-	iattrs->ia_uid = GLOBAL_ROOT_UID;
-	iattrs->ia_gid = GLOBAL_ROOT_GID;
+	kn->iattr->ia_uid = GLOBAL_ROOT_UID;
+	kn->iattr->ia_gid = GLOBAL_ROOT_GID;
 
-	ktime_get_real_ts64(&iattrs->ia_atime);
-	iattrs->ia_mtime = iattrs->ia_atime;
-	iattrs->ia_ctime = iattrs->ia_atime;
+	ktime_get_real_ts64(&kn->iattr->ia_atime);
+	kn->iattr->ia_mtime = kn->iattr->ia_atime;
+	kn->iattr->ia_ctime = kn->iattr->ia_atime;
 
 	simple_xattrs_init(&kn->iattr->xattrs);
 out_unlock:
@@ -63,32 +59,37 @@
 	return ret;
 }
 
+static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
+{
+	return __kernfs_iattrs(kn, 1);
+}
+
+static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
+{
+	return __kernfs_iattrs(kn, 0);
+}
+
 int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
 	struct kernfs_iattrs *attrs;
-	struct iattr *iattrs;
 	unsigned int ia_valid = iattr->ia_valid;
 
 	attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
-	iattrs = &attrs->ia_iattr;
-
 	if (ia_valid & ATTR_UID)
-		iattrs->ia_uid = iattr->ia_uid;
+		attrs->ia_uid = iattr->ia_uid;
 	if (ia_valid & ATTR_GID)
-		iattrs->ia_gid = iattr->ia_gid;
+		attrs->ia_gid = iattr->ia_gid;
 	if (ia_valid & ATTR_ATIME)
-		iattrs->ia_atime = iattr->ia_atime;
+		attrs->ia_atime = iattr->ia_atime;
 	if (ia_valid & ATTR_MTIME)
-		iattrs->ia_mtime = iattr->ia_mtime;
+		attrs->ia_mtime = iattr->ia_mtime;
 	if (ia_valid & ATTR_CTIME)
-		iattrs->ia_ctime = iattr->ia_ctime;
-	if (ia_valid & ATTR_MODE) {
-		umode_t mode = iattr->ia_mode;
-		iattrs->ia_mode = kn->mode = mode;
-	}
+		attrs->ia_ctime = iattr->ia_ctime;
+	if (ia_valid & ATTR_MODE)
+		kn->mode = iattr->ia_mode;
 	return 0;
 }
 
@@ -135,23 +136,6 @@
 	return error;
 }
 
-static int kernfs_node_setsecdata(struct kernfs_iattrs *attrs, void **secdata,
-				  u32 *secdata_len)
-{
-	void *old_secdata;
-	size_t old_secdata_len;
-
-	old_secdata = attrs->ia_secdata;
-	old_secdata_len = attrs->ia_secdata_len;
-
-	attrs->ia_secdata = *secdata;
-	attrs->ia_secdata_len = *secdata_len;
-
-	*secdata = old_secdata;
-	*secdata_len = old_secdata_len;
-	return 0;
-}
-
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
 {
 	struct kernfs_node *kn = kernfs_dentry_node(dentry);
@@ -171,14 +155,14 @@
 		inode->i_ctime = current_time(inode);
 }
 
-static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
+static inline void set_inode_attr(struct inode *inode,
+				  struct kernfs_iattrs *attrs)
 {
-	struct super_block *sb = inode->i_sb;
-	inode->i_uid = iattr->ia_uid;
-	inode->i_gid = iattr->ia_gid;
-	inode->i_atime = timespec64_trunc(iattr->ia_atime, sb->s_time_gran);
-	inode->i_mtime = timespec64_trunc(iattr->ia_mtime, sb->s_time_gran);
-	inode->i_ctime = timespec64_trunc(iattr->ia_ctime, sb->s_time_gran);
+	inode->i_uid = attrs->ia_uid;
+	inode->i_gid = attrs->ia_gid;
+	inode->i_atime = timestamp_truncate(attrs->ia_atime, inode);
+	inode->i_mtime = timestamp_truncate(attrs->ia_mtime, inode);
+	inode->i_ctime = timestamp_truncate(attrs->ia_ctime, inode);
 }
 
 static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
@@ -186,15 +170,12 @@
 	struct kernfs_iattrs *attrs = kn->iattr;
 
 	inode->i_mode = kn->mode;
-	if (attrs) {
+	if (attrs)
 		/*
 		 * kernfs_node has non-default attributes get them from
 		 * persistent copy in kernfs_node.
 		 */
-		set_inode_attr(inode, &attrs->ia_iattr);
-		security_inode_notifysecctx(inode, attrs->ia_secdata,
-					    attrs->ia_secdata_len);
-	}
+		set_inode_attr(inode, attrs);
 
 	if (kernfs_type(kn) == KERNFS_DIR)
 		set_nlink(inode, kn->dir.subdirs + 2);
@@ -305,78 +286,57 @@
 	return generic_permission(inode, mask);
 }
 
-static int kernfs_xattr_get(const struct xattr_handler *handler,
-			    struct dentry *unused, struct inode *inode,
-			    const char *suffix, void *value, size_t size)
+int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
+		     void *value, size_t size)
 {
-	const char *name = xattr_full_name(handler, suffix);
-	struct kernfs_node *kn = inode->i_private;
-	struct kernfs_iattrs *attrs;
-
-	attrs = kernfs_iattrs(kn);
+	struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn);
 	if (!attrs)
-		return -ENOMEM;
+		return -ENODATA;
 
 	return simple_xattr_get(&attrs->xattrs, name, value, size);
 }
 
-static int kernfs_xattr_set(const struct xattr_handler *handler,
-			    struct dentry *unused, struct inode *inode,
-			    const char *suffix, const void *value,
-			    size_t size, int flags)
+int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
+		     const void *value, size_t size, int flags)
 {
-	const char *name = xattr_full_name(handler, suffix);
-	struct kernfs_node *kn = inode->i_private;
-	struct kernfs_iattrs *attrs;
-
-	attrs = kernfs_iattrs(kn);
+	struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
 	return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
 }
 
+static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
+				struct dentry *unused, struct inode *inode,
+				const char *suffix, void *value, size_t size)
+{
+	const char *name = xattr_full_name(handler, suffix);
+	struct kernfs_node *kn = inode->i_private;
+
+	return kernfs_xattr_get(kn, name, value, size);
+}
+
+static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
+				struct dentry *unused, struct inode *inode,
+				const char *suffix, const void *value,
+				size_t size, int flags)
+{
+	const char *name = xattr_full_name(handler, suffix);
+	struct kernfs_node *kn = inode->i_private;
+
+	return kernfs_xattr_set(kn, name, value, size, flags);
+}
+
 static const struct xattr_handler kernfs_trusted_xattr_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
-	.get = kernfs_xattr_get,
-	.set = kernfs_xattr_set,
+	.get = kernfs_vfs_xattr_get,
+	.set = kernfs_vfs_xattr_set,
 };
 
-static int kernfs_security_xattr_set(const struct xattr_handler *handler,
-				     struct dentry *unused, struct inode *inode,
-				     const char *suffix, const void *value,
-				     size_t size, int flags)
-{
-	struct kernfs_node *kn = inode->i_private;
-	struct kernfs_iattrs *attrs;
-	void *secdata;
-	u32 secdata_len = 0;
-	int error;
-
-	attrs = kernfs_iattrs(kn);
-	if (!attrs)
-		return -ENOMEM;
-
-	error = security_inode_setsecurity(inode, suffix, value, size, flags);
-	if (error)
-		return error;
-	error = security_inode_getsecctx(inode, &secdata, &secdata_len);
-	if (error)
-		return error;
-
-	mutex_lock(&kernfs_mutex);
-	error = kernfs_node_setsecdata(attrs, &secdata, &secdata_len);
-	mutex_unlock(&kernfs_mutex);
-
-	if (secdata)
-		security_release_secctx(secdata, secdata_len);
-	return error;
-}
-
 static const struct xattr_handler kernfs_security_xattr_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
-	.get = kernfs_xattr_get,
-	.set = kernfs_security_xattr_set,
+	.get = kernfs_vfs_xattr_get,
+	.set = kernfs_vfs_xattr_set,
 };
 
 const struct xattr_handler *kernfs_xattr_handlers[] = {

diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3d83b11..02ce570 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h

@@ -1,11 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * fs/kernfs/kernfs-internal.h - kernfs internal header file
  *
  * Copyright (c) 2001-3 Patrick Mochel
  * Copyright (c) 2007 SUSE Linux Products GmbH
  * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
  */
 
 #ifndef __KERNFS_INTERNAL_H
@@ -17,11 +16,14 @@
 #include <linux/xattr.h>
 
 #include <linux/kernfs.h>
+#include <linux/fs_context.h>
 
 struct kernfs_iattrs {
-	struct iattr		ia_iattr;
-	void			*ia_secdata;
-	u32			ia_secdata_len;
+	kuid_t			ia_uid;
+	kgid_t			ia_gid;
+	struct timespec64	ia_atime;
+	struct timespec64	ia_mtime;
+	struct timespec64	ia_ctime;
 
 	struct simple_xattrs	xattrs;
 };
@@ -78,7 +80,7 @@
 }
 
 extern const struct super_operations kernfs_sops;
-extern struct kmem_cache *kernfs_node_cache;
+extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
 
 /*
  * inode.c

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index ff2716f..6c12fac 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c

@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/kernfs/mount.c - kernfs mount implementation
  *
  * Copyright (c) 2001-3 Patrick Mochel
  * Copyright (c) 2007 SUSE Linux Products GmbH
  * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2.
  */
 
 #include <linux/fs.h>
@@ -20,17 +19,7 @@
 
 #include "kernfs-internal.h"
 
-struct kmem_cache *kernfs_node_cache;
-
-static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	struct kernfs_root *root = kernfs_info(sb)->root;
-	struct kernfs_syscall_ops *scops = root->syscall_ops;
-
-	if (scops && scops->remount_fs)
-		return scops->remount_fs(root, flags, data);
-	return 0;
-}
+struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
 
 static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 {
@@ -60,7 +49,6 @@
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= kernfs_evict_inode,
 
-	.remount_fs	= kernfs_sop_remount_fs,
 	.show_options	= kernfs_sop_show_options,
 	.show_path	= kernfs_sop_show_path,
 };
@@ -196,8 +184,10 @@
 		return dentry;
 
 	knparent = find_next_ancestor(kn, NULL);
-	if (WARN_ON(!knparent))
+	if (WARN_ON(!knparent)) {
+		dput(dentry);
 		return ERR_PTR(-EINVAL);
+	}
 
 	do {
 		struct dentry *dtmp;
@@ -206,8 +196,10 @@
 		if (kn == knparent)
 			return dentry;
 		kntmp = find_next_ancestor(kn, knparent);
-		if (WARN_ON(!kntmp))
+		if (WARN_ON(!kntmp)) {
+			dput(dentry);
 			return ERR_PTR(-EINVAL);
+		}
 		dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
 					       strlen(kntmp->name));
 		dput(dentry);
@@ -218,7 +210,7 @@
 	} while (true);
 }
 
-static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
+static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
 	struct inode *inode;
@@ -229,13 +221,16 @@
 	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
-	sb->s_magic = magic;
+	sb->s_magic = kfc->magic;
 	sb->s_op = &kernfs_sops;
 	sb->s_xattr = kernfs_xattr_handlers;
 	if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
 		sb->s_export_op = &kernfs_export_ops;
 	sb->s_time_gran = 1;
 
+	/* sysfs dentries and inodes don't require IO to create */
+	sb->s_shrink.seeks = 0;
+
 	/* get root inode, initialize and unlock it */
 	mutex_lock(&kernfs_mutex);
 	inode = kernfs_get_inode(sb, info->root->kn);
@@ -256,21 +251,20 @@
 	return 0;
 }
 
-static int kernfs_test_super(struct super_block *sb, void *data)
+static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct kernfs_super_info *sb_info = kernfs_info(sb);
-	struct kernfs_super_info *info = data;
+	struct kernfs_super_info *info = fc->s_fs_info;
 
 	return sb_info->root == info->root && sb_info->ns == info->ns;
 }
 
-static int kernfs_set_super(struct super_block *sb, void *data)
+static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
 {
-	int error;
-	error = set_anon_super(sb, data);
-	if (!error)
-		sb->s_fs_info = data;
-	return error;
+	struct kernfs_fs_context *kfc = fc->fs_private;
+
+	kfc->ns_tag = NULL;
+	return set_anon_super_fc(sb, fc);
 }
 
 /**
@@ -287,63 +281,60 @@
 }
 
 /**
- * kernfs_mount_ns - kernfs mount helper
- * @fs_type: file_system_type of the fs being mounted
- * @flags: mount flags specified for the mount
- * @root: kernfs_root of the hierarchy being mounted
- * @magic: file system specific magic number
- * @new_sb_created: tell the caller if we allocated a new superblock
- * @ns: optional namespace tag of the mount
+ * kernfs_get_tree - kernfs filesystem access/retrieval helper
+ * @fc: The filesystem context.
  *
- * This is to be called from each kernfs user's file_system_type->mount()
- * implementation, which should pass through the specified @fs_type and
- * @flags, and specify the hierarchy and namespace tag to mount via @root
- * and @ns, respectively.
- *
- * The return value can be passed to the vfs layer verbatim.
+ * This is to be called from each kernfs user's fs_context->ops->get_tree()
+ * implementation, which should set the specified ->@fs_type and ->@flags, and
+ * specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
+ * respectively.
  */
-struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-				struct kernfs_root *root, unsigned long magic,
-				bool *new_sb_created, const void *ns)
+int kernfs_get_tree(struct fs_context *fc)
 {
+	struct kernfs_fs_context *kfc = fc->fs_private;
 	struct super_block *sb;
 	struct kernfs_super_info *info;
 	int error;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
-	info->root = root;
-	info->ns = ns;
+	info->root = kfc->root;
+	info->ns = kfc->ns_tag;
 	INIT_LIST_HEAD(&info->node);
 
-	sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
-			 &init_user_ns, info);
-	if (IS_ERR(sb) || sb->s_fs_info != info)
-		kfree(info);
+	fc->s_fs_info = info;
+	sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
 	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-
-	if (new_sb_created)
-		*new_sb_created = !sb->s_root;
+		return PTR_ERR(sb);
 
 	if (!sb->s_root) {
 		struct kernfs_super_info *info = kernfs_info(sb);
 
-		error = kernfs_fill_super(sb, magic);
+		kfc->new_sb_created = true;
+
+		error = kernfs_fill_super(sb, kfc);
 		if (error) {
 			deactivate_locked_super(sb);
-			return ERR_PTR(error);
+			return error;
 		}
 		sb->s_flags |= SB_ACTIVE;
 
 		mutex_lock(&kernfs_mutex);
-		list_add(&info->node, &root->supers);
+		list_add(&info->node, &info->root->supers);
 		mutex_unlock(&kernfs_mutex);
 	}
 
-	return dget(sb->s_root);
+	fc->root = dget(sb->s_root);
+	return 0;
+}
+
+void kernfs_free_fs_context(struct fs_context *fc)
+{
+	/* Note that we don't deal with kfc->ns_tag here. */
+	kfree(fc->s_fs_info);
+	fc->s_fs_info = NULL;
 }
 
 /**
@@ -370,36 +361,6 @@
 	kfree(info);
 }
 
-/**
- * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
- * @kernfs_root: the kernfs_root in question
- * @ns: the namespace tag
- *
- * Pin the superblock so the superblock won't be destroyed in subsequent
- * operations.  This can be used to block ->kill_sb() which may be useful
- * for kernfs users which dynamically manage superblocks.
- *
- * Returns NULL if there's no superblock associated to this kernfs_root, or
- * -EINVAL if the superblock is being freed.
- */
-struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
-{
-	struct kernfs_super_info *info;
-	struct super_block *sb = NULL;
-
-	mutex_lock(&kernfs_mutex);
-	list_for_each_entry(info, &root->supers, node) {
-		if (info->ns == ns) {
-			sb = info->sb;
-			if (!atomic_inc_not_zero(&info->sb->s_active))
-				sb = ERR_PTR(-EINVAL);
-			break;
-		}
-	}
-	mutex_unlock(&kernfs_mutex);
-	return sb;
-}
-
 void __init kernfs_init(void)
 {
 
@@ -414,4 +375,9 @@
 					      0,
 					      SLAB_PANIC | SLAB_TYPESAFE_BY_RCU,
 					      NULL);
+
+	/* Creates slab cache for kernfs inode attributes */
+	kernfs_iattrs_cache  = kmem_cache_create("kernfs_iattrs_cache",
+					      sizeof(struct kernfs_iattrs),
+					      0, SLAB_PANIC, NULL);
 }

diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 305b220..5432883 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c

@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/kernfs/symlink.c - kernfs symlink implementation
  *
  * Copyright (c) 2001-3 Patrick Mochel
  * Copyright (c) 2007 SUSE Linux Products GmbH
  * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2.
  */
 
 #include <linux/fs.h>
@@ -33,8 +32,8 @@
 	kgid_t gid = GLOBAL_ROOT_GID;
 
 	if (target->iattr) {
-		uid = target->iattr->ia_iattr.ia_uid;
-		gid = target->iattr->ia_iattr.ia_gid;
+		uid = target->iattr->ia_uid;
+		gid = target->iattr->ia_gid;
 	}
 
 	kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid,
@@ -72,6 +71,9 @@
 		if (base == kn)
 			break;
 
+		if ((s - path) + 3 >= PATH_MAX)
+			return -ENAMETOOLONG;
+
 		strcpy(s, "../");
 		s += 3;
 		base = base->parent;
@@ -88,7 +90,7 @@
 	if (len < 2)
 		return -EINVAL;
 	len--;
-	if ((s - path) + len > PATH_MAX)
+	if ((s - path) + len >= PATH_MAX)
 		return -ENAMETOOLONG;
 
 	/* reverse fillup of target string from target to base */

diff --git a/fs/libfs.c b/fs/libfs.c
index 0fb590d..1463b03 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *	fs/libfs.c
  *	Library for filesystems writers.
@@ -16,6 +17,8 @@
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
+#include <linux/fs_context.h>
+#include <linux/pseudo_fs.h>
 
 #include <linux/uaccess.h>
 
@@ -86,58 +89,45 @@
 EXPORT_SYMBOL(dcache_dir_close);
 
 /* parent is locked at least shared */
-static struct dentry *next_positive(struct dentry *parent,
-				    struct list_head *from,
-				    int count)
+/*
+ * Returns an element of siblings' list.
+ * We are looking for <count>th positive after <p>; if
+ * found, dentry is grabbed and returned to caller.
+ * If no such element exists, NULL is returned.
+ */
+static struct dentry *scan_positives(struct dentry *cursor,
+					struct list_head *p,
+					loff_t count,
+					struct dentry *last)
 {
-	unsigned *seq = &parent->d_inode->i_dir_seq, n;
-	struct dentry *res;
-	struct list_head *p;
-	bool skipped;
-	int i;
+	struct dentry *dentry = cursor->d_parent, *found = NULL;
 
-retry:
-	i = count;
-	skipped = false;
-	n = smp_load_acquire(seq) & ~1;
-	res = NULL;
-	rcu_read_lock();
-	for (p = from->next; p != &parent->d_subdirs; p = p->next) {
+	spin_lock(&dentry->d_lock);
+	while ((p = p->next) != &dentry->d_subdirs) {
 		struct dentry *d = list_entry(p, struct dentry, d_child);
-		if (!simple_positive(d)) {
-			skipped = true;
-		} else if (!--i) {
-			res = d;
-			break;
+		// we must at least skip cursors, to avoid livelocks
+		if (d->d_flags & DCACHE_DENTRY_CURSOR)
+			continue;
+		if (simple_positive(d) && !--count) {
+			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+			if (simple_positive(d))
+				found = dget_dlock(d);
+			spin_unlock(&d->d_lock);
+			if (likely(found))
+				break;
+			count = 1;
+		}
+		if (need_resched()) {
+			list_move(&cursor->d_child, p);
+			p = &cursor->d_child;
+			spin_unlock(&dentry->d_lock);
+			cond_resched();
+			spin_lock(&dentry->d_lock);
 		}
 	}
-	rcu_read_unlock();
-	if (skipped) {
-		smp_rmb();
-		if (unlikely(*seq != n))
-			goto retry;
-	}
-	return res;
-}
-
-static void move_cursor(struct dentry *cursor, struct list_head *after)
-{
-	struct dentry *parent = cursor->d_parent;
-	unsigned n, *seq = &parent->d_inode->i_dir_seq;
-	spin_lock(&parent->d_lock);
-	for (;;) {
-		n = *seq;
-		if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
-			break;
-		cpu_relax();
-	}
-	__list_del(cursor->d_child.prev, cursor->d_child.next);
-	if (after)
-		list_add(&cursor->d_child, after);
-	else
-		list_add_tail(&cursor->d_child, &parent->d_subdirs);
-	smp_store_release(seq, n + 2);
-	spin_unlock(&parent->d_lock);
+	spin_unlock(&dentry->d_lock);
+	dput(last);
+	return found;
 }
 
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
@@ -146,24 +136,34 @@
 	switch (whence) {
 		case 1:
 			offset += file->f_pos;
+			/* fall through */
 		case 0:
 			if (offset >= 0)
 				break;
+			/* fall through */
 		default:
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		if (file->f_pos >= 2) {
-			struct dentry *cursor = file->private_data;
-			struct dentry *to;
-			loff_t n = file->f_pos - 2;
+		struct dentry *cursor = file->private_data;
+		struct dentry *to = NULL;
 
-			inode_lock_shared(dentry->d_inode);
-			to = next_positive(dentry, &dentry->d_subdirs, n);
-			move_cursor(cursor, to ? &to->d_child : NULL);
-			inode_unlock_shared(dentry->d_inode);
-		}
+		inode_lock_shared(dentry->d_inode);
+
+		if (offset > 2)
+			to = scan_positives(cursor, &dentry->d_subdirs,
+					    offset - 2, NULL);
+		spin_lock(&dentry->d_lock);
+		if (to)
+			list_move(&cursor->d_child, &to->d_child);
+		else
+			list_del_init(&cursor->d_child);
+		spin_unlock(&dentry->d_lock);
+		dput(to);
+
+		file->f_pos = offset;
+
+		inode_unlock_shared(dentry->d_inode);
 	}
 	return offset;
 }
@@ -185,25 +185,35 @@
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct dentry *cursor = file->private_data;
-	struct list_head *p = &cursor->d_child;
-	struct dentry *next;
-	bool moved = false;
+	struct list_head *anchor = &dentry->d_subdirs;
+	struct dentry *next = NULL;
+	struct list_head *p;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
 	if (ctx->pos == 2)
-		p = &dentry->d_subdirs;
-	while ((next = next_positive(dentry, p, 1)) != NULL) {
+		p = anchor;
+	else if (!list_empty(&cursor->d_child))
+		p = &cursor->d_child;
+	else
+		return 0;
+
+	while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
 		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
 			      d_inode(next)->i_ino, dt_type(d_inode(next))))
 			break;
-		moved = true;
-		p = &next->d_child;
 		ctx->pos++;
+		p = &next->d_child;
 	}
-	if (moved)
-		move_cursor(cursor, p);
+	spin_lock(&dentry->d_lock);
+	if (next)
+		list_move_tail(&cursor->d_child, &next->d_child);
+	else
+		list_del_init(&cursor->d_child);
+	spin_unlock(&dentry->d_lock);
+	dput(next);
+
 	return 0;
 }
 EXPORT_SYMBOL(dcache_readdir);
@@ -233,34 +243,22 @@
 	.statfs		= simple_statfs,
 };
 
-/*
- * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
- * will never be mountable)
- */
-struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
-	const struct super_operations *ops, const struct xattr_handler **xattr,
-	const struct dentry_operations *dops, unsigned long magic)
+static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 {
-	struct super_block *s;
-	struct dentry *dentry;
+	struct pseudo_fs_context *ctx = fc->fs_private;
 	struct inode *root;
-	struct qstr d_name = QSTR_INIT(name, strlen(name));
-
-	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
-			&init_user_ns, NULL);
-	if (IS_ERR(s))
-		return ERR_CAST(s);
 
 	s->s_maxbytes = MAX_LFS_FILESIZE;
 	s->s_blocksize = PAGE_SIZE;
 	s->s_blocksize_bits = PAGE_SHIFT;
-	s->s_magic = magic;
-	s->s_op = ops ? ops : &simple_super_operations;
-	s->s_xattr = xattr;
+	s->s_magic = ctx->magic;
+	s->s_op = ctx->ops ?: &simple_super_operations;
+	s->s_xattr = ctx->xattr;
 	s->s_time_gran = 1;
 	root = new_inode(s);
 	if (!root)
-		goto Enomem;
+		return -ENOMEM;
+
 	/*
 	 * since this is the first inode, make it number 1. New inodes created
 	 * after this must take care not to collide with it (by passing
@@ -269,22 +267,48 @@
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
 	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
-	dentry = __d_alloc(s, &d_name);
-	if (!dentry) {
-		iput(root);
-		goto Enomem;
-	}
-	d_instantiate(dentry, root);
-	s->s_root = dentry;
-	s->s_d_op = dops;
-	s->s_flags |= SB_ACTIVE;
-	return dget(s->s_root);
-
-Enomem:
-	deactivate_locked_super(s);
-	return ERR_PTR(-ENOMEM);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
+		return -ENOMEM;
+	s->s_d_op = ctx->dops;
+	return 0;
 }
-EXPORT_SYMBOL(mount_pseudo_xattr);
+
+static int pseudo_fs_get_tree(struct fs_context *fc)
+{
+	return get_tree_nodev(fc, pseudo_fs_fill_super);
+}
+
+static void pseudo_fs_free(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations pseudo_fs_context_ops = {
+	.free		= pseudo_fs_free,
+	.get_tree	= pseudo_fs_get_tree,
+};
+
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
+					unsigned long magic)
+{
+	struct pseudo_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
+	if (likely(ctx)) {
+		ctx->magic = magic;
+		fc->fs_private = ctx;
+		fc->ops = &pseudo_fs_context_ops;
+		fc->sb_flags |= SB_NOUSER;
+		fc->global = true;
+	}
+	return ctx;
+}
+EXPORT_SYMBOL(init_pseudo);
 
 int simple_open(struct inode *inode, struct file *file)
 {
@@ -449,8 +473,7 @@
 
 /**
  * simple_write_end - .write_end helper for non-block-device FSes
- * @available: See .write_end of address_space_operations
- * @file: 		"
+ * @file: See .write_end of address_space_operations
  * @mapping: 		"
  * @pos: 		"
  * @len: 		"
@@ -1167,6 +1190,20 @@
 }
 EXPORT_SYMBOL(simple_nosetlease);
 
+/**
+ * simple_get_link - generic helper to get the target of "fast" symlinks
+ * @dentry: not used here
+ * @inode: the symlink inode
+ * @done: not used here
+ *
+ * Generic helper for filesystems to use for symlink inodes where a pointer to
+ * the symlink target is stored in ->i_link.  NOTE: this isn't normally called,
+ * since as an optimization the path lookup code uses any non-NULL ->i_link
+ * directly, without calling ->get_link().  But ->get_link() still must be set,
+ * to mark the inode_operations as being for a symlink.
+ *
+ * Return: the symlink target
+ */
 const char *simple_get_link(struct dentry *dentry, struct inode *inode,
 			    struct delayed_call *done)
 {

diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 00d5ef5..7df6324 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c

@@ -75,17 +75,6 @@
 }
 
 /*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
-	dprintk("lockd: %s prematurely hit the end of our receive buffer. "
-		"Remaining buffer length is %tu words.\n",
-		func, xdr->end - xdr->p);
-}
-
-
-/*
  * Encode/decode NLMv4 basic data types
  *
  * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
@@ -128,24 +117,14 @@
 static int decode_netobj(struct xdr_stream *xdr,
 			 struct xdr_netobj *obj)
 {
-	u32 length;
-	__be32 *p;
+	ssize_t ret;
 
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
-	length = be32_to_cpup(p++);
-	if (unlikely(length > XDR_MAX_NETOBJ))
-		goto out_size;
-	obj->len = length;
-	obj->data = (u8 *)p;
+	ret = xdr_stream_decode_opaque_inline(xdr, (void *)&obj->data,
+						XDR_MAX_NETOBJ);
+	if (unlikely(ret < 0))
+		return -EIO;
+	obj->len = ret;
 	return 0;
-out_size:
-	dprintk("NFS: returned netobj was too long: %u\n", length);
-	return -EIO;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -186,7 +165,6 @@
 	dprintk("NFS: returned cookie was too long: %u\n", length);
 	return -EIO;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 
@@ -246,7 +224,6 @@
 			__func__, be32_to_cpup(p));
 	return -EIO;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 
@@ -319,7 +296,6 @@
 out:
 	return error;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index c2a1286..a5bb3f7 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/lockd/clntlock.c
  *
@@ -56,14 +57,14 @@
 	u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
 	int status;
 
-	status = lockd_up(nlm_init->net);
+	status = lockd_up(nlm_init->net, nlm_init->cred);
 	if (status < 0)
 		return ERR_PTR(status);
 
 	host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
 				   nlm_init->protocol, nlm_version,
 				   nlm_init->hostname, nlm_init->noresvport,
-				   nlm_init->net);
+				   nlm_init->net, nlm_init->cred);
 	if (host == NULL)
 		goto out_nohost;
 	if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
@@ -241,7 +242,7 @@
 	allow_signal(SIGKILL);
 
 	down_write(&host->h_rwsem);
-	lockd_up(net);	/* note: this cannot fail as lockd is already running */
+	lockd_up(net, NULL);	/* note: this cannot fail as lockd is already running */
 
 	dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
 

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index d20b92f..b11f2af 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/lockd/clntproc.c
  *
@@ -46,13 +47,14 @@
 	c->len=4;
 }
 
-static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner)
+static struct nlm_lockowner *
+nlmclnt_get_lockowner(struct nlm_lockowner *lockowner)
 {
 	refcount_inc(&lockowner->count);
 	return lockowner;
 }
 
-static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
+static void nlmclnt_put_lockowner(struct nlm_lockowner *lockowner)
 {
 	if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
 		return;
@@ -81,28 +83,28 @@
 	return res;
 }
 
-static struct nlm_lockowner *__nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner)
+static struct nlm_lockowner *__nlmclnt_find_lockowner(struct nlm_host *host, fl_owner_t owner)
 {
 	struct nlm_lockowner *lockowner;
 	list_for_each_entry(lockowner, &host->h_lockowners, list) {
 		if (lockowner->owner != owner)
 			continue;
-		return nlm_get_lockowner(lockowner);
+		return nlmclnt_get_lockowner(lockowner);
 	}
 	return NULL;
 }
 
-static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner)
+static struct nlm_lockowner *nlmclnt_find_lockowner(struct nlm_host *host, fl_owner_t owner)
 {
 	struct nlm_lockowner *res, *new = NULL;
 
 	spin_lock(&host->h_lock);
-	res = __nlm_find_lockowner(host, owner);
+	res = __nlmclnt_find_lockowner(host, owner);
 	if (res == NULL) {
 		spin_unlock(&host->h_lock);
 		new = kmalloc(sizeof(*new), GFP_KERNEL);
 		spin_lock(&host->h_lock);
-		res = __nlm_find_lockowner(host, owner);
+		res = __nlmclnt_find_lockowner(host, owner);
 		if (res == NULL && new != NULL) {
 			res = new;
 			refcount_set(&new->count, 1);
@@ -256,7 +258,7 @@
  * Generic NLM call
  */
 static int
-nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc)
+nlmclnt_call(const struct cred *cred, struct nlm_rqst *req, u32 proc)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
@@ -401,7 +403,7 @@
  *      completion in order to be able to correctly track the lock
  *      state.
  */
-static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+static int nlmclnt_async_call(const struct cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
 {
 	struct rpc_message msg = {
 		.rpc_argp	= &req->a_args,
@@ -442,7 +444,7 @@
 			fl->fl_start = req->a_res.lock.fl.fl_start;
 			fl->fl_end = req->a_res.lock.fl.fl_end;
 			fl->fl_type = req->a_res.lock.fl.fl_type;
-			fl->fl_pid = 0;
+			fl->fl_pid = -req->a_res.lock.fl.fl_pid;
 			break;
 		default:
 			status = nlm_stat_to_errno(req->a_res.status);
@@ -456,7 +458,7 @@
 {
 	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 	new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
-	new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
+	new->fl_u.nfs_fl.owner = nlmclnt_get_lockowner(fl->fl_u.nfs_fl.owner);
 	list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
 	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 }
@@ -466,7 +468,7 @@
 	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 	list_del(&fl->fl_u.nfs_fl.list);
 	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
-	nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
+	nlmclnt_put_lockowner(fl->fl_u.nfs_fl.owner);
 }
 
 static const struct file_lock_operations nlmclnt_lock_ops = {
@@ -477,7 +479,7 @@
 static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
 {
 	fl->fl_u.nfs_fl.state = 0;
-	fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
+	fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, fl->fl_owner);
 	INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
 	fl->fl_ops = &nlmclnt_lock_ops;
 }
@@ -510,7 +512,7 @@
 static int
 nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 {
-	struct rpc_cred *cred = nfs_file_cred(fl->fl_file);
+	const struct cred *cred = nfs_file_cred(fl->fl_file);
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
 	struct nlm_wait *block = NULL;
@@ -715,7 +717,7 @@
 	struct nlm_rqst	*req = data;
 	u32 status = ntohl(req->a_res.status);
 
-	if (RPC_ASSASSINATED(task))
+	if (RPC_SIGNALLED(task))
 		goto die;
 
 	if (task->tk_status < 0) {
@@ -783,7 +785,7 @@
 	struct nlm_rqst	*req = data;
 	u32 status = ntohl(req->a_res.status);
 
-	if (RPC_ASSASSINATED(task))
+	if (RPC_SIGNALLED(task))
 		goto die;
 
 	if (task->tk_status < 0) {

diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 2c61763..4df62f6 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c

@@ -71,17 +71,6 @@
 }
 
 /*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
-	dprintk("lockd: %s prematurely hit the end of our receive buffer. "
-		"Remaining buffer length is %tu words.\n",
-		func, xdr->end - xdr->p);
-}
-
-
-/*
  * Encode/decode NLMv3 basic data types
  *
  * Basic NLMv3 data types are not defined in an IETF standards
@@ -125,24 +114,14 @@
 static int decode_netobj(struct xdr_stream *xdr,
 			 struct xdr_netobj *obj)
 {
-	u32 length;
-	__be32 *p;
+	ssize_t ret;
 
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
-	length = be32_to_cpup(p++);
-	if (unlikely(length > XDR_MAX_NETOBJ))
-		goto out_size;
-	obj->len = length;
-	obj->data = (u8 *)p;
+	ret = xdr_stream_decode_opaque_inline(xdr, (void *)&obj->data,
+			XDR_MAX_NETOBJ);
+	if (unlikely(ret < 0))
+		return -EIO;
+	obj->len = ret;
 	return 0;
-out_size:
-	dprintk("NFS: returned netobj was too long: %u\n", length);
-	return -EIO;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -183,7 +162,6 @@
 	dprintk("NFS: returned cookie was too long: %u\n", length);
 	return -EIO;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 
@@ -241,7 +219,6 @@
 		__func__, be32_to_cpup(p));
 	return -EIO;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 
@@ -313,7 +290,6 @@
 out:
 	return error;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 93fb7cf..7d46faf 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c

@@ -60,6 +60,7 @@
 	const size_t		hostname_len;	/* it's length */
 	const int		noresvport;	/* use non-priv port */
 	struct net		*net;		/* network namespace to bind */
+	const struct cred	*cred;
 };
 
 /*
@@ -162,6 +163,7 @@
 	host->h_nsmhandle  = nsm;
 	host->h_addrbuf    = nsm->sm_addrbuf;
 	host->net	   = ni->net;
+	host->h_cred	   = get_cred(ni->cred),
 	strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
 
 out:
@@ -188,6 +190,7 @@
 	clnt = host->h_rpcclnt;
 	if (clnt != NULL)
 		rpc_shutdown_client(clnt);
+	put_cred(host->h_cred);
 	kfree(host);
 
 	ln->nrhosts--;
@@ -202,6 +205,8 @@
  * @version: NLM protocol version
  * @hostname: '\0'-terminated hostname of server
  * @noresvport: 1 if non-privileged port should be used
+ * @net: pointer to net namespace
+ * @cred: pointer to cred
  *
  * Returns an nlm_host structure that matches the passed-in
  * [server address, transport protocol, NLM version, server hostname].
@@ -214,7 +219,8 @@
 				     const u32 version,
 				     const char *hostname,
 				     int noresvport,
-				     struct net *net)
+				     struct net *net,
+				     const struct cred *cred)
 {
 	struct nlm_lookup_host_info ni = {
 		.server		= 0,
@@ -226,6 +232,7 @@
 		.hostname_len	= strlen(hostname),
 		.noresvport	= noresvport,
 		.net		= net,
+		.cred		= cred,
 	};
 	struct hlist_head *chain;
 	struct nlm_host	*host;
@@ -290,12 +297,11 @@
 
 	WARN_ON_ONCE(host->h_server);
 
-	if (refcount_dec_and_test(&host->h_count)) {
+	if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) {
 		WARN_ON_ONCE(!list_empty(&host->h_lockowners));
 		WARN_ON_ONCE(!list_empty(&host->h_granted));
 		WARN_ON_ONCE(!list_empty(&host->h_reclaim));
 
-		mutex_lock(&nlm_host_mutex);
 		nlm_destroy_host_locked(host);
 		mutex_unlock(&nlm_host_mutex);
 	}
@@ -459,6 +465,7 @@
 			.authflavor	= RPC_AUTH_UNIX,
 			.flags		= (RPC_CLNT_CREATE_NOPING |
 					   RPC_CLNT_CREATE_AUTOBIND),
+			.cred		= host->h_cred,
 		};
 
 		/*

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 654594e..1eabd91 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c

@@ -82,6 +82,7 @@
 		.version		= NSM_VERSION,
 		.authflavor		= RPC_AUTH_NULL,
 		.flags			= RPC_CLNT_CREATE_NOPING,
+		.cred			= current_cred(),
 	};
 
 	return rpc_create(&args);

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 346ed16..1a639e3 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/lockd/svc.c
  *
@@ -188,28 +189,31 @@
 
 static int create_lockd_listener(struct svc_serv *serv, const char *name,
 				 struct net *net, const int family,
-				 const unsigned short port)
+				 const unsigned short port,
+				 const struct cred *cred)
 {
 	struct svc_xprt *xprt;
 
 	xprt = svc_find_xprt(serv, name, net, family, 0);
 	if (xprt == NULL)
 		return svc_create_xprt(serv, name, net, family, port,
-						SVC_SOCK_DEFAULTS);
+						SVC_SOCK_DEFAULTS, cred);
 	svc_xprt_put(xprt);
 	return 0;
 }
 
 static int create_lockd_family(struct svc_serv *serv, struct net *net,
-			       const int family)
+			       const int family, const struct cred *cred)
 {
 	int err;
 
-	err = create_lockd_listener(serv, "udp", net, family, nlm_udpport);
+	err = create_lockd_listener(serv, "udp", net, family, nlm_udpport,
+			cred);
 	if (err < 0)
 		return err;
 
-	return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport);
+	return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport,
+			cred);
 }
 
 /*
@@ -222,16 +226,17 @@
  * Returns zero if all listeners are available; otherwise a
  * negative errno value is returned.
  */
-static int make_socks(struct svc_serv *serv, struct net *net)
+static int make_socks(struct svc_serv *serv, struct net *net,
+		const struct cred *cred)
 {
 	static int warned;
 	int err;
 
-	err = create_lockd_family(serv, net, PF_INET);
+	err = create_lockd_family(serv, net, PF_INET, cred);
 	if (err < 0)
 		goto out_err;
 
-	err = create_lockd_family(serv, net, PF_INET6);
+	err = create_lockd_family(serv, net, PF_INET6, cred);
 	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_err;
 
@@ -246,7 +251,8 @@
 	return err;
 }
 
-static int lockd_up_net(struct svc_serv *serv, struct net *net)
+static int lockd_up_net(struct svc_serv *serv, struct net *net,
+		const struct cred *cred)
 {
 	struct lockd_net *ln = net_generic(net, lockd_net_id);
 	int error;
@@ -258,7 +264,7 @@
 	if (error)
 		goto err_bind;
 
-	error = make_socks(serv, net);
+	error = make_socks(serv, net, cred);
 	if (error < 0)
 		goto err_bind;
 	set_grace_period(net);
@@ -461,7 +467,7 @@
 /*
  * Bring up the lockd process if it's not already up.
  */
-int lockd_up(struct net *net)
+int lockd_up(struct net *net, const struct cred *cred)
 {
 	struct svc_serv *serv;
 	int error;
@@ -474,7 +480,7 @@
 		goto err_create;
 	}
 
-	error = lockd_up_net(serv, net);
+	error = lockd_up_net(serv, net, cred);
 	if (error < 0) {
 		lockd_unregister_notifiers();
 		goto err_put;
@@ -807,5 +813,7 @@
 	.pg_name		= "lockd",		/* service name */
 	.pg_class		= "nfsd",		/* share authentication with nfsd */
 	.pg_stats		= &nlmsvc_stats,	/* stats table */
-	.pg_authenticate = &lockd_authenticate	/* export authentication */
+	.pg_authenticate	= &lockd_authenticate,	/* export authentication */
+	.pg_init_request	= svc_generic_init_request,
+	.pg_rpcbind_set		= svc_generic_rpcbind_set,
 };

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 1bddf70..e4d3f78 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c

@@ -46,8 +46,14 @@
 
 		/* Set up the missing parts of the file_lock structure */
 		lock->fl.fl_file  = file->f_file;
-		lock->fl.fl_owner = (fl_owner_t) host;
+		lock->fl.fl_pid = current->tgid;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
+		nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
+		if (!lock->fl.fl_owner) {
+			/* lockowner allocation has failed */
+			nlmsvc_release_host(host);
+			return nlm_lck_denied_nolocks;
+		}
 	}
 
 	return 0;
@@ -94,6 +100,7 @@
 	else
 		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
 
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
@@ -142,6 +149,7 @@
 	else
 		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
@@ -178,6 +186,7 @@
 	resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock);
 
 	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -217,6 +226,7 @@
 	resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock);
 
 	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -365,6 +375,7 @@
 	resp->status = nlmsvc_share_file(host, file, argp);
 
 	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -399,6 +410,7 @@
 	resp->status = nlmsvc_unshare_file(host, file, argp);
 
 	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 74330da..61d3cc2 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c

@@ -276,7 +276,7 @@
 	dprintk("lockd: unlinking block %p...\n", block);
 
 	/* Remove block from list */
-	status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
+	status = locks_delete_block(&block->b_call->a_args.lock.fl);
 	nlmsvc_remove_block(block);
 	return status;
 }
@@ -332,6 +332,93 @@
 	mutex_unlock(&file->f_mutex);
 }
 
+static struct nlm_lockowner *
+nlmsvc_get_lockowner(struct nlm_lockowner *lockowner)
+{
+	refcount_inc(&lockowner->count);
+	return lockowner;
+}
+
+static void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
+{
+	if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
+		return;
+	list_del(&lockowner->list);
+	spin_unlock(&lockowner->host->h_lock);
+	nlmsvc_release_host(lockowner->host);
+	kfree(lockowner);
+}
+
+static struct nlm_lockowner *__nlmsvc_find_lockowner(struct nlm_host *host, pid_t pid)
+{
+	struct nlm_lockowner *lockowner;
+	list_for_each_entry(lockowner, &host->h_lockowners, list) {
+		if (lockowner->pid != pid)
+			continue;
+		return nlmsvc_get_lockowner(lockowner);
+	}
+	return NULL;
+}
+
+static struct nlm_lockowner *nlmsvc_find_lockowner(struct nlm_host *host, pid_t pid)
+{
+	struct nlm_lockowner *res, *new = NULL;
+
+	spin_lock(&host->h_lock);
+	res = __nlmsvc_find_lockowner(host, pid);
+
+	if (res == NULL) {
+		spin_unlock(&host->h_lock);
+		new = kmalloc(sizeof(*res), GFP_KERNEL);
+		spin_lock(&host->h_lock);
+		res = __nlmsvc_find_lockowner(host, pid);
+		if (res == NULL && new != NULL) {
+			res = new;
+			/* fs/locks.c will manage the refcount through lock_ops */
+			refcount_set(&new->count, 1);
+			new->pid = pid;
+			new->host = nlm_get_host(host);
+			list_add(&new->list, &host->h_lockowners);
+			new = NULL;
+		}
+	}
+
+	spin_unlock(&host->h_lock);
+	kfree(new);
+	return res;
+}
+
+void
+nlmsvc_release_lockowner(struct nlm_lock *lock)
+{
+	if (lock->fl.fl_owner)
+		nlmsvc_put_lockowner(lock->fl.fl_owner);
+}
+
+static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner;
+	new->fl_owner = nlmsvc_get_lockowner(nlm_lo);
+}
+
+static void nlmsvc_locks_release_private(struct file_lock *fl)
+{
+	nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner);
+}
+
+static const struct file_lock_operations nlmsvc_lock_ops = {
+	.fl_copy_lock = nlmsvc_locks_copy_lock,
+	.fl_release_private = nlmsvc_locks_release_private,
+};
+
+void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host,
+						pid_t pid)
+{
+	fl->fl_owner = nlmsvc_find_lockowner(host, pid);
+	if (fl->fl_owner != NULL)
+		fl->fl_ops = &nlmsvc_lock_ops;
+}
+
 /*
  * Initialize arguments for GRANTED call. The nlm_rqst structure
  * has been cleared already.
@@ -345,7 +432,7 @@
 
 	/* set default data area */
 	call->a_args.lock.oh.data = call->a_owner;
-	call->a_args.lock.svid = lock->fl.fl_pid;
+	call->a_args.lock.svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid;
 
 	if (lock->oh.len > NLMCLNT_OHSIZE) {
 		void *data = kmalloc(lock->oh.len, GFP_KERNEL);
@@ -509,6 +596,7 @@
 {
 	int			error;
 	__be32			ret;
+	struct nlm_lockowner	*test_owner;
 
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
 				locks_inode(file->f_file)->i_sb->s_id,
@@ -522,6 +610,9 @@
 		goto out;
 	}
 
+	/* If there's a conflicting lock, remember to clean up the test lock */
+	test_owner = (struct nlm_lockowner *)lock->fl.fl_owner;
+
 	error = vfs_test_lock(file->f_file, &lock->fl);
 	if (error) {
 		/* We can't currently deal with deferred test requests */
@@ -543,11 +634,16 @@
 	conflock->caller = "somehost";	/* FIXME */
 	conflock->len = strlen(conflock->caller);
 	conflock->oh.len = 0;		/* don't return OH info */
-	conflock->svid = lock->fl.fl_pid;
+	conflock->svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid;
 	conflock->fl.fl_type = lock->fl.fl_type;
 	conflock->fl.fl_start = lock->fl.fl_start;
 	conflock->fl.fl_end = lock->fl.fl_end;
 	locks_release_private(&lock->fl);
+
+	/* Clean up the test lock */
+	lock->fl.fl_owner = NULL;
+	nlmsvc_put_lockowner(test_owner);
+
 	ret = nlm_lck_denied;
 out:
 	return ret;
@@ -692,25 +788,7 @@
 	printk(KERN_WARNING "lockd: notification for unknown block!\n");
 }
 
-static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
-{
-	return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
-}
-
-/*
- * Since NLM uses two "keys" for tracking locks, we need to hash them down
- * to one for the blocked_hash. Here, we're just xor'ing the host address
- * with the pid in order to create a key value for picking a hash bucket.
- */
-static unsigned long
-nlmsvc_owner_key(struct file_lock *fl)
-{
-	return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
-}
-
 const struct lock_manager_operations nlmsvc_lock_operations = {
-	.lm_compare_owner = nlmsvc_same_owner,
-	.lm_owner_key = nlmsvc_owner_key,
 	.lm_notify = nlmsvc_notify_blocked,
 	.lm_grant = nlmsvc_grant_deferred,
 };

diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index ea77c66..d0bb7a6 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c

@@ -76,8 +76,14 @@
 
 		/* Set up the missing parts of the file_lock structure */
 		lock->fl.fl_file  = file->f_file;
-		lock->fl.fl_owner = (fl_owner_t) host;
+		lock->fl.fl_pid = current->tgid;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
+		nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
+		if (!lock->fl.fl_owner) {
+			/* lockowner allocation has failed */
+			nlmsvc_release_host(host);
+			return nlm_lck_denied_nolocks;
+		}
 	}
 
 	return 0;
@@ -125,6 +131,7 @@
 		dprintk("lockd: TEST          status %d vers %d\n",
 			ntohl(resp->status), rqstp->rq_vers);
 
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
@@ -173,6 +180,7 @@
 	else
 		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
@@ -210,6 +218,7 @@
 	resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock));
 
 	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -250,6 +259,7 @@
 	resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock));
 
 	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -408,6 +418,7 @@
 	resp->status = cast_status(nlmsvc_share_file(host, file, argp));
 
 	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
@@ -442,6 +453,7 @@
 	resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
 
 	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
+	nlmsvc_release_lockowner(&argp->lock);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;

diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 899360b..028fc15 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/lockd/svcsubs.c
  *
@@ -179,7 +180,7 @@
 		/* update current lock count */
 		file->f_locks++;
 
-		lockhost = (struct nlm_host *) fl->fl_owner;
+		lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host;
 		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 

diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 7147e4a..982629f 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c

@@ -126,8 +126,6 @@
 	lock->svid  = ntohl(*p++);
 
 	locks_init_lock(fl);
-	fl->fl_owner = current->files;
-	fl->fl_pid   = (pid_t)lock->svid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
 	start = ntohl(*p++);
@@ -269,7 +267,6 @@
 	memset(lock, 0, sizeof(*lock));
 	locks_init_lock(&lock->fl);
 	lock->svid = ~(u32) 0;
-	lock->fl.fl_pid = (pid_t)lock->svid;
 
 	if (!(p = nlm_decode_cookie(p, &argp->cookie))
 	 || !(p = xdr_decode_string_inplace(p, &lock->caller,

diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 7ed9edf..5fa9f48 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c

@@ -118,8 +118,6 @@
 	lock->svid  = ntohl(*p++);
 
 	locks_init_lock(fl);
-	fl->fl_owner = current->files;
-	fl->fl_pid   = (pid_t)lock->svid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
 	p = xdr_decode_hyper(p, &start);
@@ -266,7 +264,6 @@
 	memset(lock, 0, sizeof(*lock));
 	locks_init_lock(&lock->fl);
 	lock->svid = ~(u32) 0;
-	lock->fl.fl_pid = (pid_t)lock->svid;
 
 	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
 	 || !(p = xdr_decode_string_inplace(p, &lock->caller,

diff --git a/fs/locks.c b/fs/locks.c
index 2ecb4db..6970f55 100644
--- a/fs/locks.c
+++ b/fs/locks.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/locks.c
  *
@@ -11,11 +12,11 @@
  *
  *  Miscellaneous edits, and a total rewrite of posix_lock_file() code.
  *  Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *  
+ *
  *  Converted file_lock_table to a linked list from an array, which eliminates
  *  the limits on how many active file locks are open.
  *  Chad Page (pageone@netcom.com), November 27, 1994
- * 
+ *
  *  Removed dependency on file descriptors. dup()'ed file descriptors now
  *  get the same locks as the original file descriptors, and a close() on
  *  any file descriptor removes ALL the locks on the file for the current
@@ -41,7 +42,7 @@
  *  with a file pointer (filp). As a result they can be shared by a parent
  *  process and its children after a fork(). They are removed when the last
  *  file descriptor referring to the file pointer is closed (unless explicitly
- *  unlocked). 
+ *  unlocked).
  *
  *  FL_FLOCK locks never deadlock, an existing lock is always removed before
  *  upgrading from shared to exclusive (or vice versa). When this happens
@@ -50,7 +51,7 @@
  *  Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
  *
  *  Removed some race conditions in flock_lock_file(), marked other possible
- *  races. Just grep for FIXME to see them. 
+ *  races. Just grep for FIXME to see them.
  *  Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
  *
  *  Addressed Dmitry's concerns. Deadlock checking no longer recursive.
@@ -112,6 +113,46 @@
  *  Leases and LOCK_MAND
  *  Matthew Wilcox <willy@debian.org>, June, 2000.
  *  Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
+ *
+ * Locking conflicts and dependencies:
+ * If multiple threads attempt to lock the same byte (or flock the same file)
+ * only one can be granted the lock, and other must wait their turn.
+ * The first lock has been "applied" or "granted", the others are "waiting"
+ * and are "blocked" by the "applied" lock..
+ *
+ * Waiting and applied locks are all kept in trees whose properties are:
+ *
+ *	- the root of a tree may be an applied or waiting lock.
+ *	- every other node in the tree is a waiting lock that
+ *	  conflicts with every ancestor of that node.
+ *
+ * Every such tree begins life as a waiting singleton which obviously
+ * satisfies the above properties.
+ *
+ * The only ways we modify trees preserve these properties:
+ *
+ *	1. We may add a new leaf node, but only after first verifying that it
+ *	   conflicts with all of its ancestors.
+ *	2. We may remove the root of a tree, creating a new singleton
+ *	   tree from the root and N new trees rooted in the immediate
+ *	   children.
+ *	3. If the root of a tree is not currently an applied lock, we may
+ *	   apply it (if possible).
+ *	4. We may upgrade the root of the tree (either extend its range,
+ *	   or upgrade its entire range from read to write).
+ *
+ * When an applied lock is modified in a way that reduces or downgrades any
+ * part of its range, we remove all its children (2 above).  This particularly
+ * happens when a lock is unlocked.
+ *
+ * For each of those child trees we "wake up" the thread which is
+ * waiting for the lock so it can continue handling as follows: if the
+ * root of the tree applies, we do so (3).  If it doesn't, it must
+ * conflict with some applied lock.  We remove (wake up) all of its children
+ * (2), and add it is a new leaf to the tree rooted in the applied
+ * lock (1).  We then repeat the process recursively with those
+ * children.
+ *
  */
 
 #include <linux/capability.h>
@@ -171,6 +212,7 @@
 static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
 DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);
 
+
 /*
  * The blocked_hash is used to find POSIX lock loops for deadlock detection.
  * It is protected by blocked_lock_lock.
@@ -189,9 +231,9 @@
  * This lock protects the blocked_hash. Generally, if you're accessing it, you
  * want to be holding this lock.
  *
- * In addition, it also protects the fl->fl_block list, and the fl->fl_next
- * pointer for file_lock structures that are acting as lock requests (in
- * contrast to those that are acting as records of acquired locks).
+ * In addition, it also protects the fl->fl_blocked_requests list, and the
+ * fl->fl_blocker pointer for file_lock structures that are acting as lock
+ * requests (in contrast to those that are acting as records of acquired locks).
  *
  * Note that when we acquire this lock in order to change the above fields,
  * we often hold the flc_lock as well. In certain cases, when reading the fields
@@ -293,7 +335,8 @@
 {
 	INIT_HLIST_NODE(&fl->fl_link);
 	INIT_LIST_HEAD(&fl->fl_list);
-	INIT_LIST_HEAD(&fl->fl_block);
+	INIT_LIST_HEAD(&fl->fl_blocked_requests);
+	INIT_LIST_HEAD(&fl->fl_blocked_member);
 	init_waitqueue_head(&fl->fl_wait);
 }
 
@@ -311,6 +354,12 @@
 
 void locks_release_private(struct file_lock *fl)
 {
+	BUG_ON(waitqueue_active(&fl->fl_wait));
+	BUG_ON(!list_empty(&fl->fl_list));
+	BUG_ON(!list_empty(&fl->fl_blocked_requests));
+	BUG_ON(!list_empty(&fl->fl_blocked_member));
+	BUG_ON(!hlist_unhashed(&fl->fl_link));
+
 	if (fl->fl_ops) {
 		if (fl->fl_ops->fl_release_private)
 			fl->fl_ops->fl_release_private(fl);
@@ -330,11 +379,6 @@
 /* Free a lock which is not in use. */
 void locks_free_lock(struct file_lock *fl)
 {
-	BUG_ON(waitqueue_active(&fl->fl_wait));
-	BUG_ON(!list_empty(&fl->fl_list));
-	BUG_ON(!list_empty(&fl->fl_block));
-	BUG_ON(!hlist_unhashed(&fl->fl_link));
-
 	locks_release_private(fl);
 	kmem_cache_free(filelock_cache, fl);
 }
@@ -357,7 +401,6 @@
 	memset(fl, 0, sizeof(struct file_lock));
 	locks_init_lock_heads(fl);
 }
-
 EXPORT_SYMBOL(locks_init_lock);
 
 /*
@@ -397,9 +440,26 @@
 			fl->fl_ops->fl_copy_lock(new, fl);
 	}
 }
-
 EXPORT_SYMBOL(locks_copy_lock);
 
+static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
+{
+	struct file_lock *f;
+
+	/*
+	 * As ctx->flc_lock is held, new requests cannot be added to
+	 * ->fl_blocked_requests, so we don't need a lock to check if it
+	 * is empty.
+	 */
+	if (list_empty(&fl->fl_blocked_requests))
+		return;
+	spin_lock(&blocked_lock_lock);
+	list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests);
+	list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member)
+		f->fl_blocker = new;
+	spin_unlock(&blocked_lock_lock);
+}
+
 static inline int flock_translate_cmd(int cmd) {
 	if (cmd & LOCK_MAND)
 		return cmd & (LOCK_MAND | LOCK_RW);
@@ -416,17 +476,20 @@
 
 /* Fill in a file_lock structure with an appropriate FLOCK lock. */
 static struct file_lock *
-flock_make_lock(struct file *filp, unsigned int cmd)
+flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl)
 {
-	struct file_lock *fl;
 	int type = flock_translate_cmd(cmd);
 
 	if (type < 0)
 		return ERR_PTR(type);
-	
-	fl = locks_alloc_lock();
-	if (fl == NULL)
-		return ERR_PTR(-ENOMEM);
+
+	if (fl == NULL) {
+		fl = locks_alloc_lock();
+		if (fl == NULL)
+			return ERR_PTR(-ENOMEM);
+	} else {
+		locks_init_lock(fl);
+	}
 
 	fl->fl_file = filp;
 	fl->fl_owner = filp;
@@ -434,7 +497,7 @@
 	fl->fl_flags = FL_FLOCK;
 	fl->fl_type = type;
 	fl->fl_end = OFFSET_MAX;
-	
+
 	return fl;
 }
 
@@ -596,9 +659,6 @@
  */
 static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 {
-	if (fl1->fl_lmops && fl1->fl_lmops->lm_compare_owner)
-		return fl2->fl_lmops == fl1->fl_lmops &&
-			fl1->fl_lmops->lm_compare_owner(fl1, fl2);
 	return fl1->fl_owner == fl2->fl_owner;
 }
 
@@ -639,8 +699,6 @@
 static unsigned long
 posix_owner_key(struct file_lock *fl)
 {
-	if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
-		return fl->fl_lmops->lm_owner_key(fl);
 	return (unsigned long)fl->fl_owner;
 }
 
@@ -666,16 +724,58 @@
 static void __locks_delete_block(struct file_lock *waiter)
 {
 	locks_delete_global_blocked(waiter);
-	list_del_init(&waiter->fl_block);
-	waiter->fl_next = NULL;
+	list_del_init(&waiter->fl_blocked_member);
+	waiter->fl_blocker = NULL;
 }
 
-static void locks_delete_block(struct file_lock *waiter)
+static void __locks_wake_up_blocks(struct file_lock *blocker)
 {
+	while (!list_empty(&blocker->fl_blocked_requests)) {
+		struct file_lock *waiter;
+
+		waiter = list_first_entry(&blocker->fl_blocked_requests,
+					  struct file_lock, fl_blocked_member);
+		__locks_delete_block(waiter);
+		if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
+			waiter->fl_lmops->lm_notify(waiter);
+		else
+			wake_up(&waiter->fl_wait);
+	}
+}
+
+/**
+ *	locks_delete_lock - stop waiting for a file lock
+ *	@waiter: the lock which was waiting
+ *
+ *	lockd/nfsd need to disconnect the lock while working on it.
+ */
+int locks_delete_block(struct file_lock *waiter)
+{
+	int status = -ENOENT;
+
+	/*
+	 * If fl_blocker is NULL, it won't be set again as this thread
+	 * "owns" the lock and is the only one that might try to claim
+	 * the lock.  So it is safe to test fl_blocker locklessly.
+	 * Also if fl_blocker is NULL, this waiter is not listed on
+	 * fl_blocked_requests for some lock, so no other request can
+	 * be added to the list of fl_blocked_requests for this
+	 * request.  So if fl_blocker is NULL, it is safe to
+	 * locklessly check if fl_blocked_requests is empty.  If both
+	 * of these checks succeed, there is no need to take the lock.
+	 */
+	if (waiter->fl_blocker == NULL &&
+	    list_empty(&waiter->fl_blocked_requests))
+		return status;
 	spin_lock(&blocked_lock_lock);
+	if (waiter->fl_blocker)
+		status = 0;
+	__locks_wake_up_blocks(waiter);
 	__locks_delete_block(waiter);
 	spin_unlock(&blocked_lock_lock);
+	return status;
 }
+EXPORT_SYMBOL(locks_delete_block);
 
 /* Insert waiter into blocker's block list.
  * We use a circular list so that processes can be easily woken up in
@@ -683,26 +783,49 @@
  * it seems like the reasonable thing to do.
  *
  * Must be called with both the flc_lock and blocked_lock_lock held. The
- * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
- * that the flc_lock is also held on insertions we can avoid taking the
- * blocked_lock_lock in some cases when we see that the fl_block list is empty.
+ * fl_blocked_requests list itself is protected by the blocked_lock_lock,
+ * but by ensuring that the flc_lock is also held on insertions we can avoid
+ * taking the blocked_lock_lock in some cases when we see that the
+ * fl_blocked_requests list is empty.
+ *
+ * Rather than just adding to the list, we check for conflicts with any existing
+ * waiters, and add beneath any waiter that blocks the new waiter.
+ * Thus wakeups don't happen until needed.
  */
 static void __locks_insert_block(struct file_lock *blocker,
-					struct file_lock *waiter)
+				 struct file_lock *waiter,
+				 bool conflict(struct file_lock *,
+					       struct file_lock *))
 {
-	BUG_ON(!list_empty(&waiter->fl_block));
-	waiter->fl_next = blocker;
-	list_add_tail(&waiter->fl_block, &blocker->fl_block);
+	struct file_lock *fl;
+	BUG_ON(!list_empty(&waiter->fl_blocked_member));
+
+new_blocker:
+	list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
+		if (conflict(fl, waiter)) {
+			blocker =  fl;
+			goto new_blocker;
+		}
+	waiter->fl_blocker = blocker;
+	list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
 	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
 		locks_insert_global_blocked(waiter);
+
+	/* The requests in waiter->fl_blocked are known to conflict with
+	 * waiter, but might not conflict with blocker, or the requests
+	 * and lock which block it.  So they all need to be woken.
+	 */
+	__locks_wake_up_blocks(waiter);
 }
 
 /* Must be called with flc_lock held. */
 static void locks_insert_block(struct file_lock *blocker,
-					struct file_lock *waiter)
+			       struct file_lock *waiter,
+			       bool conflict(struct file_lock *,
+					     struct file_lock *))
 {
 	spin_lock(&blocked_lock_lock);
-	__locks_insert_block(blocker, waiter);
+	__locks_insert_block(blocker, waiter, conflict);
 	spin_unlock(&blocked_lock_lock);
 }
 
@@ -716,25 +839,15 @@
 	/*
 	 * Avoid taking global lock if list is empty. This is safe since new
 	 * blocked requests are only added to the list under the flc_lock, and
-	 * the flc_lock is always held here. Note that removal from the fl_block
-	 * list does not require the flc_lock, so we must recheck list_empty()
-	 * after acquiring the blocked_lock_lock.
+	 * the flc_lock is always held here. Note that removal from the
+	 * fl_blocked_requests list does not require the flc_lock, so we must
+	 * recheck list_empty() after acquiring the blocked_lock_lock.
 	 */
-	if (list_empty(&blocker->fl_block))
+	if (list_empty(&blocker->fl_blocked_requests))
 		return;
 
 	spin_lock(&blocked_lock_lock);
-	while (!list_empty(&blocker->fl_block)) {
-		struct file_lock *waiter;
-
-		waiter = list_first_entry(&blocker->fl_block,
-				struct file_lock, fl_block);
-		__locks_delete_block(waiter);
-		if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
-			waiter->fl_lmops->lm_notify(waiter);
-		else
-			wake_up(&waiter->fl_wait);
-	}
+	__locks_wake_up_blocks(blocker);
 	spin_unlock(&blocked_lock_lock);
 }
 
@@ -766,47 +879,50 @@
 /* Determine if lock sys_fl blocks lock caller_fl. Common functionality
  * checks for shared/exclusive status of overlapping locks.
  */
-static int locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+static bool locks_conflict(struct file_lock *caller_fl,
+			   struct file_lock *sys_fl)
 {
 	if (sys_fl->fl_type == F_WRLCK)
-		return 1;
+		return true;
 	if (caller_fl->fl_type == F_WRLCK)
-		return 1;
-	return 0;
+		return true;
+	return false;
 }
 
 /* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
  * checking before calling the locks_conflict().
  */
-static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+static bool posix_locks_conflict(struct file_lock *caller_fl,
+				 struct file_lock *sys_fl)
 {
 	/* POSIX locks owned by the same process do not conflict with
 	 * each other.
 	 */
 	if (posix_same_owner(caller_fl, sys_fl))
-		return (0);
+		return false;
 
 	/* Check whether they overlap */
 	if (!locks_overlap(caller_fl, sys_fl))
-		return 0;
+		return false;
 
-	return (locks_conflict(caller_fl, sys_fl));
+	return locks_conflict(caller_fl, sys_fl);
 }
 
 /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
  * checking before calling the locks_conflict().
  */
-static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+static bool flock_locks_conflict(struct file_lock *caller_fl,
+				 struct file_lock *sys_fl)
 {
 	/* FLOCK locks referring to the same filp do not conflict with
 	 * each other.
 	 */
 	if (caller_fl->fl_file == sys_fl->fl_file)
-		return (0);
+		return false;
 	if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
-		return 0;
+		return false;
 
-	return (locks_conflict(caller_fl, sys_fl));
+	return locks_conflict(caller_fl, sys_fl);
 }
 
 void
@@ -877,8 +993,11 @@
 	struct file_lock *fl;
 
 	hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
-		if (posix_same_owner(fl, block_fl))
-			return fl->fl_next;
+		if (posix_same_owner(fl, block_fl)) {
+			while (fl->fl_blocker)
+				fl = fl->fl_blocker;
+			return fl;
+		}
 	}
 	return NULL;
 }
@@ -936,7 +1055,7 @@
 			return -ENOMEM;
 	}
 
-	percpu_down_read_preempt_disable(&file_rwsem);
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
@@ -965,19 +1084,20 @@
 		if (!(request->fl_flags & FL_SLEEP))
 			goto out;
 		error = FILE_LOCK_DEFERRED;
-		locks_insert_block(fl, request);
+		locks_insert_block(fl, request, flock_locks_conflict);
 		goto out;
 	}
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 	locks_copy_lock(new_fl, request);
+	locks_move_blocks(new_fl, request);
 	locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
 	new_fl = NULL;
 	error = 0;
 
 out:
 	spin_unlock(&ctx->flc_lock);
-	percpu_up_read_preempt_enable(&file_rwsem);
+	percpu_up_read(&file_rwsem);
 	if (new_fl)
 		locks_free_lock(new_fl);
 	locks_dispose_list(&dispose);
@@ -1015,7 +1135,7 @@
 		new_fl2 = locks_alloc_lock();
 	}
 
-	percpu_down_read_preempt_disable(&file_rwsem);
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	/*
 	 * New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1037,14 +1157,20 @@
 			 */
 			error = -EDEADLK;
 			spin_lock(&blocked_lock_lock);
+			/*
+			 * Ensure that we don't find any locks blocked on this
+			 * request during deadlock detection.
+			 */
+			__locks_wake_up_blocks(request);
 			if (likely(!posix_locks_deadlock(request, fl))) {
 				error = FILE_LOCK_DEFERRED;
-				__locks_insert_block(fl, request);
+				__locks_insert_block(fl, request,
+						     posix_locks_conflict);
 			}
 			spin_unlock(&blocked_lock_lock);
 			goto out;
-  		}
-  	}
+		}
+	}
 
 	/* If we're just looking for a conflict, we're done. */
 	error = 0;
@@ -1164,6 +1290,7 @@
 			goto out;
 		}
 		locks_copy_lock(new_fl, request);
+		locks_move_blocks(new_fl, request);
 		locks_insert_lock_ctx(new_fl, &fl->fl_list);
 		fl = new_fl;
 		new_fl = NULL;
@@ -1187,7 +1314,7 @@
 	}
  out:
 	spin_unlock(&ctx->flc_lock);
-	percpu_up_read_preempt_enable(&file_rwsem);
+	percpu_up_read(&file_rwsem);
 	/*
 	 * Free any unused locks.
 	 */
@@ -1237,13 +1364,11 @@
 		error = posix_lock_inode(inode, fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
-		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
-		if (!error)
-			continue;
-
-		locks_delete_block(fl);
-		break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+		if (error)
+			break;
 	}
+	locks_delete_block(fl);
 	return error;
 }
 
@@ -1324,7 +1449,7 @@
 		error = posix_lock_inode(inode, &fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
-		error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
+		error = wait_event_interruptible(fl.fl_wait, !fl.fl_blocker);
 		if (!error) {
 			/*
 			 * If we've been sleeping someone might have
@@ -1334,13 +1459,12 @@
 				continue;
 		}
 
-		locks_delete_block(&fl);
 		break;
 	}
+	locks_delete_block(&fl);
 
 	return error;
 }
-
 EXPORT_SYMBOL(locks_mandatory_area);
 #endif /* CONFIG_MANDATORY_FILE_LOCKING */
 
@@ -1349,7 +1473,7 @@
 	switch (arg) {
 	case F_UNLCK:
 		fl->fl_flags &= ~FL_UNLOCK_PENDING;
-		/* fall through: */
+		/* fall through */
 	case F_RDLCK:
 		fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
 	}
@@ -1406,11 +1530,21 @@
 
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
-	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
-		return false;
-	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
-		return false;
-	return locks_conflict(breaker, lease);
+	bool rc;
+
+	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) {
+		rc = false;
+		goto trace;
+	}
+	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) {
+		rc = false;
+		goto trace;
+	}
+
+	rc = locks_conflict(breaker, lease);
+trace:
+	trace_leases_conflict(rc, lease, breaker);
+	return rc;
 }
 
 static bool
@@ -1459,10 +1593,10 @@
 	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx) {
 		WARN_ON_ONCE(1);
-		return error;
+		goto free_lock;
 	}
 
-	percpu_down_read_preempt_disable(&file_rwsem);
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 
 	time_out_leases(inode, &dispose);
@@ -1511,16 +1645,16 @@
 		break_time -= jiffies;
 	if (break_time == 0)
 		break_time++;
-	locks_insert_block(fl, new_fl);
+	locks_insert_block(fl, new_fl, leases_conflict);
 	trace_break_lease_block(inode, new_fl);
 	spin_unlock(&ctx->flc_lock);
-	percpu_up_read_preempt_enable(&file_rwsem);
+	percpu_up_read(&file_rwsem);
 
 	locks_dispose_list(&dispose);
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
-						!new_fl->fl_next, break_time);
+						!new_fl->fl_blocker, break_time);
 
-	percpu_down_read_preempt_disable(&file_rwsem);
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	trace_break_lease_unblock(inode, new_fl);
 	locks_delete_block(new_fl);
@@ -1537,12 +1671,12 @@
 	}
 out:
 	spin_unlock(&ctx->flc_lock);
-	percpu_up_read_preempt_enable(&file_rwsem);
+	percpu_up_read(&file_rwsem);
 	locks_dispose_list(&dispose);
+free_lock:
 	locks_free_lock(new_fl);
 	return error;
 }
-
 EXPORT_SYMBOL(__break_lease);
 
 /**
@@ -1573,7 +1707,6 @@
 	if (has_lease)
 		*time = current_time(inode);
 }
-
 EXPORT_SYMBOL(lease_get_mtime);
 
 /**
@@ -1609,7 +1742,7 @@
 
 	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-		percpu_down_read_preempt_disable(&file_rwsem);
+		percpu_down_read(&file_rwsem);
 		spin_lock(&ctx->flc_lock);
 		time_out_leases(inode, &dispose);
 		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
@@ -1619,7 +1752,7 @@
 			break;
 		}
 		spin_unlock(&ctx->flc_lock);
-		percpu_up_read_preempt_enable(&file_rwsem);
+		percpu_up_read(&file_rwsem);
 
 		locks_dispose_list(&dispose);
 	}
@@ -1627,10 +1760,10 @@
 }
 
 /**
- * check_conflicting_open - see if the given dentry points to a file that has
- * 			    an existing open that would conflict with the
- * 			    desired lease.
- * @dentry:	dentry to check
+ * check_conflicting_open - see if the given file points to an inode that has
+ *			    an existing open that would conflict with the
+ *			    desired lease.
+ * @filp:	file to check
  * @arg:	type of lease that we're trying to acquire
  * @flags:	current lock flags
  *
@@ -1638,30 +1771,42 @@
  * conflict with the lease we're trying to set.
  */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
+check_conflicting_open(struct file *filp, const long arg, int flags)
 {
-	int ret = 0;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = locks_inode(filp);
+	int self_wcount = 0, self_rcount = 0;
 
 	if (flags & FL_LAYOUT)
 		return 0;
 
-	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+	if (arg == F_RDLCK)
+		return inode_is_open_for_write(inode) ? -EAGAIN : 0;
+	else if (arg != F_WRLCK)
+		return 0;
+
+	/*
+	 * Make sure that only read/write count is from lease requestor.
+	 * Note that this will result in denying write leases when i_writecount
+	 * is negative, which is what we want.  (We shouldn't grant write leases
+	 * on files open for execution.)
+	 */
+	if (filp->f_mode & FMODE_WRITE)
+		self_wcount = 1;
+	else if (filp->f_mode & FMODE_READ)
+		self_rcount = 1;
+
+	if (atomic_read(&inode->i_writecount) != self_wcount ||
+	    atomic_read(&inode->i_readcount) != self_rcount)
 		return -EAGAIN;
 
-	if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
-	    (atomic_read(&inode->i_count) > 1)))
-		ret = -EAGAIN;
-
-	return ret;
+	return 0;
 }
 
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
 	struct file_lock *fl, *my_fl = NULL, *lease;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = locks_inode(filp);
 	struct file_lock_context *ctx;
 	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
@@ -1693,10 +1838,10 @@
 		return -EINVAL;
 	}
 
-	percpu_down_read_preempt_disable(&file_rwsem);
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
-	error = check_conflicting_open(dentry, arg, lease->fl_flags);
+	error = check_conflicting_open(filp, arg, lease->fl_flags);
 	if (error)
 		goto out;
 
@@ -1753,7 +1898,7 @@
 	 * precedes these checks.
 	 */
 	smp_mb();
-	error = check_conflicting_open(dentry, arg, lease->fl_flags);
+	error = check_conflicting_open(filp, arg, lease->fl_flags);
 	if (error) {
 		locks_unlink_lock_ctx(lease);
 		goto out;
@@ -1764,7 +1909,7 @@
 		lease->fl_lmops->lm_setup(lease, priv);
 out:
 	spin_unlock(&ctx->flc_lock);
-	percpu_up_read_preempt_enable(&file_rwsem);
+	percpu_up_read(&file_rwsem);
 	locks_dispose_list(&dispose);
 	if (is_deleg)
 		inode_unlock(inode);
@@ -1787,7 +1932,7 @@
 		return error;
 	}
 
-	percpu_down_read_preempt_disable(&file_rwsem);
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (fl->fl_file == filp &&
@@ -1800,7 +1945,7 @@
 	if (victim)
 		error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
-	percpu_up_read_preempt_enable(&file_rwsem);
+	percpu_up_read(&file_rwsem);
 	locks_dispose_list(&dispose);
 	return error;
 }
@@ -1847,13 +1992,71 @@
 }
 EXPORT_SYMBOL(generic_setlease);
 
+#if IS_ENABLED(CONFIG_SRCU)
+/*
+ * Kernel subsystems can register to be notified on any attempt to set
+ * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
+ * to close files that it may have cached when there is an attempt to set a
+ * conflicting lease.
+ */
+static struct srcu_notifier_head lease_notifier_chain;
+
+static inline void
+lease_notifier_chain_init(void)
+{
+	srcu_init_notifier_head(&lease_notifier_chain);
+}
+
+static inline void
+setlease_notifier(long arg, struct file_lock *lease)
+{
+	if (arg != F_UNLCK)
+		srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
+}
+
+int lease_register_notifier(struct notifier_block *nb)
+{
+	return srcu_notifier_chain_register(&lease_notifier_chain, nb);
+}
+EXPORT_SYMBOL_GPL(lease_register_notifier);
+
+void lease_unregister_notifier(struct notifier_block *nb)
+{
+	srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
+}
+EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+#else /* !IS_ENABLED(CONFIG_SRCU) */
+static inline void
+lease_notifier_chain_init(void)
+{
+}
+
+static inline void
+setlease_notifier(long arg, struct file_lock *lease)
+{
+}
+
+int lease_register_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(lease_register_notifier);
+
+void lease_unregister_notifier(struct notifier_block *nb)
+{
+}
+EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+#endif /* IS_ENABLED(CONFIG_SRCU) */
+
 /**
  * vfs_setlease        -       sets a lease on an open file
  * @filp:	file pointer
  * @arg:	type of lease to obtain
  * @lease:	file_lock to use when adding a lease
  * @priv:	private info for lm_setup when adding a lease (may be
- * 		NULL if lm_setup doesn't require it)
+ *		NULL if lm_setup doesn't require it)
  *
  * Call this to establish a lease on the file. The "lease" argument is not
  * used for F_UNLCK requests and may be NULL. For commands that set or alter
@@ -1867,6 +2070,8 @@
 int
 vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
 {
+	if (lease)
+		setlease_notifier(arg, *lease);
 	if (filp->f_op->setlease)
 		return filp->f_op->setlease(filp, arg, lease, priv);
 	else
@@ -1931,13 +2136,11 @@
 		error = flock_lock_inode(inode, fl);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
-		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
-		if (!error)
-			continue;
-
-		locks_delete_block(fl);
-		break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+		if (error)
+			break;
 	}
+	locks_delete_block(fl);
 	return error;
 }
 
@@ -2001,7 +2204,7 @@
 	    !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
 		goto out_putf;
 
-	lock = flock_make_lock(f.file, cmd);
+	lock = flock_make_lock(f.file, cmd, NULL);
 	if (IS_ERR(lock)) {
 		error = PTR_ERR(lock);
 		goto out_putf;
@@ -2143,7 +2346,7 @@
 	error = vfs_test_lock(filp, fl);
 	if (error)
 		goto out;
- 
+
 	flock->l_type = fl->fl_type;
 	if (fl->fl_type != F_UNLCK) {
 		error = posix_lock_to_flock(flock, fl);
@@ -2210,13 +2413,11 @@
 		error = vfs_lock_file(filp, cmd, fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
-		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
-		if (!error)
-			continue;
-
-		locks_delete_block(fl);
-		break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+		if (error)
+			break;
 	}
+	locks_delete_block(fl);
 
 	return error;
 }
@@ -2476,6 +2677,7 @@
 	if (!ctx || list_empty(&ctx->flc_posix))
 		return;
 
+	locks_init_lock(&lock);
 	lock.fl_type = F_UNLCK;
 	lock.fl_flags = FL_POSIX | FL_CLOSE;
 	lock.fl_start = 0;
@@ -2492,26 +2694,21 @@
 		lock.fl_ops->fl_release_private(&lock);
 	trace_locks_remove_posix(inode, &lock, error);
 }
-
 EXPORT_SYMBOL(locks_remove_posix);
 
 /* The i_flctx must be valid when calling into here */
 static void
 locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 {
-	struct file_lock fl = {
-		.fl_owner = filp,
-		.fl_pid = current->tgid,
-		.fl_file = filp,
-		.fl_flags = FL_FLOCK | FL_CLOSE,
-		.fl_type = F_UNLCK,
-		.fl_end = OFFSET_MAX,
-	};
+	struct file_lock fl;
 	struct inode *inode = locks_inode(filp);
 
 	if (list_empty(&flctx->flc_flock))
 		return;
 
+	flock_make_lock(filp, LOCK_UN, &fl);
+	fl.fl_flags |= FL_CLOSE;
+
 	if (filp->f_op->flock)
 		filp->f_op->flock(filp, F_SETLKW, &fl);
 	else
@@ -2531,13 +2728,13 @@
 	if (list_empty(&ctx->flc_lease))
 		return;
 
-	percpu_down_read_preempt_disable(&file_rwsem);
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
 		if (filp == fl->fl_file)
 			lease_modify(fl, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
-	percpu_up_read_preempt_enable(&file_rwsem);
+	percpu_up_read(&file_rwsem);
 
 	locks_dispose_list(&dispose);
 }
@@ -2570,27 +2767,6 @@
 }
 
 /**
- *	posix_unblock_lock - stop waiting for a file lock
- *	@waiter: the lock which was waiting
- *
- *	lockd needs to block waiting for locks.
- */
-int
-posix_unblock_lock(struct file_lock *waiter)
-{
-	int status = 0;
-
-	spin_lock(&blocked_lock_lock);
-	if (waiter->fl_next)
-		__locks_delete_block(waiter);
-	else
-		status = -ENOENT;
-	spin_unlock(&blocked_lock_lock);
-	return status;
-}
-EXPORT_SYMBOL(posix_unblock_lock);
-
-/**
  * vfs_cancel_lock - file byte range unblock lock
  * @filp: The file to apply the unblock to
  * @fl: The lock to be unblocked
@@ -2603,7 +2779,6 @@
 		return filp->f_op->lock(filp, F_CANCELLK, fl);
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 
 #ifdef CONFIG_PROC_FS
@@ -2671,10 +2846,10 @@
 			       ? (fl->fl_type & LOCK_WRITE) ? "RW   " : "READ "
 			       : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
 	} else {
-		seq_printf(f, "%s ",
-			       (lease_breaking(fl))
-			       ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ "
-			       : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ ");
+		int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+
+		seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+				     (type == F_RDLCK) ? "READ" : "UNLCK");
 	}
 	if (inode) {
 		/* userspace relies on this representation of dev_t */
@@ -2707,7 +2882,7 @@
 
 	lock_get_status(f, fl, iter->li_pos, "");
 
-	list_for_each_entry(bfl, &fl->fl_block, fl_block)
+	list_for_each_entry(bfl, &fl->fl_blocked_requests, fl_blocked_member)
 		lock_get_status(f, bfl, iter->li_pos, " ->");
 
 	return 0;
@@ -2803,7 +2978,6 @@
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
-
 	for_each_possible_cpu(i) {
 		struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
 
@@ -2811,7 +2985,7 @@
 		INIT_HLIST_HEAD(&fll->hlist);
 	}
 
+	lease_notifier_chain_init();
 	return 0;
 }
-
 core_initcall(filelock_init);

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 081ccf0..97c54d3 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/list.h>

diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index bcd53a7..de20039 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config MINIX_FS
 	tristate "Minix file system support"
 	depends on BLOCK

diff --git a/fs/minix/Makefile b/fs/minix/Makefile
index 3063015..a2d3ab5 100644
--- a/fs/minix/Makefile
+++ b/fs/minix/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux minix filesystem routines.
 #

diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 72e308c..7cb5fd3 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/minix/inode.c
  *
@@ -68,17 +69,11 @@
 	return &ei->vfs_inode;
 }
 
-static void minix_i_callback(struct rcu_head *head)
+static void minix_free_in_core_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
-static void minix_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, minix_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
@@ -110,7 +105,7 @@
 
 static const struct super_operations minix_sops = {
 	.alloc_inode	= minix_alloc_inode,
-	.destroy_inode	= minix_destroy_inode,
+	.free_inode	= minix_free_in_core_inode,
 	.write_inode	= minix_write_inode,
 	.evict_inode	= minix_evict_inode,
 	.put_super	= minix_put_super,
@@ -282,6 +277,8 @@
 
 	/* set up enough so that it can read an inode */
 	s->s_op = &minix_sops;
+	s->s_time_min = 0;
+	s->s_time_max = U32_MAX;
 	root_inode = minix_iget(s, MINIX_ROOT_INO);
 	if (IS_ERR(root_inode)) {
 		ret = PTR_ERR(root_inode);

diff --git a/fs/mount.h b/fs/mount.h
index f39bc9d..711a409 100644
--- a/fs/mount.h
+++ b/fs/mount.h

@@ -58,7 +58,10 @@
 	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
-	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	union {
+		struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+		struct hlist_node mnt_umount;
+	};
 	struct list_head mnt_umounting; /* list entry for umount propagation */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
@@ -68,8 +71,7 @@
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
-	struct fs_pin mnt_umount;
-	struct dentry *mnt_ex_mountpoint;
+	struct hlist_head mnt_stuck_children;
 } __randomize_layout;
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
@@ -146,3 +148,8 @@
 
 	return __is_local_mountpoint(dentry);
 }
+
+static inline bool is_anon_ns(struct mnt_namespace *ns)
+{
+	return ns->seq == 0;
+}

diff --git a/fs/mpage.c b/fs/mpage.c
index c820dc9..a63620c 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c

@@ -47,9 +47,9 @@
 static void mpage_end_io(struct bio *bio)
 {
 	struct bio_vec *bv;
-	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, iter_all) {
 		struct page *page = bv->bv_page;
 		page_endio(page, bio_op(bio),
 			   blk_status_to_errno(bio->bi_status));
@@ -647,7 +647,7 @@
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
-	wbc_account_io(wbc, page, PAGE_SIZE);
+	wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);

diff --git a/fs/namei.c b/fs/namei.c
index 914178c..671c3c1 100644
--- a/fs/namei.c
+++ b/fs/namei.c

@@ -39,7 +39,6 @@
 #include <linux/bitops.h>
 #include <linux/init_task.h>
 #include <linux/uaccess.h>
-#include <linux/build_bug.h>
 
 #include "internal.h"
 #include "mount.h"
@@ -131,7 +130,6 @@
 	struct filename *result;
 	char *kname;
 	int len;
-	BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
 
 	result = audit_reusename(filename);
 	if (result)
@@ -598,14 +596,12 @@
 		path_put(&nd->path);
 		for (i = 0; i < nd->depth; i++)
 			path_put(&nd->stack[i].link);
-		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		if (nd->flags & LOOKUP_ROOT_GRABBED) {
 			path_put(&nd->root);
-			nd->root.mnt = NULL;
+			nd->flags &= ~LOOKUP_ROOT_GRABBED;
 		}
 	} else {
 		nd->flags &= ~LOOKUP_RCU;
-		if (!(nd->flags & LOOKUP_ROOT))
-			nd->root.mnt = NULL;
 		rcu_read_unlock();
 	}
 	nd->depth = 0;
@@ -643,6 +639,14 @@
 	return true;
 }
 
+static bool legitimize_root(struct nameidata *nd)
+{
+	if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
+		return true;
+	nd->flags |= LOOKUP_ROOT_GRABBED;
+	return legitimize_path(nd, &nd->root, nd->root_seq);
+}
+
 /*
  * Path walking has 2 modes, rcu-walk and ref-walk (see
  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
@@ -673,23 +677,18 @@
 
 	nd->flags &= ~LOOKUP_RCU;
 	if (unlikely(!legitimize_links(nd)))
-		goto out2;
-	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
 		goto out1;
-	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
-			goto out;
-	}
+	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
+		goto out;
+	if (unlikely(!legitimize_root(nd)))
+		goto out;
 	rcu_read_unlock();
 	BUG_ON(nd->inode != parent->d_inode);
 	return 0;
 
-out2:
+out1:
 	nd->path.mnt = NULL;
 	nd->path.dentry = NULL;
-out1:
-	if (!(nd->flags & LOOKUP_ROOT))
-		nd->root.mnt = NULL;
 out:
 	rcu_read_unlock();
 	return -ECHILD;
@@ -729,23 +728,14 @@
 	 */
 	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
 		goto out;
-	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
-		rcu_read_unlock();
-		dput(dentry);
-		goto drop_root_mnt;
-	}
+	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
+		goto out_dput;
 	/*
 	 * Sequence counts matched. Now make sure that the root is
 	 * still valid and get it if required.
 	 */
-	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
-			rcu_read_unlock();
-			dput(dentry);
-			return -ECHILD;
-		}
-	}
-
+	if (unlikely(!legitimize_root(nd)))
+		goto out_dput;
 	rcu_read_unlock();
 	return 0;
 
@@ -755,9 +745,10 @@
 	nd->path.dentry = NULL;
 out:
 	rcu_read_unlock();
-drop_root_mnt:
-	if (!(nd->flags & LOOKUP_ROOT))
-		nd->root.mnt = NULL;
+	return -ECHILD;
+out_dput:
+	rcu_read_unlock();
+	dput(dentry);
 	return -ECHILD;
 }
 
@@ -821,6 +812,7 @@
 		} while (read_seqcount_retry(&fs->seq, seq));
 	} else {
 		get_fs_root(fs, &nd->root);
+		nd->flags |= LOOKUP_ROOT_GRABBED;
 	}
 }
 
@@ -1068,7 +1060,7 @@
 		return ERR_PTR(error);
 
 	nd->last_type = LAST_BIND;
-	res = inode->i_link;
+	res = READ_ONCE(inode->i_link);
 	if (!res) {
 		const char * (*get)(struct dentry *, struct inode *,
 				struct delayed_call *);
@@ -1737,8 +1729,6 @@
 				nd->flags &= ~LOOKUP_RCU;
 				nd->path.mnt = NULL;
 				nd->path.dentry = NULL;
-				if (!(nd->flags & LOOKUP_ROOT))
-					nd->root.mnt = NULL;
 				rcu_read_unlock();
 			} else if (likely(unlazy_walk(nd)) == 0)
 				error = nd_alloc_stack(nd);
@@ -2333,8 +2323,8 @@
 	return err;
 }
 
-static int filename_lookup(int dfd, struct filename *name, unsigned flags,
-			   struct path *path, struct path *root)
+int filename_lookup(int dfd, struct filename *name, unsigned flags,
+		    struct path *path, struct path *root)
 {
 	int retval;
 	struct nameidata nd;
@@ -2352,7 +2342,7 @@
 		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
 
 	if (likely(!retval))
-		audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
+		audit_inode(name, path->dentry, 0);
 	restore_nameidata();
 	putname(name);
 	return retval;
@@ -2393,7 +2383,7 @@
 	if (likely(!retval)) {
 		*last = nd.last;
 		*type = nd.last_type;
-		audit_inode(name, parent->dentry, LOOKUP_PARENT);
+		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
 	} else {
 		putname(name);
 		name = ERR_PTR(retval);
@@ -2720,7 +2710,7 @@
 	if (unlikely(error == -ESTALE))
 		error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
 	if (likely(!error))
-		audit_inode(name, path->dentry, 0);
+		audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL);
 	restore_nameidata();
 	putname(name);
 	return error;
@@ -3301,7 +3291,7 @@
 		if (error)
 			return error;
 
-		audit_inode(nd->name, dir, LOOKUP_PARENT);
+		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
 		/* trailing slashes? */
 		if (unlikely(nd->last.name[nd->last.len]))
 			return -EISDIR;
@@ -3462,6 +3452,7 @@
 		inode->i_state |= I_LINKABLE;
 		spin_unlock(&inode->i_lock);
 	}
+	ima_post_create_tmpfile(inode);
 	return child;
 
 out_err:
@@ -3884,6 +3875,7 @@
 	dentry->d_inode->i_flags |= S_DEAD;
 	dont_mount(dentry);
 	detach_mounts(dentry);
+	fsnotify_rmdir(dir, dentry);
 
 out:
 	inode_unlock(dentry->d_inode);
@@ -4000,6 +3992,7 @@
 			if (!error) {
 				dont_mount(dentry);
 				detach_mounts(dentry);
+				fsnotify_unlink(dir, dentry);
 			}
 		}
 	}
@@ -4499,10 +4492,10 @@
 		inode_unlock(target);
 	dput(new_dentry);
 	if (!error) {
-		fsnotify_move(old_dir, new_dir, old_name.name, is_dir,
+		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
 			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
 		if (flags & RENAME_EXCHANGE) {
-			fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
+			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
 				      new_is_dir, NULL, new_dentry);
 		}
 	}
@@ -4730,7 +4723,7 @@
 		spin_unlock(&inode->i_lock);
 	}
 
-	link = inode->i_link;
+	link = READ_ONCE(inode->i_link);
 	if (!link) {
 		link = inode->i_op->get_link(dentry, inode, &done);
 		if (IS_ERR(link))

diff --git a/fs/namespace.c b/fs/namespace.c
index 1fce41b..2adfe7b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c

@@ -1,8 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/namespace.c
  *
  * (C) Copyright Al Viro 2000, 2001
- *	Released under GPL v2.
  *
  * Based on code from fs/super.c, copyright Linus Torvalds and others.
  * Heavily rewritten.
@@ -20,12 +20,16 @@
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/task_work.h>
 #include <linux/sched/task.h>
+#include <uapi/linux/mount.h>
+#include <linux/fs_context.h>
+#include <linux/shmem_fs.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -66,6 +70,8 @@
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
 static DECLARE_RWSEM(namespace_sem);
+static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
+static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 
 /* /sys/fs */
 struct kobject *fs_kobj;
@@ -166,14 +172,6 @@
 #endif
 }
 
-static void drop_mountpoint(struct fs_pin *p)
-{
-	struct mount *m = container_of(p, struct mount, mnt_umount);
-	dput(m->mnt_ex_mountpoint);
-	pin_remove(p);
-	mntput(&m->mnt);
-}
-
 static struct mount *alloc_vfsmnt(const char *name)
 {
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -211,7 +209,7 @@
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
-		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
+		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 	}
 	return mnt;
 
@@ -245,13 +243,9 @@
  * mnt_want/drop_write() will _keep_ the filesystem
  * r/w.
  */
-int __mnt_is_readonly(struct vfsmount *mnt)
+bool __mnt_is_readonly(struct vfsmount *mnt)
 {
-	if (mnt->mnt_flags & MNT_READONLY)
-		return 1;
-	if (sb_rdonly(mnt->mnt_sb))
-		return 1;
-	return 0;
+	return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
@@ -507,11 +501,12 @@
 	return ret;
 }
 
-static void __mnt_unmake_readonly(struct mount *mnt)
+static int __mnt_unmake_readonly(struct mount *mnt)
 {
 	lock_mount_hash();
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
 	unlock_mount_hash();
+	return 0;
 }
 
 int sb_prepare_remount_readonly(struct super_block *sb)
@@ -739,7 +734,7 @@
 
 	/* Add the new mountpoint to the hash table */
 	read_seqlock_excl(&mount_lock);
-	new->m_dentry = dentry;
+	new->m_dentry = dget(dentry);
 	new->m_count = 1;
 	hlist_add_head(&new->m_hash, mp_hash(dentry));
 	INIT_HLIST_HEAD(&new->m_list);
@@ -752,7 +747,11 @@
 	return mp;
 }
 
-static void put_mountpoint(struct mountpoint *mp)
+/*
+ * vfsmount lock must be held.  Additionally, the caller is responsible
+ * for serializing calls for given disposal list.
+ */
+static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
 {
 	if (!--mp->m_count) {
 		struct dentry *dentry = mp->m_dentry;
@@ -760,11 +759,18 @@
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags &= ~DCACHE_MOUNTED;
 		spin_unlock(&dentry->d_lock);
+		dput_to_list(dentry, list);
 		hlist_del(&mp->m_hash);
 		kfree(mp);
 	}
 }
 
+/* called with namespace_lock and vfsmount lock */
+static void put_mountpoint(struct mountpoint *mp)
+{
+	__put_mountpoint(mp, &ex_mountpoints);
+}
+
 static inline int check_mnt(struct mount *mnt)
 {
 	return mnt->mnt_ns == current->nsproxy->mnt_ns;
@@ -795,25 +801,17 @@
 /*
  * vfsmount lock must be held for write
  */
-static void unhash_mnt(struct mount *mnt)
+static struct mountpoint *unhash_mnt(struct mount *mnt)
 {
+	struct mountpoint *mp;
 	mnt->mnt_parent = mnt;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	list_del_init(&mnt->mnt_child);
 	hlist_del_init_rcu(&mnt->mnt_hash);
 	hlist_del_init(&mnt->mnt_mp_list);
-	put_mountpoint(mnt->mnt_mp);
+	mp = mnt->mnt_mp;
 	mnt->mnt_mp = NULL;
-}
-
-/*
- * vfsmount lock must be held for write
- */
-static void detach_mnt(struct mount *mnt, struct path *old_path)
-{
-	old_path->dentry = mnt->mnt_mountpoint;
-	old_path->mnt = &mnt->mnt_parent->mnt;
-	unhash_mnt(mnt);
+	return mp;
 }
 
 /*
@@ -821,9 +819,7 @@
  */
 static void umount_mnt(struct mount *mnt)
 {
-	/* old mountpoint will be dropped when we can do that */
-	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
-	unhash_mnt(mnt);
+	put_mountpoint(unhash_mnt(mnt));
 }
 
 /*
@@ -835,7 +831,7 @@
 {
 	mp->m_count++;
 	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
-	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
+	child_mnt->mnt_mountpoint = mp->m_dentry;
 	child_mnt->mnt_parent = mnt;
 	child_mnt->mnt_mp = mp;
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
@@ -862,7 +858,6 @@
 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
 {
 	struct mountpoint *old_mp = mnt->mnt_mp;
-	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
 	struct mount *old_parent = mnt->mnt_parent;
 
 	list_del_init(&mnt->mnt_child);
@@ -872,22 +867,6 @@
 	attach_mnt(mnt, parent, mp);
 
 	put_mountpoint(old_mp);
-
-	/*
-	 * Safely avoid even the suggestion this code might sleep or
-	 * lock the mount hash by taking advantage of the knowledge that
-	 * mnt_change_mountpoint will not release the final reference
-	 * to a mountpoint.
-	 *
-	 * During mounting, the mount passed in as the parent mount will
-	 * continue to use the old mountpoint and during unmounting, the
-	 * old mountpoint will continue to exist until namespace_unlock,
-	 * which happens well after mnt_change_mountpoint.
-	 */
-	spin_lock(&old_mountpoint->d_lock);
-	old_mountpoint->d_lockref.count--;
-	spin_unlock(&old_mountpoint->d_lock);
-
 	mnt_add_count(old_parent, -1);
 }
 
@@ -942,38 +921,81 @@
 	return p;
 }
 
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_create_mount - Create a mount for a configured superblock
+ * @fc: The configuration context with the superblock attached
+ *
+ * Create a mount to an already configured superblock.  If necessary, the
+ * caller should invoke vfs_get_tree() before calling this.
+ *
+ * Note that this does not attach the mount to anything.
+ */
+struct vfsmount *vfs_create_mount(struct fs_context *fc)
 {
 	struct mount *mnt;
-	struct dentry *root;
 
-	if (!type)
-		return ERR_PTR(-ENODEV);
+	if (!fc->root)
+		return ERR_PTR(-EINVAL);
 
-	mnt = alloc_vfsmnt(name);
+	mnt = alloc_vfsmnt(fc->source ?: "none");
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
-	if (flags & SB_KERNMOUNT)
+	if (fc->sb_flags & SB_KERNMOUNT)
 		mnt->mnt.mnt_flags = MNT_INTERNAL;
 
-	root = mount_fs(type, flags, name, data);
-	if (IS_ERR(root)) {
-		mnt_free_id(mnt);
-		free_vfsmnt(mnt);
-		return ERR_CAST(root);
-	}
+	atomic_inc(&fc->root->d_sb->s_active);
+	mnt->mnt.mnt_sb		= fc->root->d_sb;
+	mnt->mnt.mnt_root	= dget(fc->root);
+	mnt->mnt_mountpoint	= mnt->mnt.mnt_root;
+	mnt->mnt_parent		= mnt;
 
-	mnt->mnt.mnt_root = root;
-	mnt->mnt.mnt_sb = root->d_sb;
-	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
-	mnt->mnt_parent = mnt;
 	lock_mount_hash();
-	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
+	list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
 	unlock_mount_hash();
 	return &mnt->mnt;
 }
+EXPORT_SYMBOL(vfs_create_mount);
+
+struct vfsmount *fc_mount(struct fs_context *fc)
+{
+	int err = vfs_get_tree(fc);
+	if (!err) {
+		up_write(&fc->root->d_sb->s_umount);
+		return vfs_create_mount(fc);
+	}
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fc_mount);
+
+struct vfsmount *vfs_kern_mount(struct file_system_type *type,
+				int flags, const char *name,
+				void *data)
+{
+	struct fs_context *fc;
+	struct vfsmount *mnt;
+	int ret = 0;
+
+	if (!type)
+		return ERR_PTR(-EINVAL);
+
+	fc = fs_context_for_mount(type, flags);
+	if (IS_ERR(fc))
+		return ERR_CAST(fc);
+
+	if (name)
+		ret = vfs_parse_fs_string(fc, "source",
+					  name, strlen(name));
+	if (!ret)
+		ret = parse_monolithic_mount_data(fc, data);
+	if (!ret)
+		mnt = fc_mount(fc);
+	else
+		mnt = ERR_PTR(ret);
+
+	put_fs_context(fc);
+	return mnt;
+}
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
 struct vfsmount *
@@ -1015,27 +1037,6 @@
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
 	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
-	/* Don't allow unprivileged users to change mount flags */
-	if (flag & CL_UNPRIVILEGED) {
-		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
-
-		if (mnt->mnt.mnt_flags & MNT_READONLY)
-			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
-
-		if (mnt->mnt.mnt_flags & MNT_NODEV)
-			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
-
-		if (mnt->mnt.mnt_flags & MNT_NOSUID)
-			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
-
-		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
-			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
-	}
-
-	/* Don't allow unprivileged users to reveal what is under a mount */
-	if ((flag & CL_UNPRIVILEGED) &&
-	    (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
-		mnt->mnt.mnt_flags |= MNT_LOCKED;
 
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
@@ -1080,19 +1081,22 @@
 
 static void cleanup_mnt(struct mount *mnt)
 {
+	struct hlist_node *p;
+	struct mount *m;
 	/*
-	 * This probably indicates that somebody messed
-	 * up a mnt_want/drop_write() pair.  If this
-	 * happens, the filesystem was probably unable
-	 * to make r/w->r/o transitions.
-	 */
-	/*
+	 * The warning here probably indicates that somebody messed
+	 * up a mnt_want/drop_write() pair.  If this happens, the
+	 * filesystem was probably unable to make r/w->r/o transitions.
 	 * The locking used to deal with mnt_count decrement provides barriers,
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
+	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
@@ -1118,6 +1122,8 @@
 
 static void mntput_no_expire(struct mount *mnt)
 {
+	LIST_HEAD(list);
+
 	rcu_read_lock();
 	if (likely(READ_ONCE(mnt->mnt_ns))) {
 		/*
@@ -1158,10 +1164,12 @@
 	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
 		struct mount *p, *tmp;
 		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
-			umount_mnt(p);
+			__put_mountpoint(unhash_mnt(p), &list);
+			hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
 		}
 	}
 	unlock_mount_hash();
+	shrink_dentry_list(&list);
 
 	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
 		struct task_struct *task = current;
@@ -1347,22 +1355,29 @@
 
 EXPORT_SYMBOL(may_umount);
 
-static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
-
 static void namespace_unlock(void)
 {
 	struct hlist_head head;
+	struct hlist_node *p;
+	struct mount *m;
+	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
+	list_splice_init(&ex_mountpoints, &list);
 
 	up_write(&namespace_sem);
 
+	shrink_dentry_list(&list);
+
 	if (likely(hlist_empty(&head)))
 		return;
 
-	synchronize_rcu();
+	synchronize_rcu_expedited();
 
-	group_pin_kill(&head);
+	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
 }
 
 static inline void namespace_lock(void)
@@ -1448,9 +1463,6 @@
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
 
 		disconnect = disconnect_mount(p, how);
-
-		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
-				 disconnect ? &unmounted : NULL);
 		if (mnt_has_parent(p)) {
 			mnt_add_count(p->mnt_parent, -1);
 			if (!disconnect) {
@@ -1461,11 +1473,36 @@
 			}
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
+		if (disconnect)
+			hlist_add_head(&p->mnt_umount, &unmounted);
 	}
 }
 
 static void shrink_submounts(struct mount *mnt);
 
+static int do_umount_root(struct super_block *sb)
+{
+	int ret = 0;
+
+	down_write(&sb->s_umount);
+	if (!sb_rdonly(sb)) {
+		struct fs_context *fc;
+
+		fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
+						SB_RDONLY);
+		if (IS_ERR(fc)) {
+			ret = PTR_ERR(fc);
+		} else {
+			ret = parse_monolithic_mount_data(fc, NULL);
+			if (!ret)
+				ret = reconfigure_super(fc);
+			put_fs_context(fc);
+		}
+	}
+	up_write(&sb->s_umount);
+	return ret;
+}
+
 static int do_umount(struct mount *mnt, int flags)
 {
 	struct super_block *sb = mnt->mnt.mnt_sb;
@@ -1531,11 +1568,7 @@
 		 */
 		if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
 			return -EPERM;
-		down_write(&sb->s_umount);
-		if (!sb_rdonly(sb))
-			retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
-		up_write(&sb->s_umount);
-		return retval;
+		return do_umount_root(sb);
 	}
 
 	namespace_lock();
@@ -1584,15 +1617,15 @@
 	namespace_lock();
 	lock_mount_hash();
 	mp = lookup_mountpoint(dentry);
-	if (IS_ERR_OR_NULL(mp))
+	if (!mp)
 		goto out_unlock;
 
 	event++;
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
 		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
-			hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
 			umount_mnt(mnt);
+			hlist_add_head(&mnt->mnt_umount, &unmounted);
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
@@ -1610,13 +1643,18 @@
 	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
 }
 
+#ifdef	CONFIG_MANDATORY_FILE_LOCKING
 static inline bool may_mandlock(void)
 {
-#ifndef	CONFIG_MANDATORY_FILE_LOCKING
-	return false;
-#endif
 	return capable(CAP_SYS_ADMIN);
 }
+#else
+static inline bool may_mandlock(void)
+{
+	pr_warn("VFS: \"mand\" mount option not supported");
+	return false;
+}
+#endif
 
 /*
  * Now umount can handle mount points as well as block devices.
@@ -1790,6 +1828,27 @@
 	return &tree->mnt;
 }
 
+static void free_mnt_ns(struct mnt_namespace *);
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+
+void dissolve_on_fput(struct vfsmount *mnt)
+{
+	struct mnt_namespace *ns;
+	namespace_lock();
+	lock_mount_hash();
+	ns = real_mount(mnt)->mnt_ns;
+	if (ns) {
+		if (is_anon_ns(ns))
+			umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
+		else
+			ns = NULL;
+	}
+	unlock_mount_hash();
+	namespace_unlock();
+	if (ns)
+		free_mnt_ns(ns);
+}
+
 void drop_collected_mounts(struct vfsmount *mnt)
 {
 	namespace_lock();
@@ -1839,6 +1898,33 @@
 	return 0;
 }
 
+static void lock_mnt_tree(struct mount *mnt)
+{
+	struct mount *p;
+
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		int flags = p->mnt.mnt_flags;
+		/* Don't allow unprivileged users to change mount flags */
+		flags |= MNT_LOCK_ATIME;
+
+		if (flags & MNT_READONLY)
+			flags |= MNT_LOCK_READONLY;
+
+		if (flags & MNT_NODEV)
+			flags |= MNT_LOCK_NODEV;
+
+		if (flags & MNT_NOSUID)
+			flags |= MNT_LOCK_NOSUID;
+
+		if (flags & MNT_NOEXEC)
+			flags |= MNT_LOCK_NOEXEC;
+		/* Don't allow unprivileged users to reveal what is under a mount */
+		if (list_empty(&p->mnt_expire))
+			flags |= MNT_LOCKED;
+		p->mnt.mnt_flags = flags;
+	}
+}
+
 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
 {
 	struct mount *p;
@@ -1954,8 +2040,9 @@
 static int attach_recursive_mnt(struct mount *source_mnt,
 			struct mount *dest_mnt,
 			struct mountpoint *dest_mp,
-			struct path *parent_path)
+			bool moving)
 {
+	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = dest_mnt->mnt_ns;
 	struct mountpoint *smp;
@@ -1971,7 +2058,7 @@
 		return PTR_ERR(smp);
 
 	/* Is there space to add these mounts to the mount namespace? */
-	if (!parent_path) {
+	if (!moving) {
 		err = count_mounts(ns, source_mnt);
 		if (err)
 			goto out;
@@ -1990,11 +2077,15 @@
 	} else {
 		lock_mount_hash();
 	}
-	if (parent_path) {
-		detach_mnt(source_mnt, parent_path);
+	if (moving) {
+		unhash_mnt(source_mnt);
 		attach_mnt(source_mnt, dest_mnt, dest_mp);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
+		if (source_mnt->mnt_ns) {
+			/* move from anon - the caller will destroy */
+			list_del_init(&source_mnt->mnt_ns->list);
+		}
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		commit_tree(source_mnt);
 	}
@@ -2006,6 +2097,10 @@
 				 child->mnt_mountpoint);
 		if (q)
 			mnt_change_mountpoint(child, smp, q);
+		/* Notice when we are propagating across user namespaces */
+		if (child->mnt_parent->mnt_ns->user_ns != user_ns)
+			lock_mnt_tree(child);
+		child->mnt.mnt_flags &= ~MNT_LOCKED;
 		commit_tree(child);
 	}
 	put_mountpoint(smp);
@@ -2081,7 +2176,7 @@
 	      d_is_dir(mnt->mnt.mnt_root))
 		return -ENOTDIR;
 
-	return attach_recursive_mnt(mnt, p, mp, NULL);
+	return attach_recursive_mnt(mnt, p, mp, false);
 }
 
 /*
@@ -2149,6 +2244,30 @@
 	return false;
 }
 
+static struct mount *__do_loopback(struct path *old_path, int recurse)
+{
+	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+
+	if (IS_MNT_UNBINDABLE(old))
+		return mnt;
+
+	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
+		return mnt;
+
+	if (!recurse && has_locked_children(old, old_path->dentry))
+		return mnt;
+
+	if (recurse)
+		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+	else
+		mnt = clone_mnt(old, old_path->dentry, 0);
+
+	if (!IS_ERR(mnt))
+		mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
+	return mnt;
+}
+
 /*
  * do loopback mount.
  */
@@ -2156,7 +2275,7 @@
 				int recurse)
 {
 	struct path old_path;
-	struct mount *mnt = NULL, *old, *parent;
+	struct mount *mnt = NULL, *parent;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
@@ -2170,38 +2289,21 @@
 		goto out;
 
 	mp = lock_mount(path);
-	err = PTR_ERR(mp);
-	if (IS_ERR(mp))
+	if (IS_ERR(mp)) {
+		err = PTR_ERR(mp);
 		goto out;
+	}
 
-	old = real_mount(old_path.mnt);
 	parent = real_mount(path->mnt);
-
-	err = -EINVAL;
-	if (IS_MNT_UNBINDABLE(old))
-		goto out2;
-
 	if (!check_mnt(parent))
 		goto out2;
 
-	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
-		goto out2;
-
-	if (!recurse && has_locked_children(old, old_path.dentry))
-		goto out2;
-
-	if (recurse)
-		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
-	else
-		mnt = clone_mnt(old, old_path.dentry, 0);
-
+	mnt = __do_loopback(&old_path, recurse);
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
 		goto out2;
 	}
 
-	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		lock_mount_hash();
@@ -2215,21 +2317,206 @@
 	return err;
 }
 
-static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+static struct file *open_detached_copy(struct path *path, bool recursive)
 {
-	int error = 0;
-	int readonly_request = 0;
+	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+	struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
+	struct mount *mnt, *p;
+	struct file *file;
 
-	if (ms_flags & MS_RDONLY)
-		readonly_request = 1;
-	if (readonly_request == __mnt_is_readonly(mnt))
+	if (IS_ERR(ns))
+		return ERR_CAST(ns);
+
+	namespace_lock();
+	mnt = __do_loopback(path, recursive);
+	if (IS_ERR(mnt)) {
+		namespace_unlock();
+		free_mnt_ns(ns);
+		return ERR_CAST(mnt);
+	}
+
+	lock_mount_hash();
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		p->mnt_ns = ns;
+		ns->mounts++;
+	}
+	ns->root = mnt;
+	list_add_tail(&ns->list, &mnt->mnt_list);
+	mntget(&mnt->mnt);
+	unlock_mount_hash();
+	namespace_unlock();
+
+	mntput(path->mnt);
+	path->mnt = &mnt->mnt;
+	file = dentry_open(path, O_PATH, current_cred());
+	if (IS_ERR(file))
+		dissolve_on_fput(path->mnt);
+	else
+		file->f_mode |= FMODE_NEED_UNMOUNT;
+	return file;
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
+{
+	struct file *file;
+	struct path path;
+	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+	bool detached = flags & OPEN_TREE_CLONE;
+	int error;
+	int fd;
+
+	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
+
+	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
+		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
+		      OPEN_TREE_CLOEXEC))
+		return -EINVAL;
+
+	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+		return -EINVAL;
+
+	if (flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+	if (detached && !may_mount())
+		return -EPERM;
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (unlikely(error)) {
+		file = ERR_PTR(error);
+	} else {
+		if (detached)
+			file = open_detached_copy(&path, flags & AT_RECURSIVE);
+		else
+			file = dentry_open(&path, O_PATH, current_cred());
+		path_put(&path);
+	}
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+	fd_install(fd, file);
+	return fd;
+}
+
+/*
+ * Don't allow locked mount flags to be cleared.
+ *
+ * No locks need to be held here while testing the various MNT_LOCK
+ * flags because those flags can never be cleared once they are set.
+ */
+static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
+{
+	unsigned int fl = mnt->mnt.mnt_flags;
+
+	if ((fl & MNT_LOCK_READONLY) &&
+	    !(mnt_flags & MNT_READONLY))
+		return false;
+
+	if ((fl & MNT_LOCK_NODEV) &&
+	    !(mnt_flags & MNT_NODEV))
+		return false;
+
+	if ((fl & MNT_LOCK_NOSUID) &&
+	    !(mnt_flags & MNT_NOSUID))
+		return false;
+
+	if ((fl & MNT_LOCK_NOEXEC) &&
+	    !(mnt_flags & MNT_NOEXEC))
+		return false;
+
+	if ((fl & MNT_LOCK_ATIME) &&
+	    ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
+		return false;
+
+	return true;
+}
+
+static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
+{
+	bool readonly_request = (mnt_flags & MNT_READONLY);
+
+	if (readonly_request == __mnt_is_readonly(&mnt->mnt))
 		return 0;
 
 	if (readonly_request)
-		error = mnt_make_readonly(real_mount(mnt));
-	else
-		__mnt_unmake_readonly(real_mount(mnt));
-	return error;
+		return mnt_make_readonly(mnt);
+
+	return __mnt_unmake_readonly(mnt);
+}
+
+/*
+ * Update the user-settable attributes on a mount.  The caller must hold
+ * sb->s_umount for writing.
+ */
+static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
+{
+	lock_mount_hash();
+	mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
+	mnt->mnt.mnt_flags = mnt_flags;
+	touch_mnt_namespace(mnt->mnt_ns);
+	unlock_mount_hash();
+}
+
+static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
+{
+	struct super_block *sb = mnt->mnt_sb;
+
+	if (!__mnt_is_readonly(mnt) &&
+	   (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
+		char *buf = (char *)__get_free_page(GFP_KERNEL);
+		char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
+		struct tm tm;
+
+		time64_to_tm(sb->s_time_max, 0, &tm);
+
+		pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
+			sb->s_type->name,
+			is_mounted(mnt) ? "remounted" : "mounted",
+			mntpath,
+			tm.tm_year+1900, (unsigned long long)sb->s_time_max);
+
+		free_page((unsigned long)buf);
+	}
+}
+
+/*
+ * Handle reconfiguration of the mountpoint only without alteration of the
+ * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
+ * to mount(2).
+ */
+static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
+{
+	struct super_block *sb = path->mnt->mnt_sb;
+	struct mount *mnt = real_mount(path->mnt);
+	int ret;
+
+	if (!check_mnt(mnt))
+		return -EINVAL;
+
+	if (path->dentry != mnt->mnt.mnt_root)
+		return -EINVAL;
+
+	if (!can_change_locked_flags(mnt, mnt_flags))
+		return -EPERM;
+
+	down_write(&sb->s_umount);
+	ret = change_mount_ro_state(mnt, mnt_flags);
+	if (ret == 0)
+		set_mount_attributes(mnt, mnt_flags);
+	up_write(&sb->s_umount);
+
+	mnt_warn_timestamp_expiry(path, &mnt->mnt);
+
+	return ret;
 }
 
 /*
@@ -2243,6 +2530,7 @@
 	int err;
 	struct super_block *sb = path->mnt->mnt_sb;
 	struct mount *mnt = real_mount(path->mnt);
+	struct fs_context *fc;
 
 	if (!check_mnt(mnt))
 		return -EINVAL;
@@ -2250,52 +2538,28 @@
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
-	/* Don't allow changing of locked mnt flags.
-	 *
-	 * No locks need to be held here while testing the various
-	 * MNT_LOCK flags because those flags can never be cleared
-	 * once they are set.
-	 */
-	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
-	    !(mnt_flags & MNT_READONLY)) {
+	if (!can_change_locked_flags(mnt, mnt_flags))
 		return -EPERM;
-	}
-	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
-	    !(mnt_flags & MNT_NODEV)) {
-		return -EPERM;
-	}
-	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
-	    !(mnt_flags & MNT_NOSUID)) {
-		return -EPERM;
-	}
-	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
-	    !(mnt_flags & MNT_NOEXEC)) {
-		return -EPERM;
-	}
-	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
-	    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
-		return -EPERM;
-	}
 
-	err = security_sb_remount(sb, data);
-	if (err)
-		return err;
+	fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
 
-	down_write(&sb->s_umount);
-	if (ms_flags & MS_BIND)
-		err = change_mount_flags(path->mnt, ms_flags);
-	else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
-		err = -EPERM;
-	else
-		err = do_remount_sb(sb, sb_flags, data, 0);
+	err = parse_monolithic_mount_data(fc, data);
 	if (!err) {
-		lock_mount_hash();
-		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
-		mnt->mnt.mnt_flags = mnt_flags;
-		touch_mnt_namespace(mnt->mnt_ns);
-		unlock_mount_hash();
+		down_write(&sb->s_umount);
+		err = -EPERM;
+		if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+			err = reconfigure_super(fc);
+			if (!err)
+				set_mount_attributes(mnt, mnt_flags);
+		}
+		up_write(&sb->s_umount);
 	}
-	up_write(&sb->s_umount);
+
+	mnt_warn_timestamp_expiry(path, &mnt->mnt);
+
+	put_fs_context(fc);
 	return err;
 }
 
@@ -2309,97 +2573,125 @@
 	return 0;
 }
 
-static int do_move_mount(struct path *path, const char *old_name)
+/*
+ * Check that there aren't references to earlier/same mount namespaces in the
+ * specified subtree.  Such references can act as pins for mount namespaces
+ * that aren't checked by the mount-cycle checking code, thereby allowing
+ * cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
 {
-	struct path old_path, parent_path;
+	struct mount *p;
+	bool ret = false;
+
+	lock_mount_hash();
+	for (p = subtree; p; p = next_mnt(p, subtree))
+		if (mnt_ns_loop(p->mnt.mnt_root))
+			goto out;
+
+	ret = true;
+out:
+	unlock_mount_hash();
+	return ret;
+}
+
+static int do_move_mount(struct path *old_path, struct path *new_path)
+{
+	struct mnt_namespace *ns;
 	struct mount *p;
 	struct mount *old;
-	struct mountpoint *mp;
+	struct mount *parent;
+	struct mountpoint *mp, *old_mp;
 	int err;
-	if (!old_name || !*old_name)
-		return -EINVAL;
-	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
-	if (err)
-		return err;
+	bool attached;
 
-	mp = lock_mount(path);
-	err = PTR_ERR(mp);
+	mp = lock_mount(new_path);
 	if (IS_ERR(mp))
+		return PTR_ERR(mp);
+
+	old = real_mount(old_path->mnt);
+	p = real_mount(new_path->mnt);
+	parent = old->mnt_parent;
+	attached = mnt_has_parent(old);
+	old_mp = old->mnt_mp;
+	ns = old->mnt_ns;
+
+	err = -EINVAL;
+	/* The mountpoint must be in our namespace. */
+	if (!check_mnt(p))
 		goto out;
 
-	old = real_mount(old_path.mnt);
-	p = real_mount(path->mnt);
+	/* The thing moved must be mounted... */
+	if (!is_mounted(&old->mnt))
+		goto out;
 
-	err = -EINVAL;
-	if (!check_mnt(p) || !check_mnt(old))
-		goto out1;
+	/* ... and either ours or the root of anon namespace */
+	if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
+		goto out;
 
 	if (old->mnt.mnt_flags & MNT_LOCKED)
-		goto out1;
+		goto out;
 
-	err = -EINVAL;
-	if (old_path.dentry != old_path.mnt->mnt_root)
-		goto out1;
+	if (old_path->dentry != old_path->mnt->mnt_root)
+		goto out;
 
-	if (!mnt_has_parent(old))
-		goto out1;
-
-	if (d_is_dir(path->dentry) !=
-	      d_is_dir(old_path.dentry))
-		goto out1;
+	if (d_is_dir(new_path->dentry) !=
+	    d_is_dir(old_path->dentry))
+		goto out;
 	/*
 	 * Don't move a mount residing in a shared parent.
 	 */
-	if (IS_MNT_SHARED(old->mnt_parent))
-		goto out1;
+	if (attached && IS_MNT_SHARED(parent))
+		goto out;
 	/*
 	 * Don't move a mount tree containing unbindable mounts to a destination
 	 * mount which is shared.
 	 */
 	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
-		goto out1;
+		goto out;
 	err = -ELOOP;
+	if (!check_for_nsfs_mounts(old))
+		goto out;
 	for (; mnt_has_parent(p); p = p->mnt_parent)
 		if (p == old)
-			goto out1;
+			goto out;
 
-	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
+	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
+				   attached);
 	if (err)
-		goto out1;
+		goto out;
 
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
 	list_del_init(&old->mnt_expire);
-out1:
-	unlock_mount(mp);
+	if (attached)
+		put_mountpoint(old_mp);
 out:
-	if (!err)
-		path_put(&parent_path);
-	path_put(&old_path);
+	unlock_mount(mp);
+	if (!err) {
+		if (attached)
+			mntput_no_expire(parent);
+		else
+			free_mnt_ns(ns);
+	}
 	return err;
 }
 
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
+static int do_move_mount_old(struct path *path, const char *old_name)
 {
+	struct path old_path;
 	int err;
-	const char *subtype = strchr(fstype, '.');
-	if (subtype) {
-		subtype++;
-		err = -EINVAL;
-		if (!subtype[0])
-			goto err;
-	} else
-		subtype = "";
 
-	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
-	err = -ENOMEM;
-	if (!mnt->mnt_sb->s_subtype)
-		goto err;
-	return mnt;
+	if (!old_name || !*old_name)
+		return -EINVAL;
 
- err:
-	mntput(mnt);
-	return ERR_PTR(err);
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+	if (err)
+		return err;
+
+	err = do_move_mount(&old_path, path);
+	path_put(&old_path);
+	return err;
 }
 
 /*
@@ -2446,7 +2738,41 @@
 	return err;
 }
 
-static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
+static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
+
+/*
+ * Create a new mount using a superblock configuration and request it
+ * be added to the namespace tree.
+ */
+static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
+			   unsigned int mnt_flags)
+{
+	struct vfsmount *mnt;
+	struct super_block *sb = fc->root->d_sb;
+	int error;
+
+	error = security_sb_kern_mount(sb);
+	if (!error && mount_too_revealing(sb, &mnt_flags))
+		error = -EPERM;
+
+	if (unlikely(error)) {
+		fc_drop_locked(fc);
+		return error;
+	}
+
+	up_write(&sb->s_umount);
+
+	mnt = vfs_create_mount(fc);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	mnt_warn_timestamp_expiry(mountpoint, mnt);
+
+	error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+	if (error < 0)
+		mntput(mnt);
+	return error;
+}
 
 /*
  * create a new mount for userspace and request it to be added into the
@@ -2456,8 +2782,9 @@
 			int mnt_flags, const char *name, void *data)
 {
 	struct file_system_type *type;
-	struct vfsmount *mnt;
-	int err;
+	struct fs_context *fc;
+	const char *subtype = NULL;
+	int err = 0;
 
 	if (!fstype)
 		return -EINVAL;
@@ -2466,23 +2793,37 @@
 	if (!type)
 		return -ENODEV;
 
-	mnt = vfs_kern_mount(type, sb_flags, name, data);
-	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-	    !mnt->mnt_sb->s_subtype)
-		mnt = fs_set_subtype(mnt, fstype);
-
-	put_filesystem(type);
-	if (IS_ERR(mnt))
-		return PTR_ERR(mnt);
-
-	if (mount_too_revealing(mnt, &mnt_flags)) {
-		mntput(mnt);
-		return -EPERM;
+	if (type->fs_flags & FS_HAS_SUBTYPE) {
+		subtype = strchr(fstype, '.');
+		if (subtype) {
+			subtype++;
+			if (!*subtype) {
+				put_filesystem(type);
+				return -EINVAL;
+			}
+		}
 	}
 
-	err = do_add_mount(real_mount(mnt), path, mnt_flags);
-	if (err)
-		mntput(mnt);
+	fc = fs_context_for_mount(type, sb_flags);
+	put_filesystem(type);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	if (subtype)
+		err = vfs_parse_fs_string(fc, "subtype",
+					  subtype, strlen(subtype));
+	if (!err && name)
+		err = vfs_parse_fs_string(fc, "source", name, strlen(name));
+	if (!err)
+		err = parse_monolithic_mount_data(fc, data);
+	if (!err && !mount_capable(fc))
+		err = -EPERM;
+	if (!err)
+		err = vfs_get_tree(fc);
+	if (!err)
+		err = do_new_mount_fc(fc, path, mnt_flags);
+
+	put_fs_context(fc);
 	return err;
 }
 
@@ -2651,7 +2992,7 @@
 	const char __user *f = from;
 	char c;
 
-	if (!access_ok(VERIFY_READ, from, n))
+	if (!access_ok(from, n))
 		return n;
 
 	while (n) {
@@ -2684,7 +3025,7 @@
 	 * the remainder of the page.
 	 */
 	/* copy_from_user cannot cross TASK_SIZE ! */
-	size = TASK_SIZE - (unsigned long)data;
+	size = TASK_SIZE - (unsigned long)untagged_addr(data);
 	if (size > PAGE_SIZE)
 		size = PAGE_SIZE;
 
@@ -2700,7 +3041,7 @@
 
 char *copy_mount_string(const void __user *data)
 {
-	return data ? strndup_user(data, PAGE_SIZE) : NULL;
+	return data ? strndup_user(data, PATH_MAX) : NULL;
 }
 
 /*
@@ -2736,7 +3077,7 @@
 		return -EINVAL;
 
 	/* ... and get the mountpoint */
-	retval = user_path(dir_name, &path);
+	retval = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
 	if (retval)
 		return retval;
 
@@ -2786,7 +3127,9 @@
 			    SB_LAZYTIME |
 			    SB_I_VERSION);
 
-	if (flags & MS_REMOUNT)
+	if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
+		retval = do_reconfigure_mnt(&path, mnt_flags);
+	else if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags, sb_flags, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
@@ -2794,7 +3137,7 @@
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
-		retval = do_move_mount(&path, dev_name);
+		retval = do_move_mount_old(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
 				      dev_name, data_page);
@@ -2815,7 +3158,8 @@
 
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
-	ns_free_inum(&ns->ns);
+	if (!is_anon_ns(ns))
+		ns_free_inum(&ns->ns);
 	dec_mnt_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	kfree(ns);
@@ -2830,7 +3174,7 @@
  */
 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
 
-static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
 {
 	struct mnt_namespace *new_ns;
 	struct ucounts *ucounts;
@@ -2840,28 +3184,27 @@
 	if (!ucounts)
 		return ERR_PTR(-ENOSPC);
 
-	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
 	if (!new_ns) {
 		dec_mnt_namespaces(ucounts);
 		return ERR_PTR(-ENOMEM);
 	}
-	ret = ns_alloc_inum(&new_ns->ns);
-	if (ret) {
-		kfree(new_ns);
-		dec_mnt_namespaces(ucounts);
-		return ERR_PTR(ret);
+	if (!anon) {
+		ret = ns_alloc_inum(&new_ns->ns);
+		if (ret) {
+			kfree(new_ns);
+			dec_mnt_namespaces(ucounts);
+			return ERR_PTR(ret);
+		}
 	}
 	new_ns->ns.ops = &mntns_operations;
-	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+	if (!anon)
+		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	atomic_set(&new_ns->count, 1);
-	new_ns->root = NULL;
 	INIT_LIST_HEAD(&new_ns->list);
 	init_waitqueue_head(&new_ns->poll);
-	new_ns->event = 0;
 	new_ns->user_ns = get_user_ns(user_ns);
 	new_ns->ucounts = ucounts;
-	new_ns->mounts = 0;
-	new_ns->pending_mounts = 0;
 	return new_ns;
 }
 
@@ -2885,7 +3228,7 @@
 
 	old = ns->root;
 
-	new_ns = alloc_mnt_ns(user_ns);
+	new_ns = alloc_mnt_ns(user_ns, false);
 	if (IS_ERR(new_ns))
 		return new_ns;
 
@@ -2893,13 +3236,18 @@
 	/* First pass: copy the tree topology */
 	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != ns->user_ns)
-		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
+		copy_flags |= CL_SHARED_TO_SLAVE;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		namespace_unlock();
 		free_mnt_ns(new_ns);
 		return ERR_CAST(new);
 	}
+	if (user_ns != ns->user_ns) {
+		lock_mount_hash();
+		lock_mnt_tree(new);
+		unlock_mount_hash();
+	}
 	new_ns->root = new;
 	list_add_tail(&new_ns->list, &new->mnt_list);
 
@@ -2940,37 +3288,25 @@
 	return new_ns;
 }
 
-/**
- * create_mnt_ns - creates a private namespace and adds a root filesystem
- * @mnt: pointer to the new root filesystem mountpoint
- */
-static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
+struct dentry *mount_subtree(struct vfsmount *m, const char *name)
 {
-	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
-	if (!IS_ERR(new_ns)) {
-		struct mount *mnt = real_mount(m);
-		mnt->mnt_ns = new_ns;
-		new_ns->root = mnt;
-		new_ns->mounts++;
-		list_add(&mnt->mnt_list, &new_ns->list);
-	} else {
-		mntput(m);
-	}
-	return new_ns;
-}
-
-struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
-{
+	struct mount *mnt = real_mount(m);
 	struct mnt_namespace *ns;
 	struct super_block *s;
 	struct path path;
 	int err;
 
-	ns = create_mnt_ns(mnt);
-	if (IS_ERR(ns))
+	ns = alloc_mnt_ns(&init_user_ns, true);
+	if (IS_ERR(ns)) {
+		mntput(m);
 		return ERR_CAST(ns);
+	}
+	mnt->mnt_ns = ns;
+	ns->root = mnt;
+	ns->mounts++;
+	list_add(&mnt->mnt_list, &ns->list);
 
-	err = vfs_path_lookup(mnt->mnt_root, mnt,
+	err = vfs_path_lookup(m->mnt_root, m,
 			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
 
 	put_mnt_ns(ns);
@@ -2989,8 +3325,8 @@
 }
 EXPORT_SYMBOL(mount_subtree);
 
-int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
-	       unsigned long flags, void __user *data)
+int ksys_mount(const char __user *dev_name, const char __user *dir_name,
+	       const char __user *type, unsigned long flags, void __user *data)
 {
 	int ret;
 	char *kernel_type;
@@ -3030,6 +3366,204 @@
 }
 
 /*
+ * Create a kernel mount representation for a new, prepared superblock
+ * (specified by fs_fd) and attach to an open_tree-like file descriptor.
+ */
+SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
+		unsigned int, attr_flags)
+{
+	struct mnt_namespace *ns;
+	struct fs_context *fc;
+	struct file *file;
+	struct path newmount;
+	struct mount *mnt;
+	struct fd f;
+	unsigned int mnt_flags = 0;
+	long ret;
+
+	if (!may_mount())
+		return -EPERM;
+
+	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
+		return -EINVAL;
+
+	if (attr_flags & ~(MOUNT_ATTR_RDONLY |
+			   MOUNT_ATTR_NOSUID |
+			   MOUNT_ATTR_NODEV |
+			   MOUNT_ATTR_NOEXEC |
+			   MOUNT_ATTR__ATIME |
+			   MOUNT_ATTR_NODIRATIME))
+		return -EINVAL;
+
+	if (attr_flags & MOUNT_ATTR_RDONLY)
+		mnt_flags |= MNT_READONLY;
+	if (attr_flags & MOUNT_ATTR_NOSUID)
+		mnt_flags |= MNT_NOSUID;
+	if (attr_flags & MOUNT_ATTR_NODEV)
+		mnt_flags |= MNT_NODEV;
+	if (attr_flags & MOUNT_ATTR_NOEXEC)
+		mnt_flags |= MNT_NOEXEC;
+	if (attr_flags & MOUNT_ATTR_NODIRATIME)
+		mnt_flags |= MNT_NODIRATIME;
+
+	switch (attr_flags & MOUNT_ATTR__ATIME) {
+	case MOUNT_ATTR_STRICTATIME:
+		break;
+	case MOUNT_ATTR_NOATIME:
+		mnt_flags |= MNT_NOATIME;
+		break;
+	case MOUNT_ATTR_RELATIME:
+		mnt_flags |= MNT_RELATIME;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	f = fdget(fs_fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EINVAL;
+	if (f.file->f_op != &fscontext_fops)
+		goto err_fsfd;
+
+	fc = f.file->private_data;
+
+	ret = mutex_lock_interruptible(&fc->uapi_mutex);
+	if (ret < 0)
+		goto err_fsfd;
+
+	/* There must be a valid superblock or we can't mount it */
+	ret = -EINVAL;
+	if (!fc->root)
+		goto err_unlock;
+
+	ret = -EPERM;
+	if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
+		pr_warn("VFS: Mount too revealing\n");
+		goto err_unlock;
+	}
+
+	ret = -EBUSY;
+	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
+		goto err_unlock;
+
+	ret = -EPERM;
+	if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
+		goto err_unlock;
+
+	newmount.mnt = vfs_create_mount(fc);
+	if (IS_ERR(newmount.mnt)) {
+		ret = PTR_ERR(newmount.mnt);
+		goto err_unlock;
+	}
+	newmount.dentry = dget(fc->root);
+	newmount.mnt->mnt_flags = mnt_flags;
+
+	/* We've done the mount bit - now move the file context into more or
+	 * less the same state as if we'd done an fspick().  We don't want to
+	 * do any memory allocation or anything like that at this point as we
+	 * don't want to have to handle any errors incurred.
+	 */
+	vfs_clean_context(fc);
+
+	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
+	if (IS_ERR(ns)) {
+		ret = PTR_ERR(ns);
+		goto err_path;
+	}
+	mnt = real_mount(newmount.mnt);
+	mnt->mnt_ns = ns;
+	ns->root = mnt;
+	ns->mounts = 1;
+	list_add(&mnt->mnt_list, &ns->list);
+	mntget(newmount.mnt);
+
+	/* Attach to an apparent O_PATH fd with a note that we need to unmount
+	 * it, not just simply put it.
+	 */
+	file = dentry_open(&newmount, O_PATH, fc->cred);
+	if (IS_ERR(file)) {
+		dissolve_on_fput(newmount.mnt);
+		ret = PTR_ERR(file);
+		goto err_path;
+	}
+	file->f_mode |= FMODE_NEED_UNMOUNT;
+
+	ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
+	if (ret >= 0)
+		fd_install(ret, file);
+	else
+		fput(file);
+
+err_path:
+	path_put(&newmount);
+err_unlock:
+	mutex_unlock(&fc->uapi_mutex);
+err_fsfd:
+	fdput(f);
+	return ret;
+}
+
+/*
+ * Move a mount from one place to another.  In combination with
+ * fsopen()/fsmount() this is used to install a new mount and in combination
+ * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
+ * a mount subtree.
+ *
+ * Note the flags value is a combination of MOVE_MOUNT_* flags.
+ */
+SYSCALL_DEFINE5(move_mount,
+		int, from_dfd, const char *, from_pathname,
+		int, to_dfd, const char *, to_pathname,
+		unsigned int, flags)
+{
+	struct path from_path, to_path;
+	unsigned int lflags;
+	int ret = 0;
+
+	if (!may_mount())
+		return -EPERM;
+
+	if (flags & ~MOVE_MOUNT__MASK)
+		return -EINVAL;
+
+	/* If someone gives a pathname, they aren't permitted to move
+	 * from an fd that requires unmount as we can't get at the flag
+	 * to clear it afterwards.
+	 */
+	lflags = 0;
+	if (flags & MOVE_MOUNT_F_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
+	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
+	if (flags & MOVE_MOUNT_F_EMPTY_PATH)	lflags |= LOOKUP_EMPTY;
+
+	ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
+	if (ret < 0)
+		return ret;
+
+	lflags = 0;
+	if (flags & MOVE_MOUNT_T_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
+	if (flags & MOVE_MOUNT_T_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
+	if (flags & MOVE_MOUNT_T_EMPTY_PATH)	lflags |= LOOKUP_EMPTY;
+
+	ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
+	if (ret < 0)
+		goto out_from;
+
+	ret = security_move_mount(&from_path, &to_path);
+	if (ret < 0)
+		goto out_to;
+
+	ret = do_move_mount(&from_path, &to_path);
+
+out_to:
+	path_put(&to_path);
+out_from:
+	path_put(&from_path);
+	return ret;
+}
+
+/*
  * Return true if path is reachable from root
  *
  * namespace_sem or mount_lock is held
@@ -3082,19 +3616,21 @@
 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		const char __user *, put_old)
 {
-	struct path new, old, parent_path, root_parent, root;
-	struct mount *new_mnt, *root_mnt, *old_mnt;
+	struct path new, old, root;
+	struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
 	struct mountpoint *old_mp, *root_mp;
 	int error;
 
 	if (!may_mount())
 		return -EPERM;
 
-	error = user_path_dir(new_root, &new);
+	error = user_path_at(AT_FDCWD, new_root,
+			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
 	if (error)
 		goto out0;
 
-	error = user_path_dir(put_old, &old);
+	error = user_path_at(AT_FDCWD, put_old,
+			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
 	if (error)
 		goto out1;
 
@@ -3112,9 +3648,11 @@
 	new_mnt = real_mount(new.mnt);
 	root_mnt = real_mount(root.mnt);
 	old_mnt = real_mount(old.mnt);
+	ex_parent = new_mnt->mnt_parent;
+	root_parent = root_mnt->mnt_parent;
 	if (IS_MNT_SHARED(old_mnt) ||
-		IS_MNT_SHARED(new_mnt->mnt_parent) ||
-		IS_MNT_SHARED(root_mnt->mnt_parent))
+		IS_MNT_SHARED(ex_parent) ||
+		IS_MNT_SHARED(root_parent))
 		goto out4;
 	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
 		goto out4;
@@ -3131,7 +3669,6 @@
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(root_mnt))
 		goto out4; /* not attached */
-	root_mp = root_mnt->mnt_mp;
 	if (new.mnt->mnt_root != new.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(new_mnt))
@@ -3142,10 +3679,9 @@
 	/* make certain new is below the root */
 	if (!is_path_reachable(new_mnt, new.dentry, &root))
 		goto out4;
-	root_mp->m_count++; /* pin it so it won't go away */
 	lock_mount_hash();
-	detach_mnt(new_mnt, &parent_path);
-	detach_mnt(root_mnt, &root_parent);
+	umount_mnt(new_mnt);
+	root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
 	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
 		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
 		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
@@ -3153,7 +3689,8 @@
 	/* mount old root on put_old */
 	attach_mnt(root_mnt, old_mnt, old_mp);
 	/* mount new_root on / */
-	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
+	attach_mnt(new_mnt, root_parent, root_mp);
+	mnt_add_count(root_parent, -1);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	/* A moved mount should not expire automatically */
 	list_del_init(&new_mnt->mnt_expire);
@@ -3163,10 +3700,8 @@
 	error = 0;
 out4:
 	unlock_mount(old_mp);
-	if (!error) {
-		path_put(&root_parent);
-		path_put(&parent_path);
-	}
+	if (!error)
+		mntput_no_expire(ex_parent);
 out3:
 	path_put(&root);
 out2:
@@ -3180,22 +3715,22 @@
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
+	struct mount *m;
 	struct mnt_namespace *ns;
 	struct path root;
-	struct file_system_type *type;
 
-	type = get_fs_type("rootfs");
-	if (!type)
-		panic("Can't find rootfs type");
-	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
-	put_filesystem(type);
+	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
-	ns = create_mnt_ns(mnt);
+	ns = alloc_mnt_ns(&init_user_ns, false);
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
-
+	m = real_mount(mnt);
+	m->mnt_ns = ns;
+	ns->root = m;
+	ns->mounts = 1;
+	list_add(&m->mnt_list, &ns->list);
 	init_task.nsproxy->mnt_ns = ns;
 	get_mnt_ns(ns);
 
@@ -3237,6 +3772,7 @@
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
+	shmem_init();
 	init_rootfs();
 	init_mount_tree();
 }
@@ -3249,10 +3785,10 @@
 	free_mnt_ns(ns);
 }
 
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+struct vfsmount *kern_mount(struct file_system_type *type)
 {
 	struct vfsmount *mnt;
-	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
+	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
 	if (!IS_ERR(mnt)) {
 		/*
 		 * it is a longterm mount, don't release mnt until
@@ -3262,7 +3798,7 @@
 	}
 	return mnt;
 }
-EXPORT_SYMBOL_GPL(kern_mount_data);
+EXPORT_SYMBOL_GPL(kern_mount);
 
 void kern_unmount(struct vfsmount *mnt)
 {
@@ -3304,7 +3840,8 @@
 	return chrooted;
 }
 
-static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+static bool mnt_already_visible(struct mnt_namespace *ns,
+				const struct super_block *sb,
 				int *new_mnt_flags)
 {
 	int new_flags = *new_mnt_flags;
@@ -3316,7 +3853,7 @@
 		struct mount *child;
 		int mnt_flags;
 
-		if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
+		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
 			continue;
 
 		/* This mount is not fully visible if it's root directory
@@ -3367,7 +3904,7 @@
 	return visible;
 }
 
-static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
 {
 	const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
@@ -3377,7 +3914,7 @@
 		return false;
 
 	/* Can this filesystem be too revealing? */
-	s_iflags = mnt->mnt_sb->s_iflags;
+	s_iflags = sb->s_iflags;
 	if (!(s_iflags & SB_I_USERNS_VISIBLE))
 		return false;
 
@@ -3387,7 +3924,7 @@
 		return true;
 	}
 
-	return !mnt_already_visible(ns, mnt, new_mnt_flags);
+	return !mnt_already_visible(ns, sb, new_mnt_flags);
 }
 
 bool mnt_may_suid(struct vfsmount *mnt)
@@ -3436,6 +3973,9 @@
 	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (is_anon_ns(mnt_ns))
+		return -EINVAL;
+
 	if (fs->users != 1)
 		return -EINVAL;
 

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 5f93cfa..295a7a2 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config NFS_FS
 	tristate "NFS client support"
 	depends on INET && FILE_LOCKING && MULTIUSER
@@ -121,7 +122,6 @@
 config PNFS_BLOCK
 	tristate
 	depends on NFS_V4_1 && BLK_DEV_DM
-	depends on 64BIT || LBDAF
 	default NFS_V4
 
 config PNFS_FLEXFILE_LAYOUT

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index c587e3c..34cdeae 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile

@@ -8,7 +8,8 @@
 CFLAGS_nfstrace.o += -I$(src)
 nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o \
 			   io.o direct.o pagelist.o read.o symlink.o unlink.o \
-			   write.o namespace.o mount_clnt.o nfstrace.o export.o
+			   write.o namespace.o mount_clnt.o nfstrace.o \
+			   export.o sysfs.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_SYSCTL)	+= sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o

diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index 3ca14c3..7668a1b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the pNFS block layout driver kernel module
 #

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 06cb0c1..6902217 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c

@@ -584,7 +584,7 @@
 
 static struct nfs4_deviceid_node *
 bl_find_get_deviceid(struct nfs_server *server,
-		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		const struct nfs4_deviceid *id, const struct cred *cred,
 		gfp_t gfp_mask)
 {
 	struct nfs4_deviceid_node *node;
@@ -896,7 +896,7 @@
 	end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 	if (end != inode->i_mapping->nrpages) {
 		rcu_read_lock();
-		end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
+		end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX);
 		rcu_read_unlock();
 	}
 

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 509dc5a..7817ad9 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c

@@ -41,11 +41,13 @@
 
 static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
 {
+	const struct cred *cred = current_cred();
 	int ret;
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 
 	ret = svc_create_xprt(serv, "tcp", net, PF_INET,
-				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+				cred);
 	if (ret <= 0)
 		goto out_err;
 	nn->nfs_callback_tcpport = ret;
@@ -53,10 +55,11 @@
 		nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
 
 	ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
-				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+				cred);
 	if (ret > 0) {
 		nn->nfs_callback_tcpport6 = ret;
-		dprintk("NFS: Callback listener port = %u (af %u, net %x\n",
+		dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
 			nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum);
 	} else if (ret != -EAFNOSUPPORT)
 		goto out_err;
@@ -206,11 +209,13 @@
 		goto err_bind;
 	}
 
-	ret = -EPROTONOSUPPORT;
+	ret = 0;
 	if (!IS_ENABLED(CONFIG_NFS_V4_1) || minorversion == 0)
 		ret = nfs4_callback_up_net(serv, net);
-	else if (xprt->ops->bc_up)
-		ret = xprt->ops->bc_up(serv, net);
+	else if (xprt->ops->bc_setup)
+		set_bc_enabled(serv);
+	else
+		ret = -EPROTONOSUPPORT;
 
 	if (ret < 0) {
 		printk(KERN_ERR "NFS: callback service start failed\n");
@@ -455,4 +460,6 @@
 	.pg_class = "nfs",				/* authentication class */
 	.pg_stats = &nfs4_callback_stats,
 	.pg_authenticate = nfs_callback_authenticate,
+	.pg_init_request = svc_generic_init_request,
+	.pg_rpcbind_set	= svc_generic_rpcbind_set,
 };

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 3159673..f39924b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c

@@ -414,27 +414,39 @@
 validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
 		const struct cb_sequenceargs * args)
 {
+	__be32 ret;
+
+	ret = cpu_to_be32(NFS4ERR_BADSLOT);
 	if (args->csa_slotid > tbl->server_highest_slotid)
-		return htonl(NFS4ERR_BADSLOT);
+		goto out_err;
 
 	/* Replay */
 	if (args->csa_sequenceid == slot->seq_nr) {
+		ret = cpu_to_be32(NFS4ERR_DELAY);
 		if (nfs4_test_locked_slot(tbl, slot->slot_nr))
-			return htonl(NFS4ERR_DELAY);
+			goto out_err;
+
 		/* Signal process_op to set this error on next op */
+		ret = cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP);
 		if (args->csa_cachethis == 0)
-			return htonl(NFS4ERR_RETRY_UNCACHED_REP);
+			goto out_err;
 
 		/* Liar! We never allowed you to set csa_cachethis != 0 */
-		return htonl(NFS4ERR_SEQ_FALSE_RETRY);
+		ret = cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY);
+		goto out_err;
 	}
 
 	/* Note: wraparound relies on seq_nr being of type u32 */
-	if (likely(args->csa_sequenceid == slot->seq_nr + 1))
-		return htonl(NFS4_OK);
-
 	/* Misordered request */
-	return htonl(NFS4ERR_SEQ_MISORDERED);
+	ret = cpu_to_be32(NFS4ERR_SEQ_MISORDERED);
+	if (args->csa_sequenceid != slot->seq_nr + 1)
+		goto out_err;
+
+	return cpu_to_be32(NFS4_OK);
+
+out_err:
+	trace_nfs4_cb_seqid_err(args, ret);
+	return ret;
 }
 
 /*

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a87a562..73a5a5e 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c

@@ -72,16 +72,6 @@
 	return xdr_ressize_check(rqstp, p);
 }
 
-static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes)
-{
-	__be32 *p;
-
-	p = xdr_inline_decode(xdr, nbytes);
-	if (unlikely(p == NULL))
-		printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
-	return p;
-}
-
 static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
 		const char **str, size_t maxlen)
 {
@@ -98,13 +88,13 @@
 {
 	__be32 *p;
 
-	p = read_buf(xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 	fh->size = ntohl(*p);
 	if (fh->size > NFS4_FHSIZE)
 		return htonl(NFS4ERR_BADHANDLE);
-	p = read_buf(xdr, fh->size);
+	p = xdr_inline_decode(xdr, fh->size);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 	memcpy(&fh->data[0], p, fh->size);
@@ -117,11 +107,11 @@
 	__be32 *p;
 	unsigned int attrlen;
 
-	p = read_buf(xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 	attrlen = ntohl(*p);
-	p = read_buf(xdr, attrlen << 2);
+	p = xdr_inline_decode(xdr, attrlen << 2);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 	if (likely(attrlen > 0))
@@ -135,7 +125,7 @@
 {
 	__be32 *p;
 
-	p = read_buf(xdr, NFS4_STATEID_SIZE);
+	p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 	memcpy(stateid->data, p, NFS4_STATEID_SIZE);
@@ -156,7 +146,7 @@
 	status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ);
 	if (unlikely(status != 0))
 		return status;
-	p = read_buf(xdr, 12);
+	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 	hdr->minorversion = ntohl(*p++);
@@ -176,7 +166,7 @@
 static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
 {
 	__be32 *p;
-	p = read_buf(xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE_HDR);
 	*op = ntohl(*p);
@@ -205,7 +195,7 @@
 	status = decode_delegation_stateid(xdr, &args->stateid);
 	if (unlikely(status != 0))
 		return status;
-	p = read_buf(xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 	args->truncate = ntohl(*p);
@@ -227,7 +217,7 @@
 	__be32 status = 0;
 	uint32_t iomode;
 
-	p = read_buf(xdr, 4 * sizeof(uint32_t));
+	p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t));
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
 
@@ -245,14 +235,14 @@
 		if (unlikely(status != 0))
 			return status;
 
-		p = read_buf(xdr, 2 * sizeof(uint64_t));
+		p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
 		if (unlikely(p == NULL))
 			return htonl(NFS4ERR_BADXDR);
 		p = xdr_decode_hyper(p, &args->cbl_range.offset);
 		p = xdr_decode_hyper(p, &args->cbl_range.length);
 		return decode_layout_stateid(xdr, &args->cbl_stateid);
 	} else if (args->cbl_recall_type == RETURN_FSID) {
-		p = read_buf(xdr, 2 * sizeof(uint64_t));
+		p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
 		if (unlikely(p == NULL))
 			return htonl(NFS4ERR_BADXDR);
 		p = xdr_decode_hyper(p, &args->cbl_fsid.major);
@@ -275,7 +265,7 @@
 	args->ndevs = 0;
 
 	/* Num of device notifications */
-	p = read_buf(xdr, sizeof(uint32_t));
+	p = xdr_inline_decode(xdr, sizeof(uint32_t));
 	if (unlikely(p == NULL)) {
 		status = htonl(NFS4ERR_BADXDR);
 		goto out;
@@ -298,7 +288,8 @@
 	for (i = 0; i < n; i++) {
 		struct cb_devicenotifyitem *dev = &args->devs[i];
 
-		p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+		p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) +
+				      NFS4_DEVICEID4_SIZE);
 		if (unlikely(p == NULL)) {
 			status = htonl(NFS4ERR_BADXDR);
 			goto err;
@@ -329,7 +320,7 @@
 		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
 
 		if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
-			p = read_buf(xdr, sizeof(uint32_t));
+			p = xdr_inline_decode(xdr, sizeof(uint32_t));
 			if (unlikely(p == NULL)) {
 				status = htonl(NFS4ERR_BADXDR);
 				goto err;
@@ -359,7 +350,7 @@
 {
 	__be32 *p;
 
-	p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 
@@ -379,13 +370,13 @@
 		goto out;
 
 	status = htonl(NFS4ERR_RESOURCE);
-	p = read_buf(xdr, sizeof(uint32_t));
+	p = xdr_inline_decode(xdr, sizeof(uint32_t));
 	if (unlikely(p == NULL))
 		goto out;
 
 	rc_list->rcl_nrefcalls = ntohl(*p++);
 	if (rc_list->rcl_nrefcalls) {
-		p = read_buf(xdr,
+		p = xdr_inline_decode(xdr,
 			     rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
 		if (unlikely(p == NULL))
 			goto out;
@@ -418,7 +409,7 @@
 	if (status)
 		return status;
 
-	p = read_buf(xdr, 5 * sizeof(uint32_t));
+	p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t));
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 
@@ -461,7 +452,7 @@
 	uint32_t bitmap[2];
 	__be32 *p, status;
 
-	p = read_buf(xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
 	args->craa_objs_to_keep = ntohl(*p++);
@@ -480,7 +471,7 @@
 	struct cb_recallslotargs *args = argp;
 	__be32 *p;
 
-	p = read_buf(xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
 	args->crsa_target_highest_slotid = ntohl(*p++);
@@ -492,14 +483,14 @@
 	__be32		*p;
 	unsigned int	len;
 
-	p = read_buf(xdr, 12);
+	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
 
 	p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
 	len = be32_to_cpu(*p);
 
-	p = read_buf(xdr, len);
+	p = xdr_inline_decode(xdr, len);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
 
@@ -537,7 +528,7 @@
 	__be32 *p;
 
 	/* skip the always zero field */
-	p = read_buf(xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out;
 	p++;
@@ -577,7 +568,7 @@
 		return status;
 
 	/* decode status */
-	p = read_buf(xdr, 4);
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out;
 	args->error = ntohl(*p++);
@@ -943,10 +934,11 @@
 	};
 	unsigned int nops = 0;
 
-	xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
+	xdr_init_decode(&xdr_in, &rqstp->rq_arg,
+			rqstp->rq_arg.head[0].iov_base, NULL);
 
 	p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
-	xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+	xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL);
 
 	status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
 	if (status == htonl(NFS4ERR_RESOURCE))
@@ -991,7 +983,7 @@
 
 out_invalidcred:
 	pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
-	return rpc_autherr_badcred;
+	return svc_return_autherr(rqstp, rpc_autherr_badcred);
 }
 
 /*

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 96d5f81..3083830 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* client.c: NFS client sharing and management code
  *
  * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 
@@ -53,6 +49,7 @@
 #include "pnfs.h"
 #include "nfs.h"
 #include "netns.h"
+#include "sysfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
 
@@ -151,7 +148,6 @@
 struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 {
 	struct nfs_client *clp;
-	struct rpc_cred *cred;
 	int err = -ENOMEM;
 
 	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
@@ -180,11 +176,10 @@
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 
 	clp->cl_proto = cl_init->proto;
+	clp->cl_nconnect = cl_init->nconnect;
 	clp->cl_net = get_net(cl_init->net);
 
-	cred = rpc_lookup_machine_cred("*");
-	if (!IS_ERR(cred))
-		clp->cl_machine_cred = cred;
+	clp->cl_principal = "*";
 	nfs_fscache_get_client_cookie(clp);
 
 	return clp;
@@ -199,7 +194,7 @@
 EXPORT_SYMBOL_GPL(nfs_alloc_client);
 
 #if IS_ENABLED(CONFIG_NFS_V4)
-void nfs_cleanup_cb_ident_idr(struct net *net)
+static void nfs_cleanup_cb_ident_idr(struct net *net)
 {
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 
@@ -221,7 +216,7 @@
 }
 
 #else
-void nfs_cleanup_cb_ident_idr(struct net *net)
+static void nfs_cleanup_cb_ident_idr(struct net *net)
 {
 }
 
@@ -246,9 +241,6 @@
 	if (!IS_ERR(clp->cl_rpcclient))
 		rpc_shutdown_client(clp->cl_rpcclient);
 
-	if (clp->cl_machine_cred != NULL)
-		put_rpccred(clp->cl_machine_cred);
-
 	put_net(clp->cl_net);
 	put_nfs_version(clp->cl_nfs_mod);
 	kfree(clp->cl_hostname);
@@ -290,6 +282,7 @@
 	struct nfs_client *clp;
 	const struct sockaddr *sap = data->addr;
 	struct nfs_net *nn = net_generic(data->net, nfs_net_id);
+	int error;
 
 again:
 	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
@@ -302,9 +295,11 @@
 		if (clp->cl_cons_state > NFS_CS_READY) {
 			refcount_inc(&clp->cl_count);
 			spin_unlock(&nn->nfs_client_lock);
-			nfs_wait_client_init_complete(clp);
+			error = nfs_wait_client_init_complete(clp);
 			nfs_put_client(clp);
 			spin_lock(&nn->nfs_client_lock);
+			if (error < 0)
+				return ERR_PTR(error);
 			goto again;
 		}
 
@@ -415,6 +410,8 @@
 			spin_unlock(&nn->nfs_client_lock);
 			if (new)
 				new->rpc_ops->free_client(new);
+			if (IS_ERR(clp))
+				return clp;
 			return nfs_found_client(cl_init, clp);
 		}
 		if (new) {
@@ -459,7 +456,7 @@
 	case XPRT_TRANSPORT_RDMA:
 		if (retrans == NFS_UNSPEC_RETRANS)
 			to->to_retries = NFS_DEF_TCP_RETRANS;
-		if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
+		if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
 			to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
 		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
 			to->to_initval = NFS_MAX_TCP_TIMEOUT;
@@ -498,6 +495,7 @@
 	struct rpc_create_args args = {
 		.net		= clp->cl_net,
 		.protocol	= clp->cl_proto,
+		.nconnect	= clp->cl_nconnect,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
 		.timeout	= cl_init->timeparms,
@@ -506,6 +504,7 @@
 		.program	= &nfs_program,
 		.version	= clp->rpc_ops->version,
 		.authflavor	= flavor,
+		.cred		= cl_init->cred,
 	};
 
 	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
@@ -527,6 +526,7 @@
 		return PTR_ERR(clnt);
 	}
 
+	clnt->cl_principal = clp->cl_principal;
 	clp->cl_rpcclient = clnt;
 	return 0;
 }
@@ -557,6 +557,7 @@
 					1 : 0,
 		.net		= clp->cl_net,
 		.nlmclnt_ops 	= clp->cl_nfs_mod->rpc_ops->nlmclnt_ops,
+		.cred		= current_cred(),
 	};
 
 	if (nlm_init.nfs_version > 3)
@@ -603,6 +604,8 @@
 			sizeof(server->client->cl_timeout_default));
 	server->client->cl_timeout = &server->client->cl_timeout_default;
 	server->client->cl_softrtry = 0;
+	if (server->flags & NFS_MOUNT_SOFTERR)
+		server->client->cl_softerr = 1;
 	if (server->flags & NFS_MOUNT_SOFT)
 		server->client->cl_softrtry = 1;
 
@@ -657,6 +660,8 @@
 		.proto = data->nfs_server.protocol,
 		.net = data->net,
 		.timeparms = &timeparms,
+		.cred = server->cred,
+		.nconnect = data->nfs_server.nconnect,
 	};
 	struct nfs_client *clp;
 	int error;
@@ -925,6 +930,7 @@
 	ida_destroy(&server->lockowner_id);
 	ida_destroy(&server->openowner_id);
 	nfs_free_iostats(server->io_stats);
+	put_cred(server->cred);
 	kfree(server);
 	nfs_release_automount_timer();
 }
@@ -945,6 +951,8 @@
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
+	server->cred = get_cred(current_cred());
+
 	error = -ENOMEM;
 	fattr = nfs_alloc_fattr();
 	if (fattr == NULL)
@@ -1011,6 +1019,8 @@
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
+	server->cred = get_cred(source->cred);
+
 	error = -ENOMEM;
 	fattr_fsinfo = nfs_alloc_fattr();
 	if (fattr_fsinfo == NULL)
@@ -1066,6 +1076,18 @@
 #endif
 	spin_lock_init(&nn->nfs_client_lock);
 	nn->boot_time = ktime_get_real();
+
+	nfs_netns_sysfs_setup(nn, net);
+}
+
+void nfs_clients_exit(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	nfs_netns_sysfs_destroy(nn);
+	nfs_cleanup_cb_ident_idr(net);
+	WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
+	WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
 }
 
 #ifdef CONFIG_PROC_FS

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 75fe92e..af549d7 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/nfs/delegation.c
  *
@@ -26,10 +27,8 @@
 
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
-	if (delegation->cred) {
-		put_rpccred(delegation->cred);
-		delegation->cred = NULL;
-	}
+	put_cred(delegation->cred);
+	delegation->cred = NULL;
 	kfree_rcu(delegation, rcu);
 }
 
@@ -54,6 +53,16 @@
 	return false;
 }
 
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (nfs4_is_valid_delegation(delegation, 0))
+		return delegation;
+	return NULL;
+}
+
 static int
 nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
 {
@@ -93,7 +102,7 @@
 	return nfs4_do_check_delegation(inode, flags, false);
 }
 
-static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid)
 {
 	struct inode *inode = state->inode;
 	struct file_lock *fl;
@@ -108,7 +117,7 @@
 	spin_lock(&flctx->flc_lock);
 restart:
 	list_for_each_entry(fl, list, fl_list) {
-		if (nfs_file_open_context(fl->fl_file) != ctx)
+		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
 		spin_unlock(&flctx->flc_lock);
 		status = nfs4_lock_delegation_recall(fl, state, stateid);
@@ -136,8 +145,8 @@
 	int err;
 
 again:
-	spin_lock(&inode->i_lock);
-	list_for_each_entry(ctx, &nfsi->open_files, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
 		state = ctx->state;
 		if (state == NULL)
 			continue;
@@ -147,15 +156,16 @@
 			continue;
 		if (!nfs4_stateid_match(&state->stateid, stateid))
 			continue;
-		get_nfs_open_context(ctx);
-		spin_unlock(&inode->i_lock);
+		if (!get_nfs_open_context(ctx))
+			continue;
+		rcu_read_unlock();
 		sp = state->owner;
 		/* Block nfs4_proc_unlck */
 		mutex_lock(&sp->so_delegreturn_mutex);
 		seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
-		err = nfs4_open_delegation_recall(ctx, state, stateid, type);
+		err = nfs4_open_delegation_recall(ctx, state, stateid);
 		if (!err)
-			err = nfs_delegation_claim_locks(ctx, state, stateid);
+			err = nfs_delegation_claim_locks(state, stateid);
 		if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
 			err = -EAGAIN;
 		mutex_unlock(&sp->so_delegreturn_mutex);
@@ -164,7 +174,7 @@
 			return err;
 		goto again;
 	}
-	spin_unlock(&inode->i_lock);
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -177,13 +187,13 @@
  * @pagemod_limit: write delegation "space_limit"
  *
  */
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
 				  fmode_t type,
 				  const nfs4_stateid *stateid,
 				  unsigned long pagemod_limit)
 {
 	struct nfs_delegation *delegation;
-	struct rpc_cred *oldcred = NULL;
+	const struct cred *oldcred = NULL;
 
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
@@ -194,12 +204,12 @@
 			delegation->type = type;
 			delegation->pagemod_limit = pagemod_limit;
 			oldcred = delegation->cred;
-			delegation->cred = get_rpccred(cred);
+			delegation->cred = get_cred(cred);
 			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
 				  &delegation->flags);
 			spin_unlock(&delegation->lock);
 			rcu_read_unlock();
-			put_rpccred(oldcred);
+			put_cred(oldcred);
 			trace_nfs4_reclaim_delegation(inode, type);
 			return;
 		}
@@ -230,6 +240,8 @@
 	spin_lock(&delegation->lock);
 	if (delegation->inode != NULL)
 		inode = igrab(delegation->inode);
+	if (!inode)
+		set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
 	spin_unlock(&delegation->lock);
 	return inode;
 }
@@ -340,7 +352,7 @@
  *
  * Returns zero on success, or a negative errno value.
  */
-int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred,
+int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
 				  fmode_t type,
 				  const nfs4_stateid *stateid,
 				  unsigned long pagemod_limit)
@@ -359,7 +371,7 @@
 	delegation->type = type;
 	delegation->pagemod_limit = pagemod_limit;
 	delegation->change_attr = inode_peek_iversion_raw(inode);
-	delegation->cred = get_rpccred(cred);
+	delegation->cred = get_cred(cred);
 	delegation->inode = inode;
 	delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
 	spin_lock_init(&delegation->lock);
@@ -682,7 +694,7 @@
 
 /**
  * nfs_super_return_all_delegations - return delegations for one superblock
- * @sb: sb to process
+ * @server: pointer to nfs_server to process
  *
  */
 void nfs_server_return_all_delegations(struct nfs_server *server)
@@ -945,10 +957,11 @@
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 		list_for_each_entry_rcu(delegation, &server->delegations,
 								super_list) {
-			if (test_bit(NFS_DELEGATION_RETURNING,
-						&delegation->flags))
-				continue;
-			if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+			if (test_bit(NFS_DELEGATION_INODE_FREEING,
+						&delegation->flags) ||
+			    test_bit(NFS_DELEGATION_RETURNING,
+						&delegation->flags) ||
+			    test_bit(NFS_DELEGATION_NEED_RECLAIM,
 						&delegation->flags) == 0)
 				continue;
 			if (!nfs_sb_active(server->super))
@@ -1032,6 +1045,34 @@
 }
 
 /**
+ * nfs_test_expired_all_delegations - test all delegations for a client
+ * @clp: nfs_client to process
+ *
+ * Helper for handling "recallable state revoked" status from server.
+ */
+void nfs_test_expired_all_delegations(struct nfs_client *clp)
+{
+	nfs_mark_test_expired_all_delegations(clp);
+	nfs4_schedule_state_manager(clp);
+}
+
+static void
+nfs_delegation_test_free_expired(struct inode *inode,
+		nfs4_stateid *stateid,
+		const struct cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+	int status;
+
+	if (!cred)
+		return;
+	status = ops->test_and_free_expired(server, stateid, cred);
+	if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+		nfs_remove_bad_delegation(inode, stateid);
+}
+
+/**
  * nfs_reap_expired_delegations - reap expired delegations
  * @clp: nfs_client to process
  *
@@ -1042,11 +1083,10 @@
  */
 void nfs_reap_expired_delegations(struct nfs_client *clp)
 {
-	const struct nfs4_minor_version_ops *ops = clp->cl_mvops;
 	struct nfs_delegation *delegation;
 	struct nfs_server *server;
 	struct inode *inode;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	nfs4_stateid stateid;
 
 restart:
@@ -1054,10 +1094,11 @@
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 		list_for_each_entry_rcu(delegation, &server->delegations,
 								super_list) {
-			if (test_bit(NFS_DELEGATION_RETURNING,
-						&delegation->flags))
-				continue;
-			if (test_bit(NFS_DELEGATION_TEST_EXPIRED,
+			if (test_bit(NFS_DELEGATION_INODE_FREEING,
+						&delegation->flags) ||
+			    test_bit(NFS_DELEGATION_RETURNING,
+						&delegation->flags) ||
+			    test_bit(NFS_DELEGATION_TEST_EXPIRED,
 						&delegation->flags) == 0)
 				continue;
 			if (!nfs_sb_active(server->super))
@@ -1068,16 +1109,12 @@
 				nfs_sb_deactive(server->super);
 				goto restart;
 			}
-			cred = get_rpccred_rcu(delegation->cred);
+			cred = get_cred_rcu(delegation->cred);
 			nfs4_stateid_copy(&stateid, &delegation->stateid);
 			clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
 			rcu_read_unlock();
-			if (cred != NULL &&
-			    ops->test_and_free_expired(server, &stateid, cred) < 0) {
-				nfs_revoke_delegation(inode, &stateid);
-				nfs_inode_find_state_and_recover(inode, &stateid);
-			}
-			put_rpccred(cred);
+			nfs_delegation_test_free_expired(inode, &stateid, cred);
+			put_cred(cred);
 			if (nfs4_server_rebooted(clp)) {
 				nfs_inode_mark_test_expired_delegation(server,inode);
 				iput(inode);
@@ -1154,7 +1191,7 @@
 	if (delegation != NULL &&
 	    nfs4_stateid_match_other(dst, &delegation->stateid)) {
 		dst->seqid = delegation->stateid.seqid;
-		return ret;
+		ret = true;
 	}
 	rcu_read_unlock();
 out:
@@ -1172,7 +1209,7 @@
  * otherwise "false" is returned.
  */
 bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
-		nfs4_stateid *dst, struct rpc_cred **cred)
+		nfs4_stateid *dst, const struct cred **cred)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
@@ -1186,7 +1223,7 @@
 		nfs4_stateid_copy(dst, &delegation->stateid);
 		nfs_mark_delegation_referenced(delegation);
 		if (cred)
-			*cred = get_rpccred(delegation->cred);
+			*cred = get_cred(delegation->cred);
 	}
 	rcu_read_unlock();
 	return ret;

diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index bb1ef8c..8b14d44 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h

@@ -15,7 +15,7 @@
  */
 struct nfs_delegation {
 	struct list_head super_list;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	struct inode *inode;
 	nfs4_stateid stateid;
 	fmode_t type;
@@ -34,11 +34,12 @@
 	NFS_DELEGATION_RETURNING,
 	NFS_DELEGATION_REVOKED,
 	NFS_DELEGATION_TEST_EXPIRED,
+	NFS_DELEGATION_INODE_FREEING,
 };
 
-int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred,
+int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
 		fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
 		fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
 int nfs4_inode_return_delegation(struct inode *inode);
 int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
@@ -57,15 +58,17 @@
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 
 void nfs_mark_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_test_expired_all_delegations(struct nfs_client *clp);
 void nfs_reap_expired_delegations(struct nfs_client *clp);
 
 /* NFSv4 delegation-related procedures */
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
-int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
 int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
-bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred);
 bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
 
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode);
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
 int nfs4_check_delegation(struct inode *inode, fmode_t flags);

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8bfaa65..e180033 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/nfs/dir.c
  *
@@ -67,7 +68,7 @@
 	.freepage = nfs_readdir_clear_array,
 };
 
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 	struct nfs_open_dir_context *ctx;
@@ -77,8 +78,12 @@
 		ctx->attr_gencount = nfsi->attr_gencount;
 		ctx->dir_cookie = 0;
 		ctx->dup_cookie = 0;
-		ctx->cred = get_rpccred(cred);
+		ctx->cred = get_cred(cred);
 		spin_lock(&dir->i_lock);
+		if (list_empty(&nfsi->open_files) &&
+		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+				NFS_INO_REVAL_FORCED;
 		list_add(&ctx->list, &nfsi->open_files);
 		spin_unlock(&dir->i_lock);
 		return ctx;
@@ -91,7 +96,7 @@
 	spin_lock(&dir->i_lock);
 	list_del(&ctx->list);
 	spin_unlock(&dir->i_lock);
-	put_rpccred(ctx->cred);
+	put_cred(ctx->cred);
 	kfree(ctx);
 }
 
@@ -103,23 +108,18 @@
 {
 	int res = 0;
 	struct nfs_open_dir_context *ctx;
-	struct rpc_cred *cred;
 
 	dfprintk(FILE, "NFS: open dir(%pD2)\n", filp);
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred))
-		return PTR_ERR(cred);
-	ctx = alloc_nfs_open_dir_context(inode, cred);
+	ctx = alloc_nfs_open_dir_context(inode, current_cred());
 	if (IS_ERR(ctx)) {
 		res = PTR_ERR(ctx);
 		goto out;
 	}
 	filp->private_data = ctx;
 out:
-	put_rpccred(cred);
 	return res;
 }
 
@@ -334,7 +334,7 @@
 			struct nfs_entry *entry, struct file *file, struct inode *inode)
 {
 	struct nfs_open_dir_context *ctx = file->private_data;
-	struct rpc_cred	*cred = ctx->cred;
+	const struct cred *cred = ctx->cred;
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -579,8 +579,8 @@
 }
 
 /*
- * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_pagearray
+ * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call
+ * to nfs_readdir_free_pages()
  */
 static
 int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
@@ -664,8 +664,9 @@
  * We only need to convert from xdr once so future lookups are much simpler
  */
 static
-int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
+int nfs_readdir_filler(void *data, struct page* page)
 {
+	nfs_readdir_descriptor_t *desc = data;
 	struct inode	*inode = file_inode(desc->file);
 	int ret;
 
@@ -697,8 +698,8 @@
 static
 struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
 {
-	return read_cache_page(desc->file->f_mapping,
-			desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+	return read_cache_page(desc->file->f_mapping, desc->page_index,
+			nfs_readdir_filler, desc);
 }
 
 /*
@@ -950,7 +951,7 @@
 
 /**
  * nfs_force_lookup_revalidate - Mark the directory as having changed
- * @dir - pointer to directory inode
+ * @dir: pointer to directory inode
  *
  * This forces the revalidation code in nfs_lookup_revalidate() to do a
  * full lookup on all child dentries of 'dir' whenever a change occurs
@@ -1072,6 +1073,100 @@
 	return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
 }
 
+static int
+nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
+			   struct inode *inode, int error)
+{
+	switch (error) {
+	case 1:
+		dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
+			__func__, dentry);
+		return 1;
+	case 0:
+		nfs_mark_for_revalidate(dir);
+		if (inode && S_ISDIR(inode->i_mode)) {
+			/* Purge readdir caches. */
+			nfs_zap_caches(inode);
+			/*
+			 * We can't d_drop the root of a disconnected tree:
+			 * its d_hash is on the s_anon list and d_drop() would hide
+			 * it from shrink_dcache_for_unmount(), leading to busy
+			 * inodes on unmount and further oopses.
+			 */
+			if (IS_ROOT(dentry))
+				return 1;
+		}
+		dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
+				__func__, dentry);
+		return 0;
+	}
+	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
+				__func__, dentry, error);
+	return error;
+}
+
+static int
+nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry,
+			       unsigned int flags)
+{
+	int ret = 1;
+	if (nfs_neg_need_reval(dir, dentry, flags)) {
+		if (flags & LOOKUP_RCU)
+			return -ECHILD;
+		ret = 0;
+	}
+	return nfs_lookup_revalidate_done(dir, dentry, NULL, ret);
+}
+
+static int
+nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
+				struct inode *inode)
+{
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+}
+
+static int
+nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
+			     struct inode *inode)
+{
+	struct nfs_fh *fhandle;
+	struct nfs_fattr *fattr;
+	struct nfs4_label *label;
+	int ret;
+
+	ret = -ENOMEM;
+	fhandle = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+	if (fhandle == NULL || fattr == NULL || IS_ERR(label))
+		goto out;
+
+	ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+	if (ret < 0) {
+		if (ret == -ESTALE || ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+	ret = 0;
+	if (nfs_compare_fh(NFS_FH(inode), fhandle))
+		goto out;
+	if (nfs_refresh_inode(inode, fattr) < 0)
+		goto out;
+
+	nfs_setsecurity(inode, fattr, label);
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+	/* set a readdirplus hint that we had a cache miss */
+	nfs_force_use_readdirplus(dir);
+	ret = 1;
+out:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	nfs4_label_free(label);
+	return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
+}
+
 /*
  * This is called every time the dcache has a lookup hit,
  * and we should check whether we can really trust that
@@ -1083,58 +1178,36 @@
  * If the parent directory is seen to have changed, we throw out the
  * cached dentry and do a new lookup.
  */
-static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+			 unsigned int flags)
 {
-	struct inode *dir;
 	struct inode *inode;
-	struct dentry *parent;
-	struct nfs_fh *fhandle = NULL;
-	struct nfs_fattr *fattr = NULL;
-	struct nfs4_label *label = NULL;
 	int error;
 
-	if (flags & LOOKUP_RCU) {
-		parent = READ_ONCE(dentry->d_parent);
-		dir = d_inode_rcu(parent);
-		if (!dir)
-			return -ECHILD;
-	} else {
-		parent = dget_parent(dentry);
-		dir = d_inode(parent);
-	}
 	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
 	inode = d_inode(dentry);
 
-	if (!inode) {
-		if (nfs_neg_need_reval(dir, dentry, flags)) {
-			if (flags & LOOKUP_RCU)
-				return -ECHILD;
-			goto out_bad;
-		}
-		goto out_valid;
-	}
+	if (!inode)
+		return nfs_lookup_revalidate_negative(dir, dentry, flags);
 
 	if (is_bad_inode(inode)) {
-		if (flags & LOOKUP_RCU)
-			return -ECHILD;
 		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
 				__func__, dentry);
 		goto out_bad;
 	}
 
 	if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
-		goto out_set_verifier;
+		return nfs_lookup_revalidate_delegated(dir, dentry, inode);
 
 	/* Force a full look up iff the parent directory has changed */
 	if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) &&
 	    nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
 		error = nfs_lookup_verify_inode(inode, flags);
 		if (error) {
-			if (flags & LOOKUP_RCU)
-				return -ECHILD;
 			if (error == -ESTALE)
-				goto out_zap_parent;
-			goto out_error;
+				nfs_zap_caches(dir);
+			goto out_bad;
 		}
 		nfs_advise_use_readdirplus(dir);
 		goto out_valid;
@@ -1146,81 +1219,45 @@
 	if (NFS_STALE(inode))
 		goto out_bad;
 
-	error = -ENOMEM;
-	fhandle = nfs_alloc_fhandle();
-	fattr = nfs_alloc_fattr();
-	if (fhandle == NULL || fattr == NULL)
-		goto out_error;
-
-	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
-	if (IS_ERR(label))
-		goto out_error;
-
 	trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
-	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+	error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
 	trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
-	if (error == -ESTALE || error == -ENOENT)
-		goto out_bad;
-	if (error)
-		goto out_error;
-	if (nfs_compare_fh(NFS_FH(inode), fhandle))
-		goto out_bad;
-	if ((error = nfs_refresh_inode(inode, fattr)) != 0)
-		goto out_bad;
+	return error;
+out_valid:
+	return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+out_bad:
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+	return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+}
 
-	nfs_setsecurity(inode, fattr, label);
+static int
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
+			int (*reval)(struct inode *, struct dentry *, unsigned int))
+{
+	struct dentry *parent;
+	struct inode *dir;
+	int ret;
 
-	nfs_free_fattr(fattr);
-	nfs_free_fhandle(fhandle);
-	nfs4_label_free(label);
-
-	/* set a readdirplus hint that we had a cache miss */
-	nfs_force_use_readdirplus(dir);
-
-out_set_verifier:
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- out_valid:
 	if (flags & LOOKUP_RCU) {
+		parent = READ_ONCE(dentry->d_parent);
+		dir = d_inode_rcu(parent);
+		if (!dir)
+			return -ECHILD;
+		ret = reval(dir, dentry, flags);
 		if (parent != READ_ONCE(dentry->d_parent))
 			return -ECHILD;
-	} else
+	} else {
+		parent = dget_parent(dentry);
+		ret = reval(d_inode(parent), dentry, flags);
 		dput(parent);
-	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
-			__func__, dentry);
-	return 1;
-out_zap_parent:
-	nfs_zap_caches(dir);
- out_bad:
-	WARN_ON(flags & LOOKUP_RCU);
-	nfs_free_fattr(fattr);
-	nfs_free_fhandle(fhandle);
-	nfs4_label_free(label);
-	nfs_mark_for_revalidate(dir);
-	if (inode && S_ISDIR(inode->i_mode)) {
-		/* Purge readdir caches. */
-		nfs_zap_caches(inode);
-		/*
-		 * We can't d_drop the root of a disconnected tree:
-		 * its d_hash is on the s_anon list and d_drop() would hide
-		 * it from shrink_dcache_for_unmount(), leading to busy
-		 * inodes on unmount and further oopses.
-		 */
-		if (IS_ROOT(dentry))
-			goto out_valid;
 	}
-	dput(parent);
-	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
-			__func__, dentry);
-	return 0;
-out_error:
-	WARN_ON(flags & LOOKUP_RCU);
-	nfs_free_fattr(fattr);
-	nfs_free_fhandle(fhandle);
-	nfs4_label_free(label);
-	dput(parent);
-	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
-			__func__, dentry, error);
-	return error;
+	return ret;
+}
+
+static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
 }
 
 /*
@@ -1450,7 +1487,7 @@
 	if (S_ISREG(file->f_path.dentry->d_inode->i_mode))
 		nfs_file_set_open_context(file, ctx);
 	else
-		err = -ESTALE;
+		err = -EOPENSTALE;
 out:
 	return err;
 }
@@ -1579,70 +1616,61 @@
 }
 EXPORT_SYMBOL_GPL(nfs_atomic_open);
 
-static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+			  unsigned int flags)
 {
 	struct inode *inode;
-	int ret = 0;
 
 	if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
-		goto no_open;
+		goto full_reval;
 	if (d_mountpoint(dentry))
-		goto no_open;
-	if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
-		goto no_open;
+		goto full_reval;
 
 	inode = d_inode(dentry);
 
 	/* We can't create new files in nfs_open_revalidate(), so we
 	 * optimize away revalidation of negative dentries.
 	 */
-	if (inode == NULL) {
-		struct dentry *parent;
-		struct inode *dir;
+	if (inode == NULL)
+		goto full_reval;
 
-		if (flags & LOOKUP_RCU) {
-			parent = READ_ONCE(dentry->d_parent);
-			dir = d_inode_rcu(parent);
-			if (!dir)
-				return -ECHILD;
-		} else {
-			parent = dget_parent(dentry);
-			dir = d_inode(parent);
-		}
-		if (!nfs_neg_need_reval(dir, dentry, flags))
-			ret = 1;
-		else if (flags & LOOKUP_RCU)
-			ret = -ECHILD;
-		if (!(flags & LOOKUP_RCU))
-			dput(parent);
-		else if (parent != READ_ONCE(dentry->d_parent))
-			return -ECHILD;
-		goto out;
-	}
+	if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+		return nfs_lookup_revalidate_delegated(dir, dentry, inode);
 
 	/* NFS only supports OPEN on regular files */
 	if (!S_ISREG(inode->i_mode))
-		goto no_open;
+		goto full_reval;
+
 	/* We cannot do exclusive creation on a positive dentry */
-	if (flags & LOOKUP_EXCL)
-		goto no_open;
+	if (flags & (LOOKUP_EXCL | LOOKUP_REVAL))
+		goto reval_dentry;
+
+	/* Check if the directory changed */
+	if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU))
+		goto reval_dentry;
 
 	/* Let f_op->open() actually open (and revalidate) the file */
-	ret = 1;
+	return 1;
+reval_dentry:
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+	return nfs_lookup_revalidate_dentry(dir, dentry, inode);
 
-out:
-	return ret;
+full_reval:
+	return nfs_do_lookup_revalidate(dir, dentry, flags);
+}
 
-no_open:
-	return nfs_lookup_revalidate(dentry, flags);
+static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	return __nfs_lookup_revalidate(dentry, flags,
+			nfs4_do_lookup_revalidate);
 }
 
 #endif /* CONFIG_NFSV4 */
 
-/*
- * Code common to create, mkdir, and mknod.
- */
-int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+struct dentry *
+nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
 				struct nfs_fattr *fattr,
 				struct nfs4_label *label)
 {
@@ -1650,13 +1678,10 @@
 	struct inode *dir = d_inode(parent);
 	struct inode *inode;
 	struct dentry *d;
-	int error = -EACCES;
+	int error;
 
 	d_drop(dentry);
 
-	/* We may have been initialized further down */
-	if (d_really_is_positive(dentry))
-		goto out;
 	if (fhandle->size == 0) {
 		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
 		if (error)
@@ -1672,18 +1697,32 @@
 	}
 	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
 	d = d_splice_alias(inode, dentry);
-	if (IS_ERR(d)) {
-		error = PTR_ERR(d);
-		goto out_error;
-	}
-	dput(d);
 out:
 	dput(parent);
-	return 0;
+	return d;
 out_error:
 	nfs_mark_for_revalidate(dir);
-	dput(parent);
-	return error;
+	d = ERR_PTR(error);
+	goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
+
+/*
+ * Code common to create, mkdir, and mknod.
+ */
+int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+				struct nfs_fattr *fattr,
+				struct nfs4_label *label)
+{
+	struct dentry *d;
+
+	d = nfs_add_or_obtain(dentry, fhandle, fattr, label);
+	if (IS_ERR(d))
+		return PTR_ERR(d);
+
+	/* Callers don't care */
+	dput(d);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_instantiate);
 
@@ -2110,7 +2149,7 @@
 
 static void nfs_access_free_entry(struct nfs_access_entry *entry)
 {
-	put_rpccred(entry->cred);
+	put_cred(entry->cred);
 	kfree_rcu(entry, rcu_head);
 	smp_mb__before_atomic();
 	atomic_long_dec(&nfs_access_nr_entries);
@@ -2236,17 +2275,18 @@
 }
 EXPORT_SYMBOL_GPL(nfs_access_zap_cache);
 
-static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred)
 {
 	struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
-	struct nfs_access_entry *entry;
 
 	while (n != NULL) {
-		entry = rb_entry(n, struct nfs_access_entry, rb_node);
+		struct nfs_access_entry *entry =
+			rb_entry(n, struct nfs_access_entry, rb_node);
+		int cmp = cred_fscmp(cred, entry->cred);
 
-		if (cred < entry->cred)
+		if (cmp < 0)
 			n = n->rb_left;
-		else if (cred > entry->cred)
+		else if (cmp > 0)
 			n = n->rb_right;
 		else
 			return entry;
@@ -2254,7 +2294,7 @@
 	return NULL;
 }
 
-static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
+static int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_access_entry *cache;
@@ -2297,7 +2337,7 @@
 	return -ENOENT;
 }
 
-static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res)
 {
 	/* Only check the most recently returned cache entry,
 	 * but do it without locking.
@@ -2334,15 +2374,17 @@
 	struct rb_node **p = &root_node->rb_node;
 	struct rb_node *parent = NULL;
 	struct nfs_access_entry *entry;
+	int cmp;
 
 	spin_lock(&inode->i_lock);
 	while (*p != NULL) {
 		parent = *p;
 		entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+		cmp = cred_fscmp(set->cred, entry->cred);
 
-		if (set->cred < entry->cred)
+		if (cmp < 0)
 			p = &parent->rb_left;
-		else if (set->cred > entry->cred)
+		else if (cmp > 0)
 			p = &parent->rb_right;
 		else
 			goto found;
@@ -2366,7 +2408,7 @@
 	if (cache == NULL)
 		return;
 	RB_CLEAR_NODE(&cache->rb_node);
-	cache->cred = get_rpccred(set->cred);
+	cache->cred = get_cred(set->cred);
 	cache->mask = set->mask;
 
 	/* The above field assignments must be visible
@@ -2430,7 +2472,7 @@
 }
 EXPORT_SYMBOL_GPL(nfs_access_set_mask);
 
-static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
+static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
 {
 	struct nfs_access_entry cache;
 	bool may_block = (mask & MAY_NOT_BLOCK) == 0;
@@ -2494,7 +2536,7 @@
 	return mask;
 }
 
-int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
+int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags)
 {
 	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
@@ -2519,7 +2561,7 @@
 
 int nfs_permission(struct inode *inode, int mask)
 {
-	struct rpc_cred *cred;
+	const struct cred *cred = current_cred();
 	int res = 0;
 
 	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
@@ -2553,20 +2595,11 @@
 
 	/* Always try fast lookups first */
 	rcu_read_lock();
-	cred = rpc_lookup_cred_nonblock();
-	if (!IS_ERR(cred))
-		res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
-	else
-		res = PTR_ERR(cred);
+	res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
 	rcu_read_unlock();
 	if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
 		/* Fast lookup failed, try the slow way */
-		cred = rpc_lookup_cred();
-		if (!IS_ERR(cred)) {
-			res = nfs_do_access(inode, cred, mask);
-			put_rpccred(cred);
-		} else
-			res = PTR_ERR(cred);
+		res = nfs_do_access(inode, cred, mask);
 	}
 out:
 	if (!res && (mask & MAY_EXEC))

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 33824a0..040a50f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/nfs/direct.c
  *
@@ -63,13 +64,6 @@
 
 static struct kmem_cache *nfs_direct_cachep;
 
-/*
- * This represents a set of asynchronous requests that we're waiting on
- */
-struct nfs_direct_mirror {
-	ssize_t count;
-};
-
 struct nfs_direct_req {
 	struct kref		kref;		/* release manager */
 
@@ -83,9 +77,6 @@
 	atomic_t		io_count;	/* i/os we're waiting for */
 	spinlock_t		lock;		/* protect completion state */
 
-	struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
-	int			mirror_count;
-
 	loff_t			io_start;	/* Start offset for I/O */
 	ssize_t			count,		/* bytes actually processed */
 				max_count,	/* max expected count */
@@ -122,34 +113,44 @@
 }
 
 static void
-nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
+			    const struct nfs_pgio_header *hdr,
+			    ssize_t dreq_len)
 {
-	int i;
-	ssize_t count;
+	if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
+	      test_bit(NFS_IOHDR_EOF, &hdr->flags)))
+		return;
+	if (dreq->max_count >= dreq_len) {
+		dreq->max_count = dreq_len;
+		if (dreq->count > dreq_len)
+			dreq->count = dreq_len;
 
-	WARN_ON_ONCE(dreq->count >= dreq->max_count);
-
-	if (dreq->mirror_count == 1) {
-		dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
-		dreq->count += hdr->good_bytes;
-	} else {
-		/* mirrored writes */
-		count = dreq->mirrors[hdr->pgio_mirror_idx].count;
-		if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
-			count = hdr->io_start + hdr->good_bytes - dreq->io_start;
-			dreq->mirrors[hdr->pgio_mirror_idx].count = count;
-		}
-		/* update the dreq->count by finding the minimum agreed count from all
-		 * mirrors */
-		count = dreq->mirrors[0].count;
-
-		for (i = 1; i < dreq->mirror_count; i++)
-			count = min(count, dreq->mirrors[i].count);
-
-		dreq->count = count;
+		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
+			dreq->error = hdr->error;
+		else /* Clear outstanding error if this is EOF */
+			dreq->error = 0;
 	}
 }
 
+static void
+nfs_direct_count_bytes(struct nfs_direct_req *dreq,
+		       const struct nfs_pgio_header *hdr)
+{
+	loff_t hdr_end = hdr->io_start + hdr->good_bytes;
+	ssize_t dreq_len = 0;
+
+	if (hdr_end > dreq->io_start)
+		dreq_len = hdr_end - dreq->io_start;
+
+	nfs_direct_handle_truncated(dreq, hdr, dreq_len);
+
+	if (dreq_len > dreq->max_count)
+		dreq_len = dreq->max_count;
+
+	if (dreq->count < dreq_len)
+		dreq->count = dreq_len;
+}
+
 /*
  * nfs_direct_select_verf - select the right verifier
  * @dreq - direct request possibly spanning multiple servers
@@ -292,18 +293,6 @@
 	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
 }
 
-static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
-					     struct nfs_pageio_descriptor *pgio,
-					     struct nfs_page *req)
-{
-	int mirror_count = 1;
-
-	if (pgio->pg_ops->pg_get_mirror_count)
-		mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
-
-	dreq->mirror_count = mirror_count;
-}
-
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
 	struct nfs_direct_req *dreq;
@@ -318,7 +307,6 @@
 	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
 	dreq->verf.committed = NFS_INVALID_STABLE_HOW;	/* not set yet */
 	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
-	dreq->mirror_count = 1;
 	spin_lock_init(&dreq->lock);
 
 	return dreq;
@@ -400,15 +388,13 @@
 	unsigned long bytes = 0;
 	struct nfs_direct_req *dreq = hdr->dreq;
 
-	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
-		goto out_put;
-
 	spin_lock(&dreq->lock);
-	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
-		dreq->error = hdr->error;
-	else
-		nfs_direct_good_bytes(dreq, hdr);
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		spin_unlock(&dreq->lock);
+		goto out_put;
+	}
 
+	nfs_direct_count_bytes(dreq, hdr);
 	spin_unlock(&dreq->lock);
 
 	while (!list_empty(&hdr->pages)) {
@@ -428,7 +414,7 @@
 	hdr->release(hdr);
 }
 
-static void nfs_read_sync_pgio_error(struct list_head *head)
+static void nfs_read_sync_pgio_error(struct list_head *head, int error)
 {
 	struct nfs_page *req;
 
@@ -492,7 +478,7 @@
 			struct nfs_page *req;
 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 			/* XXX do we need to do the eof zeroing found in async_filler? */
-			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
+			req = nfs_create_request(dreq->ctx, pagevec[i],
 						 pgbase, req_len);
 			if (IS_ERR(req)) {
 				result = PTR_ERR(req);
@@ -639,33 +625,27 @@
 	LIST_HEAD(reqs);
 	struct nfs_commit_info cinfo;
 	LIST_HEAD(failed);
-	int i;
 
 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
 
 	dreq->count = 0;
+	dreq->max_count = 0;
+	list_for_each_entry(req, &reqs, wb_list)
+		dreq->max_count += req->wb_bytes;
 	dreq->verf.committed = NFS_INVALID_STABLE_HOW;
 	nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
-	for (i = 0; i < dreq->mirror_count; i++)
-		dreq->mirrors[i].count = 0;
 	get_dreq(dreq);
 
 	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 
-	req = nfs_list_entry(reqs.next);
-	nfs_direct_setup_mirroring(dreq, &desc, req);
-	if (desc.pg_error < 0) {
-		list_splice_init(&reqs, &failed);
-		goto out_failed;
-	}
-
 	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+		/* Bump the transmission count */
+		req->wb_nio++;
 		if (!nfs_pageio_add_request(&desc, req)) {
-			nfs_list_remove_request(req);
-			nfs_list_add_request(req, &failed);
+			nfs_list_move_request(req, &failed);
 			spin_lock(&cinfo.inode->i_lock);
 			dreq->flags = 0;
 			if (desc.pg_error < 0)
@@ -678,7 +658,6 @@
 	}
 	nfs_pageio_complete(&desc);
 
-out_failed:
 	while (!list_empty(&failed)) {
 		req = nfs_list_entry(failed.next);
 		nfs_list_remove_request(req);
@@ -704,6 +683,11 @@
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
 		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
+			/*
+			 * Despite the reboot, the write was successful,
+			 * so reset wb_nio.
+			 */
+			req->wb_nio = 0;
 			/* Note the rewrite will go through mds */
 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
 		} else
@@ -775,17 +759,16 @@
 	bool request_commit = false;
 	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 
-	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
-		goto out_put;
-
 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
 
 	spin_lock(&dreq->lock);
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		spin_unlock(&dreq->lock);
+		goto out_put;
+	}
 
-	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
-		dreq->error = hdr->error;
-	if (dreq->error == 0) {
-		nfs_direct_good_bytes(dreq, hdr);
+	nfs_direct_count_bytes(dreq, hdr);
+	if (hdr->good_bytes != 0) {
 		if (nfs_write_need_commit(hdr)) {
 			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
 				request_commit = true;
@@ -821,7 +804,7 @@
 	hdr->release(hdr);
 }
 
-static void nfs_write_sync_pgio_error(struct list_head *head)
+static void nfs_write_sync_pgio_error(struct list_head *head, int error)
 {
 	struct nfs_page *req;
 
@@ -900,14 +883,13 @@
 			struct nfs_page *req;
 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 
-			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
+			req = nfs_create_request(dreq->ctx, pagevec[i],
 						 pgbase, req_len);
 			if (IS_ERR(req)) {
 				result = PTR_ERR(req);
 				break;
 			}
 
-			nfs_direct_setup_mirroring(dreq, &desc, req);
 			if (desc.pg_error < 0) {
 				nfs_free_request(req);
 				result = desc.pg_error;

diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 060c658..aec769a 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c

@@ -22,7 +22,8 @@
 	char *ip_addr = NULL;
 	int ip_len;
 
-	ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+	ip_len = dns_query(net, NULL, name, namelen, NULL, &ip_addr, NULL,
+			   false);
 	if (ip_len > 0)
 		ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
 	else
@@ -65,6 +66,7 @@
 
 	struct sockaddr_storage addr;
 	size_t addrlen;
+	struct rcu_head rcu_head;
 };
 
 
@@ -101,13 +103,21 @@
 	}
 }
 
+static void nfs_dns_ent_free_rcu(struct rcu_head *head)
+{
+	struct nfs_dns_ent *item;
+
+	item = container_of(head, struct nfs_dns_ent, rcu_head);
+	kfree(item->hostname);
+	kfree(item);
+}
+
 static void nfs_dns_ent_put(struct kref *ref)
 {
 	struct nfs_dns_ent *item;
 
 	item = container_of(ref, struct nfs_dns_ent, h.ref);
-	kfree(item->hostname);
-	kfree(item);
+	call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu);
 }
 
 static struct cache_head *nfs_dns_ent_alloc(void)
@@ -195,7 +205,7 @@
 {
 	struct cache_head *ch;
 
-	ch = sunrpc_cache_lookup(cd,
+	ch = sunrpc_cache_lookup_rcu(cd,
 			&key->h,
 			nfs_dns_hash(key));
 	if (!ch)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 29553fd..95dc905 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/nfs/file.c
  *
@@ -89,8 +90,8 @@
 
 /**
  * nfs_revalidate_size - Revalidate the file size
- * @inode - pointer to inode struct
- * @file - pointer to struct file
+ * @inode: pointer to inode struct
+ * @filp: pointer to struct file
  *
  * Revalidates the file length. This is basically a wrapper around
  * nfs_revalidate_inode() that takes into account the fact that we may
@@ -147,7 +148,7 @@
 		return 0;
 
 	/* Flush writes to the server and return any errors */
-	return vfs_fsync(file, 0);
+	return nfs_wb_all(inode);
 }
 
 ssize_t
@@ -199,13 +200,6 @@
  * Flush any dirty pages for this process, and check for write errors.
  * The return status from this call provides a reliable indication of
  * whether any write errors occurred for this process.
- *
- * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
- * disk, but it retrieves and clears ctx->error after synching, despite
- * the two being set at the same time in nfs_context_set_write_error().
- * This is because the former is used to notify the _next_ call to
- * nfs_file_write() that a write error occurred, and hence cause it to
- * fall back to doing a synchronous write.
  */
 static int
 nfs_file_fsync_commit(struct file *file, int datasync)
@@ -220,11 +214,8 @@
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
 	status = nfs_commit_inode(inode, FLUSH_SYNC);
-	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
-		ret = xchg(&ctx->error, 0);
-		if (ret)
-			goto out;
-	}
+	if (status == 0)
+		status = file_check_and_advance_wb_err(file);
 	if (status < 0) {
 		ret = status;
 		goto out;
@@ -245,13 +236,7 @@
 	trace_nfs_fsync_enter(inode);
 
 	do {
-		struct nfs_open_context *ctx = nfs_file_open_context(file);
-		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-		if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
-			int ret2 = xchg(&ctx->error, 0);
-			if (ret2)
-				ret = ret2;
-		}
+		ret = file_write_and_wait_range(file, start, end);
 		if (ret != 0)
 			break;
 		ret = nfs_file_fsync_commit(file, datasync);
@@ -276,6 +261,12 @@
  * then a modify/write/read cycle when writing to a page in the
  * page cache.
  *
+ * Some pNFS layout drivers can only read/write at a certain block
+ * granularity like all block devices and therefore we must perform
+ * read/modify/write whenever a page hasn't read yet and the data
+ * to be written there is not aligned to a block boundary and/or
+ * smaller than the block size.
+ *
  * The modify/write/read cycle may occur if a page is read before
  * being completely filled by the writer.  In this situation, the
  * page must be completely written to stable storage on the server
@@ -291,26 +282,32 @@
  * and that the new data won't completely replace the old data in
  * that range of the file.
  */
-static int nfs_want_read_modify_write(struct file *file, struct page *page,
-			loff_t pos, unsigned len)
+static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len)
 {
 	unsigned int pglen = nfs_page_length(page);
 	unsigned int offset = pos & (PAGE_SIZE - 1);
 	unsigned int end = offset + len;
 
-	if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
-		if (!PageUptodate(page))
-			return 1;
-		return 0;
-	}
+	return !pglen || (end >= pglen && !offset);
+}
 
-	if ((file->f_mode & FMODE_READ) &&	/* open for read? */
-	    !PageUptodate(page) &&		/* Uptodate? */
-	    !PagePrivate(page) &&		/* i/o request already? */
-	    pglen &&				/* valid bytes of file? */
-	    (end < pglen || offset))		/* replace all valid bytes? */
-		return 1;
-	return 0;
+static bool nfs_want_read_modify_write(struct file *file, struct page *page,
+			loff_t pos, unsigned int len)
+{
+	/*
+	 * Up-to-date pages, those with ongoing or full-page write
+	 * don't need read/modify/write
+	 */
+	if (PageUptodate(page) || PagePrivate(page) ||
+	    nfs_full_page_write(page, pos, len))
+		return false;
+
+	if (pnfs_ld_read_whole_page(file->f_mapping->host))
+		return true;
+	/* Open for reading too? */
+	if (file->f_mode & FMODE_READ)
+		return true;
+	return false;
 }
 
 /*
@@ -588,8 +585,7 @@
 	struct nfs_open_context *ctx;
 
 	ctx = nfs_file_open_context(filp);
-	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
-	    nfs_ctx_key_to_expire(ctx, inode))
+	if (nfs_ctx_key_to_expire(ctx, inode))
 		return 1;
 	return 0;
 }
@@ -643,7 +639,7 @@
 
 	/* Return error values */
 	if (nfs_need_check_write(file, inode)) {
-		int err = vfs_fsync(file, 0);
+		int err = nfs_wb_all(inode);
 		if (err < 0)
 			result = err;
 	}
@@ -697,7 +693,7 @@
 	 * Flush all pending writes before doing anything
 	 * with locks..
 	 */
-	vfs_fsync(filp, 0);
+	nfs_wb_all(inode);
 
 	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
 	if (!IS_ERR(l_ctx)) {

diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile
index 8516cdf..de05631 100644
--- a/fs/nfs/filelayout/Makefile
+++ b/fs/nfs/filelayout/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the pNFS Files Layout Driver kernel module
 #

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d175724..c9b605f 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c

@@ -904,7 +904,7 @@
 	status = filelayout_check_deviceid(lo, fl, gfp_flags);
 	if (status) {
 		pnfs_put_lseg(lseg);
-		lseg = ERR_PTR(status);
+		lseg = NULL;
 	}
 out:
 	return lseg;
@@ -917,7 +917,7 @@
 	pnfs_generic_pg_check_layout(pgio);
 	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
-						      req->wb_context,
+						      nfs_req_openctx(req),
 						      0,
 						      NFS4_MAX_UINT64,
 						      IOMODE_READ,
@@ -944,7 +944,7 @@
 	pnfs_generic_pg_check_layout(pgio);
 	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
-						      req->wb_context,
+						      nfs_req_openctx(req),
 						      0,
 						      NFS4_MAX_UINT64,
 						      IOMODE_RW,
@@ -1164,6 +1164,8 @@
 	.id			= LAYOUT_NFSV4_1_FILES,
 	.name			= "LAYOUT_NFSV4_1_FILES",
 	.owner			= THIS_MODULE,
+	.flags			= PNFS_LAYOUTGET_ON_OPEN,
+	.max_layoutget_response	= 4096, /* 1 page or so... */
 	.alloc_layout_hdr	= filelayout_alloc_layout_hdr,
 	.free_layout_hdr	= filelayout_free_layout_hdr,
 	.alloc_lseg		= filelayout_alloc_lseg,

diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
index 1d2c9f6..49f0342 100644
--- a/fs/nfs/flexfilelayout/Makefile
+++ b/fs/nfs/flexfilelayout/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the pNFS Flexfile Layout Driver kernel module
 #

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index e0fe9a0..5657b7f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Module for pnfs flexfile layout driver.
  *
@@ -7,8 +8,10 @@
  */
 
 #include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 
 #include <linux/sunrpc/metrics.h>
 
@@ -27,8 +30,7 @@
 #define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
 #define FF_LAYOUTRETURN_MAXERR 20
 
-
-static struct group_info	*ff_zero_group;
+static unsigned short io_maxretrans;
 
 static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
 		struct nfs_pgio_header *hdr);
@@ -226,16 +228,14 @@
 
 static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
-	struct rpc_cred	*cred;
+	const struct cred	*cred;
 
 	ff_layout_remove_mirror(mirror);
 	kfree(mirror->fh_versions);
 	cred = rcu_access_pointer(mirror->ro_cred);
-	if (cred)
-		put_rpccred(cred);
+	put_cred(cred);
 	cred = rcu_access_pointer(mirror->rw_cred);
-	if (cred)
-		put_rpccred(cred);
+	put_cred(cred);
 	nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
 	kfree(mirror);
 }
@@ -413,8 +413,10 @@
 
 	for (i = 0; i < fls->mirror_array_cnt; i++) {
 		struct nfs4_ff_layout_mirror *mirror;
-		struct auth_cred acred = { .group_info = ff_zero_group };
-		struct rpc_cred	__rcu *cred;
+		struct cred *kcred;
+		const struct cred __rcu *cred;
+		kuid_t uid;
+		kgid_t gid;
 		u32 ds_count, fh_count, id;
 		int j;
 
@@ -482,21 +484,28 @@
 		if (rc)
 			goto out_err_free;
 
-		acred.uid = make_kuid(&init_user_ns, id);
+		uid = make_kuid(&init_user_ns, id);
 
 		/* group */
 		rc = decode_name(&stream, &id);
 		if (rc)
 			goto out_err_free;
 
-		acred.gid = make_kgid(&init_user_ns, id);
+		gid = make_kgid(&init_user_ns, id);
 
-		/* find the cred for it */
-		rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags));
-		if (IS_ERR(cred)) {
-			rc = PTR_ERR(cred);
-			goto out_err_free;
+		if (gfp_flags & __GFP_FS)
+			kcred = prepare_kernel_cred(NULL);
+		else {
+			unsigned int nofs_flags = memalloc_nofs_save();
+			kcred = prepare_kernel_cred(NULL);
+			memalloc_nofs_restore(nofs_flags);
 		}
+		rc = -ENOMEM;
+		if (!kcred)
+			goto out_err_free;
+		kcred->fsuid = uid;
+		kcred->fsgid = gid;
+		cred = RCU_INITIALIZER(kcred);
 
 		if (lgr->range.iomode == IOMODE_READ)
 			rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
@@ -519,8 +528,8 @@
 
 		dprintk("%s: iomode %s uid %u gid %u\n", __func__,
 			lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
-			from_kuid(&init_user_ns, acred.uid),
-			from_kgid(&init_user_ns, acred.gid));
+			from_kuid(&init_user_ns, uid),
+			from_kgid(&init_user_ns, gid));
 	}
 
 	p = xdr_inline_decode(&stream, 4);
@@ -783,30 +792,82 @@
 	}
 }
 
+static void
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
+{
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+	if (devid)
+		nfs4_mark_deviceid_unavailable(devid);
+}
+
+static void
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
+{
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+	if (devid)
+		nfs4_mark_deviceid_available(devid);
+}
+
 static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
-				  int start_idx,
-				  int *best_idx)
+ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
+			     int start_idx, int *best_idx,
+			     bool check_device)
 {
 	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_pnfs_ds *ds;
 	bool fail_return = false;
 	int idx;
 
-	/* mirrors are sorted by efficiency */
+	/* mirrors are initially sorted by efficiency */
 	for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
 		if (idx+1 == fls->mirror_array_cnt)
-			fail_return = true;
-		ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return);
-		if (ds) {
-			*best_idx = idx;
-			return ds;
-		}
+			fail_return = !check_device;
+
+		mirror = FF_LAYOUT_COMP(lseg, idx);
+		ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return);
+		if (!ds)
+			continue;
+
+		if (check_device &&
+		    nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+			continue;
+
+		*best_idx = idx;
+		return ds;
 	}
 
 	return NULL;
 }
 
+static struct nfs4_pnfs_ds *
+ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
+				 int start_idx, int *best_idx)
+{
+	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
+				   int start_idx, int *best_idx)
+{
+	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+				  int start_idx, int *best_idx)
+{
+	struct nfs4_pnfs_ds *ds;
+
+	ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+	if (ds)
+		return ds;
+	return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+}
+
 static void
 ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
 		      struct nfs_page *req,
@@ -814,7 +875,7 @@
 {
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-					   req->wb_context,
+					   nfs_req_openctx(req),
 					   0,
 					   NFS4_MAX_UINT64,
 					   IOMODE_READ,
@@ -868,13 +929,21 @@
 	pgm = &pgio->pg_mirrors[0];
 	pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
 
+	if (NFS_SERVER(pgio->pg_inode)->flags &
+			(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
+		pgio->pg_maxretrans = io_maxretrans;
 	return;
 out_nolseg:
 	if (pgio->pg_error < 0)
 		return;
 out_mds:
+	trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_READ,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
+	pgio->pg_maxretrans = 0;
 	nfs_pageio_reset_read_mds(pgio);
 }
 
@@ -893,7 +962,7 @@
 	pnfs_generic_pg_check_layout(pgio);
 	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   req->wb_context,
+						   nfs_req_openctx(req),
 						   0,
 						   NFS4_MAX_UINT64,
 						   IOMODE_RW,
@@ -920,7 +989,8 @@
 		goto out_mds;
 
 	for (i = 0; i < pgio->pg_mirror_count; i++) {
-		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+		mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
 		if (!ds) {
 			if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
 				goto out_mds;
@@ -931,15 +1001,22 @@
 			goto retry;
 		}
 		pgm = &pgio->pg_mirrors[i];
-		mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
 		pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
 	}
 
+	if (NFS_SERVER(pgio->pg_inode)->flags &
+			(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
+		pgio->pg_maxretrans = io_maxretrans;
 	return;
 
 out_mds:
+	trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_RW,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
+	pgio->pg_maxretrans = 0;
 	nfs_pageio_reset_write_mds(pgio);
 }
 
@@ -949,7 +1026,7 @@
 {
 	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   req->wb_context,
+						   nfs_req_openctx(req),
 						   0,
 						   NFS4_MAX_UINT64,
 						   IOMODE_RW,
@@ -964,6 +1041,10 @@
 	if (pgio->pg_lseg)
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
+	trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_RW,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	/* no lseg means that pnfs is not in use, so no mirroring here */
 	nfs_pageio_reset_write_mds(pgio);
 out:
@@ -1013,6 +1094,10 @@
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
+		trace_pnfs_mds_fallback_write_done(hdr->inode,
+				hdr->args.offset, hdr->args.count,
+				IOMODE_RW, NFS_I(hdr->inode)->layout,
+				hdr->lseg);
 		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
 	}
 }
@@ -1032,6 +1117,10 @@
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
+		trace_pnfs_mds_fallback_read_done(hdr->inode,
+				hdr->args.offset, hdr->args.count,
+				IOMODE_READ, NFS_I(hdr->inode)->layout,
+				hdr->lseg);
 		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
 	}
 }
@@ -1153,8 +1242,10 @@
 {
 	int vers = clp->cl_nfs_mod->rpc_vers->number;
 
-	if (task->tk_status >= 0)
+	if (task->tk_status >= 0) {
+		ff_layout_mark_ds_reachable(lseg, idx);
 		return 0;
+	}
 
 	/* Handle the case of an invalid layout segment */
 	if (!pnfs_is_valid_lseg(lseg))
@@ -1217,6 +1308,8 @@
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 				       mirror, offset, length, status, opnum,
 				       GFP_NOIO);
+	if (status == NFS4ERR_NXIO)
+		ff_layout_mark_ds_unreachable(lseg, idx);
 	pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
 	dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 }
@@ -1225,6 +1318,7 @@
 static int ff_layout_read_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 {
+	int new_idx = hdr->pgio_mirror_idx;
 	int err;
 
 	trace_nfs4_pnfs_read(hdr, task->tk_status);
@@ -1243,8 +1337,8 @@
 	case -NFS4ERR_RESET_TO_PNFS:
 		if (ff_layout_choose_best_ds_for_read(hdr->lseg,
 					hdr->pgio_mirror_idx + 1,
-					&hdr->pgio_mirror_idx))
-			goto out_eagain;
+					&new_idx))
+			goto out_layouterror;
 		set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
 		return task->tk_status;
 	case -NFS4ERR_RESET_TO_MDS:
@@ -1255,6 +1349,10 @@
 	}
 
 	return 0;
+out_layouterror:
+	ff_layout_read_record_layoutstats_done(task, hdr);
+	ff_layout_send_layouterror(hdr->lseg);
+	hdr->pgio_mirror_idx = new_idx;
 out_eagain:
 	rpc_restart_call_prepare(task);
 	return -EAGAIN;
@@ -1288,15 +1386,6 @@
 		(unsigned long long) NFS_I(inode)->layout->plh_lwb);
 }
 
-static bool
-ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
-{
-	/* No mirroring for now */
-	struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
-
-	return ff_layout_test_devid_unavailable(node);
-}
-
 static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
 		struct nfs_pgio_header *hdr)
 {
@@ -1327,10 +1416,6 @@
 		rpc_exit(task, -EIO);
 		return -EIO;
 	}
-	if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
-		rpc_exit(task, -EHOSTDOWN);
-		return -EAGAIN;
-	}
 
 	ff_layout_read_record_layoutstats_start(task, hdr);
 	return 0;
@@ -1394,9 +1479,10 @@
 	struct nfs_pgio_header *hdr = data;
 
 	ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
-	if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+	if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+		ff_layout_send_layouterror(hdr->lseg);
 		pnfs_read_resend_pnfs(hdr);
-	else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+	} else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
 		ff_layout_reset_read(hdr);
 	pnfs_generic_rw_release(data);
 }
@@ -1508,11 +1594,6 @@
 		return -EIO;
 	}
 
-	if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
-		rpc_exit(task, -EHOSTDOWN);
-		return -EAGAIN;
-	}
-
 	ff_layout_write_record_layoutstats_start(task, hdr);
 	return 0;
 }
@@ -1568,9 +1649,10 @@
 	struct nfs_pgio_header *hdr = data;
 
 	ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
-	if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+	if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+		ff_layout_send_layouterror(hdr->lseg);
 		ff_layout_reset_write(hdr, true);
-	else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+	} else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
 		ff_layout_reset_write(hdr, false);
 	pnfs_generic_rw_release(data);
 }
@@ -1698,7 +1780,8 @@
 	struct pnfs_layout_segment *lseg = hdr->lseg;
 	struct nfs4_pnfs_ds *ds;
 	struct rpc_clnt *ds_clnt;
-	struct rpc_cred *ds_cred;
+	struct nfs4_ff_layout_mirror *mirror;
+	const struct cred *ds_cred;
 	loff_t offset = hdr->args.offset;
 	u32 idx = hdr->pgio_mirror_idx;
 	int vers;
@@ -1708,20 +1791,21 @@
 		__func__, hdr->inode->i_ino,
 		hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
-	ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+	mirror = FF_LAYOUT_COMP(lseg, idx);
+	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
 	if (!ds)
 		goto out_failed;
 
-	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
 						   hdr->inode);
 	if (IS_ERR(ds_clnt))
 		goto out_failed;
 
-	ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
 	if (!ds_cred)
 		goto out_failed;
 
-	vers = nfs4_ff_layout_ds_version(lseg, idx);
+	vers = nfs4_ff_layout_ds_version(mirror);
 
 	dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
 		ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1729,13 +1813,11 @@
 	hdr->pgio_done_cb = ff_layout_read_done_cb;
 	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
-	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+	fh = nfs4_ff_layout_select_ds_fh(mirror);
 	if (fh)
 		hdr->args.fh = fh;
 
-	if (vers == 4 &&
-		!nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
-		goto out_failed;
+	nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
 
 	/*
 	 * Note that if we ever decide to split across DSes,
@@ -1749,12 +1831,15 @@
 			  vers == 3 ? &ff_layout_read_call_ops_v3 :
 				      &ff_layout_read_call_ops_v4,
 			  0, RPC_TASK_SOFTCONN);
-	put_rpccred(ds_cred);
+	put_cred(ds_cred);
 	return PNFS_ATTEMPTED;
 
 out_failed:
 	if (ff_layout_avoid_mds_available_ds(lseg))
 		return PNFS_TRY_AGAIN;
+	trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
+			hdr->args.offset, hdr->args.count,
+			IOMODE_READ, NFS_I(hdr->inode)->layout, lseg);
 	return PNFS_NOT_ATTEMPTED;
 }
 
@@ -1765,26 +1850,28 @@
 	struct pnfs_layout_segment *lseg = hdr->lseg;
 	struct nfs4_pnfs_ds *ds;
 	struct rpc_clnt *ds_clnt;
-	struct rpc_cred *ds_cred;
+	struct nfs4_ff_layout_mirror *mirror;
+	const struct cred *ds_cred;
 	loff_t offset = hdr->args.offset;
 	int vers;
 	struct nfs_fh *fh;
 	int idx = hdr->pgio_mirror_idx;
 
-	ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+	mirror = FF_LAYOUT_COMP(lseg, idx);
+	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
 	if (!ds)
 		goto out_failed;
 
-	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
 						   hdr->inode);
 	if (IS_ERR(ds_clnt))
 		goto out_failed;
 
-	ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
 	if (!ds_cred)
 		goto out_failed;
 
-	vers = nfs4_ff_layout_ds_version(lseg, idx);
+	vers = nfs4_ff_layout_ds_version(mirror);
 
 	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
 		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
@@ -1795,13 +1882,11 @@
 	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	hdr->ds_commit_idx = idx;
-	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+	fh = nfs4_ff_layout_select_ds_fh(mirror);
 	if (fh)
 		hdr->args.fh = fh;
 
-	if (vers == 4 &&
-		!nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
-		goto out_failed;
+	nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
 
 	/*
 	 * Note that if we ever decide to split across DSes,
@@ -1814,12 +1899,15 @@
 			  vers == 3 ? &ff_layout_write_call_ops_v3 :
 				      &ff_layout_write_call_ops_v4,
 			  sync, RPC_TASK_SOFTCONN);
-	put_rpccred(ds_cred);
+	put_cred(ds_cred);
 	return PNFS_ATTEMPTED;
 
 out_failed:
 	if (ff_layout_avoid_mds_available_ds(lseg))
 		return PNFS_TRY_AGAIN;
+	trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
+			hdr->args.offset, hdr->args.count,
+			IOMODE_RW, NFS_I(hdr->inode)->layout, lseg);
 	return PNFS_NOT_ATTEMPTED;
 }
 
@@ -1844,7 +1932,8 @@
 	struct pnfs_layout_segment *lseg = data->lseg;
 	struct nfs4_pnfs_ds *ds;
 	struct rpc_clnt *ds_clnt;
-	struct rpc_cred *ds_cred;
+	struct nfs4_ff_layout_mirror *mirror;
+	const struct cred *ds_cred;
 	u32 idx;
 	int vers, ret;
 	struct nfs_fh *fh;
@@ -1854,20 +1943,21 @@
 		goto out_err;
 
 	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
-	ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+	mirror = FF_LAYOUT_COMP(lseg, idx);
+	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
 	if (!ds)
 		goto out_err;
 
-	ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
 						   data->inode);
 	if (IS_ERR(ds_clnt))
 		goto out_err;
 
-	ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
 	if (!ds_cred)
 		goto out_err;
 
-	vers = nfs4_ff_layout_ds_version(lseg, idx);
+	vers = nfs4_ff_layout_ds_version(mirror);
 
 	dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
 		data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
@@ -1884,7 +1974,7 @@
 				   vers == 3 ? &ff_layout_commit_call_ops_v3 :
 					       &ff_layout_commit_call_ops_v4,
 				   how, RPC_TASK_SOFTCONN);
-	put_rpccred(ds_cred);
+	put_cred(ds_cred);
 	return ret;
 out_err:
 	pnfs_generic_prepare_to_resend_writes(data);
@@ -2031,7 +2121,7 @@
 
 	dprintk("%s: Begin\n", __func__);
 
-	xdr_init_encode(&tmp_xdr, &tmp_buf, NULL);
+	xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
 
 	ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
 	ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
@@ -2097,6 +2187,52 @@
 	return -ENOMEM;
 }
 
+#ifdef CONFIG_NFS_V4_2
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+	struct pnfs_layout_hdr *lo = lseg->pls_layout;
+	struct nfs42_layout_error *errors;
+	LIST_HEAD(head);
+
+	if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR))
+		return;
+	ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1);
+	if (list_empty(&head))
+		return;
+
+	errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
+			sizeof(*errors), GFP_NOFS);
+	if (errors != NULL) {
+		const struct nfs4_ff_layout_ds_err *pos;
+		size_t n = 0;
+
+		list_for_each_entry(pos, &head, list) {
+			errors[n].offset = pos->offset;
+			errors[n].length = pos->length;
+			nfs4_stateid_copy(&errors[n].stateid, &pos->stateid);
+			errors[n].errors[0].dev_id = pos->deviceid;
+			errors[n].errors[0].status = pos->status;
+			errors[n].errors[0].opnum = pos->opnum;
+			n++;
+			if (!list_is_last(&pos->list, &head) &&
+			    n < NFS42_LAYOUTERROR_MAX)
+				continue;
+			if (nfs42_proc_layouterror(lseg, errors, n) < 0)
+				break;
+			n = 0;
+		}
+		kfree(errors);
+	}
+	ff_layout_free_ds_ioerr(&head);
+}
+#else
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+}
+#endif
+
 static int
 ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
 {
@@ -2355,6 +2491,7 @@
 	.name			= "LAYOUT_FLEX_FILES",
 	.owner			= THIS_MODULE,
 	.flags			= PNFS_LAYOUTGET_ON_OPEN,
+	.max_layoutget_response	= 4096, /* 1 page or so... */
 	.set_layoutdriver	= ff_layout_set_layoutdriver,
 	.alloc_layout_hdr	= ff_layout_alloc_layout_hdr,
 	.free_layout_hdr	= ff_layout_free_layout_hdr,
@@ -2382,11 +2519,6 @@
 {
 	printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
 	       __func__);
-	if (!ff_zero_group) {
-		ff_zero_group = groups_alloc(0);
-		if (!ff_zero_group)
-			return -ENOMEM;
-	}
 	return pnfs_register_layoutdriver(&flexfilelayout_type);
 }
 
@@ -2395,10 +2527,6 @@
 	printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
 	       __func__);
 	pnfs_unregister_layoutdriver(&flexfilelayout_type);
-	if (ff_zero_group) {
-		put_group_info(ff_zero_group);
-		ff_zero_group = NULL;
-	}
 }
 
 MODULE_ALIAS("nfs-layouttype4-4");
@@ -2408,3 +2536,7 @@
 
 module_init(nfs4flexfilelayout_init);
 module_exit(nfs4flexfilelayout_exit);
+
+module_param(io_maxretrans, ushort, 0644);
+MODULE_PARM_DESC(io_maxretrans, "The  number of times the NFSv4.1 client "
+			"retries an I/O request before returning an error. ");

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index de50a34..2f36996 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h

@@ -81,8 +81,8 @@
 	u32				fh_versions_cnt;
 	struct nfs_fh			*fh_versions;
 	nfs4_stateid			stateid;
-	struct rpc_cred	__rcu		*ro_cred;
-	struct rpc_cred	__rcu		*rw_cred;
+	const struct cred __rcu		*ro_cred;
+	const struct cred __rcu		*rw_cred;
 	refcount_t			ref;
 	spinlock_t			lock;
 	unsigned long			flags;
@@ -132,16 +132,6 @@
 			    generic_hdr);
 }
 
-static inline struct nfs4_deviceid_node *
-FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
-{
-	if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
-	    FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
-	    FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
-		return NULL;
-	return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
-}
-
 static inline struct nfs4_ff_layout_ds *
 FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
 {
@@ -151,9 +141,25 @@
 static inline struct nfs4_ff_layout_mirror *
 FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
 {
-	if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
-		return NULL;
-	return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+
+	if (idx < fls->mirror_array_cnt)
+		return fls->mirror_array[idx];
+	return NULL;
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
+
+	if (mirror != NULL) {
+		struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+
+		if (!IS_ERR_OR_NULL(mirror_ds))
+			return &mirror_ds->id_node;
+	}
+	return NULL;
 }
 
 static inline u32
@@ -174,28 +180,10 @@
 	return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
 }
 
-static inline bool
-ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
-{
-	/*
-	 * Flexfiles should never mark a DS unavailable, but if it does
-	 * print a (ratelimited) warning as this can affect performance.
-	 */
-	if (nfs4_test_deviceid_unavailable(node)) {
-		u32 *p = (u32 *)node->deviceid.data;
-
-		pr_warn_ratelimited("NFS: flexfiles layout referencing an "
-				"unavailable device [%x%x%x%x]\n",
-				p[0], p[1], p[2], p[3]);
-		return true;
-	}
-	return false;
-}
-
 static inline int
-nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
 {
-	return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+	return mirror->mirror_ds->ds_versions[0].version;
 }
 
 struct nfs4_ff_layout_ds *
@@ -207,6 +195,7 @@
 			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
 			     u64 length, int status, enum nfs_opnum4 opnum,
 			     gfp_t gfp_flags);
+void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
 int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
 void ff_layout_free_ds_ioerr(struct list_head *head);
 unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
@@ -214,23 +203,23 @@
 		struct list_head *head,
 		unsigned int maxnum);
 struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
-int
-nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
-				u32 mirror_idx,
-				nfs4_stateid *stateid);
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+		nfs4_stateid *stateid);
 
 struct nfs4_pnfs_ds *
-nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+			  struct nfs4_ff_layout_mirror *mirror,
 			  bool fail_return);
 
 struct rpc_clnt *
-nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
-				 u32 ds_idx,
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
 				 struct nfs_client *ds_clp,
 				 struct inode *inode);
-struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
-				       u32 ds_idx, struct rpc_cred *mdscred);
+const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+					 const struct pnfs_layout_range *range,
+					 const struct cred *mdscred);
 bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
 bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
 

diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index a8df2f4..3eda40a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c

@@ -18,7 +18,7 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
-static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
+static unsigned int dataserver_timeo = NFS_DEF_TCP_TIMEO;
 static unsigned int dataserver_retrans;
 
 static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
@@ -183,56 +183,6 @@
 	return NULL;
 }
 
-static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
-		struct nfs4_deviceid_node *devid)
-{
-	nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid);
-	if (!ff_layout_has_available_ds(lseg))
-		pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
-				lseg);
-}
-
-static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
-				   struct nfs4_ff_layout_mirror *mirror,
-				   bool create)
-{
-	if (mirror == NULL || IS_ERR(mirror->mirror_ds))
-		goto outerr;
-	if (mirror->mirror_ds == NULL) {
-		if (create) {
-			struct nfs4_deviceid_node *node;
-			struct pnfs_layout_hdr *lh = lseg->pls_layout;
-			struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
-
-			node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
-					&mirror->devid, lh->plh_lc_cred,
-					GFP_KERNEL);
-			if (node)
-				mirror_ds = FF_LAYOUT_MIRROR_DS(node);
-
-			/* check for race with another call to this function */
-			if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
-			    mirror_ds != ERR_PTR(-ENODEV))
-				nfs4_put_deviceid_node(node);
-		} else
-			goto outerr;
-	}
-
-	if (IS_ERR(mirror->mirror_ds))
-		goto outerr;
-
-	if (mirror->mirror_ds->ds == NULL) {
-		struct nfs4_deviceid_node *devid;
-		devid = &mirror->mirror_ds->id_node;
-		ff_layout_mark_devid_invalid(lseg, devid);
-		return false;
-	}
-	return true;
-outerr:
-	pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
-	return false;
-}
-
 static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
 			    u64 offset, u64 length)
 {
@@ -307,7 +257,7 @@
 	if (status == 0)
 		return 0;
 
-	if (mirror->mirror_ds == NULL)
+	if (IS_ERR_OR_NULL(mirror->mirror_ds))
 		return -EINVAL;
 
 	dserr = kmalloc(sizeof(*dserr), gfp_flags);
@@ -326,14 +276,13 @@
 	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
 	ff_layout_add_ds_error_locked(flo, dserr);
 	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-
 	return 0;
 }
 
-static struct rpc_cred *
+static const struct cred *
 ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
 {
-	struct rpc_cred *cred, __rcu **pcred;
+	const struct cred *cred, __rcu **pcred;
 
 	if (iomode == IOMODE_READ)
 		pcred = &mirror->ro_cred;
@@ -346,53 +295,61 @@
 		if (!cred)
 			break;
 
-		cred = get_rpccred_rcu(cred);
+		cred = get_cred_rcu(cred);
 	} while(!cred);
 	rcu_read_unlock();
 	return cred;
 }
 
 struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
 {
-	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
-	struct nfs_fh *fh = NULL;
-
-	if (!ff_layout_mirror_valid(lseg, mirror, false)) {
-		pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
-			__func__, mirror_idx);
-		goto out;
-	}
-
 	/* FIXME: For now assume there is only 1 version available for the DS */
-	fh = &mirror->fh_versions[0];
-out:
-	return fh;
+	return &mirror->fh_versions[0];
 }
 
-int
-nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
-				u32 mirror_idx,
-				nfs4_stateid *stateid)
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+		nfs4_stateid *stateid)
 {
-	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+	if (nfs4_ff_layout_ds_version(mirror) == 4)
+		nfs4_stateid_copy(stateid, &mirror->stateid);
+}
 
-	if (!ff_layout_mirror_valid(lseg, mirror, false)) {
-		pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
-			__func__, mirror_idx);
-		goto out;
+static bool
+ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
+			 struct nfs4_ff_layout_mirror *mirror)
+{
+	if (mirror == NULL)
+		goto outerr;
+	if (mirror->mirror_ds == NULL) {
+		struct nfs4_deviceid_node *node;
+		struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+		node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
+				&mirror->devid, lo->plh_lc_cred,
+				GFP_KERNEL);
+		if (node)
+			mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+		/* check for race with another call to this function */
+		if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+		    mirror_ds != ERR_PTR(-ENODEV))
+			nfs4_put_deviceid_node(node);
 	}
 
-	nfs4_stateid_copy(stateid, &mirror->stateid);
-	return 1;
-out:
-	return 0;
+	if (IS_ERR(mirror->mirror_ds))
+		goto outerr;
+
+	return true;
+outerr:
+	return false;
 }
 
 /**
  * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
  * @lseg: the layout segment we're operating on
- * @ds_idx: index of the DS to use
+ * @mirror: layout mirror describing the DS to use
  * @fail_return: return layout on connect failure?
  *
  * Try to prepare a DS connection to accept an RPC call. This involves
@@ -407,26 +364,18 @@
  * Returns a pointer to a connected DS object on success or NULL on failure.
  */
 struct nfs4_pnfs_ds *
-nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+			  struct nfs4_ff_layout_mirror *mirror,
 			  bool fail_return)
 {
-	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
 	struct nfs4_pnfs_ds *ds = NULL;
-	struct nfs4_deviceid_node *devid;
 	struct inode *ino = lseg->pls_layout->plh_inode;
 	struct nfs_server *s = NFS_SERVER(ino);
 	unsigned int max_payload;
 	int status;
 
-	if (!ff_layout_mirror_valid(lseg, mirror, true)) {
-		pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
-			__func__, ds_idx);
-		goto out;
-	}
-
-	devid = &mirror->mirror_ds->id_node;
-	if (ff_layout_test_devid_unavailable(devid))
-		goto out_fail;
+	if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+		goto noconnect;
 
 	ds = mirror->mirror_ds->ds;
 	/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -437,8 +386,8 @@
 	/* FIXME: For now we assume the server sent only one version of NFS
 	 * to use for the DS.
 	 */
-	status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
-			     dataserver_retrans,
+	status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+			     dataserver_timeo, dataserver_retrans,
 			     mirror->mirror_ds->ds_versions[0].version,
 			     mirror->mirror_ds->ds_versions[0].minor_version);
 
@@ -453,11 +402,12 @@
 			mirror->mirror_ds->ds_versions[0].wsize = max_payload;
 		goto out;
 	}
-out_fail:
+noconnect:
 	ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 				 mirror, lseg->pls_range.offset,
 				 lseg->pls_range.length, NFS4ERR_NXIO,
 				 OP_ILLEGAL, GFP_NOIO);
+	ff_layout_send_layouterror(lseg);
 	if (fail_return || !ff_layout_has_available_ds(lseg))
 		pnfs_error_mark_layout_for_return(ino, lseg);
 	ds = NULL;
@@ -465,33 +415,36 @@
 	return ds;
 }
 
-struct rpc_cred *
-ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
-		      struct rpc_cred *mdscred)
+const struct cred *
+ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+		      const struct pnfs_layout_range *range,
+		      const struct cred *mdscred)
 {
-	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
-	struct rpc_cred *cred;
+	const struct cred *cred;
 
-	if (mirror) {
-		cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
+	if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
+		cred = ff_layout_get_mirror_cred(mirror, range->iomode);
 		if (!cred)
-			cred = get_rpccred(mdscred);
+			cred = get_cred(mdscred);
 	} else {
-		cred = get_rpccred(mdscred);
+		cred = get_cred(mdscred);
 	}
 	return cred;
 }
 
 /**
-* Find or create a DS rpc client with th MDS server rpc client auth flavor
-* in the nfs_client cl_ds_clients list.
-*/
+ * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client
+ * @mirror: pointer to the mirror
+ * @ds_clp: nfs_client for the DS
+ * @inode: pointer to inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
 struct rpc_clnt *
-nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
 				 struct nfs_client *ds_clp, struct inode *inode)
 {
-	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
-
 	switch (mirror->mirror_ds->ds_versions[0].version) {
 	case 3:
 		/* For NFSv3 DS, flavor is set when creating DS connections */
@@ -608,7 +561,7 @@
 			if (IS_ERR(mirror->mirror_ds))
 				continue;
 			devid = &mirror->mirror_ds->id_node;
-			if (!ff_layout_test_devid_unavailable(devid))
+			if (!nfs4_test_deviceid_unavailable(devid))
 				return true;
 		}
 	}
@@ -629,7 +582,7 @@
 		if (!mirror->mirror_ds)
 			continue;
 		devid = &mirror->mirror_ds->id_node;
-		if (ff_layout_test_devid_unavailable(devid))
+		if (nfs4_test_deviceid_unavailable(devid))
 			return false;
 	}
 

diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 666415d..15f2714 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* NFS FS-Cache index structure definition
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/init.h>

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 4dc8878..3800ab6 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* NFS filesystem cache interface
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/init.h>
@@ -118,6 +114,10 @@
 	struct rb_node **p, *parent;
 	int diff;
 
+	nfss->fscache_key = NULL;
+	nfss->fscache = NULL;
+	if (!(nfss->options & NFS_OPTION_FSCACHE))
+		return;
 	if (!uniq) {
 		uniq = "";
 		ulen = 1;
@@ -230,10 +230,11 @@
 void nfs_fscache_init_inode(struct inode *inode)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
+	struct nfs_server *nfss = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	nfsi->fscache = NULL;
-	if (!S_ISREG(inode->i_mode))
+	if (!(nfss->fscache && S_ISREG(inode->i_mode)))
 		return;
 
 	memset(&auxdata, 0, sizeof(auxdata));

diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 161ba2e..ad041cf 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* NFS filesystem cache interface definitions
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #ifndef _NFS_FSCACHE_H
@@ -186,7 +182,7 @@
  */
 static inline const char *nfs_server_fscache_state(struct nfs_server *server)
 {
-	if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+	if (server->fscache)
 		return "yes";
 	return "no ";
 }

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 391dafa..878c4c5 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* getroot.c: get the root dentry for an NFS mount
  *
  * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b65aee4..2a03bfe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/nfs/inode.c
  *
@@ -50,6 +51,7 @@
 #include "pnfs.h"
 #include "nfs.h"
 #include "netns.h"
+#include "sysfs.h"
 
 #include "nfstrace.h"
 
@@ -143,6 +145,7 @@
 
 /**
  * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ * @mapping: pointer to struct address_space
  */
 int nfs_sync_mapping(struct address_space *mapping)
 {
@@ -206,7 +209,7 @@
 	}
 
 	if (inode->i_mapping->nrpages == 0)
-		flags &= ~NFS_INO_INVALID_DATA;
+		flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
 	nfsi->cache_validity |= flags;
 	if (flags & NFS_INO_INVALID_DATA)
 		nfs_fscache_invalidate(inode);
@@ -650,7 +653,8 @@
 	i_size_write(inode, offset);
 	/* Optimisation */
 	if (offset == 0)
-		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
+				NFS_INO_DATA_INVAL_DEFER);
 	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
 
 	spin_unlock(&inode->i_lock);
@@ -857,15 +861,14 @@
 
 static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
 {
-	struct nfs_lock_context *head = &ctx->lock_context;
-	struct nfs_lock_context *pos = head;
+	struct nfs_lock_context *pos;
 
-	do {
+	list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) {
 		if (pos->lockowner != current->files)
 			continue;
-		refcount_inc(&pos->count);
-		return pos;
-	} while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
+		if (refcount_inc_not_zero(&pos->count))
+			return pos;
+	}
 	return NULL;
 }
 
@@ -874,10 +877,10 @@
 	struct nfs_lock_context *res, *new = NULL;
 	struct inode *inode = d_inode(ctx->dentry);
 
-	spin_lock(&inode->i_lock);
+	rcu_read_lock();
 	res = __nfs_find_lock_context(ctx);
+	rcu_read_unlock();
 	if (res == NULL) {
-		spin_unlock(&inode->i_lock);
 		new = kmalloc(sizeof(*new), GFP_KERNEL);
 		if (new == NULL)
 			return ERR_PTR(-ENOMEM);
@@ -885,14 +888,18 @@
 		spin_lock(&inode->i_lock);
 		res = __nfs_find_lock_context(ctx);
 		if (res == NULL) {
-			list_add_tail(&new->list, &ctx->lock_context.list);
-			new->open_context = ctx;
-			res = new;
-			new = NULL;
+			new->open_context = get_nfs_open_context(ctx);
+			if (new->open_context) {
+				list_add_tail_rcu(&new->list,
+						&ctx->lock_context.list);
+				res = new;
+				new = NULL;
+			} else
+				res = ERR_PTR(-EBADF);
 		}
+		spin_unlock(&inode->i_lock);
+		kfree(new);
 	}
-	spin_unlock(&inode->i_lock);
-	kfree(new);
 	return res;
 }
 EXPORT_SYMBOL_GPL(nfs_get_lock_context);
@@ -904,9 +911,10 @@
 
 	if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
 		return;
-	list_del(&l_ctx->list);
+	list_del_rcu(&l_ctx->list);
 	spin_unlock(&inode->i_lock);
-	kfree(l_ctx);
+	put_nfs_open_context(ctx);
+	kfree_rcu(l_ctx, rcu_head);
 }
 EXPORT_SYMBOL_GPL(nfs_put_lock_context);
 
@@ -951,18 +959,17 @@
 						struct file *filp)
 {
 	struct nfs_open_context *ctx;
-	struct rpc_cred *cred = rpc_lookup_cred();
-	if (IS_ERR(cred))
-		return ERR_CAST(cred);
+	const struct cred *cred = get_current_cred();
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx) {
-		put_rpccred(cred);
+		put_cred(cred);
 		return ERR_PTR(-ENOMEM);
 	}
 	nfs_sb_active(dentry->d_sb);
 	ctx->dentry = dget(dentry);
 	ctx->cred = cred;
+	ctx->ll_cred = NULL;
 	ctx->state = NULL;
 	ctx->mode = f_mode;
 	ctx->flags = 0;
@@ -978,9 +985,9 @@
 
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
-	if (ctx != NULL)
-		refcount_inc(&ctx->lock_context.count);
-	return ctx;
+	if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count))
+		return ctx;
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(get_nfs_open_context);
 
@@ -989,21 +996,21 @@
 	struct inode *inode = d_inode(ctx->dentry);
 	struct super_block *sb = ctx->dentry->d_sb;
 
-	if (!list_empty(&ctx->list)) {
-		if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
-			return;
-		list_del(&ctx->list);
-		spin_unlock(&inode->i_lock);
-	} else if (!refcount_dec_and_test(&ctx->lock_context.count))
+	if (!refcount_dec_and_test(&ctx->lock_context.count))
 		return;
+	if (!list_empty(&ctx->list)) {
+		spin_lock(&inode->i_lock);
+		list_del_rcu(&ctx->list);
+		spin_unlock(&inode->i_lock);
+	}
 	if (inode != NULL)
 		NFS_PROTO(inode)->close_context(ctx, is_sync);
-	if (ctx->cred != NULL)
-		put_rpccred(ctx->cred);
+	put_cred(ctx->cred);
 	dput(ctx->dentry);
 	nfs_sb_deactive(sb);
+	put_rpccred(ctx->ll_cred);
 	kfree(ctx->mdsthreshold);
-	kfree(ctx);
+	kfree_rcu(ctx, rcu_head);
 }
 
 void put_nfs_open_context(struct nfs_open_context *ctx)
@@ -1027,10 +1034,11 @@
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
-	if (ctx->mode & FMODE_WRITE)
-		list_add(&ctx->list, &nfsi->open_files);
-	else
-		list_add_tail(&ctx->list, &nfsi->open_files);
+	if (list_empty(&nfsi->open_files) &&
+	    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+			NFS_INO_REVAL_FORCED;
+	list_add_tail_rcu(&ctx->list, &nfsi->open_files);
 	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@@ -1046,21 +1054,22 @@
 /*
  * Given an inode, search for an open context with the desired characteristics
  */
-struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_open_context *pos, *ctx = NULL;
 
-	spin_lock(&inode->i_lock);
-	list_for_each_entry(pos, &nfsi->open_files, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(pos, &nfsi->open_files, list) {
 		if (cred != NULL && pos->cred != cred)
 			continue;
 		if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
 			continue;
 		ctx = get_nfs_open_context(pos);
-		break;
+		if (ctx)
+			break;
 	}
-	spin_unlock(&inode->i_lock);
+	rcu_read_unlock();
 	return ctx;
 }
 
@@ -1078,9 +1087,6 @@
 		if (ctx->error < 0)
 			invalidate_inode_pages2(inode->i_mapping);
 		filp->private_data = NULL;
-		spin_lock(&inode->i_lock);
-		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
-		spin_unlock(&inode->i_lock);
 		put_nfs_open_context_sync(ctx);
 	}
 }
@@ -1100,6 +1106,7 @@
 	nfs_fscache_open_file(inode, filp);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(nfs_open);
 
 /*
  * This function is called whenever some part of NFS notices that
@@ -1191,8 +1198,8 @@
 
 /**
  * nfs_revalidate_inode - Revalidate the inode attributes
- * @server - pointer to nfs_server struct
- * @inode - pointer to inode struct
+ * @server: pointer to nfs_server struct
+ * @inode: pointer to inode struct
  *
  * Updates inode attribute information by retrieving the data from the server.
  */
@@ -1262,8 +1269,8 @@
 
 /**
  * nfs_revalidate_mapping - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
  */
 int nfs_revalidate_mapping(struct inode *inode,
 		struct address_space *mapping)
@@ -1312,7 +1319,8 @@
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
 	smp_wmb();
-	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
+			NFS_INO_DATA_INVAL_DEFER);
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	ret = nfs_invalidate_mapping(inode, mapping);
@@ -1329,19 +1337,11 @@
 {
 	struct inode *inode = &nfsi->vfs_inode;
 
-	assert_spin_locked(&inode->i_lock);
-
 	if (!S_ISREG(inode->i_mode))
 		return false;
 	if (list_empty(&nfsi->open_files))
 		return false;
-	/* Note: This relies on nfsi->open_files being ordered with writers
-	 *       being placed at the head of the list.
-	 *       See nfs_inode_attach_open_context()
-	 */
-	return (list_first_entry(&nfsi->open_files,
-			struct nfs_open_context,
-			list)->mode & FMODE_WRITE) == FMODE_WRITE;
+	return inode_is_open_for_write(inode);
 }
 
 static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
@@ -1386,8 +1386,8 @@
 
 /**
  * nfs_check_inode_attributes - verify consistency of the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
  *
  * Verifies the attribute cache. If we have just changed the attributes,
  * so that fattr carries weak cache consistency data, then it may
@@ -1403,12 +1403,22 @@
 	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
 		return 0;
 
+	if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
+		/* Only a mounted-on-fileid? Just exit */
+		if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+			return 0;
 	/* Has the inode gone and changed behind our back? */
-	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+	} else if (nfsi->fileid != fattr->fileid) {
+		/* Is this perhaps the mounted-on fileid? */
+		if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) &&
+		    nfsi->fileid == fattr->mounted_on_fileid)
+			return 0;
 		return -ESTALE;
+	}
 	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		return -ESTALE;
 
+
 	if (!nfs_file_has_buffered_writers(nfsi)) {
 		/* Verify a few of the more important attributes */
 		if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
@@ -1587,8 +1597,8 @@
 
 /**
  * nfs_inode_attrs_need_update - check if the inode attributes need updating
- * @inode - pointer to inode
- * @fattr - attributes
+ * @inode: pointer to inode
+ * @fattr: attributes
  *
  * Attempt to divine whether or not an RPC call reply carrying stale
  * attributes got scheduled after another call carrying updated ones.
@@ -1629,8 +1639,8 @@
 
 /**
  * nfs_refresh_inode - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
  *
  * Check that an RPC call that returned attributes has not overlapped with
  * other recent updates of the inode metadata, then decide whether it is
@@ -1664,8 +1674,8 @@
 
 /**
  * nfs_post_op_update_inode - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
  *
  * After an operation that has changed the inode metadata, mark the
  * attribute cache as being invalid, then try to update it.
@@ -1694,8 +1704,8 @@
 
 /**
  * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
  *
  * After an operation that has changed the inode metadata, mark the
  * attribute cache as being invalid, then try to update it. Fake up
@@ -1746,8 +1756,8 @@
 
 /**
  * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
  *
  * After an operation that has changed the inode metadata, mark the
  * attribute cache as being invalid, then try to update it. Fake up
@@ -1768,18 +1778,6 @@
 EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
 
 
-static inline bool nfs_fileid_valid(struct nfs_inode *nfsi,
-				    struct nfs_fattr *fattr)
-{
-	bool ret1 = true, ret2 = true;
-
-	if (fattr->valid & NFS_ATTR_FATTR_FILEID)
-		ret1 = (nfsi->fileid == fattr->fileid);
-	if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
-		ret2 = (nfsi->fileid == fattr->mounted_on_fileid);
-	return ret1 || ret2;
-}
-
 /*
  * Many nfs protocol calls return the new file attributes after
  * an operation.  Here we update the inode to reflect the state
@@ -1810,7 +1808,16 @@
 			nfs_display_fhandle_hash(NFS_FH(inode)),
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if (!nfs_fileid_valid(nfsi, fattr)) {
+	if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
+		/* Only a mounted-on-fileid? Just exit */
+		if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+			return 0;
+	/* Has the inode gone and changed behind our back? */
+	} else if (nfsi->fileid != fattr->fileid) {
+		/* Is this perhaps the mounted-on fileid? */
+		if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) &&
+		    nfsi->fileid == fattr->mounted_on_fileid)
+			return 0;
 		printk(KERN_ERR "NFS: server %s error: fileid changed\n"
 			"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
 			NFS_SERVER(inode)->nfs_client->cl_hostname,
@@ -1878,7 +1885,8 @@
 				dprintk("NFS: change_attr change on server for file %s/%ld\n",
 						inode->i_sb->s_id,
 						inode->i_ino);
-			}
+			} else if (!have_delegation)
+				nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
 			inode_set_iversion_raw(inode, fattr->change_attr);
 			attr_changed = true;
 		}
@@ -2069,17 +2077,11 @@
 }
 EXPORT_SYMBOL_GPL(nfs_alloc_inode);
 
-static void nfs_i_callback(struct rcu_head *head)
+void nfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
-
-void nfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, nfs_i_callback);
-}
-EXPORT_SYMBOL_GPL(nfs_destroy_inode);
+EXPORT_SYMBOL_GPL(nfs_free_inode);
 
 static inline void nfs4_init_once(struct nfs_inode *nfsi)
 {
@@ -2173,12 +2175,8 @@
 
 static void nfs_net_exit(struct net *net)
 {
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-
 	nfs_fs_proc_net_exit(net);
-	nfs_cleanup_cb_ident_idr(net);
-	WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
-	WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
+	nfs_clients_exit(net);
 }
 
 static struct pernet_operations nfs_net_ops = {
@@ -2195,6 +2193,10 @@
 {
 	int err;
 
+	err = nfs_sysfs_init();
+	if (err < 0)
+		goto out10;
+
 	err = register_pernet_subsys(&nfs_net_ops);
 	if (err < 0)
 		goto out9;
@@ -2258,6 +2260,8 @@
 out8:
 	unregister_pernet_subsys(&nfs_net_ops);
 out9:
+	nfs_sysfs_exit();
+out10:
 	return err;
 }
 
@@ -2274,6 +2278,7 @@
 	unregister_nfs_fs();
 	nfs_fs_proc_exit();
 	nfsiod_stop();
+	nfs_sysfs_exit();
 }
 
 /* Not quite true; I just maintain it */

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8357ff6..447a3c1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h

@@ -16,14 +16,6 @@
 
 struct nfs_string;
 
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- *        their needs. People that do NFS over a slow network, might for
- *        instance want to reduce it to something closer to 1 for improved
- *        interactive response.
- */
-#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
-
 static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
 {
 	if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
@@ -81,8 +73,10 @@
 	struct nfs_subversion *nfs_mod;
 	int proto;
 	u32 minorversion;
+	unsigned int nconnect;
 	struct net *net;
 	const struct rpc_timeout *timeparms;
+	const struct cred *cred;
 };
 
 /*
@@ -121,9 +115,10 @@
 		char			*export_path;
 		int			port;
 		unsigned short		protocol;
+		unsigned short		nconnect;
 	} nfs_server;
 
-	struct security_mnt_opts lsm_opts;
+	void			*lsm_opts;
 	struct net		*net;
 };
 
@@ -156,6 +151,7 @@
 /* client.c */
 extern const struct rpc_program nfs_program;
 extern void nfs_clients_init(struct net *net);
+extern void nfs_clients_exit(struct net *net);
 extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
 int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
 struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
@@ -168,7 +164,6 @@
 struct nfs_server *nfs_alloc_server(void);
 void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *);
 
-extern void nfs_cleanup_cb_ident_idr(struct net *);
 extern void nfs_put_client(struct nfs_client *);
 extern void nfs_free_client(struct nfs_client *);
 extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
@@ -254,7 +249,7 @@
 void nfs_pgio_header_free(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
 int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
-		      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+		      const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
 		      const struct rpc_call_ops *call_ops, int how, int flags);
 void nfs_free_request(struct nfs_page *req);
 struct nfs_pgio_mirror *
@@ -269,7 +264,7 @@
 static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
 		const struct nfs_open_context *ctx2)
 {
-	return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+	return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state;
 }
 
 /* nfs2xdr.c */
@@ -380,7 +375,7 @@
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
-extern void nfs_destroy_inode(struct inode *);
+extern void nfs_free_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
 extern int nfs_drop_inode(struct inode *);
 extern void nfs_clear_inode(struct inode *);
@@ -395,7 +390,6 @@
 extern struct file_system_type nfs_fs_type;
 extern struct file_system_type nfs_xdev_fs_type;
 #if IS_ENABLED(CONFIG_NFS_V4)
-extern struct file_system_type nfs4_xdev_fs_type;
 extern struct file_system_type nfs4_referral_fs_type;
 #endif
 bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
@@ -565,13 +559,13 @@
 			    const struct nfs_client_initdata *);
 extern int nfs40_walk_client_list(struct nfs_client *clp,
 				struct nfs_client **result,
-				struct rpc_cred *cred);
+				const struct cred *cred);
 extern int nfs41_walk_client_list(struct nfs_client *clp,
 				struct nfs_client **result,
-				struct rpc_cred *cred);
-extern int nfs4_test_session_trunk(struct rpc_clnt *,
-				struct rpc_xprt *,
-				void *);
+				const struct cred *cred);
+extern void nfs4_test_session_trunk(struct rpc_clnt *clnt,
+				struct rpc_xprt *xprt,
+				void *data);
 
 static inline struct inode *nfs_igrab_and_active(struct inode *inode)
 {
@@ -756,6 +750,7 @@
 {
 	switch (err) {
 	case -ERESTARTSYS:
+	case -EINTR:
 	case -EACCES:
 	case -EDQUOT:
 	case -EFBIG:
@@ -764,15 +759,21 @@
 	case -EROFS:
 	case -ESTALE:
 	case -E2BIG:
+	case -ENOMEM:
+	case -ETIMEDOUT:
 		return true;
 	default:
 		return false;
 	}
 }
 
-static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
+static inline bool nfs_error_is_fatal_on_server(int err)
 {
-	ctx->error = error;
-	smp_wmb();
-	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	switch (err) {
+	case 0:
+	case -ERESTARTSYS:
+	case -EINTR:
+		return false;
+	}
+	return nfs_error_is_fatal(err);
 }

diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 9034b49..5088fda 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c

@@ -25,7 +25,7 @@
 
 /**
  * nfs_start_io_read - declare the file is being used for buffered reads
- * @inode - file inode
+ * @inode: file inode
  *
  * Declare that a buffered read operation is about to start, and ensure
  * that we block all direct I/O.
@@ -56,7 +56,7 @@
 
 /**
  * nfs_end_io_read - declare that the buffered read operation is done
- * @inode - file inode
+ * @inode: file inode
  *
  * Declare that a buffered read operation is done, and release the shared
  * lock on inode->i_rwsem.
@@ -69,7 +69,7 @@
 
 /**
  * nfs_start_io_write - declare the file is being used for buffered writes
- * @inode - file inode
+ * @inode: file inode
  *
  * Declare that a buffered read operation is about to start, and ensure
  * that we block all direct I/O.
@@ -83,7 +83,7 @@
 
 /**
  * nfs_end_io_write - declare that the buffered write operation is done
- * @inode - file inode
+ * @inode: file inode
  *
  * Declare that a buffered write operation is done, and release the
  * lock on inode->i_rwsem.
@@ -105,7 +105,7 @@
 
 /**
  * nfs_end_io_direct - declare the file is being used for direct i/o
- * @inode - file inode
+ * @inode: file inode
  *
  * Declare that a direct I/O operation is about to start, and ensure
  * that we block all buffered I/O.
@@ -136,7 +136,7 @@
 
 /**
  * nfs_end_io_direct - declare that the direct i/o operation is done
- * @inode - file inode
+ * @inode: file inode
  *
  * Declare that a direct I/O operation is done, and release the shared
  * lock on inode->i_rwsem.

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index d979ff4..cb7c10e 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c

@@ -163,6 +163,7 @@
 		.program	= &mnt_program,
 		.version	= info->version,
 		.authflavor	= RPC_AUTH_UNIX,
+		.cred		= current_cred(),
 	};
 	struct rpc_clnt		*mnt_clnt;
 	int			status;
@@ -249,6 +250,7 @@
 		.version	= info->version,
 		.authflavor	= RPC_AUTH_UNIX,
 		.flags		= RPC_CLNT_CREATE_NOPING,
+		.cred		= current_cred(),
 	};
 	struct rpc_message msg	= {
 		.rpc_argp	= info->dirpath,

diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e5686be..9287eb6 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/nfs/namespace.c
  *
@@ -221,10 +222,10 @@
 
 /**
  * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @dentry - parent directory
- * @fh - filehandle for new root dentry
- * @fattr - attributes for new root inode
- * @authflavor - security flavor to use when performing the mount
+ * @dentry: parent directory
+ * @fh: filehandle for new root dentry
+ * @fattr: attributes for new root inode
+ * @authflavor: security flavor to use when performing the mount
  *
  */
 struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,

diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index fc9978c..c8374f7 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h

@@ -15,6 +15,8 @@
 	uint32_t major, minor;
 };
 
+struct nfs_netns_client;
+
 struct nfs_net {
 	struct cache_detail *nfs_dns_resolve;
 	struct rpc_pipe *bl_device_pipe;
@@ -29,6 +31,7 @@
 	unsigned short nfs_callback_tcpport6;
 	int cb_users[NFS4_MAX_MINOR_VERSION + 1];
 #endif
+	struct nfs_netns_client *nfs_client;
 	spinlock_t nfs_client_lock;
 	ktime_t boot_time;
 #ifdef CONFIG_PROC_FS

diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c
index 0a9782c..467f21e 100644
--- a/fs/nfs/nfs2super.c
+++ b/fs/nfs/nfs2super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2012 Netapp, Inc. All rights reserved.
  */

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 350675e..cbc17a2 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c

@@ -22,6 +22,7 @@
 #include <linux/nfs.h>
 #include <linux/nfs2.h>
 #include <linux/nfs_fs.h>
+#include "nfstrace.h"
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
@@ -55,42 +56,16 @@
 
 #define NFS_attrstat_sz		(1+NFS_fattr_sz)
 #define NFS_diropres_sz		(1+NFS_fhandle_sz+NFS_fattr_sz)
-#define NFS_readlinkres_sz	(2)
-#define NFS_readres_sz		(1+NFS_fattr_sz+1)
+#define NFS_readlinkres_sz	(2+1)
+#define NFS_readres_sz		(1+NFS_fattr_sz+1+1)
 #define NFS_writeres_sz         (NFS_attrstat_sz)
 #define NFS_stat_sz		(1)
-#define NFS_readdirres_sz	(1)
+#define NFS_readdirres_sz	(1+1)
 #define NFS_statfsres_sz	(1+NFS_info_sz)
 
 static int nfs_stat_to_errno(enum nfs_stat);
 
 /*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-				 unsigned int base, unsigned int len,
-				 unsigned int bufsize)
-{
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-
-	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
-	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
-	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-		"Remaining buffer length is %tu words.\n",
-		func, xdr->end - xdr->p);
-}
-
-
-/*
  * Encode/decode NFSv2 basic data types
  *
  * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
@@ -101,6 +76,20 @@
  * or decoded inline.
  */
 
+static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt)
+{
+	if (clnt && clnt->cl_cred)
+		return clnt->cl_cred->user_ns;
+	return &init_user_ns;
+}
+
+static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp)
+{
+	if (rqstp->rq_task)
+		return rpc_userns(rqstp->rq_task->tk_client);
+	return &init_user_ns;
+}
+
 /*
  *	typedef opaque	nfsdata<>;
  */
@@ -110,8 +99,8 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	count = be32_to_cpup(p);
 	recvd = xdr_read_pages(xdr, count);
 	if (unlikely(count > recvd))
@@ -125,9 +114,6 @@
 		"count %u > recvd %u\n", count, recvd);
 	count = recvd;
 	goto out;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -157,13 +143,16 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
-	*status = be32_to_cpup(p);
+	if (unlikely(!p))
+		return -EIO;
+	if (unlikely(*p != cpu_to_be32(NFS_OK)))
+		goto out_status;
+	*status = 0;
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
+out_status:
+	*status = be32_to_cpup(p);
+	trace_nfs_xdr_status(xdr, (int)*status);
+	return 0;
 }
 
 /*
@@ -205,14 +194,11 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS2_FHSIZE);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	fh->size = NFS2_FHSIZE;
 	memcpy(fh->data, p, NFS2_FHSIZE);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -276,14 +262,15 @@
  *	};
  *
  */
-static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct user_namespace *userns)
 {
 	u32 rdev, type;
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 
 	fattr->valid |= NFS_ATTR_FATTR_V2;
 
@@ -291,10 +278,10 @@
 
 	fattr->mode = be32_to_cpup(p++);
 	fattr->nlink = be32_to_cpup(p++);
-	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++));
+	fattr->uid = make_kuid(userns, be32_to_cpup(p++));
 	if (!uid_valid(fattr->uid))
 		goto out_uid;
-	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++));
+	fattr->gid = make_kgid(userns, be32_to_cpup(p++));
 	if (!gid_valid(fattr->gid))
 		goto out_gid;
 		
@@ -325,9 +312,6 @@
 out_gid:
 	dprintk("NFS: returned invalid gid\n");
 	return -EINVAL;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -352,7 +336,8 @@
 	return p;
 }
 
-static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr,
+		struct user_namespace *userns)
 {
 	struct timespec ts;
 	__be32 *p;
@@ -364,11 +349,11 @@
 	else
 		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
 	if (attr->ia_valid & ATTR_UID)
-		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));
+		*p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid));
 	else
 		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
 	if (attr->ia_valid & ATTR_GID)
-		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));
+		*p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid));
 	else
 		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
 	if (attr->ia_valid & ATTR_SIZE)
@@ -416,23 +401,20 @@
 	u32 count;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	count = be32_to_cpup(p);
 	if (count > NFS3_MAXNAMLEN)
 		goto out_nametoolong;
 	p = xdr_inline_decode(xdr, count);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	*name = (const char *)p;
 	*length = count;
 	return 0;
 out_nametoolong:
 	dprintk("NFS: returned filename too long: %u\n", count);
 	return -ENAMETOOLONG;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -455,8 +437,8 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	length = be32_to_cpup(p);
 	if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
 		goto out_size;
@@ -472,9 +454,6 @@
 	dprintk("NFS: server cheating in pathname result: "
 		"length %u > received %u\n", length, recvd);
 	return -EIO;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -488,7 +467,8 @@
  *	};
  */
 static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
-			   __u32 *op_status)
+			   __u32 *op_status,
+			   struct user_namespace *userns)
 {
 	enum nfs_stat status;
 	int error;
@@ -500,7 +480,7 @@
 		*op_status = status;
 	if (status != NFS_OK)
 		goto out_default;
-	error = decode_fattr(xdr, result);
+	error = decode_fattr(xdr, result, userns);
 out:
 	return error;
 out_default:
@@ -535,19 +515,21 @@
  *		void;
  *	};
  */
-static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result,
+		struct user_namespace *userns)
 {
 	int error;
 
 	error = decode_fhandle(xdr, result->fh);
 	if (unlikely(error))
 		goto out;
-	error = decode_fattr(xdr, result->fattr);
+	error = decode_fattr(xdr, result->fattr, userns);
 out:
 	return error;
 }
 
-static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result,
+		struct user_namespace *userns)
 {
 	enum nfs_stat status;
 	int error;
@@ -557,7 +539,7 @@
 		goto out;
 	if (status != NFS_OK)
 		goto out_default;
-	error = decode_diropok(xdr, result);
+	error = decode_diropok(xdr, result, userns);
 out:
 	return error;
 out_default:
@@ -596,7 +578,7 @@
 	const struct nfs_sattrargs *args = data;
 
 	encode_fhandle(xdr, args->fh);
-	encode_sattr(xdr, args->sattr);
+	encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
 }
 
 static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
@@ -615,8 +597,8 @@
 	const struct nfs_readlinkargs *args = data;
 
 	encode_fhandle(xdr, args->fh);
-	prepare_reply_buffer(req, args->pages, args->pgbase,
-					args->pglen, NFS_readlinkres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->pglen, NFS_readlinkres_sz);
 }
 
 /*
@@ -651,8 +633,8 @@
 	const struct nfs_pgio_args *args = data;
 
 	encode_readargs(xdr, args);
-	prepare_reply_buffer(req, args->pages, args->pgbase,
-					args->count, NFS_readres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, NFS_readres_sz);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 }
 
@@ -711,7 +693,7 @@
 	const struct nfs_createargs *args = data;
 
 	encode_diropargs(xdr, args->fh, args->name, args->len);
-	encode_sattr(xdr, args->sattr);
+	encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
 }
 
 static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
@@ -778,7 +760,7 @@
 
 	encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
 	encode_path(xdr, args->pages, args->pathlen);
-	encode_sattr(xdr, args->sattr);
+	encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
 }
 
 /*
@@ -809,8 +791,8 @@
 	const struct nfs_readdirargs *args = data;
 
 	encode_readdirargs(xdr, args);
-	prepare_reply_buffer(req, args->pages, 0,
-					args->count, NFS_readdirres_sz);
+	rpc_prepare_reply_pages(req, args->pages, 0,
+				args->count, NFS_readdirres_sz);
 }
 
 /*
@@ -840,13 +822,13 @@
 static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 void *result)
 {
-	return decode_attrstat(xdr, result, NULL);
+	return decode_attrstat(xdr, result, NULL, rpc_rqst_userns(req));
 }
 
 static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 void *result)
 {
-	return decode_diropres(xdr, result);
+	return decode_diropres(xdr, result, rpc_rqst_userns(req));
 }
 
 /*
@@ -901,7 +883,7 @@
 	result->op_status = status;
 	if (status != NFS_OK)
 		goto out_default;
-	error = decode_fattr(xdr, result->fattr);
+	error = decode_fattr(xdr, result->fattr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	error = decode_nfsdata(xdr, result);
@@ -918,7 +900,8 @@
 
 	/* All NFSv2 writes are "file sync" writes */
 	result->verf->committed = NFS_FILE_SYNC;
-	return decode_attrstat(xdr, result->fattr, &result->op_status);
+	return decode_attrstat(xdr, result->fattr, &result->op_status,
+			rpc_rqst_userns(req));
 }
 
 /**
@@ -951,12 +934,12 @@
 	int error;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EAGAIN;
 	if (*p++ == xdr_zero) {
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(p == NULL))
-			goto out_overflow;
+		if (unlikely(!p))
+			return -EAGAIN;
 		if (*p++ == xdr_zero)
 			return -EAGAIN;
 		entry->eof = 1;
@@ -964,8 +947,8 @@
 	}
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EAGAIN;
 	entry->ino = be32_to_cpup(p);
 
 	error = decode_filename_inline(xdr, &entry->name, &entry->len);
@@ -978,17 +961,13 @@
 	 */
 	entry->prev_cookie = entry->cookie;
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EAGAIN;
 	entry->cookie = be32_to_cpup(p);
 
 	entry->d_type = DT_UNKNOWN;
 
 	return 0;
-
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EAGAIN;
 }
 
 /*
@@ -1052,17 +1031,14 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS_info_sz << 2);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	result->tsize  = be32_to_cpup(p++);
 	result->bsize  = be32_to_cpup(p++);
 	result->blocks = be32_to_cpup(p++);
 	result->bfree  = be32_to_cpup(p++);
 	result->bavail = be32_to_cpup(p);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,

diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9fce185..c5c3fc6 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c

@@ -222,8 +222,6 @@
 	switch (status) {
 		case 0:
 			status = nfs_refresh_inode(inode, fattr);
-			set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
-			set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
 			break;
 		case -EPFNOSUPPORT:
 		case -EPROTONOSUPPORT:

diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 7879f2a..148ceb7 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/sunrpc/addr.h>
@@ -91,6 +92,7 @@
 		.proto = ds_proto,
 		.net = mds_clp->cl_net,
 		.timeparms = &ds_timeout,
+		.cred = mds_srv->cred,
 	};
 	struct nfs_client *clp;
 	char buf[INET6_ADDRSTRLEN + 1];
@@ -100,6 +102,9 @@
 		return ERR_PTR(-EINVAL);
 	cl_init.hostname = buf;
 
+	if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+		cl_init.nconnect = mds_clp->cl_nconnect;
+
 	if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ec8a9ef..9eb2f1a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c

@@ -279,15 +279,17 @@
 	return data;
 }
 
-static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+static struct dentry *
+nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
 {
 	int status;
 
 	status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
 	nfs_post_op_update_inode(dir, data->res.dir_attr);
-	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
-	return status;
+	if (status != 0)
+		return ERR_PTR(status);
+
+	return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL);
 }
 
 static void nfs3_free_createdata(struct nfs3_createdata *data)
@@ -304,6 +306,7 @@
 {
 	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
+	struct dentry *d_alias;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  create %pd\n", dentry);
@@ -330,7 +333,8 @@
 		goto out;
 
 	for (;;) {
-		status = nfs3_do_create(dir, dentry, data);
+		d_alias = nfs3_do_create(dir, dentry, data);
+		status = PTR_ERR_OR_ZERO(d_alias);
 
 		if (status != -ENOTSUPP)
 			break;
@@ -355,6 +359,9 @@
 	if (status != 0)
 		goto out_release_acls;
 
+	if (d_alias)
+		dentry = d_alias;
+
 	/* When we created the file with exclusive semantics, make
 	 * sure we set the attributes afterwards. */
 	if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -372,11 +379,13 @@
 		nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
 		dprintk("NFS reply setattr (post-create): %d\n", status);
 		if (status != 0)
-			goto out_release_acls;
+			goto out_dput;
 	}
 
 	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
 
+out_dput:
+	dput(d_alias);
 out_release_acls:
 	posix_acl_release(acl);
 	posix_acl_release(default_acl);
@@ -504,6 +513,7 @@
 		  unsigned int len, struct iattr *sattr)
 {
 	struct nfs3_createdata *data;
+	struct dentry *d_alias;
 	int status = -ENOMEM;
 
 	if (len > NFS3_MAXPATHLEN)
@@ -522,7 +532,11 @@
 	data->arg.symlink.pathlen = len;
 	data->arg.symlink.sattr = sattr;
 
-	status = nfs3_do_create(dir, dentry, data);
+	d_alias = nfs3_do_create(dir, dentry, data);
+	status = PTR_ERR_OR_ZERO(d_alias);
+
+	if (status == 0)
+		dput(d_alias);
 
 	nfs3_free_createdata(data);
 out:
@@ -535,6 +549,7 @@
 {
 	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
+	struct dentry *d_alias;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  mkdir %pd\n", dentry);
@@ -553,12 +568,18 @@
 	data->arg.mkdir.len = dentry->d_name.len;
 	data->arg.mkdir.sattr = sattr;
 
-	status = nfs3_do_create(dir, dentry, data);
+	d_alias = nfs3_do_create(dir, dentry, data);
+	status = PTR_ERR_OR_ZERO(d_alias);
+
 	if (status != 0)
 		goto out_release_acls;
 
+	if (d_alias)
+		dentry = d_alias;
+
 	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
 
+	dput(d_alias);
 out_release_acls:
 	posix_acl_release(acl);
 	posix_acl_release(default_acl);
@@ -607,7 +628,7 @@
  * readdirplus.
  */
 static int
-nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
 		  u64 cookie, struct page **pages, unsigned int count, bool plus)
 {
 	struct inode		*dir = d_inode(dentry);
@@ -628,7 +649,7 @@
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_READDIR],
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
-		.rpc_cred	= cred
+		.rpc_cred	= cred,
 	};
 	int status = -ENOMEM;
 
@@ -660,6 +681,7 @@
 {
 	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
+	struct dentry *d_alias;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  mknod %pd %u:%u\n", dentry,
@@ -698,12 +720,17 @@
 		goto out;
 	}
 
-	status = nfs3_do_create(dir, dentry, data);
+	d_alias = nfs3_do_create(dir, dentry, data);
+	status = PTR_ERR_OR_ZERO(d_alias);
 	if (status != 0)
 		goto out_release_acls;
 
+	if (d_alias)
+		dentry = d_alias;
+
 	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
 
+	dput(d_alias);
 out_release_acls:
 	posix_acl_release(acl);
 	posix_acl_release(default_acl);
@@ -786,6 +813,7 @@
 static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
 	struct inode *inode = hdr->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
 
 	if (hdr->pgio_done_cb != NULL)
 		return hdr->pgio_done_cb(task, hdr);
@@ -793,6 +821,9 @@
 	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 
+	if (task->tk_status >= 0 && !server->read_hdrsize)
+		cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+
 	nfs_invalidate_atime(inode);
 	nfs_refresh_inode(inode, &hdr->fattr);
 	return 0;
@@ -802,6 +833,7 @@
 				 struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
+	hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize;
 }
 
 static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,

diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 5c4394e..7c58094 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2012 Netapp, Inc. All rights reserved.
  */

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 64e4fa3..6027678 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c

@@ -21,6 +21,7 @@
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfsacl.h>
+#include "nfstrace.h"
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
@@ -68,13 +69,13 @@
 #define NFS3_removeres_sz	(NFS3_setattrres_sz)
 #define NFS3_lookupres_sz	(1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz	(1+NFS3_post_op_attr_sz+1)
-#define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1)
-#define NFS3_readres_sz		(1+NFS3_post_op_attr_sz+3)
+#define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1+1)
+#define NFS3_readres_sz		(1+NFS3_post_op_attr_sz+3+1)
 #define NFS3_writeres_sz	(1+NFS3_wcc_data_sz+4)
 #define NFS3_createres_sz	(1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
 #define NFS3_renameres_sz	(1+(2 * NFS3_wcc_data_sz))
 #define NFS3_linkres_sz		(1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
-#define NFS3_readdirres_sz	(1+NFS3_post_op_attr_sz+2)
+#define NFS3_readdirres_sz	(1+NFS3_post_op_attr_sz+2+1)
 #define NFS3_fsstatres_sz	(1+NFS3_post_op_attr_sz+13)
 #define NFS3_fsinfores_sz	(1+NFS3_post_op_attr_sz+12)
 #define NFS3_pathconfres_sz	(1+NFS3_post_op_attr_sz+6)
@@ -84,7 +85,7 @@
 #define ACL3_setaclargs_sz	(NFS3_fh_sz+1+ \
 				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
 #define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+ \
-				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1)
 #define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
 
 static int nfs3_stat_to_errno(enum nfs_stat);
@@ -103,32 +104,20 @@
 	[NF3FIFO] = S_IFIFO,
 };
 
-/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-				 unsigned int base, unsigned int len,
-				 unsigned int bufsize)
+static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt)
 {
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-
-	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
-	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+	if (clnt && clnt->cl_cred)
+		return clnt->cl_cred->user_ns;
+	return &init_user_ns;
 }
 
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp)
 {
-	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-		"Remaining buffer length is %tu words.\n",
-		func, xdr->end - xdr->p);
+	if (rqstp->rq_task)
+		return rpc_userns(rqstp->rq_task->tk_client);
+	return &init_user_ns;
 }
 
-
 /*
  * Encode/decode NFSv3 basic data types
  *
@@ -151,13 +140,10 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	*value = be32_to_cpup(p);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_uint64(struct xdr_stream *xdr, u64 *value)
@@ -165,13 +151,10 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 8);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	xdr_decode_hyper(p, value);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -211,14 +194,14 @@
 	u32 count;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	count = be32_to_cpup(p);
 	if (count > NFS3_MAXNAMLEN)
 		goto out_nametoolong;
 	p = xdr_inline_decode(xdr, count);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	*name = (const char *)p;
 	*length = count;
 	return 0;
@@ -226,9 +209,6 @@
 out_nametoolong:
 	dprintk("NFS: returned filename too long: %u\n", count);
 	return -ENAMETOOLONG;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -249,8 +229,8 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	count = be32_to_cpup(p);
 	if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
 		goto out_nametoolong;
@@ -267,9 +247,6 @@
 	dprintk("NFS: server cheating in pathname result: "
 		"count %u > recvd %u\n", count, recvd);
 	return -EIO;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -303,13 +280,10 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -330,13 +304,10 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	memcpy(verifier->data, p, NFS3_WRITEVERFSIZE);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -364,13 +335,16 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
-	*status = be32_to_cpup(p);
+	if (unlikely(!p))
+		return -EIO;
+	if (unlikely(*p != cpu_to_be32(NFS3_OK)))
+		goto out_status;
+	*status = 0;
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
+out_status:
+	*status = be32_to_cpup(p);
+	trace_nfs_xdr_status(xdr, (int)*status);
+	return 0;
 }
 
 /*
@@ -453,23 +427,20 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	length = be32_to_cpup(p++);
 	if (unlikely(length > NFS3_FHSIZE))
 		goto out_toobig;
 	p = xdr_inline_decode(xdr, length);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	fh->size = length;
 	memcpy(fh->data, p, length);
 	return 0;
 out_toobig:
 	dprintk("NFS: file handle size (%u) too big\n", length);
 	return -E2BIG;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static void zero_nfs_fh3(struct nfs_fh *fh)
@@ -559,7 +530,8 @@
  *		set_mtime	mtime;
  *	};
  */
-static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr,
+		struct user_namespace *userns)
 {
 	struct timespec ts;
 	u32 nbytes;
@@ -594,13 +566,13 @@
 
 	if (attr->ia_valid & ATTR_UID) {
 		*p++ = xdr_one;
-		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));
+		*p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid));
 	} else
 		*p++ = xdr_zero;
 
 	if (attr->ia_valid & ATTR_GID) {
 		*p++ = xdr_one;
-		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));
+		*p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid));
 	} else
 		*p++ = xdr_zero;
 
@@ -649,23 +621,24 @@
  *		nfstime3	ctime;
  *	};
  */
-static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct user_namespace *userns)
 {
 	umode_t fmode;
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 
 	p = xdr_decode_ftype3(p, &fmode);
 
 	fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
 	fattr->nlink = be32_to_cpup(p++);
-	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++));
+	fattr->uid = make_kuid(userns, be32_to_cpup(p++));
 	if (!uid_valid(fattr->uid))
 		goto out_uid;
-	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++));
+	fattr->gid = make_kgid(userns, be32_to_cpup(p++));
 	if (!gid_valid(fattr->gid))
 		goto out_gid;
 
@@ -690,9 +663,6 @@
 out_gid:
 	dprintk("NFS: returned invalid gid\n");
 	return -EINVAL;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -705,19 +675,17 @@
  *		void;
  *	};
  */
-static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct user_namespace *userns)
 {
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	if (*p != xdr_zero)
-		return decode_fattr3(xdr, fattr);
+		return decode_fattr3(xdr, fattr, userns);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -733,8 +701,8 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 
 	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
 		| NFS_ATTR_FATTR_PRECHANGE
@@ -747,9 +715,6 @@
 	fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
 
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -773,24 +738,22 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	if (*p != xdr_zero)
 		return decode_wcc_attr(xdr, fattr);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
-static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct user_namespace *userns)
 {
 	int error;
 
 	error = decode_pre_op_attr(xdr, fattr);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, fattr);
+	error = decode_post_op_attr(xdr, fattr, userns);
 out:
 	return error;
 }
@@ -808,15 +771,12 @@
 static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
 {
 	__be32 *p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	if (*p != xdr_zero)
 		return decode_nfs_fh3(xdr, fh);
 	zero_nfs_fh3(fh);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -895,7 +855,7 @@
 {
 	const struct nfs3_sattrargs *args = data;
 	encode_nfs_fh3(xdr, args->fh);
-	encode_sattr3(xdr, args->sattr);
+	encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req));
 	encode_sattrguard3(xdr, args);
 }
 
@@ -953,8 +913,8 @@
 	const struct nfs3_readlinkargs *args = data;
 
 	encode_nfs_fh3(xdr, args->fh);
-	prepare_reply_buffer(req, args->pages, args->pgbase,
-					args->pglen, NFS3_readlinkres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->pglen, NFS3_readlinkres_sz);
 }
 
 /*
@@ -983,10 +943,11 @@
 				   const void *data)
 {
 	const struct nfs_pgio_args *args = data;
+	unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
 
 	encode_read3args(xdr, args);
-	prepare_reply_buffer(req, args->pages, args->pgbase,
-					args->count, NFS3_readres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, replen);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 }
 
@@ -1055,13 +1016,14 @@
  *	};
  */
 static void encode_createhow3(struct xdr_stream *xdr,
-			      const struct nfs3_createargs *args)
+			      const struct nfs3_createargs *args,
+			      struct user_namespace *userns)
 {
 	encode_uint32(xdr, args->createmode);
 	switch (args->createmode) {
 	case NFS3_CREATE_UNCHECKED:
 	case NFS3_CREATE_GUARDED:
-		encode_sattr3(xdr, args->sattr);
+		encode_sattr3(xdr, args->sattr, userns);
 		break;
 	case NFS3_CREATE_EXCLUSIVE:
 		encode_createverf3(xdr, args->verifier);
@@ -1078,7 +1040,7 @@
 	const struct nfs3_createargs *args = data;
 
 	encode_diropargs3(xdr, args->fh, args->name, args->len);
-	encode_createhow3(xdr, args);
+	encode_createhow3(xdr, args, rpc_rqst_userns(req));
 }
 
 /*
@@ -1096,7 +1058,7 @@
 	const struct nfs3_mkdirargs *args = data;
 
 	encode_diropargs3(xdr, args->fh, args->name, args->len);
-	encode_sattr3(xdr, args->sattr);
+	encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req));
 }
 
 /*
@@ -1113,11 +1075,12 @@
  *	};
  */
 static void encode_symlinkdata3(struct xdr_stream *xdr,
-				const void *data)
+				const void *data,
+				struct user_namespace *userns)
 {
 	const struct nfs3_symlinkargs *args = data;
 
-	encode_sattr3(xdr, args->sattr);
+	encode_sattr3(xdr, args->sattr, userns);
 	encode_nfspath3(xdr, args->pages, args->pathlen);
 }
 
@@ -1128,7 +1091,7 @@
 	const struct nfs3_symlinkargs *args = data;
 
 	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
-	encode_symlinkdata3(xdr, args);
+	encode_symlinkdata3(xdr, args, rpc_rqst_userns(req));
 	xdr->buf->flags |= XDRBUF_WRITE;
 }
 
@@ -1157,24 +1120,26 @@
  *	};
  */
 static void encode_devicedata3(struct xdr_stream *xdr,
-			       const struct nfs3_mknodargs *args)
+			       const struct nfs3_mknodargs *args,
+			       struct user_namespace *userns)
 {
-	encode_sattr3(xdr, args->sattr);
+	encode_sattr3(xdr, args->sattr, userns);
 	encode_specdata3(xdr, args->rdev);
 }
 
 static void encode_mknoddata3(struct xdr_stream *xdr,
-			      const struct nfs3_mknodargs *args)
+			      const struct nfs3_mknodargs *args,
+			      struct user_namespace *userns)
 {
 	encode_ftype3(xdr, args->type);
 	switch (args->type) {
 	case NF3CHR:
 	case NF3BLK:
-		encode_devicedata3(xdr, args);
+		encode_devicedata3(xdr, args, userns);
 		break;
 	case NF3SOCK:
 	case NF3FIFO:
-		encode_sattr3(xdr, args->sattr);
+		encode_sattr3(xdr, args->sattr, userns);
 		break;
 	case NF3REG:
 	case NF3DIR:
@@ -1191,7 +1156,7 @@
 	const struct nfs3_mknodargs *args = data;
 
 	encode_diropargs3(xdr, args->fh, args->name, args->len);
-	encode_mknoddata3(xdr, args);
+	encode_mknoddata3(xdr, args, rpc_rqst_userns(req));
 }
 
 /*
@@ -1278,7 +1243,7 @@
 	const struct nfs3_readdirargs *args = data;
 
 	encode_readdir3args(xdr, args);
-	prepare_reply_buffer(req, args->pages, 0,
+	rpc_prepare_reply_pages(req, args->pages, 0,
 				args->count, NFS3_readdirres_sz);
 }
 
@@ -1320,7 +1285,7 @@
 	const struct nfs3_readdirargs *args = data;
 
 	encode_readdirplus3args(xdr, args);
-	prepare_reply_buffer(req, args->pages, 0,
+	rpc_prepare_reply_pages(req, args->pages, 0,
 				args->count, NFS3_readdirres_sz);
 }
 
@@ -1364,10 +1329,12 @@
 
 	encode_nfs_fh3(xdr, args->fh);
 	encode_uint32(xdr, args->mask);
-	if (args->mask & (NFS_ACL | NFS_DFACL))
-		prepare_reply_buffer(req, args->pages, 0,
+	if (args->mask & (NFS_ACL | NFS_DFACL)) {
+		rpc_prepare_reply_pages(req, args->pages, 0,
 					NFSACL_MAXPAGES << PAGE_SHIFT,
 					ACL3_getaclres_sz);
+		req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+	}
 }
 
 static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
@@ -1434,7 +1401,7 @@
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_fattr3(xdr, result);
+	error = decode_fattr3(xdr, result, rpc_rqst_userns(req));
 out:
 	return error;
 out_default:
@@ -1469,7 +1436,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(xdr, result);
+	error = decode_wcc_data(xdr, result, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1504,6 +1471,7 @@
 				   struct xdr_stream *xdr,
 				   void *data)
 {
+	struct user_namespace *userns = rpc_rqst_userns(req);
 	struct nfs3_diropres *result = data;
 	enum nfs_stat status;
 	int error;
@@ -1516,14 +1484,14 @@
 	error = decode_nfs_fh3(xdr, result->fh);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, userns);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->dir_attr);
+	error = decode_post_op_attr(xdr, result->dir_attr, userns);
 out:
 	return error;
 out_default:
-	error = decode_post_op_attr(xdr, result->dir_attr);
+	error = decode_post_op_attr(xdr, result->dir_attr, userns);
 	if (unlikely(error))
 		goto out;
 	return nfs3_stat_to_errno(status);
@@ -1559,7 +1527,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1600,7 +1568,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result);
+	error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1640,8 +1608,8 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4 + 4 + 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	count = be32_to_cpup(p++);
 	eof = be32_to_cpup(p++);
 	ocount = be32_to_cpup(p++);
@@ -1664,27 +1632,27 @@
 	count = recvd;
 	eof = 0;
 	goto out;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 void *data)
 {
 	struct nfs_pgio_res *result = data;
+	unsigned int pos;
 	enum nfs_stat status;
 	int error;
 
+	pos = xdr_stream_pos(xdr);
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	result->op_status = status;
 	if (status != NFS3_OK)
 		goto out_status;
+	result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2);
 	error = decode_read3resok(xdr, result);
 out:
 	return error;
@@ -1725,22 +1693,18 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4 + 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	result->count = be32_to_cpup(p++);
 	result->verf->committed = be32_to_cpup(p++);
 	if (unlikely(result->verf->committed > NFS_FILE_SYNC))
 		goto out_badvalue;
 	if (decode_writeverf3(xdr, &result->verf->verifier))
-		goto out_eio;
+		return -EIO;
 	return result->count;
 out_badvalue:
 	dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
 	return -EIO;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-out_eio:
-	return -EIO;
 }
 
 static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -1753,7 +1717,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(xdr, result->fattr);
+	error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	result->op_status = status;
@@ -1787,14 +1751,15 @@
  *	};
  */
 static int decode_create3resok(struct xdr_stream *xdr,
-			       struct nfs3_diropres *result)
+			       struct nfs3_diropres *result,
+			       struct user_namespace *userns)
 {
 	int error;
 
 	error = decode_post_op_fh3(xdr, result->fh);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, userns);
 	if (unlikely(error))
 		goto out;
 	/* The server isn't required to return a file handle.
@@ -1803,7 +1768,7 @@
 	 * values for the new object. */
 	if (result->fh->size == 0)
 		result->fattr->valid = 0;
-	error = decode_wcc_data(xdr, result->dir_attr);
+	error = decode_wcc_data(xdr, result->dir_attr, userns);
 out:
 	return error;
 }
@@ -1812,6 +1777,7 @@
 				   struct xdr_stream *xdr,
 				   void *data)
 {
+	struct user_namespace *userns = rpc_rqst_userns(req);
 	struct nfs3_diropres *result = data;
 	enum nfs_stat status;
 	int error;
@@ -1821,11 +1787,11 @@
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_create3resok(xdr, result);
+	error = decode_create3resok(xdr, result, userns);
 out:
 	return error;
 out_default:
-	error = decode_wcc_data(xdr, result->dir_attr);
+	error = decode_wcc_data(xdr, result->dir_attr, userns);
 	if (unlikely(error))
 		goto out;
 	return nfs3_stat_to_errno(status);
@@ -1860,7 +1826,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(xdr, result->dir_attr);
+	error = decode_wcc_data(xdr, result->dir_attr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1895,6 +1861,7 @@
 				   struct xdr_stream *xdr,
 				   void *data)
 {
+	struct user_namespace *userns = rpc_rqst_userns(req);
 	struct nfs_renameres *result = data;
 	enum nfs_stat status;
 	int error;
@@ -1902,10 +1869,10 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(xdr, result->old_fattr);
+	error = decode_wcc_data(xdr, result->old_fattr, userns);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(xdr, result->new_fattr);
+	error = decode_wcc_data(xdr, result->new_fattr, userns);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1939,6 +1906,7 @@
 static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 void *data)
 {
+	struct user_namespace *userns = rpc_rqst_userns(req);
 	struct nfs3_linkres *result = data;
 	enum nfs_stat status;
 	int error;
@@ -1946,10 +1914,10 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, userns);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(xdr, result->dir_attr);
+	error = decode_wcc_data(xdr, result->dir_attr, userns);
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -1998,18 +1966,19 @@
 int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		       bool plus)
 {
+	struct user_namespace *userns = rpc_userns(entry->server->client);
 	struct nfs_entry old = *entry;
 	__be32 *p;
 	int error;
 	u64 new_cookie;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EAGAIN;
 	if (*p == xdr_zero) {
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(p == NULL))
-			goto out_overflow;
+		if (unlikely(!p))
+			return -EAGAIN;
 		if (*p == xdr_zero)
 			return -EAGAIN;
 		entry->eof = 1;
@@ -2032,7 +2001,7 @@
 
 	if (plus) {
 		entry->fattr->valid = 0;
-		error = decode_post_op_attr(xdr, entry->fattr);
+		error = decode_post_op_attr(xdr, entry->fattr, userns);
 		if (unlikely(error))
 			return error;
 		if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
@@ -2045,8 +2014,8 @@
 
 		/* In fact, a post_op_fh3: */
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(p == NULL))
-			goto out_overflow;
+		if (unlikely(!p))
+			return -EAGAIN;
 		if (*p != xdr_zero) {
 			error = decode_nfs_fh3(xdr, entry->fh);
 			if (unlikely(error)) {
@@ -2063,9 +2032,6 @@
 
 	return 0;
 
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EAGAIN;
 out_truncated:
 	dprintk("NFS: directory entry contains invalid file handle\n");
 	*entry = old;
@@ -2107,11 +2073,12 @@
 }
 
 static int decode_readdir3resok(struct xdr_stream *xdr,
-				struct nfs3_readdirres *result)
+				struct nfs3_readdirres *result,
+				struct user_namespace *userns)
 {
 	int error;
 
-	error = decode_post_op_attr(xdr, result->dir_attr);
+	error = decode_post_op_attr(xdr, result->dir_attr, userns);
 	if (unlikely(error))
 		goto out;
 	/* XXX: do we need to check if result->verf != NULL ? */
@@ -2136,11 +2103,11 @@
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_readdir3resok(xdr, result);
+	error = decode_readdir3resok(xdr, result, rpc_rqst_userns(req));
 out:
 	return error;
 out_default:
-	error = decode_post_op_attr(xdr, result->dir_attr);
+	error = decode_post_op_attr(xdr, result->dir_attr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	return nfs3_stat_to_errno(status);
@@ -2177,8 +2144,8 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 8 * 6 + 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	p = xdr_decode_size3(p, &result->tbytes);
 	p = xdr_decode_size3(p, &result->fbytes);
 	p = xdr_decode_size3(p, &result->abytes);
@@ -2187,9 +2154,6 @@
 	xdr_decode_size3(p, &result->afiles);
 	/* ignore invarsec */
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
@@ -2203,7 +2167,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -2249,8 +2213,8 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	result->rtmax  = be32_to_cpup(p++);
 	result->rtpref = be32_to_cpup(p++);
 	result->rtmult = be32_to_cpup(p++);
@@ -2264,9 +2228,6 @@
 	/* ignore properties */
 	result->lease_time = 0;
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
@@ -2280,7 +2241,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -2322,15 +2283,12 @@
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4 * 6);
-	if (unlikely(p == NULL))
-		goto out_overflow;
+	if (unlikely(!p))
+		return -EIO;
 	result->max_link = be32_to_cpup(p++);
 	result->max_namelen = be32_to_cpup(p);
 	/* ignore remaining fields */
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
@@ -2344,7 +2302,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	if (status != NFS3_OK)
@@ -2386,7 +2344,7 @@
 	error = decode_nfsstat3(xdr, &status);
 	if (unlikely(error))
 		goto out;
-	error = decode_wcc_data(xdr, result->fattr);
+	error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req));
 	if (unlikely(error))
 		goto out;
 	result->op_status = status;
@@ -2402,14 +2360,15 @@
 #ifdef CONFIG_NFS_V3_ACL
 
 static inline int decode_getacl3resok(struct xdr_stream *xdr,
-				      struct nfs3_getaclres *result)
+				      struct nfs3_getaclres *result,
+				      struct user_namespace *userns)
 {
 	struct posix_acl **acl;
 	unsigned int *aclcnt;
 	size_t hdrlen;
 	int error;
 
-	error = decode_post_op_attr(xdr, result->fattr);
+	error = decode_post_op_attr(xdr, result->fattr, userns);
 	if (unlikely(error))
 		goto out;
 	error = decode_uint32(xdr, &result->mask);
@@ -2457,7 +2416,7 @@
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_getacl3resok(xdr, result);
+	error = decode_getacl3resok(xdr, result, rpc_rqst_userns(req));
 out:
 	return error;
 out_default:
@@ -2476,7 +2435,7 @@
 		goto out;
 	if (status != NFS3_OK)
 		goto out_default;
-	error = decode_post_op_attr(xdr, result);
+	error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req));
 out:
 	return error;
 out_default:

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 19ec38f..901cca7 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h

@@ -20,5 +20,8 @@
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
 				   struct nfs42_layoutstat_data *);
 int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+			   const struct nfs42_layout_error *errors,
+			   size_t n);
 
 #endif /* __LINUX_FS_NFS_NFS4_2_H */

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index fed06fd..5196bfa 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c

@@ -329,9 +329,6 @@
 	};
 	ssize_t err, err2;
 
-	if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
-		return -EOPNOTSUPP;
-
 	src_lock = nfs_get_lock_context(nfs_file_open_context(src));
 	if (IS_ERR(src_lock))
 		return PTR_ERR(src_lock);
@@ -672,6 +669,170 @@
 	return 0;
 }
 
+static struct nfs42_layouterror_data *
+nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags)
+{
+	struct nfs42_layouterror_data *data;
+	struct inode *inode = lseg->pls_layout->plh_inode;
+
+	data = kzalloc(sizeof(*data), gfp_flags);
+	if (data) {
+		data->args.inode = data->inode = nfs_igrab_and_active(inode);
+		if (data->inode) {
+			data->lseg = pnfs_get_lseg(lseg);
+			if (data->lseg)
+				return data;
+			nfs_iput_and_deactive(data->inode);
+		}
+		kfree(data);
+	}
+	return NULL;
+}
+
+static void
+nfs42_free_layouterror_data(struct nfs42_layouterror_data *data)
+{
+	pnfs_put_lseg(data->lseg);
+	nfs_iput_and_deactive(data->inode);
+	kfree(data);
+}
+
+static void
+nfs42_layouterror_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs42_layouterror_data *data = calldata;
+	struct inode *inode = data->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+	unsigned i;
+
+	spin_lock(&inode->i_lock);
+	if (!pnfs_layout_is_valid(lo)) {
+		spin_unlock(&inode->i_lock);
+		rpc_exit(task, 0);
+		return;
+	}
+	for (i = 0; i < data->args.num_errors; i++)
+		nfs4_stateid_copy(&data->args.errors[i].stateid,
+				&lo->plh_stateid);
+	spin_unlock(&inode->i_lock);
+	nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+			    &data->res.seq_res, task);
+}
+
+static void
+nfs42_layouterror_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs42_layouterror_data *data = calldata;
+	struct inode *inode = data->inode;
+	struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
+
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_BADHANDLE:
+	case -ESTALE:
+		pnfs_destroy_layout(NFS_I(inode));
+		break;
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_BAD_STATEID:
+		spin_lock(&inode->i_lock);
+		if (pnfs_layout_is_valid(lo) &&
+		    nfs4_stateid_match(&data->args.errors[0].stateid,
+					     &lo->plh_stateid)) {
+			LIST_HEAD(head);
+
+			/*
+			 * Mark the bad layout state as invalid, then retry
+			 * with the current stateid.
+			 */
+			pnfs_mark_layout_stateid_invalid(lo, &head);
+			spin_unlock(&inode->i_lock);
+			pnfs_free_lseg_list(&head);
+			nfs_commit_inode(inode, 0);
+		} else
+			spin_unlock(&inode->i_lock);
+		break;
+	case -NFS4ERR_OLD_STATEID:
+		spin_lock(&inode->i_lock);
+		if (pnfs_layout_is_valid(lo) &&
+		    nfs4_stateid_match_other(&data->args.errors[0].stateid,
+					&lo->plh_stateid)) {
+			/* Do we need to delay before resending? */
+			if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+						&data->args.errors[0].stateid))
+				rpc_delay(task, HZ);
+			rpc_restart_call_prepare(task);
+		}
+		spin_unlock(&inode->i_lock);
+		break;
+	case -ENOTSUPP:
+	case -EOPNOTSUPP:
+		NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
+	}
+}
+
+static void
+nfs42_layouterror_release(void *calldata)
+{
+	struct nfs42_layouterror_data *data = calldata;
+
+	nfs42_free_layouterror_data(data);
+}
+
+static const struct rpc_call_ops nfs42_layouterror_ops = {
+	.rpc_call_prepare = nfs42_layouterror_prepare,
+	.rpc_call_done = nfs42_layouterror_done,
+	.rpc_release = nfs42_layouterror_release,
+};
+
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+		const struct nfs42_layout_error *errors, size_t n)
+{
+	struct inode *inode = lseg->pls_layout->plh_inode;
+	struct nfs42_layouterror_data *data;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR],
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_message = &msg,
+		.callback_ops = &nfs42_layouterror_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	unsigned int i;
+
+	if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR))
+		return -EOPNOTSUPP;
+	if (n > NFS42_LAYOUTERROR_MAX)
+		return -EINVAL;
+	data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+	if (!data)
+		return -ENOMEM;
+	for (i = 0; i < n; i++) {
+		data->args.errors[i] = errors[i];
+		data->args.num_errors++;
+		data->res.num_errors++;
+	}
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	task_setup.callback_data = data;
+	task_setup.rpc_client = NFS_SERVER(inode)->client;
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+	task = rpc_run_task(&task_setup);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs42_proc_layouterror);
+
 static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
 		struct file *dst_f, struct nfs_lock_context *src_lock,
 		struct nfs_lock_context *dst_lock, loff_t src_offset,

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 69f72ed..aed865a 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c

@@ -51,6 +51,15 @@
 					1 /* opaque devaddr4 length */ + \
 					XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
 #define decode_layoutstats_maxsz	(op_decode_hdr_maxsz)
+#define encode_device_error_maxsz	(XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+					1 /* status */ + 1 /* opnum */)
+#define encode_layouterror_maxsz	(op_decode_hdr_maxsz + \
+					2 /* offset */ + \
+					2 /* length */ + \
+					encode_stateid_maxsz + \
+					1 /* Array size */ + \
+					encode_device_error_maxsz)
+#define decode_layouterror_maxsz	(op_decode_hdr_maxsz)
 #define encode_clone_maxsz		(encode_stateid_maxsz + \
 					encode_stateid_maxsz + \
 					2 /* src offset */ + \
@@ -59,43 +68,53 @@
 #define decode_clone_maxsz		(op_decode_hdr_maxsz)
 
 #define NFS4_enc_allocate_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_allocate_maxsz + \
 					 encode_getattr_maxsz)
 #define NFS4_dec_allocate_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_allocate_maxsz + \
 					 decode_getattr_maxsz)
 #define NFS4_enc_copy_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_savefh_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_copy_maxsz + \
 					 encode_commit_maxsz)
 #define NFS4_dec_copy_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_savefh_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_copy_maxsz + \
 					 decode_commit_maxsz)
 #define NFS4_enc_offload_cancel_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_offload_cancel_maxsz)
 #define NFS4_dec_offload_cancel_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_offload_cancel_maxsz)
 #define NFS4_enc_deallocate_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_deallocate_maxsz + \
 					 encode_getattr_maxsz)
 #define NFS4_dec_deallocate_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_deallocate_maxsz + \
 					 decode_getattr_maxsz)
 #define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_seek_maxsz)
 #define NFS4_dec_seek_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_seek_maxsz)
 #define NFS4_enc_layoutstats_sz		(compound_encode_hdr_maxsz + \
@@ -106,6 +125,16 @@
 					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
+#define NFS4_enc_layouterror_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putfh_maxsz + \
+					 NFS42_LAYOUTERROR_MAX * \
+					 encode_layouterror_maxsz)
+#define NFS4_dec_layouterror_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putfh_maxsz + \
+					 NFS42_LAYOUTERROR_MAX * \
+					 decode_layouterror_maxsz)
 #define NFS4_enc_clone_sz		(compound_encode_hdr_maxsz + \
 					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
@@ -223,6 +252,34 @@
 	xdr_encode_hyper(p, args->count);
 }
 
+static void encode_device_error(struct xdr_stream *xdr,
+				const struct nfs42_device_error *error)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4);
+	p = xdr_encode_opaque_fixed(p, error->dev_id.data,
+			NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(error->status);
+	*p = cpu_to_be32(error->opnum);
+}
+
+static void encode_layouterror(struct xdr_stream *xdr,
+			       const struct nfs42_layout_error *args,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr);
+	p = reserve_space(xdr, 8 + 8);
+	p = xdr_encode_hyper(p, args->offset);
+	p = xdr_encode_hyper(p, args->length);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(1);
+	encode_device_error(xdr, &args->errors[0]);
+}
+
 /*
  * Encode ALLOCATE request
  */
@@ -381,6 +438,27 @@
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode LAYOUTERROR request
+ */
+static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const void *data)
+{
+	const struct nfs42_layouterror_args *args = data;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+	int i;
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	for (i = 0; i < args->num_errors; i++)
+		encode_layouterror(xdr, &args->errors[i], &hdr);
+	encode_nops(&hdr);
+}
+
 static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
 	return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -394,7 +472,7 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	count = be32_to_cpup(p);
 	if (count > 1)
 		return -EREMOTEIO;
@@ -402,18 +480,14 @@
 		status = decode_opaque_fixed(xdr, &res->stateid,
 				NFS4_STATEID_SIZE);
 		if (unlikely(status))
-			goto out_overflow;
+			return -EIO;
 	}
 	p = xdr_inline_decode(xdr, 8 + 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	p = xdr_decode_hyper(p, &res->count);
 	res->verifier.committed = be32_to_cpup(p);
 	return decode_verifier(xdr, &res->verifier.verifier);
-
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_copy_requirements(struct xdr_stream *xdr,
@@ -422,14 +496,11 @@
 
 	p = xdr_inline_decode(xdr, 4 + 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 
 	res->consecutive = be32_to_cpup(p++);
 	res->synchronous = be32_to_cpup(p++);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
@@ -474,15 +545,11 @@
 
 	p = xdr_inline_decode(xdr, 4 + 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 
 	res->sr_eof = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &res->sr_offset);
 	return 0;
-
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_layoutstats(struct xdr_stream *xdr)
@@ -495,6 +562,11 @@
 	return decode_op_hdr(xdr, OP_CLONE);
 }
 
+static int decode_layouterror(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_LAYOUTERROR);
+}
+
 /*
  * Decode ALLOCATE request
  */
@@ -704,4 +776,30 @@
 	return status;
 }
 
+/*
+ * Decode LAYOUTERROR request
+ */
+static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    void *data)
+{
+	struct nfs42_layouterror_res *res = data;
+	struct compound_hdr hdr;
+	int status, i;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+
+	for (i = 0; i < res->num_errors && status == 0; i++)
+		status = decode_layouterror(xdr);
+out:
+	res->rpc_status = status;
+	return status;
+}
+
 #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 63287d9..16b2e5c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h

@@ -62,10 +62,11 @@
 	void	(*free_lock_state)(struct nfs_server *,
 			struct nfs4_lock_state *);
 	int	(*test_and_free_expired)(struct nfs_server *,
-			nfs4_stateid *, struct rpc_cred *);
+			nfs4_stateid *, const struct cred *);
 	struct nfs_seqid *
 		(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
-	int	(*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *);
+	void	(*session_trunk)(struct rpc_clnt *clnt,
+			struct rpc_xprt *xprt, void *data);
 	const struct rpc_call_ops *call_sync_ops;
 	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
 	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -107,7 +108,7 @@
 	unsigned long        so_expires;
 	struct rb_node	     so_server_node;
 
-	struct rpc_cred	     *so_cred;	 /* Associated cred */
+	const struct cred    *so_cred;	 /* Associated cred */
 
 	spinlock_t	     so_lock;
 	atomic_t	     so_count;
@@ -190,9 +191,10 @@
 	unsigned int n_wronly;		/* Number of write-only references */
 	unsigned int n_rdwr;		/* Number of read/write references */
 	fmode_t state;			/* State on the server (R,W, or RW) */
-	atomic_t count;
+	refcount_t count;
 
 	wait_queue_head_t waitq;
+	struct rcu_head rcu_head;
 };
 
 
@@ -204,6 +206,7 @@
 	unsigned char delay : 1,
 		      recovering : 1,
 		      retry : 1;
+	bool interruptible;
 };
 
 struct nfs4_state_recovery_ops {
@@ -211,10 +214,10 @@
 	int state_flag_bit;
 	int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
 	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
-	int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
-	int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
+	int (*establish_clid)(struct nfs_client *, const struct cred *);
+	int (*reclaim_complete)(struct nfs_client *, const struct cred *);
 	int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
-		struct rpc_cred *);
+		const struct cred *);
 };
 
 struct nfs4_opendata {
@@ -244,19 +247,19 @@
 
 struct nfs4_add_xprt_data {
 	struct nfs_client	*clp;
-	struct rpc_cred		*cred;
+	const struct cred	*cred;
 };
 
 struct nfs4_state_maintenance_ops {
-	int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
-	struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
-	int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
+	int (*sched_state_renewal)(struct nfs_client *, const struct cred *, unsigned);
+	const struct cred * (*get_state_renewal_cred)(struct nfs_client *);
+	int (*renew_lease)(struct nfs_client *, const struct cred *);
 };
 
 struct nfs4_mig_recovery_ops {
 	int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
-		struct page *, struct rpc_cred *);
-	int (*fsid_present)(struct inode *, struct rpc_cred *);
+		struct page *, const struct cred *);
+	int (*fsid_present)(struct inode *, const struct cred *);
 };
 
 extern const struct dentry_operations nfs4_dentry_operations;
@@ -285,21 +288,21 @@
 			  struct rpc_message *, struct nfs4_sequence_args *,
 			  struct nfs4_sequence_res *, int);
 extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int);
-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, const struct cred *, struct nfs4_setclientid_res *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, const struct cred *);
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
-extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);
-extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
+extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, const struct cred *cred);
+extern int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred);
 extern int nfs4_destroy_clientid(struct nfs_client *clp);
-extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_init_clientid(struct nfs_client *, const struct cred *);
+extern int nfs41_init_clientid(struct nfs_client *, const struct cred *);
 extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
 				  struct nfs4_fs_locations *, struct page *);
 extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
-		struct page *page, struct rpc_cred *);
-extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);
+		struct page *page, const struct cred *);
+extern int nfs4_proc_fsid_present(struct inode *, const struct cred *);
 extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, const struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
@@ -309,12 +312,12 @@
 		const struct nfs_lock_context *l_ctx,
 		fmode_t fmode);
 
-#if defined(CONFIG_NFS_V4_1)
-extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
-extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
+#if defined(CONFIG_NFS_V4_1)
+extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
+extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *);
+extern int nfs4_proc_destroy_session(struct nfs4_session *, const struct cred *);
 extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
 				  bool sync);
 extern int nfs4_detect_session_trunking(struct nfs_client *clp,
@@ -337,7 +340,6 @@
 _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
 		    struct rpc_clnt **clntp, struct rpc_message *msg)
 {
-	struct rpc_cred *newcred = NULL;
 	rpc_authflavor_t flavor;
 
 	if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP ||
@@ -352,13 +354,7 @@
 			return false;
 	}
 	if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
-		spin_lock(&clp->cl_lock);
-		if (clp->cl_machine_cred != NULL)
-			/* don't call get_rpccred on the machine cred -
-			 * a reference will be held for life of clp */
-			newcred = clp->cl_machine_cred;
-		spin_unlock(&clp->cl_lock);
-		msg->rpc_cred = newcred;
+		msg->rpc_cred = rpc_machine_cred();
 
 		flavor = clp->cl_rpcclient->cl_auth->au_flavor;
 		WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I &&
@@ -449,16 +445,16 @@
 
 
 /* nfs4state.c */
-struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+const struct cred *nfs4_get_clid_cred(struct nfs_client *clp);
+const struct cred *nfs4_get_machine_cred(struct nfs_client *clp);
+const struct cred *nfs4_get_renew_cred(struct nfs_client *clp);
 int nfs4_discover_server_trunking(struct nfs_client *clp,
 			struct nfs_client **);
 int nfs40_discover_server_trunking(struct nfs_client *clp,
-			struct nfs_client **, struct rpc_cred *);
+			struct nfs_client **, const struct cred *);
 #if defined(CONFIG_NFS_V4_1)
 int nfs41_discover_server_trunking(struct nfs_client *clp,
-			struct nfs_client **, struct rpc_cred *);
+			struct nfs_client **, const struct cred *);
 extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
 extern void nfs41_notify_server(struct nfs_client *);
 #else
@@ -467,9 +463,10 @@
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);
+extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, const struct cred *, gfp_t);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
-extern void nfs4_purge_state_owners(struct nfs_server *);
+extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *);
+extern void nfs4_free_state_owners(struct list_head *head);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct nfs4_state *, fmode_t);
@@ -493,9 +490,7 @@
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
 		const struct nfs_lock_context *, nfs4_stateid *,
-		struct rpc_cred **);
-extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst,
-		struct nfs4_state *state);
+		const struct cred **);
 extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
 		struct nfs4_state *state);
 
@@ -577,6 +572,15 @@
 	return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
 }
 
+static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1)
+{
+	u32 seqid = be32_to_cpu(s1->seqid);
+
+	if (++seqid == 0)
+		++seqid;
+	s1->seqid = cpu_to_be32(seqid);
+}
+
 static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
 {
 	return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8f53455..da62040 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -42,7 +43,7 @@
 }
 
 #ifdef CONFIG_NFS_V4_1
-/**
+/*
  * Per auth flavor data server rpc clients
  */
 struct nfs4_ds_server {
@@ -51,7 +52,9 @@
 };
 
 /**
- * Common lookup case for DS I/O
+ * nfs4_find_ds_client - Common lookup case for DS I/O
+ * @ds_clp: pointer to the DS's nfs_client
+ * @flavor: rpc auth flavour to match
  */
 static struct nfs4_ds_server *
 nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
@@ -118,9 +121,13 @@
 }
 
 /**
-* Find or create a DS rpc client with th MDS server rpc client auth flavor
-* in the nfs_client cl_ds_clients list.
-*/
+ * nfs4_find_or_create_ds_client - Find or create a DS rpc client
+ * @ds_clp: pointer to the DS's nfs_client
+ * @inode: pointer to the inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
 struct rpc_clnt *
 nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
 {
@@ -145,7 +152,6 @@
 nfs4_shutdown_ds_clients(struct nfs_client *clp)
 {
 	struct nfs4_ds_server *dss;
-	LIST_HEAD(shutdown_list);
 
 	while (!list_empty(&clp->cl_ds_clients)) {
 		dss = list_entry(clp->cl_ds_clients.next,
@@ -284,7 +290,7 @@
 
 /**
  * nfs40_init_client - nfs_client initialization tasks for NFSv4.0
- * @clp - nfs_client to initialize
+ * @clp: nfs_client to initialize
  *
  * Returns zero on success, or a negative errno if some error occurred.
  */
@@ -312,7 +318,7 @@
 
 /**
  * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
- * @clp - nfs_client to initialize
+ * @clp: nfs_client to initialize
  *
  * Returns zero on success, or a negative errno if some error occurred.
  */
@@ -360,9 +366,7 @@
  * nfs4_init_client - Initialise an NFS4 client record
  *
  * @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
- * @ip_addr: callback IP address in presentation format
- * @authflavor: authentication flavor for underlying RPC transport
+ * @cl_init: pointer to nfs_client_initdata
  *
  * Returns pointer to an NFS client, or an ERR_PTR value.
  */
@@ -545,7 +549,7 @@
  */
 int nfs40_walk_client_list(struct nfs_client *new,
 			   struct nfs_client **result,
-			   struct rpc_cred *cred)
+			   const struct cred *cred)
 {
 	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
 	struct nfs_client *pos, *prev = NULL;
@@ -649,13 +653,13 @@
 
 /**
  * nfs4_detect_session_trunking - Checks for session trunking.
- *
- * Called after a successful EXCHANGE_ID on a multi-addr connection.
- * Upon success, add the transport.
- *
  * @clp:    original mount nfs_client
  * @res:    result structure from an exchange_id using the original mount
  *          nfs_client with a new multi_addr transport
+ * @xprt:   pointer to the transport to add.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
  *
  * Returns zero on success, otherwise -EINVAL
  *
@@ -711,7 +715,7 @@
  */
 int nfs41_walk_client_list(struct nfs_client *new,
 			   struct nfs_client **result,
-			   struct rpc_cred *cred)
+			   const struct cred *cred)
 {
 	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
 	struct nfs_client *pos, *prev = NULL;
@@ -754,9 +758,12 @@
 
 static void nfs4_destroy_server(struct nfs_server *server)
 {
+	LIST_HEAD(freeme);
+
 	nfs_server_return_all_delegations(server);
 	unset_pnfs_layoutdriver(server);
-	nfs4_purge_state_owners(server);
+	nfs4_purge_state_owners(server, &freeme);
+	nfs4_free_state_owners(&freeme);
 }
 
 /*
@@ -855,7 +862,8 @@
 		const size_t addrlen,
 		const char *ip_addr,
 		int proto, const struct rpc_timeout *timeparms,
-		u32 minorversion, struct net *net)
+		u32 minorversion, unsigned int nconnect,
+		struct net *net)
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
@@ -867,9 +875,12 @@
 		.minorversion = minorversion,
 		.net = net,
 		.timeparms = timeparms,
+		.cred = server->cred,
 	};
 	struct nfs_client *clp;
 
+	if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP)
+		cl_init.nconnect = nconnect;
 	if (server->flags & NFS_MOUNT_NORESVPORT)
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 	if (server->options & NFS_OPTION_MIGRATION)
@@ -928,6 +939,7 @@
 		.minorversion = minor_version,
 		.net = mds_clp->cl_net,
 		.timeparms = &ds_timeout,
+		.cred = mds_srv->cred,
 	};
 	char buf[INET6_ADDRSTRLEN + 1];
 
@@ -935,6 +947,9 @@
 		return ERR_PTR(-EINVAL);
 	cl_init.hostname = buf;
 
+	if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+		cl_init.nconnect = mds_clp->cl_nconnect;
+
 	if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
 		__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 
@@ -1068,6 +1083,7 @@
 			data->nfs_server.protocol,
 			&timeparms,
 			data->minorversion,
+			data->nfs_server.nconnect,
 			data->net);
 	if (error < 0)
 		return error;
@@ -1104,6 +1120,8 @@
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
+	server->cred = get_cred(current_cred());
+
 	auth_probe = mount_info->parsed->auth_info.flavor_len < 1;
 
 	/* set up the general RPC client */
@@ -1140,6 +1158,8 @@
 	parent_server = NFS_SB(data->sb);
 	parent_client = parent_server->nfs_client;
 
+	server->cred = get_cred(parent_server->cred);
+
 	/* Initialise the client representation from the parent server */
 	nfs_server_copy_userdata(server, parent_server);
 
@@ -1153,6 +1173,7 @@
 				XPRT_TRANSPORT_RDMA,
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
+				parent_client->cl_nconnect,
 				parent_client->cl_net);
 	if (!error)
 		goto init_server;
@@ -1166,6 +1187,7 @@
 				XPRT_TRANSPORT_TCP,
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
+				parent_client->cl_nconnect,
 				parent_client->cl_net);
 	if (error < 0)
 		goto error;
@@ -1261,7 +1283,8 @@
 	set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	error = nfs4_set_client(server, hostname, sap, salen, buf,
 				clp->cl_proto, clnt->cl_timeout,
-				clp->cl_minorversion, net);
+				clp->cl_minorversion,
+				clp->cl_nconnect, net);
 	clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	if (error != 0) {
 		nfs_server_insert_lists(server);

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 4288a6e..339663d 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c

@@ -49,7 +49,7 @@
 		return err;
 
 	if ((openflags & O_ACCMODE) == 3)
-		openflags--;
+		return nfs_open(inode, filp);
 
 	/* We can't create new files here */
 	openflags &= ~(O_CREAT|O_EXCL);
@@ -73,13 +73,13 @@
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		switch (err) {
-		case -EPERM:
-		case -EACCES:
-		case -EDQUOT:
-		case -ENOSPC:
-		case -EROFS:
-			goto out_put_ctx;
 		default:
+			goto out_put_ctx;
+		case -ENOENT:
+		case -ESTALE:
+		case -EISDIR:
+		case -ENOTDIR:
+		case -ELOOP:
 			goto out_drop;
 		}
 	}
@@ -125,22 +125,35 @@
 		return filemap_fdatawrite(file->f_mapping);
 
 	/* Flush writes to the server and return any errors */
-	return vfs_fsync(file, 0);
+	return nfs_wb_all(inode);
 }
 
 #ifdef CONFIG_NFS_V4_2
+static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+				      struct file *file_out, loff_t pos_out,
+				      size_t count, unsigned int flags)
+{
+	/* Only offload copy if superblock is the same */
+	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+		return -EXDEV;
+	if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY))
+		return -EOPNOTSUPP;
+	if (file_inode(file_in) == file_inode(file_out))
+		return -EOPNOTSUPP;
+	return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
+}
+
 static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    size_t count, unsigned int flags)
 {
 	ssize_t ret;
 
-	if (file_inode(file_in) == file_inode(file_out))
-		return -EINVAL;
-retry:
-	ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
-	if (ret == -EAGAIN)
-		goto retry;
+	ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
+				     flags);
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(file_in, pos_in, file_out,
+					      pos_out, count, flags);
 	return ret;
 }
 
@@ -180,8 +193,9 @@
 	return nfs42_proc_allocate(filep, offset, len);
 }
 
-static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
-		struct file *dst_file, loff_t dst_off, u64 count)
+static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
+		struct file *dst_file, loff_t dst_off, loff_t count,
+		unsigned int remap_flags)
 {
 	struct inode *dst_inode = file_inode(dst_file);
 	struct nfs_server *server = NFS_SERVER(dst_inode);
@@ -190,6 +204,13 @@
 	bool same_inode = false;
 	int ret;
 
+	/* NFS does not support deduplication. */
+	if (remap_flags & REMAP_FILE_DEDUP)
+		return -EOPNOTSUPP;
+
+	if (remap_flags & ~REMAP_FILE_ADVISORY)
+		return -EINVAL;
+
 	/* check alignment w.r.t. clone_blksize */
 	ret = -EINVAL;
 	if (bs) {
@@ -240,7 +261,7 @@
 		inode_unlock(src_inode);
 	}
 out:
-	return ret;
+	return ret < 0 ? ret : count;
 }
 #endif /* CONFIG_NFS_V4_2 */
 
@@ -262,7 +283,7 @@
 	.copy_file_range = nfs4_copy_file_range,
 	.llseek		= nfs4_file_llseek,
 	.fallocate	= nfs42_fallocate,
-	.clone_file_range = nfs42_clone_file_range,
+	.remap_file_range = nfs42_remap_file_range,
 #else
 	.llseek		= nfs_file_llseek,
 #endif

diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 3f23b68..1e72963 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c

@@ -44,6 +44,7 @@
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <keys/user-type.h>
+#include <keys/request_key_auth-type.h>
 #include <linux/module.h>
 
 #include "internal.h"
@@ -59,7 +60,7 @@
 struct idmap_legacy_upcalldata {
 	struct rpc_pipe_msg pipe_msg;
 	struct idmap_msg idmap_msg;
-	struct key_construction	*key_cons;
+	struct key	*authkey;
 	struct idmap *idmap;
 };
 
@@ -68,8 +69,16 @@
 	struct rpc_pipe		*idmap_pipe;
 	struct idmap_legacy_upcalldata *idmap_upcall_data;
 	struct mutex		idmap_mutex;
+	const struct cred	*cred;
 };
 
+static struct user_namespace *idmap_userns(const struct idmap *idmap)
+{
+	if (idmap && idmap->cred)
+		return idmap->cred->user_ns;
+	return &init_user_ns;
+}
+
 /**
  * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
  * @fattr: fully initialised struct nfs_fattr
@@ -270,18 +279,19 @@
 					 const char *type, struct idmap *idmap)
 {
 	char *desc;
-	struct key *rkey;
+	struct key *rkey = ERR_PTR(-EAGAIN);
 	ssize_t ret;
 
 	ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
-	rkey = request_key(&key_type_id_resolver, desc, "");
+	if (!idmap->cred || idmap->cred->user_ns == &init_user_ns)
+		rkey = request_key(&key_type_id_resolver, desc, "");
 	if (IS_ERR(rkey)) {
 		mutex_lock(&idmap->idmap_mutex);
 		rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
-						desc, "", 0, idmap);
+						desc, NULL, "", 0, idmap);
 		mutex_unlock(&idmap->idmap_mutex);
 	}
 	if (!IS_ERR(rkey))
@@ -384,7 +394,7 @@
 	{ Opt_find_err, NULL }
 };
 
-static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
+static int nfs_idmap_legacy_upcall(struct key *, void *);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
 				   size_t);
 static void idmap_release_pipe(struct inode *);
@@ -451,6 +461,9 @@
 	if (idmap == NULL)
 		return -ENOMEM;
 
+	mutex_init(&idmap->idmap_mutex);
+	idmap->cred = get_cred(clp->cl_rpcclient->cl_cred);
+
 	rpc_init_pipe_dir_object(&idmap->idmap_pdo,
 			&nfs_idmap_pipe_dir_object_ops,
 			idmap);
@@ -461,7 +474,6 @@
 		goto err;
 	}
 	idmap->idmap_pipe = pipe;
-	mutex_init(&idmap->idmap_mutex);
 
 	error = rpc_add_pipe_dir_object(clp->cl_net,
 			&clp->cl_rpcclient->cl_pipedir_objects,
@@ -474,6 +486,7 @@
 err_destroy_pipe:
 	rpc_destroy_pipe_data(idmap->idmap_pipe);
 err:
+	put_cred(idmap->cred);
 	kfree(idmap);
 	return error;
 }
@@ -490,6 +503,7 @@
 			&clp->cl_rpcclient->cl_pipedir_objects,
 			&idmap->idmap_pdo);
 	rpc_destroy_pipe_data(idmap->idmap_pipe);
+	put_cred(idmap->cred);
 	kfree(idmap);
 }
 
@@ -549,11 +563,12 @@
 static void
 nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
 {
-	struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
+	struct key *authkey = idmap->idmap_upcall_data->authkey;
 
 	kfree(idmap->idmap_upcall_data);
 	idmap->idmap_upcall_data = NULL;
-	complete_request_key(cons, ret);
+	complete_request_key(authkey, ret);
+	key_put(authkey);
 }
 
 static void
@@ -563,15 +578,14 @@
 		nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
 }
 
-static int nfs_idmap_legacy_upcall(struct key_construction *cons,
-				   const char *op,
-				   void *aux)
+static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
 {
 	struct idmap_legacy_upcalldata *data;
+	struct request_key_auth *rka = get_request_key_auth(authkey);
 	struct rpc_pipe_msg *msg;
 	struct idmap_msg *im;
 	struct idmap *idmap = (struct idmap *)aux;
-	struct key *key = cons->key;
+	struct key *key = rka->target_key;
 	int ret = -ENOKEY;
 
 	if (!aux)
@@ -586,7 +600,7 @@
 	msg = &data->pipe_msg;
 	im = &data->idmap_msg;
 	data->idmap = idmap;
-	data->key_cons = cons;
+	data->authkey = key_get(authkey);
 
 	ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
 	if (ret < 0)
@@ -604,7 +618,7 @@
 out2:
 	kfree(data);
 out1:
-	complete_request_key(cons, ret);
+	complete_request_key(authkey, ret);
 	return ret;
 }
 
@@ -651,9 +665,10 @@
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
+	struct request_key_auth *rka;
 	struct rpc_inode *rpci = RPC_I(file_inode(filp));
 	struct idmap *idmap = (struct idmap *)rpci->private;
-	struct key_construction *cons;
+	struct key *authkey;
 	struct idmap_msg im;
 	size_t namelen_in;
 	int ret = -ENOKEY;
@@ -665,7 +680,8 @@
 	if (idmap->idmap_upcall_data == NULL)
 		goto out_noupcall;
 
-	cons = idmap->idmap_upcall_data->key_cons;
+	authkey = idmap->idmap_upcall_data->authkey;
+	rka = get_request_key_auth(authkey);
 
 	if (mlen != sizeof(im)) {
 		ret = -ENOSPC;
@@ -690,9 +706,9 @@
 
 	ret = nfs_idmap_read_and_verify_message(&im,
 			&idmap->idmap_upcall_data->idmap_msg,
-			cons->key, cons->authkey);
+			rka->target_key, authkey);
 	if (ret >= 0) {
-		key_set_timeout(cons->key, nfs_idmap_cache_timeout);
+		key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
 		ret = mlen;
 	}
 
@@ -732,7 +748,7 @@
 	if (!nfs_map_string_to_numeric(name, namelen, &id))
 		ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap);
 	if (ret == 0) {
-		*uid = make_kuid(&init_user_ns, id);
+		*uid = make_kuid(idmap_userns(idmap), id);
 		if (!uid_valid(*uid))
 			ret = -ERANGE;
 	}
@@ -749,7 +765,7 @@
 	if (!nfs_map_string_to_numeric(name, namelen, &id))
 		ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap);
 	if (ret == 0) {
-		*gid = make_kgid(&init_user_ns, id);
+		*gid = make_kgid(idmap_userns(idmap), id);
 		if (!gid_valid(*gid))
 			ret = -ERANGE;
 	}
@@ -763,7 +779,7 @@
 	int ret = -EINVAL;
 	__u32 id;
 
-	id = from_kuid(&init_user_ns, uid);
+	id = from_kuid_munged(idmap_userns(idmap), uid);
 	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
 		ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
 	if (ret < 0)
@@ -777,7 +793,7 @@
 	int ret = -EINVAL;
 	__u32 id;
 
-	id = from_kgid(&init_user_ns, gid);
+	id = from_kgid_munged(idmap_userns(idmap), gid);
 	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
 		ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
 	if (ret < 0)

diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 24f06dc..2e460c3 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c

@@ -137,6 +137,7 @@
 
 /**
  * nfs_find_best_sec - Find a security mechanism supported locally
+ * @clnt: pointer to rpc_clnt
  * @server: NFS server struct
  * @flavors: List of security tuples returned by SECINFO procedure
  *
@@ -288,8 +289,8 @@
 
 /**
  * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @dentry - parent directory
- * @locations - array of NFSv4 server location information
+ * @dentry: parent directory
+ * @locations: array of NFSv4 server location information
  *
  */
 static struct vfsmount *nfs_follow_referral(struct dentry *dentry,

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8220a16..caacf5e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c

@@ -93,19 +93,19 @@
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label, struct inode *inode);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode);
-static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
 			    struct nfs_open_context *ctx, struct nfs4_label *ilabel,
 			    struct nfs4_label *olabel);
 #ifdef CONFIG_NFS_V4_1
 static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
-		struct rpc_cred *cred,
+		const struct cred *cred,
 		struct nfs4_slot *slot,
 		bool is_privileged);
 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
-		struct rpc_cred *);
+		const struct cred *);
 static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
-		struct rpc_cred *, bool);
+		const struct cred *, bool);
 #endif
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
@@ -361,7 +361,7 @@
 
 static void nfs4_test_and_free_stateid(struct nfs_server *server,
 		nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
 
@@ -370,7 +370,7 @@
 
 static void __nfs4_free_revoked_stateid(struct nfs_server *server,
 		nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	stateid->type = NFS4_REVOKED_STATEID_TYPE;
 	nfs4_test_and_free_stateid(server, stateid, cred);
@@ -378,7 +378,7 @@
 
 static void nfs4_free_revoked_stateid(struct nfs_server *server,
 		const nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	nfs4_stateid tmp;
 
@@ -400,17 +400,48 @@
 	return ret;
 }
 
-static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+static int nfs4_delay_killable(long *timeout)
 {
-	int res = 0;
-
 	might_sleep();
 
 	freezable_schedule_timeout_killable_unsafe(
 		nfs4_update_delay(timeout));
-	if (fatal_signal_pending(current))
-		res = -ERESTARTSYS;
-	return res;
+	if (!__fatal_signal_pending(current))
+		return 0;
+	return -EINTR;
+}
+
+static int nfs4_delay_interruptible(long *timeout)
+{
+	might_sleep();
+
+	freezable_schedule_timeout_interruptible(nfs4_update_delay(timeout));
+	if (!signal_pending(current))
+		return 0;
+	return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
+}
+
+static int nfs4_delay(long *timeout, bool interruptible)
+{
+	if (interruptible)
+		return nfs4_delay_interruptible(timeout);
+	return nfs4_delay_killable(timeout);
+}
+
+static const nfs4_stateid *
+nfs4_recoverable_stateid(const nfs4_stateid *stateid)
+{
+	if (!stateid)
+		return NULL;
+	switch (stateid->type) {
+	case NFS4_OPEN_STATEID_TYPE:
+	case NFS4_LOCK_STATEID_TYPE:
+	case NFS4_DELEGATION_STATEID_TYPE:
+		return stateid;
+	default:
+		break;
+	}
+	return NULL;
 }
 
 /* This is the error handling routine for processes that are allowed
@@ -421,7 +452,7 @@
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state *state = exception->state;
-	const nfs4_stateid *stateid = exception->stateid;
+	const nfs4_stateid *stateid;
 	struct inode *inode = exception->inode;
 	int ret = errorcode;
 
@@ -429,8 +460,9 @@
 	exception->recovering = 0;
 	exception->retry = 0;
 
+	stateid = nfs4_recoverable_stateid(exception->stateid);
 	if (stateid == NULL && state != NULL)
-		stateid = &state->stateid;
+		stateid = nfs4_recoverable_stateid(&state->stateid);
 
 	switch(errorcode) {
 		case 0:
@@ -546,7 +578,8 @@
 
 	ret = nfs4_do_handle_exception(server, errorcode, exception);
 	if (exception->delay) {
-		ret = nfs4_delay(server->client, &exception->timeout);
+		ret = nfs4_delay(&exception->timeout,
+				exception->interruptible);
 		goto out_retry;
 	}
 	if (exception->recovering) {
@@ -730,33 +763,41 @@
 	res->sr_slot = NULL;
 }
 
+static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
+		u32 seqnr)
+{
+	if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
+		slot->seq_nr_highest_sent = seqnr;
+}
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
+		u32 seqnr)
+{
+	slot->seq_nr_highest_sent = seqnr;
+	slot->seq_nr_last_acked = seqnr;
+}
+
 static int nfs41_sequence_process(struct rpc_task *task,
 		struct nfs4_sequence_res *res)
 {
 	struct nfs4_session *session;
 	struct nfs4_slot *slot = res->sr_slot;
 	struct nfs_client *clp;
-	bool interrupted = false;
 	int ret = 1;
 
 	if (slot == NULL)
 		goto out_noaction;
 	/* don't increment the sequence number if the task wasn't sent */
-	if (!RPC_WAS_SENT(task))
+	if (!RPC_WAS_SENT(task) || slot->seq_done)
 		goto out;
 
 	session = slot->table->session;
 
-	if (slot->interrupted) {
-		if (res->sr_status != -NFS4ERR_DELAY)
-			slot->interrupted = 0;
-		interrupted = true;
-	}
-
 	trace_nfs4_sequence_done(session, res);
 	/* Check the SEQUENCE operation status */
 	switch (res->sr_status) {
 	case 0:
+		/* Mark this sequence number as having been acked */
+		nfs4_slot_sequence_acked(slot, slot->seq_nr);
 		/* Update the slot's sequence and clientid lease timer */
 		slot->seq_done = 1;
 		clp = session->clp;
@@ -771,9 +812,9 @@
 		 * sr_status remains 1 if an RPC level error occurred.
 		 * The server may or may not have processed the sequence
 		 * operation..
-		 * Mark the slot as having hosted an interrupted RPC call.
 		 */
-		slot->interrupted = 1;
+		nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
+		slot->seq_done = 1;
 		goto out;
 	case -NFS4ERR_DELAY:
 		/* The server detected a resend of the RPC call and
@@ -784,6 +825,7 @@
 			__func__,
 			slot->slot_nr,
 			slot->seq_nr);
+		nfs4_slot_sequence_acked(slot, slot->seq_nr);
 		goto out_retry;
 	case -NFS4ERR_RETRY_UNCACHED_REP:
 	case -NFS4ERR_SEQ_FALSE_RETRY:
@@ -791,6 +833,7 @@
 		 * The server thinks we tried to replay a request.
 		 * Retry the call after bumping the sequence ID.
 		 */
+		nfs4_slot_sequence_acked(slot, slot->seq_nr);
 		goto retry_new_seq;
 	case -NFS4ERR_BADSLOT:
 		/*
@@ -801,21 +844,28 @@
 			goto session_recover;
 		goto retry_nowait;
 	case -NFS4ERR_SEQ_MISORDERED:
+		nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
 		/*
-		 * Was the last operation on this sequence interrupted?
-		 * If so, retry after bumping the sequence number.
+		 * Were one or more calls using this slot interrupted?
+		 * If the server never received the request, then our
+		 * transmitted slot sequence number may be too high.
 		 */
-		if (interrupted)
-			goto retry_new_seq;
-		/*
-		 * Could this slot have been previously retired?
-		 * If so, then the server may be expecting seq_nr = 1!
-		 */
-		if (slot->seq_nr != 1) {
-			slot->seq_nr = 1;
+		if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
+			slot->seq_nr--;
 			goto retry_nowait;
 		}
-		goto session_recover;
+		/*
+		 * RFC5661:
+		 * A retry might be sent while the original request is
+		 * still in progress on the replier. The replier SHOULD
+		 * deal with the issue by returning NFS4ERR_DELAY as the
+		 * reply to SEQUENCE or CB_SEQUENCE operation, but
+		 * implementations MAY return NFS4ERR_SEQ_MISORDERED.
+		 *
+		 * Restart the search after a delay.
+		 */
+		slot->seq_nr = slot->seq_nr_highest_sent;
+		goto out_retry;
 	default:
 		/* Just update the slot sequence no. */
 		slot->seq_done = 1;
@@ -906,17 +956,6 @@
 	.rpc_call_done = nfs41_call_sync_done,
 };
 
-static void
-nfs4_sequence_process_interrupted(struct nfs_client *client,
-		struct nfs4_slot *slot, struct rpc_cred *cred)
-{
-	struct rpc_task *task;
-
-	task = _nfs41_proc_sequence(client, cred, slot, true);
-	if (!IS_ERR(task))
-		rpc_put_task_async(task);
-}
-
 #else	/* !CONFIG_NFS_V4_1 */
 
 static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -937,16 +976,15 @@
 }
 EXPORT_SYMBOL_GPL(nfs4_sequence_done);
 
-static void
-nfs4_sequence_process_interrupted(struct nfs_client *client,
-		struct nfs4_slot *slot, struct rpc_cred *cred)
-{
-	WARN_ON_ONCE(1);
-	slot->interrupted = 0;
-}
-
 #endif	/* !CONFIG_NFS_V4_1 */
 
+static void nfs41_sequence_res_init(struct nfs4_sequence_res *res)
+{
+	res->sr_timestamp = jiffies;
+	res->sr_status_flags = 0;
+	res->sr_status = 1;
+}
+
 static
 void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
 		struct nfs4_sequence_res *res,
@@ -958,10 +996,6 @@
 	args->sa_slot = slot;
 
 	res->sr_slot = slot;
-	res->sr_timestamp = jiffies;
-	res->sr_status_flags = 0;
-	res->sr_status = 1;
-
 }
 
 int nfs4_setup_sequence(struct nfs_client *client,
@@ -977,43 +1011,43 @@
 	if (res->sr_slot != NULL)
 		goto out_start;
 
-	if (session) {
+	if (session)
 		tbl = &session->fc_slot_table;
-		task->tk_timeout = 0;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	/* The state manager will wait until the slot table is empty */
+	if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+		goto out_sleep;
+
+	slot = nfs4_alloc_slot(tbl);
+	if (IS_ERR(slot)) {
+		if (slot == ERR_PTR(-ENOMEM))
+			goto out_sleep_timeout;
+		goto out_sleep;
 	}
-
-	for (;;) {
-		spin_lock(&tbl->slot_tbl_lock);
-		/* The state manager will wait until the slot table is empty */
-		if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
-			goto out_sleep;
-
-		slot = nfs4_alloc_slot(tbl);
-		if (IS_ERR(slot)) {
-			/* Try again in 1/4 second */
-			if (slot == ERR_PTR(-ENOMEM))
-				task->tk_timeout = HZ >> 2;
-			goto out_sleep;
-		}
-		spin_unlock(&tbl->slot_tbl_lock);
-
-		if (likely(!slot->interrupted))
-			break;
-		nfs4_sequence_process_interrupted(client,
-				slot, task->tk_msg.rpc_cred);
-	}
+	spin_unlock(&tbl->slot_tbl_lock);
 
 	nfs4_sequence_attach_slot(args, res, slot);
 
 	trace_nfs4_setup_sequence(session, args);
 out_start:
+	nfs41_sequence_res_init(res);
 	rpc_call_start(task);
 	return 0;
-
+out_sleep_timeout:
+	/* Try again in 1/4 second */
+	if (args->sa_privileged)
+		rpc_sleep_on_priority_timeout(&tbl->slot_tbl_waitq, task,
+				jiffies + (HZ >> 2), RPC_PRIORITY_PRIVILEGED);
+	else
+		rpc_sleep_on_timeout(&tbl->slot_tbl_waitq, task,
+				NULL, jiffies + (HZ >> 2));
+	spin_unlock(&tbl->slot_tbl_lock);
+	return -EAGAIN;
 out_sleep:
 	if (args->sa_privileged)
 		rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
-				NULL, RPC_PRIORITY_PRIVILEGED);
+				RPC_PRIORITY_PRIVILEGED);
 	else
 		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
 	spin_unlock(&tbl->slot_tbl_lock);
@@ -1039,14 +1073,26 @@
 	.rpc_call_done = nfs40_call_sync_done,
 };
 
+static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup)
+{
+	int ret;
+	struct rpc_task *task;
+
+	task = rpc_run_task(task_setup);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	ret = task->tk_status;
+	rpc_put_task(task);
+	return ret;
+}
+
 static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
 				   struct nfs_server *server,
 				   struct rpc_message *msg,
 				   struct nfs4_sequence_args *args,
 				   struct nfs4_sequence_res *res)
 {
-	int ret;
-	struct rpc_task *task;
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_call_sync_data data = {
 		.seq_server = server,
@@ -1060,14 +1106,7 @@
 		.callback_data = &data
 	};
 
-	task = rpc_run_task(&task_setup);
-	if (IS_ERR(task))
-		ret = PTR_ERR(task);
-	else {
-		ret = task->tk_status;
-		rpc_put_task(task);
-	}
-	return ret;
+	return nfs4_call_sync_custom(&task_setup);
 }
 
 int nfs4_call_sync(struct rpc_clnt *clnt,
@@ -1148,6 +1187,18 @@
 	return true;
 }
 
+static fmode_t _nfs4_ctx_to_accessmode(const struct nfs_open_context *ctx)
+{
+	 return ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+}
+
+static fmode_t _nfs4_ctx_to_openmode(const struct nfs_open_context *ctx)
+{
+	fmode_t ret = ctx->mode & (FMODE_READ|FMODE_WRITE);
+
+	return (ctx->mode & FMODE_EXEC) ? FMODE_READ | ret : ret;
+}
+
 static u32
 nfs4_map_atomic_open_share(struct nfs_server *server,
 		fmode_t fmode, int openflags)
@@ -1239,10 +1290,20 @@
 	atomic_inc(&sp->so_count);
 	p->o_arg.open_flags = flags;
 	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
-	p->o_arg.umask = current_umask();
 	p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
 	p->o_arg.share_access = nfs4_map_atomic_open_share(server,
 			fmode, flags);
+	if (flags & O_CREAT) {
+		p->o_arg.umask = current_umask();
+		p->o_arg.label = nfs4_label_copy(p->a_label, label);
+		if (c->sattr != NULL && c->sattr->ia_valid != 0) {
+			p->o_arg.u.attrs = &p->attrs;
+			memcpy(&p->attrs, c->sattr, sizeof(p->attrs));
+
+			memcpy(p->o_arg.u.verifier.data, c->verf,
+					sizeof(p->o_arg.u.verifier.data));
+		}
+	}
 	/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
 	 * will return permission denied for all bits until close */
 	if (!(flags & O_EXCL)) {
@@ -1266,7 +1327,6 @@
 	p->o_arg.server = server;
 	p->o_arg.bitmask = nfs4_bitmask(server, label);
 	p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
-	p->o_arg.label = nfs4_label_copy(p->a_label, label);
 	switch (p->o_arg.claim) {
 	case NFS4_OPEN_CLAIM_NULL:
 	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
@@ -1279,13 +1339,6 @@
 	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
 		p->o_arg.fh = NFS_FH(d_inode(dentry));
 	}
-	if (c != NULL && c->sattr != NULL && c->sattr->ia_valid != 0) {
-		p->o_arg.u.attrs = &p->attrs;
-		memcpy(&p->attrs, c->sattr, sizeof(p->attrs));
-
-		memcpy(p->o_arg.u.verifier.data, c->verf,
-				sizeof(p->o_arg.u.verifier.data));
-	}
 	p->c_arg.fh = &p->o_res.fh;
 	p->c_arg.stateid = &p->o_res.stateid;
 	p->c_arg.seqid = p->o_arg.seqid;
@@ -1349,12 +1402,20 @@
 	return false;
 }
 
-static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode,
+		int open_mode, enum open_claim_type4 claim)
 {
 	int ret = 0;
 
 	if (open_mode & (O_EXCL|O_TRUNC))
 		goto out;
+	switch (claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_FH:
+		goto out;
+	default:
+		break;
+	}
 	switch (mode & (FMODE_READ|FMODE_WRITE)) {
 		case FMODE_READ:
 			ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
@@ -1379,8 +1440,6 @@
 		return 0;
 	if ((delegation->type & fmode) != fmode)
 		return 0;
-	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
-		return 0;
 	switch (claim) {
 	case NFS4_OPEN_CLAIM_NULL:
 	case NFS4_OPEN_CLAIM_FH:
@@ -1547,6 +1606,10 @@
 
 static void nfs_set_open_stateid_locked(struct nfs4_state *state,
 		const nfs4_stateid *stateid, nfs4_stateid *freeme)
+	__must_hold(&state->owner->so_lock)
+	__must_hold(&state->seqlock)
+	__must_hold(RCU)
+
 {
 	DEFINE_WAIT(wait);
 	int status = 0;
@@ -1623,6 +1686,14 @@
 	write_sequnlock(&state->seqlock);
 }
 
+static void nfs_state_clear_open_state_flags(struct nfs4_state *state)
+{
+	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+	clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+	clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+	clear_bit(NFS_OPEN_STATE, &state->flags);
+}
+
 static void nfs_state_set_delegation(struct nfs4_state *state,
 		const nfs4_stateid *deleg_stateid,
 		fmode_t fmode)
@@ -1737,7 +1808,6 @@
 static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 {
 	struct nfs4_state *state = opendata->state;
-	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_delegation *delegation;
 	int open_mode = opendata->o_arg.open_flags;
 	fmode_t fmode = opendata->o_arg.fmode;
@@ -1747,14 +1817,14 @@
 
 	for (;;) {
 		spin_lock(&state->owner->so_lock);
-		if (can_open_cached(state, fmode, open_mode)) {
+		if (can_open_cached(state, fmode, open_mode, claim)) {
 			update_open_stateflags(state, fmode);
 			spin_unlock(&state->owner->so_lock);
 			goto out_return_state;
 		}
 		spin_unlock(&state->owner->so_lock);
 		rcu_read_lock();
-		delegation = rcu_dereference(nfsi->delegation);
+		delegation = nfs4_get_valid_delegation(state->inode);
 		if (!can_open_delegated(delegation, fmode, claim)) {
 			rcu_read_unlock();
 			break;
@@ -1777,7 +1847,7 @@
 out:
 	return ERR_PTR(ret);
 out_return_state:
-	atomic_inc(&state->count);
+	refcount_inc(&state->count);
 	return state;
 }
 
@@ -1847,9 +1917,10 @@
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
 update:
-	update_open_stateid(state, &data->o_res.stateid, NULL,
-			    data->o_arg.fmode);
-	atomic_inc(&state->count);
+	if (!update_open_stateid(state, &data->o_res.stateid,
+				NULL, data->o_arg.fmode))
+		return ERR_PTR(-EAGAIN);
+	refcount_inc(&state->count);
 
 	return state;
 }
@@ -1887,7 +1958,7 @@
 		return ERR_CAST(inode);
 	if (data->state != NULL && data->state->inode == inode) {
 		state = data->state;
-		atomic_inc(&state->count);
+		refcount_inc(&state->count);
 	} else
 		state = nfs4_get_open_state(inode, data->owner);
 	iput(inode);
@@ -1913,8 +1984,11 @@
 
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
-	update_open_stateid(state, &data->o_res.stateid, NULL,
-			data->o_arg.fmode);
+	if (!update_open_stateid(state, &data->o_res.stateid,
+				NULL, data->o_arg.fmode)) {
+		nfs4_put_open_state(state);
+		state = ERR_PTR(-EAGAIN);
+	}
 out:
 	nfs_release_seqid(data->o_arg.seqid);
 	return state;
@@ -1933,23 +2007,41 @@
 	return ret;
 }
 
-static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
+static struct nfs_open_context *
+nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode)
 {
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_open_context *ctx;
 
-	spin_lock(&state->inode->i_lock);
-	list_for_each_entry(ctx, &nfsi->open_files, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
 		if (ctx->state != state)
 			continue;
-		get_nfs_open_context(ctx);
-		spin_unlock(&state->inode->i_lock);
+		if ((ctx->mode & mode) != mode)
+			continue;
+		if (!get_nfs_open_context(ctx))
+			continue;
+		rcu_read_unlock();
 		return ctx;
 	}
-	spin_unlock(&state->inode->i_lock);
+	rcu_read_unlock();
 	return ERR_PTR(-ENOENT);
 }
 
+static struct nfs_open_context *
+nfs4_state_find_open_context(struct nfs4_state *state)
+{
+	struct nfs_open_context *ctx;
+
+	ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE);
+	if (!IS_ERR(ctx))
+		return ctx;
+	ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE);
+	if (!IS_ERR(ctx))
+		return ctx;
+	return nfs4_state_find_open_context_mode(state, FMODE_READ);
+}
+
 static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
 		struct nfs4_state *state, enum open_claim_type4 claim)
 {
@@ -1960,7 +2052,7 @@
 	if (opendata == NULL)
 		return ERR_PTR(-ENOMEM);
 	opendata->state = state;
-	atomic_inc(&state->count);
+	refcount_inc(&state->count);
 	return opendata;
 }
 
@@ -1996,13 +2088,7 @@
 {
 	int ret;
 
-	/* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
-	clear_bit(NFS_O_RDWR_STATE, &state->flags);
-	clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-	clear_bit(NFS_O_RDONLY_STATE, &state->flags);
 	/* memory barrier prior to reading state->n_* */
-	clear_bit(NFS_DELEGATED_STATE, &state->flags);
-	clear_bit(NFS_OPEN_STATE, &state->flags);
 	smp_rmb();
 	ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
 	if (ret != 0)
@@ -2078,6 +2164,8 @@
 	ctx = nfs4_state_find_open_context(state);
 	if (IS_ERR(ctx))
 		return -EAGAIN;
+	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	nfs_state_clear_open_state_flags(state);
 	ret = nfs4_do_open_reclaim(ctx, state);
 	put_nfs_open_context(ctx);
 	return ret;
@@ -2093,18 +2181,17 @@
 		case -ENOENT:
 		case -EAGAIN:
 		case -ESTALE:
+		case -ETIMEDOUT:
 			break;
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
 		case -NFS4ERR_BAD_HIGH_SLOT:
 		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 		case -NFS4ERR_DEADSESSION:
-			set_bit(NFS_DELEGATED_STATE, &state->flags);
 			nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
 			return -EAGAIN;
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_STALE_STATEID:
-			set_bit(NFS_DELEGATED_STATE, &state->flags);
 			/* Don't recall a delegation if it was lost */
 			nfs4_schedule_lease_recovery(server->nfs_client);
 			return -EAGAIN;
@@ -2125,7 +2212,6 @@
 			return -EAGAIN;
 		case -NFS4ERR_DELAY:
 		case -NFS4ERR_GRACE:
-			set_bit(NFS_DELEGATED_STATE, &state->flags);
 			ssleep(1);
 			return -EAGAIN;
 		case -ENOMEM:
@@ -2141,8 +2227,7 @@
 }
 
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
-		struct nfs4_state *state, const nfs4_stateid *stateid,
-		fmode_t type)
+		struct nfs4_state *state, const nfs4_stateid *stateid)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	struct nfs4_opendata *opendata;
@@ -2153,20 +2238,23 @@
 	if (IS_ERR(opendata))
 		return PTR_ERR(opendata);
 	nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
-	nfs_state_clear_delegation(state);
-	switch (type & (FMODE_READ|FMODE_WRITE)) {
-	case FMODE_READ|FMODE_WRITE:
-	case FMODE_WRITE:
+	if (!test_bit(NFS_O_RDWR_STATE, &state->flags)) {
 		err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
 		if (err)
-			break;
+			goto out;
+	}
+	if (!test_bit(NFS_O_WRONLY_STATE, &state->flags)) {
 		err = nfs4_open_recover_helper(opendata, FMODE_WRITE);
 		if (err)
-			break;
-		/* Fall through */
-	case FMODE_READ:
-		err = nfs4_open_recover_helper(opendata, FMODE_READ);
+			goto out;
 	}
+	if (!test_bit(NFS_O_RDONLY_STATE, &state->flags)) {
+		err = nfs4_open_recover_helper(opendata, FMODE_READ);
+		if (err)
+			goto out;
+	}
+	nfs_state_clear_delegation(state);
+out:
 	nfs4_opendata_put(opendata);
 	return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err);
 }
@@ -2276,10 +2364,11 @@
 	if (data->state != NULL) {
 		struct nfs_delegation *delegation;
 
-		if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
+		if (can_open_cached(data->state, data->o_arg.fmode,
+					data->o_arg.open_flags, claim))
 			goto out_no_action;
 		rcu_read_lock();
-		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
+		delegation = nfs4_get_valid_delegation(data->state->inode);
 		if (can_open_delegated(delegation, data->o_arg.fmode, claim))
 			goto unlock_no_action;
 		rcu_read_unlock();
@@ -2413,6 +2502,7 @@
 	if (!ctx) {
 		nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 1);
 		data->is_recover = true;
+		task_setup_data.flags |= RPC_TASK_TIMEOUT;
 	} else {
 		nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 0);
 		pnfs_lgopen_prepare(data, ctx);
@@ -2457,7 +2547,7 @@
  * Note that in the non-execute case, we want to turn off permission
  * checking if we just created a new file (POSIX open() semantics).
  */
-static int nfs4_opendata_access(struct rpc_cred *cred,
+static int nfs4_opendata_access(const struct cred *cred,
 				struct nfs4_opendata *opendata,
 				struct nfs4_state *state, fmode_t fmode,
 				int openflags)
@@ -2619,12 +2709,13 @@
 {
 	/* NFSv4.0 doesn't allow for delegation recovery on open expire */
 	nfs40_clear_delegation_stateid(state);
+	nfs_state_clear_open_state_flags(state);
 	return nfs4_open_expired(sp, state);
 }
 
 static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
 		nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	return -NFS4ERR_BAD_STATEID;
 }
@@ -2632,7 +2723,7 @@
 #if defined(CONFIG_NFS_V4_1)
 static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
 		nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	int status;
 
@@ -2661,13 +2752,13 @@
 	return -NFS4ERR_EXPIRED;
 }
 
-static void nfs41_check_delegation_stateid(struct nfs4_state *state)
+static int nfs41_check_delegation_stateid(struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	nfs4_stateid stateid;
 	struct nfs_delegation *delegation;
-	struct rpc_cred *cred;
-	int status;
+	const struct cred *cred = NULL;
+	int status, ret = NFS_OK;
 
 	/* Get the delegation credential for use by test/free_stateid */
 	rcu_read_lock();
@@ -2675,30 +2766,42 @@
 	if (delegation == NULL) {
 		rcu_read_unlock();
 		nfs_state_clear_delegation(state);
-		return;
+		return NFS_OK;
 	}
 
 	nfs4_stateid_copy(&stateid, &delegation->stateid);
-	if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
-		rcu_read_unlock();
-		nfs_state_clear_delegation(state);
-		return;
-	}
 
 	if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
 				&delegation->flags)) {
 		rcu_read_unlock();
-		return;
+		return NFS_OK;
 	}
 
-	cred = get_rpccred(delegation->cred);
+	if (delegation->cred)
+		cred = get_cred(delegation->cred);
 	rcu_read_unlock();
 	status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
 	trace_nfs4_test_delegation_stateid(state, NULL, status);
 	if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
 		nfs_finish_clear_delegation_stateid(state, &stateid);
+	else
+		ret = status;
 
-	put_rpccred(cred);
+	put_cred(cred);
+	return ret;
+}
+
+static void nfs41_delegation_recover_stateid(struct nfs4_state *state)
+{
+	nfs4_stateid tmp;
+
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags) &&
+	    nfs4_copy_delegation_stateid(state->inode, state->state,
+				&tmp, NULL) &&
+	    nfs4_stateid_match_other(&state->stateid, &tmp))
+		nfs_state_set_delegation(state, &tmp, state->state);
+	else
+		nfs_state_clear_delegation(state);
 }
 
 /**
@@ -2721,7 +2824,7 @@
 	spin_lock(&state->state_lock);
 	list_for_each_entry(lsp, &state->lock_states, ls_locks) {
 		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-			struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+			const struct cred *cred = lsp->ls_state->owner->so_cred;
 
 			refcount_inc(&lsp->ls_count);
 			spin_unlock(&state->state_lock);
@@ -2765,24 +2868,15 @@
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	nfs4_stateid *stateid = &state->open_stateid;
-	struct rpc_cred *cred = state->owner->so_cred;
+	const struct cred *cred = state->owner->so_cred;
 	int status;
 
-	if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) {
-		if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)  {
-			if (nfs4_have_delegation(state->inode, state->state))
-				return NFS_OK;
-			return -NFS4ERR_OPENMODE;
-		}
+	if (test_bit(NFS_OPEN_STATE, &state->flags) == 0)
 		return -NFS4ERR_BAD_STATEID;
-	}
 	status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
 	trace_nfs4_test_open_stateid(state, NULL, status);
 	if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
-		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
-		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-		clear_bit(NFS_O_RDWR_STATE, &state->flags);
-		clear_bit(NFS_OPEN_STATE, &state->flags);
+		nfs_state_clear_open_state_flags(state);
 		stateid->type = NFS4_INVALID_STATEID_TYPE;
 		return status;
 	}
@@ -2795,7 +2889,11 @@
 {
 	int status;
 
-	nfs41_check_delegation_stateid(state);
+	status = nfs41_check_delegation_stateid(state);
+	if (status != NFS_OK)
+		return status;
+	nfs41_delegation_recover_stateid(state);
+
 	status = nfs41_check_expired_locks(state);
 	if (status != NFS_OK)
 		return status;
@@ -2848,14 +2946,13 @@
 }
 
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
-		fmode_t fmode,
-		int flags,
-		struct nfs_open_context *ctx)
+		int flags, struct nfs_open_context *ctx)
 {
 	struct nfs4_state_owner *sp = opendata->owner;
 	struct nfs_server *server = sp->so_server;
 	struct dentry *dentry;
 	struct nfs4_state *state;
+	fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx);
 	unsigned int seq;
 	int ret;
 
@@ -2894,7 +2991,8 @@
 	/* Parse layoutget results before we check for access */
 	pnfs_parse_lgopen(state->inode, opendata->lgp, ctx);
 
-	ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
+	ret = nfs4_opendata_access(sp->so_cred, opendata, state,
+			acc_mode, flags);
 	if (ret != 0)
 		goto out;
 
@@ -2905,7 +3003,8 @@
 	}
 
 out:
-	nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+	if (!opendata->cancelled)
+		nfs4_sequence_free_slot(&opendata->o_res.seq_res);
 	return ret;
 }
 
@@ -2923,9 +3022,9 @@
 	struct nfs_server       *server = NFS_SERVER(dir);
 	struct nfs4_opendata *opendata;
 	struct dentry *dentry = ctx->dentry;
-	struct rpc_cred *cred = ctx->cred;
+	const struct cred *cred = ctx->cred;
 	struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
-	fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+	fmode_t fmode = _nfs4_ctx_to_openmode(ctx);
 	enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
 	struct iattr *sattr = c->sattr;
 	struct nfs4_label *label = c->label;
@@ -2971,7 +3070,7 @@
 	if (d_really_is_positive(dentry))
 		opendata->state = nfs4_get_open_state(d_inode(dentry), sp);
 
-	status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
+	status = _nfs4_open_and_get_state(opendata, flags, ctx);
 	if (status != 0)
 		goto err_free_label;
 	state = ctx->state;
@@ -3031,7 +3130,9 @@
 					int *opened)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	struct nfs4_state *res;
 	struct nfs4_open_createattrs c = {
 		.label = label,
@@ -3093,7 +3194,7 @@
 static int _nfs4_do_setattr(struct inode *inode,
 			    struct nfs_setattrargs *arg,
 			    struct nfs_setattrres *res,
-			    struct rpc_cred *cred,
+			    const struct cred *cred,
 			    struct nfs_open_context *ctx)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -3103,7 +3204,7 @@
 		.rpc_resp	= res,
 		.rpc_cred	= cred,
 	};
-	struct rpc_cred *delegation_cred = NULL;
+	const struct cred *delegation_cred = NULL;
 	unsigned long timestamp = jiffies;
 	bool truncate;
 	int status;
@@ -3117,7 +3218,7 @@
 
 	if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) {
 		/* Use that stateid */
-	} else if (ctx != NULL) {
+	} else if (ctx != NULL && ctx->state) {
 		struct nfs_lock_context *l_ctx;
 		if (!nfs4_valid_open_stateid(ctx->state))
 			return -EBADF;
@@ -3138,14 +3239,14 @@
 
 	status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
 
-	put_rpccred(delegation_cred);
+	put_cred(delegation_cred);
 	if (status == 0 && ctx != NULL)
 		renew_lease(server, timestamp);
 	trace_nfs4_setattr(inode, &arg->stateid, status);
 	return status;
 }
 
-static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
 			   struct nfs_fattr *fattr, struct iattr *sattr,
 			   struct nfs_open_context *ctx, struct nfs4_label *ilabel,
 			   struct nfs4_label *olabel)
@@ -3209,6 +3310,75 @@
 	return pnfs_wait_on_layoutreturn(inode, task);
 }
 
+/*
+ * Update the seqid of an open stateid
+ */
+static void nfs4_sync_open_stateid(nfs4_stateid *dst,
+		struct nfs4_state *state)
+{
+	__be32 seqid_open;
+	u32 dst_seqid;
+	int seq;
+
+	for (;;) {
+		if (!nfs4_valid_open_stateid(state))
+			break;
+		seq = read_seqbegin(&state->seqlock);
+		if (!nfs4_state_match_open_stateid_other(state, dst)) {
+			nfs4_stateid_copy(dst, &state->open_stateid);
+			if (read_seqretry(&state->seqlock, seq))
+				continue;
+			break;
+		}
+		seqid_open = state->open_stateid.seqid;
+		if (read_seqretry(&state->seqlock, seq))
+			continue;
+
+		dst_seqid = be32_to_cpu(dst->seqid);
+		if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0)
+			dst->seqid = seqid_open;
+		break;
+	}
+}
+
+/*
+ * Update the seqid of an open stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
+		struct nfs4_state *state)
+{
+	__be32 seqid_open;
+	u32 dst_seqid;
+	bool ret;
+	int seq;
+
+	for (;;) {
+		ret = false;
+		if (!nfs4_valid_open_stateid(state))
+			break;
+		seq = read_seqbegin(&state->seqlock);
+		if (!nfs4_state_match_open_stateid_other(state, dst)) {
+			if (read_seqretry(&state->seqlock, seq))
+				continue;
+			break;
+		}
+		seqid_open = state->open_stateid.seqid;
+		if (read_seqretry(&state->seqlock, seq))
+			continue;
+
+		dst_seqid = be32_to_cpu(dst->seqid);
+		if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0)
+			dst->seqid = cpu_to_be32(dst_seqid + 1);
+		else
+			dst->seqid = seqid_open;
+		ret = true;
+		break;
+	}
+
+	return ret;
+}
+
 struct nfs4_closedata {
 	struct inode *inode;
 	struct nfs4_state *state;
@@ -3259,32 +3429,11 @@
 	trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
 
 	/* Handle Layoutreturn errors */
-	if (calldata->arg.lr_args && task->tk_status != 0) {
-		switch (calldata->res.lr_ret) {
-		default:
-			calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
-			break;
-		case 0:
-			calldata->arg.lr_args = NULL;
-			calldata->res.lr_res = NULL;
-			break;
-		case -NFS4ERR_OLD_STATEID:
-			if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid,
-						&calldata->arg.lr_args->range,
-						calldata->inode))
-				goto lr_restart;
-			/* Fallthrough */
-		case -NFS4ERR_ADMIN_REVOKED:
-		case -NFS4ERR_DELEG_REVOKED:
-		case -NFS4ERR_EXPIRED:
-		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
-		case -NFS4ERR_WRONG_CRED:
-			calldata->arg.lr_args = NULL;
-			calldata->res.lr_res = NULL;
-			goto lr_restart;
-		}
-	}
+	if (pnfs_roc_done(task, calldata->inode,
+				&calldata->arg.lr_args,
+				&calldata->res.lr_res,
+				&calldata->res.lr_ret) == -EAGAIN)
+		goto out_restart;
 
 	/* hmm. we are done with the inode, and in the process of freeing
 	 * the state_owner. we keep this around to process errors
@@ -3304,7 +3453,7 @@
 			break;
 		case -NFS4ERR_OLD_STATEID:
 			/* Did we race with OPEN? */
-			if (nfs4_refresh_open_stateid(&calldata->arg.stateid,
+			if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid,
 						state))
 				goto out_restart;
 			goto out_release;
@@ -3316,7 +3465,9 @@
 					task->tk_msg.rpc_cred);
 			/* Fallthrough */
 		case -NFS4ERR_BAD_STATEID:
-			break;
+			if (calldata->arg.fmode == 0)
+				break;
+			/* Fallthrough */
 		default:
 			task->tk_status = nfs4_async_handle_exception(task,
 					server, task->tk_status, &exception);
@@ -3331,8 +3482,6 @@
 	nfs_refresh_inode(calldata->inode, &calldata->fattr);
 	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
 	return;
-lr_restart:
-	calldata->res.lr_ret = 0;
 out_restart:
 	task->tk_status = 0;
 	rpc_restart_call_prepare(task);
@@ -3373,8 +3522,8 @@
 	} else if (is_rdwr)
 		calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
 
-	if (!nfs4_valid_open_stateid(state) ||
-	    !nfs4_refresh_open_stateid(&calldata->arg.stateid, state))
+	nfs4_sync_open_stateid(&calldata->arg.stateid, state);
+	if (!nfs4_valid_open_stateid(state))
 		call_close = 0;
 	spin_unlock(&state->owner->so_lock);
 
@@ -3539,9 +3688,9 @@
 	if (ctx->state == NULL)
 		return;
 	if (is_sync)
-		nfs4_close_sync(ctx->state, ctx->mode);
+		nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx));
 	else
-		nfs4_close_state(ctx->state, ctx->mode);
+		nfs4_close_state(ctx->state, _nfs4_ctx_to_openmode(ctx));
 }
 
 #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
@@ -3644,7 +3793,9 @@
 
 int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = nfs4_handle_exception(server,
@@ -3686,7 +3837,9 @@
 static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
 		struct nfs_fsinfo *info)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = _nfs4_lookup_root(server, fhandle, info);
@@ -3761,7 +3914,7 @@
 	}
 
 	/*
-	 * -EACCESS could mean that the user doesn't have correct permissions
+	 * -EACCES could mean that the user doesn't have correct permissions
 	 * to access the mount.  It could also mean that we tried to mount
 	 * with a gss auth flavor, but rpc.gssd isn't running.  Either way,
 	 * existing mount programs don't handle -EACCES very well so it should
@@ -3913,7 +4066,9 @@
 				struct nfs_fattr *fattr, struct nfs4_label *label,
 				struct inode *inode)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = _nfs4_proc_getattr(server, fhandle, fattr, label, inode);
@@ -3946,7 +4101,7 @@
 		  struct iattr *sattr)
 {
 	struct inode *inode = d_inode(dentry);
-	struct rpc_cred *cred = NULL;
+	const struct cred *cred = NULL;
 	struct nfs_open_context *ctx = NULL;
 	struct nfs4_label *label = NULL;
 	int status;
@@ -4036,7 +4191,9 @@
 				   const struct qstr *name, struct nfs_fh *fhandle,
 				   struct nfs_fattr *fattr, struct nfs4_label *label)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	struct rpc_clnt *client = *clnt;
 	int err;
 	do {
@@ -4140,7 +4297,9 @@
 static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
 			     struct nfs_fattr *fattr, struct nfs4_label *label)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = _nfs4_proc_lookupp(inode, fhandle, fattr, label);
@@ -4175,7 +4334,6 @@
 			return -ENOMEM;
 		args.bitmask = server->cache_consistency_bitmask;
 	}
-
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	if (!status) {
 		nfs_access_set_mask(entry, res.access);
@@ -4188,7 +4346,9 @@
 
 static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = _nfs4_proc_access(inode, entry);
@@ -4243,7 +4403,9 @@
 static int nfs4_proc_readlink(struct inode *inode, struct page *page,
 		unsigned int pgbase, unsigned int pglen)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = _nfs4_proc_readlink(inode, page, pgbase, pglen);
@@ -4319,7 +4481,9 @@
 
 static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	struct inode *inode = d_inode(dentry);
 	int err;
 
@@ -4340,7 +4504,9 @@
 
 static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 
 	do {
@@ -4499,7 +4665,9 @@
 
 static int nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(inode),
@@ -4606,7 +4774,9 @@
 static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 		struct page *page, unsigned int len, struct iattr *sattr)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	struct nfs4_label l, *label = NULL;
 	int err;
 
@@ -4645,7 +4815,9 @@
 		struct iattr *sattr)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	struct nfs4_label l, *label = NULL;
 	int err;
 
@@ -4664,7 +4836,7 @@
 	return err;
 }
 
-static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
 		u64 cookie, struct page **pages, unsigned int count, bool plus)
 {
 	struct inode		*dir = d_inode(dentry);
@@ -4702,10 +4874,12 @@
 	return status;
 }
 
-static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
 		u64 cookie, struct page **pages, unsigned int count, bool plus)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = _nfs4_proc_readdir(dentry, cred, cookie,
@@ -4756,7 +4930,9 @@
 		struct iattr *sattr, dev_t rdev)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	struct nfs4_label l, *label = NULL;
 	int err;
 
@@ -4798,7 +4974,9 @@
 
 static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = nfs4_handle_exception(server,
@@ -4829,7 +5007,9 @@
 
 static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	unsigned long now = jiffies;
 	int err;
 
@@ -4891,7 +5071,9 @@
 static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 		struct nfs_pathconf *pathconf)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 
 	do {
@@ -5230,7 +5412,7 @@
 	.rpc_release = nfs4_renew_release,
 };
 
-static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
+static int nfs4_proc_async_renew(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -5254,7 +5436,7 @@
 			&nfs4_renew_ops, data);
 }
 
-static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -5460,7 +5642,9 @@
 
 static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	ssize_t ret;
 	do {
 		ret = __nfs4_get_acl_uncached(inode, buf, buflen);
@@ -5594,7 +5778,9 @@
 static int nfs4_get_security_label(struct inode *inode, void *buf,
 					size_t buflen)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 
 	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -5669,7 +5855,6 @@
 {
 	struct nfs4_label ilabel, *olabel = NULL;
 	struct nfs_fattr fattr;
-	struct rpc_cred *cred;
 	int status;
 
 	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -5682,10 +5867,6 @@
 	ilabel.label = (char *)buf;
 	ilabel.len = buflen;
 
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred))
-		return PTR_ERR(cred);
-
 	olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
 	if (IS_ERR(olabel)) {
 		status = -PTR_ERR(olabel);
@@ -5698,7 +5879,6 @@
 
 	nfs4_label_free(olabel);
 out:
-	put_rpccred(cred);
 	return status;
 }
 #endif	/* CONFIG_NFS_V4_SECURITY_LABEL */
@@ -5867,13 +6047,13 @@
  * @clp: state data structure
  * @program: RPC program for NFSv4 callback service
  * @port: IP port number for NFS4 callback service
- * @cred: RPC credential to use for this call
+ * @cred: credential to use for this call
  * @res: where to place the result
  *
  * Returns zero, a negative errno, or a negative NFS4ERR status code.
  */
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
-		unsigned short port, struct rpc_cred *cred,
+		unsigned short port, const struct cred *cred,
 		struct nfs4_setclientid_res *res)
 {
 	nfs4_verifier sc_verifier;
@@ -5888,13 +6068,12 @@
 		.rpc_resp = res,
 		.rpc_cred = cred,
 	};
-	struct rpc_task *task;
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_setclientid_ops,
 		.callback_data = &setclientid,
-		.flags = RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status;
 
@@ -5921,17 +6100,13 @@
 	dprintk("NFS call  setclientid auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		clp->cl_owner_id);
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task)) {
-		status = PTR_ERR(task);
-		goto out;
-	}
-	status = task->tk_status;
+
+	status = nfs4_call_sync_custom(&task_setup_data);
 	if (setclientid.sc_cred) {
+		kfree(clp->cl_acceptor);
 		clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
 		put_rpccred(setclientid.sc_cred);
 	}
-	rpc_put_task(task);
 out:
 	trace_nfs4_setclientid(clp, status);
 	dprintk("NFS reply setclientid: %d\n", status);
@@ -5941,14 +6116,14 @@
 /**
  * nfs4_proc_setclientid_confirm - Confirm client ID
  * @clp: state data structure
- * @res: result of a previous SETCLIENTID
- * @cred: RPC credential to use for this call
+ * @arg: result of a previous SETCLIENTID
+ * @cred: credential to use for this call
  *
  * Returns zero, a negative errno, or a negative NFS4ERR status code.
  */
 int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
 		struct nfs4_setclientid_res *arg,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
@@ -5960,7 +6135,8 @@
 	dprintk("NFS call  setclientid_confirm auth=%s, (client ID %llx)\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		clp->cl_clientid);
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_setclientid_confirm(clp, status);
 	dprintk("NFS reply setclientid_confirm: %d\n", status);
 	return status;
@@ -5998,32 +6174,11 @@
 	trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
 
 	/* Handle Layoutreturn errors */
-	if (data->args.lr_args && task->tk_status != 0) {
-		switch(data->res.lr_ret) {
-		default:
-			data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
-			break;
-		case 0:
-			data->args.lr_args = NULL;
-			data->res.lr_res = NULL;
-			break;
-		case -NFS4ERR_OLD_STATEID:
-			if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid,
-						&data->args.lr_args->range,
-						data->inode))
-				goto lr_restart;
-			/* Fallthrough */
-		case -NFS4ERR_ADMIN_REVOKED:
-		case -NFS4ERR_DELEG_REVOKED:
-		case -NFS4ERR_EXPIRED:
-		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
-		case -NFS4ERR_WRONG_CRED:
-			data->args.lr_args = NULL;
-			data->res.lr_res = NULL;
-			goto lr_restart;
-		}
-	}
+	if (pnfs_roc_done(task, data->inode,
+				&data->args.lr_args,
+				&data->res.lr_res,
+				&data->res.lr_ret) == -EAGAIN)
+		goto out_restart;
 
 	switch (task->tk_status) {
 	case 0:
@@ -6061,8 +6216,6 @@
 	}
 	data->rpc_status = task->tk_status;
 	return;
-lr_restart:
-	data->res.lr_ret = 0;
 out_restart:
 	task->tk_status = 0;
 	rpc_restart_call_prepare(task);
@@ -6111,7 +6264,7 @@
 	.rpc_release = nfs4_delegreturn_release,
 };
 
-static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
+static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
 {
 	struct nfs4_delegreturndata *data;
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -6178,7 +6331,7 @@
 	return status;
 }
 
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_exception exception = { };
@@ -6241,7 +6394,9 @@
 
 static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 
 	do {
@@ -6253,6 +6408,42 @@
 	return err;
 }
 
+/*
+ * Update the seqid of a lock stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst,
+		struct nfs4_lock_state *lsp)
+{
+	struct nfs4_state *state = lsp->ls_state;
+	bool ret = false;
+
+	spin_lock(&state->state_lock);
+	if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid))
+		goto out;
+	if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst))
+		nfs4_stateid_seqid_inc(dst);
+	else
+		dst->seqid = lsp->ls_stateid.seqid;
+	ret = true;
+out:
+	spin_unlock(&state->state_lock);
+	return ret;
+}
+
+static bool nfs4_sync_lock_stateid(nfs4_stateid *dst,
+		struct nfs4_lock_state *lsp)
+{
+	struct nfs4_state *state = lsp->ls_state;
+	bool ret;
+
+	spin_lock(&state->state_lock);
+	ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid);
+	nfs4_stateid_copy(dst, &lsp->ls_stateid);
+	spin_unlock(&state->state_lock);
+	return ret;
+}
+
 struct nfs4_unlockdata {
 	struct nfs_locku_args arg;
 	struct nfs_locku_res res;
@@ -6270,7 +6461,8 @@
 		struct nfs_seqid *seqid)
 {
 	struct nfs4_unlockdata *p;
-	struct inode *inode = lsp->ls_state->inode;
+	struct nfs4_state *state = lsp->ls_state;
+	struct inode *inode = state->inode;
 
 	p = kzalloc(sizeof(*p), GFP_NOFS);
 	if (p == NULL)
@@ -6280,12 +6472,15 @@
 	p->arg.seqid = seqid;
 	p->res.seqid = seqid;
 	p->lsp = lsp;
-	refcount_inc(&lsp->ls_count);
 	/* Ensure we don't close file until we're done freeing locks! */
 	p->ctx = get_nfs_open_context(ctx);
 	p->l_ctx = nfs_get_lock_context(ctx);
-	memcpy(&p->fl, fl, sizeof(p->fl));
+	locks_init_lock(&p->fl);
+	locks_copy_lock(&p->fl, fl);
 	p->server = NFS_SERVER(inode);
+	spin_lock(&state->state_lock);
+	nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid);
+	spin_unlock(&state->state_lock);
 	return p;
 }
 
@@ -6324,10 +6519,14 @@
 					task->tk_msg.rpc_cred);
 			/* Fall through */
 		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_STALE_STATEID:
-			if (!nfs4_stateid_match(&calldata->arg.stateid,
-						&calldata->lsp->ls_stateid))
+			if (nfs4_sync_lock_stateid(&calldata->arg.stateid,
+						calldata->lsp))
+				rpc_restart_call_prepare(task);
+			break;
+		case -NFS4ERR_OLD_STATEID:
+			if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid,
+						calldata->lsp))
 				rpc_restart_call_prepare(task);
 			break;
 		default:
@@ -6350,7 +6549,6 @@
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		goto out_wait;
-	nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
 	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
 		/* Note: exit _without_ running nfs4_locku_done */
 		goto out_no_action;
@@ -6504,9 +6702,9 @@
 	p->res.lock_seqid = p->arg.lock_seqid;
 	p->lsp = lsp;
 	p->server = server;
-	refcount_inc(&lsp->ls_count);
 	p->ctx = get_nfs_open_context(ctx);
-	memcpy(&p->fl, fl, sizeof(p->fl));
+	locks_init_lock(&p->fl);
+	locks_copy_lock(&p->fl, fl);
 	return p;
 out_free_seqid:
 	nfs_free_seqid(p->arg.open_seqid);
@@ -6805,6 +7003,7 @@
 	struct nfs4_exception exception = {
 		.state = state,
 		.inode = state->inode,
+		.interruptible = true,
 	};
 	int err;
 
@@ -6845,7 +7044,6 @@
 	struct task_struct	*task;
 	struct inode		*inode;
 	struct nfs_lowner	*owner;
-	bool			notified;
 };
 
 static int
@@ -6867,13 +7065,13 @@
 		/* Make sure it's for the right inode */
 		if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
 			return 0;
-
-		waiter->notified = true;
 	}
 
 	/* override "private" so we can use default_wake_function */
 	wait->private = waiter->task;
-	ret = autoremove_wake_function(wait, mode, flags, key);
+	ret = woken_wake_function(wait, mode, flags, key);
+	if (ret)
+		list_del_init(&wait->entry);
 	wait->private = waiter;
 	return ret;
 }
@@ -6882,7 +7080,6 @@
 nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
 	int status = -ERESTARTSYS;
-	unsigned long flags;
 	struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	struct nfs_client *clp = server->nfs_client;
@@ -6892,8 +7089,7 @@
 				    .s_dev = server->s_dev };
 	struct nfs4_lock_waiter waiter = { .task  = current,
 					   .inode = state->inode,
-					   .owner = &owner,
-					   .notified = false };
+					   .owner = &owner};
 	wait_queue_entry_t wait;
 
 	/* Don't bother with waitqueue if we don't expect a callback */
@@ -6903,27 +7099,22 @@
 	init_wait(&wait);
 	wait.private = &waiter;
 	wait.func = nfs4_wake_lock_waiter;
-	add_wait_queue(q, &wait);
 
 	while(!signalled()) {
-		waiter.notified = false;
+		add_wait_queue(q, &wait);
 		status = nfs4_proc_setlk(state, cmd, request);
-		if ((status != -EAGAIN) || IS_SETLK(cmd))
+		if ((status != -EAGAIN) || IS_SETLK(cmd)) {
+			finish_wait(q, &wait);
 			break;
+		}
 
 		status = -ERESTARTSYS;
-		spin_lock_irqsave(&q->lock, flags);
-		if (waiter.notified) {
-			spin_unlock_irqrestore(&q->lock, flags);
-			continue;
-		}
-		set_current_state(TASK_INTERRUPTIBLE);
-		spin_unlock_irqrestore(&q->lock, flags);
-
-		freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT);
+		freezer_do_not_count();
+		wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT);
+		freezer_count();
+		finish_wait(q, &wait);
 	}
 
-	finish_wait(q, &wait);
 	return status;
 }
 #else /* !CONFIG_NFS_V4_1 */
@@ -7218,7 +7409,9 @@
 			   struct nfs4_fs_locations *fs_locations,
 			   struct page *page)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = _nfs4_proc_fs_locations(client, dir, name,
@@ -7239,7 +7432,7 @@
  */
 static int _nfs40_proc_get_locations(struct inode *inode,
 				     struct nfs4_fs_locations *locations,
-				     struct page *page, struct rpc_cred *cred)
+				     struct page *page, const struct cred *cred)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct rpc_clnt *clnt = server->client;
@@ -7296,7 +7489,7 @@
  */
 static int _nfs41_proc_get_locations(struct inode *inode,
 				     struct nfs4_fs_locations *locations,
-				     struct page *page, struct rpc_cred *cred)
+				     struct page *page, const struct cred *cred)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct rpc_clnt *clnt = server->client;
@@ -7355,13 +7548,15 @@
  */
 int nfs4_proc_get_locations(struct inode *inode,
 			    struct nfs4_fs_locations *locations,
-			    struct page *page, struct rpc_cred *cred)
+			    struct page *page, const struct cred *cred)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_client *clp = server->nfs_client;
 	const struct nfs4_mig_recovery_ops *ops =
 					clp->cl_mvops->mig_recovery_ops;
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int status;
 
 	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
@@ -7386,7 +7581,7 @@
  * is appended to this compound to identify the client ID which is
  * performing recovery.
  */
-static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+static int _nfs40_proc_fsid_present(struct inode *inode, const struct cred *cred)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
@@ -7432,7 +7627,7 @@
  * this operation is identified in the SEQUENCE operation in this
  * compound.
  */
-static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+static int _nfs41_proc_fsid_present(struct inode *inode, const struct cred *cred)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct rpc_clnt *clnt = server->client;
@@ -7479,13 +7674,15 @@
  *  NFS4ERR code if some error occurred on the server, or a
  *  negative errno if a local failure occurred.
  */
-int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_client *clp = server->nfs_client;
 	const struct nfs4_mig_recovery_ops *ops =
 					clp->cl_mvops->mig_recovery_ops;
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int status;
 
 	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
@@ -7503,7 +7700,7 @@
 	return status;
 }
 
-/**
+/*
  * If 'use_integrity' is true and the state managment nfs_client
  * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
  * and the machine credential as per RFC3530bis and RFC5661 Security
@@ -7513,6 +7710,8 @@
 static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
 {
 	int status;
+	struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+	struct nfs_client *clp = NFS_SERVER(dir)->nfs_client;
 	struct nfs4_secinfo_arg args = {
 		.dir_fh = NFS_FH(dir),
 		.name   = name,
@@ -7525,34 +7724,46 @@
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
-	struct rpc_cred *cred = NULL;
+	struct nfs4_call_sync_data data = {
+		.seq_server = NFS_SERVER(dir),
+		.seq_args = &args.seq_args,
+		.seq_res = &res.seq_res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clnt,
+		.rpc_message = &msg,
+		.callback_ops = clp->cl_mvops->call_sync_ops,
+		.callback_data = &data,
+		.flags = RPC_TASK_NO_ROUND_ROBIN,
+	};
+	const struct cred *cred = NULL;
 
 	if (use_integrity) {
-		clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
-		cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client);
+		clnt = clp->cl_rpcclient;
+		task_setup.rpc_client = clnt;
+
+		cred = nfs4_get_clid_cred(clp);
 		msg.rpc_cred = cred;
 	}
 
 	dprintk("NFS call  secinfo %s\n", name->name);
 
-	nfs4_state_protect(NFS_SERVER(dir)->nfs_client,
-		NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+	nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+	status = nfs4_call_sync_custom(&task_setup);
 
-	status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
-				&res.seq_res, 0);
 	dprintk("NFS reply  secinfo: %d\n", status);
 
-	if (cred)
-		put_rpccred(cred);
-
+	put_cred(cred);
 	return status;
 }
 
 int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
 		      struct nfs4_secinfo_flavors *flavors)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = -NFS4ERR_WRONGSEC;
@@ -7625,7 +7836,7 @@
 int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
 		struct rpc_xprt *xprt,
 		struct nfs_client *clp,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	int status;
 	struct nfs41_bind_conn_to_session_args args = {
@@ -7687,7 +7898,7 @@
 
 struct rpc_bind_conn_calldata {
 	struct nfs_client *clp;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 };
 
 static int
@@ -7700,7 +7911,7 @@
 	return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred);
 }
 
-int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, const struct cred *cred)
 {
 	struct rpc_bind_conn_calldata data = {
 		.clp = clp,
@@ -7866,7 +8077,7 @@
  * Wrapper for EXCHANGE_ID operation.
  */
 static struct rpc_task *
-nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
 			u32 sp4_how, struct rpc_xprt *xprt)
 {
 	struct rpc_message msg = {
@@ -7877,7 +8088,7 @@
 		.rpc_client = clp->cl_rpcclient,
 		.callback_ops = &nfs4_exchange_id_call_ops,
 		.rpc_message = &msg,
-		.flags = RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	struct nfs41_exchange_id_data *calldata;
 	int status;
@@ -7962,7 +8173,7 @@
  *
  * Wrapper for EXCHANGE_ID operation.
  */
-static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred,
 			u32 sp4_how)
 {
 	struct rpc_task *task;
@@ -8029,7 +8240,7 @@
  *
  * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used.
  */
-int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred)
 {
 	rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor;
 	int status;
@@ -8061,7 +8272,7 @@
  * @xprt: the rpc_xprt to test
  * @data: call data for _nfs4_proc_exchange_id.
  */
-int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
 			    void *data)
 {
 	struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
@@ -8078,20 +8289,22 @@
 	/* Test connection for session trunking. Async exchange_id call */
 	task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
 	if (IS_ERR(task))
-		return PTR_ERR(task);
+		return;
 
 	status = task->tk_status;
 	if (status == 0)
 		status = nfs4_detect_session_trunking(adata->clp,
 				task->tk_msg.rpc_resp, xprt);
 
+	if (status == 0)
+		rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
+
 	rpc_put_task(task);
-	return status;
 }
 EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
 
 static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID],
@@ -8100,7 +8313,8 @@
 	};
 	int status;
 
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_destroy_clientid(clp, status);
 	if (status)
 		dprintk("NFS: Got error %d from the server %s on "
@@ -8109,7 +8323,7 @@
 }
 
 static int nfs4_proc_destroy_clientid(struct nfs_client *clp,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	unsigned int loop;
 	int ret;
@@ -8130,7 +8344,7 @@
 
 int nfs4_destroy_clientid(struct nfs_client *clp)
 {
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	int ret = 0;
 
 	if (clp->cl_mvops->minor_version < 1)
@@ -8141,8 +8355,7 @@
 		goto out;
 	cred = nfs4_get_clid_cred(clp);
 	ret = nfs4_proc_destroy_clientid(clp, cred);
-	if (cred)
-		put_rpccred(cred);
+	put_cred(cred);
 	switch (ret) {
 	case 0:
 	case -NFS4ERR_STALE_CLIENTID:
@@ -8152,6 +8365,8 @@
 	return ret;
 }
 
+#endif /* CONFIG_NFS_V4_1 */
+
 struct nfs4_get_lease_time_data {
 	struct nfs4_get_lease_time_args *args;
 	struct nfs4_get_lease_time_res *res;
@@ -8184,7 +8399,7 @@
 			(struct nfs4_get_lease_time_data *)calldata;
 
 	dprintk("--> %s\n", __func__);
-	if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
+	if (!nfs4_sequence_done(task, &data->res->lr_seq_res))
 		return;
 	switch (task->tk_status) {
 	case -NFS4ERR_DELAY:
@@ -8207,7 +8422,6 @@
 
 int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 {
-	struct rpc_task *task;
 	struct nfs4_get_lease_time_args args;
 	struct nfs4_get_lease_time_res res = {
 		.lr_fsinfo = fsinfo,
@@ -8229,19 +8443,13 @@
 		.callback_data = &data,
 		.flags = RPC_TASK_TIMEOUT,
 	};
-	int status;
 
 	nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1);
-	task = rpc_run_task(&task_setup);
-
-	if (IS_ERR(task))
-		return PTR_ERR(task);
-
-	status = task->tk_status;
-	rpc_put_task(task);
-	return status;
+	return nfs4_call_sync_custom(&task_setup);
 }
 
+#ifdef CONFIG_NFS_V4_1
+
 /*
  * Initialize the values to be used by the client in CREATE_SESSION
  * If nfs4_init_session set the fore channel request and response sizes,
@@ -8256,6 +8464,7 @@
 {
 	unsigned int max_rqst_sz, max_resp_sz;
 	unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
+	unsigned int max_bc_slots = rpc_num_bc_slots(clnt);
 
 	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
 	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -8278,6 +8487,8 @@
 	args->bc_attrs.max_resp_sz_cached = 0;
 	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
 	args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1);
+	if (args->bc_attrs.max_reqs > max_bc_slots)
+		args->bc_attrs.max_reqs = max_bc_slots;
 
 	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
 		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
@@ -8358,7 +8569,7 @@
 }
 
 static int _nfs4_proc_create_session(struct nfs_client *clp,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	struct nfs4_session *session = clp->cl_session;
 	struct nfs41_create_session_args args = {
@@ -8380,7 +8591,8 @@
 	nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
 	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
 
-	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_create_session(clp, status);
 
 	switch (status) {
@@ -8410,7 +8622,7 @@
  * It is the responsibility of the caller to verify the session is
  * expired before calling this routine.
  */
-int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
 {
 	int status;
 	unsigned *ptr;
@@ -8441,7 +8653,7 @@
  * The caller must serialize access to this routine.
  */
 int nfs4_proc_destroy_session(struct nfs4_session *session,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION],
@@ -8456,7 +8668,8 @@
 	if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
 		return 0;
 
-	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_destroy_session(session->clp, status);
 
 	if (status)
@@ -8543,7 +8756,7 @@
 };
 
 static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
-		struct rpc_cred *cred,
+		const struct cred *cred,
 		struct nfs4_slot *slot,
 		bool is_privileged)
 {
@@ -8586,7 +8799,7 @@
 	return ret;
 }
 
-static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
+static int nfs41_proc_async_sequence(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags)
 {
 	struct rpc_task *task;
 	int ret = 0;
@@ -8602,7 +8815,7 @@
 	return ret;
 }
 
-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_sequence(struct nfs_client *clp, const struct cred *cred)
 {
 	struct rpc_task *task;
 	int ret;
@@ -8698,10 +8911,9 @@
  * Issue a global reclaim complete.
  */
 static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	struct nfs4_reclaim_complete_data *calldata;
-	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
 		.rpc_cred = cred,
@@ -8710,7 +8922,7 @@
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_reclaim_complete_call_ops,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status = -ENOMEM;
 
@@ -8725,15 +8937,7 @@
 	msg.rpc_argp = &calldata->arg;
 	msg.rpc_resp = &calldata->res;
 	task_setup_data.callback_data = calldata;
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task)) {
-		status = PTR_ERR(task);
-		goto out;
-	}
-	status = rpc_wait_for_completion_task(task);
-	if (status == 0)
-		status = task->tk_status;
-	rpc_put_task(task);
+	status = nfs4_call_sync_custom(&task_setup_data);
 out:
 	dprintk("<-- %s status=%d\n", __func__, status);
 	return status;
@@ -8913,10 +9117,12 @@
 	if (status != 0)
 		goto out;
 
-	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
-	if (task->tk_status < 0 || lgp->res.layoutp->len == 0) {
+	if (task->tk_status < 0) {
 		status = nfs4_layoutget_handle_exception(task, lgp, &exception);
 		*timeout = exception.timeout;
+	} else if (lgp->res.layoutp->len == 0) {
+		status = -EAGAIN;
+		*timeout = nfs4_update_delay(&exception.timeout);
 	} else
 		lseg = pnfs_layout_process(lgp);
 out:
@@ -8957,10 +9163,19 @@
 	if (!nfs41_sequence_process(task, &lrp->res.seq_res))
 		return;
 
+	/*
+	 * Was there an RPC level error? Assume the call succeeded,
+	 * and that we need to release the layout
+	 */
+	if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) {
+		lrp->res.lrs_present = 0;
+		return;
+	}
+
 	server = NFS_SERVER(lrp->args.inode);
 	switch (task->tk_status) {
 	case -NFS4ERR_OLD_STATEID:
-		if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid,
+		if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid,
 					&lrp->args.range,
 					lrp->args.inode))
 			goto out_restart;
@@ -9051,7 +9266,7 @@
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server,
 		struct pnfs_device *pdev,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	struct nfs4_getdeviceinfo_args args = {
 		.pdev = pdev,
@@ -9083,7 +9298,7 @@
 
 int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 		struct pnfs_device *pdev,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	struct nfs4_exception exception = { };
 	int err;
@@ -9140,7 +9355,7 @@
 	pnfs_cleanup_layoutcommit(data);
 	nfs_post_op_update_inode_force_wcc(data->args.inode,
 					   data->res.fattr);
-	put_rpccred(data->cred);
+	put_cred(data->cred);
 	nfs_iput_and_deactive(data->inode);
 	kfree(data);
 }
@@ -9195,7 +9410,7 @@
 	return status;
 }
 
-/**
+/*
  * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
  * possible) as per RFC3530bis and RFC5661 Security Considerations sections
  */
@@ -9216,22 +9431,35 @@
 		.rpc_resp = &res,
 	};
 	struct rpc_clnt *clnt = server->client;
-	struct rpc_cred *cred = NULL;
+	struct nfs4_call_sync_data data = {
+		.seq_server = server,
+		.seq_args = &args.seq_args,
+		.seq_res = &res.seq_res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+		.callback_data = &data,
+		.flags = RPC_TASK_NO_ROUND_ROBIN,
+	};
+	const struct cred *cred = NULL;
 	int status;
 
 	if (use_integrity) {
 		clnt = server->nfs_client->cl_rpcclient;
+		task_setup.rpc_client = clnt;
+
 		cred = nfs4_get_clid_cred(server->nfs_client);
 		msg.rpc_cred = cred;
 	}
 
 	dprintk("--> %s\n", __func__);
-	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
-				&res.seq_res, 0);
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+	status = nfs4_call_sync_custom(&task_setup);
 	dprintk("<-- %s status=%d\n", __func__, status);
 
-	if (cred)
-		put_rpccred(cred);
+	put_cred(cred);
 
 	return status;
 }
@@ -9240,7 +9468,9 @@
 nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
 			   struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		/* first try using integrity protection */
@@ -9344,7 +9574,7 @@
 
 static int _nfs41_test_stateid(struct nfs_server *server,
 		nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	int status;
 	struct nfs41_test_stateid_args args = {
@@ -9405,9 +9635,11 @@
  */
 static int nfs41_test_stateid(struct nfs_server *server,
 		nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const struct cred *cred)
 {
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.interruptible = true,
+	};
 	int err;
 	do {
 		err = _nfs41_test_stateid(server, stateid, cred);
@@ -9461,13 +9693,13 @@
  * @server: server / transport on which to perform the operation
  * @stateid: state ID to release
  * @cred: credential
- * @is_recovery: set to true if this call needs to be privileged
+ * @privileged: set to true if this call needs to be privileged
  *
  * Note: this function is always asynchronous.
  */
 static int nfs41_free_stateid(struct nfs_server *server,
 		const nfs4_stateid *stateid,
-		struct rpc_cred *cred,
+		const struct cred *cred,
 		bool privileged)
 {
 	struct rpc_message msg = {
@@ -9508,7 +9740,7 @@
 static void
 nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
-	struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+	const struct cred *cred = lsp->ls_state->owner->so_cred;
 
 	nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
 	nfs4_free_lock_state(server, lsp);
@@ -9579,14 +9811,14 @@
 
 static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
 	.sched_state_renewal = nfs4_proc_async_renew,
-	.get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
+	.get_state_renewal_cred = nfs4_get_renew_cred,
 	.renew_lease = nfs4_proc_renew,
 };
 
 #if defined(CONFIG_NFS_V4_1)
 static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 	.sched_state_renewal = nfs41_proc_async_sequence,
-	.get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
+	.get_state_renewal_cred = nfs4_get_machine_cred,
 	.renew_lease = nfs4_proc_sequence,
 };
 #endif
@@ -9668,7 +9900,8 @@
 		| NFS_CAP_DEALLOCATE
 		| NFS_CAP_SEEK
 		| NFS_CAP_LAYOUTSTATS
-		| NFS_CAP_CLONE,
+		| NFS_CAP_CLONE
+		| NFS_CAP_LAYOUTERROR,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,

diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1f8c2ae..6ea431b 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c

@@ -57,7 +57,7 @@
 	const struct nfs4_state_maintenance_ops *ops;
 	struct nfs_client *clp =
 		container_of(work, struct nfs_client, cl_renewd.work);
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	long lease;
 	unsigned long last, now;
 	unsigned renew_flags = 0;
@@ -68,7 +68,6 @@
 	if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
 		goto out;
 
-	spin_lock(&clp->cl_lock);
 	lease = clp->cl_lease_time;
 	last = clp->cl_last_renewal;
 	now = jiffies;
@@ -79,8 +78,7 @@
 		renew_flags |= NFS4_RENEW_DELEGATION_CB;
 
 	if (renew_flags != 0) {
-		cred = ops->get_state_renewal_cred_locked(clp);
-		spin_unlock(&clp->cl_lock);
+		cred = ops->get_state_renewal_cred(clp);
 		if (cred == NULL) {
 			if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
 				set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
@@ -92,7 +90,7 @@
 
 			/* Queue an asynchronous RENEW. */
 			ret = ops->sched_state_renewal(clp, cred, renew_flags);
-			put_rpccred(cred);
+			put_cred(cred);
 			switch (ret) {
 			default:
 				goto out_exp;
@@ -104,7 +102,6 @@
 	} else {
 		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
 				__func__);
-		spin_unlock(&clp->cl_lock);
 	}
 	nfs4_schedule_state_renewal(clp);
 out_exp:

diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 769b856..4145a01 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/nfs/nfs4session.c
  *
@@ -55,7 +56,7 @@
 
 /**
  * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
- * @tbl - controlling slot table
+ * @tbl: controlling slot table
  *
  */
 void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
@@ -110,6 +111,8 @@
 		slot->table = tbl;
 		slot->slot_nr = slotid;
 		slot->seq_nr = seq_init;
+		slot->seq_nr_highest_sent = seq_init;
+		slot->seq_nr_last_acked = seq_init - 1;
 	}
 	return slot;
 }
@@ -276,7 +279,8 @@
 	p = &tbl->slots;
 	while (*p) {
 		(*p)->seq_nr = ivalue;
-		(*p)->interrupted = 0;
+		(*p)->seq_nr_highest_sent = ivalue;
+		(*p)->seq_nr_last_acked = ivalue - 1;
 		p = &(*p)->next;
 	}
 	tbl->highest_used_slotid = NFS4_NO_SLOT;
@@ -573,12 +577,11 @@
 void nfs4_destroy_session(struct nfs4_session *session)
 {
 	struct rpc_xprt *xprt;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 
 	cred = nfs4_get_clid_cred(session->clp);
 	nfs4_proc_destroy_session(session, cred);
-	if (cred)
-		put_rpccred(cred);
+	put_cred(cred);
 
 	rcu_read_lock();
 	xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);

diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 3c550f2..b996ee2 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h

@@ -10,7 +10,7 @@
 
 /* maximum number of slots to use */
 #define NFS4_DEF_SLOT_TABLE_SIZE (64U)
-#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U)
 #define NFS4_MAX_SLOT_TABLE (1024U)
 #define NFS4_NO_SLOT ((u32)-1)
 
@@ -23,8 +23,9 @@
 	unsigned long		generation;
 	u32			slot_nr;
 	u32		 	seq_nr;
-	unsigned int		interrupted : 1,
-				privileged : 1,
+	u32		 	seq_nr_last_acked;
+	u32		 	seq_nr_highest_sent;
+	unsigned int		privileged : 1,
 				seq_done : 1;
 };
 

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d2f645d..0c6d53d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c

@@ -87,7 +87,28 @@
 
 static DEFINE_MUTEX(nfs_clid_init_mutex);
 
-int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_setup_state_renewal(struct nfs_client *clp)
+{
+	int status;
+	struct nfs_fsinfo fsinfo;
+	unsigned long now;
+
+	if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+		nfs4_schedule_state_renewal(clp);
+		return 0;
+	}
+
+	now = jiffies;
+	status = nfs4_proc_get_lease_time(clp, &fsinfo);
+	if (status == 0) {
+		nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now);
+		nfs4_schedule_state_renewal(clp);
+	}
+
+	return status;
+}
+
+int nfs4_init_clientid(struct nfs_client *clp, const struct cred *cred)
 {
 	struct nfs4_setclientid_res clid = {
 		.clientid = clp->cl_clientid,
@@ -114,7 +135,7 @@
 	if (status != 0)
 		goto out;
 	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-	nfs4_schedule_state_renewal(clp);
+	nfs4_setup_state_renewal(clp);
 out:
 	return status;
 }
@@ -134,7 +155,7 @@
  */
 int nfs40_discover_server_trunking(struct nfs_client *clp,
 				   struct nfs_client **result,
-				   struct rpc_cred *cred)
+				   const struct cred *cred)
 {
 	struct nfs4_setclientid_res clid = {
 		.clientid = clp->cl_clientid,
@@ -159,37 +180,32 @@
 		/* Sustain the lease, even if it's empty.  If the clientid4
 		 * goes stale it's of no use for trunking discovery. */
 		nfs4_schedule_state_renewal(*result);
+
+		/* If the client state need to recover, do it. */
+		if (clp->cl_state)
+			nfs4_schedule_state_manager(clp);
 	}
 out:
 	return status;
 }
 
-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
+const struct cred *nfs4_get_machine_cred(struct nfs_client *clp)
 {
-	struct rpc_cred *cred = NULL;
-
-	if (clp->cl_machine_cred != NULL)
-		cred = get_rpccred(clp->cl_machine_cred);
-	return cred;
+	return get_cred(rpc_machine_cred());
 }
 
 static void nfs4_root_machine_cred(struct nfs_client *clp)
 {
-	struct rpc_cred *cred, *new;
 
-	new = rpc_lookup_machine_cred(NULL);
-	spin_lock(&clp->cl_lock);
-	cred = clp->cl_machine_cred;
-	clp->cl_machine_cred = new;
-	spin_unlock(&clp->cl_lock);
-	if (cred != NULL)
-		put_rpccred(cred);
+	/* Force root creds instead of machine */
+	clp->cl_principal = NULL;
+	clp->cl_rpcclient->cl_principal = NULL;
 }
 
-static struct rpc_cred *
+static const struct cred *
 nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
-	struct rpc_cred *cred = NULL;
+	const struct cred *cred = NULL;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
 
@@ -199,29 +215,30 @@
 		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
 		if (list_empty(&sp->so_states))
 			continue;
-		cred = get_rpccred(sp->so_cred);
+		cred = get_cred(sp->so_cred);
 		break;
 	}
 	return cred;
 }
 
 /**
- * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * nfs4_get_renew_cred - Acquire credential for a renew operation
  * @clp: client state handle
  *
  * Returns an rpc_cred with reference count bumped, or NULL.
  * Caller must hold clp->cl_lock.
  */
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+const struct cred *nfs4_get_renew_cred(struct nfs_client *clp)
 {
-	struct rpc_cred *cred = NULL;
+	const struct cred *cred = NULL;
 	struct nfs_server *server;
 
 	/* Use machine credentials if available */
-	cred = nfs4_get_machine_cred_locked(clp);
+	cred = nfs4_get_machine_cred(clp);
 	if (cred != NULL)
 		goto out;
 
+	spin_lock(&clp->cl_lock);
 	rcu_read_lock();
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 		cred = nfs4_get_renew_cred_server_locked(server);
@@ -229,6 +246,7 @@
 			break;
 	}
 	rcu_read_unlock();
+	spin_unlock(&clp->cl_lock);
 
 out:
 	return cred;
@@ -289,37 +307,16 @@
 
 #if defined(CONFIG_NFS_V4_1)
 
-static int nfs41_setup_state_renewal(struct nfs_client *clp)
-{
-	int status;
-	struct nfs_fsinfo fsinfo;
-	unsigned long now;
-
-	if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
-		nfs4_schedule_state_renewal(clp);
-		return 0;
-	}
-
-	now = jiffies;
-	status = nfs4_proc_get_lease_time(clp, &fsinfo);
-	if (status == 0) {
-		nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now);
-		nfs4_schedule_state_renewal(clp);
-	}
-
-	return status;
-}
-
 static void nfs41_finish_session_reset(struct nfs_client *clp)
 {
 	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 	/* create_session negotiated new slot table */
 	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
-	nfs41_setup_state_renewal(clp);
+	nfs4_setup_state_renewal(clp);
 }
 
-int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs41_init_clientid(struct nfs_client *clp, const struct cred *cred)
 {
 	int status;
 
@@ -354,7 +351,7 @@
  */
 int nfs41_discover_server_trunking(struct nfs_client *clp,
 				   struct nfs_client **result,
-				   struct rpc_cred *cred)
+				   const struct cred *cred)
 {
 	int status;
 
@@ -392,32 +389,32 @@
  * nfs4_get_clid_cred - Acquire credential for a setclientid operation
  * @clp: client state handle
  *
- * Returns an rpc_cred with reference count bumped, or NULL.
+ * Returns a cred with reference count bumped, or NULL.
  */
-struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp)
+const struct cred *nfs4_get_clid_cred(struct nfs_client *clp)
 {
-	struct rpc_cred *cred;
+	const struct cred *cred;
 
-	spin_lock(&clp->cl_lock);
-	cred = nfs4_get_machine_cred_locked(clp);
-	spin_unlock(&clp->cl_lock);
+	cred = nfs4_get_machine_cred(clp);
 	return cred;
 }
 
 static struct nfs4_state_owner *
-nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, const struct cred *cred)
 {
 	struct rb_node **p = &server->state_owners.rb_node,
 		       *parent = NULL;
 	struct nfs4_state_owner *sp;
+	int cmp;
 
 	while (*p != NULL) {
 		parent = *p;
 		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+		cmp = cred_fscmp(cred, sp->so_cred);
 
-		if (cred < sp->so_cred)
+		if (cmp < 0)
 			p = &parent->rb_left;
-		else if (cred > sp->so_cred)
+		else if (cmp > 0)
 			p = &parent->rb_right;
 		else {
 			if (!list_empty(&sp->so_lru))
@@ -436,14 +433,16 @@
 	struct rb_node **p = &server->state_owners.rb_node,
 		       *parent = NULL;
 	struct nfs4_state_owner *sp;
+	int cmp;
 
 	while (*p != NULL) {
 		parent = *p;
 		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+		cmp = cred_fscmp(new->so_cred, sp->so_cred);
 
-		if (new->so_cred < sp->so_cred)
+		if (cmp < 0)
 			p = &parent->rb_left;
-		else if (new->so_cred > sp->so_cred)
+		else if (cmp > 0)
 			p = &parent->rb_right;
 		else {
 			if (!list_empty(&sp->so_lru))
@@ -490,7 +489,7 @@
  */
 static struct nfs4_state_owner *
 nfs4_alloc_state_owner(struct nfs_server *server,
-		struct rpc_cred *cred,
+		const struct cred *cred,
 		gfp_t gfp_flags)
 {
 	struct nfs4_state_owner *sp;
@@ -505,7 +504,7 @@
 		return NULL;
 	}
 	sp->so_server = server;
-	sp->so_cred = get_rpccred(cred);
+	sp->so_cred = get_cred(cred);
 	spin_lock_init(&sp->so_lock);
 	INIT_LIST_HEAD(&sp->so_states);
 	nfs4_init_seqid_counter(&sp->so_seqid);
@@ -534,7 +533,7 @@
 static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
 {
 	nfs4_destroy_seqid_counter(&sp->so_seqid);
-	put_rpccred(sp->so_cred);
+	put_cred(sp->so_cred);
 	ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
 	kfree(sp);
 }
@@ -568,11 +567,12 @@
  * nfs4_get_state_owner - Look up a state owner given a credential
  * @server: nfs_server to search
  * @cred: RPC credential to match
+ * @gfp_flags: allocation mode
  *
  * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
  */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
-					      struct rpc_cred *cred,
+					      const struct cred *cred,
 					      gfp_t gfp_flags)
 {
 	struct nfs_client *clp = server->nfs_client;
@@ -624,24 +624,39 @@
 /**
  * nfs4_purge_state_owners - Release all cached state owners
  * @server: nfs_server with cached state owners to release
+ * @head: resulting list of state owners
  *
  * Called at umount time.  Remaining state owners will be on
  * the LRU with ref count of zero.
+ * Note that the state owners are not freed, but are added
+ * to the list @head, which can later be used as an argument
+ * to nfs4_free_state_owners.
  */
-void nfs4_purge_state_owners(struct nfs_server *server)
+void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp, *tmp;
-	LIST_HEAD(doomed);
 
 	spin_lock(&clp->cl_lock);
 	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
-		list_move(&sp->so_lru, &doomed);
+		list_move(&sp->so_lru, head);
 		nfs4_remove_state_owner_locked(sp);
 	}
 	spin_unlock(&clp->cl_lock);
+}
 
-	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @head: resulting list of state owners
+ *
+ * Frees a list of state owners that was generated by
+ * nfs4_purge_state_owners
+ */
+void nfs4_free_state_owners(struct list_head *head)
+{
+	struct nfs4_state_owner *sp, *tmp;
+
+	list_for_each_entry_safe(sp, tmp, head, so_lru) {
 		list_del(&sp->so_lru);
 		nfs4_free_state_owner(sp);
 	}
@@ -655,7 +670,7 @@
 	state = kzalloc(sizeof(*state), GFP_NOFS);
 	if (!state)
 		return NULL;
-	atomic_set(&state->count, 1);
+	refcount_set(&state->count, 1);
 	INIT_LIST_HEAD(&state->lock_states);
 	spin_lock_init(&state->state_lock);
 	seqlock_init(&state->seqlock);
@@ -684,12 +699,12 @@
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs4_state *state;
 
-	list_for_each_entry(state, &nfsi->open_states, inode_states) {
+	list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) {
 		if (state->owner != owner)
 			continue;
 		if (!nfs4_valid_open_stateid(state))
 			continue;
-		if (atomic_inc_not_zero(&state->count))
+		if (refcount_inc_not_zero(&state->count))
 			return state;
 	}
 	return NULL;
@@ -698,7 +713,7 @@
 static void
 nfs4_free_open_state(struct nfs4_state *state)
 {
-	kfree(state);
+	kfree_rcu(state, rcu_head);
 }
 
 struct nfs4_state *
@@ -707,9 +722,9 @@
 	struct nfs4_state *state, *new;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	spin_lock(&inode->i_lock);
+	rcu_read_lock();
 	state = __nfs4_find_state_byowner(inode, owner);
-	spin_unlock(&inode->i_lock);
+	rcu_read_unlock();
 	if (state)
 		goto out;
 	new = nfs4_alloc_open_state();
@@ -720,7 +735,7 @@
 		state = new;
 		state->owner = owner;
 		atomic_inc(&owner->so_count);
-		list_add(&state->inode_states, &nfsi->open_states);
+		list_add_rcu(&state->inode_states, &nfsi->open_states);
 		ihold(inode);
 		state->inode = inode;
 		spin_unlock(&inode->i_lock);
@@ -743,10 +758,10 @@
 	struct inode *inode = state->inode;
 	struct nfs4_state_owner *owner = state->owner;
 
-	if (!atomic_dec_and_lock(&state->count, &owner->so_lock))
+	if (!refcount_dec_and_lock(&state->count, &owner->so_lock))
 		return;
 	spin_lock(&inode->i_lock);
-	list_del(&state->inode_states);
+	list_del_rcu(&state->inode_states);
 	list_del(&state->open_states);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&owner->so_lock);
@@ -1000,22 +1015,6 @@
 	return ret;
 }
 
-bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
-{
-	bool ret;
-	int seq;
-
-	do {
-		ret = false;
-		seq = read_seqbegin(&state->seqlock);
-		if (nfs4_state_match_open_stateid_other(state, dst)) {
-			dst->seqid = state->open_stateid.seqid;
-			ret = true;
-		}
-	} while (read_seqretry(&state->seqlock, seq));
-	return ret;
-}
-
 bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
 {
 	bool ret;
@@ -1041,7 +1040,7 @@
  */
 int nfs4_select_rw_stateid(struct nfs4_state *state,
 		fmode_t fmode, const struct nfs_lock_context *l_ctx,
-		nfs4_stateid *dst, struct rpc_cred **cred)
+		nfs4_stateid *dst, const struct cred **cred)
 {
 	int ret;
 
@@ -1064,8 +1063,7 @@
 		 * choose to use.
 		 */
 		goto out;
-	nfs4_copy_open_stateid(dst, state);
-	ret = 0;
+	ret = nfs4_copy_open_stateid(dst, state) ? 0 : -EAGAIN;
 out:
 	if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
 		dst->seqid = 0;
@@ -1438,8 +1436,8 @@
 	struct nfs4_state *state;
 	bool found = false;
 
-	spin_lock(&inode->i_lock);
-	list_for_each_entry(ctx, &nfsi->open_files, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
 		state = ctx->state;
 		if (state == NULL)
 			continue;
@@ -1457,32 +1455,34 @@
 		    nfs4_state_mark_reclaim_nograce(clp, state))
 			found = true;
 	}
-	spin_unlock(&inode->i_lock);
+	rcu_read_unlock();
 
 	nfs_inode_find_delegation_state_and_recover(inode, stateid);
 	if (found)
 		nfs4_schedule_state_manager(clp);
 }
 
-static void nfs4_state_mark_open_context_bad(struct nfs4_state *state)
+static void nfs4_state_mark_open_context_bad(struct nfs4_state *state, int err)
 {
 	struct inode *inode = state->inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_open_context *ctx;
 
-	spin_lock(&inode->i_lock);
-	list_for_each_entry(ctx, &nfsi->open_files, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
 		if (ctx->state != state)
 			continue;
 		set_bit(NFS_CONTEXT_BAD, &ctx->flags);
+		pr_warn("NFSv4: state recovery failed for open file %pd2, "
+				"error = %d\n", ctx->dentry, err);
 	}
-	spin_unlock(&inode->i_lock);
+	rcu_read_unlock();
 }
 
 static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
 {
 	set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags);
-	nfs4_state_mark_open_context_bad(state);
+	nfs4_state_mark_open_context_bad(state, error);
 }
 
 
@@ -1513,6 +1513,7 @@
 		switch (status) {
 		case 0:
 			break;
+		case -ETIMEDOUT:
 		case -ESTALE:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_STALE_STATEID:
@@ -1550,10 +1551,63 @@
 	return status;
 }
 
+#ifdef CONFIG_NFS_V4_2
+static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	struct nfs4_copy_state *copy;
+
+	if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags))
+		return;
+
+	spin_lock(&sp->so_server->nfs_client->cl_lock);
+	list_for_each_entry(copy, &sp->so_server->ss_copies, copies) {
+		if (!nfs4_stateid_match_other(&state->stateid, &copy->parent_state->stateid))
+			continue;
+		copy->flags = 1;
+		complete(&copy->completion);
+		break;
+	}
+	spin_unlock(&sp->so_server->nfs_client->cl_lock);
+}
+#else /* !CONFIG_NFS_V4_2 */
+static inline void nfs42_complete_copies(struct nfs4_state_owner *sp,
+					 struct nfs4_state *state)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state,
+				     const struct nfs4_state_recovery_ops *ops)
+{
+	struct nfs4_lock_state *lock;
+	int status;
+
+	status = ops->recover_open(sp, state);
+	if (status < 0)
+		return status;
+
+	status = nfs4_reclaim_locks(state, ops);
+	if (status < 0)
+		return status;
+
+	if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+		spin_lock(&state->state_lock);
+		list_for_each_entry(lock, &state->lock_states, ls_locks) {
+			if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+				pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__);
+		}
+		spin_unlock(&state->state_lock);
+	}
+
+	nfs42_complete_copies(sp, state);
+	clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+	return status;
+}
+
 static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
 {
 	struct nfs4_state *state;
-	struct nfs4_lock_state *lock;
+	unsigned int loop = 0;
 	int status = 0;
 
 	/* Note: we rely on the sp->so_states list being ordered 
@@ -1574,79 +1628,53 @@
 			continue;
 		if (state->state == 0)
 			continue;
-		atomic_inc(&state->count);
+		refcount_inc(&state->count);
 		spin_unlock(&sp->so_lock);
-		status = ops->recover_open(sp, state);
-		if (status >= 0) {
-			status = nfs4_reclaim_locks(state, ops);
-			if (status >= 0) {
-				if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
-					spin_lock(&state->state_lock);
-					list_for_each_entry(lock, &state->lock_states, ls_locks) {
-						if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
-							pr_warn_ratelimited("NFS: "
-									    "%s: Lock reclaim "
-									    "failed!\n", __func__);
-					}
-					spin_unlock(&state->state_lock);
-				}
-				clear_bit(NFS_STATE_RECLAIM_NOGRACE,
-					&state->flags);
-#ifdef CONFIG_NFS_V4_2
-				if (test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) {
-					struct nfs4_copy_state *copy;
+		status = __nfs4_reclaim_open_state(sp, state, ops);
 
-					spin_lock(&sp->so_server->nfs_client->cl_lock);
-					list_for_each_entry(copy, &sp->so_server->ss_copies, copies) {
-						if (memcmp(&state->stateid.other, &copy->parent_state->stateid.other, NFS4_STATEID_SIZE))
-							continue;
-						copy->flags = 1;
-						complete(&copy->completion);
-						printk("AGLO: server rebooted waking up the copy\n");
-						break;
-					}
-					spin_unlock(&sp->so_server->nfs_client->cl_lock);
-				}
-#endif /* CONFIG_NFS_V4_2 */
-				nfs4_put_open_state(state);
-				spin_lock(&sp->so_lock);
-				goto restart;
-			}
-		}
 		switch (status) {
-			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
-					__func__, status);
-				/* Fall through */
-			case -ENOENT:
-			case -ENOMEM:
-			case -EACCES:
-			case -EROFS:
-			case -EIO:
-			case -ESTALE:
-				/* Open state on this file cannot be recovered */
-				nfs4_state_mark_recovery_failed(state, status);
+		default:
+			if (status >= 0) {
+				loop = 0;
 				break;
-			case -EAGAIN:
-				ssleep(1);
-				/* Fall through */
-			case -NFS4ERR_ADMIN_REVOKED:
-			case -NFS4ERR_STALE_STATEID:
-			case -NFS4ERR_OLD_STATEID:
-			case -NFS4ERR_BAD_STATEID:
-			case -NFS4ERR_RECLAIM_BAD:
-			case -NFS4ERR_RECLAIM_CONFLICT:
-				nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+			}
+			printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status);
+			/* Fall through */
+		case -ENOENT:
+		case -ENOMEM:
+		case -EACCES:
+		case -EROFS:
+		case -EIO:
+		case -ESTALE:
+			/* Open state on this file cannot be recovered */
+			nfs4_state_mark_recovery_failed(state, status);
+			break;
+		case -EAGAIN:
+			ssleep(1);
+			if (loop++ < 10) {
+				set_bit(ops->state_flag_bit, &state->flags);
 				break;
-			case -NFS4ERR_EXPIRED:
-			case -NFS4ERR_NO_GRACE:
-				nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
-			case -NFS4ERR_STALE_CLIENTID:
-			case -NFS4ERR_BADSESSION:
-			case -NFS4ERR_BADSLOT:
-			case -NFS4ERR_BAD_HIGH_SLOT:
-			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-				goto out_err;
+			}
+			/* Fall through */
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_OLD_STATEID:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_RECLAIM_BAD:
+		case -NFS4ERR_RECLAIM_CONFLICT:
+			nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+			break;
+		case -NFS4ERR_EXPIRED:
+		case -NFS4ERR_NO_GRACE:
+			nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+			/* Fall through */
+		case -NFS4ERR_STALE_CLIENTID:
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -ETIMEDOUT:
+			goto out_err;
 		}
 		nfs4_put_open_state(state);
 		spin_lock(&sp->so_lock);
@@ -1723,7 +1751,7 @@
 
 static int nfs4_reclaim_complete(struct nfs_client *clp,
 				 const struct nfs4_state_recovery_ops *ops,
-				 struct rpc_cred *cred)
+				 const struct cred *cred)
 {
 	/* Notify the server we're done reclaiming our state */
 	if (ops->reclaim_complete)
@@ -1774,7 +1802,7 @@
 static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
 	const struct nfs4_state_recovery_ops *ops;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	int err;
 
 	if (!nfs4_state_clear_reclaim_reboot(clp))
@@ -1782,7 +1810,7 @@
 	ops = clp->cl_mvops->reboot_recovery_ops;
 	cred = nfs4_get_clid_cred(clp);
 	err = nfs4_reclaim_complete(clp, ops, cred);
-	put_rpccred(cred);
+	put_cred(cred);
 	if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
 		set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
 }
@@ -1796,38 +1824,38 @@
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
 	switch (error) {
-		case 0:
-			break;
-		case -NFS4ERR_CB_PATH_DOWN:
-			nfs40_handle_cb_pathdown(clp);
-			break;
-		case -NFS4ERR_NO_GRACE:
-			nfs4_state_end_reclaim_reboot(clp);
-			break;
-		case -NFS4ERR_STALE_CLIENTID:
-			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-			nfs4_state_start_reclaim_reboot(clp);
-			break;
-		case -NFS4ERR_EXPIRED:
-			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-			nfs4_state_start_reclaim_nograce(clp);
-			break;
-		case -NFS4ERR_BADSESSION:
-		case -NFS4ERR_BADSLOT:
-		case -NFS4ERR_BAD_HIGH_SLOT:
-		case -NFS4ERR_DEADSESSION:
-		case -NFS4ERR_SEQ_FALSE_RETRY:
-		case -NFS4ERR_SEQ_MISORDERED:
-			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-			/* Zero session reset errors */
-			break;
-		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-			set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
-			break;
-		default:
-			dprintk("%s: failed to handle error %d for server %s\n",
-					__func__, error, clp->cl_hostname);
-			return error;
+	case 0:
+		break;
+	case -NFS4ERR_CB_PATH_DOWN:
+		nfs40_handle_cb_pathdown(clp);
+		break;
+	case -NFS4ERR_NO_GRACE:
+		nfs4_state_end_reclaim_reboot(clp);
+		break;
+	case -NFS4ERR_STALE_CLIENTID:
+		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		nfs4_state_start_reclaim_reboot(clp);
+		break;
+	case -NFS4ERR_EXPIRED:
+		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		nfs4_state_start_reclaim_nograce(clp);
+		break;
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_DEADSESSION:
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+	case -NFS4ERR_SEQ_MISORDERED:
+		set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+		/* Zero session reset errors */
+		break;
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+		break;
+	default:
+		dprintk("%s: failed to handle error %d for server %s\n",
+				__func__, error, clp->cl_hostname);
+		return error;
 	}
 	dprintk("%s: handled error %d for server %s\n", __func__, error,
 			clp->cl_hostname);
@@ -1839,12 +1867,13 @@
 	struct nfs4_state_owner *sp;
 	struct nfs_server *server;
 	struct rb_node *pos;
+	LIST_HEAD(freeme);
 	int status = 0;
 
 restart:
 	rcu_read_lock();
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-		nfs4_purge_state_owners(server);
+		nfs4_purge_state_owners(server, &freeme);
 		spin_lock(&clp->cl_lock);
 		for (pos = rb_first(&server->state_owners);
 		     pos != NULL;
@@ -1873,12 +1902,13 @@
 		spin_unlock(&clp->cl_lock);
 	}
 	rcu_read_unlock();
+	nfs4_free_state_owners(&freeme);
 	return 0;
 }
 
 static int nfs4_check_lease(struct nfs_client *clp)
 {
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	const struct nfs4_state_maintenance_ops *ops =
 		clp->cl_mvops->state_renewal_ops;
 	int status;
@@ -1886,9 +1916,7 @@
 	/* Is the client already known to have an expired lease? */
 	if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 		return 0;
-	spin_lock(&clp->cl_lock);
-	cred = ops->get_state_renewal_cred_locked(clp);
-	spin_unlock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred(clp);
 	if (cred == NULL) {
 		cred = nfs4_get_clid_cred(clp);
 		status = -ENOKEY;
@@ -1896,7 +1924,7 @@
 			goto out;
 	}
 	status = ops->renew_lease(clp, cred);
-	put_rpccred(cred);
+	put_cred(cred);
 	if (status == -ETIMEDOUT) {
 		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 		return 0;
@@ -1930,7 +1958,6 @@
 		return -EPERM;
 	case -EACCES:
 	case -NFS4ERR_DELAY:
-	case -ETIMEDOUT:
 	case -EAGAIN:
 		ssleep(1);
 		break;
@@ -1956,7 +1983,7 @@
 
 static int nfs4_establish_lease(struct nfs_client *clp)
 {
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	const struct nfs4_state_recovery_ops *ops =
 		clp->cl_mvops->reboot_recovery_ops;
 	int status;
@@ -1968,7 +1995,7 @@
 	if (cred == NULL)
 		return -ENOENT;
 	status = ops->establish_clid(clp, cred);
-	put_rpccred(cred);
+	put_cred(cred);
 	if (status != 0)
 		return status;
 	pnfs_destroy_all_layouts(clp);
@@ -2015,7 +2042,7 @@
  *
  * Returns zero or a negative NFS4ERR status code.
  */
-static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred)
+static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_fs_locations *locations = NULL;
@@ -2052,8 +2079,10 @@
 	}
 
 	status = nfs4_begin_drain_session(clp);
-	if (status != 0)
-		return status;
+	if (status != 0) {
+		result = status;
+		goto out;
+	}
 
 	status = nfs4_replace_transport(server, locations);
 	if (status != 0) {
@@ -2085,14 +2114,12 @@
 	const struct nfs4_state_maintenance_ops *ops =
 				clp->cl_mvops->state_renewal_ops;
 	struct nfs_server *server;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 
 	dprintk("%s: migration reported on \"%s\"\n", __func__,
 			clp->cl_hostname);
 
-	spin_lock(&clp->cl_lock);
-	cred = ops->get_state_renewal_cred_locked(clp);
-	spin_unlock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred(clp);
 	if (cred == NULL)
 		return -NFS4ERR_NOENT;
 
@@ -2113,13 +2140,13 @@
 		rcu_read_unlock();
 		status = nfs4_try_migration(server, cred);
 		if (status < 0) {
-			put_rpccred(cred);
+			put_cred(cred);
 			return status;
 		}
 		goto restart;
 	}
 	rcu_read_unlock();
-	put_rpccred(cred);
+	put_cred(cred);
 	return 0;
 }
 
@@ -2133,14 +2160,12 @@
 	const struct nfs4_state_maintenance_ops *ops =
 				clp->cl_mvops->state_renewal_ops;
 	struct nfs_server *server;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 
 	dprintk("%s: lease moved reported on \"%s\"\n", __func__,
 			clp->cl_hostname);
 
-	spin_lock(&clp->cl_lock);
-	cred = ops->get_state_renewal_cred_locked(clp);
-	spin_unlock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred(clp);
 	if (cred == NULL)
 		return -NFS4ERR_NOENT;
 
@@ -2168,7 +2193,7 @@
 	rcu_read_unlock();
 
 out:
-	put_rpccred(cred);
+	put_cred(cred);
 	return 0;
 }
 
@@ -2191,7 +2216,7 @@
 	const struct nfs4_state_recovery_ops *ops =
 				clp->cl_mvops->reboot_recovery_ops;
 	struct rpc_clnt *clnt;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	int i, status;
 
 	dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
@@ -2207,7 +2232,7 @@
 		goto out_unlock;
 
 	status = ops->detect_trunking(clp, result, cred);
-	put_rpccred(cred);
+	put_cred(cred);
 	switch (status) {
 	case 0:
 	case -EINTR:
@@ -2338,8 +2363,7 @@
 {
 	/* FIXME: For now, we destroy all layouts. */
 	pnfs_destroy_all_layouts(clp);
-	/* FIXME: For now, we test all delegations+open state+locks. */
-	nfs41_handle_some_state_revoked(clp);
+	nfs_test_expired_all_delegations(clp);
 	dprintk("%s: Recallable state revoked on server %s!\n", __func__,
 			clp->cl_hostname);
 }
@@ -2398,7 +2422,7 @@
 
 static int nfs4_reset_session(struct nfs_client *clp)
 {
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	int status;
 
 	if (!nfs4_has_session(clp))
@@ -2436,14 +2460,13 @@
 	dprintk("%s: session reset was successful for server %s!\n",
 			__func__, clp->cl_hostname);
 out:
-	if (cred)
-		put_rpccred(cred);
+	put_cred(cred);
 	return status;
 }
 
 static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 {
-	struct rpc_cred *cred;
+	const struct cred *cred;
 	int ret;
 
 	if (!nfs4_has_session(clp))
@@ -2453,8 +2476,7 @@
 		return ret;
 	cred = nfs4_get_clid_cred(clp);
 	ret = nfs4_proc_bind_conn_to_session(clp, cred);
-	if (cred)
-		put_rpccred(cred);
+	put_cred(cred);
 	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
 	switch (ret) {
 	case 0:
@@ -2566,7 +2588,7 @@
 		}
 
 		/* Now recover expired state... */
-		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+		if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
 			section = "reclaim nograce";
 			status = nfs4_do_reclaim(clp,
 				clp->cl_mvops->nograce_recovery_ops);
@@ -2574,6 +2596,7 @@
 				continue;
 			if (status < 0)
 				goto out_error;
+			clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
 		}
 
 		nfs4_end_drain_session(clp);
@@ -2592,7 +2615,7 @@
 			return;
 		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
 			return;
-	} while (refcount_read(&clp->cl_count) > 1);
+	} while (refcount_read(&clp->cl_count) > 1 && !signalled());
 	goto out_drain;
 
 out_error:

diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 6fb7cb6..04c5706 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2012 Bryan Schumaker <bjschuma@netapp.com>
  */
@@ -50,7 +51,7 @@
 
 static const struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
-	.destroy_inode	= nfs_destroy_inode,
+	.free_inode	= nfs_free_inode,
 	.write_inode	= nfs4_write_inode,
 	.drop_inode	= nfs_drop_inode,
 	.statfs		= nfs_statfs,

diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index e9fb3e5..1a8f376 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c

@@ -16,4 +16,12 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
 #endif

diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index b1483b3..b2f395f 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h

@@ -10,157 +10,302 @@
 
 #include <linux/tracepoint.h>
 
+TRACE_DEFINE_ENUM(EPERM);
+TRACE_DEFINE_ENUM(ENOENT);
+TRACE_DEFINE_ENUM(EIO);
+TRACE_DEFINE_ENUM(ENXIO);
+TRACE_DEFINE_ENUM(EACCES);
+TRACE_DEFINE_ENUM(EEXIST);
+TRACE_DEFINE_ENUM(EXDEV);
+TRACE_DEFINE_ENUM(ENOTDIR);
+TRACE_DEFINE_ENUM(EISDIR);
+TRACE_DEFINE_ENUM(EFBIG);
+TRACE_DEFINE_ENUM(ENOSPC);
+TRACE_DEFINE_ENUM(EROFS);
+TRACE_DEFINE_ENUM(EMLINK);
+TRACE_DEFINE_ENUM(ENAMETOOLONG);
+TRACE_DEFINE_ENUM(ENOTEMPTY);
+TRACE_DEFINE_ENUM(EDQUOT);
+TRACE_DEFINE_ENUM(ESTALE);
+TRACE_DEFINE_ENUM(EBADHANDLE);
+TRACE_DEFINE_ENUM(EBADCOOKIE);
+TRACE_DEFINE_ENUM(ENOTSUPP);
+TRACE_DEFINE_ENUM(ETOOSMALL);
+TRACE_DEFINE_ENUM(EREMOTEIO);
+TRACE_DEFINE_ENUM(EBADTYPE);
+TRACE_DEFINE_ENUM(EAGAIN);
+TRACE_DEFINE_ENUM(ELOOP);
+TRACE_DEFINE_ENUM(EOPNOTSUPP);
+TRACE_DEFINE_ENUM(EDEADLK);
+TRACE_DEFINE_ENUM(ENOMEM);
+TRACE_DEFINE_ENUM(EKEYEXPIRED);
+TRACE_DEFINE_ENUM(ETIMEDOUT);
+TRACE_DEFINE_ENUM(ERESTARTSYS);
+TRACE_DEFINE_ENUM(ECONNREFUSED);
+TRACE_DEFINE_ENUM(ECONNRESET);
+TRACE_DEFINE_ENUM(ENETUNREACH);
+TRACE_DEFINE_ENUM(EHOSTUNREACH);
+TRACE_DEFINE_ENUM(EHOSTDOWN);
+TRACE_DEFINE_ENUM(EPIPE);
+TRACE_DEFINE_ENUM(EPFNOSUPPORT);
+TRACE_DEFINE_ENUM(EPROTONOSUPPORT);
+
+TRACE_DEFINE_ENUM(NFS4_OK);
+TRACE_DEFINE_ENUM(NFS4ERR_ACCESS);
+TRACE_DEFINE_ENUM(NFS4ERR_ATTRNOTSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_ADMIN_REVOKED);
+TRACE_DEFINE_ENUM(NFS4ERR_BACK_CHAN_BUSY);
+TRACE_DEFINE_ENUM(NFS4ERR_BADCHAR);
+TRACE_DEFINE_ENUM(NFS4ERR_BADHANDLE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADIOMODE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADLAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_BADLABEL);
+TRACE_DEFINE_ENUM(NFS4ERR_BADNAME);
+TRACE_DEFINE_ENUM(NFS4ERR_BADOWNER);
+TRACE_DEFINE_ENUM(NFS4ERR_BADSESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_BADSLOT);
+TRACE_DEFINE_ENUM(NFS4ERR_BADTYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADXDR);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_COOKIE);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_HIGH_SLOT);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_RANGE);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_SEQID);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_SESSION_DIGEST);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_CB_PATH_DOWN);
+TRACE_DEFINE_ENUM(NFS4ERR_CLID_INUSE);
+TRACE_DEFINE_ENUM(NFS4ERR_CLIENTID_BUSY);
+TRACE_DEFINE_ENUM(NFS4ERR_COMPLETE_ALREADY);
+TRACE_DEFINE_ENUM(NFS4ERR_CONN_NOT_BOUND_TO_SESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_DEADLOCK);
+TRACE_DEFINE_ENUM(NFS4ERR_DEADSESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_DELAY);
+TRACE_DEFINE_ENUM(NFS4ERR_DELEG_ALREADY_WANTED);
+TRACE_DEFINE_ENUM(NFS4ERR_DELEG_REVOKED);
+TRACE_DEFINE_ENUM(NFS4ERR_DENIED);
+TRACE_DEFINE_ENUM(NFS4ERR_DIRDELEG_UNAVAIL);
+TRACE_DEFINE_ENUM(NFS4ERR_DQUOT);
+TRACE_DEFINE_ENUM(NFS4ERR_ENCR_ALG_UNSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_EXIST);
+TRACE_DEFINE_ENUM(NFS4ERR_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4ERR_FBIG);
+TRACE_DEFINE_ENUM(NFS4ERR_FHEXPIRED);
+TRACE_DEFINE_ENUM(NFS4ERR_FILE_OPEN);
+TRACE_DEFINE_ENUM(NFS4ERR_GRACE);
+TRACE_DEFINE_ENUM(NFS4ERR_HASH_ALG_UNSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_INVAL);
+TRACE_DEFINE_ENUM(NFS4ERR_IO);
+TRACE_DEFINE_ENUM(NFS4ERR_ISDIR);
+TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTTRYLATER);
+TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTUNAVAILABLE);
+TRACE_DEFINE_ENUM(NFS4ERR_LEASE_MOVED);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCKED);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCKS_HELD);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCK_RANGE);
+TRACE_DEFINE_ENUM(NFS4ERR_MINOR_VERS_MISMATCH);
+TRACE_DEFINE_ENUM(NFS4ERR_MLINK);
+TRACE_DEFINE_ENUM(NFS4ERR_MOVED);
+TRACE_DEFINE_ENUM(NFS4ERR_NAMETOOLONG);
+TRACE_DEFINE_ENUM(NFS4ERR_NOENT);
+TRACE_DEFINE_ENUM(NFS4ERR_NOFILEHANDLE);
+TRACE_DEFINE_ENUM(NFS4ERR_NOMATCHING_LAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_NOSPC);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTDIR);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTEMPTY);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_NOT_ONLY_OP);
+TRACE_DEFINE_ENUM(NFS4ERR_NOT_SAME);
+TRACE_DEFINE_ENUM(NFS4ERR_NO_GRACE);
+TRACE_DEFINE_ENUM(NFS4ERR_NXIO);
+TRACE_DEFINE_ENUM(NFS4ERR_OLD_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_OPENMODE);
+TRACE_DEFINE_ENUM(NFS4ERR_OP_ILLEGAL);
+TRACE_DEFINE_ENUM(NFS4ERR_OP_NOT_IN_SESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_PERM);
+TRACE_DEFINE_ENUM(NFS4ERR_PNFS_IO_HOLE);
+TRACE_DEFINE_ENUM(NFS4ERR_PNFS_NO_LAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_RECALLCONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_BAD);
+TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_CONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_REJECT_DELEG);
+TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG);
+TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+TRACE_DEFINE_ENUM(NFS4ERR_REQ_TOO_BIG);
+TRACE_DEFINE_ENUM(NFS4ERR_RESOURCE);
+TRACE_DEFINE_ENUM(NFS4ERR_RESTOREFH);
+TRACE_DEFINE_ENUM(NFS4ERR_RETRY_UNCACHED_REP);
+TRACE_DEFINE_ENUM(NFS4ERR_RETURNCONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_ROFS);
+TRACE_DEFINE_ENUM(NFS4ERR_SAME);
+TRACE_DEFINE_ENUM(NFS4ERR_SHARE_DENIED);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQUENCE_POS);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQ_FALSE_RETRY);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQ_MISORDERED);
+TRACE_DEFINE_ENUM(NFS4ERR_SERVERFAULT);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE_CLIENTID);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_SYMLINK);
+TRACE_DEFINE_ENUM(NFS4ERR_TOOSMALL);
+TRACE_DEFINE_ENUM(NFS4ERR_TOO_MANY_OPS);
+TRACE_DEFINE_ENUM(NFS4ERR_UNKNOWN_LAYOUTTYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_UNSAFE_COMPOUND);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONGSEC);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONG_CRED);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONG_TYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_XDEV);
+
 #define show_nfsv4_errors(error) \
 	__print_symbolic(error, \
 		{ NFS4_OK, "OK" }, \
 		/* Mapped by nfs4_stat_to_errno() */ \
-		{ -EPERM, "EPERM" }, \
-		{ -ENOENT, "ENOENT" }, \
-		{ -EIO, "EIO" }, \
-		{ -ENXIO, "ENXIO" }, \
-		{ -EACCES, "EACCES" }, \
-		{ -EEXIST, "EEXIST" }, \
-		{ -EXDEV, "EXDEV" }, \
-		{ -ENOTDIR, "ENOTDIR" }, \
-		{ -EISDIR, "EISDIR" }, \
-		{ -EFBIG, "EFBIG" }, \
-		{ -ENOSPC, "ENOSPC" }, \
-		{ -EROFS, "EROFS" }, \
-		{ -EMLINK, "EMLINK" }, \
-		{ -ENAMETOOLONG, "ENAMETOOLONG" }, \
-		{ -ENOTEMPTY, "ENOTEMPTY" }, \
-		{ -EDQUOT, "EDQUOT" }, \
-		{ -ESTALE, "ESTALE" }, \
-		{ -EBADHANDLE, "EBADHANDLE" }, \
-		{ -EBADCOOKIE, "EBADCOOKIE" }, \
-		{ -ENOTSUPP, "ENOTSUPP" }, \
-		{ -ETOOSMALL, "ETOOSMALL" }, \
-		{ -EREMOTEIO, "EREMOTEIO" }, \
-		{ -EBADTYPE, "EBADTYPE" }, \
-		{ -EAGAIN, "EAGAIN" }, \
-		{ -ELOOP, "ELOOP" }, \
-		{ -EOPNOTSUPP, "EOPNOTSUPP" }, \
-		{ -EDEADLK, "EDEADLK" }, \
+		{ EPERM, "EPERM" }, \
+		{ ENOENT, "ENOENT" }, \
+		{ EIO, "EIO" }, \
+		{ ENXIO, "ENXIO" }, \
+		{ EACCES, "EACCES" }, \
+		{ EEXIST, "EEXIST" }, \
+		{ EXDEV, "EXDEV" }, \
+		{ ENOTDIR, "ENOTDIR" }, \
+		{ EISDIR, "EISDIR" }, \
+		{ EFBIG, "EFBIG" }, \
+		{ ENOSPC, "ENOSPC" }, \
+		{ EROFS, "EROFS" }, \
+		{ EMLINK, "EMLINK" }, \
+		{ ENAMETOOLONG, "ENAMETOOLONG" }, \
+		{ ENOTEMPTY, "ENOTEMPTY" }, \
+		{ EDQUOT, "EDQUOT" }, \
+		{ ESTALE, "ESTALE" }, \
+		{ EBADHANDLE, "EBADHANDLE" }, \
+		{ EBADCOOKIE, "EBADCOOKIE" }, \
+		{ ENOTSUPP, "ENOTSUPP" }, \
+		{ ETOOSMALL, "ETOOSMALL" }, \
+		{ EREMOTEIO, "EREMOTEIO" }, \
+		{ EBADTYPE, "EBADTYPE" }, \
+		{ EAGAIN, "EAGAIN" }, \
+		{ ELOOP, "ELOOP" }, \
+		{ EOPNOTSUPP, "EOPNOTSUPP" }, \
+		{ EDEADLK, "EDEADLK" }, \
 		/* RPC errors */ \
-		{ -ENOMEM, "ENOMEM" }, \
-		{ -EKEYEXPIRED, "EKEYEXPIRED" }, \
-		{ -ETIMEDOUT, "ETIMEDOUT" }, \
-		{ -ERESTARTSYS, "ERESTARTSYS" }, \
-		{ -ECONNREFUSED, "ECONNREFUSED" }, \
-		{ -ECONNRESET, "ECONNRESET" }, \
-		{ -ENETUNREACH, "ENETUNREACH" }, \
-		{ -EHOSTUNREACH, "EHOSTUNREACH" }, \
-		{ -EHOSTDOWN, "EHOSTDOWN" }, \
-		{ -EPIPE, "EPIPE" }, \
-		{ -EPFNOSUPPORT, "EPFNOSUPPORT" }, \
-		{ -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
+		{ ENOMEM, "ENOMEM" }, \
+		{ EKEYEXPIRED, "EKEYEXPIRED" }, \
+		{ ETIMEDOUT, "ETIMEDOUT" }, \
+		{ ERESTARTSYS, "ERESTARTSYS" }, \
+		{ ECONNREFUSED, "ECONNREFUSED" }, \
+		{ ECONNRESET, "ECONNRESET" }, \
+		{ ENETUNREACH, "ENETUNREACH" }, \
+		{ EHOSTUNREACH, "EHOSTUNREACH" }, \
+		{ EHOSTDOWN, "EHOSTDOWN" }, \
+		{ EPIPE, "EPIPE" }, \
+		{ EPFNOSUPPORT, "EPFNOSUPPORT" }, \
+		{ EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
 		/* NFSv4 native errors */ \
-		{ -NFS4ERR_ACCESS, "ACCESS" }, \
-		{ -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
-		{ -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
-		{ -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
-		{ -NFS4ERR_BADCHAR, "BADCHAR" }, \
-		{ -NFS4ERR_BADHANDLE, "BADHANDLE" }, \
-		{ -NFS4ERR_BADIOMODE, "BADIOMODE" }, \
-		{ -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
-		{ -NFS4ERR_BADLABEL, "BADLABEL" }, \
-		{ -NFS4ERR_BADNAME, "BADNAME" }, \
-		{ -NFS4ERR_BADOWNER, "BADOWNER" }, \
-		{ -NFS4ERR_BADSESSION, "BADSESSION" }, \
-		{ -NFS4ERR_BADSLOT, "BADSLOT" }, \
-		{ -NFS4ERR_BADTYPE, "BADTYPE" }, \
-		{ -NFS4ERR_BADXDR, "BADXDR" }, \
-		{ -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
-		{ -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
-		{ -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
-		{ -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
-		{ -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
-		{ -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
-		{ -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
-		{ -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
-		{ -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
-		{ -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
-		{ -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
+		{ NFS4ERR_ACCESS, "ACCESS" }, \
+		{ NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
+		{ NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
+		{ NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
+		{ NFS4ERR_BADCHAR, "BADCHAR" }, \
+		{ NFS4ERR_BADHANDLE, "BADHANDLE" }, \
+		{ NFS4ERR_BADIOMODE, "BADIOMODE" }, \
+		{ NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
+		{ NFS4ERR_BADLABEL, "BADLABEL" }, \
+		{ NFS4ERR_BADNAME, "BADNAME" }, \
+		{ NFS4ERR_BADOWNER, "BADOWNER" }, \
+		{ NFS4ERR_BADSESSION, "BADSESSION" }, \
+		{ NFS4ERR_BADSLOT, "BADSLOT" }, \
+		{ NFS4ERR_BADTYPE, "BADTYPE" }, \
+		{ NFS4ERR_BADXDR, "BADXDR" }, \
+		{ NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
+		{ NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
+		{ NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
+		{ NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
+		{ NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
+		{ NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
+		{ NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
+		{ NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
+		{ NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
+		{ NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
+		{ NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
 			"CONN_NOT_BOUND_TO_SESSION" }, \
-		{ -NFS4ERR_DEADLOCK, "DEADLOCK" }, \
-		{ -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
-		{ -NFS4ERR_DELAY, "DELAY" }, \
-		{ -NFS4ERR_DELEG_ALREADY_WANTED, \
+		{ NFS4ERR_DEADLOCK, "DEADLOCK" }, \
+		{ NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
+		{ NFS4ERR_DELAY, "DELAY" }, \
+		{ NFS4ERR_DELEG_ALREADY_WANTED, \
 			"DELEG_ALREADY_WANTED" }, \
-		{ -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
-		{ -NFS4ERR_DENIED, "DENIED" }, \
-		{ -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
-		{ -NFS4ERR_DQUOT, "DQUOT" }, \
-		{ -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
-		{ -NFS4ERR_EXIST, "EXIST" }, \
-		{ -NFS4ERR_EXPIRED, "EXPIRED" }, \
-		{ -NFS4ERR_FBIG, "FBIG" }, \
-		{ -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
-		{ -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
-		{ -NFS4ERR_GRACE, "GRACE" }, \
-		{ -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
-		{ -NFS4ERR_INVAL, "INVAL" }, \
-		{ -NFS4ERR_IO, "IO" }, \
-		{ -NFS4ERR_ISDIR, "ISDIR" }, \
-		{ -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
-		{ -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
-		{ -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
-		{ -NFS4ERR_LOCKED, "LOCKED" }, \
-		{ -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
-		{ -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
-		{ -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
-		{ -NFS4ERR_MLINK, "MLINK" }, \
-		{ -NFS4ERR_MOVED, "MOVED" }, \
-		{ -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
-		{ -NFS4ERR_NOENT, "NOENT" }, \
-		{ -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
-		{ -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
-		{ -NFS4ERR_NOSPC, "NOSPC" }, \
-		{ -NFS4ERR_NOTDIR, "NOTDIR" }, \
-		{ -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
-		{ -NFS4ERR_NOTSUPP, "NOTSUPP" }, \
-		{ -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
-		{ -NFS4ERR_NOT_SAME, "NOT_SAME" }, \
-		{ -NFS4ERR_NO_GRACE, "NO_GRACE" }, \
-		{ -NFS4ERR_NXIO, "NXIO" }, \
-		{ -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
-		{ -NFS4ERR_OPENMODE, "OPENMODE" }, \
-		{ -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
-		{ -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
-		{ -NFS4ERR_PERM, "PERM" }, \
-		{ -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
-		{ -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
-		{ -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
-		{ -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
-		{ -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
-		{ -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
-		{ -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
-		{ -NFS4ERR_REP_TOO_BIG_TO_CACHE, \
+		{ NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
+		{ NFS4ERR_DENIED, "DENIED" }, \
+		{ NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
+		{ NFS4ERR_DQUOT, "DQUOT" }, \
+		{ NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
+		{ NFS4ERR_EXIST, "EXIST" }, \
+		{ NFS4ERR_EXPIRED, "EXPIRED" }, \
+		{ NFS4ERR_FBIG, "FBIG" }, \
+		{ NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
+		{ NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
+		{ NFS4ERR_GRACE, "GRACE" }, \
+		{ NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
+		{ NFS4ERR_INVAL, "INVAL" }, \
+		{ NFS4ERR_IO, "IO" }, \
+		{ NFS4ERR_ISDIR, "ISDIR" }, \
+		{ NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
+		{ NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
+		{ NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
+		{ NFS4ERR_LOCKED, "LOCKED" }, \
+		{ NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
+		{ NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
+		{ NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
+		{ NFS4ERR_MLINK, "MLINK" }, \
+		{ NFS4ERR_MOVED, "MOVED" }, \
+		{ NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
+		{ NFS4ERR_NOENT, "NOENT" }, \
+		{ NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
+		{ NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
+		{ NFS4ERR_NOSPC, "NOSPC" }, \
+		{ NFS4ERR_NOTDIR, "NOTDIR" }, \
+		{ NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
+		{ NFS4ERR_NOTSUPP, "NOTSUPP" }, \
+		{ NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
+		{ NFS4ERR_NOT_SAME, "NOT_SAME" }, \
+		{ NFS4ERR_NO_GRACE, "NO_GRACE" }, \
+		{ NFS4ERR_NXIO, "NXIO" }, \
+		{ NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
+		{ NFS4ERR_OPENMODE, "OPENMODE" }, \
+		{ NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
+		{ NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
+		{ NFS4ERR_PERM, "PERM" }, \
+		{ NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
+		{ NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
+		{ NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
+		{ NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
+		{ NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
+		{ NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
+		{ NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
+		{ NFS4ERR_REP_TOO_BIG_TO_CACHE, \
 			"REP_TOO_BIG_TO_CACHE" }, \
-		{ -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
-		{ -NFS4ERR_RESOURCE, "RESOURCE" }, \
-		{ -NFS4ERR_RESTOREFH, "RESTOREFH" }, \
-		{ -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
-		{ -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
-		{ -NFS4ERR_ROFS, "ROFS" }, \
-		{ -NFS4ERR_SAME, "SAME" }, \
-		{ -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
-		{ -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
-		{ -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
-		{ -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
-		{ -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
-		{ -NFS4ERR_STALE, "STALE" }, \
-		{ -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
-		{ -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
-		{ -NFS4ERR_SYMLINK, "SYMLINK" }, \
-		{ -NFS4ERR_TOOSMALL, "TOOSMALL" }, \
-		{ -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
-		{ -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
-		{ -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
-		{ -NFS4ERR_WRONGSEC, "WRONGSEC" }, \
-		{ -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
-		{ -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
-		{ -NFS4ERR_XDEV, "XDEV" })
+		{ NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
+		{ NFS4ERR_RESOURCE, "RESOURCE" }, \
+		{ NFS4ERR_RESTOREFH, "RESTOREFH" }, \
+		{ NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
+		{ NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
+		{ NFS4ERR_ROFS, "ROFS" }, \
+		{ NFS4ERR_SAME, "SAME" }, \
+		{ NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
+		{ NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
+		{ NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
+		{ NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
+		{ NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
+		{ NFS4ERR_STALE, "STALE" }, \
+		{ NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
+		{ NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
+		{ NFS4ERR_SYMLINK, "SYMLINK" }, \
+		{ NFS4ERR_TOOSMALL, "TOOSMALL" }, \
+		{ NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
+		{ NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
+		{ NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
+		{ NFS4ERR_WRONGSEC, "WRONGSEC" }, \
+		{ NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
+		{ NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
+		{ NFS4ERR_XDEV, "XDEV" })
 
 #define show_open_flags(flags) \
 	__print_flags(flags, "|", \
@@ -203,7 +348,7 @@
 
 		TP_STRUCT__entry(
 			__string(dstaddr, clp->cl_hostname)
-			__field(int, error)
+			__field(unsigned long, error)
 		),
 
 		TP_fast_assign(
@@ -212,8 +357,8 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) dstaddr=%s",
-			__entry->error,
+			"error=%ld (%s) dstaddr=%s",
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			__get_str(dstaddr)
 		)
@@ -275,7 +420,7 @@
 			__field(unsigned int, highest_slotid)
 			__field(unsigned int, target_highest_slotid)
 			__field(unsigned int, status_flags)
-			__field(int, error)
+			__field(unsigned long, error)
 		),
 
 		TP_fast_assign(
@@ -290,10 +435,10 @@
 			__entry->error = res->sr_status;
 		),
 		TP_printk(
-			"error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+			"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
 			"highest_slotid=%u target_highest_slotid=%u "
 			"status_flags=%u (%s)",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			__entry->session,
 			__entry->slot_nr,
@@ -322,7 +467,7 @@
 			__field(unsigned int, seq_nr)
 			__field(unsigned int, highest_slotid)
 			__field(unsigned int, cachethis)
-			__field(int, error)
+			__field(unsigned long, error)
 		),
 
 		TP_fast_assign(
@@ -331,13 +476,13 @@
 			__entry->seq_nr = args->csa_sequenceid;
 			__entry->highest_slotid = args->csa_highestslotid;
 			__entry->cachethis = args->csa_cachethis;
-			__entry->error = -be32_to_cpu(status);
+			__entry->error = be32_to_cpu(status);
 		),
 
 		TP_printk(
-			"error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+			"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
 			"highest_slotid=%u",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			__entry->session,
 			__entry->slot_nr,
@@ -345,6 +490,44 @@
 			__entry->highest_slotid
 		)
 );
+
+TRACE_EVENT(nfs4_cb_seqid_err,
+		TP_PROTO(
+			const struct cb_sequenceargs *args,
+			__be32 status
+		),
+		TP_ARGS(args, status),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, session)
+			__field(unsigned int, slot_nr)
+			__field(unsigned int, seq_nr)
+			__field(unsigned int, highest_slotid)
+			__field(unsigned int, cachethis)
+			__field(unsigned long, error)
+		),
+
+		TP_fast_assign(
+			__entry->session = nfs_session_id_hash(&args->csa_sessionid);
+			__entry->slot_nr = args->csa_slotid;
+			__entry->seq_nr = args->csa_sequenceid;
+			__entry->highest_slotid = args->csa_highestslotid;
+			__entry->cachethis = args->csa_cachethis;
+			__entry->error = be32_to_cpu(status);
+		),
+
+		TP_printk(
+			"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+			"highest_slotid=%u",
+			-__entry->error,
+			show_nfsv4_errors(__entry->error),
+			__entry->session,
+			__entry->slot_nr,
+			__entry->seq_nr,
+			__entry->highest_slotid
+		)
+);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 TRACE_EVENT(nfs4_setup_sequence,
@@ -379,6 +562,42 @@
 		)
 );
 
+TRACE_EVENT(nfs4_xdr_status,
+		TP_PROTO(
+			const struct xdr_stream *xdr,
+			u32 op,
+			int error
+		),
+
+		TP_ARGS(xdr, op, error),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, task_id)
+			__field(unsigned int, client_id)
+			__field(u32, xid)
+			__field(u32, op)
+			__field(unsigned long, error)
+		),
+
+		TP_fast_assign(
+			const struct rpc_rqst *rqstp = xdr->rqst;
+			const struct rpc_task *task = rqstp->rq_task;
+
+			__entry->task_id = task->tk_pid;
+			__entry->client_id = task->tk_client->cl_clid;
+			__entry->xid = be32_to_cpu(rqstp->rq_xid);
+			__entry->op = op;
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"task:%u@%d xid=0x%08x error=%ld (%s) operation=%u",
+			__entry->task_id, __entry->client_id, __entry->xid,
+			-__entry->error, show_nfsv4_errors(__entry->error),
+			__entry->op
+		)
+);
+
 DECLARE_EVENT_CLASS(nfs4_open_event,
 		TP_PROTO(
 			const struct nfs_open_context *ctx,
@@ -389,7 +608,7 @@
 		TP_ARGS(ctx, flags, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(unsigned int, flags)
 			__field(unsigned int, fmode)
 			__field(dev_t, dev)
@@ -407,7 +626,7 @@
 			const struct nfs4_state *state = ctx->state;
 			const struct inode *inode = NULL;
 
-			__entry->error = error;
+			__entry->error = -error;
 			__entry->flags = flags;
 			__entry->fmode = (__force unsigned int)ctx->mode;
 			__entry->dev = ctx->dentry->d_sb->s_dev;
@@ -439,11 +658,11 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) flags=%d (%s) fmode=%s "
+			"error=%ld (%s) flags=%d (%s) fmode=%s "
 			"fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
 			"openstateid=%d:0x%08x",
-			 __entry->error,
+			 -__entry->error,
 			 show_nfsv4_errors(__entry->error),
 			 __entry->flags,
 			 show_open_flags(__entry->flags),
@@ -525,7 +744,7 @@
 			__field(u32, fhandle)
 			__field(u64, fileid)
 			__field(unsigned int, fmode)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(int, stateid_seq)
 			__field(u32, stateid_hash)
 		),
@@ -545,9 +764,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
+			"error=%ld (%s) fmode=%s fileid=%02x:%02x:%llu "
 			"fhandle=0x%08x openstateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			__entry->fmode ?  show_fmode_flags(__entry->fmode) :
 					  "closed",
@@ -558,6 +777,13 @@
 		)
 );
 
+TRACE_DEFINE_ENUM(F_GETLK);
+TRACE_DEFINE_ENUM(F_SETLK);
+TRACE_DEFINE_ENUM(F_SETLKW);
+TRACE_DEFINE_ENUM(F_RDLCK);
+TRACE_DEFINE_ENUM(F_WRLCK);
+TRACE_DEFINE_ENUM(F_UNLCK);
+
 #define show_lock_cmd(type) \
 	__print_symbolic((int)type, \
 		{ F_GETLK, "GETLK" }, \
@@ -580,7 +806,7 @@
 		TP_ARGS(request, state, cmd, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(int, cmd)
 			__field(char, type)
 			__field(loff_t, start)
@@ -610,10 +836,10 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) cmd=%s:%s range=%lld:%lld "
+			"error=%ld (%s) cmd=%s:%s range=%lld:%lld "
 			"fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"stateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			show_lock_cmd(__entry->cmd),
 			show_lock_type(__entry->type),
@@ -650,7 +876,7 @@
 		TP_ARGS(request, state, lockstateid, cmd, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(int, cmd)
 			__field(char, type)
 			__field(loff_t, start)
@@ -686,10 +912,10 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) cmd=%s:%s range=%lld:%lld "
+			"error=%ld (%s) cmd=%s:%s range=%lld:%lld "
 			"fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"stateid=%d:0x%08x lockstateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			show_lock_cmd(__entry->cmd),
 			show_lock_type(__entry->type),
@@ -755,7 +981,7 @@
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
 			__field(u32, fhandle)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(int, stateid_seq)
 			__field(u32, stateid_hash)
 		),
@@ -771,9 +997,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) dev=%02x:%02x fhandle=0x%08x "
+			"error=%ld (%s) dev=%02x:%02x fhandle=0x%08x "
 			"stateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			__entry->fhandle,
@@ -792,7 +1018,7 @@
 		TP_ARGS(state, lsp, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
@@ -814,9 +1040,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"stateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -849,7 +1075,7 @@
 
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(u64, dir)
 			__string(name, name->name)
 		),
@@ -857,13 +1083,13 @@
 		TP_fast_assign(
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
-			__entry->error = error;
+			__entry->error = -error;
 			__assign_str(name, name->name);
 		),
 
 		TP_printk(
-			"error=%d (%s) name=%02x:%02x:%llu/%s",
-			__entry->error,
+			"error=%ld (%s) name=%02x:%02x:%llu/%s",
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->dir,
@@ -899,7 +1125,7 @@
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
 			__field(u64, ino)
-			__field(int, error)
+			__field(unsigned long, error)
 		),
 
 		TP_fast_assign(
@@ -909,8 +1135,8 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) inode=%02x:%02x:%llu",
-			__entry->error,
+			"error=%ld (%s) inode=%02x:%02x:%llu",
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->ino
@@ -930,7 +1156,7 @@
 
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(u64, olddir)
 			__string(oldname, oldname->name)
 			__field(u64, newdir)
@@ -947,9 +1173,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) oldname=%02x:%02x:%llu/%s "
+			"error=%ld (%s) oldname=%02x:%02x:%llu/%s "
 			"newname=%02x:%02x:%llu/%s",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->olddir,
@@ -972,19 +1198,19 @@
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
-			__field(int, error)
+			__field(unsigned long, error)
 		),
 
 		TP_fast_assign(
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-			__entry->error = error;
+			__entry->error = error < 0 ? -error : 0;
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
-			__entry->error,
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1023,7 +1249,7 @@
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(int, stateid_seq)
 			__field(u32, stateid_hash)
 		),
@@ -1040,9 +1266,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"stateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1080,7 +1306,7 @@
 			__field(u32, fhandle)
 			__field(u64, fileid)
 			__field(unsigned int, valid)
-			__field(int, error)
+			__field(unsigned long, error)
 		),
 
 		TP_fast_assign(
@@ -1092,9 +1318,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"valid=%s",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1127,7 +1353,7 @@
 		TP_ARGS(clp, fhandle, inode, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
@@ -1148,9 +1374,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"dstaddr=%s",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1182,7 +1408,7 @@
 		TP_ARGS(clp, fhandle, inode, stateid, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
@@ -1209,9 +1435,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"stateid=%d:0x%08x dstaddr=%s",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1245,7 +1471,7 @@
 		TP_ARGS(name, len, id, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(u32, id)
 			__dynamic_array(char, name, len > 0 ? len + 1 : 1)
 		),
@@ -1260,8 +1486,8 @@
 		),
 
 		TP_printk(
-			"error=%d id=%u name=%s",
-			__entry->error,
+			"error=%ld (%s) id=%u name=%s",
+			-__entry->error, show_nfsv4_errors(__entry->error),
 			__entry->id,
 			__get_str(name)
 		)
@@ -1294,7 +1520,7 @@
 			__field(u64, fileid)
 			__field(loff_t, offset)
 			__field(size_t, count)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(int, stateid_seq)
 			__field(u32, stateid_hash)
 		),
@@ -1308,7 +1534,7 @@
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
 			__entry->offset = hdr->args.offset;
 			__entry->count = hdr->args.count;
-			__entry->error = error;
+			__entry->error = error < 0 ? -error : 0;
 			__entry->stateid_seq =
 				be32_to_cpu(state->stateid.seqid);
 			__entry->stateid_hash =
@@ -1316,9 +1542,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"offset=%lld count=%zu stateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1354,7 +1580,7 @@
 			__field(u64, fileid)
 			__field(loff_t, offset)
 			__field(size_t, count)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(int, stateid_seq)
 			__field(u32, stateid_hash)
 		),
@@ -1368,7 +1594,7 @@
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
 			__entry->offset = hdr->args.offset;
 			__entry->count = hdr->args.count;
-			__entry->error = error;
+			__entry->error = error < 0 ? -error : 0;
 			__entry->stateid_seq =
 				be32_to_cpu(state->stateid.seqid);
 			__entry->stateid_hash =
@@ -1376,9 +1602,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"offset=%lld count=%zu stateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1415,7 +1641,7 @@
 			__field(u64, fileid)
 			__field(loff_t, offset)
 			__field(size_t, count)
-			__field(int, error)
+			__field(unsigned long, error)
 		),
 
 		TP_fast_assign(
@@ -1429,9 +1655,9 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"offset=%lld count=%zu",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1451,6 +1677,10 @@
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
 
+TRACE_DEFINE_ENUM(IOMODE_READ);
+TRACE_DEFINE_ENUM(IOMODE_RW);
+TRACE_DEFINE_ENUM(IOMODE_ANY);
+
 #define show_pnfs_iomode(iomode) \
 	__print_symbolic(iomode, \
 		{ IOMODE_READ, "READ" }, \
@@ -1475,7 +1705,7 @@
 			__field(u32, iomode)
 			__field(u64, offset)
 			__field(u64, count)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(int, stateid_seq)
 			__field(u32, stateid_hash)
 			__field(int, layoutstateid_seq)
@@ -1508,10 +1738,10 @@
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
 			"layoutstateid=%d:0x%08x",
-			__entry->error,
+			-__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
@@ -1528,6 +1758,21 @@
 DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
 
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_UNKNOWN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NO_PNFS);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_MDSTHRESH);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NOMEM);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BULK_RECALL);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETURN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT);
+
 #define show_pnfs_update_layout_reason(reason)				\
 	__print_symbolic(reason,					\
 		{ PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" },		\
@@ -1542,7 +1787,8 @@
 		{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },	\
 		{ PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" },	\
 		{ PNFS_UPDATE_LAYOUT_RETRY, "retrying" },	\
-		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \
+		{ PNFS_UPDATE_LAYOUT_EXIT, "exit" })
 
 TRACE_EVENT(pnfs_update_layout,
 		TP_PROTO(struct inode *inode,
@@ -1601,6 +1847,78 @@
 		)
 );
 
+DECLARE_EVENT_CLASS(pnfs_layout_event,
+		TP_PROTO(struct inode *inode,
+			loff_t pos,
+			u64 count,
+			enum pnfs_iomode iomode,
+			struct pnfs_layout_hdr *lo,
+			struct pnfs_layout_segment *lseg
+		),
+		TP_ARGS(inode, pos, count, iomode, lo, lseg),
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u64, fileid)
+			__field(u32, fhandle)
+			__field(loff_t, pos)
+			__field(u64, count)
+			__field(enum pnfs_iomode, iomode)
+			__field(int, layoutstateid_seq)
+			__field(u32, layoutstateid_hash)
+			__field(long, lseg)
+		),
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->pos = pos;
+			__entry->count = count;
+			__entry->iomode = iomode;
+			if (lo != NULL) {
+				__entry->layoutstateid_seq =
+				be32_to_cpu(lo->plh_stateid.seqid);
+				__entry->layoutstateid_hash =
+				nfs_stateid_hash(&lo->plh_stateid);
+			} else {
+				__entry->layoutstateid_seq = 0;
+				__entry->layoutstateid_hash = 0;
+			}
+			__entry->lseg = (long)lseg;
+		),
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"iomode=%s pos=%llu count=%llu "
+			"layoutstateid=%d:0x%08x lseg=0x%lx",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			show_pnfs_iomode(__entry->iomode),
+			(unsigned long long)__entry->pos,
+			(unsigned long long)__entry->count,
+			__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+			__entry->lseg
+		)
+);
+
+#define DEFINE_PNFS_LAYOUT_EVENT(name) \
+	DEFINE_EVENT(pnfs_layout_event, name, \
+		TP_PROTO(struct inode *inode, \
+			loff_t pos, \
+			u64 count, \
+			enum pnfs_iomode iomode, \
+			struct pnfs_layout_hdr *lo, \
+			struct pnfs_layout_segment *lseg \
+		), \
+		TP_ARGS(inode, pos, count, iomode, lo, lseg))
+
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* _TRACE_NFS4_H */

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b7bde12..ab07db0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c

@@ -54,6 +54,7 @@
 #include <linux/nfs_fs.h>
 
 #include "nfs4_fs.h"
+#include "nfs4trace.h"
 #include "internal.h"
 #include "nfs4idmap.h"
 #include "nfs4session.h"
@@ -214,14 +215,14 @@
 				 nfs4_fattr_bitmap_maxsz)
 #define encode_read_maxsz	(op_encode_hdr_maxsz + \
 				 encode_stateid_maxsz + 3)
-#define decode_read_maxsz	(op_decode_hdr_maxsz + 2)
+#define decode_read_maxsz	(op_decode_hdr_maxsz + 2 + 1)
 #define encode_readdir_maxsz	(op_encode_hdr_maxsz + \
 				 2 + encode_verifier_maxsz + 5 + \
 				nfs4_label_maxsz)
 #define decode_readdir_maxsz	(op_decode_hdr_maxsz + \
-				 decode_verifier_maxsz)
+				 decode_verifier_maxsz + 1)
 #define encode_readlink_maxsz	(op_encode_hdr_maxsz)
-#define decode_readlink_maxsz	(op_decode_hdr_maxsz + 1)
+#define decode_readlink_maxsz	(op_decode_hdr_maxsz + 1 + 1)
 #define encode_write_maxsz	(op_encode_hdr_maxsz + \
 				 encode_stateid_maxsz + 4)
 #define decode_write_maxsz	(op_decode_hdr_maxsz + \
@@ -283,14 +284,14 @@
 #define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
 #define encode_getacl_maxsz	(encode_getattr_maxsz)
 #define decode_getacl_maxsz	(op_decode_hdr_maxsz + \
-				 nfs4_fattr_bitmap_maxsz + 1)
+				 nfs4_fattr_bitmap_maxsz + 1 + 1)
 #define encode_setacl_maxsz	(op_encode_hdr_maxsz + \
 				 encode_stateid_maxsz + 3)
 #define decode_setacl_maxsz	(decode_setattr_maxsz)
 #define encode_fs_locations_maxsz \
 				(encode_getattr_maxsz)
 #define decode_fs_locations_maxsz \
-				(0)
+				(1)
 #define encode_secinfo_maxsz	(op_encode_hdr_maxsz + nfs4_name_maxsz)
 #define decode_secinfo_maxsz	(op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
 
@@ -391,12 +392,13 @@
 				1 /* opaque devaddr4 length */ + \
 				  /* devaddr4 payload is read into page */ \
 				1 /* notification bitmap length */ + \
-				1 /* notification bitmap, word 0 */)
+				1 /* notification bitmap, word 0 */ + \
+				1 /* possible XDR padding */)
 #define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
 				encode_stateid_maxsz)
 #define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
 				decode_stateid_maxsz + \
-				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1)
 #define encode_layoutcommit_maxsz (op_encode_hdr_maxsz +          \
 				2 /* offset */ + \
 				2 /* length */ + \
@@ -835,6 +837,7 @@
 #define NFS4_dec_sequence_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_sequence_maxsz)
+#endif
 #define NFS4_enc_get_lease_time_sz	(compound_encode_hdr_maxsz + \
 					 encode_sequence_maxsz + \
 					 encode_putrootfh_maxsz + \
@@ -843,6 +846,7 @@
 					 decode_sequence_maxsz + \
 					 decode_putrootfh_maxsz + \
 					 decode_fsinfo_maxsz)
+#if defined(CONFIG_NFS_V4_1)
 #define NFS4_enc_reclaim_complete_sz	(compound_encode_hdr_maxsz + \
 					 encode_sequence_maxsz + \
 					 encode_reclaim_complete_maxsz)
@@ -1015,12 +1019,11 @@
 				struct compound_hdr *hdr)
 {
 	__be32 *p;
-	struct rpc_auth *auth = req->rq_cred->cr_auth;
 
 	/* initialize running count of expected bytes in reply.
 	 * NOTE: the replied tag SHOULD be the same is the one sent,
 	 * but this is not required as a MUST for the server to do so. */
-	hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
+	hdr->replen = 3 + hdr->taglen;
 
 	WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
 	encode_string(xdr, hdr->taglen, hdr->tag);
@@ -1171,7 +1174,7 @@
 		} else
 			*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
-	if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+	if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) {
 		*p++ = cpu_to_be32(label->lfs);
 		*p++ = cpu_to_be32(label->pi);
 		*p++ = cpu_to_be32(label->len);
@@ -2340,9 +2343,9 @@
 	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
 	if (args->lg_args) {
 		encode_layoutget(xdr, args->lg_args, &hdr);
-		xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-				 args->lg_args->layout.pages,
-				 0, args->lg_args->layout.pglen);
+		rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+					args->lg_args->layout.pglen,
+					hdr.replen);
 	}
 	encode_nops(&hdr);
 }
@@ -2386,9 +2389,9 @@
 	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
 	if (args->lg_args) {
 		encode_layoutget(xdr, args->lg_args, &hdr);
-		xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-				 args->lg_args->layout.pages,
-				 0, args->lg_args->layout.pglen);
+		rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+					args->lg_args->layout.pglen,
+					hdr.replen);
 	}
 	encode_nops(&hdr);
 }
@@ -2498,8 +2501,8 @@
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_readlink(xdr, args, req, &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
-			args->pgbase, args->pglen);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->pglen, hdr.replen);
 	encode_nops(&hdr);
 }
 
@@ -2519,11 +2522,8 @@
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_readdir(xdr, args, req, &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
-			 args->pgbase, args->count);
-	dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
-			__func__, hdr.replen << 2, args->pages,
-			args->pgbase, args->count);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, hdr.replen);
 	encode_nops(&hdr);
 }
 
@@ -2543,8 +2543,8 @@
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_read(xdr, args, &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-			 args->pages, args->pgbase, args->count);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, hdr.replen);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	encode_nops(&hdr);
 }
@@ -2590,9 +2590,8 @@
 	encode_getattr(xdr, nfs4_acl_bitmap, NULL,
 			ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
-		args->acl_pages, 0, args->acl_len);
-
+	rpc_prepare_reply_pages(req, args->acl_pages, 0,
+				args->acl_len, replen + 1);
 	encode_nops(&hdr);
 }
 
@@ -2813,9 +2812,8 @@
 		encode_fs_locations(xdr, args->bitmask, &hdr);
 	}
 
-	/* Set up reply kvec to capture returned fs_locations array. */
-	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
-			 (struct page **)&args->page, 0, PAGE_SIZE);
+	rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
+				PAGE_SIZE, replen + 1);
 	encode_nops(&hdr);
 }
 
@@ -2961,6 +2959,8 @@
 	encode_nops(&hdr);
 }
 
+#endif
+
 /*
  * a GET_LEASE_TIME request
  */
@@ -2981,6 +2981,8 @@
 	encode_nops(&hdr);
 }
 
+#ifdef CONFIG_NFS_V4_1
+
 /*
  * a RECLAIM_COMPLETE request
  */
@@ -3017,10 +3019,8 @@
 
 	/* set up reply kvec. Subtract notification bitmap max size (2)
 	 * so that notification bitmap is put in xdr_buf tail */
-	xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
-			 args->pdev->pages, args->pdev->pgbase,
-			 args->pdev->pglen);
-
+	rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
+				args->pdev->pglen, hdr.replen - 2);
 	encode_nops(&hdr);
 }
 
@@ -3041,9 +3041,8 @@
 	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
 	encode_layoutget(xdr, args, &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-	    args->layout.pages, 0, args->layout.pglen);
-
+	rpc_prepare_reply_pages(req, args->layout.pages, 0,
+				args->layout.pglen, hdr.replen);
 	encode_nops(&hdr);
 }
 
@@ -3144,22 +3143,12 @@
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
-	dprintk("nfs: %s: prematurely hit end of receive buffer. "
-		"Remaining buffer length is %tu words.\n",
-		func, xdr->end - xdr->p);
-}
-
 static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
 	ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string,
 			NFS4_OPAQUE_LIMIT);
-	if (unlikely(ret < 0)) {
-		if (ret == -EBADMSG)
-			print_overflow_msg(__func__, xdr);
+	if (unlikely(ret < 0))
 		return -EIO;
-	}
 	*len = ret;
 	return 0;
 }
@@ -3170,22 +3159,19 @@
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	hdr->status = be32_to_cpup(p++);
 	hdr->taglen = be32_to_cpup(p);
 
 	p = xdr_inline_decode(xdr, hdr->taglen + 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
 	hdr->nops = be32_to_cpup(p);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
@@ -3201,11 +3187,14 @@
 	opnum = be32_to_cpup(p++);
 	if (unlikely(opnum != expected))
 		goto out_bad_operation;
+	if (unlikely(*p != cpu_to_be32(NFS_OK)))
+		goto out_status;
+	*nfs_retval = 0;
+	return true;
+out_status:
 	nfserr = be32_to_cpup(p);
-	if (nfserr == NFS_OK)
-		*nfs_retval = 0;
-	else
-		*nfs_retval = nfs4_stat_to_errno(nfserr);
+	trace_nfs4_xdr_status(xdr, opnum, nfserr);
+	*nfs_retval = nfs4_stat_to_errno(nfserr);
 	return true;
 out_bad_operation:
 	dprintk("nfs: Server returned operation"
@@ -3214,7 +3203,6 @@
 	*nfs_retval = -EREMOTEIO;
 	return false;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	*nfs_retval = -EIO;
 	return false;
 }
@@ -3235,10 +3223,9 @@
 	char *str;
 
 	p = xdr_inline_decode(xdr, 12);
-	if (likely(p))
-		return decode_opaque_inline(xdr, &strlen, &str);
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
+	if (unlikely(!p))
+		return -EIO;
+	return decode_opaque_inline(xdr, &strlen, &str);
 }
 
 static ssize_t
@@ -3249,10 +3236,9 @@
 	ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz);
 	if (likely(ret >= 0))
 		return ret;
-	if (ret == -EMSGSIZE)
-		return sz;
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
+	if (ret != -EMSGSIZE)
+		return -EIO;
+	return sz;
 }
 
 static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -3268,13 +3254,10 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	*attrlen = be32_to_cpup(p);
 	*savep = xdr_stream_pos(xdr);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -3303,7 +3286,7 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*type = be32_to_cpup(p);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
@@ -3314,9 +3297,6 @@
 	}
 	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
@@ -3330,15 +3310,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*type = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
 	}
 	dprintk("%s: expire type=0x%x\n", __func__, *type);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -3352,7 +3329,7 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
@@ -3360,9 +3337,6 @@
 	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -3376,16 +3350,13 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
 	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3398,15 +3369,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3419,15 +3387,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -3442,7 +3407,7 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
 		p = xdr_inline_decode(xdr, 16);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		p = xdr_decode_hyper(p, &fsid->major);
 		xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
@@ -3452,9 +3417,6 @@
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3467,15 +3429,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
-	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
+	dprintk("%s: lease time=%u\n", __func__, (unsigned int)*res);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
@@ -3487,14 +3446,11 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
 		*res = -be32_to_cpup(p);
 	}
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
@@ -3516,7 +3472,7 @@
 static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
 {
 	__be32 *p;
-	int len;
+	u32 len;
 
 	if (fh != NULL)
 		memset(fh, 0, sizeof(*fh));
@@ -3526,13 +3482,13 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		len = be32_to_cpup(p);
 		if (len > NFS4_FHSIZE)
 			return -EIO;
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		if (fh != NULL) {
 			memcpy(fh->data, p, len);
 			fh->size = len;
@@ -3540,9 +3496,6 @@
 		bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
 	}
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3555,15 +3508,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -3577,16 +3527,13 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -3600,16 +3547,13 @@
 	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3623,15 +3567,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3645,15 +3586,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3667,15 +3605,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -3686,7 +3621,7 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	n = be32_to_cpup(p);
 	if (n == 0)
 		goto root_path;
@@ -3718,9 +3653,6 @@
 	dprintk(" status %d", status);
 	status = -EIO;
 	goto out;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -3745,7 +3677,7 @@
 		goto out;
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		goto out_eio;
 	n = be32_to_cpup(p);
 	if (n <= 0)
 		goto out_eio;
@@ -3758,7 +3690,7 @@
 		loc = &res->locations[res->nlocations];
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			goto out_eio;
 		m = be32_to_cpup(p);
 
 		dprintk("%s: servers:\n", __func__);
@@ -3796,8 +3728,6 @@
 out:
 	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
 out_eio:
 	status = -EIO;
 	goto out;
@@ -3814,15 +3744,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -3836,15 +3763,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*maxlink = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -3858,15 +3782,12 @@
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*maxname = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3881,7 +3802,7 @@
 		uint64_t maxread;
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
@@ -3890,9 +3811,6 @@
 	}
 	dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3907,7 +3825,7 @@
 		uint64_t maxwrite;
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
@@ -3916,9 +3834,6 @@
 	}
 	dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3933,7 +3848,7 @@
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		tmp = be32_to_cpup(p);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
@@ -3941,9 +3856,6 @@
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3957,16 +3869,13 @@
 	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		*nlink = be32_to_cpup(p);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
 	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static ssize_t decode_nfs4_string(struct xdr_stream *xdr,
@@ -4011,10 +3920,9 @@
 		return NFS_ATTR_FATTR_OWNER;
 	}
 out:
-	if (len != -EBADMSG)
-		return 0;
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
+	if (len == -EBADMSG)
+		return -EIO;
+	return 0;
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
@@ -4046,10 +3954,9 @@
 		return NFS_ATTR_FATTR_GROUP;
 	}
 out:
-	if (len != -EBADMSG)
-		return 0;
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
+	if (len == -EBADMSG)
+		return -EIO;
+	return 0;
 }
 
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -4066,7 +3973,7 @@
 
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		major = be32_to_cpup(p++);
 		minor = be32_to_cpup(p);
 		tmp = MKDEV(major, minor);
@@ -4077,9 +3984,6 @@
 	}
 	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4093,15 +3997,12 @@
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4115,15 +4016,12 @@
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4137,15 +4035,12 @@
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -4159,7 +4054,7 @@
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
@@ -4167,9 +4062,6 @@
 	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
 	return ret;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static __be32 *
@@ -4189,12 +4081,9 @@
 
 	p = xdr_inline_decode(xdr, nfstime4_maxsz << 2);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	xdr_decode_nfstime4(p, time);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -4265,19 +4154,19 @@
 	if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		lfs = be32_to_cpup(p++);
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		pi = be32_to_cpup(p++);
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		len = be32_to_cpup(p++);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		if (len < NFS4_MAXLABELLEN) {
 			if (label) {
 				memcpy(label->label, p, len);
@@ -4295,10 +4184,6 @@
 		dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
 			(char *)label->label, label->len, label->pi, label->lfs);
 	return status;
-
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -4342,14 +4227,11 @@
 
 	p = xdr_inline_decode(xdr, 20);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	cinfo->atomic = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &cinfo->before);
 	xdr_decode_hyper(p, &cinfo->after);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
@@ -4363,24 +4245,19 @@
 		return status;
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	supp = be32_to_cpup(p++);
 	acc = be32_to_cpup(p);
 	*supported = supp;
 	*access = acc;
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
 {
 	ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len);
-	if (unlikely(ret < 0)) {
-		print_overflow_msg(__func__, xdr);
+	if (unlikely(ret < 0))
 		return -EIO;
-	}
 	return 0;
 }
 
@@ -4460,13 +4337,11 @@
 		return status;
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	bmlen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, bmlen << 2);
 	if (likely(p))
 		return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 
@@ -4574,13 +4449,10 @@
 	if (likely(bitmap[0] & hint_bit)) {
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		xdr_decode_hyper(p, res);
 	}
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_first_threshold_item4(struct xdr_stream *xdr,
@@ -4593,10 +4465,8 @@
 
 	/* layout type */
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p)) {
-		print_overflow_msg(__func__, xdr);
+	if (unlikely(!p))
 		return -EIO;
-	}
 	res->l_type = be32_to_cpup(p);
 
 	/* thi_hintset bitmap */
@@ -4654,7 +4524,7 @@
 			return -EREMOTEIO;
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		num = be32_to_cpup(p);
 		if (num == 0)
 			return 0;
@@ -4667,9 +4537,6 @@
 		bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD;
 	}
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
@@ -4857,7 +4724,7 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	fsinfo->nlayouttypes = be32_to_cpup(p);
 
 	/* pNFS is not supported by the underlying file system */
@@ -4867,7 +4734,7 @@
 	/* Decode and set first layout type, move xdr->p past unused types */
 	p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 
 	/* If we get too many, then just cap it at the max */
 	if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
@@ -4879,9 +4746,6 @@
 	for(i = 0; i < fsinfo->nlayouttypes; ++i)
 		fsinfo->layouttype[i] = be32_to_cpup(p++);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 /*
@@ -4915,10 +4779,8 @@
 	*res = 0;
 	if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p)) {
-			print_overflow_msg(__func__, xdr);
+		if (unlikely(!p))
 			return -EIO;
-		}
 		*res = be32_to_cpup(p);
 		bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
 	}
@@ -4937,10 +4799,8 @@
 	*res = 0;
 	if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p)) {
-			print_overflow_msg(__func__, xdr);
+		if (unlikely(!p))
 			return -EIO;
-		}
 		*res = be32_to_cpup(p);
 		bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
 	}
@@ -5016,19 +4876,16 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	len = be32_to_cpup(p);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
 	p = xdr_inline_decode(xdr, len);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	memcpy(fh->data, p, len);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -5052,7 +4909,7 @@
 
 	p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
 	p = xdr_decode_hyper(p, &length);
 	type = be32_to_cpup(p++); /* 4 byte read */
@@ -5069,11 +4926,9 @@
 	p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
 	namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
 	p = xdr_inline_decode(xdr, namelen); /* variable size field */
-	if (likely(p))
-		return -NFS4ERR_DENIED;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
+	if (likely(!p))
+		return -EIO;
+	return -NFS4ERR_DENIED;
 }
 
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
@@ -5142,7 +4997,7 @@
 
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case NFS4_LIMIT_SIZE:
@@ -5156,9 +5011,6 @@
 	maxsize >>= PAGE_SHIFT;
 	*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_rw_delegation(struct xdr_stream *xdr,
@@ -5173,7 +5025,7 @@
 		return status;
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	res->do_recall = be32_to_cpup(p);
 
 	switch (delegation_type) {
@@ -5186,9 +5038,6 @@
 				return -EIO;
 	}
 	return decode_ace(xdr, NULL);
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5198,7 +5047,7 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	why_no_delegation = be32_to_cpup(p);
 	switch (why_no_delegation) {
 		case WND4_CONTENTION:
@@ -5207,9 +5056,6 @@
 			/* Ignore for now */
 	}
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5219,7 +5065,7 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	delegation_type = be32_to_cpup(p);
 	res->delegation_type = 0;
 	switch (delegation_type) {
@@ -5232,9 +5078,6 @@
 		return decode_no_delegation(xdr, res);
 	}
 	return -EIO;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5256,7 +5099,7 @@
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	res->rflags = be32_to_cpup(p++);
 	bmlen = be32_to_cpup(p);
 	if (bmlen > 10)
@@ -5264,7 +5107,7 @@
 
 	p = xdr_inline_decode(xdr, bmlen << 2);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
 	for (i = 0; i < savewords; ++i)
 		res->attrset[i] = be32_to_cpup(p++);
@@ -5275,9 +5118,6 @@
 xdr_error:
 	dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
 	return -EIO;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
@@ -5326,7 +5166,7 @@
 		return status;
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	eof = be32_to_cpup(p++);
 	count = be32_to_cpup(p);
 	recvd = xdr_read_pages(xdr, count);
@@ -5339,9 +5179,6 @@
 	res->eof = eof;
 	res->count = count;
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -5374,7 +5211,7 @@
 	/* Convert length of symlink */
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	len = be32_to_cpup(p);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
@@ -5395,9 +5232,6 @@
 	 */
 	xdr_terminate_string(rcvbuf, len);
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -5500,7 +5334,6 @@
 		return status;
 	if (decode_bitmap4(xdr, NULL, 0) >= 0)
 		return 0;
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 
@@ -5512,7 +5345,7 @@
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	opnum = be32_to_cpup(p++);
 	if (opnum != OP_SETCLIENTID) {
 		dprintk("nfs: decode_setclientid: Server returned operation"
@@ -5523,7 +5356,7 @@
 	if (nfserr == NFS_OK) {
 		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		p = xdr_decode_hyper(p, &res->clientid);
 		memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
@@ -5532,28 +5365,25 @@
 		/* skip netid string */
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 
 		/* skip uaddr string */
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		return -NFSERR_CLID_INUSE;
 	} else
 		return nfs4_stat_to_errno(nfserr);
 
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -5572,13 +5402,10 @@
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	res->count = be32_to_cpup(p++);
 	res->verf->committed = be32_to_cpup(p++);
 	return decode_write_verifier(xdr, &res->verf->verifier);
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_delegreturn(struct xdr_stream *xdr)
@@ -5594,30 +5421,24 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	oid_len = be32_to_cpup(p);
 	if (oid_len > GSS_OID_MAX_LEN)
-		goto out_err;
+		return -EINVAL;
 
 	p = xdr_inline_decode(xdr, oid_len);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	memcpy(flavor->flavor_info.oid.data, p, oid_len);
 	flavor->flavor_info.oid.len = oid_len;
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	flavor->flavor_info.qop = be32_to_cpup(p++);
 	flavor->flavor_info.service = be32_to_cpup(p);
 
 	return 0;
-
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
-out_err:
-	return -EINVAL;
 }
 
 static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
@@ -5629,7 +5450,7 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 
 	res->flavors->num_flavors = 0;
 	num_flavors = be32_to_cpup(p);
@@ -5641,7 +5462,7 @@
 
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		sec_flavor->flavor = be32_to_cpup(p);
 
 		if (sec_flavor->flavor == RPC_AUTH_GSS) {
@@ -5655,9 +5476,6 @@
 	status = 0;
 out:
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
@@ -5711,11 +5529,11 @@
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	xdr_decode_hyper(p, &res->clientid);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	res->seqid = be32_to_cpup(p++);
 	res->flags = be32_to_cpup(p++);
 
@@ -5739,7 +5557,7 @@
 	/* server_owner4.so_minor_id */
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	p = xdr_decode_hyper(p, &res->server_owner->minor_id);
 
 	/* server_owner4.so_major_id */
@@ -5759,7 +5577,7 @@
 	/* Implementation Id */
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	impl_id_count = be32_to_cpup(p++);
 
 	if (impl_id_count) {
@@ -5778,16 +5596,13 @@
 		/* nii_date */
 		p = xdr_inline_decode(xdr, 12);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 		p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
 		res->impl_id->date.nseconds = be32_to_cpup(p);
 
 		/* if there's more than one entry, ignore the rest */
 	}
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -5798,7 +5613,7 @@
 
 	p = xdr_inline_decode(xdr, 28);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	val = be32_to_cpup(p++);	/* headerpadsz */
 	if (val)
 		return -EINVAL;		/* no support for header padding yet */
@@ -5816,12 +5631,9 @@
 	if (nr_attrs == 1) {
 		p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 	}
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
@@ -5844,7 +5656,7 @@
 	/* dir flags, rdma mode bool */
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 
 	res->dir = be32_to_cpup(p++);
 	if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
@@ -5855,9 +5667,6 @@
 		res->use_conn_in_rdma_mode = true;
 
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_create_session(struct xdr_stream *xdr,
@@ -5875,7 +5684,7 @@
 	/* seqid, flags */
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	res->seqid = be32_to_cpup(p++);
 	res->flags = be32_to_cpup(p);
 
@@ -5884,9 +5693,6 @@
 	if (!status)
 		status = decode_chan_attrs(xdr, &res->bc_attrs);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -5967,7 +5773,6 @@
 	res->sr_status = status;
 	return status;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	status = -EIO;
 	goto out_err;
 #else  /* CONFIG_NFS_V4_1 */
@@ -5995,7 +5800,7 @@
 		if (status == -ETOOSMALL) {
 			p = xdr_inline_decode(xdr, 4);
 			if (unlikely(!p))
-				goto out_overflow;
+				return -EIO;
 			pdev->mincount = be32_to_cpup(p);
 			dprintk("%s: Min count too small. mincnt = %u\n",
 				__func__, pdev->mincount);
@@ -6005,7 +5810,7 @@
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	type = be32_to_cpup(p++);
 	if (type != pdev->layout_type) {
 		dprintk("%s: layout mismatch req: %u pdev: %u\n",
@@ -6019,19 +5824,19 @@
 	 */
 	pdev->mincount = be32_to_cpup(p);
 	if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
-		goto out_overflow;
+		return -EIO;
 
 	/* Parse notification bitmap, verifying that it is zero. */
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	len = be32_to_cpup(p);
 	if (len) {
 		uint32_t i;
 
 		p = xdr_inline_decode(xdr, 4 * len);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 
 		res->notification = be32_to_cpup(p++);
 		for (i = 1; i < len; i++) {
@@ -6043,9 +5848,6 @@
 		}
 	}
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
@@ -6115,7 +5917,6 @@
 	res->status = status;
 	return status;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	status = -EIO;
 	goto out;
 }
@@ -6131,16 +5932,13 @@
 		return status;
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	res->lrs_present = be32_to_cpup(p);
 	if (res->lrs_present)
 		status = decode_layout_stateid(xdr, &res->stateid);
 	else
 		nfs4_stateid_copy(&res->stateid, &invalid_stateid);
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_layoutcommit(struct xdr_stream *xdr,
@@ -6158,19 +5956,16 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	sizechanged = be32_to_cpup(p);
 
 	if (sizechanged) {
 		/* throw away new size */
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EIO;
 	}
 	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 
 static int decode_test_stateid(struct xdr_stream *xdr,
@@ -6186,21 +5981,17 @@
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	num_res = be32_to_cpup(p++);
 	if (num_res != 1)
-		goto out;
+		return -EIO;
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EIO;
 	res->status = be32_to_cpup(p++);
 
 	return status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-out:
-	return -EIO;
 }
 
 static int decode_free_stateid(struct xdr_stream *xdr,
@@ -7337,6 +7128,8 @@
 	return status;
 }
 
+#endif
+
 /*
  * Decode GET_LEASE_TIME response
  */
@@ -7358,6 +7151,8 @@
 	return status;
 }
 
+#ifdef CONFIG_NFS_V4_1
+
 /*
  * Decode RECLAIM_COMPLETE response
  */
@@ -7570,11 +7365,11 @@
 	uint64_t new_cookie;
 	__be32 *p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EAGAIN;
 	if (*p == xdr_zero) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
-			goto out_overflow;
+			return -EAGAIN;
 		if (*p == xdr_zero)
 			return -EAGAIN;
 		entry->eof = 1;
@@ -7583,13 +7378,13 @@
 
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EAGAIN;
 	p = xdr_decode_hyper(p, &new_cookie);
 	entry->len = be32_to_cpup(p);
 
 	p = xdr_inline_decode(xdr, entry->len);
 	if (unlikely(!p))
-		goto out_overflow;
+		return -EAGAIN;
 	entry->name = (const char *) p;
 
 	/*
@@ -7601,14 +7396,14 @@
 	entry->fattr->valid = 0;
 
 	if (decode_attr_bitmap(xdr, bitmap) < 0)
-		goto out_overflow;
+		return -EAGAIN;
 
 	if (decode_attr_length(xdr, &len, &savep) < 0)
-		goto out_overflow;
+		return -EAGAIN;
 
 	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
 			NULL, entry->label, entry->server) < 0)
-		goto out_overflow;
+		return -EAGAIN;
 	if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
 		entry->ino = entry->fattr->mounted_on_fileid;
 	else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
@@ -7622,10 +7417,6 @@
 	entry->cookie = new_cookie;
 
 	return 0;
-
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EAGAIN;
 }
 
 /*
@@ -7770,7 +7561,7 @@
 	PROC41(CREATE_SESSION,	enc_create_session,	dec_create_session),
 	PROC41(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session),
 	PROC41(SEQUENCE,	enc_sequence,		dec_sequence),
-	PROC41(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
+	PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
 	PROC41(RECLAIM_COMPLETE,enc_reclaim_complete,	dec_reclaim_complete),
 	PROC41(GETDEVICEINFO,	enc_getdeviceinfo,	dec_getdeviceinfo),
 	PROC41(LAYOUTGET,	enc_layoutget,		dec_layoutget),
@@ -7791,6 +7582,7 @@
 	PROC42(COPY,		enc_copy,		dec_copy),
 	PROC42(OFFLOAD_CANCEL,	enc_offload_cancel,	dec_offload_cancel),
 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp),
+	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror),
 };
 
 static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];

diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
index b60d5fb..a90b363 100644
--- a/fs/nfs/nfstrace.c
+++ b/fs/nfs/nfstrace.c

@@ -11,3 +11,4 @@
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status);

diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index bd60f8d..361cc10 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h

@@ -11,6 +11,16 @@
 #include <linux/tracepoint.h>
 #include <linux/iversion.h>
 
+TRACE_DEFINE_ENUM(DT_UNKNOWN);
+TRACE_DEFINE_ENUM(DT_FIFO);
+TRACE_DEFINE_ENUM(DT_CHR);
+TRACE_DEFINE_ENUM(DT_DIR);
+TRACE_DEFINE_ENUM(DT_BLK);
+TRACE_DEFINE_ENUM(DT_REG);
+TRACE_DEFINE_ENUM(DT_LNK);
+TRACE_DEFINE_ENUM(DT_SOCK);
+TRACE_DEFINE_ENUM(DT_WHT);
+
 #define nfs_show_file_type(ftype) \
 	__print_symbolic(ftype, \
 			{ DT_UNKNOWN, "UNKNOWN" }, \
@@ -23,25 +33,57 @@
 			{ DT_SOCK, "SOCK" }, \
 			{ DT_WHT, "WHT" })
 
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_DATA);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_ATIME);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACCESS);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACL);
+TRACE_DEFINE_ENUM(NFS_INO_REVAL_PAGECACHE);
+TRACE_DEFINE_ENUM(NFS_INO_REVAL_FORCED);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_LABEL);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_CHANGE);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_CTIME);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_MTIME);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_SIZE);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
+
 #define nfs_show_cache_validity(v) \
 	__print_flags(v, "|", \
-			{ NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \
 			{ NFS_INO_INVALID_DATA, "INVALID_DATA" }, \
 			{ NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
 			{ NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
 			{ NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
 			{ NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
 			{ NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
-			{ NFS_INO_INVALID_LABEL, "INVALID_LABEL" })
+			{ NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \
+			{ NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \
+			{ NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \
+			{ NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \
+			{ NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \
+			{ NFS_INO_INVALID_OTHER, "INVALID_OTHER" })
+
+TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
+TRACE_DEFINE_ENUM(NFS_INO_STALE);
+TRACE_DEFINE_ENUM(NFS_INO_ACL_LRU_SET);
+TRACE_DEFINE_ENUM(NFS_INO_INVALIDATING);
+TRACE_DEFINE_ENUM(NFS_INO_FSCACHE);
+TRACE_DEFINE_ENUM(NFS_INO_FSCACHE_LOCK);
+TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMIT);
+TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMITTING);
+TRACE_DEFINE_ENUM(NFS_INO_LAYOUTSTATS);
+TRACE_DEFINE_ENUM(NFS_INO_ODIRECT);
 
 #define nfs_show_nfsi_flags(v) \
 	__print_flags(v, "|", \
-			{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
-			{ 1 << NFS_INO_STALE, "STALE" }, \
-			{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
-			{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
-			{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
-			{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
+			{ BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \
+			{ BIT(NFS_INO_STALE), "STALE" }, \
+			{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
+			{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
+			{ BIT(NFS_INO_FSCACHE), "FSCACHE" }, \
+			{ BIT(NFS_INO_FSCACHE_LOCK), "FSCACHE_LOCK" }, \
+			{ BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \
+			{ BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \
+			{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
+			{ BIT(NFS_INO_ODIRECT), "ODIRECT" })
 
 DECLARE_EVENT_CLASS(nfs_inode_event,
 		TP_PROTO(
@@ -83,7 +125,7 @@
 		TP_ARGS(inode, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(unsigned char, type)
@@ -96,7 +138,7 @@
 
 		TP_fast_assign(
 			const struct nfs_inode *nfsi = NFS_I(inode);
-			__entry->error = error;
+			__entry->error = error < 0 ? -error : 0;
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = nfsi->fileid;
 			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
@@ -108,10 +150,10 @@
 		),
 
 		TP_printk(
-			"error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
 			"type=%u (%s) version=%llu size=%lld "
-			"cache_validity=%lu (%s) nfs_flags=%ld (%s)",
-			__entry->error,
+			"cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s)",
+			-__entry->error, nfs_show_status(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
 			__entry->fhandle,
@@ -158,13 +200,39 @@
 DEFINE_NFS_INODE_EVENT(nfs_access_enter);
 DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit);
 
+TRACE_DEFINE_ENUM(LOOKUP_FOLLOW);
+TRACE_DEFINE_ENUM(LOOKUP_DIRECTORY);
+TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT);
+TRACE_DEFINE_ENUM(LOOKUP_PARENT);
+TRACE_DEFINE_ENUM(LOOKUP_REVAL);
+TRACE_DEFINE_ENUM(LOOKUP_RCU);
+TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL);
+TRACE_DEFINE_ENUM(LOOKUP_OPEN);
+TRACE_DEFINE_ENUM(LOOKUP_CREATE);
+TRACE_DEFINE_ENUM(LOOKUP_EXCL);
+TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
+TRACE_DEFINE_ENUM(LOOKUP_JUMPED);
+TRACE_DEFINE_ENUM(LOOKUP_ROOT);
+TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
+TRACE_DEFINE_ENUM(LOOKUP_DOWN);
+
 #define show_lookup_flags(flags) \
-	__print_flags((unsigned long)flags, "|", \
-			{ LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
+	__print_flags(flags, "|", \
+			{ LOOKUP_FOLLOW, "FOLLOW" }, \
 			{ LOOKUP_DIRECTORY, "DIRECTORY" }, \
+			{ LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
+			{ LOOKUP_PARENT, "PARENT" }, \
+			{ LOOKUP_REVAL, "REVAL" }, \
+			{ LOOKUP_RCU, "RCU" }, \
+			{ LOOKUP_NO_REVAL, "NO_REVAL" }, \
 			{ LOOKUP_OPEN, "OPEN" }, \
 			{ LOOKUP_CREATE, "CREATE" }, \
-			{ LOOKUP_EXCL, "EXCL" })
+			{ LOOKUP_EXCL, "EXCL" }, \
+			{ LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
+			{ LOOKUP_JUMPED, "JUMPED" }, \
+			{ LOOKUP_ROOT, "ROOT" }, \
+			{ LOOKUP_EMPTY, "EMPTY" }, \
+			{ LOOKUP_DOWN, "DOWN" })
 
 DECLARE_EVENT_CLASS(nfs_lookup_event,
 		TP_PROTO(
@@ -176,7 +244,7 @@
 		TP_ARGS(dir, dentry, flags),
 
 		TP_STRUCT__entry(
-			__field(unsigned int, flags)
+			__field(unsigned long, flags)
 			__field(dev_t, dev)
 			__field(u64, dir)
 			__string(name, dentry->d_name.name)
@@ -190,7 +258,7 @@
 		),
 
 		TP_printk(
-			"flags=%u (%s) name=%02x:%02x:%llu/%s",
+			"flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
 			__entry->flags,
 			show_lookup_flags(__entry->flags),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -219,8 +287,8 @@
 		TP_ARGS(dir, dentry, flags, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
-			__field(unsigned int, flags)
+			__field(unsigned long, error)
+			__field(unsigned long, flags)
 			__field(dev_t, dev)
 			__field(u64, dir)
 			__string(name, dentry->d_name.name)
@@ -229,14 +297,14 @@
 		TP_fast_assign(
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
-			__entry->error = error;
+			__entry->error = error < 0 ? -error : 0;
 			__entry->flags = flags;
 			__assign_str(name, dentry->d_name.name);
 		),
 
 		TP_printk(
-			"error=%d flags=%u (%s) name=%02x:%02x:%llu/%s",
-			__entry->error,
+			"error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+			-__entry->error, nfs_show_status(__entry->error),
 			__entry->flags,
 			show_lookup_flags(__entry->flags),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -260,15 +328,43 @@
 DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
 DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
 
+TRACE_DEFINE_ENUM(O_WRONLY);
+TRACE_DEFINE_ENUM(O_RDWR);
+TRACE_DEFINE_ENUM(O_CREAT);
+TRACE_DEFINE_ENUM(O_EXCL);
+TRACE_DEFINE_ENUM(O_NOCTTY);
+TRACE_DEFINE_ENUM(O_TRUNC);
+TRACE_DEFINE_ENUM(O_APPEND);
+TRACE_DEFINE_ENUM(O_NONBLOCK);
+TRACE_DEFINE_ENUM(O_DSYNC);
+TRACE_DEFINE_ENUM(O_DIRECT);
+TRACE_DEFINE_ENUM(O_LARGEFILE);
+TRACE_DEFINE_ENUM(O_DIRECTORY);
+TRACE_DEFINE_ENUM(O_NOFOLLOW);
+TRACE_DEFINE_ENUM(O_NOATIME);
+TRACE_DEFINE_ENUM(O_CLOEXEC);
+
 #define show_open_flags(flags) \
-	__print_flags((unsigned long)flags, "|", \
+	__print_flags(flags, "|", \
+		{ O_WRONLY, "O_WRONLY" }, \
+		{ O_RDWR, "O_RDWR" }, \
 		{ O_CREAT, "O_CREAT" }, \
 		{ O_EXCL, "O_EXCL" }, \
+		{ O_NOCTTY, "O_NOCTTY" }, \
 		{ O_TRUNC, "O_TRUNC" }, \
 		{ O_APPEND, "O_APPEND" }, \
+		{ O_NONBLOCK, "O_NONBLOCK" }, \
 		{ O_DSYNC, "O_DSYNC" }, \
 		{ O_DIRECT, "O_DIRECT" }, \
-		{ O_DIRECTORY, "O_DIRECTORY" })
+		{ O_LARGEFILE, "O_LARGEFILE" }, \
+		{ O_DIRECTORY, "O_DIRECTORY" }, \
+		{ O_NOFOLLOW, "O_NOFOLLOW" }, \
+		{ O_NOATIME, "O_NOATIME" }, \
+		{ O_CLOEXEC, "O_CLOEXEC" })
+
+TRACE_DEFINE_ENUM(FMODE_READ);
+TRACE_DEFINE_ENUM(FMODE_WRITE);
+TRACE_DEFINE_ENUM(FMODE_EXEC);
 
 #define show_fmode_flags(mode) \
 	__print_flags(mode, "|", \
@@ -286,7 +382,7 @@
 		TP_ARGS(dir, ctx, flags),
 
 		TP_STRUCT__entry(
-			__field(unsigned int, flags)
+			__field(unsigned long, flags)
 			__field(unsigned int, fmode)
 			__field(dev_t, dev)
 			__field(u64, dir)
@@ -302,7 +398,7 @@
 		),
 
 		TP_printk(
-			"flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s",
+			"flags=0x%lx (%s) fmode=%s name=%02x:%02x:%llu/%s",
 			__entry->flags,
 			show_open_flags(__entry->flags),
 			show_fmode_flags(__entry->fmode),
@@ -323,8 +419,8 @@
 		TP_ARGS(dir, ctx, flags, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
-			__field(unsigned int, flags)
+			__field(unsigned long, error)
+			__field(unsigned long, flags)
 			__field(unsigned int, fmode)
 			__field(dev_t, dev)
 			__field(u64, dir)
@@ -332,7 +428,7 @@
 		),
 
 		TP_fast_assign(
-			__entry->error = error;
+			__entry->error = -error;
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
 			__entry->flags = flags;
@@ -341,9 +437,9 @@
 		),
 
 		TP_printk(
-			"error=%d flags=%u (%s) fmode=%s "
+			"error=%ld (%s) flags=0x%lx (%s) fmode=%s "
 			"name=%02x:%02x:%llu/%s",
-			__entry->error,
+			-__entry->error, nfs_show_status(__entry->error),
 			__entry->flags,
 			show_open_flags(__entry->flags),
 			show_fmode_flags(__entry->fmode),
@@ -363,7 +459,7 @@
 		TP_ARGS(dir, dentry, flags),
 
 		TP_STRUCT__entry(
-			__field(unsigned int, flags)
+			__field(unsigned long, flags)
 			__field(dev_t, dev)
 			__field(u64, dir)
 			__string(name, dentry->d_name.name)
@@ -377,7 +473,7 @@
 		),
 
 		TP_printk(
-			"flags=%u (%s) name=%02x:%02x:%llu/%s",
+			"flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
 			__entry->flags,
 			show_open_flags(__entry->flags),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -397,15 +493,15 @@
 		TP_ARGS(dir, dentry, flags, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
-			__field(unsigned int, flags)
+			__field(unsigned long, error)
+			__field(unsigned long, flags)
 			__field(dev_t, dev)
 			__field(u64, dir)
 			__string(name, dentry->d_name.name)
 		),
 
 		TP_fast_assign(
-			__entry->error = error;
+			__entry->error = -error;
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
 			__entry->flags = flags;
@@ -413,8 +509,8 @@
 		),
 
 		TP_printk(
-			"error=%d flags=%u (%s) name=%02x:%02x:%llu/%s",
-			__entry->error,
+			"error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+			-__entry->error, nfs_show_status(__entry->error),
 			__entry->flags,
 			show_open_flags(__entry->flags),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -469,7 +565,7 @@
 		TP_ARGS(dir, dentry, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(dev_t, dev)
 			__field(u64, dir)
 			__string(name, dentry->d_name.name)
@@ -478,13 +574,13 @@
 		TP_fast_assign(
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
-			__entry->error = error;
+			__entry->error = error < 0 ? -error : 0;
 			__assign_str(name, dentry->d_name.name);
 		),
 
 		TP_printk(
-			"error=%d name=%02x:%02x:%llu/%s",
-			__entry->error,
+			"error=%ld (%s) name=%02x:%02x:%llu/%s",
+			-__entry->error, nfs_show_status(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->dir,
 			__get_str(name)
@@ -557,7 +653,7 @@
 		TP_ARGS(inode, dir, dentry, error),
 
 		TP_STRUCT__entry(
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(dev_t, dev)
 			__field(u64, fileid)
 			__field(u64, dir)
@@ -568,13 +664,13 @@
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->dir = NFS_FILEID(dir);
-			__entry->error = error;
+			__entry->error = error < 0 ? -error : 0;
 			__assign_str(name, dentry->d_name.name);
 		),
 
 		TP_printk(
-			"error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
-			__entry->error,
+			"error=%ld (%s) fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+			-__entry->error, nfs_show_status(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			__entry->fileid,
 			MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -642,7 +738,7 @@
 
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(u64, old_dir)
 			__string(old_name, old_dentry->d_name.name)
 			__field(u64, new_dir)
@@ -651,17 +747,17 @@
 
 		TP_fast_assign(
 			__entry->dev = old_dir->i_sb->s_dev;
+			__entry->error = -error;
 			__entry->old_dir = NFS_FILEID(old_dir);
 			__entry->new_dir = NFS_FILEID(new_dir);
-			__entry->error = error;
 			__assign_str(old_name, old_dentry->d_name.name);
 			__assign_str(new_name, new_dentry->d_name.name);
 		),
 
 		TP_printk(
-			"error=%d old_name=%02x:%02x:%llu/%s "
+			"error=%ld (%s) old_name=%02x:%02x:%llu/%s "
 			"new_name=%02x:%02x:%llu/%s",
-			__entry->error,
+			-__entry->error, nfs_show_status(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->old_dir,
 			__get_str(old_name),
@@ -697,7 +793,7 @@
 
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
-			__field(int, error)
+			__field(unsigned long, error)
 			__field(u64, dir)
 			__dynamic_array(char, name, data->args.name.len + 1)
 		),
@@ -707,15 +803,15 @@
 			size_t len = data->args.name.len;
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
-			__entry->error = error;
+			__entry->error = -error;
 			memcpy(__get_str(name),
 				data->args.name.name, len);
 			__get_str(name)[len] = 0;
 		),
 
 		TP_printk(
-			"error=%d name=%02x:%02x:%llu/%s",
-			__entry->error,
+			"error=%ld (%s) name=%02x:%02x:%llu/%s",
+			-__entry->error, nfs_show_status(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->dir,
 			__get_str(name)
@@ -969,6 +1065,108 @@
 		)
 );
 
+TRACE_DEFINE_ENUM(NFS_OK);
+TRACE_DEFINE_ENUM(NFSERR_PERM);
+TRACE_DEFINE_ENUM(NFSERR_NOENT);
+TRACE_DEFINE_ENUM(NFSERR_IO);
+TRACE_DEFINE_ENUM(NFSERR_NXIO);
+TRACE_DEFINE_ENUM(ECHILD);
+TRACE_DEFINE_ENUM(NFSERR_EAGAIN);
+TRACE_DEFINE_ENUM(NFSERR_ACCES);
+TRACE_DEFINE_ENUM(NFSERR_EXIST);
+TRACE_DEFINE_ENUM(NFSERR_XDEV);
+TRACE_DEFINE_ENUM(NFSERR_NODEV);
+TRACE_DEFINE_ENUM(NFSERR_NOTDIR);
+TRACE_DEFINE_ENUM(NFSERR_ISDIR);
+TRACE_DEFINE_ENUM(NFSERR_INVAL);
+TRACE_DEFINE_ENUM(NFSERR_FBIG);
+TRACE_DEFINE_ENUM(NFSERR_NOSPC);
+TRACE_DEFINE_ENUM(NFSERR_ROFS);
+TRACE_DEFINE_ENUM(NFSERR_MLINK);
+TRACE_DEFINE_ENUM(NFSERR_OPNOTSUPP);
+TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG);
+TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY);
+TRACE_DEFINE_ENUM(NFSERR_DQUOT);
+TRACE_DEFINE_ENUM(NFSERR_STALE);
+TRACE_DEFINE_ENUM(NFSERR_REMOTE);
+TRACE_DEFINE_ENUM(NFSERR_WFLUSH);
+TRACE_DEFINE_ENUM(NFSERR_BADHANDLE);
+TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC);
+TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE);
+TRACE_DEFINE_ENUM(NFSERR_NOTSUPP);
+TRACE_DEFINE_ENUM(NFSERR_TOOSMALL);
+TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT);
+TRACE_DEFINE_ENUM(NFSERR_BADTYPE);
+TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
+
+#define nfs_show_status(x) \
+	__print_symbolic(x, \
+			{ NFS_OK, "OK" }, \
+			{ NFSERR_PERM, "PERM" }, \
+			{ NFSERR_NOENT, "NOENT" }, \
+			{ NFSERR_IO, "IO" }, \
+			{ NFSERR_NXIO, "NXIO" }, \
+			{ ECHILD, "CHILD" }, \
+			{ NFSERR_EAGAIN, "AGAIN" }, \
+			{ NFSERR_ACCES, "ACCES" }, \
+			{ NFSERR_EXIST, "EXIST" }, \
+			{ NFSERR_XDEV, "XDEV" }, \
+			{ NFSERR_NODEV, "NODEV" }, \
+			{ NFSERR_NOTDIR, "NOTDIR" }, \
+			{ NFSERR_ISDIR, "ISDIR" }, \
+			{ NFSERR_INVAL, "INVAL" }, \
+			{ NFSERR_FBIG, "FBIG" }, \
+			{ NFSERR_NOSPC, "NOSPC" }, \
+			{ NFSERR_ROFS, "ROFS" }, \
+			{ NFSERR_MLINK, "MLINK" }, \
+			{ NFSERR_OPNOTSUPP, "OPNOTSUPP" }, \
+			{ NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \
+			{ NFSERR_NOTEMPTY, "NOTEMPTY" }, \
+			{ NFSERR_DQUOT, "DQUOT" }, \
+			{ NFSERR_STALE, "STALE" }, \
+			{ NFSERR_REMOTE, "REMOTE" }, \
+			{ NFSERR_WFLUSH, "WFLUSH" }, \
+			{ NFSERR_BADHANDLE, "BADHANDLE" }, \
+			{ NFSERR_NOT_SYNC, "NOTSYNC" }, \
+			{ NFSERR_BAD_COOKIE, "BADCOOKIE" }, \
+			{ NFSERR_NOTSUPP, "NOTSUPP" }, \
+			{ NFSERR_TOOSMALL, "TOOSMALL" }, \
+			{ NFSERR_SERVERFAULT, "REMOTEIO" }, \
+			{ NFSERR_BADTYPE, "BADTYPE" }, \
+			{ NFSERR_JUKEBOX, "JUKEBOX" })
+
+TRACE_EVENT(nfs_xdr_status,
+		TP_PROTO(
+			const struct xdr_stream *xdr,
+			int error
+		),
+
+		TP_ARGS(xdr, error),
+
+		TP_STRUCT__entry(
+			__field(unsigned int, task_id)
+			__field(unsigned int, client_id)
+			__field(u32, xid)
+			__field(unsigned long, error)
+		),
+
+		TP_fast_assign(
+			const struct rpc_rqst *rqstp = xdr->rqst;
+			const struct rpc_task *task = rqstp->rq_task;
+
+			__entry->task_id = task->tk_pid;
+			__entry->client_id = task->tk_client->cl_clid;
+			__entry->xid = be32_to_cpu(rqstp->rq_xid);
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"task:%u@%d xid=0x%08x error=%ld (%s)",
+			__entry->task_id, __entry->client_id, __entry->xid,
+			-__entry->error, nfs_show_status(__entry->error)
+		)
+);
+
 #endif /* _TRACE_NFS_H */
 
 #undef TRACE_INCLUDE_PATH

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 3dbd15b..20b3717 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/nfs/pagelist.c
  *
@@ -16,8 +17,8 @@
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
-#include <linux/nfs_page.h>
 #include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
 #include <linux/nfs_mount.h>
 #include <linux/export.h>
 
@@ -47,7 +48,7 @@
 
 	hdr->req = nfs_list_entry(mirror->pg_list.next);
 	hdr->inode = desc->pg_inode;
-	hdr->cred = hdr->req->wb_context->cred;
+	hdr->cred = nfs_req_openctx(hdr->req)->cred;
 	hdr->io_start = req_offset(hdr->req);
 	hdr->good_bytes = mirror->pg_count;
 	hdr->io_completion = desc->pg_io_completion;
@@ -63,20 +64,20 @@
 
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 {
-	spin_lock(&hdr->lock);
-	if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
-	    || pos < hdr->io_start + hdr->good_bytes) {
+	unsigned int new = pos - hdr->io_start;
+
+	if (hdr->good_bytes > new) {
+		hdr->good_bytes = new;
 		clear_bit(NFS_IOHDR_EOF, &hdr->flags);
-		hdr->good_bytes = pos - hdr->io_start;
-		hdr->error = error;
+		if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags))
+			hdr->error = error;
 	}
-	spin_unlock(&hdr->lock);
 }
 
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
-	struct nfs_page	*p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO);
+	struct nfs_page	*p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
 	if (p)
 		INIT_LIST_HEAD(&p->wb_list);
 	return p;
@@ -295,25 +296,13 @@
 		nfs_release_request(head);
 }
 
-/**
- * nfs_create_request - Create an NFS read/write request.
- * @ctx: open context to use
- * @page: page to write
- * @last: last nfs request created for this page group or NULL if head
- * @offset: starting offset within the page for the write
- * @count: number of bytes to read/write
- *
- * The page must be locked by the caller. This makes sure we never
- * create two different requests for the same page.
- * User should ensure it is safe to sleep in this function.
- */
-struct nfs_page *
-nfs_create_request(struct nfs_open_context *ctx, struct page *page,
-		   struct nfs_page *last, unsigned int offset,
+static struct nfs_page *
+__nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page,
+		   unsigned int pgbase, unsigned int offset,
 		   unsigned int count)
 {
 	struct nfs_page		*req;
-	struct nfs_lock_context *l_ctx;
+	struct nfs_open_context *ctx = l_ctx->open_context;
 
 	if (test_bit(NFS_CONTEXT_BAD, &ctx->flags))
 		return ERR_PTR(-EBADF);
@@ -322,13 +311,8 @@
 	if (req == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	/* get lock context early so we can deal with alloc failures */
-	l_ctx = nfs_get_lock_context(ctx);
-	if (IS_ERR(l_ctx)) {
-		nfs_page_free(req);
-		return ERR_CAST(l_ctx);
-	}
 	req->wb_lock_context = l_ctx;
+	refcount_inc(&l_ctx->count);
 	atomic_inc(&l_ctx->io_count);
 
 	/* Initialize the request struct. Initially, we assume a
@@ -340,17 +324,61 @@
 		get_page(page);
 	}
 	req->wb_offset  = offset;
-	req->wb_pgbase	= offset;
+	req->wb_pgbase	= pgbase;
 	req->wb_bytes   = count;
-	req->wb_context = get_nfs_open_context(ctx);
 	kref_init(&req->wb_kref);
-	nfs_page_group_init(req, last);
+	req->wb_nio = 0;
 	return req;
 }
 
 /**
+ * nfs_create_request - Create an NFS read/write request.
+ * @ctx: open context to use
+ * @page: page to write
+ * @offset: starting offset within the page for the write
+ * @count: number of bytes to read/write
+ *
+ * The page must be locked by the caller. This makes sure we never
+ * create two different requests for the same page.
+ * User should ensure it is safe to sleep in this function.
+ */
+struct nfs_page *
+nfs_create_request(struct nfs_open_context *ctx, struct page *page,
+		   unsigned int offset, unsigned int count)
+{
+	struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx);
+	struct nfs_page *ret;
+
+	if (IS_ERR(l_ctx))
+		return ERR_CAST(l_ctx);
+	ret = __nfs_create_request(l_ctx, page, offset, offset, count);
+	if (!IS_ERR(ret))
+		nfs_page_group_init(ret, NULL);
+	nfs_put_lock_context(l_ctx);
+	return ret;
+}
+
+static struct nfs_page *
+nfs_create_subreq(struct nfs_page *req, struct nfs_page *last,
+		  unsigned int pgbase, unsigned int offset,
+		  unsigned int count)
+{
+	struct nfs_page *ret;
+
+	ret = __nfs_create_request(req->wb_lock_context, req->wb_page,
+			pgbase, offset, count);
+	if (!IS_ERR(ret)) {
+		nfs_lock_request(ret);
+		ret->wb_index = req->wb_index;
+		nfs_page_group_init(ret, last);
+		ret->wb_nio = req->wb_nio;
+	}
+	return ret;
+}
+
+/**
  * nfs_unlock_request - Unlock request and wake up sleepers.
- * @req:
+ * @req: pointer to request
  */
 void nfs_unlock_request(struct nfs_page *req)
 {
@@ -368,7 +396,7 @@
 
 /**
  * nfs_unlock_and_release_request - Unlock request and release the nfs_page
- * @req:
+ * @req: pointer to request
  */
 void nfs_unlock_and_release_request(struct nfs_page *req)
 {
@@ -386,8 +414,8 @@
 static void nfs_clear_request(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
-	struct nfs_open_context *ctx = req->wb_context;
 	struct nfs_lock_context *l_ctx = req->wb_lock_context;
+	struct nfs_open_context *ctx;
 
 	if (page != NULL) {
 		put_page(page);
@@ -396,16 +424,13 @@
 	if (l_ctx != NULL) {
 		if (atomic_dec_and_test(&l_ctx->io_count)) {
 			wake_up_var(&l_ctx->io_count);
+			ctx = l_ctx->open_context;
 			if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
 				rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
 		}
 		nfs_put_lock_context(l_ctx);
 		req->wb_lock_context = NULL;
 	}
-	if (ctx != NULL) {
-		put_nfs_open_context(ctx);
-		req->wb_context = NULL;
-	}
 }
 
 /**
@@ -461,7 +486,7 @@
  * @prev: previous request in desc, or NULL
  * @req: this request
  *
- * Returns zero if @req can be coalesced into @desc, otherwise it returns
+ * Returns zero if @req cannot be coalesced into @desc, otherwise it returns
  * the size of the request.
  */
 size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
@@ -494,7 +519,6 @@
 
 	if (hdr) {
 		INIT_LIST_HEAD(&hdr->pages);
-		spin_lock_init(&hdr->lock);
 		hdr->rw_ops = ops;
 	}
 	return hdr;
@@ -532,7 +556,6 @@
  * nfs_pgio_rpcsetup - Set up arguments for a pageio call
  * @hdr: The pageio hdr
  * @count: Number of bytes to read
- * @offset: Initial offset
  * @how: How to commit data (writes only)
  * @cinfo: Commit information for the call (writes only)
  */
@@ -552,7 +575,7 @@
 	hdr->args.pgbase = req->wb_pgbase;
 	hdr->args.pages  = hdr->page_array.pagevec;
 	hdr->args.count  = count;
-	hdr->args.context = get_nfs_open_context(req->wb_context);
+	hdr->args.context = get_nfs_open_context(nfs_req_openctx(req));
 	hdr->args.lock_context = req->wb_lock_context;
 	hdr->args.stable  = NFS_UNSTABLE;
 	switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
@@ -567,7 +590,7 @@
 	}
 
 	hdr->res.fattr   = &hdr->fattr;
-	hdr->res.count   = count;
+	hdr->res.count   = 0;
 	hdr->res.eof     = 0;
 	hdr->res.verf    = &hdr->verf;
 	nfs_fattr_init(&hdr->fattr);
@@ -588,7 +611,7 @@
 }
 
 int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
-		      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+		      const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
 		      const struct rpc_call_ops *call_ops, int how, int flags)
 {
 	struct rpc_task *task;
@@ -635,7 +658,6 @@
 
 /**
  * nfs_pgio_error - Clean up from a pageio error
- * @desc: IO descriptor
  * @hdr: pageio header
  */
 static void nfs_pgio_error(struct nfs_pgio_header *hdr)
@@ -701,6 +723,7 @@
 	desc->pg_mirrors_dynamic = NULL;
 	desc->pg_mirrors = desc->pg_mirrors_static;
 	nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
+	desc->pg_maxretrans = 0;
 }
 
 /**
@@ -752,8 +775,6 @@
 	if (pagecount <= ARRAY_SIZE(pg_array->page_array))
 		pg_array->pagevec = pg_array->page_array;
 	else {
-		if (hdr->rw_mode == FMODE_WRITE)
-			gfp_flags = GFP_NOIO;
 		pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags);
 		if (!pg_array->pagevec) {
 			pg_array->npages = 0;
@@ -769,8 +790,7 @@
 	pageused = 0;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_list_add_request(req, &hdr->pages);
+		nfs_list_move_request(req, &hdr->pages);
 
 		if (!last_page || last_page != req->wb_page) {
 			pageused++;
@@ -829,7 +849,7 @@
 	desc->pg_mirrors_dynamic = NULL;
 	if (mirror_count == 1)
 		return desc->pg_mirrors_static;
-	ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_NOFS);
+	ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL);
 	if (ret != NULL) {
 		for (i = 0; i < mirror_count; i++)
 			nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
@@ -894,6 +914,7 @@
  * nfs_can_coalesce_requests - test two requests for compatibility
  * @prev: pointer to nfs_page
  * @req: pointer to nfs_page
+ * @pgio: pointer to nfs_pagio_descriptor
  *
  * The nfs_page structures 'prev' and 'req' are compared to ensure that the
  * page data area they describe is contiguous, and that their RPC
@@ -909,9 +930,9 @@
 	struct file_lock_context *flctx;
 
 	if (prev) {
-		if (!nfs_match_open_context(req->wb_context, prev->wb_context))
+		if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
 			return false;
-		flctx = d_inode(req->wb_context->dentry)->i_flctx;
+		flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
 		if (flctx != NULL &&
 		    !(list_empty_careful(&flctx->flc_posix) &&
 		      list_empty_careful(&flctx->flc_flock)) &&
@@ -960,10 +981,18 @@
 			return 0;
 		mirror->pg_base = req->wb_pgbase;
 	}
+
+	if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) {
+		if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR)
+			desc->pg_error = -ETIMEDOUT;
+		else
+			desc->pg_error = -EIO;
+		return 0;
+	}
+
 	if (!nfs_can_coalesce_requests(prev, req, desc))
 		return 0;
-	nfs_list_remove_request(req);
-	nfs_list_add_request(req, &mirror->pg_list);
+	nfs_list_move_request(req, &mirror->pg_list);
 	mirror->pg_count += req->wb_bytes;
 	return 1;
 }
@@ -989,6 +1018,16 @@
 	}
 }
 
+static void
+nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
+		struct nfs_page *req)
+{
+	LIST_HEAD(head);
+
+	nfs_list_move_request(req, &head);
+	desc->pg_completion_ops->error_cleanup(&head, desc->pg_error);
+}
+
 /**
  * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
  * @desc: destination io descriptor
@@ -1026,10 +1065,8 @@
 			nfs_page_group_unlock(req);
 			desc->pg_moreio = 1;
 			nfs_pageio_doio(desc);
-			if (desc->pg_error < 0)
-				return 0;
-			if (mirror->pg_recoalesce)
-				return 0;
+			if (desc->pg_error < 0 || mirror->pg_recoalesce)
+				goto out_cleanup_subreq;
 			/* retry add_request for this subreq */
 			nfs_page_group_lock(req);
 			continue;
@@ -1045,14 +1082,10 @@
 		pgbase += subreq->wb_bytes;
 
 		if (bytes_left) {
-			subreq = nfs_create_request(req->wb_context,
-					req->wb_page,
-					subreq, pgbase, bytes_left);
+			subreq = nfs_create_subreq(req, subreq, pgbase,
+					offset, bytes_left);
 			if (IS_ERR(subreq))
 				goto err_ptr;
-			nfs_lock_request(subreq);
-			subreq->wb_offset  = offset;
-			subreq->wb_index = req->wb_index;
 		}
 	} while (bytes_left > 0);
 
@@ -1062,6 +1095,10 @@
 	desc->pg_error = PTR_ERR(subreq);
 	nfs_page_group_unlock(req);
 	return 0;
+out_cleanup_subreq:
+	if (req != subreq)
+		nfs_pageio_cleanup_request(desc, subreq);
+	return 0;
 }
 
 static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1080,7 +1117,6 @@
 			struct nfs_page *req;
 
 			req = list_first_entry(&head, struct nfs_page, wb_list);
-			nfs_list_remove_request(req);
 			if (__nfs_pageio_add_request(desc, req))
 				continue;
 			if (desc->pg_error < 0) {
@@ -1121,7 +1157,8 @@
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
 		mirror = &desc->pg_mirrors[midx];
-		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
+				desc->pg_error);
 	}
 }
 
@@ -1150,35 +1187,29 @@
 			     lastreq = lastreq->wb_this_page)
 				;
 
-			dupreq = nfs_create_request(req->wb_context,
-					req->wb_page, lastreq, pgbase, bytes);
+			dupreq = nfs_create_subreq(req, lastreq,
+					pgbase, offset, bytes);
 
+			nfs_page_group_unlock(req);
 			if (IS_ERR(dupreq)) {
-				nfs_page_group_unlock(req);
 				desc->pg_error = PTR_ERR(dupreq);
 				goto out_failed;
 			}
-
-			nfs_lock_request(dupreq);
-			nfs_page_group_unlock(req);
-			dupreq->wb_offset = offset;
-			dupreq->wb_index = req->wb_index;
 		} else
 			dupreq = req;
 
 		if (nfs_pgio_has_mirroring(desc))
 			desc->pg_mirror_idx = midx;
 		if (!nfs_pageio_add_request_mirror(desc, dupreq))
-			goto out_failed;
+			goto out_cleanup_subreq;
 	}
 
 	return 1;
 
+out_cleanup_subreq:
+	if (req != dupreq)
+		nfs_pageio_cleanup_request(desc, dupreq);
 out_failed:
-	/* remember fatal errors */
-	if (nfs_error_is_fatal(desc->pg_error))
-		nfs_context_set_write_error(req->wb_context,
-						desc->pg_error);
 	nfs_pageio_error_cleanup(desc);
 	return 0;
 }
@@ -1199,7 +1230,7 @@
 		desc->pg_mirror_idx = mirror_idx;
 	for (;;) {
 		nfs_pageio_doio(desc);
-		if (!mirror->pg_recoalesce)
+		if (desc->pg_error < 0 || !mirror->pg_recoalesce)
 			break;
 		if (!nfs_do_recoalesce(desc))
 			break;
@@ -1220,21 +1251,23 @@
 int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 		      struct nfs_pgio_header *hdr)
 {
-	LIST_HEAD(failed);
+	LIST_HEAD(pages);
 
 	desc->pg_io_completion = hdr->io_completion;
 	desc->pg_dreq = hdr->dreq;
-	while (!list_empty(&hdr->pages)) {
-		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+	list_splice_init(&hdr->pages, &pages);
+	while (!list_empty(&pages)) {
+		struct nfs_page *req = nfs_list_entry(pages.next);
 
-		nfs_list_remove_request(req);
 		if (!nfs_pageio_add_request(desc, req))
-			nfs_list_add_request(req, &failed);
+			break;
 	}
 	nfs_pageio_complete(desc);
-	if (!list_empty(&failed)) {
-		list_move(&failed, &hdr->pages);
-		return desc->pg_error < 0 ? desc->pg_error : -EIO;
+	if (!list_empty(&pages)) {
+		int err = desc->pg_error < 0 ? desc->pg_error : -EIO;
+		hdr->completion_ops->error_cleanup(&pages, err);
+		nfs_set_pgio_error(hdr, err, hdr->io_start);
+		return err;
 	}
 	return 0;
 }

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7d9a51e..bb80034 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c

@@ -275,7 +275,7 @@
 		list_del_init(&lo->plh_layouts);
 		spin_unlock(&clp->cl_lock);
 	}
-	put_rpccred(lo->plh_lc_cred);
+	put_cred(lo->plh_lc_cred);
 	return ld->free_layout_hdr(lo);
 }
 
@@ -359,9 +359,10 @@
 }
 
 /*
- * Update the seqid of a layout stateid
+ * Update the seqid of a layout stateid after receiving
+ * NFS4ERR_OLD_STATEID
  */
-bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
 		struct pnfs_layout_range *dst_range,
 		struct inode *inode)
 {
@@ -377,7 +378,15 @@
 
 	spin_lock(&inode->i_lock);
 	lo = NFS_I(inode)->layout;
-	if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+	if (lo &&  pnfs_layout_is_valid(lo) &&
+	    nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+		/* Is our call using the most recent seqid? If so, bump it */
+		if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
+			nfs4_stateid_seqid_inc(dst);
+			ret = true;
+			goto out;
+		}
+		/* Try to update the seqid to the most recent */
 		err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
 		if (err != -EBUSY) {
 			dst->seqid = lo->plh_stateid.seqid;
@@ -385,6 +394,7 @@
 			ret = true;
 		}
 	}
+out:
 	spin_unlock(&inode->i_lock);
 	pnfs_free_lseg_list(&head);
 	return ret;
@@ -758,22 +768,35 @@
 pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
 		struct nfs_server *server,
 		struct list_head *layout_list)
+	__must_hold(&clp->cl_lock)
+	__must_hold(RCU)
 {
 	struct pnfs_layout_hdr *lo, *next;
 	struct inode *inode;
 
 	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
-		if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+		if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+		    test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
+		    !list_empty(&lo->plh_bulk_destroy))
 			continue;
+		/* If the sb is being destroyed, just bail */
+		if (!nfs_sb_active(server->super))
+			break;
 		inode = igrab(lo->plh_inode);
-		if (inode == NULL)
-			continue;
-		list_del_init(&lo->plh_layouts);
-		if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
-			continue;
-		rcu_read_unlock();
-		spin_unlock(&clp->cl_lock);
-		iput(inode);
+		if (inode != NULL) {
+			list_del_init(&lo->plh_layouts);
+			if (pnfs_layout_add_bulk_destroy_list(inode,
+						layout_list))
+				continue;
+			rcu_read_unlock();
+			spin_unlock(&clp->cl_lock);
+			iput(inode);
+		} else {
+			rcu_read_unlock();
+			spin_unlock(&clp->cl_lock);
+			set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+		}
+		nfs_sb_deactive(server->super);
 		spin_lock(&clp->cl_lock);
 		rcu_read_lock();
 		return -EAGAIN;
@@ -811,7 +834,7 @@
 		/* Free all lsegs that are attached to commit buckets */
 		nfs_commit_inode(inode, 0);
 		pnfs_put_layout_hdr(lo);
-		iput(inode);
+		nfs_iput_and_deactive(inode);
 	}
 	return ret;
 }
@@ -965,7 +988,7 @@
 	struct page **pages;
 	int i;
 
-	pages = kcalloc(size, sizeof(struct page *), gfp_flags);
+	pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
 	if (!pages) {
 		dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
 		return NULL;
@@ -975,7 +998,7 @@
 		pages[i] = alloc_page(gfp_flags);
 		if (!pages[i]) {
 			dprintk("%s: failed to allocate page\n", __func__);
-			nfs4_free_pages(pages, size);
+			nfs4_free_pages(pages, i);
 			return NULL;
 		}
 	}
@@ -991,6 +1014,7 @@
 	   gfp_t gfp_flags)
 {
 	struct nfs_server *server = pnfs_find_server(ino, ctx);
+	size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
 	size_t max_pages = max_response_pages(server);
 	struct nfs4_layoutget *lgp;
 
@@ -1000,6 +1024,12 @@
 	if (lgp == NULL)
 		return NULL;
 
+	if (max_reply_sz) {
+		size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (npages < max_pages)
+			max_pages = npages;
+	}
+
 	lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
 	if (!lgp->args.layout.pages) {
 		kfree(lgp);
@@ -1031,7 +1061,7 @@
 	lgp->args.ctx = get_nfs_open_context(ctx);
 	nfs4_stateid_copy(&lgp->args.stateid, stateid);
 	lgp->gfp_flags = gfp_flags;
-	lgp->cred = get_rpccred(ctx->cred);
+	lgp->cred = get_cred(ctx->cred);
 	return lgp;
 }
 
@@ -1042,7 +1072,7 @@
 	nfs4_free_pages(lgp->args.layout.pages, max_pages);
 	if (lgp->args.inode)
 		pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
-	put_rpccred(lgp->cred);
+	put_cred(lgp->cred);
 	put_nfs_open_context(lgp->args.ctx);
 	kfree(lgp);
 }
@@ -1317,7 +1347,7 @@
 bool pnfs_roc(struct inode *ino,
 		struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
-		const struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct nfs_open_context *ctx;
@@ -1332,6 +1362,7 @@
 	if (!nfs_have_layout(ino))
 		return false;
 retry:
+	rcu_read_lock();
 	spin_lock(&ino->i_lock);
 	lo = nfsi->layout;
 	if (!lo || !pnfs_layout_is_valid(lo) ||
@@ -1342,6 +1373,7 @@
 	pnfs_get_layout_hdr(lo);
 	if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
 		spin_unlock(&ino->i_lock);
+		rcu_read_unlock();
 		wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
 				TASK_UNINTERRUPTIBLE);
 		pnfs_put_layout_hdr(lo);
@@ -1355,7 +1387,7 @@
 		skip_read = true;
 	}
 
-	list_for_each_entry(ctx, &nfsi->open_files, list) {
+	list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
 		state = ctx->state;
 		if (state == NULL)
 			continue;
@@ -1403,6 +1435,7 @@
 
 out_noroc:
 	spin_unlock(&ino->i_lock);
+	rcu_read_unlock();
 	pnfs_layoutcommit_inode(ino, true);
 	if (roc) {
 		struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
@@ -1417,6 +1450,52 @@
 	return false;
 }
 
+int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+		struct nfs4_layoutreturn_args **argpp,
+		struct nfs4_layoutreturn_res **respp,
+		int *ret)
+{
+	struct nfs4_layoutreturn_args *arg = *argpp;
+	int retval = -EAGAIN;
+
+	if (!arg)
+		return 0;
+	/* Handle Layoutreturn errors */
+	switch (*ret) {
+	case 0:
+		retval = 0;
+		break;
+	case -NFS4ERR_NOMATCHING_LAYOUT:
+		/* Was there an RPC level error? If not, retry */
+		if (task->tk_rpc_status == 0)
+			break;
+		/* If the call was not sent, let caller handle it */
+		if (!RPC_WAS_SENT(task))
+			return 0;
+		/*
+		 * Otherwise, assume the call succeeded and
+		 * that we need to release the layout
+		 */
+		*ret = 0;
+		(*respp)->lrs_present = 0;
+		retval = 0;
+		break;
+	case -NFS4ERR_DELAY:
+		/* Let the caller handle the retry */
+		*ret = -NFS4ERR_NOMATCHING_LAYOUT;
+		return 0;
+	case -NFS4ERR_OLD_STATEID:
+		if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
+					&arg->range, inode))
+			break;
+		*ret = -NFS4ERR_NOMATCHING_LAYOUT;
+		return -EAGAIN;
+	}
+	*argpp = NULL;
+	*respp = NULL;
+	return retval;
+}
+
 void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
 		int ret)
@@ -1426,10 +1505,15 @@
 	const nfs4_stateid *res_stateid = NULL;
 	struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
 
-	if (ret == 0) {
-		arg_stateid = &args->stateid;
+	switch (ret) {
+	case -NFS4ERR_NOMATCHING_LAYOUT:
+		break;
+	case 0:
 		if (res->lrs_present)
 			res_stateid = &res->stateid;
+		/* Fallthrough */
+	default:
+		arg_stateid = &args->stateid;
 	}
 	pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
 			res_stateid);
@@ -1573,7 +1657,7 @@
 	INIT_LIST_HEAD(&lo->plh_return_segs);
 	INIT_LIST_HEAD(&lo->plh_bulk_destroy);
 	lo->plh_inode = ino;
-	lo->plh_lc_cred = get_rpccred(ctx->cred);
+	lo->plh_lc_cred = get_cred(ctx->cred);
 	lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
 	return lo;
 }
@@ -1866,8 +1950,8 @@
 	    atomic_read(&lo->plh_outstanding) != 0) {
 		spin_unlock(&ino->i_lock);
 		lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
-					atomic_read(&lo->plh_outstanding)));
-		if (IS_ERR(lseg) || !list_empty(&lo->plh_segs))
+					!atomic_read(&lo->plh_outstanding)));
+		if (IS_ERR(lseg))
 			goto out_put_layout_hdr;
 		pnfs_put_layout_hdr(lo);
 		goto lookup_again;
@@ -1880,18 +1964,13 @@
 		goto out_unlock;
 	}
 
-	if (!nfs4_valid_open_stateid(ctx->state)) {
-		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
-				PNFS_UPDATE_LAYOUT_INVALID_OPEN);
-		goto out_unlock;
-	}
-
 	/*
 	 * Choose a stateid for the LAYOUTGET. If we don't have a layout
 	 * stateid, or it has been invalidated, then we must use the open
 	 * stateid.
 	 */
 	if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+		int status;
 
 		/*
 		 * The first layoutget for the file. Need to serialize per
@@ -1911,13 +1990,21 @@
 		}
 
 		first = true;
-		if (nfs4_select_rw_stateid(ctx->state,
+		status = nfs4_select_rw_stateid(ctx->state,
 					iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
-					NULL, &stateid, NULL) != 0) {
+					NULL, &stateid, NULL);
+		if (status != 0) {
+			lseg = ERR_PTR(status);
 			trace_pnfs_update_layout(ino, pos, count,
 					iomode, lo, lseg,
 					PNFS_UPDATE_LAYOUT_INVALID_OPEN);
-			goto out_unlock;
+			if (status != -EAGAIN)
+				goto out_unlock;
+			spin_unlock(&ino->i_lock);
+			nfs4_schedule_stateid_recovery(server, ctx->state);
+			pnfs_clear_first_layoutget(lo);
+			pnfs_put_layout_hdr(lo);
+			goto lookup_again;
 		}
 	} else {
 		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
@@ -2006,6 +2093,8 @@
 out_put_layout_hdr:
 	if (first)
 		pnfs_clear_first_layoutget(lo);
+	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				 PNFS_UPDATE_LAYOUT_EXIT);
 	pnfs_put_layout_hdr(lo);
 out:
 	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
@@ -2413,7 +2502,7 @@
 			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
 
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   req->wb_context,
+						   nfs_req_openctx(req),
 						   req_offset(req),
 						   rd_size,
 						   IOMODE_READ,
@@ -2440,12 +2529,12 @@
 	pnfs_generic_pg_check_range(pgio, req);
 	if (pgio->pg_lseg == NULL) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   req->wb_context,
+						   nfs_req_openctx(req),
 						   req_offset(req),
 						   wb_size,
 						   IOMODE_RW,
 						   false,
-						   GFP_NOFS);
+						   GFP_KERNEL);
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -2918,7 +3007,7 @@
 	spin_unlock(&inode->i_lock);
 
 	data->args.inode = inode;
-	data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
+	data->cred = get_cred(nfsi->layout->plh_lc_cred);
 	nfs_fattr_init(&data->fattr);
 	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
 	data->res.fattr = &data->fattr;
@@ -2931,7 +3020,7 @@
 	if (ld->prepare_layoutcommit) {
 		status = ld->prepare_layoutcommit(&data->args);
 		if (status) {
-			put_rpccred(data->cred);
+			put_cred(data->cred);
 			spin_lock(&inode->i_lock);
 			set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
 			if (end_pos > nfsi->layout->plh_lwb)

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index ece367e..f8a3806 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h

@@ -104,6 +104,7 @@
 	NFS_LAYOUT_RETURN_REQUESTED,	/* Return this layout ASAP */
 	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
 	NFS_LAYOUT_FIRST_LAYOUTGET,	/* Serialize first layoutget */
+	NFS_LAYOUT_INODE_FREEING,	/* The inode is being freed */
 };
 
 enum layoutdriver_policy_flags {
@@ -125,6 +126,7 @@
 	struct module *owner;
 	unsigned flags;
 	unsigned max_deviceinfo_size;
+	unsigned max_layoutget_response;
 
 	int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
 	int (*clear_layoutdriver) (struct nfs_server *);
@@ -199,7 +201,7 @@
 	u32			plh_return_seq;
 	enum pnfs_iomode	plh_return_iomode;
 	loff_t			plh_lwb; /* last write byte for layoutcommit */
-	struct rpc_cred		*plh_lc_cred; /* layoutcommit cred */
+	const struct cred	*plh_lc_cred; /* layoutcommit cred */
 	struct inode		*plh_inode;
 };
 
@@ -229,7 +231,7 @@
 extern size_t max_response_pages(struct nfs_server *server);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 				   struct pnfs_device *dev,
-				   struct rpc_cred *cred);
+				   const struct cred *cred);
 extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 
@@ -259,7 +261,7 @@
 		bool is_recall);
 int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
 		bool is_recall);
-bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
 		struct pnfs_layout_range *dst_range,
 		struct inode *inode);
 void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -279,7 +281,11 @@
 bool pnfs_roc(struct inode *ino,
 		struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
-		const struct rpc_cred *cred);
+		const struct cred *cred);
+int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+		struct nfs4_layoutreturn_args **argpp,
+		struct nfs4_layoutreturn_res **respp,
+		int *ret);
 void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
 		int ret);
@@ -342,12 +348,13 @@
 
 struct nfs4_deviceid_node *
 nfs4_find_get_deviceid(struct nfs_server *server,
-		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		const struct nfs4_deviceid *id, const struct cred *cred,
 		gfp_t gfp_mask);
 void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
 void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
 			     const struct nfs4_deviceid *);
 bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node);
 void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
 bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
@@ -456,7 +463,7 @@
 pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 			 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
 {
-	struct inode *inode = d_inode(req->wb_context->dentry);
+	struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 
 	if (lseg == NULL || ld->mark_request_commit == NULL)
@@ -468,7 +475,7 @@
 static inline bool
 pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = d_inode(req->wb_context->dentry);
+	struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 
 	if (ld == NULL || ld->clear_request_commit == NULL)
@@ -693,11 +700,20 @@
 pnfs_roc(struct inode *ino,
 		struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
-		const struct rpc_cred *cred)
+		const struct cred *cred)
 {
 	return false;
 }
 
+static inline int
+pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+		struct nfs4_layoutreturn_args **argpp,
+		struct nfs4_layoutreturn_res **respp,
+		int *ret)
+{
+	return 0;
+}
+
 static inline void
 pnfs_roc_release(struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
@@ -782,7 +798,7 @@
 {
 }
 
-static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
 		struct pnfs_layout_range *dst_range,
 		struct inode *inode)
 {

diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index e8a07b3..537b80d 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c

@@ -94,7 +94,7 @@
 static struct nfs4_deviceid_node *
 nfs4_get_device_info(struct nfs_server *server,
 		const struct nfs4_deviceid *dev_id,
-		struct rpc_cred *cred, gfp_t gfp_flags)
+		const struct cred *cred, gfp_t gfp_flags)
 {
 	struct nfs4_deviceid_node *d = NULL;
 	struct pnfs_device *pdev = NULL;
@@ -184,7 +184,7 @@
 
 struct nfs4_deviceid_node *
 nfs4_find_get_deviceid(struct nfs_server *server,
-		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		const struct nfs4_deviceid *id, const struct cred *cred,
 		gfp_t gfp_mask)
 {
 	long hash = nfs4_deviceid_hash(id);
@@ -284,10 +284,22 @@
 EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
 
 void
+nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node)
+{
+	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+		clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+		smp_mb__after_atomic();
+	}
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available);
+
+void
 nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
 {
 	node->timestamp_unavailable = jiffies;
+	smp_mb__before_atomic();
 	set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+	smp_mb__after_atomic();
 }
 EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
 
@@ -302,6 +314,7 @@
 		if (time_in_range(node->timestamp_unavailable, start, end))
 			return true;
 		clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+		smp_mb__after_atomic();
 	}
 	return false;
 }

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index d5e4d3c..82af480 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Common NFS I/O  operations for the pnfs file based
  * layout drivers.
@@ -626,11 +627,16 @@
 			/* Add this address as an alias */
 			rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
 					rpc_clnt_test_and_add_xprt, NULL);
-		} else
-			clp = get_v3_ds_connect(mds_srv,
-					(struct sockaddr *)&da->da_addr,
-					da->da_addrlen, IPPROTO_TCP,
-					timeo, retrans);
+			continue;
+		}
+		clp = get_v3_ds_connect(mds_srv,
+				(struct sockaddr *)&da->da_addr,
+				da->da_addrlen, IPPROTO_TCP,
+				timeo, retrans);
+		if (IS_ERR(clp))
+			continue;
+		clp->cl_rpcclient->cl_softerr = 0;
+		clp->cl_rpcclient->cl_softrtry = 0;
 	}
 
 	if (IS_ERR(clp)) {
@@ -686,7 +692,7 @@
 					  rpc_clnt_setup_test_and_add_xprt,
 					  &rpcdata);
 			if (xprtdata.cred)
-				put_rpccred(xprtdata.cred);
+				put_cred(xprtdata.cred);
 		} else {
 			clp = nfs4_set_ds_client(mds_srv,
 						(struct sockaddr *)&da->da_addr,

diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e0c257b..0f7288b 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c

@@ -490,7 +490,7 @@
  * from nfs_readdir by calling the decode_entry function directly.
  */
 static int
-nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+nfs_proc_readdir(struct dentry *dentry, const struct cred *cred,
 		 u64 cookie, struct page **pages, unsigned int count, bool plus)
 {
 	struct inode		*dir = d_inode(dentry);
@@ -594,7 +594,8 @@
 		/* Emulate the eof flag, which isn't normally needed in NFSv2
 		 * as it is guaranteed to always return the file attributes
 		 */
-		if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+		if ((hdr->res.count == 0 && hdr->args.count > 0) ||
+		    hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
 			hdr->res.eof = 1;
 	}
 	return 0;
@@ -615,8 +616,10 @@
 
 static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-	if (task->tk_status >= 0)
+	if (task->tk_status >= 0) {
+		hdr->res.count = hdr->args.count;
 		nfs_writeback_update_inode(hdr);
+	}
 	return 0;
 }
 

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 48d7277..cfe0b58 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/nfs/read.c
  *
@@ -90,19 +91,25 @@
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
-static void nfs_readpage_release(struct nfs_page *req)
+static void nfs_readpage_release(struct nfs_page *req, int error)
 {
-	struct inode *inode = d_inode(req->wb_context->dentry);
+	struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
+	struct page *page = req->wb_page;
 
 	dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
 		(unsigned long long)NFS_FILEID(inode), req->wb_bytes,
 		(long long)req_offset(req));
 
+	if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
+		SetPageError(page);
 	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
-		if (PageUptodate(req->wb_page))
-			nfs_readpage_to_fscache(inode, req->wb_page, 0);
+		struct address_space *mapping = page_file_mapping(page);
 
-		unlock_page(req->wb_page);
+		if (PageUptodate(page))
+			nfs_readpage_to_fscache(inode, page, 0);
+		else if (!PageError(page) && !PagePrivate(page))
+			generic_error_remove_page(mapping, page);
+		unlock_page(page);
 	}
 	nfs_release_request(req);
 }
@@ -118,7 +125,7 @@
 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);
-	new = nfs_create_request(ctx, page, NULL, 0, len);
+	new = nfs_create_request(ctx, page, 0, len);
 	if (IS_ERR(new)) {
 		unlock_page(page);
 		return PTR_ERR(new);
@@ -130,7 +137,7 @@
 			     &nfs_async_read_completion_ops);
 	if (!nfs_pageio_add_request(&pgio, new)) {
 		nfs_list_remove_request(new);
-		nfs_readpage_release(new);
+		nfs_readpage_release(new, pgio.pg_error);
 	}
 	nfs_pageio_complete(&pgio);
 
@@ -152,6 +159,7 @@
 static void nfs_read_completion(struct nfs_pgio_header *hdr)
 {
 	unsigned long bytes = 0;
+	int error;
 
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
 		goto out;
@@ -178,14 +186,19 @@
 				zero_user_segment(page, start, end);
 			}
 		}
+		error = 0;
 		bytes += req->wb_bytes;
 		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
 			if (bytes <= hdr->good_bytes)
 				nfs_page_group_set_uptodate(req);
+			else {
+				error = hdr->error;
+				xchg(&nfs_req_openctx(req)->error, error);
+			}
 		} else
 			nfs_page_group_set_uptodate(req);
 		nfs_list_remove_request(req);
-		nfs_readpage_release(req);
+		nfs_readpage_release(req, error);
 	}
 out:
 	hdr->release(hdr);
@@ -205,14 +218,14 @@
 }
 
 static void
-nfs_async_read_error(struct list_head *head)
+nfs_async_read_error(struct list_head *head, int error)
 {
 	struct nfs_page	*req;
 
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_readpage_release(req);
+		nfs_readpage_release(req, error);
 	}
 }
 
@@ -276,16 +289,14 @@
 				struct nfs_pgio_header *hdr)
 {
 	if (hdr->res.eof) {
-		loff_t bound;
+		loff_t pos = hdr->args.offset + hdr->res.count;
+		unsigned int new = pos - hdr->io_start;
 
-		bound = hdr->args.offset + hdr->res.count;
-		spin_lock(&hdr->lock);
-		if (bound < hdr->io_start + hdr->good_bytes) {
+		if (hdr->good_bytes > new) {
+			hdr->good_bytes = new;
 			set_bit(NFS_IOHDR_EOF, &hdr->flags);
 			clear_bit(NFS_IOHDR_ERROR, &hdr->flags);
-			hdr->good_bytes = bound - hdr->io_start;
 		}
-		spin_unlock(&hdr->lock);
 	} else if (hdr->res.count < hdr->args.count)
 		nfs_readpage_retry(task, hdr);
 }
@@ -338,8 +349,13 @@
 			goto out;
 	}
 
+	xchg(&ctx->error, 0);
 	error = nfs_readpage_async(ctx, inode, page);
-
+	if (!error) {
+		error = wait_on_page_locked_killable(page);
+		if (!PageUptodate(page) && !error)
+			error = xchg(&ctx->error, 0);
+	}
 out:
 	put_nfs_open_context(ctx);
 	return error;
@@ -365,7 +381,7 @@
 	if (len == 0)
 		return nfs_return_empty_page(page);
 
-	new = nfs_create_request(desc->ctx, page, NULL, 0, len);
+	new = nfs_create_request(desc->ctx, page, 0, len);
 	if (IS_ERR(new))
 		goto out_error;
 
@@ -373,8 +389,8 @@
 		zero_user_segment(page, len, PAGE_SIZE);
 	if (!nfs_pageio_add_request(desc->pgio, new)) {
 		nfs_list_remove_request(new);
-		nfs_readpage_release(new);
 		error = desc->pgio->pg_error;
+		nfs_readpage_release(new, error);
 		goto out;
 	}
 	return 0;

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ac4b2f0..a84df7d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/nfs/super.c
  *
@@ -76,9 +77,11 @@
 #define NFS_DEFAULT_VERSION 2
 #endif
 
+#define NFS_MAX_CONNECTIONS 16
+
 enum {
 	/* Mount options that take no arguments */
-	Opt_soft, Opt_hard,
+	Opt_soft, Opt_softerr, Opt_hard,
 	Opt_posix, Opt_noposix,
 	Opt_cto, Opt_nocto,
 	Opt_ac, Opt_noac,
@@ -107,6 +110,7 @@
 	Opt_nfsvers,
 	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
 	Opt_addr, Opt_mountaddr, Opt_clientaddr,
+	Opt_nconnect,
 	Opt_lookupcache,
 	Opt_fscache_uniq,
 	Opt_local_lock,
@@ -125,6 +129,7 @@
 	{ Opt_sloppy, "sloppy" },
 
 	{ Opt_soft, "soft" },
+	{ Opt_softerr, "softerr" },
 	{ Opt_hard, "hard" },
 	{ Opt_deprecated, "intr" },
 	{ Opt_deprecated, "nointr" },
@@ -179,6 +184,8 @@
 	{ Opt_mounthost, "mounthost=%s" },
 	{ Opt_mountaddr, "mountaddr=%s" },
 
+	{ Opt_nconnect, "nconnect=%s" },
+
 	{ Opt_lookupcache, "lookupcache=%s" },
 	{ Opt_fscache_uniq, "fsc=%s" },
 	{ Opt_local_lock, "local_lock=%s" },
@@ -309,7 +316,7 @@
 
 const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
-	.destroy_inode	= nfs_destroy_inode,
+	.free_inode	= nfs_free_inode,
 	.write_inode	= nfs_write_inode,
 	.drop_inode	= nfs_drop_inode,
 	.statfs		= nfs_statfs,
@@ -450,10 +457,8 @@
 		struct dentry *pd_dentry;
 
 		pd_dentry = dget_parent(dentry);
-		if (pd_dentry != NULL) {
-			nfs_zap_caches(d_inode(pd_dentry));
-			dput(pd_dentry);
-		}
+		nfs_zap_caches(d_inode(pd_dentry));
+		dput(pd_dentry);
 	}
 	nfs_free_fattr(res.fattr);
 	if (error < 0)
@@ -580,7 +585,7 @@
 	}
 	default:
 		if (showdefaults)
-			seq_printf(m, ",mountaddr=unspecified");
+			seq_puts(m, ",mountaddr=unspecified");
 	}
 
 	if (nfss->mountd_version || showdefaults)
@@ -628,7 +633,8 @@
 		const char *str;
 		const char *nostr;
 	} nfs_info[] = {
-		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
+		{ NFS_MOUNT_SOFT, ",soft", "" },
+		{ NFS_MOUNT_SOFTERR, ",softerr", "" },
 		{ NFS_MOUNT_POSIX, ",posix", "" },
 		{ NFS_MOUNT_NOCTO, ",nocto", "" },
 		{ NFS_MOUNT_NOAC, ",noac", "" },
@@ -658,6 +664,8 @@
 		seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
 	if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
 		seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
+	if (!(nfss->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)))
+			seq_puts(m, ",hard");
 	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 		if (nfss->flags & nfs_infop->flag)
 			seq_puts(m, nfs_infop->str);
@@ -668,6 +676,8 @@
 	seq_printf(m, ",proto=%s",
 		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
 	rcu_read_unlock();
+	if (clp->cl_nconnect > 0)
+		seq_printf(m, ",nconnect=%u", clp->cl_nconnect);
 	if (version == 4) {
 		if (nfss->port != NFS_PORT)
 			seq_printf(m, ",port=%u", nfss->port);
@@ -685,29 +695,29 @@
 		nfs_show_nfsv4_options(m, nfss, showdefaults);
 
 	if (nfss->options & NFS_OPTION_FSCACHE)
-		seq_printf(m, ",fsc");
+		seq_puts(m, ",fsc");
 
 	if (nfss->options & NFS_OPTION_MIGRATION)
-		seq_printf(m, ",migration");
+		seq_puts(m, ",migration");
 
 	if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
 		if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
-			seq_printf(m, ",lookupcache=none");
+			seq_puts(m, ",lookupcache=none");
 		else
-			seq_printf(m, ",lookupcache=pos");
+			seq_puts(m, ",lookupcache=pos");
 	}
 
 	local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
 	local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
 
 	if (!local_flock && !local_fcntl)
-		seq_printf(m, ",local_lock=none");
+		seq_puts(m, ",local_lock=none");
 	else if (local_flock && local_fcntl)
-		seq_printf(m, ",local_lock=all");
+		seq_puts(m, ",local_lock=all");
 	else if (local_flock)
-		seq_printf(m, ",local_lock=flock");
+		seq_puts(m, ",local_lock=flock");
 	else
-		seq_printf(m, ",local_lock=posix");
+		seq_puts(m, ",local_lock=posix");
 }
 
 /*
@@ -730,11 +740,21 @@
 EXPORT_SYMBOL_GPL(nfs_show_options);
 
 #if IS_ENABLED(CONFIG_NFS_V4)
+static void show_lease(struct seq_file *m, struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	unsigned long expire;
+
+	seq_printf(m, ",lease_time=%ld", clp->cl_lease_time / HZ);
+	expire = clp->cl_last_renewal + clp->cl_lease_time;
+	seq_printf(m, ",lease_expired=%ld",
+		   time_after(expire, jiffies) ?  0 : (jiffies - expire) / HZ);
+}
 #ifdef CONFIG_NFS_V4_1
 static void show_sessions(struct seq_file *m, struct nfs_server *server)
 {
 	if (nfs4_has_session(server->nfs_client))
-		seq_printf(m, ",sessions");
+		seq_puts(m, ",sessions");
 }
 #else
 static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
@@ -811,7 +831,7 @@
 	/*
 	 * Display all mount option settings
 	 */
-	seq_printf(m, "\n\topts:\t");
+	seq_puts(m, "\n\topts:\t");
 	seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw");
 	seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : "");
 	seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : "");
@@ -822,7 +842,7 @@
 
 	show_implementation_id(m, nfss);
 
-	seq_printf(m, "\n\tcaps:\t");
+	seq_puts(m, "\n\tcaps:\t");
 	seq_printf(m, "caps=0x%x", nfss->caps);
 	seq_printf(m, ",wtmult=%u", nfss->wtmult);
 	seq_printf(m, ",dtsize=%u", nfss->dtsize);
@@ -831,13 +851,14 @@
 
 #if IS_ENABLED(CONFIG_NFS_V4)
 	if (nfss->nfs_client->rpc_ops->version == 4) {
-		seq_printf(m, "\n\tnfsv4:\t");
+		seq_puts(m, "\n\tnfsv4:\t");
 		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
 		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
 		seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
 		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
 		show_sessions(m, nfss);
 		show_pnfs(m, nfss);
+		show_lease(m, nfss);
 	}
 #endif
 
@@ -869,20 +890,20 @@
 		preempt_enable();
 	}
 
-	seq_printf(m, "\n\tevents:\t");
+	seq_puts(m, "\n\tevents:\t");
 	for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
 		seq_printf(m, "%lu ", totals.events[i]);
-	seq_printf(m, "\n\tbytes:\t");
+	seq_puts(m, "\n\tbytes:\t");
 	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 		seq_printf(m, "%Lu ", totals.bytes[i]);
 #ifdef CONFIG_NFS_FSCACHE
 	if (nfss->options & NFS_OPTION_FSCACHE) {
-		seq_printf(m, "\n\tfsc:\t");
+		seq_puts(m, "\n\tfsc:\t");
 		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
 			seq_printf(m, "%Lu ", totals.fscache[i]);
 	}
 #endif
-	seq_printf(m, "\n");
+	seq_putc(m, '\n');
 
 	rpc_clnt_show_stats(m, nfss->client);
 
@@ -929,7 +950,7 @@
 		data->minorversion	= 0;
 		data->need_mount	= true;
 		data->net		= current->nsproxy->net_ns;
-		security_init_mnt_opts(&data->lsm_opts);
+		data->lsm_opts		= NULL;
 	}
 	return data;
 }
@@ -1206,7 +1227,7 @@
 static int nfs_parse_mount_options(char *raw,
 				   struct nfs_parsed_mount_data *mnt)
 {
-	char *p, *string, *secdata;
+	char *p, *string;
 	int rc, sloppy = 0, invalid_option = 0;
 	unsigned short protofamily = AF_UNSPEC;
 	unsigned short mountfamily = AF_UNSPEC;
@@ -1217,20 +1238,10 @@
 	}
 	dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw);
 
-	secdata = alloc_secdata();
-	if (!secdata)
-		goto out_nomem;
-
-	rc = security_sb_copy_data(raw, secdata);
+	rc = security_sb_eat_lsm_opts(raw, &mnt->lsm_opts);
 	if (rc)
 		goto out_security_failure;
 
-	rc = security_sb_parse_opts_str(secdata, &mnt->lsm_opts);
-	if (rc)
-		goto out_security_failure;
-
-	free_secdata(secdata);
-
 	while ((p = strsep(&raw, ",")) != NULL) {
 		substring_t args[MAX_OPT_ARGS];
 		unsigned long option;
@@ -1249,9 +1260,14 @@
 		 */
 		case Opt_soft:
 			mnt->flags |= NFS_MOUNT_SOFT;
+			mnt->flags &= ~NFS_MOUNT_SOFTERR;
+			break;
+		case Opt_softerr:
+			mnt->flags |= NFS_MOUNT_SOFTERR;
+			mnt->flags &= ~NFS_MOUNT_SOFT;
 			break;
 		case Opt_hard:
-			mnt->flags &= ~NFS_MOUNT_SOFT;
+			mnt->flags &= ~(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR);
 			break;
 		case Opt_posix:
 			mnt->flags |= NFS_MOUNT_POSIX;
@@ -1549,6 +1565,11 @@
 			if (mnt->mount_server.addrlen == 0)
 				goto out_invalid_address;
 			break;
+		case Opt_nconnect:
+			if (nfs_get_option_ul_bound(args, &option, 1, NFS_MAX_CONNECTIONS))
+				goto out_invalid_value;
+			mnt->nfs_server.nconnect = option;
+			break;
 		case Opt_lookupcache:
 			string = match_strdup(args);
 			if (string == NULL)
@@ -1682,7 +1703,6 @@
 	printk(KERN_INFO "NFS: not enough memory to parse option\n");
 	return 0;
 out_security_failure:
-	free_secdata(secdata);
 	printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
 	return 0;
 }
@@ -1906,6 +1926,11 @@
 	size_t len;
 	char *end;
 
+	if (unlikely(!dev_name || !*dev_name)) {
+		dfprintk(MOUNT, "NFS: device name not specified\n");
+		return -EINVAL;
+	}
+
 	/* Is the host name protected with square brakcets? */
 	if (*dev_name == '[') {
 		end = strchr(++dev_name, ']');
@@ -1925,7 +1950,7 @@
 		/* kill possible hostname list: not supported */
 		comma = strchr(dev_name, ',');
 		if (comma != NULL && comma < end)
-			*comma = 0;
+			len = comma - dev_name;
 	}
 
 	if (len > maxnamlen)
@@ -2047,7 +2072,8 @@
 		memcpy(sap, &data->addr, sizeof(data->addr));
 		args->nfs_server.addrlen = sizeof(data->addr);
 		args->nfs_server.port = ntohs(data->addr.sin_port);
-		if (!nfs_verify_server_address(sap))
+		if (sap->sa_family != AF_INET ||
+		    !nfs_verify_server_address(sap))
 			goto out_no_address;
 
 		if (!(data->flags & NFS_MOUNT_TCP))
@@ -2081,14 +2107,9 @@
 		if (data->context[0]){
 #ifdef CONFIG_SECURITY_SELINUX
 			int rc;
-			char *opts_str = kmalloc(sizeof(data->context) + 8, GFP_KERNEL);
-			if (!opts_str)
-				return -ENOMEM;
-			strcpy(opts_str, "context=");
 			data->context[NFS_MAX_CONTEXT_LEN] = '\0';
-			strcat(opts_str, &data->context[0]);
-			rc = security_sb_parse_opts_str(opts_str, &args->lsm_opts);
-			kfree(opts_str);
+			rc = security_add_mnt_opt("context", data->context,
+					strlen(data->context), &args->lsm_opts);
 			if (rc)
 				return rc;
 #else
@@ -2168,7 +2189,10 @@
 
 	if (args->version == 4) {
 #if IS_ENABLED(CONFIG_NFS_V4)
-		port = NFS_PORT;
+		if (args->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
+			port = NFS_RDMA_PORT;
+		else
+			port = NFS_PORT;
 		max_namelen = NFS4_MAXNAMLEN;
 		max_pathlen = NFS4_MAXPATHLEN;
 		nfs_validate_transport_protocol(args);
@@ -2178,8 +2202,11 @@
 #else
 		goto out_v4_not_compiled;
 #endif /* CONFIG_NFS_V4 */
-	} else
+	} else {
 		nfs_set_mount_transport_protocol(args);
+		if (args->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
+			port = NFS_RDMA_PORT;
+	}
 
 	nfs_set_port(sap, &args->nfs_server.port, port);
 
@@ -2233,6 +2260,7 @@
 	    data->acdirmin != nfss->acdirmin / HZ ||
 	    data->acdirmax != nfss->acdirmax / HZ ||
 	    data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+	    (data->options & NFS_OPTION_FSCACHE) != (nfss->options & NFS_OPTION_FSCACHE) ||
 	    data->nfs_server.port != nfss->port ||
 	    data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
 	    !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
@@ -2265,7 +2293,7 @@
 					   options->version <= 6))))
 		return 0;
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = nfs_alloc_parsed_mount_data();
 	if (data == NULL)
 		return -ENOMEM;
 
@@ -2304,8 +2332,10 @@
 
 	/* compare new mount options with old ones */
 	error = nfs_compare_remount_data(nfss, data);
+	if (!error)
+		error = security_sb_remount(sb, data->lsm_opts);
 out:
-	kfree(data);
+	nfs_free_parsed_mount_data(data);
 	return error;
 }
 EXPORT_SYMBOL_GPL(nfs_remount);
@@ -2352,6 +2382,15 @@
 		sb->s_flags |= SB_POSIXACL;
 		sb->s_time_gran = 1;
 		sb->s_export_op = &nfs_export_ops;
+	} else
+		sb->s_time_gran = 1000;
+
+	if (server->nfs_client->rpc_ops->version != 4) {
+		sb->s_time_min = 0;
+		sb->s_time_max = U32_MAX;
+	} else {
+		sb->s_time_min = S64_MIN;
+		sb->s_time_max = S64_MAX;
 	}
 
  	nfs_initialise_sb(sb);
@@ -2372,7 +2411,6 @@
 	sb->s_maxbytes = old_sb->s_maxbytes;
 	sb->s_xattr = old_sb->s_xattr;
 	sb->s_op = old_sb->s_op;
-	sb->s_time_gran = 1;
 	sb->s_export_op = old_sb->s_export_op;
 
 	if (server->nfs_client->rpc_ops->version != 2) {
@@ -2380,6 +2418,16 @@
 		 * so ourselves when necessary.
 		 */
 		sb->s_flags |= SB_POSIXACL;
+		sb->s_time_gran = 1;
+	} else
+		sb->s_time_gran = 1000;
+
+	if (server->nfs_client->rpc_ops->version != 4) {
+		sb->s_time_min = 0;
+		sb->s_time_max = U32_MAX;
+	} else {
+		sb->s_time_min = S64_MIN;
+		sb->s_time_max = S64_MAX;
 	}
 
  	nfs_initialise_sb(sb);
@@ -2409,8 +2457,7 @@
 		goto Ebusy;
 	if (a->acdirmax != b->acdirmax)
 		goto Ebusy;
-	if (b->auth_info.flavor_len > 0 &&
-	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+	if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
 	return 1;
 Ebusy:
@@ -2479,6 +2526,21 @@
 	return 1;
 }
 
+static int nfs_compare_userns(const struct nfs_server *old,
+		const struct nfs_server *new)
+{
+	const struct user_namespace *oldns = &init_user_ns;
+	const struct user_namespace *newns = &init_user_ns;
+
+	if (old->client && old->client->cl_cred)
+		oldns = old->client->cl_cred->user_ns;
+	if (new->client && new->client->cl_cred)
+		newns = new->client->cl_cred->user_ns;
+	if (oldns != newns)
+		return 0;
+	return 1;
+}
+
 static int nfs_compare_super(struct super_block *sb, void *data)
 {
 	struct nfs_sb_mountdata *sb_mntdata = data;
@@ -2492,6 +2554,8 @@
 		return 0;
 	if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
 		return 0;
+	if (!nfs_compare_userns(old, server))
+		return 0;
 	return nfs_compare_mount_options(sb, server, mntflags);
 }
 
@@ -2543,7 +2607,7 @@
 	if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
 		kflags |= SECURITY_LSM_NATIVE_LABELS;
 
-	error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+	error = security_sb_set_mnt_opts(s, mount_info->parsed->lsm_opts,
 						kflags, &kflags_out);
 	if (error)
 		goto err;
@@ -2581,6 +2645,13 @@
 }
 EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
 
+static void nfs_set_readahead(struct backing_dev_info *bdi,
+			      unsigned long iomax_pages)
+{
+	bdi->ra_pages = VM_READAHEAD_PAGES;
+	bdi->io_pages = iomax_pages;
+}
+
 struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 				   int flags, const char *dev_name,
 				   struct nfs_mount_info *mount_info,
@@ -2623,7 +2694,7 @@
 			mntroot = ERR_PTR(error);
 			goto error_splat_super;
 		}
-		s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+		nfs_set_readahead(s->s_bdi, server->rpages);
 		server->super = s;
 	}
 

diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 06eb44b..25ba299 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c

@@ -26,8 +26,9 @@
  * and straight-forward than readdir caching.
  */
 
-static int nfs_symlink_filler(struct inode *inode, struct page *page)
+static int nfs_symlink_filler(void *data, struct page *page)
 {
+	struct inode *inode = data;
 	int error;
 
 	error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE);
@@ -65,8 +66,8 @@
 		err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 		if (err)
 			return err;
-		page = read_cache_page(&inode->i_data, 0,
-					(filler_t *)nfs_symlink_filler, inode);
+		page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler,
+				inode);
 		if (IS_ERR(page))
 			return ERR_CAST(page);
 	}

diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
new file mode 100644
index 0000000..4f3390b
--- /dev/null
+++ b/fs/nfs/sysfs.c

@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Hammerspace Inc
+ */
+
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/nfs_fs.h>
+#include <linux/rcupdate.h>
+
+#include "nfs4_fs.h"
+#include "netns.h"
+#include "sysfs.h"
+
+struct kobject *nfs_client_kobj;
+static struct kset *nfs_client_kset;
+
+static void nfs_netns_object_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type(
+		struct kobject *kobj)
+{
+	return &net_ns_type_operations;
+}
+
+static struct kobj_type nfs_netns_object_type = {
+	.release = nfs_netns_object_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.child_ns_type = nfs_netns_object_child_ns_type,
+};
+
+static struct kobject *nfs_netns_object_alloc(const char *name,
+		struct kset *kset, struct kobject *parent)
+{
+	struct kobject *kobj;
+
+	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+	if (kobj) {
+		kobj->kset = kset;
+		if (kobject_init_and_add(kobj, &nfs_netns_object_type,
+					parent, "%s", name) == 0)
+			return kobj;
+		kobject_put(kobj);
+	}
+	return NULL;
+}
+
+int nfs_sysfs_init(void)
+{
+	nfs_client_kset = kset_create_and_add("nfs", NULL, fs_kobj);
+	if (!nfs_client_kset)
+		return -ENOMEM;
+	nfs_client_kobj = nfs_netns_object_alloc("net", nfs_client_kset, NULL);
+	if  (!nfs_client_kobj) {
+		kset_unregister(nfs_client_kset);
+		nfs_client_kset = NULL;
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void nfs_sysfs_exit(void)
+{
+	kobject_put(nfs_client_kobj);
+	kset_unregister(nfs_client_kset);
+}
+
+static ssize_t nfs_netns_identifier_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct nfs_netns_client *c = container_of(kobj,
+			struct nfs_netns_client,
+			kobject);
+	return scnprintf(buf, PAGE_SIZE, "%s\n", c->identifier);
+}
+
+/* Strip trailing '\n' */
+static size_t nfs_string_strip(const char *c, size_t len)
+{
+	while (len > 0 && c[len-1] == '\n')
+		--len;
+	return len;
+}
+
+static ssize_t nfs_netns_identifier_store(struct kobject *kobj,
+		struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct nfs_netns_client *c = container_of(kobj,
+			struct nfs_netns_client,
+			kobject);
+	const char *old;
+	char *p;
+	size_t len;
+
+	len = nfs_string_strip(buf, min_t(size_t, count, CONTAINER_ID_MAXLEN));
+	if (!len)
+		return 0;
+	p = kmemdup_nul(buf, len, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	old = xchg(&c->identifier, p);
+	if (old) {
+		synchronize_rcu();
+		kfree(old);
+	}
+	return count;
+}
+
+static void nfs_netns_client_release(struct kobject *kobj)
+{
+	struct nfs_netns_client *c = container_of(kobj,
+			struct nfs_netns_client,
+			kobject);
+
+	if (c->identifier)
+		kfree(c->identifier);
+	kfree(c);
+}
+
+static const void *nfs_netns_client_namespace(struct kobject *kobj)
+{
+	return container_of(kobj, struct nfs_netns_client, kobject)->net;
+}
+
+static struct kobj_attribute nfs_netns_client_id = __ATTR(identifier,
+		0644, nfs_netns_identifier_show, nfs_netns_identifier_store);
+
+static struct attribute *nfs_netns_client_attrs[] = {
+	&nfs_netns_client_id.attr,
+	NULL,
+};
+
+static struct kobj_type nfs_netns_client_type = {
+	.release = nfs_netns_client_release,
+	.default_attrs = nfs_netns_client_attrs,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.namespace = nfs_netns_client_namespace,
+};
+
+static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
+		struct net *net)
+{
+	struct nfs_netns_client *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (p) {
+		p->net = net;
+		p->kobject.kset = nfs_client_kset;
+		if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type,
+					parent, "nfs_client") == 0)
+			return p;
+		kobject_put(&p->kobject);
+	}
+	return NULL;
+}
+
+void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net)
+{
+	struct nfs_netns_client *clp;
+
+	clp = nfs_netns_client_alloc(nfs_client_kobj, net);
+	if (clp) {
+		netns->nfs_client = clp;
+		kobject_uevent(&clp->kobject, KOBJ_ADD);
+	}
+}
+
+void nfs_netns_sysfs_destroy(struct nfs_net *netns)
+{
+	struct nfs_netns_client *clp = netns->nfs_client;
+
+	if (clp) {
+		kobject_uevent(&clp->kobject, KOBJ_REMOVE);
+		kobject_del(&clp->kobject);
+		kobject_put(&clp->kobject);
+		netns->nfs_client = NULL;
+	}
+}

diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h
new file mode 100644
index 0000000..f1b2741
--- /dev/null
+++ b/fs/nfs/sysfs.h

@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Hammerspace Inc
+ */
+
+#ifndef __NFS_SYSFS_H
+#define __NFS_SYSFS_H
+
+#define CONTAINER_ID_MAXLEN (64)
+
+struct nfs_netns_client {
+	struct kobject kobject;
+	struct net *net;
+	const char *identifier;
+};
+
+extern struct kobject *nfs_client_kobj;
+
+extern int nfs_sysfs_init(void);
+extern void nfs_sysfs_exit(void);
+
+void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net);
+void nfs_netns_sysfs_destroy(struct nfs_net *netns);
+
+#endif

diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index fd61bf0..0effeee 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c

@@ -31,7 +31,7 @@
 static void
 nfs_free_unlinkdata(struct nfs_unlinkdata *data)
 {
-	put_rpccred(data->cred);
+	put_cred(data->cred);
 	kfree(data->args.name.name);
 	kfree(data);
 }
@@ -39,6 +39,7 @@
 /**
  * nfs_async_unlink_done - Sillydelete post-processing
  * @task: rpc_task of the sillydelete
+ * @calldata: pointer to nfs_unlinkdata
  *
  * Do the directory attribute update.
  */
@@ -54,7 +55,7 @@
 
 /**
  * nfs_async_unlink_release - Release the sillydelete data.
- * @task: rpc_task of the sillydelete
+ * @calldata: struct nfs_unlinkdata to release
  *
  * We need to call nfs_put_unlinkdata as a 'tk_release' task since the
  * rpc_task would be freed too.
@@ -159,8 +160,8 @@
 
 /**
  * nfs_async_unlink - asynchronous unlinking of a file
- * @dir: parent directory of dentry
- * @dentry: dentry to unlink
+ * @dentry: parent directory of dentry
+ * @name: name of dentry to unlink
  */
 static int
 nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
@@ -177,11 +178,7 @@
 		goto out_free;
 	data->args.name.len = name->len;
 
-	data->cred = rpc_lookup_cred();
-	if (IS_ERR(data->cred)) {
-		status = PTR_ERR(data->cred);
-		goto out_free_name;
-	}
+	data->cred = get_current_cred();
 	data->res.dir_attr = &data->dir_attr;
 	init_waitqueue_head(&data->wq);
 
@@ -202,8 +199,7 @@
 	return 0;
 out_unlock:
 	spin_unlock(&dentry->d_lock);
-	put_rpccred(data->cred);
-out_free_name:
+	put_cred(data->cred);
 	kfree(data->args.name.name);
 out_free:
 	kfree(data);
@@ -307,7 +303,7 @@
 	iput(data->old_dir);
 	iput(data->new_dir);
 	nfs_sb_deactive(sb);
-	put_rpccred(data->cred);
+	put_cred(data->cred);
 	kfree(data);
 }
 
@@ -329,6 +325,7 @@
  * @new_dir: target directory for the rename
  * @old_dentry: original dentry to be renamed
  * @new_dentry: dentry to which the old_dentry should be renamed
+ * @complete: Function to run on successful completion
  *
  * It's expected that valid references to the dentries and inodes are held
  */
@@ -352,12 +349,7 @@
 		return ERR_PTR(-ENOMEM);
 	task_setup_data.callback_data = data;
 
-	data->cred = rpc_lookup_cred();
-	if (IS_ERR(data->cred)) {
-		struct rpc_task *task = ERR_CAST(data->cred);
-		kfree(data);
-		return task;
-	}
+	data->cred = get_current_cred();
 
 	msg.rpc_argp = &data->args;
 	msg.rpc_resp = &data->res;
@@ -404,12 +396,6 @@
 		nfs_cancel_async_unlink(dentry);
 		return;
 	}
-
-	/*
-	 * vfs_unlink and the like do not issue this when a file is
-	 * sillyrenamed, so do it here.
-	 */
-	fsnotify_nameremove(dentry, 0);
 }
 
 #define SILLYNAME_PREFIX ".nfs"

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 586726a..52cab65 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/fs/nfs/write.c
  *
@@ -26,6 +27,7 @@
 #include <linux/iversion.h>
 
 #include <linux/uaccess.h>
+#include <linux/sched/mm.h>
 
 #include "delegation.h"
 #include "internal.h"
@@ -55,6 +57,7 @@
 static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
 static const struct nfs_rw_ops nfs_rw_write_ops;
+static void nfs_inode_remove_request(struct nfs_page *req);
 static void nfs_clear_request_commit(struct nfs_page *req);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 				      struct inode *inode);
@@ -101,7 +104,7 @@
 
 static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
-	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL);
 
 	memset(p, 0, sizeof(*p));
 	p->rw_mode = FMODE_WRITE;
@@ -238,9 +241,15 @@
 }
 
 /* A writeback failed: mark the page as bad, and invalidate the page cache */
-static void nfs_set_pageerror(struct page *page)
+static void nfs_set_pageerror(struct address_space *mapping)
 {
-	nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
+	nfs_zap_mapping(mapping->host, mapping);
+}
+
+static void nfs_mapping_set_error(struct page *page, int error)
+{
+	SetPageError(page);
+	mapping_set_error(page_file_mapping(page), error);
 }
 
 /*
@@ -581,26 +590,15 @@
 	return ERR_PTR(ret);
 }
 
-static void nfs_write_error_remove_page(struct nfs_page *req)
+static void nfs_write_error(struct nfs_page *req, int error)
 {
+	nfs_set_pageerror(page_file_mapping(req->wb_page));
+	nfs_mapping_set_error(req->wb_page, error);
+	nfs_inode_remove_request(req);
 	nfs_end_page_writeback(req);
-	generic_error_remove_page(page_file_mapping(req->wb_page),
-				  req->wb_page);
 	nfs_release_request(req);
 }
 
-static bool
-nfs_error_is_fatal_on_server(int err)
-{
-	switch (err) {
-	case 0:
-	case -ERESTARTSYS:
-	case -EINTR:
-		return false;
-	}
-	return nfs_error_is_fatal(err);
-}
-
 /*
  * Find an associated nfs write request, and prepare to flush it out
  * May return an error if the user signalled nfs_wait_on_request().
@@ -621,31 +619,32 @@
 	nfs_set_page_writeback(page);
 	WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
 
-	ret = 0;
 	/* If there is a fatal error that covers this write, just exit */
-	if (nfs_error_is_fatal_on_server(req->wb_context->error))
+	ret = pgio->pg_error;
+	if (nfs_error_is_fatal_on_server(ret))
 		goto out_launder;
 
+	ret = 0;
 	if (!nfs_pageio_add_request(pgio, req)) {
 		ret = pgio->pg_error;
 		/*
 		 * Remove the problematic req upon fatal errors on the server
 		 */
 		if (nfs_error_is_fatal(ret)) {
-			nfs_context_set_write_error(req->wb_context, ret);
 			if (nfs_error_is_fatal_on_server(ret))
 				goto out_launder;
-		}
+		} else
+			ret = -EAGAIN;
 		nfs_redirty_request(req);
-		ret = -EAGAIN;
+		pgio->pg_error = 0;
 	} else
 		nfs_add_stats(page_file_mapping(page)->host,
 				NFSIOS_WRITEPAGES, 1);
 out:
 	return ret;
 out_launder:
-	nfs_write_error_remove_page(req);
-	return ret;
+	nfs_write_error(req, ret);
+	return 0;
 }
 
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
@@ -657,7 +656,7 @@
 	ret = nfs_page_async_flush(pgio, page);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
-		ret = 0;
+		ret = AOP_WRITEPAGE_ACTIVATE;
 	}
 	return ret;
 }
@@ -676,10 +675,11 @@
 	nfs_pageio_init_write(&pgio, inode, 0,
 				false, &nfs_async_write_completion_ops);
 	err = nfs_do_writepage(page, wbc, &pgio);
+	pgio.pg_error = 0;
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
 		return err;
-	if (pgio.pg_error < 0)
+	if (nfs_error_is_fatal(pgio.pg_error))
 		return pgio.pg_error;
 	return 0;
 }
@@ -689,7 +689,8 @@
 	int ret;
 
 	ret = nfs_writepage_locked(page, wbc);
-	unlock_page(page);
+	if (ret != AOP_WRITEPAGE_ACTIVATE)
+		unlock_page(page);
 	return ret;
 }
 
@@ -698,7 +699,8 @@
 	int ret;
 
 	ret = nfs_do_writepage(page, wbc, data);
-	unlock_page(page);
+	if (ret != AOP_WRITEPAGE_ACTIVATE)
+		unlock_page(page);
 	return ret;
 }
 
@@ -711,11 +713,12 @@
 {
 	struct inode *inode = mapping->host;
 	struct nfs_pageio_descriptor pgio;
-	struct nfs_io_completion *ioc = nfs_io_completion_alloc(GFP_NOFS);
+	struct nfs_io_completion *ioc;
 	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
+	ioc = nfs_io_completion_alloc(GFP_KERNEL);
 	if (ioc)
 		nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
 
@@ -723,13 +726,14 @@
 				&nfs_async_write_completion_ops);
 	pgio.pg_io_completion = ioc;
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
+	pgio.pg_error = 0;
 	nfs_pageio_complete(&pgio);
 	nfs_io_completion_put(ioc);
 
 	if (err < 0)
 		goto out_err;
 	err = pgio.pg_error;
-	if (err < 0)
+	if (nfs_error_is_fatal(err))
 		goto out_err;
 	return 0;
 out_err:
@@ -782,7 +786,6 @@
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *head;
 
-	atomic_long_dec(&nfsi->nrequests);
 	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
 		head = req->wb_head;
 
@@ -795,8 +798,10 @@
 		spin_unlock(&mapping->private_lock);
 	}
 
-	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
 		nfs_release_request(req);
+		atomic_long_dec(&nfsi->nrequests);
+	}
 }
 
 static void
@@ -864,7 +869,6 @@
 /**
  * nfs_request_add_commit_list - add request to a commit list
  * @req: pointer to a struct nfs_page
- * @dst: commit list head
  * @cinfo: holds list lock and accounting info
  *
  * This sets the PG_CLEAN bit, updates the cinfo count of
@@ -953,7 +957,8 @@
 nfs_clear_request_commit(struct nfs_page *req)
 {
 	if (test_bit(PG_CLEAN, &req->wb_flags)) {
-		struct inode *inode = d_inode(req->wb_context->dentry);
+		struct nfs_open_context *ctx = nfs_req_openctx(req);
+		struct inode *inode = d_inode(ctx->dentry);
 		struct nfs_commit_info cinfo;
 
 		nfs_init_cinfo_from_inode(&cinfo, inode);
@@ -993,11 +998,13 @@
 		nfs_list_remove_request(req);
 		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
 		    (hdr->good_bytes < bytes)) {
-			nfs_set_pageerror(req->wb_page);
-			nfs_context_set_write_error(req->wb_context, hdr->error);
+			nfs_set_pageerror(page_file_mapping(req->wb_page));
+			nfs_mapping_set_error(req->wb_page, hdr->error);
 			goto remove_req;
 		}
 		if (nfs_write_need_commit(hdr)) {
+			/* Reset wb_nio, since the write was successful. */
+			req->wb_nio = 0;
 			memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
 				hdr->pgio_mirror_idx);
@@ -1131,6 +1138,7 @@
 		req->wb_bytes = end - req->wb_offset;
 	else
 		req->wb_bytes = rqend - req->wb_offset;
+	req->wb_nio = 0;
 	return req;
 out_flushme:
 	/*
@@ -1160,7 +1168,7 @@
 	req = nfs_try_to_update_request(inode, page, offset, bytes);
 	if (req != NULL)
 		goto out;
-	req = nfs_create_request(ctx, page, NULL, offset, bytes);
+	req = nfs_create_request(ctx, page, offset, bytes);
 	if (IS_ERR(req))
 		goto out;
 	nfs_inode_add_request(inode, req);
@@ -1205,7 +1213,7 @@
 			return 0;
 		l_ctx = req->wb_lock_context;
 		do_flush = req->wb_page != page ||
-			!nfs_match_open_context(req->wb_context, ctx);
+			!nfs_match_open_context(nfs_req_openctx(req), ctx);
 		if (l_ctx && flctx &&
 		    !(list_empty_careful(&flctx->flc_posix) &&
 		      list_empty_careful(&flctx->flc_flock))) {
@@ -1233,9 +1241,12 @@
 nfs_key_timeout_notify(struct file *filp, struct inode *inode)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(filp);
-	struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
 
-	return rpcauth_key_timeout_notify(auth, ctx->cred);
+	if (nfs_ctx_key_to_expire(ctx, inode) &&
+	    !ctx->ll_cred)
+		/* Already expired! */
+		return -EACCES;
+	return 0;
 }
 
 /*
@@ -1244,8 +1255,23 @@
 bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
 {
 	struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+	struct rpc_cred *cred = ctx->ll_cred;
+	struct auth_cred acred = {
+		.cred = ctx->cred,
+	};
 
-	return rpcauth_cred_key_to_expire(auth, ctx->cred);
+	if (cred && !cred->cr_ops->crmatch(&acred, cred, 0)) {
+		put_rpccred(cred);
+		ctx->ll_cred = NULL;
+		cred = NULL;
+	}
+	if (!cred)
+		cred = auth->au_ops->lookup_cred(auth, &acred, 0);
+	if (!cred || IS_ERR(cred))
+		return true;
+	ctx->ll_cred = cred;
+	return !!(cred->cr_ops->crkey_timeout &&
+		  cred->cr_ops->crkey_timeout(cred));
 }
 
 /*
@@ -1329,7 +1355,8 @@
 		unsigned int offset, unsigned int count)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
-	struct inode	*inode = page_file_mapping(page)->host;
+	struct address_space *mapping = page_file_mapping(page);
+	struct inode	*inode = mapping->host;
 	int		status = 0;
 
 	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -1347,7 +1374,7 @@
 
 	status = nfs_writepage_setup(ctx, page, offset, count);
 	if (status < 0)
-		nfs_set_pageerror(page);
+		nfs_set_pageerror(mapping);
 	else
 		__set_page_dirty_nobuffers(page);
 out:
@@ -1386,26 +1413,31 @@
  */
 static void nfs_redirty_request(struct nfs_page *req)
 {
+	/* Bump the transmission count */
+	req->wb_nio++;
 	nfs_mark_request_dirty(req);
-	set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
+	set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
 	nfs_end_page_writeback(req);
 	nfs_release_request(req);
 }
 
-static void nfs_async_write_error(struct list_head *head)
+static void nfs_async_write_error(struct list_head *head, int error)
 {
 	struct nfs_page	*req;
 
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_redirty_request(req);
+		if (nfs_error_is_fatal(error))
+			nfs_write_error(req, error);
+		else
+			nfs_redirty_request(req);
 	}
 }
 
 static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
 {
-	nfs_async_write_error(&hdr->pages);
+	nfs_async_write_error(&hdr->pages, 0);
 	filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset,
 			hdr->args.offset + hdr->args.count - 1);
 }
@@ -1704,7 +1736,8 @@
 		     struct nfs_commit_info *cinfo)
 {
 	struct nfs_page *first = nfs_list_entry(head->next);
-	struct inode *inode = d_inode(first->wb_context->dentry);
+	struct nfs_open_context *ctx = nfs_req_openctx(first);
+	struct inode *inode = d_inode(ctx->dentry);
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
@@ -1712,7 +1745,7 @@
 	list_splice_init(head, &data->pages);
 
 	data->inode	  = inode;
-	data->cred	  = first->wb_context->cred;
+	data->cred	  = ctx->cred;
 	data->lseg	  = lseg; /* reference transferred */
 	/* only set lwb for pnfs commit */
 	if (lseg)
@@ -1725,7 +1758,7 @@
 	/* Note: we always request a commit of the entire inode */
 	data->args.offset = 0;
 	data->args.count  = 0;
-	data->context     = get_nfs_open_context(first->wb_context);
+	data->context     = get_nfs_open_context(ctx);
 	data->res.fattr   = &data->fattr;
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
@@ -1808,14 +1841,15 @@
 			nfs_clear_page_commit(req->wb_page);
 
 		dprintk("NFS:       commit (%s/%llu %d@%lld)",
-			req->wb_context->dentry->d_sb->s_id,
-			(unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
+			nfs_req_openctx(req)->dentry->d_sb->s_id,
+			(unsigned long long)NFS_FILEID(d_inode(nfs_req_openctx(req)->dentry)),
 			req->wb_bytes,
 			(long long)req_offset(req));
 		if (status < 0) {
-			nfs_context_set_write_error(req->wb_context, status);
-			if (req->wb_page)
+			if (req->wb_page) {
+				nfs_mapping_set_error(req->wb_page, status);
 				nfs_inode_remove_request(req);
+			}
 			dprintk_cont(", error = %d\n", status);
 			goto next;
 		}
@@ -1832,7 +1866,7 @@
 		/* We have a mismatch. Write the page again */
 		dprintk_cont(" mismatch\n");
 		nfs_mark_request_dirty(req);
-		set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
+		set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
 	next:
 		nfs_unlock_and_release_request(req);
 		/* Latency breaker */
@@ -2121,7 +2155,7 @@
 	 * This allows larger machines to have larger/more transfers.
 	 * Limit the default to 256M
 	 */
-	nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
 	if (nfs_congestion_kb > 256*1024)
 		nfs_congestion_kb = 256*1024;
 

diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index d153ca3..4bebe83 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for Linux filesystem routines that are shared by client and server.
 #

diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index 5be08f0..b73d9dd 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Common code for control of lockd and nfsv4 grace periods.
  *

diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 538f142..8ceb642 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * fs/nfs_common/nfsacl.c
  *

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 20b1c17..10cefb0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig

@@ -1,7 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config NFSD
 	tristate "NFS server support"
 	depends on INET
 	depends on FILE_LOCKING
+	depends on FSNOTIFY
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
@@ -146,7 +148,7 @@
 
 config NFSD_FAULT_INJECTION
 	bool "NFS server manual fault injection"
-	depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS
+	depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN
 	help
 	  This option enables support for manually injecting faults
 	  into the NFS server.  This is intended to be used for

diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 2bfb58e..6a40b1a 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile

@@ -11,7 +11,8 @@
 nfsd-y			+= trace.o
 
 nfsd-y 			+= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
-			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+			   export.o auth.o lockd.o nfscache.o nfsxdr.o \
+			   stats.o filecache.o
 nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o

diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 4cd7c69..ba14d2f 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h

@@ -39,14 +39,6 @@
 struct svc_fh;
 struct svc_rqst;
 
-/*
- * Maximum ACL we'll accept from a client; chosen (somewhat
- * arbitrarily) so that kmalloc'ing the ACL shouldn't require a
- * high-order allocation.  This allows 204 ACEs on x86_64:
- */
-#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
-			/ sizeof(struct nfs4_ace))
-
 int nfs4_acl_bytes(int entries);
 int nfs4_acl_get_whotype(char *, u32);
 __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 4fb1f72..9bbaa67 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c

@@ -15,6 +15,7 @@
 
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
@@ -121,15 +122,13 @@
 {
 	loff_t new_size = lcp->lc_last_wr + 1;
 	struct iattr iattr = { .ia_valid = 0 };
-	struct timespec ts;
 	int error;
 
-	ts = timespec64_to_timespec(inode->i_mtime);
 	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
-	    timespec_compare(&lcp->lc_mtime, &ts) < 0)
-		lcp->lc_mtime = timespec64_to_timespec(current_time(inode));
+	    timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+		lcp->lc_mtime = current_time(inode);
 	iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
-	iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = timespec_to_timespec64(lcp->lc_mtime);
+	iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
 
 	if (new_size > i_size_read(inode)) {
 		iattr.ia_valid |= ATTR_SIZE;
@@ -406,7 +405,7 @@
 nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
-	struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
+	struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev;
 
 	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
 			nfsd4_scsi_pr_key(clp), 0, true);

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b7559c6..10ec5ec 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h

@@ -10,6 +10,7 @@
 #define NFSCACHE_H
 
 #include <linux/sunrpc/svc.h>
+#include "netns.h"
 
 /*
  * Representation of a reply cache entry.
@@ -19,18 +20,22 @@
  * is much larger than a sockaddr_in6.
  */
 struct svc_cacherep {
-	struct list_head	c_lru;
+	struct {
+		/* Keep often-read xid, csum in the same cache line: */
+		__be32			k_xid;
+		__wsum			k_csum;
+		u32			k_proc;
+		u32			k_prot;
+		u32			k_vers;
+		unsigned int		k_len;
+		struct sockaddr_in6	k_addr;
+	} c_key;
 
+	struct rb_node		c_node;
+	struct list_head	c_lru;
 	unsigned char		c_state,	/* unused, inprog, done */
 				c_type,		/* status, buffer */
 				c_secure : 1;	/* req came from port < 1024 */
-	struct sockaddr_in6	c_addr;
-	__be32			c_xid;
-	u32			c_prot;
-	u32			c_proc;
-	u32			c_vers;
-	unsigned int		c_len;
-	__wsum			c_csum;
 	unsigned long		c_timestamp;
 	union {
 		struct kvec	u_vec;
@@ -73,8 +78,8 @@
 /* Checksum this amount of the request */
 #define RC_CSUMLEN		(256U)
 
-int	nfsd_reply_cache_init(void);
-void	nfsd_reply_cache_shutdown(void);
+int	nfsd_reply_cache_init(struct nfsd_net *);
+void	nfsd_reply_cache_shutdown(struct nfsd_net *);
 int	nfsd_cache_lookup(struct svc_rqst *);
 void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
 int	nfsd_reply_cache_stats_open(struct inode *, struct file *);

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a1143f7..15422c9 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c

@@ -22,6 +22,7 @@
 #include "nfsfh.h"
 #include "netns.h"
 #include "pnfs.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
 
@@ -46,7 +47,7 @@
 	    !test_bit(CACHE_NEGATIVE, &key->h.flags))
 		path_put(&key->ek_path);
 	auth_domain_put(key->ek_client);
-	kfree(key);
+	kfree_rcu(key, ek_rcu);
 }
 
 static void expkey_request(struct cache_detail *cd,
@@ -232,6 +233,17 @@
 		return NULL;
 }
 
+static void expkey_flush(void)
+{
+	/*
+	 * Take the nfsd_mutex here to ensure that the file cache is not
+	 * destroyed while we're in the middle of flushing.
+	 */
+	mutex_lock(&nfsd_mutex);
+	nfsd_file_cache_purge(current->nsproxy->net_ns);
+	mutex_unlock(&nfsd_mutex);
+}
+
 static const struct cache_detail svc_expkey_cache_template = {
 	.owner		= THIS_MODULE,
 	.hash_size	= EXPKEY_HASHMAX,
@@ -244,6 +256,7 @@
 	.init		= expkey_init,
 	.update       	= expkey_update,
 	.alloc		= expkey_alloc,
+	.flush		= expkey_flush,
 };
 
 static int
@@ -265,7 +278,7 @@
 	struct cache_head *ch;
 	int hash = svc_expkey_hash(item);
 
-	ch = sunrpc_cache_lookup(cd, &item->h, hash);
+	ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
 	if (ch)
 		return container_of(ch, struct svc_expkey, h);
 	else
@@ -314,7 +327,7 @@
 	auth_domain_put(exp->ex_client);
 	nfsd4_fslocs_free(&exp->ex_fslocs);
 	kfree(exp->ex_uuid);
-	kfree(exp);
+	kfree_rcu(exp, ex_rcu);
 }
 
 static void svc_export_request(struct cache_detail *cd,
@@ -570,13 +583,13 @@
 		err = get_int(&mesg, &an_int);
 		if (err)
 			goto out3;
-		exp.ex_anon_uid= make_kuid(&init_user_ns, an_int);
+		exp.ex_anon_uid= make_kuid(current_user_ns(), an_int);
 
 		/* anon gid */
 		err = get_int(&mesg, &an_int);
 		if (err)
 			goto out3;
-		exp.ex_anon_gid= make_kgid(&init_user_ns, an_int);
+		exp.ex_anon_gid= make_kgid(current_user_ns(), an_int);
 
 		/* fsid */
 		err = get_int(&mesg, &an_int);
@@ -780,7 +793,7 @@
 	struct cache_head *ch;
 	int hash = svc_export_hash(exp);
 
-	ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash);
+	ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash);
 	if (ch)
 		return container_of(ch, struct svc_export, h);
 	else
@@ -1170,15 +1183,17 @@
 static void exp_flags(struct seq_file *m, int flag, int fsid,
 		kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc)
 {
+	struct user_namespace *userns = m->file->f_cred->user_ns;
+
 	show_expflags(m, flag, NFSEXP_ALLFLAGS);
 	if (flag & NFSEXP_FSID)
 		seq_printf(m, ",fsid=%d", fsid);
-	if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) &&
-	    !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2)))
-		seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu));
-	if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) &&
-	    !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2)))
-		seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong));
+	if (!uid_eq(anonu, make_kuid(userns, (uid_t)-2)) &&
+	    !uid_eq(anonu, make_kuid(userns, 0x10000-2)))
+		seq_printf(m, ",anonuid=%u", from_kuid_munged(userns, anonu));
+	if (!gid_eq(anong, make_kgid(userns, (gid_t)-2)) &&
+	    !gid_eq(anong, make_kgid(userns, 0x10000-2)))
+		seq_printf(m, ",anongid=%u", from_kgid_munged(userns, anong));
 	if (fsloc && fsloc->locations_count > 0) {
 		char *loctype = (fsloc->migrated) ? "refer" : "replicas";
 		int i;
@@ -1216,9 +1231,9 @@
 }
 
 const struct seq_operations nfs_exports_op = {
-	.start	= cache_seq_start,
-	.next	= cache_seq_next,
-	.stop	= cache_seq_stop,
+	.start	= cache_seq_start_rcu,
+	.next	= cache_seq_next_rcu,
+	.stop	= cache_seq_stop_rcu,
 	.show	= e_show,
 };
 

diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index c8b7412..e7daa1f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h

@@ -61,6 +61,7 @@
 	u32			ex_layout_types;
 	struct nfsd4_deviceid_map *ex_devid_map;
 	struct cache_detail	*cd;
+	struct rcu_head		ex_rcu;
 };
 
 /* an "export key" (expkey) maps a filehandlefragement to an
@@ -75,6 +76,7 @@
 	u32			ek_fsid[6];
 
 	struct path		ek_path;
+	struct rcu_head		ek_rcu;
 };
 
 #define EX_ISSYNC(exp)		(!((exp)->ex_flags & NFSEXP_ASYNC))

diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 8483125..76bee0a 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c

@@ -127,24 +127,16 @@
 	},
 };
 
-int nfsd_fault_inject_init(void)
+void nfsd_fault_inject_init(void)
 {
 	unsigned int i;
 	struct nfsd_fault_inject_op *op;
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 
 	debug_dir = debugfs_create_dir("nfsd", NULL);
-	if (!debug_dir)
-		goto fail;
 
 	for (i = 0; i < ARRAY_SIZE(inject_ops); i++) {
 		op = &inject_ops[i];
-		if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
-			goto fail;
+		debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd);
 	}
-	return 0;
-
-fail:
-	nfsd_fault_inject_cleanup();
-	return -ENOMEM;
 }

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
new file mode 100644
index 0000000..ef55e9b
--- /dev/null
+++ b/fs/nfsd/filecache.c

@@ -0,0 +1,934 @@
+/*
+ * Open file cache.
+ *
+ * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ */
+
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/list_lru.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+
+#include "vfs.h"
+#include "nfsd.h"
+#include "nfsfh.h"
+#include "netns.h"
+#include "filecache.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_FH
+
+/* FIXME: dynamically size this for the machine somehow? */
+#define NFSD_FILE_HASH_BITS                   12
+#define NFSD_FILE_HASH_SIZE                  (1 << NFSD_FILE_HASH_BITS)
+#define NFSD_LAUNDRETTE_DELAY		     (2 * HZ)
+
+#define NFSD_FILE_LRU_RESCAN		     (0)
+#define NFSD_FILE_SHUTDOWN		     (1)
+#define NFSD_FILE_LRU_THRESHOLD		     (4096UL)
+#define NFSD_FILE_LRU_LIMIT		     (NFSD_FILE_LRU_THRESHOLD << 2)
+
+/* We only care about NFSD_MAY_READ/WRITE for this cache */
+#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
+
+struct nfsd_fcache_bucket {
+	struct hlist_head	nfb_head;
+	spinlock_t		nfb_lock;
+	unsigned int		nfb_count;
+	unsigned int		nfb_maxcount;
+};
+
+static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
+
+static struct kmem_cache		*nfsd_file_slab;
+static struct kmem_cache		*nfsd_file_mark_slab;
+static struct nfsd_fcache_bucket	*nfsd_file_hashtbl;
+static struct list_lru			nfsd_file_lru;
+static long				nfsd_file_lru_flags;
+static struct fsnotify_group		*nfsd_file_fsnotify_group;
+static atomic_long_t			nfsd_filecache_count;
+static struct delayed_work		nfsd_filecache_laundrette;
+
+enum nfsd_file_laundrette_ctl {
+	NFSD_FILE_LAUNDRETTE_NOFLUSH = 0,
+	NFSD_FILE_LAUNDRETTE_MAY_FLUSH
+};
+
+static void
+nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl)
+{
+	long count = atomic_long_read(&nfsd_filecache_count);
+
+	if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
+		return;
+
+	/* Be more aggressive about scanning if over the threshold */
+	if (count > NFSD_FILE_LRU_THRESHOLD)
+		mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0);
+	else
+		schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY);
+
+	if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH)
+		return;
+
+	/* ...and don't delay flushing if we're out of control */
+	if (count >= NFSD_FILE_LRU_LIMIT)
+		flush_delayed_work(&nfsd_filecache_laundrette);
+}
+
+static void
+nfsd_file_slab_free(struct rcu_head *rcu)
+{
+	struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu);
+
+	put_cred(nf->nf_cred);
+	kmem_cache_free(nfsd_file_slab, nf);
+}
+
+static void
+nfsd_file_mark_free(struct fsnotify_mark *mark)
+{
+	struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark,
+						  nfm_mark);
+
+	kmem_cache_free(nfsd_file_mark_slab, nfm);
+}
+
+static struct nfsd_file_mark *
+nfsd_file_mark_get(struct nfsd_file_mark *nfm)
+{
+	if (!atomic_inc_not_zero(&nfm->nfm_ref))
+		return NULL;
+	return nfm;
+}
+
+static void
+nfsd_file_mark_put(struct nfsd_file_mark *nfm)
+{
+	if (atomic_dec_and_test(&nfm->nfm_ref)) {
+
+		fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group);
+		fsnotify_put_mark(&nfm->nfm_mark);
+	}
+}
+
+static struct nfsd_file_mark *
+nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+{
+	int			err;
+	struct fsnotify_mark	*mark;
+	struct nfsd_file_mark	*nfm = NULL, *new;
+	struct inode *inode = nf->nf_inode;
+
+	do {
+		mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
+		mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
+				nfsd_file_fsnotify_group);
+		if (mark) {
+			nfm = nfsd_file_mark_get(container_of(mark,
+						 struct nfsd_file_mark,
+						 nfm_mark));
+			mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+			fsnotify_put_mark(mark);
+			if (likely(nfm))
+				break;
+		} else
+			mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+
+		/* allocate a new nfm */
+		new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
+		if (!new)
+			return NULL;
+		fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group);
+		new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF;
+		atomic_set(&new->nfm_ref, 1);
+
+		err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0);
+
+		/*
+		 * If the add was successful, then return the object.
+		 * Otherwise, we need to put the reference we hold on the
+		 * nfm_mark. The fsnotify code will take a reference and put
+		 * it on failure, so we can't just free it directly. It's also
+		 * not safe to call fsnotify_destroy_mark on it as the
+		 * mark->group will be NULL. Thus, we can't let the nfm_ref
+		 * counter drive the destruction at this point.
+		 */
+		if (likely(!err))
+			nfm = new;
+		else
+			fsnotify_put_mark(&new->nfm_mark);
+	} while (unlikely(err == -EEXIST));
+
+	return nfm;
+}
+
+static struct nfsd_file *
+nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
+		struct net *net)
+{
+	struct nfsd_file *nf;
+
+	nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
+	if (nf) {
+		INIT_HLIST_NODE(&nf->nf_node);
+		INIT_LIST_HEAD(&nf->nf_lru);
+		nf->nf_file = NULL;
+		nf->nf_cred = get_current_cred();
+		nf->nf_net = net;
+		nf->nf_flags = 0;
+		nf->nf_inode = inode;
+		nf->nf_hashval = hashval;
+		atomic_set(&nf->nf_ref, 1);
+		nf->nf_may = may & NFSD_FILE_MAY_MASK;
+		if (may & NFSD_MAY_NOT_BREAK_LEASE) {
+			if (may & NFSD_MAY_WRITE)
+				__set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags);
+			if (may & NFSD_MAY_READ)
+				__set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+		}
+		nf->nf_mark = NULL;
+		trace_nfsd_file_alloc(nf);
+	}
+	return nf;
+}
+
+static bool
+nfsd_file_free(struct nfsd_file *nf)
+{
+	bool flush = false;
+
+	trace_nfsd_file_put_final(nf);
+	if (nf->nf_mark)
+		nfsd_file_mark_put(nf->nf_mark);
+	if (nf->nf_file) {
+		get_file(nf->nf_file);
+		filp_close(nf->nf_file, NULL);
+		fput(nf->nf_file);
+		flush = true;
+	}
+	call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
+	return flush;
+}
+
+static bool
+nfsd_file_check_writeback(struct nfsd_file *nf)
+{
+	struct file *file = nf->nf_file;
+	struct address_space *mapping;
+
+	if (!file || !(file->f_mode & FMODE_WRITE))
+		return false;
+	mapping = file->f_mapping;
+	return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+		mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+}
+
+static int
+nfsd_file_check_write_error(struct nfsd_file *nf)
+{
+	struct file *file = nf->nf_file;
+
+	if (!file || !(file->f_mode & FMODE_WRITE))
+		return 0;
+	return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+}
+
+static bool
+nfsd_file_in_use(struct nfsd_file *nf)
+{
+	return nfsd_file_check_writeback(nf) ||
+			nfsd_file_check_write_error(nf);
+}
+
+static void
+nfsd_file_do_unhash(struct nfsd_file *nf)
+{
+	lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+
+	trace_nfsd_file_unhash(nf);
+
+	if (nfsd_file_check_write_error(nf))
+		nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
+	--nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
+	hlist_del_rcu(&nf->nf_node);
+	if (!list_empty(&nf->nf_lru))
+		list_lru_del(&nfsd_file_lru, &nf->nf_lru);
+	atomic_long_dec(&nfsd_filecache_count);
+}
+
+static bool
+nfsd_file_unhash(struct nfsd_file *nf)
+{
+	if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+		nfsd_file_do_unhash(nf);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Return true if the file was unhashed.
+ */
+static bool
+nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose)
+{
+	lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+
+	trace_nfsd_file_unhash_and_release_locked(nf);
+	if (!nfsd_file_unhash(nf))
+		return false;
+	/* keep final reference for nfsd_file_lru_dispose */
+	if (atomic_add_unless(&nf->nf_ref, -1, 1))
+		return true;
+
+	list_add(&nf->nf_lru, dispose);
+	return true;
+}
+
+static int
+nfsd_file_put_noref(struct nfsd_file *nf)
+{
+	int count;
+	trace_nfsd_file_put(nf);
+
+	count = atomic_dec_return(&nf->nf_ref);
+	if (!count) {
+		WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
+		nfsd_file_free(nf);
+	}
+	return count;
+}
+
+void
+nfsd_file_put(struct nfsd_file *nf)
+{
+	bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
+	bool unused = !nfsd_file_in_use(nf);
+
+	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+	if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused)
+		nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH);
+}
+
+struct nfsd_file *
+nfsd_file_get(struct nfsd_file *nf)
+{
+	if (likely(atomic_inc_not_zero(&nf->nf_ref)))
+		return nf;
+	return NULL;
+}
+
+static void
+nfsd_file_dispose_list(struct list_head *dispose)
+{
+	struct nfsd_file *nf;
+
+	while(!list_empty(dispose)) {
+		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+		list_del(&nf->nf_lru);
+		nfsd_file_put_noref(nf);
+	}
+}
+
+static void
+nfsd_file_dispose_list_sync(struct list_head *dispose)
+{
+	bool flush = false;
+	struct nfsd_file *nf;
+
+	while(!list_empty(dispose)) {
+		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+		list_del(&nf->nf_lru);
+		if (!atomic_dec_and_test(&nf->nf_ref))
+			continue;
+		if (nfsd_file_free(nf))
+			flush = true;
+	}
+	if (flush)
+		flush_delayed_fput();
+}
+
+/*
+ * Note this can deadlock with nfsd_file_cache_purge.
+ */
+static enum lru_status
+nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
+		 spinlock_t *lock, void *arg)
+	__releases(lock)
+	__acquires(lock)
+{
+	struct list_head *head = arg;
+	struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
+
+	/*
+	 * Do a lockless refcount check. The hashtable holds one reference, so
+	 * we look to see if anything else has a reference, or if any have
+	 * been put since the shrinker last ran. Those don't get unhashed and
+	 * released.
+	 *
+	 * Note that in the put path, we set the flag and then decrement the
+	 * counter. Here we check the counter and then test and clear the flag.
+	 * That order is deliberate to ensure that we can do this locklessly.
+	 */
+	if (atomic_read(&nf->nf_ref) > 1)
+		goto out_skip;
+
+	/*
+	 * Don't throw out files that are still undergoing I/O or
+	 * that have uncleared errors pending.
+	 */
+	if (nfsd_file_check_writeback(nf))
+		goto out_skip;
+
+	if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
+		goto out_rescan;
+
+	if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+		goto out_skip;
+
+	list_lru_isolate_move(lru, &nf->nf_lru, head);
+	return LRU_REMOVED;
+out_rescan:
+	set_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags);
+out_skip:
+	return LRU_SKIP;
+}
+
+static void
+nfsd_file_lru_dispose(struct list_head *head)
+{
+	while(!list_empty(head)) {
+		struct nfsd_file *nf = list_first_entry(head,
+				struct nfsd_file, nf_lru);
+		list_del_init(&nf->nf_lru);
+		spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+		nfsd_file_do_unhash(nf);
+		spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+		nfsd_file_put_noref(nf);
+	}
+}
+
+static unsigned long
+nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
+{
+	return list_lru_count(&nfsd_file_lru);
+}
+
+static unsigned long
+nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
+{
+	LIST_HEAD(head);
+	unsigned long ret;
+
+	ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head);
+	nfsd_file_lru_dispose(&head);
+	return ret;
+}
+
+static struct shrinker	nfsd_file_shrinker = {
+	.scan_objects = nfsd_file_lru_scan,
+	.count_objects = nfsd_file_lru_count,
+	.seeks = 1,
+};
+
+static void
+__nfsd_file_close_inode(struct inode *inode, unsigned int hashval,
+			struct list_head *dispose)
+{
+	struct nfsd_file	*nf;
+	struct hlist_node	*tmp;
+
+	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
+		if (inode == nf->nf_inode)
+			nfsd_file_unhash_and_release_locked(nf, dispose);
+	}
+	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+}
+
+/**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them and
+ * destroy any that had their last reference put. Also ensure that any of the
+ * fputs also have their final __fput done as well.
+ */
+void
+nfsd_file_close_inode_sync(struct inode *inode)
+{
+	unsigned int		hashval = (unsigned int)hash_long(inode->i_ino,
+						NFSD_FILE_HASH_BITS);
+	LIST_HEAD(dispose);
+
+	__nfsd_file_close_inode(inode, hashval, &dispose);
+	trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose));
+	nfsd_file_dispose_list_sync(&dispose);
+}
+
+/**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them and
+ * destroy any that had their last reference put.
+ */
+static void
+nfsd_file_close_inode(struct inode *inode)
+{
+	unsigned int		hashval = (unsigned int)hash_long(inode->i_ino,
+						NFSD_FILE_HASH_BITS);
+	LIST_HEAD(dispose);
+
+	__nfsd_file_close_inode(inode, hashval, &dispose);
+	trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
+	nfsd_file_dispose_list(&dispose);
+}
+
+/**
+ * nfsd_file_delayed_close - close unused nfsd_files
+ * @work: dummy
+ *
+ * Walk the LRU list and close any entries that have not been used since
+ * the last scan.
+ *
+ * Note this can deadlock with nfsd_file_cache_purge.
+ */
+static void
+nfsd_file_delayed_close(struct work_struct *work)
+{
+	LIST_HEAD(head);
+
+	list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX);
+
+	if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags))
+		nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH);
+
+	if (!list_empty(&head)) {
+		nfsd_file_lru_dispose(&head);
+		flush_delayed_fput();
+	}
+}
+
+static int
+nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
+			    void *data)
+{
+	struct file_lock *fl = data;
+
+	/* Only close files for F_SETLEASE leases */
+	if (fl->fl_flags & FL_LEASE)
+		nfsd_file_close_inode_sync(file_inode(fl->fl_file));
+	return 0;
+}
+
+static struct notifier_block nfsd_file_lease_notifier = {
+	.notifier_call = nfsd_file_lease_notifier_call,
+};
+
+static int
+nfsd_file_fsnotify_handle_event(struct fsnotify_group *group,
+				struct inode *inode,
+				u32 mask, const void *data, int data_type,
+				const struct qstr *file_name, u32 cookie,
+				struct fsnotify_iter_info *iter_info)
+{
+	trace_nfsd_file_fsnotify_handle_event(inode, mask);
+
+	/* Should be no marks on non-regular files */
+	if (!S_ISREG(inode->i_mode)) {
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	/* don't close files if this was not the last link */
+	if (mask & FS_ATTRIB) {
+		if (inode->i_nlink)
+			return 0;
+	}
+
+	nfsd_file_close_inode(inode);
+	return 0;
+}
+
+
+static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
+	.handle_event = nfsd_file_fsnotify_handle_event,
+	.free_mark = nfsd_file_mark_free,
+};
+
+int
+nfsd_file_cache_init(void)
+{
+	int		ret = -ENOMEM;
+	unsigned int	i;
+
+	clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+
+	if (nfsd_file_hashtbl)
+		return 0;
+
+	nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+				sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
+	if (!nfsd_file_hashtbl) {
+		pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
+		goto out_err;
+	}
+
+	nfsd_file_slab = kmem_cache_create("nfsd_file",
+				sizeof(struct nfsd_file), 0, 0, NULL);
+	if (!nfsd_file_slab) {
+		pr_err("nfsd: unable to create nfsd_file_slab\n");
+		goto out_err;
+	}
+
+	nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
+					sizeof(struct nfsd_file_mark), 0, 0, NULL);
+	if (!nfsd_file_mark_slab) {
+		pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
+		goto out_err;
+	}
+
+
+	ret = list_lru_init(&nfsd_file_lru);
+	if (ret) {
+		pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
+		goto out_err;
+	}
+
+	ret = register_shrinker(&nfsd_file_shrinker);
+	if (ret) {
+		pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
+		goto out_lru;
+	}
+
+	ret = lease_register_notifier(&nfsd_file_lease_notifier);
+	if (ret) {
+		pr_err("nfsd: unable to register lease notifier: %d\n", ret);
+		goto out_shrinker;
+	}
+
+	nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
+	if (IS_ERR(nfsd_file_fsnotify_group)) {
+		pr_err("nfsd: unable to create fsnotify group: %ld\n",
+			PTR_ERR(nfsd_file_fsnotify_group));
+		nfsd_file_fsnotify_group = NULL;
+		goto out_notifier;
+	}
+
+	for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+		INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
+		spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
+	}
+
+	INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close);
+out:
+	return ret;
+out_notifier:
+	lease_unregister_notifier(&nfsd_file_lease_notifier);
+out_shrinker:
+	unregister_shrinker(&nfsd_file_shrinker);
+out_lru:
+	list_lru_destroy(&nfsd_file_lru);
+out_err:
+	kmem_cache_destroy(nfsd_file_slab);
+	nfsd_file_slab = NULL;
+	kmem_cache_destroy(nfsd_file_mark_slab);
+	nfsd_file_mark_slab = NULL;
+	kfree(nfsd_file_hashtbl);
+	nfsd_file_hashtbl = NULL;
+	goto out;
+}
+
+/*
+ * Note this can deadlock with nfsd_file_lru_cb.
+ */
+void
+nfsd_file_cache_purge(struct net *net)
+{
+	unsigned int		i;
+	struct nfsd_file	*nf;
+	struct hlist_node	*next;
+	LIST_HEAD(dispose);
+	bool del;
+
+	if (!nfsd_file_hashtbl)
+		return;
+
+	for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+		struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+
+		spin_lock(&nfb->nfb_lock);
+		hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+			if (net && nf->nf_net != net)
+				continue;
+			del = nfsd_file_unhash_and_release_locked(nf, &dispose);
+
+			/*
+			 * Deadlock detected! Something marked this entry as
+			 * unhased, but hasn't removed it from the hash list.
+			 */
+			WARN_ON_ONCE(!del);
+		}
+		spin_unlock(&nfb->nfb_lock);
+		nfsd_file_dispose_list(&dispose);
+	}
+}
+
+void
+nfsd_file_cache_shutdown(void)
+{
+	LIST_HEAD(dispose);
+
+	set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+
+	lease_unregister_notifier(&nfsd_file_lease_notifier);
+	unregister_shrinker(&nfsd_file_shrinker);
+	/*
+	 * make sure all callers of nfsd_file_lru_cb are done before
+	 * calling nfsd_file_cache_purge
+	 */
+	cancel_delayed_work_sync(&nfsd_filecache_laundrette);
+	nfsd_file_cache_purge(NULL);
+	list_lru_destroy(&nfsd_file_lru);
+	rcu_barrier();
+	fsnotify_put_group(nfsd_file_fsnotify_group);
+	nfsd_file_fsnotify_group = NULL;
+	kmem_cache_destroy(nfsd_file_slab);
+	nfsd_file_slab = NULL;
+	fsnotify_wait_marks_destroyed();
+	kmem_cache_destroy(nfsd_file_mark_slab);
+	nfsd_file_mark_slab = NULL;
+	kfree(nfsd_file_hashtbl);
+	nfsd_file_hashtbl = NULL;
+}
+
+static bool
+nfsd_match_cred(const struct cred *c1, const struct cred *c2)
+{
+	int i;
+
+	if (!uid_eq(c1->fsuid, c2->fsuid))
+		return false;
+	if (!gid_eq(c1->fsgid, c2->fsgid))
+		return false;
+	if (c1->group_info == NULL || c2->group_info == NULL)
+		return c1->group_info == c2->group_info;
+	if (c1->group_info->ngroups != c2->group_info->ngroups)
+		return false;
+	for (i = 0; i < c1->group_info->ngroups; i++) {
+		if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
+			return false;
+	}
+	return true;
+}
+
+static struct nfsd_file *
+nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
+			unsigned int hashval, struct net *net)
+{
+	struct nfsd_file *nf;
+	unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+
+	hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+				 nf_node) {
+		if ((need & nf->nf_may) != need)
+			continue;
+		if (nf->nf_inode != inode)
+			continue;
+		if (nf->nf_net != net)
+			continue;
+		if (!nfsd_match_cred(nf->nf_cred, current_cred()))
+			continue;
+		if (nfsd_file_get(nf) != NULL)
+			return nf;
+	}
+	return NULL;
+}
+
+/**
+ * nfsd_file_is_cached - are there any cached open files for this fh?
+ * @inode: inode of the file to check
+ *
+ * Scan the hashtable for open files that match this fh. Returns true if there
+ * are any, and false if not.
+ */
+bool
+nfsd_file_is_cached(struct inode *inode)
+{
+	bool			ret = false;
+	struct nfsd_file	*nf;
+	unsigned int		hashval;
+
+        hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+				 nf_node) {
+		if (inode == nf->nf_inode) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	trace_nfsd_file_is_cached(inode, hashval, (int)ret);
+	return ret;
+}
+
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  unsigned int may_flags, struct nfsd_file **pnf)
+{
+	__be32	status;
+	struct net *net = SVC_NET(rqstp);
+	struct nfsd_file *nf, *new;
+	struct inode *inode;
+	unsigned int hashval;
+
+	/* FIXME: skip this if fh_dentry is already set? */
+	status = fh_verify(rqstp, fhp, S_IFREG,
+				may_flags|NFSD_MAY_OWNER_OVERRIDE);
+	if (status != nfs_ok)
+		return status;
+
+	inode = d_inode(fhp->fh_dentry);
+	hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+retry:
+	rcu_read_lock();
+	nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+	rcu_read_unlock();
+	if (nf)
+		goto wait_for_construction;
+
+	new = nfsd_file_alloc(inode, may_flags, hashval, net);
+	if (!new) {
+		trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
+					NULL, nfserr_jukebox);
+		return nfserr_jukebox;
+	}
+
+	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+	if (nf == NULL)
+		goto open_file;
+	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	nfsd_file_slab_free(&new->nf_rcu);
+
+wait_for_construction:
+	wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
+
+	/* Did construction of this file fail? */
+	if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+		nfsd_file_put_noref(nf);
+		goto retry;
+	}
+
+	this_cpu_inc(nfsd_file_cache_hits);
+
+	if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) {
+		bool write = (may_flags & NFSD_MAY_WRITE);
+
+		if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) ||
+		    (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) {
+			status = nfserrno(nfsd_open_break_lease(
+					file_inode(nf->nf_file), may_flags));
+			if (status == nfs_ok) {
+				clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+				if (write)
+					clear_bit(NFSD_FILE_BREAK_WRITE,
+						  &nf->nf_flags);
+			}
+		}
+	}
+out:
+	if (status == nfs_ok) {
+		*pnf = nf;
+	} else {
+		nfsd_file_put(nf);
+		nf = NULL;
+	}
+
+	trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status);
+	return status;
+open_file:
+	nf = new;
+	/* Take reference for the hashtable */
+	atomic_inc(&nf->nf_ref);
+	__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
+	__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+	list_lru_add(&nfsd_file_lru, &nf->nf_lru);
+	hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head);
+	++nfsd_file_hashtbl[hashval].nfb_count;
+	nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
+			nfsd_file_hashtbl[hashval].nfb_count);
+	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	atomic_long_inc(&nfsd_filecache_count);
+
+	nf->nf_mark = nfsd_file_mark_find_or_create(nf);
+	if (nf->nf_mark)
+		status = nfsd_open_verified(rqstp, fhp, S_IFREG,
+				may_flags, &nf->nf_file);
+	else
+		status = nfserr_jukebox;
+	/*
+	 * If construction failed, or we raced with a call to unlink()
+	 * then unhash.
+	 */
+	if (status != nfs_ok || inode->i_nlink == 0) {
+		bool do_free;
+		spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+		do_free = nfsd_file_unhash(nf);
+		spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+		if (do_free)
+			nfsd_file_put_noref(nf);
+	}
+	clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
+	goto out;
+}
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+{
+	unsigned int i, count = 0, longest = 0;
+	unsigned long hits = 0;
+
+	/*
+	 * No need for spinlocks here since we're not terribly interested in
+	 * accuracy. We do take the nfsd_mutex simply to ensure that we
+	 * don't end up racing with server shutdown
+	 */
+	mutex_lock(&nfsd_mutex);
+	if (nfsd_file_hashtbl) {
+		for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+			count += nfsd_file_hashtbl[i].nfb_count;
+			longest = max(longest, nfsd_file_hashtbl[i].nfb_count);
+		}
+	}
+	mutex_unlock(&nfsd_mutex);
+
+	for_each_possible_cpu(i)
+		hits += per_cpu(nfsd_file_cache_hits, i);
+
+	seq_printf(m, "total entries: %u\n", count);
+	seq_printf(m, "longest chain: %u\n", longest);
+	seq_printf(m, "cache hits:    %lu\n", hits);
+	return 0;
+}
+
+int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, nfsd_file_cache_stats_show, NULL);
+}

diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
new file mode 100644
index 0000000..851d9ab
--- /dev/null
+++ b/fs/nfsd/filecache.h

@@ -0,0 +1,61 @@
+#ifndef _FS_NFSD_FILECACHE_H
+#define _FS_NFSD_FILECACHE_H
+
+#include <linux/fsnotify_backend.h>
+
+/*
+ * This is the fsnotify_mark container that nfsd attaches to the files that it
+ * is holding open. Note that we have a separate refcount here aside from the
+ * one in the fsnotify_mark. We only want a single fsnotify_mark attached to
+ * the inode, and for each nfsd_file to hold a reference to it.
+ *
+ * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us
+ * how to put that reference. If there are still outstanding nfsd_files that
+ * reference the mark, then we would want to call fsnotify_put_mark on it.
+ * If there were not, then we'd need to call fsnotify_destroy_mark. Since we
+ * can't really tell the difference, we use the nfm_mark to keep track of how
+ * many nfsd_files hold references to the mark. When that counter goes to zero
+ * then we know to call fsnotify_destroy_mark on it.
+ */
+struct nfsd_file_mark {
+	struct fsnotify_mark	nfm_mark;
+	atomic_t		nfm_ref;
+};
+
+/*
+ * A representation of a file that has been opened by knfsd. These are hashed
+ * in the hashtable by inode pointer value. Note that this object doesn't
+ * hold a reference to the inode by itself, so the nf_inode pointer should
+ * never be dereferenced, only used for comparison.
+ */
+struct nfsd_file {
+	struct hlist_node	nf_node;
+	struct list_head	nf_lru;
+	struct rcu_head		nf_rcu;
+	struct file		*nf_file;
+	const struct cred	*nf_cred;
+	struct net		*nf_net;
+#define NFSD_FILE_HASHED	(0)
+#define NFSD_FILE_PENDING	(1)
+#define NFSD_FILE_BREAK_READ	(2)
+#define NFSD_FILE_BREAK_WRITE	(3)
+#define NFSD_FILE_REFERENCED	(4)
+	unsigned long		nf_flags;
+	struct inode		*nf_inode;
+	unsigned int		nf_hashval;
+	atomic_t		nf_ref;
+	unsigned char		nf_may;
+	struct nfsd_file_mark	*nf_mark;
+};
+
+int nfsd_file_cache_init(void);
+void nfsd_file_cache_purge(struct net *);
+void nfsd_file_cache_shutdown(void);
+void nfsd_file_put(struct nfsd_file *nf);
+struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
+void nfsd_file_close_inode_sync(struct inode *inode);
+bool nfsd_file_is_cached(struct inode *inode);
+__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  unsigned int may_flags, struct nfsd_file **nfp);
+int	nfsd_file_cache_stats_open(struct inode *, struct file *);
+#endif /* _FS_NFSD_FILECACHE_H */

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 426f550..9a4ef81 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h

@@ -1,21 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * per net namespace data structures for nfsd
  *
  * Copyright (C) 2012, Jeff Layton <jlayton@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
 #ifndef __NFSD_NETNS_H__
@@ -55,6 +42,11 @@
 	bool grace_ended;
 	time_t boot_time;
 
+	/* internal mount of the "nfsd" pseudofilesystem: */
+	struct vfsmount *nfsd_mnt;
+
+	struct dentry *nfsd_client_dir;
+
 	/*
 	 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
 	 * used in reboot/reset lease grace period processing
@@ -104,11 +96,15 @@
 	time_t nfsd4_grace;
 	bool somebody_reclaimed;
 
+	bool track_reclaim_completes;
+	atomic_t nr_reclaim_complete;
+
 	bool nfsd_net_up;
 	bool lockd_up;
 
 	/* Time of server startup */
 	struct timespec64 nfssvc_boot;
+	seqlock_t boot_lock;
 
 	/*
 	 * Max number of connections this nfsd container will allow. Defaults
@@ -116,6 +112,7 @@
 	 */
 	unsigned int max_connections;
 
+	u32 clientid_base;
 	u32 clientid_counter;
 	u32 clverifier_counter;
 
@@ -123,10 +120,67 @@
 
 	wait_queue_head_t ntf_wq;
 	atomic_t ntf_refcnt;
+
+	/*
+	 * clientid and stateid data for construction of net unique COPY
+	 * stateids.
+	 */
+	u32		s2s_cp_cl_id;
+	struct idr	s2s_cp_stateids;
+	spinlock_t	s2s_cp_lock;
+
+	/*
+	 * Version information
+	 */
+	bool *nfsd_versions;
+	bool *nfsd4_minorversions;
+
+	/*
+	 * Duplicate reply cache
+	 */
+	struct nfsd_drc_bucket   *drc_hashtbl;
+	struct kmem_cache        *drc_slab;
+
+	/* max number of entries allowed in the cache */
+	unsigned int             max_drc_entries;
+
+	/* number of significant bits in the hash value */
+	unsigned int             maskbits;
+	unsigned int             drc_hashsize;
+
+	/*
+	 * Stats and other tracking of on the duplicate reply cache.
+	 * These fields and the "rc" fields in nfsdstats are modified
+	 * with only the per-bucket cache lock, which isn't really safe
+	 * and should be fixed if we want the statistics to be
+	 * completely accurate.
+	 */
+
+	/* total number of entries */
+	atomic_t                 num_drc_entries;
+
+	/* cache misses due only to checksum comparison failures */
+	unsigned int             payload_misses;
+
+	/* amount of memory (in bytes) currently consumed by the DRC */
+	unsigned int             drc_mem_usage;
+
+	/* longest hash chain seen */
+	unsigned int             longest_chain;
+
+	/* size of cache when we saw the longest hash chain */
+	unsigned int             longest_chain_cachesize;
+
+	struct shrinker		nfsd_reply_cache_shrinker;
 };
 
 /* Simple check to find out if a given net was properly initialized */
 #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
 
+extern void nfsd_netns_free_versions(struct nfsd_net *nn);
+
 extern unsigned int nfsd_net_id;
+
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn);
+void nfsd_reset_boot_verifier(struct nfsd_net *nn);
 #endif /* __NFSD_NETNS_H__ */

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9eb8086..cea68d8 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c

@@ -172,13 +172,8 @@
 	nfserr = nfsd_read(rqstp, &resp->fh,
 				  argp->offset,
 			   	  rqstp->rq_vec, argp->vlen,
-				  &resp->count);
-	if (nfserr == 0) {
-		struct inode	*inode = d_inode(resp->fh.fh_dentry);
-		resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
-							inode->i_size);
-	}
-
+				  &resp->count,
+				  &resp->eof);
 	RETURN_STATUS(nfserr);
 }
 
@@ -442,7 +437,9 @@
 	struct nfsd3_readdirargs *argp = rqstp->rq_argp;
 	struct nfsd3_readdirres  *resp = rqstp->rq_resp;
 	__be32		nfserr;
-	int		count;
+	int		count = 0;
+	struct page	**p;
+	caddr_t		page_addr = NULL;
 
 	dprintk("nfsd: READDIR(3)  %s %d bytes at %d\n",
 				SVCFH_fmt(&argp->fh),
@@ -462,9 +459,31 @@
 	nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, 
 					&resp->common, nfs3svc_encode_entry);
 	memcpy(resp->verf, argp->verf, 8);
-	resp->count = resp->buffer - argp->buffer;
-	if (resp->offset)
-		xdr_encode_hyper(resp->offset, argp->cookie);
+	count = 0;
+	for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+		page_addr = page_address(*p);
+
+		if (((caddr_t)resp->buffer >= page_addr) &&
+		    ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+			count += (caddr_t)resp->buffer - page_addr;
+			break;
+		}
+		count += PAGE_SIZE;
+	}
+	resp->count = count >> 2;
+	if (resp->offset) {
+		loff_t offset = argp->cookie;
+
+		if (unlikely(resp->offset1)) {
+			/* we ended up with offset on a page boundary */
+			*resp->offset = htonl(offset >> 32);
+			*resp->offset1 = htonl(offset & 0xffffffff);
+			resp->offset1 = NULL;
+		} else {
+			xdr_encode_hyper(resp->offset, offset);
+		}
+		resp->offset = NULL;
+	}
 
 	RETURN_STATUS(nfserr);
 }
@@ -533,6 +552,7 @@
 		} else {
 			xdr_encode_hyper(resp->offset, offset);
 		}
+		resp->offset = NULL;
 	}
 
 	RETURN_STATUS(nfserr);
@@ -576,7 +596,7 @@
 	resp->f_wtmax  = max_blocksize;
 	resp->f_wtpref = max_blocksize;
 	resp->f_wtmult = PAGE_SIZE;
-	resp->f_dtpref = PAGE_SIZE;
+	resp->f_dtpref = max_blocksize;
 	resp->f_maxfilesize = ~(u32) 0;
 	resp->f_properties = NFS3_FSF_DEFAULT;
 

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9b973f4..86e5658 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c

@@ -27,6 +27,7 @@
 	NF3SOCK, NF3BAD,  NF3LNK, NF3BAD,
 };
 
+
 /*
  * XDR functions for basic NFS types
  */
@@ -96,7 +97,7 @@
 }
 
 static __be32 *
-decode_sattr3(__be32 *p, struct iattr *iap)
+decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns)
 {
 	u32	tmp;
 
@@ -107,12 +108,12 @@
 		iap->ia_mode = ntohl(*p++);
 	}
 	if (*p++) {
-		iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++));
+		iap->ia_uid = make_kuid(userns, ntohl(*p++));
 		if (uid_valid(iap->ia_uid))
 			iap->ia_valid |= ATTR_UID;
 	}
 	if (*p++) {
-		iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++));
+		iap->ia_gid = make_kgid(userns, ntohl(*p++));
 		if (gid_valid(iap->ia_gid))
 			iap->ia_valid |= ATTR_GID;
 	}
@@ -165,12 +166,13 @@
 encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	      struct kstat *stat)
 {
+	struct user_namespace *userns = nfsd_user_namespace(rqstp);
 	struct timespec ts;
 	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
 	*p++ = htonl((u32) (stat->mode & S_IALLUGO));
 	*p++ = htonl((u32) stat->nlink);
-	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
-	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
+	*p++ = htonl((u32) from_kuid_munged(userns, stat->uid));
+	*p++ = htonl((u32) from_kgid_munged(userns, stat->gid));
 	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
 		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
 	} else {
@@ -325,7 +327,7 @@
 	p = decode_fh(p, &args->fh);
 	if (!p)
 		return 0;
-	p = decode_sattr3(p, &args->attrs);
+	p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
 
 	if ((args->check_guard = ntohl(*p++)) != 0) { 
 		struct timespec time; 
@@ -455,7 +457,7 @@
 	switch (args->createmode = ntohl(*p++)) {
 	case NFS3_CREATE_UNCHECKED:
 	case NFS3_CREATE_GUARDED:
-		p = decode_sattr3(p, &args->attrs);
+		p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
 		break;
 	case NFS3_CREATE_EXCLUSIVE:
 		args->verf = p;
@@ -476,7 +478,7 @@
 	if (!(p = decode_fh(p, &args->fh)) ||
 	    !(p = decode_filename(p, &args->name, &args->len)))
 		return 0;
-	p = decode_sattr3(p, &args->attrs);
+	p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
 
 	return xdr_argsize_check(rqstp, p);
 }
@@ -491,7 +493,7 @@
 	if (!(p = decode_fh(p, &args->ffh)) ||
 	    !(p = decode_filename(p, &args->fname, &args->flen)))
 		return 0;
-	p = decode_sattr3(p, &args->attrs);
+	p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
 
 	args->tlen = ntohl(*p++);
 
@@ -519,7 +521,7 @@
 
 	if (args->ftype == NF3BLK  || args->ftype == NF3CHR
 	 || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
-		p = decode_sattr3(p, &args->attrs);
+		p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
 
 	if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
 		args->major = ntohl(*p++);
@@ -573,6 +575,9 @@
 nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
 {
 	struct nfsd3_readdirargs *args = rqstp->rq_argp;
+	int len;
+	u32 max_blocksize = svc_max_payload(rqstp);
+
 	p = decode_fh(p, &args->fh);
 	if (!p)
 		return 0;
@@ -580,8 +585,14 @@
 	args->verf   = p; p += 2;
 	args->dircount = ~0;
 	args->count  = ntohl(*p++);
-	args->count  = min_t(u32, args->count, PAGE_SIZE);
-	args->buffer = page_address(*(rqstp->rq_next_page++));
+	len = args->count  = min_t(u32, args->count, max_blocksize);
+
+	while (len > 0) {
+		struct page *p = *(rqstp->rq_next_page++);
+		if (!args->buffer)
+			args->buffer = page_address(p);
+		len -= PAGE_SIZE;
+	}
 
 	return xdr_argsize_check(rqstp, p);
 }
@@ -741,14 +752,16 @@
 {
 	struct nfsd3_writeres *resp = rqstp->rq_resp;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	__be32 verf[2];
 
 	p = encode_wcc_data(rqstp, p, &resp->fh);
 	if (resp->status == 0) {
 		*p++ = htonl(resp->count);
 		*p++ = htonl(resp->committed);
 		/* unique identifier, y2038 overflow can be ignored */
-		*p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
-		*p++ = htonl(nn->nfssvc_boot.tv_nsec);
+		nfsd_copy_boot_verifier(verf, nn);
+		*p++ = verf[0];
+		*p++ = verf[1];
 	}
 	return xdr_ressize_check(rqstp, p);
 }
@@ -921,6 +934,7 @@
 		} else {
 			xdr_encode_hyper(cd->offset, offset64);
 		}
+		cd->offset = NULL;
 	}
 
 	/*
@@ -1114,13 +1128,15 @@
 {
 	struct nfsd3_commitres *resp = rqstp->rq_resp;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	__be32 verf[2];
 
 	p = encode_wcc_data(rqstp, p, &resp->fh);
 	/* Write verifier */
 	if (resp->status == 0) {
 		/* unique identifier, y2038 overflow can be ignored */
-		*p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
-		*p++ = htonl(nn->nfssvc_boot.tv_nsec);
+		nfsd_copy_boot_verifier(verf, nn);
+		*p++ = verf[0];
+		*p++ = verf[1];
 	}
 	return xdr_ressize_check(rqstp, p);
 }

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 601bf33..5241114 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c

@@ -39,6 +39,7 @@
 #include "state.h"
 #include "netns.h"
 #include "xdr4cb.h"
+#include "xdr4.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -59,16 +60,6 @@
 	int		status;
 };
 
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
-	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-		"Remaining buffer length is %tu words.\n",
-		func, xdr->end - xdr->p);
-}
-
 static __be32 *xdr_encode_empty_array(__be32 *p)
 {
 	*p++ = xdr_zero;
@@ -105,6 +96,7 @@
 	OP_CB_WANTS_CANCELLED		= 12,
 	OP_CB_NOTIFY_LOCK		= 13,
 	OP_CB_NOTIFY_DEVICEID		= 14,
+	OP_CB_OFFLOAD			= 15,
 	OP_CB_ILLEGAL			= 10044
 };
 
@@ -238,7 +230,6 @@
 	*status = nfs_cb_stat_to_errno(be32_to_cpup(p));
 	return 0;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 out_unexpected:
 	dprintk("NFSD: Callback server returned operation %d but "
@@ -307,7 +298,6 @@
 	hdr->nops = be32_to_cpup(p);
 	return 0;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	return -EIO;
 }
 
@@ -435,7 +425,6 @@
 	cb->cb_seq_status = status;
 	return status;
 out_overflow:
-	print_overflow_msg(__func__, xdr);
 	status = -EIO;
 	goto out;
 }
@@ -523,11 +512,9 @@
 	if (unlikely(status))
 		return status;
 
-	if (cb != NULL) {
-		status = decode_cb_sequence4res(xdr, cb);
-		if (unlikely(status || cb->cb_seq_status))
-			return status;
-	}
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
 
 	return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
 }
@@ -615,11 +602,10 @@
 	if (unlikely(status))
 		return status;
 
-	if (cb) {
-		status = decode_cb_sequence4res(xdr, cb);
-		if (unlikely(status || cb->cb_seq_status))
-			return status;
-	}
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
 	return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
 }
 #endif /* CONFIG_NFSD_PNFS */
@@ -674,15 +660,108 @@
 	if (unlikely(status))
 		return status;
 
-	if (cb) {
-		status = decode_cb_sequence4res(xdr, cb);
-		if (unlikely(status || cb->cb_seq_status))
-			return status;
-	}
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
 	return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status);
 }
 
 /*
+ * struct write_response4 {
+ *	stateid4	wr_callback_id<1>;
+ *	length4		wr_count;
+ *	stable_how4	wr_committed;
+ *	verifier4	wr_writeverf;
+ * };
+ * union offload_info4 switch (nfsstat4 coa_status) {
+ *	case NFS4_OK:
+ *		write_response4	coa_resok4;
+ *	default:
+ *	length4		coa_bytes_copied;
+ * };
+ * struct CB_OFFLOAD4args {
+ *	nfs_fh4		coa_fh;
+ *	stateid4	coa_stateid;
+ *	offload_info4	coa_offload_info;
+ * };
+ */
+static void encode_offload_info4(struct xdr_stream *xdr,
+				 __be32 nfserr,
+				 const struct nfsd4_copy *cp)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p++ = nfserr;
+	if (!nfserr) {
+		p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+		p = xdr_encode_empty_array(p);
+		p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written);
+		*p++ = cpu_to_be32(cp->cp_res.wr_stable_how);
+		p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data,
+					    NFS4_VERIFIER_SIZE);
+	} else {
+		p = xdr_reserve_space(xdr, 8);
+		/* We always return success if bytes were written */
+		p = xdr_encode_hyper(p, 0);
+	}
+}
+
+static void encode_cb_offload4args(struct xdr_stream *xdr,
+				   __be32 nfserr,
+				   const struct knfsd_fh *fh,
+				   const struct nfsd4_copy *cp,
+				   struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p++ = cpu_to_be32(OP_CB_OFFLOAD);
+	encode_nfs_fh4(xdr, fh);
+	encode_stateid4(xdr, &cp->cp_res.cb_stateid);
+	encode_offload_info4(xdr, nfserr, cp);
+
+	hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const void *data)
+{
+	const struct nfsd4_callback *cb = data;
+	const struct nfsd4_copy *cp =
+		container_of(cb, struct nfsd4_copy, cp_cb);
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = 0,
+		.minorversion = cb->cb_clp->cl_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr);
+	encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp,
+				   struct xdr_stream *xdr,
+				   void *data)
+{
+	struct nfsd4_callback *cb = data;
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
+	return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status);
+}
+/*
  * RPC procedure tables
  */
 #define PROC(proc, call, argtype, restype)				\
@@ -703,6 +782,7 @@
 	PROC(CB_LAYOUT,	COMPOUND,	cb_layout,	cb_layout),
 #endif
 	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
+	PROC(CB_OFFLOAD,	COMPOUND,	cb_offload,	cb_offload),
 };
 
 static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
@@ -746,24 +826,23 @@
 	return max(nn->nfsd4_lease/10, (time_t)1) * HZ;
 }
 
-static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
+static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
 {
 	if (clp->cl_minorversion == 0) {
-		char *principal = clp->cl_cred.cr_targ_princ ?
-					clp->cl_cred.cr_targ_princ : "nfs";
-		struct rpc_cred *cred;
+		client->cl_principal = clp->cl_cred.cr_targ_princ ?
+			clp->cl_cred.cr_targ_princ : "nfs";
 
-		cred = rpc_lookup_machine_cred(principal);
-		if (!IS_ERR(cred))
-			get_rpccred(cred);
-		return cred;
+		return get_cred(rpc_machine_cred());
 	} else {
-		struct rpc_auth *auth = client->cl_auth;
-		struct auth_cred acred = {};
+		struct cred *kcred;
 
-		acred.uid = ses->se_cb_sec.uid;
-		acred.gid = ses->se_cb_sec.gid;
-		return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0);
+		kcred = prepare_kernel_cred(NULL);
+		if (!kcred)
+			return NULL;
+
+		kcred->uid = ses->se_cb_sec.uid;
+		kcred->gid = ses->se_cb_sec.gid;
+		return kcred;
 	}
 }
 
@@ -784,9 +863,10 @@
 		.program	= &cb_program,
 		.version	= 1,
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+		.cred		= current_cred(),
 	};
 	struct rpc_clnt *client;
-	struct rpc_cred *cred;
+	const struct cred *cred;
 
 	if (clp->cl_minorversion == 0) {
 		if (!clp->cl_cred.cr_principal &&
@@ -816,9 +896,9 @@
 		return PTR_ERR(client);
 	}
 	cred = get_backchannel_cred(clp, client, ses);
-	if (IS_ERR(cred)) {
+	if (!cred) {
 		rpc_shutdown_client(client);
-		return PTR_ERR(cred);
+		return -ENOMEM;
 	}
 	clp->cl_cb_client = client;
 	clp->cl_cb_cred = cred;
@@ -926,8 +1006,9 @@
 	cb->cb_seq_status = 1;
 	cb->cb_status = 0;
 	if (minorversion) {
-		if (!nfsd41_cb_get_slot(clp, task))
+		if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task))
 			return;
+		cb->cb_holds_slot = true;
 	}
 	rpc_call_start(task);
 }
@@ -948,12 +1029,15 @@
 		 * the submission code will error out, so we don't need to
 		 * handle that case here.
 		 */
-		if (task->tk_flags & RPC_TASK_KILLED)
+		if (RPC_SIGNALLED(task))
 			goto need_restart;
 
 		return true;
 	}
 
+	if (!cb->cb_holds_slot)
+		goto need_restart;
+
 	switch (cb->cb_seq_status) {
 	case 0:
 		/*
@@ -992,12 +1076,13 @@
 			cb->cb_seq_status);
 	}
 
+	cb->cb_holds_slot = false;
 	clear_bit(0, &clp->cl_cb_slot_busy);
 	rpc_wake_up_next(&clp->cl_cb_waitq);
 	dprintk("%s: freed slot, new seqid=%d\n", __func__,
 		clp->cl_cb_session->se_cb_seq_nr);
 
-	if (task->tk_flags & RPC_TASK_KILLED)
+	if (RPC_SIGNALLED(task))
 		goto need_restart;
 out:
 	return ret;
@@ -1033,10 +1118,11 @@
 		rpc_restart_call_prepare(task);
 		return;
 	case 1:
-		break;
-	case -1:
-		/* Network partition? */
-		nfsd4_mark_cb_down(clp, task->tk_status);
+		switch (task->tk_status) {
+		case -EIO:
+		case -ETIMEDOUT:
+			nfsd4_mark_cb_down(clp, task->tk_status);
+		}
 		break;
 	default:
 		BUG();
@@ -1116,7 +1202,7 @@
 	if (clp->cl_cb_client) {
 		rpc_shutdown_client(clp->cl_cb_client);
 		clp->cl_cb_client = NULL;
-		put_rpccred(clp->cl_cb_cred);
+		put_cred(clp->cl_cb_cred);
 		clp->cl_cb_cred = NULL;
 	}
 	if (clp->cl_cb_conn.cb_xprt) {
@@ -1199,6 +1285,7 @@
 	cb->cb_seq_status = 1;
 	cb->cb_status = 0;
 	cb->cb_need_restart = false;
+	cb->cb_holds_slot = false;
 }
 
 void nfsd4_run_cb(struct nfsd4_callback *cb)

diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a5bb765..d1f2852 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c

@@ -65,6 +65,7 @@
 	u32               id;
 	char              name[IDMAP_NAMESZ];
 	char              authname[IDMAP_NAMESZ];
+	struct rcu_head	  rcu_head;
 };
 
 /* Common entry handling */
@@ -82,14 +83,14 @@
 	new->type = itm->type;
 
 	strlcpy(new->name, itm->name, sizeof(new->name));
-	strlcpy(new->authname, itm->authname, sizeof(new->name));
+	strlcpy(new->authname, itm->authname, sizeof(new->authname));
 }
 
 static void
 ent_put(struct kref *ref)
 {
 	struct ent *map = container_of(ref, struct ent, h.ref);
-	kfree(map);
+	kfree_rcu(map, rcu_head);
 }
 
 static struct cache_head *
@@ -264,8 +265,8 @@
 static struct ent *
 idtoname_lookup(struct cache_detail *cd, struct ent *item)
 {
-	struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
-						    idtoname_hash(item));
+	struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+							idtoname_hash(item));
 	if (ch)
 		return container_of(ch, struct ent, h);
 	else
@@ -422,8 +423,8 @@
 static struct ent *
 nametoid_lookup(struct cache_detail *cd, struct ent *item)
 {
-	struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
-						    nametoid_hash(item));
+	struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+							nametoid_hash(item));
 	if (ch)
 		return container_of(ch, struct ent, h);
 	else
@@ -633,7 +634,7 @@
 		return nfserr_inval;
 
 	status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
-	*uid = make_kuid(&init_user_ns, id);
+	*uid = make_kuid(nfsd_user_namespace(rqstp), id);
 	if (!uid_valid(*uid))
 		status = nfserr_badowner;
 	return status;
@@ -650,7 +651,7 @@
 		return nfserr_inval;
 
 	status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
-	*gid = make_kgid(&init_user_ns, id);
+	*gid = make_kgid(nfsd_user_namespace(rqstp), id);
 	if (!gid_valid(*gid))
 		status = nfserr_badowner;
 	return status;
@@ -659,13 +660,13 @@
 __be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp,
 			 kuid_t uid)
 {
-	u32 id = from_kuid(&init_user_ns, uid);
+	u32 id = from_kuid_munged(nfsd_user_namespace(rqstp), uid);
 	return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id);
 }
 
 __be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp,
 			  kgid_t gid)
 {
-	u32 id = from_kgid(&init_user_ns, gid);
+	u32 id = from_kgid_munged(nfsd_user_namespace(rqstp), gid);
 	return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id);
 }

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 2b36aa0..2681c70 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c

@@ -169,8 +169,8 @@
 	spin_unlock(&fp->fi_lock);
 
 	if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
-		vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
-	fput(ls->ls_file);
+		vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
+	nfsd_file_put(ls->ls_file);
 
 	if (ls->ls_recalled)
 		atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
@@ -197,7 +197,7 @@
 	fl->fl_end = OFFSET_MAX;
 	fl->fl_owner = ls;
 	fl->fl_pid = current->tgid;
-	fl->fl_file = ls->ls_file;
+	fl->fl_file = ls->ls_file->nf_file;
 
 	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
 	if (status) {
@@ -236,13 +236,13 @@
 			NFSPROC4_CLNT_CB_LAYOUT);
 
 	if (parent->sc_type == NFS4_DELEG_STID)
-		ls->ls_file = get_file(fp->fi_deleg_file);
+		ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
 	else
 		ls->ls_file = find_any_file(fp);
 	BUG_ON(!ls->ls_file);
 
 	if (nfsd4_layout_setlease(ls)) {
-		fput(ls->ls_file);
+		nfsd_file_put(ls->ls_file);
 		put_nfs4_file(fp);
 		kmem_cache_free(nfs4_layout_stateid_cache, ls);
 		return NULL;
@@ -626,7 +626,7 @@
 
 	argv[0] = (char *)nfsd_recall_failed;
 	argv[1] = addr_str;
-	argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+	argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id;
 	argv[3] = NULL;
 
 	error = call_usermodehelper(nfsd_recall_failed, argv, envp,
@@ -656,7 +656,6 @@
 	struct nfsd_net *nn;
 	ktime_t now, cutoff;
 	const struct nfsd4_layout_ops *ops;
-	LIST_HEAD(reaplist);
 
 
 	switch (task->tk_status) {
@@ -694,7 +693,7 @@
 			ops->fence_client(ls);
 		else
 			nfsd4_cb_layout_fail(ls);
-		return -1;
+		return 1;
 	case -NFS4ERR_NOMATCHING_LAYOUT:
 		trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid);
 		task->tk_status = 0;

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9d6b4f0..4e3e77b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c

@@ -36,6 +36,7 @@
 #include <linux/file.h>
 #include <linux/falloc.h>
 #include <linux/slab.h>
+#include <linux/kthread.h>
 
 #include "idmap.h"
 #include "cache.h"
@@ -426,6 +427,7 @@
 				goto out;
 			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 			reclaim = true;
+			/* fall through */
 		case NFS4_OPEN_CLAIM_FH:
 		case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 			status = do_open_fhandle(rqstp, cstate, open);
@@ -566,17 +568,11 @@
 
 static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
 {
-	__be32 verf[2];
-	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	__be32 *verf = (__be32 *)verifier->data;
 
-	/*
-	 * This is opaque to client, so no need to byte-swap. Use
-	 * __force to keep sparse happy. y2038 time_t overflow is
-	 * irrelevant in this usage.
-	 */
-	verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
-	verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
-	memcpy(verifier->data, verf, sizeof(verifier->data));
+	BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
+
+	nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id));
 }
 
 static __be32
@@ -759,7 +755,7 @@
 	struct nfsd4_read *read = &u->read;
 	__be32 status;
 
-	read->rd_filp = NULL;
+	read->rd_nf = NULL;
 	if (read->rd_offset >= OFFSET_MAX)
 		return nfserr_inval;
 
@@ -780,7 +776,7 @@
 	/* check stateid */
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					&read->rd_stateid, RD_STATE,
-					&read->rd_filp, &read->rd_tmp_file);
+					&read->rd_nf);
 	if (status) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
@@ -796,8 +792,8 @@
 static void
 nfsd4_read_release(union nfsd4_op_u *u)
 {
-	if (u->read.rd_filp)
-		fput(u->read.rd_filp);
+	if (u->read.rd_nf)
+		nfsd_file_put(u->read.rd_nf);
 	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
 			     u->read.rd_offset, u->read.rd_length);
 }
@@ -862,8 +858,7 @@
 	struct nfsd4_rename *rename = &u->rename;
 	__be32 status;
 
-	if (opens_in_grace(SVC_NET(rqstp)) &&
-		!(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK))
+	if (opens_in_grace(SVC_NET(rqstp)))
 		return nfserr_grace;
 	status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
 			     rename->rn_snamelen, &cstate->current_fh,
@@ -953,7 +948,7 @@
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		status = nfs4_preprocess_stateid_op(rqstp, cstate,
 				&cstate->current_fh, &setattr->sa_stateid,
-				WR_STATE, NULL, NULL);
+				WR_STATE, NULL);
 		if (status) {
 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
 			return status;
@@ -992,7 +987,7 @@
 {
 	struct nfsd4_write *write = &u->write;
 	stateid_t *stateid = &write->wr_stateid;
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 	__be32 status = nfs_ok;
 	unsigned long cnt;
 	int nvecs;
@@ -1004,7 +999,7 @@
 	trace_nfsd_write_start(rqstp, &cstate->current_fh,
 			       write->wr_offset, cnt);
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
-						stateid, WR_STATE, &filp, NULL);
+						stateid, WR_STATE, &nf);
 	if (status) {
 		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
 		return status;
@@ -1015,14 +1010,12 @@
 
 	nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist,
 				      &write->wr_head, write->wr_buflen);
-	if (!nvecs)
-		return nfserr_io;
 	WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
 
-	status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
+	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf->nf_file,
 				write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
 				write->wr_how_written);
-	fput(filp);
+	nfsd_file_put(nf);
 
 	write->wr_bytes_written = cnt;
 	trace_nfsd_write_done(rqstp, &cstate->current_fh,
@@ -1032,8 +1025,8 @@
 
 static __be32
 nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-		  stateid_t *src_stateid, struct file **src,
-		  stateid_t *dst_stateid, struct file **dst)
+		  stateid_t *src_stateid, struct nfsd_file **src,
+		  stateid_t *dst_stateid, struct nfsd_file **dst)
 {
 	__be32 status;
 
@@ -1041,22 +1034,22 @@
 		return nfserr_nofilehandle;
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
-					    src_stateid, RD_STATE, src, NULL);
+					    src_stateid, RD_STATE, src);
 	if (status) {
 		dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
 		goto out;
 	}
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
-					    dst_stateid, WR_STATE, dst, NULL);
+					    dst_stateid, WR_STATE, dst);
 	if (status) {
 		dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
 		goto out_put_src;
 	}
 
 	/* fix up for NFS-specific error code */
-	if (!S_ISREG(file_inode(*src)->i_mode) ||
-	    !S_ISREG(file_inode(*dst)->i_mode)) {
+	if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
+	    !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) {
 		status = nfserr_wrong_type;
 		goto out_put_dst;
 	}
@@ -1064,9 +1057,9 @@
 out:
 	return status;
 out_put_dst:
-	fput(*dst);
+	nfsd_file_put(*dst);
 out_put_src:
-	fput(*src);
+	nfsd_file_put(*src);
 	goto out;
 }
 
@@ -1075,7 +1068,7 @@
 		union nfsd4_op_u *u)
 {
 	struct nfsd4_clone *clone = &u->clone;
-	struct file *src, *dst;
+	struct nfsd_file *src, *dst;
 	__be32 status;
 
 	status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src,
@@ -1083,68 +1076,306 @@
 	if (status)
 		goto out;
 
-	status = nfsd4_clone_file_range(src, clone->cl_src_pos,
-			dst, clone->cl_dst_pos, clone->cl_count);
+	status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos,
+			dst->nf_file, clone->cl_dst_pos, clone->cl_count);
 
-	fput(dst);
-	fput(src);
+	nfsd_file_put(dst);
+	nfsd_file_put(src);
 out:
 	return status;
 }
 
+void nfs4_put_copy(struct nfsd4_copy *copy)
+{
+	if (!refcount_dec_and_test(&copy->refcount))
+		return;
+	kfree(copy);
+}
+
+static bool
+check_and_set_stop_copy(struct nfsd4_copy *copy)
+{
+	bool value;
+
+	spin_lock(&copy->cp_clp->async_lock);
+	value = copy->stopped;
+	if (!copy->stopped)
+		copy->stopped = true;
+	spin_unlock(&copy->cp_clp->async_lock);
+	return value;
+}
+
+static void nfsd4_stop_copy(struct nfsd4_copy *copy)
+{
+	/* only 1 thread should stop the copy */
+	if (!check_and_set_stop_copy(copy))
+		kthread_stop(copy->copy_task);
+	nfs4_put_copy(copy);
+}
+
+static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp)
+{
+	struct nfsd4_copy *copy = NULL;
+
+	spin_lock(&clp->async_lock);
+	if (!list_empty(&clp->async_copies)) {
+		copy = list_first_entry(&clp->async_copies, struct nfsd4_copy,
+					copies);
+		refcount_inc(&copy->refcount);
+	}
+	spin_unlock(&clp->async_lock);
+	return copy;
+}
+
+void nfsd4_shutdown_copy(struct nfs4_client *clp)
+{
+	struct nfsd4_copy *copy;
+
+	while ((copy = nfsd4_get_copy(clp)) != NULL)
+		nfsd4_stop_copy(copy);
+}
+
+static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
+{
+	struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb);
+
+	nfs4_put_copy(copy);
+}
+
+static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
+				 struct rpc_task *task)
+{
+	return 1;
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
+	.release = nfsd4_cb_offload_release,
+	.done = nfsd4_cb_offload_done
+};
+
+static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
+{
+	copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+	copy->cp_synchronous = sync;
+	gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
+}
+
+static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
+{
+	ssize_t bytes_copied = 0;
+	size_t bytes_total = copy->cp_count;
+	u64 src_pos = copy->cp_src_pos;
+	u64 dst_pos = copy->cp_dst_pos;
+
+	do {
+		if (kthread_should_stop())
+			break;
+		bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file,
+				src_pos, copy->nf_dst->nf_file, dst_pos,
+				bytes_total);
+		if (bytes_copied <= 0)
+			break;
+		bytes_total -= bytes_copied;
+		copy->cp_res.wr_bytes_written += bytes_copied;
+		src_pos += bytes_copied;
+		dst_pos += bytes_copied;
+	} while (bytes_total > 0 && !copy->cp_synchronous);
+	return bytes_copied;
+}
+
+static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
+{
+	__be32 status;
+	ssize_t bytes;
+
+	bytes = _nfsd_copy_file_range(copy);
+	/* for async copy, we ignore the error, client can always retry
+	 * to get the error
+	 */
+	if (bytes < 0 && !copy->cp_res.wr_bytes_written)
+		status = nfserrno(bytes);
+	else {
+		nfsd4_init_copy_res(copy, sync);
+		status = nfs_ok;
+	}
+
+	nfsd_file_put(copy->nf_src);
+	nfsd_file_put(copy->nf_dst);
+	return status;
+}
+
+static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
+{
+	dst->cp_src_pos = src->cp_src_pos;
+	dst->cp_dst_pos = src->cp_dst_pos;
+	dst->cp_count = src->cp_count;
+	dst->cp_synchronous = src->cp_synchronous;
+	memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
+	memcpy(&dst->fh, &src->fh, sizeof(src->fh));
+	dst->cp_clp = src->cp_clp;
+	dst->nf_dst = nfsd_file_get(src->nf_dst);
+	dst->nf_src = nfsd_file_get(src->nf_src);
+	memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
+}
+
+static void cleanup_async_copy(struct nfsd4_copy *copy)
+{
+	nfs4_free_cp_state(copy);
+	nfsd_file_put(copy->nf_dst);
+	nfsd_file_put(copy->nf_src);
+	spin_lock(&copy->cp_clp->async_lock);
+	list_del(&copy->copies);
+	spin_unlock(&copy->cp_clp->async_lock);
+	nfs4_put_copy(copy);
+}
+
+static int nfsd4_do_async_copy(void *data)
+{
+	struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
+	struct nfsd4_copy *cb_copy;
+
+	copy->nfserr = nfsd4_do_copy(copy, 0);
+	cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+	if (!cb_copy)
+		goto out;
+	memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
+	cb_copy->cp_clp = copy->cp_clp;
+	cb_copy->nfserr = copy->nfserr;
+	memcpy(&cb_copy->fh, &copy->fh, sizeof(copy->fh));
+	nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp,
+			&nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD);
+	nfsd4_run_cb(&cb_copy->cp_cb);
+out:
+	cleanup_async_copy(copy);
+	return 0;
+}
+
 static __be32
 nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		union nfsd4_op_u *u)
 {
 	struct nfsd4_copy *copy = &u->copy;
-	struct file *src, *dst;
 	__be32 status;
-	ssize_t bytes;
+	struct nfsd4_copy *async_copy = NULL;
 
-	status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid, &src,
-				   &copy->cp_dst_stateid, &dst);
+	status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid,
+				   &copy->nf_src, &copy->cp_dst_stateid,
+				   &copy->nf_dst);
 	if (status)
 		goto out;
 
-	bytes = nfsd_copy_file_range(src, copy->cp_src_pos,
-			dst, copy->cp_dst_pos, copy->cp_count);
+	copy->cp_clp = cstate->clp;
+	memcpy(&copy->fh, &cstate->current_fh.fh_handle,
+		sizeof(struct knfsd_fh));
+	if (!copy->cp_synchronous) {
+		struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
-	if (bytes < 0)
-		status = nfserrno(bytes);
-	else {
-		copy->cp_res.wr_bytes_written = bytes;
-		copy->cp_res.wr_stable_how = NFS_UNSTABLE;
-		copy->cp_synchronous = 1;
-		gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
+		status = nfserrno(-ENOMEM);
+		async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+		if (!async_copy)
+			goto out;
+		if (!nfs4_init_cp_state(nn, copy)) {
+			kfree(async_copy);
+			goto out;
+		}
+		refcount_set(&async_copy->refcount, 1);
+		memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid,
+			sizeof(copy->cp_stateid));
+		dup_copy_fields(copy, async_copy);
+		async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
+				async_copy, "%s", "copy thread");
+		if (IS_ERR(async_copy->copy_task))
+			goto out_err;
+		spin_lock(&async_copy->cp_clp->async_lock);
+		list_add(&async_copy->copies,
+				&async_copy->cp_clp->async_copies);
+		spin_unlock(&async_copy->cp_clp->async_lock);
+		wake_up_process(async_copy->copy_task);
 		status = nfs_ok;
-	}
-
-	fput(src);
-	fput(dst);
+	} else
+		status = nfsd4_do_copy(copy, 1);
 out:
 	return status;
+out_err:
+	cleanup_async_copy(async_copy);
+	goto out;
+}
+
+struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
+{
+	struct nfsd4_copy *copy;
+
+	spin_lock(&clp->async_lock);
+	list_for_each_entry(copy, &clp->async_copies, copies) {
+		if (memcmp(&copy->cp_stateid, stateid, NFS4_STATEID_SIZE))
+			continue;
+		refcount_inc(&copy->refcount);
+		spin_unlock(&clp->async_lock);
+		return copy;
+	}
+	spin_unlock(&clp->async_lock);
+	return NULL;
+}
+
+static __be32
+nfsd4_offload_cancel(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     union nfsd4_op_u *u)
+{
+	struct nfsd4_offload_status *os = &u->offload_status;
+	__be32 status = 0;
+	struct nfsd4_copy *copy;
+	struct nfs4_client *clp = cstate->clp;
+
+	copy = find_async_copy(clp, &os->stateid);
+	if (copy)
+		nfsd4_stop_copy(copy);
+	else
+		status = nfserr_bad_stateid;
+
+	return status;
 }
 
 static __be32
 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		struct nfsd4_fallocate *fallocate, int flags)
 {
-	__be32 status = nfserr_notsupp;
-	struct file *file;
+	__be32 status;
+	struct nfsd_file *nf;
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    &fallocate->falloc_stateid,
-					    WR_STATE, &file, NULL);
+					    WR_STATE, &nf);
 	if (status != nfs_ok) {
 		dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
 		return status;
 	}
 
-	status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file,
+	status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
 				     fallocate->falloc_offset,
 				     fallocate->falloc_length,
 				     flags);
-	fput(file);
+	nfsd_file_put(nf);
+	return status;
+}
+static __be32
+nfsd4_offload_status(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     union nfsd4_op_u *u)
+{
+	struct nfsd4_offload_status *os = &u->offload_status;
+	__be32 status = 0;
+	struct nfsd4_copy *copy;
+	struct nfs4_client *clp = cstate->clp;
+
+	copy = find_async_copy(clp, &os->stateid);
+	if (copy) {
+		os->count = copy->cp_res.wr_bytes_written;
+		nfs4_put_copy(copy);
+	} else
+		status = nfserr_bad_stateid;
+
 	return status;
 }
 
@@ -1170,11 +1401,11 @@
 	struct nfsd4_seek *seek = &u->seek;
 	int whence;
 	__be32 status;
-	struct file *file;
+	struct nfsd_file *nf;
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    &seek->seek_stateid,
-					    RD_STATE, &file, NULL);
+					    RD_STATE, &nf);
 	if (status) {
 		dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
 		return status;
@@ -1196,14 +1427,14 @@
 	 * Note:  This call does change file->f_pos, but nothing in NFSD
 	 *        should ever file->f_pos.
 	 */
-	seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
+	seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence);
 	if (seek->seek_pos < 0)
 		status = nfserrno(seek->seek_pos);
-	else if (seek->seek_pos >= i_size_read(file_inode(file)))
+	else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file)))
 		seek->seek_eof = true;
 
 out:
-	fput(file);
+	nfsd_file_put(nf);
 	return status;
 }
 
@@ -1691,6 +1922,7 @@
 	struct nfsd4_compound_state *cstate = &resp->cstate;
 	struct svc_fh *current_fh = &cstate->current_fh;
 	struct svc_fh *save_fh = &cstate->save_fh;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	__be32		status;
 
 	svcxdr_init_encode(rqstp, resp);
@@ -1713,7 +1945,7 @@
 	 * According to RFC3010, this takes precedence over all other errors.
 	 */
 	status = nfserr_minor_vers_mismatch;
-	if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0)
+	if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0)
 		goto out;
 	status = nfserr_resource;
 	if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
@@ -2050,6 +2282,14 @@
 		1 /* cr_synchronous */) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp,
+					     struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		2 /* osr_count */ +
+		1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32);
+}
+
 #ifdef CONFIG_NFSD_PNFS
 static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
@@ -2436,25 +2676,25 @@
 	/* NFSv4.2 operations */
 	[OP_ALLOCATE] = {
 		.op_func = nfsd4_allocate,
-		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_ALLOCATE",
 		.op_rsize_bop = nfsd4_only_status_rsize,
 	},
 	[OP_DEALLOCATE] = {
 		.op_func = nfsd4_deallocate,
-		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_DEALLOCATE",
 		.op_rsize_bop = nfsd4_only_status_rsize,
 	},
 	[OP_CLONE] = {
 		.op_func = nfsd4_clone,
-		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_CLONE",
 		.op_rsize_bop = nfsd4_only_status_rsize,
 	},
 	[OP_COPY] = {
 		.op_func = nfsd4_copy,
-		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_COPY",
 		.op_rsize_bop = nfsd4_copy_rsize,
 	},
@@ -2463,6 +2703,17 @@
 		.op_name = "OP_SEEK",
 		.op_rsize_bop = nfsd4_seek_rsize,
 	},
+	[OP_OFFLOAD_STATUS] = {
+		.op_func = nfsd4_offload_status,
+		.op_name = "OP_OFFLOAD_STATUS",
+		.op_rsize_bop = nfsd4_offload_status_rsize,
+	},
+	[OP_OFFLOAD_CANCEL] = {
+		.op_func = nfsd4_offload_cancel,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_OFFLOAD_CANCEL",
+		.op_rsize_bop = nfsd4_only_status_rsize,
+	},
 };
 
 /**

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c247fa..cdc75ad 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c

@@ -59,8 +59,13 @@
 	void (*remove)(struct nfs4_client *);
 	int (*check)(struct nfs4_client *);
 	void (*grace_done)(struct nfsd_net *);
+	uint8_t version;
+	size_t msglen;
 };
 
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops;
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2;
+
 /* Globals */
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 
@@ -126,7 +131,6 @@
 		SHASH_DESC_ON_STACK(desc, tfm);
 
 		desc->tfm = tfm;
-		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 		status = crypto_shash_digest(desc, clname->data, clname->len,
 					     cksum.data);
@@ -170,12 +174,34 @@
 }
 
 static void
+__nfsd4_create_reclaim_record_grace(struct nfs4_client *clp,
+		const char *dname, int len, struct nfsd_net *nn)
+{
+	struct xdr_netobj name;
+	struct xdr_netobj princhash = { .len = 0, .data = NULL };
+	struct nfs4_client_reclaim *crp;
+
+	name.data = kmemdup(dname, len, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		return;
+	}
+	name.len = len;
+	crp = nfs4_client_to_reclaim(name, princhash, nn);
+	if (!crp) {
+		kfree(name.data);
+		return;
+	}
+	crp->cr_clp = clp;
+}
+
+static void
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
 	const struct cred *original_cred;
 	char dname[HEXDIR_LEN];
 	struct dentry *dir, *dentry;
-	struct nfs4_client_reclaim *crp;
 	int status;
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
@@ -221,11 +247,9 @@
 out_unlock:
 	inode_unlock(d_inode(dir));
 	if (status == 0) {
-		if (nn->in_grace) {
-			crp = nfs4_client_to_reclaim(dname, nn);
-			if (crp)
-				crp->cr_clp = clp;
-		}
+		if (nn->in_grace)
+			__nfsd4_create_reclaim_record_grace(clp, dname,
+					HEXDIR_LEN, nn);
 		vfs_fsync(nn->rec_file, 0);
 	} else {
 		printk(KERN_ERR "NFSD: failed to write recovery record"
@@ -346,10 +370,29 @@
 }
 
 static void
+__nfsd4_remove_reclaim_record_grace(const char *dname, int len,
+		struct nfsd_net *nn)
+{
+	struct xdr_netobj name;
+	struct nfs4_client_reclaim *crp;
+
+	name.data = kmemdup(dname, len, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		return;
+	}
+	name.len = len;
+	crp = nfsd4_find_reclaim_client(name, nn);
+	kfree(name.data);
+	if (crp)
+		nfs4_remove_reclaim_record(crp, nn);
+}
+
+static void
 nfsd4_remove_clid_dir(struct nfs4_client *clp)
 {
 	const struct cred *original_cred;
-	struct nfs4_client_reclaim *crp;
 	char dname[HEXDIR_LEN];
 	int status;
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -374,12 +417,9 @@
 	nfs4_reset_creds(original_cred);
 	if (status == 0) {
 		vfs_fsync(nn->rec_file, 0);
-		if (nn->in_grace) {
-			/* remove reclaim record */
-			crp = nfsd4_find_reclaim_client(dname, nn);
-			if (crp)
-				nfs4_remove_reclaim_record(crp, nn);
-		}
+		if (nn->in_grace)
+			__nfsd4_remove_reclaim_record_grace(dname,
+					HEXDIR_LEN, nn);
 	}
 out_drop_write:
 	mnt_drop_write_file(nn->rec_file);
@@ -393,14 +433,31 @@
 purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
 	int status;
+	struct xdr_netobj name;
 
-	if (nfs4_has_reclaimed_state(child->d_name.name, nn))
+	if (child->d_name.len != HEXDIR_LEN - 1) {
+		printk("%s: illegal name %pd in recovery directory\n",
+				__func__, child);
+		/* Keep trying; maybe the others are OK: */
 		return 0;
+	}
+	name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		goto out;
+	}
+	name.len = HEXDIR_LEN;
+	if (nfs4_has_reclaimed_state(name, nn))
+		goto out_free;
 
 	status = vfs_rmdir(d_inode(parent), child);
 	if (status)
 		printk("failed to remove client recovery directory %pd\n",
 				child);
+out_free:
+	kfree(name.data);
+out:
 	/* Keep trying, success or failure: */
 	return 0;
 }
@@ -430,13 +487,25 @@
 static int
 load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
+	struct xdr_netobj name;
+	struct xdr_netobj princhash = { .len = 0, .data = NULL };
+
 	if (child->d_name.len != HEXDIR_LEN - 1) {
-		printk("nfsd4: illegal name %pd in recovery directory\n",
-				child);
+		printk("%s: illegal name %pd in recovery directory\n",
+				__func__, child);
 		/* Keep trying; maybe the others are OK: */
 		return 0;
 	}
-	nfs4_client_to_reclaim(child->d_name.name, nn);
+	name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		goto out;
+	}
+	name.len = HEXDIR_LEN;
+	if (!nfs4_client_to_reclaim(name, princhash, nn))
+		kfree(name.data);
+out:
 	return 0;
 }
 
@@ -565,6 +634,7 @@
 	status = nfsd4_load_reboot_recovery_data(net);
 	if (status)
 		goto err;
+	printk("NFSD: Using legacy client tracking operations.\n");
 	return 0;
 
 err:
@@ -616,6 +686,7 @@
 	char dname[HEXDIR_LEN];
 	struct nfs4_client_reclaim *crp;
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct xdr_netobj name;
 
 	/* did we already find that this client is stable? */
 	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
@@ -628,13 +699,22 @@
 	}
 
 	/* look for it in the reclaim hashtable otherwise */
-	crp = nfsd4_find_reclaim_client(dname, nn);
+	name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		goto out_enoent;
+	}
+	name.len = HEXDIR_LEN;
+	crp = nfsd4_find_reclaim_client(name, nn);
+	kfree(name.data);
 	if (crp) {
 		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 		crp->cr_clp = clp;
 		return 0;
 	}
 
+out_enoent:
 	return -ENOENT;
 }
 
@@ -645,6 +725,8 @@
 	.remove		= nfsd4_remove_clid_dir,
 	.check		= nfsd4_check_legacy_client,
 	.grace_done	= nfsd4_recdir_purge_old,
+	.version	= 1,
+	.msglen		= 0,
 };
 
 /* Globals */
@@ -657,37 +739,40 @@
 	spinlock_t		 cn_lock;
 	struct list_head	 cn_list;
 	unsigned int		 cn_xid;
+	bool			 cn_has_legacy;
+	struct crypto_shash	*cn_tfm;
 };
 
 struct cld_upcall {
 	struct list_head	 cu_list;
 	struct cld_net		*cu_net;
-	struct task_struct	*cu_task;
-	struct cld_msg		 cu_msg;
+	struct completion	 cu_done;
+	union {
+		struct cld_msg_hdr	 cu_hdr;
+		struct cld_msg		 cu_msg;
+		struct cld_msg_v2	 cu_msg_v2;
+	} cu_u;
 };
 
 static int
-__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
 {
 	int ret;
 	struct rpc_pipe_msg msg;
+	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u);
+	struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info,
+					  nfsd_net_id);
 
 	memset(&msg, 0, sizeof(msg));
 	msg.data = cmsg;
-	msg.len = sizeof(*cmsg);
+	msg.len = nn->client_tracking_ops->msglen;
 
-	/*
-	 * Set task state before we queue the upcall. That prevents
-	 * wake_up_process in the downcall from racing with schedule.
-	 */
-	set_current_state(TASK_UNINTERRUPTIBLE);
 	ret = rpc_queue_upcall(pipe, &msg);
 	if (ret < 0) {
-		set_current_state(TASK_RUNNING);
 		goto out;
 	}
 
-	schedule();
+	wait_for_completion(&cup->cu_done);
 
 	if (msg.errno < 0)
 		ret = msg.errno;
@@ -696,7 +781,7 @@
 }
 
 static int
-cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
 {
 	int ret;
 
@@ -712,34 +797,107 @@
 }
 
 static ssize_t
+__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
+		struct nfsd_net *nn)
+{
+	uint8_t cmd, princhashlen;
+	struct xdr_netobj name, princhash = { .len = 0, .data = NULL };
+	uint16_t namelen;
+	struct cld_net *cn = nn->cld_net;
+
+	if (get_user(cmd, &cmsg->cm_cmd)) {
+		dprintk("%s: error when copying cmd from userspace", __func__);
+		return -EFAULT;
+	}
+	if (cmd == Cld_GraceStart) {
+		if (nn->client_tracking_ops->version >= 2) {
+			const struct cld_clntinfo __user *ci;
+
+			ci = &cmsg->cm_u.cm_clntinfo;
+			if (get_user(namelen, &ci->cc_name.cn_len))
+				return -EFAULT;
+			name.data = memdup_user(&ci->cc_name.cn_id, namelen);
+			if (IS_ERR_OR_NULL(name.data))
+				return -EFAULT;
+			name.len = namelen;
+			get_user(princhashlen, &ci->cc_princhash.cp_len);
+			if (princhashlen > 0) {
+				princhash.data = memdup_user(
+						&ci->cc_princhash.cp_data,
+						princhashlen);
+				if (IS_ERR_OR_NULL(princhash.data))
+					return -EFAULT;
+				princhash.len = princhashlen;
+			} else
+				princhash.len = 0;
+		} else {
+			const struct cld_name __user *cnm;
+
+			cnm = &cmsg->cm_u.cm_name;
+			if (get_user(namelen, &cnm->cn_len))
+				return -EFAULT;
+			name.data = memdup_user(&cnm->cn_id, namelen);
+			if (IS_ERR_OR_NULL(name.data))
+				return -EFAULT;
+			name.len = namelen;
+		}
+		if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
+			name.len = name.len - 5;
+			memmove(name.data, name.data + 5, name.len);
+			cn->cn_has_legacy = true;
+		}
+		if (!nfs4_client_to_reclaim(name, princhash, nn)) {
+			kfree(name.data);
+			kfree(princhash.data);
+			return -EFAULT;
+		}
+		return nn->client_tracking_ops->msglen;
+	}
+	return -EFAULT;
+}
+
+static ssize_t
 cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
 	struct cld_upcall *tmp, *cup;
-	struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
+	struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src;
+	struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src;
 	uint32_t xid;
 	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
 						nfsd_net_id);
 	struct cld_net *cn = nn->cld_net;
+	int16_t status;
 
-	if (mlen != sizeof(*cmsg)) {
+	if (mlen != nn->client_tracking_ops->msglen) {
 		dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
-			sizeof(*cmsg));
+			nn->client_tracking_ops->msglen);
 		return -EINVAL;
 	}
 
 	/* copy just the xid so we can try to find that */
-	if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) {
+	if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) {
 		dprintk("%s: error when copying xid from userspace", __func__);
 		return -EFAULT;
 	}
 
+	/*
+	 * copy the status so we know whether to remove the upcall from the
+	 * list (for -EINPROGRESS, we just want to make sure the xid is
+	 * valid, not remove the upcall from the list)
+	 */
+	if (get_user(status, &hdr->cm_status)) {
+		dprintk("%s: error when copying status from userspace", __func__);
+		return -EFAULT;
+	}
+
 	/* walk the list and find corresponding xid */
 	cup = NULL;
 	spin_lock(&cn->cn_lock);
 	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
-		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
+		if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) {
 			cup = tmp;
-			list_del_init(&cup->cu_list);
+			if (status != -EINPROGRESS)
+				list_del_init(&cup->cu_list);
 			break;
 		}
 	}
@@ -751,10 +909,13 @@
 		return -EINVAL;
 	}
 
-	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
+	if (status == -EINPROGRESS)
+		return __cld_pipe_inprogress_downcall(cmsg, nn);
+
+	if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0)
 		return -EFAULT;
 
-	wake_up_process(cup->cu_task);
+	complete(&cup->cu_done);
 	return mlen;
 }
 
@@ -763,13 +924,13 @@
 {
 	struct cld_msg *cmsg = msg->data;
 	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall,
-						 cu_msg);
+						 cu_u.cu_msg);
 
 	/* errno >= 0 means we got a downcall */
 	if (msg->errno >= 0)
 		return;
 
-	wake_up_process(cup->cu_task);
+	complete(&cup->cu_done);
 }
 
 static const struct rpc_pipe_ops cld_upcall_ops = {
@@ -826,7 +987,7 @@
 
 /* Initialize rpc_pipefs pipe for communication with client tracking daemon */
 static int
-nfsd4_init_cld_pipe(struct net *net)
+__nfsd4_init_cld_pipe(struct net *net)
 {
 	int ret;
 	struct dentry *dentry;
@@ -857,6 +1018,7 @@
 	}
 
 	cn->cn_pipe->dentry = dentry;
+	cn->cn_has_legacy = false;
 	nn->cld_net = cn;
 	return 0;
 
@@ -869,6 +1031,17 @@
 	return ret;
 }
 
+static int
+nfsd4_init_cld_pipe(struct net *net)
+{
+	int status;
+
+	status = __nfsd4_init_cld_pipe(net);
+	if (!status)
+		printk("NFSD: Using old nfsdcld client tracking operations.\n");
+	return status;
+}
+
 static void
 nfsd4_remove_cld_pipe(struct net *net)
 {
@@ -877,14 +1050,17 @@
 
 	nfsd4_cld_unregister_net(net, cn->cn_pipe);
 	rpc_destroy_pipe_data(cn->cn_pipe);
+	if (cn->cn_tfm)
+		crypto_free_shash(cn->cn_tfm);
 	kfree(nn->cld_net);
 	nn->cld_net = NULL;
 }
 
 static struct cld_upcall *
-alloc_cld_upcall(struct cld_net *cn)
+alloc_cld_upcall(struct nfsd_net *nn)
 {
 	struct cld_upcall *new, *tmp;
+	struct cld_net *cn = nn->cld_net;
 
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
@@ -894,20 +1070,20 @@
 restart_search:
 	spin_lock(&cn->cn_lock);
 	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
-		if (tmp->cu_msg.cm_xid == cn->cn_xid) {
+		if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) {
 			cn->cn_xid++;
 			spin_unlock(&cn->cn_lock);
 			goto restart_search;
 		}
 	}
-	new->cu_task = current;
-	new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
-	put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid);
+	init_completion(&new->cu_done);
+	new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version;
+	put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid);
 	new->cu_net = cn;
 	list_add(&new->cu_list, &cn->cn_list);
 	spin_unlock(&cn->cn_lock);
 
-	dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid);
+	dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid);
 
 	return new;
 }
@@ -936,20 +1112,20 @@
 	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Create;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Create;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -962,6 +1138,75 @@
 
 /* Ask daemon to create a new record */
 static void
+nfsd4_cld_create_v2(struct nfs4_client *clp)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+	struct cld_msg_v2 *cmsg;
+	struct crypto_shash *tfm = cn->cn_tfm;
+	struct xdr_netobj cksum;
+	char *principal = NULL;
+	SHASH_DESC_ON_STACK(desc, tfm);
+
+	/* Don't upcall if it's already stored */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
+	cup = alloc_cld_upcall(nn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cmsg = &cup->cu_u.cu_msg_v2;
+	cmsg->cm_cmd = Cld_Create;
+	cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len;
+	memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data,
+			clp->cl_name.len);
+	if (clp->cl_cred.cr_raw_principal)
+		principal = clp->cl_cred.cr_raw_principal;
+	else if (clp->cl_cred.cr_principal)
+		principal = clp->cl_cred.cr_principal;
+	if (principal) {
+		desc->tfm = tfm;
+		cksum.len = crypto_shash_digestsize(tfm);
+		cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+		if (cksum.data == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = crypto_shash_digest(desc, principal, strlen(principal),
+					  cksum.data);
+		shash_desc_zero(desc);
+		if (ret) {
+			kfree(cksum.data);
+			goto out;
+		}
+		cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len;
+		memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data,
+		       cksum.data, cksum.len);
+		kfree(cksum.data);
+	} else
+		cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0;
+
+	ret = cld_pipe_upcall(cn->cn_pipe, cmsg);
+	if (!ret) {
+		ret = cmsg->cm_status;
+		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	}
+
+out:
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		pr_err("NFSD: Unable to create client record on stable storage: %d\n",
+				ret);
+}
+
+/* Ask daemon to create a new record */
+static void
 nfsd4_cld_remove(struct nfs4_client *clp)
 {
 	int ret;
@@ -973,20 +1218,20 @@
 	if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Remove;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Remove;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -997,9 +1242,14 @@
 				"record from stable storage: %d\n", ret);
 }
 
-/* Check for presence of a record, and update its timestamp */
+/*
+ * For older nfsdcld's that do not allow us to "slurp" the clients
+ * from the tracking database during startup.
+ *
+ * Check for presence of a record, and update its timestamp
+ */
 static int
-nfsd4_cld_check(struct nfs4_client *clp)
+nfsd4_cld_check_v0(struct nfs4_client *clp)
 {
 	int ret;
 	struct cld_upcall *cup;
@@ -1010,21 +1260,21 @@
 	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return 0;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		printk(KERN_ERR "NFSD: Unable to check client record on "
 				"stable storage: %d\n", -ENOMEM);
 		return -ENOMEM;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Check;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Check;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -1032,24 +1282,173 @@
 	return ret;
 }
 
-static void
-nfsd4_cld_grace_done(struct nfsd_net *nn)
+/*
+ * For newer nfsdcld's that allow us to "slurp" the clients
+ * from the tracking database during startup.
+ *
+ * Check for presence of a record in the reclaim_str_hashtbl
+ */
+static int
+nfsd4_cld_check(struct nfs4_client *clp)
+{
+	struct nfs4_client_reclaim *crp;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+	int status;
+	char dname[HEXDIR_LEN];
+	struct xdr_netobj name;
+
+	/* did we already find that this client is stable? */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
+
+	/* look for it in the reclaim hashtable otherwise */
+	crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
+	if (crp)
+		goto found;
+
+	if (cn->cn_has_legacy) {
+		status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+		if (status)
+			return -ENOENT;
+
+		name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+		if (!name.data) {
+			dprintk("%s: failed to allocate memory for name.data!\n",
+				__func__);
+			return -ENOENT;
+		}
+		name.len = HEXDIR_LEN;
+		crp = nfsd4_find_reclaim_client(name, nn);
+		kfree(name.data);
+		if (crp)
+			goto found;
+
+	}
+	return -ENOENT;
+found:
+	crp->cr_clp = clp;
+	return 0;
+}
+
+static int
+nfsd4_cld_check_v2(struct nfs4_client *clp)
+{
+	struct nfs4_client_reclaim *crp;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+	int status;
+	char dname[HEXDIR_LEN];
+	struct xdr_netobj name;
+	struct crypto_shash *tfm = cn->cn_tfm;
+	struct xdr_netobj cksum;
+	char *principal = NULL;
+	SHASH_DESC_ON_STACK(desc, tfm);
+
+	/* did we already find that this client is stable? */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
+
+	/* look for it in the reclaim hashtable otherwise */
+	crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
+	if (crp)
+		goto found;
+
+	if (cn->cn_has_legacy) {
+		status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+		if (status)
+			return -ENOENT;
+
+		name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+		if (!name.data) {
+			dprintk("%s: failed to allocate memory for name.data\n",
+					__func__);
+			return -ENOENT;
+		}
+		name.len = HEXDIR_LEN;
+		crp = nfsd4_find_reclaim_client(name, nn);
+		kfree(name.data);
+		if (crp)
+			goto found;
+
+	}
+	return -ENOENT;
+found:
+	if (crp->cr_princhash.len) {
+		if (clp->cl_cred.cr_raw_principal)
+			principal = clp->cl_cred.cr_raw_principal;
+		else if (clp->cl_cred.cr_principal)
+			principal = clp->cl_cred.cr_principal;
+		if (principal == NULL)
+			return -ENOENT;
+		desc->tfm = tfm;
+		cksum.len = crypto_shash_digestsize(tfm);
+		cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+		if (cksum.data == NULL)
+			return -ENOENT;
+		status = crypto_shash_digest(desc, principal, strlen(principal),
+					     cksum.data);
+		shash_desc_zero(desc);
+		if (status) {
+			kfree(cksum.data);
+			return -ENOENT;
+		}
+		if (memcmp(crp->cr_princhash.data, cksum.data,
+				crp->cr_princhash.len)) {
+			kfree(cksum.data);
+			return -ENOENT;
+		}
+		kfree(cksum.data);
+	}
+	crp->cr_clp = clp;
+	return 0;
+}
+
+static int
+nfsd4_cld_grace_start(struct nfsd_net *nn)
 {
 	int ret;
 	struct cld_upcall *cup;
 	struct cld_net *cn = nn->cld_net;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_GraceDone;
-	cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret)
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
+
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		dprintk("%s: Unable to get clients from userspace: %d\n",
+			__func__, ret);
+	return ret;
+}
+
+/* For older nfsdcld's that need cm_gracetime */
+static void
+nfsd4_cld_grace_done_v0(struct nfsd_net *nn)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct cld_net *cn = nn->cld_net;
+
+	cup = alloc_cld_upcall(nn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+	cup->cu_u.cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
+	if (!ret)
+		ret = cup->cu_u.cu_msg.cm_status;
 
 	free_cld_upcall(cup);
 out_err:
@@ -1057,13 +1456,223 @@
 		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
 }
 
-static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+/*
+ * For newer nfsdcld's that do not need cm_gracetime.  We also need to call
+ * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl.
+ */
+static void
+nfsd4_cld_grace_done(struct nfsd_net *nn)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct cld_net *cn = nn->cld_net;
+
+	cup = alloc_cld_upcall(nn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
+	if (!ret)
+		ret = cup->cu_u.cu_msg.cm_status;
+
+	free_cld_upcall(cup);
+out_err:
+	nfs4_release_reclaim(nn);
+	if (ret)
+		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
+}
+
+static int
+nfs4_cld_state_init(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int i;
+
+	nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
+						sizeof(struct list_head),
+						GFP_KERNEL);
+	if (!nn->reclaim_str_hashtbl)
+		return -ENOMEM;
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
+	nn->reclaim_str_hashtbl_size = 0;
+	nn->track_reclaim_completes = true;
+	atomic_set(&nn->nr_reclaim_complete, 0);
+
+	return 0;
+}
+
+static void
+nfs4_cld_state_shutdown(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	nn->track_reclaim_completes = false;
+	kfree(nn->reclaim_str_hashtbl);
+}
+
+static bool
+cld_running(struct nfsd_net *nn)
+{
+	struct cld_net *cn = nn->cld_net;
+	struct rpc_pipe *pipe = cn->cn_pipe;
+
+	return pipe->nreaders || pipe->nwriters;
+}
+
+static int
+nfsd4_cld_get_version(struct nfsd_net *nn)
+{
+	int ret = 0;
+	struct cld_upcall *cup;
+	struct cld_net *cn = nn->cld_net;
+	uint8_t version;
+
+	cup = alloc_cld_upcall(nn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+	cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
+	if (!ret) {
+		ret = cup->cu_u.cu_msg.cm_status;
+		if (ret)
+			goto out_free;
+		version = cup->cu_u.cu_msg.cm_u.cm_version;
+		dprintk("%s: userspace returned version %u\n",
+				__func__, version);
+		if (version < 1)
+			version = 1;
+		else if (version > CLD_UPCALL_VERSION)
+			version = CLD_UPCALL_VERSION;
+
+		switch (version) {
+		case 1:
+			nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+			break;
+		case 2:
+			nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2;
+			break;
+		default:
+			break;
+		}
+	}
+out_free:
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		dprintk("%s: Unable to get version from userspace: %d\n",
+			__func__, ret);
+	return ret;
+}
+
+static int
+nfsd4_cld_tracking_init(struct net *net)
+{
+	int status;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	bool running;
+	int retries = 10;
+
+	status = nfs4_cld_state_init(net);
+	if (status)
+		return status;
+
+	status = __nfsd4_init_cld_pipe(net);
+	if (status)
+		goto err_shutdown;
+	nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(nn->cld_net->cn_tfm)) {
+		status = PTR_ERR(nn->cld_net->cn_tfm);
+		goto err_remove;
+	}
+
+	/*
+	 * rpc pipe upcalls take 30 seconds to time out, so we don't want to
+	 * queue an upcall unless we know that nfsdcld is running (because we
+	 * want this to fail fast so that nfsd4_client_tracking_init() can try
+	 * the next client tracking method).  nfsdcld should already be running
+	 * before nfsd is started, so the wait here is for nfsdcld to open the
+	 * pipefs file we just created.
+	 */
+	while (!(running = cld_running(nn)) && retries--)
+		msleep(100);
+
+	if (!running) {
+		status = -ETIMEDOUT;
+		goto err_remove;
+	}
+
+	status = nfsd4_cld_get_version(nn);
+	if (status == -EOPNOTSUPP)
+		pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n");
+
+	status = nfsd4_cld_grace_start(nn);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n");
+		nfs4_release_reclaim(nn);
+		goto err_remove;
+	} else
+		printk("NFSD: Using nfsdcld client tracking operations.\n");
+	return 0;
+
+err_remove:
+	nfsd4_remove_cld_pipe(net);
+err_shutdown:
+	nfs4_cld_state_shutdown(net);
+	return status;
+}
+
+static void
+nfsd4_cld_tracking_exit(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	nfs4_release_reclaim(nn);
+	nfsd4_remove_cld_pipe(net);
+	nfs4_cld_state_shutdown(net);
+}
+
+/* For older nfsdcld's */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = {
 	.init		= nfsd4_init_cld_pipe,
 	.exit		= nfsd4_remove_cld_pipe,
 	.create		= nfsd4_cld_create,
 	.remove		= nfsd4_cld_remove,
+	.check		= nfsd4_cld_check_v0,
+	.grace_done	= nfsd4_cld_grace_done_v0,
+	.version	= 1,
+	.msglen		= sizeof(struct cld_msg),
+};
+
+/* For newer nfsdcld's */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+	.init		= nfsd4_cld_tracking_init,
+	.exit		= nfsd4_cld_tracking_exit,
+	.create		= nfsd4_cld_create,
+	.remove		= nfsd4_cld_remove,
 	.check		= nfsd4_cld_check,
 	.grace_done	= nfsd4_cld_grace_done,
+	.version	= 1,
+	.msglen		= sizeof(struct cld_msg),
+};
+
+/* v2 create/check ops include the principal, if available */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = {
+	.init		= nfsd4_cld_tracking_init,
+	.exit		= nfsd4_cld_tracking_exit,
+	.create		= nfsd4_cld_create_v2,
+	.remove		= nfsd4_cld_remove,
+	.check		= nfsd4_cld_check_v2,
+	.grace_done	= nfsd4_cld_grace_done,
+	.version	= 2,
+	.msglen		= sizeof(struct cld_msg_v2),
 };
 
 /* upcall via usermodehelper */
@@ -1273,6 +1882,8 @@
 
 	ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
 	kfree(grace_start);
+	if (!ret)
+		printk("NFSD: Using UMH upcall client tracking operations.\n");
 	return ret;
 }
 
@@ -1409,6 +2020,8 @@
 	.remove		= nfsd4_umh_cltrack_remove,
 	.check		= nfsd4_umh_cltrack_check,
 	.grace_done	= nfsd4_umh_cltrack_grace_done,
+	.version	= 1,
+	.msglen		= 0,
 };
 
 int
@@ -1422,9 +2035,20 @@
 	if (nn->client_tracking_ops)
 		goto do_init;
 
+	/* First, try to use nfsdcld */
+	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+	status = nn->client_tracking_ops->init(net);
+	if (!status)
+		return status;
+	if (status != -ETIMEDOUT) {
+		nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0;
+		status = nn->client_tracking_ops->init(net);
+		if (!status)
+			return status;
+	}
+
 	/*
-	 * First, try a UMH upcall. It should succeed or fail quickly, so
-	 * there's little harm in trying that first.
+	 * Next, try the UMH upcall.
 	 */
 	nn->client_tracking_ops = &nfsd4_umh_tracking_ops;
 	status = nn->client_tracking_ops->init(net);
@@ -1432,25 +2056,23 @@
 		return status;
 
 	/*
-	 * See if the recoverydir exists and is a directory. If it is,
-	 * then use the legacy ops.
+	 * Finally, See if the recoverydir exists and is a directory.
+	 * If it is, then use the legacy ops.
 	 */
 	nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
 	status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
 	if (!status) {
 		status = d_is_dir(path.dentry);
 		path_put(&path);
-		if (status)
-			goto do_init;
+		if (!status) {
+			status = -EINVAL;
+			goto out;
+		}
 	}
 
-	/* Finally, try to use nfsdcld */
-	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
-	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
-			"removed in 3.10. Please transition to using "
-			"nfsdcltrack.\n");
 do_init:
 	status = nn->client_tracking_ops->init(net);
+out:
 	if (status) {
 		printk(KERN_WARNING "NFSD: Unable to initialize client "
 				    "recovery tracking! (%d)\n", status);

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9c6d1d5..c65aeaa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c

@@ -42,6 +42,7 @@
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/jhash.h>
+#include <linux/string_helpers.h>
 #include "xdr4.h"
 #include "xdr4cb.h"
 #include "vfs.h"
@@ -49,6 +50,7 @@
 
 #include "netns.h"
 #include "pnfs.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -77,6 +79,7 @@
 /* forward declarations */
 static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
 static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
+void nfsd4_end_grace(struct nfsd_net *nn);
 
 /* Locking: */
 
@@ -98,6 +101,13 @@
  */
 static DECLARE_WAIT_QUEUE_HEAD(close_wq);
 
+/*
+ * A waitqueue where a writer to clients/#/ctl destroying a client can
+ * wait for cl_rpc_users to drop to 0 and then for the client to be
+ * unhashed.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(expiry_wq);
+
 static struct kmem_cache *client_slab;
 static struct kmem_cache *openowner_slab;
 static struct kmem_cache *lockowner_slab;
@@ -137,7 +147,7 @@
 
 	if (is_client_expired(clp))
 		return nfserr_expired;
-	atomic_inc(&clp->cl_refcount);
+	atomic_inc(&clp->cl_rpc_users);
 	return nfs_ok;
 }
 
@@ -169,20 +179,24 @@
 
 	lockdep_assert_held(&nn->client_lock);
 
-	if (!atomic_dec_and_test(&clp->cl_refcount))
+	if (!atomic_dec_and_test(&clp->cl_rpc_users))
 		return;
 	if (!is_client_expired(clp))
 		renew_client_locked(clp);
+	else
+		wake_up_all(&expiry_wq);
 }
 
 static void put_client_renew(struct nfs4_client *clp)
 {
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
-	if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
+	if (!atomic_dec_and_lock(&clp->cl_rpc_users, &nn->client_lock))
 		return;
 	if (!is_client_expired(clp))
 		renew_client_locked(clp);
+	else
+		wake_up_all(&expiry_wq);
 	spin_unlock(&nn->client_lock);
 }
 
@@ -238,7 +252,7 @@
 	}
 	spin_unlock(&nn->blocked_locks_lock);
 	if (found)
-		posix_unblock_lock(&found->nbl_lock);
+		locks_delete_block(&found->nbl_lock);
 	return found;
 }
 
@@ -265,6 +279,7 @@
 static void
 free_blocked_lock(struct nfsd4_blocked_lock *nbl)
 {
+	locks_delete_block(&nbl->nbl_lock);
 	locks_release_private(&nbl->nbl_lock);
 	kfree(nbl);
 }
@@ -293,11 +308,18 @@
 		nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock,
 					nbl_lru);
 		list_del_init(&nbl->nbl_lru);
-		posix_unblock_lock(&nbl->nbl_lock);
 		free_blocked_lock(nbl);
 	}
 }
 
+static void
+nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb)
+{
+	struct nfsd4_blocked_lock	*nbl = container_of(cb,
+						struct nfsd4_blocked_lock, nbl_cb);
+	locks_delete_block(&nbl->nbl_lock);
+}
+
 static int
 nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
 {
@@ -325,6 +347,7 @@
 }
 
 static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
+	.prepare	= nfsd4_cb_notify_lock_prepare,
 	.done		= nfsd4_cb_notify_lock_done,
 	.release	= nfsd4_cb_notify_lock_release,
 };
@@ -407,18 +430,18 @@
 	}
 }
 
-static struct file *
+static struct nfsd_file *
 __nfs4_get_fd(struct nfs4_file *f, int oflag)
 {
 	if (f->fi_fds[oflag])
-		return get_file(f->fi_fds[oflag]);
+		return nfsd_file_get(f->fi_fds[oflag]);
 	return NULL;
 }
 
-static struct file *
+static struct nfsd_file *
 find_writeable_file_locked(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	lockdep_assert_held(&f->fi_lock);
 
@@ -428,10 +451,10 @@
 	return ret;
 }
 
-static struct file *
+static struct nfsd_file *
 find_writeable_file(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	spin_lock(&f->fi_lock);
 	ret = find_writeable_file_locked(f);
@@ -440,9 +463,10 @@
 	return ret;
 }
 
-static struct file *find_readable_file_locked(struct nfs4_file *f)
+static struct nfsd_file *
+find_readable_file_locked(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	lockdep_assert_held(&f->fi_lock);
 
@@ -452,10 +476,10 @@
 	return ret;
 }
 
-static struct file *
+static struct nfsd_file *
 find_readable_file(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	spin_lock(&f->fi_lock);
 	ret = find_readable_file_locked(f);
@@ -464,10 +488,10 @@
 	return ret;
 }
 
-struct file *
+struct nfsd_file *
 find_any_file(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	spin_lock(&f->fi_lock);
 	ret = __nfs4_get_fd(f, O_RDWR);
@@ -568,17 +592,17 @@
 	might_lock(&fp->fi_lock);
 
 	if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) {
-		struct file *f1 = NULL;
-		struct file *f2 = NULL;
+		struct nfsd_file *f1 = NULL;
+		struct nfsd_file *f2 = NULL;
 
 		swap(f1, fp->fi_fds[oflag]);
 		if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
 			swap(f2, fp->fi_fds[O_RDWR]);
 		spin_unlock(&fp->fi_lock);
 		if (f1)
-			fput(f1);
+			nfsd_file_put(f1);
 		if (f2)
-			fput(f2);
+			nfsd_file_put(f2);
 	}
 }
 
@@ -684,7 +708,8 @@
 
 	idr_preload(GFP_KERNEL);
 	spin_lock(&cl->cl_lock);
-	new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT);
+	/* Reserving 0 for start of file in nfsdfs "states" file: */
+	new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 1, 0, GFP_NOWAIT);
 	spin_unlock(&cl->cl_lock);
 	idr_preload_end();
 	if (new_id < 0)
@@ -713,6 +738,36 @@
 	return NULL;
 }
 
+/*
+ * Create a unique stateid_t to represent each COPY.
+ */
+int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy)
+{
+	int new_id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&nn->s2s_cp_lock);
+	new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, copy, 0, 0, GFP_NOWAIT);
+	spin_unlock(&nn->s2s_cp_lock);
+	idr_preload_end();
+	if (new_id < 0)
+		return 0;
+	copy->cp_stateid.si_opaque.so_id = new_id;
+	copy->cp_stateid.si_opaque.so_clid.cl_boot = nn->boot_time;
+	copy->cp_stateid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+	return 1;
+}
+
+void nfs4_free_cp_state(struct nfsd4_copy *copy)
+{
+	struct nfsd_net *nn;
+
+	nn = net_generic(copy->cp_clp->net, nfsd_net_id);
+	spin_lock(&nn->s2s_cp_lock);
+	idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.si_opaque.so_id);
+	spin_unlock(&nn->s2s_cp_lock);
+}
+
 static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
 {
 	struct nfs4_stid *stid;
@@ -880,25 +935,25 @@
 
 static void put_deleg_file(struct nfs4_file *fp)
 {
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 
 	spin_lock(&fp->fi_lock);
 	if (--fp->fi_delegees == 0)
-		swap(filp, fp->fi_deleg_file);
+		swap(nf, fp->fi_deleg_file);
 	spin_unlock(&fp->fi_lock);
 
-	if (filp)
-		fput(filp);
+	if (nf)
+		nfsd_file_put(nf);
 }
 
 static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
-	struct file *filp = fp->fi_deleg_file;
+	struct nfsd_file *nf = fp->fi_deleg_file;
 
 	WARN_ON_ONCE(!fp->fi_delegees);
 
-	vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp);
+	vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
 	put_deleg_file(fp);
 }
 
@@ -1028,9 +1083,9 @@
 	return id & CLIENT_HASH_MASK;
 }
 
-static unsigned int clientstr_hashval(const char *name)
+static unsigned int clientstr_hashval(struct xdr_netobj name)
 {
-	return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+	return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK;
 }
 
 /*
@@ -1236,11 +1291,14 @@
 {
 	struct nfs4_ol_stateid *stp = openlockstateid(stid);
 	struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
-	struct file *file;
+	struct nfsd_file *nf;
 
-	file = find_any_file(stp->st_stid.sc_file);
-	if (file)
-		filp_close(file, (fl_owner_t)lo);
+	nf = find_any_file(stp->st_stid.sc_file);
+	if (nf) {
+		get_file(nf->nf_file);
+		filp_close(nf->nf_file, (fl_owner_t)lo);
+		nfsd_file_put(nf);
+	}
 	nfs4_free_ol_stateid(stid);
 }
 
@@ -1510,21 +1568,39 @@
  * re-negotiate active sessions and reduce their slot usage to make
  * room for new connections. For now we just fail the create session.
  */
-static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
+static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
 {
 	u32 slotsize = slot_bytes(ca);
 	u32 num = ca->maxreqs;
-	int avail;
+	unsigned long avail, total_avail;
+	unsigned int scale_factor;
 
 	spin_lock(&nfsd_drc_lock);
-	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
-		    nfsd_drc_max_mem - nfsd_drc_mem_used);
+	if (nfsd_drc_max_mem > nfsd_drc_mem_used)
+		total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+	else
+		/* We have handed out more space than we chose in
+		 * set_max_drc() to allow.  That isn't really a
+		 * problem as long as that doesn't make us think we
+		 * have lots more due to integer overflow.
+		 */
+		total_avail = 0;
+	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
 	/*
-	 * Never use more than a third of the remaining memory,
-	 * unless it's the only way to give this client a slot:
+	 * Never use more than a fraction of the remaining memory,
+	 * unless it's the only way to give this client a slot.
+	 * The chosen fraction is either 1/8 or 1/number of threads,
+	 * whichever is smaller.  This ensures there are adequate
+	 * slots to support multiple clients per thread.
+	 * Give the client one slot even if that would require
+	 * over-allocation--it is better than failure.
 	 */
-	avail = clamp_t(int, avail, slotsize, avail/3);
+	scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads);
+
+	avail = clamp_t(unsigned long, avail, slotsize,
+			total_avail/scale_factor);
 	num = min_t(int, num, avail / slotsize);
+	num = max_t(int, num, 1);
 	nfsd_drc_mem_used += num * slotsize;
 	spin_unlock(&nfsd_drc_lock);
 
@@ -1804,7 +1880,7 @@
 	clp = kmem_cache_zalloc(client_slab, GFP_KERNEL);
 	if (clp == NULL)
 		return NULL;
-	clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
+	xdr_netobj_dup(&clp->cl_name, &name, GFP_KERNEL);
 	if (clp->cl_name.data == NULL)
 		goto err_no_name;
 	clp->cl_ownerstr_hashtbl = kmalloc_array(OWNER_HASH_SIZE,
@@ -1814,10 +1890,9 @@
 		goto err_no_hashtbl;
 	for (i = 0; i < OWNER_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]);
-	clp->cl_name.len = name.len;
 	INIT_LIST_HEAD(&clp->cl_sessions);
 	idr_init(&clp->cl_stateids);
-	atomic_set(&clp->cl_refcount, 0);
+	atomic_set(&clp->cl_rpc_users, 0);
 	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
 	INIT_LIST_HEAD(&clp->cl_idhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
@@ -1827,6 +1902,8 @@
 #ifdef CONFIG_NFSD_PNFS
 	INIT_LIST_HEAD(&clp->cl_lo_states);
 #endif
+	INIT_LIST_HEAD(&clp->async_copies);
+	spin_lock_init(&clp->async_lock);
 	spin_lock_init(&clp->cl_lock);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
@@ -1837,6 +1914,25 @@
 	return NULL;
 }
 
+static void __free_client(struct kref *k)
+{
+	struct nfsdfs_client *c = container_of(k, struct nfsdfs_client, cl_ref);
+	struct nfs4_client *clp = container_of(c, struct nfs4_client, cl_nfsdfs);
+
+	free_svc_cred(&clp->cl_cred);
+	kfree(clp->cl_ownerstr_hashtbl);
+	kfree(clp->cl_name.data);
+	kfree(clp->cl_nii_domain.data);
+	kfree(clp->cl_nii_name.data);
+	idr_destroy(&clp->cl_stateids);
+	kmem_cache_free(client_slab, clp);
+}
+
+static void drop_client(struct nfs4_client *clp)
+{
+	kref_put(&clp->cl_nfsdfs.cl_ref, __free_client);
+}
+
 static void
 free_client(struct nfs4_client *clp)
 {
@@ -1849,11 +1945,12 @@
 		free_session(ses);
 	}
 	rpc_destroy_wait_queue(&clp->cl_cb_waitq);
-	free_svc_cred(&clp->cl_cred);
-	kfree(clp->cl_ownerstr_hashtbl);
-	kfree(clp->cl_name.data);
-	idr_destroy(&clp->cl_stateids);
-	kmem_cache_free(client_slab, clp);
+	if (clp->cl_nfsd_dentry) {
+		nfsd_client_rmdir(clp->cl_nfsd_dentry);
+		clp->cl_nfsd_dentry = NULL;
+		wake_up_all(&expiry_wq);
+	}
+	drop_client(clp);
 }
 
 /* must be called under the client_lock */
@@ -1894,7 +1991,7 @@
 
 static __be32 mark_client_expired_locked(struct nfs4_client *clp)
 {
-	if (atomic_read(&clp->cl_refcount))
+	if (atomic_read(&clp->cl_rpc_users))
 		return nfserr_jukebox;
 	unhash_client_locked(clp);
 	return nfs_ok;
@@ -1942,10 +2039,12 @@
 		}
 	}
 	nfsd4_return_all_client_layouts(clp);
+	nfsd4_shutdown_copy(clp);
 	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
 	free_client(clp);
+	wake_up_all(&expiry_wq);
 }
 
 static void
@@ -1955,6 +2054,22 @@
 	__destroy_client(clp);
 }
 
+static void inc_reclaim_complete(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!nn->track_reclaim_completes)
+		return;
+	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
+		return;
+	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
+			nn->reclaim_str_hashtbl_size) {
+		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
+				clp->net->ns.inum);
+		nfsd4_end_grace(nn);
+	}
+}
+
 static void expire_client(struct nfs4_client *clp)
 {
 	unhash_client(clp);
@@ -2006,11 +2121,6 @@
 	return memcmp(o1->data, o2->data, o1->len);
 }
 
-static int same_name(const char *n1, const char *n2)
-{
-	return 0 == memcmp(n1, n2, HEXDIR_LEN);
-}
-
 static int
 same_verf(nfs4_verifier *v1, nfs4_verifier *v2)
 {
@@ -2145,6 +2255,342 @@
 	return s;
 }
 
+static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
+{
+	struct nfsdfs_client *nc;
+	nc = get_nfsdfs_client(inode);
+	if (!nc)
+		return NULL;
+	return container_of(nc, struct nfs4_client, cl_nfsdfs);
+}
+
+static void seq_quote_mem(struct seq_file *m, char *data, int len)
+{
+	seq_printf(m, "\"");
+	seq_escape_mem_ascii(m, data, len);
+	seq_printf(m, "\"");
+}
+
+static int client_info_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct nfs4_client *clp;
+	u64 clid;
+
+	clp = get_nfsdfs_clp(inode);
+	if (!clp)
+		return -ENXIO;
+	memcpy(&clid, &clp->cl_clientid, sizeof(clid));
+	seq_printf(m, "clientid: 0x%llx\n", clid);
+	seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr);
+	seq_printf(m, "name: ");
+	seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
+	seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
+	if (clp->cl_nii_domain.data) {
+		seq_printf(m, "Implementation domain: ");
+		seq_quote_mem(m, clp->cl_nii_domain.data,
+					clp->cl_nii_domain.len);
+		seq_printf(m, "\nImplementation name: ");
+		seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len);
+		seq_printf(m, "\nImplementation time: [%ld, %ld]\n",
+			clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
+	}
+	drop_client(clp);
+
+	return 0;
+}
+
+static int client_info_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, client_info_show, inode);
+}
+
+static const struct file_operations client_info_fops = {
+	.open		= client_info_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void *states_start(struct seq_file *s, loff_t *pos)
+	__acquires(&clp->cl_lock)
+{
+	struct nfs4_client *clp = s->private;
+	unsigned long id = *pos;
+	void *ret;
+
+	spin_lock(&clp->cl_lock);
+	ret = idr_get_next_ul(&clp->cl_stateids, &id);
+	*pos = id;
+	return ret;
+}
+
+static void *states_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct nfs4_client *clp = s->private;
+	unsigned long id = *pos;
+	void *ret;
+
+	id = *pos;
+	id++;
+	ret = idr_get_next_ul(&clp->cl_stateids, &id);
+	*pos = id;
+	return ret;
+}
+
+static void states_stop(struct seq_file *s, void *v)
+	__releases(&clp->cl_lock)
+{
+	struct nfs4_client *clp = s->private;
+
+	spin_unlock(&clp->cl_lock);
+}
+
+static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
+{
+	struct inode *inode = f->nf_inode;
+
+	seq_printf(s, "superblock: \"%02x:%02x:%ld\"",
+					MAJOR(inode->i_sb->s_dev),
+					 MINOR(inode->i_sb->s_dev),
+					 inode->i_ino);
+}
+
+static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo)
+{
+	seq_printf(s, "owner: ");
+	seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len);
+}
+
+static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
+{
+	struct nfs4_ol_stateid *ols;
+	struct nfs4_file *nf;
+	struct nfsd_file *file;
+	struct nfs4_stateowner *oo;
+	unsigned int access, deny;
+
+	if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID)
+		return 0; /* XXX: or SEQ_SKIP? */
+	ols = openlockstateid(st);
+	oo = ols->st_stateowner;
+	nf = st->sc_file;
+	file = find_any_file(nf);
+
+	seq_printf(s, "- 0x%16phN: { type: open, ", &st->sc_stateid);
+
+	access = bmap_to_share_mode(ols->st_access_bmap);
+	deny   = bmap_to_share_mode(ols->st_deny_bmap);
+
+	seq_printf(s, "access: \%s\%s, ",
+		access & NFS4_SHARE_ACCESS_READ ? "r" : "-",
+		access & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
+	seq_printf(s, "deny: \%s\%s, ",
+		deny & NFS4_SHARE_ACCESS_READ ? "r" : "-",
+		deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
+
+	nfs4_show_superblock(s, file);
+	seq_printf(s, ", ");
+	nfs4_show_owner(s, oo);
+	seq_printf(s, " }\n");
+	nfsd_file_put(file);
+
+	return 0;
+}
+
+static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
+{
+	struct nfs4_ol_stateid *ols;
+	struct nfs4_file *nf;
+	struct nfsd_file *file;
+	struct nfs4_stateowner *oo;
+
+	ols = openlockstateid(st);
+	oo = ols->st_stateowner;
+	nf = st->sc_file;
+	file = find_any_file(nf);
+
+	seq_printf(s, "- 0x%16phN: { type: lock, ", &st->sc_stateid);
+
+	/*
+	 * Note: a lock stateid isn't really the same thing as a lock,
+	 * it's the locking state held by one owner on a file, and there
+	 * may be multiple (or no) lock ranges associated with it.
+	 * (Same for the matter is true of open stateids.)
+	 */
+
+	nfs4_show_superblock(s, file);
+	/* XXX: open stateid? */
+	seq_printf(s, ", ");
+	nfs4_show_owner(s, oo);
+	seq_printf(s, " }\n");
+	nfsd_file_put(file);
+
+	return 0;
+}
+
+static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
+{
+	struct nfs4_delegation *ds;
+	struct nfs4_file *nf;
+	struct nfsd_file *file;
+
+	ds = delegstateid(st);
+	nf = st->sc_file;
+	file = nf->fi_deleg_file;
+
+	seq_printf(s, "- 0x%16phN: { type: deleg, ", &st->sc_stateid);
+
+	/* Kinda dead code as long as we only support read delegs: */
+	seq_printf(s, "access: %s, ",
+		ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
+
+	/* XXX: lease time, whether it's being recalled. */
+
+	nfs4_show_superblock(s, file);
+	seq_printf(s, " }\n");
+
+	return 0;
+}
+
+static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfsd_file *file;
+
+	ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
+	file = ls->ls_file;
+
+	seq_printf(s, "- 0x%16phN: { type: layout, ", &st->sc_stateid);
+
+	/* XXX: What else would be useful? */
+
+	nfs4_show_superblock(s, file);
+	seq_printf(s, " }\n");
+
+	return 0;
+}
+
+static int states_show(struct seq_file *s, void *v)
+{
+	struct nfs4_stid *st = v;
+
+	switch (st->sc_type) {
+	case NFS4_OPEN_STID:
+		return nfs4_show_open(s, st);
+	case NFS4_LOCK_STID:
+		return nfs4_show_lock(s, st);
+	case NFS4_DELEG_STID:
+		return nfs4_show_deleg(s, st);
+	case NFS4_LAYOUT_STID:
+		return nfs4_show_layout(s, st);
+	default:
+		return 0; /* XXX: or SEQ_SKIP? */
+	}
+	/* XXX: copy stateids? */
+}
+
+static struct seq_operations states_seq_ops = {
+	.start = states_start,
+	.next = states_next,
+	.stop = states_stop,
+	.show = states_show
+};
+
+static int client_states_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *s;
+	struct nfs4_client *clp;
+	int ret;
+
+	clp = get_nfsdfs_clp(inode);
+	if (!clp)
+		return -ENXIO;
+
+	ret = seq_open(file, &states_seq_ops);
+	if (ret)
+		return ret;
+	s = file->private_data;
+	s->private = clp;
+	return 0;
+}
+
+static int client_opens_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct nfs4_client *clp = m->private;
+
+	/* XXX: alternatively, we could get/drop in seq start/stop */
+	drop_client(clp);
+	return 0;
+}
+
+static const struct file_operations client_states_fops = {
+	.open		= client_states_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= client_opens_release,
+};
+
+/*
+ * Normally we refuse to destroy clients that are in use, but here the
+ * administrator is telling us to just do it.  We also want to wait
+ * so the caller has a guarantee that the client's locks are gone by
+ * the time the write returns:
+ */
+static void force_expire_client(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	bool already_expired;
+
+	spin_lock(&clp->cl_lock);
+	clp->cl_time = 0;
+	spin_unlock(&clp->cl_lock);
+
+	wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
+	spin_lock(&nn->client_lock);
+	already_expired = list_empty(&clp->cl_lru);
+	if (!already_expired)
+		unhash_client_locked(clp);
+	spin_unlock(&nn->client_lock);
+
+	if (!already_expired)
+		expire_client(clp);
+	else
+		wait_event(expiry_wq, clp->cl_nfsd_dentry == NULL);
+}
+
+static ssize_t client_ctl_write(struct file *file, const char __user *buf,
+				   size_t size, loff_t *pos)
+{
+	char *data;
+	struct nfs4_client *clp;
+
+	data = simple_transaction_get(file, buf, size);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+	if (size != 7 || 0 != memcmp(data, "expire\n", 7))
+		return -EINVAL;
+	clp = get_nfsdfs_clp(file_inode(file));
+	if (!clp)
+		return -ENXIO;
+	force_expire_client(clp);
+	drop_client(clp);
+	return 7;
+}
+
+static const struct file_operations client_ctl_fops = {
+	.write		= client_ctl_write,
+	.release	= simple_transaction_release,
+};
+
+static const struct tree_descr client_files[] = {
+	[0] = {"info", &client_info_fops, S_IRUSR},
+	[1] = {"states", &client_states_fops, S_IRUSR},
+	[2] = {"ctl", &client_ctl_fops, S_IRUSR|S_IWUSR},
+	[3] = {""},
+};
+
 static struct nfs4_client *create_client(struct xdr_netobj name,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -2152,6 +2598,7 @@
 	struct sockaddr *sa = svc_addr(rqstp);
 	int ret;
 	struct net *net = SVC_NET(rqstp);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	clp = alloc_client(name);
 	if (clp == NULL)
@@ -2162,13 +2609,22 @@
 		free_client(clp);
 		return NULL;
 	}
+	gen_clid(clp, nn);
+	kref_init(&clp->cl_nfsdfs.cl_ref);
 	nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
 	clp->cl_time = get_seconds();
 	clear_bit(0, &clp->cl_cb_slot_busy);
 	copy_verf(clp, verf);
-	rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
+	memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
 	clp->cl_cb_session = NULL;
 	clp->net = net;
+	clp->cl_nfsd_dentry = nfsd_client_mkdir(nn, &clp->cl_nfsdfs,
+			clp->cl_clientid.cl_id - nn->clientid_base,
+			client_files);
+	if (!clp->cl_nfsd_dentry) {
+		free_client(clp);
+		return NULL;
+	}
 	return clp;
 }
 
@@ -2475,7 +2931,24 @@
 		|| !list_empty(&clp->cl_lo_states)
 #endif
 		|| !list_empty(&clp->cl_delegations)
-		|| !list_empty(&clp->cl_sessions);
+		|| !list_empty(&clp->cl_sessions)
+		|| !list_empty(&clp->async_copies);
+}
+
+static __be32 copy_impl_id(struct nfs4_client *clp,
+				struct nfsd4_exchange_id *exid)
+{
+	if (!exid->nii_domain.data)
+		return 0;
+	xdr_netobj_dup(&clp->cl_nii_domain, &exid->nii_domain, GFP_KERNEL);
+	if (!clp->cl_nii_domain.data)
+		return nfserr_jukebox;
+	xdr_netobj_dup(&clp->cl_nii_name, &exid->nii_name, GFP_KERNEL);
+	if (!clp->cl_nii_name.data)
+		return nfserr_jukebox;
+	clp->cl_nii_time.tv_sec = exid->nii_time.tv_sec;
+	clp->cl_nii_time.tv_nsec = exid->nii_time.tv_nsec;
+	return 0;
 }
 
 __be32
@@ -2504,6 +2977,9 @@
 	new = create_client(exid->clname, rqstp, &verf);
 	if (new == NULL)
 		return nfserr_jukebox;
+	status = copy_impl_id(new, exid);
+	if (status)
+		goto out_nolock;
 
 	switch (exid->spa_how) {
 	case SP4_MACH_CRED:
@@ -2542,6 +3018,7 @@
 		break;
 	default:				/* checked by xdr code */
 		WARN_ON_ONCE(1);
+		/* fall through */
 	case SP4_SSV:
 		status = nfserr_encr_alg_unsupp;
 		goto out_nolock;
@@ -2611,7 +3088,6 @@
 	new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
 	new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1];
 
-	gen_clid(new, nn);
 	add_to_unconfirmed(new);
 	swap(new, conf);
 out_copy:
@@ -2716,10 +3192,10 @@
 	 * performance.  When short on memory we therefore prefer to
 	 * decrease number of slots instead of their size.  Clients that
 	 * request larger slots than they need will get poor results:
+	 * Note that we always allow at least one slot, because our
+	 * accounting is soft and provides no guarantees either way.
 	 */
-	ca->maxreqs = nfsd4_get_drc_mem(ca);
-	if (!ca->maxreqs)
-		return nfserr_jukebox;
+	ca->maxreqs = nfsd4_get_drc_mem(ca, nn);
 
 	return nfs_ok;
 }
@@ -3310,6 +3786,7 @@
 
 	status = nfs_ok;
 	nfsd4_client_record_create(cstate->session->se_client);
+	inc_reclaim_complete(cstate->session->se_client);
 out:
 	return status;
 }
@@ -3354,7 +3831,7 @@
 		copy_clid(new, conf);
 		gen_confirm(new, nn);
 	} else /* case 4 (new client) or cases 2, 3 (client reboot): */
-		gen_clid(new, nn);
+		;
 	new->cl_minorversion = 0;
 	gen_callback(new, setclid, rqstp);
 	add_to_unconfirmed(new);
@@ -3575,12 +4052,11 @@
 	if (!sop)
 		return NULL;
 
-	sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
+	xdr_netobj_dup(&sop->so_owner, owner, GFP_KERNEL);
 	if (!sop->so_owner.data) {
 		kmem_cache_free(slab, sop);
 		return NULL;
 	}
-	sop->so_owner.len = owner->len;
 
 	INIT_LIST_HEAD(&sop->so_stateids);
 	sop->so_client = clp;
@@ -3914,6 +4390,9 @@
 	switch (task->tk_status) {
 	case 0:
 		return 1;
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, 2 * HZ);
+		return 0;
 	case -EBADHANDLE:
 	case -NFS4ERR_BAD_STATEID:
 		/*
@@ -3926,7 +4405,7 @@
 		}
 		/*FALLTHRU*/
 	default:
-		return -1;
+		return 1;
 	}
 }
 
@@ -4032,7 +4511,7 @@
 		spin_unlock(&nn->client_lock);
 		return nfserr_expired;
 	}
-	atomic_inc(&found->cl_refcount);
+	atomic_inc(&found->cl_rpc_users);
 	spin_unlock(&nn->client_lock);
 
 	/* Cache the nfs4_client in cstate! */
@@ -4195,7 +4674,7 @@
 		struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
 		struct nfsd4_open *open)
 {
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 	__be32 status;
 	int oflag = nfs4_access_to_omode(open->op_share_access);
 	int access = nfs4_access_to_access(open->op_share_access);
@@ -4231,18 +4710,18 @@
 
 	if (!fp->fi_fds[oflag]) {
 		spin_unlock(&fp->fi_lock);
-		status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp);
+		status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
 		if (status)
 			goto out_put_access;
 		spin_lock(&fp->fi_lock);
 		if (!fp->fi_fds[oflag]) {
-			fp->fi_fds[oflag] = filp;
-			filp = NULL;
+			fp->fi_fds[oflag] = nf;
+			nf = NULL;
 		}
 	}
 	spin_unlock(&fp->fi_lock);
-	if (filp)
-		fput(filp);
+	if (nf)
+		nfsd_file_put(nf);
 
 	status = nfsd4_truncate(rqstp, cur_fh, open);
 	if (status)
@@ -4311,7 +4790,7 @@
 	fl->fl_end = OFFSET_MAX;
 	fl->fl_owner = (fl_owner_t)dp;
 	fl->fl_pid = current->tgid;
-	fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file;
+	fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
 	return fl;
 }
 
@@ -4321,7 +4800,7 @@
 {
 	int status = 0;
 	struct nfs4_delegation *dp;
-	struct file *filp;
+	struct nfsd_file *nf;
 	struct file_lock *fl;
 
 	/*
@@ -4332,8 +4811,8 @@
 	if (fp->fi_had_conflict)
 		return ERR_PTR(-EAGAIN);
 
-	filp = find_readable_file(fp);
-	if (!filp) {
+	nf = find_readable_file(fp);
+	if (!nf) {
 		/* We should always have a readable file here */
 		WARN_ON_ONCE(1);
 		return ERR_PTR(-EBADF);
@@ -4343,17 +4822,17 @@
 	if (nfs4_delegation_exists(clp, fp))
 		status = -EAGAIN;
 	else if (!fp->fi_deleg_file) {
-		fp->fi_deleg_file = filp;
+		fp->fi_deleg_file = nf;
 		/* increment early to prevent fi_deleg_file from being
 		 * cleared */
 		fp->fi_delegees = 1;
-		filp = NULL;
+		nf = NULL;
 	} else
 		fp->fi_delegees++;
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
-	if (filp)
-		fput(filp);
+	if (nf)
+		nfsd_file_put(nf);
 	if (status)
 		return ERR_PTR(status);
 
@@ -4366,7 +4845,7 @@
 	if (!fl)
 		goto out_clnt_odstate;
 
-	status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL);
+	status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
 	if (fl)
 		locks_free_lock(fl);
 	if (status)
@@ -4386,7 +4865,7 @@
 
 	return dp;
 out_unlock:
-	vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp);
+	vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
 out_clnt_odstate:
 	put_clnt_odstate(dp->dl_clnt_odstate);
 	nfs4_put_stid(&dp->dl_stid);
@@ -4669,7 +5148,6 @@
 	if (nn->grace_ended)
 		return;
 
-	dprintk("NFSD: end of grace period\n");
 	nn->grace_ended = true;
 	/*
 	 * If the server goes down again right now, an NFSv4
@@ -4705,6 +5183,10 @@
 	unsigned long double_grace_period_end = nn->boot_time +
 						2 * nn->nfsd4_lease;
 
+	if (nn->track_reclaim_completes &&
+			atomic_read(&nn->nr_reclaim_complete) ==
+			nn->reclaim_str_hashtbl_size)
+		return false;
 	if (!nn->somebody_reclaimed)
 		return false;
 	nn->somebody_reclaimed = false;
@@ -4735,6 +5217,7 @@
 		new_timeo = 0;
 		goto out;
 	}
+	dprintk("NFSD: end of grace period\n");
 	nfsd4_end_grace(nn);
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&nn->client_lock);
@@ -4829,7 +5312,6 @@
 		nbl = list_first_entry(&reaplist,
 					struct nfsd4_blocked_lock, nbl_lru);
 		list_del_init(&nbl->nbl_lru);
-		posix_unblock_lock(&nbl->nbl_lock);
 		free_blocked_lock(nbl);
 	}
 out:
@@ -5054,7 +5536,7 @@
 	return nfs_ok;
 }
 
-static struct file *
+static struct nfsd_file *
 nfs4_find_file(struct nfs4_stid *s, int flags)
 {
 	if (!s)
@@ -5064,7 +5546,7 @@
 	case NFS4_DELEG_STID:
 		if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file))
 			return NULL;
-		return get_file(s->sc_file->fi_deleg_file);
+		return nfsd_file_get(s->sc_file->fi_deleg_file);
 	case NFS4_OPEN_STID:
 	case NFS4_LOCK_STID:
 		if (flags & RD_STATE)
@@ -5078,7 +5560,7 @@
 }
 
 static __be32
-nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags)
+nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags)
 {
 	__be32 status;
 
@@ -5090,32 +5572,28 @@
 
 static __be32
 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
-		struct file **filpp, bool *tmp_file, int flags)
+		struct nfsd_file **nfp, int flags)
 {
 	int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE;
-	struct file *file;
+	struct nfsd_file *nf;
 	__be32 status;
 
-	file = nfs4_find_file(s, flags);
-	if (file) {
+	nf = nfs4_find_file(s, flags);
+	if (nf) {
 		status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
 				acc | NFSD_MAY_OWNER_OVERRIDE);
 		if (status) {
-			fput(file);
-			return status;
+			nfsd_file_put(nf);
+			goto out;
 		}
-
-		*filpp = file;
 	} else {
-		status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp);
+		status = nfsd_file_acquire(rqstp, fhp, acc, &nf);
 		if (status)
 			return status;
-
-		if (tmp_file)
-			*tmp_file = true;
 	}
-
-	return 0;
+	*nfp = nf;
+out:
+	return status;
 }
 
 /*
@@ -5124,7 +5602,7 @@
 __be32
 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
-		stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file)
+		stateid_t *stateid, int flags, struct nfsd_file **nfp)
 {
 	struct inode *ino = d_inode(fhp->fh_dentry);
 	struct net *net = SVC_NET(rqstp);
@@ -5132,10 +5610,8 @@
 	struct nfs4_stid *s = NULL;
 	__be32 status;
 
-	if (filpp)
-		*filpp = NULL;
-	if (tmp_file)
-		*tmp_file = false;
+	if (nfp)
+		*nfp = NULL;
 
 	if (grace_disallows_io(net, ino))
 		return nfserr_grace;
@@ -5161,7 +5637,7 @@
 		break;
 	case NFS4_OPEN_STID:
 	case NFS4_LOCK_STID:
-		status = nfs4_check_olstateid(fhp, openlockstateid(s), flags);
+		status = nfs4_check_olstateid(openlockstateid(s), flags);
 		break;
 	default:
 		status = nfserr_bad_stateid;
@@ -5172,8 +5648,8 @@
 	status = nfs4_check_fh(fhp, s);
 
 done:
-	if (!status && filpp)
-		status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags);
+	if (status == nfs_ok && nfp)
+		status = nfs4_check_file(rqstp, fhp, s, nfp, flags);
 out:
 	if (s)
 		nfs4_put_stid(s);
@@ -5662,12 +6138,11 @@
 
 	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
 		lo = (struct nfs4_lockowner *) fl->fl_owner;
-		deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
-					lo->lo_owner.so_owner.len, GFP_KERNEL);
+		xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner,
+						GFP_KERNEL);
 		if (!deny->ld_owner.data)
 			/* We just don't care that much */
 			goto nevermind;
-		deny->ld_owner.len = lo->lo_owner.so_owner.len;
 		deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
 	} else {
 nevermind:
@@ -5934,7 +6409,7 @@
 	struct nfs4_ol_stateid *lock_stp = NULL;
 	struct nfs4_ol_stateid *open_stp = NULL;
 	struct nfs4_file *fp;
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 	struct nfsd4_blocked_lock *nbl = NULL;
 	struct file_lock *file_lock = NULL;
 	struct file_lock *conflock = NULL;
@@ -6016,8 +6491,8 @@
 			/* Fallthrough */
 		case NFS4_READ_LT:
 			spin_lock(&fp->fi_lock);
-			filp = find_readable_file_locked(fp);
-			if (filp)
+			nf = find_readable_file_locked(fp);
+			if (nf)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
 			spin_unlock(&fp->fi_lock);
 			fl_type = F_RDLCK;
@@ -6028,8 +6503,8 @@
 			/* Fallthrough */
 		case NFS4_WRITE_LT:
 			spin_lock(&fp->fi_lock);
-			filp = find_writeable_file_locked(fp);
-			if (filp)
+			nf = find_writeable_file_locked(fp);
+			if (nf)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
 			spin_unlock(&fp->fi_lock);
 			fl_type = F_WRLCK;
@@ -6039,7 +6514,7 @@
 		goto out;
 	}
 
-	if (!filp) {
+	if (!nf) {
 		status = nfserr_openmode;
 		goto out;
 	}
@@ -6055,7 +6530,7 @@
 	file_lock->fl_type = fl_type;
 	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
 	file_lock->fl_pid = current->tgid;
-	file_lock->fl_file = filp;
+	file_lock->fl_file = nf->nf_file;
 	file_lock->fl_flags = fl_flags;
 	file_lock->fl_lmops = &nfsd_posix_mng_ops;
 	file_lock->fl_start = lock->lk_offset;
@@ -6077,7 +6552,7 @@
 		spin_unlock(&nn->blocked_locks_lock);
 	}
 
-	err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
+	err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock);
 	switch (err) {
 	case 0: /* success! */
 		nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
@@ -6112,8 +6587,8 @@
 		}
 		free_blocked_lock(nbl);
 	}
-	if (filp)
-		fput(filp);
+	if (nf)
+		nfsd_file_put(nf);
 	if (lock_stp) {
 		/* Bump seqid manually if the 4.0 replay owner is openowner */
 		if (cstate->replay_owner &&
@@ -6148,11 +6623,11 @@
  */
 static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
 {
-	struct file *file;
-	__be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	struct nfsd_file *nf;
+	__be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
 	if (!err) {
-		err = nfserrno(vfs_test_lock(file, lock));
-		fput(file);
+		err = nfserrno(vfs_test_lock(nf->nf_file, lock));
+		nfsd_file_put(nf);
 	}
 	return err;
 }
@@ -6196,15 +6671,15 @@
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
 			file_lock->fl_type = F_RDLCK;
-		break;
+			break;
 		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
 			file_lock->fl_type = F_WRLCK;
-		break;
+			break;
 		default:
 			dprintk("NFSD: nfs4_lockt: bad lock type!\n");
 			status = nfserr_inval;
-		goto out;
+			goto out;
 	}
 
 	lo = find_lockowner_str(cstate->clp, &lockt->lt_owner);
@@ -6240,7 +6715,7 @@
 {
 	struct nfsd4_locku *locku = &u->locku;
 	struct nfs4_ol_stateid *stp;
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 	struct file_lock *file_lock = NULL;
 	__be32 status;
 	int err;
@@ -6258,8 +6733,8 @@
 					&stp, nn);
 	if (status)
 		goto out;
-	filp = find_any_file(stp->st_stid.sc_file);
-	if (!filp) {
+	nf = find_any_file(stp->st_stid.sc_file);
+	if (!nf) {
 		status = nfserr_lock_range;
 		goto put_stateid;
 	}
@@ -6267,13 +6742,13 @@
 	if (!file_lock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
-		goto fput;
+		goto put_file;
 	}
 
 	file_lock->fl_type = F_UNLCK;
 	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
 	file_lock->fl_pid = current->tgid;
-	file_lock->fl_file = filp;
+	file_lock->fl_file = nf->nf_file;
 	file_lock->fl_flags = FL_POSIX;
 	file_lock->fl_lmops = &nfsd_posix_mng_ops;
 	file_lock->fl_start = locku->lu_offset;
@@ -6282,14 +6757,14 @@
 						locku->lu_length);
 	nfs4_transform_lock_offset(file_lock);
 
-	err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
+	err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL);
 	if (err) {
 		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
 		goto out_nfserr;
 	}
 	nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
-fput:
-	fput(filp);
+put_file:
+	nfsd_file_put(nf);
 put_stateid:
 	mutex_unlock(&stp->st_mutex);
 	nfs4_put_stid(&stp->st_stid);
@@ -6301,7 +6776,7 @@
 
 out_nfserr:
 	status = nfserrno(err);
-	goto fput;
+	goto put_file;
 }
 
 /*
@@ -6314,17 +6789,17 @@
 {
 	struct file_lock *fl;
 	int status = false;
-	struct file *filp = find_any_file(fp);
+	struct nfsd_file *nf = find_any_file(fp);
 	struct inode *inode;
 	struct file_lock_context *flctx;
 
-	if (!filp) {
+	if (!nf) {
 		/* Any valid lock stateid should have some sort of access */
 		WARN_ON_ONCE(1);
 		return status;
 	}
 
-	inode = locks_inode(filp);
+	inode = locks_inode(nf->nf_file);
 	flctx = inode->i_flctx;
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
@@ -6337,7 +6812,7 @@
 		}
 		spin_unlock(&flctx->flc_lock);
 	}
-	fput(filp);
+	nfsd_file_put(nf);
 	return status;
 }
 
@@ -6415,7 +6890,7 @@
 }
 
 bool
-nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
+nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn)
 {
 	struct nfs4_client_reclaim *crp;
 
@@ -6425,20 +6900,27 @@
 
 /*
  * failure => all reset bets are off, nfserr_no_grace...
+ *
+ * The caller is responsible for freeing name.data if NULL is returned (it
+ * will be freed in nfs4_remove_reclaim_record in the normal case).
  */
 struct nfs4_client_reclaim *
-nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)
+nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
+		struct nfsd_net *nn)
 {
 	unsigned int strhashval;
 	struct nfs4_client_reclaim *crp;
 
-	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
+	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", name.len, name.data);
 	crp = alloc_reclaim();
 	if (crp) {
 		strhashval = clientstr_hashval(name);
 		INIT_LIST_HEAD(&crp->cr_strhash);
 		list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
-		memcpy(crp->cr_recdir, name, HEXDIR_LEN);
+		crp->cr_name.data = name.data;
+		crp->cr_name.len = name.len;
+		crp->cr_princhash.data = princhash.data;
+		crp->cr_princhash.len = princhash.len;
 		crp->cr_clp = NULL;
 		nn->reclaim_str_hashtbl_size++;
 	}
@@ -6449,6 +6931,8 @@
 nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
 {
 	list_del(&crp->cr_strhash);
+	kfree(crp->cr_name.data);
+	kfree(crp->cr_princhash.data);
 	kfree(crp);
 	nn->reclaim_str_hashtbl_size--;
 }
@@ -6472,16 +6956,16 @@
 /*
  * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
 struct nfs4_client_reclaim *
-nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
+nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn)
 {
 	unsigned int strhashval;
 	struct nfs4_client_reclaim *crp = NULL;
 
-	dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir);
+	dprintk("NFSD: nfs4_find_reclaim_client for name %.*s\n", name.len, name.data);
 
-	strhashval = clientstr_hashval(recdir);
+	strhashval = clientstr_hashval(name);
 	list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
-		if (same_name(crp->cr_recdir, recdir)) {
+		if (compare_blob(&crp->cr_name, &name) == 0) {
 			return crp;
 		}
 	}
@@ -6516,7 +7000,7 @@
 static inline void
 put_client(struct nfs4_client *clp)
 {
-	atomic_dec(&clp->cl_refcount);
+	atomic_dec(&clp->cl_rpc_users);
 }
 
 static struct nfs4_client *
@@ -6634,7 +7118,7 @@
 		return;
 
 	lockdep_assert_held(&nn->client_lock);
-	atomic_inc(&clp->cl_refcount);
+	atomic_inc(&clp->cl_rpc_users);
 	list_add(&lst->st_locks, collect);
 }
 
@@ -6663,7 +7147,7 @@
 				 * Despite the fact that these functions deal
 				 * with 64-bit integers for "count", we must
 				 * ensure that it doesn't blow up the
-				 * clp->cl_refcount. Throw a warning if we
+				 * clp->cl_rpc_users. Throw a warning if we
 				 * start to approach INT_MAX here.
 				 */
 				WARN_ON_ONCE(count == (INT_MAX / 2));
@@ -6787,7 +7271,7 @@
 		if (func) {
 			func(oop);
 			if (collect) {
-				atomic_inc(&clp->cl_refcount);
+				atomic_inc(&clp->cl_rpc_users);
 				list_add(&oop->oo_perclient, collect);
 			}
 		}
@@ -6795,7 +7279,7 @@
 		/*
 		 * Despite the fact that these functions deal with
 		 * 64-bit integers for "count", we must ensure that
-		 * it doesn't blow up the clp->cl_refcount. Throw a
+		 * it doesn't blow up the clp->cl_rpc_users. Throw a
 		 * warning if we start to approach INT_MAX here.
 		 */
 		WARN_ON_ONCE(count == (INT_MAX / 2));
@@ -6925,7 +7409,7 @@
 			if (dp->dl_time != 0)
 				continue;
 
-			atomic_inc(&clp->cl_refcount);
+			atomic_inc(&clp->cl_rpc_users);
 			WARN_ON(!unhash_delegation_locked(dp));
 			list_add(&dp->dl_recall_lru, victims);
 		}
@@ -6933,7 +7417,7 @@
 		/*
 		 * Despite the fact that these functions deal with
 		 * 64-bit integers for "count", we must ensure that
-		 * it doesn't blow up the clp->cl_refcount. Throw a
+		 * it doesn't blow up the clp->cl_rpc_users. Throw a
 		 * warning if we start to approach INT_MAX here.
 		 */
 		WARN_ON_ONCE(count == (INT_MAX / 2));
@@ -7160,6 +7644,8 @@
 	INIT_LIST_HEAD(&nn->close_lru);
 	INIT_LIST_HEAD(&nn->del_recall_lru);
 	spin_lock_init(&nn->client_lock);
+	spin_lock_init(&nn->s2s_cp_lock);
+	idr_init(&nn->s2s_cp_stateids);
 
 	spin_lock_init(&nn->blocked_locks_lock);
 	INIT_LIST_HEAD(&nn->blocked_locks_lru);
@@ -7217,10 +7703,19 @@
 		return ret;
 	locks_start_grace(net, &nn->nfsd4_manager);
 	nfsd4_client_tracking_init(net);
+	if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
+		goto skip_grace;
 	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
 	       nn->nfsd4_grace, net->ns.inum);
 	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
 	return 0;
+
+skip_grace:
+	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
+			net->ns.inum);
+	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
+	nfsd4_end_grace(nn);
+	return 0;
 }
 
 /* initialization to perform when the nfsd service is started: */

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 418fa9c..533d0fc 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c

@@ -49,6 +49,7 @@
 #include "cache.h"
 #include "netns.h"
 #include "pnfs.h"
+#include "filecache.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -203,6 +204,13 @@
 	return p;
 }
 
+static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp)
+{
+	unsigned int this = (char *)argp->end - (char *)argp->p;
+
+	return this + argp->pagelen;
+}
+
 static int zero_clientid(clientid_t *clid)
 {
 	return (clid->cl_boot == 0) && (clid->cl_id == 0);
@@ -211,10 +219,10 @@
 /**
  * svcxdr_tmpalloc - allocate memory to be freed after compound processing
  * @argp: NFSv4 compound argument structure
- * @p: pointer to be freed (with kfree())
+ * @len: length of buffer to allocate
  *
- * Marks @p to be freed when processing the compound operation
- * described in @argp finishes.
+ * Allocates a buffer of size @len to be freed when processing the compound
+ * operation described in @argp finishes.
  */
 static void *
 svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
@@ -269,19 +277,13 @@
 	return ret;
 }
 
-/*
- * We require the high 32 bits of 'seconds' to be 0, and
- * we ignore all 32 bits of 'nseconds'.
- */
 static __be32
-nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec64 *tv)
 {
 	DECODE_HEAD;
-	u64 sec;
 
 	READ_BUF(12);
-	p = xdr_decode_hyper(p, &sec);
-	tv->tv_sec = sec;
+	p = xdr_decode_hyper(p, &tv->tv_sec);
 	tv->tv_nsec = be32_to_cpup(p++);
 	if (tv->tv_nsec >= (u32)1000000000)
 		return nfserr_inval;
@@ -320,7 +322,6 @@
 		   struct iattr *iattr, struct nfs4_acl **acl,
 		   struct xdr_netobj *label, int *umask)
 {
-	struct timespec ts;
 	int expected_len, len = 0;
 	u32 dummy32;
 	char *buf;
@@ -354,7 +355,12 @@
 		READ_BUF(4); len += 4;
 		nace = be32_to_cpup(p++);
 
-		if (nace > NFS4_ACL_MAX)
+		if (nace > compoundargs_bytes_left(argp)/20)
+			/*
+			 * Even with 4-byte names there wouldn't be
+			 * space for that many aces; something fishy is
+			 * going on:
+			 */
 			return nfserr_fbig;
 
 		*acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
@@ -422,8 +428,7 @@
 		switch (dummy32) {
 		case NFS4_SET_TO_CLIENT_TIME:
 			len += 12;
-			status = nfsd4_decode_time(argp, &ts);
-			iattr->ia_atime = timespec_to_timespec64(ts);
+			status = nfsd4_decode_time(argp, &iattr->ia_atime);
 			if (status)
 				return status;
 			iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
@@ -442,8 +447,7 @@
 		switch (dummy32) {
 		case NFS4_SET_TO_CLIENT_TIME:
 			len += 12;
-			status = nfsd4_decode_time(argp, &ts);
-			iattr->ia_mtime = timespec_to_timespec64(ts);
+			status = nfsd4_decode_time(argp, &iattr->ia_mtime);
 			if (status)
 				return status;
 			iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
@@ -521,6 +525,7 @@
 static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
 {
 	DECODE_HEAD;
+	struct user_namespace *userns = nfsd_user_namespace(argp->rqstp);
 	u32 dummy, uid, gid;
 	char *machine_name;
 	int i;
@@ -563,8 +568,8 @@
 			dummy = be32_to_cpup(p++);
 			READ_BUF(dummy * 4);
 			if (cbs->flavor == (u32)(-1)) {
-				kuid_t kuid = make_kuid(&init_user_ns, uid);
-				kgid_t kgid = make_kgid(&init_user_ns, gid);
+				kuid_t kuid = make_kuid(userns, uid);
+				kgid_t kgid = make_kgid(userns, gid);
 				if (uid_valid(kuid) && gid_valid(kgid)) {
 					cbs->uid = kuid;
 					cbs->gid = kgid;
@@ -1397,7 +1402,6 @@
 		goto xdr_error;
 	}
 
-	/* Ignore Implementation ID */
 	READ_BUF(4);    /* nfs_impl_id4 array length */
 	dummy = be32_to_cpup(p++);
 
@@ -1405,21 +1409,19 @@
 		goto xdr_error;
 
 	if (dummy == 1) {
-		/* nii_domain */
-		READ_BUF(4);
-		dummy = be32_to_cpup(p++);
-		READ_BUF(dummy);
-		p += XDR_QUADLEN(dummy);
+		status = nfsd4_decode_opaque(argp, &exid->nii_domain);
+		if (status)
+			goto xdr_error;
 
 		/* nii_name */
-		READ_BUF(4);
-		dummy = be32_to_cpup(p++);
-		READ_BUF(dummy);
-		p += XDR_QUADLEN(dummy);
+		status = nfsd4_decode_opaque(argp, &exid->nii_name);
+		if (status)
+			goto xdr_error;
 
 		/* nii_date */
-		READ_BUF(12);
-		p += 3;
+		status = nfsd4_decode_time(argp, &exid->nii_time);
+		if (status)
+			goto xdr_error;
 	}
 	DECODE_TAIL;
 }
@@ -1429,7 +1431,6 @@
 			    struct nfsd4_create_session *sess)
 {
 	DECODE_HEAD;
-	u32 dummy;
 
 	READ_BUF(16);
 	COPYMEM(&sess->clientid, 8);
@@ -1438,7 +1439,7 @@
 
 	/* Fore channel attrs */
 	READ_BUF(28);
-	dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+	p++; /* headerpadsz is always 0 */
 	sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
 	sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
 	sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
@@ -1455,7 +1456,7 @@
 
 	/* Back channel attrs */
 	READ_BUF(28);
-	dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+	p++; /* headerpadsz is always 0 */
 	sess->back_channel.maxreq_sz = be32_to_cpup(p++);
 	sess->back_channel.maxresp_sz = be32_to_cpup(p++);
 	sess->back_channel.maxresp_cached = be32_to_cpup(p++);
@@ -1747,7 +1748,6 @@
 nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
 {
 	DECODE_HEAD;
-	unsigned int tmp;
 
 	status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
 	if (status)
@@ -1762,12 +1762,19 @@
 	p = xdr_decode_hyper(p, &copy->cp_count);
 	p++; /* ca_consecutive: we always do consecutive copies */
 	copy->cp_synchronous = be32_to_cpup(p++);
-	tmp = be32_to_cpup(p); /* Source server list not supported */
+	/* tmp = be32_to_cpup(p); Source server list not supported */
 
 	DECODE_TAIL;
 }
 
 static __be32
+nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
+			    struct nfsd4_offload_status *os)
+{
+	return nfsd4_decode_stateid(argp, &os->stateid);
+}
+
+static __be32
 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
 {
 	DECODE_HEAD;
@@ -1873,8 +1880,8 @@
 	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTERROR]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTSTATS]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_OFFLOAD_CANCEL]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OFFLOAD_CANCEL]	= (nfsd4_dec)nfsd4_decode_offload_status,
+	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_offload_status,
 	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
 	[OP_WRITE_SAME]		= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2413,8 +2420,10 @@
 	__be32 status;
 	int err;
 	struct nfs4_acl *acl = NULL;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 	void *context = NULL;
 	int contextlen;
+#endif
 	bool contextsupport = false;
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	u32 minorversion = resp->cstate.minorversion;
@@ -2899,12 +2908,14 @@
 			*p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA);
 	}
 
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
 		status = nfsd4_encode_security_label(xdr, rqstp, context,
 								contextlen);
 		if (status)
 			goto out;
 	}
+#endif
 
 	attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
 	write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
@@ -3217,9 +3228,8 @@
 	if (!p)
 		return nfserr_resource;
 	encode_cinfo(p, &create->cr_cinfo);
-	nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
+	return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
 			create->cr_bmval[1], create->cr_bmval[2]);
-	return 0;
 }
 
 static __be32
@@ -3462,7 +3472,7 @@
 
 	len = maxcount;
 	nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
-				  file, read->rd_offset, &maxcount);
+				  file, read->rd_offset, &maxcount, &eof);
 	read->rd_length = maxcount;
 	if (nfserr) {
 		/*
@@ -3474,9 +3484,6 @@
 		return nfserr;
 	}
 
-	eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
-				d_inode(read->rd_fhp->fh_dentry)->i_size);
-
 	*(p++) = htonl(eof);
 	*(p++) = htonl(maxcount);
 
@@ -3547,15 +3554,13 @@
 
 	len = maxcount;
 	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
-			    resp->rqstp->rq_vec, read->rd_vlen, &maxcount);
+			    resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
+			    &eof);
 	read->rd_length = maxcount;
 	if (nfserr)
 		return nfserr;
 	xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
 
-	eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
-				d_inode(read->rd_fhp->fh_dentry)->i_size);
-
 	tmp = htonl(eof);
 	write_bytes_to_xdr_buf(xdr->buf, starting_len    , &tmp, 4);
 	tmp = htonl(maxcount);
@@ -3574,11 +3579,14 @@
 {
 	unsigned long maxcount;
 	struct xdr_stream *xdr = &resp->xdr;
-	struct file *file = read->rd_filp;
+	struct file *file;
 	int starting_len = xdr->buf->len;
-	struct raparms *ra = NULL;
 	__be32 *p;
 
+	if (nfserr)
+		return nfserr;
+	file = read->rd_nf->nf_file;
+
 	p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
 	if (!p) {
 		WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
@@ -3596,18 +3604,12 @@
 			 (xdr->buf->buflen - xdr->buf->len));
 	maxcount = min_t(unsigned long, maxcount, read->rd_length);
 
-	if (read->rd_tmp_file)
-		ra = nfsd_init_raparms(file);
-
 	if (file->f_op->splice_read &&
 	    test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
 		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
 	else
 		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
 
-	if (ra)
-		nfsd_put_raparams(file, ra);
-
 	if (nfserr)
 		xdr_truncate_encode(xdr, starting_len);
 
@@ -4224,15 +4226,27 @@
 #endif /* CONFIG_NFSD_PNFS */
 
 static __be32
-nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
+		struct nfsd42_write_res *write, bool sync)
 {
 	__be32 *p;
-
-	p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+	p = xdr_reserve_space(&resp->xdr, 4);
 	if (!p)
 		return nfserr_resource;
 
-	*p++ = cpu_to_be32(0);
+	if (sync)
+		*p++ = cpu_to_be32(0);
+	else {
+		__be32 nfserr;
+		*p++ = cpu_to_be32(1);
+		nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid);
+		if (nfserr)
+			return nfserr;
+	}
+	p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
+	if (!p)
+		return nfserr_resource;
+
 	p = xdr_encode_hyper(p, write->wr_bytes_written);
 	*p++ = cpu_to_be32(write->wr_stable_how);
 	p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
@@ -4246,7 +4260,8 @@
 {
 	__be32 *p;
 
-	nfserr = nfsd42_encode_write_res(resp, &copy->cp_res);
+	nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
+			copy->cp_synchronous);
 	if (nfserr)
 		return nfserr;
 
@@ -4257,6 +4272,22 @@
 }
 
 static __be32
+nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
+			    struct nfsd4_offload_status *os)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 8 + 4);
+	if (!p)
+		return nfserr_resource;
+	p = xdr_encode_hyper(p, os->count);
+	*p++ = cpu_to_be32(0);
+
+	return nfserr;
+}
+
+static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  struct nfsd4_seek *seek)
 {
@@ -4359,7 +4390,7 @@
 	[OP_LAYOUTERROR]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTSTATS]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OFFLOAD_CANCEL]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_offload_status,
 	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
 	[OP_WRITE_SAME]		= (nfsd4_enc)nfsd4_encode_noop,

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index dbdeb9d..96352ab 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c

@@ -9,6 +9,7 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sunrpc/addr.h>
@@ -30,52 +31,17 @@
 #define TARGET_BUCKET_SIZE	64
 
 struct nfsd_drc_bucket {
+	struct rb_root rb_head;
 	struct list_head lru_head;
 	spinlock_t cache_lock;
 };
 
-static struct nfsd_drc_bucket	*drc_hashtbl;
-static struct kmem_cache	*drc_slab;
-
-/* max number of entries allowed in the cache */
-static unsigned int		max_drc_entries;
-
-/* number of significant bits in the hash value */
-static unsigned int		maskbits;
-static unsigned int		drc_hashsize;
-
-/*
- * Stats and other tracking of on the duplicate reply cache. All of these and
- * the "rc" fields in nfsdstats are protected by the cache_lock
- */
-
-/* total number of entries */
-static atomic_t			num_drc_entries;
-
-/* cache misses due only to checksum comparison failures */
-static unsigned int		payload_misses;
-
-/* amount of memory (in bytes) currently consumed by the DRC */
-static unsigned int		drc_mem_usage;
-
-/* longest hash chain seen */
-static unsigned int		longest_chain;
-
-/* size of cache when we saw the longest hash chain */
-static unsigned int		longest_chain_cachesize;
-
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
 					    struct shrink_control *sc);
 static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
 					   struct shrink_control *sc);
 
-static struct shrinker nfsd_reply_cache_shrinker = {
-	.scan_objects = nfsd_reply_cache_scan,
-	.count_objects = nfsd_reply_cache_count,
-	.seeks	= 1,
-};
-
 /*
  * Put a cap on the size of the DRC based on the amount of available
  * low memory in the machine.
@@ -93,12 +59,15 @@
  * ...with a hard cap of 256k entries. In the worst case, each entry will be
  * ~1k, so the above numbers should give a rough max of the amount of memory
  * used in k.
+ *
+ * XXX: these limits are per-container, so memory used will increase
+ * linearly with number of containers.  Maybe that's OK.
  */
 static unsigned int
 nfsd_cache_size_limit(void)
 {
 	unsigned int limit;
-	unsigned long low_pages = totalram_pages - totalhigh_pages;
+	unsigned long low_pages = totalram_pages() - totalhigh_pages();
 
 	limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
 	return min_t(unsigned int, limit, 256*1024);
@@ -115,108 +84,133 @@
 }
 
 static u32
-nfsd_cache_hash(__be32 xid)
+nfsd_cache_hash(__be32 xid, struct nfsd_net *nn)
 {
-	return hash_32(be32_to_cpu(xid), maskbits);
+	return hash_32(be32_to_cpu(xid), nn->maskbits);
 }
 
 static struct svc_cacherep *
-nfsd_reply_cache_alloc(void)
+nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
+			struct nfsd_net *nn)
 {
 	struct svc_cacherep	*rp;
 
-	rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
+	rp = kmem_cache_alloc(nn->drc_slab, GFP_KERNEL);
 	if (rp) {
 		rp->c_state = RC_UNUSED;
 		rp->c_type = RC_NOCACHE;
+		RB_CLEAR_NODE(&rp->c_node);
 		INIT_LIST_HEAD(&rp->c_lru);
+
+		memset(&rp->c_key, 0, sizeof(rp->c_key));
+		rp->c_key.k_xid = rqstp->rq_xid;
+		rp->c_key.k_proc = rqstp->rq_proc;
+		rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp));
+		rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp)));
+		rp->c_key.k_prot = rqstp->rq_prot;
+		rp->c_key.k_vers = rqstp->rq_vers;
+		rp->c_key.k_len = rqstp->rq_arg.len;
+		rp->c_key.k_csum = csum;
 	}
 	return rp;
 }
 
 static void
-nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
+nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
+				struct nfsd_net *nn)
 {
 	if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
-		drc_mem_usage -= rp->c_replvec.iov_len;
+		nn->drc_mem_usage -= rp->c_replvec.iov_len;
 		kfree(rp->c_replvec.iov_base);
 	}
-	list_del(&rp->c_lru);
-	atomic_dec(&num_drc_entries);
-	drc_mem_usage -= sizeof(*rp);
-	kmem_cache_free(drc_slab, rp);
+	if (rp->c_state != RC_UNUSED) {
+		rb_erase(&rp->c_node, &b->rb_head);
+		list_del(&rp->c_lru);
+		atomic_dec(&nn->num_drc_entries);
+		nn->drc_mem_usage -= sizeof(*rp);
+	}
+	kmem_cache_free(nn->drc_slab, rp);
 }
 
 static void
-nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
+			struct nfsd_net *nn)
 {
 	spin_lock(&b->cache_lock);
-	nfsd_reply_cache_free_locked(rp);
+	nfsd_reply_cache_free_locked(b, rp, nn);
 	spin_unlock(&b->cache_lock);
 }
 
-int nfsd_reply_cache_init(void)
+int nfsd_reply_cache_init(struct nfsd_net *nn)
 {
 	unsigned int hashsize;
 	unsigned int i;
 	int status = 0;
 
-	max_drc_entries = nfsd_cache_size_limit();
-	atomic_set(&num_drc_entries, 0);
-	hashsize = nfsd_hashsize(max_drc_entries);
-	maskbits = ilog2(hashsize);
+	nn->max_drc_entries = nfsd_cache_size_limit();
+	atomic_set(&nn->num_drc_entries, 0);
+	hashsize = nfsd_hashsize(nn->max_drc_entries);
+	nn->maskbits = ilog2(hashsize);
 
-	status = register_shrinker(&nfsd_reply_cache_shrinker);
+	nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
+	nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
+	nn->nfsd_reply_cache_shrinker.seeks = 1;
+	status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
 	if (status)
-		return status;
-
-	drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
-					0, 0, NULL);
-	if (!drc_slab)
 		goto out_nomem;
 
-	drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
-	if (!drc_hashtbl) {
-		drc_hashtbl = vzalloc(array_size(hashsize,
-						 sizeof(*drc_hashtbl)));
-		if (!drc_hashtbl)
-			goto out_nomem;
+	nn->drc_slab = kmem_cache_create("nfsd_drc",
+				sizeof(struct svc_cacherep), 0, 0, NULL);
+	if (!nn->drc_slab)
+		goto out_shrinker;
+
+	nn->drc_hashtbl = kcalloc(hashsize,
+				sizeof(*nn->drc_hashtbl), GFP_KERNEL);
+	if (!nn->drc_hashtbl) {
+		nn->drc_hashtbl = vzalloc(array_size(hashsize,
+						 sizeof(*nn->drc_hashtbl)));
+		if (!nn->drc_hashtbl)
+			goto out_slab;
 	}
 
 	for (i = 0; i < hashsize; i++) {
-		INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
-		spin_lock_init(&drc_hashtbl[i].cache_lock);
+		INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head);
+		spin_lock_init(&nn->drc_hashtbl[i].cache_lock);
 	}
-	drc_hashsize = hashsize;
+	nn->drc_hashsize = hashsize;
 
 	return 0;
+out_slab:
+	kmem_cache_destroy(nn->drc_slab);
+out_shrinker:
+	unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
 out_nomem:
 	printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
-	nfsd_reply_cache_shutdown();
 	return -ENOMEM;
 }
 
-void nfsd_reply_cache_shutdown(void)
+void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
 {
 	struct svc_cacherep	*rp;
 	unsigned int i;
 
-	unregister_shrinker(&nfsd_reply_cache_shrinker);
+	unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
 
-	for (i = 0; i < drc_hashsize; i++) {
-		struct list_head *head = &drc_hashtbl[i].lru_head;
+	for (i = 0; i < nn->drc_hashsize; i++) {
+		struct list_head *head = &nn->drc_hashtbl[i].lru_head;
 		while (!list_empty(head)) {
 			rp = list_first_entry(head, struct svc_cacherep, c_lru);
-			nfsd_reply_cache_free_locked(rp);
+			nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i],
+									rp, nn);
 		}
 	}
 
-	kvfree(drc_hashtbl);
-	drc_hashtbl = NULL;
-	drc_hashsize = 0;
+	kvfree(nn->drc_hashtbl);
+	nn->drc_hashtbl = NULL;
+	nn->drc_hashsize = 0;
 
-	kmem_cache_destroy(drc_slab);
-	drc_slab = NULL;
+	kmem_cache_destroy(nn->drc_slab);
+	nn->drc_slab = NULL;
 }
 
 /*
@@ -231,7 +225,7 @@
 }
 
 static long
-prune_bucket(struct nfsd_drc_bucket *b)
+prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
 {
 	struct svc_cacherep *rp, *tmp;
 	long freed = 0;
@@ -243,10 +237,10 @@
 		 */
 		if (rp->c_state == RC_INPROG)
 			continue;
-		if (atomic_read(&num_drc_entries) <= max_drc_entries &&
+		if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries &&
 		    time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
 			break;
-		nfsd_reply_cache_free_locked(rp);
+		nfsd_reply_cache_free_locked(b, rp, nn);
 		freed++;
 	}
 	return freed;
@@ -257,18 +251,18 @@
  * Also prune the oldest ones when the total exceeds the max number of entries.
  */
 static long
-prune_cache_entries(void)
+prune_cache_entries(struct nfsd_net *nn)
 {
 	unsigned int i;
 	long freed = 0;
 
-	for (i = 0; i < drc_hashsize; i++) {
-		struct nfsd_drc_bucket *b = &drc_hashtbl[i];
+	for (i = 0; i < nn->drc_hashsize; i++) {
+		struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i];
 
 		if (list_empty(&b->lru_head))
 			continue;
 		spin_lock(&b->cache_lock);
-		freed += prune_bucket(b);
+		freed += prune_bucket(b, nn);
 		spin_unlock(&b->cache_lock);
 	}
 	return freed;
@@ -277,13 +271,19 @@
 static unsigned long
 nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	return atomic_read(&num_drc_entries);
+	struct nfsd_net *nn = container_of(shrink,
+				struct nfsd_net, nfsd_reply_cache_shrinker);
+
+	return atomic_read(&nn->num_drc_entries);
 }
 
 static unsigned long
 nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-	return prune_cache_entries();
+	struct nfsd_net *nn = container_of(shrink,
+				struct nfsd_net, nfsd_reply_cache_shrinker);
+
+	return prune_cache_entries(nn);
 }
 /*
  * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
@@ -318,62 +318,62 @@
 	return csum;
 }
 
-static bool
-nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
+static int
+nfsd_cache_key_cmp(const struct svc_cacherep *key,
+			const struct svc_cacherep *rp, struct nfsd_net *nn)
 {
-	/* Check RPC XID first */
-	if (rqstp->rq_xid != rp->c_xid)
-		return false;
-	/* compare checksum of NFS data */
-	if (csum != rp->c_csum) {
-		++payload_misses;
-		return false;
-	}
+	if (key->c_key.k_xid == rp->c_key.k_xid &&
+	    key->c_key.k_csum != rp->c_key.k_csum)
+		++nn->payload_misses;
 
-	/* Other discriminators */
-	if (rqstp->rq_proc != rp->c_proc ||
-	    rqstp->rq_prot != rp->c_prot ||
-	    rqstp->rq_vers != rp->c_vers ||
-	    rqstp->rq_arg.len != rp->c_len ||
-	    !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
-	    rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
-		return false;
-
-	return true;
+	return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key));
 }
 
 /*
  * Search the request hash for an entry that matches the given rqstp.
  * Must be called with cache_lock held. Returns the found entry or
- * NULL on failure.
+ * inserts an empty key on failure.
  */
 static struct svc_cacherep *
-nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
-		__wsum csum)
+nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
+			struct nfsd_net *nn)
 {
-	struct svc_cacherep	*rp, *ret = NULL;
-	struct list_head 	*rh = &b->lru_head;
+	struct svc_cacherep	*rp, *ret = key;
+	struct rb_node		**p = &b->rb_head.rb_node,
+				*parent = NULL;
 	unsigned int		entries = 0;
+	int cmp;
 
-	list_for_each_entry(rp, rh, c_lru) {
+	while (*p != NULL) {
 		++entries;
-		if (nfsd_cache_match(rqstp, csum, rp)) {
+		parent = *p;
+		rp = rb_entry(parent, struct svc_cacherep, c_node);
+
+		cmp = nfsd_cache_key_cmp(key, rp, nn);
+		if (cmp < 0)
+			p = &parent->rb_left;
+		else if (cmp > 0)
+			p = &parent->rb_right;
+		else {
 			ret = rp;
-			break;
+			goto out;
 		}
 	}
-
+	rb_link_node(&key->c_node, parent, p);
+	rb_insert_color(&key->c_node, &b->rb_head);
+out:
 	/* tally hash chain length stats */
-	if (entries > longest_chain) {
-		longest_chain = entries;
-		longest_chain_cachesize = atomic_read(&num_drc_entries);
-	} else if (entries == longest_chain) {
+	if (entries > nn->longest_chain) {
+		nn->longest_chain = entries;
+		nn->longest_chain_cachesize = atomic_read(&nn->num_drc_entries);
+	} else if (entries == nn->longest_chain) {
 		/* prefer to keep the smallest cachesize possible here */
-		longest_chain_cachesize = min_t(unsigned int,
-				longest_chain_cachesize,
-				atomic_read(&num_drc_entries));
+		nn->longest_chain_cachesize = min_t(unsigned int,
+				nn->longest_chain_cachesize,
+				atomic_read(&nn->num_drc_entries));
 	}
 
+	lru_put_end(b, ret);
 	return ret;
 }
 
@@ -387,14 +387,12 @@
 int
 nfsd_cache_lookup(struct svc_rqst *rqstp)
 {
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct svc_cacherep	*rp, *found;
 	__be32			xid = rqstp->rq_xid;
-	u32			proto =  rqstp->rq_prot,
-				vers = rqstp->rq_vers,
-				proc = rqstp->rq_proc;
 	__wsum			csum;
-	u32 hash = nfsd_cache_hash(xid);
-	struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
+	u32 hash = nfsd_cache_hash(xid, nn);
+	struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash];
 	int type = rqstp->rq_cachetype;
 	int rtn = RC_DOIT;
 
@@ -410,60 +408,38 @@
 	 * Since the common case is a cache miss followed by an insert,
 	 * preallocate an entry.
 	 */
-	rp = nfsd_reply_cache_alloc();
-	spin_lock(&b->cache_lock);
-	if (likely(rp)) {
-		atomic_inc(&num_drc_entries);
-		drc_mem_usage += sizeof(*rp);
-	}
-
-	/* go ahead and prune the cache */
-	prune_bucket(b);
-
-	found = nfsd_cache_search(b, rqstp, csum);
-	if (found) {
-		if (likely(rp))
-			nfsd_reply_cache_free_locked(rp);
-		rp = found;
-		goto found_entry;
-	}
-
+	rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
 	if (!rp) {
 		dprintk("nfsd: unable to allocate DRC entry!\n");
-		goto out;
+		return rtn;
+	}
+
+	spin_lock(&b->cache_lock);
+	found = nfsd_cache_insert(b, rp, nn);
+	if (found != rp) {
+		nfsd_reply_cache_free_locked(NULL, rp, nn);
+		rp = found;
+		goto found_entry;
 	}
 
 	nfsdstats.rcmisses++;
 	rqstp->rq_cacherep = rp;
 	rp->c_state = RC_INPROG;
-	rp->c_xid = xid;
-	rp->c_proc = proc;
-	rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
-	rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
-	rp->c_prot = proto;
-	rp->c_vers = vers;
-	rp->c_len = rqstp->rq_arg.len;
-	rp->c_csum = csum;
 
-	lru_put_end(b, rp);
+	atomic_inc(&nn->num_drc_entries);
+	nn->drc_mem_usage += sizeof(*rp);
 
-	/* release any buffer */
-	if (rp->c_type == RC_REPLBUFF) {
-		drc_mem_usage -= rp->c_replvec.iov_len;
-		kfree(rp->c_replvec.iov_base);
-		rp->c_replvec.iov_base = NULL;
-	}
-	rp->c_type = RC_NOCACHE;
+	/* go ahead and prune the cache */
+	prune_bucket(b, nn);
  out:
 	spin_unlock(&b->cache_lock);
 	return rtn;
 
 found_entry:
-	nfsdstats.rchits++;
 	/* We found a matching entry which is either in progress or done. */
-	lru_put_end(b, rp);
-
+	nfsdstats.rchits++;
 	rtn = RC_DROPIT;
+
 	/* Request being processed */
 	if (rp->c_state == RC_INPROG)
 		goto out;
@@ -489,7 +465,7 @@
 		break;
 	default:
 		printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
-		nfsd_reply_cache_free_locked(rp);
+		nfsd_reply_cache_free_locked(b, rp, nn);
 	}
 
 	goto out;
@@ -514,6 +490,7 @@
 void
 nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 {
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct svc_cacherep *rp = rqstp->rq_cacherep;
 	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
 	u32		hash;
@@ -524,15 +501,15 @@
 	if (!rp)
 		return;
 
-	hash = nfsd_cache_hash(rp->c_xid);
-	b = &drc_hashtbl[hash];
+	hash = nfsd_cache_hash(rp->c_key.k_xid, nn);
+	b = &nn->drc_hashtbl[hash];
 
 	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
 	len >>= 2;
 
 	/* Don't cache excessive amounts of data and XDR failures */
 	if (!statp || len > (256 >> 2)) {
-		nfsd_reply_cache_free(b, rp);
+		nfsd_reply_cache_free(b, rp, nn);
 		return;
 	}
 
@@ -547,18 +524,18 @@
 		bufsize = len << 2;
 		cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
 		if (!cachv->iov_base) {
-			nfsd_reply_cache_free(b, rp);
+			nfsd_reply_cache_free(b, rp, nn);
 			return;
 		}
 		cachv->iov_len = bufsize;
 		memcpy(cachv->iov_base, statp, bufsize);
 		break;
 	case RC_NOCACHE:
-		nfsd_reply_cache_free(b, rp);
+		nfsd_reply_cache_free(b, rp, nn);
 		return;
 	}
 	spin_lock(&b->cache_lock);
-	drc_mem_usage += bufsize;
+	nn->drc_mem_usage += bufsize;
 	lru_put_end(b, rp);
 	rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags);
 	rp->c_type = cachetype;
@@ -594,21 +571,26 @@
  */
 static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "max entries:           %u\n", max_drc_entries);
+	struct nfsd_net *nn = m->private;
+
+	seq_printf(m, "max entries:           %u\n", nn->max_drc_entries);
 	seq_printf(m, "num entries:           %u\n",
-			atomic_read(&num_drc_entries));
-	seq_printf(m, "hash buckets:          %u\n", 1 << maskbits);
-	seq_printf(m, "mem usage:             %u\n", drc_mem_usage);
+			atomic_read(&nn->num_drc_entries));
+	seq_printf(m, "hash buckets:          %u\n", 1 << nn->maskbits);
+	seq_printf(m, "mem usage:             %u\n", nn->drc_mem_usage);
 	seq_printf(m, "cache hits:            %u\n", nfsdstats.rchits);
 	seq_printf(m, "cache misses:          %u\n", nfsdstats.rcmisses);
 	seq_printf(m, "not cached:            %u\n", nfsdstats.rcnocache);
-	seq_printf(m, "payload misses:        %u\n", payload_misses);
-	seq_printf(m, "longest chain len:     %u\n", longest_chain);
-	seq_printf(m, "cachesize at longest:  %u\n", longest_chain_cachesize);
+	seq_printf(m, "payload misses:        %u\n", nn->payload_misses);
+	seq_printf(m, "longest chain len:     %u\n", nn->longest_chain);
+	seq_printf(m, "cachesize at longest:  %u\n", nn->longest_chain_cachesize);
 	return 0;
 }
 
 int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, nfsd_reply_cache_stats_show, NULL);
+	struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info,
+								nfsd_net_id);
+
+	return single_open(file, nfsd_reply_cache_stats_show, nn);
 }

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7fb9f7c..11b42c5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Syscall interface to knfsd.
  *
@@ -7,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
+#include <linux/fs_context.h>
 
 #include <linux/sunrpc/svcsock.h>
 #include <linux/lockd/lockd.h>
@@ -15,6 +17,7 @@
 #include <linux/sunrpc/gss_krb5_enctypes.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/module.h>
+#include <linux/fsnotify.h>
 
 #include "idmap.h"
 #include "nfsd.h"
@@ -52,6 +55,7 @@
 	NFSD_RecoveryDir,
 	NFSD_V4EndGrace,
 #endif
+	NFSD_MaxReserved
 };
 
 /*
@@ -439,7 +443,7 @@
 			return rv;
 		if (newthreads < 0)
 			return -EINVAL;
-		rv = nfsd_svc(newthreads, net);
+		rv = nfsd_svc(newthreads, net, file->f_cred);
 		if (rv < 0)
 			return rv;
 	} else
@@ -537,14 +541,14 @@
 }
 
 static ssize_t
-nfsd_print_version_support(char *buf, int remaining, const char *sep,
-		unsigned vers, int minor)
+nfsd_print_version_support(struct nfsd_net *nn, char *buf, int remaining,
+		const char *sep, unsigned vers, int minor)
 {
 	const char *format = minor < 0 ? "%s%c%u" : "%s%c%u.%u";
-	bool supported = !!nfsd_vers(vers, NFSD_TEST);
+	bool supported = !!nfsd_vers(nn, vers, NFSD_TEST);
 
 	if (vers == 4 && minor >= 0 &&
-	    !nfsd_minorversion(minor, NFSD_TEST))
+	    !nfsd_minorversion(nn, minor, NFSD_TEST))
 		supported = false;
 	if (minor == 0 && supported)
 		/*
@@ -599,20 +603,20 @@
 			switch(num) {
 			case 2:
 			case 3:
-				nfsd_vers(num, cmd);
+				nfsd_vers(nn, num, cmd);
 				break;
 			case 4:
 				if (*minorp == '.') {
-					if (nfsd_minorversion(minor, cmd) < 0)
+					if (nfsd_minorversion(nn, minor, cmd) < 0)
 						return -EINVAL;
-				} else if ((cmd == NFSD_SET) != nfsd_vers(num, NFSD_TEST)) {
+				} else if ((cmd == NFSD_SET) != nfsd_vers(nn, num, NFSD_TEST)) {
 					/*
 					 * Either we have +4 and no minors are enabled,
 					 * or we have -4 and at least one minor is enabled.
 					 * In either case, propagate 'cmd' to all minors.
 					 */
 					minor = 0;
-					while (nfsd_minorversion(minor, cmd) >= 0)
+					while (nfsd_minorversion(nn, minor, cmd) >= 0)
 						minor++;
 				}
 				break;
@@ -624,7 +628,7 @@
 		/* If all get turned off, turn them back on, as
 		 * having no versions is BAD
 		 */
-		nfsd_reset_versions();
+		nfsd_reset_versions(nn);
 	}
 
 	/* Now write current state into reply buffer */
@@ -633,12 +637,12 @@
 	remaining = SIMPLE_TRANSACTION_LIMIT;
 	for (num=2 ; num <= 4 ; num++) {
 		int minor;
-		if (!nfsd_vers(num, NFSD_AVAIL))
+		if (!nfsd_vers(nn, num, NFSD_AVAIL))
 			continue;
 
 		minor = -1;
 		do {
-			len = nfsd_print_version_support(buf, remaining,
+			len = nfsd_print_version_support(nn, buf, remaining,
 					sep, num, minor);
 			if (len >= remaining)
 				goto out;
@@ -717,7 +721,7 @@
  * a socket of a supported family/protocol, and we use it as an
  * nfsd listener.
  */
-static ssize_t __write_ports_addfd(char *buf, struct net *net)
+static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred *cred)
 {
 	char *mesg = buf;
 	int fd, err;
@@ -736,7 +740,7 @@
 	if (err != 0)
 		return err;
 
-	err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
+	err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
 	if (err < 0) {
 		nfsd_destroy(net);
 		return err;
@@ -751,7 +755,7 @@
  * A transport listener is added by writing it's transport name and
  * a port number.
  */
-static ssize_t __write_ports_addxprt(char *buf, struct net *net)
+static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred)
 {
 	char transport[16];
 	struct svc_xprt *xprt;
@@ -769,12 +773,12 @@
 		return err;
 
 	err = svc_create_xprt(nn->nfsd_serv, transport, net,
-				PF_INET, port, SVC_SOCK_ANONYMOUS);
+				PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
 	if (err < 0)
 		goto out_err;
 
 	err = svc_create_xprt(nn->nfsd_serv, transport, net,
-				PF_INET6, port, SVC_SOCK_ANONYMOUS);
+				PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
 	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_close;
 
@@ -799,10 +803,10 @@
 		return __write_ports_names(buf, net);
 
 	if (isdigit(buf[0]))
-		return __write_ports_addfd(buf, net);
+		return __write_ports_addfd(buf, net, file->f_cred);
 
 	if (isalpha(buf[0]))
-		return __write_ports_addxprt(buf, net);
+		return __write_ports_addxprt(buf, net, file->f_cred);
 
 	return -EINVAL;
 }
@@ -1126,6 +1130,8 @@
 		case 'Y':
 		case 'y':
 		case '1':
+			if (!nn->nfsd_serv)
+				return -EBUSY;
 			nfsd4_end_grace(nn);
 			break;
 		default:
@@ -1144,8 +1150,200 @@
  *	populating the filesystem.
  */
 
-static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
+/* Basically copying rpc_get_inode. */
+static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
 {
+	struct inode *inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+	/* Following advice from simple_fill_super documentation: */
+	inode->i_ino = iunique(sb, NFSD_MaxReserved);
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_fop = &simple_dir_operations;
+		inode->i_op = &simple_dir_inode_operations;
+		inc_nlink(inode);
+	default:
+		break;
+	}
+	return inode;
+}
+
+static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl)
+{
+	struct inode *inode;
+
+	inode = nfsd_get_inode(dir->i_sb, mode);
+	if (!inode)
+		return -ENOMEM;
+	if (ncl) {
+		inode->i_private = ncl;
+		kref_get(&ncl->cl_ref);
+	}
+	d_add(dentry, inode);
+	inc_nlink(dir);
+	fsnotify_mkdir(dir, dentry);
+	return 0;
+}
+
+static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name)
+{
+	struct inode *dir = parent->d_inode;
+	struct dentry *dentry;
+	int ret = -ENOMEM;
+
+	inode_lock(dir);
+	dentry = d_alloc_name(parent, name);
+	if (!dentry)
+		goto out_err;
+	ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl);
+	if (ret)
+		goto out_err;
+out:
+	inode_unlock(dir);
+	return dentry;
+out_err:
+	dput(dentry);
+	dentry = ERR_PTR(ret);
+	goto out;
+}
+
+static void clear_ncl(struct inode *inode)
+{
+	struct nfsdfs_client *ncl = inode->i_private;
+
+	inode->i_private = NULL;
+	kref_put(&ncl->cl_ref, ncl->cl_release);
+}
+
+static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode)
+{
+	struct nfsdfs_client *nc = inode->i_private;
+
+	if (nc)
+		kref_get(&nc->cl_ref);
+	return nc;
+}
+
+struct nfsdfs_client *get_nfsdfs_client(struct inode *inode)
+{
+	struct nfsdfs_client *nc;
+
+	inode_lock_shared(inode);
+	nc = __get_nfsdfs_client(inode);
+	inode_unlock_shared(inode);
+	return nc;
+}
+/* from __rpc_unlink */
+static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
+{
+	int ret;
+
+	clear_ncl(d_inode(dentry));
+	dget(dentry);
+	ret = simple_unlink(dir, dentry);
+	d_delete(dentry);
+	dput(dentry);
+	WARN_ON_ONCE(ret);
+}
+
+static void nfsdfs_remove_files(struct dentry *root)
+{
+	struct dentry *dentry, *tmp;
+
+	list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) {
+		if (!simple_positive(dentry)) {
+			WARN_ON_ONCE(1); /* I think this can't happen? */
+			continue;
+		}
+		nfsdfs_remove_file(d_inode(root), dentry);
+	}
+}
+
+/* XXX: cut'n'paste from simple_fill_super; figure out if we could share
+ * code instead. */
+static  int nfsdfs_create_files(struct dentry *root,
+					const struct tree_descr *files)
+{
+	struct inode *dir = d_inode(root);
+	struct inode *inode;
+	struct dentry *dentry;
+	int i;
+
+	inode_lock(dir);
+	for (i = 0; files->name && files->name[0]; i++, files++) {
+		if (!files->name)
+			continue;
+		dentry = d_alloc_name(root, files->name);
+		if (!dentry)
+			goto out;
+		inode = nfsd_get_inode(d_inode(root)->i_sb,
+					S_IFREG | files->mode);
+		if (!inode) {
+			dput(dentry);
+			goto out;
+		}
+		inode->i_fop = files->ops;
+		inode->i_private = __get_nfsdfs_client(dir);
+		d_add(dentry, inode);
+		fsnotify_create(dir, dentry);
+	}
+	inode_unlock(dir);
+	return 0;
+out:
+	nfsdfs_remove_files(root);
+	inode_unlock(dir);
+	return -ENOMEM;
+}
+
+/* on success, returns positive number unique to that client. */
+struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
+		struct nfsdfs_client *ncl, u32 id,
+		const struct tree_descr *files)
+{
+	struct dentry *dentry;
+	char name[11];
+	int ret;
+
+	sprintf(name, "%u", id);
+
+	dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name);
+	if (IS_ERR(dentry)) /* XXX: tossing errors? */
+		return NULL;
+	ret = nfsdfs_create_files(dentry, files);
+	if (ret) {
+		nfsd_client_rmdir(dentry);
+		return NULL;
+	}
+	return dentry;
+}
+
+/* Taken from __rpc_rmdir: */
+void nfsd_client_rmdir(struct dentry *dentry)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct inode *inode = d_inode(dentry);
+	int ret;
+
+	inode_lock(dir);
+	nfsdfs_remove_files(dentry);
+	clear_ncl(inode);
+	dget(dentry);
+	ret = simple_rmdir(dir, dentry);
+	WARN_ON_ONCE(ret);
+	d_delete(dentry);
+	inode_unlock(dir);
+}
+
+static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+							nfsd_net_id);
+	struct dentry *dentry;
+	int ret;
+
 	static const struct tree_descr nfsd_files[] = {
 		[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
 		[NFSD_Export_features] = {"export_features",
@@ -1174,15 +1372,39 @@
 #endif
 		/* last one */ {""}
 	};
-	get_net(sb->s_fs_info);
-	return simple_fill_super(sb, 0x6e667364, nfsd_files);
+
+	ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
+	if (ret)
+		return ret;
+	dentry = nfsd_mkdir(sb->s_root, NULL, "clients");
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	nn->nfsd_client_dir = dentry;
+	return 0;
 }
 
-static struct dentry *nfsd_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int nfsd_fs_get_tree(struct fs_context *fc)
 {
-	struct net *net = current->nsproxy->net_ns;
-	return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super);
+	return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns));
+}
+
+static void nfsd_fs_free_fc(struct fs_context *fc)
+{
+	if (fc->s_fs_info)
+		put_net(fc->s_fs_info);
+}
+
+static const struct fs_context_operations nfsd_fs_context_ops = {
+	.free		= nfsd_fs_free_fc,
+	.get_tree	= nfsd_fs_get_tree,
+};
+
+static int nfsd_init_fs_context(struct fs_context *fc)
+{
+	put_user_ns(fc->user_ns);
+	fc->user_ns = get_user_ns(fc->net_ns->user_ns);
+	fc->ops = &nfsd_fs_context_ops;
+	return 0;
 }
 
 static void nfsd_umount(struct super_block *sb)
@@ -1196,7 +1418,7 @@
 static struct file_system_type nfsd_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfsd",
-	.mount		= nfsd_mount,
+	.init_fs_context = nfsd_init_fs_context,
 	.kill_sb	= nfsd_umount,
 };
 MODULE_ALIAS_FS("nfsd");
@@ -1229,6 +1451,7 @@
 static __net_init int nfsd_init_net(struct net *net)
 {
 	int retval;
+	struct vfsmount *mnt;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	retval = nfsd_export_init(net);
@@ -1237,16 +1460,36 @@
 	retval = nfsd_idmap_init(net);
 	if (retval)
 		goto out_idmap_error;
-	nn->nfsd4_lease = 45;	/* default lease time */
-	nn->nfsd4_grace = 45;
+	nn->nfsd_versions = NULL;
+	nn->nfsd4_minorversions = NULL;
+	retval = nfsd_reply_cache_init(nn);
+	if (retval)
+		goto out_drc_error;
+	nn->nfsd4_lease = 90;	/* default lease time */
+	nn->nfsd4_grace = 90;
 	nn->somebody_reclaimed = false;
+	nn->track_reclaim_completes = false;
 	nn->clverifier_counter = prandom_u32();
-	nn->clientid_counter = prandom_u32();
+	nn->clientid_base = prandom_u32();
+	nn->clientid_counter = nn->clientid_base + 1;
+	nn->s2s_cp_cl_id = nn->clientid_counter++;
 
 	atomic_set(&nn->ntf_refcnt, 0);
 	init_waitqueue_head(&nn->ntf_wq);
+	seqlock_init(&nn->boot_lock);
+
+	mnt =  vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
+	if (IS_ERR(mnt)) {
+		retval = PTR_ERR(mnt);
+		goto out_mount_err;
+	}
+	nn->nfsd_mnt = mnt;
 	return 0;
 
+out_mount_err:
+	nfsd_reply_cache_shutdown(nn);
+out_drc_error:
+	nfsd_idmap_shutdown(net);
 out_idmap_error:
 	nfsd_export_shutdown(net);
 out_export_error:
@@ -1255,8 +1498,13 @@
 
 static __net_exit void nfsd_exit_net(struct net *net)
 {
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	mntput(nn->nfsd_mnt);
+	nfsd_reply_cache_shutdown(nn);
 	nfsd_idmap_shutdown(net);
 	nfsd_export_shutdown(net);
+	nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
 }
 
 static struct pernet_operations nfsd_net_ops = {
@@ -1283,13 +1531,8 @@
 	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
-	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
-	if (retval)
-		goto out_exit_pnfs;
+	nfsd_fault_inject_init(); /* nfsd fault injection controls */
 	nfsd_stat_init();	/* Statistics */
-	retval = nfsd_reply_cache_init();
-	if (retval)
-		goto out_free_stat;
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
 	retval = create_proc_exports_entry();
 	if (retval)
@@ -1303,11 +1546,8 @@
 	remove_proc_entry("fs/nfs", NULL);
 out_free_lockd:
 	nfsd_lockd_shutdown();
-	nfsd_reply_cache_shutdown();
-out_free_stat:
 	nfsd_stat_shutdown();
 	nfsd_fault_inject_cleanup();
-out_exit_pnfs:
 	nfsd4_exit_pnfs();
 out_free_slabs:
 	nfsd4_free_slabs();
@@ -1320,7 +1560,6 @@
 
 static void __exit exit_nfsd(void)
 {
-	nfsd_reply_cache_shutdown();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
 	nfsd_stat_shutdown();

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 0668999..af29475 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h

@@ -17,10 +17,12 @@
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
 #include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/sunrpc/msg_prot.h>
 
 #include <uapi/linux/nfsd/debug.h>
 
+#include "netns.h"
 #include "stats.h"
 #include "export.h"
 
@@ -73,7 +75,7 @@
 /*
  * Function prototypes.
  */
-int		nfsd_svc(int nrservs, struct net *net);
+int		nfsd_svc(int nrservs, struct net *net, const struct cred *cred);
 int		nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
 
 int		nfsd_nrthreads(struct net *);
@@ -85,6 +87,16 @@
 
 void		nfsd_destroy(struct net *net);
 
+struct nfsdfs_client {
+	struct kref cl_ref;
+	void (*cl_release)(struct kref *kref);
+};
+
+struct nfsdfs_client *get_nfsdfs_client(struct inode *);
+struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
+		struct nfsdfs_client *ncl, u32 id, const struct tree_descr *);
+void nfsd_client_rmdir(struct dentry *dentry);
+
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 #ifdef CONFIG_NFSD_V2_ACL
 extern const struct svc_version nfsd_acl_version2;
@@ -98,10 +110,12 @@
 #endif
 #endif
 
+struct nfsd_net;
+
 enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
-int nfsd_vers(int vers, enum vers_op change);
-int nfsd_minorversion(u32 minorversion, enum vers_op change);
-void nfsd_reset_versions(void);
+int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change);
+int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change);
+void nfsd_reset_versions(struct nfsd_net *nn);
 int nfsd_create_serv(struct net *net);
 
 extern int nfsd_max_blksize;
@@ -110,6 +124,12 @@
 {
 	return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
 }
+static inline struct user_namespace *
+nfsd_user_namespace(const struct svc_rqst *rqstp)
+{
+	const struct cred *cred = rqstp->rq_xprt->xpt_cred;
+	return cred ? cred->user_ns : &init_user_ns;
+}
 
 /* 
  * NFSv4 State

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0d20fd1..c83ddac 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c

@@ -172,6 +172,7 @@
 	struct nfsd_readargs *argp = rqstp->rq_argp;
 	struct nfsd_readres *resp = rqstp->rq_resp;
 	__be32	nfserr;
+	u32 eof;
 
 	dprintk("nfsd: READ    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -195,7 +196,8 @@
 	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
 				  argp->offset,
 			   	  rqstp->rq_vec, argp->vlen,
-				  &resp->count);
+				  &resp->count,
+				  &eof);
 
 	if (nfserr) return nfserr;
 	return fh_getattr(&resp->fh, &resp->stat);

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 89cb484..fdf7ed4 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c

@@ -27,11 +27,30 @@
 #include "cache.h"
 #include "vfs.h"
 #include "netns.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static int			nfsd_acl_rpcbind_set(struct net *,
+						     const struct svc_program *,
+						     u32, int,
+						     unsigned short,
+						     unsigned short);
+static __be32			nfsd_acl_init_request(struct svc_rqst *,
+						const struct svc_program *,
+						struct svc_process_info *);
+#endif
+static int			nfsd_rpcbind_set(struct net *,
+						 const struct svc_program *,
+						 u32, int,
+						 unsigned short,
+						 unsigned short);
+static __be32			nfsd_init_request(struct svc_rqst *,
+						const struct svc_program *,
+						struct svc_process_info *);
 
 /*
  * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
@@ -86,6 +105,8 @@
 	.pg_class		= "nfsd",
 	.pg_stats		= &nfsd_acl_svcstats,
 	.pg_authenticate	= &svc_set_client,
+	.pg_init_request	= nfsd_acl_init_request,
+	.pg_rpcbind_set		= nfsd_acl_rpcbind_set,
 };
 
 static struct svc_stat	nfsd_acl_svcstats = {
@@ -105,7 +126,6 @@
 
 #define NFSD_MINVERS    	2
 #define NFSD_NRVERS		ARRAY_SIZE(nfsd_version)
-static const struct svc_version *nfsd_versions[NFSD_NRVERS];
 
 struct svc_program		nfsd_program = {
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
@@ -113,77 +133,136 @@
 #endif
 	.pg_prog		= NFS_PROGRAM,		/* program number */
 	.pg_nvers		= NFSD_NRVERS,		/* nr of entries in nfsd_version */
-	.pg_vers		= nfsd_versions,	/* version table */
+	.pg_vers		= nfsd_version,		/* version table */
 	.pg_name		= "nfsd",		/* program name */
 	.pg_class		= "nfsd",		/* authentication class */
 	.pg_stats		= &nfsd_svcstats,	/* version table */
 	.pg_authenticate	= &svc_set_client,	/* export authentication */
-
+	.pg_init_request	= nfsd_init_request,
+	.pg_rpcbind_set		= nfsd_rpcbind_set,
 };
 
-static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
-	[0] = 1,
-	[1] = 1,
-	[2] = 1,
-};
+static bool
+nfsd_support_version(int vers)
+{
+	if (vers >= NFSD_MINVERS && vers < NFSD_NRVERS)
+		return nfsd_version[vers] != NULL;
+	return false;
+}
 
-int nfsd_vers(int vers, enum vers_op change)
+static bool *
+nfsd_alloc_versions(void)
+{
+	bool *vers = kmalloc_array(NFSD_NRVERS, sizeof(bool), GFP_KERNEL);
+	unsigned i;
+
+	if (vers) {
+		/* All compiled versions are enabled by default */
+		for (i = 0; i < NFSD_NRVERS; i++)
+			vers[i] = nfsd_support_version(i);
+	}
+	return vers;
+}
+
+static bool *
+nfsd_alloc_minorversions(void)
+{
+	bool *vers = kmalloc_array(NFSD_SUPPORTED_MINOR_VERSION + 1,
+			sizeof(bool), GFP_KERNEL);
+	unsigned i;
+
+	if (vers) {
+		/* All minor versions are enabled by default */
+		for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
+			vers[i] = nfsd_support_version(4);
+	}
+	return vers;
+}
+
+void
+nfsd_netns_free_versions(struct nfsd_net *nn)
+{
+	kfree(nn->nfsd_versions);
+	kfree(nn->nfsd4_minorversions);
+	nn->nfsd_versions = NULL;
+	nn->nfsd4_minorversions = NULL;
+}
+
+static void
+nfsd_netns_init_versions(struct nfsd_net *nn)
+{
+	if (!nn->nfsd_versions) {
+		nn->nfsd_versions = nfsd_alloc_versions();
+		nn->nfsd4_minorversions = nfsd_alloc_minorversions();
+		if (!nn->nfsd_versions || !nn->nfsd4_minorversions)
+			nfsd_netns_free_versions(nn);
+	}
+}
+
+int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change)
 {
 	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
 		return 0;
 	switch(change) {
 	case NFSD_SET:
-		nfsd_versions[vers] = nfsd_version[vers];
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-		if (vers < NFSD_ACL_NRVERS)
-			nfsd_acl_versions[vers] = nfsd_acl_version[vers];
-#endif
+		if (nn->nfsd_versions)
+			nn->nfsd_versions[vers] = nfsd_support_version(vers);
 		break;
 	case NFSD_CLEAR:
-		nfsd_versions[vers] = NULL;
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-		if (vers < NFSD_ACL_NRVERS)
-			nfsd_acl_versions[vers] = NULL;
-#endif
+		nfsd_netns_init_versions(nn);
+		if (nn->nfsd_versions)
+			nn->nfsd_versions[vers] = false;
 		break;
 	case NFSD_TEST:
-		return nfsd_versions[vers] != NULL;
+		if (nn->nfsd_versions)
+			return nn->nfsd_versions[vers];
+		/* Fallthrough */
 	case NFSD_AVAIL:
-		return nfsd_version[vers] != NULL;
+		return nfsd_support_version(vers);
 	}
 	return 0;
 }
 
 static void
-nfsd_adjust_nfsd_versions4(void)
+nfsd_adjust_nfsd_versions4(struct nfsd_net *nn)
 {
 	unsigned i;
 
 	for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) {
-		if (nfsd_supported_minorversions[i])
+		if (nn->nfsd4_minorversions[i])
 			return;
 	}
-	nfsd_vers(4, NFSD_CLEAR);
+	nfsd_vers(nn, 4, NFSD_CLEAR);
 }
 
-int nfsd_minorversion(u32 minorversion, enum vers_op change)
+int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change)
 {
 	if (minorversion > NFSD_SUPPORTED_MINOR_VERSION &&
 	    change != NFSD_AVAIL)
 		return -1;
+
 	switch(change) {
 	case NFSD_SET:
-		nfsd_supported_minorversions[minorversion] = true;
-		nfsd_vers(4, NFSD_SET);
+		if (nn->nfsd4_minorversions) {
+			nfsd_vers(nn, 4, NFSD_SET);
+			nn->nfsd4_minorversions[minorversion] =
+				nfsd_vers(nn, 4, NFSD_TEST);
+		}
 		break;
 	case NFSD_CLEAR:
-		nfsd_supported_minorversions[minorversion] = false;
-		nfsd_adjust_nfsd_versions4();
+		nfsd_netns_init_versions(nn);
+		if (nn->nfsd4_minorversions) {
+			nn->nfsd4_minorversions[minorversion] = false;
+			nfsd_adjust_nfsd_versions4(nn);
+		}
 		break;
 	case NFSD_TEST:
-		return nfsd_supported_minorversions[minorversion];
+		if (nn->nfsd4_minorversions)
+			return nn->nfsd4_minorversions[minorversion];
+		return nfsd_vers(nn, 4, NFSD_TEST);
 	case NFSD_AVAIL:
-		return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+		return minorversion <= NFSD_SUPPORTED_MINOR_VERSION &&
+			nfsd_vers(nn, 4, NFSD_AVAIL);
 	}
 	return 0;
 }
@@ -205,7 +284,7 @@
 	return rv;
 }
 
-static int nfsd_init_socks(struct net *net)
+static int nfsd_init_socks(struct net *net, const struct cred *cred)
 {
 	int error;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -214,12 +293,12 @@
 		return 0;
 
 	error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
-					SVC_SOCK_DEFAULTS);
+					SVC_SOCK_DEFAULTS, cred);
 	if (error < 0)
 		return error;
 
 	error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
-					SVC_SOCK_DEFAULTS);
+					SVC_SOCK_DEFAULTS, cred);
 	if (error < 0)
 		return error;
 
@@ -235,22 +314,17 @@
 	if (nfsd_users++)
 		return 0;
 
-	/*
-	 * Readahead param cache - will no-op if it already exists.
-	 * (Note therefore results will be suboptimal if number of
-	 * threads is modified after nfsd start.)
-	 */
-	ret = nfsd_racache_init(2*nrservs);
+	ret = nfsd_file_cache_init();
 	if (ret)
 		goto dec_users;
 
 	ret = nfs4_state_start();
 	if (ret)
-		goto out_racache;
+		goto out_file_cache;
 	return 0;
 
-out_racache:
-	nfsd_racache_shutdown();
+out_file_cache:
+	nfsd_file_cache_shutdown();
 dec_users:
 	nfsd_users--;
 	return ret;
@@ -262,19 +336,44 @@
 		return;
 
 	nfs4_state_shutdown();
-	nfsd_racache_shutdown();
+	nfsd_file_cache_shutdown();
 }
 
-static bool nfsd_needs_lockd(void)
+static bool nfsd_needs_lockd(struct nfsd_net *nn)
 {
-#if defined(CONFIG_NFSD_V3)
-	return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL);
-#else
-	return (nfsd_versions[2] != NULL);
-#endif
+	return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST);
 }
 
-static int nfsd_startup_net(int nrservs, struct net *net)
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn)
+{
+	int seq = 0;
+
+	do {
+		read_seqbegin_or_lock(&nn->boot_lock, &seq);
+		/*
+		 * This is opaque to client, so no need to byte-swap. Use
+		 * __force to keep sparse happy. y2038 time_t overflow is
+		 * irrelevant in this usage
+		 */
+		verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+		verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
+	} while (need_seqretry(&nn->boot_lock, seq));
+	done_seqretry(&nn->boot_lock, seq);
+}
+
+static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn)
+{
+	ktime_get_real_ts64(&nn->nfssvc_boot);
+}
+
+void nfsd_reset_boot_verifier(struct nfsd_net *nn)
+{
+	write_seqlock(&nn->boot_lock);
+	nfsd_reset_boot_verifier_locked(nn);
+	write_sequnlock(&nn->boot_lock);
+}
+
+static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	int ret;
@@ -285,12 +384,12 @@
 	ret = nfsd_startup_generic(nrservs);
 	if (ret)
 		return ret;
-	ret = nfsd_init_socks(net);
+	ret = nfsd_init_socks(net, cred);
 	if (ret)
 		goto out_socks;
 
-	if (nfsd_needs_lockd() && !nn->lockd_up) {
-		ret = lockd_up(net);
+	if (nfsd_needs_lockd(nn) && !nn->lockd_up) {
+		ret = lockd_up(net, cred);
 		if (ret)
 			goto out_socks;
 		nn->lockd_up = 1;
@@ -317,6 +416,7 @@
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	nfsd_file_cache_purge(net);
 	nfs4_state_shutdown_net(net);
 	if (nn->lockd_up) {
 		lockd_down(net);
@@ -422,20 +522,20 @@
 	nfsd_export_flush(net);
 }
 
-void nfsd_reset_versions(void)
+void nfsd_reset_versions(struct nfsd_net *nn)
 {
 	int i;
 
 	for (i = 0; i < NFSD_NRVERS; i++)
-		if (nfsd_vers(i, NFSD_TEST))
+		if (nfsd_vers(nn, i, NFSD_TEST))
 			return;
 
 	for (i = 0; i < NFSD_NRVERS; i++)
 		if (i != 4)
-			nfsd_vers(i, NFSD_SET);
+			nfsd_vers(nn, i, NFSD_SET);
 		else {
 			int minor = 0;
-			while (nfsd_minorversion(minor, NFSD_SET) >= 0)
+			while (nfsd_minorversion(nn, minor, NFSD_SET) >= 0)
 				minor++;
 		}
 }
@@ -503,7 +603,7 @@
 	}
 	if (nfsd_max_blksize == 0)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
-	nfsd_reset_versions();
+	nfsd_reset_versions(nn);
 	nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
 						&nfsd_thread_sv_ops);
 	if (nn->nfsd_serv == NULL)
@@ -525,7 +625,7 @@
 #endif
 	}
 	atomic_inc(&nn->ntf_refcnt);
-	ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */
+	nfsd_reset_boot_verifier(nn);
 	return 0;
 }
 
@@ -623,7 +723,7 @@
  * this is the first time nrservs is nonzero.
  */
 int
-nfsd_svc(int nrservs, struct net *net)
+nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 {
 	int	error;
 	bool	nfsd_up_before;
@@ -645,7 +745,7 @@
 
 	nfsd_up_before = nn->nfsd_net_up;
 
-	error = nfsd_startup_net(nrservs, net);
+	error = nfsd_startup_net(nrservs, net, cred);
 	if (error)
 		goto out_destroy;
 	error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
@@ -667,6 +767,101 @@
 	return error;
 }
 
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static bool
+nfsd_support_acl_version(int vers)
+{
+	if (vers >= NFSD_ACL_MINVERS && vers < NFSD_ACL_NRVERS)
+		return nfsd_acl_version[vers] != NULL;
+	return false;
+}
+
+static int
+nfsd_acl_rpcbind_set(struct net *net, const struct svc_program *progp,
+		     u32 version, int family, unsigned short proto,
+		     unsigned short port)
+{
+	if (!nfsd_support_acl_version(version) ||
+	    !nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST))
+		return 0;
+	return svc_generic_rpcbind_set(net, progp, version, family,
+			proto, port);
+}
+
+static __be32
+nfsd_acl_init_request(struct svc_rqst *rqstp,
+		      const struct svc_program *progp,
+		      struct svc_process_info *ret)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	int i;
+
+	if (likely(nfsd_support_acl_version(rqstp->rq_vers) &&
+	    nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST)))
+		return svc_generic_init_request(rqstp, progp, ret);
+
+	ret->mismatch.lovers = NFSD_ACL_NRVERS;
+	for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) {
+		if (nfsd_support_acl_version(rqstp->rq_vers) &&
+		    nfsd_vers(nn, i, NFSD_TEST)) {
+			ret->mismatch.lovers = i;
+			break;
+		}
+	}
+	if (ret->mismatch.lovers == NFSD_ACL_NRVERS)
+		return rpc_prog_unavail;
+	ret->mismatch.hivers = NFSD_ACL_MINVERS;
+	for (i = NFSD_ACL_NRVERS - 1; i >= NFSD_ACL_MINVERS; i--) {
+		if (nfsd_support_acl_version(rqstp->rq_vers) &&
+		    nfsd_vers(nn, i, NFSD_TEST)) {
+			ret->mismatch.hivers = i;
+			break;
+		}
+	}
+	return rpc_prog_mismatch;
+}
+#endif
+
+static int
+nfsd_rpcbind_set(struct net *net, const struct svc_program *progp,
+		 u32 version, int family, unsigned short proto,
+		 unsigned short port)
+{
+	if (!nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST))
+		return 0;
+	return svc_generic_rpcbind_set(net, progp, version, family,
+			proto, port);
+}
+
+static __be32
+nfsd_init_request(struct svc_rqst *rqstp,
+		  const struct svc_program *progp,
+		  struct svc_process_info *ret)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	int i;
+
+	if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST)))
+		return svc_generic_init_request(rqstp, progp, ret);
+
+	ret->mismatch.lovers = NFSD_NRVERS;
+	for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) {
+		if (nfsd_vers(nn, i, NFSD_TEST)) {
+			ret->mismatch.lovers = i;
+			break;
+		}
+	}
+	if (ret->mismatch.lovers == NFSD_NRVERS)
+		return rpc_prog_unavail;
+	ret->mismatch.hivers = NFSD_MINVERS;
+	for (i = NFSD_NRVERS - 1; i >= NFSD_MINVERS; i--) {
+		if (nfsd_vers(nn, i, NFSD_TEST)) {
+			ret->mismatch.hivers = i;
+			break;
+		}
+	}
+	return rpc_prog_mismatch;
+}
 
 /*
  * This is the NFS server kernel thread

diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 6b2e8b7..b51fe51 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c

@@ -71,7 +71,7 @@
 }
 
 static __be32 *
-decode_sattr(__be32 *p, struct iattr *iap)
+decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns)
 {
 	u32	tmp, tmp1;
 
@@ -86,12 +86,12 @@
 		iap->ia_mode = tmp;
 	}
 	if ((tmp = ntohl(*p++)) != (u32)-1) {
-		iap->ia_uid = make_kuid(&init_user_ns, tmp);
+		iap->ia_uid = make_kuid(userns, tmp);
 		if (uid_valid(iap->ia_uid))
 			iap->ia_valid |= ATTR_UID;
 	}
 	if ((tmp = ntohl(*p++)) != (u32)-1) {
-		iap->ia_gid = make_kgid(&init_user_ns, tmp);
+		iap->ia_gid = make_kgid(userns, tmp);
 		if (gid_valid(iap->ia_gid))
 			iap->ia_valid |= ATTR_GID;
 	}
@@ -129,6 +129,7 @@
 encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	     struct kstat *stat)
 {
+	struct user_namespace *userns = nfsd_user_namespace(rqstp);
 	struct dentry	*dentry = fhp->fh_dentry;
 	int type;
 	struct timespec64 time;
@@ -139,8 +140,8 @@
 	*p++ = htonl(nfs_ftypes[type >> 12]);
 	*p++ = htonl((u32) stat->mode);
 	*p++ = htonl((u32) stat->nlink);
-	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
-	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
+	*p++ = htonl((u32) from_kuid_munged(userns, stat->uid));
+	*p++ = htonl((u32) from_kgid_munged(userns, stat->gid));
 
 	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
 		*p++ = htonl(NFS_MAXPATHLEN);
@@ -216,7 +217,7 @@
 	p = decode_fh(p, &args->fh);
 	if (!p)
 		return 0;
-	p = decode_sattr(p, &args->attrs);
+	p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
 
 	return xdr_argsize_check(rqstp, p);
 }
@@ -319,7 +320,7 @@
 	if (   !(p = decode_fh(p, &args->fh))
 	    || !(p = decode_filename(p, &args->name, &args->len)))
 		return 0;
-	p = decode_sattr(p, &args->attrs);
+	p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
 
 	return xdr_argsize_check(rqstp, p);
 }
@@ -398,7 +399,7 @@
 			return 0;
 		p += xdrlen;
 	}
-	decode_sattr(p, &args->attrs);
+	decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
 
 	return 1;
 }

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0b15dac..46f56af 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h

@@ -39,6 +39,7 @@
 #include <linux/refcount.h>
 #include <linux/sunrpc/svc_xprt.h>
 #include "nfsfh.h"
+#include "nfsd.h"
 
 typedef struct {
 	u32             cl_boot;
@@ -70,6 +71,7 @@
 	int cb_seq_status;
 	int cb_status;
 	bool cb_need_restart;
+	bool cb_holds_slot;
 };
 
 struct nfsd4_callback_ops {
@@ -315,6 +317,10 @@
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
 	u32			cl_minorversion;
+	/* NFSv4.1 client implementation id: */
+	struct xdr_netobj	cl_nii_domain;
+	struct xdr_netobj	cl_nii_name;
+	struct timespec		cl_nii_time;
 
 	/* for v4.0 and v4.1 callbacks: */
 	struct nfs4_cb_conn	cl_cb_conn;
@@ -327,7 +333,7 @@
 #define NFSD4_CLIENT_CB_FLAG_MASK	(1 << NFSD4_CLIENT_CB_UPDATE | \
 					 1 << NFSD4_CLIENT_CB_KILL)
 	unsigned long		cl_flags;
-	struct rpc_cred		*cl_cb_cred;
+	const struct cred	*cl_cb_cred;
 	struct rpc_clnt		*cl_cb_client;
 	u32			cl_cb_ident;
 #define NFSD4_CB_UP		0
@@ -346,15 +352,21 @@
 	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
 	u32			cl_exchange_flags;
 	/* number of rpc's in progress over an associated session: */
-	atomic_t		cl_refcount;
+	atomic_t		cl_rpc_users;
+	struct nfsdfs_client	cl_nfsdfs;
 	struct nfs4_op_map      cl_spo_must_allow;
 
+	/* debugging info directory under nfsd/clients/ : */
+	struct dentry		*cl_nfsd_dentry;
+
 	/* for nfs41 callbacks */
 	/* We currently support a single back channel with a single slot */
 	unsigned long		cl_cb_slot_busy;
 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
 						/* wait here for slots */
 	struct net		*net;
+	struct list_head	async_copies;	/* list of async copies */
+	spinlock_t		async_lock;	/* lock for async copies */
 };
 
 /* struct nfs4_client_reset
@@ -365,7 +377,8 @@
 struct nfs4_client_reclaim {
 	struct list_head	cr_strhash;	/* hash by cr_name */
 	struct nfs4_client	*cr_clp;	/* pointer to associated clp */
-	char			cr_recdir[HEXDIR_LEN]; /* recover dir */
+	struct xdr_netobj	cr_name;	/* recovery dir name */
+	struct xdr_netobj	cr_princhash;
 };
 
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
@@ -494,7 +507,7 @@
 	};
 	struct list_head	fi_clnt_odstate;
 	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
-	struct file *		fi_fds[3];
+	struct nfsd_file	*fi_fds[3];
 	/*
 	 * Each open or lock stateid contributes 0-4 to the counts
 	 * below depending on which bits are set in st_access_bitmap:
@@ -504,7 +517,7 @@
 	 */
 	atomic_t		fi_access[2];
 	u32			fi_share_deny;
-	struct file		*fi_deleg_file;
+	struct nfsd_file	*fi_deleg_file;
 	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;
 	bool			fi_had_conflict;
@@ -553,7 +566,7 @@
 	spinlock_t			ls_lock;
 	struct list_head		ls_layouts;
 	u32				ls_layout_type;
-	struct file			*ls_file;
+	struct nfsd_file		*ls_file;
 	struct nfsd4_callback		ls_recall;
 	stateid_t			ls_recall_sid;
 	bool				ls_recalled;
@@ -573,6 +586,7 @@
 	NFSPROC4_CLNT_CB_NULL = 0,
 	NFSPROC4_CLNT_CB_RECALL,
 	NFSPROC4_CLNT_CB_LAYOUT,
+	NFSPROC4_CLNT_CB_OFFLOAD,
 	NFSPROC4_CLNT_CB_SEQUENCE,
 	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
 };
@@ -599,21 +613,24 @@
 
 struct nfsd4_compound_state;
 struct nfsd_net;
+struct nfsd4_copy;
 
 extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
-		stateid_t *stateid, int flags, struct file **filp, bool *tmp_file);
+		stateid_t *stateid, int flags, struct nfsd_file **filp);
 __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		     stateid_t *stateid, unsigned char typemask,
 		     struct nfs4_stid **s, struct nfsd_net *nn);
 struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
 				  void (*sc_free)(struct nfs4_stid *));
+int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy);
+void nfs4_free_cp_state(struct nfsd4_copy *copy);
 void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
-extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
+extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name,
 							struct nfsd_net *nn);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
 		struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
@@ -626,18 +643,22 @@
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
+extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
 extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
-extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
-							struct nfsd_net *nn);
-extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
+				struct xdr_netobj princhash, struct nfsd_net *nn);
+extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
 
 struct nfs4_file *find_file(struct knfsd_fh *fh);
 void put_nfs4_file(struct nfs4_file *fi);
+extern void nfs4_put_copy(struct nfsd4_copy *copy);
+extern struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *staetid);
 static inline void get_nfs4_file(struct nfs4_file *fi)
 {
 	refcount_inc(&fi->fi_ref);
 }
-struct file *find_any_file(struct nfs4_file *f);
+struct nfsd_file *find_any_file(struct nfs4_file *f);
 
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);
@@ -652,7 +673,7 @@
 
 /* nfs fault injection functions */
 #ifdef CONFIG_NFSD_FAULT_INJECTION
-int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_init(void);
 void nfsd_fault_inject_cleanup(void);
 
 u64 nfsd_inject_print_clients(void);
@@ -673,7 +694,7 @@
 u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t);
 u64 nfsd_inject_recall_delegations(u64);
 #else /* CONFIG_NFSD_FAULT_INJECTION */
-static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_init(void) {}
 static inline void nfsd_fault_inject_cleanup(void) {}
 #endif /* CONFIG_NFSD_FAULT_INJECTION */
 

diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 80933e4..ffc78a0 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h

@@ -126,6 +126,8 @@
 DEFINE_NFSD_ERR_EVENT(write_err);
 
 #include "state.h"
+#include "filecache.h"
+#include "vfs.h"
 
 DECLARE_EVENT_CLASS(nfsd_stateid_class,
 	TP_PROTO(stateid_t *stp),
@@ -164,6 +166,144 @@
 DEFINE_STATEID_EVENT(layout_recall_fail);
 DEFINE_STATEID_EVENT(layout_recall_release);
 
+#define show_nf_flags(val)						\
+	__print_flags(val, "|",						\
+		{ 1 << NFSD_FILE_HASHED,	"HASHED" },		\
+		{ 1 << NFSD_FILE_PENDING,	"PENDING" },		\
+		{ 1 << NFSD_FILE_BREAK_READ,	"BREAK_READ" },		\
+		{ 1 << NFSD_FILE_BREAK_WRITE,	"BREAK_WRITE" },	\
+		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"})
+
+/* FIXME: This should probably be fleshed out in the future. */
+#define show_nf_may(val)						\
+	__print_flags(val, "|",						\
+		{ NFSD_MAY_READ,		"READ" },		\
+		{ NFSD_MAY_WRITE,		"WRITE" },		\
+		{ NFSD_MAY_NOT_BREAK_LEASE,	"NOT_BREAK_LEASE" })
+
+DECLARE_EVENT_CLASS(nfsd_file_class,
+	TP_PROTO(struct nfsd_file *nf),
+	TP_ARGS(nf),
+	TP_STRUCT__entry(
+		__field(unsigned int, nf_hashval)
+		__field(void *, nf_inode)
+		__field(int, nf_ref)
+		__field(unsigned long, nf_flags)
+		__field(unsigned char, nf_may)
+		__field(struct file *, nf_file)
+	),
+	TP_fast_assign(
+		__entry->nf_hashval = nf->nf_hashval;
+		__entry->nf_inode = nf->nf_inode;
+		__entry->nf_ref = atomic_read(&nf->nf_ref);
+		__entry->nf_flags = nf->nf_flags;
+		__entry->nf_may = nf->nf_may;
+		__entry->nf_file = nf->nf_file;
+	),
+	TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p",
+		__entry->nf_hashval,
+		__entry->nf_inode,
+		__entry->nf_ref,
+		show_nf_flags(__entry->nf_flags),
+		show_nf_may(__entry->nf_may),
+		__entry->nf_file)
+)
+
+#define DEFINE_NFSD_FILE_EVENT(name) \
+DEFINE_EVENT(nfsd_file_class, name, \
+	TP_PROTO(struct nfsd_file *nf), \
+	TP_ARGS(nf))
+
+DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked);
+
+TRACE_EVENT(nfsd_file_acquire,
+	TP_PROTO(struct svc_rqst *rqstp, unsigned int hash,
+		 struct inode *inode, unsigned int may_flags,
+		 struct nfsd_file *nf, __be32 status),
+
+	TP_ARGS(rqstp, hash, inode, may_flags, nf, status),
+
+	TP_STRUCT__entry(
+		__field(__be32, xid)
+		__field(unsigned int, hash)
+		__field(void *, inode)
+		__field(unsigned int, may_flags)
+		__field(int, nf_ref)
+		__field(unsigned long, nf_flags)
+		__field(unsigned char, nf_may)
+		__field(struct file *, nf_file)
+		__field(__be32, status)
+	),
+
+	TP_fast_assign(
+		__entry->xid = rqstp->rq_xid;
+		__entry->hash = hash;
+		__entry->inode = inode;
+		__entry->may_flags = may_flags;
+		__entry->nf_ref = nf ? atomic_read(&nf->nf_ref) : 0;
+		__entry->nf_flags = nf ? nf->nf_flags : 0;
+		__entry->nf_may = nf ? nf->nf_may : 0;
+		__entry->nf_file = nf ? nf->nf_file : NULL;
+		__entry->status = status;
+	),
+
+	TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u",
+			be32_to_cpu(__entry->xid), __entry->hash, __entry->inode,
+			show_nf_may(__entry->may_flags), __entry->nf_ref,
+			show_nf_flags(__entry->nf_flags),
+			show_nf_may(__entry->nf_may), __entry->nf_file,
+			be32_to_cpu(__entry->status))
+);
+
+DECLARE_EVENT_CLASS(nfsd_file_search_class,
+	TP_PROTO(struct inode *inode, unsigned int hash, int found),
+	TP_ARGS(inode, hash, found),
+	TP_STRUCT__entry(
+		__field(struct inode *, inode)
+		__field(unsigned int, hash)
+		__field(int, found)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->hash = hash;
+		__entry->found = found;
+	),
+	TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
+			__entry->inode, __entry->found)
+);
+
+#define DEFINE_NFSD_FILE_SEARCH_EVENT(name)				\
+DEFINE_EVENT(nfsd_file_search_class, name,				\
+	TP_PROTO(struct inode *inode, unsigned int hash, int found),	\
+	TP_ARGS(inode, hash, found))
+
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached);
+
+TRACE_EVENT(nfsd_file_fsnotify_handle_event,
+	TP_PROTO(struct inode *inode, u32 mask),
+	TP_ARGS(inode, mask),
+	TP_STRUCT__entry(
+		__field(struct inode *, inode)
+		__field(unsigned int, nlink)
+		__field(umode_t, mode)
+		__field(u32, mask)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->nlink = inode->i_nlink;
+		__entry->mode = inode->i_mode;
+		__entry->mask = mask;
+	),
+	TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode,
+			__entry->nlink, __entry->mode, __entry->mask)
+);
+
 #endif /* _NFSD_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b53e763..bd0a385 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c

@@ -44,38 +44,11 @@
 
 #include "nfsd.h"
 #include "vfs.h"
+#include "filecache.h"
 #include "trace.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
 
-
-/*
- * This is a cache of readahead params that help us choose the proper
- * readahead strategy. Initially, we set all readahead parameters to 0
- * and let the VFS handle things.
- * If you increase the number of cached files very much, you'll need to
- * add a hash table here.
- */
-struct raparms {
-	struct raparms		*p_next;
-	unsigned int		p_count;
-	ino_t			p_ino;
-	dev_t			p_dev;
-	int			p_set;
-	struct file_ra_state	p_ra;
-	unsigned int		p_hindex;
-};
-
-struct raparm_hbucket {
-	struct raparms		*pb_head;
-	spinlock_t		pb_lock;
-} ____cacheline_aligned_in_smp;
-
-#define RAPARM_HASH_BITS	4
-#define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
-#define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
-static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
-
 /* 
  * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
  * a mount point.
@@ -396,10 +369,23 @@
 	bool		get_write_count;
 	bool		size_change = (iap->ia_valid & ATTR_SIZE);
 
-	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+	if (iap->ia_valid & ATTR_SIZE) {
 		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
-	if (iap->ia_valid & ATTR_SIZE)
 		ftype = S_IFREG;
+	}
+
+	/*
+	 * If utimes(2) and friends are called with times not NULL, we should
+	 * not set NFSD_MAY_WRITE bit. Otherwise fh_verify->nfsd_permission
+	 * will return EACCES, when the caller's effective UID does not match
+	 * the owner of the file, and the caller is not privileged. In this
+	 * situation, we should return EPERM(notify_change will return this).
+	 */
+	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME)) {
+		accmode |= NFSD_MAY_OWNER_OVERRIDE;
+		if (!(iap->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET)))
+			accmode |= NFSD_MAY_WRITE;
+	}
 
 	/* Callers that do fh_verify should do the fh_want_write: */
 	get_write_count = !fhp->fh_dentry;
@@ -541,8 +527,14 @@
 __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
 		u64 dst_pos, u64 count)
 {
-	return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
-					     count));
+	loff_t cloned;
+
+	cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
+	if (cloned < 0)
+		return nfserrno(cloned);
+	if (count && cloned != count)
+		return nfserrno(-EINVAL);
+	return 0;
 }
 
 ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
@@ -680,7 +672,7 @@
 }
 #endif /* CONFIG_NFSD_V3 */
 
-static int nfsd_open_break_lease(struct inode *inode, int access)
+int nfsd_open_break_lease(struct inode *inode, int access)
 {
 	unsigned int mode;
 
@@ -696,8 +688,8 @@
  * and additional flags.
  * N.B. After this call fhp needs an fh_put
  */
-__be32
-nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+static __be32
+__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 			int may_flags, struct file **filp)
 {
 	struct path	path;
@@ -707,25 +699,6 @@
 	__be32		err;
 	int		host_err = 0;
 
-	validate_process_creds();
-
-	/*
-	 * If we get here, then the client has already done an "open",
-	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
-	 * in case a chmod has now revoked permission.
-	 *
-	 * Arguably we should also allow the owner override for
-	 * directories, but we never have and it doesn't seem to have
-	 * caused anyone a problem.  If we were to change this, note
-	 * also that our filldir callbacks would need a variant of
-	 * lookup_one_len that doesn't check permissions.
-	 */
-	if (type == S_IFREG)
-		may_flags |= NFSD_MAY_OWNER_OVERRIDE;
-	err = fh_verify(rqstp, fhp, type, may_flags);
-	if (err)
-		goto out;
-
 	path.mnt = fhp->fh_export->ex_path.mnt;
 	path.dentry = fhp->fh_dentry;
 	inode = d_inode(path.dentry);
@@ -779,67 +752,46 @@
 out_nfserr:
 	err = nfserrno(host_err);
 out:
+	return err;
+}
+
+__be32
+nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+		int may_flags, struct file **filp)
+{
+	__be32 err;
+
+	validate_process_creds();
+	/*
+	 * If we get here, then the client has already done an "open",
+	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
+	 * in case a chmod has now revoked permission.
+	 *
+	 * Arguably we should also allow the owner override for
+	 * directories, but we never have and it doesn't seem to have
+	 * caused anyone a problem.  If we were to change this, note
+	 * also that our filldir callbacks would need a variant of
+	 * lookup_one_len that doesn't check permissions.
+	 */
+	if (type == S_IFREG)
+		may_flags |= NFSD_MAY_OWNER_OVERRIDE;
+	err = fh_verify(rqstp, fhp, type, may_flags);
+	if (!err)
+		err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
 	validate_process_creds();
 	return err;
 }
 
-struct raparms *
-nfsd_init_raparms(struct file *file)
+__be32
+nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+		int may_flags, struct file **filp)
 {
-	struct inode *inode = file_inode(file);
-	dev_t dev = inode->i_sb->s_dev;
-	ino_t ino = inode->i_ino;
-	struct raparms	*ra, **rap, **frap = NULL;
-	int depth = 0;
-	unsigned int hash;
-	struct raparm_hbucket *rab;
+	__be32 err;
 
-	hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
-	rab = &raparm_hash[hash];
-
-	spin_lock(&rab->pb_lock);
-	for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
-		if (ra->p_ino == ino && ra->p_dev == dev)
-			goto found;
-		depth++;
-		if (ra->p_count == 0)
-			frap = rap;
-	}
-	depth = nfsdstats.ra_size;
-	if (!frap) {	
-		spin_unlock(&rab->pb_lock);
-		return NULL;
-	}
-	rap = frap;
-	ra = *frap;
-	ra->p_dev = dev;
-	ra->p_ino = ino;
-	ra->p_set = 0;
-	ra->p_hindex = hash;
-found:
-	if (rap != &rab->pb_head) {
-		*rap = ra->p_next;
-		ra->p_next   = rab->pb_head;
-		rab->pb_head = ra;
-	}
-	ra->p_count++;
-	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
-	spin_unlock(&rab->pb_lock);
-
-	if (ra->p_set)
-		file->f_ra = ra->p_ra;
-	return ra;
-}
-
-void nfsd_put_raparams(struct file *file, struct raparms *ra)
-{
-	struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
-
-	spin_lock(&rab->pb_lock);
-	ra->p_ra = file->f_ra;
-	ra->p_set = 1;
-	ra->p_count--;
-	spin_unlock(&rab->pb_lock);
+	validate_process_creds();
+	err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+	validate_process_creds();
+	return err;
 }
 
 /*
@@ -882,12 +834,23 @@
 	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
 
+static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len,
+		size_t expected)
+{
+	if (expected != 0 && len == 0)
+		return 1;
+	if (offset+len >= i_size_read(file_inode(file)))
+		return 1;
+	return 0;
+}
+
 static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			       struct file *file, loff_t offset,
-			       unsigned long *count, int host_err)
+			       unsigned long *count, u32 *eof, ssize_t host_err)
 {
 	if (host_err >= 0) {
 		nfsdstats.io_read += host_err;
+		*eof = nfsd_eof_on_read(file, offset, host_err, *count);
 		*count = host_err;
 		fsnotify_access(file);
 		trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
@@ -899,7 +862,8 @@
 }
 
 __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
-			struct file *file, loff_t offset, unsigned long *count)
+			struct file *file, loff_t offset, unsigned long *count,
+			u32 *eof)
 {
 	struct splice_desc sd = {
 		.len		= 0,
@@ -907,25 +871,27 @@
 		.pos		= offset,
 		.u.data		= rqstp,
 	};
-	int host_err;
+	ssize_t host_err;
 
 	trace_nfsd_read_splice(rqstp, fhp, offset, *count);
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 	host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
-	return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
+	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
 
 __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  struct file *file, loff_t offset,
-		  struct kvec *vec, int vlen, unsigned long *count)
+		  struct kvec *vec, int vlen, unsigned long *count,
+		  u32 *eof)
 {
 	struct iov_iter iter;
-	int host_err;
+	loff_t ppos = offset;
+	ssize_t host_err;
 
 	trace_nfsd_read_vector(rqstp, fhp, offset, *count);
-	iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count);
-	host_err = vfs_iter_read(file, &iter, &offset, 0);
-	return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
+	iov_iter_kvec(&iter, READ, vec, vlen, *count);
+	host_err = vfs_iter_read(file, &iter, &ppos, 0);
+	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
 
 /*
@@ -999,15 +965,19 @@
 	if (stable && !use_wgather)
 		flags |= RWF_SYNC;
 
-	iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt);
+	iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
 	host_err = vfs_iter_write(file, &iter, &pos, flags);
 	if (host_err < 0)
 		goto out_nfserr;
 	nfsdstats.io_write += *cnt;
 	fsnotify_modify(file);
 
-	if (stable && use_wgather)
+	if (stable && use_wgather) {
 		host_err = wait_for_concurrent_writes(file);
+		if (host_err < 0)
+			nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+						 nfsd_net_id));
+	}
 
 out_nfserr:
 	if (host_err >= 0) {
@@ -1028,27 +998,25 @@
  * N.B. After this call fhp needs an fh_put
  */
 __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
-	loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+	loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
+	u32 *eof)
 {
+	struct nfsd_file	*nf;
 	struct file *file;
-	struct raparms	*ra;
 	__be32 err;
 
 	trace_nfsd_read_start(rqstp, fhp, offset, *count);
-	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
 	if (err)
 		return err;
 
-	ra = nfsd_init_raparms(file);
-
+	file = nf->nf_file;
 	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
-		err = nfsd_splice_read(rqstp, fhp, file, offset, count);
+		err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
 	else
-		err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count);
+		err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
 
-	if (ra)
-		nfsd_put_raparams(file, ra);
-	fput(file);
+	nfsd_file_put(nf);
 
 	trace_nfsd_read_done(rqstp, fhp, offset, *count);
 
@@ -1064,17 +1032,18 @@
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 	   struct kvec *vec, int vlen, unsigned long *cnt, int stable)
 {
-	struct file *file = NULL;
-	__be32 err = 0;
+	struct nfsd_file *nf;
+	__be32 err;
 
 	trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
 
-	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
 	if (err)
 		goto out;
 
-	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
-	fput(file);
+	err = nfsd_vfs_write(rqstp, fhp, nf->nf_file, offset, vec,
+			vlen, cnt, stable);
+	nfsd_file_put(nf);
 out:
 	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
 	return err;
@@ -1094,9 +1063,9 @@
 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
                loff_t offset, unsigned long count)
 {
-	struct file	*file;
-	loff_t		end = LLONG_MAX;
-	__be32		err = nfserr_inval;
+	struct nfsd_file	*nf;
+	loff_t			end = LLONG_MAX;
+	__be32			err = nfserr_inval;
 
 	if (offset < 0)
 		goto out;
@@ -1106,20 +1075,27 @@
 			goto out;
 	}
 
-	err = nfsd_open(rqstp, fhp, S_IFREG,
-			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
+	err = nfsd_file_acquire(rqstp, fhp,
+			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
 	if (err)
 		goto out;
 	if (EX_ISSYNC(fhp->fh_export)) {
-		int err2 = vfs_fsync_range(file, offset, end, 0);
+		int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
 
-		if (err2 != -EINVAL)
-			err = nfserrno(err2);
-		else
+		switch (err2) {
+		case 0:
+			break;
+		case -EINVAL:
 			err = nfserr_notsupp;
+			break;
+		default:
+			err = nfserrno(err2);
+			nfsd_reset_boot_verifier(net_generic(nf->nf_net,
+						 nfsd_net_id));
+		}
 	}
 
-	fput(file);
+	nfsd_file_put(nf);
 out:
 	return err;
 }
@@ -1276,7 +1252,6 @@
 		int type, dev_t rdev, struct svc_fh *resfhp)
 {
 	struct dentry	*dentry, *dchild = NULL;
-	struct inode	*dirp;
 	__be32		err;
 	int		host_err;
 
@@ -1288,7 +1263,6 @@
 		return err;
 
 	dentry = fhp->fh_dentry;
-	dirp = d_inode(dentry);
 
 	host_err = fh_want_write(fhp);
 	if (host_err)
@@ -1409,6 +1383,7 @@
 					*created = 1;
 				break;
 			}
+			/* fall through */
 		case NFS4_CREATE_EXCLUSIVE4_1:
 			if (   d_inode(dchild)->i_mtime.tv_sec == v_mtime
 			    && d_inode(dchild)->i_atime.tv_sec == v_atime
@@ -1417,7 +1392,7 @@
 					*created = 1;
 				goto set_attr;
 			}
-			 /* fallthru */
+			/* fall through */
 		case NFS3_CREATE_GUARDED:
 			err = nfserr_exist;
 		}
@@ -1641,6 +1616,26 @@
 	goto out_unlock;
 }
 
+static void
+nfsd_close_cached_files(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (inode && S_ISREG(inode->i_mode))
+		nfsd_file_close_inode_sync(inode);
+}
+
+static bool
+nfsd_has_cached_files(struct dentry *dentry)
+{
+	bool		ret = false;
+	struct inode *inode = d_inode(dentry);
+
+	if (inode && S_ISREG(inode->i_mode))
+		ret = nfsd_file_is_cached(inode);
+	return ret;
+}
+
 /*
  * Rename a file
  * N.B. After this call _both_ ffhp and tfhp need an fh_put
@@ -1653,6 +1648,7 @@
 	struct inode	*fdir, *tdir;
 	__be32		err;
 	int		host_err;
+	bool		has_cached = false;
 
 	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
 	if (err)
@@ -1671,6 +1667,7 @@
 	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
 		goto out;
 
+retry:
 	host_err = fh_want_write(ffhp);
 	if (host_err) {
 		err = nfserrno(host_err);
@@ -1710,11 +1707,16 @@
 	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
 		goto out_dput_new;
 
-	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
-	if (!host_err) {
-		host_err = commit_metadata(tfhp);
-		if (!host_err)
-			host_err = commit_metadata(ffhp);
+	if (nfsd_has_cached_files(ndentry)) {
+		has_cached = true;
+		goto out_dput_old;
+	} else {
+		host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+		if (!host_err) {
+			host_err = commit_metadata(tfhp);
+			if (!host_err)
+				host_err = commit_metadata(ffhp);
+		}
 	}
  out_dput_new:
 	dput(ndentry);
@@ -1727,12 +1729,26 @@
 	 * as that would do the wrong thing if the two directories
 	 * were the same, so again we do it by hand.
 	 */
-	fill_post_wcc(ffhp);
-	fill_post_wcc(tfhp);
+	if (!has_cached) {
+		fill_post_wcc(ffhp);
+		fill_post_wcc(tfhp);
+	}
 	unlock_rename(tdentry, fdentry);
 	ffhp->fh_locked = tfhp->fh_locked = false;
 	fh_drop_write(ffhp);
 
+	/*
+	 * If the target dentry has cached open files, then we need to try to
+	 * close them prior to doing the rename. Flushing delayed fput
+	 * shouldn't be done with locks held however, so we delay it until this
+	 * point and then reattempt the whole shebang.
+	 */
+	if (has_cached) {
+		has_cached = false;
+		nfsd_close_cached_files(ndentry);
+		dput(ndentry);
+		goto retry;
+	}
 out:
 	return err;
 }
@@ -1768,25 +1784,30 @@
 	rdentry = lookup_one_len(fname, dentry, flen);
 	host_err = PTR_ERR(rdentry);
 	if (IS_ERR(rdentry))
-		goto out_nfserr;
+		goto out_drop_write;
 
 	if (d_really_is_negative(rdentry)) {
 		dput(rdentry);
-		err = nfserr_noent;
-		goto out;
+		host_err = -ENOENT;
+		goto out_drop_write;
 	}
 
 	if (!type)
 		type = d_inode(rdentry)->i_mode & S_IFMT;
 
-	if (type != S_IFDIR)
+	if (type != S_IFDIR) {
+		nfsd_close_cached_files(rdentry);
 		host_err = vfs_unlink(dirp, rdentry, NULL);
-	else
+	} else {
 		host_err = vfs_rmdir(dirp, rdentry);
+	}
+
 	if (!host_err)
 		host_err = commit_metadata(fhp);
 	dput(rdentry);
 
+out_drop_write:
+	fh_drop_write(fhp);
 out_nfserr:
 	err = nfserrno(host_err);
 out:
@@ -2054,63 +2075,3 @@
 
 	return err? nfserrno(err) : 0;
 }
-
-void
-nfsd_racache_shutdown(void)
-{
-	struct raparms *raparm, *last_raparm;
-	unsigned int i;
-
-	dprintk("nfsd: freeing readahead buffers.\n");
-
-	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
-		raparm = raparm_hash[i].pb_head;
-		while(raparm) {
-			last_raparm = raparm;
-			raparm = raparm->p_next;
-			kfree(last_raparm);
-		}
-		raparm_hash[i].pb_head = NULL;
-	}
-}
-/*
- * Initialize readahead param cache
- */
-int
-nfsd_racache_init(int cache_size)
-{
-	int	i;
-	int	j = 0;
-	int	nperbucket;
-	struct raparms **raparm = NULL;
-
-
-	if (raparm_hash[0].pb_head)
-		return 0;
-	nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
-	nperbucket = max(2, nperbucket);
-	cache_size = nperbucket * RAPARM_HASH_SIZE;
-
-	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
-
-	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
-		spin_lock_init(&raparm_hash[i].pb_lock);
-
-		raparm = &raparm_hash[i].pb_head;
-		for (j = 0; j < nperbucket; j++) {
-			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
-			if (!*raparm)
-				goto out_nomem;
-			raparm = &(*raparm)->p_next;
-		}
-		*raparm = NULL;
-	}
-
-	nfsdstats.ra_size = cache_size;
-	return 0;
-
-out_nomem:
-	dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
-	nfsd_racache_shutdown();
-	return -ENOMEM;
-}

diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a7e1073..a13fd9d 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h

@@ -40,8 +40,6 @@
 typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
 
 /* nfsd/vfs.c */
-int		nfsd_racache_init(int);
-void		nfsd_racache_shutdown(void);
 int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 		                struct svc_export **expp);
 __be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
@@ -75,18 +73,23 @@
 __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 				loff_t, unsigned long);
 #endif /* CONFIG_NFSD_V3 */
+int 		nfsd_open_break_lease(struct inode *, int);
 __be32		nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
 				int, struct file **);
-struct raparms;
+__be32		nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t,
+				int, struct file **);
 __be32		nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct file *file, loff_t offset,
-				unsigned long *count);
+				unsigned long *count,
+				u32 *eof);
 __be32		nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct file *file, loff_t offset,
 				struct kvec *vec, int vlen,
-				unsigned long *count);
+				unsigned long *count,
+				u32 *eof);
 __be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
-				loff_t, struct kvec *, int, unsigned long *);
+				loff_t, struct kvec *, int, unsigned long *,
+				u32 *eof);
 __be32 		nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
 				struct kvec *, int, unsigned long *, int);
 __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -115,13 +118,13 @@
 __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 				struct dentry *, int);
 
-struct raparms *nfsd_init_raparms(struct file *file);
-void		nfsd_put_raparams(struct file *file, struct raparms *ra);
-
 static inline int fh_want_write(struct svc_fh *fh)
 {
-	int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
+	int ret;
 
+	if (fh->fh_want_write)
+		return 0;
+	ret = mnt_want_write(fh->fh_export->ex_path.mnt);
 	if (!ret)
 		fh->fh_want_write = true;
 	return ret;
@@ -149,23 +152,4 @@
 	       || createmode == NFS4_CREATE_EXCLUSIVE4_1;
 }
 
-static inline bool nfsd_eof_on_read(long requested, long read,
-				loff_t offset, loff_t size)
-{
-	/* We assume a short read means eof: */
-	if (requested > read)
-		return true;
-	/*
-	 * A non-short read might also reach end of file.  The spec
-	 * still requires us to set eof in that case.
-	 *
-	 * Further operations may have modified the file size since
-	 * the read, so the following check is not atomic with the read.
-	 * We've only seen that cause a problem for a client in the case
-	 * where the read returned a count of 0 without setting eof.
-	 * That case was fixed by the addition of the above check.
-	 */
-	return (offset + read >= size);
-}
-
 #endif /* LINUX_NFSD_VFS_H */

diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 2cb29e9..99ff9f4 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h

@@ -151,7 +151,7 @@
 	__be32			status;
 	struct svc_fh		fh;
 	unsigned long		count;
-	int			eof;
+	__u32			eof;
 };
 
 struct nfsd3_writeres {

diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 17c453a..f4737d6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h

@@ -273,15 +273,14 @@
 
 
 struct nfsd4_read {
-	stateid_t	rd_stateid;         /* request */
-	u64		rd_offset;          /* request */
-	u32		rd_length;          /* request */
-	int		rd_vlen;
-	struct file     *rd_filp;
-	bool		rd_tmp_file;
+	stateid_t		rd_stateid;         /* request */
+	u64			rd_offset;          /* request */
+	u32			rd_length;          /* request */
+	int			rd_vlen;
+	struct nfsd_file	*rd_nf;
 	
-	struct svc_rqst *rd_rqstp;          /* response */
-	struct svc_fh * rd_fhp;             /* response */
+	struct svc_rqst		*rd_rqstp;          /* response */
+	struct svc_fh		*rd_fhp;             /* response */
 };
 
 struct nfsd4_readdir {
@@ -410,6 +409,9 @@
 	int		spa_how;
 	u32             spo_must_enforce[3];
 	u32             spo_must_allow[3];
+	struct xdr_netobj nii_domain;
+	struct xdr_netobj nii_name;
+	struct timespec64 nii_time;
 };
 
 struct nfsd4_sequence {
@@ -472,7 +474,7 @@
 	u32			lc_reclaim;	/* request */
 	u32			lc_newoffset;	/* request */
 	u64			lc_last_wr;	/* request */
-	struct timespec		lc_mtime;	/* request */
+	struct timespec64	lc_mtime;	/* request */
 	u32			lc_layout_type;	/* request */
 	u32			lc_up_len;	/* layout length */
 	void			*lc_up_layout;	/* decoded by callback */
@@ -511,6 +513,7 @@
 	u64			wr_bytes_written;
 	u32			wr_stable_how;
 	nfs4_verifier		wr_verifier;
+	stateid_t		cb_stateid;
 };
 
 struct nfsd4_copy {
@@ -526,6 +529,23 @@
 
 	/* response */
 	struct nfsd42_write_res	cp_res;
+
+	/* for cb_offload */
+	struct nfsd4_callback	cp_cb;
+	__be32			nfserr;
+	struct knfsd_fh		fh;
+
+	struct nfs4_client      *cp_clp;
+
+	struct nfsd_file        *nf_src;
+	struct nfsd_file        *nf_dst;
+
+	stateid_t		cp_stateid;
+
+	struct list_head	copies;
+	struct task_struct	*copy_task;
+	refcount_t		refcount;
+	bool			stopped;
 };
 
 struct nfsd4_seek {
@@ -539,6 +559,15 @@
 	loff_t		seek_pos;
 };
 
+struct nfsd4_offload_status {
+	/* request */
+	stateid_t	stateid;
+
+	/* response */
+	u64		count;
+	u32		status;
+};
+
 struct nfsd4_op {
 	int					opnum;
 	const struct nfsd4_operation *		opdesc;
@@ -597,6 +626,7 @@
 		struct nfsd4_fallocate		deallocate;
 		struct nfsd4_clone		clone;
 		struct nfsd4_copy		copy;
+		struct nfsd4_offload_status	offload_status;
 		struct nfsd4_seek		seek;
 	} u;
 	struct nfs4_replay *			replay;

diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 517239a..547cf07 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h

@@ -38,3 +38,13 @@
 #define NFS4_dec_cb_notify_lock_sz	(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+#define enc_cb_offload_info_sz		(1 + 1 + 2 + 1 +		\
+					XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define NFS4_enc_cb_offload_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					enc_nfs4_fh_sz +		\
+					enc_stateid_sz +		\
+					enc_cb_offload_info_sz)
+#define NFS4_dec_cb_offload_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)

diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 80da8eb..254d102 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config NILFS2_FS
 	tristate "NILFS2 file system support"
 	select CRC32

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index ebb24a3..4391fd3 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c

@@ -168,24 +168,18 @@
 	ctxt->newbh = NULL;
 
 	if (inode->i_blkbits == PAGE_SHIFT) {
-		lock_page(obh->b_page);
-		/*
-		 * We cannot call radix_tree_preload for the kernels older
-		 * than 2.6.23, because it is not exported for modules.
-		 */
+		struct page *opage = obh->b_page;
+		lock_page(opage);
 retry:
-		err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
-		if (err)
-			goto failed_unlock;
 		/* BUG_ON(oldkey != obh->b_page->index); */
-		if (unlikely(oldkey != obh->b_page->index))
-			NILFS_PAGE_BUG(obh->b_page,
+		if (unlikely(oldkey != opage->index))
+			NILFS_PAGE_BUG(opage,
 				       "invalid oldkey %lld (newkey=%lld)",
 				       (unsigned long long)oldkey,
 				       (unsigned long long)newkey);
 
 		xa_lock_irq(&btnc->i_pages);
-		err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page);
+		err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS);
 		xa_unlock_irq(&btnc->i_pages);
 		/*
 		 * Note: page->index will not change to newkey until
@@ -193,17 +187,16 @@
 		 * To protect the page in intermediate state, the page lock
 		 * is held.
 		 */
-		radix_tree_preload_end();
 		if (!err)
 			return 0;
-		else if (err != -EEXIST)
+		else if (err != -EBUSY)
 			goto failed_unlock;
 
 		err = invalidate_inode_pages2_range(btnc, newkey, newkey);
 		if (!err)
 			goto retry;
 		/* fallback to copy mode */
-		unlock_page(obh->b_page);
+		unlock_page(opage);
 	}
 
 	nbh = nilfs_btnode_create_block(btnc, newkey);
@@ -243,9 +236,8 @@
 		mark_buffer_dirty(obh);
 
 		xa_lock_irq(&btnc->i_pages);
-		radix_tree_delete(&btnc->i_pages, oldkey);
-		radix_tree_tag_set(&btnc->i_pages, newkey,
-				   PAGECACHE_TAG_DIRTY);
+		__xa_erase(&btnc->i_pages, oldkey);
+		__xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY);
 		xa_unlock_irq(&btnc->i_pages);
 
 		opage->index = obh->b_blocknr = newkey;
@@ -274,9 +266,7 @@
 		return;
 
 	if (nbh == NULL) {	/* blocksize == pagesize */
-		xa_lock_irq(&btnc->i_pages);
-		radix_tree_delete(&btnc->i_pages, newkey);
-		xa_unlock_irq(&btnc->i_pages);
+		xa_erase_irq(&btnc->i_pages, newkey);
 		unlock_page(ctxt->bh->b_page);
 	} else
 		brelse(nbh);

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 9b96d79..91b9dac 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c

@@ -148,13 +148,8 @@
 
 	oldflags = NILFS_I(inode)->i_flags;
 
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
-	 * relevant capability.
-	 */
-	ret = -EPERM;
-	if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
+	ret = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (ret)
 		goto out;
 
 	ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);

diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a2f247b..42395ba 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h

@@ -252,7 +252,6 @@
 void nilfs_inode_add_blocks(struct inode *inode, int n);
 void nilfs_inode_sub_blocks(struct inode *inode, int n);
 extern struct inode *nilfs_new_inode(struct inode *, umode_t);
-extern void nilfs_free_inode(struct inode *);
 extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
@@ -289,7 +288,6 @@
 
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
-extern void nilfs_destroy_inode(struct inode *);
 
 extern __printf(3, 4)
 void __nilfs_msg(struct super_block *sb, const char *level,

diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 329a056..d7fc8d3 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c

@@ -289,7 +289,7 @@
  * @dmap: destination page cache
  * @smap: source page cache
  *
- * No pages must no be added to the cache during this process.
+ * No pages must be added to the cache during this process.
  * This must be ensured by the caller.
  */
 void nilfs_copy_back_pages(struct address_space *dmap,
@@ -298,7 +298,6 @@
 	struct pagevec pvec;
 	unsigned int i, n;
 	pgoff_t index = 0;
-	int err;
 
 	pagevec_init(&pvec);
 repeat:
@@ -313,35 +312,34 @@
 		lock_page(page);
 		dpage = find_lock_page(dmap, offset);
 		if (dpage) {
-			/* override existing page on the destination cache */
+			/* overwrite existing page in the destination cache */
 			WARN_ON(PageDirty(dpage));
 			nilfs_copy_page(dpage, page, 0);
 			unlock_page(dpage);
 			put_page(dpage);
+			/* Do we not need to remove page from smap here? */
 		} else {
-			struct page *page2;
+			struct page *p;
 
 			/* move the page to the destination cache */
 			xa_lock_irq(&smap->i_pages);
-			page2 = radix_tree_delete(&smap->i_pages, offset);
-			WARN_ON(page2 != page);
-
+			p = __xa_erase(&smap->i_pages, offset);
+			WARN_ON(page != p);
 			smap->nrpages--;
 			xa_unlock_irq(&smap->i_pages);
 
 			xa_lock_irq(&dmap->i_pages);
-			err = radix_tree_insert(&dmap->i_pages, offset, page);
-			if (unlikely(err < 0)) {
-				WARN_ON(err == -EEXIST);
+			p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS);
+			if (unlikely(p)) {
+				/* Probably -ENOMEM */
 				page->mapping = NULL;
-				put_page(page); /* for cache */
+				put_page(page);
 			} else {
 				page->mapping = dmap;
 				dmap->nrpages++;
 				if (PageDirty(page))
-					radix_tree_tag_set(&dmap->i_pages,
-							   offset,
-							   PAGECACHE_TAG_DIRTY);
+					__xa_set_mark(&dmap->i_pages, offset,
+							PAGECACHE_TAG_DIRTY);
 			}
 			xa_unlock_irq(&dmap->i_pages);
 		}
@@ -467,8 +465,7 @@
 	if (mapping) {
 		xa_lock_irq(&mapping->i_pages);
 		if (test_bit(PG_dirty, &page->flags)) {
-			radix_tree_tag_clear(&mapping->i_pages,
-					     page_index(page),
+			__xa_clear_mark(&mapping->i_pages, page_index(page),
 					     PAGECACHE_TAG_DIRTY);
 			xa_unlock_irq(&mapping->i_pages);
 			return clear_page_dirty_for_io(page);

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 26290aa..5729ee8 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c

@@ -155,21 +155,14 @@
 	return &ii->vfs_inode;
 }
 
-static void nilfs_i_callback(struct rcu_head *head)
+static void nilfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-
 	if (nilfs_is_metadata_file_inode(inode))
 		nilfs_mdt_destroy(inode);
 
 	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
 
-void nilfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, nilfs_i_callback);
-}
-
 static int nilfs_sync_super(struct super_block *sb, int flag)
 {
 	struct the_nilfs *nilfs = sb->s_fs_info;
@@ -686,7 +679,7 @@
 
 static const struct super_operations nilfs_sops = {
 	.alloc_inode    = nilfs_alloc_inode,
-	.destroy_inode  = nilfs_destroy_inode,
+	.free_inode     = nilfs_free_inode,
 	.dirty_inode    = nilfs_dirty_inode,
 	.evict_inode    = nilfs_evict_inode,
 	.put_super      = nilfs_put_super,

diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig
index e2ce79e..5a63303 100644
--- a/fs/nls/Kconfig
+++ b/fs/nls/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Native language support configuration
 #

diff --git a/fs/no-block.c b/fs/no-block.c
index 6e40e42..481c0f0 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* no-block.c: implementation of routines required for non-BLOCK configuration
  *
  * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>

diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 2a24249..c020d26 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config FSNOTIFY
 	def_bool n
 	select SRCU

diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index f9c1ca1..3df7def 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config DNOTIFY
 	bool "Dnotify support"
 	select FSNOTIFY

diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
index f145251..121b4dd 100644
--- a/fs/notify/dnotify/Makefile
+++ b/fs/notify/dnotify/Makefile

@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_DNOTIFY)		+= dnotify.o

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 58d77dc..7a42c2e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Directory notifications for Linux.
  *
@@ -5,16 +6,6 @@
  *
  * Copyright (C) 2009 Eric Paris <Red Hat Inc>
  * dnotify was largly rewritten to use the new fsnotify infrastructure
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 #include <linux/fs.h>
 #include <linux/module.h>
@@ -22,6 +13,7 @@
 #include <linux/sched/signal.h>
 #include <linux/dnotify.h>
 #include <linux/init.h>
+#include <linux/security.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
@@ -81,7 +73,7 @@
 static int dnotify_handle_event(struct fsnotify_group *group,
 				struct inode *inode,
 				u32 mask, const void *data, int data_type,
-				const unsigned char *file_name, u32 cookie,
+				const struct qstr *file_name, u32 cookie,
 				struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
@@ -288,6 +280,17 @@
 		goto out_err;
 	}
 
+	/*
+	 * convert the userspace DN_* "arg" to the internal FS_*
+	 * defined in fsnotify
+	 */
+	mask = convert_arg(arg);
+
+	error = security_path_notify(&filp->f_path, mask,
+			FSNOTIFY_OBJ_TYPE_INODE);
+	if (error)
+		goto out_err;
+
 	/* expect most fcntl to add new rather than augment old */
 	dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
 	if (!dn) {
@@ -302,9 +305,6 @@
 		goto out_err;
 	}
 
-	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
-	mask = convert_arg(arg);
-
 	/* set up the new_fsn_mark and new_dn_mark */
 	new_fsn_mark = &new_dn_mark->fsn_mark;
 	fsnotify_init_mark(new_fsn_mark, dnotify_group);

diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 41355ce..8b9103f 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig

@@ -1,7 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config FANOTIFY
 	bool "Filesystem wide access notification"
 	select FSNOTIFY
-	select ANON_INODES
+	select EXPORTFS
 	default n
 	---help---
 	   Say Y here to enable fanotify support.  fanotify is a file access

diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
index 0999213..25ef222 100644
--- a/fs/notify/fanotify/Makefile
+++ b/fs/notify/fanotify/Makefile

@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_FANOTIFY)		+= fanotify.o fanotify_user.o

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 29dee96..5778d13 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c

@@ -13,22 +13,40 @@
 #include <linux/wait.h>
 #include <linux/audit.h>
 #include <linux/sched/mm.h>
+#include <linux/statfs.h>
 
 #include "fanotify.h"
 
 static bool should_merge(struct fsnotify_event *old_fsn,
 			 struct fsnotify_event *new_fsn)
 {
-	struct fanotify_event_info *old, *new;
+	struct fanotify_event *old, *new;
 
 	pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
 	old = FANOTIFY_E(old_fsn);
 	new = FANOTIFY_E(new_fsn);
 
-	if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
-	    old->path.mnt == new->path.mnt &&
-	    old->path.dentry == new->path.dentry)
-		return true;
+	if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
+	    old->fh_type != new->fh_type || old->fh_len != new->fh_len)
+		return false;
+
+	if (fanotify_event_has_path(old)) {
+		return old->path.mnt == new->path.mnt &&
+			old->path.dentry == new->path.dentry;
+	} else if (fanotify_event_has_fid(old)) {
+		/*
+		 * We want to merge many dirent events in the same dir (i.e.
+		 * creates/unlinks/renames), but we do not want to merge dirent
+		 * events referring to subdirs with dirent events referring to
+		 * non subdirs, otherwise, user won't be able to tell from a
+		 * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
+		 * unlink pair or rmdir+create pair of events.
+		 */
+		return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) &&
+			fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
+	}
+
+	/* Do not merge events if we failed to encode fid */
 	return false;
 }
 
@@ -36,20 +54,22 @@
 static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 {
 	struct fsnotify_event *test_event;
+	struct fanotify_event *new;
 
 	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+	new = FANOTIFY_E(event);
 
 	/*
 	 * Don't merge a permission event with any other event so that we know
 	 * the event structure we have created in fanotify_handle_event() is the
 	 * one we should check for permission response.
 	 */
-	if (fanotify_is_perm_event(event->mask))
+	if (fanotify_is_perm_event(new->mask))
 		return 0;
 
 	list_for_each_entry_reverse(test_event, list, list) {
 		if (should_merge(test_event, event)) {
-			test_event->mask |= event->mask;
+			FANOTIFY_E(test_event)->mask |= new->mask;
 			return 1;
 		}
 	}
@@ -57,15 +77,44 @@
 	return 0;
 }
 
+/*
+ * Wait for response to permission event. The function also takes care of
+ * freeing the permission event (or offloads that in case the wait is canceled
+ * by a signal). The function returns 0 in case access got allowed by userspace,
+ * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case
+ * the wait got interrupted by a signal.
+ */
 static int fanotify_get_response(struct fsnotify_group *group,
-				 struct fanotify_perm_event_info *event,
+				 struct fanotify_perm_event *event,
 				 struct fsnotify_iter_info *iter_info)
 {
 	int ret;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	wait_event(group->fanotify_data.access_waitq, event->response);
+	ret = wait_event_killable(group->fanotify_data.access_waitq,
+				  event->state == FAN_EVENT_ANSWERED);
+	/* Signal pending? */
+	if (ret < 0) {
+		spin_lock(&group->notification_lock);
+		/* Event reported to userspace and no answer yet? */
+		if (event->state == FAN_EVENT_REPORTED) {
+			/* Event will get freed once userspace answers to it */
+			event->state = FAN_EVENT_CANCELED;
+			spin_unlock(&group->notification_lock);
+			return ret;
+		}
+		/* Event not yet reported? Just remove it. */
+		if (event->state == FAN_EVENT_INIT)
+			fsnotify_remove_queued_event(group, &event->fae.fse);
+		/*
+		 * Event may be also answered in case signal delivery raced
+		 * with wakeup. In that case we have nothing to do besides
+		 * freeing the event and reporting error.
+		 */
+		spin_unlock(&group->notification_lock);
+		goto out;
+	}
 
 	/* userspace responded, convert to something usable */
 	switch (event->response & ~FAN_AUDIT) {
@@ -81,19 +130,27 @@
 	if (event->response & FAN_AUDIT)
 		audit_fanotify(event->response & ~FAN_AUDIT);
 
-	event->response = 0;
-
 	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
 		 group, event, ret);
-	
+out:
+	fsnotify_destroy_event(group, &event->fae.fse);
+
 	return ret;
 }
 
-static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info,
-				       u32 event_mask, const void *data,
-				       int data_type)
+/*
+ * This function returns a mask for an event that only contains the flags
+ * that have been specifically requested by the user. Flags that may have
+ * been included within the event mask, but have not been explicitly
+ * requested by the user, will not be present in the returned mask.
+ */
+static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+				     struct fsnotify_iter_info *iter_info,
+				     u32 event_mask, const void *data,
+				     int data_type)
 {
 	__u32 marks_mask = 0, marks_ignored_mask = 0;
+	__u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS;
 	const struct path *path = data;
 	struct fsnotify_mark *mark;
 	int type;
@@ -101,14 +158,14 @@
 	pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
 		 __func__, iter_info->report_mask, event_mask, data, data_type);
 
-	/* if we don't have enough info to send an event to userspace say no */
-	if (data_type != FSNOTIFY_EVENT_PATH)
-		return false;
-
-	/* sorry, fanotify only gives a damn about files and dirs */
-	if (!d_is_reg(path->dentry) &&
-	    !d_can_lookup(path->dentry))
-		return false;
+	if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		/* Do we have path to open a file descriptor? */
+		if (data_type != FSNOTIFY_EVENT_PATH)
+			return 0;
+		/* Path type events are only relevant for files and dirs */
+		if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry))
+			return 0;
+	}
 
 	fsnotify_foreach_obj_type(type) {
 		if (!fsnotify_iter_should_report_type(iter_info, type))
@@ -127,55 +184,152 @@
 		marks_ignored_mask |= mark->ignored_mask;
 	}
 
-	if (d_is_dir(path->dentry) &&
+	test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+
+	/*
+	 * dirent modification events (create/delete/move) do not carry the
+	 * child entry name/inode information. Instead, we report FAN_ONDIR
+	 * for mkdir/rmdir so user can differentiate them from creat/unlink.
+	 *
+	 * For backward compatibility and consistency, do not report FAN_ONDIR
+	 * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
+	 * to user in FAN_REPORT_FID mode for all event types.
+	 */
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		/* Do not report FAN_ONDIR without any event */
+		if (!(test_mask & ~FAN_ONDIR))
+			return 0;
+	} else {
+		user_mask &= ~FAN_ONDIR;
+	}
+
+	if (event_mask & FS_ISDIR &&
 	    !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
-		return false;
+		return 0;
 
-	if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
-				 ~marks_ignored_mask)
-		return true;
-
-	return false;
+	return test_mask & user_mask;
 }
 
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
-						 struct inode *inode, u32 mask,
-						 const struct path *path)
+static int fanotify_encode_fid(struct fanotify_event *event,
+			       struct inode *inode, gfp_t gfp,
+			       __kernel_fsid_t *fsid)
 {
-	struct fanotify_event_info *event = NULL;
+	struct fanotify_fid *fid = &event->fid;
+	int dwords, bytes = 0;
+	int err, type;
+
+	fid->ext_fh = NULL;
+	dwords = 0;
+	err = -ENOENT;
+	type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+	if (!dwords)
+		goto out_err;
+
+	bytes = dwords << 2;
+	if (bytes > FANOTIFY_INLINE_FH_LEN) {
+		/* Treat failure to allocate fh as failure to allocate event */
+		err = -ENOMEM;
+		fid->ext_fh = kmalloc(bytes, gfp);
+		if (!fid->ext_fh)
+			goto out_err;
+	}
+
+	type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes),
+					&dwords, NULL);
+	err = -EINVAL;
+	if (!type || type == FILEID_INVALID || bytes != dwords << 2)
+		goto out_err;
+
+	fid->fsid = *fsid;
+	event->fh_len = bytes;
+
+	return type;
+
+out_err:
+	pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
+			    "type=%d, bytes=%d, err=%i)\n",
+			    fsid->val[0], fsid->val[1], type, bytes, err);
+	kfree(fid->ext_fh);
+	fid->ext_fh = NULL;
+	event->fh_len = 0;
+
+	return FILEID_INVALID;
+}
+
+/*
+ * The inode to use as identifier when reporting fid depends on the event.
+ * Report the modified directory inode on dirent modification events.
+ * Report the "victim" inode otherwise.
+ * For example:
+ * FS_ATTRIB reports the child inode even if reported on a watched parent.
+ * FS_CREATE reports the modified dir inode and not the created inode.
+ */
+static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
+					const void *data, int data_type)
+{
+	if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+		return to_tell;
+	else if (data_type == FSNOTIFY_EVENT_INODE)
+		return (struct inode *)data;
+	else if (data_type == FSNOTIFY_EVENT_PATH)
+		return d_inode(((struct path *)data)->dentry);
+	return NULL;
+}
+
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+					    struct inode *inode, u32 mask,
+					    const void *data, int data_type,
+					    __kernel_fsid_t *fsid)
+{
+	struct fanotify_event *event = NULL;
 	gfp_t gfp = GFP_KERNEL_ACCOUNT;
+	struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
 
 	/*
 	 * For queues with unlimited length lost events are not expected and
 	 * can possibly have security implications. Avoid losing events when
-	 * memory is short.
+	 * memory is short. For the limited size queues, avoid OOM killer in the
+	 * target monitoring memcg as it may have security repercussion.
 	 */
 	if (group->max_events == UINT_MAX)
 		gfp |= __GFP_NOFAIL;
+	else
+		gfp |= __GFP_RETRY_MAYFAIL;
 
 	/* Whoever is interested in the event, pays for the allocation. */
 	memalloc_use_memcg(group->memcg);
 
 	if (fanotify_is_perm_event(mask)) {
-		struct fanotify_perm_event_info *pevent;
+		struct fanotify_perm_event *pevent;
 
 		pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
 		if (!pevent)
 			goto out;
 		event = &pevent->fae;
 		pevent->response = 0;
+		pevent->state = FAN_EVENT_INIT;
 		goto init;
 	}
 	event = kmem_cache_alloc(fanotify_event_cachep, gfp);
 	if (!event)
 		goto out;
 init: __maybe_unused
-	fsnotify_init_event(&event->fse, inode, mask);
-	event->tgid = get_pid(task_tgid(current));
-	if (path) {
-		event->path = *path;
+	fsnotify_init_event(&event->fse, inode);
+	event->mask = mask;
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
+		event->pid = get_pid(task_pid(current));
+	else
+		event->pid = get_pid(task_tgid(current));
+	event->fh_len = 0;
+	if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		/* Report the event without a file identifier on encode error */
+		event->fh_type = fanotify_encode_fid(event, id, gfp, fsid);
+	} else if (data_type == FSNOTIFY_EVENT_PATH) {
+		event->fh_type = FILEID_ROOT;
+		event->path = *((struct path *)data);
 		path_get(&event->path);
 	} else {
+		event->fh_type = FILEID_INVALID;
 		event->path.mnt = NULL;
 		event->path.dentry = NULL;
 	}
@@ -184,28 +338,75 @@
 	return event;
 }
 
+/*
+ * Get cached fsid of the filesystem containing the object from any connector.
+ * All connectors are supposed to have the same fsid, but we do not verify that
+ * here.
+ */
+static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
+{
+	int type;
+	__kernel_fsid_t fsid = {};
+
+	fsnotify_foreach_obj_type(type) {
+		struct fsnotify_mark_connector *conn;
+
+		if (!fsnotify_iter_should_report_type(iter_info, type))
+			continue;
+
+		conn = READ_ONCE(iter_info->marks[type]->connector);
+		/* Mark is just getting destroyed or created? */
+		if (!conn)
+			continue;
+		if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID))
+			continue;
+		/* Pairs with smp_wmb() in fsnotify_add_mark_list() */
+		smp_rmb();
+		fsid = conn->fsid;
+		if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+			continue;
+		return fsid;
+	}
+
+	return fsid;
+}
+
 static int fanotify_handle_event(struct fsnotify_group *group,
 				 struct inode *inode,
 				 u32 mask, const void *data, int data_type,
-				 const unsigned char *file_name, u32 cookie,
+				 const struct qstr *file_name, u32 cookie,
 				 struct fsnotify_iter_info *iter_info)
 {
 	int ret = 0;
-	struct fanotify_event_info *event;
+	struct fanotify_event *event;
 	struct fsnotify_event *fsn_event;
+	__kernel_fsid_t fsid = {};
 
 	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
 	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB);
 	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
 	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
 	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
 	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
 	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
 	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
 	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
 	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+	BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
+	BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
 
-	if (!fanotify_should_send_event(iter_info, mask, data, data_type))
+	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
+
+	mask = fanotify_group_event_mask(group, iter_info, mask, data,
+					 data_type);
+	if (!mask)
 		return 0;
 
 	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
@@ -220,7 +421,15 @@
 			return 0;
 	}
 
-	event = fanotify_alloc_event(group, inode, mask, data);
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		fsid = fanotify_get_fsid(iter_info);
+		/* Racing with mark destruction or creation? */
+		if (!fsid.val[0] && !fsid.val[1])
+			return 0;
+	}
+
+	event = fanotify_alloc_event(group, inode, mask, data, data_type,
+				     &fsid);
 	ret = -ENOMEM;
 	if (unlikely(!event)) {
 		/*
@@ -236,7 +445,7 @@
 	ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
 	if (ret) {
 		/* Permission events shouldn't be merged */
-		BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
+		BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
 		/* Our event wasn't used in the end. Free it. */
 		fsnotify_destroy_event(group, fsn_event);
 
@@ -244,7 +453,6 @@
 	} else if (fanotify_is_perm_event(mask)) {
 		ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
 					    iter_info);
-		fsnotify_destroy_event(group, fsn_event);
 	}
 finish:
 	if (fanotify_is_perm_event(mask))
@@ -264,12 +472,15 @@
 
 static void fanotify_free_event(struct fsnotify_event *fsn_event)
 {
-	struct fanotify_event_info *event;
+	struct fanotify_event *event;
 
 	event = FANOTIFY_E(fsn_event);
-	path_put(&event->path);
-	put_pid(event->tgid);
-	if (fanotify_is_perm_event(fsn_event->mask)) {
+	if (fanotify_event_has_path(event))
+		path_put(&event->path);
+	else if (fanotify_event_has_ext_fh(event))
+		kfree(event->fid.ext_fh);
+	put_pid(event->pid);
+	if (fanotify_is_perm_event(event->mask)) {
 		kmem_cache_free(fanotify_perm_event_cachep,
 				FANOTIFY_PE(fsn_event));
 		return;

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 8609ba0..68b3050 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h

@@ -2,26 +2,112 @@
 #include <linux/fsnotify_backend.h>
 #include <linux/path.h>
 #include <linux/slab.h>
+#include <linux/exportfs.h>
 
 extern struct kmem_cache *fanotify_mark_cache;
 extern struct kmem_cache *fanotify_event_cachep;
 extern struct kmem_cache *fanotify_perm_event_cachep;
 
+/* Possible states of the permission event */
+enum {
+	FAN_EVENT_INIT,
+	FAN_EVENT_REPORTED,
+	FAN_EVENT_ANSWERED,
+	FAN_EVENT_CANCELED,
+};
+
+/*
+ * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
+ * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
+ * fh_* fields increase the size of fanotify_event by another 4 bytes.
+ * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
+ * fh_* fields are packed in a hole after mask.
+ */
+#if BITS_PER_LONG == 32
+#define FANOTIFY_INLINE_FH_LEN	(3 << 2)
+#else
+#define FANOTIFY_INLINE_FH_LEN	(4 << 2)
+#endif
+
+struct fanotify_fid {
+	__kernel_fsid_t fsid;
+	union {
+		unsigned char fh[FANOTIFY_INLINE_FH_LEN];
+		unsigned char *ext_fh;
+	};
+};
+
+static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
+				    unsigned int fh_len)
+{
+	return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
+}
+
+static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
+				      struct fanotify_fid *fid2,
+				      unsigned int fh_len)
+{
+	return fid1->fsid.val[0] == fid2->fsid.val[0] &&
+		fid1->fsid.val[1] == fid2->fsid.val[1] &&
+		!memcmp(fanotify_fid_fh(fid1, fh_len),
+			fanotify_fid_fh(fid2, fh_len), fh_len);
+}
+
 /*
  * Structure for normal fanotify events. It gets allocated in
  * fanotify_handle_event() and freed when the information is retrieved by
  * userspace
  */
-struct fanotify_event_info {
+struct fanotify_event {
 	struct fsnotify_event fse;
+	u32 mask;
 	/*
-	 * We hold ref to this path so it may be dereferenced at any point
-	 * during this object's lifetime
+	 * Those fields are outside fanotify_fid to pack fanotify_event nicely
+	 * on 64bit arch and to use fh_type as an indication of whether path
+	 * or fid are used in the union:
+	 * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
 	 */
-	struct path path;
-	struct pid *tgid;
+	u8 fh_type;
+	u8 fh_len;
+	u16 pad;
+	union {
+		/*
+		 * We hold ref to this path so it may be dereferenced at any
+		 * point during this object's lifetime
+		 */
+		struct path path;
+		/*
+		 * With FAN_REPORT_FID, we do not hold any reference on the
+		 * victim object. Instead we store its NFS file handle and its
+		 * filesystem's fsid as a unique identifier.
+		 */
+		struct fanotify_fid fid;
+	};
+	struct pid *pid;
 };
 
+static inline bool fanotify_event_has_path(struct fanotify_event *event)
+{
+	return event->fh_type == FILEID_ROOT;
+}
+
+static inline bool fanotify_event_has_fid(struct fanotify_event *event)
+{
+	return event->fh_type != FILEID_ROOT &&
+		event->fh_type != FILEID_INVALID;
+}
+
+static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
+{
+	return fanotify_event_has_fid(event) &&
+		event->fh_len > FANOTIFY_INLINE_FH_LEN;
+}
+
+static inline void *fanotify_event_fh(struct fanotify_event *event)
+{
+	return fanotify_fid_fh(&event->fid, event->fh_len);
+}
+
 /*
  * Structure for permission fanotify events. It gets allocated and freed in
  * fanotify_handle_event() since we wait there for user response. When the
@@ -29,29 +115,31 @@
  * group->notification_list to group->fanotify_data.access_list to wait for
  * user response.
  */
-struct fanotify_perm_event_info {
-	struct fanotify_event_info fae;
-	int response;	/* userspace answer to question */
+struct fanotify_perm_event {
+	struct fanotify_event fae;
+	unsigned short response;	/* userspace answer to the event */
+	unsigned short state;		/* state of the event */
 	int fd;		/* fd we passed to userspace for this event */
 };
 
-static inline struct fanotify_perm_event_info *
+static inline struct fanotify_perm_event *
 FANOTIFY_PE(struct fsnotify_event *fse)
 {
-	return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+	return container_of(fse, struct fanotify_perm_event, fae.fse);
 }
 
 static inline bool fanotify_is_perm_event(u32 mask)
 {
 	return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) &&
-		mask & FAN_ALL_PERM_EVENTS;
+		mask & FANOTIFY_PERM_EVENTS;
 }
 
-static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
 {
-	return container_of(fse, struct fanotify_event_info, fse);
+	return container_of(fse, struct fanotify_event, fse);
 }
 
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
-						 struct inode *inode, u32 mask,
-						 const struct path *path);
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+					    struct inode *inode, u32 mask,
+					    const void *data, int data_type,
+					    __kernel_fsid_t *fsid);

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6905488..8508ab5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c

@@ -17,6 +17,8 @@
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
 #include <linux/memcontrol.h>
+#include <linux/statfs.h>
+#include <linux/exportfs.h>
 
 #include <asm/ioctls.h>
 
@@ -47,33 +49,55 @@
 struct kmem_cache *fanotify_event_cachep __read_mostly;
 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 
+#define FANOTIFY_EVENT_ALIGN 4
+
+static int fanotify_event_info_len(struct fanotify_event *event)
+{
+	if (!fanotify_event_has_fid(event))
+		return 0;
+
+	return roundup(sizeof(struct fanotify_event_info_fid) +
+		       sizeof(struct file_handle) + event->fh_len,
+		       FANOTIFY_EVENT_ALIGN);
+}
+
 /*
  * Get an fsnotify notification event if one exists and is small
  * enough to fit in "count". Return an error pointer if the count
- * is not large enough.
- *
- * Called with the group->notification_lock held.
+ * is not large enough. When permission event is dequeued, its state is
+ * updated accordingly.
  */
 static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 					    size_t count)
 {
-	assert_spin_locked(&group->notification_lock);
+	size_t event_size = FAN_EVENT_METADATA_LEN;
+	struct fsnotify_event *fsn_event = NULL;
 
 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
 
+	spin_lock(&group->notification_lock);
 	if (fsnotify_notify_queue_is_empty(group))
-		return NULL;
+		goto out;
 
-	if (FAN_EVENT_METADATA_LEN > count)
-		return ERR_PTR(-EINVAL);
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		event_size += fanotify_event_info_len(
+			FANOTIFY_E(fsnotify_peek_first_event(group)));
+	}
 
-	/* held the notification_lock the whole time, so this is the
-	 * same event we peeked above */
-	return fsnotify_remove_first_event(group);
+	if (event_size > count) {
+		fsn_event = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	fsn_event = fsnotify_remove_first_event(group);
+	if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask))
+		FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED;
+out:
+	spin_unlock(&group->notification_lock);
+	return fsn_event;
 }
 
 static int create_fd(struct fsnotify_group *group,
-		     struct fanotify_event_info *event,
+		     struct fanotify_event *event,
 		     struct file **file)
 {
 	int client_fd;
@@ -114,62 +138,32 @@
 	return client_fd;
 }
 
-static int fill_event_metadata(struct fsnotify_group *group,
-			       struct fanotify_event_metadata *metadata,
-			       struct fsnotify_event *fsn_event,
-			       struct file **file)
+/*
+ * Finish processing of permission event by setting it to ANSWERED state and
+ * drop group->notification_lock.
+ */
+static void finish_permission_event(struct fsnotify_group *group,
+				    struct fanotify_perm_event *event,
+				    unsigned int response)
+				    __releases(&group->notification_lock)
 {
-	int ret = 0;
-	struct fanotify_event_info *event;
+	bool destroy = false;
 
-	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
-		 group, metadata, fsn_event);
-
-	*file = NULL;
-	event = container_of(fsn_event, struct fanotify_event_info, fse);
-	metadata->event_len = FAN_EVENT_METADATA_LEN;
-	metadata->metadata_len = FAN_EVENT_METADATA_LEN;
-	metadata->vers = FANOTIFY_METADATA_VERSION;
-	metadata->reserved = 0;
-	metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
-	metadata->pid = pid_vnr(event->tgid);
-	if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
-		metadata->fd = FAN_NOFD;
-	else {
-		metadata->fd = create_fd(group, event, file);
-		if (metadata->fd < 0)
-			ret = metadata->fd;
-	}
-
-	return ret;
-}
-
-static struct fanotify_perm_event_info *dequeue_event(
-				struct fsnotify_group *group, int fd)
-{
-	struct fanotify_perm_event_info *event, *return_e = NULL;
-
-	spin_lock(&group->notification_lock);
-	list_for_each_entry(event, &group->fanotify_data.access_list,
-			    fae.fse.list) {
-		if (event->fd != fd)
-			continue;
-
-		list_del_init(&event->fae.fse.list);
-		return_e = event;
-		break;
-	}
+	assert_spin_locked(&group->notification_lock);
+	event->response = response;
+	if (event->state == FAN_EVENT_CANCELED)
+		destroy = true;
+	else
+		event->state = FAN_EVENT_ANSWERED;
 	spin_unlock(&group->notification_lock);
-
-	pr_debug("%s: found return_re=%p\n", __func__, return_e);
-
-	return return_e;
+	if (destroy)
+		fsnotify_destroy_event(group, &event->fae.fse);
 }
 
 static int process_access_response(struct fsnotify_group *group,
 				   struct fanotify_response *response_struct)
 {
-	struct fanotify_perm_event_info *event;
+	struct fanotify_perm_event *event;
 	int fd = response_struct->fd;
 	int response = response_struct->response;
 
@@ -191,45 +185,128 @@
 	if (fd < 0)
 		return -EINVAL;
 
-	if ((response & FAN_AUDIT) && !group->fanotify_data.audit)
+	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
 		return -EINVAL;
 
-	event = dequeue_event(group, fd);
-	if (!event)
-		return -ENOENT;
+	spin_lock(&group->notification_lock);
+	list_for_each_entry(event, &group->fanotify_data.access_list,
+			    fae.fse.list) {
+		if (event->fd != fd)
+			continue;
 
-	event->response = response;
-	wake_up(&group->fanotify_data.access_waitq);
+		list_del_init(&event->fae.fse.list);
+		finish_permission_event(group, event, response);
+		wake_up(&group->fanotify_data.access_waitq);
+		return 0;
+	}
+	spin_unlock(&group->notification_lock);
+
+	return -ENOENT;
+}
+
+static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
+{
+	struct fanotify_event_info_fid info = { };
+	struct file_handle handle = { };
+	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh;
+	size_t fh_len = event->fh_len;
+	size_t len = fanotify_event_info_len(event);
+
+	if (!len)
+		return 0;
+
+	if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
+		return -EFAULT;
+
+	/* Copy event info fid header followed by vaiable sized file handle */
+	info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
+	info.hdr.len = len;
+	info.fsid = event->fid.fsid;
+	if (copy_to_user(buf, &info, sizeof(info)))
+		return -EFAULT;
+
+	buf += sizeof(info);
+	len -= sizeof(info);
+	handle.handle_type = event->fh_type;
+	handle.handle_bytes = fh_len;
+	if (copy_to_user(buf, &handle, sizeof(handle)))
+		return -EFAULT;
+
+	buf += sizeof(handle);
+	len -= sizeof(handle);
+	/*
+	 * For an inline fh, copy through stack to exclude the copy from
+	 * usercopy hardening protections.
+	 */
+	fh = fanotify_event_fh(event);
+	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
+		memcpy(bounce, fh, fh_len);
+		fh = bounce;
+	}
+	if (copy_to_user(buf, fh, fh_len))
+		return -EFAULT;
+
+	/* Pad with 0's */
+	buf += fh_len;
+	len -= fh_len;
+	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
+	if (len > 0 && clear_user(buf, len))
+		return -EFAULT;
 
 	return 0;
 }
 
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
-				  struct fsnotify_event *event,
-				  char __user *buf)
+				  struct fsnotify_event *fsn_event,
+				  char __user *buf, size_t count)
 {
-	struct fanotify_event_metadata fanotify_event_metadata;
-	struct file *f;
-	int fd, ret;
+	struct fanotify_event_metadata metadata;
+	struct fanotify_event *event;
+	struct file *f = NULL;
+	int ret, fd = FAN_NOFD;
 
-	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+	pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
 
-	ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
-	if (ret < 0)
-		return ret;
+	event = container_of(fsn_event, struct fanotify_event, fse);
+	metadata.event_len = FAN_EVENT_METADATA_LEN;
+	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
+	metadata.vers = FANOTIFY_METADATA_VERSION;
+	metadata.reserved = 0;
+	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
+	metadata.pid = pid_vnr(event->pid);
 
-	fd = fanotify_event_metadata.fd;
+	if (fanotify_event_has_path(event)) {
+		fd = create_fd(group, event, &f);
+		if (fd < 0)
+			return fd;
+	} else if (fanotify_event_has_fid(event)) {
+		metadata.event_len += fanotify_event_info_len(event);
+	}
+	metadata.fd = fd;
+
 	ret = -EFAULT;
-	if (copy_to_user(buf, &fanotify_event_metadata,
-			 fanotify_event_metadata.event_len))
+	/*
+	 * Sanity check copy size in case get_one_event() and
+	 * fill_event_metadata() event_len sizes ever get out of sync.
+	 */
+	if (WARN_ON_ONCE(metadata.event_len > count))
+		goto out_close_fd;
+
+	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
 		goto out_close_fd;
 
 	if (fanotify_is_perm_event(event->mask))
-		FANOTIFY_PE(event)->fd = fd;
+		FANOTIFY_PE(fsn_event)->fd = fd;
 
-	if (fd != FAN_NOFD)
+	if (fanotify_event_has_path(event)) {
 		fd_install(fd, f);
-	return fanotify_event_metadata.event_len;
+	} else if (fanotify_event_has_fid(event)) {
+		ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
+		if (ret < 0)
+			return ret;
+	}
+
+	return metadata.event_len;
 
 out_close_fd:
 	if (fd != FAN_NOFD) {
@@ -270,10 +347,7 @@
 
 	add_wait_queue(&group->notification_waitq, &wait);
 	while (1) {
-		spin_lock(&group->notification_lock);
 		kevent = get_one_event(group, count);
-		spin_unlock(&group->notification_lock);
-
 		if (IS_ERR(kevent)) {
 			ret = PTR_ERR(kevent);
 			break;
@@ -295,7 +369,7 @@
 			continue;
 		}
 
-		ret = copy_event_to_user(group, kevent, buf);
+		ret = copy_event_to_user(group, kevent, buf, count);
 		if (unlikely(ret == -EOPENSTALE)) {
 			/*
 			 * We cannot report events with stale fd so drop it.
@@ -310,11 +384,13 @@
 		 * Permission events get queued to wait for response.  Other
 		 * events can be destroyed now.
 		 */
-		if (!fanotify_is_perm_event(kevent->mask)) {
+		if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
 			fsnotify_destroy_event(group, kevent);
 		} else {
 			if (ret <= 0) {
-				FANOTIFY_PE(kevent)->response = FAN_DENY;
+				spin_lock(&group->notification_lock);
+				finish_permission_event(group,
+					FANOTIFY_PE(kevent), FAN_DENY);
 				wake_up(&group->fanotify_data.access_waitq);
 			} else {
 				spin_lock(&group->notification_lock);
@@ -364,7 +440,7 @@
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
-	struct fanotify_perm_event_info *event, *next;
+	struct fanotify_perm_event *event;
 	struct fsnotify_event *fsn_event;
 
 	/*
@@ -379,13 +455,12 @@
 	 * and simulate reply from userspace.
 	 */
 	spin_lock(&group->notification_lock);
-	list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
-				 fae.fse.list) {
-		pr_debug("%s: found group=%p event=%p\n", __func__, group,
-			 event);
-
+	while (!list_empty(&group->fanotify_data.access_list)) {
+		event = list_first_entry(&group->fanotify_data.access_list,
+				struct fanotify_perm_event, fae.fse.list);
 		list_del_init(&event->fae.fse.list);
-		event->response = FAN_ALLOW;
+		finish_permission_event(group, event, FAN_ALLOW);
+		spin_lock(&group->notification_lock);
 	}
 
 	/*
@@ -395,13 +470,14 @@
 	 */
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		fsn_event = fsnotify_remove_first_event(group);
-		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) {
+		if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
 			spin_unlock(&group->notification_lock);
 			fsnotify_destroy_event(group, fsn_event);
-			spin_lock(&group->notification_lock);
 		} else {
-			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+			finish_permission_event(group, FANOTIFY_PE(fsn_event),
+						FAN_ALLOW);
 		}
+		spin_lock(&group->notification_lock);
 	}
 	spin_unlock(&group->notification_lock);
 
@@ -452,7 +528,8 @@
 };
 
 static int fanotify_find_path(int dfd, const char __user *filename,
-			      struct path *path, unsigned int flags)
+			      struct path *path, unsigned int flags, __u64 mask,
+			      unsigned int obj_type)
 {
 	int ret;
 
@@ -491,8 +568,15 @@
 
 	/* you can only watch an inode if you have read permissions on it */
 	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (ret) {
+		path_put(path);
+		goto out;
+	}
+
+	ret = security_path_notify(path, mask, obj_type);
 	if (ret)
 		path_put(path);
+
 out:
 	return ret;
 }
@@ -506,18 +590,10 @@
 
 	spin_lock(&fsn_mark->lock);
 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
-		__u32 tmask = fsn_mark->mask & ~mask;
-
-		if (flags & FAN_MARK_ONDIR)
-			tmask &= ~FAN_ONDIR;
-
 		oldmask = fsn_mark->mask;
-		fsn_mark->mask = tmask;
+		fsn_mark->mask &= ~mask;
 	} else {
-		__u32 tmask = fsn_mark->ignored_mask & ~mask;
-		if (flags & FAN_MARK_ONDIR)
-			tmask &= ~FAN_ONDIR;
-		fsn_mark->ignored_mask = tmask;
+		fsn_mark->ignored_mask &= ~mask;
 	}
 	*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
 	spin_unlock(&fsn_mark->lock);
@@ -563,6 +639,13 @@
 				    mask, flags);
 }
 
+static int fanotify_remove_sb_mark(struct fsnotify_group *group,
+				      struct super_block *sb, __u32 mask,
+				      unsigned int flags)
+{
+	return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags);
+}
+
 static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 				      struct inode *inode, __u32 mask,
 				      unsigned int flags)
@@ -579,19 +662,10 @@
 
 	spin_lock(&fsn_mark->lock);
 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
-		__u32 tmask = fsn_mark->mask | mask;
-
-		if (flags & FAN_MARK_ONDIR)
-			tmask |= FAN_ONDIR;
-
 		oldmask = fsn_mark->mask;
-		fsn_mark->mask = tmask;
+		fsn_mark->mask |= mask;
 	} else {
-		__u32 tmask = fsn_mark->ignored_mask | mask;
-		if (flags & FAN_MARK_ONDIR)
-			tmask |= FAN_ONDIR;
-
-		fsn_mark->ignored_mask = tmask;
+		fsn_mark->ignored_mask |= mask;
 		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
 			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 	}
@@ -602,7 +676,8 @@
 
 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 						   fsnotify_connp_t *connp,
-						   unsigned int type)
+						   unsigned int type,
+						   __kernel_fsid_t *fsid)
 {
 	struct fsnotify_mark *mark;
 	int ret;
@@ -615,7 +690,7 @@
 		return ERR_PTR(-ENOMEM);
 
 	fsnotify_init_mark(mark, group);
-	ret = fsnotify_add_mark_locked(mark, connp, type, 0);
+	ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
 	if (ret) {
 		fsnotify_put_mark(mark);
 		return ERR_PTR(ret);
@@ -627,7 +702,8 @@
 
 static int fanotify_add_mark(struct fsnotify_group *group,
 			     fsnotify_connp_t *connp, unsigned int type,
-			     __u32 mask, unsigned int flags)
+			     __u32 mask, unsigned int flags,
+			     __kernel_fsid_t *fsid)
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
@@ -635,7 +711,7 @@
 	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_mark(connp, group);
 	if (!fsn_mark) {
-		fsn_mark = fanotify_add_new_mark(group, connp, type);
+		fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
 		if (IS_ERR(fsn_mark)) {
 			mutex_unlock(&group->mark_mutex);
 			return PTR_ERR(fsn_mark);
@@ -652,15 +728,23 @@
 
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 				      struct vfsmount *mnt, __u32 mask,
-				      unsigned int flags)
+				      unsigned int flags, __kernel_fsid_t *fsid)
 {
 	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
-				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags);
+				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
+}
+
+static int fanotify_add_sb_mark(struct fsnotify_group *group,
+				struct super_block *sb, __u32 mask,
+				unsigned int flags, __kernel_fsid_t *fsid)
+{
+	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
+				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
 }
 
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
 				   struct inode *inode, __u32 mask,
-				   unsigned int flags)
+				   unsigned int flags, __kernel_fsid_t *fsid)
 {
 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
 
@@ -671,11 +755,11 @@
 	 */
 	if ((flags & FAN_MARK_IGNORED_MASK) &&
 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
-	    (atomic_read(&inode->i_writecount) > 0))
+	    inode_is_open_for_write(inode))
 		return 0;
 
 	return fanotify_add_mark(group, &inode->i_fsnotify_marks,
-				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags);
+				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
 }
 
 /* fanotify syscalls */
@@ -684,18 +768,18 @@
 	struct fsnotify_group *group;
 	int f_flags, fd;
 	struct user_struct *user;
-	struct fanotify_event_info *oevent;
+	struct fanotify_event *oevent;
 
-	pr_debug("%s: flags=%d event_f_flags=%d\n",
-		__func__, flags, event_f_flags);
+	pr_debug("%s: flags=%x event_f_flags=%x\n",
+		 __func__, flags, event_f_flags);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 #ifdef CONFIG_AUDITSYSCALL
-	if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT))
+	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
 #else
-	if (flags & ~FAN_ALL_INIT_FLAGS)
+	if (flags & ~FANOTIFY_INIT_FLAGS)
 #endif
 		return -EINVAL;
 
@@ -711,6 +795,10 @@
 		return -EINVAL;
 	}
 
+	if ((flags & FAN_REPORT_FID) &&
+	    (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
+		return -EINVAL;
+
 	user = get_current_user();
 	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
 		free_uid(user);
@@ -731,10 +819,12 @@
 	}
 
 	group->fanotify_data.user = user;
+	group->fanotify_data.flags = flags;
 	atomic_inc(&user->fanotify_listeners);
 	group->memcg = get_mem_cgroup_from_mm(current->mm);
 
-	oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
+	oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL,
+				      FSNOTIFY_EVENT_NONE, NULL);
 	if (unlikely(!oevent)) {
 		fd = -ENOMEM;
 		goto out_destroy_group;
@@ -746,7 +836,7 @@
 	group->fanotify_data.f_flags = event_f_flags;
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
-	switch (flags & FAN_ALL_CLASS_BITS) {
+	switch (flags & FANOTIFY_CLASS_BITS) {
 	case FAN_CLASS_NOTIF:
 		group->priority = FS_PRIO_0;
 		break;
@@ -783,7 +873,6 @@
 		fd = -EPERM;
 		if (!capable(CAP_AUDIT_WRITE))
 			goto out_destroy_group;
-		group->fanotify_data.audit = true;
 	}
 
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
@@ -797,6 +886,64 @@
 	return fd;
 }
 
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+{
+	__kernel_fsid_t root_fsid;
+	int err;
+
+	/*
+	 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+	 */
+	err = vfs_get_fsid(path->dentry, fsid);
+	if (err)
+		return err;
+
+	if (!fsid->val[0] && !fsid->val[1])
+		return -ENODEV;
+
+	/*
+	 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+	 * which uses a different fsid than sb root.
+	 */
+	err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+	if (err)
+		return err;
+
+	if (root_fsid.val[0] != fsid->val[0] ||
+	    root_fsid.val[1] != fsid->val[1])
+		return -EXDEV;
+
+	/*
+	 * We need to make sure that the file system supports at least
+	 * encoding a file handle so user can use name_to_handle_at() to
+	 * compare fid returned with event to the file handle of watched
+	 * objects. However, name_to_handle_at() requires that the
+	 * filesystem also supports decoding file handles.
+	 */
+	if (!path->dentry->d_sb->s_export_op ||
+	    !path->dentry->d_sb->s_export_op->fh_to_dentry)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static int fanotify_events_supported(struct path *path, __u64 mask)
+{
+	/*
+	 * Some filesystems such as 'proc' acquire unusual locks when opening
+	 * files. For them fanotify permission events have high chances of
+	 * deadlocking the system - open done when reporting fanotify event
+	 * blocks on this "unusual" lock while another process holding the lock
+	 * waits for fanotify permission event to be answered. Just disallow
+	 * permission events for such filesystems.
+	 */
+	if (mask & FANOTIFY_PERM_EVENTS &&
+	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
+		return -EINVAL;
+	return 0;
+}
+
 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 			    int dfd, const char  __user *pathname)
 {
@@ -805,7 +952,10 @@
 	struct fsnotify_group *group;
 	struct fd f;
 	struct path path;
-	u32 valid_mask = FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD;
+	__kernel_fsid_t __fsid, *fsid = NULL;
+	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
+	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+	unsigned int obj_type;
 	int ret;
 
 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
@@ -815,8 +965,23 @@
 	if (mask & ((__u64)0xffffffff << 32))
 		return -EINVAL;
 
-	if (flags & ~FAN_ALL_MARK_FLAGS)
+	if (flags & ~FANOTIFY_MARK_FLAGS)
 		return -EINVAL;
+
+	switch (mark_type) {
+	case FAN_MARK_INODE:
+		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
+		break;
+	case FAN_MARK_MOUNT:
+		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+		break;
+	case FAN_MARK_FILESYSTEM:
+		obj_type = FSNOTIFY_OBJ_TYPE_SB;
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
 	case FAN_MARK_ADD:		/* fallthrough */
 	case FAN_MARK_REMOVE:
@@ -824,20 +989,15 @@
 			return -EINVAL;
 		break;
 	case FAN_MARK_FLUSH:
-		if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH))
+		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
 			return -EINVAL;
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	if (mask & FAN_ONDIR) {
-		flags |= FAN_MARK_ONDIR;
-		mask &= ~FAN_ONDIR;
-	}
-
 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
-		valid_mask |= FAN_ALL_PERM_EVENTS;
+		valid_mask |= FANOTIFY_PERM_EVENTS;
 
 	if (mask & ~valid_mask)
 		return -EINVAL;
@@ -857,25 +1017,54 @@
 	 * allowed to set permissions events.
 	 */
 	ret = -EINVAL;
-	if (mask & FAN_ALL_PERM_EVENTS &&
+	if (mask & FANOTIFY_PERM_EVENTS &&
 	    group->priority == FS_PRIO_0)
 		goto fput_and_out;
 
+	/*
+	 * Events with data type inode do not carry enough information to report
+	 * event->fd, so we do not allow setting a mask for inode events unless
+	 * group supports reporting fid.
+	 * inode events are not supported on a mount mark, because they do not
+	 * carry enough information (i.e. path) to be filtered by mount point.
+	 */
+	if (mask & FANOTIFY_INODE_EVENTS &&
+	    (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) ||
+	     mark_type == FAN_MARK_MOUNT))
+		goto fput_and_out;
+
 	if (flags & FAN_MARK_FLUSH) {
 		ret = 0;
-		if (flags & FAN_MARK_MOUNT)
+		if (mark_type == FAN_MARK_MOUNT)
 			fsnotify_clear_vfsmount_marks_by_group(group);
+		else if (mark_type == FAN_MARK_FILESYSTEM)
+			fsnotify_clear_sb_marks_by_group(group);
 		else
 			fsnotify_clear_inode_marks_by_group(group);
 		goto fput_and_out;
 	}
 
-	ret = fanotify_find_path(dfd, pathname, &path, flags);
+	ret = fanotify_find_path(dfd, pathname, &path, flags,
+			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
 	if (ret)
 		goto fput_and_out;
 
+	if (flags & FAN_MARK_ADD) {
+		ret = fanotify_events_supported(&path, mask);
+		if (ret)
+			goto path_put_and_out;
+	}
+
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		ret = fanotify_test_fid(&path, &__fsid);
+		if (ret)
+			goto path_put_and_out;
+
+		fsid = &__fsid;
+	}
+
 	/* inode held in place by reference to path; group by fget on fd */
-	if (!(flags & FAN_MARK_MOUNT))
+	if (mark_type == FAN_MARK_INODE)
 		inode = path.dentry->d_inode;
 	else
 		mnt = path.mnt;
@@ -883,21 +1072,32 @@
 	/* create/update an inode mark */
 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
 	case FAN_MARK_ADD:
-		if (flags & FAN_MARK_MOUNT)
-			ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+		if (mark_type == FAN_MARK_MOUNT)
+			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
+							 flags, fsid);
+		else if (mark_type == FAN_MARK_FILESYSTEM)
+			ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
+						   flags, fsid);
 		else
-			ret = fanotify_add_inode_mark(group, inode, mask, flags);
+			ret = fanotify_add_inode_mark(group, inode, mask,
+						      flags, fsid);
 		break;
 	case FAN_MARK_REMOVE:
-		if (flags & FAN_MARK_MOUNT)
-			ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+		if (mark_type == FAN_MARK_MOUNT)
+			ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
+							    flags);
+		else if (mark_type == FAN_MARK_FILESYSTEM)
+			ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
+						      flags);
 		else
-			ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+			ret = fanotify_remove_inode_mark(group, inode, mask,
+							 flags);
 		break;
 	default:
 		ret = -EINVAL;
 	}
 
+path_put_and_out:
 	path_put(&path);
 fput_and_out:
 	fdput(f);
@@ -934,12 +1134,15 @@
  */
 static int __init fanotify_user_setup(void)
 {
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
+
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
 					 SLAB_PANIC|SLAB_ACCOUNT);
-	fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+	fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
 		fanotify_perm_event_cachep =
-			KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC);
+			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
 	}
 
 	return 0;

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 86fcf58..1e2bfd2 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c

@@ -15,6 +15,7 @@
 #include <linux/exportfs.h>
 
 #include "inotify/inotify.h"
+#include "fdinfo.h"
 #include "fsnotify.h"
 
 #if defined(CONFIG_PROC_FS)
@@ -131,37 +132,20 @@
 
 		seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
 			   mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
+	} else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) {
+		struct super_block *sb = fsnotify_conn_sb(mark->connector);
+
+		seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
+			   sb->s_dev, mflags, mark->mask, mark->ignored_mask);
 	}
 }
 
 void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct fsnotify_group *group = f->private_data;
-	unsigned int flags = 0;
-
-	switch (group->priority) {
-	case FS_PRIO_0:
-		flags |= FAN_CLASS_NOTIF;
-		break;
-	case FS_PRIO_1:
-		flags |= FAN_CLASS_CONTENT;
-		break;
-	case FS_PRIO_2:
-		flags |= FAN_CLASS_PRE_CONTENT;
-		break;
-	}
-
-	if (group->max_events == UINT_MAX)
-		flags |= FAN_UNLIMITED_QUEUE;
-
-	if (group->fanotify_data.max_marks == UINT_MAX)
-		flags |= FAN_UNLIMITED_MARKS;
-
-	if (group->fanotify_data.audit)
-		flags |= FAN_ENABLE_AUDIT;
 
 	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
-		   flags, group->fanotify_data.f_flags);
+		   group->fanotify_data.flags, group->fanotify_data.f_flags);
 
 	show_fdinfo(m, f, fanotify_fdinfo);
 }

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 170a733..2ecef61 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/dcache.h>
@@ -48,7 +35,7 @@
  * Called during unmount with no locks held, so needs to be safe against
  * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
  */
-void fsnotify_unmount_inodes(struct super_block *sb)
+static void fsnotify_unmount_inodes(struct super_block *sb)
 {
 	struct inode *inode, *iput_inode = NULL;
 
@@ -101,6 +88,12 @@
 		       !atomic_long_read(&sb->s_fsnotify_inode_refs));
 }
 
+void fsnotify_sb_delete(struct super_block *sb)
+{
+	fsnotify_unmount_inodes(sb);
+	fsnotify_clear_marks_by_sb(sb);
+}
+
 /*
  * Given an inode, first check if we care what happens to our children.  Inotify
  * and dnotify both tell their parents about events.  If we care about any event
@@ -173,10 +166,10 @@
 		take_dentry_name_snapshot(&name, dentry);
 		if (path)
 			ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
-				       name.name, 0);
+				       &name.name, 0);
 		else
 			ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-				       name.name, 0);
+				       &name.name, 0);
 		release_dentry_name_snapshot(&name);
 	}
 
@@ -189,7 +182,7 @@
 static int send_to_group(struct inode *to_tell,
 			 __u32 mask, const void *data,
 			 int data_is, u32 cookie,
-			 const unsigned char *file_name,
+			 const struct qstr *file_name,
 			 struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_group *group = NULL;
@@ -319,21 +312,22 @@
  * notification event in whatever means they feel necessary.
  */
 int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
-	     const unsigned char *file_name, u32 cookie)
+	     const struct qstr *file_name, u32 cookie)
 {
 	struct fsnotify_iter_info iter_info = {};
-	struct mount *mnt;
+	struct super_block *sb = to_tell->i_sb;
+	struct mount *mnt = NULL;
+	__u32 mnt_or_sb_mask = sb->s_fsnotify_mask;
 	int ret = 0;
 	__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
 
-	if (data_is == FSNOTIFY_EVENT_PATH)
+	if (data_is == FSNOTIFY_EVENT_PATH) {
 		mnt = real_mount(((const struct path *)data)->mnt);
-	else
-		mnt = NULL;
-
-	/* An event "on child" is not intended for a mount mark */
+		mnt_or_sb_mask |= mnt->mnt_fsnotify_mask;
+	}
+	/* An event "on child" is not intended for a mount/sb mark */
 	if (mask & FS_EVENT_ON_CHILD)
-		mnt = NULL;
+		mnt_or_sb_mask = 0;
 
 	/*
 	 * Optimization: srcu_read_lock() has a memory barrier which can
@@ -342,31 +336,32 @@
 	 * SRCU because we have no references to any objects and do not
 	 * need SRCU to keep them "alive".
 	 */
-	if (!to_tell->i_fsnotify_marks &&
+	if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
 	    (!mnt || !mnt->mnt_fsnotify_marks))
 		return 0;
 	/*
 	 * if this is a modify event we may need to clear the ignored masks
-	 * otherwise return if neither the inode nor the vfsmount care about
+	 * otherwise return if neither the inode nor the vfsmount/sb care about
 	 * this type of event.
 	 */
 	if (!(mask & FS_MODIFY) &&
-	    !(test_mask & to_tell->i_fsnotify_mask) &&
-	    !(mnt && test_mask & mnt->mnt_fsnotify_mask))
+	    !(test_mask & (to_tell->i_fsnotify_mask | mnt_or_sb_mask)))
 		return 0;
 
 	iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
 
 	iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
 		fsnotify_first_mark(&to_tell->i_fsnotify_marks);
+	iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
+		fsnotify_first_mark(&sb->s_fsnotify_marks);
 	if (mnt) {
 		iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
 			fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
 	}
 
 	/*
-	 * We need to merge inode & vfsmount mark lists so that inode mark
-	 * ignore masks are properly reflected for mount mark notifications.
+	 * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
+	 * ignore masks are properly reflected for mount/sb mark notifications.
 	 * That's why this traversal is so complicated...
 	 */
 	while (fsnotify_iter_select_report_types(&iter_info)) {
@@ -392,7 +387,7 @@
 {
 	int ret;
 
-	BUG_ON(hweight32(ALL_FSNOTIFY_BITS) != 23);
+	BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
 
 	ret = init_srcu_struct(&fsnotify_mark_srcu);
 	if (ret)

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 7902653..f346282 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h

@@ -21,6 +21,12 @@
 	return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
 }
 
+static inline struct super_block *fsnotify_conn_sb(
+				struct fsnotify_mark_connector *conn)
+{
+	return container_of(conn->obj, struct super_block, s_fsnotify_marks);
+}
+
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
@@ -43,8 +49,11 @@
 {
 	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
 }
-/* Wait until all marks queued for destruction are destroyed */
-extern void fsnotify_wait_marks_destroyed(void);
+/* run the list of all marks associated with sb and destroy them */
+static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
+{
+	fsnotify_destroy_marks(&sb->s_fsnotify_marks);
+}
 
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares

diff --git a/fs/notify/group.c b/fs/notify/group.c
index c03b836..133f723 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/list.h>
@@ -121,6 +108,7 @@
 	if (refcount_dec_and_test(&group->refcnt))
 		fsnotify_final_destroy_group(group);
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_group);
 
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
@@ -150,6 +138,7 @@
 
 	return group;
 }
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
 int fsnotify_fasync(int fd, struct file *file, int on)
 {

diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b981fc0..6736e47 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig

@@ -1,6 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	select ANON_INODES
 	select FSNOTIFY
 	default y
 	---help---

diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index a380dab..812237e 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile

@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o

diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7e4578d..3f246f7 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h

@@ -5,6 +5,7 @@
 
 struct inotify_event_info {
 	struct fsnotify_event fse;
+	u32 mask;
 	int wd;
 	u32 sync_cookie;
 	int name_len;
@@ -26,7 +27,7 @@
 extern int inotify_handle_event(struct fsnotify_group *group,
 				struct inode *inode,
 				u32 mask, const void *data, int data_type,
-				const unsigned char *file_name, u32 cookie,
+				const struct qstr *file_name, u32 cookie,
 				struct fsnotify_iter_info *iter_info);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f4184b4..d510223 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * fs/inotify_user.c - inotify support for userspace
  *
@@ -10,16 +11,6 @@
  *
  * Copyright (C) 2009 Eric Paris <Red Hat Inc>
  * inotify was largely rewriten to make use of the fsnotify infrastructure
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/dcache.h> /* d_unlinked */
@@ -43,11 +34,11 @@
 {
 	struct inotify_event_info *old, *new;
 
-	if (old_fsn->mask & FS_IN_IGNORED)
-		return false;
 	old = INOTIFY_E(old_fsn);
 	new = INOTIFY_E(new_fsn);
-	if ((old_fsn->mask == new_fsn->mask) &&
+	if (old->mask & FS_IN_IGNORED)
+		return false;
+	if ((old->mask == new->mask) &&
 	    (old_fsn->inode == new_fsn->inode) &&
 	    (old->name_len == new->name_len) &&
 	    (!old->name_len || !strcmp(old->name, new->name)))
@@ -67,7 +58,7 @@
 int inotify_handle_event(struct fsnotify_group *group,
 			 struct inode *inode,
 			 u32 mask, const void *data, int data_type,
-			 const unsigned char *file_name, u32 cookie,
+			 const struct qstr *file_name, u32 cookie,
 			 struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
@@ -89,7 +80,7 @@
 			return 0;
 	}
 	if (file_name) {
-		len = strlen(file_name);
+		len = file_name->len;
 		alloc_len += len + 1;
 	}
 
@@ -99,9 +90,13 @@
 	i_mark = container_of(inode_mark, struct inotify_inode_mark,
 			      fsn_mark);
 
-	/* Whoever is interested in the event, pays for the allocation. */
+	/*
+	 * Whoever is interested in the event, pays for the allocation. Do not
+	 * trigger OOM killer in the target monitoring memcg as it may have
+	 * security repercussion.
+	 */
 	memalloc_use_memcg(group->memcg);
-	event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT);
+	event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 	memalloc_unuse_memcg();
 
 	if (unlikely(!event)) {
@@ -113,13 +108,23 @@
 		return -ENOMEM;
 	}
 
+	/*
+	 * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
+	 * for fanotify. inotify never reported IN_ISDIR with those events.
+	 * It looks like an oversight, but to avoid the risk of breaking
+	 * existing inotify programs, mask the flag out from those events.
+	 */
+	if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
+		mask &= ~IN_ISDIR;
+
 	fsn_event = &event->fse;
-	fsnotify_init_event(fsn_event, inode, mask);
+	fsnotify_init_event(fsn_event, inode);
+	event->mask = mask;
 	event->wd = i_mark->wd;
 	event->sync_cookie = cookie;
 	event->name_len = len;
 	if (len)
-		strcpy(event->name, file_name);
+		strcpy(event->name, file_name->name);
 
 	ret = fsnotify_add_event(group, fsn_event, inotify_merge);
 	if (ret) {

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ac6978d..107537a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * fs/inotify_user.c - inotify support for userspace
  *
@@ -10,16 +11,6 @@
  *
  * Copyright (C) 2009 Eric Paris <Red Hat Inc>
  * inotify was largely rewriten to make use of the fsnotify infrastructure
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/file.h>
@@ -39,6 +30,7 @@
 #include <linux/poll.h>
 #include <linux/wait.h>
 #include <linux/memcontrol.h>
+#include <linux/security.h>
 
 #include "inotify.h"
 #include "../fdinfo.h"
@@ -54,8 +46,6 @@
 
 #include <linux/sysctl.h>
 
-static int zero;
-
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
@@ -63,7 +53,7 @@
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "max_user_watches",
@@ -71,7 +61,7 @@
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "max_queued_events",
@@ -79,7 +69,7 @@
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero
+		.extra1		= SYSCTL_ZERO
 	},
 	{ }
 };
@@ -189,7 +179,7 @@
 	 */
 	pad_name_len = round_event_name_len(fsn_event);
 	inotify_event.len = pad_name_len;
-	inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+	inotify_event.mask = inotify_mask_to_arg(event->mask);
 	inotify_event.wd = event->wd;
 	inotify_event.cookie = event->sync_cookie;
 
@@ -342,7 +332,8 @@
 /*
  * find_inode - resolve a user-given path to a specific inode
  */
-static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+static int inotify_find_inode(const char __user *dirname, struct path *path,
+						unsigned int flags, __u64 mask)
 {
 	int error;
 
@@ -351,8 +342,15 @@
 		return error;
 	/* you can only watch an inode if you have read permissions on it */
 	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error) {
+		path_put(path);
+		return error;
+	}
+	error = security_path_notify(path, mask,
+				FSNOTIFY_OBJ_TYPE_INODE);
 	if (error)
 		path_put(path);
+
 	return error;
 }
 
@@ -519,8 +517,10 @@
 	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark)
 		return -ENOENT;
-	else if (create)
-		return -EEXIST;
+	else if (create) {
+		ret = -EEXIST;
+		goto out;
+	}
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
@@ -548,6 +548,7 @@
 	/* return the wd */
 	ret = i_mark->wd;
 
+out:
 	/* match the get from fsnotify_find_mark() */
 	fsnotify_put_mark(fsn_mark);
 
@@ -634,7 +635,8 @@
 		return ERR_PTR(-ENOMEM);
 	}
 	group->overflow_event = &oevent->fse;
-	fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+	fsnotify_init_event(group->overflow_event, NULL);
+	oevent->mask = FS_Q_OVERFLOW;
 	oevent->wd = -1;
 	oevent->sync_cookie = 0;
 	oevent->name_len = 0;
@@ -724,8 +726,10 @@
 		return -EBADF;
 
 	/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
-	if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE)))
-		return -EINVAL;
+	if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
 
 	/* verify that this is indeed an inotify instance */
 	if (unlikely(f.file->f_op != &inotify_fops)) {
@@ -738,7 +742,8 @@
 	if (mask & IN_ONLYDIR)
 		flags |= LOOKUP_DIRECTORY;
 
-	ret = inotify_find_inode(pathname, &path, flags);
+	ret = inotify_find_inode(pathname, &path, flags,
+			(mask & IN_ALL_EVENTS));
 	if (ret)
 		goto fput_and_out;
 
@@ -815,7 +820,7 @@
 	BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
 	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
 
-	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22);
+	BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22);
 
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
 					       SLAB_PANIC|SLAB_ACCOUNT);

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 09535f6..1d96216 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -82,6 +69,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
+#include <linux/ratelimit.h>
 
 #include <linux/atomic.h>
 
@@ -115,6 +103,8 @@
 		return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
 	else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
 		return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
+	else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
+		return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
 	return NULL;
 }
 
@@ -195,6 +185,8 @@
 		atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
 	} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
 		fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
+	} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
+		fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
 	}
 
 	rcu_assign_pointer(*(conn->obj), NULL);
@@ -234,13 +226,13 @@
 
 void fsnotify_put_mark(struct fsnotify_mark *mark)
 {
-	struct fsnotify_mark_connector *conn;
+	struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
 	void *objp = NULL;
 	unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
 	bool free_conn = false;
 
 	/* Catch marks that were actually never attached to object */
-	if (!mark->connector) {
+	if (!conn) {
 		if (refcount_dec_and_test(&mark->refcnt))
 			fsnotify_final_mark_destroy(mark);
 		return;
@@ -250,10 +242,9 @@
 	 * We have to be careful so that traversals of obj_list under lock can
 	 * safely grab mark reference.
 	 */
-	if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+	if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
 		return;
 
-	conn = mark->connector;
 	hlist_del_init_rcu(&mark->obj_list);
 	if (hlist_empty(&conn->list)) {
 		objp = fsnotify_detach_connector_from_object(conn, &type);
@@ -261,7 +252,7 @@
 	} else {
 		__fsnotify_recalc_mask(conn);
 	}
-	mark->connector = NULL;
+	WRITE_ONCE(mark->connector, NULL);
 	spin_unlock(&conn->lock);
 
 	fsnotify_drop_object(type, objp);
@@ -285,6 +276,7 @@
 	queue_delayed_work(system_unbound_wq, &reaper_work,
 			   FSNOTIFY_REAPER_DELAY);
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_mark);
 
 /*
  * Get mark reference when we found the mark via lockless traversal of object
@@ -439,6 +431,7 @@
 	mutex_unlock(&group->mark_mutex);
 	fsnotify_free_mark(mark);
 }
+EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
 
 /*
  * Sorting function for lists of fsnotify marks.
@@ -477,7 +470,8 @@
 }
 
 static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
-					       unsigned int type)
+					       unsigned int type,
+					       __kernel_fsid_t *fsid)
 {
 	struct inode *inode = NULL;
 	struct fsnotify_mark_connector *conn;
@@ -489,6 +483,14 @@
 	INIT_HLIST_HEAD(&conn->list);
 	conn->type = type;
 	conn->obj = connp;
+	/* Cache fsid of filesystem containing the object */
+	if (fsid) {
+		conn->fsid = *fsid;
+		conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID;
+	} else {
+		conn->fsid.val[0] = conn->fsid.val[1] = 0;
+		conn->flags = 0;
+	}
 	if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
 		inode = igrab(fsnotify_conn_inode(conn));
 	/*
@@ -540,7 +542,7 @@
  */
 static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
 				  fsnotify_connp_t *connp, unsigned int type,
-				  int allow_dups)
+				  int allow_dups, __kernel_fsid_t *fsid)
 {
 	struct fsnotify_mark *lmark, *last = NULL;
 	struct fsnotify_mark_connector *conn;
@@ -549,15 +551,41 @@
 
 	if (WARN_ON(!fsnotify_valid_obj_type(type)))
 		return -EINVAL;
+
+	/* Backend is expected to check for zero fsid (e.g. tmpfs) */
+	if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
+		return -ENODEV;
+
 restart:
 	spin_lock(&mark->lock);
 	conn = fsnotify_grab_connector(connp);
 	if (!conn) {
 		spin_unlock(&mark->lock);
-		err = fsnotify_attach_connector_to_object(connp, type);
+		err = fsnotify_attach_connector_to_object(connp, type, fsid);
 		if (err)
 			return err;
 		goto restart;
+	} else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) {
+		conn->fsid = *fsid;
+		/* Pairs with smp_rmb() in fanotify_get_fsid() */
+		smp_wmb();
+		conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID;
+	} else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) &&
+		   (fsid->val[0] != conn->fsid.val[0] ||
+		    fsid->val[1] != conn->fsid.val[1])) {
+		/*
+		 * Backend is expected to check for non uniform fsid
+		 * (e.g. btrfs), but maybe we missed something?
+		 * Only allow setting conn->fsid once to non zero fsid.
+		 * inotify and non-fid fanotify groups do not set nor test
+		 * conn->fsid.
+		 */
+		pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
+				    "%x.%x != %x.%x\n", __func__, conn->type,
+				    fsid->val[0], fsid->val[1],
+				    conn->fsid.val[0], conn->fsid.val[1]);
+		err = -EXDEV;
+		goto out_err;
 	}
 
 	/* is mark the first mark? */
@@ -588,7 +616,12 @@
 	/* mark should be the last entry.  last is the current last entry */
 	hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
 added:
-	mark->connector = conn;
+	/*
+	 * Since connector is attached to object using cmpxchg() we are
+	 * guaranteed that connector initialization is fully visible by anyone
+	 * seeing mark->connector set.
+	 */
+	WRITE_ONCE(mark->connector, conn);
 out_err:
 	spin_unlock(&conn->lock);
 	spin_unlock(&mark->lock);
@@ -602,7 +635,7 @@
  */
 int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 			     fsnotify_connp_t *connp, unsigned int type,
-			     int allow_dups)
+			     int allow_dups, __kernel_fsid_t *fsid)
 {
 	struct fsnotify_group *group = mark->group;
 	int ret = 0;
@@ -623,7 +656,7 @@
 	fsnotify_get_mark(mark); /* for g_list */
 	spin_unlock(&mark->lock);
 
-	ret = fsnotify_add_mark_list(mark, connp, type, allow_dups);
+	ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
 	if (ret)
 		goto err;
 
@@ -644,16 +677,17 @@
 }
 
 int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
-		      unsigned int type, int allow_dups)
+		      unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
 {
 	int ret;
 	struct fsnotify_group *group = mark->group;
 
 	mutex_lock(&group->mark_mutex);
-	ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups);
+	ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
 	mutex_unlock(&group->mark_mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(fsnotify_add_mark);
 
 /*
  * Given a list of marks, find the mark associated with given group. If found
@@ -680,6 +714,7 @@
 	spin_unlock(&conn->lock);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(fsnotify_find_mark);
 
 /* Clear any marks in a group with given type mask */
 void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
@@ -776,7 +811,9 @@
 	refcount_set(&mark->refcnt, 1);
 	fsnotify_get_group(group);
 	mark->group = group;
+	WRITE_ONCE(mark->connector, NULL);
 }
+EXPORT_SYMBOL_GPL(fsnotify_init_mark);
 
 /*
  * Destroy all marks in destroy_list, waits for SRCU period to finish before
@@ -805,3 +842,4 @@
 {
 	flush_delayed_work(&reaper_work);
 }
+EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3c3e367..75d79d6 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c

@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -71,7 +58,7 @@
 			    struct fsnotify_event *event)
 {
 	/* Overflow events are per-group and we don't want to free them */
-	if (!event || event->mask == FS_Q_OVERFLOW)
+	if (!event || event == group->overflow_event)
 		return;
 	/*
 	 * If the event is still queued, we have a problem... Do an unreliable
@@ -141,6 +128,18 @@
 	return ret;
 }
 
+void fsnotify_remove_queued_event(struct fsnotify_group *group,
+				  struct fsnotify_event *event)
+{
+	assert_spin_locked(&group->notification_lock);
+	/*
+	 * We need to init list head for the case of overflow event so that
+	 * check in fsnotify_add_event() works
+	 */
+	list_del_init(&event->list);
+	group->q_len--;
+}
+
 /*
  * Remove and return the first event from the notification list.  It is the
  * responsibility of the caller to destroy the obtained event
@@ -155,13 +154,7 @@
 
 	event = list_first_entry(&group->notification_list,
 				 struct fsnotify_event, list);
-	/*
-	 * We need to init list head for the case of overflow event so that
-	 * check in fsnotify_add_event() works
-	 */
-	list_del_init(&event->list);
-	group->q_len--;
-
+	fsnotify_remove_queued_event(group, event);
 	return event;
 }
 
@@ -194,23 +187,3 @@
 	}
 	spin_unlock(&group->notification_lock);
 }
-
-/*
- * fsnotify_create_event - Allocate a new event which will be sent to each
- * group's handle_event function if the group was interested in this
- * particular event.
- *
- * @inode the inode which is supposed to receive the event (sometimes a
- *	parent of the inode to which the event happened.
- * @mask what actually happened.
- * @data pointer to the object which was actually affected
- * @data_type flag indication if the data is a file, path, inode, nothing...
- * @name the filename, if available
- */
-void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
-			 u32 mask)
-{
-	INIT_LIST_HEAD(&event->list);
-	event->inode = inode;
-	event->mask = mask;
-}

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 60702d6..a043164 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c

@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/proc_ns.h>
@@ -85,13 +86,12 @@
 	inode->i_fop = &ns_file_operations;
 	inode->i_private = ns;
 
-	dentry = d_alloc_pseudo(mnt->mnt_sb, &empty_name);
+	dentry = d_alloc_anon(mnt->mnt_sb);
 	if (!dentry) {
 		iput(inode);
 		return ERR_PTR(-ENOMEM);
 	}
 	d_instantiate(dentry, inode);
-	dentry->d_flags |= DCACHE_RCUACCESS;
 	dentry->d_fsdata = (void *)ns->ops;
 	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
 	if (d) {
@@ -106,17 +106,16 @@
 void *ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		     void *private_data)
 {
-	struct ns_common *ns;
 	void *ret;
 
-again:
-	ns = ns_get_cb(private_data);
-	if (!ns)
-		return ERR_PTR(-ENOENT);
+	do {
+		struct ns_common *ns = ns_get_cb(private_data);
+		if (!ns)
+			return ERR_PTR(-ENOENT);
 
-	ret = __ns_get_path(path, ns);
-	if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN)
-		goto again;
+		ret = __ns_get_path(path, ns);
+	} while (ret == ERR_PTR(-EAGAIN));
+
 	return ret;
 }
 
@@ -155,7 +154,7 @@
 	if (fd < 0)
 		return fd;
 
-	while (1) {
+	do {
 		struct ns_common *relative;
 
 		relative = get_ns(ns);
@@ -165,10 +164,8 @@
 		}
 
 		err = __ns_get_path(&path, relative);
-		if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN)
-			continue;
-		break;
-	}
+	} while (err == ERR_PTR(-EAGAIN));
+
 	if (IS_ERR(err)) {
 		put_unused_fd(fd);
 		return PTR_ERR(err);
@@ -262,15 +259,20 @@
 	.evict_inode = nsfs_evict,
 	.show_path = nsfs_show_path,
 };
-static struct dentry *nsfs_mount(struct file_system_type *fs_type,
-			int flags, const char *dev_name, void *data)
+
+static int nsfs_init_fs_context(struct fs_context *fc)
 {
-	return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
-			&ns_dentry_operations, NSFS_MAGIC);
+	struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+	ctx->ops = &nsfs_ops;
+	ctx->dops = &ns_dentry_operations;
+	return 0;
 }
+
 static struct file_system_type nsfs = {
 	.name = "nsfs",
-	.mount = nsfs_mount,
+	.init_fs_context = nsfs_init_fs_context,
 	.kill_sb = kill_anon_super,
 };
 

diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
index f5a868c..de9fb5c 100644
--- a/fs/ntfs/Kconfig
+++ b/fs/ntfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config NTFS_FS
 	tristate "NTFS file system support"
 	select NLS

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 8946130..7202a1e 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * aops.c - NTFS kernel address space operations and page cache handling.
  *
  * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/errno.h>

diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 820d6ea..f0962d4 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h

@@ -1,24 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /**
  * aops.h - Defines for NTFS kernel address space operations and page cache
  *	    handling.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2004 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_AOPS_H

diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 44a39a0..d563abc 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/buffer_head.h>

diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 3c8b74c..fe0890d 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h

@@ -1,24 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
  *	      Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_ATTRIB_H

diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index ec130c5..0675b24 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * bitmap.c - NTFS kernel bitmap handling.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2004-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifdef NTFS_RW

diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
index 72c9ad8..9dd2224 100644
--- a/fs/ntfs/bitmap.h
+++ b/fs/ntfs/bitmap.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * bitmap.h - Defines for NTFS kernel bitmap handling.  Part of the Linux-NTFS
  *	      project.
  *
  * Copyright (c) 2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_BITMAP_H

diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
index 4a28ab3..3ab6ec9 100644
--- a/fs/ntfs/collate.c
+++ b/fs/ntfs/collate.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * collate.c - NTFS kernel collation handling.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include "collate.h"

diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
index aba8334..f225561 100644
--- a/fs/ntfs/collate.h
+++ b/fs/ntfs/collate.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * collate.h - Defines for NTFS kernel collation handling.  Part of the
  *	       Linux-NTFS project.
  *
  * Copyright (c) 2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_COLLATE_H

diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index df7c32b..d2f9d6a 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c

@@ -1,24 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * compress.c - NTFS kernel compressed attributes handling.
  *		Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2004 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/fs.h>

diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index 825a54e..a3c1c56 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include "debug.h"

diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 61bf091..6fdef38 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h

@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_DEBUG_H

diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 1a24be9..3c48114 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/buffer_head.h>

diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
index aea7582..0e32675 100644
--- a/fs/ntfs/dir.h
+++ b/fs/ntfs/dir.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
  *	   the Linux-NTFS project.
  *
  * Copyright (c) 2002-2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_DIR_H

diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
index 927b5bf..f30c139 100644
--- a/fs/ntfs/endian.h
+++ b/fs/ntfs/endian.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
  *	      Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_ENDIAN_H

diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 331910f..f42967b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/backing-dev.h>

diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 0d645f3..d46c2c0 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * index.c - NTFS kernel index handling.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2004-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/slab.h>

diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
index 8745469..bb3c3ae 100644
--- a/fs/ntfs/index.h
+++ b/fs/ntfs/index.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * index.h - Defines for NTFS kernel index handling.  Part of the Linux-NTFS
  *	     project.
  *
  * Copyright (c) 2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_INDEX_H

diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index bd3221c..6c73884 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * inode.c - NTFS kernel inode handling.
  *
  * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/buffer_head.h>
@@ -332,23 +318,11 @@
 	return NULL;
 }
 
-static void ntfs_i_callback(struct rcu_head *head)
+void ntfs_free_big_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
 }
 
-void ntfs_destroy_big_inode(struct inode *inode)
-{
-	ntfs_inode *ni = NTFS_I(inode);
-
-	ntfs_debug("Entering.");
-	BUG_ON(ni->page);
-	if (!atomic_dec_and_test(&ni->count))
-		BUG();
-	call_rcu(&inode->i_rcu, ntfs_i_callback);
-}
-
 static inline ntfs_inode *ntfs_alloc_extent_inode(void)
 {
 	ntfs_inode *ni;
@@ -2287,6 +2261,9 @@
 			ni->ext.base_ntfs_ino = NULL;
 		}
 	}
+	BUG_ON(ni->page);
+	if (!atomic_dec_and_test(&ni->count))
+		BUG();
 	return;
 }
 
@@ -2922,15 +2899,18 @@
 			ia_valid |= ATTR_MTIME | ATTR_CTIME;
 		}
 	}
-	if (ia_valid & ATTR_ATIME)
-		vi->i_atime = timespec64_trunc(attr->ia_atime,
-					       vi->i_sb->s_time_gran);
-	if (ia_valid & ATTR_MTIME)
-		vi->i_mtime = timespec64_trunc(attr->ia_mtime,
-					       vi->i_sb->s_time_gran);
-	if (ia_valid & ATTR_CTIME)
-		vi->i_ctime = timespec64_trunc(attr->ia_ctime,
-					       vi->i_sb->s_time_gran);
+	if (ia_valid & ATTR_ATIME) {
+		vi->i_atime = timestamp_truncate(attr->ia_atime,
+					       vi);
+	}
+	if (ia_valid & ATTR_MTIME) {
+		vi->i_mtime = timestamp_truncate(attr->ia_mtime,
+					       vi);
+	}
+	if (ia_valid & ATTR_CTIME) {
+		vi->i_ctime = timestamp_truncate(attr->ia_ctime,
+					       vi);
+	}
 	mark_inode_dirty(vi);
 out:
 	return err;

diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index b3c3469..98e670f 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h

@@ -1,24 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
  *	     the Linux-NTFS project.
  *
  * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_INODE_H
@@ -278,7 +264,7 @@
 		u32 name_len);
 
 extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
-extern void ntfs_destroy_big_inode(struct inode *inode);
+extern void ntfs_free_big_inode(struct inode *inode);
 extern void ntfs_evict_big_inode(struct inode *vi);
 
 extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);

diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 809c0e6..8542276 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h

@@ -1,24 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
  *	      project.
  *
  * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_LAYOUT_H

diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 27a24a4..eda9972 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * lcnalloc.c - Cluster (de)allocation code.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2004-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifdef NTFS_RW

diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index 2adb043..1589a6d 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation.  Part of the
  *		Linux-NTFS project.
  *
  * Copyright (c) 2004-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_LCNALLOC_H

diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 353379f..a0c40f1 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
  *
  * Copyright (c) 2002-2007 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifdef NTFS_RW

diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index aa2b6ac..429d490 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
  *	       the Linux-NTFS project.
  *
  * Copyright (c) 2000-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_LOGFILE_H

diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index ab172e5..842b0bf 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h

@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_MALLOC_H
@@ -47,7 +33,7 @@
 		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
 		/* return (void *)__get_free_page(gfp_mask); */
 	}
-	if (likely((size >> PAGE_SHIFT) < totalram_pages))
+	if (likely((size >> PAGE_SHIFT) < totalram_pages()))
 		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
 	return NULL;
 }

diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index fb14d17..3aac5c9 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/buffer_head.h>
@@ -85,7 +71,7 @@
 	}
 	/* Read, map, and pin the page. */
 	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (likely(!IS_ERR(page))) {
+	if (!IS_ERR(page)) {
 		/* Catch multi sector transfer fixup errors. */
 		if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
 				ofs)))) {
@@ -168,7 +154,7 @@
 	mutex_lock(&ni->mrec_lock);
 
 	m = map_mft_record_page(ni);
-	if (likely(!IS_ERR(m)))
+	if (!IS_ERR(m))
 		return m;
 
 	mutex_unlock(&ni->mrec_lock);
@@ -285,7 +271,7 @@
 		m = map_mft_record(ni);
 		/* map_mft_record() has incremented this on success. */
 		atomic_dec(&ni->count);
-		if (likely(!IS_ERR(m))) {
+		if (!IS_ERR(m)) {
 			/* Verify the sequence number. */
 			if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
 				ntfs_debug("Done 1.");
@@ -1317,7 +1303,7 @@
 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
 			(ll - 1) >> vol->cluster_size_bits, NULL);
-	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
 		up_write(&mftbmp_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft bitmap attribute.");
@@ -1748,7 +1734,7 @@
 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 	rl = ntfs_attr_find_vcn_nolock(mft_ni,
 			(ll - 1) >> vol->cluster_size_bits, NULL);
-	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
 		up_write(&mft_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft data attribute.");
@@ -1790,7 +1776,7 @@
 	do {
 		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
 				true);
-		if (likely(!IS_ERR(rl2)))
+		if (!IS_ERR(rl2))
 			break;
 		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
 			ntfs_error(vol->sb, "Failed to allocate the minimal "

diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
index b52bf87..17bfefc 100644
--- a/fs/ntfs/mft.h
+++ b/fs/ntfs/mft.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
  *	   Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_MFT_H

diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
index 5a858d8..16b3c88 100644
--- a/fs/ntfs/mst.c
+++ b/fs/ntfs/mst.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * mst.c - NTFS multi sector transfer protection handling code. Part of the
  *	   Linux-NTFS project.
  *
  * Copyright (c) 2001-2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include "ntfs.h"

diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 4690cd7..4e6a44b 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
  *	     project.
  *
  * Copyright (c) 2001-2006 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/dcache.h>
@@ -129,7 +115,7 @@
 		dent_ino = MREF(mref);
 		ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
 		dent_inode = ntfs_iget(vol->sb, dent_ino);
-		if (likely(!IS_ERR(dent_inode))) {
+		if (!IS_ERR(dent_inode)) {
 			/* Consistency check. */
 			if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
 					NTFS_I(dent_inode)->seq_no ||
@@ -312,7 +298,7 @@
 	/* Get the mft record of the inode belonging to the child dentry. */
 	mrec = map_mft_record(ni);
 	if (IS_ERR(mrec))
-		return (struct dentry *)mrec;
+		return ERR_CAST(mrec);
 	/* Find the first file name attribute in the mft record. */
 	ctx = ntfs_attr_get_search_ctx(ni, mrec);
 	if (unlikely(!ctx)) {

diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index 12de47b..e81376e 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * ntfs.h - Defines for NTFS Linux kernel driver.
  *
  * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
  * Copyright (C) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_H

diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index 9793e68..9160480 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * quota.c - NTFS kernel quota ($Quota) handling.  Part of the Linux-NTFS
  *	     project.
  *
  * Copyright (c) 2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifdef NTFS_RW

diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
index 4cbe559..fe3132a 100644
--- a/fs/ntfs/quota.h
+++ b/fs/ntfs/quota.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * quota.h - Defines for NTFS kernel quota ($Quota) handling.  Part of the
  *	     Linux-NTFS project.
  *
  * Copyright (c) 2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_QUOTA_H

diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index eac7d67..97932fb 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /**
  * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002-2005 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include "debug.h"
@@ -965,7 +951,7 @@
 	}
 	/* Now combine the new and old runlists checking for overlaps. */
 	old_rl = ntfs_runlists_merge(old_rl, rl);
-	if (likely(!IS_ERR(old_rl)))
+	if (!IS_ERR(old_rl))
 		return old_rl;
 	ntfs_free(rl);
 	ntfs_error(vol->sb, "Failed to merge runlists.");

diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
index 47728fb..38de0a3 100644
--- a/fs/ntfs/runlist.h
+++ b/fs/ntfs/runlist.h

@@ -1,24 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
  *	       Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_RUNLIST_H

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index bb7159f..7dc3bc6 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2001,2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
@@ -1489,7 +1475,7 @@
 	kfree(name);
 	/* Get the inode. */
 	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
-	if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) {
+	if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) {
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
@@ -2676,7 +2662,7 @@
  */
 static const struct super_operations ntfs_sops = {
 	.alloc_inode	= ntfs_alloc_big_inode,	  /* VFS: Allocate new inode. */
-	.destroy_inode	= ntfs_destroy_big_inode, /* VFS: Deallocate inode. */
+	.free_inode	= ntfs_free_big_inode, /* VFS: Deallocate inode. */
 #ifdef NTFS_RW
 	.write_inode	= ntfs_write_inode,	/* VFS: Write dirty inode to
 						   disk. */

diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index a503156..a030d00 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c

@@ -1,24 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
  *	      the Linux-NTFS project. Adapted from the old NTFS driver,
  *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
  *
  * Copyright (c) 2002-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifdef DEBUG

diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
index d4f8ce9..96bb229 100644
--- a/fs/ntfs/sysctl.h
+++ b/fs/ntfs/sysctl.h

@@ -1,24 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
  *	      the Linux-NTFS project. Adapted from the old NTFS driver,
  *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
  *
  * Copyright (c) 2002-2004 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_SYSCTL_H

diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
index 24cd719..6b63261 100644
--- a/fs/ntfs/time.h
+++ b/fs/ntfs/time.h

@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * time.h - NTFS time conversion functions.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_TIME_H

diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
index 8c8053b..9a47859 100644
--- a/fs/ntfs/types.h
+++ b/fs/ntfs/types.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * types.h - Defines for NTFS Linux kernel driver specific types.
  *	     Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_TYPES_H

diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index 005ca4b..a6b6c64 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2006 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/slab.h>

diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
index e2f72ca..4ebe84a 100644
--- a/fs/ntfs/upcase.c
+++ b/fs/ntfs/upcase.c

@@ -1,24 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
  *	      Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
  * Copyright (c) 2001-2006 Anton Altaparmakov
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS source
- * in the file COPYING); if not, write to the Free Software Foundation,
- * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include "malloc.h"

diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
index b2bc0d5..9097a0b 100644
--- a/fs/ntfs/usnjrnl.c
+++ b/fs/ntfs/usnjrnl.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling.  Part of the
  *	       Linux-NTFS project.
  *
  * Copyright (c) 2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifdef NTFS_RW

diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 00d8e6b..85f531b 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h

@@ -1,23 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
  *	       Part of the Linux-NTFS project.
  *
  * Copyright (c) 2005 Anton Altaparmakov
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_USNJRNL_H

diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
index 4f579b0..930a9ae 100644
--- a/fs/ntfs/volume.h
+++ b/fs/ntfs/volume.h

@@ -1,24 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
  *	      of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2006 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
- *
- * This program/include file is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program/include file is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS
- * distribution in the file COPYING); if not, write to the Free Software
- * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _LINUX_NTFS_VOLUME_H

diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 77a8de5..46bba20 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config OCFS2_FS
 	tristate "OCFS2 file system support"
 	depends on NET && SYSFS && CONFIGFS_FS

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 99ee093..cc9b32b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile

@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)
 
 obj-$(CONFIG_OCFS2_FS) += 	\
 	ocfs2.o			\

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 917fadc..3e7da39 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -8,15 +9,6 @@
  * CREDITS:
  * Lots of code in this file is copy from linux/fs/ext3/acl.c.
  * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/init.h>

diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 7be0bb7..127b134 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h

@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * acl.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #ifndef OCFS2_ACL_H

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a342f00..f9baefc 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Extent allocs and frees
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -5106,8 +5092,6 @@
 	 * rightmost extent list.
 	 */
 	if (path->p_tree_depth) {
-		struct ocfs2_extent_block *eb;
-
 		ret = ocfs2_read_extent_block(et->et_ci,
 					      ocfs2_et_get_last_eb_blk(et),
 					      &last_eb_bh);
@@ -5115,8 +5099,6 @@
 			mlog_errno(ret);
 			goto out;
 		}
-
-		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
 	}
 
 	if (rec->e_cpos == split_rec->e_cpos &&
@@ -6011,6 +5993,7 @@
 	struct buffer_head *data_alloc_bh = NULL;
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
+	struct ocfs2_journal *journal = osb->journal;
 
 	BUG_ON(inode_trylock(tl_inode));
 
@@ -6031,6 +6014,20 @@
 		goto out;
 	}
 
+	/* Appending truncate log(TA) and and flushing truncate log(TF) are
+	 * two separated transactions. They can be both committed but not
+	 * checkpointed. If crash occurs then, both two transaction will be
+	 * replayed with several already released to global bitmap clusters.
+	 * Then truncate log will be replayed resulting in cluster double free.
+	 */
+	jbd2_journal_lock_updates(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal);
+	jbd2_journal_unlock_updates(journal->j_journal);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
 						       GLOBAL_BITMAP_SYSTEM_INODE,
 						       OCFS2_INVALID_SLOT);
@@ -6209,17 +6206,17 @@
 	if (le16_to_cpu(tl->tl_used)) {
 		trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
 
-		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+		/*
+		 * Assuming the write-out below goes well, this copy will be
+		 * passed back to recovery for processing.
+		 */
+		*tl_copy = kmemdup(tl_bh->b_data, tl_bh->b_size, GFP_KERNEL);
 		if (!(*tl_copy)) {
 			status = -ENOMEM;
 			mlog_errno(status);
 			goto bail;
 		}
 
-		/* Assuming the write-out below goes well, this copy
-		 * will be passed back to recovery for processing. */
-		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
-
 		/* All we need to do to clear the truncate log is set
 		 * tl_used. */
 		tl->tl_used = 0;
@@ -6810,6 +6807,8 @@
 			      struct page *page, int zero, u64 *phys)
 {
 	int ret, partial = 0;
+	loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from;
+	loff_t length = to - from;
 
 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
 	if (ret)
@@ -6829,7 +6828,8 @@
 	if (ret < 0)
 		mlog_errno(ret);
 	else if (ocfs2_should_order_data(inode)) {
-		ret = ocfs2_jbd2_file_inode(handle, inode);
+		ret = ocfs2_jbd2_inode_add_write(handle, inode,
+						 start_byte, length);
 		if (ret < 0)
 			mlog_errno(ret);
 	}
@@ -7536,10 +7536,11 @@
 	return count;
 }
 
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
 {
 	struct ocfs2_super *osb = OCFS2_SB(sb);
-	u64 start, len, trimmed, first_group, last_group, group;
+	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
 	int ret, cnt;
 	u32 first_bit, last_bit, minlen;
 	struct buffer_head *main_bm_bh = NULL;
@@ -7547,7 +7548,6 @@
 	struct buffer_head *gd_bh = NULL;
 	struct ocfs2_dinode *main_bm;
 	struct ocfs2_group_desc *gd = NULL;
-	struct ocfs2_trim_fs_info info, *pinfo = NULL;
 
 	start = range->start >> osb->s_clustersize_bits;
 	len = range->len >> osb->s_clustersize_bits;
@@ -7556,6 +7556,9 @@
 	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
 		return -EINVAL;
 
+	trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
 	main_bm_inode = ocfs2_get_system_file_inode(osb,
 						    GLOBAL_BITMAP_SYSTEM_INODE,
 						    OCFS2_INVALID_SLOT);
@@ -7574,64 +7577,34 @@
 	}
 	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
 
-	if (start >= le32_to_cpu(main_bm->i_clusters)) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	len = range->len >> osb->s_clustersize_bits;
-	if (start + len > le32_to_cpu(main_bm->i_clusters))
-		len = le32_to_cpu(main_bm->i_clusters) - start;
-
-	trace_ocfs2_trim_fs(start, len, minlen);
-
-	ocfs2_trim_fs_lock_res_init(osb);
-	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
-	if (ret < 0) {
-		if (ret != -EAGAIN) {
-			mlog_errno(ret);
-			ocfs2_trim_fs_lock_res_uninit(osb);
+	/*
+	 * Do some check before trim the first group.
+	 */
+	if (!group) {
+		if (start >= le32_to_cpu(main_bm->i_clusters)) {
+			ret = -EINVAL;
 			goto out_unlock;
 		}
 
-		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
-		     "finish, which is running from another node.\n",
-		     osb->dev_str);
-		ret = ocfs2_trim_fs_lock(osb, &info, 0);
-		if (ret < 0) {
-			mlog_errno(ret);
-			ocfs2_trim_fs_lock_res_uninit(osb);
-			goto out_unlock;
-		}
+		if (start + len > le32_to_cpu(main_bm->i_clusters))
+			len = le32_to_cpu(main_bm->i_clusters) - start;
 
-		if (info.tf_valid && info.tf_success &&
-		    info.tf_start == start && info.tf_len == len &&
-		    info.tf_minlen == minlen) {
-			/* Avoid sending duplicated trim to a shared device */
-			mlog(ML_NOTICE, "The same trim on device (%s) was "
-			     "just done from node (%u), return.\n",
-			     osb->dev_str, info.tf_nodenum);
-			range->len = info.tf_trimlen;
-			goto out_trimunlock;
-		}
+		/*
+		 * Determine first and last group to examine based on
+		 * start and len
+		 */
+		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+		if (first_group == osb->first_cluster_group_blkno)
+			first_bit = start;
+		else
+			first_bit = start - ocfs2_blocks_to_clusters(sb,
+								first_group);
+		last_group = ocfs2_which_cluster_group(main_bm_inode,
+						       start + len - 1);
+		group = first_group;
 	}
 
-	info.tf_nodenum = osb->node_num;
-	info.tf_start = start;
-	info.tf_len = len;
-	info.tf_minlen = minlen;
-
-	/* Determine first and last group to examine based on start and len */
-	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
-	if (first_group == osb->first_cluster_group_blkno)
-		first_bit = start;
-	else
-		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
-	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
-	last_bit = osb->bitmap_cpg;
-
-	trimmed = 0;
-	for (group = first_group; group <= last_group;) {
+	do {
 		if (first_bit + len >= osb->bitmap_cpg)
 			last_bit = osb->bitmap_cpg;
 		else
@@ -7663,21 +7636,81 @@
 			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
 		else
 			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
-	}
-	range->len = trimmed * sb->s_blocksize;
+	} while (0);
 
-	info.tf_trimlen = range->len;
-	info.tf_success = (ret ? 0 : 1);
-	pinfo = &info;
-out_trimunlock:
-	ocfs2_trim_fs_unlock(osb, pinfo);
-	ocfs2_trim_fs_lock_res_uninit(osb);
 out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 0);
 	brelse(main_bm_bh);
+	main_bm_bh = NULL;
 out_mutex:
 	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
+
+	/*
+	 * If all the groups trim are not done or failed, but we should release
+	 * main_bm related locks for avoiding the current IO starve, then go to
+	 * trim the next group
+	 */
+	if (ret >= 0 && group <= last_group)
+		goto next_group;
 out:
+	range->len = trimmed * sb->s_blocksize;
+	return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+	ocfs2_trim_fs_lock_res_init(osb);
+
+	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+	if (ret < 0) {
+		if (ret != -EAGAIN) {
+			mlog_errno(ret);
+			ocfs2_trim_fs_lock_res_uninit(osb);
+			return ret;
+		}
+
+		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+		     "finish, which is running from another node.\n",
+		     osb->dev_str);
+		ret = ocfs2_trim_fs_lock(osb, &info, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			ocfs2_trim_fs_lock_res_uninit(osb);
+			return ret;
+		}
+
+		if (info.tf_valid && info.tf_success &&
+		    info.tf_start == range->start &&
+		    info.tf_len == range->len &&
+		    info.tf_minlen == range->minlen) {
+			/* Avoid sending duplicated trim to a shared device */
+			mlog(ML_NOTICE, "The same trim on device (%s) was "
+			     "just done from node (%u), return.\n",
+			     osb->dev_str, info.tf_nodenum);
+			range->len = info.tf_trimlen;
+			goto out;
+		}
+	}
+
+	info.tf_nodenum = osb->node_num;
+	info.tf_start = range->start;
+	info.tf_len = range->len;
+	info.tf_minlen = range->minlen;
+
+	ret = ocfs2_trim_mainbm(sb, range);
+
+	info.tf_trimlen = range->len;
+	info.tf_success = (ret < 0 ? 0 : 1);
+	pinfo = &info;
+out:
+	ocfs2_trim_fs_unlock(osb, pinfo);
+	ocfs2_trim_fs_lock_res_uninit(osb);
 	return ret;
 }

diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 250bcac..7f973dd 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_ALLOC_H

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7578bd5..9cd0a68 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -30,6 +16,7 @@
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
 #include <linux/uio.h>
+#include <linux/mm.h>
 
 #include <cluster/masklog.h>
 
@@ -397,7 +384,7 @@
 	 * Check whether a remote node truncated this file - we just
 	 * drop out in that case as it's not worth handling here.
 	 */
-	last = list_entry(pages->prev, struct page, lru);
+	last = lru_to_page(pages);
 	start = (loff_t)last->index << PAGE_SHIFT;
 	if (start >= i_size_read(inode))
 		goto out_unlock;
@@ -955,7 +942,8 @@
 
 		if (tmppage && page_has_buffers(tmppage)) {
 			if (ocfs2_should_order_data(inode))
-				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+				ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
+							   user_pos, user_len);
 
 			block_commit_write(tmppage, from, to);
 		}
@@ -1392,8 +1380,7 @@
 unlock:
 	spin_unlock(&oi->ip_lock);
 out:
-	if (new)
-		kfree(new);
+	kfree(new);
 	return ret;
 }
 
@@ -2037,8 +2024,14 @@
 		}
 
 		if (page_has_buffers(tmppage)) {
-			if (handle && ocfs2_should_order_data(inode))
-				ocfs2_jbd2_file_inode(handle, inode);
+			if (handle && ocfs2_should_order_data(inode)) {
+				loff_t start_byte =
+					((loff_t)tmppage->index << PAGE_SHIFT) +
+					from;
+				loff_t length = to - from;
+				ocfs2_jbd2_inode_add_write(handle, inode,
+							   start_byte, length);
+			}
 			block_commit_write(tmppage, from, to);
 		}
 	}
@@ -2056,7 +2049,8 @@
 		inode->i_mtime = inode->i_ctime = current_time(inode);
 		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
 		di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-		ocfs2_update_inode_fsync_trans(handle, inode, 1);
+		if (handle)
+			ocfs2_update_inode_fsync_trans(handle, inode, 1);
 	}
 	if (handle)
 		ocfs2_journal_dirty(handle, wc->w_di_bh);
@@ -2153,13 +2147,30 @@
 	struct ocfs2_dio_write_ctxt *dwc = NULL;
 	struct buffer_head *di_bh = NULL;
 	u64 p_blkno;
-	loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+	unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+	loff_t pos = iblock << i_blkbits;
+	sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
 	unsigned len, total_len = bh_result->b_size;
 	int ret = 0, first_get_block = 0;
 
 	len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
 	len = min(total_len, len);
 
+	/*
+	 * bh_result->b_size is count in get_more_blocks according to write
+	 * "pos" and "end", we need map twice to return different buffer state:
+	 * 1. area in file size, not set NEW;
+	 * 2. area out file size, set  NEW.
+	 *
+	 *		   iblock    endblk
+	 * |--------|---------|---------|---------
+	 * |<-------area in file------->|
+	 */
+
+	if ((iblock <= endblk) &&
+	    ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+		len = (endblk - iblock + 1) << i_blkbits;
+
 	mlog(0, "get block of %lu at %llu:%u req %u\n",
 			inode->i_ino, pos, len, total_len);
 
@@ -2243,6 +2254,9 @@
 	if (desc->c_needs_zero)
 		set_buffer_new(bh_result);
 
+	if (iblock > endblk)
+		set_buffer_new(bh_result);
+
 	/* May sleep in end_io. It should not happen in a irq context. So defer
 	 * it to dio work queue. */
 	set_buffer_defer_completion(bh_result);

diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 3494a62..70ed438 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h

@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_AOPS_H

diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 0725e60..eaf042f 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Checksum and ECC codes for the OCFS2 userspace library.
  *
  * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/kernel.h>
@@ -239,68 +231,36 @@
 }
 DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
 
-static struct dentry *blockcheck_debugfs_create(const char *name,
-						struct dentry *parent,
-						u64 *value)
-{
-	return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
-				   &blockcheck_fops);
-}
-
 static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
 {
 	if (stats) {
-		debugfs_remove(stats->b_debug_check);
-		stats->b_debug_check = NULL;
-		debugfs_remove(stats->b_debug_failure);
-		stats->b_debug_failure = NULL;
-		debugfs_remove(stats->b_debug_recover);
-		stats->b_debug_recover = NULL;
-		debugfs_remove(stats->b_debug_dir);
+		debugfs_remove_recursive(stats->b_debug_dir);
 		stats->b_debug_dir = NULL;
 	}
 }
 
-static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
-					  struct dentry *parent)
+static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+					   struct dentry *parent)
 {
-	int rc = -EINVAL;
+	struct dentry *dir;
 
-	if (!stats)
-		goto out;
+	dir = debugfs_create_dir("blockcheck", parent);
+	stats->b_debug_dir = dir;
 
-	stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
-	if (!stats->b_debug_dir)
-		goto out;
+	debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir,
+			    &stats->b_check_count, &blockcheck_fops);
 
-	stats->b_debug_check =
-		blockcheck_debugfs_create("blocks_checked",
-					  stats->b_debug_dir,
-					  &stats->b_check_count);
+	debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir,
+			    &stats->b_failure_count, &blockcheck_fops);
 
-	stats->b_debug_failure =
-		blockcheck_debugfs_create("checksums_failed",
-					  stats->b_debug_dir,
-					  &stats->b_failure_count);
+	debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir,
+			    &stats->b_recover_count, &blockcheck_fops);
 
-	stats->b_debug_recover =
-		blockcheck_debugfs_create("ecc_recoveries",
-					  stats->b_debug_dir,
-					  &stats->b_recover_count);
-	if (stats->b_debug_check && stats->b_debug_failure &&
-	    stats->b_debug_recover)
-		rc = 0;
-
-out:
-	if (rc)
-		ocfs2_blockcheck_debug_remove(stats);
-	return rc;
 }
 #else
-static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
-						 struct dentry *parent)
+static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+						  struct dentry *parent)
 {
-	return 0;
 }
 
 static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
@@ -309,10 +269,10 @@
 #endif  /* CONFIG_DEBUG_FS */
 
 /* Always-called wrappers for starting and stopping the debugfs files */
-int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
-					   struct dentry *parent)
+void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+					    struct dentry *parent)
 {
-	return ocfs2_blockcheck_debug_install(stats, parent);
+	ocfs2_blockcheck_debug_install(stats, parent);
 }
 
 void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)

diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index d4b69fe..8f17d2c 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Checksum and ECC codes for the OCFS2 userspace library.
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #ifndef OCFS2_BLOCKCHECK_H
@@ -33,9 +25,6 @@
 	 * ocfs2_blockcheck_stats_debugfs_install()
 	 */
 	struct dentry *b_debug_dir;	/* Parent of the debugfs  files */
-	struct dentry *b_debug_check;	/* Exposes b_check_count */
-	struct dentry *b_debug_failure;	/* Exposes b_failure_count */
-	struct dentry *b_debug_recover;	/* Exposes b_recover_count */
 };
 
 
@@ -64,8 +53,8 @@
 				   struct ocfs2_blockcheck_stats *stats);
 
 /* Debug Initialization */
-int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
-					   struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+					    struct dentry *parent);
 void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
 
 /*

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1d098c3..f0b104e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Buffer cache handling
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -99,25 +85,34 @@
 	return ret;
 }
 
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 			   unsigned int nr, struct buffer_head *bhs[])
 {
 	int status = 0;
 	unsigned int i;
 	struct buffer_head *bh;
+	int new_bh = 0;
 
 	trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
 
 	if (!nr)
 		goto bail;
 
+	/* Don't put buffer head and re-assign it to NULL if it is allocated
+	 * outside since the caller can't be aware of this alternation!
+	 */
+	new_bh = (bhs[0] == NULL);
+
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
 			bhs[i] = sb_getblk(osb->sb, block++);
 			if (bhs[i] == NULL) {
 				status = -ENOMEM;
 				mlog_errno(status);
-				goto bail;
+				break;
 			}
 		}
 		bh = bhs[i];
@@ -152,15 +147,31 @@
 #endif
 		}
 
-		clear_buffer_uptodate(bh);
 		get_bh(bh); /* for end_buffer_read_sync() */
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(REQ_OP_READ, 0, bh);
 	}
 
+read_failure:
 	for (i = nr; i > 0; i--) {
 		bh = bhs[i - 1];
 
+		if (unlikely(status)) {
+			if (new_bh && bh) {
+				/* If middle bh fails, let previous bh
+				 * finish its read and then put it to
+				 * aovoid bh leak
+				 */
+				if (!buffer_jbd(bh))
+					wait_on_buffer(bh);
+				put_bh(bh);
+				bhs[i - 1] = NULL;
+			} else if (bh && buffer_uptodate(bh)) {
+				clear_buffer_uptodate(bh);
+			}
+			continue;
+		}
+
 		/* No need to wait on the buffer if it's managed by JBD. */
 		if (!buffer_jbd(bh))
 			wait_on_buffer(bh);
@@ -170,8 +181,7 @@
 			 * so we can safely record this and loop back
 			 * to cleanup the other buffers. */
 			status = -EIO;
-			put_bh(bh);
-			bhs[i - 1] = NULL;
+			goto read_failure;
 		}
 	}
 
@@ -179,6 +189,9 @@
 	return status;
 }
 
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
 int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 		      struct buffer_head *bhs[], int flags,
 		      int (*validate)(struct super_block *sb,
@@ -188,6 +201,7 @@
 	int i, ignore_cache = 0;
 	struct buffer_head *bh;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	int new_bh = 0;
 
 	trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
 
@@ -213,6 +227,11 @@
 		goto bail;
 	}
 
+	/* Don't put buffer head and re-assign it to NULL if it is allocated
+	 * outside since the caller can't be aware of this alternation!
+	 */
+	new_bh = (bhs[0] == NULL);
+
 	ocfs2_metadata_cache_io_lock(ci);
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
@@ -221,7 +240,8 @@
 				ocfs2_metadata_cache_io_unlock(ci);
 				status = -ENOMEM;
 				mlog_errno(status);
-				goto bail;
+				/* Don't forget to put previous bh! */
+				break;
 			}
 		}
 		bh = bhs[i];
@@ -306,7 +326,6 @@
 				continue;
 			}
 
-			clear_buffer_uptodate(bh);
 			get_bh(bh); /* for end_buffer_read_sync() */
 			if (validate)
 				set_buffer_needs_validate(bh);
@@ -316,16 +335,27 @@
 		}
 	}
 
-	status = 0;
-
+read_failure:
 	for (i = (nr - 1); i >= 0; i--) {
 		bh = bhs[i];
 
 		if (!(flags & OCFS2_BH_READAHEAD)) {
-			if (status) {
-				/* Clear the rest of the buffers on error */
-				put_bh(bh);
-				bhs[i] = NULL;
+			if (unlikely(status)) {
+				/* Clear the buffers on error including those
+				 * ever succeeded in reading
+				 */
+				if (new_bh && bh) {
+					/* If middle bh fails, let previous bh
+					 * finish its read and then put it to
+					 * aovoid bh leak
+					 */
+					if (!buffer_jbd(bh))
+						wait_on_buffer(bh);
+					put_bh(bh);
+					bhs[i] = NULL;
+				} else if (bh && buffer_uptodate(bh)) {
+					clear_buffer_uptodate(bh);
+				}
 				continue;
 			}
 			/* We know this can't have changed as we hold the
@@ -343,9 +373,7 @@
 				 * uptodate. */
 				status = -EIO;
 				clear_buffer_needs_validate(bh);
-				put_bh(bh);
-				bhs[i] = NULL;
-				continue;
+				goto read_failure;
 			}
 
 			if (buffer_needs_validate(bh)) {
@@ -355,11 +383,8 @@
 				BUG_ON(buffer_jbd(bh));
 				clear_buffer_needs_validate(bh);
 				status = validate(sb, bh);
-				if (status) {
-					put_bh(bh);
-					bhs[i] = NULL;
-					continue;
-				}
+				if (status)
+					goto read_failure;
 			}
 		}
 

diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index b1bb70c..1c5e533 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Buffer cache handling functions defined
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_BUFFER_HEAD_IO_H

diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index 1aefc03..0e5b732 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
 
 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9b2ed62..a368350 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/kernel.h>
@@ -106,10 +92,6 @@
 #define O2HB_DEBUG_REGION_PINNED	"pinned"
 
 static struct dentry *o2hb_debug_dir;
-static struct dentry *o2hb_debug_livenodes;
-static struct dentry *o2hb_debug_liveregions;
-static struct dentry *o2hb_debug_quorumregions;
-static struct dentry *o2hb_debug_failedregions;
 
 static LIST_HEAD(o2hb_all_regions);
 
@@ -243,10 +225,6 @@
 	unsigned int		hr_region_num;
 
 	struct dentry		*hr_debug_dir;
-	struct dentry		*hr_debug_livenodes;
-	struct dentry		*hr_debug_regnum;
-	struct dentry		*hr_debug_elapsed_time;
-	struct dentry		*hr_debug_pinned;
 	struct o2hb_debug_buf	*hr_db_livenodes;
 	struct o2hb_debug_buf	*hr_db_regnum;
 	struct o2hb_debug_buf	*hr_db_elapsed_time;
@@ -582,9 +560,10 @@
 }
 
 static int o2hb_read_slots(struct o2hb_region *reg,
+			   unsigned int begin_slot,
 			   unsigned int max_slots)
 {
-	unsigned int current_slot=0;
+	unsigned int current_slot = begin_slot;
 	int status;
 	struct o2hb_bio_wait_ctxt wc;
 	struct bio *bio;
@@ -1093,9 +1072,14 @@
 	return find_last_bit(nodes, numbits);
 }
 
+static int o2hb_lowest_node(unsigned long *nodes, int numbits)
+{
+	return find_first_bit(nodes, numbits);
+}
+
 static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
-	int i, ret, highest_node;
+	int i, ret, highest_node, lowest_node;
 	int membership_change = 0, own_slot_ok = 0;
 	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -1120,7 +1104,8 @@
 	}
 
 	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
-	if (highest_node >= O2NM_MAX_NODES) {
+	lowest_node = o2hb_lowest_node(configured_nodes, O2NM_MAX_NODES);
+	if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) {
 		mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
 		ret = -EINVAL;
 		goto bail;
@@ -1130,7 +1115,7 @@
 	 * yet. Of course, if the node definitions have holes in them
 	 * then we're reading an empty slot anyway... Consider this
 	 * best-effort. */
-	ret = o2hb_read_slots(reg, highest_node + 1);
+	ret = o2hb_read_slots(reg, lowest_node, highest_node + 1);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
@@ -1191,7 +1176,7 @@
 	if (atomic_read(&reg->hr_steady_iterations) != 0) {
 		if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
 			printk(KERN_NOTICE "o2hb: Unable to stabilize "
-			       "heartbeart on region %s (%s)\n",
+			       "heartbeat on region %s (%s)\n",
 			       config_item_name(&reg->hr_item),
 			       reg->hr_dev_name);
 			atomic_set(&reg->hr_steady_iterations, 0);
@@ -1398,107 +1383,60 @@
 
 void o2hb_exit(void)
 {
-	debugfs_remove(o2hb_debug_failedregions);
-	debugfs_remove(o2hb_debug_quorumregions);
-	debugfs_remove(o2hb_debug_liveregions);
-	debugfs_remove(o2hb_debug_livenodes);
-	debugfs_remove(o2hb_debug_dir);
+	debugfs_remove_recursive(o2hb_debug_dir);
 	kfree(o2hb_db_livenodes);
 	kfree(o2hb_db_liveregions);
 	kfree(o2hb_db_quorumregions);
 	kfree(o2hb_db_failedregions);
 }
 
-static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
-					struct o2hb_debug_buf **db, int db_len,
-					int type, int size, int len, void *data)
+static void o2hb_debug_create(const char *name, struct dentry *dir,
+			      struct o2hb_debug_buf **db, int db_len, int type,
+			      int size, int len, void *data)
 {
 	*db = kmalloc(db_len, GFP_KERNEL);
 	if (!*db)
-		return NULL;
+		return;
 
 	(*db)->db_type = type;
 	(*db)->db_size = size;
 	(*db)->db_len = len;
 	(*db)->db_data = data;
 
-	return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
-				   &o2hb_debug_fops);
+	debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops);
 }
 
-static int o2hb_debug_init(void)
+static void o2hb_debug_init(void)
 {
-	int ret = -ENOMEM;
-
 	o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
-	if (!o2hb_debug_dir) {
-		mlog_errno(ret);
-		goto bail;
-	}
 
-	o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
-						 o2hb_debug_dir,
-						 &o2hb_db_livenodes,
-						 sizeof(*o2hb_db_livenodes),
-						 O2HB_DB_TYPE_LIVENODES,
-						 sizeof(o2hb_live_node_bitmap),
-						 O2NM_MAX_NODES,
-						 o2hb_live_node_bitmap);
-	if (!o2hb_debug_livenodes) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_LIVENODES, o2hb_debug_dir,
+			  &o2hb_db_livenodes, sizeof(*o2hb_db_livenodes),
+			  O2HB_DB_TYPE_LIVENODES, sizeof(o2hb_live_node_bitmap),
+			  O2NM_MAX_NODES, o2hb_live_node_bitmap);
 
-	o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
-						   o2hb_debug_dir,
-						   &o2hb_db_liveregions,
-						   sizeof(*o2hb_db_liveregions),
-						   O2HB_DB_TYPE_LIVEREGIONS,
-						   sizeof(o2hb_live_region_bitmap),
-						   O2NM_MAX_REGIONS,
-						   o2hb_live_region_bitmap);
-	if (!o2hb_debug_liveregions) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, o2hb_debug_dir,
+			  &o2hb_db_liveregions, sizeof(*o2hb_db_liveregions),
+			  O2HB_DB_TYPE_LIVEREGIONS,
+			  sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS,
+			  o2hb_live_region_bitmap);
 
-	o2hb_debug_quorumregions =
-			o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
-					  o2hb_debug_dir,
-					  &o2hb_db_quorumregions,
-					  sizeof(*o2hb_db_quorumregions),
-					  O2HB_DB_TYPE_QUORUMREGIONS,
-					  sizeof(o2hb_quorum_region_bitmap),
-					  O2NM_MAX_REGIONS,
-					  o2hb_quorum_region_bitmap);
-	if (!o2hb_debug_quorumregions) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, o2hb_debug_dir,
+			  &o2hb_db_quorumregions,
+			  sizeof(*o2hb_db_quorumregions),
+			  O2HB_DB_TYPE_QUORUMREGIONS,
+			  sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS,
+			  o2hb_quorum_region_bitmap);
 
-	o2hb_debug_failedregions =
-			o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
-					  o2hb_debug_dir,
-					  &o2hb_db_failedregions,
-					  sizeof(*o2hb_db_failedregions),
-					  O2HB_DB_TYPE_FAILEDREGIONS,
-					  sizeof(o2hb_failed_region_bitmap),
-					  O2NM_MAX_REGIONS,
-					  o2hb_failed_region_bitmap);
-	if (!o2hb_debug_failedregions) {
-		mlog_errno(ret);
-		goto bail;
-	}
-
-	ret = 0;
-bail:
-	if (ret)
-		o2hb_exit();
-
-	return ret;
+	o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, o2hb_debug_dir,
+			  &o2hb_db_failedregions,
+			  sizeof(*o2hb_db_failedregions),
+			  O2HB_DB_TYPE_FAILEDREGIONS,
+			  sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS,
+			  o2hb_failed_region_bitmap);
 }
 
-int o2hb_init(void)
+void o2hb_init(void)
 {
 	int i;
 
@@ -1518,7 +1456,7 @@
 
 	o2hb_dependent_users = 0;
 
-	return o2hb_debug_init();
+	o2hb_debug_init();
 }
 
 /* if we're already in a callback then we're already serialized by the sem */
@@ -1582,11 +1520,7 @@
 
 	kfree(reg->hr_slots);
 
-	debugfs_remove(reg->hr_debug_livenodes);
-	debugfs_remove(reg->hr_debug_regnum);
-	debugfs_remove(reg->hr_debug_elapsed_time);
-	debugfs_remove(reg->hr_debug_pinned);
-	debugfs_remove(reg->hr_debug_dir);
+	debugfs_remove_recursive(reg->hr_debug_dir);
 	kfree(reg->hr_db_livenodes);
 	kfree(reg->hr_db_regnum);
 	kfree(reg->hr_db_elapsed_time);
@@ -1801,7 +1735,7 @@
 	struct o2hb_disk_slot *slot;
 	struct o2hb_disk_heartbeat_block *hb_block;
 
-	ret = o2hb_read_slots(reg, reg->hr_blocks);
+	ret = o2hb_read_slots(reg, 0, reg->hr_blocks);
 	if (ret)
 		goto out;
 
@@ -2045,69 +1979,33 @@
 		: NULL;
 }
 
-static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+static void o2hb_debug_region_init(struct o2hb_region *reg,
+				   struct dentry *parent)
 {
-	int ret = -ENOMEM;
+	struct dentry *dir;
 
-	reg->hr_debug_dir =
-		debugfs_create_dir(config_item_name(&reg->hr_item), dir);
-	if (!reg->hr_debug_dir) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	dir = debugfs_create_dir(config_item_name(&reg->hr_item), parent);
+	reg->hr_debug_dir = dir;
 
-	reg->hr_debug_livenodes =
-			o2hb_debug_create(O2HB_DEBUG_LIVENODES,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_livenodes),
-					  sizeof(*(reg->hr_db_livenodes)),
-					  O2HB_DB_TYPE_REGION_LIVENODES,
-					  sizeof(reg->hr_live_node_bitmap),
-					  O2NM_MAX_NODES, reg);
-	if (!reg->hr_debug_livenodes) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes),
+			  sizeof(*(reg->hr_db_livenodes)),
+			  O2HB_DB_TYPE_REGION_LIVENODES,
+			  sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES,
+			  reg);
 
-	reg->hr_debug_regnum =
-			o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_regnum),
-					  sizeof(*(reg->hr_db_regnum)),
-					  O2HB_DB_TYPE_REGION_NUMBER,
-					  0, O2NM_MAX_NODES, reg);
-	if (!reg->hr_debug_regnum) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum),
+			  sizeof(*(reg->hr_db_regnum)),
+			  O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg);
 
-	reg->hr_debug_elapsed_time =
-			o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_elapsed_time),
-					  sizeof(*(reg->hr_db_elapsed_time)),
-					  O2HB_DB_TYPE_REGION_ELAPSED_TIME,
-					  0, 0, reg);
-	if (!reg->hr_debug_elapsed_time) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir,
+			  &(reg->hr_db_elapsed_time),
+			  sizeof(*(reg->hr_db_elapsed_time)),
+			  O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg);
 
-	reg->hr_debug_pinned =
-			o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_pinned),
-					  sizeof(*(reg->hr_db_pinned)),
-					  O2HB_DB_TYPE_REGION_PINNED,
-					  0, 0, reg);
-	if (!reg->hr_debug_pinned) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned),
+			  sizeof(*(reg->hr_db_pinned)),
+			  O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg);
 
-	ret = 0;
-bail:
-	return ret;
 }
 
 static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
@@ -2163,11 +2061,7 @@
 	if (ret)
 		goto unregister_handler;
 
-	ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
-	if (ret) {
-		config_item_put(&reg->hr_item);
-		goto unregister_handler;
-	}
+	o2hb_debug_region_init(reg, o2hb_debug_dir);
 
 	return &reg->hr_item;
 

diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index a9e67ef..beed31e 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef O2CLUSTER_HEARTBEAT_H
@@ -78,7 +63,7 @@
 void o2hb_fill_node_map(unsigned long *map,
 			unsigned bytes);
 void o2hb_exit(void);
-int o2hb_init(void);
+void o2hb_init(void);
 int o2hb_check_node_heartbeating_no_sem(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 void o2hb_stop_all_regions(void);

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d331c23..1d696c9 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/module.h>

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index a396096..446e452 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h

@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef O2CLUSTER_MASKLOG_H

diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 74a21f6..02bf4a1 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * debug functionality for o2net
  *
  * Copyright (C) 2005, 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifdef CONFIG_DEBUG_FS
@@ -53,10 +38,6 @@
 #define SHOW_SOCK_STATS		1
 
 static struct dentry *o2net_dentry;
-static struct dentry *sc_dentry;
-static struct dentry *nst_dentry;
-static struct dentry *stats_dentry;
-static struct dentry *nodes_dentry;
 
 static DEFINE_SPINLOCK(o2net_debug_lock);
 
@@ -505,36 +486,23 @@
 
 void o2net_debugfs_exit(void)
 {
-	debugfs_remove(nodes_dentry);
-	debugfs_remove(stats_dentry);
-	debugfs_remove(sc_dentry);
-	debugfs_remove(nst_dentry);
-	debugfs_remove(o2net_dentry);
+	debugfs_remove_recursive(o2net_dentry);
 }
 
-int o2net_debugfs_init(void)
+void o2net_debugfs_init(void)
 {
 	umode_t mode = S_IFREG|S_IRUSR;
 
 	o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
-	if (o2net_dentry)
-		nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
-					o2net_dentry, NULL, &nst_seq_fops);
-	if (nst_dentry)
-		sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
-					o2net_dentry, NULL, &sc_seq_fops);
-	if (sc_dentry)
-		stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
-					o2net_dentry, NULL, &stats_seq_fops);
-	if (stats_dentry)
-		nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
-					o2net_dentry, NULL, &nodes_fops);
-	if (nodes_dentry)
-		return 0;
 
-	o2net_debugfs_exit();
-	mlog_errno(-ENOMEM);
-	return -ENOMEM;
+	debugfs_create_file(NST_DEBUG_NAME, mode, o2net_dentry, NULL,
+			    &nst_seq_fops);
+	debugfs_create_file(SC_DEBUG_NAME, mode, o2net_dentry, NULL,
+			    &sc_seq_fops);
+	debugfs_create_file(STATS_DEBUG_NAME, mode, o2net_dentry, NULL,
+			    &stats_seq_fops);
+	debugfs_create_file(NODES_DEBUG_NAME, mode, o2net_dentry, NULL,
+			    &nodes_fops);
 }
 
 #endif	/* CONFIG_DEBUG_FS */

diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 0e4166c..7a7640c 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c

@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/slab.h>
@@ -621,13 +607,15 @@
 	struct o2nm_node *node = to_o2nm_node(item);
 	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
 
-	o2net_disconnect_node(node);
+	if (cluster->cl_nodes[node->nd_num] == node) {
+		o2net_disconnect_node(node);
 
-	if (cluster->cl_has_local &&
-	    (cluster->cl_local_node == node->nd_num)) {
-		cluster->cl_has_local = 0;
-		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
-		o2net_stop_listening(node);
+		if (cluster->cl_has_local &&
+		    (cluster->cl_local_node == node->nd_num)) {
+			cluster->cl_has_local = 0;
+			cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+			o2net_stop_listening(node);
+		}
 	}
 
 	/* XXX call into net to stop this node from trading messages */
@@ -840,9 +828,7 @@
 {
 	int ret = -1;
 
-	ret = o2hb_init();
-	if (ret)
-		goto out;
+	o2hb_init();
 
 	ret = o2net_init();
 	if (ret)

diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 09ea2d3..3e00066 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef O2CLUSTER_NODEMANAGER_H

diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
index 3f4151d..760d850 100644
--- a/fs/ocfs2/cluster/ocfs2_heartbeat.h
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * On-disk structures for ocfs2_heartbeat
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef _OCFS2_HEARTBEAT_H

diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 49b5943..21ad307 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -7,22 +8,6 @@
  * for the ocfs2_nodemanager module.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef _OCFS2_NODEMANAGER_H

diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index af2e747..5c424a0 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  *
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 /* This quorum hack is only here until we transition to some more rational
@@ -81,6 +67,7 @@
 	default:
 		WARN_ON(o2nm_single_cluster->cl_fence_method >=
 			O2NM_FENCE_METHODS);
+		/* fall through */
 	case O2NM_FENCE_RESET:
 		printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
 		       "system by restarting ***\n");
@@ -89,7 +76,7 @@
 	};
 }
 
-/* Indicate that a timeout occurred on a hearbeat region write. The
+/* Indicate that a timeout occurred on a heartbeat region write. The
  * other nodes in the cluster may consider us dead at that time so we
  * want to "fence" ourselves so that we don't scribble on the disk
  * after they think they've recovered us. This can't solve all

diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
index 6649cc6..6d45ce8 100644
--- a/fs/ocfs2/cluster/quorum.h
+++ b/fs/ocfs2/cluster/quorum.h

@@ -1,23 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef O2CLUSTER_QUORUM_H

diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index b7f5727..d6067c3 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * OCFS2 cluster sysfs interface
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation,
- * version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #include <linux/kernel.h>

diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
index d66b8ab..ce38051 100644
--- a/fs/ocfs2/cluster/sys.h
+++ b/fs/ocfs2/cluster/sys.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * Function prototypes for o2cb sysfs interface
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation,
- * version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef O2CLUSTER_SYS_H

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 7d9eea7..48a3398 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c

@@ -1,24 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  *
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * ----
  *
  * Callers for this were originally written against a very simple synchronus
@@ -916,7 +902,7 @@
 {
 	struct kvec vec = { .iov_len = len, .iov_base = data, };
 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
-	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, len);
+	iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len);
 	return sock_recvmsg(sock, &msg, MSG_DONTWAIT);
 }
 
@@ -1776,7 +1762,7 @@
 		(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
 
 	if (node_num != o2nm_this_node()) {
-		/* believe it or not, accept and node hearbeating testing
+		/* believe it or not, accept and node heartbeating testing
 		 * can succeed for this node before we got here.. so
 		 * only use set_nn_state to clear the persistent error
 		 * if that hasn't already happened */
@@ -2143,8 +2129,7 @@
 
 	o2quo_init();
 
-	if (o2net_debugfs_init())
-		goto out;
+	o2net_debugfs_init();
 
 	o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
 	o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);

diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index c571e84..de87cbf 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef O2CLUSTER_TCP_H
@@ -124,16 +109,15 @@
 struct o2net_sock_container;
 
 #ifdef CONFIG_DEBUG_FS
-int o2net_debugfs_init(void);
+void o2net_debugfs_init(void);
 void o2net_debugfs_exit(void);
 void o2net_debug_add_nst(struct o2net_send_tracking *nst);
 void o2net_debug_del_nst(struct o2net_send_tracking *nst);
 void o2net_debug_add_sc(struct o2net_sock_container *sc);
 void o2net_debug_del_sc(struct o2net_sock_container *sc);
 #else
-static inline int o2net_debugfs_init(void)
+static inline void o2net_debugfs_init(void)
 {
-	return 0;
 }
 static inline void o2net_debugfs_exit(void)
 {

diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 0276f7f..e6a2b9d 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h

@@ -1,22 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef O2CLUSTER_TCP_INTERNAL_H

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 2903730..42a61ee 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * dentry cache handling code
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -310,6 +296,18 @@
 
 out_attach:
 	spin_lock(&dentry_attach_lock);
+	if (unlikely(dentry->d_fsdata && !alias)) {
+		/* d_fsdata is set by a racing thread which is doing
+		 * the same thing as this thread is doing. Leave the racing
+		 * thread going ahead and we return here.
+		 */
+		spin_unlock(&dentry_attach_lock);
+		iput(dl->dl_inode);
+		ocfs2_lock_res_free(&dl->dl_lockres);
+		kfree(dl);
+		return 0;
+	}
+
 	dentry->d_fsdata = dl;
 	dl->dl_count++;
 	spin_unlock(&dentry_attach_lock);

diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index 55f5889..3686a52 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_DCACHE_H

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c121abb..bdef72c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -19,21 +20,6 @@
  *   linux/fs/minix/dir.c
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -69,10 +55,6 @@
 #define NAMEI_RA_BLOCKS  4
 #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
 
-static unsigned char ocfs2_filetype_table[] = {
-	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
 static int ocfs2_do_extend_dir(struct super_block *sb,
 			       handle_t *handle,
 			       struct inode *dir,
@@ -1718,7 +1700,7 @@
 				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
 				de = de1;
 			}
-			de->file_type = OCFS2_FT_UNKNOWN;
+			de->file_type = FT_UNKNOWN;
 			if (blkno) {
 				de->inode = cpu_to_le64(blkno);
 				ocfs2_set_de_type(de, inode->i_mode);
@@ -1803,13 +1785,9 @@
 		}
 		offset += le16_to_cpu(de->rec_len);
 		if (le64_to_cpu(de->inode)) {
-			unsigned char d_type = DT_UNKNOWN;
-
-			if (de->file_type < OCFS2_FT_MAX)
-				d_type = ocfs2_filetype_table[de->file_type];
-
 			if (!dir_emit(ctx, de->name, de->name_len,
-				      le64_to_cpu(de->inode), d_type))
+				      le64_to_cpu(de->inode),
+				      fs_ftype_to_dtype(de->file_type)))
 				goto out;
 		}
 		ctx->pos += le16_to_cpu(de->rec_len);
@@ -1900,14 +1878,10 @@
 				break;
 			}
 			if (le64_to_cpu(de->inode)) {
-				unsigned char d_type = DT_UNKNOWN;
-
-				if (de->file_type < OCFS2_FT_MAX)
-					d_type = ocfs2_filetype_table[de->file_type];
 				if (!dir_emit(ctx, de->name,
 						de->name_len,
 						le64_to_cpu(de->inode),
-						d_type)) {
+					fs_ftype_to_dtype(de->file_type))) {
 					brelse(bh);
 					return 0;
 				}
@@ -3662,7 +3636,7 @@
 	int i, j, num_used;
 	u32 major_hash;
 	struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
-	struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+	struct ocfs2_dx_entry_list *orig_list, *tmp_list;
 	struct ocfs2_dx_entry *dx_entry;
 
 	tmp_list = &tmp_dx_leaf->dl_list;
@@ -3671,7 +3645,6 @@
 		orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
 		orig_list = &orig_dx_leaf->dl_list;
 		new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
-		new_list = &new_dx_leaf->dl_list;
 
 		num_used = le16_to_cpu(orig_list->de_num_used);
 

diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 3d8639f..e3e7d5d 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_DIR_H

diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index bd1aab1..38b2243 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile

@@ -1,7 +1,7 @@
-ccflags-y := -Ifs/ocfs2
+# SPDX-License-Identifier: GPL-2.0-only
+ccflags-y := -I $(srctree)/$(src)/..
 
 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
 	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
-

diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index 3cfa114..6456c0f 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * externally exported dlm interfaces
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef DLMAPI_H

diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 39831fc..4de89af 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * AST and BAST functionality for local and remote nodes
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d06e27e..aaf2454 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h

@@ -1,25 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * dlmcommon.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef DLMCOMMON_H
@@ -157,7 +142,6 @@
 	atomic_t res_tot_count;
 	atomic_t res_cur_count;
 
-	struct dlm_debug_ctxt *dlm_debug_ctxt;
 	struct dentry *dlm_debugfs_subroot;
 
 	/* NOTE: Next three are protected by dlm_domain_lock */

diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 0bb1286..965f45d 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * underlying calls for lock conversion
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 

diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
index b2e3677..12d9c28 100644
--- a/fs/ocfs2/dlm/dlmconvert.h
+++ b/fs/ocfs2/dlm/dlmconvert.h

@@ -1,25 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * dlmconvert.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef DLMCONVERT_H

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 9b984ca..4d0b452 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * debug functionality for the dlm
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #include <linux/types.h>
@@ -329,7 +314,7 @@
 {
 	char *buf;
 
-	buf = (char *) get_zeroed_page(GFP_NOFS);
+	buf = (char *) get_zeroed_page(GFP_ATOMIC);
 	if (buf) {
 		dump_mle(mle, buf, PAGE_SIZE - 1);
 		free_page((unsigned long)buf);
@@ -866,109 +851,42 @@
 /* end  - debug state funcs */
 
 /* files in subroot */
-int dlm_debug_init(struct dlm_ctxt *dlm)
+void dlm_debug_init(struct dlm_ctxt *dlm)
 {
-	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
 	/* for dumping dlm_ctxt */
-	dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
-						     S_IFREG|S_IRUSR,
-						     dlm->dlm_debugfs_subroot,
-						     dlm, &debug_state_fops);
-	if (!dc->debug_state_dentry) {
-		mlog_errno(-ENOMEM);
-		goto bail;
-	}
+	debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR,
+			    dlm->dlm_debugfs_subroot, dlm, &debug_state_fops);
 
 	/* for dumping lockres */
-	dc->debug_lockres_dentry =
-			debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
-					    S_IFREG|S_IRUSR,
-					    dlm->dlm_debugfs_subroot,
-					    dlm, &debug_lockres_fops);
-	if (!dc->debug_lockres_dentry) {
-		mlog_errno(-ENOMEM);
-		goto bail;
-	}
+	debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR,
+			    dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops);
 
 	/* for dumping mles */
-	dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
-						   S_IFREG|S_IRUSR,
-						   dlm->dlm_debugfs_subroot,
-						   dlm, &debug_mle_fops);
-	if (!dc->debug_mle_dentry) {
-		mlog_errno(-ENOMEM);
-		goto bail;
-	}
+	debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR,
+			    dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops);
 
 	/* for dumping lockres on the purge list */
-	dc->debug_purgelist_dentry =
-			debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
-					    S_IFREG|S_IRUSR,
-					    dlm->dlm_debugfs_subroot,
-					    dlm, &debug_purgelist_fops);
-	if (!dc->debug_purgelist_dentry) {
-		mlog_errno(-ENOMEM);
-		goto bail;
-	}
-
-	return 0;
-
-bail:
-	return -ENOMEM;
-}
-
-void dlm_debug_shutdown(struct dlm_ctxt *dlm)
-{
-	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
-	if (dc) {
-		debugfs_remove(dc->debug_purgelist_dentry);
-		debugfs_remove(dc->debug_mle_dentry);
-		debugfs_remove(dc->debug_lockres_dentry);
-		debugfs_remove(dc->debug_state_dentry);
-		kfree(dc);
-		dc = NULL;
-	}
+	debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR,
+			    dlm->dlm_debugfs_subroot, dlm,
+			    &debug_purgelist_fops);
 }
 
 /* subroot - domain dir */
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 	dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
 						      dlm_debugfs_root);
-	if (!dlm->dlm_debugfs_subroot) {
-		mlog_errno(-ENOMEM);
-		goto bail;
-	}
-
-	dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
-				      GFP_KERNEL);
-	if (!dlm->dlm_debug_ctxt) {
-		mlog_errno(-ENOMEM);
-		goto bail;
-	}
-
-	return 0;
-bail:
-	dlm_destroy_debugfs_subroot(dlm);
-	return -ENOMEM;
 }
 
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-	debugfs_remove(dlm->dlm_debugfs_subroot);
+	debugfs_remove_recursive(dlm->dlm_debugfs_subroot);
 }
 
 /* debugfs root */
-int dlm_create_debugfs_root(void)
+void dlm_create_debugfs_root(void)
 {
 	dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
-	if (!dlm_debugfs_root) {
-		mlog_errno(-ENOMEM);
-		return -ENOMEM;
-	}
-	return 0;
 }
 
 void dlm_destroy_debugfs_root(void)

diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 5ced548..f8fd868 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h

@@ -1,25 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * dlmdebug.h
  *
  * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef DLMDEBUG_H
@@ -29,13 +14,6 @@
 
 #ifdef CONFIG_DEBUG_FS
 
-struct dlm_debug_ctxt {
-	struct dentry *debug_state_dentry;
-	struct dentry *debug_lockres_dentry;
-	struct dentry *debug_mle_dentry;
-	struct dentry *debug_purgelist_dentry;
-};
-
 struct debug_lockres {
 	int dl_len;
 	char *dl_buf;
@@ -43,34 +21,27 @@
 	struct dlm_lock_resource *dl_res;
 };
 
-int dlm_debug_init(struct dlm_ctxt *dlm);
-void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+void dlm_debug_init(struct dlm_ctxt *dlm);
 
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
 
-int dlm_create_debugfs_root(void);
+void dlm_create_debugfs_root(void);
 void dlm_destroy_debugfs_root(void);
 
 #else
 
-static inline int dlm_debug_init(struct dlm_ctxt *dlm)
-{
-	return 0;
-}
-static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+static inline void dlm_debug_init(struct dlm_ctxt *dlm)
 {
 }
-static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-	return 0;
 }
 static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 }
-static inline int dlm_create_debugfs_root(void)
+static inline void dlm_create_debugfs_root(void)
 {
-	return 0;
 }
 static inline void dlm_destroy_debugfs_root(void)
 {

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2acd58b..ee6f459 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * defines domain join / leave apis
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #include <linux/module.h>
@@ -402,7 +387,6 @@
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
 	dlm_unregister_domain_handlers(dlm);
-	dlm_debug_shutdown(dlm);
 	dlm_complete_thread(dlm);
 	dlm_complete_recovery_thread(dlm);
 	dlm_destroy_dlm_worker(dlm);
@@ -1896,11 +1880,7 @@
 		goto bail;
 	}
 
-	status = dlm_debug_init(dlm);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	dlm_debug_init(dlm);
 
 	snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
 	dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0);
@@ -1957,7 +1937,6 @@
 
 	if (status) {
 		dlm_unregister_domain_handlers(dlm);
-		dlm_debug_shutdown(dlm);
 		dlm_complete_thread(dlm);
 		dlm_complete_recovery_thread(dlm);
 		dlm_destroy_dlm_worker(dlm);
@@ -2011,9 +1990,7 @@
 	dlm->key = key;
 	dlm->node_num = o2nm_this_node();
 
-	ret = dlm_create_debugfs_subroot(dlm);
-	if (ret < 0)
-		goto leave;
+	dlm_create_debugfs_subroot(dlm);
 
 	spin_lock_init(&dlm->spinlock);
 	spin_lock_init(&dlm->master_lock);
@@ -2075,6 +2052,7 @@
 	mlog(0, "context init: refcount %u\n",
 		  kref_read(&dlm->dlm_refs));
 
+	ret = 0;
 leave:
 	if (ret < 0 && dlm) {
 		if (dlm->master_hash)
@@ -2361,9 +2339,7 @@
 		goto error;
 	}
 
-	status = dlm_create_debugfs_root();
-	if (status)
-		goto error;
+	dlm_create_debugfs_root();
 
 	return 0;
 error:

diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 8a92814..7c21664 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h

@@ -1,25 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * dlmdomain.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 #ifndef DLMDOMAIN_H

diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 7496231..baff087 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * underlying calls for lock creation
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 826f056..74b768c 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * standalone DLM module
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 
@@ -2176,7 +2161,7 @@
  * think that $RECOVERY is currently mastered by a dead node.  If so,
  * we wait a short time to allow that node to get notified by its own
  * heartbeat stack, then check again.  All $RECOVERY lock resources
- * mastered by dead nodes are purged when the hearbeat callback is
+ * mastered by dead nodes are purged when the heartbeat callback is
  * fired, so we can know for sure that it is safe to continue once
  * the node returns a live node or no node.  */
 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 802636d..064ce5b 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * recovery stuff
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 
@@ -1124,7 +1109,7 @@
 {
 	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
 	int mres_total_locks = be32_to_cpu(mres->total_locks);
-	int sz, ret = 0, status = 0;
+	int ret = 0, status = 0;
 	u8 orig_flags = mres->flags,
 	   orig_master = mres->master;
 
@@ -1132,9 +1117,6 @@
 	if (!mres->num_locks)
 		return 0;
 
-	sz = sizeof(struct dlm_migratable_lockres) +
-		(mres->num_locks * sizeof(struct dlm_migratable_lock));
-
 	/* add an all-done flag if we reached the last lock */
 	orig_flags = mres->flags;
 	BUG_ON(total_locks > mres_total_locks);
@@ -1148,7 +1130,8 @@
 
 	/* send it */
 	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
-				 sz, send_to, &status);
+				 struct_size(mres, ml, mres->num_locks),
+				 send_to, &status);
 	if (ret < 0) {
 		/* XXX: negative status is not handled.
 		 * this will end up killing this node. */

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 838a06d..61c51c2 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * standalone DLM module
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 
@@ -531,7 +516,7 @@
 	assert_spin_locked(&res->spinlock);
 
 	/* don't shuffle secondary queues */
-	if ((res->owner == dlm->node_num)) {
+	if (res->owner == dlm->node_num) {
 		if (res->state & (DLM_LOCK_RES_MIGRATING |
 				  DLM_LOCK_RES_BLOCK_DIRTY))
 		    return;

diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 63d701c..3883633 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,22 +7,6 @@
  * underlying calls for unlocking locks
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  */
 
 
@@ -105,7 +90,8 @@
 	enum dlm_status status;
 	int actions = 0;
 	int in_use;
-        u8 owner;
+	u8 owner;
+	int recovery_wait = 0;
 
 	mlog(0, "master_node = %d, valblk = %d\n", master_node,
 	     flags & LKM_VALBLK);
@@ -208,9 +194,12 @@
 		}
 		if (flags & LKM_CANCEL)
 			lock->cancel_pending = 0;
-		else
-			lock->unlock_pending = 0;
-
+		else {
+			if (!lock->unlock_pending)
+				recovery_wait = 1;
+			else
+				lock->unlock_pending = 0;
+		}
 	}
 
 	/* get an extra ref on lock.  if we are just switching
@@ -244,6 +233,17 @@
 	spin_unlock(&res->spinlock);
 	wake_up(&res->wq);
 
+	if (recovery_wait) {
+		spin_lock(&res->spinlock);
+		/* Unlock request will directly succeed after owner dies,
+		 * and the lock is already removed from grant list. We have to
+		 * wait for RECOVERING done or we miss the chance to purge it
+		 * since the removement is much faster than RECOVERING proc.
+		 */
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
+		spin_unlock(&res->spinlock);
+	}
+
 	/* let the caller's final dlm_lock_put handle the actual kfree */
 	if (actions & DLM_UNLOCK_FREE_LOCK) {
 		/* this should always be coupled with list removal */

diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index eed3db8..a9874e4 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile

@@ -1,4 +1,5 @@
-ccflags-y := -Ifs/ocfs2
+# SPDX-License-Identifier: GPL-2.0-only
+ccflags-y := -I $(srctree)/$(src)/..
 
 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
 

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 602c71f..4f1668c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -9,21 +10,6 @@
  * which was a template for the fs side of this module.
  *
  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 /* Simple VFS hooks based on: */
@@ -179,7 +165,7 @@
 static int dlmfs_file_release(struct inode *inode,
 			      struct file *file)
 {
-	int level, status;
+	int level;
 	struct dlmfs_inode_private *ip = DLMFS_I(inode);
 	struct dlmfs_filp_private *fp = file->private_data;
 
@@ -188,7 +174,6 @@
 
 	mlog(0, "close called on inode %lu\n", inode->i_ino);
 
-	status = 0;
 	if (fp) {
 		level = fp->fp_lock_level;
 		if (level != DLM_LOCK_IV)
@@ -255,7 +240,7 @@
 	if (!count)
 		return 0;
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	/* don't read past the lvb */
@@ -303,7 +288,7 @@
 	if (!count)
 		return 0;
 
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	/* don't write past the lvb */
@@ -350,17 +335,11 @@
 	return &ip->ip_vfs_inode;
 }
 
-static void dlmfs_i_callback(struct rcu_head *head)
+static void dlmfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
 
-static void dlmfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, dlmfs_i_callback);
-}
-
 static void dlmfs_evict_inode(struct inode *inode)
 {
 	int status;
@@ -606,7 +585,7 @@
 static const struct super_operations dlmfs_ops = {
 	.statfs		= simple_statfs,
 	.alloc_inode	= dlmfs_alloc_inode,
-	.destroy_inode	= dlmfs_destroy_inode,
+	.free_inode	= dlmfs_free_inode,
 	.evict_inode	= dlmfs_evict_inode,
 	.drop_inode	= generic_delete_inode,
 };

diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 9cecf48..525b14d 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -10,21 +11,6 @@
  * functions.
  *
  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/signal.h>

diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index ede94a6..4bef7cd 100644
--- a/fs/ocfs2/dlmfs/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Userspace dlm defines
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 933aac5..6e774c5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Code which implements an OCFS2 specific interface to our DLM.
  *
  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/types.h>
@@ -440,6 +426,7 @@
 static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
 {
 	res->l_lock_refresh = 0;
+	res->l_lock_wait = 0;
 	memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
 	memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
 }
@@ -474,6 +461,8 @@
 
 	if (ret)
 		stats->ls_fail++;
+
+	stats->ls_last = ktime_to_us(ktime_get_real());
 }
 
 static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -481,6 +470,21 @@
 	lockres->l_lock_refresh++;
 }
 
+static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_mask_waiter *mw;
+
+	if (list_empty(&lockres->l_mask_waiters)) {
+		lockres->l_lock_wait = 0;
+		return;
+	}
+
+	mw = list_first_entry(&lockres->l_mask_waiters,
+				struct ocfs2_mask_waiter, mw_item);
+	lockres->l_lock_wait =
+			ktime_to_us(ktime_mono_to_real(mw->mw_lock_start));
+}
+
 static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
 {
 	mw->mw_lock_start = ktime_get();
@@ -496,6 +500,9 @@
 static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
 {
 }
+static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
+{
+}
 static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
 {
 }
@@ -686,6 +693,9 @@
 {
 	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
 
+	/* Only one trimfs thread are allowed to work at the same time. */
+	mutex_lock(&osb->obs_trim_fs_mutex);
+
 	ocfs2_lock_res_init_once(lockres);
 	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
 	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +708,8 @@
 
 	ocfs2_simple_drop_lockres(osb, lockres);
 	ocfs2_lock_res_free(lockres);
+
+	mutex_unlock(&osb->obs_trim_fs_mutex);
 }
 
 static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
@@ -884,6 +896,7 @@
 		list_del_init(&mw->mw_item);
 		mw->mw_status = 0;
 		complete(&mw->mw_complete);
+		ocfs2_track_lock_wait(lockres);
 	}
 }
 static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
@@ -1395,6 +1408,7 @@
 	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
 	mw->mw_mask = mask;
 	mw->mw_goal = goal;
+	ocfs2_track_lock_wait(lockres);
 }
 
 /* returns 0 if the mw that was removed was already satisfied, -EBUSY
@@ -1411,6 +1425,7 @@
 
 		list_del_init(&mw->mw_item);
 		init_completion(&mw->mw_complete);
+		ocfs2_track_lock_wait(lockres);
 	}
 
 	return ret;
@@ -2123,10 +2138,10 @@
 
 /* LVB only has room for 64 bits of time here so we pack it for
  * now. */
-static u64 ocfs2_pack_timespec(struct timespec *spec)
+static u64 ocfs2_pack_timespec(struct timespec64 *spec)
 {
 	u64 res;
-	u64 sec = spec->tv_sec;
+	u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull);
 	u32 nsec = spec->tv_nsec;
 
 	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
@@ -2142,7 +2157,6 @@
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
-	struct timespec ts;
 
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
@@ -2163,15 +2177,12 @@
 	lvb->lvb_igid      = cpu_to_be32(i_gid_read(inode));
 	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
 	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
-	ts = timespec64_to_timespec(inode->i_atime);
 	lvb->lvb_iatime_packed  =
-		cpu_to_be64(ocfs2_pack_timespec(&ts));
-	ts = timespec64_to_timespec(inode->i_ctime);
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
 	lvb->lvb_ictime_packed =
-		cpu_to_be64(ocfs2_pack_timespec(&ts));
-	ts = timespec64_to_timespec(inode->i_mtime);
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
 	lvb->lvb_imtime_packed =
-		cpu_to_be64(ocfs2_pack_timespec(&ts));
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
 	lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
 	lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
 	lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2180,7 +2191,7 @@
 	mlog_meta_lvb(0, lockres);
 }
 
-static void ocfs2_unpack_timespec(struct timespec *spec,
+static void ocfs2_unpack_timespec(struct timespec64 *spec,
 				  u64 packed_time)
 {
 	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
@@ -2189,7 +2200,6 @@
 
 static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 {
-	struct timespec ts;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
@@ -2217,15 +2227,12 @@
 	i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
 	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
-	ocfs2_unpack_timespec(&ts,
+	ocfs2_unpack_timespec(&inode->i_atime,
 			      be64_to_cpu(lvb->lvb_iatime_packed));
-	inode->i_atime = timespec_to_timespec64(ts);
-	ocfs2_unpack_timespec(&ts,
+	ocfs2_unpack_timespec(&inode->i_mtime,
 			      be64_to_cpu(lvb->lvb_imtime_packed));
-	inode->i_mtime = timespec_to_timespec64(ts);
-	ocfs2_unpack_timespec(&ts,
+	ocfs2_unpack_timespec(&inode->i_ctime,
 			      be64_to_cpu(lvb->lvb_ictime_packed));
-	inode->i_ctime = timespec_to_timespec64(ts);
 	spin_unlock(&oi->ip_lock);
 }
 
@@ -2501,9 +2508,7 @@
 			ocfs2_inode_unlock(inode, ex);
 	}
 
-	if (local_bh)
-		brelse(local_bh);
-
+	brelse(local_bh);
 	return status;
 }
 
@@ -2586,8 +2591,7 @@
 		*level = 1;
 		if (ocfs2_should_update_atime(inode, vfsmnt))
 			ocfs2_update_inode_atime(inode, bh);
-		if (bh)
-			brelse(bh);
+		brelse(bh);
 	} else
 		*level = 0;
 
@@ -3005,7 +3009,7 @@
 
 	kref_init(&dlm_debug->d_refcnt);
 	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
-	dlm_debug->d_locking_state = NULL;
+	dlm_debug->d_filter_secs = 0;
 out:
 	return dlm_debug;
 }
@@ -3096,17 +3100,43 @@
  *	- Lock stats printed
  * New in version 3
  *	- Max time in lock stats is in usecs (instead of nsecs)
+ * New in version 4
+ *	- Add last pr/ex unlock times and first lock wait time in usecs
  */
-#define OCFS2_DLM_DEBUG_STR_VERSION 3
+#define OCFS2_DLM_DEBUG_STR_VERSION 4
 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 {
 	int i;
 	char *lvb;
 	struct ocfs2_lock_res *lockres = v;
+#ifdef CONFIG_OCFS2_FS_STATS
+	u64 now, last;
+	struct ocfs2_dlm_debug *dlm_debug =
+			((struct ocfs2_dlm_seq_priv *)m->private)->p_dlm_debug;
+#endif
 
 	if (!lockres)
 		return -EINVAL;
 
+#ifdef CONFIG_OCFS2_FS_STATS
+	if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) {
+		now = ktime_to_us(ktime_get_real());
+		if (lockres->l_lock_prmode.ls_last >
+		    lockres->l_lock_exmode.ls_last)
+			last = lockres->l_lock_prmode.ls_last;
+		else
+			last = lockres->l_lock_exmode.ls_last;
+		/*
+		 * Use d_filter_secs field to filter lock resources dump,
+		 * the default d_filter_secs(0) value filters nothing,
+		 * otherwise, only dump the last N seconds active lock
+		 * resources.
+		 */
+		if (div_u64(now - last, 1000000) > dlm_debug->d_filter_secs)
+			return 0;
+	}
+#endif
+
 	seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
 
 	if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
@@ -3148,6 +3178,9 @@
 # define lock_max_prmode(_l)		((_l)->l_lock_prmode.ls_max)
 # define lock_max_exmode(_l)		((_l)->l_lock_exmode.ls_max)
 # define lock_refresh(_l)		((_l)->l_lock_refresh)
+# define lock_last_prmode(_l)		((_l)->l_lock_prmode.ls_last)
+# define lock_last_exmode(_l)		((_l)->l_lock_exmode.ls_last)
+# define lock_wait(_l)			((_l)->l_lock_wait)
 #else
 # define lock_num_prmode(_l)		(0)
 # define lock_num_exmode(_l)		(0)
@@ -3158,6 +3191,9 @@
 # define lock_max_prmode(_l)		(0)
 # define lock_max_exmode(_l)		(0)
 # define lock_refresh(_l)		(0)
+# define lock_last_prmode(_l)		(0ULL)
+# define lock_last_exmode(_l)		(0ULL)
+# define lock_wait(_l)			(0ULL)
 #endif
 	/* The following seq_print was added in version 2 of this output */
 	seq_printf(m, "%u\t"
@@ -3168,7 +3204,10 @@
 		   "%llu\t"
 		   "%u\t"
 		   "%u\t"
-		   "%u\t",
+		   "%u\t"
+		   "%llu\t"
+		   "%llu\t"
+		   "%llu\t",
 		   lock_num_prmode(lockres),
 		   lock_num_exmode(lockres),
 		   lock_num_prmode_failed(lockres),
@@ -3177,7 +3216,10 @@
 		   lock_total_exmode(lockres),
 		   lock_max_prmode(lockres),
 		   lock_max_exmode(lockres),
-		   lock_refresh(lockres));
+		   lock_refresh(lockres),
+		   lock_last_prmode(lockres),
+		   lock_last_exmode(lockres),
+		   lock_wait(lockres));
 
 	/* End the line */
 	seq_printf(m, "\n");
@@ -3231,36 +3273,23 @@
 	.llseek =	seq_lseek,
 };
 
-static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
 {
-	int ret = 0;
 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
 
-	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
-							 S_IFREG|S_IRUSR,
-							 osb->osb_debug_root,
-							 osb,
-							 &ocfs2_dlm_debug_fops);
-	if (!dlm_debug->d_locking_state) {
-		ret = -EINVAL;
-		mlog(ML_ERROR,
-		     "Unable to create locking state debugfs file.\n");
-		goto out;
-	}
+	debugfs_create_file("locking_state", S_IFREG|S_IRUSR,
+			    osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
 
-	ocfs2_get_dlm_debug(dlm_debug);
-out:
-	return ret;
+	debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
+			   &dlm_debug->d_filter_secs);
 }
 
 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
 {
 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
 
-	if (dlm_debug) {
-		debugfs_remove(dlm_debug->d_locking_state);
+	if (dlm_debug)
 		ocfs2_put_dlm_debug(dlm_debug);
-	}
 }
 
 int ocfs2_dlm_init(struct ocfs2_super *osb)
@@ -3273,11 +3302,7 @@
 		goto local;
 	}
 
-	status = ocfs2_dlm_init_debug(osb);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_dlm_init_debug(osb);
 
 	/* launch downconvert thread */
 	osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
@@ -3603,7 +3628,7 @@
 	 * we can recover correctly from node failure. Otherwise, we may get
 	 * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
 	 */
-	if (!ocfs2_is_o2cb_active() &&
+	if (ocfs2_userspace_stack(osb) &&
 	    lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
 		lvb = 1;
 
@@ -4369,7 +4394,6 @@
 
 static int ocfs2_downconvert_thread(void *arg)
 {
-	int status = 0;
 	struct ocfs2_super *osb = arg;
 
 	/* only quit once we've been asked to stop and there is no more
@@ -4387,7 +4411,7 @@
 	}
 
 	osb->dc_task = NULL;
-	return status;
+	return 0;
 }
 
 void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)

diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 4ec1c82..b8fbed2 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * description here
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 

diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 4bf8d58..69ed278 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Functions to facilitate NFS exporting
  *
  * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -148,16 +134,24 @@
 	u64 blkno;
 	struct dentry *parent;
 	struct inode *dir = d_inode(child);
+	int set;
 
 	trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
 			       (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
+	status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1);
+	if (status < 0) {
+		mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+
 	status = ocfs2_inode_lock(dir, NULL, 0);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		parent = ERR_PTR(status);
-		goto bail;
+		goto unlock_nfs_sync;
 	}
 
 	status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
@@ -166,11 +160,31 @@
 		goto bail_unlock;
 	}
 
+	status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set);
+	if (status < 0) {
+		if (status == -EINVAL) {
+			status = -ESTALE;
+		} else
+			mlog(ML_ERROR, "test inode bit failed %d\n", status);
+		parent = ERR_PTR(status);
+		goto bail_unlock;
+	}
+
+	trace_ocfs2_get_dentry_test_bit(status, set);
+	if (!set) {
+		status = -ESTALE;
+		parent = ERR_PTR(status);
+		goto bail_unlock;
+	}
+
 	parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
 
 bail_unlock:
 	ocfs2_inode_unlock(dir, 0);
 
+unlock_nfs_sync:
+	ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1);
+
 bail:
 	trace_ocfs2_get_parent_end(parent);
 

diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index 41a7386..d485da0 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_EXPORT_H

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 06cb964..e3e2d1b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,20 +7,6 @@
  * Block/Cluster mapping functions
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2,  as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -603,8 +590,7 @@
 			*extent_flags = rec->e_flags;
 	}
 out:
-	if (eb_bh)
-		brelse(eb_bh);
+	brelse(eb_bh);
 	return ret;
 }
 

diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1057586..e5464f6 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,20 +7,6 @@
  * In-memory file extent mappings for OCFS2.
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2,  as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef _EXTENT_MAP_H

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9fa35cb..9876db5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * File open, close, extend, truncate
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/capability.h>
@@ -720,7 +706,9 @@
  * Thus, we need to explicitly order the zeroed pages.
  */
 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
-						struct buffer_head *di_bh)
+						      struct buffer_head *di_bh,
+						      loff_t start_byte,
+						      loff_t length)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle = NULL;
@@ -736,7 +724,7 @@
 		goto out;
 	}
 
-	ret = ocfs2_jbd2_file_inode(handle, inode);
+	ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -775,7 +763,9 @@
 	BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
 	BUG_ON(abs_from & (inode->i_blkbits - 1));
 
-	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
+						      abs_from,
+						      abs_to - abs_from);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
@@ -1240,6 +1230,7 @@
 			transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
 			if (IS_ERR(transfer_to[USRQUOTA])) {
 				status = PTR_ERR(transfer_to[USRQUOTA]);
+				transfer_to[USRQUOTA] = NULL;
 				goto bail_unlock;
 			}
 		}
@@ -1249,6 +1240,7 @@
 			transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
 			if (IS_ERR(transfer_to[GRPQUOTA])) {
 				status = PTR_ERR(transfer_to[GRPQUOTA]);
+				transfer_to[GRPQUOTA] = NULL;
 				goto bail_unlock;
 			}
 		}
@@ -2106,54 +2098,89 @@
 	return 0;
 }
 
-static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
-					    struct file *file,
-					    loff_t pos, size_t count,
-					    int *meta_level)
+static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+					    struct buffer_head **di_bh,
+					    int meta_level,
+					    int overwrite_io,
+					    int write_sem,
+					    int wait)
 {
-	int ret;
-	struct buffer_head *di_bh = NULL;
-	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
-	u32 clusters =
-		ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+	int ret = 0;
 
-	ret = ocfs2_inode_lock(inode, &di_bh, 1);
-	if (ret) {
-		mlog_errno(ret);
+	if (wait)
+		ret = ocfs2_inode_lock(inode, NULL, meta_level);
+	else
+		ret = ocfs2_try_inode_lock(inode,
+			overwrite_io ? NULL : di_bh, meta_level);
+	if (ret < 0)
 		goto out;
+
+	if (wait) {
+		if (write_sem)
+			down_write(&OCFS2_I(inode)->ip_alloc_sem);
+		else
+			down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	} else {
+		if (write_sem)
+			ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+		else
+			ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+
+		if (!ret) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
 	}
 
-	*meta_level = 1;
-
-	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
-	if (ret)
-		mlog_errno(ret);
-out:
-	brelse(di_bh);
 	return ret;
+
+out_unlock:
+	brelse(*di_bh);
+	ocfs2_inode_unlock(inode, meta_level);
+out:
+	return ret;
+}
+
+static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
+					       struct buffer_head **di_bh,
+					       int meta_level,
+					       int write_sem)
+{
+	if (write_sem)
+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	else
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(*di_bh);
+	*di_bh = NULL;
+
+	if (meta_level >= 0)
+		ocfs2_inode_unlock(inode, meta_level);
 }
 
 static int ocfs2_prepare_inode_for_write(struct file *file,
 					 loff_t pos, size_t count, int wait)
 {
 	int ret = 0, meta_level = 0, overwrite_io = 0;
+	int write_sem = 0;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
 	struct buffer_head *di_bh = NULL;
-	loff_t end;
+	u32 cpos;
+	u32 clusters;
 
 	/*
 	 * We start with a read level meta lock and only jump to an ex
 	 * if we need to make modifications here.
 	 */
 	for(;;) {
-		if (wait)
-			ret = ocfs2_inode_lock(inode, NULL, meta_level);
-		else
-			ret = ocfs2_try_inode_lock(inode,
-				overwrite_io ? NULL : &di_bh, meta_level);
+		ret = ocfs2_inode_lock_for_extent_tree(inode,
+						       &di_bh,
+						       meta_level,
+						       overwrite_io,
+						       write_sem,
+						       wait);
 		if (ret < 0) {
-			meta_level = -1;
 			if (ret != -EAGAIN)
 				mlog_errno(ret);
 			goto out;
@@ -2165,15 +2192,8 @@
 		 */
 		if (!wait && !overwrite_io) {
 			overwrite_io = 1;
-			if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
-				ret = -EAGAIN;
-				goto out_unlock;
-			}
 
 			ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
-			brelse(di_bh);
-			di_bh = NULL;
-			up_read(&OCFS2_I(inode)->ip_alloc_sem);
 			if (ret < 0) {
 				if (ret != -EAGAIN)
 					mlog_errno(ret);
@@ -2192,7 +2212,10 @@
 		 * set inode->i_size at the end of a write. */
 		if (should_remove_suid(dentry)) {
 			if (meta_level == 0) {
-				ocfs2_inode_unlock(inode, meta_level);
+				ocfs2_inode_unlock_for_extent_tree(inode,
+								   &di_bh,
+								   meta_level,
+								   write_sem);
 				meta_level = 1;
 				continue;
 			}
@@ -2204,22 +2227,34 @@
 			}
 		}
 
-		end = pos + count;
-
 		ret = ocfs2_check_range_for_refcount(inode, pos, count);
 		if (ret == 1) {
-			ocfs2_inode_unlock(inode, meta_level);
-			meta_level = -1;
+			ocfs2_inode_unlock_for_extent_tree(inode,
+							   &di_bh,
+							   meta_level,
+							   write_sem);
+			ret = ocfs2_inode_lock_for_extent_tree(inode,
+							       &di_bh,
+							       meta_level,
+							       overwrite_io,
+							       1,
+							       wait);
+			write_sem = 1;
+			if (ret < 0) {
+				if (ret != -EAGAIN)
+					mlog_errno(ret);
+				goto out;
+			}
 
-			ret = ocfs2_prepare_inode_for_refcount(inode,
-							       file,
-							       pos,
-							       count,
-							       &meta_level);
+			cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+			clusters =
+				ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+			ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
 		}
 
 		if (ret < 0) {
-			mlog_errno(ret);
+			if (ret != -EAGAIN)
+				mlog_errno(ret);
 			goto out_unlock;
 		}
 
@@ -2230,10 +2265,10 @@
 	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
 					    pos, count, wait);
 
-	brelse(di_bh);
-
-	if (meta_level >= 0)
-		ocfs2_inode_unlock(inode, meta_level);
+	ocfs2_inode_unlock_for_extent_tree(inode,
+					   &di_bh,
+					   meta_level,
+					   write_sem);
 
 out:
 	return ret;
@@ -2343,7 +2378,7 @@
 
 	written = __generic_file_write_iter(iocb, from);
 	/* buffered aio wouldn't have proper lock coverage today */
-	BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+	BUG_ON(written == -EIOCBQUEUED && !direct_io);
 
 	/*
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -2463,7 +2498,7 @@
 	trace_generic_file_read_iter_ret(ret);
 
 	/* buffered aio wouldn't have proper lock coverage today */
-	BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+	BUG_ON(ret == -EIOCBQUEUED && !direct_io);
 
 	/* see ocfs2_file_write_iter */
 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
@@ -2527,24 +2562,79 @@
 	return offset;
 }
 
-static int ocfs2_file_clone_range(struct file *file_in,
-				  loff_t pos_in,
-				  struct file *file_out,
-				  loff_t pos_out,
-				  u64 len)
+static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
+				     struct file *file_out, loff_t pos_out,
+				     loff_t len, unsigned int remap_flags)
 {
-	return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-					 len, false);
-}
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+	struct buffer_head *in_bh = NULL, *out_bh = NULL;
+	bool same_inode = (inode_in == inode_out);
+	loff_t remapped = 0;
+	ssize_t ret;
 
-static int ocfs2_file_dedupe_range(struct file *file_in,
-				   loff_t pos_in,
-				   struct file *file_out,
-				   loff_t pos_out,
-				   u64 len)
-{
-	return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-					  len, true);
+	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+		return -EINVAL;
+	if (!ocfs2_refcount_tree(osb))
+		return -EOPNOTSUPP;
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	/* Lock both files against IO */
+	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+	if (ret)
+		return ret;
+
+	/* Check file eligibility and prepare for block sharing. */
+	ret = -EINVAL;
+	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+		goto out_unlock;
+
+	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+			&len, remap_flags);
+	if (ret < 0 || len == 0)
+		goto out_unlock;
+
+	/* Lock out changes to the allocation maps and remap. */
+	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+	if (!same_inode)
+		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+				  SINGLE_DEPTH_NESTING);
+
+	/* Zap any page cache for the destination file's range. */
+	truncate_inode_pages_range(&inode_out->i_data,
+				   round_down(pos_out, PAGE_SIZE),
+				   round_up(pos_out + len, PAGE_SIZE) - 1);
+
+	remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
+			inode_out, out_bh, pos_out, len);
+	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+	if (!same_inode)
+		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+	if (remapped < 0) {
+		ret = remapped;
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	/*
+	 * Empty the extent map so that we may get the right extent
+	 * record from the disk.
+	 */
+	ocfs2_extent_map_trunc(inode_in, 0);
+	ocfs2_extent_map_trunc(inode_out, 0);
+
+	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+out_unlock:
+	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+	return remapped > 0 ? remapped : ret;
 }
 
 const struct inode_operations ocfs2_file_iops = {
@@ -2586,8 +2676,7 @@
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ocfs2_fallocate,
-	.clone_file_range = ocfs2_file_clone_range,
-	.dedupe_file_range = ocfs2_file_dedupe_range,
+	.remap_file_range = ocfs2_remap_file_range,
 };
 
 const struct file_operations ocfs2_dops = {
@@ -2633,8 +2722,7 @@
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ocfs2_fallocate,
-	.clone_file_range = ocfs2_file_clone_range,
-	.dedupe_file_range = ocfs2_file_dedupe_range,
+	.remap_file_range = ocfs2_remap_file_range,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {

diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 7eb7f03..4832cbc 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_FILE_H

diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index f65f2b2..50f11bf 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Code which implements online file check.
  *
  * Copyright (C) 2016 SuSE.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/list.h>
@@ -193,6 +185,7 @@
 	ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck,
 					NULL, "filecheck");
 	if (ret) {
+		kobject_put(&entry->fs_kobj);
 		kfree(fcheck);
 		return ret;
 	}

diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
index 6a22ee7..4d00677 100644
--- a/fs/ocfs2/filecheck.h
+++ b/fs/ocfs2/filecheck.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Online file check.
  *
  * Copyright (C) 2016 SuSE.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index d8208b2..60c5f99 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -7,21 +8,6 @@
  * up to date, and fire off recovery when needed.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>

diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index 74b9c5d..5fedb2d 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_HEARTBEAT_H

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7927924..7c9dfd5 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * vfs' aops, fops, dops and iops
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -548,7 +534,7 @@
 	 */
 	mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
 			!!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
-			"Inode %llu: system file state is ambigous\n",
+			"Inode %llu: system file state is ambiguous\n",
 			(unsigned long long)args->fi_blkno);
 
 	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||

diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9b955f7..51a4f71 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_INODE_H

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 994726a..efeea20 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c

@@ -106,16 +106,9 @@
 	flags = flags & mask;
 	flags |= oldflags & ~mask;
 
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-	 * the relevant capability.
-	 */
-	status = -EPERM;
-	if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
-		(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
-		if (!capable(CAP_LINUX_IMMUTABLE))
-			goto bail_unlock;
-	}
+	status = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (status)
+		goto bail_unlock;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
@@ -290,7 +283,7 @@
 	if (inode_alloc)
 		inode_lock(inode_alloc);
 
-	if (o2info_coherent(&fi->ifi_req)) {
+	if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
 		status = ocfs2_inode_lock(inode_alloc, &bh, 0);
 		if (status < 0) {
 			mlog_errno(status);

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index bd34756..699a560 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Defines functions of journalling api
  *
  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -231,7 +217,8 @@
 	/* At this point, we know that no more recovery threads can be
 	 * launched, so wait for any recovery completion work to
 	 * complete. */
-	flush_workqueue(osb->ocfs2_wq);
+	if (osb->ocfs2_wq)
+		flush_workqueue(osb->ocfs2_wq);
 
 	/*
 	 * Now that recovery is shut down, and the osb is about to be
@@ -1017,7 +1004,8 @@
 			mlog_errno(status);
 	}
 
-	if (status == 0) {
+	/* Shutdown the kernel journal system */
+	if (!jbd2_journal_destroy(journal->j_journal) && !status) {
 		/*
 		 * Do not toggle if flush was unsuccessful otherwise
 		 * will leave dirty metadata in a "clean" journal
@@ -1026,9 +1014,6 @@
 		if (status < 0)
 			mlog_errno(status);
 	}
-
-	/* Shutdown the kernel journal system */
-	jbd2_journal_destroy(journal->j_journal);
 	journal->j_journal = NULL;
 
 	OCFS2_I(inode)->ip_open_count--;
@@ -1378,15 +1363,23 @@
 	int rm_quota_used = 0, i;
 	struct ocfs2_quota_recovery *qrec;
 
+	/* Whether the quota supported. */
+	int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+			OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+		|| OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+			OCFS2_FEATURE_RO_COMPAT_GRPQUOTA);
+
 	status = ocfs2_wait_on_mount(osb);
 	if (status < 0) {
 		goto bail;
 	}
 
-	rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
-	if (!rm_quota) {
-		status = -ENOMEM;
-		goto bail;
+	if (quota_enabled) {
+		rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
+		if (!rm_quota) {
+			status = -ENOMEM;
+			goto bail;
+		}
 	}
 restart:
 	status = ocfs2_super_lock(osb, 1);
@@ -1422,9 +1415,14 @@
 		 * then quota usage would be out of sync until some node takes
 		 * the slot. So we remember which nodes need quota recovery
 		 * and when everything else is done, we recover quotas. */
-		for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
-		if (i == rm_quota_used)
-			rm_quota[rm_quota_used++] = slot_num;
+		if (quota_enabled) {
+			for (i = 0; i < rm_quota_used
+					&& rm_quota[i] != slot_num; i++)
+				;
+
+			if (i == rm_quota_used)
+				rm_quota[rm_quota_used++] = slot_num;
+		}
 
 		status = ocfs2_recover_node(osb, node_num, slot_num);
 skip_recovery:
@@ -1452,16 +1450,19 @@
 	/* Now it is right time to recover quotas... We have to do this under
 	 * superblock lock so that no one can start using the slot (and crash)
 	 * before we recover it */
-	for (i = 0; i < rm_quota_used; i++) {
-		qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
-		if (IS_ERR(qrec)) {
-			status = PTR_ERR(qrec);
-			mlog_errno(status);
-			continue;
+	if (quota_enabled) {
+		for (i = 0; i < rm_quota_used; i++) {
+			qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+			if (IS_ERR(qrec)) {
+				status = PTR_ERR(qrec);
+				mlog_errno(status);
+				continue;
+			}
+			ocfs2_queue_recovery_completion(osb->journal,
+					rm_quota[i],
+					NULL, NULL, qrec,
+					ORPHAN_NEED_TRUNCATE);
 		}
-		ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
-						NULL, NULL, qrec,
-						ORPHAN_NEED_TRUNCATE);
 	}
 
 	ocfs2_super_unlock(osb, 1);
@@ -1483,7 +1484,8 @@
 
 	mutex_unlock(&osb->recovery_lock);
 
-	kfree(rm_quota);
+	if (quota_enabled)
+		kfree(rm_quota);
 
 	/* no one is callint kthread_stop() for us so the kthread() api
 	 * requires that we call do_exit().  And it isn't exported, but

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 497a417..3103ba7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Defines journalling api and structures.
  *
  * Copyright (C) 2003, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_JOURNAL_H
@@ -158,7 +144,6 @@
 void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
-void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
 
 void ocfs2_complete_recovery(struct work_struct *work);
 void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
@@ -246,8 +231,8 @@
  *                          ocfs2_journal_access_*() unless you intend to
  *                          manage the checksum by hand.
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
- *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
- *                           the current handle commits.
+ *  ocfs2_jbd2_inode_add_write  - Mark an inode with range so that its data goes
+ *                                out before the current handle commits.
  */
 
 /* You must always start_trans with a number of buffs > 0, but it's
@@ -455,7 +440,7 @@
  * previous dirblock update in the free list */
 static inline int ocfs2_link_credits(struct super_block *sb)
 {
-	return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
+	return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 +
 	       ocfs2_quota_trans_credits(sb);
 }
 
@@ -589,37 +574,12 @@
 	return ocfs2_extent_recs_per_gd(sb);
 }
 
-static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
-						unsigned int clusters_to_del,
-						struct ocfs2_dinode *fe,
-						struct ocfs2_extent_list *last_el)
+static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode,
+					     loff_t start_byte, loff_t length)
 {
- 	/* for dinode + all headers in this pass + update to next leaf */
-	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
-	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
-	int credits = 1 + tree_depth + 1;
-	int i;
-
-	i = next_free - 1;
-	BUG_ON(i < 0);
-
-	/* We may be deleting metadata blocks, so metadata alloc dinode +
-	   one desc. block for each possible delete. */
-	if (tree_depth && next_free == 1 &&
-	    ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
-		credits += 1 + tree_depth;
-
-	/* update to the truncate log. */
-	credits += OCFS2_TRUNCATE_LOG_UPDATE;
-
-	credits += ocfs2_quota_trans_credits(sb);
-
-	return credits;
-}
-
-static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
-{
-	return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
+	return jbd2_journal_inode_ranged_write(handle,
+					       &OCFS2_I(inode)->ip_jinode,
+					       start_byte, length);
 }
 
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7642b67..720e9f9 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Node local data allocation
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -345,13 +331,18 @@
 	if (num_used
 	    || alloc->id1.bitmap1.i_used
 	    || alloc->id1.bitmap1.i_total
-	    || la->la_bm_off)
-		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+	    || la->la_bm_off) {
+		mlog(ML_ERROR, "inconsistent detected, clean journal with"
+		     " unrecovered local alloc, please run fsck.ocfs2!\n"
 		     "found = %u, set = %u, taken = %u, off = %u\n",
 		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
 		     le32_to_cpu(alloc->id1.bitmap1.i_total),
 		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
 
+		status = -EINVAL;
+		goto bail;
+	}
+
 	osb->local_alloc_bh = alloc_bh;
 	osb->local_alloc_state = OCFS2_LA_ENABLED;
 
@@ -386,7 +377,8 @@
 	struct ocfs2_dinode *alloc = NULL;
 
 	cancel_delayed_work(&osb->la_enable_wq);
-	flush_workqueue(osb->ocfs2_wq);
+	if (osb->ocfs2_wq)
+		flush_workqueue(osb->ocfs2_wq);
 
 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
 		goto out;
@@ -433,12 +425,11 @@
 	bh = osb->local_alloc_bh;
 	alloc = (struct ocfs2_dinode *) bh->b_data;
 
-	alloc_copy = kmalloc(bh->b_size, GFP_NOFS);
+	alloc_copy = kmemdup(alloc, bh->b_size, GFP_NOFS);
 	if (!alloc_copy) {
 		status = -ENOMEM;
 		goto out_commit;
 	}
-	memcpy(alloc_copy, alloc, bh->b_size);
 
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
 					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -835,7 +826,7 @@
 				     u32 *numbits,
 				     struct ocfs2_alloc_reservation *resv)
 {
-	int numfound = 0, bitoff, left, startoff, lastzero;
+	int numfound = 0, bitoff, left, startoff;
 	int local_resv = 0;
 	struct ocfs2_alloc_reservation r;
 	void *bitmap = NULL;
@@ -873,7 +864,6 @@
 	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
 
 	numfound = bitoff = startoff = 0;
-	lastzero = -1;
 	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
 	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
 		if (bitoff == left) {
@@ -1282,13 +1272,12 @@
 	 * local alloc shutdown won't try to double free main bitmap
 	 * bits. Make a copy so the sync function knows which bits to
 	 * free. */
-	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS);
+	alloc_copy = kmemdup(alloc, osb->local_alloc_bh->b_size, GFP_NOFS);
 	if (!alloc_copy) {
 		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
-	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
 
 	status = ocfs2_journal_access_di(handle,
 					 INODE_CACHE(local_alloc_inode),

diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 44a7d1f..e8a5cea 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_LOCALALLOC_H

diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index d56f007..7edc4e5 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Userspace file locking support
  *
  * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -52,6 +38,7 @@
 	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
 	    lockres->l_level > LKM_NLMODE) {
 		int old_level = 0;
+		struct file_lock request;
 
 		if (lockres->l_level == LKM_EXMODE)
 			old_level = 1;
@@ -66,11 +53,10 @@
 		 * level.
 		 */
 
-		locks_lock_file_wait(file,
-				&(struct file_lock) {
-					.fl_type = F_UNLCK,
-					.fl_flags = FL_FLOCK
-				});
+		locks_init_lock(&request);
+		request.fl_type = F_UNLCK;
+		request.fl_flags = FL_FLOCK;
+		locks_lock_file_wait(file, &request);
 
 		ocfs2_file_unlock(file);
 	}

diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 496d488..389fe1f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes for Userspace file locking support
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_LOCKS_H

diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 05220b3..3a44e46 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Code to deal with the mess that is clustered mmap.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>

diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index f55f82c..758d966 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c

@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * move_extents.c
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 #include <linux/fs.h>
 #include <linux/types.h>
@@ -25,6 +17,7 @@
 #include "ocfs2_ioctl.h"
 
 #include "alloc.h"
+#include "localalloc.h"
 #include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -222,6 +215,7 @@
 	struct ocfs2_refcount_tree *ref_tree = NULL;
 	u32 new_phys_cpos, new_len;
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	int need_free = 0;
 
 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
 		BUG_ON(!ocfs2_is_refcount_inode(inode));
@@ -312,6 +306,7 @@
 		if (!partial) {
 			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
 			ret = -ENOSPC;
+			need_free = 1;
 			goto out_commit;
 		}
 	}
@@ -336,6 +331,20 @@
 		mlog_errno(ret);
 
 out_commit:
+	if (need_free && context->data_ac) {
+		struct ocfs2_alloc_context *data_ac = context->data_ac;
+
+		if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+					new_phys_cpos, new_len);
+		else
+			ocfs2_free_clusters(handle,
+					data_ac->ac_inode,
+					data_ac->ac_bh,
+					ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos),
+					new_len);
+	}
+
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock_mutex:

diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
index 4e143e8..28cac43 100644
--- a/fs/ocfs2/move_extents.h
+++ b/fs/ocfs2/move_extents.h

@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * move_extents.h
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 #ifndef OCFS2_MOVE_EXTENTS_H
 #define OCFS2_MOVE_EXTENTS_H

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b7ca84b..8ea51cf 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -19,21 +20,6 @@
  *   linux/fs/minix/dir.c
  *
  *   Copyright (C) 1991, 1992 Linux Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>
@@ -2500,7 +2486,6 @@
 	struct inode *inode = NULL;
 	struct inode *orphan_dir = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
-	struct ocfs2_dinode *di = NULL;
 	handle_t *handle = NULL;
 	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
 	struct buffer_head *parent_di_bh = NULL;
@@ -2566,7 +2551,6 @@
 		goto leave;
 	}
 
-	di = (struct ocfs2_dinode *)new_di_bh->b_data;
 	status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
 				  &orphan_insert, orphan_dir, false);
 	if (status < 0) {

diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 1155918..cc091ed 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_NAMEI_H

diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
index dfb313b..01ae48c 100644
--- a/fs/ocfs2/ocfs1_fs_compat.h
+++ b/fs/ocfs2/ocfs1_fs_compat.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -9,20 +10,6 @@
  * mount it.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2,  as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef _OCFS1_FS_COMPAT_H

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0..9150cfa 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Defines macros and structures used in OCFS2
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_H
@@ -164,6 +150,7 @@
 
 	/* Storing max wait in usecs saves 24 bytes per inode */
 	u32		ls_max;		/* Max wait in USEC */
+	u64		ls_last;	/* Last unlock time in USEC */
 };
 #endif
 
@@ -205,6 +192,7 @@
 #ifdef CONFIG_OCFS2_FS_STATS
 	struct ocfs2_lock_stats  l_lock_prmode;		/* PR mode stats */
 	u32                      l_lock_refresh;	/* Disk refreshes */
+	u64                      l_lock_wait;	/* First lock wait time */
 	struct ocfs2_lock_stats  l_lock_exmode;		/* EX mode stats */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -235,7 +223,7 @@
 
 struct ocfs2_dlm_debug {
 	struct kref d_refcnt;
-	struct dentry *d_locking_state;
+	u32 d_filter_secs;
 	struct list_head d_lockres_tracking;
 };
 
@@ -407,10 +395,10 @@
 	struct ocfs2_lock_res osb_rename_lockres;
 	struct ocfs2_lock_res osb_nfs_sync_lockres;
 	struct ocfs2_lock_res osb_trim_fs_lockres;
+	struct mutex obs_trim_fs_mutex;
 	struct ocfs2_dlm_debug *osb_dlm_debug;
 
 	struct dentry *osb_debug_root;
-	struct dentry *osb_ctxt;
 
 	wait_queue_head_t recovery_event;
 

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7071ad0..0db4a7e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,20 +7,6 @@
  * On-disk structures for OCFS2.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2,  as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef _OCFS2_FS_H
@@ -392,21 +379,6 @@
 #define OCFS2_HB_GLOBAL			"heartbeat=global"
 
 /*
- * OCFS2 directory file types.  Only the low 3 bits are used.  The
- * other bits are reserved for now.
- */
-#define OCFS2_FT_UNKNOWN	0
-#define OCFS2_FT_REG_FILE	1
-#define OCFS2_FT_DIR		2
-#define OCFS2_FT_CHRDEV		3
-#define OCFS2_FT_BLKDEV		4
-#define OCFS2_FT_FIFO		5
-#define OCFS2_FT_SOCK		6
-#define OCFS2_FT_SYMLINK	7
-
-#define OCFS2_FT_MAX		8
-
-/*
  * OCFS2_DIR_PAD defines the directory entries boundaries
  *
  * NOTE: It must be a multiple of 4
@@ -424,17 +396,6 @@
 #define	OCFS2_LINKS_HI_SHIFT	16
 #define	OCFS2_DX_ENTRIES_MAX	(0xffffffffU)
 
-#define S_SHIFT			12
-static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
-	[S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
-};
-
 
 /*
  * Convenience casts
@@ -1629,7 +1590,7 @@
 static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
 				    umode_t mode)
 {
-	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+	de->file_type = fs_umode_to_ftype(mode);
 }
 
 static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)

diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 5b27ff1..d7b3173 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Defines OCFS2 ioctls.
  *
  * Copyright (C) 2010 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #ifndef OCFS2_IOCTL_H

diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 7051b99..b4be849 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Defines OCFS2 lockid bits.
  *
  * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_LOCKID_H

diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 2e45c8d..5c9c105 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Defines OCFS2 Locking version values.
  *
  * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #ifndef OCFS2_LOCKINGVER_H

diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a9..dc4bce1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h

@@ -712,6 +712,8 @@
 
 DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
 
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
 DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
 
 /* End of trace events for fs/ocfs2/alloc.c. */

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7a5ee14..ee43e51 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c

@@ -1,18 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * refcounttree.c
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/sort.h>
@@ -4135,7 +4127,6 @@
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
-	struct ocfs2_refcount_block *rb;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
 	struct ocfs2_refcount_tree *ref_tree;
 
@@ -4162,7 +4153,6 @@
 		mlog_errno(ret);
 		goto out;
 	}
-	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
 
 	ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
 					  &ref_tree->rf_ci, ref_root_bh,
@@ -4468,9 +4458,9 @@
 }
 
 /* Update destination inode size, if necessary. */
-static int ocfs2_reflink_update_dest(struct inode *dest,
-				     struct buffer_head *d_bh,
-				     loff_t newlen)
+int ocfs2_reflink_update_dest(struct inode *dest,
+			      struct buffer_head *d_bh,
+			      loff_t newlen)
 {
 	handle_t *handle;
 	int ret;
@@ -4507,14 +4497,14 @@
 }
 
 /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
-static int ocfs2_reflink_remap_extent(struct inode *s_inode,
-				      struct buffer_head *s_bh,
-				      loff_t pos_in,
-				      struct inode *t_inode,
-				      struct buffer_head *t_bh,
-				      loff_t pos_out,
-				      loff_t len,
-				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+					 struct buffer_head *s_bh,
+					 loff_t pos_in,
+					 struct inode *t_inode,
+					 struct buffer_head *t_bh,
+					 loff_t pos_out,
+					 loff_t len,
+					 struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	struct ocfs2_extent_tree s_et;
 	struct ocfs2_extent_tree t_et;
@@ -4522,8 +4512,9 @@
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_refcount_tree *ref_tree;
 	struct ocfs2_super *osb;
+	loff_t remapped_bytes = 0;
 	loff_t pstart, plen;
-	u32 p_cluster, num_clusters, slast, spos, tpos;
+	u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
 	unsigned int ext_flags;
 	int ret = 0;
 
@@ -4605,30 +4596,34 @@
 next_loop:
 		spos += num_clusters;
 		tpos += num_clusters;
+		remapped_clus += num_clusters;
 	}
 
-out:
-	return ret;
+	goto out;
 out_unlock_refcount:
 	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
-	return ret;
+out:
+	remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
+	remapped_bytes = min_t(loff_t, len, remapped_bytes);
+
+	return remapped_bytes > 0 ? remapped_bytes : ret;
 }
 
 /* Set up refcount tree and remap s_inode to t_inode. */
-static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
-				      struct buffer_head *s_bh,
-				      loff_t pos_in,
-				      struct inode *t_inode,
-				      struct buffer_head *t_bh,
-				      loff_t pos_out,
-				      loff_t len)
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+				  struct buffer_head *s_bh,
+				  loff_t pos_in,
+				  struct inode *t_inode,
+				  struct buffer_head *t_bh,
+				  loff_t pos_out,
+				  loff_t len)
 {
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct ocfs2_super *osb;
 	struct ocfs2_dinode *dis;
 	struct ocfs2_dinode *dit;
-	int ret;
+	loff_t ret;
 
 	osb = OCFS2_SB(s_inode->i_sb);
 	dis = (struct ocfs2_dinode *)s_bh->b_data;
@@ -4700,7 +4695,7 @@
 	/* Actually remap extents now. */
 	ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
 					 pos_out, len, &dealloc);
-	if (ret) {
+	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
@@ -4715,23 +4710,24 @@
 }
 
 /* Lock an inode and grab a bh pointing to the inode. */
-static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
-				     struct buffer_head **bh1,
-				     struct inode *t_inode,
-				     struct buffer_head **bh2)
+int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+			      struct buffer_head **bh_s,
+			      struct inode *t_inode,
+			      struct buffer_head **bh_t)
 {
-	struct inode *inode1;
-	struct inode *inode2;
+	struct inode *inode1 = s_inode;
+	struct inode *inode2 = t_inode;
 	struct ocfs2_inode_info *oi1;
 	struct ocfs2_inode_info *oi2;
+	struct buffer_head *bh1 = NULL;
+	struct buffer_head *bh2 = NULL;
 	bool same_inode = (s_inode == t_inode);
+	bool need_swap = (inode1->i_ino > inode2->i_ino);
 	int status;
 
 	/* First grab the VFS and rw locks. */
 	lock_two_nondirectories(s_inode, t_inode);
-	inode1 = s_inode;
-	inode2 = t_inode;
-	if (inode1->i_ino > inode2->i_ino)
+	if (need_swap)
 		swap(inode1, inode2);
 
 	status = ocfs2_rw_lock(inode1, 1);
@@ -4754,17 +4750,13 @@
 	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
 				(unsigned long long)oi2->ip_blkno);
 
-	if (*bh1)
-		*bh1 = NULL;
-	if (*bh2)
-		*bh2 = NULL;
-
 	/* We always want to lock the one with the lower lockid first. */
 	if (oi1->ip_blkno > oi2->ip_blkno)
 		mlog_errno(-ENOLCK);
 
 	/* lock id1 */
-	status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+	status = ocfs2_inode_lock_nested(inode1, &bh1, 1,
+					 OI_LS_REFLINK_TARGET);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -4773,15 +4765,25 @@
 
 	/* lock id2 */
 	if (!same_inode) {
-		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+		status = ocfs2_inode_lock_nested(inode2, &bh2, 1,
 						 OI_LS_REFLINK_TARGET);
 		if (status < 0) {
 			if (status != -ENOENT)
 				mlog_errno(status);
 			goto out_cl1;
 		}
-	} else
-		*bh2 = *bh1;
+	} else {
+		bh2 = bh1;
+	}
+
+	/*
+	 * If we swapped inode order above, we have to swap the buffer heads
+	 * before passing them back to the caller.
+	 */
+	if (need_swap)
+		swap(bh1, bh2);
+	*bh_s = bh1;
+	*bh_t = bh2;
 
 	trace_ocfs2_double_lock_end(
 			(unsigned long long)oi1->ip_blkno,
@@ -4791,8 +4793,7 @@
 
 out_cl1:
 	ocfs2_inode_unlock(inode1, 1);
-	brelse(*bh1);
-	*bh1 = NULL;
+	brelse(bh1);
 out_rw2:
 	ocfs2_rw_unlock(inode2, 1);
 out_i2:
@@ -4803,10 +4804,10 @@
 }
 
 /* Unlock both inodes and release buffers. */
-static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
-					struct buffer_head *s_bh,
-					struct inode *t_inode,
-					struct buffer_head *t_bh)
+void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+				 struct buffer_head *s_bh,
+				 struct inode *t_inode,
+				 struct buffer_head *t_bh)
 {
 	ocfs2_inode_unlock(s_inode, 1);
 	ocfs2_rw_unlock(s_inode, 1);
@@ -4818,82 +4819,3 @@
 	}
 	unlock_two_nondirectories(s_inode, t_inode);
 }
-
-/* Link a range of blocks from one file to another. */
-int ocfs2_reflink_remap_range(struct file *file_in,
-			      loff_t pos_in,
-			      struct file *file_out,
-			      loff_t pos_out,
-			      u64 len,
-			      bool is_dedupe)
-{
-	struct inode *inode_in = file_inode(file_in);
-	struct inode *inode_out = file_inode(file_out);
-	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
-	struct buffer_head *in_bh = NULL, *out_bh = NULL;
-	bool same_inode = (inode_in == inode_out);
-	ssize_t ret;
-
-	if (!ocfs2_refcount_tree(osb))
-		return -EOPNOTSUPP;
-	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
-		return -EROFS;
-
-	/* Lock both files against IO */
-	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
-	if (ret)
-		return ret;
-
-	/* Check file eligibility and prepare for block sharing. */
-	ret = -EINVAL;
-	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
-	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
-		goto out_unlock;
-
-	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
-			&len, is_dedupe);
-	if (ret <= 0)
-		goto out_unlock;
-
-	/* Lock out changes to the allocation maps and remap. */
-	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
-	if (!same_inode)
-		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
-				  SINGLE_DEPTH_NESTING);
-
-	ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
-					 out_bh, pos_out, len);
-
-	/* Zap any page cache for the destination file's range. */
-	if (!ret)
-		truncate_inode_pages_range(&inode_out->i_data, pos_out,
-					   PAGE_ALIGN(pos_out + len) - 1);
-
-	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
-	if (!same_inode)
-		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
-
-	/*
-	 * Empty the extent map so that we may get the right extent
-	 * record from the disk.
-	 */
-	ocfs2_extent_map_trunc(inode_in, 0);
-	ocfs2_extent_map_trunc(inode_out, 0);
-
-	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
-
-	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
-	return 0;
-
-out_unlock:
-	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
-	return ret;
-}

diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 4af55bf..0b90144 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h

@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * refcounttree.h
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 #ifndef OCFS2_REFCOUNTTREE_H
 #define OCFS2_REFCOUNTTREE_H
@@ -115,11 +107,23 @@
 			const char __user *oldname,
 			const char __user *newname,
 			bool preserve);
-int ocfs2_reflink_remap_range(struct file *file_in,
-			      loff_t pos_in,
-			      struct file *file_out,
-			      loff_t pos_out,
-			      u64 len,
-			      bool is_dedupe);
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+				  struct buffer_head *s_bh,
+				  loff_t pos_in,
+				  struct inode *t_inode,
+				  struct buffer_head *t_bh,
+				  loff_t pos_out,
+				  loff_t len);
+int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+			      struct buffer_head **bh1,
+			      struct inode *t_inode,
+			      struct buffer_head **bh2);
+void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+				 struct buffer_head *s_bh,
+				 struct inode *t_inode,
+				 struct buffer_head *t_bh);
+int ocfs2_reflink_update_dest(struct inode *dest,
+			      struct buffer_head *d_bh,
+			      loff_t newlen);
 
 #endif /* OCFS2_REFCOUNTTREE_H */

diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 6a348b0..0249e8c 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -13,15 +14,6 @@
  * Universite Pierre et Marie Curie (Paris VI)
  *
  * The rest is copyright (C) 2010 Novell.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/fs.h>

diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 42c2b80..6ac8812 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Allocation reservations function prototypes and structures.
  *
  * Copyright (C) 2010 Novell.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #ifndef	OCFS2_RESERVATIONS_H

diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 18451e0..24eb52f 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -7,21 +8,6 @@
  * Inspired by ext3/resize.c.
  *
  * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>

diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
index f38841a..0af0c02 100644
--- a/fs/ocfs2/resize.h
+++ b/fs/ocfs2/resize.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_RESIZE_H

diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d740799..8caecee 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c

@@ -1,26 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * slot_map.c
  *
- *
- *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/types.h>
@@ -55,7 +39,7 @@
 	unsigned int si_blocks;
 	struct buffer_head **si_bh;
 	unsigned int si_num_slots;
-	struct ocfs2_slot *si_slots;
+	struct ocfs2_slot si_slots[];
 };
 
 
@@ -420,9 +404,7 @@
 	struct inode *inode = NULL;
 	struct ocfs2_slot_info *si;
 
-	si = kzalloc(sizeof(struct ocfs2_slot_info) +
-		     (sizeof(struct ocfs2_slot) * osb->max_slots),
-		     GFP_KERNEL);
+	si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
 	if (!si) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -431,8 +413,6 @@
 
 	si->si_extended = ocfs2_uses_extended_slot_map(osb);
 	si->si_num_slots = osb->max_slots;
-	si->si_slots = (struct ocfs2_slot *)((char *)si +
-					     sizeof(struct ocfs2_slot_info));
 
 	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
 					    OCFS2_INVALID_SLOT);

diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 601c95f..93b53e7 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * description here
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 

diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 220cae7..dbf8b57 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Code which interfaces ocfs2 with the o2cb stack.
  *
  * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/kernel.h>

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index d2fb97b..7397064 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
  *
  * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/module.h>

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index d6c350b..8aa6a66 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -7,15 +8,6 @@
  * cluster stacks.
  *
  * Copyright (C) 2007, 2009 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/list.h>
@@ -48,12 +40,6 @@
  */
 static struct ocfs2_stack_plugin *active_stack;
 
-inline int ocfs2_is_o2cb_active(void)
-{
-	return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
-}
-EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
-
 static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
 {
 	struct ocfs2_stack_plugin *p;

diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index e3036e1..e9d26cb 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,15 +7,6 @@
  * Glue to the underlying cluster stack.
  *
  * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 
@@ -298,9 +290,6 @@
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
 
-/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
-int ocfs2_is_o2cb_active(void);
-
 extern struct kset *ocfs2_kset;
 
 #endif  /* STACKGLUE_H */

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f7c972f..69c21a3 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -7,21 +8,6 @@
  * Inspired by ext3 block groups.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>

diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 2d25017..f0a5d30 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Defines sub allocator api
  *
  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef _CHAINALLOC_H_

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b..c81e86c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * load/unload driver, mount/dismount volumes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/module.h>
@@ -134,7 +120,7 @@
 			    int block,
 			    int sect_size);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
-static void ocfs2_destroy_inode(struct inode *inode);
+static void ocfs2_free_inode(struct inode *inode);
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
 static int ocfs2_enable_quotas(struct ocfs2_super *osb);
 static void ocfs2_disable_quotas(struct ocfs2_super *osb);
@@ -147,7 +133,7 @@
 static const struct super_operations ocfs2_sops = {
 	.statfs		= ocfs2_statfs,
 	.alloc_inode	= ocfs2_alloc_inode,
-	.destroy_inode	= ocfs2_destroy_inode,
+	.free_inode	= ocfs2_free_inode,
 	.drop_inode	= ocfs2_drop_inode,
 	.evict_inode	= ocfs2_evict_inode,
 	.sync_fs	= ocfs2_sync_fs,
@@ -575,17 +561,11 @@
 	return &oi->vfs_inode;
 }
 
-static void ocfs2_i_callback(struct rcu_head *head)
+static void ocfs2_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
 
-static void ocfs2_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, ocfs2_i_callback);
-}
-
 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 						unsigned int cbits)
 {
@@ -600,7 +580,6 @@
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBDAF)
 	BUILD_BUG_ON(sizeof(sector_t) != 8);
 	/*
 	 * We might be limited by page cache size.
@@ -614,15 +593,6 @@
 		 */
 		bitshift = 31;
 	}
-# else
-	/*
-	 * We are limited by the size of sector_t. Use block size, as
-	 * that's what we expose to the VFS.
-	 */
-	bytes = 1 << bbits;
-	trim = 1;
-	bitshift = 31;
-# endif
 #endif
 
 	/*
@@ -1109,33 +1079,13 @@
 
 	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
 						 ocfs2_debugfs_root);
-	if (!osb->osb_debug_root) {
-		status = -EINVAL;
-		mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
-		goto read_super_error;
-	}
 
-	osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
-					    osb->osb_debug_root,
-					    osb,
-					    &ocfs2_osb_debug_fops);
-	if (!osb->osb_ctxt) {
-		status = -EINVAL;
-		mlog_errno(status);
-		goto read_super_error;
-	}
+	debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root,
+			    osb, &ocfs2_osb_debug_fops);
 
-	if (ocfs2_meta_ecc(osb)) {
-		status = ocfs2_blockcheck_stats_debugfs_install(
-						&osb->osb_ecc_stats,
-						osb->osb_debug_root);
-		if (status) {
-			mlog(ML_ERROR,
-			     "Unable to create blockcheck statistics "
-			     "files\n");
-			goto read_super_error;
-		}
-	}
+	if (ocfs2_meta_ecc(osb))
+		ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats,
+							osb->osb_debug_root);
 
 	status = ocfs2_mount_volume(sb);
 	if (status < 0)
@@ -1622,11 +1572,6 @@
 		goto out2;
 
 	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
-	if (!ocfs2_debugfs_root) {
-		status = -ENOMEM;
-		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
-		goto out3;
-	}
 
 	ocfs2_set_locking_protocol();
 
@@ -1847,6 +1792,8 @@
 	if (ocfs2_is_hard_readonly(osb))
 		goto leave;
 
+	mutex_init(&osb->obs_trim_fs_mutex);
+
 	status = ocfs2_dlm_init(osb);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1912,8 +1859,6 @@
 
 	kset_unregister(osb->osb_dev_kset);
 
-	debugfs_remove(osb->osb_ctxt);
-
 	/* Orphan scan should be stopped as early as possible */
 	ocfs2_orphan_scan_stop(osb);
 
@@ -1969,7 +1914,7 @@
 		ocfs2_dlm_shutdown(osb, hangup_needed);
 
 	ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
-	debugfs_remove(osb->osb_debug_root);
+	debugfs_remove_recursive(osb->osb_debug_root);
 
 	if (hangup_needed)
 		ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));

diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index d4550c8..76facaf 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_SUPER_H

diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 71ee424..167094d 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_SYMLINK_H

diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 5965f38..bb701c4 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Initialize, read, write, etc. system files.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/fs.h>

diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
index cc9ea66..a83dd96 100644
--- a/fs/ocfs2/sysfile.h
+++ b/fs/ocfs2/sysfile.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Function prototypes
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_SYSFILE_H

diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 78f09c7..580852b 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -8,21 +9,6 @@
  *
  * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
  * Standard buffer head caching flags (uptodate, etc) are insufficient
  * in a clustered environment - a buffer may be marked up to date on
  * our local node but could have been modified by another cluster

diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 0d826fe..77a30ca 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -6,21 +7,6 @@
  * Cluster uptodate tracking
  *
  * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_UPTODATE_H

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3a24ce3..90c830e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
@@ -8,15 +9,6 @@
  * CREDITS:
  * Lots of code in this file is copy from linux/fs/ext3/xattr.c.
  * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/capability.h>
@@ -3833,7 +3825,6 @@
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int low_bucket = 0, bucket, high_bucket;
 	struct ocfs2_xattr_bucket *search;
-	u32 last_hash;
 	u64 blkno, lower_blkno = 0;
 
 	search = ocfs2_xattr_bucket_new(inode);
@@ -3877,8 +3868,6 @@
 		if (xh->xh_count)
 			xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
 
-		last_hash = le32_to_cpu(xe->xe_name_hash);
-
 		/* record lower_blkno which may be the insert place. */
 		lower_blkno = blkno;
 

diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1633cc1..9c80382 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h

@@ -1,18 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * xattr.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #ifndef OCFS2_XATTR_H

diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
index b1b9a0a..42b2ec3 100644
--- a/fs/omfs/Kconfig
+++ b/fs/omfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config OMFS_FS
 	tristate "SonicBlue Optimized MPEG File System support"
 	depends on BLOCK

diff --git a/fs/omfs/Makefile b/fs/omfs/Makefile
index 8b82b63..ae16fc3 100644
--- a/fs/omfs/Makefile
+++ b/fs/omfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 
 obj-$(CONFIG_OMFS_FS) += omfs.o
 

diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 4bee3a7..a0f4565 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c

@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMFS (as used by RIO Karma) directory operations.
  * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
- * Released under GPL v2.
  */
 
 #include <linux/fs.h>

diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index bf83e66..d640b93 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c

@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * OMFS (as used by RIO Karma) file operations.
  * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
- * Released under GPL v2.
  */
 
 #include <linux/module.h>

diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index ee14af9..b76ec6b 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c

@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Optimized MPEG FS - inode and super operations.
  * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
- * Released under GPL v2.
  */
 #include <linux/module.h>
 #include <linux/sched.h>
@@ -478,6 +478,10 @@
 
 	sb->s_maxbytes = 0xffffffff;
 
+	sb->s_time_gran = NSEC_PER_MSEC;
+	sb->s_time_min = 0;
+	sb->s_time_max = U64_MAX / MSEC_PER_SEC;
+
 	sb_set_blocksize(sb, 0x200);
 
 	bh = sb_bread(sb, 0);

diff --git a/fs/open.c b/fs/open.c
index 0285ce7..b62f5c0 100644
--- a/fs/open.c
+++ b/fs/open.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/open.c
  *
@@ -373,6 +374,25 @@
 				override_cred->cap_permitted;
 	}
 
+	/*
+	 * The new set of credentials can *only* be used in
+	 * task-synchronous circumstances, and does not need
+	 * RCU freeing, unless somebody then takes a separate
+	 * reference to it.
+	 *
+	 * NOTE! This is _only_ true because this credential
+	 * is used purely for override_creds() that installs
+	 * it as the subjective cred. Other threads will be
+	 * accessing ->real_cred, not the subjective cred.
+	 *
+	 * If somebody _does_ make a copy of this (using the
+	 * 'get_current_cred()' function), that will clear the
+	 * non_rcu field, because now that other user may be
+	 * expecting RCU freeing. But normal thread-synchronous
+	 * cred accesses will keep things non-RCY.
+	 */
+	override_cred->non_rcu = 1;
+
 	old_cred = override_creds(override_cred);
 retry:
 	res = user_path_at(dfd, filename, lookup_flags, &path);
@@ -733,6 +753,12 @@
 		return 0;
 	}
 
+	/* Any file opened for execve()/uselib() has to be a regular file. */
+	if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
+		error = -EACCES;
+		goto cleanup_file;
+	}
+
 	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
 		error = get_write_access(inode);
 		if (unlikely(error))
@@ -750,7 +776,7 @@
 		f->f_mode |= FMODE_ATOMIC_POS;
 
 	f->f_op = fops_get(inode->i_fop);
-	if (unlikely(WARN_ON(!f->f_op))) {
+	if (WARN_ON(!f->f_op)) {
 		error = -ENODEV;
 		goto cleanup_all;
 	}
@@ -792,6 +818,14 @@
 		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
 			return -EINVAL;
 	}
+
+	/*
+	 * XXX: Huge page cache doesn't support writing yet. Drop all page
+	 * cache for this file before processing writes.
+	 */
+	if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
+		truncate_pagecache(inode, 0);
+
 	return 0;
 
 cleanup_all:
@@ -1209,3 +1243,22 @@
 }
 
 EXPORT_SYMBOL(nonseekable_open);
+
+/*
+ * stream_open is used by subsystems that want stream-like file descriptors.
+ * Such file descriptors are not seekable and don't have notion of position
+ * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
+ * Contrary to file descriptors of other regular files, .read() and .write()
+ * can run simultaneously.
+ *
+ * stream_open never fails and is marked to return int so that it could be
+ * directly used as file_operations.open .
+ */
+int stream_open(struct inode *inode, struct file *filp)
+{
+	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
+	filp->f_mode |= FMODE_STREAM;
+	return 0;
+}
+
+EXPORT_SYMBOL(stream_open);

diff --git a/fs/openpromfs/Makefile b/fs/openpromfs/Makefile
index 4563199..843c270 100644
--- a/fs/openpromfs/Makefile
+++ b/fs/openpromfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux Sun Openprom filesystem routines.
 #

diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 607092f..40c8c2e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* inode.c: /proc/openprom handling routines
  *
  * Copyright (C) 1996-1999 Jakub Jelinek  (jakub@redhat.com)
@@ -8,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
@@ -199,10 +201,11 @@
 
 	child = dp->child;
 	while (child) {
-		int n = strlen(child->path_component_name);
+		const char *node_name = kbasename(child->full_name);
+		int n = strlen(node_name);
 
 		if (len == n &&
-		    !strncmp(child->path_component_name, name, len)) {
+		    !strncmp(node_name, name, len)) {
 			ent_type = op_inode_node;
 			ent_data.node = child;
 			ino = child->unique_id;
@@ -245,7 +248,7 @@
 		set_nlink(inode, 2);
 		break;
 	case op_inode_prop:
-		if (!strcmp(dp->name, "options") && (len == 17) &&
+		if (of_node_name_eq(dp, "options") && (len == 17) &&
 		    !strncmp (name, "security-password", 17))
 			inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
 		else
@@ -293,8 +296,8 @@
 	}
 	while (child) {
 		if (!dir_emit(ctx,
-			    child->path_component_name,
-			    strlen(child->path_component_name),
+			    kbasename(child->full_name),
+			    strlen(kbasename(child->full_name)),
 			    child->unique_id, DT_DIR))
 			goto out;
 
@@ -335,17 +338,11 @@
 	return &oi->vfs_inode;
 }
 
-static void openprom_i_callback(struct rcu_head *head)
+static void openprom_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
 
-static void openprom_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, openprom_i_callback);
-}
-
 static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 {
 	struct inode *inode;
@@ -374,12 +371,12 @@
 
 static const struct super_operations openprom_sops = {
 	.alloc_inode	= openprom_alloc_inode,
-	.destroy_inode	= openprom_destroy_inode,
+	.free_inode	= openprom_free_inode,
 	.statfs		= simple_statfs,
 	.remount_fs	= openprom_remount,
 };
 
-static int openprom_fill_super(struct super_block *s, void *data, int silent)
+static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
 {
 	struct inode *root_inode;
 	struct op_inode_info *oi;
@@ -413,16 +410,25 @@
 	return ret;
 }
 
-static struct dentry *openprom_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int openpromfs_get_tree(struct fs_context *fc)
 {
-	return mount_single(fs_type, flags, data, openprom_fill_super);
+	return get_tree_single(fc, openprom_fill_super);
+}
+
+static const struct fs_context_operations openpromfs_context_ops = {
+	.get_tree	= openpromfs_get_tree,
+};
+
+static int openpromfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &openpromfs_context_ops;
+	return 0;
 }
 
 static struct file_system_type openprom_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "openpromfs",
-	.mount		= openprom_mount,
+	.init_fs_context = openpromfs_init_fs_context,
 	.kill_sb	= kill_anon_super,
 };
 MODULE_ALIAS_FS("openpromfs");

diff --git a/fs/orangefs/Kconfig b/fs/orangefs/Kconfig
index 1554c02..890c0ae 100644
--- a/fs/orangefs/Kconfig
+++ b/fs/orangefs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config ORANGEFS_FS
 	tristate "ORANGEFS (Powered by PVFS) support"
 	select FS_POSIX_ACL

diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 1058741..eced272 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c

@@ -142,7 +142,7 @@
 			rc = __orangefs_set_acl(inode, acl, type);
 		} else {
 			iattr.ia_valid = ATTR_MODE;
-			rc = orangefs_inode_setattr(inode, &iattr);
+			rc = __orangefs_setattr(inode, &iattr);
 		}
 
 		return rc;
@@ -167,12 +167,16 @@
 		error = __orangefs_set_acl(inode, default_acl,
 					   ACL_TYPE_DEFAULT);
 		posix_acl_release(default_acl);
+	} else {
+		inode->i_default_acl = NULL;
 	}
 
 	if (acl) {
 		if (!error)
 			error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
 		posix_acl_release(acl);
+	} else {
+		inode->i_acl = NULL;
 	}
 
 	/* If mode of the inode was changed, then do a forcible ->setattr */
@@ -181,7 +185,7 @@
 		inode->i_mode = mode;
 		iattr.ia_mode = mode;
 		iattr.ia_valid |= ATTR_MODE;
-		orangefs_inode_setattr(inode, &iattr);
+		__orangefs_setattr(inode, &iattr);
 	}
 
 	return error;

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index a5a2fe7..a5612ab 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c

@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * (C) 2001 Clemson University and The University of Chicago
+ * Copyright 2018 Omnibond Systems, L.L.C.
  *
  * See COPYING in top-level directory.
  */
@@ -44,15 +45,16 @@
 /*
  * Post and wait for the I/O upcall to finish
  */
-static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
-		loff_t *offset, struct iov_iter *iter,
-		size_t total_size, loff_t readahead_size)
+ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
+    loff_t *offset, struct iov_iter *iter, size_t total_size,
+    loff_t readahead_size, struct orangefs_write_range *wr, int *index_return)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
 	struct orangefs_kernel_op_s *new_op = NULL;
-	int buffer_index = -1;
+	int buffer_index;
 	ssize_t ret;
+	size_t copy_amount;
 
 	new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
 	if (!new_op)
@@ -84,6 +86,10 @@
 	new_op->upcall.req.io.buf_index = buffer_index;
 	new_op->upcall.req.io.count = total_size;
 	new_op->upcall.req.io.offset = *offset;
+	if (type == ORANGEFS_IO_WRITE && wr) {
+		new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
+		new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
+	}
 
 	gossip_debug(GOSSIP_FILE_DEBUG,
 		     "%s(%pU): offset: %llu total_size: %zd\n",
@@ -128,7 +134,6 @@
 	 */
 	if (ret == -EAGAIN && op_state_purged(new_op)) {
 		orangefs_bufmap_put(buffer_index);
-		buffer_index = -1;
 		if (type == ORANGEFS_IO_WRITE)
 			iov_iter_revert(iter, total_size);
 		gossip_debug(GOSSIP_FILE_DEBUG,
@@ -168,7 +173,10 @@
 			 * trigger the write.
 			 */
 			case OP_VFS_STATE_INPROGR:
-				ret = total_size;
+				if (type == ORANGEFS_IO_READ)
+					ret = -EINTR;
+				else
+					ret = total_size;
 				break;
 			default:
 				gossip_err("%s: unexpected op state :%d:.\n",
@@ -204,8 +212,25 @@
 		 *       can futher be kernel-space or user-space addresses.
 		 *       or it can pointers to struct page's
 		 */
+
+		/*
+		 * When reading, readahead_size will only be zero when
+		 * we're doing O_DIRECT, otherwise we got here from
+		 * orangefs_readpage.
+		 *
+		 * If we got here from orangefs_readpage we want to
+		 * copy either a page or the whole file into the io
+		 * vector, whichever is smaller.
+		 */
+		if (readahead_size)
+			copy_amount =
+				min(new_op->downcall.resp.io.amt_complete,
+					(__s64)PAGE_SIZE);
+		else
+			copy_amount = new_op->downcall.resp.io.amt_complete;
+
 		ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
-		    new_op->downcall.resp.io.amt_complete);
+			copy_amount);
 		if (ret < 0) {
 			gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
 			    __func__, (long)ret);
@@ -223,250 +248,127 @@
 
 out:
 	if (buffer_index >= 0) {
-		orangefs_bufmap_put(buffer_index);
-		gossip_debug(GOSSIP_FILE_DEBUG,
-			     "%s(%pU): PUT buffer_index %d\n",
-			     __func__, handle, buffer_index);
-		buffer_index = -1;
+		if ((readahead_size) && (type == ORANGEFS_IO_READ)) {
+			/* readpage */
+			*index_return = buffer_index;
+			gossip_debug(GOSSIP_FILE_DEBUG,
+				"%s: hold on to buffer_index :%d:\n",
+				__func__, buffer_index);
+		} else {
+			/* O_DIRECT */
+			orangefs_bufmap_put(buffer_index);
+			gossip_debug(GOSSIP_FILE_DEBUG,
+				"%s(%pU): PUT buffer_index %d\n",
+				__func__, handle, buffer_index);
+		}
 	}
 	op_release(new_op);
 	return ret;
 }
 
-/*
- * Common entry point for read/write/readv/writev
- * This function will dispatch it to either the direct I/O
- * or buffered I/O path depending on the mount options and/or
- * augmented/extended metadata attached to the file.
- * Note: File extended attributes override any mount options.
- */
-static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
-		loff_t *offset, struct iov_iter *iter)
+int orangefs_revalidate_mapping(struct inode *inode)
 {
-	struct inode *inode = file->f_mapping->host;
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
-	struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
-	size_t count = iov_iter_count(iter);
-	ssize_t total_count = 0;
-	ssize_t ret = -EINVAL;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long *bitlock = &orangefs_inode->bitlock;
+	int ret;
 
-	gossip_debug(GOSSIP_FILE_DEBUG,
-		"%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
-		__func__,
-		handle,
-		(int)count);
-
-	if (type == ORANGEFS_IO_WRITE) {
-		gossip_debug(GOSSIP_FILE_DEBUG,
-			     "%s(%pU): proceeding with offset : %llu, "
-			     "size %d\n",
-			     __func__,
-			     handle,
-			     llu(*offset),
-			     (int)count);
-	}
-
-	if (count == 0) {
-		ret = 0;
-		goto out;
-	}
-
-	while (iov_iter_count(iter)) {
-		size_t each_count = iov_iter_count(iter);
-		size_t amt_complete;
-
-		/* how much to transfer in this loop iteration */
-		if (each_count > orangefs_bufmap_size_query())
-			each_count = orangefs_bufmap_size_query();
-
-		gossip_debug(GOSSIP_FILE_DEBUG,
-			     "%s(%pU): size of each_count(%d)\n",
-			     __func__,
-			     handle,
-			     (int)each_count);
-		gossip_debug(GOSSIP_FILE_DEBUG,
-			     "%s(%pU): BEFORE wait_for_io: offset is %d\n",
-			     __func__,
-			     handle,
-			     (int)*offset);
-
-		ret = wait_for_direct_io(type, inode, offset, iter,
-				each_count, 0);
-		gossip_debug(GOSSIP_FILE_DEBUG,
-			     "%s(%pU): return from wait_for_io:%d\n",
-			     __func__,
-			     handle,
-			     (int)ret);
-
-		if (ret < 0)
-			goto out;
-
-		*offset += ret;
-		total_count += ret;
-		amt_complete = ret;
-
-		gossip_debug(GOSSIP_FILE_DEBUG,
-			     "%s(%pU): AFTER wait_for_io: offset is %d\n",
-			     __func__,
-			     handle,
-			     (int)*offset);
-
-		/*
-		 * if we got a short I/O operations,
-		 * fall out and return what we got so far
-		 */
-		if (amt_complete < each_count)
+	while (1) {
+		ret = wait_on_bit(bitlock, 1, TASK_KILLABLE);
+		if (ret)
+			return ret;
+		spin_lock(&inode->i_lock);
+		if (test_bit(1, bitlock)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (!time_before(jiffies, orangefs_inode->mapping_time))
 			break;
-	} /*end while */
-
-out:
-	if (total_count > 0)
-		ret = total_count;
-	if (ret > 0) {
-		if (type == ORANGEFS_IO_READ) {
-			file_accessed(file);
-		} else {
-			file_update_time(file);
-			/*
-			 * Must invalidate to ensure write loop doesn't
-			 * prevent kernel from reading updated
-			 * attribute.  Size probably changed because of
-			 * the write, and other clients could update
-			 * any other attribute.
-			 */
-			orangefs_inode->getattr_time = jiffies - 1;
-		}
+		spin_unlock(&inode->i_lock);
+		return 0;
 	}
 
-	gossip_debug(GOSSIP_FILE_DEBUG,
-		     "%s(%pU): Value(%d) returned.\n",
-		     __func__,
-		     handle,
-		     (int)ret);
+	set_bit(1, bitlock);
+	smp_wmb();
+	spin_unlock(&inode->i_lock);
+
+	unmap_mapping_range(mapping, 0, 0, 0);
+	ret = filemap_write_and_wait(mapping);
+	if (!ret)
+		ret = invalidate_inode_pages2(mapping);
+
+	orangefs_inode->mapping_time = jiffies +
+	    orangefs_cache_timeout_msecs*HZ/1000;
+
+	clear_bit(1, bitlock);
+	smp_mb__after_atomic();
+	wake_up_bit(bitlock, 1);
 
 	return ret;
 }
 
-/*
- * Read data from a specified offset in a file (referenced by inode).
- * Data may be placed either in a user or kernel buffer.
- */
-ssize_t orangefs_inode_read(struct inode *inode,
-			    struct iov_iter *iter,
-			    loff_t *offset,
-			    loff_t readahead_size)
+static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
+    struct iov_iter *iter)
 {
-	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
-	size_t count = iov_iter_count(iter);
-	size_t bufmap_size;
-	ssize_t ret = -EINVAL;
+	int ret;
+	struct orangefs_read_options *ro;
 
 	orangefs_stats.reads++;
 
-	bufmap_size = orangefs_bufmap_size_query();
-	if (count > bufmap_size) {
-		gossip_debug(GOSSIP_FILE_DEBUG,
-			     "%s: count is too large (%zd/%zd)!\n",
-			     __func__, count, bufmap_size);
-		return -EINVAL;
-	}
-
-	gossip_debug(GOSSIP_FILE_DEBUG,
-		     "%s(%pU) %zd@%llu\n",
-		     __func__,
-		     &orangefs_inode->refn.khandle,
-		     count,
-		     llu(*offset));
-
-	ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter,
-			count, readahead_size);
-	if (ret > 0)
-		*offset += ret;
-
-	gossip_debug(GOSSIP_FILE_DEBUG,
-		     "%s(%pU): Value(%zd) returned.\n",
-		     __func__,
-		     &orangefs_inode->refn.khandle,
-		     ret);
-
-	return ret;
-}
-
-static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	loff_t pos = iocb->ki_pos;
-	ssize_t rc = 0;
-
-	BUG_ON(iocb->private);
-
-	gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
-
-	orangefs_stats.reads++;
-
-	rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
-	iocb->ki_pos = pos;
-
-	return rc;
-}
-
-static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	loff_t pos;
-	ssize_t rc;
-
-	BUG_ON(iocb->private);
-
-	gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
-
-	inode_lock(file->f_mapping->host);
-
-	/* Make sure generic_write_checks sees an up to date inode size. */
-	if (file->f_flags & O_APPEND) {
-		rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
-		    STATX_SIZE);
-		if (rc == -ESTALE)
-			rc = -EIO;
-		if (rc) {
-			gossip_err("%s: orangefs_inode_getattr failed, "
-			    "rc:%zd:.\n", __func__, rc);
-			goto out;
-		}
-	}
-
-	rc = generic_write_checks(iocb, iter);
-
-	if (rc <= 0) {
-		gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
-			   __func__, rc);
-		goto out;
-	}
-
 	/*
-	 * if we are appending, generic_write_checks would have updated
-	 * pos to the end of the file, so we will wait till now to set
-	 * pos...
+	 * Remember how they set "count" in read(2) or pread(2) or whatever -
+	 * users can use count as a knob to control orangefs io size and later
+	 * we can try to help them fill as many pages as possible in readpage.
 	 */
-	pos = iocb->ki_pos;
-
-	rc = do_readv_writev(ORANGEFS_IO_WRITE,
-			     file,
-			     &pos,
-			     iter);
-	if (rc < 0) {
-		gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
-			   __func__, rc);
-		goto out;
+	if (!iocb->ki_filp->private_data) {
+		iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL);
+		if (!iocb->ki_filp->private_data)
+			return(ENOMEM);
+		ro = iocb->ki_filp->private_data;
+		ro->blksiz = iter->count;
 	}
 
-	iocb->ki_pos = pos;
+	down_read(&file_inode(iocb->ki_filp)->i_rwsem);
+	ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
+	if (ret)
+		goto out;
+
+	ret = generic_file_read_iter(iocb, iter);
+out:
+	up_read(&file_inode(iocb->ki_filp)->i_rwsem);
+	return ret;
+}
+
+static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
+    struct iov_iter *iter)
+{
+	int ret;
 	orangefs_stats.writes++;
 
-out:
+	if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) {
+		ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
+		if (ret)
+			return ret;
+	}
 
-	inode_unlock(file->f_mapping->host);
-	return rc;
+	ret = generic_file_write_iter(iocb, iter);
+	return ret;
+}
+
+static int orangefs_getflags(struct inode *inode, unsigned long *uval)
+{
+	__u64 val = 0;
+	int ret;
+
+	ret = orangefs_inode_getxattr(inode,
+				      "user.pvfs2.meta_hint",
+				      &val, sizeof(val));
+	if (ret < 0 && ret != -ENODATA)
+		return ret;
+	else if (ret == -ENODATA)
+		val = 0;
+	*uval = val;
+	return 0;
 }
 
 /*
@@ -474,6 +376,7 @@
  */
 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = file_inode(file);
 	int ret = -ENOTTY;
 	__u64 val = 0;
 	unsigned long uval;
@@ -487,20 +390,16 @@
 	 * and append flags
 	 */
 	if (cmd == FS_IOC_GETFLAGS) {
-		val = 0;
-		ret = orangefs_inode_getxattr(file_inode(file),
-					      "user.pvfs2.meta_hint",
-					      &val, sizeof(val));
-		if (ret < 0 && ret != -ENODATA)
+		ret = orangefs_getflags(inode, &uval);
+		if (ret)
 			return ret;
-		else if (ret == -ENODATA)
-			val = 0;
-		uval = val;
 		gossip_debug(GOSSIP_FILE_DEBUG,
 			     "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
 			     (unsigned long long)uval);
 		return put_user(uval, (int __user *)arg);
 	} else if (cmd == FS_IOC_SETFLAGS) {
+		unsigned long old_uval;
+
 		ret = 0;
 		if (get_user(uval, (int __user *)arg))
 			return -EFAULT;
@@ -516,11 +415,17 @@
 			gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
 			return -EINVAL;
 		}
+		ret = orangefs_getflags(inode, &old_uval);
+		if (ret)
+			return ret;
+		ret = vfs_ioc_setflags_prepare(inode, old_uval, uval);
+		if (ret)
+			return ret;
 		val = uval;
 		gossip_debug(GOSSIP_FILE_DEBUG,
 			     "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
 			     (unsigned long long)val);
-		ret = orangefs_inode_setxattr(file_inode(file),
+		ret = orangefs_inode_setxattr(inode,
 					      "user.pvfs2.meta_hint",
 					      &val, sizeof(val), 0);
 	}
@@ -532,14 +437,13 @@
 {
 	struct file *file = vmf->vma->vm_file;
 	int ret;
-
-	ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
-	    STATX_SIZE);
+	ret = orangefs_inode_getattr(file->f_mapping->host,
+	    ORANGEFS_GETATTR_SIZE);
 	if (ret == -ESTALE)
 		ret = -EIO;
 	if (ret) {
-		gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
-				__func__, ret);
+		gossip_err("%s: orangefs_inode_getattr failed, "
+		    "ret:%d:.\n", __func__, ret);
 		return VM_FAULT_SIGBUS;
 	}
 	return filemap_fault(vmf);
@@ -548,7 +452,7 @@
 static const struct vm_operations_struct orangefs_file_vm_ops = {
 	.fault = orangefs_fault,
 	.map_pages = filemap_map_pages,
-	.page_mkwrite = filemap_page_mkwrite,
+	.page_mkwrite = orangefs_page_mkwrite,
 };
 
 /*
@@ -556,15 +460,18 @@
  */
 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	int ret;
+
+	ret = orangefs_revalidate_mapping(file_inode(file));
+	if (ret)
+		return ret;
+
 	gossip_debug(GOSSIP_FILE_DEBUG,
 		     "orangefs_file_mmap: called on %s\n",
 		     (file ?
 			(char *)file->f_path.dentry->d_name.name :
 			(char *)"Unknown"));
 
-	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
-		return -EINVAL;
-
 	/* set the sequential readahead hint */
 	vma->vm_flags |= VM_SEQ_READ;
 	vma->vm_flags &= ~VM_RAND_READ;
@@ -604,8 +511,7 @@
 			gossip_debug(GOSSIP_INODE_DEBUG,
 			    "flush_racache finished\n");
 		}
-		truncate_inode_pages(file_inode(file)->i_mapping,
-				     0);
+
 	}
 	return 0;
 }
@@ -623,6 +529,11 @@
 		ORANGEFS_I(file_inode(file));
 	struct orangefs_kernel_op_s *new_op = NULL;
 
+	ret = filemap_write_and_wait_range(file_inode(file)->i_mapping,
+	    start, end);
+	if (ret < 0)
+		return ret;
+
 	new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
 	if (!new_op)
 		return -ENOMEM;
@@ -644,7 +555,7 @@
  * Change the file pointer position for an instance of an open file.
  *
  * \note If .llseek is overriden, we must acquire lock as described in
- *       Documentation/filesystems/Locking.
+ *       Documentation/filesystems/locking.rst.
  *
  * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
  * require much changes to the FS
@@ -660,8 +571,8 @@
 		 * NOTE: We are only interested in file size here,
 		 * so we set mask accordingly.
 		 */
-		ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
-		    STATX_SIZE);
+		ret = orangefs_inode_getattr(file->f_mapping->host,
+		    ORANGEFS_GETATTR_SIZE);
 		if (ret == -ESTALE)
 			ret = -EIO;
 		if (ret) {
@@ -704,6 +615,42 @@
 	return rc;
 }
 
+static int orangefs_file_open(struct inode * inode, struct file *file)
+{
+	file->private_data = NULL;
+	return generic_file_open(inode, file);
+}
+
+static int orangefs_flush(struct file *file, fl_owner_t id)
+{
+	/*
+	 * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the
+	 * service_operation in orangefs_fsync.
+	 *
+	 * Do not send fsync to OrangeFS server on a close.  Do send fsync
+	 * on an explicit fsync call.  This duplicates historical OrangeFS
+	 * behavior.
+	 */
+	struct inode *inode = file->f_mapping->host;
+	int r;
+
+	kfree(file->private_data);
+	file->private_data = NULL;
+
+	if (inode->i_state & I_DIRTY_TIME) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_DIRTY_TIME;
+		spin_unlock(&inode->i_lock);
+		mark_inode_dirty_sync(inode);
+	}
+
+	r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
+	if (r > 0)
+		return 0;
+	else
+		return r;
+}
+
 /** ORANGEFS implementation of VFS file operations */
 const struct file_operations orangefs_file_operations = {
 	.llseek		= orangefs_file_llseek,
@@ -712,7 +659,8 @@
 	.lock		= orangefs_lock,
 	.unlocked_ioctl	= orangefs_ioctl,
 	.mmap		= orangefs_file_mmap,
-	.open		= generic_file_open,
+	.open		= orangefs_file_open,
+	.flush		= orangefs_flush,
 	.release	= orangefs_file_release,
 	.fsync		= orangefs_fsync,
 };

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 3193287..efb1219 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c

@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * (C) 2001 Clemson University and The University of Chicago
+ * Copyright 2018 Omnibond Systems, L.L.C.
  *
  * See COPYING in top-level directory.
  */
@@ -14,40 +15,312 @@
 #include "orangefs-kernel.h"
 #include "orangefs-bufmap.h"
 
-static int read_one_page(struct page *page)
+static int orangefs_writepage_locked(struct page *page,
+    struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct orangefs_write_range *wr = NULL;
+	struct iov_iter iter;
+	struct bio_vec bv;
+	size_t len, wlen;
+	ssize_t ret;
+	loff_t off;
+
+	set_page_writeback(page);
+
+	len = i_size_read(inode);
+	if (PagePrivate(page)) {
+		wr = (struct orangefs_write_range *)page_private(page);
+		WARN_ON(wr->pos >= len);
+		off = wr->pos;
+		if (off + wr->len > len)
+			wlen = len - off;
+		else
+			wlen = wr->len;
+	} else {
+		WARN_ON(1);
+		off = page_offset(page);
+		if (off + PAGE_SIZE > len)
+			wlen = len - off;
+		else
+			wlen = PAGE_SIZE;
+	}
+	/* Should've been handled in orangefs_invalidatepage. */
+	WARN_ON(off == len || off + wlen > len);
+
+	bv.bv_page = page;
+	bv.bv_len = wlen;
+	bv.bv_offset = off % PAGE_SIZE;
+	WARN_ON(wlen == 0);
+	iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);
+
+	ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
+	    len, wr, NULL);
+	if (ret < 0) {
+		SetPageError(page);
+		mapping_set_error(page->mapping, ret);
+	} else {
+		ret = 0;
+	}
+	if (wr) {
+		kfree(wr);
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		put_page(page);
+	}
+	return ret;
+}
+
+static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
-	int max_block;
-	ssize_t bytes_read = 0;
-	struct inode *inode = page->mapping->host;
-	const __u32 blocksize = PAGE_SIZE;
-	const __u32 blockbits = PAGE_SHIFT;
-	struct iov_iter to;
-	struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
+	ret = orangefs_writepage_locked(page, wbc);
+	unlock_page(page);
+	end_page_writeback(page);
+	return ret;
+}
 
-	iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
+struct orangefs_writepages {
+	loff_t off;
+	size_t len;
+	kuid_t uid;
+	kgid_t gid;
+	int maxpages;
+	int npages;
+	struct page **pages;
+	struct bio_vec *bv;
+};
 
-	gossip_debug(GOSSIP_INODE_DEBUG,
-		    "orangefs_readpage called with page %p\n",
-		     page);
+static int orangefs_writepages_work(struct orangefs_writepages *ow,
+    struct writeback_control *wbc)
+{
+	struct inode *inode = ow->pages[0]->mapping->host;
+	struct orangefs_write_range *wrp, wr;
+	struct iov_iter iter;
+	ssize_t ret;
+	size_t len;
+	loff_t off;
+	int i;
 
-	max_block = ((inode->i_size / blocksize) + 1);
+	len = i_size_read(inode);
 
-	if (page->index < max_block) {
-		loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
-
-		bytes_read = orangefs_inode_read(inode,
-						 &to,
-						 &blockptr_offset,
-						 inode->i_size);
+	for (i = 0; i < ow->npages; i++) {
+		set_page_writeback(ow->pages[i]);
+		ow->bv[i].bv_page = ow->pages[i];
+		ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE,
+		    ow->off + ow->len) -
+		    max(ow->off, page_offset(ow->pages[i]));
+		if (i == 0)
+			ow->bv[i].bv_offset = ow->off -
+			    page_offset(ow->pages[i]);
+		else
+			ow->bv[i].bv_offset = 0;
 	}
+	iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
+
+	WARN_ON(ow->off >= len);
+	if (ow->off + ow->len > len)
+		ow->len = len - ow->off;
+
+	off = ow->off;
+	wr.uid = ow->uid;
+	wr.gid = ow->gid;
+	ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
+	    0, &wr, NULL);
+	if (ret < 0) {
+		for (i = 0; i < ow->npages; i++) {
+			SetPageError(ow->pages[i]);
+			mapping_set_error(ow->pages[i]->mapping, ret);
+			if (PagePrivate(ow->pages[i])) {
+				wrp = (struct orangefs_write_range *)
+				    page_private(ow->pages[i]);
+				ClearPagePrivate(ow->pages[i]);
+				put_page(ow->pages[i]);
+				kfree(wrp);
+			}
+			end_page_writeback(ow->pages[i]);
+			unlock_page(ow->pages[i]);
+		}
+	} else {
+		ret = 0;
+		for (i = 0; i < ow->npages; i++) {
+			if (PagePrivate(ow->pages[i])) {
+				wrp = (struct orangefs_write_range *)
+				    page_private(ow->pages[i]);
+				ClearPagePrivate(ow->pages[i]);
+				put_page(ow->pages[i]);
+				kfree(wrp);
+			}
+			end_page_writeback(ow->pages[i]);
+			unlock_page(ow->pages[i]);
+		}
+	}
+	return ret;
+}
+
+static int orangefs_writepages_callback(struct page *page,
+    struct writeback_control *wbc, void *data)
+{
+	struct orangefs_writepages *ow = data;
+	struct orangefs_write_range *wr;
+	int ret;
+
+	if (!PagePrivate(page)) {
+		unlock_page(page);
+		/* It's not private so there's nothing to write, right? */
+		printk("writepages_callback not private!\n");
+		BUG();
+		return 0;
+	}
+	wr = (struct orangefs_write_range *)page_private(page);
+
+	ret = -1;
+	if (ow->npages == 0) {
+		ow->off = wr->pos;
+		ow->len = wr->len;
+		ow->uid = wr->uid;
+		ow->gid = wr->gid;
+		ow->pages[ow->npages++] = page;
+		ret = 0;
+		goto done;
+	}
+	if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
+		orangefs_writepages_work(ow, wbc);
+		ow->npages = 0;
+		ret = -1;
+		goto done;
+	}
+	if (ow->off + ow->len == wr->pos) {
+		ow->len += wr->len;
+		ow->pages[ow->npages++] = page;
+		ret = 0;
+		goto done;
+	}
+done:
+	if (ret == -1) {
+		if (ow->npages) {
+			orangefs_writepages_work(ow, wbc);
+			ow->npages = 0;
+		}
+		ret = orangefs_writepage_locked(page, wbc);
+		mapping_set_error(page->mapping, ret);
+		unlock_page(page);
+		end_page_writeback(page);
+	} else {
+		if (ow->npages == ow->maxpages) {
+			orangefs_writepages_work(ow, wbc);
+			ow->npages = 0;
+		}
+	}
+	return ret;
+}
+
+static int orangefs_writepages(struct address_space *mapping,
+    struct writeback_control *wbc)
+{
+	struct orangefs_writepages *ow;
+	struct blk_plug plug;
+	int ret;
+	ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
+	if (!ow)
+		return -ENOMEM;
+	ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
+	ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
+	if (!ow->pages) {
+		kfree(ow);
+		return -ENOMEM;
+	}
+	ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
+	if (!ow->bv) {
+		kfree(ow->pages);
+		kfree(ow);
+		return -ENOMEM;
+	}
+	blk_start_plug(&plug);
+	ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
+	if (ow->npages)
+		ret = orangefs_writepages_work(ow, wbc);
+	blk_finish_plug(&plug);
+	kfree(ow->pages);
+	kfree(ow->bv);
+	kfree(ow);
+	return ret;
+}
+
+static int orangefs_launder_page(struct page *);
+
+static int orangefs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct iov_iter iter;
+	struct bio_vec bv;
+	ssize_t ret;
+	loff_t off; /* offset into this page */
+	pgoff_t index; /* which page */
+	struct page *next_page;
+	char *kaddr;
+	struct orangefs_read_options *ro = file->private_data;
+	loff_t read_size;
+	loff_t roundedup;
+	int buffer_index = -1; /* orangefs shared memory slot */
+	int slot_index;   /* index into slot */
+	int remaining;
+
+	/*
+	 * If they set some miniscule size for "count" in read(2)
+	 * (for example) then let's try to read a page, or the whole file
+	 * if it is smaller than a page. Once "count" goes over a page
+	 * then lets round up to the highest page size multiple that is
+	 * less than or equal to "count" and do that much orangefs IO and
+	 * try to fill as many pages as we can from it.
+	 *
+	 * "count" should be represented in ro->blksiz.
+	 *
+	 * inode->i_size = file size.
+	 */
+	if (ro) {
+		if (ro->blksiz < PAGE_SIZE) {
+			if (inode->i_size < PAGE_SIZE)
+				read_size = inode->i_size;
+			else
+				read_size = PAGE_SIZE;
+		} else {
+			roundedup = ((PAGE_SIZE - 1) & ro->blksiz) ?
+				((ro->blksiz + PAGE_SIZE) & ~(PAGE_SIZE -1)) :
+				ro->blksiz;
+			if (roundedup > inode->i_size)
+				read_size = inode->i_size;
+			else
+				read_size = roundedup;
+
+		}
+	} else {
+		read_size = PAGE_SIZE;
+	}
+	if (!read_size)
+		read_size = PAGE_SIZE;
+
+	if (PageDirty(page))
+		orangefs_launder_page(page);
+
+	off = page_offset(page);
+	index = off >> PAGE_SHIFT;
+	bv.bv_page = page;
+	bv.bv_len = PAGE_SIZE;
+	bv.bv_offset = 0;
+	iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
+
+	ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
+	    read_size, inode->i_size, NULL, &buffer_index);
+	remaining = ret;
 	/* this will only zero remaining unread portions of the page data */
-	iov_iter_zero(~0U, &to);
+	iov_iter_zero(~0U, &iter);
 	/* takes care of potential aliasing */
 	flush_dcache_page(page);
-	if (bytes_read < 0) {
-		ret = bytes_read;
+	if (ret < 0) {
 		SetPageError(page);
+		unlock_page(page);
+		goto out;
 	} else {
 		SetPageUptodate(page);
 		if (PageError(page))
@@ -56,96 +329,469 @@
 	}
 	/* unlock the page after the ->readpage() routine completes */
 	unlock_page(page);
+
+	if (remaining > PAGE_SIZE) {
+		slot_index = 0;
+		while ((remaining - PAGE_SIZE) >= PAGE_SIZE) {
+			remaining -= PAGE_SIZE;
+			/*
+			 * It is an optimization to try and fill more than one
+			 * page... by now we've already gotten the single
+			 * page we were after, if stuff doesn't seem to
+			 * be going our way at this point just return
+			 * and hope for the best.
+			 *
+			 * If we look for pages and they're already there is
+			 * one reason to give up, and if they're not there
+			 * and we can't create them is another reason.
+			 */
+
+			index++;
+			slot_index++;
+			next_page = find_get_page(inode->i_mapping, index);
+			if (next_page) {
+				gossip_debug(GOSSIP_FILE_DEBUG,
+					"%s: found next page, quitting\n",
+					__func__);
+				put_page(next_page);
+				goto out;
+			}
+			next_page = find_or_create_page(inode->i_mapping,
+							index,
+							GFP_KERNEL);
+			/*
+			 * I've never hit this, leave it as a printk for
+			 * now so it will be obvious.
+			 */
+			if (!next_page) {
+				printk("%s: can't create next page, quitting\n",
+					__func__);
+				goto out;
+			}
+			kaddr = kmap_atomic(next_page);
+			orangefs_bufmap_page_fill(kaddr,
+						buffer_index,
+						slot_index);
+			kunmap_atomic(kaddr);
+			SetPageUptodate(next_page);
+			unlock_page(next_page);
+			put_page(next_page);
+		}
+	}
+
+out:
+	if (buffer_index != -1)
+		orangefs_bufmap_put(buffer_index);
 	return ret;
 }
 
-static int orangefs_readpage(struct file *file, struct page *page)
+static int orangefs_write_begin(struct file *file,
+    struct address_space *mapping,
+    loff_t pos, unsigned len, unsigned flags, struct page **pagep,
+    void **fsdata)
 {
-	return read_one_page(page);
-}
-
-static int orangefs_readpages(struct file *file,
-			   struct address_space *mapping,
-			   struct list_head *pages,
-			   unsigned nr_pages)
-{
-	int page_idx;
+	struct orangefs_write_range *wr;
+	struct page *page;
+	pgoff_t index;
 	int ret;
 
-	gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n");
+	index = pos >> PAGE_SHIFT;
 
-	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		struct page *page;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
 
-		page = list_entry(pages->prev, struct page, lru);
-		list_del(&page->lru);
-		if (!add_to_page_cache(page,
-				       mapping,
-				       page->index,
-				       readahead_gfp_mask(mapping))) {
-			ret = read_one_page(page);
-			gossip_debug(GOSSIP_INODE_DEBUG,
-				"failure adding page to cache, read_one_page returned: %d\n",
-				ret);
-	      } else {
-			put_page(page);
-	      }
+	*pagep = page;
+
+	if (PageDirty(page) && !PagePrivate(page)) {
+		/*
+		 * Should be impossible.  If it happens, launder the page
+		 * since we don't know what's dirty.  This will WARN in
+		 * orangefs_writepage_locked.
+		 */
+		ret = orangefs_launder_page(page);
+		if (ret)
+			return ret;
 	}
-	BUG_ON(!list_empty(pages));
+	if (PagePrivate(page)) {
+		struct orangefs_write_range *wr;
+		wr = (struct orangefs_write_range *)page_private(page);
+		if (wr->pos + wr->len == pos &&
+		    uid_eq(wr->uid, current_fsuid()) &&
+		    gid_eq(wr->gid, current_fsgid())) {
+			wr->len += len;
+			goto okay;
+		} else {
+			ret = orangefs_launder_page(page);
+			if (ret)
+				return ret;
+		}
+	}
+
+	wr = kmalloc(sizeof *wr, GFP_KERNEL);
+	if (!wr)
+		return -ENOMEM;
+
+	wr->pos = pos;
+	wr->len = len;
+	wr->uid = current_fsuid();
+	wr->gid = current_fsgid();
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long)wr);
+	get_page(page);
+okay:
 	return 0;
 }
 
+static int orangefs_write_end(struct file *file, struct address_space *mapping,
+    loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t last_pos = pos + copied;
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_mutex.
+	 */
+	if (last_pos > inode->i_size)
+		i_size_write(inode, last_pos);
+
+	/* zero the stale part of the page if we did a short copy */
+	if (!PageUptodate(page)) {
+		unsigned from = pos & (PAGE_SIZE - 1);
+		if (copied < len) {
+			zero_user(page, from + copied, len - copied);
+		}
+		/* Set fully written pages uptodate. */
+		if (pos == page_offset(page) &&
+		    (len == PAGE_SIZE || pos + len == inode->i_size)) {
+			zero_user_segment(page, from + copied, PAGE_SIZE);
+			SetPageUptodate(page);
+		}
+	}
+
+	set_page_dirty(page);
+	unlock_page(page);
+	put_page(page);
+
+	mark_inode_dirty_sync(file_inode(file));
+	return copied;
+}
+
 static void orangefs_invalidatepage(struct page *page,
 				 unsigned int offset,
 				 unsigned int length)
 {
-	gossip_debug(GOSSIP_INODE_DEBUG,
-		     "orangefs_invalidatepage called on page %p "
-		     "(offset is %u)\n",
-		     page,
-		     offset);
+	struct orangefs_write_range *wr;
+	wr = (struct orangefs_write_range *)page_private(page);
 
-	ClearPageUptodate(page);
-	ClearPageMappedToDisk(page);
-	return;
+	if (offset == 0 && length == PAGE_SIZE) {
+		kfree((struct orangefs_write_range *)page_private(page));
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		put_page(page);
+		return;
+	/* write range entirely within invalidate range (or equal) */
+	} else if (page_offset(page) + offset <= wr->pos &&
+	    wr->pos + wr->len <= page_offset(page) + offset + length) {
+		kfree((struct orangefs_write_range *)page_private(page));
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		put_page(page);
+		/* XXX is this right? only caller in fs */
+		cancel_dirty_page(page);
+		return;
+	/* invalidate range chops off end of write range */
+	} else if (wr->pos < page_offset(page) + offset &&
+	    wr->pos + wr->len <= page_offset(page) + offset + length &&
+	     page_offset(page) + offset < wr->pos + wr->len) {
+		size_t x;
+		x = wr->pos + wr->len - (page_offset(page) + offset);
+		WARN_ON(x > wr->len);
+		wr->len -= x;
+		wr->uid = current_fsuid();
+		wr->gid = current_fsgid();
+	/* invalidate range chops off beginning of write range */
+	} else if (page_offset(page) + offset <= wr->pos &&
+	    page_offset(page) + offset + length < wr->pos + wr->len &&
+	    wr->pos < page_offset(page) + offset + length) {
+		size_t x;
+		x = page_offset(page) + offset + length - wr->pos;
+		WARN_ON(x > wr->len);
+		wr->pos += x;
+		wr->len -= x;
+		wr->uid = current_fsuid();
+		wr->gid = current_fsgid();
+	/* invalidate range entirely within write range (punch hole) */
+	} else if (wr->pos < page_offset(page) + offset &&
+	    page_offset(page) + offset + length < wr->pos + wr->len) {
+		/* XXX what do we do here... should not WARN_ON */
+		WARN_ON(1);
+		/* punch hole */
+		/*
+		 * should we just ignore this and write it out anyway?
+		 * it hardly makes sense
+		 */
+		return;
+	/* non-overlapping ranges */
+	} else {
+		/* WARN if they do overlap */
+		if (!((page_offset(page) + offset + length <= wr->pos) ^
+		    (wr->pos + wr->len <= page_offset(page) + offset))) {
+			WARN_ON(1);
+			printk("invalidate range offset %llu length %u\n",
+			    page_offset(page) + offset, length);
+			printk("write range offset %llu length %zu\n",
+			    wr->pos, wr->len);
+		}
+		return;
+	}
 
+	/*
+	 * Above there are returns where wr is freed or where we WARN.
+	 * Thus the following runs if wr was modified above.
+	 */
+
+	orangefs_launder_page(page);
 }
 
 static int orangefs_releasepage(struct page *page, gfp_t foo)
 {
-	gossip_debug(GOSSIP_INODE_DEBUG,
-		     "orangefs_releasepage called on page %p\n",
-		     page);
-	return 0;
+	return !PagePrivate(page);
 }
 
-/*
- * Having a direct_IO entry point in the address_space_operations
- * struct causes the kernel to allows us to use O_DIRECT on
- * open. Nothing will ever call this thing, but in the future we
- * will need to be able to use O_DIRECT on open in order to support
- * AIO. Modeled after NFS, they do this too.
- */
+static void orangefs_freepage(struct page *page)
+{
+	if (PagePrivate(page)) {
+		kfree((struct orangefs_write_range *)page_private(page));
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		put_page(page);
+	}
+}
+
+static int orangefs_launder_page(struct page *page)
+{
+	int r = 0;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,
+	};
+	wait_on_page_writeback(page);
+	if (clear_page_dirty_for_io(page)) {
+		r = orangefs_writepage_locked(page, &wbc);
+		end_page_writeback(page);
+	}
+	return r;
+}
 
 static ssize_t orangefs_direct_IO(struct kiocb *iocb,
 				  struct iov_iter *iter)
 {
-	gossip_debug(GOSSIP_INODE_DEBUG,
-		     "orangefs_direct_IO: %pD\n",
-		     iocb->ki_filp);
+	/*
+	 * Comment from original do_readv_writev:
+	 * Common entry point for read/write/readv/writev
+	 * This function will dispatch it to either the direct I/O
+	 * or buffered I/O path depending on the mount options and/or
+	 * augmented/extended metadata attached to the file.
+	 * Note: File extended attributes override any mount options.
+	 */
+	struct file *file = iocb->ki_filp;
+	loff_t pos = iocb->ki_pos;
+	enum ORANGEFS_io_type type = iov_iter_rw(iter) == WRITE ?
+            ORANGEFS_IO_WRITE : ORANGEFS_IO_READ;
+	loff_t *offset = &pos;
+	struct inode *inode = file->f_mapping->host;
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
+	size_t count = iov_iter_count(iter);
+	ssize_t total_count = 0;
+	ssize_t ret = -EINVAL;
+	int i = 0;
 
-	return -EINVAL;
+	gossip_debug(GOSSIP_FILE_DEBUG,
+		"%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
+		__func__,
+		handle,
+		(int)count);
+
+	if (type == ORANGEFS_IO_WRITE) {
+		gossip_debug(GOSSIP_FILE_DEBUG,
+			     "%s(%pU): proceeding with offset : %llu, "
+			     "size %d\n",
+			     __func__,
+			     handle,
+			     llu(*offset),
+			     (int)count);
+	}
+
+	if (count == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	while (iov_iter_count(iter)) {
+		size_t each_count = iov_iter_count(iter);
+		size_t amt_complete;
+		i++;
+
+		/* how much to transfer in this loop iteration */
+		if (each_count > orangefs_bufmap_size_query())
+			each_count = orangefs_bufmap_size_query();
+
+		gossip_debug(GOSSIP_FILE_DEBUG,
+			     "%s(%pU): size of each_count(%d)\n",
+			     __func__,
+			     handle,
+			     (int)each_count);
+		gossip_debug(GOSSIP_FILE_DEBUG,
+			     "%s(%pU): BEFORE wait_for_io: offset is %d\n",
+			     __func__,
+			     handle,
+			     (int)*offset);
+
+		ret = wait_for_direct_io(type, inode, offset, iter,
+				each_count, 0, NULL, NULL);
+		gossip_debug(GOSSIP_FILE_DEBUG,
+			     "%s(%pU): return from wait_for_io:%d\n",
+			     __func__,
+			     handle,
+			     (int)ret);
+
+		if (ret < 0)
+			goto out;
+
+		*offset += ret;
+		total_count += ret;
+		amt_complete = ret;
+
+		gossip_debug(GOSSIP_FILE_DEBUG,
+			     "%s(%pU): AFTER wait_for_io: offset is %d\n",
+			     __func__,
+			     handle,
+			     (int)*offset);
+
+		/*
+		 * if we got a short I/O operations,
+		 * fall out and return what we got so far
+		 */
+		if (amt_complete < each_count)
+			break;
+	} /*end while */
+
+out:
+	if (total_count > 0)
+		ret = total_count;
+	if (ret > 0) {
+		if (type == ORANGEFS_IO_READ) {
+			file_accessed(file);
+		} else {
+			file_update_time(file);
+			if (*offset > i_size_read(inode))
+				i_size_write(inode, *offset);
+		}
+	}
+
+	gossip_debug(GOSSIP_FILE_DEBUG,
+		     "%s(%pU): Value(%d) returned.\n",
+		     __func__,
+		     handle,
+		     (int)ret);
+
+	return ret;
 }
 
 /** ORANGEFS2 implementation of address space operations */
 static const struct address_space_operations orangefs_address_operations = {
+	.writepage = orangefs_writepage,
 	.readpage = orangefs_readpage,
-	.readpages = orangefs_readpages,
+	.writepages = orangefs_writepages,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.write_begin = orangefs_write_begin,
+	.write_end = orangefs_write_end,
 	.invalidatepage = orangefs_invalidatepage,
 	.releasepage = orangefs_releasepage,
+	.freepage = orangefs_freepage,
+	.launder_page = orangefs_launder_page,
 	.direct_IO = orangefs_direct_IO,
 };
 
+vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	unsigned long *bitlock = &orangefs_inode->bitlock;
+	vm_fault_t ret;
+	struct orangefs_write_range *wr;
+
+	sb_start_pagefault(inode->i_sb);
+
+	if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) {
+		ret = VM_FAULT_RETRY;
+		goto out;
+	}
+
+	lock_page(page);
+	if (PageDirty(page) && !PagePrivate(page)) {
+		/*
+		 * Should be impossible.  If it happens, launder the page
+		 * since we don't know what's dirty.  This will WARN in
+		 * orangefs_writepage_locked.
+		 */
+		if (orangefs_launder_page(page)) {
+			ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
+			goto out;
+		}
+	}
+	if (PagePrivate(page)) {
+		wr = (struct orangefs_write_range *)page_private(page);
+		if (uid_eq(wr->uid, current_fsuid()) &&
+		    gid_eq(wr->gid, current_fsgid())) {
+			wr->pos = page_offset(page);
+			wr->len = PAGE_SIZE;
+			goto okay;
+		} else {
+			if (orangefs_launder_page(page)) {
+				ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
+				goto out;
+			}
+		}
+	}
+	wr = kmalloc(sizeof *wr, GFP_KERNEL);
+	if (!wr) {
+		ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
+		goto out;
+	}
+	wr->pos = page_offset(page);
+	wr->len = PAGE_SIZE;
+	wr->uid = current_fsuid();
+	wr->gid = current_fsgid();
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long)wr);
+	get_page(page);
+okay:
+
+	file_update_time(vmf->vma->vm_file);
+	if (page->mapping != inode->i_mapping) {
+		unlock_page(page);
+		ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	/*
+	 * We mark the page dirty already here so that when freeze is in
+	 * progress, we are guaranteed that writeback during freezing will
+	 * see the dirty page and writeprotect it again.
+	 */
+	set_page_dirty(page);
+	wait_for_stable_page(page);
+	ret = VM_FAULT_LOCKED;
+out:
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+
 static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
@@ -162,7 +808,7 @@
 		     iattr->ia_size);
 
 	/* Ensure that we have a up to date size, so we know if it changed. */
-	ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE);
+	ret = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_SIZE);
 	if (ret == -ESTALE)
 		ret = -EIO;
 	if (ret) {
@@ -172,7 +818,11 @@
 	}
 	orig_size = i_size_read(inode);
 
-	truncate_setsize(inode, iattr->ia_size);
+	/* This is truncate_setsize in a different order. */
+	truncate_pagecache(inode, iattr->ia_size);
+	i_size_write(inode, iattr->ia_size);
+	if (iattr->ia_size > orig_size)
+		pagecache_isize_extended(inode, orig_size, iattr->ia_size);
 
 	new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE);
 	if (!new_op)
@@ -202,22 +852,33 @@
 	return ret;
 }
 
-/*
- * Change attributes of an object referenced by dentry.
- */
-int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+int __orangefs_setattr(struct inode *inode, struct iattr *iattr)
 {
-	int ret = -EINVAL;
-	struct inode *inode = dentry->d_inode;
+	int ret;
 
-	gossip_debug(GOSSIP_INODE_DEBUG,
-		"%s: called on %pd\n",
-		__func__,
-		dentry);
-
-	ret = setattr_prepare(dentry, iattr);
-	if (ret)
-		goto out;
+	if (iattr->ia_valid & ATTR_MODE) {
+		if (iattr->ia_mode & (S_ISVTX)) {
+			if (is_root_handle(inode)) {
+				/*
+				 * allow sticky bit to be set on root (since
+				 * it shows up that way by default anyhow),
+				 * but don't show it to the server
+				 */
+				iattr->ia_mode -= S_ISVTX;
+			} else {
+				gossip_debug(GOSSIP_UTILS_DEBUG,
+					     "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+		if (iattr->ia_mode & (S_ISUID)) {
+			gossip_debug(GOSSIP_UTILS_DEBUG,
+				     "Attempting to set setuid bit (not supported); returning EINVAL.\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
 
 	if (iattr->ia_valid & ATTR_SIZE) {
 		ret = orangefs_setattr_size(inode, iattr);
@@ -225,21 +886,51 @@
 			goto out;
 	}
 
+again:
+	spin_lock(&inode->i_lock);
+	if (ORANGEFS_I(inode)->attr_valid) {
+		if (uid_eq(ORANGEFS_I(inode)->attr_uid, current_fsuid()) &&
+		    gid_eq(ORANGEFS_I(inode)->attr_gid, current_fsgid())) {
+			ORANGEFS_I(inode)->attr_valid = iattr->ia_valid;
+		} else {
+			spin_unlock(&inode->i_lock);
+			write_inode_now(inode, 1);
+			goto again;
+		}
+	} else {
+		ORANGEFS_I(inode)->attr_valid = iattr->ia_valid;
+		ORANGEFS_I(inode)->attr_uid = current_fsuid();
+		ORANGEFS_I(inode)->attr_gid = current_fsgid();
+	}
 	setattr_copy(inode, iattr);
+	spin_unlock(&inode->i_lock);
 	mark_inode_dirty(inode);
 
-	ret = orangefs_inode_setattr(inode, iattr);
-	gossip_debug(GOSSIP_INODE_DEBUG,
-		"%s: orangefs_inode_setattr returned %d\n",
-		__func__,
-		ret);
-
-	if (!ret && (iattr->ia_valid & ATTR_MODE))
+	if (iattr->ia_valid & ATTR_MODE)
 		/* change mod on a file that has ACLs */
 		ret = posix_acl_chmod(inode, inode->i_mode);
 
+	ret = 0;
 out:
-	gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret);
+	return ret;
+}
+
+/*
+ * Change attributes of an object referenced by dentry.
+ */
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	int ret;
+	gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n",
+	    dentry);
+	ret = setattr_prepare(dentry, iattr);
+	if (ret)
+	        goto out;
+	ret = __orangefs_setattr(d_inode(dentry), iattr);
+	sync_inode_metadata(d_inode(dentry), 1);
+out:
+	gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
+	    ret);
 	return ret;
 }
 
@@ -249,23 +940,21 @@
 int orangefs_getattr(const struct path *path, struct kstat *stat,
 		     u32 request_mask, unsigned int flags)
 {
-	int ret = -ENOENT;
+	int ret;
 	struct inode *inode = path->dentry->d_inode;
 
 	gossip_debug(GOSSIP_INODE_DEBUG,
-		     "orangefs_getattr: called on %pd\n",
-		     path->dentry);
+		     "orangefs_getattr: called on %pd mask %u\n",
+		     path->dentry, request_mask);
 
-	ret = orangefs_inode_getattr(inode, 0, 0, request_mask);
+	ret = orangefs_inode_getattr(inode,
+	    request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
 	if (ret == 0) {
 		generic_fillattr(inode, stat);
 
 		/* override block size reported to stat */
-		if (request_mask & STATX_SIZE)
-			stat->result_mask = STATX_BASIC_STATS;
-		else
-			stat->result_mask = STATX_BASIC_STATS &
-			    ~STATX_SIZE;
+		if (!(request_mask & STATX_SIZE))
+			stat->result_mask &= ~STATX_SIZE;
 
 		stat->attributes_mask = STATX_ATTR_IMMUTABLE |
 		    STATX_ATTR_APPEND;
@@ -287,7 +976,7 @@
 	gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
 
 	/* Make sure the permission (and other common attrs) are up to date. */
-	ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE);
+	ret = orangefs_inode_getattr(inode, 0);
 	if (ret < 0)
 		return ret;
 
@@ -307,7 +996,7 @@
 		iattr.ia_valid |= ATTR_CTIME;
 	if (flags & S_MTIME)
 		iattr.ia_valid |= ATTR_MTIME;
-	return orangefs_inode_setattr(inode, &iattr);
+	return __orangefs_setattr(inode, &iattr);
 }
 
 /* ORANGEFS2 implementation of VFS inode operations for files */
@@ -367,6 +1056,10 @@
 	struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
 	ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
 	ORANGEFS_I(inode)->refn.khandle = ref->khandle;
+	ORANGEFS_I(inode)->attr_valid = 0;
+	hash_init(ORANGEFS_I(inode)->xattr_cache);
+	ORANGEFS_I(inode)->mapping_time = jiffies - 1;
+	ORANGEFS_I(inode)->bitlock = 0;
 	return 0;
 }
 
@@ -405,10 +1098,14 @@
 			orangefs_test_inode,
 			orangefs_set_inode,
 			ref);
-	if (!inode || !(inode->i_state & I_NEW))
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(inode->i_state & I_NEW))
 		return inode;
 
-	error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
+	error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
 	if (error) {
 		iget_failed(inode);
 		return ERR_PTR(error);
@@ -448,22 +1145,16 @@
 
 	inode = new_inode(sb);
 	if (!inode)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	orangefs_set_inode(inode, ref);
 	inode->i_ino = hash;	/* needed for stat etc */
 
-	error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
+	error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
 	if (error)
 		goto out_iput;
 
 	orangefs_init_iops(inode);
-
-	inode->i_mode = mode;
-	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
-	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
-	inode->i_size = PAGE_SIZE;
 	inode->i_rdev = dev;
 
 	error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);

diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 625b058..3e7cf3d 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c

@@ -58,7 +58,6 @@
 		goto out;
 
 	ref = new_op->downcall.resp.create.refn;
-	op_release(new_op);
 
 	inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &ref);
 	if (IS_ERR(inode)) {
@@ -77,21 +76,19 @@
 
 	d_instantiate_new(dentry, inode);
 	orangefs_set_timeout(dentry);
-	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
-	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "%s: dentry instantiated for %pd\n",
 		     __func__,
 		     dentry);
 
-	dir->i_mtime = dir->i_ctime = current_time(dir);
 	memset(&iattr, 0, sizeof iattr);
-	iattr.ia_valid |= ATTR_MTIME;
-	orangefs_inode_setattr(dir, &iattr);
-	mark_inode_dirty_sync(dir);
+	iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+	iattr.ia_mtime = iattr.ia_ctime = current_time(dir);
+	__orangefs_setattr(dir, &iattr);
 	ret = 0;
 out:
+	op_release(new_op);
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "%s: %pd: returning %d\n",
 		     __func__,
@@ -157,7 +154,7 @@
 		     new_op->downcall.resp.lookup.refn.fs_id,
 		     ret);
 
-	if (ret >= 0) {
+	if (ret == 0) {
 		orangefs_set_timeout(dentry);
 		inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
 	} else if (ret == -ENOENT) {
@@ -210,11 +207,10 @@
 	if (!ret) {
 		drop_nlink(inode);
 
-		dir->i_mtime = dir->i_ctime = current_time(dir);
 		memset(&iattr, 0, sizeof iattr);
-		iattr.ia_valid |= ATTR_MTIME;
-		orangefs_inode_setattr(dir, &iattr);
-		mark_inode_dirty_sync(dir);
+		iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+		iattr.ia_mtime = iattr.ia_ctime = current_time(dir);
+		__orangefs_setattr(dir, &iattr);
 	}
 	return ret;
 }
@@ -228,7 +224,7 @@
 	struct orangefs_object_kref ref;
 	struct inode *inode;
 	struct iattr iattr;
-	int mode = 755;
+	int mode = 0755;
 	int ret;
 
 	gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
@@ -269,7 +265,6 @@
 	}
 
 	ref = new_op->downcall.resp.sym.refn;
-	op_release(new_op);
 
 	inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, &ref);
 	if (IS_ERR(inode)) {
@@ -292,21 +287,19 @@
 
 	d_instantiate_new(dentry, inode);
 	orangefs_set_timeout(dentry);
-	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
-	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "Inode (Symlink) %pU -> %pd\n",
 		     get_khandle_from_ino(inode),
 		     dentry);
 
-	dir->i_mtime = dir->i_ctime = current_time(dir);
 	memset(&iattr, 0, sizeof iattr);
-	iattr.ia_valid |= ATTR_MTIME;
-	orangefs_inode_setattr(dir, &iattr);
-	mark_inode_dirty_sync(dir);
+	iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+	iattr.ia_mtime = iattr.ia_ctime = current_time(dir);
+	__orangefs_setattr(dir, &iattr);
 	ret = 0;
 out:
+	op_release(new_op);
 	return ret;
 }
 
@@ -346,7 +339,6 @@
 	}
 
 	ref = new_op->downcall.resp.mkdir.refn;
-	op_release(new_op);
 
 	inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, &ref);
 	if (IS_ERR(inode)) {
@@ -361,8 +353,6 @@
 
 	d_instantiate_new(dentry, inode);
 	orangefs_set_timeout(dentry);
-	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
-	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "Inode (Directory) %pU -> %pd\n",
@@ -373,12 +363,12 @@
 	 * NOTE: we have no good way to keep nlink consistent for directories
 	 * across clients; keep constant at 1.
 	 */
-	dir->i_mtime = dir->i_ctime = current_time(dir);
 	memset(&iattr, 0, sizeof iattr);
-	iattr.ia_valid |= ATTR_MTIME;
-	orangefs_inode_setattr(dir, &iattr);
-	mark_inode_dirty_sync(dir);
+	iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+	iattr.ia_mtime = iattr.ia_ctime = current_time(dir);
+	__orangefs_setattr(dir, &iattr);
 out:
+	op_release(new_op);
 	return ret;
 }
 
@@ -389,6 +379,7 @@
 			unsigned int flags)
 {
 	struct orangefs_kernel_op_s *new_op;
+	struct iattr iattr;
 	int ret;
 
 	if (flags)
@@ -398,7 +389,10 @@
 		     "orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
 		     old_dentry, new_dentry, d_count(new_dentry));
 
-	ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1;
+	memset(&iattr, 0, sizeof iattr);
+	iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+	iattr.ia_mtime = iattr.ia_ctime = current_time(new_dir);
+	__orangefs_setattr(new_dir, &iattr);
 
 	new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
 	if (!new_op)

diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index c4e98c9..2bb916d 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c

@@ -105,7 +105,7 @@
 			left = t;
 		else
 			left = t + (left - n);
-		if (unlikely(signal_pending(current)))
+		if (signal_pending(current))
 			left = -EINTR;
 	} while (left > 0);
 
@@ -269,7 +269,7 @@
 
 	/* map the pages */
 	ret = get_user_pages_fast((unsigned long)user_desc->ptr,
-			     bufmap->page_count, 1, bufmap->page_array);
+			     bufmap->page_count, FOLL_WRITE, bufmap->page_array);
 
 	if (ret < 0)
 		return ret;
@@ -538,3 +538,16 @@
 	}
 	return 0;
 }
+
+void orangefs_bufmap_page_fill(void *page_to,
+				int buffer_index,
+				int slot_index)
+{
+	struct orangefs_bufmap_desc *from;
+	void *page_from;
+
+	from = &__orangefs_bufmap->desc_array[buffer_index];
+	page_from = kmap_atomic(from->page_array[slot_index]);
+	memcpy(page_to, page_from, PAGE_SIZE);
+	kunmap_atomic(page_from);
+}

diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h
index c2c3c5a..75b2d28 100644
--- a/fs/orangefs/orangefs-bufmap.h
+++ b/fs/orangefs/orangefs-bufmap.h

@@ -34,4 +34,6 @@
 			      int buffer_index,
 			      size_t size);
 
+void orangefs_bufmap_page_fill(void *kaddr, int buffer_index, int slot_index);
+
 #endif /* __ORANGEFS_BUFMAP_H */

diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 0732cb0..25543a9 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c

@@ -64,7 +64,7 @@
 	__u64 mask2;
 };
 
-static int orangefs_kernel_debug_init(void);
+static void orangefs_kernel_debug_init(void);
 
 static int orangefs_debug_help_open(struct inode *, struct file *);
 static void *help_start(struct seq_file *, loff_t *);
@@ -99,7 +99,6 @@
 static char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
 static char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
 
-static struct dentry *help_file_dentry;
 static struct dentry *client_debug_dentry;
 static struct dentry *debug_dir;
 
@@ -151,10 +150,8 @@
  * initialize kmod debug operations, create orangefs debugfs dir and
  * ORANGEFS_KMOD_DEBUG_HELP_FILE.
  */
-int orangefs_debugfs_init(int debug_mask)
+void orangefs_debugfs_init(int debug_mask)
 {
-	int rc = -ENOMEM;
-
 	/* convert input debug mask to a 64-bit unsigned integer */
         orangefs_gossip_debug_mask = (unsigned long long)debug_mask;
 
@@ -183,37 +180,21 @@
 		(unsigned long long)orangefs_gossip_debug_mask);
 
 	debug_dir = debugfs_create_dir("orangefs", NULL);
-	if (!debug_dir) {
-		pr_info("%s: debugfs_create_dir failed.\n", __func__);
-		goto out;
-	}
 
-	help_file_dentry = debugfs_create_file(ORANGEFS_KMOD_DEBUG_HELP_FILE,
-				  0444,
-				  debug_dir,
-				  debug_help_string,
-				  &debug_help_fops);
-	if (!help_file_dentry) {
-		pr_info("%s: debugfs_create_file failed.\n", __func__);
-		goto out;
-	}
+	debugfs_create_file(ORANGEFS_KMOD_DEBUG_HELP_FILE, 0444, debug_dir,
+			    debug_help_string, &debug_help_fops);
 
 	orangefs_debug_disabled = 0;
 
-	rc = orangefs_kernel_debug_init();
-
-out:
-
-	return rc;
+	orangefs_kernel_debug_init();
 }
 
 /*
  * initialize the kernel-debug file.
  */
-static int orangefs_kernel_debug_init(void)
+static void orangefs_kernel_debug_init(void)
 {
 	int rc = -ENOMEM;
-	struct dentry *ret;
 	char *k_buffer = NULL;
 
 	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
@@ -230,24 +211,11 @@
 		pr_info("%s: overflow 1!\n", __func__);
 	}
 
-	ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
-				  0444,
-				  debug_dir,
-				  k_buffer,
-				  &kernel_debug_fops);
-	if (!ret) {
-		pr_info("%s: failed to create %s.\n",
-			__func__,
-			ORANGEFS_KMOD_DEBUG_FILE);
-		goto out;
-	}
-
-	rc = 0;
+	debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
+			    &kernel_debug_fops);
 
 out:
-
 	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
-	return rc;
 }
 
 
@@ -353,12 +321,6 @@
 						  debug_dir,
 						  c_buffer,
 						  &kernel_debug_fops);
-	if (!client_debug_dentry) {
-		pr_info("%s: failed to create updated %s.\n",
-			__func__,
-			ORANGEFS_CLIENT_DEBUG_FILE);
-		goto out;
-	}
 
 	rc = 0;
 
@@ -963,7 +925,7 @@
 	return ret;
 }
 
-int orangefs_debugfs_new_client_string(void __user *arg) 
+int orangefs_debugfs_new_client_string(void __user *arg)
 {
 	int ret;
 
@@ -1016,7 +978,7 @@
 	return 0;
 }
 
-int orangefs_debugfs_new_debug(void __user *arg) 
+int orangefs_debugfs_new_debug(void __user *arg)
 {
 	struct dev_mask_info_s mask_info = {0};
 	int ret;

diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h
index 51147f9..502f6de 100644
--- a/fs/orangefs/orangefs-debugfs.h
+++ b/fs/orangefs/orangefs-debugfs.h

@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-int orangefs_debugfs_init(int);
+void orangefs_debugfs_init(int);
 void orangefs_debugfs_cleanup(void);
 int orangefs_prepare_debugfs_help_string(int);
 int orangefs_debugfs_new_client_mask(void __user *);

diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 17b24ad..34a6c99 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h

@@ -51,6 +51,7 @@
 #include <linux/rwsem.h>
 #include <linux/xattr.h>
 #include <linux/exportfs.h>
+#include <linux/hashtable.h>
 
 #include <asm/unaligned.h>
 
@@ -192,7 +193,13 @@
 	sector_t last_failed_block_index_read;
 
 	unsigned long getattr_time;
-	u32 getattr_mask;
+	unsigned long mapping_time;
+	int attr_valid;
+	kuid_t attr_uid;
+	kgid_t attr_gid;
+	unsigned long bitlock;
+
+	DECLARE_HASHTABLE(xattr_cache, 4);
 };
 
 /* per superblock private orangefs info */
@@ -217,10 +224,29 @@
 	unsigned long writes;
 };
 
+struct orangefs_cached_xattr {
+	struct hlist_node node;
+	char key[ORANGEFS_MAX_XATTR_NAMELEN];
+	char val[ORANGEFS_MAX_XATTR_VALUELEN];
+	ssize_t length;
+	unsigned long timeout;
+};
+
+struct orangefs_write_range {
+	loff_t pos;
+	size_t len;
+	kuid_t uid;
+	kgid_t gid;
+};
+
+struct orangefs_read_options {
+	ssize_t blksiz;
+};
+
 extern struct orangefs_stats orangefs_stats;
 
 /*
- * NOTE: See Documentation/filesystems/porting for information
+ * NOTE: See Documentation/filesystems/porting.rst for information
  * on implementing FOO_I and properly accessing fs private data
  */
 static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode)
@@ -329,13 +355,15 @@
 /*
  * defined in inode.c
  */
+vm_fault_t orangefs_page_mkwrite(struct vm_fault *);
 struct inode *orangefs_new_inode(struct super_block *sb,
 			      struct inode *dir,
 			      int mode,
 			      dev_t dev,
 			      struct orangefs_object_kref *ref);
 
-int orangefs_setattr(struct dentry *dentry, struct iattr *iattr);
+int __orangefs_setattr(struct inode *, struct iattr *);
+int orangefs_setattr(struct dentry *, struct iattr *);
 
 int orangefs_getattr(const struct path *path, struct kstat *stat,
 		     u32 request_mask, unsigned int flags);
@@ -355,11 +383,6 @@
 struct inode *orangefs_iget(struct super_block *sb,
 			 struct orangefs_object_kref *ref);
 
-ssize_t orangefs_inode_read(struct inode *inode,
-			    struct iov_iter *iter,
-			    loff_t *offset,
-			    loff_t readahead_size);
-
 /*
  * defined in devorangefs-req.c
  */
@@ -371,6 +394,15 @@
 bool __is_daemon_in_service(void);
 
 /*
+ * defined in file.c
+ */
+int orangefs_revalidate_mapping(struct inode *);
+ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *,
+    struct iov_iter *, size_t, loff_t, struct orangefs_write_range *, int *);
+ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *,
+    struct iov_iter *);
+
+/*
  * defined in orangefs-utils.c
  */
 __s32 fsid_of_op(struct orangefs_kernel_op_s *op);
@@ -386,12 +418,14 @@
 			 size_t size,
 			 int flags);
 
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
-    u32 request_mask);
+#define ORANGEFS_GETATTR_NEW 1
+#define ORANGEFS_GETATTR_SIZE 2
+
+int orangefs_inode_getattr(struct inode *, int);
 
 int orangefs_inode_check_changed(struct inode *inode);
 
-int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr);
+int orangefs_inode_setattr(struct inode *inode);
 
 bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
 
@@ -400,6 +434,7 @@
 extern struct mutex orangefs_request_mutex;
 extern int op_timeout_secs;
 extern int slot_timeout_secs;
+extern int orangefs_cache_timeout_msecs;
 extern int orangefs_dcache_timeout_msecs;
 extern int orangefs_getattr_timeout_msecs;
 extern struct list_head orangefs_superblocks;
@@ -426,6 +461,7 @@
 #define ORANGEFS_OP_CANCELLATION  4   /* this is a cancellation */
 #define ORANGEFS_OP_NO_MUTEX      8   /* don't acquire request_mutex */
 #define ORANGEFS_OP_ASYNC         16  /* Queue it, but don't wait */
+#define ORANGEFS_OP_WRITEBACK     32
 
 int service_operation(struct orangefs_kernel_op_s *op,
 		      const char *op_name,

diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 85ef872..c010c1f 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * (C) 2001 Clemson University and The University of Chicago
  *
@@ -30,6 +31,7 @@
 __u64 orangefs_gossip_debug_mask;
 int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
 int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+int orangefs_cache_timeout_msecs = 50;
 int orangefs_dcache_timeout_msecs = 50;
 int orangefs_getattr_timeout_msecs = 50;
 
@@ -127,9 +129,7 @@
 	if (ret)
 		goto cleanup_key_table;
 
-	ret = orangefs_debugfs_init(module_parm_debug_mask);
-	if (ret)
-		goto debugfs_init_failed;
+	orangefs_debugfs_init(module_parm_debug_mask);
 
 	ret = orangefs_sysfs_init();
 	if (ret)
@@ -159,8 +159,6 @@
 	orangefs_dev_cleanup();
 
 sysfs_init_failed:
-
-debugfs_init_failed:
 	orangefs_debugfs_cleanup();
 
 cleanup_key_table:

diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index dd28079..3627ea9 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c

@@ -62,6 +62,14 @@
  *			Slots are requested and waited for,
  *			the wait times out after slot_timeout_secs.
  *
+ * What:		/sys/fs/orangefs/cache_timeout_msecs
+ * Date:		Mar 2018
+ * Contact:		Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *			Time in milliseconds between which
+ *			orangefs_revalidate_mapping will invalidate the page
+ *			cache.
+ *
  * What:		/sys/fs/orangefs/dcache_timeout_msecs
  * Date:		Jul 2016
  * Contact:		Martin Brandenburg <martin@omnibond.com>
@@ -222,6 +230,13 @@
 				       slot_timeout_secs);
 			goto out;
 		} else if (!strcmp(attr->attr.name,
+				   "cache_timeout_msecs")) {
+			rc = scnprintf(buf,
+				       PAGE_SIZE,
+				       "%d\n",
+				       orangefs_cache_timeout_msecs);
+			goto out;
+		} else if (!strcmp(attr->attr.name,
 				   "dcache_timeout_msecs")) {
 			rc = scnprintf(buf,
 				       PAGE_SIZE,
@@ -277,6 +292,9 @@
 	} else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
 		rc = kstrtoint(buf, 0, &slot_timeout_secs);
 		goto out;
+	} else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) {
+		rc = kstrtoint(buf, 0, &orangefs_cache_timeout_msecs);
+		goto out;
 	} else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
 		rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs);
 		goto out;
@@ -323,7 +341,7 @@
 	/* Can't do a service_operation if the client is not running... */
 	rc = is_daemon_in_service();
 	if (rc) {
-		pr_info("%s: Client not running :%d:\n",
+		pr_info_ratelimited("%s: Client not running :%d:\n",
 			__func__,
 			is_daemon_in_service());
 		goto out;
@@ -818,6 +836,9 @@
 static struct orangefs_attribute slot_timeout_secs_attribute =
 	__ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
 
+static struct orangefs_attribute cache_timeout_msecs_attribute =
+	__ATTR(cache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
+
 static struct orangefs_attribute dcache_timeout_msecs_attribute =
 	__ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
 
@@ -861,6 +882,7 @@
 static struct attribute *orangefs_default_attrs[] = {
 	&op_timeout_secs_attribute.attr,
 	&slot_timeout_secs_attribute.attr,
+	&cache_timeout_msecs_attribute.attr,
 	&dcache_timeout_msecs_attribute.attr,
 	&getattr_timeout_msecs_attribute.attr,
 	&readahead_count_attribute.attr,

diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 804c8a2..d4b7ae7 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c

@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * (C) 2001 Clemson University and The University of Chicago
+ * Copyright 2018 Omnibond Systems, L.L.C.
  *
  * See COPYING in top-level directory.
  */
@@ -135,51 +136,37 @@
  * NOTE: in kernel land, we never use the sys_attr->link_target for
  * anything, so don't bother copying it into the sys_attr object here.
  */
-static inline int copy_attributes_from_inode(struct inode *inode,
-					     struct ORANGEFS_sys_attr_s *attrs,
-					     struct iattr *iattr)
+static inline void copy_attributes_from_inode(struct inode *inode,
+    struct ORANGEFS_sys_attr_s *attrs)
 {
-	umode_t tmp_mode;
-
-	if (!iattr || !inode || !attrs) {
-		gossip_err("NULL iattr (%p), inode (%p), attrs (%p) "
-			   "in copy_attributes_from_inode!\n",
-			   iattr,
-			   inode,
-			   attrs);
-		return -EINVAL;
-	}
-	/*
-	 * We need to be careful to only copy the attributes out of the
-	 * iattr object that we know are valid.
-	 */
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	attrs->mask = 0;
-	if (iattr->ia_valid & ATTR_UID) {
-		attrs->owner = from_kuid(&init_user_ns, iattr->ia_uid);
+	if (orangefs_inode->attr_valid & ATTR_UID) {
+		attrs->owner = from_kuid(&init_user_ns, inode->i_uid);
 		attrs->mask |= ORANGEFS_ATTR_SYS_UID;
 		gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner);
 	}
-	if (iattr->ia_valid & ATTR_GID) {
-		attrs->group = from_kgid(&init_user_ns, iattr->ia_gid);
+	if (orangefs_inode->attr_valid & ATTR_GID) {
+		attrs->group = from_kgid(&init_user_ns, inode->i_gid);
 		attrs->mask |= ORANGEFS_ATTR_SYS_GID;
 		gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group);
 	}
 
-	if (iattr->ia_valid & ATTR_ATIME) {
+	if (orangefs_inode->attr_valid & ATTR_ATIME) {
 		attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
-		if (iattr->ia_valid & ATTR_ATIME_SET) {
-			attrs->atime = (time64_t)iattr->ia_atime.tv_sec;
+		if (orangefs_inode->attr_valid & ATTR_ATIME_SET) {
+			attrs->atime = (time64_t)inode->i_atime.tv_sec;
 			attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
 		}
 	}
-	if (iattr->ia_valid & ATTR_MTIME) {
+	if (orangefs_inode->attr_valid & ATTR_MTIME) {
 		attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
-		if (iattr->ia_valid & ATTR_MTIME_SET) {
-			attrs->mtime = (time64_t)iattr->ia_mtime.tv_sec;
+		if (orangefs_inode->attr_valid & ATTR_MTIME_SET) {
+			attrs->mtime = (time64_t)inode->i_mtime.tv_sec;
 			attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
 		}
 	}
-	if (iattr->ia_valid & ATTR_CTIME)
+	if (orangefs_inode->attr_valid & ATTR_CTIME)
 		attrs->mask |= ORANGEFS_ATTR_SYS_CTIME;
 
 	/*
@@ -188,36 +175,10 @@
 	 * worry about ATTR_SIZE
 	 */
 
-	if (iattr->ia_valid & ATTR_MODE) {
-		tmp_mode = iattr->ia_mode;
-		if (tmp_mode & (S_ISVTX)) {
-			if (is_root_handle(inode)) {
-				/*
-				 * allow sticky bit to be set on root (since
-				 * it shows up that way by default anyhow),
-				 * but don't show it to the server
-				 */
-				tmp_mode -= S_ISVTX;
-			} else {
-				gossip_debug(GOSSIP_UTILS_DEBUG,
-					"%s: setting sticky bit not supported.\n",
-					__func__);
-				return -EINVAL;
-			}
-		}
-
-		if (tmp_mode & (S_ISUID)) {
-			gossip_debug(GOSSIP_UTILS_DEBUG,
-				"%s: setting setuid bit not supported.\n",
-				__func__);
-			return -EINVAL;
-		}
-
-		attrs->perms = ORANGEFS_util_translate_mode(tmp_mode);
+	if (orangefs_inode->attr_valid & ATTR_MODE) {
+		attrs->perms = ORANGEFS_util_translate_mode(inode->i_mode);
 		attrs->mask |= ORANGEFS_ATTR_SYS_PERM;
 	}
-
-	return 0;
 }
 
 static int orangefs_inode_type(enum orangefs_ds_type objtype)
@@ -272,27 +233,30 @@
 	return 0;
 }
 
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
-    u32 request_mask)
+int orangefs_inode_getattr(struct inode *inode, int flags)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_kernel_op_s *new_op;
 	loff_t inode_size;
 	int ret, type;
 
-	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
-	    get_khandle_from_ino(inode));
+	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU flags %d\n",
+	    __func__, get_khandle_from_ino(inode), flags);
 
-	if (!new && !bypass) {
-		/*
-		 * Must have all the attributes in the mask and be within cache
-		 * time.
-		 */
-		if ((request_mask & orangefs_inode->getattr_mask) ==
-		    request_mask &&
-		    time_before(jiffies, orangefs_inode->getattr_time))
-			return 0;
+again:
+	spin_lock(&inode->i_lock);
+	/* Must have all the attributes in the mask and be within cache time. */
+	if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
+	    orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) {
+		if (orangefs_inode->attr_valid) {
+			spin_unlock(&inode->i_lock);
+			write_inode_now(inode, 1);
+			goto again;
+		}
+		spin_unlock(&inode->i_lock);
+		return 0;
 	}
+	spin_unlock(&inode->i_lock);
 
 	new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
 	if (!new_op)
@@ -302,7 +266,7 @@
 	 * Size is the hardest attribute to get.  The incremental cost of any
 	 * other attribute is essentially zero.
 	 */
-	if (request_mask & STATX_SIZE || new)
+	if (flags)
 		new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
 	else
 		new_op->upcall.req.getattr.mask =
@@ -313,13 +277,33 @@
 	if (ret != 0)
 		goto out;
 
-	if (!new) {
+again2:
+	spin_lock(&inode->i_lock);
+	/* Must have all the attributes in the mask and be within cache time. */
+	if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
+	    orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) {
+		if (orangefs_inode->attr_valid) {
+			spin_unlock(&inode->i_lock);
+			write_inode_now(inode, 1);
+			goto again2;
+		}
+		if (inode->i_state & I_DIRTY_PAGES) {
+			ret = 0;
+			goto out_unlock;
+		}
+		gossip_debug(GOSSIP_UTILS_DEBUG, "%s: in cache or dirty\n",
+		    __func__);
+		ret = 0;
+		goto out_unlock;
+	}
+
+	if (!(flags & ORANGEFS_GETATTR_NEW)) {
 		ret = orangefs_inode_is_stale(inode,
 		    &new_op->downcall.resp.getattr.attributes,
 		    new_op->downcall.resp.getattr.link_target);
 		if (ret) {
 			ret = -ESTALE;
-			goto out;
+			goto out_unlock;
 		}
 	}
 
@@ -329,30 +313,26 @@
 	case S_IFREG:
 		inode->i_flags = orangefs_inode_flags(&new_op->
 		    downcall.resp.getattr.attributes);
-		if (request_mask & STATX_SIZE || new) {
+		if (flags) {
 			inode_size = (loff_t)new_op->
 			    downcall.resp.getattr.attributes.size;
 			inode->i_size = inode_size;
 			inode->i_blkbits = ffs(new_op->downcall.resp.getattr.
 			    attributes.blksize);
-			spin_lock(&inode->i_lock);
 			inode->i_bytes = inode_size;
 			inode->i_blocks =
 			    (inode_size + 512 - inode_size % 512)/512;
-			spin_unlock(&inode->i_lock);
 		}
 		break;
 	case S_IFDIR:
-		if (request_mask & STATX_SIZE || new) {
+		if (flags) {
 			inode->i_size = PAGE_SIZE;
-			spin_lock(&inode->i_lock);
 			inode_set_bytes(inode, inode->i_size);
-			spin_unlock(&inode->i_lock);
 		}
 		set_nlink(inode, 1);
 		break;
 	case S_IFLNK:
-		if (new) {
+		if (flags & ORANGEFS_GETATTR_NEW) {
 			inode->i_size = (loff_t)strlen(new_op->
 			    downcall.resp.getattr.link_target);
 			ret = strscpy(orangefs_inode->link_target,
@@ -360,7 +340,7 @@
 			    ORANGEFS_NAME_MAX);
 			if (ret == -E2BIG) {
 				ret = -EIO;
-				goto out;
+				goto out_unlock;
 			}
 			inode->i_link = orangefs_inode->link_target;
 		}
@@ -370,7 +350,7 @@
 		/* XXX: ESTALE?  This is what is done if it is not new. */
 		orangefs_make_bad_inode(inode);
 		ret = -ESTALE;
-		goto out;
+		goto out_unlock;
 	}
 
 	inode->i_uid = make_kuid(&init_user_ns, new_op->
@@ -393,11 +373,9 @@
 
 	orangefs_inode->getattr_time = jiffies +
 	    orangefs_getattr_timeout_msecs*HZ/1000;
-	if (request_mask & STATX_SIZE || new)
-		orangefs_inode->getattr_mask = STATX_BASIC_STATS;
-	else
-		orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE;
 	ret = 0;
+out_unlock:
+	spin_unlock(&inode->i_lock);
 out:
 	op_release(new_op);
 	return ret;
@@ -436,7 +414,7 @@
  * issues a orangefs setattr request to make sure the new attribute values
  * take effect if successful.  returns 0 on success; -errno otherwise
  */
-int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
+int orangefs_inode_setattr(struct inode *inode)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_kernel_op_s *new_op;
@@ -446,24 +424,31 @@
 	if (!new_op)
 		return -ENOMEM;
 
+	spin_lock(&inode->i_lock);
+	new_op->upcall.uid = from_kuid(&init_user_ns, orangefs_inode->attr_uid);
+	new_op->upcall.gid = from_kgid(&init_user_ns, orangefs_inode->attr_gid);
 	new_op->upcall.req.setattr.refn = orangefs_inode->refn;
-	ret = copy_attributes_from_inode(inode,
-		       &new_op->upcall.req.setattr.attributes,
-		       iattr);
-	if (ret >= 0) {
-		ret = service_operation(new_op, __func__,
-				get_interruptible_flag(inode));
-
-		gossip_debug(GOSSIP_UTILS_DEBUG,
-			     "orangefs_inode_setattr: returning %d\n",
-			     ret);
+	copy_attributes_from_inode(inode,
+	    &new_op->upcall.req.setattr.attributes);
+	orangefs_inode->attr_valid = 0;
+	if (!new_op->upcall.req.setattr.attributes.mask) {
+		spin_unlock(&inode->i_lock);
+		op_release(new_op);
+		return 0;
 	}
+	spin_unlock(&inode->i_lock);
+
+	ret = service_operation(new_op, __func__,
+	    get_interruptible_flag(inode) | ORANGEFS_OP_WRITEBACK);
+	gossip_debug(GOSSIP_UTILS_DEBUG,
+	    "orangefs_inode_setattr: returning %d\n", ret);
+	if (ret)
+		orangefs_make_bad_inode(inode);
 
 	op_release(new_op);
 
 	if (ret == 0)
 		orangefs_inode->getattr_time = jiffies - 1;
-
 	return ret;
 }
 

diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index dfaee90..ee5efdc 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c

@@ -10,6 +10,7 @@
 #include "orangefs-bufmap.h"
 
 #include <linux/parser.h>
+#include <linux/hashtable.h>
 
 /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
 static struct kmem_cache *orangefs_inode_cache;
@@ -124,10 +125,18 @@
 	return &orangefs_inode->vfs_inode;
 }
 
-static void orangefs_i_callback(struct rcu_head *head)
+static void orangefs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	struct orangefs_cached_xattr *cx;
+	struct hlist_node *tmp;
+	int i;
+
+	hash_for_each_safe(orangefs_inode->xattr_cache, i, tmp, cx, node) {
+		hlist_del(&cx->node);
+		kfree(cx);
+	}
+
 	kmem_cache_free(orangefs_inode_cache, orangefs_inode);
 }
 
@@ -138,8 +147,13 @@
 	gossip_debug(GOSSIP_SUPER_DEBUG,
 			"%s: deallocated %p destroying inode %pU\n",
 			__func__, orangefs_inode, get_khandle_from_ino(inode));
+}
 
-	call_rcu(&inode->i_rcu, orangefs_i_callback);
+static int orangefs_write_inode(struct inode *inode,
+				struct writeback_control *wbc)
+{
+	gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n");
+	return orangefs_inode_setattr(inode);
 }
 
 /*
@@ -299,7 +313,9 @@
 
 static const struct super_operations orangefs_s_ops = {
 	.alloc_inode = orangefs_alloc_inode,
+	.free_inode = orangefs_free_inode,
 	.destroy_inode = orangefs_destroy_inode,
+	.write_inode = orangefs_write_inode,
 	.drop_inode = generic_delete_inode,
 	.statfs = orangefs_statfs,
 	.remount_fs = orangefs_remount_fs,
@@ -397,15 +413,11 @@
 		struct orangefs_fs_mount_response *fs_mount,
 		void *data, int silent)
 {
-	int ret = -EINVAL;
-	struct inode *root = NULL;
-	struct dentry *root_dentry = NULL;
+	int ret;
+	struct inode *root;
+	struct dentry *root_dentry;
 	struct orangefs_object_kref root_object;
 
-	/* alloc and init our private orangefs sb info */
-	sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
-	if (!ORANGEFS_SB(sb))
-		return -ENOMEM;
 	ORANGEFS_SB(sb)->sb = sb;
 
 	ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle;
@@ -428,6 +440,10 @@
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
+	ret = super_setup_bdi(sb);
+	if (ret)
+		return ret;
+
 	root_object.khandle = ORANGEFS_SB(sb)->root_khandle;
 	root_object.fs_id = ORANGEFS_SB(sb)->fs_id;
 	gossip_debug(GOSSIP_SUPER_DEBUG,
@@ -506,6 +522,13 @@
 		goto free_op;
 	}
 
+	/* alloc and init our private orangefs sb info */
+	sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
+	if (!ORANGEFS_SB(sb)) {
+		d = ERR_PTR(-ENOMEM);
+		goto free_op;
+	}
+
 	ret = orangefs_fill_sb(sb,
 	      &new_op->downcall.resp.fs_mount, data,
 	      flags & SB_SILENT ? 1 : 0);

diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index 0729d26..beafc33 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c

@@ -19,7 +19,7 @@
 
 static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
 		long timeout,
-		bool interruptible)
+		int flags)
 			__acquires(op->lock);
 static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
 	__releases(op->lock);
@@ -143,9 +143,7 @@
 	if (!(flags & ORANGEFS_OP_NO_MUTEX))
 		mutex_unlock(&orangefs_request_mutex);
 
-	ret = wait_for_matching_downcall(op, timeout,
-					 flags & ORANGEFS_OP_INTERRUPTIBLE);
-
+	ret = wait_for_matching_downcall(op, timeout, flags);
 	gossip_debug(GOSSIP_WAIT_DEBUG,
 		     "%s: wait_for_matching_downcall returned %d for %p\n",
 		     __func__,
@@ -319,10 +317,12 @@
  */
 static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
 		long timeout,
-		bool interruptible)
+		int flags)
 			__acquires(op->lock)
 {
 	long n;
+	int writeback = flags & ORANGEFS_OP_WRITEBACK,
+	    interruptible = flags & ORANGEFS_OP_INTERRUPTIBLE;
 
 	/*
 	 * There's a "schedule_timeout" inside of these wait
@@ -330,10 +330,12 @@
 	 * user process that needs something done and is being
 	 * manipulated by the client-core process.
 	 */
-	if (interruptible)
+	if (writeback)
+		n = wait_for_completion_io_timeout(&op->waitq, timeout);
+	else if (!writeback && interruptible)
 		n = wait_for_completion_interruptible_timeout(&op->waitq,
-							      timeout);
-	else
+								      timeout);
+	else /* !writeback && !interruptible but compiler complains */
 		n = wait_for_completion_killable_timeout(&op->waitq, timeout);
 
 	spin_lock(&op->lock);

diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 03bcb87..bdc285a 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c

@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * (C) 2001 Clemson University and The University of Chicago
+ * Copyright 2018 Omnibond Systems, L.L.C.
  *
  * See COPYING in top-level directory.
  */
@@ -14,7 +15,7 @@
 #include "orangefs-bufmap.h"
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
-
+#include <linux/hashtable.h>
 
 #define SYSTEM_ORANGEFS_KEY "system.pvfs2."
 #define SYSTEM_ORANGEFS_KEY_LEN 13
@@ -50,6 +51,35 @@
 	return internal_flag;
 }
 
+static unsigned int xattr_key(const char *key)
+{
+	unsigned int i = 0;
+	while (key)
+		i += *key++;
+	return i % 16;
+}
+
+static struct orangefs_cached_xattr *find_cached_xattr(struct inode *inode,
+    const char *key)
+{
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	struct orangefs_cached_xattr *cx;
+	struct hlist_head *h;
+	struct hlist_node *tmp;
+	h = &orangefs_inode->xattr_cache[xattr_key(key)];
+	if (hlist_empty(h))
+		return NULL;
+	hlist_for_each_entry_safe(cx, tmp, h, node) {
+/*		if (!time_before(jiffies, cx->timeout)) {
+			hlist_del(&cx->node);
+			kfree(cx);
+			continue;
+		}*/
+		if (!strcmp(cx->key, key))
+			return cx;
+	}
+	return NULL;
+}
 
 /*
  * Tries to get a specified key's attributes of a given
@@ -65,6 +95,7 @@
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_kernel_op_s *new_op = NULL;
+	struct orangefs_cached_xattr *cx;
 	ssize_t ret = -ENOMEM;
 	ssize_t length = 0;
 	int fsuid;
@@ -93,6 +124,27 @@
 
 	down_read(&orangefs_inode->xattr_sem);
 
+	cx = find_cached_xattr(inode, name);
+	if (cx && time_before(jiffies, cx->timeout)) {
+		if (cx->length == -1) {
+			ret = -ENODATA;
+			goto out_unlock;
+		} else {
+			if (size == 0) {
+				ret = cx->length;
+				goto out_unlock;
+			}
+			if (cx->length > size) {
+				ret = -ERANGE;
+				goto out_unlock;
+			}
+			memcpy(buffer, cx->val, cx->length);
+			memset(buffer + cx->length, 0, size - cx->length);
+			ret = cx->length;
+			goto out_unlock;
+		}
+	}
+
 	new_op = op_alloc(ORANGEFS_VFS_OP_GETXATTR);
 	if (!new_op)
 		goto out_unlock;
@@ -117,6 +169,15 @@
 				     " does not exist!\n",
 				     get_khandle_from_ino(inode),
 				     (char *)new_op->upcall.req.getxattr.key);
+			cx = kmalloc(sizeof *cx, GFP_KERNEL);
+			if (cx) {
+				strcpy(cx->key, name);
+				cx->length = -1;
+				cx->timeout = jiffies +
+				    orangefs_getattr_timeout_msecs*HZ/1000;
+				hash_add(orangefs_inode->xattr_cache, &cx->node,
+				    xattr_key(cx->key));
+			}
 		}
 		goto out_release_op;
 	}
@@ -156,6 +217,23 @@
 
 	ret = length;
 
+	if (cx) {
+		strcpy(cx->key, name);
+		memcpy(cx->val, buffer, length);
+		cx->length = length;
+		cx->timeout = jiffies + HZ;
+	} else {
+		cx = kmalloc(sizeof *cx, GFP_KERNEL);
+		if (cx) {
+			strcpy(cx->key, name);
+			memcpy(cx->val, buffer, length);
+			cx->length = length;
+			cx->timeout = jiffies + HZ;
+			hash_add(orangefs_inode->xattr_cache, &cx->node,
+			    xattr_key(cx->key));
+		}
+	}
+
 out_release_op:
 	op_release(new_op);
 out_unlock:
@@ -168,6 +246,9 @@
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_kernel_op_s *new_op = NULL;
+	struct orangefs_cached_xattr *cx;
+	struct hlist_head *h;
+	struct hlist_node *tmp;
 	int ret = -ENOMEM;
 
 	if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN)
@@ -209,6 +290,16 @@
 		     "orangefs_inode_removexattr: returning %d\n", ret);
 
 	op_release(new_op);
+
+	h = &orangefs_inode->xattr_cache[xattr_key(name)];
+	hlist_for_each_entry_safe(cx, tmp, h, node) {
+		if (!strcmp(cx->key, name)) {
+			hlist_del(&cx->node);
+			kfree(cx);
+			break;
+		}
+	}
+
 out_unlock:
 	up_write(&orangefs_inode->xattr_sem);
 	return ret;
@@ -226,6 +317,9 @@
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_kernel_op_s *new_op;
 	int internal_flag = 0;
+	struct orangefs_cached_xattr *cx;
+	struct hlist_head *h;
+	struct hlist_node *tmp;
 	int ret = -ENOMEM;
 
 	gossip_debug(GOSSIP_XATTR_DEBUG,
@@ -287,6 +381,16 @@
 
 	/* when request is serviced properly, free req op struct */
 	op_release(new_op);
+
+	h = &orangefs_inode->xattr_cache[xattr_key(name)];
+	hlist_for_each_entry_safe(cx, tmp, h, node) {
+		if (!strcmp(cx->key, name)) {
+			hlist_del(&cx->node);
+			kfree(cx);
+			break;
+		}
+	}
+
 out_unlock:
 	up_write(&orangefs_inode->xattr_sem);
 	return ret;

diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 2ef91be..444e2da 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config OVERLAY_FS
 	tristate "Overlay filesystem support"
 	select EXPORTFS

diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
index 46e1ff8..9164c58 100644
--- a/fs/overlayfs/Makefile
+++ b/fs/overlayfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the overlay filesystem.
 #

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 1cc797a..b801c63 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *
  * Copyright (C) 2011 Novell Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/module.h>
@@ -37,7 +34,7 @@
 }
 
 module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644);
-MODULE_PARM_DESC(ovl_check_copy_up, "Obsolete; does nothing");
+MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing");
 
 int ovl_copy_xattr(struct dentry *old, struct dentry *new)
 {
@@ -125,6 +122,7 @@
 	struct file *new_file;
 	loff_t old_pos = 0;
 	loff_t new_pos = 0;
+	loff_t cloned;
 	int error = 0;
 
 	if (len == 0)
@@ -141,11 +139,10 @@
 	}
 
 	/* Try to use clone_file_range to clone up within the same fs */
-	error = do_clone_file_range(old_file, 0, new_file, 0, len);
-	if (!error)
+	cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+	if (cloned == len)
 		goto out;
 	/* Couldn't clone, so now we try to copy the data */
-	error = 0;
 
 	/* FIXME: copy up sparse files efficiently */
 	while (len) {
@@ -395,7 +392,6 @@
 	struct dentry *destdir;
 	struct qstr destname;
 	struct dentry *workdir;
-	bool tmpfile;
 	bool origin;
 	bool indexed;
 	bool metacopy;
@@ -440,67 +436,28 @@
 	return err;
 }
 
-static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
-			    struct dentry **newdentry)
-{
-	int err;
-	struct dentry *upper;
-	struct inode *udir = d_inode(c->destdir);
-
-	upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
-	if (IS_ERR(upper))
-		return PTR_ERR(upper);
-
-	if (c->tmpfile)
-		err = ovl_do_link(temp, udir, upper);
-	else
-		err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
-
-	if (!err)
-		*newdentry = dget(c->tmpfile ? upper : temp);
-	dput(upper);
-
-	return err;
-}
-
-static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
-{
-	int err;
-	struct dentry *temp;
-	const struct cred *old_creds = NULL;
-	struct cred *new_creds = NULL;
-	struct ovl_cattr cattr = {
-		/* Can't properly set mode on creation because of the umask */
-		.mode = c->stat.mode & S_IFMT,
-		.rdev = c->stat.rdev,
-		.link = c->link
-	};
-
-	err = security_inode_copy_up(c->dentry, &new_creds);
-	temp = ERR_PTR(err);
-	if (err < 0)
-		goto out;
-
-	if (new_creds)
-		old_creds = override_creds(new_creds);
-
-	if (c->tmpfile)
-		temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
-	else
-		temp = ovl_create_temp(c->workdir, &cattr);
-out:
-	if (new_creds) {
-		revert_creds(old_creds);
-		put_cred(new_creds);
-	}
-
-	return temp;
-}
-
 static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
 {
 	int err;
 
+	/*
+	 * Copy up data first and then xattrs. Writing data after
+	 * xattrs will remove security.capability xattr automatically.
+	 */
+	if (S_ISREG(c->stat.mode) && !c->metacopy) {
+		struct path upperpath, datapath;
+
+		ovl_path_upper(c->dentry, &upperpath);
+		if (WARN_ON(upperpath.dentry != NULL))
+			return -EIO;
+		upperpath.dentry = temp;
+
+		ovl_path_lowerdata(c->dentry, &datapath);
+		err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
+		if (err)
+			return err;
+	}
+
 	err = ovl_copy_xattr(c->lowerpath.dentry, temp);
 	if (err)
 		return err;
@@ -518,19 +475,6 @@
 			return err;
 	}
 
-	if (S_ISREG(c->stat.mode) && !c->metacopy) {
-		struct path upperpath, datapath;
-
-		ovl_path_upper(c->dentry, &upperpath);
-		BUG_ON(upperpath.dentry != NULL);
-		upperpath.dentry = temp;
-
-		ovl_path_lowerdata(c->dentry, &datapath);
-		err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
-		if (err)
-			return err;
-	}
-
 	if (c->metacopy) {
 		err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY,
 					 NULL, 0, -EOPNOTSUPP);
@@ -548,51 +492,148 @@
 	return err;
 }
 
-static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
+struct ovl_cu_creds {
+	const struct cred *old;
+	struct cred *new;
+};
+
+static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
 {
-	struct inode *udir = c->destdir->d_inode;
-	struct inode *inode;
-	struct dentry *newdentry = NULL;
-	struct dentry *temp;
 	int err;
 
-	temp = ovl_get_tmpfile(c);
+	cc->old = cc->new = NULL;
+	err = security_inode_copy_up(dentry, &cc->new);
+	if (err < 0)
+		return err;
+
+	if (cc->new)
+		cc->old = override_creds(cc->new);
+
+	return 0;
+}
+
+static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
+{
+	if (cc->new) {
+		revert_creds(cc->old);
+		put_cred(cc->new);
+	}
+}
+
+/*
+ * Copyup using workdir to prepare temp file.  Used when copying up directories,
+ * special files or when upper fs doesn't support O_TMPFILE.
+ */
+static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
+{
+	struct inode *inode;
+	struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
+	struct dentry *temp, *upper;
+	struct ovl_cu_creds cc;
+	int err;
+	struct ovl_cattr cattr = {
+		/* Can't properly set mode on creation because of the umask */
+		.mode = c->stat.mode & S_IFMT,
+		.rdev = c->stat.rdev,
+		.link = c->link
+	};
+
+	err = ovl_lock_rename_workdir(c->workdir, c->destdir);
+	if (err)
+		return err;
+
+	err = ovl_prep_cu_creds(c->dentry, &cc);
+	if (err)
+		goto unlock;
+
+	temp = ovl_create_temp(c->workdir, &cattr);
+	ovl_revert_cu_creds(&cc);
+
+	err = PTR_ERR(temp);
+	if (IS_ERR(temp))
+		goto unlock;
+
+	err = ovl_copy_up_inode(c, temp);
+	if (err)
+		goto cleanup;
+
+	if (S_ISDIR(c->stat.mode) && c->indexed) {
+		err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
+		if (err)
+			goto cleanup;
+	}
+
+	upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+	err = PTR_ERR(upper);
+	if (IS_ERR(upper))
+		goto cleanup;
+
+	err = ovl_do_rename(wdir, temp, udir, upper, 0);
+	dput(upper);
+	if (err)
+		goto cleanup;
+
+	if (!c->metacopy)
+		ovl_set_upperdata(d_inode(c->dentry));
+	inode = d_inode(c->dentry);
+	ovl_inode_update(inode, temp);
+	if (S_ISDIR(inode->i_mode))
+		ovl_set_flag(OVL_WHITEOUTS, inode);
+unlock:
+	unlock_rename(c->workdir, c->destdir);
+
+	return err;
+
+cleanup:
+	ovl_cleanup(wdir, temp);
+	dput(temp);
+	goto unlock;
+}
+
+/* Copyup using O_TMPFILE which does not require cross dir locking */
+static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
+{
+	struct inode *udir = d_inode(c->destdir);
+	struct dentry *temp, *upper;
+	struct ovl_cu_creds cc;
+	int err;
+
+	err = ovl_prep_cu_creds(c->dentry, &cc);
+	if (err)
+		return err;
+
+	temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
+	ovl_revert_cu_creds(&cc);
+
 	if (IS_ERR(temp))
 		return PTR_ERR(temp);
 
 	err = ovl_copy_up_inode(c, temp);
 	if (err)
-		goto out;
+		goto out_dput;
 
-	if (S_ISDIR(c->stat.mode) && c->indexed) {
-		err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
-		if (err)
-			goto out;
-	}
+	inode_lock_nested(udir, I_MUTEX_PARENT);
 
-	if (c->tmpfile) {
-		inode_lock_nested(udir, I_MUTEX_PARENT);
-		err = ovl_install_temp(c, temp, &newdentry);
-		inode_unlock(udir);
-	} else {
-		err = ovl_install_temp(c, temp, &newdentry);
+	upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+	err = PTR_ERR(upper);
+	if (!IS_ERR(upper)) {
+		err = ovl_do_link(temp, udir, upper);
+		dput(upper);
 	}
+	inode_unlock(udir);
+
 	if (err)
-		goto out;
+		goto out_dput;
 
 	if (!c->metacopy)
 		ovl_set_upperdata(d_inode(c->dentry));
-	inode = d_inode(c->dentry);
-	ovl_inode_update(inode, newdentry);
-	if (S_ISDIR(inode->i_mode))
-		ovl_set_flag(OVL_WHITEOUTS, inode);
+	ovl_inode_update(d_inode(c->dentry), temp);
 
-out:
-	if (err && !c->tmpfile)
-		ovl_cleanup(d_inode(c->workdir), temp);
+	return 0;
+
+out_dput:
 	dput(temp);
 	return err;
-
 }
 
 /*
@@ -646,18 +687,10 @@
 	}
 
 	/* Should we copyup with O_TMPFILE or with workdir? */
-	if (S_ISREG(c->stat.mode) && ofs->tmpfile) {
-		c->tmpfile = true;
-		err = ovl_copy_up_locked(c);
-	} else {
-		err = ovl_lock_rename_workdir(c->workdir, c->destdir);
-		if (!err) {
-			err = ovl_copy_up_locked(c);
-			unlock_rename(c->workdir, c->destdir);
-		}
-	}
-
-
+	if (S_ISREG(c->stat.mode) && ofs->tmpfile)
+		err = ovl_copy_up_tmpfile(c);
+	else
+		err = ovl_copy_up_workdir(c);
 	if (err)
 		goto out;
 
@@ -706,6 +739,8 @@
 {
 	struct path upperpath, datapath;
 	int err;
+	char *capability = NULL;
+	ssize_t uninitialized_var(cap_size);
 
 	ovl_path_upper(c->dentry, &upperpath);
 	if (WARN_ON(upperpath.dentry == NULL))
@@ -715,15 +750,37 @@
 	if (WARN_ON(datapath.dentry == NULL))
 		return -EIO;
 
+	if (c->stat.size) {
+		err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS,
+					      &capability, 0);
+		if (err < 0 && err != -ENODATA)
+			goto out;
+	}
+
 	err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
 	if (err)
-		return err;
+		goto out_free;
+
+	/*
+	 * Writing to upper file will clear security.capability xattr. We
+	 * don't want that to happen for normal copy-up operation.
+	 */
+	if (capability) {
+		err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
+				      capability, cap_size, 0);
+		if (err)
+			goto out_free;
+	}
+
 
 	err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY);
 	if (err)
-		return err;
+		goto out_free;
 
 	ovl_set_upperdata(d_inode(c->dentry));
+out_free:
+	kfree(capability);
+out:
 	return err;
 }
 
@@ -849,14 +906,14 @@
 	return true;
 }
 
-int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
+int ovl_maybe_copy_up(struct dentry *dentry, int flags)
 {
 	int err = 0;
 
-	if (ovl_open_need_copy_up(dentry, file_flags)) {
+	if (ovl_open_need_copy_up(dentry, flags)) {
 		err = ovl_want_write(dentry);
 		if (!err) {
-			err = ovl_copy_up_flags(dentry, file_flags);
+			err = ovl_copy_up_flags(dentry, flags);
 			ovl_drop_write(dentry);
 		}
 	}

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index b2aadd3..702aa63 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *
  * Copyright (C) 2011 Novell Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/fs.h>
@@ -21,7 +18,7 @@
 
 static unsigned short ovl_redirect_max = 256;
 module_param_named(redirect_max, ovl_redirect_max, ushort, 0644);
-MODULE_PARM_DESC(ovl_redirect_max,
+MODULE_PARM_DESC(redirect_max,
 		 "Maximum length of absolute redirect xattr value");
 
 static int ovl_set_redirect(struct dentry *dentry, bool samedir);
@@ -260,7 +257,7 @@
 		 * hashed directory inode aliases.
 		 */
 		inode = ovl_get_inode(dentry->d_sb, &oip);
-		if (WARN_ON(IS_ERR(inode)))
+		if (IS_ERR(inode))
 			return PTR_ERR(inode);
 	} else {
 		WARN_ON(ovl_inode_real(inode) != d_inode(newdentry));
@@ -414,13 +411,12 @@
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
 		return 0;
 
-	size = posix_acl_to_xattr(NULL, acl, NULL, 0);
+	size = posix_acl_xattr_size(acl->a_count);
 	buffer = kmalloc(size, GFP_KERNEL);
 	if (!buffer)
 		return -ENOMEM;
 
-	size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	err = size;
+	err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
 	if (err < 0)
 		goto out_free;
 
@@ -668,7 +664,6 @@
 		    struct dentry *new)
 {
 	int err;
-	bool locked = false;
 	struct inode *inode;
 
 	err = ovl_want_write(old);
@@ -689,7 +684,7 @@
 			goto out_drop_write;
 	}
 
-	err = ovl_nlink_start(old, &locked);
+	err = ovl_nlink_start(old);
 	if (err)
 		goto out_drop_write;
 
@@ -702,7 +697,7 @@
 	if (err)
 		iput(inode);
 
-	ovl_nlink_end(old, locked);
+	ovl_nlink_end(old);
 out_drop_write:
 	ovl_drop_write(old);
 out:
@@ -827,7 +822,6 @@
 static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 {
 	int err;
-	bool locked = false;
 	const struct cred *old_cred;
 	struct dentry *upperdentry;
 	bool lower_positive = ovl_lower_positive(dentry);
@@ -848,7 +842,7 @@
 	if (err)
 		goto out_drop_write;
 
-	err = ovl_nlink_start(dentry, &locked);
+	err = ovl_nlink_start(dentry);
 	if (err)
 		goto out_drop_write;
 
@@ -864,7 +858,7 @@
 		else
 			drop_nlink(dentry->d_inode);
 	}
-	ovl_nlink_end(dentry, locked);
+	ovl_nlink_end(dentry);
 
 	/*
 	 * Copy ctime
@@ -1028,7 +1022,6 @@
 		      unsigned int flags)
 {
 	int err;
-	bool locked = false;
 	struct dentry *old_upperdir;
 	struct dentry *new_upperdir;
 	struct dentry *olddentry;
@@ -1037,6 +1030,7 @@
 	bool old_opaque;
 	bool new_opaque;
 	bool cleanup_whiteout = false;
+	bool update_nlink = false;
 	bool overwrite = !(flags & RENAME_EXCHANGE);
 	bool is_dir = d_is_dir(old);
 	bool new_is_dir = d_is_dir(new);
@@ -1094,10 +1088,12 @@
 		err = ovl_copy_up(new);
 		if (err)
 			goto out_drop_write;
-	} else {
-		err = ovl_nlink_start(new, &locked);
+	} else if (d_inode(new)) {
+		err = ovl_nlink_start(new);
 		if (err)
 			goto out_drop_write;
+
+		update_nlink = true;
 	}
 
 	old_cred = ovl_override_creds(old->d_sb);
@@ -1226,7 +1222,8 @@
 	unlock_rename(new_upperdir, old_upperdir);
 out_revert_creds:
 	revert_creds(old_cred);
-	ovl_nlink_end(new, locked);
+	if (update_nlink)
+		ovl_nlink_end(new);
 out_drop_write:
 	ovl_drop_write(old);
 out:

diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 54e5d17..73c9775 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c

@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Overlayfs NFS export support.
  *
  * Amir Goldstein <amir73il@gmail.com>
  *
  * Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/fs.h>
@@ -230,9 +227,8 @@
 	/* Encode an upper or lower file handle */
 	fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) :
 				ovl_dentry_upper(dentry), !enc_lower);
-	err = PTR_ERR(fh);
 	if (IS_ERR(fh))
-		goto fail;
+		return PTR_ERR(fh);
 
 	err = -EOVERFLOW;
 	if (fh->len > buflen)
@@ -398,7 +394,7 @@
 	 * pointer because we hold no lock on the real dentry.
 	 */
 	take_dentry_name_snapshot(&name, real);
-	this = lookup_one_len(name.name, connected, strlen(name.name));
+	this = lookup_one_len(name.name.name, connected, name.name.len);
 	err = PTR_ERR(this);
 	if (IS_ERR(this)) {
 		goto fail;

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 986313d..e235a63 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c

@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2017 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/cred.h>
@@ -11,6 +8,7 @@
 #include <linux/mount.h>
 #include <linux/xattr.h>
 #include <linux/uio.h>
+#include <linux/uaccess.h>
 #include "overlayfs.h"
 
 static char ovl_whatisit(struct inode *inode, struct inode *realinode)
@@ -29,10 +27,11 @@
 	struct inode *inode = file_inode(file);
 	struct file *realfile;
 	const struct cred *old_cred;
+	int flags = file->f_flags | O_NOATIME | FMODE_NONOTIFY;
 
 	old_cred = ovl_override_creds(inode->i_sb);
-	realfile = open_with_fake_path(&file->f_path, file->f_flags | O_NOATIME,
-				       realinode, current_cred());
+	realfile = open_with_fake_path(&file->f_path, flags, realinode,
+				       current_cred());
 	revert_creds(old_cred);
 
 	pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
@@ -50,7 +49,7 @@
 	int err;
 
 	/* No atime modificaton on underlying */
-	flags |= O_NOATIME;
+	flags |= O_NOATIME | FMODE_NONOTIFY;
 
 	/* If some flag changed that cannot be changed then something's amiss */
 	if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK))
@@ -116,11 +115,10 @@
 
 static int ovl_open(struct inode *inode, struct file *file)
 {
-	struct dentry *dentry = file_dentry(file);
 	struct file *realfile;
 	int err;
 
-	err = ovl_open_maybe_copy_up(dentry, file->f_flags);
+	err = ovl_maybe_copy_up(file_dentry(file), file->f_flags);
 	if (err)
 		return err;
 
@@ -145,11 +143,47 @@
 
 static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
 {
-	struct inode *realinode = ovl_inode_real(file_inode(file));
+	struct inode *inode = file_inode(file);
+	struct fd real;
+	const struct cred *old_cred;
+	ssize_t ret;
 
-	return generic_file_llseek_size(file, offset, whence,
-					realinode->i_sb->s_maxbytes,
-					i_size_read(realinode));
+	/*
+	 * The two special cases below do not need to involve real fs,
+	 * so we can optimizing concurrent callers.
+	 */
+	if (offset == 0) {
+		if (whence == SEEK_CUR)
+			return file->f_pos;
+
+		if (whence == SEEK_SET)
+			return vfs_setpos(file, 0, 0);
+	}
+
+	ret = ovl_real_fdget(file, &real);
+	if (ret)
+		return ret;
+
+	/*
+	 * Overlay file f_pos is the master copy that is preserved
+	 * through copy up and modified on read/write, but only real
+	 * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose
+	 * limitations that are more strict than ->s_maxbytes for specific
+	 * files, so we use the real file to perform seeks.
+	 */
+	inode_lock(inode);
+	real.file->f_pos = file->f_pos;
+
+	old_cred = ovl_override_creds(inode->i_sb);
+	ret = vfs_llseek(real.file, offset, whence);
+	revert_creds(old_cred);
+
+	file->f_pos = real.file->f_pos;
+	inode_unlock(inode);
+
+	fdput(real);
+
+	return ret;
 }
 
 static void ovl_file_accessed(struct file *file)
@@ -372,34 +406,118 @@
 	return ret;
 }
 
-static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
+				unsigned long arg, unsigned int iflags)
 {
 	long ret;
 	struct inode *inode = file_inode(file);
+	unsigned int old_iflags;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+
+	/* Check the capability before cred override */
+	ret = -EPERM;
+	old_iflags = READ_ONCE(inode->i_flags);
+	if (((iflags ^ old_iflags) & (S_APPEND | S_IMMUTABLE)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto unlock;
+
+	ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY);
+	if (ret)
+		goto unlock;
+
+	ret = ovl_real_ioctl(file, cmd, arg);
+
+	ovl_copyflags(ovl_inode_real(inode), inode);
+unlock:
+	inode_unlock(inode);
+
+	mnt_drop_write_file(file);
+
+	return ret;
+
+}
+
+static unsigned int ovl_fsflags_to_iflags(unsigned int flags)
+{
+	unsigned int iflags = 0;
+
+	if (flags & FS_SYNC_FL)
+		iflags |= S_SYNC;
+	if (flags & FS_APPEND_FL)
+		iflags |= S_APPEND;
+	if (flags & FS_IMMUTABLE_FL)
+		iflags |= S_IMMUTABLE;
+	if (flags & FS_NOATIME_FL)
+		iflags |= S_NOATIME;
+
+	return iflags;
+}
+
+static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd,
+				  unsigned long arg)
+{
+	unsigned int flags;
+
+	if (get_user(flags, (int __user *) arg))
+		return -EFAULT;
+
+	return ovl_ioctl_set_flags(file, cmd, arg,
+				   ovl_fsflags_to_iflags(flags));
+}
+
+static unsigned int ovl_fsxflags_to_iflags(unsigned int xflags)
+{
+	unsigned int iflags = 0;
+
+	if (xflags & FS_XFLAG_SYNC)
+		iflags |= S_SYNC;
+	if (xflags & FS_XFLAG_APPEND)
+		iflags |= S_APPEND;
+	if (xflags & FS_XFLAG_IMMUTABLE)
+		iflags |= S_IMMUTABLE;
+	if (xflags & FS_XFLAG_NOATIME)
+		iflags |= S_NOATIME;
+
+	return iflags;
+}
+
+static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	struct fsxattr fa;
+
+	memset(&fa, 0, sizeof(fa));
+	if (copy_from_user(&fa, (void __user *) arg, sizeof(fa)))
+		return -EFAULT;
+
+	return ovl_ioctl_set_flags(file, cmd, arg,
+				   ovl_fsxflags_to_iflags(fa.fsx_xflags));
+}
+
+static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long ret;
 
 	switch (cmd) {
 	case FS_IOC_GETFLAGS:
+	case FS_IOC_FSGETXATTR:
 		ret = ovl_real_ioctl(file, cmd, arg);
 		break;
 
 	case FS_IOC_SETFLAGS:
-		if (!inode_owner_or_capable(inode))
-			return -EACCES;
+		ret = ovl_ioctl_set_fsflags(file, cmd, arg);
+		break;
 
-		ret = mnt_want_write_file(file);
-		if (ret)
-			return ret;
-
-		ret = ovl_copy_up_with_data(file_dentry(file));
-		if (!ret) {
-			ret = ovl_real_ioctl(file, cmd, arg);
-
-			inode_lock(inode);
-			ovl_copyflags(ovl_inode_real(inode), inode);
-			inode_unlock(inode);
-		}
-
-		mnt_drop_write_file(file);
+	case FS_IOC_FSSETXATTR:
+		ret = ovl_ioctl_set_fsxflags(file, cmd, arg);
 		break;
 
 	default:
@@ -434,14 +552,14 @@
 	OVL_DEDUPE,
 };
 
-static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in,
+static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
 			    struct file *file_out, loff_t pos_out,
-			    u64 len, unsigned int flags, enum ovl_copyop op)
+			    loff_t len, unsigned int flags, enum ovl_copyop op)
 {
 	struct inode *inode_out = file_inode(file_out);
 	struct fd real_in, real_out;
 	const struct cred *old_cred;
-	ssize_t ret;
+	loff_t ret;
 
 	ret = ovl_real_fdget(file_out, &real_out);
 	if (ret)
@@ -462,12 +580,13 @@
 
 	case OVL_CLONE:
 		ret = vfs_clone_file_range(real_in.file, pos_in,
-					   real_out.file, pos_out, len);
+					   real_out.file, pos_out, len, flags);
 		break;
 
 	case OVL_DEDUPE:
 		ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
-						real_out.file, pos_out, len);
+						real_out.file, pos_out, len,
+						flags);
 		break;
 	}
 	revert_creds(old_cred);
@@ -489,26 +608,31 @@
 			    OVL_COPY);
 }
 
-static int ovl_clone_file_range(struct file *file_in, loff_t pos_in,
-				struct file *file_out, loff_t pos_out, u64 len)
+static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
+				   struct file *file_out, loff_t pos_out,
+				   loff_t len, unsigned int remap_flags)
 {
-	return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
-			    OVL_CLONE);
-}
+	enum ovl_copyop op;
 
-static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in,
-				 struct file *file_out, loff_t pos_out, u64 len)
-{
+	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (remap_flags & REMAP_FILE_DEDUP)
+		op = OVL_DEDUPE;
+	else
+		op = OVL_CLONE;
+
 	/*
 	 * Don't copy up because of a dedupe request, this wouldn't make sense
 	 * most of the time (data would be duplicated instead of deduplicated).
 	 */
-	if (!ovl_inode_upper(file_inode(file_in)) ||
-	    !ovl_inode_upper(file_inode(file_out)))
+	if (op == OVL_DEDUPE &&
+	    (!ovl_inode_upper(file_inode(file_in)) ||
+	     !ovl_inode_upper(file_inode(file_out))))
 		return -EPERM;
 
-	return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
-			    OVL_DEDUPE);
+	return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
+			    remap_flags, op);
 }
 
 const struct file_operations ovl_file_operations = {
@@ -525,6 +649,5 @@
 	.compat_ioctl	= ovl_compat_ioctl,
 
 	.copy_file_range	= ovl_copy_file_range,
-	.clone_file_range	= ovl_clone_file_range,
-	.dedupe_file_range	= ovl_dedupe_file_range,
+	.remap_file_range	= ovl_remap_file_range,
 };

diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 3b7ed5d..bc14781 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *
  * Copyright (C) 2011 Novell Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/fs.h>
@@ -386,7 +383,8 @@
 		return true;
 
 	/* Never list trusted.overlay, list other trusted for superuser only */
-	return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
+	return !ovl_is_private_xattr(s) &&
+	       ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
 }
 
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
@@ -553,15 +551,15 @@
 	int xinobits = ovl_xino_bits(inode->i_sb);
 
 	/*
-	 * When NFS export is enabled and d_ino is consistent with st_ino
-	 * (samefs or i_ino has enough bits to encode layer), set the same
-	 * value used for d_ino to i_ino, because nfsd readdirplus compares
-	 * d_ino values to i_ino values of child entries. When called from
+	 * When d_ino is consistent with st_ino (samefs or i_ino has enough
+	 * bits to encode layer), set the same value used for st_ino to i_ino,
+	 * so inode number exposed via /proc/locks and a like will be
+	 * consistent with d_ino and st_ino values. An i_ino value inconsistent
+	 * with d_ino also causes nfsd readdirplus to fail.  When called from
 	 * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
 	 * upper inode i_ino on ovl_inode_init() or ovl_inode_update().
 	 */
-	if (inode->i_sb->s_export_op &&
-	    (ovl_same_sb(inode->i_sb) || xinobits)) {
+	if (ovl_same_sb(inode->i_sb) || xinobits) {
 		inode->i_ino = ino;
 		if (xinobits && fsid && !(ino >> (64 - xinobits)))
 			inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
@@ -777,6 +775,54 @@
 	return inode;
 }
 
+bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir)
+{
+	struct inode *key = d_inode(dir);
+	struct inode *trap;
+	bool res;
+
+	trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
+	if (!trap)
+		return false;
+
+	res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) &&
+				  !ovl_inode_lower(trap);
+
+	iput(trap);
+	return res;
+}
+
+/*
+ * Create an inode cache entry for layer root dir, that will intentionally
+ * fail ovl_verify_inode(), so any lookup that will find some layer root
+ * will fail.
+ */
+struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir)
+{
+	struct inode *key = d_inode(dir);
+	struct inode *trap;
+
+	if (!d_is_dir(dir))
+		return ERR_PTR(-ENOTDIR);
+
+	trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test,
+			    ovl_inode_set, key);
+	if (!trap)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(trap->i_state & I_NEW)) {
+		/* Conflicting layer roots? */
+		iput(trap);
+		return ERR_PTR(-ELOOP);
+	}
+
+	trap->i_mode = S_IFDIR;
+	trap->i_flags = S_DEAD;
+	unlock_new_inode(trap);
+
+	return trap;
+}
+
 /*
  * Does overlay inode need to be hashed by lower inode?
  */
@@ -832,7 +878,7 @@
 	int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
 	bool is_dir, metacopy = false;
 	unsigned long ino = 0;
-	int err = -ENOMEM;
+	int err = oip->newinode ? -EEXIST : -ENOMEM;
 
 	if (!realinode)
 		realinode = d_inode(lowerdentry);
@@ -917,6 +963,7 @@
 	return inode;
 
 out_err:
+	pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err);
 	inode = ERR_PTR(err);
 	goto out;
 }

diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index efd3723..e9717c2 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2011 Novell Inc.
  * Copyright (C) 2016 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/fs.h>
@@ -18,6 +15,7 @@
 #include "overlayfs.h"
 
 struct ovl_lookup_data {
+	struct super_block *sb;
 	struct qstr name;
 	bool is_dir;
 	bool opaque;
@@ -244,6 +242,12 @@
 		if (!d->metacopy || d->last)
 			goto out;
 	} else {
+		if (ovl_lookup_trap_inode(d->sb, this)) {
+			/* Caught in a trap of overlapping layers */
+			err = -ELOOP;
+			goto out_err;
+		}
+
 		if (last_element)
 			d->is_dir = true;
 		if (d->last)
@@ -819,6 +823,7 @@
 	int err;
 	bool metacopy = false;
 	struct ovl_lookup_data d = {
+		.sb = dentry->d_sb,
 		.name = dentry->d_name,
 		.is_dir = false,
 		.opaque = false,

diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index a3c0d95..6934bcf 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h

@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  *
  * Copyright (C) 2011 Novell Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/kernel.h>
@@ -270,13 +267,16 @@
 bool ovl_test_flag(unsigned long flag, struct inode *inode);
 bool ovl_inuse_trylock(struct dentry *dentry);
 void ovl_inuse_unlock(struct dentry *dentry);
+bool ovl_is_inuse(struct dentry *dentry);
 bool ovl_need_index(struct dentry *dentry);
-int ovl_nlink_start(struct dentry *dentry, bool *locked);
-void ovl_nlink_end(struct dentry *dentry, bool locked);
+int ovl_nlink_start(struct dentry *dentry);
+void ovl_nlink_end(struct dentry *dentry);
 int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
 int ovl_check_metacopy_xattr(struct dentry *dentry);
 bool ovl_is_metacopy_dentry(struct dentry *dentry);
 char *ovl_get_redirect_xattr(struct dentry *dentry, int padding);
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+		     size_t padding);
 
 static inline bool ovl_is_impuredir(struct dentry *dentry)
 {
@@ -290,6 +290,16 @@
 	return ofs->xino_bits;
 }
 
+static inline int ovl_inode_lock(struct inode *inode)
+{
+	return mutex_lock_interruptible(&OVL_I(inode)->lock);
+}
+
+static inline void ovl_inode_unlock(struct inode *inode)
+{
+	mutex_unlock(&OVL_I(inode)->lock);
+}
+
 
 /* namei.c */
 int ovl_check_fh_len(struct ovl_fh *fh, int fh_len);
@@ -364,6 +374,8 @@
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
 struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
 			       bool is_upper);
+bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir);
+struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir);
 struct inode *ovl_get_inode(struct super_block *sb,
 			    struct ovl_inode_params *oip);
 static inline void ovl_copyattr(struct inode *from, struct inode *to)
@@ -409,7 +421,7 @@
 int ovl_copy_up(struct dentry *dentry);
 int ovl_copy_up_with_data(struct dentry *dentry);
 int ovl_copy_up_flags(struct dentry *dentry, int flags);
-int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
+int ovl_maybe_copy_up(struct dentry *dentry, int flags);
 int ovl_copy_xattr(struct dentry *old, struct dentry *new);
 int ovl_set_attr(struct dentry *upper, struct kstat *stat);
 struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper);

diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index ec23703..a827928 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h

@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  *
  * Copyright (C) 2011 Novell Inc.
  * Copyright (C) 2016 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 struct ovl_config {
@@ -29,6 +26,8 @@
 
 struct ovl_layer {
 	struct vfsmount *mnt;
+	/* Trap in ovl inode cache */
+	struct inode *trap;
 	struct ovl_sb *fs;
 	/* Index of this layer in fs root (upper idx == 0) */
 	int idx;
@@ -65,6 +64,11 @@
 	/* Did we take the inuse lock? */
 	bool upperdir_locked;
 	bool workdir_locked;
+	/* Traps in ovl inode cache */
+	struct inode *upperdir_trap;
+	struct inode *workbasedir_trap;
+	struct inode *workdir_trap;
+	struct inode *indexdir_trap;
 	/* Inode numbers in all layers do not use the high xino_bits */
 	unsigned int xino_bits;
 };

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index cc8303a..47a91c9 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *
  * Copyright (C) 2011 Novell Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/fs.h>

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 0fb0a59..afbcb11 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *
  * Copyright (C) 2011 Novell Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <uapi/linux/magic.h>
@@ -31,29 +28,29 @@
 
 static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR);
 module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644);
-MODULE_PARM_DESC(ovl_redirect_dir_def,
+MODULE_PARM_DESC(redirect_dir,
 		 "Default to on or off for the redirect_dir feature");
 
 static bool ovl_redirect_always_follow =
 	IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW);
 module_param_named(redirect_always_follow, ovl_redirect_always_follow,
 		   bool, 0644);
-MODULE_PARM_DESC(ovl_redirect_always_follow,
+MODULE_PARM_DESC(redirect_always_follow,
 		 "Follow redirects even if redirect_dir feature is turned off");
 
 static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX);
 module_param_named(index, ovl_index_def, bool, 0644);
-MODULE_PARM_DESC(ovl_index_def,
+MODULE_PARM_DESC(index,
 		 "Default to on or off for the inodes index feature");
 
 static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT);
 module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644);
-MODULE_PARM_DESC(ovl_nfs_export_def,
+MODULE_PARM_DESC(nfs_export,
 		 "Default to on or off for the NFS export feature");
 
 static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO);
 module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644);
-MODULE_PARM_DESC(ovl_xino_auto_def,
+MODULE_PARM_DESC(xino_auto,
 		 "Auto enable xino feature");
 
 static void ovl_entry_stack_free(struct ovl_entry *oe)
@@ -66,7 +63,7 @@
 
 static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY);
 module_param_named(metacopy, ovl_metacopy_def, bool, 0644);
-MODULE_PARM_DESC(ovl_metacopy_def,
+MODULE_PARM_DESC(metacopy,
 		 "Default to on or off for the metadata only copy up feature");
 
 static void ovl_dentry_release(struct dentry *dentry)
@@ -190,11 +187,13 @@
 	return &oi->vfs_inode;
 }
 
-static void ovl_i_callback(struct rcu_head *head)
+static void ovl_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ovl_inode *oi = OVL_I(inode);
 
-	kmem_cache_free(ovl_inode_cachep, OVL_I(inode));
+	kfree(oi->redirect);
+	mutex_destroy(&oi->lock);
+	kmem_cache_free(ovl_inode_cachep, oi);
 }
 
 static void ovl_destroy_inode(struct inode *inode)
@@ -207,16 +206,16 @@
 		ovl_dir_cache_free(inode);
 	else
 		iput(oi->lowerdata);
-	kfree(oi->redirect);
-	mutex_destroy(&oi->lock);
-
-	call_rcu(&inode->i_rcu, ovl_i_callback);
 }
 
 static void ovl_free_fs(struct ovl_fs *ofs)
 {
 	unsigned i;
 
+	iput(ofs->workbasedir_trap);
+	iput(ofs->indexdir_trap);
+	iput(ofs->workdir_trap);
+	iput(ofs->upperdir_trap);
 	dput(ofs->indexdir);
 	dput(ofs->workdir);
 	if (ofs->workdir_locked)
@@ -225,8 +224,10 @@
 	if (ofs->upperdir_locked)
 		ovl_inuse_unlock(ofs->upper_mnt->mnt_root);
 	mntput(ofs->upper_mnt);
-	for (i = 0; i < ofs->numlower; i++)
+	for (i = 0; i < ofs->numlower; i++) {
+		iput(ofs->lower_layers[i].trap);
 		mntput(ofs->lower_layers[i].mnt);
+	}
 	for (i = 0; i < ofs->numlowerfs; i++)
 		free_anon_bdev(ofs->lower_fs[i].pseudo_dev);
 	kfree(ofs->lower_layers);
@@ -377,6 +378,7 @@
 
 static const struct super_operations ovl_super_operations = {
 	.alloc_inode	= ovl_alloc_inode,
+	.free_inode	= ovl_free_inode,
 	.destroy_inode	= ovl_destroy_inode,
 	.drop_inode	= generic_delete_inode,
 	.put_super	= ovl_put_super,
@@ -984,7 +986,45 @@
 	NULL
 };
 
-static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
+static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
+			  struct inode **ptrap, const char *name)
+{
+	struct inode *trap;
+	int err;
+
+	trap = ovl_get_trap_inode(sb, dir);
+	err = PTR_ERR_OR_ZERO(trap);
+	if (err) {
+		if (err == -ELOOP)
+			pr_err("overlayfs: conflicting %s path\n", name);
+		return err;
+	}
+
+	*ptrap = trap;
+	return 0;
+}
+
+/*
+ * Determine how we treat concurrent use of upperdir/workdir based on the
+ * index feature. This is papering over mount leaks of container runtimes,
+ * for example, an old overlay mount is leaked and now its upperdir is
+ * attempted to be used as a lower layer in a new overlay mount.
+ */
+static int ovl_report_in_use(struct ovl_fs *ofs, const char *name)
+{
+	if (ofs->config.index) {
+		pr_err("overlayfs: %s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n",
+		       name);
+		return -EBUSY;
+	} else {
+		pr_warn("overlayfs: %s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n",
+			name);
+		return 0;
+	}
+}
+
+static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
+			 struct path *upperpath)
 {
 	struct vfsmount *upper_mnt;
 	int err;
@@ -1004,6 +1044,11 @@
 	if (err)
 		goto out;
 
+	err = ovl_setup_trap(sb, upperpath->dentry, &ofs->upperdir_trap,
+			     "upperdir");
+	if (err)
+		goto out;
+
 	upper_mnt = clone_private_mount(upperpath);
 	err = PTR_ERR(upper_mnt);
 	if (IS_ERR(upper_mnt)) {
@@ -1015,14 +1060,12 @@
 	upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
 	ofs->upper_mnt = upper_mnt;
 
-	err = -EBUSY;
 	if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) {
 		ofs->upperdir_locked = true;
-	} else if (ofs->config.index) {
-		pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
-		goto out;
 	} else {
-		pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+		err = ovl_report_in_use(ofs, "upperdir");
+		if (err)
+			goto out;
 	}
 
 	err = 0;
@@ -1030,7 +1073,8 @@
 	return err;
 }
 
-static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath)
+static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
+			    struct path *workpath)
 {
 	struct vfsmount *mnt = ofs->upper_mnt;
 	struct dentry *temp;
@@ -1045,6 +1089,10 @@
 	if (!ofs->workdir)
 		goto out;
 
+	err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir");
+	if (err)
+		goto out;
+
 	/*
 	 * Upper should support d_type, else whiteouts are visible.  Given
 	 * workdir and upper are on same fs, we can do iterate_dir() on
@@ -1105,7 +1153,8 @@
 	return err;
 }
 
-static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
+static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs,
+			   struct path *upperpath)
 {
 	int err;
 	struct path workpath = { };
@@ -1126,29 +1175,29 @@
 
 	ofs->workbasedir = dget(workpath.dentry);
 
-	err = -EBUSY;
 	if (ovl_inuse_trylock(ofs->workbasedir)) {
 		ofs->workdir_locked = true;
-	} else if (ofs->config.index) {
-		pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
-		goto out;
 	} else {
-		pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+		err = ovl_report_in_use(ofs, "workdir");
+		if (err)
+			goto out;
 	}
 
-	err = ovl_make_workdir(ofs, &workpath);
+	err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap,
+			     "workdir");
 	if (err)
 		goto out;
 
-	err = 0;
+	err = ovl_make_workdir(sb, ofs, &workpath);
+
 out:
 	path_put(&workpath);
 
 	return err;
 }
 
-static int ovl_get_indexdir(struct ovl_fs *ofs, struct ovl_entry *oe,
-			    struct path *upperpath)
+static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
+			    struct ovl_entry *oe, struct path *upperpath)
 {
 	struct vfsmount *mnt = ofs->upper_mnt;
 	int err;
@@ -1167,6 +1216,11 @@
 
 	ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
 	if (ofs->indexdir) {
+		err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap,
+				     "indexdir");
+		if (err)
+			goto out;
+
 		/*
 		 * Verify upper root is exclusively associated with index dir.
 		 * Older kernels stored upper fh in "trusted.overlay.origin"
@@ -1197,10 +1251,30 @@
 	return err;
 }
 
-/* Get a unique fsid for the layer */
-static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb)
+static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
 {
 	unsigned int i;
+
+	if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt))
+		return true;
+
+	for (i = 0; i < ofs->numlowerfs; i++) {
+		/*
+		 * We use uuid to associate an overlay lower file handle with a
+		 * lower layer, so we can accept lower fs with null uuid as long
+		 * as all lower layers with null uuid are on the same fs.
+		 */
+		if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid))
+			return false;
+	}
+	return true;
+}
+
+/* Get a unique fsid for the layer */
+static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
+{
+	struct super_block *sb = path->mnt->mnt_sb;
+	unsigned int i;
 	dev_t dev;
 	int err;
 
@@ -1213,6 +1287,14 @@
 			return i + 1;
 	}
 
+	if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
+		ofs->config.index = false;
+		ofs->config.nfs_export = false;
+		pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
+			uuid_is_null(&sb->s_uuid) ? "null" : "conflicting",
+			path->dentry);
+	}
+
 	err = get_anon_bdev(&dev);
 	if (err) {
 		pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n");
@@ -1226,8 +1308,8 @@
 	return ofs->numlowerfs;
 }
 
-static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
-				unsigned int numlower)
+static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs,
+				struct path *stack, unsigned int numlower)
 {
 	int err;
 	unsigned int i;
@@ -1245,16 +1327,28 @@
 
 	for (i = 0; i < numlower; i++) {
 		struct vfsmount *mnt;
+		struct inode *trap;
 		int fsid;
 
-		err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb);
+		err = fsid = ovl_get_fsid(ofs, &stack[i]);
 		if (err < 0)
 			goto out;
 
+		err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir");
+		if (err)
+			goto out;
+
+		if (ovl_is_inuse(stack[i].dentry)) {
+			err = ovl_report_in_use(ofs, "lowerdir");
+			if (err)
+				goto out;
+		}
+
 		mnt = clone_private_mount(&stack[i]);
 		err = PTR_ERR(mnt);
 		if (IS_ERR(mnt)) {
 			pr_err("overlayfs: failed to clone lowerpath\n");
+			iput(trap);
 			goto out;
 		}
 
@@ -1264,6 +1358,7 @@
 		 */
 		mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
 
+		ofs->lower_layers[ofs->numlower].trap = trap;
 		ofs->lower_layers[ofs->numlower].mnt = mnt;
 		ofs->lower_layers[ofs->numlower].idx = i + 1;
 		ofs->lower_layers[ofs->numlower].fsid = fsid;
@@ -1358,7 +1453,7 @@
 		goto out_err;
 	}
 
-	err = ovl_get_lower_layers(ofs, stack, numlower);
+	err = ovl_get_lower_layers(sb, ofs, stack, numlower);
 	if (err)
 		goto out_err;
 
@@ -1390,6 +1485,77 @@
 	goto out;
 }
 
+/*
+ * Check if this layer root is a descendant of:
+ * - another layer of this overlayfs instance
+ * - upper/work dir of any overlayfs instance
+ */
+static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
+			   struct dentry *dentry, const char *name)
+{
+	struct dentry *next = dentry, *parent;
+	int err = 0;
+
+	if (!dentry)
+		return 0;
+
+	parent = dget_parent(next);
+
+	/* Walk back ancestors to root (inclusive) looking for traps */
+	while (!err && parent != next) {
+		if (ovl_lookup_trap_inode(sb, parent)) {
+			err = -ELOOP;
+			pr_err("overlayfs: overlapping %s path\n", name);
+		} else if (ovl_is_inuse(parent)) {
+			err = ovl_report_in_use(ofs, name);
+		}
+		next = parent;
+		parent = dget_parent(next);
+		dput(next);
+	}
+
+	dput(parent);
+
+	return err;
+}
+
+/*
+ * Check if any of the layers or work dirs overlap.
+ */
+static int ovl_check_overlapping_layers(struct super_block *sb,
+					struct ovl_fs *ofs)
+{
+	int i, err;
+
+	if (ofs->upper_mnt) {
+		err = ovl_check_layer(sb, ofs, ofs->upper_mnt->mnt_root,
+				      "upperdir");
+		if (err)
+			return err;
+
+		/*
+		 * Checking workbasedir avoids hitting ovl_is_inuse(parent) of
+		 * this instance and covers overlapping work and index dirs,
+		 * unless work or index dir have been moved since created inside
+		 * workbasedir.  In that case, we already have their traps in
+		 * inode cache and we will catch that case on lookup.
+		 */
+		err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir");
+		if (err)
+			return err;
+	}
+
+	for (i = 0; i < ofs->numlower; i++) {
+		err = ovl_check_layer(sb, ofs,
+				      ofs->lower_layers[i].mnt->mnt_root,
+				      "lowerdir");
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct path upperpath = { };
@@ -1429,17 +1595,20 @@
 	if (ofs->config.xino != OVL_XINO_OFF)
 		ofs->xino_bits = BITS_PER_LONG - 32;
 
+	/* alloc/destroy_inode needed for setting up traps in inode cache */
+	sb->s_op = &ovl_super_operations;
+
 	if (ofs->config.upperdir) {
 		if (!ofs->config.workdir) {
 			pr_err("overlayfs: missing 'workdir'\n");
 			goto out_err;
 		}
 
-		err = ovl_get_upper(ofs, &upperpath);
+		err = ovl_get_upper(sb, ofs, &upperpath);
 		if (err)
 			goto out_err;
 
-		err = ovl_get_workdir(ofs, &upperpath);
+		err = ovl_get_workdir(sb, ofs, &upperpath);
 		if (err)
 			goto out_err;
 
@@ -1460,7 +1629,7 @@
 		sb->s_flags |= SB_RDONLY;
 
 	if (!(ovl_force_readonly(ofs)) && ofs->config.index) {
-		err = ovl_get_indexdir(ofs, oe, &upperpath);
+		err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
 		if (err)
 			goto out_free_oe;
 
@@ -1473,6 +1642,10 @@
 
 	}
 
+	err = ovl_check_overlapping_layers(sb, ofs);
+	if (err)
+		goto out_free_oe;
+
 	/* Show index=off in /proc/mounts for forced r/o mount */
 	if (!ofs->indexdir) {
 		ofs->config.index = false;
@@ -1494,7 +1667,6 @@
 	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
 
 	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
-	sb->s_op = &ovl_super_operations;
 	sb->s_xattr = ovl_xattr_handlers;
 	sb->s_fs_info = ofs;
 	sb->s_flags |= SB_POSIXACL;

diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index ace4fe4..f5678a3 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c

@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2011 Novell Inc.
  * Copyright (C) 2016 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
  */
 
 #include <linux/fs.h>
@@ -65,8 +62,7 @@
  */
 int ovl_can_decode_fh(struct super_block *sb)
 {
-	if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry ||
-	    uuid_is_null(&sb->s_uuid))
+	if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry)
 		return 0;
 
 	return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
@@ -522,13 +518,13 @@
 
 int ovl_copy_up_start(struct dentry *dentry, int flags)
 {
-	struct ovl_inode *oi = OVL_I(d_inode(dentry));
+	struct inode *inode = d_inode(dentry);
 	int err;
 
-	err = mutex_lock_interruptible(&oi->lock);
+	err = ovl_inode_lock(inode);
 	if (!err && ovl_already_copied_up_locked(dentry, flags)) {
 		err = 1; /* Already copied up */
-		mutex_unlock(&oi->lock);
+		ovl_inode_unlock(inode);
 	}
 
 	return err;
@@ -536,7 +532,7 @@
 
 void ovl_copy_up_end(struct dentry *dentry)
 {
-	mutex_unlock(&OVL_I(d_inode(dentry))->lock);
+	ovl_inode_unlock(d_inode(dentry));
 }
 
 bool ovl_check_origin_xattr(struct dentry *dentry)
@@ -653,6 +649,18 @@
 	}
 }
 
+bool ovl_is_inuse(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	bool inuse;
+
+	spin_lock(&inode->i_lock);
+	inuse = (inode->i_state & I_OVL_INUSE);
+	spin_unlock(&inode->i_lock);
+
+	return inuse;
+}
+
 /*
  * Does this overlay dentry need to be indexed on copy up?
  */
@@ -739,14 +747,14 @@
  * Operations that change overlay inode and upper inode nlink need to be
  * synchronized with copy up for persistent nlink accounting.
  */
-int ovl_nlink_start(struct dentry *dentry, bool *locked)
+int ovl_nlink_start(struct dentry *dentry)
 {
-	struct ovl_inode *oi = OVL_I(d_inode(dentry));
+	struct inode *inode = d_inode(dentry);
 	const struct cred *old_cred;
 	int err;
 
-	if (!d_inode(dentry))
-		return 0;
+	if (WARN_ON(!inode))
+		return -ENOENT;
 
 	/*
 	 * With inodes index is enabled, we store the union overlay nlink
@@ -768,11 +776,11 @@
 			return err;
 	}
 
-	err = mutex_lock_interruptible(&oi->lock);
+	err = ovl_inode_lock(inode);
 	if (err)
 		return err;
 
-	if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+	if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
 		goto out;
 
 	old_cred = ovl_override_creds(dentry->d_sb);
@@ -787,27 +795,24 @@
 
 out:
 	if (err)
-		mutex_unlock(&oi->lock);
-	else
-		*locked = true;
+		ovl_inode_unlock(inode);
 
 	return err;
 }
 
-void ovl_nlink_end(struct dentry *dentry, bool locked)
+void ovl_nlink_end(struct dentry *dentry)
 {
-	if (locked) {
-		if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) &&
-		    d_inode(dentry)->i_nlink == 0) {
-			const struct cred *old_cred;
+	struct inode *inode = d_inode(dentry);
 
-			old_cred = ovl_override_creds(dentry->d_sb);
-			ovl_cleanup_index(dentry);
-			revert_creds(old_cred);
-		}
+	if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
+		const struct cred *old_cred;
 
-		mutex_unlock(&OVL_I(d_inode(dentry))->lock);
+		old_cred = ovl_override_creds(dentry->d_sb);
+		ovl_cleanup_index(dentry);
+		revert_creds(old_cred);
 	}
+
+	ovl_inode_unlock(inode);
 }
 
 int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
@@ -867,28 +872,49 @@
 	return (oe->numlower > 1);
 }
 
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+		     size_t padding)
+{
+	ssize_t res;
+	char *buf = NULL;
+
+	res = vfs_getxattr(dentry, name, NULL, 0);
+	if (res < 0) {
+		if (res == -ENODATA || res == -EOPNOTSUPP)
+			return -ENODATA;
+		goto fail;
+	}
+
+	if (res != 0) {
+		buf = kzalloc(res + padding, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+
+		res = vfs_getxattr(dentry, name, buf, res);
+		if (res < 0)
+			goto fail;
+	}
+	*value = buf;
+
+	return res;
+
+fail:
+	pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n",
+			    name, res);
+	kfree(buf);
+	return res;
+}
+
 char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
 {
 	int res;
 	char *s, *next, *buf = NULL;
 
-	res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0);
-	if (res < 0) {
-		if (res == -ENODATA || res == -EOPNOTSUPP)
-			return NULL;
-		goto fail;
-	}
-
-	buf = kzalloc(res + padding + 1, GFP_KERNEL);
-	if (!buf)
-		return ERR_PTR(-ENOMEM);
-
-	if (res == 0)
-		goto invalid;
-
-	res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res);
+	res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1);
+	if (res == -ENODATA)
+		return NULL;
 	if (res < 0)
-		goto fail;
+		return ERR_PTR(res);
 	if (res == 0)
 		goto invalid;
 
@@ -904,15 +930,9 @@
 	}
 
 	return buf;
-
-err_free:
-	kfree(buf);
-	return ERR_PTR(res);
-fail:
-	pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res);
-	goto err_free;
 invalid:
 	pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf);
 	res = -EINVAL;
-	goto err_free;
+	kfree(buf);
+	return ERR_PTR(res);
 }

diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c..8a2ab2f 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c

@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/log2.h>
 #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
 #include <linux/magic.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
@@ -140,8 +141,7 @@
 	struct page *page = buf->page;
 
 	if (page_count(page) == 1) {
-		if (memcg_kmem_enabled())
-			memcg_kmem_uncharge(page, 0);
+		memcg_kmem_uncharge(page, 0);
 		__SetPageLocked(page);
 		return 0;
 	}
@@ -189,9 +189,9 @@
  *	in the tee() system call, when we duplicate the buffers in one
  *	pipe into another.
  */
-void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
-	get_page(buf->page);
+	return try_get_page(buf->page);
 }
 EXPORT_SYMBOL(generic_pipe_buf_get);
 
@@ -226,8 +226,15 @@
 }
 EXPORT_SYMBOL(generic_pipe_buf_release);
 
+/* New data written to a pipe may be appended to a buffer with this type. */
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
-	.can_merge = 1,
+	.confirm = generic_pipe_buf_confirm,
+	.release = anon_pipe_buf_release,
+	.steal = anon_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
@@ -235,13 +242,32 @@
 };
 
 static const struct pipe_buf_operations packet_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
+/**
+ * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable
+ * @buf:	the buffer to mark
+ *
+ * Description:
+ *	This function ensures that no future writes will be merged into the
+ *	given &struct pipe_buffer. This is necessary when multiple pipe buffers
+ *	share the same backing page.
+ */
+void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
+{
+	if (buf->ops == &anon_pipe_buf_ops)
+		buf->ops = &anon_pipe_buf_nomerge_ops;
+}
+
+static bool pipe_buf_can_merge(struct pipe_buffer *buf)
+{
+	return buf->ops == &anon_pipe_buf_ops;
+}
+
 static ssize_t
 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 {
@@ -379,7 +405,7 @@
 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		int offset = buf->offset + buf->len;
 
-		if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) {
+		if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
 			ret = pipe_buf_confirm(pipe, buf);
 			if (ret)
 				goto out;
@@ -1157,16 +1183,20 @@
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-static struct dentry *pipefs_mount(struct file_system_type *fs_type,
-			 int flags, const char *dev_name, void *data)
+
+static int pipefs_init_fs_context(struct fs_context *fc)
 {
-	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
-			&pipefs_dentry_operations, PIPEFS_MAGIC);
+	struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+	ctx->ops = &pipefs_ops;
+	ctx->dops = &pipefs_dentry_operations;
+	return 0;
 }
 
 static struct file_system_type pipe_fs_type = {
 	.name		= "pipefs",
-	.mount		= pipefs_mount,
+	.init_fs_context = pipefs_init_fs_context,
 	.kill_sb	= kill_anon_super,
 };
 

diff --git a/fs/pnode.c b/fs/pnode.c
index 53d411a..49f6d7f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c

@@ -1,15 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/pnode.c
  *
  * (C) Copyright IBM Corporation 2005.
- *	Released under GPL v2.
  *	Author : Ram Pai (linuxram@us.ibm.com)
- *
  */
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/fs.h>
 #include <linux/nsproxy.h>
+#include <uapi/linux/mount.h>
 #include "internal.h"
 #include "pnode.h"
 
@@ -213,7 +213,6 @@
 }
 
 /* all accesses are serialized by namespace_sem */
-static struct user_namespace *user_ns;
 static struct mount *last_dest, *first_source, *last_source, *dest_master;
 static struct mountpoint *mp;
 static struct hlist_head *list;
@@ -259,13 +258,9 @@
 			type |= CL_MAKE_SHARED;
 	}
 		
-	/* Notice when we are propagating across user namespaces */
-	if (m->mnt_ns->user_ns != user_ns)
-		type |= CL_UNPRIVILEGED;
 	child = copy_tree(last_source, last_source->mnt.mnt_root, type);
 	if (IS_ERR(child))
 		return PTR_ERR(child);
-	child->mnt.mnt_flags &= ~MNT_LOCKED;
 	mnt_set_mountpoint(m, mp, child);
 	last_dest = m;
 	last_source = child;
@@ -302,7 +297,6 @@
 	 * propagate_one(); everything is serialized by namespace_sem,
 	 * so globals will do just fine.
 	 */
-	user_ns = current->nsproxy->mnt_ns->user_ns;
 	last_dest = dest_mnt;
 	first_source = source_mnt;
 	last_source = source_mnt;

diff --git a/fs/pnode.h b/fs/pnode.h
index dc87e65..49a058c 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h

@@ -1,9 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  *  linux/fs/pnode.h
  *
  * (C) Copyright IBM Corporation 2005.
- *	Released under GPL v2.
- *
  */
 #ifndef _LINUX_PNODE_H
 #define _LINUX_PNODE_H
@@ -27,8 +26,7 @@
 #define CL_MAKE_SHARED 		0x08
 #define CL_PRIVATE 		0x10
 #define CL_SHARED_TO_SLAVE	0x20
-#define CL_UNPRIVILEGED		0x40
-#define CL_COPY_MNT_NS_FILE	0x80
+#define CL_COPY_MNT_NS_FILE	0x40
 
 #define CL_COPY_ALL		(CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
 

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 2fd0fde..84ad1c9 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  *

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 817c02b..cb5629b 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config PROC_FS
 	bool "/proc file system support" if EXPERT
 	default y
@@ -57,7 +58,8 @@
 	  snapshot.
 
 	  If you say Y here, the collected device dumps will be added
-	  as ELF notes to /proc/vmcore.
+	  as ELF notes to /proc/vmcore. You can still disable device
+	  dump using the kernel command line option 'novmcoredd'.
 
 config PROC_SYSCTL
 	bool "Sysctl support (/proc/sys)" if EXPERT
@@ -71,7 +73,7 @@
 	  interface is through /proc/sys.  If you say Y here a tree of
 	  modifiable sysctl entries will be generated beneath the
           /proc/sys directory. They are explained in the files
-	  in <file:Documentation/sysctl/>.  Note that enabling this
+	  in <file:Documentation/admin-guide/sysctl/>.  Note that enabling this
 	  option will enlarge the kernel by at least 8 KB.
 
 	  As it is generally a good thing, you should say Y here unless
@@ -97,3 +99,7 @@
 
 	  Say Y if you are running any user-space software which takes benefit from
 	  this interface. For example, rkt is such a piece of software.
+
+config PROC_PID_ARCH_STATUS
+	def_bool n
+	depends on PROC_FS

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0ceb3b6..46dcb6f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c

@@ -343,28 +343,28 @@
 #ifdef CONFIG_SECCOMP
 	seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
 #endif
-	seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+	seq_puts(m, "\nSpeculation_Store_Bypass:\t");
 	switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
 	case -EINVAL:
-		seq_printf(m, "unknown");
+		seq_puts(m, "unknown");
 		break;
 	case PR_SPEC_NOT_AFFECTED:
-		seq_printf(m, "not vulnerable");
+		seq_puts(m, "not vulnerable");
 		break;
 	case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
-		seq_printf(m, "thread force mitigated");
+		seq_puts(m, "thread force mitigated");
 		break;
 	case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
-		seq_printf(m, "thread mitigated");
+		seq_puts(m, "thread mitigated");
 		break;
 	case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
-		seq_printf(m, "thread vulnerable");
+		seq_puts(m, "thread vulnerable");
 		break;
 	case PR_SPEC_DISABLE:
-		seq_printf(m, "globally mitigated");
+		seq_puts(m, "globally mitigated");
 		break;
 	default:
-		seq_printf(m, "vulnerable");
+		seq_puts(m, "vulnerable");
 		break;
 	}
 	seq_putc(m, '\n');
@@ -381,9 +381,9 @@
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
 	seq_printf(m, "Cpus_allowed:\t%*pb\n",
-		   cpumask_pr_args(&task->cpus_allowed));
+		   cpumask_pr_args(task->cpus_ptr));
 	seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-		   cpumask_pr_args(&task->cpus_allowed));
+		   cpumask_pr_args(task->cpus_ptr));
 }
 
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
@@ -392,6 +392,15 @@
 	seq_putc(m, '\n');
 }
 
+static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
+{
+	bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);
+
+	if (thp_enabled)
+		thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags);
+	seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -406,6 +415,7 @@
 	if (mm) {
 		task_mem(m, mm);
 		task_core_dumping(m, mm);
+		task_thp_status(m, mm);
 		mmput(mm);
 	}
 	task_sig(m, task);
@@ -452,7 +462,7 @@
 		 * a program is not able to use ptrace(2) in that case. It is
 		 * safe because the task has stopped executing permanently.
 		 */
-		if (permitted && (task->flags & PF_DUMPCORE)) {
+		if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
 			if (try_get_task_stack(task)) {
 				eip = KSTK_EIP(task);
 				esp = KSTK_ESP(task);

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7e9f07b..ebea950 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c

@@ -59,6 +59,7 @@
 #include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/generic-radix-tree.h>
 #include <linux/string.h>
 #include <linux/seq_file.h>
 #include <linux/namei.h>
@@ -92,7 +93,6 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/stat.h>
-#include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #include <trace/events/oom.h>
 #include "internal.h"
@@ -140,9 +140,13 @@
 #define REG(NAME, MODE, fops)				\
 	NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
 #define ONE(NAME, MODE, show)				\
-	NOD(NAME, (S_IFREG|(MODE)), 			\
+	NOD(NAME, (S_IFREG|(MODE)),			\
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = show } )
+#define ATTR(LSM, NAME, MODE)				\
+	NOD(NAME, (S_IFREG|(MODE)),			\
+		NULL, &proc_pid_attr_operations,	\
+		{ .lsm = LSM })
 
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -205,12 +209,53 @@
 	return result;
 }
 
+/*
+ * If the user used setproctitle(), we just get the string from
+ * user space at arg_start, and limit it to a maximum of one page.
+ */
+static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
+				size_t count, unsigned long pos,
+				unsigned long arg_start)
+{
+	char *page;
+	int ret, got;
+
+	if (pos >= PAGE_SIZE)
+		return 0;
+
+	page = (char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	ret = 0;
+	got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
+	if (got > 0) {
+		int len = strnlen(page, got);
+
+		/* Include the NUL character if it was found */
+		if (len < got)
+			len++;
+
+		if (len > pos) {
+			len -= pos;
+			if (len > count)
+				len = count;
+			len -= copy_to_user(buf, page+pos, len);
+			if (!len)
+				len = -EFAULT;
+			ret = len;
+		}
+	}
+	free_page((unsigned long)page);
+	return ret;
+}
+
 static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
 			      size_t count, loff_t *ppos)
 {
 	unsigned long arg_start, arg_end, env_start, env_end;
 	unsigned long pos, len;
-	char *page;
+	char *page, c;
 
 	/* Check if process spawned far enough to have cmdline. */
 	if (!mm->env_end)
@@ -227,28 +272,42 @@
 		return 0;
 
 	/*
-	 * We have traditionally allowed the user to re-write
-	 * the argument strings and overflow the end result
-	 * into the environment section. But only do that if
-	 * the environment area is contiguous to the arguments.
+	 * We allow setproctitle() to overwrite the argument
+	 * strings, and overflow past the original end. But
+	 * only when it overflows into the environment area.
 	 */
-	if (env_start != arg_end || env_start >= env_end)
+	if (env_start != arg_end || env_end < env_start)
 		env_start = env_end = arg_end;
-
-	/* .. and limit it to a maximum of one page of slop */
-	if (env_end >= arg_end + PAGE_SIZE)
-		env_end = arg_end + PAGE_SIZE - 1;
+	len = env_end - arg_start;
 
 	/* We're not going to care if "*ppos" has high bits set */
-	pos = arg_start + *ppos;
-
-	/* .. but we do check the result is in the proper range */
-	if (pos < arg_start || pos >= env_end)
+	pos = *ppos;
+	if (pos >= len)
+		return 0;
+	if (count > len - pos)
+		count = len - pos;
+	if (!count)
 		return 0;
 
-	/* .. and we never go past env_end */
-	if (env_end - pos < count)
-		count = env_end - pos;
+	/*
+	 * Magical special case: if the argv[] end byte is not
+	 * zero, the user has overwritten it with setproctitle(3).
+	 *
+	 * Possible future enhancement: do this only once when
+	 * pos is 0, and set a flag in the 'struct file'.
+	 */
+	if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
+		return get_mm_proctitle(mm, buf, count, pos, arg_start);
+
+	/*
+	 * For the non-setproctitle() case we limit things strictly
+	 * to the [arg_start, arg_end[ range.
+	 */
+	pos += arg_start;
+	if (pos < arg_start || pos >= arg_end)
+		return 0;
+	if (count > arg_end - pos)
+		count = arg_end - pos;
 
 	page = (char *)__get_free_page(GFP_KERNEL);
 	if (!page)
@@ -258,48 +317,11 @@
 	while (count) {
 		int got;
 		size_t size = min_t(size_t, PAGE_SIZE, count);
-		long offset;
 
-		/*
-		 * Are we already starting past the official end?
-		 * We always include the last byte that is *supposed*
-		 * to be NUL
-		 */
-		offset = (pos >= arg_end) ? pos - arg_end + 1 : 0;
-
-		got = access_remote_vm(mm, pos - offset, page, size + offset, FOLL_ANON);
-		if (got <= offset)
+		got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
+		if (got <= 0)
 			break;
-		got -= offset;
-
-		/* Don't walk past a NUL character once you hit arg_end */
-		if (pos + got >= arg_end) {
-			int n = 0;
-
-			/*
-			 * If we started before 'arg_end' but ended up
-			 * at or after it, we start the NUL character
-			 * check at arg_end-1 (where we expect the normal
-			 * EOF to be).
-			 *
-			 * NOTE! This is smaller than 'got', because
-			 * pos + got >= arg_end
-			 */
-			if (pos < arg_end)
-				n = arg_end - pos - 1;
-
-			/* Cut off at first NUL after 'n' */
-			got = n + strnlen(page+n, offset+got-n);
-			if (got < offset)
-				break;
-			got -= offset;
-
-			/* Include the NUL if it existed */
-			if (got < size)
-				got++;
-		}
-
-		got -= copy_to_user(buf, page+offset, got);
+		got -= copy_to_user(buf, page, got);
 		if (unlikely(!got)) {
 			if (!len)
 				len = -EFAULT;
@@ -403,7 +425,6 @@
 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 			  struct pid *pid, struct task_struct *task)
 {
-	struct stack_trace trace;
 	unsigned long *entries;
 	int err;
 
@@ -426,20 +447,17 @@
 	if (!entries)
 		return -ENOMEM;
 
-	trace.nr_entries	= 0;
-	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
-	trace.entries		= entries;
-	trace.skip		= 0;
-
 	err = lock_trace(task);
 	if (!err) {
-		unsigned int i;
+		unsigned int i, nr_entries;
 
-		save_stack_trace_tsk(task, &trace);
+		nr_entries = stack_trace_save_tsk(task, entries,
+						  MAX_STACK_TRACE_DEPTH, 0);
 
-		for (i = 0; i < trace.nr_entries; i++) {
+		for (i = 0; i < nr_entries; i++) {
 			seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
 		}
+
 		unlock_trace(task);
 	}
 	kfree(entries);
@@ -456,7 +474,7 @@
 			      struct pid *pid, struct task_struct *task)
 {
 	if (unlikely(!sched_info_on()))
-		seq_printf(m, "0 0 0\n");
+		seq_puts(m, "0 0 0\n");
 	else
 		seq_printf(m, "%llu %llu %lu\n",
 		   (unsigned long long)task->se.sum_exec_runtime,
@@ -485,10 +503,9 @@
 				   lr->count, lr->time, lr->max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 				unsigned long bt = lr->backtrace[q];
+
 				if (!bt)
 					break;
-				if (bt == ULONG_MAX)
-					break;
 				seq_printf(m, " %ps", (void *)bt);
 			}
 			seq_putc(m, '\n');
@@ -511,7 +528,7 @@
 
 	if (!task)
 		return -ESRCH;
-	clear_all_latency_tracing(task);
+	clear_tsk_latency_tracing(task);
 	put_task_struct(task);
 
 	return count;
@@ -530,11 +547,10 @@
 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 			  struct pid *pid, struct task_struct *task)
 {
-	unsigned long totalpages = totalram_pages + total_swap_pages;
+	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 
-	points = oom_badness(task, NULL, NULL, totalpages) *
-					1000 / totalpages;
+	points = oom_badness(task, totalpages) * 1000 / totalpages;
 	seq_printf(m, "%lu\n", points);
 
 	return 0;
@@ -581,8 +597,10 @@
 	/*
 	 * print the file header
 	 */
-       seq_printf(m, "%-25s %-20s %-20s %-10s\n",
-		  "Limit", "Soft Limit", "Hard Limit", "Units");
+	seq_puts(m, "Limit                     "
+		"Soft Limit           "
+		"Hard Limit           "
+		"Units     \n");
 
 	for (i = 0; i < RLIM_NLIMITS; i++) {
 		if (rlim[i].rlim_cur == RLIM_INFINITY)
@@ -610,24 +628,25 @@
 static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
 			    struct pid *pid, struct task_struct *task)
 {
-	long nr;
-	unsigned long args[6], sp, pc;
+	struct syscall_info info;
+	u64 *args = &info.data.args[0];
 	int res;
 
 	res = lock_trace(task);
 	if (res)
 		return res;
 
-	if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+	if (task_current_syscall(task, &info))
 		seq_puts(m, "running\n");
-	else if (nr < 0)
-		seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+	else if (info.data.nr < 0)
+		seq_printf(m, "%d 0x%llx 0x%llx\n",
+			   info.data.nr, info.sp, info.data.instruction_pointer);
 	else
 		seq_printf(m,
-		       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
-		       nr,
+		       "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
+		       info.data.nr,
 		       args[0], args[1], args[2], args[3], args[4], args[5],
-		       sp, pc);
+		       info.sp, info.data.instruction_pointer);
 	unlock_trace(task);
 
 	return 0;
@@ -1084,10 +1103,6 @@
 
 			task_lock(p);
 			if (!p->vfork_done && process_shares_mm(p, mm)) {
-				pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
-						task_pid_nr(p), p->comm,
-						p->signal->oom_score_adj, oom_adj,
-						task_pid_nr(task), task->comm);
 				p->signal->oom_score_adj = oom_adj;
 				if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
 					p->signal->oom_score_adj_min = (short)oom_adj;
@@ -1208,7 +1223,7 @@
 	.llseek		= default_llseek,
 };
 
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
 #define TMPBUFLEN 11
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
@@ -1964,9 +1979,12 @@
 		goto out;
 
 	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
-		down_read(&mm->mmap_sem);
-		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
-		up_read(&mm->mmap_sem);
+		status = down_read_killable(&mm->mmap_sem);
+		if (!status) {
+			exact_vma_exists = !!find_exact_vma(mm, vm_start,
+							    vm_end);
+			up_read(&mm->mmap_sem);
+		}
 	}
 
 	mmput(mm);
@@ -2012,8 +2030,11 @@
 	if (rc)
 		goto out_mmput;
 
+	rc = down_read_killable(&mm->mmap_sem);
+	if (rc)
+		goto out_mmput;
+
 	rc = -ENOENT;
-	down_read(&mm->mmap_sem);
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
 		*path = vma->vm_file->f_path;
@@ -2109,7 +2130,11 @@
 	if (!mm)
 		goto out_put_task;
 
-	down_read(&mm->mmap_sem);
+	result = ERR_PTR(-EINTR);
+	if (down_read_killable(&mm->mmap_sem))
+		goto out_put_mm;
+
+	result = ERR_PTR(-ENOENT);
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (!vma)
 		goto out_no_vma;
@@ -2120,6 +2145,7 @@
 
 out_no_vma:
 	up_read(&mm->mmap_sem);
+out_put_mm:
 	mmput(mm);
 out_put_task:
 	put_task_struct(task);
@@ -2140,11 +2166,12 @@
 	struct task_struct *task;
 	struct mm_struct *mm;
 	unsigned long nr_files, pos, i;
-	struct flex_array *fa = NULL;
-	struct map_files_info info;
+	GENRADIX(struct map_files_info) fa;
 	struct map_files_info *p;
 	int ret;
 
+	genradix_init(&fa);
+
 	ret = -ENOENT;
 	task = get_proc_task(file_inode(file));
 	if (!task)
@@ -2161,7 +2188,12 @@
 	mm = get_task_mm(task);
 	if (!mm)
 		goto out_put_task;
-	down_read(&mm->mmap_sem);
+
+	ret = down_read_killable(&mm->mmap_sem);
+	if (ret) {
+		mmput(mm);
+		goto out_put_task;
+	}
 
 	nr_files = 0;
 
@@ -2176,35 +2208,22 @@
 	 */
 
 	for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-		if (vma->vm_file && ++pos > ctx->pos)
-			nr_files++;
-	}
+		if (!vma->vm_file)
+			continue;
+		if (++pos <= ctx->pos)
+			continue;
 
-	if (nr_files) {
-		fa = flex_array_alloc(sizeof(info), nr_files,
-					GFP_KERNEL);
-		if (!fa || flex_array_prealloc(fa, 0, nr_files,
-						GFP_KERNEL)) {
+		p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
+		if (!p) {
 			ret = -ENOMEM;
-			if (fa)
-				flex_array_free(fa);
 			up_read(&mm->mmap_sem);
 			mmput(mm);
 			goto out_put_task;
 		}
-		for (i = 0, vma = mm->mmap, pos = 2; vma;
-				vma = vma->vm_next) {
-			if (!vma->vm_file)
-				continue;
-			if (++pos <= ctx->pos)
-				continue;
 
-			info.start = vma->vm_start;
-			info.end = vma->vm_end;
-			info.mode = vma->vm_file->f_mode;
-			if (flex_array_put(fa, i++, &info, GFP_KERNEL))
-				BUG();
-		}
+		p->start = vma->vm_start;
+		p->end = vma->vm_end;
+		p->mode = vma->vm_file->f_mode;
 	}
 	up_read(&mm->mmap_sem);
 	mmput(mm);
@@ -2213,7 +2232,7 @@
 		char buf[4 * sizeof(long) + 2];	/* max: %lx-%lx\0 */
 		unsigned int len;
 
-		p = flex_array_get(fa, i);
+		p = genradix_ptr(&fa, i);
 		len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
 		if (!proc_fill_cache(file, ctx,
 				      buf, len,
@@ -2223,12 +2242,11 @@
 			break;
 		ctx->pos++;
 	}
-	if (fa)
-		flex_array_free(fa);
 
 out_put_task:
 	put_task_struct(task);
 out:
+	genradix_free(&fa);
 	return ret;
 }
 
@@ -2356,10 +2374,13 @@
 		return -ESRCH;
 
 	if (p != current) {
-		if (!capable(CAP_SYS_NICE)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
 			count = -EPERM;
 			goto out;
 		}
+		rcu_read_unlock();
 
 		err = security_task_setscheduler(p);
 		if (err) {
@@ -2392,11 +2413,14 @@
 		return -ESRCH;
 
 	if (p != current) {
-
-		if (!capable(CAP_SYS_NICE)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
 			err = -EPERM;
 			goto out;
 		}
+		rcu_read_unlock();
+
 		err = security_task_getscheduler(p);
 		if (err)
 			goto out;
@@ -2451,11 +2475,10 @@
 
 static struct dentry *proc_pident_lookup(struct inode *dir, 
 					 struct dentry *dentry,
-					 const struct pid_entry *ents,
-					 unsigned int nents)
+					 const struct pid_entry *p,
+					 const struct pid_entry *end)
 {
 	struct task_struct *task = get_proc_task(dir);
-	const struct pid_entry *p, *last;
 	struct dentry *res = ERR_PTR(-ENOENT);
 
 	if (!task)
@@ -2465,8 +2488,7 @@
 	 * Yes, it does not scale. And it should not. Don't add
 	 * new entries into /proc/<tgid>/ without very good reasons.
 	 */
-	last = &ents[nents];
-	for (p = ents; p < last; p++) {
+	for (; p < end; p++) {
 		if (p->len != dentry->d_name.len)
 			continue;
 		if (!memcmp(dentry->d_name.name, p->name, p->len)) {
@@ -2517,7 +2539,7 @@
 	if (!task)
 		return -ESRCH;
 
-	length = security_getprocattr(task,
+	length = security_getprocattr(task, PROC_I(inode)->op.lsm,
 				      (char*)file->f_path.dentry->d_name.name,
 				      &p);
 	put_task_struct(task);
@@ -2546,6 +2568,11 @@
 		rcu_read_unlock();
 		return -EACCES;
 	}
+	/* Prevent changes to overridden credentials. */
+	if (current_cred() != current_real_cred()) {
+		rcu_read_unlock();
+		return -EBUSY;
+	}
 	rcu_read_unlock();
 
 	if (count > PAGE_SIZE)
@@ -2566,7 +2593,9 @@
 	if (rv < 0)
 		goto out_free;
 
-	rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count);
+	rv = security_setprocattr(PROC_I(inode)->op.lsm,
+				  file->f_path.dentry->d_name.name, page,
+				  count);
 	mutex_unlock(&current->signal->cred_guard_mutex);
 out_free:
 	kfree(page);
@@ -2580,13 +2609,53 @@
 	.llseek		= generic_file_llseek,
 };
 
+#define LSM_DIR_OPS(LSM) \
+static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+			     struct dir_context *ctx) \
+{ \
+	return proc_pident_readdir(filp, ctx, \
+				   LSM##_attr_dir_stuff, \
+				   ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+	.read		= generic_read_dir, \
+	.iterate	= proc_##LSM##_attr_dir_iterate, \
+	.llseek		= default_llseek, \
+}; \
+\
+static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+				struct dentry *dentry, unsigned int flags) \
+{ \
+	return proc_pident_lookup(dir, dentry, \
+				  LSM##_attr_dir_stuff, \
+				  LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+	.lookup		= proc_##LSM##_attr_dir_lookup, \
+	.getattr	= pid_getattr, \
+	.setattr	= proc_setattr, \
+}
+
+#ifdef CONFIG_SECURITY_SMACK
+static const struct pid_entry smack_attr_dir_stuff[] = {
+	ATTR("smack", "current",	0666),
+};
+LSM_DIR_OPS(smack);
+#endif
+
 static const struct pid_entry attr_dir_stuff[] = {
-	REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-	REG("prev",       S_IRUGO,	   proc_pid_attr_operations),
-	REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-	REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-	REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-	REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	ATTR(NULL, "current",		0666),
+	ATTR(NULL, "prev",		0444),
+	ATTR(NULL, "exec",		0666),
+	ATTR(NULL, "fscreate",		0666),
+	ATTR(NULL, "keycreate",		0666),
+	ATTR(NULL, "sockcreate",	0666),
+#ifdef CONFIG_SECURITY_SMACK
+	DIR("smack",			0555,
+	    proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+#endif
 };
 
 static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -2605,7 +2674,8 @@
 				struct dentry *dentry, unsigned int flags)
 {
 	return proc_pident_lookup(dir, dentry,
-				  attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+				  attr_dir_stuff,
+				  attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
 }
 
 static const struct inode_operations proc_attr_dir_inode_operations = {
@@ -2905,6 +2975,21 @@
 }
 #endif /* CONFIG_LIVEPATCH */
 
+#ifdef CONFIG_STACKLEAK_METRICS
+static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task)
+{
+	unsigned long prev_depth = THREAD_SIZE -
+				(task->prev_lowest_stack & (THREAD_SIZE - 1));
+	unsigned long depth = THREAD_SIZE -
+				(task->lowest_stack & (THREAD_SIZE - 1));
+
+	seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
+							prev_depth, depth);
+	return 0;
+}
+#endif /* CONFIG_STACKLEAK_METRICS */
+
 /*
  * Thread groups
  */
@@ -2979,7 +3064,7 @@
 	ONE("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
@@ -3006,6 +3091,12 @@
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
+#ifdef CONFIG_STACKLEAK_METRICS
+	ONE("stack_depth", S_IRUGO, proc_stack_depth),
+#endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3020,10 +3111,19 @@
 	.llseek		= generic_file_llseek,
 };
 
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+	if (file->f_op != &proc_tgid_base_operations)
+		return ERR_PTR(-EBADF);
+
+	return proc_pid(file_inode(file));
+}
+
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	return proc_pident_lookup(dir, dentry,
-				  tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+				  tgid_base_stuff,
+				  tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
 }
 
 static const struct inode_operations proc_tgid_base_inode_operations = {
@@ -3139,7 +3239,7 @@
 	return d_splice_alias(inode, dentry);
 }
 
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
 {
 	struct task_struct *task;
 	unsigned tgid;
@@ -3364,7 +3464,7 @@
 	ONE("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
@@ -3384,6 +3484,9 @@
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3395,7 +3498,8 @@
 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	return proc_pident_lookup(dir, dentry,
-				  tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+				  tid_base_stuff,
+				  tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
 }
 
 static const struct file_operations proc_tid_base_operations = {

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index 954caf0..dfe6ce3 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c

@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2010 Werner Fink, Jiri Slaby
- *
- * Licensed under GPLv2
  */
 
 #include <linux/console.h>

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8ae1094..64e9ee1 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * proc/fs/generic.c --- generic routines for the proc-fs
  *
@@ -256,7 +257,7 @@
 		inode = proc_get_inode(dir->i_sb, de);
 		if (!inode)
 			return ERR_PTR(-ENOMEM);
-		d_set_d_op(dentry, &proc_misc_dentry_ops);
+		d_set_d_op(dentry, de->proc_dops);
 		return d_splice_alias(inode, dentry);
 	}
 	read_unlock(&proc_subdir_lock);
@@ -429,6 +430,8 @@
 	INIT_LIST_HEAD(&ent->pde_openers);
 	proc_set_user(ent, (*parent)->uid, (*parent)->gid);
 
+	ent->proc_dops = &proc_misc_dentry_ops;
+
 out:
 	return ent;
 }

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fc5306a..dbe43a5 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c

@@ -24,7 +24,6 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
-#include <linux/magic.h>
 
 #include <linux/uaccess.h>
 
@@ -59,7 +58,6 @@
 static struct inode *proc_alloc_inode(struct super_block *sb)
 {
 	struct proc_inode *ei;
-	struct inode *inode;
 
 	ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
 	if (!ei)
@@ -71,21 +69,14 @@
 	ei->sysctl = NULL;
 	ei->sysctl_entry = NULL;
 	ei->ns_ops = NULL;
-	inode = &ei->vfs_inode;
-	return inode;
+	return &ei->vfs_inode;
 }
 
-static void proc_i_callback(struct rcu_head *head)
+static void proc_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
-static void proc_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, proc_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
@@ -124,13 +115,12 @@
 	return 0;
 }
 
-static const struct super_operations proc_sops = {
+const struct super_operations proc_sops = {
 	.alloc_inode	= proc_alloc_inode,
-	.destroy_inode	= proc_destroy_inode,
+	.free_inode	= proc_free_inode,
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= proc_evict_inode,
 	.statfs		= simple_statfs,
-	.remount_fs	= proc_remount,
 	.show_options	= proc_show_options,
 };
 
@@ -210,7 +200,8 @@
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	loff_t rv = -EINVAL;
 	if (use_pde(pde)) {
-		loff_t (*llseek)(struct file *, loff_t, int);
+		typeof_member(struct file_operations, llseek) llseek;
+
 		llseek = pde->proc_fops->llseek;
 		if (!llseek)
 			llseek = default_llseek;
@@ -222,10 +213,11 @@
 
 static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
-	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	ssize_t rv = -EIO;
 	if (use_pde(pde)) {
+		typeof_member(struct file_operations, read) read;
+
 		read = pde->proc_fops->read;
 		if (read)
 			rv = read(file, buf, count, ppos);
@@ -236,10 +228,11 @@
 
 static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 {
-	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	ssize_t rv = -EIO;
 	if (use_pde(pde)) {
+		typeof_member(struct file_operations, write) write;
+
 		write = pde->proc_fops->write;
 		if (write)
 			rv = write(file, buf, count, ppos);
@@ -252,8 +245,9 @@
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	__poll_t rv = DEFAULT_POLLMASK;
-	__poll_t (*poll)(struct file *, struct poll_table_struct *);
 	if (use_pde(pde)) {
+		typeof_member(struct file_operations, poll) poll;
+
 		poll = pde->proc_fops->poll;
 		if (poll)
 			rv = poll(file, pts);
@@ -266,8 +260,9 @@
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
-	long (*ioctl)(struct file *, unsigned int, unsigned long);
 	if (use_pde(pde)) {
+		typeof_member(struct file_operations, unlocked_ioctl) ioctl;
+
 		ioctl = pde->proc_fops->unlocked_ioctl;
 		if (ioctl)
 			rv = ioctl(file, cmd, arg);
@@ -281,8 +276,9 @@
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
-	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
 	if (use_pde(pde)) {
+		typeof_member(struct file_operations, compat_ioctl) compat_ioctl;
+
 		compat_ioctl = pde->proc_fops->compat_ioctl;
 		if (compat_ioctl)
 			rv = compat_ioctl(file, cmd, arg);
@@ -296,8 +292,9 @@
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	int rv = -EIO;
-	int (*mmap)(struct file *, struct vm_area_struct *);
 	if (use_pde(pde)) {
+		typeof_member(struct file_operations, mmap) mmap;
+
 		mmap = pde->proc_fops->mmap;
 		if (mmap)
 			rv = mmap(file, vma);
@@ -315,7 +312,7 @@
 	unsigned long rv = -EIO;
 
 	if (use_pde(pde)) {
-		typeof(proc_reg_get_unmapped_area) *get_area;
+		typeof_member(struct file_operations, get_unmapped_area) get_area;
 
 		get_area = pde->proc_fops->get_unmapped_area;
 #ifdef CONFIG_MMU
@@ -336,8 +333,8 @@
 {
 	struct proc_dir_entry *pde = PDE(inode);
 	int rv = 0;
-	int (*open)(struct inode *, struct file *);
-	int (*release)(struct inode *, struct file *);
+	typeof_member(struct file_operations, open) open;
+	typeof_member(struct file_operations, release) release;
 	struct pde_opener *pdeo;
 
 	/*
@@ -490,48 +487,3 @@
 	       pde_put(de);
 	return inode;
 }
-
-int proc_fill_super(struct super_block *s, void *data, int silent)
-{
-	struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
-	struct inode *root_inode;
-	int ret;
-
-	if (!proc_parse_options(data, ns))
-		return -EINVAL;
-
-	/* User space would break if executables or devices appear on proc */
-	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
-	s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
-	s->s_blocksize = 1024;
-	s->s_blocksize_bits = 10;
-	s->s_magic = PROC_SUPER_MAGIC;
-	s->s_op = &proc_sops;
-	s->s_time_gran = 1;
-
-	/*
-	 * procfs isn't actually a stacking filesystem; however, there is
-	 * too much magic going on inside it to permit stacking things on
-	 * top of it
-	 */
-	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
-	
-	pde_get(&proc_root);
-	root_inode = proc_get_inode(s, &proc_root);
-	if (!root_inode) {
-		pr_err("proc_fill_super: get root inode failed\n");
-		return -ENOMEM;
-	}
-
-	s->s_root = d_make_root(root_inode);
-	if (!s->s_root) {
-		pr_err("proc_fill_super: allocate dentry failed\n");
-		return -ENOMEM;
-	}
-
-	ret = proc_setup_self(s);
-	if (ret) {
-		return ret;
-	}
-	return proc_setup_thread_self(s);
-}

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f..cd0c8d5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* Internal procfs definitions
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/proc_fs.h>
@@ -44,6 +40,7 @@
 	struct completion *pde_unload_completion;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
+	const struct dentry_operations *proc_dops;
 	union {
 		const struct seq_operations *seq_ops;
 		int (*single_show)(struct seq_file *, void *);
@@ -81,6 +78,7 @@
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,
 		struct task_struct *task);
+	const char *lsm;
 };
 
 struct proc_inode {
@@ -161,7 +159,7 @@
 extern void pid_update_inode(struct task_struct *, struct inode *);
 extern int pid_delete_dentry(const struct dentry *);
 extern int proc_pid_readdir(struct file *, struct dir_context *);
-extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
 extern loff_t mem_lseek(struct file *, loff_t, int);
 
 /* Lookups */
@@ -205,13 +203,12 @@
 	struct completion *c;
 } __randomize_layout;
 extern const struct inode_operations proc_link_inode_operations;
-
 extern const struct inode_operations proc_pid_link_inode_operations;
+extern const struct super_operations proc_sops;
 
 void proc_init_kmemcache(void);
 void set_proc_pid_nlink(void);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *, void *data, int flags);
 extern void proc_entry_rundown(struct proc_dir_entry *);
 
 /*
@@ -269,10 +266,8 @@
  * root.c
  */
 extern struct proc_dir_entry proc_root;
-extern int proc_parse_options(char *options, struct pid_namespace *pid);
 
 extern void proc_self_init(void);
-extern int proc_remount(struct super_block *, int *, char *);
 
 /*
  * task_[no]mmu.c

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d297fe4..e2ed8e0 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c

@@ -22,7 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/printk.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
@@ -31,6 +31,7 @@
 #include <linux/ioport.h>
 #include <linux/memory.h>
 #include <linux/sched/task.h>
+#include <linux/security.h>
 #include <asm/sections.h>
 #include "internal.h"
 
@@ -54,6 +55,28 @@
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * Same as oldmem_pfn_is_ram in vmcore
+ */
+static int (*mem_pfn_is_ram)(unsigned long pfn);
+
+int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+	if (mem_pfn_is_ram)
+		return -EBUSY;
+	mem_pfn_is_ram = fn;
+	return 0;
+}
+
+static int pfn_is_ram(unsigned long pfn)
+{
+	if (mem_pfn_is_ram)
+		return mem_pfn_is_ram(pfn);
+	else
+		return 1;
+}
+
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
 void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
 		       int type)
@@ -465,6 +488,11 @@
 				goto out;
 			}
 			m = NULL;	/* skip the list anchor */
+		} else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
+			if (clear_user(buffer, tsz)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else if (m->type == KCORE_VMALLOC) {
 			vread(buf, (char *)start, tsz);
 			/* we have to zero-fill user buffer even if no read */
@@ -518,9 +546,14 @@
 
 static int open_kcore(struct inode *inode, struct file *filp)
 {
+	int ret = security_locked_down(LOCKDOWN_KCORE);
+
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
+	if (ret)
+		return ret;
+
 	filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!filp->private_data)
 		return -ENOMEM;
@@ -588,7 +621,7 @@
 /*
  * MODULES_VADDR has no intersection with VMALLOC_ADDR.
  */
-struct kcore_list kcore_modules;
+static struct kcore_list kcore_modules;
 static void __init add_modules_range(void)
 {
 	if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index d066947..8468bae 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c

@@ -10,9 +10,6 @@
 #include <linux/seqlock.h>
 #include <linux/time.h>
 
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
 	unsigned long avnrun[3];

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index edda898..8c1f1bb 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c

@@ -8,7 +8,6 @@
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
 #include <linux/percpu.h>
-#include <linux/quicklist.h>
 #include <linux/seq_file.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
@@ -38,6 +37,7 @@
 	long cached;
 	long available;
 	unsigned long pages[NR_LRU_LISTS];
+	unsigned long sreclaimable, sunreclaim;
 	int lru;
 
 	si_meminfo(&i);
@@ -53,6 +53,8 @@
 		pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
 
 	available = si_mem_available();
+	sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
+	sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
 
 	show_val_kb(m, "MemTotal:       ", i.totalram);
 	show_val_kb(m, "MemFree:        ", i.freeram);
@@ -94,21 +96,15 @@
 	show_val_kb(m, "Mapped:         ",
 		    global_node_page_state(NR_FILE_MAPPED));
 	show_val_kb(m, "Shmem:          ", i.sharedram);
-	show_val_kb(m, "Slab:           ",
-		    global_node_page_state(NR_SLAB_RECLAIMABLE) +
-		    global_node_page_state(NR_SLAB_UNRECLAIMABLE));
-
-	show_val_kb(m, "SReclaimable:   ",
-		    global_node_page_state(NR_SLAB_RECLAIMABLE));
-	show_val_kb(m, "SUnreclaim:     ",
-		    global_node_page_state(NR_SLAB_UNRECLAIMABLE));
+	show_val_kb(m, "KReclaimable:   ", sreclaimable +
+		    global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
+	show_val_kb(m, "Slab:           ", sreclaimable + sunreclaim);
+	show_val_kb(m, "SReclaimable:   ", sreclaimable);
+	show_val_kb(m, "SUnreclaim:     ", sunreclaim);
 	seq_printf(m, "KernelStack:    %8lu kB\n",
 		   global_zone_page_state(NR_KERNEL_STACK_KB));
 	show_val_kb(m, "PageTables:     ",
 		    global_zone_page_state(NR_PAGETABLE));
-#ifdef CONFIG_QUICKLIST
-	show_val_kb(m, "Quicklists:     ", quicklist_total_size());
-#endif
 
 	show_val_kb(m, "NFS_Unstable:   ",
 		    global_node_page_state(NR_UNSTABLE_NFS));
@@ -120,7 +116,7 @@
 	show_val_kb(m, "Committed_AS:   ", committed);
 	seq_printf(m, "VmallocTotal:   %8lu kB\n",
 		   (unsigned long)VMALLOC_TOTAL >> 10);
-	show_val_kb(m, "VmallocUsed:    ", 0ul);
+	show_val_kb(m, "VmallocUsed:    ", vmalloc_nr_pages());
 	show_val_kb(m, "VmallocChunk:   ", 0ul);
 	show_val_kb(m, "Percpu:         ", pcpu_nr_pages());
 
@@ -136,6 +132,10 @@
 		    global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
 	show_val_kb(m, "ShmemPmdMapped: ",
 		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+	show_val_kb(m, "FileHugePages:  ",
+		    global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+	show_val_kb(m, "FilePmdMapped:  ",
+		    global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
 #endif
 
 #ifdef CONFIG_CMA

diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3b63be6..14c2bad 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* nommu.c: mmu-less memory info files
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/init.h>

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 792c78a..7c952ee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c

@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -42,11 +42,13 @@
 		return -EINVAL;
 
 	while (count > 0) {
-		if (pfn_valid(pfn))
-			ppage = pfn_to_page(pfn);
-		else
-			ppage = NULL;
-		if (!ppage || PageSlab(ppage))
+		/*
+		 * TODO: ZONE_DEVICE support requires to identify
+		 * memmaps that were actually initialized.
+		 */
+		ppage = pfn_to_online_page(pfn);
+
+		if (!ppage || PageSlab(ppage) || page_has_type(ppage))
 			pcount = 0;
 		else
 			pcount = page_mapcount(ppage);
@@ -152,8 +154,8 @@
 	else if (page_count(page) == 0 && is_free_buddy_page(page))
 		u |= 1 << KPF_BUDDY;
 
-	if (PageBalloon(page))
-		u |= 1 << KPF_BALLOON;
+	if (PageOffline(page))
+		u |= 1 << KPF_OFFLINE;
 	if (PageTable(page))
 		u |= 1 << KPF_PGTABLE;
 
@@ -216,10 +218,11 @@
 		return -EINVAL;
 
 	while (count > 0) {
-		if (pfn_valid(pfn))
-			ppage = pfn_to_page(pfn);
-		else
-			ppage = NULL;
+		/*
+		 * TODO: ZONE_DEVICE support requires to identify
+		 * memmaps that were actually initialized.
+		 */
+		ppage = pfn_to_online_page(pfn);
 
 		if (put_user(stable_page_flags(ppage), out)) {
 			ret = -EFAULT;
@@ -261,10 +264,11 @@
 		return -EINVAL;
 
 	while (count > 0) {
-		if (pfn_valid(pfn))
-			ppage = pfn_to_page(pfn);
-		else
-			ppage = NULL;
+		/*
+		 * TODO: ZONE_DEVICE support requires to identify
+		 * memmaps that were actually initialized.
+		 */
+		ppage = pfn_to_online_page(pfn);
 
 		if (ppage)
 			ino = page_cgroup_ino(ppage);

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index d5e0fcb..76ae278 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/proc/net.c
  *
@@ -38,6 +39,22 @@
 	return maybe_get_net(PDE_NET(PDE(inode)));
 }
 
+static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	return 0;
+}
+
+static const struct dentry_operations proc_net_dentry_ops = {
+	.d_revalidate	= proc_net_d_revalidate,
+	.d_delete	= always_delete_dentry,
+};
+
+static void pde_force_lookup(struct proc_dir_entry *pde)
+{
+	/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+	pde->proc_dops = &proc_net_dentry_ops;
+}
+
 static int seq_open_net(struct inode *inode, struct file *file)
 {
 	unsigned int state_size = PDE(inode)->state_size;
@@ -90,6 +107,7 @@
 	p = proc_create_reg(name, mode, &parent, data);
 	if (!p)
 		return NULL;
+	pde_force_lookup(p);
 	p->proc_fops = &proc_net_seq_fops;
 	p->seq_ops = ops;
 	p->state_size = state_size;
@@ -133,6 +151,7 @@
 	p = proc_create_reg(name, mode, &parent, data);
 	if (!p)
 		return NULL;
+	pde_force_lookup(p);
 	p->proc_fops = &proc_net_seq_fops;
 	p->seq_ops = ops;
 	p->state_size = state_size;
@@ -181,6 +200,7 @@
 	p = proc_create_reg(name, mode, &parent, data);
 	if (!p)
 		return NULL;
+	pde_force_lookup(p);
 	p->proc_fops = &proc_net_single_fops;
 	p->single_show = show;
 	return proc_register(parent, p);
@@ -223,6 +243,7 @@
 	p = proc_create_reg(name, mode, &parent, data);
 	if (!p)
 		return NULL;
+	pde_force_lookup(p);
 	p->proc_fops = &proc_net_single_fops;
 	p->single_show = show;
 	p->write = write;

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 4d598a3..d80989b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c

@@ -13,6 +13,7 @@
 #include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/bpf-cgroup.h>
 #include "internal.h"
 
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -21,6 +22,10 @@
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+/* shared constants to be used in various sysctls */
+const int sysctl_vals[] = { 0, 1, INT_MAX };
+EXPORT_SYMBOL(sysctl_vals);
+
 /* Support for permanently empty directories */
 
 struct ctl_table sysctl_mount_point[] = {
@@ -498,6 +503,10 @@
 
 	if (root->set_ownership)
 		root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
+	else {
+		inode->i_uid = GLOBAL_ROOT_UID;
+		inode->i_gid = GLOBAL_ROOT_GID;
+	}
 
 	return inode;
 }
@@ -569,8 +578,8 @@
 	struct inode *inode = file_inode(filp);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	void *new_buf = NULL;
 	ssize_t error;
-	size_t res;
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
@@ -588,11 +597,27 @@
 	if (!table->proc_handler)
 		goto out;
 
+	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count,
+					   ppos, &new_buf);
+	if (error)
+		goto out;
+
 	/* careful: calling conventions are nasty here */
-	res = count;
-	error = table->proc_handler(table, write, buf, &res, ppos);
+	if (new_buf) {
+		mm_segment_t old_fs;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		error = table->proc_handler(table, write, (void __user *)new_buf,
+					    &count, ppos);
+		set_fs(old_fs);
+		kfree(new_buf);
+	} else {
+		error = table->proc_handler(table, write, buf, &count, ppos);
+	}
+
 	if (!error)
-		error = res;
+		error = count;
 out:
 	sysctl_head_finish(head);
 
@@ -1626,8 +1651,11 @@
 	if (--header->nreg)
 		return;
 
-	put_links(header);
-	start_unregistering(header);
+	if (parent) {
+		put_links(header);
+		start_unregistering(header);
+	}
+
 	if (!--header->count)
 		kfree_rcu(header, rcu);
 

diff --git a/fs/proc/root.c b/fs/proc/root.c
index f4b1a9d..0b7c8df 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c

@@ -19,86 +19,176 @@
 #include <linux/module.h>
 #include <linux/bitops.h>
 #include <linux/user_namespace.h>
+#include <linux/fs_context.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
 #include <linux/cred.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
 
 #include "internal.h"
 
-enum {
-	Opt_gid, Opt_hidepid, Opt_err,
+struct proc_fs_context {
+	struct pid_namespace	*pid_ns;
+	unsigned int		mask;
+	int			hidepid;
+	int			gid;
 };
 
-static const match_table_t tokens = {
-	{Opt_hidepid, "hidepid=%u"},
-	{Opt_gid, "gid=%u"},
-	{Opt_err, NULL},
+enum proc_param {
+	Opt_gid,
+	Opt_hidepid,
 };
 
-int proc_parse_options(char *options, struct pid_namespace *pid)
+static const struct fs_parameter_spec proc_param_specs[] = {
+	fsparam_u32("gid",	Opt_gid),
+	fsparam_u32("hidepid",	Opt_hidepid),
+	{}
+};
+
+static const struct fs_parameter_description proc_fs_parameters = {
+	.name		= "proc",
+	.specs		= proc_param_specs,
+};
+
+static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
+	struct proc_fs_context *ctx = fc->fs_private;
+	struct fs_parse_result result;
+	int opt;
 
-	if (!options)
-		return 1;
+	opt = fs_parse(fc, &proc_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
 
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
+	switch (opt) {
+	case Opt_gid:
+		ctx->gid = result.uint_32;
+		break;
 
-		args[0].to = args[0].from = NULL;
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_gid:
-			if (match_int(&args[0], &option))
-				return 0;
-			pid->pid_gid = make_kgid(current_user_ns(), option);
-			break;
-		case Opt_hidepid:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < HIDEPID_OFF ||
-			    option > HIDEPID_INVISIBLE) {
-				pr_err("proc: hidepid value must be between 0 and 2.\n");
-				return 0;
-			}
-			pid->hide_pid = option;
-			break;
-		default:
-			pr_err("proc: unrecognized mount option \"%s\" "
-			       "or missing value\n", p);
-			return 0;
-		}
+	case Opt_hidepid:
+		ctx->hidepid = result.uint_32;
+		if (ctx->hidepid < HIDEPID_OFF ||
+		    ctx->hidepid > HIDEPID_INVISIBLE)
+			return invalf(fc, "proc: hidepid value must be between 0 and 2.\n");
+		break;
+
+	default:
+		return -EINVAL;
 	}
 
-	return 1;
+	ctx->mask |= 1 << opt;
+	return 0;
 }
 
-int proc_remount(struct super_block *sb, int *flags, char *data)
+static void proc_apply_options(struct super_block *s,
+			       struct fs_context *fc,
+			       struct pid_namespace *pid_ns,
+			       struct user_namespace *user_ns)
 {
+	struct proc_fs_context *ctx = fc->fs_private;
+
+	if (ctx->mask & (1 << Opt_gid))
+		pid_ns->pid_gid = make_kgid(user_ns, ctx->gid);
+	if (ctx->mask & (1 << Opt_hidepid))
+		pid_ns->hide_pid = ctx->hidepid;
+}
+
+static int proc_fill_super(struct super_block *s, struct fs_context *fc)
+{
+	struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info);
+	struct inode *root_inode;
+	int ret;
+
+	proc_apply_options(s, fc, pid_ns, current_user_ns());
+
+	/* User space would break if executables or devices appear on proc */
+	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+	s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = PROC_SUPER_MAGIC;
+	s->s_op = &proc_sops;
+	s->s_time_gran = 1;
+
+	/*
+	 * procfs isn't actually a stacking filesystem; however, there is
+	 * too much magic going on inside it to permit stacking things on
+	 * top of it
+	 */
+	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+	
+	/* procfs dentries and inodes don't require IO to create */
+	s->s_shrink.seeks = 0;
+
+	pde_get(&proc_root);
+	root_inode = proc_get_inode(s, &proc_root);
+	if (!root_inode) {
+		pr_err("proc_fill_super: get root inode failed\n");
+		return -ENOMEM;
+	}
+
+	s->s_root = d_make_root(root_inode);
+	if (!s->s_root) {
+		pr_err("proc_fill_super: allocate dentry failed\n");
+		return -ENOMEM;
+	}
+
+	ret = proc_setup_self(s);
+	if (ret) {
+		return ret;
+	}
+	return proc_setup_thread_self(s);
+}
+
+static int proc_reconfigure(struct fs_context *fc)
+{
+	struct super_block *sb = fc->root->d_sb;
 	struct pid_namespace *pid = sb->s_fs_info;
 
 	sync_filesystem(sb);
-	return !proc_parse_options(data, pid);
+
+	proc_apply_options(sb, fc, pid, current_user_ns());
+	return 0;
 }
 
-static struct dentry *proc_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int proc_get_tree(struct fs_context *fc)
 {
-	struct pid_namespace *ns;
+	struct proc_fs_context *ctx = fc->fs_private;
 
-	if (flags & SB_KERNMOUNT) {
-		ns = data;
-		data = NULL;
-	} else {
-		ns = task_active_pid_ns(current);
-	}
+	return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns);
+}
 
-	return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
+static void proc_fs_context_free(struct fs_context *fc)
+{
+	struct proc_fs_context *ctx = fc->fs_private;
+
+	put_pid_ns(ctx->pid_ns);
+	kfree(ctx);
+}
+
+static const struct fs_context_operations proc_fs_context_ops = {
+	.free		= proc_fs_context_free,
+	.parse_param	= proc_parse_param,
+	.get_tree	= proc_get_tree,
+	.reconfigure	= proc_reconfigure,
+};
+
+static int proc_init_fs_context(struct fs_context *fc)
+{
+	struct proc_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
+	put_user_ns(fc->user_ns);
+	fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+	fc->fs_private = ctx;
+	fc->ops = &proc_fs_context_ops;
+	return 0;
 }
 
 static void proc_kill_sb(struct super_block *sb)
@@ -115,10 +205,11 @@
 }
 
 static struct file_system_type proc_fs_type = {
-	.name		= "proc",
-	.mount		= proc_mount,
-	.kill_sb	= proc_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.name			= "proc",
+	.init_fs_context	= proc_init_fs_context,
+	.parameters		= &proc_fs_parameters,
+	.kill_sb		= proc_kill_sb,
+	.fs_flags		= FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
 };
 
 void __init proc_root_init(void)
@@ -154,9 +245,9 @@
 
 static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
 {
-	if (!proc_pid_lookup(dir, dentry, flags))
+	if (!proc_pid_lookup(dentry, flags))
 		return NULL;
-	
+
 	return proc_lookup(dir, dentry, flags);
 }
 
@@ -209,9 +300,28 @@
 
 int pid_ns_prepare_proc(struct pid_namespace *ns)
 {
+	struct proc_fs_context *ctx;
+	struct fs_context *fc;
 	struct vfsmount *mnt;
 
-	mnt = kern_mount_data(&proc_fs_type, ns);
+	fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	if (fc->user_ns != ns->user_ns) {
+		put_user_ns(fc->user_ns);
+		fc->user_ns = get_user_ns(ns->user_ns);
+	}
+
+	ctx = fc->fs_private;
+	if (ctx->pid_ns != ns) {
+		put_pid_ns(ctx->pid_ns);
+		get_pid_ns(ns);
+		ctx->pid_ns = ns;
+	}
+
+	mnt = fc_mount(fc);
+	put_fs_context(fc);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 

diff --git a/fs/proc/self.c b/fs/proc/self.c
index 127265e..57c0a10 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c

@@ -38,6 +38,7 @@
 	struct inode *root_inode = d_inode(s->s_root);
 	struct pid_namespace *ns = proc_pid_ns(root_inode);
 	struct dentry *self;
+	int ret = -ENOMEM;
 	
 	inode_lock(root_inode);
 	self = d_alloc_name(s->s_root, "self");
@@ -51,20 +52,19 @@
 			inode->i_gid = GLOBAL_ROOT_GID;
 			inode->i_op = &proc_self_inode_operations;
 			d_add(self, inode);
+			ret = 0;
 		} else {
 			dput(self);
-			self = ERR_PTR(-ENOMEM);
 		}
-	} else {
-		self = ERR_PTR(-ENOMEM);
 	}
 	inode_unlock(root_inode);
-	if (IS_ERR(self)) {
+
+	if (ret)
 		pr_err("proc_fill_super: can't allocate /proc/self\n");
-		return PTR_ERR(self);
-	}
-	ns->proc_self = self;
-	return 0;
+	else
+		ns->proc_self = self;
+
+	return ret;
 }
 
 void __init proc_self_init(void)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 535eda7..80c305f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c

@@ -23,21 +23,21 @@
 
 #ifdef arch_idle_time
 
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
 {
 	u64 idle;
 
-	idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+	idle = kcs->cpustat[CPUTIME_IDLE];
 	if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
 		idle += arch_idle_time(cpu);
 	return idle;
 }
 
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
 {
 	u64 iowait;
 
-	iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+	iowait = kcs->cpustat[CPUTIME_IOWAIT];
 	if (cpu_online(cpu) && nr_iowait_cpu(cpu))
 		iowait += arch_idle_time(cpu);
 	return iowait;
@@ -45,7 +45,7 @@
 
 #else
 
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
 {
 	u64 idle, idle_usecs = -1ULL;
 
@@ -54,14 +54,14 @@
 
 	if (idle_usecs == -1ULL)
 		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
-		idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+		idle = kcs->cpustat[CPUTIME_IDLE];
 	else
 		idle = idle_usecs * NSEC_PER_USEC;
 
 	return idle;
 }
 
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
 {
 	u64 iowait, iowait_usecs = -1ULL;
 
@@ -70,7 +70,7 @@
 
 	if (iowait_usecs == -1ULL)
 		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
-		iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+		iowait = kcs->cpustat[CPUTIME_IOWAIT];
 	else
 		iowait = iowait_usecs * NSEC_PER_USEC;
 
@@ -79,6 +79,31 @@
 
 #endif
 
+static void show_irq_gap(struct seq_file *p, unsigned int gap)
+{
+	static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
+
+	while (gap > 0) {
+		unsigned int inc;
+
+		inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2);
+		seq_write(p, zeros, 2 * inc);
+		gap -= inc;
+	}
+}
+
+static void show_all_irqs(struct seq_file *p)
+{
+	unsigned int i, next = 0;
+
+	for_each_active_irq(i) {
+		show_irq_gap(p, i - next);
+		seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
+		next = i + 1;
+	}
+	show_irq_gap(p, nr_irqs - next);
+}
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i, j;
@@ -95,16 +120,18 @@
 	getboottime64(&boottime);
 
 	for_each_possible_cpu(i) {
-		user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
-		nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
-		system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
-		idle += get_idle_time(i);
-		iowait += get_iowait_time(i);
-		irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
-		softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
-		steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
-		guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
-		guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+		struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
+		user += kcs->cpustat[CPUTIME_USER];
+		nice += kcs->cpustat[CPUTIME_NICE];
+		system += kcs->cpustat[CPUTIME_SYSTEM];
+		idle += get_idle_time(kcs, i);
+		iowait += get_iowait_time(kcs, i);
+		irq += kcs->cpustat[CPUTIME_IRQ];
+		softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
+		steal += kcs->cpustat[CPUTIME_STEAL];
+		guest += kcs->cpustat[CPUTIME_GUEST];
+		guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
 		sum += kstat_cpu_irqs_sum(i);
 		sum += arch_irq_stat_cpu(i);
 
@@ -130,17 +157,19 @@
 	seq_putc(p, '\n');
 
 	for_each_online_cpu(i) {
+		struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-		user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
-		nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
-		system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
-		idle = get_idle_time(i);
-		iowait = get_iowait_time(i);
-		irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
-		softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
-		steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
-		guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
-		guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+		user = kcs->cpustat[CPUTIME_USER];
+		nice = kcs->cpustat[CPUTIME_NICE];
+		system = kcs->cpustat[CPUTIME_SYSTEM];
+		idle = get_idle_time(kcs, i);
+		iowait = get_iowait_time(kcs, i);
+		irq = kcs->cpustat[CPUTIME_IRQ];
+		softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
+		steal = kcs->cpustat[CPUTIME_STEAL];
+		guest = kcs->cpustat[CPUTIME_GUEST];
+		guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
 		seq_printf(p, "cpu%d", i);
 		seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
 		seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
@@ -156,9 +185,7 @@
 	}
 	seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
 
-	/* sum again ? it could be updated? */
-	for_each_irq_nr(j)
-		seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
+	show_all_irqs(p);
 
 	seq_printf(p,
 		"\nctxt %llu\n"

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a027473..9442631 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c

@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
 #include <linux/vmacache.h>
 #include <linux/hugetlb.h>
 #include <linux/huge_mm.h>
@@ -59,7 +59,7 @@
 	SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
 	SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
 	SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
-	SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
+	SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
 	SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
 	SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
 	SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
@@ -166,7 +166,11 @@
 	if (!mm || !mmget_not_zero(mm))
 		return NULL;
 
-	down_read(&mm->mmap_sem);
+	if (down_read_killable(&mm->mmap_sem)) {
+		mmput(mm);
+		return ERR_PTR(-EINTR);
+	}
+
 	hold_task_mempolicy(priv);
 	priv->tail_vma = get_gate_vma(mm);
 
@@ -413,21 +417,58 @@
 	unsigned long lazyfree;
 	unsigned long anonymous_thp;
 	unsigned long shmem_thp;
+	unsigned long file_thp;
 	unsigned long swap;
 	unsigned long shared_hugetlb;
 	unsigned long private_hugetlb;
 	u64 pss;
+	u64 pss_anon;
+	u64 pss_file;
+	u64 pss_shmem;
 	u64 pss_locked;
 	u64 swap_pss;
 	bool check_shmem_swap;
 };
 
-static void smaps_account(struct mem_size_stats *mss, struct page *page,
-		bool compound, bool young, bool dirty)
+static void smaps_page_accumulate(struct mem_size_stats *mss,
+		struct page *page, unsigned long size, unsigned long pss,
+		bool dirty, bool locked, bool private)
 {
-	int i, nr = compound ? 1 << compound_order(page) : 1;
+	mss->pss += pss;
+
+	if (PageAnon(page))
+		mss->pss_anon += pss;
+	else if (PageSwapBacked(page))
+		mss->pss_shmem += pss;
+	else
+		mss->pss_file += pss;
+
+	if (locked)
+		mss->pss_locked += pss;
+
+	if (dirty || PageDirty(page)) {
+		if (private)
+			mss->private_dirty += size;
+		else
+			mss->shared_dirty += size;
+	} else {
+		if (private)
+			mss->private_clean += size;
+		else
+			mss->shared_clean += size;
+	}
+}
+
+static void smaps_account(struct mem_size_stats *mss, struct page *page,
+		bool compound, bool young, bool dirty, bool locked)
+{
+	int i, nr = compound ? compound_nr(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
 
+	/*
+	 * First accumulate quantities that depend only on |size| and the type
+	 * of the compound page.
+	 */
 	if (PageAnon(page)) {
 		mss->anonymous += size;
 		if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
@@ -440,35 +481,25 @@
 		mss->referenced += size;
 
 	/*
+	 * Then accumulate quantities that may depend on sharing, or that may
+	 * differ page-by-page.
+	 *
 	 * page_count(page) == 1 guarantees the page is mapped exactly once.
 	 * If any subpage of the compound page mapped with PTE it would elevate
 	 * page_count().
 	 */
 	if (page_count(page) == 1) {
-		if (dirty || PageDirty(page))
-			mss->private_dirty += size;
-		else
-			mss->private_clean += size;
-		mss->pss += (u64)size << PSS_SHIFT;
+		smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
+			locked, true);
 		return;
 	}
-
 	for (i = 0; i < nr; i++, page++) {
 		int mapcount = page_mapcount(page);
-
-		if (mapcount >= 2) {
-			if (dirty || PageDirty(page))
-				mss->shared_dirty += PAGE_SIZE;
-			else
-				mss->shared_clean += PAGE_SIZE;
-			mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
-		} else {
-			if (dirty || PageDirty(page))
-				mss->private_dirty += PAGE_SIZE;
-			else
-				mss->private_clean += PAGE_SIZE;
-			mss->pss += PAGE_SIZE << PSS_SHIFT;
-		}
+		unsigned long pss = PAGE_SIZE << PSS_SHIFT;
+		if (mapcount >= 2)
+			pss /= mapcount;
+		smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
+				      mapcount < 2);
 	}
 }
 
@@ -483,13 +514,16 @@
 
 	return 0;
 }
-#endif
+#else
+#define smaps_pte_hole		NULL
+#endif /* CONFIG_SHMEM */
 
 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 		struct mm_walk *walk)
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = walk->vma;
+	bool locked = !!(vma->vm_flags & VM_LOCKED);
 	struct page *page = NULL;
 
 	if (pte_present(*pte)) {
@@ -521,7 +555,7 @@
 		if (!page)
 			return;
 
-		if (radix_tree_exceptional_entry(page))
+		if (xa_is_value(page))
 			mss->swap += PAGE_SIZE;
 		else
 			put_page(page);
@@ -532,7 +566,7 @@
 	if (!page)
 		return;
 
-	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
+	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -541,6 +575,7 @@
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = walk->vma;
+	bool locked = !!(vma->vm_flags & VM_LOCKED);
 	struct page *page;
 
 	/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -554,8 +589,8 @@
 	else if (is_zone_device_page(page))
 		/* pass */;
 	else
-		VM_BUG_ON_PAGE(1, page);
-	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
+		mss->file_thp += HPAGE_PMD_SIZE;
+	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -697,21 +732,24 @@
 	}
 	return 0;
 }
+#else
+#define smaps_hugetlb_range	NULL
 #endif /* HUGETLB_PAGE */
 
+static const struct mm_walk_ops smaps_walk_ops = {
+	.pmd_entry		= smaps_pte_range,
+	.hugetlb_entry		= smaps_hugetlb_range,
+};
+
+static const struct mm_walk_ops smaps_shmem_walk_ops = {
+	.pmd_entry		= smaps_pte_range,
+	.hugetlb_entry		= smaps_hugetlb_range,
+	.pte_hole		= smaps_pte_hole,
+};
+
 static void smap_gather_stats(struct vm_area_struct *vma,
 			     struct mem_size_stats *mss)
 {
-	struct mm_walk smaps_walk = {
-		.pmd_entry = smaps_pte_range,
-#ifdef CONFIG_HUGETLB_PAGE
-		.hugetlb_entry = smaps_hugetlb_range,
-#endif
-		.mm = vma->vm_mm,
-	};
-
-	smaps_walk.private = mss;
-
 #ifdef CONFIG_SHMEM
 	/* In case of smaps_rollup, reset the value from previous vma */
 	mss->check_shmem_swap = false;
@@ -733,25 +771,36 @@
 			mss->swap += shmem_swapped;
 		} else {
 			mss->check_shmem_swap = true;
-			smaps_walk.pte_hole = smaps_pte_hole;
+			walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
+			return;
 		}
 	}
 #endif
-
 	/* mmap_sem is held in m_start */
-	walk_page_vma(vma, &smaps_walk);
-	if (vma->vm_flags & VM_LOCKED)
-		mss->pss_locked += mss->pss;
+	walk_page_vma(vma, &smaps_walk_ops, mss);
 }
 
 #define SEQ_PUT_DEC(str, val) \
 		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
 
 /* Show the contents common for smaps and smaps_rollup */
-static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss)
+static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
+	bool rollup_mode)
 {
 	SEQ_PUT_DEC("Rss:            ", mss->resident);
 	SEQ_PUT_DEC(" kB\nPss:            ", mss->pss >> PSS_SHIFT);
+	if (rollup_mode) {
+		/*
+		 * These are meaningful only for smaps_rollup, otherwise two of
+		 * them are zero, and the other one is the same as Pss.
+		 */
+		SEQ_PUT_DEC(" kB\nPss_Anon:       ",
+			mss->pss_anon >> PSS_SHIFT);
+		SEQ_PUT_DEC(" kB\nPss_File:       ",
+			mss->pss_file >> PSS_SHIFT);
+		SEQ_PUT_DEC(" kB\nPss_Shmem:      ",
+			mss->pss_shmem >> PSS_SHIFT);
+	}
 	SEQ_PUT_DEC(" kB\nShared_Clean:   ", mss->shared_clean);
 	SEQ_PUT_DEC(" kB\nShared_Dirty:   ", mss->shared_dirty);
 	SEQ_PUT_DEC(" kB\nPrivate_Clean:  ", mss->private_clean);
@@ -761,6 +810,7 @@
 	SEQ_PUT_DEC(" kB\nLazyFree:       ", mss->lazyfree);
 	SEQ_PUT_DEC(" kB\nAnonHugePages:  ", mss->anonymous_thp);
 	SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+	SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
 	SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
 	seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
 				  mss->private_hugetlb >> 10, 7);
@@ -788,7 +838,10 @@
 	SEQ_PUT_DEC(" kB\nMMUPageSize:    ", vma_mmu_pagesize(vma));
 	seq_puts(m, " kB\n");
 
-	__show_smap(m, &mss);
+	__show_smap(m, &mss, false);
+
+	seq_printf(m, "THPeligible:		%d\n",
+		   transparent_hugepage_enabled(vma));
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
@@ -820,7 +873,10 @@
 
 	memset(&mss, 0, sizeof(mss));
 
-	down_read(&mm->mmap_sem);
+	ret = down_read_killable(&mm->mmap_sem);
+	if (ret)
+		goto out_put_mm;
+
 	hold_task_mempolicy(priv);
 
 	for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
@@ -833,12 +889,13 @@
 	seq_pad(m, ' ');
 	seq_puts(m, "[rollup]\n");
 
-	__show_smap(m, &mss);
+	__show_smap(m, &mss, true);
 
 	release_task_mempolicy(priv);
 	up_read(&mm->mmap_sem);
-	mmput(mm);
 
+out_put_mm:
+	mmput(mm);
 out_put_task:
 	put_task_struct(priv->task);
 	priv->task = NULL;
@@ -940,10 +997,12 @@
 	pte_t ptent = *pte;
 
 	if (pte_present(ptent)) {
-		ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
-		ptent = pte_wrprotect(ptent);
+		pte_t old_pte;
+
+		old_pte = ptep_modify_prot_start(vma, addr, pte);
+		ptent = pte_wrprotect(old_pte);
 		ptent = pte_clear_soft_dirty(ptent);
-		ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
+		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
 	} else if (is_swap_pte(ptent)) {
 		ptent = pte_swp_clear_soft_dirty(ptent);
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
@@ -1067,6 +1126,11 @@
 	return 0;
 }
 
+static const struct mm_walk_ops clear_refs_walk_ops = {
+	.pmd_entry		= clear_refs_pte_range,
+	.test_walk		= clear_refs_test_walk,
+};
+
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
@@ -1096,15 +1160,10 @@
 		return -ESRCH;
 	mm = get_task_mm(task);
 	if (mm) {
+		struct mmu_notifier_range range;
 		struct clear_refs_private cp = {
 			.type = type,
 		};
-		struct mm_walk clear_refs_walk = {
-			.pmd_entry = clear_refs_pte_range,
-			.test_walk = clear_refs_test_walk,
-			.mm = mm,
-			.private = &cp,
-		};
 
 		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
 			if (down_write_killable(&mm->mmap_sem)) {
@@ -1121,7 +1180,10 @@
 			goto out_mm;
 		}
 
-		down_read(&mm->mmap_sem);
+		if (down_read_killable(&mm->mmap_sem)) {
+			count = -EINTR;
+			goto out_mm;
+		}
 		tlb_gather_mmu(&tlb, mm, 0, -1);
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
 			for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -1132,6 +1194,24 @@
 					count = -EINTR;
 					goto out_mm;
 				}
+				/*
+				 * Avoid to modify vma->vm_flags
+				 * without locked ops while the
+				 * coredump reads the vm_flags.
+				 */
+				if (!mmget_still_valid(mm)) {
+					/*
+					 * Silently return "count"
+					 * like if get_task_mm()
+					 * failed. FIXME: should this
+					 * function have returned
+					 * -ESRCH if get_task_mm()
+					 * failed like if
+					 * get_proc_task() fails?
+					 */
+					up_write(&mm->mmap_sem);
+					goto out_mm;
+				}
 				for (vma = mm->mmap; vma; vma = vma->vm_next) {
 					vma->vm_flags &= ~VM_SOFTDIRTY;
 					vma_set_page_prot(vma);
@@ -1139,11 +1219,15 @@
 				downgrade_write(&mm->mmap_sem);
 				break;
 			}
-			mmu_notifier_invalidate_range_start(mm, 0, -1);
+
+			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+						0, NULL, mm, 0, -1UL);
+			mmu_notifier_invalidate_range_start(&range);
 		}
-		walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
+		walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
+				&cp);
 		if (type == CLEAR_REFS_SOFT_DIRTY)
-			mmu_notifier_invalidate_range_end(mm, 0, -1);
+			mmu_notifier_invalidate_range_end(&range);
 		tlb_finish_mmu(&tlb, 0, -1);
 		up_read(&mm->mmap_sem);
 out_mm:
@@ -1247,7 +1331,7 @@
 		if (pm->show_pfn)
 			frame = pte_pfn(pte);
 		flags |= PM_PRESENT;
-		page = _vm_normal_page(vma, addr, pte, true);
+		page = vm_normal_page(vma, addr, pte);
 		if (pte_soft_dirty(pte))
 			flags |= PM_SOFT_DIRTY;
 	} else if (is_swap_pte(pte)) {
@@ -1413,8 +1497,16 @@
 
 	return err;
 }
+#else
+#define pagemap_hugetlb_range	NULL
 #endif /* HUGETLB_PAGE */
 
+static const struct mm_walk_ops pagemap_ops = {
+	.pmd_entry	= pagemap_pmd_range,
+	.pte_hole	= pagemap_pte_hole,
+	.hugetlb_entry	= pagemap_hugetlb_range,
+};
+
 /*
  * /proc/pid/pagemap - an array mapping virtual pages to pfns
  *
@@ -1446,7 +1538,6 @@
 {
 	struct mm_struct *mm = file->private_data;
 	struct pagemapread pm;
-	struct mm_walk pagemap_walk = {};
 	unsigned long src;
 	unsigned long svpfn;
 	unsigned long start_vaddr;
@@ -1474,14 +1565,6 @@
 	if (!pm.buffer)
 		goto out_mm;
 
-	pagemap_walk.pmd_entry = pagemap_pmd_range;
-	pagemap_walk.pte_hole = pagemap_pte_hole;
-#ifdef CONFIG_HUGETLB_PAGE
-	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
-#endif
-	pagemap_walk.mm = mm;
-	pagemap_walk.private = &pm;
-
 	src = *ppos;
 	svpfn = src / PM_ENTRY_BYTES;
 	start_vaddr = svpfn << PAGE_SHIFT;
@@ -1507,8 +1590,10 @@
 		/* overflow ? */
 		if (end < start_vaddr || end > end_vaddr)
 			end = end_vaddr;
-		down_read(&mm->mmap_sem);
-		ret = walk_page_range(start_vaddr, end, &pagemap_walk);
+		ret = down_read_killable(&mm->mmap_sem);
+		if (ret)
+			goto out_free;
+		ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
 		up_read(&mm->mmap_sem);
 		start_vaddr = end;
 
@@ -1720,6 +1805,11 @@
 }
 #endif
 
+static const struct mm_walk_ops show_numa_ops = {
+	.hugetlb_entry = gather_hugetlb_stats,
+	.pmd_entry = gather_pte_stats,
+};
+
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
@@ -1731,12 +1821,6 @@
 	struct numa_maps *md = &numa_priv->md;
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
-	struct mm_walk walk = {
-		.hugetlb_entry = gather_hugetlb_stats,
-		.pmd_entry = gather_pte_stats,
-		.private = md,
-		.mm = mm,
-	};
 	struct mempolicy *pol;
 	char buffer[64];
 	int nid;
@@ -1770,7 +1854,7 @@
 		seq_puts(m, " huge");
 
 	/* mmap_sem is held by m_start */
-	walk_page_vma(vma, &walk);
+	walk_page_vma(vma, &show_numa_ops, md);
 
 	if (!md->pages)
 		goto out;

diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 0b63d68..7907e64 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c

@@ -64,7 +64,7 @@
 	else
 		bytes += kobjsize(current->files);
 
-	if (current->sighand && atomic_read(&current->sighand->count) > 1)
+	if (current->sighand && refcount_read(&current->sighand->count) > 1)
 		sbytes += kobjsize(current->sighand);
 	else
 		bytes += kobjsize(current->sighand);
@@ -178,7 +178,7 @@
 		seq_file_path(m, file, "");
 	} else if (mm && is_stack(vma)) {
 		seq_pad(m, ' ');
-		seq_printf(m, "[stack]");
+		seq_puts(m, "[stack]");
 	}
 
 	seq_putc(m, '\n');
@@ -211,7 +211,11 @@
 	if (!mm || !mmget_not_zero(mm))
 		return NULL;
 
-	down_read(&mm->mmap_sem);
+	if (down_read_killable(&mm->mmap_sem)) {
+		mmput(mm);
+		return ERR_PTR(-EINTR);
+	}
+
 	/* start from the Nth VMA */
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
 		if (n-- == 0)

diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index b905010..f61ae53 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c

@@ -38,6 +38,7 @@
 	struct inode *root_inode = d_inode(s->s_root);
 	struct pid_namespace *ns = proc_pid_ns(root_inode);
 	struct dentry *thread_self;
+	int ret = -ENOMEM;
 
 	inode_lock(root_inode);
 	thread_self = d_alloc_name(s->s_root, "thread-self");
@@ -51,20 +52,19 @@
 			inode->i_gid = GLOBAL_ROOT_GID;
 			inode->i_op = &proc_thread_self_inode_operations;
 			d_add(thread_self, inode);
+			ret = 0;
 		} else {
 			dput(thread_self);
-			thread_self = ERR_PTR(-ENOMEM);
 		}
-	} else {
-		thread_self = ERR_PTR(-ENOMEM);
 	}
 	inode_unlock(root_inode);
-	if (IS_ERR(thread_self)) {
+
+	if (ret)
 		pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
-		return PTR_ERR(thread_self);
-	}
-	ns->proc_thread_self = thread_self;
-	return 0;
+	else
+		ns->proc_thread_self = thread_self;
+
+	return ret;
 }
 
 void __init proc_thread_self_init(void)

diff --git a/fs/proc/util.c b/fs/proc/util.c
index b161cfa..98f8adc 100644
--- a/fs/proc/util.c
+++ b/fs/proc/util.c

@@ -1,4 +1,5 @@
 #include <linux/dcache.h>
+#include "internal.h"
 
 unsigned name_to_int(const struct qstr *qstr)
 {

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728..7b13988 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *	fs/proc/vmcore.c Interface for accessing the crash
  * 				 dump from the system's previous life.
@@ -16,14 +17,17 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/printk.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/init.h>
 #include <linux/crash_dump.h>
 #include <linux/list.h>
+#include <linux/moduleparam.h>
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/uaccess.h>
+#include <linux/mem_encrypt.h>
+#include <asm/pgtable.h>
 #include <asm/io.h>
 #include "internal.h"
 
@@ -51,6 +55,9 @@
 /* Device Dump list and mutex to synchronize access to list */
 static LIST_HEAD(vmcoredd_list);
 static DEFINE_MUTEX(vmcoredd_mutex);
+
+static bool vmcoredd_disabled;
+core_param(novmcoredd, vmcoredd_disabled, bool, 0);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 /* Device Dump Size */
@@ -97,8 +104,9 @@
 }
 
 /* Reads a page from the oldmem device from given offset. */
-static ssize_t read_from_oldmem(char *buf, size_t count,
-				u64 *ppos, int userbuf)
+ssize_t read_from_oldmem(char *buf, size_t count,
+			 u64 *ppos, int userbuf,
+			 bool encrypted)
 {
 	unsigned long pfn, offset;
 	size_t nr_bytes;
@@ -120,8 +128,15 @@
 		if (pfn_is_ram(pfn) == 0)
 			memset(buf, 0, nr_bytes);
 		else {
-			tmp = copy_oldmem_page(pfn, buf, nr_bytes,
-						offset, userbuf);
+			if (encrypted)
+				tmp = copy_oldmem_page_encrypted(pfn, buf,
+								 nr_bytes,
+								 offset,
+								 userbuf);
+			else
+				tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+						       offset, userbuf);
+
 			if (tmp < 0)
 				return tmp;
 		}
@@ -155,7 +170,7 @@
  */
 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-	return read_from_oldmem(buf, count, ppos, 0);
+	return read_from_oldmem(buf, count, ppos, 0, false);
 }
 
 /*
@@ -163,7 +178,7 @@
  */
 ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
 {
-	return read_from_oldmem(buf, count, ppos, 0);
+	return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active());
 }
 
 /*
@@ -173,10 +188,21 @@
 				  unsigned long from, unsigned long pfn,
 				  unsigned long size, pgprot_t prot)
 {
+	prot = pgprot_encrypted(prot);
 	return remap_pfn_range(vma, from, pfn, size, prot);
 }
 
 /*
+ * Architectures which support memory encryption override this.
+ */
+ssize_t __weak
+copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
+			   unsigned long offset, int userbuf)
+{
+	return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
+}
+
+/*
  * Copy to either kernel or user space
  */
 static int copy_to(void *target, void *src, size_t size, int userbuf)
@@ -351,7 +377,8 @@
 					    m->offset + m->size - *fpos,
 					    buflen);
 			start = m->paddr + *fpos - m->offset;
-			tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
+			tmp = read_from_oldmem(buffer, tsz, &start,
+					       userbuf, mem_encrypt_active());
 			if (tmp < 0)
 				return tmp;
 			buflen -= tsz;
@@ -401,7 +428,7 @@
 		if (rc < 0) {
 			unlock_page(page);
 			put_page(page);
-			return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+			return vmf_error(rc);
 		}
 		SetPageUptodate(page);
 	}
@@ -1429,6 +1456,11 @@
 	size_t data_size;
 	int ret;
 
+	if (vmcoredd_disabled) {
+		pr_err_once("Device dump is disabled\n");
+		return -EINVAL;
+	}
+
 	if (!data || !strlen(data->dump_name) ||
 	    !data->vmcoredd_callback || !data->size)
 		return -EINVAL;

diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index e16fb8f..273ee82 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c

@@ -88,7 +88,7 @@
 static void show_type(struct seq_file *m, struct super_block *sb)
 {
 	mangle(m, sb->s_type->name);
-	if (sb->s_subtype && sb->s_subtype[0]) {
+	if (sb->s_subtype) {
 		seq_putc(m, '.');
 		mangle(m, sb->s_subtype);
 	}

diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 503086f..8f0369a 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config PSTORE
 	tristate "Persistent store support"
 	select CRYPTO if PSTORE_COMPRESS
@@ -141,7 +142,6 @@
 	tristate "Log panic/oops to a RAM buffer"
 	depends on PSTORE
 	depends on HAS_IOMEM
-	depends on HAVE_MEMBLOCK
 	select REED_SOLOMON
 	select REED_SOLOMON_ENC8
 	select REED_SOLOMON_DEC8

diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 06aab07..bfbfc26 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c

@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright 2012  Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
@@ -120,27 +112,13 @@
 
 void pstore_register_ftrace(void)
 {
-	struct dentry *file;
-
 	if (!psinfo->write)
 		return;
 
 	pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
-	if (!pstore_ftrace_dir) {
-		pr_err("%s: unable to create pstore directory\n", __func__);
-		return;
-	}
 
-	file = debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir,
-				   NULL, &pstore_knob_fops);
-	if (!file) {
-		pr_err("%s: unable to create record_ftrace file\n", __func__);
-		goto err_file;
-	}
-
-	return;
-err_file:
-	debugfs_remove(pstore_ftrace_dir);
+	debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir, NULL,
+			    &pstore_knob_fops);
 }
 
 void pstore_unregister_ftrace(void)
@@ -148,7 +126,7 @@
 	mutex_lock(&pstore_ftrace_lock);
 	if (pstore_ftrace_enabled) {
 		unregister_ftrace_function(&pstore_ftrace_ops);
-		pstore_ftrace_enabled = 0;
+		pstore_ftrace_enabled = false;
 	}
 	mutex_unlock(&pstore_ftrace_lock);
 

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5fcb845..7fbe8f0 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c

@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Persistent Storage - ramfs parts.
  *
  * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/module.h>
@@ -115,7 +103,7 @@
 
 	rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off);
 
-	seq_printf(s, "CPU:%d ts:%llu %08lx  %08lx  %pf <- %pF\n",
+	seq_printf(s, "CPU:%d ts:%llu %08lx  %08lx  %ps <- %pS\n",
 		   pstore_ftrace_decode_cpu(rec),
 		   pstore_ftrace_read_timestamp(rec),
 		   rec->ip, rec->parent_ip, (void *)rec->ip,
@@ -330,65 +318,21 @@
 		goto fail;
 	inode->i_mode = S_IFREG | 0444;
 	inode->i_fop = &pstore_file_operations;
+	scnprintf(name, sizeof(name), "%s-%s-%llu%s",
+			pstore_type_to_name(record->type),
+			record->psi->name, record->id,
+			record->compressed ? ".enc.z" : "");
+
 	private = kzalloc(sizeof(*private), GFP_KERNEL);
 	if (!private)
-		goto fail_alloc;
-	private->record = record;
-
-	switch (record->type) {
-	case PSTORE_TYPE_DMESG:
-		scnprintf(name, sizeof(name), "dmesg-%s-%llu%s",
-			  record->psi->name, record->id,
-			  record->compressed ? ".enc.z" : "");
-		break;
-	case PSTORE_TYPE_CONSOLE:
-		scnprintf(name, sizeof(name), "console-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	case PSTORE_TYPE_FTRACE:
-		scnprintf(name, sizeof(name), "ftrace-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	case PSTORE_TYPE_MCE:
-		scnprintf(name, sizeof(name), "mce-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	case PSTORE_TYPE_PPC_RTAS:
-		scnprintf(name, sizeof(name), "rtas-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	case PSTORE_TYPE_PPC_OF:
-		scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	case PSTORE_TYPE_PPC_COMMON:
-		scnprintf(name, sizeof(name), "powerpc-common-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	case PSTORE_TYPE_PMSG:
-		scnprintf(name, sizeof(name), "pmsg-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	case PSTORE_TYPE_PPC_OPAL:
-		scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	case PSTORE_TYPE_UNKNOWN:
-		scnprintf(name, sizeof(name), "unknown-%s-%llu",
-			  record->psi->name, record->id);
-		break;
-	default:
-		scnprintf(name, sizeof(name), "type%d-%s-%llu",
-			  record->type, record->psi->name, record->id);
-		break;
-	}
+		goto fail_inode;
 
 	dentry = d_alloc_name(root, name);
 	if (!dentry)
 		goto fail_private;
 
+	private->record = record;
 	inode->i_size = private->total_size = size;
-
 	inode->i_private = private;
 
 	if (record->time.tv_sec)
@@ -404,7 +348,7 @@
 
 fail_private:
 	free_pstore_private(private);
-fail_alloc:
+fail_inode:
 	iput(inode);
 
 fail:
@@ -482,12 +426,10 @@
 	.kill_sb	= pstore_kill_sb,
 };
 
-static int __init init_pstore_fs(void)
+int __init pstore_init_fs(void)
 {
 	int err;
 
-	pstore_choose_compression();
-
 	/* Create a convenient mount point for people to access pstore */
 	err = sysfs_create_mount_point(fs_kobj, "pstore");
 	if (err)
@@ -500,14 +442,9 @@
 out:
 	return err;
 }
-module_init(init_pstore_fs)
 
-static void __exit exit_pstore_fs(void)
+void __exit pstore_exit_fs(void)
 {
 	unregister_filesystem(&pstore_fs_type);
 	sysfs_remove_mount_point(fs_kobj, "pstore");
 }
-module_exit(exit_pstore_fs)
-
-MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
-MODULE_LICENSE("GPL");

diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index fb767e2..7062ea4 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h

@@ -37,7 +37,8 @@
 extern void	pstore_record_init(struct pstore_record *record,
 				   struct pstore_info *psi);
 
-/* Called during module_init() */
-extern void __init pstore_choose_compression(void);
+/* Called during pstore init/exit. */
+int __init	pstore_init_fs(void);
+void __exit	pstore_exit_fs(void);
 
 #endif

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 15e99d5..3d70246 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Persistent Storage - platform driver interface parts.
  *
  * Copyright (C) 2007-2008 Google, Inc.
  * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #define pr_fmt(fmt) "pstore: " fmt
@@ -59,6 +47,19 @@
 		 "enabling this option is not safe, it may lead to further "
 		 "corruption on Oopses)");
 
+/* Names should be in the same order as the enum pstore_type_id */
+static const char * const pstore_type_names[] = {
+	"dmesg",
+	"mce",
+	"console",
+	"ftrace",
+	"rtas",
+	"powerpc-ofw",
+	"powerpc-common",
+	"pmsg",
+	"powerpc-opal",
+};
+
 static int pstore_new_entry;
 
 static void pstore_timefunc(struct timer_list *);
@@ -104,6 +105,30 @@
 /* Tag each group of saved records with a sequence number */
 static int	oopscount;
 
+const char *pstore_type_to_name(enum pstore_type_id type)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(pstore_type_names) != PSTORE_TYPE_MAX);
+
+	if (WARN_ON_ONCE(type >= PSTORE_TYPE_MAX))
+		return "unknown";
+
+	return pstore_type_names[type];
+}
+EXPORT_SYMBOL_GPL(pstore_type_to_name);
+
+enum pstore_type_id pstore_name_to_type(const char *name)
+{
+	int i;
+
+	for (i = 0; i < PSTORE_TYPE_MAX; i++) {
+		if (!strcmp(pstore_type_names[i], name))
+			return i;
+	}
+
+	return PSTORE_TYPE_MAX;
+}
+EXPORT_SYMBOL_GPL(pstore_name_to_type);
+
 static const char *get_reason_str(enum kmsg_dump_reason reason)
 {
 	switch (reason) {
@@ -124,26 +149,27 @@
 	}
 }
 
-bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+/*
+ * Should pstore_dump() wait for a concurrent pstore_dump()? If
+ * not, the current pstore_dump() will report a failure to dump
+ * and return.
+ */
+static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
 {
-	/*
-	 * In case of NMI path, pstore shouldn't be blocked
-	 * regardless of reason.
-	 */
+	/* In NMI path, pstore shouldn't block regardless of reason. */
 	if (in_nmi())
 		return true;
 
 	switch (reason) {
 	/* In panic case, other cpus are stopped by smp_send_stop(). */
 	case KMSG_DUMP_PANIC:
-	/* Emergency restart shouldn't be blocked by spin lock. */
+	/* Emergency restart shouldn't be blocked. */
 	case KMSG_DUMP_EMERG:
 		return true;
 	default:
 		return false;
 	}
 }
-EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
 
 #if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
 static int zbufsize_deflate(size_t size)
@@ -258,53 +284,61 @@
 	return outlen;
 }
 
-static int pstore_decompress(void *in, void *out,
-			     unsigned int inlen, unsigned int outlen)
-{
-	int ret;
-
-	ret = crypto_comp_decompress(tfm, in, inlen, out, &outlen);
-	if (ret) {
-		pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
-		return ret;
-	}
-
-	return outlen;
-}
-
 static void allocate_buf_for_compression(void)
 {
+	struct crypto_comp *ctx;
+	int size;
+	char *buf;
+
+	/* Skip if not built-in or compression backend not selected yet. */
 	if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend)
 		return;
 
+	/* Skip if no pstore backend yet or compression init already done. */
+	if (!psinfo || tfm)
+		return;
+
 	if (!crypto_has_comp(zbackend->name, 0, 0)) {
-		pr_err("No %s compression\n", zbackend->name);
+		pr_err("Unknown compression: %s\n", zbackend->name);
 		return;
 	}
 
-	big_oops_buf_sz = zbackend->zbufsize(psinfo->bufsize);
-	if (big_oops_buf_sz <= 0)
-		return;
-
-	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
-	if (!big_oops_buf) {
-		pr_err("allocate compression buffer error!\n");
+	size = zbackend->zbufsize(psinfo->bufsize);
+	if (size <= 0) {
+		pr_err("Invalid compression size for %s: %d\n",
+		       zbackend->name, size);
 		return;
 	}
 
-	tfm = crypto_alloc_comp(zbackend->name, 0, 0);
-	if (IS_ERR_OR_NULL(tfm)) {
-		kfree(big_oops_buf);
-		big_oops_buf = NULL;
-		pr_err("crypto_alloc_comp() failed!\n");
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf) {
+		pr_err("Failed %d byte compression buffer allocation for: %s\n",
+		       size, zbackend->name);
 		return;
 	}
+
+	ctx = crypto_alloc_comp(zbackend->name, 0, 0);
+	if (IS_ERR_OR_NULL(ctx)) {
+		kfree(buf);
+		pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
+		       PTR_ERR(ctx));
+		return;
+	}
+
+	/* A non-NULL big_oops_buf indicates compression is available. */
+	tfm = ctx;
+	big_oops_buf_sz = size;
+	big_oops_buf = buf;
+
+	pr_info("Using crash dump compression: %s\n", zbackend->name);
 }
 
 static void free_buf_for_compression(void)
 {
-	if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && !IS_ERR_OR_NULL(tfm))
+	if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) {
 		crypto_free_comp(tfm);
+		tfm = NULL;
+	}
 	kfree(big_oops_buf);
 	big_oops_buf = NULL;
 	big_oops_buf_sz = 0;
@@ -348,9 +382,8 @@
 }
 
 /*
- * callback from kmsg_dump. (s2,l2) has the most recently
- * written bytes, older bytes are in (s1,l1). Save as much
- * as we can from the end of the buffer.
+ * callback from kmsg_dump. Save as much as we can (up to kmsg_bytes) from the
+ * end of the buffer.
  */
 static void pstore_dump(struct kmsg_dumper *dumper,
 			enum kmsg_dump_reason reason)
@@ -358,23 +391,23 @@
 	unsigned long	total = 0;
 	const char	*why;
 	unsigned int	part = 1;
-	unsigned long	flags = 0;
-	int		is_locked;
 	int		ret;
 
 	why = get_reason_str(reason);
 
-	if (pstore_cannot_block_path(reason)) {
-		is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
-		if (!is_locked) {
-			pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
-				       , in_nmi() ? "NMI" : why);
+	if (down_trylock(&psinfo->buf_lock)) {
+		/* Failed to acquire lock: give up if we cannot wait. */
+		if (pstore_cannot_wait(reason)) {
+			pr_err("dump skipped in %s path: may corrupt error record\n",
+				in_nmi() ? "NMI" : why);
 			return;
 		}
-	} else {
-		spin_lock_irqsave(&psinfo->buf_lock, flags);
-		is_locked = 1;
+		if (down_interruptible(&psinfo->buf_lock)) {
+			pr_err("could not grab semaphore?!\n");
+			return;
+		}
 	}
+
 	oopscount++;
 	while (total < kmsg_bytes) {
 		char *dst;
@@ -391,7 +424,7 @@
 		record.part = part;
 		record.buf = psinfo->buf;
 
-		if (big_oops_buf && is_locked) {
+		if (big_oops_buf) {
 			dst = big_oops_buf;
 			dst_size = big_oops_buf_sz;
 		} else {
@@ -409,7 +442,7 @@
 					  dst_size, &dump_size))
 			break;
 
-		if (big_oops_buf && is_locked) {
+		if (big_oops_buf) {
 			zipped_len = pstore_compress(dst, psinfo->buf,
 						header_size + dump_size,
 						psinfo->bufsize);
@@ -432,8 +465,8 @@
 		total += record.size;
 		part++;
 	}
-	if (is_locked)
-		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+
+	up(&psinfo->buf_lock);
 }
 
 static struct kmsg_dumper pstore_dumper = {
@@ -456,31 +489,17 @@
 #ifdef CONFIG_PSTORE_CONSOLE
 static void pstore_console_write(struct console *con, const char *s, unsigned c)
 {
-	const char *e = s + c;
+	struct pstore_record record;
 
-	while (s < e) {
-		struct pstore_record record;
-		unsigned long flags;
+	if (!c)
+		return;
 
-		pstore_record_init(&record, psinfo);
-		record.type = PSTORE_TYPE_CONSOLE;
+	pstore_record_init(&record, psinfo);
+	record.type = PSTORE_TYPE_CONSOLE;
 
-		if (c > psinfo->bufsize)
-			c = psinfo->bufsize;
-
-		if (oops_in_progress) {
-			if (!spin_trylock_irqsave(&psinfo->buf_lock, flags))
-				break;
-		} else {
-			spin_lock_irqsave(&psinfo->buf_lock, flags);
-		}
-		record.buf = (char *)s;
-		record.size = c;
-		psinfo->write(&record);
-		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
-		s += c;
-		c = e - s;
-	}
+	record.buf = (char *)s;
+	record.size = c;
+	psinfo->write(&record);
 }
 
 static struct console pstore_console = {
@@ -569,6 +588,7 @@
 		psi->write_user = pstore_write_user_compat;
 	psinfo = psi;
 	mutex_init(&psinfo->read_mutex);
+	sema_init(&psinfo->buf_lock, 1);
 	spin_unlock(&pstore_lock);
 
 	if (owner && !try_module_get(owner)) {
@@ -576,7 +596,8 @@
 		return -EINVAL;
 	}
 
-	allocate_buf_for_compression();
+	if (psi->flags & PSTORE_FLAGS_DMESG)
+		allocate_buf_for_compression();
 
 	if (pstore_is_mounted())
 		pstore_get_records(0);
@@ -636,8 +657,9 @@
 
 static void decompress_record(struct pstore_record *record)
 {
+	int ret;
 	int unzipped_len;
-	char *decompressed;
+	char *unzipped, *workspace;
 
 	if (!record->compressed)
 		return;
@@ -648,35 +670,42 @@
 		return;
 	}
 
-	/* No compression method has created the common buffer. */
+	/* Missing compression buffer means compression was not initialized. */
 	if (!big_oops_buf) {
-		pr_warn("no decompression buffer allocated\n");
+		pr_warn("no decompression method initialized!\n");
 		return;
 	}
 
-	unzipped_len = pstore_decompress(record->buf, big_oops_buf,
-					 record->size, big_oops_buf_sz);
-	if (unzipped_len <= 0) {
-		pr_err("decompression failed: %d\n", unzipped_len);
+	/* Allocate enough space to hold max decompression and ECC. */
+	unzipped_len = big_oops_buf_sz;
+	workspace = kmalloc(unzipped_len + record->ecc_notice_size,
+			    GFP_KERNEL);
+	if (!workspace)
 		return;
-	}
 
-	/* Build new buffer for decompressed contents. */
-	decompressed = kmalloc(unzipped_len + record->ecc_notice_size,
-			       GFP_KERNEL);
-	if (!decompressed) {
-		pr_err("decompression ran out of memory\n");
+	/* After decompression "unzipped_len" is almost certainly smaller. */
+	ret = crypto_comp_decompress(tfm, record->buf, record->size,
+					  workspace, &unzipped_len);
+	if (ret) {
+		pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
+		kfree(workspace);
 		return;
 	}
-	memcpy(decompressed, big_oops_buf, unzipped_len);
 
 	/* Append ECC notice to decompressed buffer. */
-	memcpy(decompressed + unzipped_len, record->buf + record->size,
+	memcpy(workspace + unzipped_len, record->buf + record->size,
 	       record->ecc_notice_size);
 
-	/* Swap out compresed contents with decompressed contents. */
+	/* Copy decompressed contents into an minimum-sized allocation. */
+	unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size,
+			   GFP_KERNEL);
+	kfree(workspace);
+	if (!unzipped)
+		return;
+
+	/* Swap out compressed contents with decompressed contents. */
 	kfree(record->buf);
-	record->buf = decompressed;
+	record->buf = unzipped;
 	record->size = unzipped_len;
 	record->compressed = false;
 }
@@ -774,14 +803,43 @@
 	for (step = zbackends; step->name; step++) {
 		if (!strcmp(compress, step->name)) {
 			zbackend = step;
-			pr_info("using %s compression\n", zbackend->name);
 			return;
 		}
 	}
 }
 
+static int __init pstore_init(void)
+{
+	int ret;
+
+	pstore_choose_compression();
+
+	/*
+	 * Check if any pstore backends registered earlier but did not
+	 * initialize compression because crypto was not ready. If so,
+	 * initialize compression now.
+	 */
+	allocate_buf_for_compression();
+
+	ret = pstore_init_fs();
+	if (ret)
+		return ret;
+
+	return 0;
+}
+late_initcall(pstore_init);
+
+static void __exit pstore_exit(void)
+{
+	pstore_exit_fs();
+}
+module_exit(pstore_exit)
+
 module_param(compress, charp, 0444);
 MODULE_PARM_DESC(compress, "Pstore compression to use");
 
 module_param(backend, charp, 0444);
 MODULE_PARM_DESC(backend, "Pstore backend to use");
+
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");

diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 24db02d..d8542ec 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c

@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright 2014  Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/cdev.h>
@@ -33,7 +25,7 @@
 	record.size = count;
 
 	/* check outside lock, page in any data. write_user also checks */
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	mutex_lock(&pmsg_lock);

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 03cd593..8caff83 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c

@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * RAM Oops/Panic logger
  *
  * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com>
  * Copyright (C) 2011 Kees Cook <keescook@chromium.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
- *
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -110,7 +96,6 @@
 };
 
 static struct platform_device *dummy;
-static struct ramoops_platform_data *dummy_data;
 
 static int ramoops_pstore_open(struct pstore_info *psi)
 {
@@ -124,31 +109,28 @@
 }
 
 static struct persistent_ram_zone *
-ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
-		     u64 *id,
-		     enum pstore_type_id *typep, enum pstore_type_id type,
-		     bool update)
+ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id,
+		     struct pstore_record *record)
 {
 	struct persistent_ram_zone *prz;
-	int i = (*c)++;
 
 	/* Give up if we never existed or have hit the end. */
-	if (!przs || i >= max)
+	if (!przs)
 		return NULL;
 
-	prz = przs[i];
+	prz = przs[id];
 	if (!prz)
 		return NULL;
 
 	/* Update old/shadowed buffer. */
-	if (update)
+	if (prz->type == PSTORE_TYPE_DMESG)
 		persistent_ram_save_old(prz);
 
 	if (!persistent_ram_old_size(prz))
 		return NULL;
 
-	*typep = type;
-	*id = i;
+	record->type = prz->type;
+	record->id = id;
 
 	return prz;
 }
@@ -162,6 +144,7 @@
 	if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lld.%lu-%c\n%n",
 		   (time64_t *)&time->tv_sec, &time->tv_nsec, &data_type,
 		   &header_length) == 3) {
+		time->tv_nsec *= 1000;
 		if (data_type == 'C')
 			*compressed = true;
 		else
@@ -169,6 +152,7 @@
 	} else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lld.%lu\n%n",
 			  (time64_t *)&time->tv_sec, &time->tv_nsec,
 			  &header_length) == 2) {
+		time->tv_nsec *= 1000;
 		*compressed = false;
 	} else {
 		time->tv_sec = 0;
@@ -255,10 +239,8 @@
 
 	/* Find the next valid persistent_ram_zone for DMESG */
 	while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
-		prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
-					   cxt->max_dump_cnt, &record->id,
-					   &record->type,
-					   PSTORE_TYPE_DMESG, 1);
+		prz = ramoops_get_next_prz(cxt->dprzs, cxt->dump_read_cnt++,
+					   record);
 		if (!prz_ok(prz))
 			continue;
 		header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
@@ -272,22 +254,18 @@
 		}
 	}
 
-	if (!prz_ok(prz))
-		prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
-					   1, &record->id, &record->type,
-					   PSTORE_TYPE_CONSOLE, 0);
+	if (!prz_ok(prz) && !cxt->console_read_cnt++)
+		prz = ramoops_get_next_prz(&cxt->cprz, 0 /* single */, record);
 
-	if (!prz_ok(prz))
-		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
-					   1, &record->id, &record->type,
-					   PSTORE_TYPE_PMSG, 0);
+	if (!prz_ok(prz) && !cxt->pmsg_read_cnt++)
+		prz = ramoops_get_next_prz(&cxt->mprz, 0 /* single */, record);
 
 	/* ftrace is last since it may want to dynamically allocate memory. */
 	if (!prz_ok(prz)) {
-		if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
-			prz = ramoops_get_next_prz(cxt->fprzs,
-					&cxt->ftrace_read_cnt, 1, &record->id,
-					&record->type, PSTORE_TYPE_FTRACE, 0);
+		if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) &&
+		    !cxt->ftrace_read_cnt++) {
+			prz = ramoops_get_next_prz(cxt->fprzs, 0 /* single */,
+						   record);
 		} else {
 			/*
 			 * Build a new dummy record which combines all the
@@ -299,15 +277,12 @@
 					  GFP_KERNEL);
 			if (!tmp_prz)
 				return -ENOMEM;
+			prz = tmp_prz;
 			free_prz = true;
 
 			while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
 				prz_next = ramoops_get_next_prz(cxt->fprzs,
-						&cxt->ftrace_read_cnt,
-						cxt->max_ftrace_cnt,
-						&record->id,
-						&record->type,
-						PSTORE_TYPE_FTRACE, 0);
+						cxt->ftrace_read_cnt++, record);
 
 				if (!prz_ok(prz_next))
 					continue;
@@ -321,7 +296,6 @@
 					goto out;
 			}
 			record->id = 0;
-			prz = tmp_prz;
 		}
 	}
 
@@ -359,17 +333,15 @@
 static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
 				     struct pstore_record *record)
 {
-	char *hdr;
+	char hdr[36]; /* "===="(4), %lld(20), "."(1), %06lu(6), "-%c\n"(3) */
 	size_t len;
 
-	hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n",
+	len = scnprintf(hdr, sizeof(hdr),
+		RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n",
 		(time64_t)record->time.tv_sec,
 		record->time.tv_nsec / 1000,
 		record->compressed ? 'C' : 'D');
-	WARN_ON_ONCE(!hdr);
-	len = hdr ? strlen(hdr) : 0;
 	persistent_ram_write(prz, hdr, len);
-	kfree(hdr);
 
 	return len;
 }
@@ -437,6 +409,9 @@
 
 	/* Build header and append record contents. */
 	hlen = ramoops_write_kmsg_hdr(prz, record);
+	if (!hlen)
+		return -ENOMEM;
+
 	size = record->size;
 	if (size + hlen > prz->buffer_size)
 		size = prz->buffer_size - hlen;
@@ -587,9 +562,16 @@
 		goto fail;
 
 	for (i = 0; i < *cnt; i++) {
+		char *label;
+
+		if (*cnt == 1)
+			label = kasprintf(GFP_KERNEL, "ramoops:%s", name);
+		else
+			label = kasprintf(GFP_KERNEL, "ramoops:%s(%d/%d)",
+					  name, i, *cnt - 1);
 		prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
-						  &cxt->ecc_info,
-						  cxt->memtype, flags);
+					       &cxt->ecc_info,
+					       cxt->memtype, flags, label);
 		if (IS_ERR(prz_ar[i])) {
 			err = PTR_ERR(prz_ar[i]);
 			dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
@@ -604,6 +586,7 @@
 			goto fail;
 		}
 		*paddr += zone_sz;
+		prz_ar[i]->type = pstore_name_to_type(name);
 	}
 
 	*przs = prz_ar;
@@ -619,6 +602,8 @@
 			    struct persistent_ram_zone **prz,
 			    phys_addr_t *paddr, size_t sz, u32 sig)
 {
+	char *label;
+
 	if (!sz)
 		return 0;
 
@@ -629,8 +614,9 @@
 		return -ENOMEM;
 	}
 
+	label = kasprintf(GFP_KERNEL, "ramoops:%s", name);
 	*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
-				  cxt->memtype, 0);
+				  cxt->memtype, PRZ_FLAG_ZAP_OLD, label);
 	if (IS_ERR(*prz)) {
 		int err = PTR_ERR(*prz);
 
@@ -639,9 +625,8 @@
 		return err;
 	}
 
-	persistent_ram_zap(*prz);
-
 	*paddr += sz;
+	(*prz)->type = pstore_name_to_type(name);
 
 	return 0;
 }
@@ -672,6 +657,7 @@
 			    struct ramoops_platform_data *pdata)
 {
 	struct device_node *of_node = pdev->dev.of_node;
+	struct device_node *parent_node;
 	struct resource *res;
 	u32 value;
 	int ret;
@@ -706,6 +692,26 @@
 
 #undef parse_size
 
+	/*
+	 * Some old Chromebooks relied on the kernel setting the
+	 * console_size and pmsg_size to the record size since that's
+	 * what the downstream kernel did.  These same Chromebooks had
+	 * "ramoops" straight under the root node which isn't
+	 * according to the current upstream bindings (though it was
+	 * arguably acceptable under a prior version of the bindings).
+	 * Let's make those old Chromebooks work by detecting that
+	 * we're not a child of "reserved-memory" and mimicking the
+	 * expected behavior.
+	 */
+	parent_node = of_get_parent(of_node);
+	if (!of_node_name_eq(parent_node, "reserved-memory") &&
+	    !pdata->console_size && !pdata->ftrace_size &&
+	    !pdata->pmsg_size && !pdata->ecc_info.ecc_size) {
+		pdata->console_size = pdata->record_size;
+		pdata->pmsg_size = pdata->record_size;
+	}
+	of_node_put(parent_node);
+
 	return 0;
 }
 
@@ -713,24 +719,12 @@
 {
 	struct device *dev = &pdev->dev;
 	struct ramoops_platform_data *pdata = dev->platform_data;
+	struct ramoops_platform_data pdata_local;
 	struct ramoops_context *cxt = &oops_cxt;
 	size_t dump_mem_sz;
 	phys_addr_t paddr;
 	int err = -EINVAL;
 
-	if (dev_of_node(dev) && !pdata) {
-		pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
-		if (!pdata) {
-			pr_err("cannot allocate platform data buffer\n");
-			err = -ENOMEM;
-			goto fail_out;
-		}
-
-		err = ramoops_parse_dt(pdev, pdata);
-		if (err < 0)
-			goto fail_out;
-	}
-
 	/*
 	 * Only a single ramoops area allowed at a time, so fail extra
 	 * probes.
@@ -740,6 +734,15 @@
 		goto fail_out;
 	}
 
+	if (dev_of_node(dev) && !pdata) {
+		pdata = &pdata_local;
+		memset(pdata, 0, sizeof(*pdata));
+
+		err = ramoops_parse_dt(pdev, pdata);
+		if (err < 0)
+			goto fail_out;
+	}
+
 	/* Make sure we didn't get bogus platform data pointer. */
 	if (!pdata) {
 		pr_err("NULL platform data\n");
@@ -777,7 +780,7 @@
 
 	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
 			- cxt->pmsg_size;
-	err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
+	err = ramoops_init_przs("dmesg", dev, cxt, &cxt->dprzs, &paddr,
 				dump_mem_sz, cxt->record_size,
 				&cxt->max_dump_cnt, 0, 0);
 	if (err)
@@ -806,26 +809,35 @@
 
 	cxt->pstore.data = cxt;
 	/*
+	 * Prepare frontend flags based on which areas are initialized.
+	 * For ramoops_init_przs() cases, the "max count" variable tells
+	 * if there are regions present. For ramoops_init_prz() cases,
+	 * the single region size is how to check.
+	 */
+	cxt->pstore.flags = 0;
+	if (cxt->max_dump_cnt)
+		cxt->pstore.flags |= PSTORE_FLAGS_DMESG;
+	if (cxt->console_size)
+		cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
+	if (cxt->max_ftrace_cnt)
+		cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
+	if (cxt->pmsg_size)
+		cxt->pstore.flags |= PSTORE_FLAGS_PMSG;
+
+	/*
 	 * Since bufsize is only used for dmesg crash dumps, it
 	 * must match the size of the dprz record (after PRZ header
 	 * and ECC bytes have been accounted for).
 	 */
-	cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size;
-	cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
-	if (!cxt->pstore.buf) {
-		pr_err("cannot allocate pstore crash dump buffer\n");
-		err = -ENOMEM;
-		goto fail_clear;
+	if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) {
+		cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size;
+		cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
+		if (!cxt->pstore.buf) {
+			pr_err("cannot allocate pstore crash dump buffer\n");
+			err = -ENOMEM;
+			goto fail_clear;
+		}
 	}
-	spin_lock_init(&cxt->pstore.buf_lock);
-
-	cxt->pstore.flags = PSTORE_FLAGS_DMESG;
-	if (cxt->console_size)
-		cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
-	if (cxt->ftrace_size)
-		cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
-	if (cxt->pmsg_size)
-		cxt->pstore.flags |= PSTORE_FLAGS_PMSG;
 
 	err = pstore_register(&cxt->pstore);
 	if (err) {
@@ -845,9 +857,9 @@
 	ramoops_pmsg_size = pdata->pmsg_size;
 	ramoops_ftrace_size = pdata->ftrace_size;
 
-	pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
+	pr_info("using 0x%lx@0x%llx, ecc: %d\n",
 		cxt->size, (unsigned long long)cxt->phys_addr,
-		cxt->ecc_info.ecc_size, cxt->ecc_info.block_size);
+		cxt->ecc_info.ecc_size);
 
 	return 0;
 
@@ -899,13 +911,12 @@
 {
 	platform_device_unregister(dummy);
 	dummy = NULL;
-
-	kfree(dummy_data);
-	dummy_data = NULL;
 }
 
 static void __init ramoops_register_dummy(void)
 {
+	struct ramoops_platform_data pdata;
+
 	/*
 	 * Prepare a dummy platform data structure to carry the module
 	 * parameters. If mem_size isn't set, then there are no module
@@ -916,30 +927,25 @@
 
 	pr_info("using module parameters\n");
 
-	dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL);
-	if (!dummy_data) {
-		pr_info("could not allocate pdata\n");
-		return;
-	}
-
-	dummy_data->mem_size = mem_size;
-	dummy_data->mem_address = mem_address;
-	dummy_data->mem_type = mem_type;
-	dummy_data->record_size = record_size;
-	dummy_data->console_size = ramoops_console_size;
-	dummy_data->ftrace_size = ramoops_ftrace_size;
-	dummy_data->pmsg_size = ramoops_pmsg_size;
-	dummy_data->dump_oops = dump_oops;
-	dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
+	memset(&pdata, 0, sizeof(pdata));
+	pdata.mem_size = mem_size;
+	pdata.mem_address = mem_address;
+	pdata.mem_type = mem_type;
+	pdata.record_size = record_size;
+	pdata.console_size = ramoops_console_size;
+	pdata.ftrace_size = ramoops_ftrace_size;
+	pdata.pmsg_size = ramoops_pmsg_size;
+	pdata.dump_oops = dump_oops;
+	pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
 
 	/*
 	 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
 	 * (using 1 byte for ECC isn't much of use anyway).
 	 */
-	dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
+	pdata.ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
 
 	dummy = platform_device_register_data(NULL, "ramoops", -1,
-			dummy_data, sizeof(struct ramoops_platform_data));
+			&pdata, sizeof(pdata));
 	if (IS_ERR(dummy)) {
 		pr_info("could not create platform device: %ld\n",
 			PTR_ERR(dummy));
@@ -959,7 +965,7 @@
 
 	return ret;
 }
-late_initcall(ramoops_init);
+postcore_initcall(ramoops_init);
 
 static void __exit ramoops_exit(void)
 {

diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 0792595..8823f65 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c

@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2012 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
-#define pr_fmt(fmt) "persistent_ram: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/device.h>
 #include <linux/err.h>
@@ -29,6 +20,16 @@
 #include <linux/vmalloc.h>
 #include <asm/page.h>
 
+/**
+ * struct persistent_ram_buffer - persistent circular RAM buffer
+ *
+ * @sig:
+ *	signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value)
+ * @start:
+ *	offset into @data where the beginning of the stored bytes begin
+ * @size:
+ *	number of valid bytes stored in @data
+ */
 struct persistent_ram_buffer {
 	uint32_t    sig;
 	atomic_t    start;
@@ -347,7 +348,7 @@
 	int rem, ret = 0, c = count;
 	size_t start;
 
-	if (unlikely(!access_ok(VERIFY_READ, s, count)))
+	if (unlikely(!access_ok(s, count)))
 		return -EFAULT;
 	if (unlikely(c > prz->buffer_size)) {
 		s += c - prz->buffer_size;
@@ -438,12 +439,13 @@
 }
 
 static void *persistent_ram_iomap(phys_addr_t start, size_t size,
-		unsigned int memtype)
+		unsigned int memtype, char *label)
 {
 	void *va;
 
-	if (!request_mem_region(start, size, "persistent_ram")) {
-		pr_err("request mem region (0x%llx@0x%llx) failed\n",
+	if (!request_mem_region(start, size, label ?: "ramoops")) {
+		pr_err("request mem region (%s 0x%llx@0x%llx) failed\n",
+			label ?: "ramoops",
 			(unsigned long long)size, (unsigned long long)start);
 		return NULL;
 	}
@@ -470,7 +472,8 @@
 	if (pfn_valid(start >> PAGE_SHIFT))
 		prz->vaddr = persistent_ram_vmap(start, size, memtype);
 	else
-		prz->vaddr = persistent_ram_iomap(start, size, memtype);
+		prz->vaddr = persistent_ram_iomap(start, size, memtype,
+						  prz->label);
 
 	if (!prz->vaddr) {
 		pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__,
@@ -488,32 +491,42 @@
 				    struct persistent_ram_ecc_info *ecc_info)
 {
 	int ret;
+	bool zap = !!(prz->flags & PRZ_FLAG_ZAP_OLD);
 
 	ret = persistent_ram_init_ecc(prz, ecc_info);
-	if (ret)
+	if (ret) {
+		pr_warn("ECC failed %s\n", prz->label);
 		return ret;
+	}
 
 	sig ^= PERSISTENT_RAM_SIG;
 
 	if (prz->buffer->sig == sig) {
+		if (buffer_size(prz) == 0) {
+			pr_debug("found existing empty buffer\n");
+			return 0;
+		}
+
 		if (buffer_size(prz) > prz->buffer_size ||
-		    buffer_start(prz) > buffer_size(prz))
+		    buffer_start(prz) > buffer_size(prz)) {
 			pr_info("found existing invalid buffer, size %zu, start %zu\n",
 				buffer_size(prz), buffer_start(prz));
-		else {
+			zap = true;
+		} else {
 			pr_debug("found existing buffer, size %zu, start %zu\n",
 				 buffer_size(prz), buffer_start(prz));
 			persistent_ram_save_old(prz);
-			return 0;
 		}
 	} else {
 		pr_debug("no valid data in buffer (sig = 0x%08x)\n",
 			 prz->buffer->sig);
+		prz->buffer->sig = sig;
+		zap = true;
 	}
 
-	/* Rewind missing or invalid memory area. */
-	prz->buffer->sig = sig;
-	persistent_ram_zap(prz);
+	/* Reset missing, invalid, or single-use memory area. */
+	if (zap)
+		persistent_ram_zap(prz);
 
 	return 0;
 }
@@ -541,12 +554,13 @@
 	prz->ecc_info.par = NULL;
 
 	persistent_ram_free_old(prz);
+	kfree(prz->label);
 	kfree(prz);
 }
 
 struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 			u32 sig, struct persistent_ram_ecc_info *ecc_info,
-			unsigned int memtype, u32 flags)
+			unsigned int memtype, u32 flags, char *label)
 {
 	struct persistent_ram_zone *prz;
 	int ret = -ENOMEM;
@@ -560,6 +574,7 @@
 	/* Initialize general buffer state. */
 	raw_spin_lock_init(&prz->buffer_lock);
 	prz->flags = flags;
+	prz->label = label;
 
 	ret = persistent_ram_buffer_map(start, size, prz, memtype);
 	if (ret)
@@ -569,6 +584,12 @@
 	if (ret)
 		goto err;
 
+	pr_debug("attached %s 0x%zx@0x%llx: %zu header, %zu data, %zu ecc (%d/%d)\n",
+		prz->label, prz->size, (unsigned long long)prz->paddr,
+		sizeof(*prz->buffer), prz->buffer_size,
+		prz->size - sizeof(*prz->buffer) - prz->buffer_size,
+		prz->ecc_info.ecc_size, prz->ecc_info.block_size);
+
 	return prz;
 err:
 	persistent_ram_free(prz);

diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index 5f60899..45b5b98 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config QNX4FS_FS
 	tristate "QNX4 file system support (read only)"
 	depends on BLOCK

diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 4a283b3..42d4fc2 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux qnx4-filesystem routines.
 #

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3d46fe3..e8da1cd 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * QNX4 file system, Linux implementation.
  *
@@ -28,14 +29,14 @@
 static const struct super_operations qnx4_sops;
 
 static struct inode *qnx4_alloc_inode(struct super_block *sb);
-static void qnx4_destroy_inode(struct inode *inode);
+static void qnx4_free_inode(struct inode *inode);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
 static int qnx4_statfs(struct dentry *, struct kstatfs *);
 
 static const struct super_operations qnx4_sops =
 {
 	.alloc_inode	= qnx4_alloc_inode,
-	.destroy_inode	= qnx4_destroy_inode,
+	.free_inode	= qnx4_free_inode,
 	.statfs		= qnx4_statfs,
 	.remount_fs	= qnx4_remount,
 };
@@ -200,6 +201,8 @@
 	s->s_op = &qnx4_sops;
 	s->s_magic = QNX4_SUPER_MAGIC;
 	s->s_flags |= SB_RDONLY;	/* Yup, read-only yet */
+	s->s_time_min = 0;
+	s->s_time_max = U32_MAX;
 
 	/* Check the superblock signature. Since the qnx4 code is
 	   dangerous, we should leave as quickly as possible
@@ -342,17 +345,11 @@
 	return &ei->vfs_inode;
 }
 
-static void qnx4_i_callback(struct rcu_head *head)
+static void qnx4_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
-static void qnx4_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, qnx4_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;

diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig
index edbba5c..6a9d6bc 100644
--- a/fs/qnx6/Kconfig
+++ b/fs/qnx6/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config QNX6FS_FS
 	tristate "QNX6 file system support (read only)"
 	depends on BLOCK && CRC32

diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile
index 5e6bae6..bd4e6c8 100644
--- a/fs/qnx6/Makefile
+++ b/fs/qnx6/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux qnx4-filesystem routines.
 #

diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 4aeb26b..345db56 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * QNX6 file system, Linux implementation.
  *
@@ -29,14 +30,14 @@
 
 static void qnx6_put_super(struct super_block *sb);
 static struct inode *qnx6_alloc_inode(struct super_block *sb);
-static void qnx6_destroy_inode(struct inode *inode);
+static void qnx6_free_inode(struct inode *inode);
 static int qnx6_remount(struct super_block *sb, int *flags, char *data);
 static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int qnx6_show_options(struct seq_file *seq, struct dentry *root);
 
 static const struct super_operations qnx6_sops = {
 	.alloc_inode	= qnx6_alloc_inode,
-	.destroy_inode	= qnx6_destroy_inode,
+	.free_inode	= qnx6_free_inode,
 	.put_super	= qnx6_put_super,
 	.statfs		= qnx6_statfs,
 	.remount_fs	= qnx6_remount,
@@ -428,6 +429,8 @@
 	s->s_op = &qnx6_sops;
 	s->s_magic = QNX6_SUPER_MAGIC;
 	s->s_flags |= SB_RDONLY;        /* Yup, read-only yet */
+	s->s_time_min = 0;
+	s->s_time_max = U32_MAX;
 
 	/* ease the later tree level calculations */
 	sbi = QNX6_SB(s);
@@ -602,17 +605,11 @@
 	return &ei->vfs_inode;
 }
 
-static void qnx6_i_callback(struct rcu_head *head)
+static void qnx6_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode));
 }
 
-static void qnx6_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, qnx6_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo;

diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 4a09975..7218314 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 #  Quota configuration
 #

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index fc20e06..6e826b4 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c

@@ -9,7 +9,7 @@
  * on the Melbourne quota system as used on BSD derived systems. The internal
  * implementation is based on one of the several variants of the LINUX
  * inode-subsystem with added complexity of the diskquota system.
- * 
+ *
  * Author:	Marco van Wieringen <mvw@planets.elm.net>
  *
  * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
@@ -51,7 +51,7 @@
  *		Added journalled quota support, fix lock inversion problems
  *		Jan Kara, <jack@suse.cz>, 2003,2004
  *
- * (C) Copyright 1994 - 1997 Marco van Wieringen 
+ * (C) Copyright 1994 - 1997 Marco van Wieringen
  */
 
 #include <linux/errno.h>
@@ -197,7 +197,7 @@
 		int qm;
 
 		spin_unlock(&dq_list_lock);
-		
+
 		for (qm = 0; module_names[qm].qm_fmt_id &&
 			     module_names[qm].qm_fmt_id != id; qm++)
 			;
@@ -223,9 +223,9 @@
 
 /*
  * Dquot List Management:
- * The quota code uses three lists for dquot management: the inuse_list,
- * free_dquots, and dquot_hash[] array. A single dquot structure may be
- * on all three lists, depending on its current state.
+ * The quota code uses four lists for dquot management: the inuse_list,
+ * free_dquots, dqi_dirty_list, and dquot_hash[] array. A single dquot
+ * structure may be on some of those lists, depending on its current state.
  *
  * All dquots are placed to the end of inuse_list when first created, and this
  * list is used for invalidate operation, which must look at every dquot.
@@ -236,6 +236,11 @@
  * dqstats.free_dquots gives the number of dquots on the list. When
  * dquot is invalidated it's completely released from memory.
  *
+ * Dirty dquots are added to the dqi_dirty_list of quota_info when mark
+ * dirtied, and this list is searched when writing dirty dquots back to
+ * quota file. Note that some filesystems do dirty dquot tracking on their
+ * own (e.g. in a journal) and thus don't use dqi_dirty_list.
+ *
  * Dquots with a specific identity (device, type and id) are placed on
  * one of the dquot_hash[] hash chains. The provides an efficient search
  * mechanism to locate a specific dquot.
@@ -424,10 +429,11 @@
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 
 	mutex_lock(&dquot->dq_lock);
-	if (!test_bit(DQ_READ_B, &dquot->dq_flags))
+	if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
 		ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
-	if (ret < 0)
-		goto out_iolock;
+		if (ret < 0)
+			goto out_iolock;
+	}
 	/* Make sure flags update is visible after dquot has been filled */
 	smp_mb__before_atomic();
 	set_bit(DQ_READ_B, &dquot->dq_flags);
@@ -1049,7 +1055,9 @@
 		struct list_head *tofree_head)
 {
 	struct inode *inode;
+#ifdef CONFIG_QUOTA_DEBUG
 	int reserved = 0;
+#endif
 
 	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -1061,8 +1069,10 @@
 		 */
 		spin_lock(&dq_data_lock);
 		if (!IS_NOQUOTA(inode)) {
+#ifdef CONFIG_QUOTA_DEBUG
 			if (unlikely(inode_get_rsv_space(inode) > 0))
 				reserved = 1;
+#endif
 			remove_inode_dquot_ref(inode, type, tofree_head);
 		}
 		spin_unlock(&dq_data_lock);
@@ -1663,7 +1673,7 @@
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!dquots[cnt])
 			continue;
-		if (flags & DQUOT_SPACE_RESERVE) {
+		if (reserve) {
 			ret = dquot_add_space(dquots[cnt], 0, number, flags,
 					      &warn[cnt]);
 		} else {
@@ -1676,13 +1686,11 @@
 				if (!dquots[cnt])
 					continue;
 				spin_lock(&dquots[cnt]->dq_dqb_lock);
-				if (flags & DQUOT_SPACE_RESERVE) {
-					dquots[cnt]->dq_dqb.dqb_rsvspace -=
-									number;
-				} else {
-					dquots[cnt]->dq_dqb.dqb_curspace -=
-									number;
-				}
+				if (reserve)
+					dquot_free_reserved_space(dquots[cnt],
+								  number);
+				else
+					dquot_decr_space(dquots[cnt], number);
 				spin_unlock(&dquots[cnt]->dq_dqb_lock);
 			}
 			spin_unlock(&inode->i_lock);
@@ -1733,7 +1741,7 @@
 					continue;
 				/* Back out changes we already did */
 				spin_lock(&dquots[cnt]->dq_dqb_lock);
-				dquots[cnt]->dq_dqb.dqb_curinodes--;
+				dquot_decr_inodes(dquots[cnt], 1);
 				spin_unlock(&dquots[cnt]->dq_dqb_lock);
 			}
 			goto warn_put_all;
@@ -1993,8 +2001,8 @@
 				       &warn_to[cnt]);
 		if (ret)
 			goto over_quota;
-		ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0,
-				      &warn_to[cnt]);
+		ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space,
+				      DQUOT_SPACE_WARN, &warn_to[cnt]);
 		if (ret) {
 			spin_lock(&transfer_to[cnt]->dq_dqb_lock);
 			dquot_decr_inodes(transfer_to[cnt], inode_usage);
@@ -2397,7 +2405,7 @@
 out_fmt:
 	put_quota_format(fmt);
 
-	return error; 
+	return error;
 }
 
 /* Reenable quotas on remount RW */
@@ -2723,7 +2731,7 @@
 
 	if (check_blim) {
 		if (!dm->dqb_bsoftlimit ||
-		    dm->dqb_curspace + dm->dqb_rsvspace < dm->dqb_bsoftlimit) {
+		    dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 		} else if (!(di->d_fieldmask & QC_SPC_TIMER))
@@ -2732,7 +2740,7 @@
 	}
 	if (check_ilim) {
 		if (!dm->dqb_isoftlimit ||
-		    dm->dqb_curinodes < dm->dqb_isoftlimit) {
+		    dm->dqb_curinodes <= dm->dqb_isoftlimit) {
 			dm->dqb_itime = 0;
 			clear_bit(DQ_INODES_B, &dquot->dq_flags);
 		} else if (!(di->d_fieldmask & QC_INO_TIMER))
@@ -2775,7 +2783,7 @@
 	struct qc_type_state *tstate;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	int type;
-  
+
 	memset(state, 0, sizeof(*state));
 	for (type = 0; type < MAXQUOTAS; type++) {
 		if (!sb_has_quota_active(sb, type))

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index f0cbf58..cb13fb7 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c

@@ -331,9 +331,9 @@
 	return flags;
 }
 
-static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs)
+static int quota_getstate(struct super_block *sb, int type,
+			  struct fs_quota_stat *fqs)
 {
-	int type;
 	struct qc_state state;
 	int ret;
 
@@ -349,14 +349,7 @@
 	if (!fqs->qs_flags)
 		return -ENOSYS;
 	fqs->qs_incoredqs = state.s_incoredqs;
-	/*
-	 * GETXSTATE quotactl has space for just one set of time limits so
-	 * report them for the first enabled quota type
-	 */
-	for (type = 0; type < MAXQUOTAS; type++)
-		if (state.s_state[type].flags & QCI_ACCT_ENABLED)
-			break;
-	BUG_ON(type == MAXQUOTAS);
+
 	fqs->qs_btimelimit = state.s_state[type].spc_timelimit;
 	fqs->qs_itimelimit = state.s_state[type].ino_timelimit;
 	fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
@@ -391,22 +384,22 @@
 	return 0;
 }
 
-static int quota_getxstate(struct super_block *sb, void __user *addr)
+static int quota_getxstate(struct super_block *sb, int type, void __user *addr)
 {
 	struct fs_quota_stat fqs;
 	int ret;
 
 	if (!sb->s_qcop->get_state)
 		return -ENOSYS;
-	ret = quota_getstate(sb, &fqs);
+	ret = quota_getstate(sb, type, &fqs);
 	if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
 		return -EFAULT;
 	return ret;
 }
 
-static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs)
+static int quota_getstatev(struct super_block *sb, int type,
+			   struct fs_quota_statv *fqs)
 {
-	int type;
 	struct qc_state state;
 	int ret;
 
@@ -422,14 +415,7 @@
 	if (!fqs->qs_flags)
 		return -ENOSYS;
 	fqs->qs_incoredqs = state.s_incoredqs;
-	/*
-	 * GETXSTATV quotactl has space for just one set of time limits so
-	 * report them for the first enabled quota type
-	 */
-	for (type = 0; type < MAXQUOTAS; type++)
-		if (state.s_state[type].flags & QCI_ACCT_ENABLED)
-			break;
-	BUG_ON(type == MAXQUOTAS);
+
 	fqs->qs_btimelimit = state.s_state[type].spc_timelimit;
 	fqs->qs_itimelimit = state.s_state[type].ino_timelimit;
 	fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
@@ -455,7 +441,7 @@
 	return 0;
 }
 
-static int quota_getxstatev(struct super_block *sb, void __user *addr)
+static int quota_getxstatev(struct super_block *sb, int type, void __user *addr)
 {
 	struct fs_quota_statv fqs;
 	int ret;
@@ -474,7 +460,7 @@
 	default:
 		return -EINVAL;
 	}
-	ret = quota_getstatev(sb, &fqs);
+	ret = quota_getstatev(sb, type, &fqs);
 	if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
 		return -EFAULT;
 	return ret;
@@ -744,9 +730,9 @@
 	case Q_XQUOTARM:
 		return quota_rmxquota(sb, addr);
 	case Q_XGETQSTAT:
-		return quota_getxstate(sb, addr);
+		return quota_getxstate(sb, type, addr);
 	case Q_XGETQSTATV:
-		return quota_getxstatev(sb, addr);
+		return quota_getxstatev(sb, type, addr);
 	case Q_XSETQLIM:
 		return quota_setxquota(sb, type, id, addr);
 	case Q_XGETQUOTA:
@@ -791,7 +777,8 @@
 /* Return true if quotactl command is manipulating quota on/off state */
 static bool quotactl_cmd_onoff(int cmd)
 {
-	return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF);
+	return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) ||
+		 (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF);
 }
 
 /*

diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index bb3f59b..a6f856f 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *	vfsv0 quota IO operations on file
  */

diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 7ac5298..c740e55 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/quota.h>
@@ -127,7 +128,7 @@
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	ulong blocks;
-	size_t off; 
+	size_t off;
 	struct v2_disk_dqheader dqhead;
 	ssize_t size;
 	loff_t isize;

diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index a73e5b3..53429c2 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *	vfsv0 quota IO operations on file
  */
@@ -78,7 +79,7 @@
 	struct v2_disk_dqheader dqhead;
 	static const uint quota_magics[] = V2_INITQMAGICS;
 	static const uint quota_versions[] = V2_INITQVERSIONS;
- 
+
 	if (v2_read_header(sb, type, &dqhead))
 		return 0;
 	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||

diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
index c71e65d..d1198ad 100644
--- a/fs/ramfs/Makefile
+++ b/fs/ramfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux ramfs routines.
 #

diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 3ac1f23..4146954 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* file-nommu.c: no-MMU version of ramfs
  *
  * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/module.h>

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 11201b2..d82636e 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c

@@ -36,6 +36,8 @@
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include "internal.h"
 
 struct ramfs_mount_opts {
@@ -175,62 +177,52 @@
 	.show_options	= ramfs_show_options,
 };
 
-enum {
+enum ramfs_param {
 	Opt_mode,
-	Opt_err
 };
 
-static const match_table_t tokens = {
-	{Opt_mode, "mode=%o"},
-	{Opt_err, NULL}
+static const struct fs_parameter_spec ramfs_param_specs[] = {
+	fsparam_u32oct("mode",	Opt_mode),
+	{}
 };
 
-static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
+const struct fs_parameter_description ramfs_fs_parameters = {
+	.name		= "ramfs",
+	.specs		= ramfs_param_specs,
+};
+
+static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-	int token;
-	char *p;
+	struct fs_parse_result result;
+	struct ramfs_fs_info *fsi = fc->s_fs_info;
+	int opt;
 
-	opts->mode = RAMFS_DEFAULT_MODE;
-
-	while ((p = strsep(&data, ",")) != NULL) {
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_mode:
-			if (match_octal(&args[0], &option))
-				return -EINVAL;
-			opts->mode = option & S_IALLUGO;
-			break;
+	opt = fs_parse(fc, &ramfs_fs_parameters, param, &result);
+	if (opt < 0) {
 		/*
 		 * We might like to report bad mount options here;
 		 * but traditionally ramfs has ignored all mount options,
 		 * and as it is used as a !CONFIG_SHMEM simple substitute
 		 * for tmpfs, better continue to ignore other mount options.
 		 */
-		}
+		if (opt == -ENOPARAM)
+			opt = 0;
+		return opt;
+	}
+
+	switch (opt) {
+	case Opt_mode:
+		fsi->mount_opts.mode = result.uint_32 & S_IALLUGO;
+		break;
 	}
 
 	return 0;
 }
 
-int ramfs_fill_super(struct super_block *sb, void *data, int silent)
+static int ramfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct ramfs_fs_info *fsi;
+	struct ramfs_fs_info *fsi = sb->s_fs_info;
 	struct inode *inode;
-	int err;
-
-	fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
-	sb->s_fs_info = fsi;
-	if (!fsi)
-		return -ENOMEM;
-
-	err = ramfs_parse_options(data, &fsi->mount_opts);
-	if (err)
-		return err;
 
 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
 	sb->s_blocksize		= PAGE_SIZE;
@@ -247,10 +239,34 @@
 	return 0;
 }
 
-struct dentry *ramfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ramfs_get_tree(struct fs_context *fc)
 {
-	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
+	return get_tree_nodev(fc, ramfs_fill_super);
+}
+
+static void ramfs_free_fc(struct fs_context *fc)
+{
+	kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations ramfs_context_ops = {
+	.free		= ramfs_free_fc,
+	.parse_param	= ramfs_parse_param,
+	.get_tree	= ramfs_get_tree,
+};
+
+int ramfs_init_fs_context(struct fs_context *fc)
+{
+	struct ramfs_fs_info *fsi;
+
+	fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
+	if (!fsi)
+		return -ENOMEM;
+
+	fsi->mount_opts.mode = RAMFS_DEFAULT_MODE;
+	fc->s_fs_info = fsi;
+	fc->ops = &ramfs_context_ops;
+	return 0;
 }
 
 static void ramfs_kill_sb(struct super_block *sb)
@@ -261,17 +277,14 @@
 
 static struct file_system_type ramfs_fs_type = {
 	.name		= "ramfs",
-	.mount		= ramfs_mount,
+	.init_fs_context = ramfs_init_fs_context,
+	.parameters	= &ramfs_fs_parameters,
 	.kill_sb	= ramfs_kill_sb,
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
-int __init init_ramfs_fs(void)
+static int __init init_ramfs_fs(void)
 {
-	static unsigned long once;
-
-	if (test_and_set_bit(0, &once))
-		return 0;
 	return register_filesystem(&ramfs_fs_type);
 }
 fs_initcall(init_ramfs_fs);

diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index a9d8ae8..3257fc1 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* internal.h: ramfs internal definitions
  *
  * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 

diff --git a/fs/read_write.c b/fs/read_write.c
index 8a2737f..5bbf587 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c

@@ -331,7 +331,7 @@
 }
 #endif
 
-#ifdef __ARCH_WANT_SYS_LLSEEK
+#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 		unsigned long, offset_low, loff_t __user *, result,
 		unsigned int, whence)
@@ -365,29 +365,37 @@
 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 {
 	struct inode *inode;
-	loff_t pos;
 	int retval = -EINVAL;
 
 	inode = file_inode(file);
 	if (unlikely((ssize_t) count < 0))
 		return retval;
-	pos = *ppos;
-	if (unlikely(pos < 0)) {
-		if (!unsigned_offsets(file))
-			return retval;
-		if (count >= -pos) /* both values are in 0..LLONG_MAX */
-			return -EOVERFLOW;
-	} else if (unlikely((loff_t) (pos + count) < 0)) {
-		if (!unsigned_offsets(file))
-			return retval;
+
+	/*
+	 * ranged mandatory locking does not apply to streams - it makes sense
+	 * only for files where position has a meaning.
+	 */
+	if (ppos) {
+		loff_t pos = *ppos;
+
+		if (unlikely(pos < 0)) {
+			if (!unsigned_offsets(file))
+				return retval;
+			if (count >= -pos) /* both values are in 0..LLONG_MAX */
+				return -EOVERFLOW;
+		} else if (unlikely((loff_t) (pos + count) < 0)) {
+			if (!unsigned_offsets(file))
+				return retval;
+		}
+
+		if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
+			retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
+					read_write == READ ? F_RDLCK : F_WRLCK);
+			if (retval < 0)
+				return retval;
+		}
 	}
 
-	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
-		retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
-				read_write == READ ? F_RDLCK : F_WRLCK);
-		if (retval < 0)
-			return retval;
-	}
 	return security_file_permission(file,
 				read_write == READ ? MAY_READ : MAY_WRITE);
 }
@@ -400,12 +408,13 @@
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
-	kiocb.ki_pos = *ppos;
+	kiocb.ki_pos = (ppos ? *ppos : 0);
 	iov_iter_init(&iter, READ, &iov, 1, len);
 
 	ret = call_read_iter(filp, &kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
-	*ppos = kiocb.ki_pos;
+	if (ppos)
+		*ppos = kiocb.ki_pos;
 	return ret;
 }
 
@@ -426,7 +435,7 @@
 	ssize_t result;
 
 	old_fs = get_fs();
-	set_fs(get_ds());
+	set_fs(KERNEL_DS);
 	/* The cast to a user pointer is valid due to the set_fs() */
 	result = vfs_read(file, (void __user *)buf, count, pos);
 	set_fs(old_fs);
@@ -442,7 +451,7 @@
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
-	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
+	if (unlikely(!access_ok(buf, count)))
 		return -EFAULT;
 
 	ret = rw_verify_area(READ, file, pos, count);
@@ -468,18 +477,18 @@
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
-	kiocb.ki_pos = *ppos;
+	kiocb.ki_pos = (ppos ? *ppos : 0);
 	iov_iter_init(&iter, WRITE, &iov, 1, len);
 
 	ret = call_write_iter(filp, &kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
-	if (ret > 0)
+	if (ret > 0 && ppos)
 		*ppos = kiocb.ki_pos;
 	return ret;
 }
 
-ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
-		    loff_t *pos)
+static ssize_t __vfs_write(struct file *file, const char __user *p,
+			   size_t count, loff_t *pos)
 {
 	if (file->f_op->write)
 		return file->f_op->write(file, p, count, pos);
@@ -499,7 +508,7 @@
 		return -EINVAL;
 
 	old_fs = get_fs();
-	set_fs(get_ds());
+	set_fs(KERNEL_DS);
 	p = (__force const char __user *)buf;
 	if (count > MAX_RW_COUNT)
 		count =  MAX_RW_COUNT;
@@ -521,7 +530,7 @@
 	ssize_t res;
 
 	old_fs = get_fs();
-	set_fs(get_ds());
+	set_fs(KERNEL_DS);
 	/* The cast to a user pointer is valid due to the set_fs() */
 	res = vfs_write(file, (__force const char __user *)buf, count, pos);
 	set_fs(old_fs);
@@ -538,7 +547,7 @@
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
-	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
+	if (unlikely(!access_ok(buf, count)))
 		return -EFAULT;
 
 	ret = rw_verify_area(WRITE, file, pos, count);
@@ -558,14 +567,10 @@
 	return ret;
 }
 
-static inline loff_t file_pos_read(struct file *file)
+/* file_ppos returns &file->f_pos or NULL if file is stream */
+static inline loff_t *file_ppos(struct file *file)
 {
-	return file->f_pos;
-}
-
-static inline void file_pos_write(struct file *file, loff_t pos)
-{
-	file->f_pos = pos;
+	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
 }
 
 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
@@ -574,10 +579,14 @@
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
-		loff_t pos = file_pos_read(f.file);
-		ret = vfs_read(f.file, buf, count, &pos);
-		if (ret >= 0)
-			file_pos_write(f.file, pos);
+		loff_t pos, *ppos = file_ppos(f.file);
+		if (ppos) {
+			pos = *ppos;
+			ppos = &pos;
+		}
+		ret = vfs_read(f.file, buf, count, ppos);
+		if (ret >= 0 && ppos)
+			f.file->f_pos = pos;
 		fdput_pos(f);
 	}
 	return ret;
@@ -594,10 +603,14 @@
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
-		loff_t pos = file_pos_read(f.file);
-		ret = vfs_write(f.file, buf, count, &pos);
-		if (ret >= 0)
-			file_pos_write(f.file, pos);
+		loff_t pos, *ppos = file_ppos(f.file);
+		if (ppos) {
+			pos = *ppos;
+			ppos = &pos;
+		}
+		ret = vfs_write(f.file, buf, count, ppos);
+		if (ret >= 0 && ppos)
+			f.file->f_pos = pos;
 		fdput_pos(f);
 	}
 
@@ -672,14 +685,15 @@
 	ret = kiocb_set_rw_flags(&kiocb, flags);
 	if (ret)
 		return ret;
-	kiocb.ki_pos = *ppos;
+	kiocb.ki_pos = (ppos ? *ppos : 0);
 
 	if (type == READ)
 		ret = call_read_iter(filp, &kiocb, iter);
 	else
 		ret = call_write_iter(filp, &kiocb, iter);
 	BUG_ON(ret == -EIOCBQUEUED);
-	*ppos = kiocb.ki_pos;
+	if (ppos)
+		*ppos = kiocb.ki_pos;
 	return ret;
 }
 
@@ -718,9 +732,6 @@
 	return ret;
 }
 
-/* A write operation does a read from user space and vice versa */
-#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
-
 /**
  * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
  *     into the kernel and check that it is valid.
@@ -810,7 +821,7 @@
 			goto out;
 		}
 		if (type >= 0
-		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+		    && unlikely(!access_ok(buf, len))) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -856,7 +867,7 @@
 	*ret_pointer = iov;
 
 	ret = -EFAULT;
-	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+	if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
 		goto out;
 
 	/*
@@ -881,7 +892,7 @@
 		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 			goto out;
 		if (type >= 0 &&
-		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+		    !access_ok(compat_ptr(buf), len)) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1015,10 +1026,14 @@
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
-		loff_t pos = file_pos_read(f.file);
-		ret = vfs_readv(f.file, vec, vlen, &pos, flags);
-		if (ret >= 0)
-			file_pos_write(f.file, pos);
+		loff_t pos, *ppos = file_ppos(f.file);
+		if (ppos) {
+			pos = *ppos;
+			ppos = &pos;
+		}
+		ret = vfs_readv(f.file, vec, vlen, ppos, flags);
+		if (ret >= 0 && ppos)
+			f.file->f_pos = pos;
 		fdput_pos(f);
 	}
 
@@ -1035,10 +1050,14 @@
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
-		loff_t pos = file_pos_read(f.file);
-		ret = vfs_writev(f.file, vec, vlen, &pos, flags);
-		if (ret >= 0)
-			file_pos_write(f.file, pos);
+		loff_t pos, *ppos = file_ppos(f.file);
+		if (ppos) {
+			pos = *ppos;
+			ppos = &pos;
+		}
+		ret = vfs_writev(f.file, vec, vlen, ppos, flags);
+		if (ret >= 0 && ppos)
+			f.file->f_pos = pos;
 		fdput_pos(f);
 	}
 
@@ -1241,6 +1260,9 @@
 		const struct compat_iovec __user *,vec,
 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
+	if (pos == -1)
+		return do_compat_readv(fd, vec, vlen, flags);
+
 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
 }
 #endif
@@ -1347,6 +1369,9 @@
 		const struct compat_iovec __user *,vec,
 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
+	if (pos == -1)
+		return do_compat_writev(fd, vec, vlen, flags);
+
 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 }
 #endif
@@ -1407,7 +1432,6 @@
 		goto fput_in;
 	if (!(out.file->f_mode & FMODE_WRITE))
 		goto fput_out;
-	retval = -EINVAL;
 	in_inode = file_inode(in.file);
 	out_inode = file_inode(out.file);
 	out_pos = out.file->f_pos;
@@ -1541,6 +1565,58 @@
 }
 #endif
 
+/**
+ * generic_copy_file_range - copy data between two files
+ * @file_in:	file structure to read from
+ * @pos_in:	file offset to read from
+ * @file_out:	file structure to write data to
+ * @pos_out:	file offset to write data to
+ * @len:	amount of data to copy
+ * @flags:	copy flags
+ *
+ * This is a generic filesystem helper to copy data from one file to another.
+ * It has no constraints on the source or destination file owners - the files
+ * can belong to different superblocks and different filesystem types. Short
+ * copies are allowed.
+ *
+ * This should be called from the @file_out filesystem, as per the
+ * ->copy_file_range() method.
+ *
+ * Returns the number of bytes copied or a negative error indicating the
+ * failure.
+ */
+
+ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
+				struct file *file_out, loff_t pos_out,
+				size_t len, unsigned int flags)
+{
+	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+}
+EXPORT_SYMBOL(generic_copy_file_range);
+
+static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
+				  struct file *file_out, loff_t pos_out,
+				  size_t len, unsigned int flags)
+{
+	/*
+	 * Although we now allow filesystems to handle cross sb copy, passing
+	 * a file of the wrong filesystem type to filesystem driver can result
+	 * in an attempt to dereference the wrong type of ->private_data, so
+	 * avoid doing that until we really have a good reason.  NFS defines
+	 * several different file_system_type structures, but they all end up
+	 * using the same ->copy_file_range() function pointer.
+	 */
+	if (file_out->f_op->copy_file_range &&
+	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
+		return file_out->f_op->copy_file_range(file_in, pos_in,
+						       file_out, pos_out,
+						       len, flags);
+
+	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
+				       flags);
+}
+
 /*
  * copy_file_range() differs from regular file read and write in that it
  * specifically allows return partial success.  When it does so is up to
@@ -1550,17 +1626,15 @@
 			    struct file *file_out, loff_t pos_out,
 			    size_t len, unsigned int flags)
 {
-	struct inode *inode_in = file_inode(file_in);
-	struct inode *inode_out = file_inode(file_out);
 	ssize_t ret;
 
 	if (flags != 0)
 		return -EINVAL;
 
-	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		return -EISDIR;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		return -EINVAL;
+	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
+				       flags);
+	if (unlikely(ret))
+		return ret;
 
 	ret = rw_verify_area(READ, file_in, &pos_in, len);
 	if (unlikely(ret))
@@ -1570,15 +1644,6 @@
 	if (unlikely(ret))
 		return ret;
 
-	if (!(file_in->f_mode & FMODE_READ) ||
-	    !(file_out->f_mode & FMODE_WRITE) ||
-	    (file_out->f_flags & O_APPEND))
-		return -EBADF;
-
-	/* this could be relaxed once a method supports cross-fs copies */
-	if (inode_in->i_sb != inode_out->i_sb)
-		return -EXDEV;
-
 	if (len == 0)
 		return 0;
 
@@ -1588,25 +1653,23 @@
 	 * Try cloning first, this is supported by more file systems, and
 	 * more efficient if both clone and copy are supported (e.g. NFS).
 	 */
-	if (file_in->f_op->clone_file_range) {
-		ret = file_in->f_op->clone_file_range(file_in, pos_in,
-				file_out, pos_out, len);
-		if (ret == 0) {
-			ret = len;
+	if (file_in->f_op->remap_file_range &&
+	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+		loff_t cloned;
+
+		cloned = file_in->f_op->remap_file_range(file_in, pos_in,
+				file_out, pos_out,
+				min_t(loff_t, MAX_RW_COUNT, len),
+				REMAP_FILE_CAN_SHORTEN);
+		if (cloned > 0) {
+			ret = cloned;
 			goto done;
 		}
 	}
 
-	if (file_out->f_op->copy_file_range) {
-		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
-						      pos_out, len, flags);
-		if (ret != -EOPNOTSUPP)
-			goto done;
-	}
-
-	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
-
+	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
+				flags);
+	WARN_ON_ONCE(ret == -EOPNOTSUPP);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);
@@ -1686,11 +1749,12 @@
 	return ret;
 }
 
-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
+			     bool write)
 {
 	struct inode *inode = file_inode(file);
 
-	if (unlikely(pos < 0))
+	if (unlikely(pos < 0 || len < 0))
 		return -EINVAL;
 
 	 if (unlikely((loff_t) (pos + len) < 0))
@@ -1708,209 +1772,90 @@
 
 	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
 }
-
 /*
- * Check that the two inodes are eligible for cloning, the ranges make
- * sense, and then flush all dirty data.  Caller must ensure that the
- * inodes have been locked against any other modifications.
+ * Ensure that we don't remap a partial EOF block in the middle of something
+ * else.  Assume that the offsets have already been checked for block
+ * alignment.
  *
- * Returns: 0 for "nothing to clone", 1 for "something to clone", or
- * the usual negative error code.
+ * For deduplication we always scale down to the previous block because we
+ * can't meaningfully compare post-EOF contents.
+ *
+ * For clone we only link a partial EOF block above the destination file's EOF.
+ *
+ * Shorten the request if possible.
  */
-int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-			       struct inode *inode_out, loff_t pos_out,
-			       u64 *len, bool is_dedupe)
+static int generic_remap_check_len(struct inode *inode_in,
+				   struct inode *inode_out,
+				   loff_t pos_out,
+				   loff_t *len,
+				   unsigned int remap_flags)
 {
-	loff_t bs = inode_out->i_sb->s_blocksize;
-	loff_t blen;
-	loff_t isize;
-	bool same_inode = (inode_in == inode_out);
-	int ret;
+	u64 blkmask = i_blocksize(inode_in) - 1;
+	loff_t new_len = *len;
 
-	/* Don't touch certain kinds of inodes */
-	if (IS_IMMUTABLE(inode_out))
-		return -EPERM;
-
-	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
-		return -ETXTBSY;
-
-	/* Don't reflink dirs, pipes, sockets... */
-	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		return -EISDIR;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		return -EINVAL;
-
-	/* Are we going all the way to the end? */
-	isize = i_size_read(inode_in);
-	if (isize == 0)
+	if ((*len & blkmask) == 0)
 		return 0;
 
-	/* Zero length dedupe exits immediately; reflink goes to EOF. */
-	if (*len == 0) {
-		if (is_dedupe || pos_in == isize)
-			return 0;
-		if (pos_in > isize)
-			return -EINVAL;
-		*len = isize - pos_in;
+	if ((remap_flags & REMAP_FILE_DEDUP) ||
+	    pos_out + *len < i_size_read(inode_out))
+		new_len &= ~blkmask;
+
+	if (new_len == *len)
+		return 0;
+
+	if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
+		*len = new_len;
+		return 0;
 	}
 
-	/* Ensure offsets don't wrap and the input is inside i_size */
-	if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
-	    pos_in + *len > isize)
-		return -EINVAL;
-
-	/* Don't allow dedupe past EOF in the dest file */
-	if (is_dedupe) {
-		loff_t	disize;
-
-		disize = i_size_read(inode_out);
-		if (pos_out >= disize || pos_out + *len > disize)
-			return -EINVAL;
-	}
-
-	/* If we're linking to EOF, continue to the block boundary. */
-	if (pos_in + *len == isize)
-		blen = ALIGN(isize, bs) - pos_in;
-	else
-		blen = *len;
-
-	/* Only reflink if we're aligned to block boundaries */
-	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-		return -EINVAL;
-
-	/* Don't allow overlapped reflink within the same file */
-	if (same_inode) {
-		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-			return -EINVAL;
-	}
-
-	/* Wait for the completion of any pending IOs on both files */
-	inode_dio_wait(inode_in);
-	if (!same_inode)
-		inode_dio_wait(inode_out);
-
-	ret = filemap_write_and_wait_range(inode_in->i_mapping,
-			pos_in, pos_in + *len - 1);
-	if (ret)
-		return ret;
-
-	ret = filemap_write_and_wait_range(inode_out->i_mapping,
-			pos_out, pos_out + *len - 1);
-	if (ret)
-		return ret;
-
-	/*
-	 * Check that the extents are the same.
-	 */
-	if (is_dedupe) {
-		bool		is_same = false;
-
-		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
-				inode_out, pos_out, *len, &is_same);
-		if (ret)
-			return ret;
-		if (!is_same)
-			return -EBADE;
-	}
-
-	return 1;
+	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
 }
-EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
 
-int do_clone_file_range(struct file *file_in, loff_t pos_in,
-			struct file *file_out, loff_t pos_out, u64 len)
-{
-	struct inode *inode_in = file_inode(file_in);
-	struct inode *inode_out = file_inode(file_out);
-	int ret;
-
-	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		return -EISDIR;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		return -EINVAL;
-
-	/*
-	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
-	 * the same mount. Practically, they only need to be on the same file
-	 * system.
-	 */
-	if (inode_in->i_sb != inode_out->i_sb)
-		return -EXDEV;
-
-	if (!(file_in->f_mode & FMODE_READ) ||
-	    !(file_out->f_mode & FMODE_WRITE) ||
-	    (file_out->f_flags & O_APPEND))
-		return -EBADF;
-
-	if (!file_in->f_op->clone_file_range)
-		return -EOPNOTSUPP;
-
-	ret = clone_verify_area(file_in, pos_in, len, false);
-	if (ret)
-		return ret;
-
-	ret = clone_verify_area(file_out, pos_out, len, true);
-	if (ret)
-		return ret;
-
-	if (pos_in + len > i_size_read(inode_in))
-		return -EINVAL;
-
-	ret = file_in->f_op->clone_file_range(file_in, pos_in,
-			file_out, pos_out, len);
-	if (!ret) {
-		fsnotify_access(file_in);
-		fsnotify_modify(file_out);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL(do_clone_file_range);
-
-int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-			 struct file *file_out, loff_t pos_out, u64 len)
-{
-	int ret;
-
-	file_start_write(file_out);
-	ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
-	file_end_write(file_out);
-
-	return ret;
-}
-EXPORT_SYMBOL(vfs_clone_file_range);
-
-/*
- * Read a page's worth of file data into the page cache.  Return the page
- * locked.
- */
+/* Read a page's worth of file data into the page cache. */
 static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
 {
-	struct address_space *mapping;
 	struct page *page;
-	pgoff_t n;
 
-	n = offset >> PAGE_SHIFT;
-	mapping = inode->i_mapping;
-	page = read_mapping_page(mapping, n, NULL);
+	page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
 	if (IS_ERR(page))
 		return page;
 	if (!PageUptodate(page)) {
 		put_page(page);
 		return ERR_PTR(-EIO);
 	}
-	lock_page(page);
 	return page;
 }
 
 /*
+ * Lock two pages, ensuring that we lock in offset order if the pages are from
+ * the same file.
+ */
+static void vfs_lock_two_pages(struct page *page1, struct page *page2)
+{
+	/* Always lock in order of increasing index. */
+	if (page1->index > page2->index)
+		swap(page1, page2);
+
+	lock_page(page1);
+	if (page1 != page2)
+		lock_page(page2);
+}
+
+/* Unlock two pages, being careful not to unlock the same page twice. */
+static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
+{
+	unlock_page(page1);
+	if (page1 != page2)
+		unlock_page(page2);
+}
+
+/*
  * Compare extents of two files to see if they are the same.
  * Caller must have locked both inodes to prevent write races.
  */
-int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
-				  struct inode *dest, loff_t destoff,
-				  loff_t len, bool *is_same)
+static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+					 struct inode *dest, loff_t destoff,
+					 loff_t len, bool *is_same)
 {
 	loff_t src_poff;
 	loff_t dest_poff;
@@ -1941,10 +1886,24 @@
 		dest_page = vfs_dedupe_get_page(dest, destoff);
 		if (IS_ERR(dest_page)) {
 			error = PTR_ERR(dest_page);
-			unlock_page(src_page);
 			put_page(src_page);
 			goto out_error;
 		}
+
+		vfs_lock_two_pages(src_page, dest_page);
+
+		/*
+		 * Now that we've locked both pages, make sure they're still
+		 * mapped to the file data we're interested in.  If not,
+		 * someone is invalidating pages on us and we lose.
+		 */
+		if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
+		    src_page->mapping != src->i_mapping ||
+		    dest_page->mapping != dest->i_mapping) {
+			same = false;
+			goto unlock;
+		}
+
 		src_addr = kmap_atomic(src_page);
 		dest_addr = kmap_atomic(dest_page);
 
@@ -1956,8 +1915,8 @@
 
 		kunmap_atomic(dest_addr);
 		kunmap_atomic(src_addr);
-		unlock_page(dest_page);
-		unlock_page(src_page);
+unlock:
+		vfs_unlock_two_pages(src_page, dest_page);
 		put_page(dest_page);
 		put_page(src_page);
 
@@ -1975,23 +1934,188 @@
 out_error:
 	return error;
 }
-EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
 
-int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
-			      struct file *dst_file, loff_t dst_pos, u64 len)
+/*
+ * Check that the two inodes are eligible for cloning, the ranges make
+ * sense, and then flush all dirty data.  Caller must ensure that the
+ * inodes have been locked against any other modifications.
+ *
+ * If there's an error, then the usual negative error code is returned.
+ * Otherwise returns 0 with *len set to the request length.
+ */
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				  struct file *file_out, loff_t pos_out,
+				  loff_t *len, unsigned int remap_flags)
 {
-	s64 ret;
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	bool same_inode = (inode_in == inode_out);
+	int ret;
+
+	/* Don't touch certain kinds of inodes */
+	if (IS_IMMUTABLE(inode_out))
+		return -EPERM;
+
+	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+		return -ETXTBSY;
+
+	/* Don't reflink dirs, pipes, sockets... */
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	/* Zero length dedupe exits immediately; reflink goes to EOF. */
+	if (*len == 0) {
+		loff_t isize = i_size_read(inode_in);
+
+		if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
+			return 0;
+		if (pos_in > isize)
+			return -EINVAL;
+		*len = isize - pos_in;
+		if (*len == 0)
+			return 0;
+	}
+
+	/* Check that we don't violate system file offset limits. */
+	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+			remap_flags);
+	if (ret)
+		return ret;
+
+	/* Wait for the completion of any pending IOs on both files */
+	inode_dio_wait(inode_in);
+	if (!same_inode)
+		inode_dio_wait(inode_out);
+
+	ret = filemap_write_and_wait_range(inode_in->i_mapping,
+			pos_in, pos_in + *len - 1);
+	if (ret)
+		return ret;
+
+	ret = filemap_write_and_wait_range(inode_out->i_mapping,
+			pos_out, pos_out + *len - 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Check that the extents are the same.
+	 */
+	if (remap_flags & REMAP_FILE_DEDUP) {
+		bool		is_same = false;
+
+		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+				inode_out, pos_out, *len, &is_same);
+		if (ret)
+			return ret;
+		if (!is_same)
+			return -EBADE;
+	}
+
+	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+			remap_flags);
+	if (ret)
+		return ret;
+
+	/* If can't alter the file contents, we're done. */
+	if (!(remap_flags & REMAP_FILE_DEDUP))
+		ret = file_modified(file_out);
+
+	return ret;
+}
+EXPORT_SYMBOL(generic_remap_file_range_prep);
+
+loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
+			   struct file *file_out, loff_t pos_out,
+			   loff_t len, unsigned int remap_flags)
+{
+	loff_t ret;
+
+	WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
+
+	/*
+	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
+	 * the same mount. Practically, they only need to be on the same file
+	 * system.
+	 */
+	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+		return -EXDEV;
+
+	ret = generic_file_rw_checks(file_in, file_out);
+	if (ret < 0)
+		return ret;
+
+	if (!file_in->f_op->remap_file_range)
+		return -EOPNOTSUPP;
+
+	ret = remap_verify_area(file_in, pos_in, len, false);
+	if (ret)
+		return ret;
+
+	ret = remap_verify_area(file_out, pos_out, len, true);
+	if (ret)
+		return ret;
+
+	ret = file_in->f_op->remap_file_range(file_in, pos_in,
+			file_out, pos_out, len, remap_flags);
+	if (ret < 0)
+		return ret;
+
+	fsnotify_access(file_in);
+	fsnotify_modify(file_out);
+	return ret;
+}
+EXPORT_SYMBOL(do_clone_file_range);
+
+loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+			    struct file *file_out, loff_t pos_out,
+			    loff_t len, unsigned int remap_flags)
+{
+	loff_t ret;
+
+	file_start_write(file_out);
+	ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
+				  remap_flags);
+	file_end_write(file_out);
+
+	return ret;
+}
+EXPORT_SYMBOL(vfs_clone_file_range);
+
+/* Check whether we are allowed to dedupe the destination file */
+static bool allow_file_dedupe(struct file *file)
+{
+	if (capable(CAP_SYS_ADMIN))
+		return true;
+	if (file->f_mode & FMODE_WRITE)
+		return true;
+	if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
+		return true;
+	if (!inode_permission(file_inode(file), MAY_WRITE))
+		return true;
+	return false;
+}
+
+loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+				 struct file *dst_file, loff_t dst_pos,
+				 loff_t len, unsigned int remap_flags)
+{
+	loff_t ret;
+
+	WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+				     REMAP_FILE_CAN_SHORTEN));
 
 	ret = mnt_want_write_file(dst_file);
 	if (ret)
 		return ret;
 
-	ret = clone_verify_area(dst_file, dst_pos, len, true);
+	ret = remap_verify_area(dst_file, dst_pos, len, true);
 	if (ret < 0)
 		goto out_drop_write;
 
-	ret = -EINVAL;
-	if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
+	ret = -EPERM;
+	if (!allow_file_dedupe(dst_file))
 		goto out_drop_write;
 
 	ret = -EXDEV;
@@ -2003,11 +2127,16 @@
 		goto out_drop_write;
 
 	ret = -EINVAL;
-	if (!dst_file->f_op->dedupe_file_range)
+	if (!dst_file->f_op->remap_file_range)
 		goto out_drop_write;
 
-	ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
-						dst_file, dst_pos, len);
+	if (len == 0) {
+		ret = 0;
+		goto out_drop_write;
+	}
+
+	ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
+			dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
 out_drop_write:
 	mnt_drop_write_file(dst_file);
 
@@ -2024,7 +2153,7 @@
 	int i;
 	int ret;
 	u16 count = same->dest_count;
-	int deduped;
+	loff_t deduped;
 
 	if (!(file->f_mode & FMODE_READ))
 		return -EINVAL;
@@ -2035,17 +2164,18 @@
 	off = same->src_offset;
 	len = same->src_length;
 
-	ret = -EISDIR;
 	if (S_ISDIR(src->i_mode))
-		goto out;
+		return -EISDIR;
 
-	ret = -EINVAL;
 	if (!S_ISREG(src->i_mode))
-		goto out;
+		return -EINVAL;
 
-	ret = clone_verify_area(file, off, len, false);
+	if (!file->f_op->remap_file_range)
+		return -EOPNOTSUPP;
+
+	ret = remap_verify_area(file, off, len, false);
 	if (ret < 0)
-		goto out;
+		return ret;
 	ret = 0;
 
 	if (off + len > i_size_read(src))
@@ -2075,7 +2205,8 @@
 		}
 
 		deduped = vfs_dedupe_file_range_one(file, off, dst_file,
-						    info->dest_offset, len);
+						    info->dest_offset, len,
+						    REMAP_FILE_CAN_SHORTEN);
 		if (deduped == -EBADE)
 			info->status = FILE_DEDUPE_RANGE_DIFFERS;
 		else if (deduped < 0)
@@ -2087,10 +2218,8 @@
 		fdput(dst_fd);
 next_loop:
 		if (fatal_signal_pending(current))
-			goto out;
+			break;
 	}
-
-out:
 	return ret;
 }
 EXPORT_SYMBOL(vfs_dedupe_file_range);

diff --git a/fs/readdir.c b/fs/readdir.c
index d97f548..d26d5ea 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c

@@ -20,9 +20,23 @@
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/compat.h>
-
 #include <linux/uaccess.h>
 
+#include <asm/unaligned.h>
+
+/*
+ * Note the "unsafe_put_user() semantics: we goto a
+ * label for errors.
+ */
+#define unsafe_copy_dirent_name(_dst, _src, _len, label) do {	\
+	char __user *dst = (_dst);				\
+	const char *src = (_src);				\
+	size_t len = (_len);					\
+	unsafe_put_user(0, dst+len, label);			\
+	unsafe_copy_to_user(dst, src, len, label);		\
+} while (0)
+
+
 int iterate_dir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
@@ -65,6 +79,40 @@
 EXPORT_SYMBOL(iterate_dir);
 
 /*
+ * POSIX says that a dirent name cannot contain NULL or a '/'.
+ *
+ * It's not 100% clear what we should really do in this case.
+ * The filesystem is clearly corrupted, but returning a hard
+ * error means that you now don't see any of the other names
+ * either, so that isn't a perfect alternative.
+ *
+ * And if you return an error, what error do you use? Several
+ * filesystems seem to have decided on EUCLEAN being the error
+ * code for EFSCORRUPTED, and that may be the error to use. Or
+ * just EIO, which is perhaps more obvious to users.
+ *
+ * In order to see the other file names in the directory, the
+ * caller might want to make this a "soft" error: skip the
+ * entry, and return the error at the end instead.
+ *
+ * Note that this should likely do a "memchr(name, 0, len)"
+ * check too, since that would be filesystem corruption as
+ * well. However, that case can't actually confuse user space,
+ * which has to do a strlen() on the name anyway to find the
+ * filename length, and the above "soft error" worry means
+ * that it's probably better left alone until we have that
+ * issue clarified.
+ */
+static int verify_dirent_name(const char *name, int len)
+{
+	if (!len)
+		return -EIO;
+	if (memchr(name, '/', len))
+		return -EIO;
+	return 0;
+}
+
+/*
  * Traditional linux readdir() handling..
  *
  * "count=1" is a special case, meaning that the buffer is one
@@ -105,7 +153,7 @@
 	}
 	buf->result++;
 	dirent = buf->dirent;
-	if (!access_ok(VERIFY_WRITE, dirent,
+	if (!access_ok(dirent,
 			(unsigned long)(dirent->d_name + namlen + 1) -
 				(unsigned long)dirent))
 		goto efault;
@@ -173,6 +221,9 @@
 	int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
 		sizeof(long));
 
+	buf->error = verify_dirent_name(name, namlen);
+	if (unlikely(buf->error))
+		return buf->error;
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
 		return -EINVAL;
@@ -182,28 +233,31 @@
 		return -EOVERFLOW;
 	}
 	dirent = buf->previous;
-	if (dirent) {
-		if (signal_pending(current))
-			return -EINTR;
-		if (__put_user(offset, &dirent->d_off))
-			goto efault;
-	}
+	if (dirent && signal_pending(current))
+		return -EINTR;
+
+	/*
+	 * Note! This range-checks 'previous' (which may be NULL).
+	 * The real range was checked in getdents
+	 */
+	if (!user_access_begin(dirent, sizeof(*dirent)))
+		goto efault;
+	if (dirent)
+		unsafe_put_user(offset, &dirent->d_off, efault_end);
 	dirent = buf->current_dir;
-	if (__put_user(d_ino, &dirent->d_ino))
-		goto efault;
-	if (__put_user(reclen, &dirent->d_reclen))
-		goto efault;
-	if (copy_to_user(dirent->d_name, name, namlen))
-		goto efault;
-	if (__put_user(0, dirent->d_name + namlen))
-		goto efault;
-	if (__put_user(d_type, (char __user *) dirent + reclen - 1))
-		goto efault;
+	unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
+	unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
+	unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end);
+	unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
+	user_access_end();
+
 	buf->previous = dirent;
 	dirent = (void __user *)dirent + reclen;
 	buf->current_dir = dirent;
 	buf->count -= reclen;
 	return 0;
+efault_end:
+	user_access_end();
 efault:
 	buf->error = -EFAULT;
 	return -EFAULT;
@@ -221,7 +275,7 @@
 	};
 	int error;
 
-	if (!access_ok(VERIFY_WRITE, dirent, count))
+	if (!access_ok(dirent, count))
 		return -EFAULT;
 
 	f = fdget_pos(fd);
@@ -259,34 +313,38 @@
 	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
 		sizeof(u64));
 
+	buf->error = verify_dirent_name(name, namlen);
+	if (unlikely(buf->error))
+		return buf->error;
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
 		return -EINVAL;
 	dirent = buf->previous;
-	if (dirent) {
-		if (signal_pending(current))
-			return -EINTR;
-		if (__put_user(offset, &dirent->d_off))
-			goto efault;
-	}
+	if (dirent && signal_pending(current))
+		return -EINTR;
+
+	/*
+	 * Note! This range-checks 'previous' (which may be NULL).
+	 * The real range was checked in getdents
+	 */
+	if (!user_access_begin(dirent, sizeof(*dirent)))
+		goto efault;
+	if (dirent)
+		unsafe_put_user(offset, &dirent->d_off, efault_end);
 	dirent = buf->current_dir;
-	if (__put_user(ino, &dirent->d_ino))
-		goto efault;
-	if (__put_user(0, &dirent->d_off))
-		goto efault;
-	if (__put_user(reclen, &dirent->d_reclen))
-		goto efault;
-	if (__put_user(d_type, &dirent->d_type))
-		goto efault;
-	if (copy_to_user(dirent->d_name, name, namlen))
-		goto efault;
-	if (__put_user(0, dirent->d_name + namlen))
-		goto efault;
+	unsafe_put_user(ino, &dirent->d_ino, efault_end);
+	unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
+	unsafe_put_user(d_type, &dirent->d_type, efault_end);
+	unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
+	user_access_end();
+
 	buf->previous = dirent;
 	dirent = (void __user *)dirent + reclen;
 	buf->current_dir = dirent;
 	buf->count -= reclen;
 	return 0;
+efault_end:
+	user_access_end();
 efault:
 	buf->error = -EFAULT;
 	return -EFAULT;
@@ -304,7 +362,7 @@
 	};
 	int error;
 
-	if (!access_ok(VERIFY_WRITE, dirent, count))
+	if (!access_ok(dirent, count))
 		return -EFAULT;
 
 	f = fdget_pos(fd);
@@ -365,7 +423,7 @@
 	}
 	buf->result++;
 	dirent = buf->dirent;
-	if (!access_ok(VERIFY_WRITE, dirent,
+	if (!access_ok(dirent,
 			(unsigned long)(dirent->d_name + namlen + 1) -
 				(unsigned long)dirent))
 		goto efault;
@@ -475,7 +533,7 @@
 	};
 	int error;
 
-	if (!access_ok(VERIFY_WRITE, dirent, count))
+	if (!access_ok(dirent, count))
 		return -EFAULT;
 
 	f = fdget_pos(fd);

diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 86e71c0..8fd54ed 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config REISERFS_FS
 	tristate "Reiserfs support"
 	select CRC32

diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index a39a562..bd29c58 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile

@@ -26,14 +26,5 @@
 reiserfs-objs += xattr_acl.o
 endif
 
-# gcc -O2 (the kernel default)  is overaggressive on ppc32 when many inline
-# functions are used.  This causes the compiler to advance the stack
-# pointer out of the available stack space, corrupting kernel space,
-# and causing a panic. Since this behavior only affects ppc32, this ifeq
-# will work around it. If any other architecture displays this behavior,
-# add it here.
-ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
-
 TAGS:
 	etags *.c
-

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 9c02d96..4075e41 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c

@@ -239,10 +239,8 @@
 static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int item_pos = PATH_LAST_POSITION(tb->tb_path);
 	struct buffer_info bi;
 	int n;
-	struct item_head *ih;
 
 	RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
 	       "vs- 12000: level: wrong FR %z", tb->FR[0]);
@@ -251,7 +249,6 @@
 	RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
 	       "PAP-12010: tree can not be empty");
 
-	ih = item_head(tbS0, item_pos);
 	buffer_info_init_tbS0(tb, &bi);
 
 	/* Delete or truncate the item */
@@ -298,7 +295,6 @@
 	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
 		/* part of new item falls into L[0] */
 		int new_item_len, shift;
-		int version;
 
 		ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
 
@@ -317,8 +313,6 @@
 		leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
 			     min_t(int, tb->zeroes_num, ih_item_len(ih)));
 
-		version = ih_version(ih);
-
 		/*
 		 * Calculate key component, item length and body to
 		 * insert into S[0]
@@ -632,7 +626,6 @@
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	int n = B_NR_ITEMS(tbS0);
 	struct buffer_info bi;
-	int ret;
 
 	/* new item or part of it doesn't fall into R[0] */
 	if (n - tb->rnum[0] >= tb->item_pos) {
@@ -646,13 +639,11 @@
 	if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
 		loff_t old_key_comp, old_len, r_zeroes_number;
 		const char *r_body;
-		int version, shift;
+		int shift;
 		loff_t offset;
 
 		leaf_shift_right(tb, tb->rnum[0] - 1, -1);
 
-		version = ih_version(ih);
-
 		/* Remember key component and item length */
 		old_key_comp = le_ih_k_offset(ih);
 		old_len = ih_item_len(ih);
@@ -698,7 +689,7 @@
 		/* whole new item falls into R[0] */
 
 		/* Shift rnum[0]-1 items to R[0] */
-		ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
+		leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
 
 		/* Insert new item into R[0] */
 		buffer_info_init_right(tb, &bi);
@@ -950,14 +941,12 @@
 	if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
 		int old_key_comp, old_len, r_zeroes_number;
 		const char *r_body;
-		int version;
 
 		/* Move snum[i]-1 items from S[0] to S_new[i] */
 		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
 				tb->S_new[i]);
 
 		/* Remember key component and item length */
-		version = ih_version(ih);
 		old_key_comp = le_ih_k_offset(ih);
 		old_len = ih_item_len(ih);
 

diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6b0ddb2..1170922 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c

@@ -376,7 +376,6 @@
 		       int to, int to_bytes, short *snum012, int flow)
 {
 	int i;
-	int cur_free;
 	int units;
 	struct virtual_node *vn = tb->tb_vn;
 	int total_node_size, max_node_size, current_item_size;
@@ -438,7 +437,6 @@
 	/* leaf level */
 	needed_nodes = 1;
 	total_node_size = 0;
-	cur_free = max_node_size;
 
 	/* start from 'from'-th item */
 	start_item = from;
@@ -1734,14 +1732,12 @@
 	 * and Fh is its father.
 	 */
 	struct buffer_head *Sh, *Fh;
-	int maxsize, ret;
+	int ret;
 	int lfree, rfree /* free space in L and R */ ;
 
 	Sh = PATH_H_PBUFFER(tb->tb_path, h);
 	Fh = PATH_H_PPARENT(tb->tb_path, h);
 
-	maxsize = MAX_CHILD_SIZE(Sh);
-
 	/*
 	 * using tb->insert_size[h], which is negative in this case,
 	 * create_virtual_node calculates:

diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index acbbaf7..45e1a5d 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c

@@ -74,13 +74,11 @@
 				err = -EPERM;
 				goto setflags_out;
 			}
-			if (((flags ^ REISERFS_I(inode)->
-			      i_attrs) & (REISERFS_IMMUTABLE_FL |
-					  REISERFS_APPEND_FL))
-			    && !capable(CAP_LINUX_IMMUTABLE)) {
-				err = -EPERM;
+			err = vfs_ioc_setflags_prepare(inode,
+						     REISERFS_I(inode)->i_attrs,
+						     flags);
+			if (err)
 				goto setflags_out;
-			}
 			if ((flags & REISERFS_NOTAIL_FL) &&
 			    S_ISREG(inode->i_mode)) {
 				int result;

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 8a76f9d..4b3e3e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c

@@ -94,7 +94,7 @@
 			struct super_block *sb);
 static void release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal);
-static int dirty_one_transaction(struct super_block *s,
+static void dirty_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl);
 static void flush_async_commits(struct work_struct *work);
 static void queue_log_writer(struct super_block *s);
@@ -891,7 +891,6 @@
 	struct list_head *entry;
 	unsigned int trans_id = jl->j_trans_id;
 	unsigned int other_trans_id;
-	unsigned int first_trans_id;
 
 find_first:
 	/*
@@ -914,8 +913,6 @@
 		return 0;
 	}
 
-	first_trans_id = first_jl->j_trans_id;
-
 	entry = &first_jl->j_list;
 	while (1) {
 		other_jl = JOURNAL_LIST_ENTRY(entry);
@@ -1351,7 +1348,7 @@
 			      struct reiserfs_journal_list *jl, int flushall)
 {
 	struct reiserfs_journal_list *pjl;
-	struct reiserfs_journal_cnode *cn, *last;
+	struct reiserfs_journal_cnode *cn;
 	int count;
 	int was_jwait = 0;
 	int was_dirty = 0;
@@ -1509,7 +1506,6 @@
 					 b_blocknr, __func__);
 		}
 free_cnode:
-		last = cn;
 		cn = cn->next;
 		if (saved_bh) {
 			/*
@@ -1682,12 +1678,11 @@
 }
 
 /* used by flush_commit_list */
-static int dirty_one_transaction(struct super_block *s,
+static void dirty_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl)
 {
 	struct reiserfs_journal_cnode *cn;
 	struct reiserfs_journal_list *pjl;
-	int ret = 0;
 
 	jl->j_state |= LIST_DIRTY;
 	cn = jl->j_realblock;
@@ -1716,7 +1711,6 @@
 		}
 		cn = cn->next;
 	}
-	return ret;
 }
 
 static int kupdate_transactions(struct super_block *s,
@@ -1794,7 +1788,6 @@
 {
 	unsigned long len = 0;
 	unsigned long cur_len;
-	int ret;
 	int i;
 	int limit = 256;
 	struct reiserfs_journal_list *tjl;
@@ -1831,9 +1824,9 @@
 	 * transactions, but only bother if we've actually spanned
 	 * across multiple lists
 	 */
-	if (flush_jl != jl) {
-		ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
-	}
+	if (flush_jl != jl)
+		kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+
 	flush_journal_list(s, flush_jl, 1);
 	put_journal_list(s, flush_jl);
 	put_journal_list(s, jl);
@@ -1844,7 +1837,7 @@
  * removes any nodes in table with name block and dev as bh.
  * only touchs the hnext and hprev pointers.
  */
-void remove_journal_hash(struct super_block *sb,
+static void remove_journal_hash(struct super_block *sb,
 			 struct reiserfs_journal_cnode **table,
 			 struct reiserfs_journal_list *jl,
 			 unsigned long block, int remove_freed)
@@ -1913,7 +1906,6 @@
 			      struct super_block *sb, int error)
 {
 	struct reiserfs_transaction_handle myth;
-	int flushed = 0;
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	/*
@@ -1935,7 +1927,6 @@
 						     1);
 			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
 			do_journal_end(&myth, FLUSH_ALL);
-			flushed = 1;
 		}
 	}
 
@@ -3446,9 +3437,8 @@
 	if (cn == journal->j_last) {
 		journal->j_last = cn->prev;
 	}
-	if (bh)
-		remove_journal_hash(sb, journal->j_hash_table, NULL,
-				    bh->b_blocknr, 0);
+	remove_journal_hash(sb, journal->j_hash_table, NULL,
+			    bh->b_blocknr, 0);
 	clear_buffer_journaled(bh);	/* don't log this one */
 
 	if (!already_cleaned) {
@@ -3990,7 +3980,6 @@
 	struct buffer_head *c_bh;	/* commit bh */
 	struct buffer_head *d_bh;	/* desc bh */
 	int cur_write_start = 0;	/* start index of current log write */
-	int old_start;
 	int i;
 	int flush;
 	int wait_on_commit;
@@ -4247,7 +4236,6 @@
 	journal->j_num_work_lists++;
 
 	/* reset journal values for the next transaction */
-	old_start = journal->j_start;
 	journal->j_start =
 	    (journal->j_start + journal->j_len +
 	     2) % SB_ONDISK_JOURNAL_SIZE(sb);

diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index f5cebd7..7f86856 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c

@@ -1322,7 +1322,7 @@
 	char *item;
 	struct reiserfs_de_head *deh;
 	char *insert_point;
-	int i, old_entry_num;
+	int i;
 	struct buffer_head *bh = bi->bi_bh;
 
 	if (new_entry_count == 0)
@@ -1362,7 +1362,6 @@
 		put_deh_location(&deh[i],
 				 deh_location(&deh[i]) + paste_size);
 
-	old_entry_num = ih_entry_count(ih);
 	put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
 
 	/* prepare space for pasted records */

diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 415d66c..34baf5c 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c

@@ -183,13 +183,12 @@
 	int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
 	int old_max = sb_oid_maxsize(disk_sb);
 	struct reiserfs_super_block_v1 *disk_sb_v1;
-	__le32 *objectid_map, *new_objectid_map;
+	__le32 *objectid_map;
 	int i;
 
 	disk_sb_v1 =
 	    (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
 	objectid_map = (__le32 *) (disk_sb_v1 + 1);
-	new_objectid_map = (__le32 *) (disk_sb + 1);
 
 	if (cur_size > new_size) {
 		/*

diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 9fed1c0..500f200 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c

@@ -746,9 +746,6 @@
 
 static void check_internal_block_head(struct buffer_head *bh)
 {
-	struct block_head *blkh;
-
-	blkh = B_BLK_HEAD(bh);
 	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
 		reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
 

diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 0037aea..da9ebe3 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c

@@ -593,7 +593,6 @@
 	struct buffer_head *bh;
 	struct path_element *last_element;
 	int node_level, retval;
-	int right_neighbor_of_leaf_node;
 	int fs_gen;
 	struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
 	b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
@@ -614,8 +613,6 @@
 
 	pathrelse(search_path);
 
-	right_neighbor_of_leaf_node = 0;
-
 	/*
 	 * With each iteration of this loop we search through the items in the
 	 * current node, and calculate the next current node(next path element)
@@ -701,7 +698,6 @@
 			 */
 			block_number = SB_ROOT_BLOCK(sb);
 			expected_level = -1;
-			right_neighbor_of_leaf_node = 0;
 
 			/* repeat search from the root */
 			continue;

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1fc934d..d69b4ac 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c

@@ -650,17 +650,11 @@
 	return &ei->vfs_inode;
 }
 
-static void reiserfs_i_callback(struct rcu_head *head)
+static void reiserfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
-static void reiserfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, reiserfs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
@@ -815,7 +809,7 @@
 
 static const struct super_operations reiserfs_sops = {
 	.alloc_inode = reiserfs_alloc_inode,
-	.destroy_inode = reiserfs_destroy_inode,
+	.free_inode = reiserfs_free_inode,
 	.write_inode = reiserfs_write_inode,
 	.dirty_inode = reiserfs_dirty_inode,
 	.evict_inode = reiserfs_evict_inode,
@@ -1982,6 +1976,9 @@
 		goto error_unlocked;
 	}
 
+	s->s_time_min = 0;
+	s->s_time_max = U32_MAX;
+
 	rs = SB_DISK_SUPER_BLOCK(s);
 	/*
 	 * Let's do basic sanity check to verify that underlying device is not

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 32d8986..b5b26d8 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c

@@ -450,6 +450,15 @@
 
 static inline __u32 xattr_hash(const char *msg, int len)
 {
+	/*
+	 * csum_partial() gives different results for little-endian and
+	 * big endian hosts. Images created on little-endian hosts and
+	 * mounted on big-endian hosts(and vice versa) will see csum mismatches
+	 * when trying to fetch xattrs. Treating the hash as __wsum_t would
+	 * lower the frequency of mismatch.  This is an endianness bug in
+	 * reiserfs.  The return statement would result in a sparse warning. Do
+	 * not fix the sparse warning so as to not hide a reminder of the bug.
+	 */
 	return csum_partial(msg, len, 0);
 }
 

diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index ce2d6bc..ad4c457 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config ROMFS_FS
 	tristate "ROM file system support"
 	depends on BLOCK || MTD

diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
index 95217b8..e90df22 100644
--- a/fs/romfs/internal.h
+++ b/fs/romfs/internal.h

@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* RomFS internal definitions
  *
  * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/romfs_fs.h>

diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index 1118a0d..2c4a231 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* NOMMU mmap support for RomFS on MTD devices
  *
  * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/mm.h>

diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index f86f51f..6b2b436 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c

@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* RomFS storage access routines
  *
  * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
 #include <linux/fs.h>

diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 6ccb519..e582d00 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c

@@ -65,7 +65,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/statfs.h>
@@ -381,18 +381,11 @@
 /*
  * return a spent inode to the slab cache
  */
-static void romfs_i_callback(struct rcu_head *head)
+static void romfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
-static void romfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, romfs_i_callback);
-}
-
 /*
  * get filesystem statistics
  */
@@ -430,18 +423,17 @@
 /*
  * remounting must involve read-only
  */
-static int romfs_remount(struct super_block *sb, int *flags, char *data)
+static int romfs_reconfigure(struct fs_context *fc)
 {
-	sync_filesystem(sb);
-	*flags |= SB_RDONLY;
+	sync_filesystem(fc->root->d_sb);
+	fc->sb_flags |= SB_RDONLY;
 	return 0;
 }
 
 static const struct super_operations romfs_super_ops = {
 	.alloc_inode	= romfs_alloc_inode,
-	.destroy_inode	= romfs_destroy_inode,
+	.free_inode	= romfs_free_inode,
 	.statfs		= romfs_statfs,
-	.remount_fs	= romfs_remount,
 };
 
 /*
@@ -464,7 +456,7 @@
 /*
  * fill in the superblock
  */
-static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+static int romfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct romfs_super_block *rsb;
 	struct inode *root;
@@ -485,6 +477,8 @@
 	sb->s_maxbytes = 0xFFFFFFFF;
 	sb->s_magic = ROMFS_MAGIC;
 	sb->s_flags |= SB_RDONLY | SB_NOATIME;
+	sb->s_time_min = 0;
+	sb->s_time_max = 0;
 	sb->s_op = &romfs_super_ops;
 
 #ifdef CONFIG_ROMFS_ON_MTD
@@ -511,8 +505,8 @@
 
 	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
 	    img_size < ROMFH_SIZE) {
-		if (!silent)
-			pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n",
+		if (!(fc->sb_flags & SB_SILENT))
+			errorf(fc, "VFS: Can't find a romfs filesystem on dev %s.\n",
 			       sb->s_id);
 		goto error_rsb_inval;
 	}
@@ -525,7 +519,7 @@
 	storage = sb->s_mtd ? "MTD" : "the block layer";
 
 	len = strnlen(rsb->name, ROMFS_MAXFN);
-	if (!silent)
+	if (!(fc->sb_flags & SB_SILENT))
 		pr_notice("Mounting image '%*.*s' through %s\n",
 			  (unsigned) len, (unsigned) len, rsb->name, storage);
 
@@ -555,23 +549,34 @@
 /*
  * get a superblock for mounting
  */
-static struct dentry *romfs_mount(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data)
+static int romfs_get_tree(struct fs_context *fc)
 {
-	struct dentry *ret = ERR_PTR(-EINVAL);
+	int ret = -EINVAL;
 
 #ifdef CONFIG_ROMFS_ON_MTD
-	ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
+	ret = get_tree_mtd(fc, romfs_fill_super);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
-	if (ret == ERR_PTR(-EINVAL))
-		ret = mount_bdev(fs_type, flags, dev_name, data,
-				  romfs_fill_super);
+	if (ret == -EINVAL)
+		ret = get_tree_bdev(fc, romfs_fill_super);
 #endif
 	return ret;
 }
 
+static const struct fs_context_operations romfs_context_ops = {
+	.get_tree	= romfs_get_tree,
+	.reconfigure	= romfs_reconfigure,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int romfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &romfs_context_ops;
+	return 0;
+}
+
 /*
  * destroy a romfs superblock in the appropriate manner
  */
@@ -594,7 +599,7 @@
 static struct file_system_type romfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "romfs",
-	.mount		= romfs_mount,
+	.init_fs_context = romfs_init_fs_context,
 	.kill_sb	= romfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };

diff --git a/fs/select.c b/fs/select.c
index 4a6b6e4..53a0c14 100644
--- a/fs/select.c
+++ b/fs/select.c

@@ -287,12 +287,20 @@
 	return 0;
 }
 
-static int poll_select_copy_remaining(struct timespec64 *end_time,
-				      void __user *p,
-				      int timeval, int ret)
+enum poll_time_type {
+	PT_TIMEVAL = 0,
+	PT_OLD_TIMEVAL = 1,
+	PT_TIMESPEC = 2,
+	PT_OLD_TIMESPEC = 3,
+};
+
+static int poll_select_finish(struct timespec64 *end_time,
+			      void __user *p,
+			      enum poll_time_type pt_type, int ret)
 {
 	struct timespec64 rts;
-	struct timeval rtv;
+
+	restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
 
 	if (!p)
 		return ret;
@@ -310,18 +318,40 @@
 		rts.tv_sec = rts.tv_nsec = 0;
 
 
-	if (timeval) {
-		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
-			memset(&rtv, 0, sizeof(rtv));
-		rtv.tv_sec = rts.tv_sec;
-		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+	switch (pt_type) {
+	case PT_TIMEVAL:
+		{
+			struct timeval rtv;
 
-		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+				memset(&rtv, 0, sizeof(rtv));
+			rtv.tv_sec = rts.tv_sec;
+			rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+			if (!copy_to_user(p, &rtv, sizeof(rtv)))
+				return ret;
+		}
+		break;
+	case PT_OLD_TIMEVAL:
+		{
+			struct old_timeval32 rtv;
+
+			rtv.tv_sec = rts.tv_sec;
+			rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+			if (!copy_to_user(p, &rtv, sizeof(rtv)))
+				return ret;
+		}
+		break;
+	case PT_TIMESPEC:
+		if (!put_timespec64(&rts, p))
 			return ret;
-
-	} else if (!put_timespec64(&rts, p))
-		return ret;
-
+		break;
+	case PT_OLD_TIMESPEC:
+		if (!put_old_timespec32(&rts, p))
+			return ret;
+		break;
+	default:
+		BUG();
+	}
 	/*
 	 * If an application puts its timeval in read-only memory, we
 	 * don't want the Linux-specific update to the timeval to
@@ -353,9 +383,6 @@
 #define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
 
 /*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- *
  * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
  */
 static inline
@@ -689,9 +716,7 @@
 	}
 
 	ret = core_sys_select(n, inp, outp, exp, to);
-	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
-
-	return ret;
+	return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret);
 }
 
 SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
@@ -701,51 +726,38 @@
 }
 
 static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
-		       fd_set __user *exp, struct timespec __user *tsp,
-		       const sigset_t __user *sigmask, size_t sigsetsize)
+		       fd_set __user *exp, void __user *tsp,
+		       const sigset_t __user *sigmask, size_t sigsetsize,
+		       enum poll_time_type type)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (get_timespec64(&ts, tsp))
-			return -EFAULT;
+		switch (type) {
+		case PT_TIMESPEC:
+			if (get_timespec64(&ts, tsp))
+				return -EFAULT;
+			break;
+		case PT_OLD_TIMESPEC:
+			if (get_old_timespec32(&ts, tsp))
+				return -EFAULT;
+			break;
+		default:
+			BUG();
+		}
 
 		to = &end_time;
 		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
 	}
 
-	if (sigmask) {
-		/* XXX: Don't preclude handling different sized sigset_t's.  */
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_user_sigmask(sigmask, sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = core_sys_select(n, inp, outp, exp, to);
-	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
-	if (ret == -ERESTARTNOHAND) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-					sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	return ret;
+	return poll_select_finish(&end_time, tsp, type, ret);
 }
 
 /*
@@ -755,23 +767,45 @@
  * the sigset size.
  */
 SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
-		fd_set __user *, exp, struct timespec __user *, tsp,
+		fd_set __user *, exp, struct __kernel_timespec __user *, tsp,
 		void __user *, sig)
 {
 	size_t sigsetsize = 0;
 	sigset_t __user *up = NULL;
 
 	if (sig) {
-		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
+		if (!access_ok(sig, sizeof(void *)+sizeof(size_t))
 		    || __get_user(up, (sigset_t __user * __user *)sig)
 		    || __get_user(sigsetsize,
 				(size_t __user *)(sig+sizeof(void *))))
 			return -EFAULT;
 	}
 
-	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
+	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize, PT_TIMESPEC);
 }
 
+#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct old_timespec32 __user *, tsp,
+		void __user *, sig)
+{
+	size_t sigsetsize = 0;
+	sigset_t __user *up = NULL;
+
+	if (sig) {
+		if (!access_ok(sig, sizeof(void *)+sizeof(size_t))
+		    || __get_user(up, (sigset_t __user * __user *)sig)
+		    || __get_user(sigsetsize,
+				(size_t __user *)(sig+sizeof(void *))))
+			return -EFAULT;
+	}
+
+	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize, PT_OLD_TIMESPEC);
+}
+
+#endif
+
 #ifdef __ARCH_WANT_SYS_OLD_SELECT
 struct sel_arg_struct {
 	unsigned long n;
@@ -888,7 +922,7 @@
 		if (!count) {
 			count = wait->error;
 			if (signal_pending(current))
-				count = -EINTR;
+				count = -ERESTARTNOHAND;
 		}
 		if (count || timed_out)
 			break;
@@ -927,7 +961,7 @@
 		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
- 	int err = -EFAULT, fdcount, len, size;
+	int err = -EFAULT, fdcount, len;
 	/* Allocate small arguments on the stack to save memory and be
 	   faster - use long to make sure the buffer is aligned properly
 	   on 64 bit archs to avoid unaligned access */
@@ -955,8 +989,8 @@
 			break;
 
 		len = min(todo, POLLFD_PER_PAGE);
-		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
-		walk = walk->next = kmalloc(size, GFP_KERNEL);
+		walk = walk->next = kmalloc(struct_size(walk, entries, len),
+					    GFP_KERNEL);
 		if (!walk) {
 			err = -ENOMEM;
 			goto out_fds;
@@ -1003,7 +1037,7 @@
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	if (ret == -EINTR) {
+	if (ret == -ERESTARTNOHAND) {
 		restart_block->fn = do_restart_poll;
 		ret = -ERESTART_RESTARTBLOCK;
 	}
@@ -1024,7 +1058,7 @@
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	if (ret == -EINTR) {
+	if (ret == -ERESTARTNOHAND) {
 		struct restart_block *restart_block;
 
 		restart_block = &current->restart_block;
@@ -1045,10 +1079,9 @@
 }
 
 SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
-		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
+		struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
 		size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1061,90 +1094,44 @@
 			return -EINVAL;
 	}
 
-	if (sigmask) {
-		/* XXX: Don't preclude handling different sized sigset_t's.  */
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_user_sigmask(sigmask, sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
-
-	/* We can restart this syscall, usually */
-	if (ret == -EINTR) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-					sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-		ret = -ERESTARTNOHAND;
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
-	return ret;
+	return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret);
 }
 
+#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
+		struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
+{
+	struct timespec64 ts, end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (get_old_timespec32(&ts, tsp))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	ret = set_user_sigmask(sigmask, sigsetsize);
+	if (ret)
+		return ret;
+
+	ret = do_sys_poll(ufds, nfds, to);
+	return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret);
+}
+#endif
+
 #ifdef CONFIG_COMPAT
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
-static
-int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p,
-				      int timeval, int ret)
-{
-	struct timespec64 ts;
-
-	if (!p)
-		return ret;
-
-	if (current->personality & STICKY_TIMEOUTS)
-		goto sticky;
-
-	/* No update for zero timeout */
-	if (!end_time->tv_sec && !end_time->tv_nsec)
-		return ret;
-
-	ktime_get_ts64(&ts);
-	ts = timespec64_sub(*end_time, ts);
-	if (ts.tv_sec < 0)
-		ts.tv_sec = ts.tv_nsec = 0;
-
-	if (timeval) {
-		struct compat_timeval rtv;
-
-		rtv.tv_sec = ts.tv_sec;
-		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
-
-		if (!copy_to_user(p, &rtv, sizeof(rtv)))
-			return ret;
-	} else {
-		if (!compat_put_timespec64(&ts, p))
-			return ret;
-	}
-	/*
-	 * If an application puts its timeval in read-only memory, we
-	 * don't want the Linux-specific update to the timeval to
-	 * cause a fault after the select has completed
-	 * successfully. However, because we're not updating the
-	 * timeval, we can't restart the system call.
-	 */
-
-sticky:
-	if (ret == -ERESTARTNOHAND)
-		ret = -EINTR;
-	return ret;
-}
-
 /*
  * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
  * 64-bit unsigned longs.
@@ -1257,10 +1244,10 @@
 
 static int do_compat_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct compat_timeval __user *tvp)
+	struct old_timeval32 __user *tvp)
 {
 	struct timespec64 end_time, *to = NULL;
-	struct compat_timeval tv;
+	struct old_timeval32 tv;
 	int ret;
 
 	if (tvp) {
@@ -1275,14 +1262,12 @@
 	}
 
 	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);
-
-	return ret;
+	return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret);
 }
 
 COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
 	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
-	struct compat_timeval __user *, tvp)
+	struct old_timeval32 __user *, tvp)
 {
 	return do_compat_select(n, inp, outp, exp, tvp);
 }
@@ -1307,81 +1292,93 @@
 
 static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
-	compat_size_t sigsetsize)
+	void __user *tsp, compat_sigset_t __user *sigmask,
+	compat_size_t sigsetsize, enum poll_time_type type)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (compat_get_timespec64(&ts, tsp))
-			return -EFAULT;
+		switch (type) {
+		case PT_OLD_TIMESPEC:
+			if (get_old_timespec32(&ts, tsp))
+				return -EFAULT;
+			break;
+		case PT_TIMESPEC:
+			if (get_timespec64(&ts, tsp))
+				return -EFAULT;
+			break;
+		default:
+			BUG();
+		}
 
 		to = &end_time;
 		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
 	}
 
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (get_compat_sigset(&ksigmask, sigmask))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
-	if (ret == -ERESTARTNOHAND) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-					sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	return ret;
+	return poll_select_finish(&end_time, tsp, type, ret);
 }
 
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
 	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
-	struct compat_timespec __user *, tsp, void __user *, sig)
+	struct __kernel_timespec __user *, tsp, void __user *, sig)
 {
 	compat_size_t sigsetsize = 0;
 	compat_uptr_t up = 0;
 
 	if (sig) {
-		if (!access_ok(VERIFY_READ, sig,
+		if (!access_ok(sig,
+				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+				__get_user(up, (compat_uptr_t __user *)sig) ||
+				__get_user(sigsetsize,
+				(compat_size_t __user *)(sig+sizeof(up))))
+			return -EFAULT;
+	}
+
+	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+				 sigsetsize, PT_TIMESPEC);
+}
+
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+
+COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct old_timespec32 __user *, tsp, void __user *, sig)
+{
+	compat_size_t sigsetsize = 0;
+	compat_uptr_t up = 0;
+
+	if (sig) {
+		if (!access_ok(sig,
 				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
 		    	__get_user(up, (compat_uptr_t __user *)sig) ||
 		    	__get_user(sigsetsize,
 				(compat_size_t __user *)(sig+sizeof(up))))
 			return -EFAULT;
 	}
+
 	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
-				 sigsetsize);
+				 sigsetsize, PT_OLD_TIMESPEC);
 }
 
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
-	unsigned int,  nfds, struct compat_timespec __user *, tsp,
+#endif
+
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
+	unsigned int,  nfds, struct old_timespec32 __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (compat_get_timespec64(&ts, tsp))
+		if (get_old_timespec32(&ts, tsp))
 			return -EFAULT;
 
 		to = &end_time;
@@ -1389,36 +1386,38 @@
 			return -EINVAL;
 	}
 
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (get_compat_sigset(&ksigmask, sigmask))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
-
-	/* We can restart this syscall, usually */
-	if (ret == -EINTR) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-				sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-		ret = -ERESTARTNOHAND;
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
-	return ret;
+	return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret);
 }
 #endif
+
+/* New compat syscall for 64 bit time_t*/
+COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
+	unsigned int,  nfds, struct __kernel_timespec __user *, tsp,
+	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
+{
+	struct timespec64 ts, end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (get_timespec64(&ts, tsp))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
+	if (ret)
+		return ret;
+
+	ret = do_sys_poll(ufds, nfds, to);
+	return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret);
+}
+
+#endif

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 1dea7a8..1600034 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c

@@ -119,6 +119,7 @@
 		}
 		if (seq_has_overflowed(m))
 			goto Eoverflow;
+		p = m->op->next(m, p, &m->index);
 		if (pos + m->count > offset) {
 			m->from = offset - pos;
 			m->count -= m->from;
@@ -126,7 +127,6 @@
 		}
 		pos += m->count;
 		m->count = 0;
-		p = m->op->next(m, p, &m->index);
 		if (pos == offset)
 			break;
 	}
@@ -317,6 +317,7 @@
 	switch (whence) {
 	case SEEK_CUR:
 		offset += file->f_pos;
+		/* fall through */
 	case SEEK_SET:
 		if (offset < 0)
 			break;
@@ -383,6 +384,17 @@
 }
 EXPORT_SYMBOL(seq_escape);
 
+void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int ret;
+
+	ret = string_escape_mem_ascii(src, isz, buf, size);
+	seq_commit(m, ret < size ? ret : -1);
+}
+EXPORT_SYMBOL(seq_escape_mem_ascii);
+
 void seq_vprintf(struct seq_file *m, const char *f, va_list args)
 {
 	int len;

diff --git a/fs/signalfd.c b/fs/signalfd.c
index 4fcd149..44b6845 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c

@@ -79,7 +79,7 @@
  * Copied from copy_siginfo_to_user() in kernel/signal.c
  */
 static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
-			     siginfo_t const *kinfo)
+			     kernel_siginfo_t const *kinfo)
 {
 	struct signalfd_siginfo new;
 
@@ -163,7 +163,7 @@
 	return sizeof(*uinfo);
 }
 
-static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
+static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info,
 				int nonblock)
 {
 	ssize_t ret;
@@ -176,6 +176,7 @@
 		if (!nonblock)
 			break;
 		ret = -EAGAIN;
+		/* fall through */
 	default:
 		spin_unlock_irq(&current->sighand->siglock);
 		return ret;
@@ -215,7 +216,7 @@
 	struct signalfd_siginfo __user *siginfo;
 	int nonblock = file->f_flags & O_NONBLOCK;
 	ssize_t ret, total = 0;
-	siginfo_t info;
+	kernel_siginfo_t info;
 
 	count /= sizeof(struct signalfd_siginfo);
 	if (!count)

diff --git a/fs/splice.c b/fs/splice.c
index b3daa97..9841272 100644
--- a/fs/splice.c
+++ b/fs/splice.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * "splice": joining two ropes together by interweaving their strands.
  *
@@ -138,7 +139,6 @@
 }
 
 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = page_cache_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
@@ -156,7 +156,6 @@
 }
 
 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = user_page_pipe_buf_steal,
@@ -301,7 +300,7 @@
 	struct kiocb kiocb;
 	int idx, ret;
 
-	iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
+	iov_iter_pipe(&to, READ, pipe, len);
 	idx = to.idx;
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
@@ -326,22 +325,20 @@
 EXPORT_SYMBOL(generic_file_splice_read);
 
 const struct pipe_buf_operations default_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = generic_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
-static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
-				    struct pipe_buffer *buf)
+int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+			     struct pipe_buffer *buf)
 {
 	return 1;
 }
 
 /* Pipe buffer operations for a socket and similar. */
 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = generic_pipe_buf_release,
 	.steal = generic_pipe_buf_nosteal,
@@ -357,7 +354,7 @@
 	ssize_t res;
 
 	old_fs = get_fs();
-	set_fs(get_ds());
+	set_fs(KERNEL_DS);
 	/* The cast to a user pointer is valid due to the set_fs() */
 	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
 	set_fs(old_fs);
@@ -386,7 +383,7 @@
 	 */
 	offset = *ppos & ~PAGE_MASK;
 
-	iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
+	iov_iter_pipe(&to, READ, pipe, len + offset);
 
 	res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
 	if (res <= 0)
@@ -745,8 +742,7 @@
 			left -= this_len;
 		}
 
-		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
-			      sd.total_len - left);
+		iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
 		ret = vfs_iter_write(out, &from, &sd.pos, 0);
 		if (ret <= 0)
 			break;
@@ -946,11 +942,16 @@
 	sd->flags &= ~SPLICE_F_NONBLOCK;
 	more = sd->flags & SPLICE_F_MORE;
 
+	WARN_ON_ONCE(pipe->nrbufs != 0);
+
 	while (len) {
 		size_t read_len;
 		loff_t pos = sd->pos, prev_pos = pos;
 
-		ret = do_splice_to(in, &pos, pipe, len, flags);
+		/* Don't try to read more the pipe has space for. */
+		read_len = min_t(size_t, len,
+				 (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
+		ret = do_splice_to(in, &pos, pipe, read_len, flags);
 		if (unlikely(ret <= 0))
 			goto out_release;
 
@@ -1119,6 +1120,9 @@
 		if (ipipe == opipe)
 			return -EINVAL;
 
+		if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+			flags |= SPLICE_F_NONBLOCK;
+
 		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
 	}
 
@@ -1144,6 +1148,9 @@
 		if (unlikely(ret < 0))
 			return ret;
 
+		if (in->f_flags & O_NONBLOCK)
+			flags |= SPLICE_F_NONBLOCK;
+
 		file_start_write(out);
 		ret = do_splice_from(ipipe, out, &offset, len, flags);
 		file_end_write(out);
@@ -1168,6 +1175,9 @@
 			offset = in->f_pos;
 		}
 
+		if (out->f_flags & O_NONBLOCK)
+			flags |= SPLICE_F_NONBLOCK;
+
 		pipe_lock(opipe);
 		ret = wait_for_space(opipe, flags);
 		if (!ret)
@@ -1346,7 +1356,7 @@
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	long error;
+	ssize_t error;
 	struct fd f;
 	int type;
 
@@ -1357,7 +1367,7 @@
 
 	error = import_iovec(type, uiov, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (!error) {
+	if (error >= 0) {
 		error = do_vmsplice(f.file, &iter, flags);
 		kfree(iov);
 	}
@@ -1372,7 +1382,7 @@
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	long error;
+	ssize_t error;
 	struct fd f;
 	int type;
 
@@ -1383,7 +1393,7 @@
 
 	error = compat_import_iovec(type, iov32, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (!error) {
+	if (error >= 0) {
 		error = do_vmsplice(f.file, &iter, flags);
 		kfree(iov);
 	}
@@ -1584,7 +1594,11 @@
 			 * Get a reference to this pipe buffer,
 			 * so we can copy the contents over.
 			 */
-			pipe_buf_get(ipipe, ibuf);
+			if (!pipe_buf_get(ipipe, ibuf)) {
+				if (ret == 0)
+					ret = -EFAULT;
+				break;
+			}
 			*obuf = *ibuf;
 
 			/*
@@ -1593,6 +1607,8 @@
 			 */
 			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 
+			pipe_buf_mark_unmergeable(obuf);
+
 			obuf->len = len;
 			opipe->nrbufs++;
 			ibuf->offset += obuf->len;
@@ -1656,7 +1672,11 @@
 		 * Get a reference to this pipe buffer,
 		 * so we can copy the contents over.
 		 */
-		pipe_buf_get(ipipe, ibuf);
+		if (!pipe_buf_get(ipipe, ibuf)) {
+			if (ret == 0)
+				ret = -EFAULT;
+			break;
+		}
 
 		obuf = opipe->bufs + nbuf;
 		*obuf = *ibuf;
@@ -1667,6 +1687,8 @@
 		 */
 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 
+		pipe_buf_mark_unmergeable(obuf);
+
 		if (obuf->len > len)
 			obuf->len = len;
 
@@ -1713,6 +1735,9 @@
 	 * copying the data.
 	 */
 	if (ipipe && opipe && ipipe != opipe) {
+		if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+			flags |= SPLICE_F_NONBLOCK;
+
 		/*
 		 * Keep going, unless we encounter an error. The ipipe/opipe
 		 * ordering doesn't really matter.

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 1adb334..916e78f 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config SQUASHFS
 	tristate "SquashFS 4.0 - Squashed file system support"
 	depends on BLOCK

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index f098b9f..4f9b9fb 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * block.c
  */
 

diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 0839efa..5062326 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * cache.c
  */
 

diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 8366398..d57bef9 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * decompressor.c
  */
 

diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 0f5a8e4..ec86175 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 #ifndef DECOMPRESSOR_H
 #define DECOMPRESSOR_H
 /*
@@ -6,20 +7,6 @@
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * decompressor.h
  */
 

diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index d6008a6..c181dee 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c

@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  Copyright (c) 2013
  *  Minchan Kim <minchan@kernel.org>
- *
- *  This work is licensed under the terms of the GNU GPL, version 2. See
- *  the COPYING file in the top-level directory.
  */
 #include <linux/types.h>
 #include <linux/mutex.h>

diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index 23a9c28..2a2a2d1 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c

@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2013
  * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
  */
 
 #include <linux/types.h>

diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
index a6c7592..550c3e5 100644
--- a/fs/squashfs/decompressor_single.c
+++ b/fs/squashfs/decompressor_single.c

@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2013
  * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
  */
 
 #include <linux/types.h>

diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index a5845f9..a2ade63 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * dir.c
  */
 

diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 8073b65..ae2c87b 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * export.c
  */
 

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index f1c1430..7b11283 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * file.c
  */
 

diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
index a9ba8d9..54c17b7 100644
--- a/fs/squashfs/file_cache.c
+++ b/fs/squashfs/file_cache.c

@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2013
  * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
  */
 
 #include <linux/fs.h>

diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 80db1b8..a4894cc 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c

@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2013
  * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
  */
 
 #include <linux/fs.h>

diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 0681fea..49602b9 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * fragment.c
  */
 

diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index d38ea3d..6be5afe 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * id.c
  */
 

diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index e9793b1..2446314 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * inode.c
  */
 

diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index 95da653..c4e47e0 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c

@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2013, 2014
  * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
  */
 
 #include <linux/buffer_head.h>

diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 934c17e..aa3c3da 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2010 LG Electronics
  * Chan Jeong <chan.jeong@lge.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * lzo_wrapper.c
  */
 

diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 40c10d9..11e4539 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * namei.c
  */
 

diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 9b7b1b6..520d323 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c

@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2013
  * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
  */
 
 #include <linux/kernel.h>

diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 98537ea..2e3073a 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h

@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 #ifndef PAGE_ACTOR_H
 #define PAGE_ACTOR_H
 /*
  * Copyright (c) 2013
  * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
  */
 
 #ifndef CONFIG_SQUASHFS_FILE_DIRECT

diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index f89f8a7..2797763 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h

@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * squashfs.h
  */
 

diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 4e6853f..7187bd1 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 #ifndef SQUASHFS_FS
 #define SQUASHFS_FS
 /*
@@ -6,20 +7,6 @@
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * squashfs_fs.h
  */
 

diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d09fcd6..2c82d6f 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 #ifndef SQUASHFS_FS_I
 #define SQUASHFS_FS_I
 /*
@@ -6,20 +7,6 @@
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * squashfs_fs_i.h
  */
 

diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index ef69c31..34c21ff 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h

@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 #ifndef SQUASHFS_FS_SB
 #define SQUASHFS_FS_SB
 /*
@@ -6,20 +7,6 @@
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * squashfs_fs_sb.h
  */
 

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 40e6573..0cc4cee 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * super.c
  */
 
@@ -30,6 +17,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
@@ -49,26 +37,27 @@
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
 
-static const struct squashfs_decompressor *supported_squashfs_filesystem(short
-	major, short minor, short id)
+static const struct squashfs_decompressor *supported_squashfs_filesystem(
+	struct fs_context *fc,
+	short major, short minor, short id)
 {
 	const struct squashfs_decompressor *decompressor;
 
 	if (major < SQUASHFS_MAJOR) {
-		ERROR("Major/Minor mismatch, older Squashfs %d.%d "
-			"filesystems are unsupported\n", major, minor);
+		errorf(fc, "Major/Minor mismatch, older Squashfs %d.%d "
+		       "filesystems are unsupported", major, minor);
 		return NULL;
 	} else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
-		ERROR("Major/Minor mismatch, trying to mount newer "
-			"%d.%d filesystem\n", major, minor);
-		ERROR("Please update your kernel\n");
+		errorf(fc, "Major/Minor mismatch, trying to mount newer "
+		       "%d.%d filesystem", major, minor);
+		errorf(fc, "Please update your kernel");
 		return NULL;
 	}
 
 	decompressor = squashfs_lookup_decompressor(id);
 	if (!decompressor->supported) {
-		ERROR("Filesystem uses \"%s\" compression. This is not "
-			"supported\n", decompressor->name);
+		errorf(fc, "Filesystem uses \"%s\" compression. This is not supported",
+		       decompressor->name);
 		return NULL;
 	}
 
@@ -76,7 +65,7 @@
 }
 
 
-static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
+static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct squashfs_sb_info *msblk;
 	struct squashfs_super_block *sblk = NULL;
@@ -111,7 +100,7 @@
 	sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk));
 
 	if (IS_ERR(sblk)) {
-		ERROR("unable to read squashfs_super_block\n");
+		errorf(fc, "unable to read squashfs_super_block");
 		err = PTR_ERR(sblk);
 		sblk = NULL;
 		goto failed_mount;
@@ -122,14 +111,15 @@
 	/* Check it is a SQUASHFS superblock */
 	sb->s_magic = le32_to_cpu(sblk->s_magic);
 	if (sb->s_magic != SQUASHFS_MAGIC) {
-		if (!silent)
-			ERROR("Can't find a SQUASHFS superblock on %pg\n",
-						sb->s_bdev);
+		if (!(fc->sb_flags & SB_SILENT))
+			errorf(fc, "Can't find a SQUASHFS superblock on %pg",
+			       sb->s_bdev);
 		goto failed_mount;
 	}
 
 	/* Check the MAJOR & MINOR versions and lookup compression type */
 	msblk->decompressor = supported_squashfs_filesystem(
+			fc,
 			le16_to_cpu(sblk->s_major),
 			le16_to_cpu(sblk->s_minor),
 			le16_to_cpu(sblk->compression));
@@ -146,15 +136,15 @@
 	/* Check block size for sanity */
 	msblk->block_size = le32_to_cpu(sblk->block_size);
 	if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
-		goto failed_mount;
+		goto insanity;
 
 	/*
 	 * Check the system page size is not larger than the filesystem
 	 * block size (by default 128K).  This is currently not supported.
 	 */
 	if (PAGE_SIZE > msblk->block_size) {
-		ERROR("Page size > filesystem block size (%d).  This is "
-			"currently not supported!\n", msblk->block_size);
+		errorf(fc, "Page size > filesystem block size (%d).  This is "
+		       "currently not supported!", msblk->block_size);
 		goto failed_mount;
 	}
 
@@ -165,12 +155,12 @@
 
 	/* Check that block_size and block_log match */
 	if (msblk->block_size != (1 << msblk->block_log))
-		goto failed_mount;
+		goto insanity;
 
 	/* Check the root inode for sanity */
 	root_inode = le64_to_cpu(sblk->root_inode);
 	if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
-		goto failed_mount;
+		goto insanity;
 
 	msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
 	msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
@@ -196,6 +186,8 @@
 		(u64) le64_to_cpu(sblk->id_table_start));
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_time_min = 0;
+	sb->s_time_max = U32_MAX;
 	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &squashfs_super_ops;
 
@@ -210,7 +202,7 @@
 	msblk->read_page = squashfs_cache_init("data",
 		squashfs_max_decompressors(), msblk->block_size);
 	if (msblk->read_page == NULL) {
-		ERROR("Failed to allocate read_page block\n");
+		errorf(fc, "Failed to allocate read_page block");
 		goto failed_mount;
 	}
 
@@ -218,7 +210,7 @@
 	if (IS_ERR(msblk->stream)) {
 		err = PTR_ERR(msblk->stream);
 		msblk->stream = NULL;
-		goto failed_mount;
+		goto insanity;
 	}
 
 	/* Handle xattrs */
@@ -233,7 +225,7 @@
 	msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
 		xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
 	if (IS_ERR(msblk->xattr_id_table)) {
-		ERROR("unable to read xattr id index table\n");
+		errorf(fc, "unable to read xattr id index table");
 		err = PTR_ERR(msblk->xattr_id_table);
 		msblk->xattr_id_table = NULL;
 		if (err != -ENOTSUPP)
@@ -247,7 +239,7 @@
 		le64_to_cpu(sblk->id_table_start), next_table,
 		le16_to_cpu(sblk->no_ids));
 	if (IS_ERR(msblk->id_table)) {
-		ERROR("unable to read id index table\n");
+		errorf(fc, "unable to read id index table");
 		err = PTR_ERR(msblk->id_table);
 		msblk->id_table = NULL;
 		goto failed_mount;
@@ -263,7 +255,7 @@
 	msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
 		lookup_table_start, next_table, msblk->inodes);
 	if (IS_ERR(msblk->inode_lookup_table)) {
-		ERROR("unable to read inode lookup table\n");
+		errorf(fc, "unable to read inode lookup table");
 		err = PTR_ERR(msblk->inode_lookup_table);
 		msblk->inode_lookup_table = NULL;
 		goto failed_mount;
@@ -288,7 +280,7 @@
 	msblk->fragment_index = squashfs_read_fragment_index_table(sb,
 		le64_to_cpu(sblk->fragment_table_start), next_table, fragments);
 	if (IS_ERR(msblk->fragment_index)) {
-		ERROR("unable to read fragment index table\n");
+		errorf(fc, "unable to read fragment index table");
 		err = PTR_ERR(msblk->fragment_index);
 		msblk->fragment_index = NULL;
 		goto failed_mount;
@@ -299,13 +291,13 @@
 	/* Sanity check directory_table */
 	if (msblk->directory_table > next_table) {
 		err = -EINVAL;
-		goto failed_mount;
+		goto insanity;
 	}
 
 	/* Sanity check inode_table */
 	if (msblk->inode_table >= msblk->directory_table) {
 		err = -EINVAL;
-		goto failed_mount;
+		goto insanity;
 	}
 
 	/* allocate root */
@@ -334,6 +326,8 @@
 	kfree(sblk);
 	return 0;
 
+insanity:
+	errorf(fc, "squashfs image failed sanity check");
 failed_mount:
 	squashfs_cache_delete(msblk->block_cache);
 	squashfs_cache_delete(msblk->fragment_cache);
@@ -349,6 +343,28 @@
 	return err;
 }
 
+static int squashfs_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, squashfs_fill_super);
+}
+
+static int squashfs_reconfigure(struct fs_context *fc)
+{
+	sync_filesystem(fc->root->d_sb);
+	fc->sb_flags |= SB_RDONLY;
+	return 0;
+}
+
+static const struct fs_context_operations squashfs_context_ops = {
+	.get_tree	= squashfs_get_tree,
+	.reconfigure	= squashfs_reconfigure,
+};
+
+static int squashfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &squashfs_context_ops;
+	return 0;
+}
 
 static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -371,14 +387,6 @@
 }
 
 
-static int squashfs_remount(struct super_block *sb, int *flags, char *data)
-{
-	sync_filesystem(sb);
-	*flags |= SB_RDONLY;
-	return 0;
-}
-
-
 static void squashfs_put_super(struct super_block *sb)
 {
 	if (sb->s_fs_info) {
@@ -397,14 +405,6 @@
 	}
 }
 
-
-static struct dentry *squashfs_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
-}
-
-
 static struct kmem_cache *squashfs_inode_cachep;
 
 
@@ -473,22 +473,15 @@
 }
 
 
-static void squashfs_i_callback(struct rcu_head *head)
+static void squashfs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
 }
 
-static void squashfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, squashfs_i_callback);
-}
-
-
 static struct file_system_type squashfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "squashfs",
-	.mount = squashfs_mount,
+	.init_fs_context = squashfs_init_fs_context,
 	.kill_sb = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV
 };
@@ -496,10 +489,9 @@
 
 static const struct super_operations squashfs_super_ops = {
 	.alloc_inode = squashfs_alloc_inode,
-	.destroy_inode = squashfs_destroy_inode,
+	.free_inode = squashfs_free_inode,
 	.statfs = squashfs_statfs,
 	.put_super = squashfs_put_super,
-	.remount_fs = squashfs_remount
 };
 
 module_init(init_squashfs_fs);

diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index befeba0..1430613 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * symlink.c
  */
 

diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 1548b37..e1e3f3d 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2010
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * xattr.c
  */
 

diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index afe70f8..184129a 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h

@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2010
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * xattr.h
  */
 

diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index c89607d..d99e084 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2010
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * xattr_id.c
  */
 

diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 6bfaef7..4b2f205 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * xz_wrapper.c
  */
 

diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 2ec24d1..f2226af 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c

@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
  * zlib_wrapper.c
  */
 

diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index eeaabf8..b448c2a 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c

@@ -1,19 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Squashfs - a compressed read only filesystem for Linux
  *
  * Copyright (c) 2016-present, Facebook, Inc.
  * All rights reserved.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  * zstd_wrapper.c
  */
 

diff --git a/fs/stack.c b/fs/stack.c
index a54e33e..4ef2c05 100644
--- a/fs/stack.c
+++ b/fs/stack.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/export.h>
 #include <linux/fs.h>
 #include <linux/fs_stack.h>
@@ -21,11 +22,10 @@
 	i_size = i_size_read(src);
 
 	/*
-	 * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to
-	 * keep the two halves of i_blocks in sync despite SMP or PREEMPT -
-	 * though stat's generic_fillattr() doesn't bother, and we won't be
-	 * applying quotas (where i_blocks does become important) at the
-	 * upper level.
+	 * But on 32-bit, we ought to make an effort to keep the two halves of
+	 * i_blocks in sync despite SMP or PREEMPT - though stat's
+	 * generic_fillattr() doesn't bother, and we won't be applying quotas
+	 * (where i_blocks does become important) at the upper level.
 	 *
 	 * We don't actually know what locking is used at the lower level;
 	 * but if it's a filesystem that supports quotas, it will be using
@@ -44,9 +44,9 @@
 	 * include/linux/fs.h).  We don't necessarily hold i_mutex when this
 	 * is called, so take i_lock for that case.
 	 *
-	 * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
-	 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
-	 * for that case too, and do both at once by combining the tests.
+	 * And if on 32-bit, continue our effort to keep the two halves of
+	 * i_blocks in sync despite SMP or PREEMPT: use i_lock  for that case
+	 * too, and do both at once by combining the tests.
 	 *
 	 * There is none of this locking overhead in the 64-bit case.
 	 */

diff --git a/fs/stat.c b/fs/stat.c
index f8e6fb2..c38e4c2 100644
--- a/fs/stat.c
+++ b/fs/stat.c

@@ -45,11 +45,6 @@
 	stat->ctime = inode->i_ctime;
 	stat->blksize = i_blocksize(inode);
 	stat->blocks = inode->i_blocks;
-
-	if (IS_NOATIME(inode))
-		stat->result_mask &= ~STATX_ATIME;
-	if (IS_AUTOMOUNT(inode))
-		stat->attributes |= STATX_ATTR_AUTOMOUNT;
 }
 EXPORT_SYMBOL(generic_fillattr);
 
@@ -75,6 +70,13 @@
 	stat->result_mask |= STATX_BASIC_STATS;
 	request_mask &= STATX_ALL;
 	query_flags &= KSTAT_QUERY_FLAGS;
+
+	/* allow the fs to override these if it really wants to */
+	if (IS_NOATIME(inode))
+		stat->result_mask &= ~STATX_ATIME;
+	if (IS_AUTOMOUNT(inode))
+		stat->attributes |= STATX_ATTR_AUTOMOUNT;
+
 	if (inode->i_op->getattr)
 		return inode->i_op->getattr(path, stat, request_mask,
 					    query_flags);
@@ -280,6 +282,8 @@
 
 #endif /* __ARCH_WANT_OLD_STAT */
 
+#ifdef __ARCH_WANT_NEW_STAT
+
 #if BITS_PER_LONG == 32
 #  define choose_32_64(a,b) a
 #else
@@ -378,6 +382,7 @@
 
 	return error;
 }
+#endif
 
 static int do_readlinkat(int dfd, const char __user *pathname,
 			 char __user *buf, int bufsiz)

diff --git a/fs/statfs.c b/fs/statfs.c
index f021662..2616424 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c

@@ -67,6 +67,20 @@
 	return retval;
 }
 
+int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+{
+	struct kstatfs st;
+	int error;
+
+	error = statfs_by_dentry(dentry, &st);
+	if (error)
+		return error;
+
+	*fsid = st.f_fsid;
+	return 0;
+}
+EXPORT_SYMBOL(vfs_get_fsid);
+
 int vfs_statfs(const struct path *path, struct kstatfs *buf)
 {
 	int error;
@@ -304,19 +318,10 @@
 static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
 {
 	struct compat_statfs64 buf;
-	if (sizeof(ubuf->f_bsize) == 4) {
-		if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
-		     kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
-			return -EOVERFLOW;
-		/* f_files and f_ffree may be -1; it's okay
-		 * to stuff that into 32 bits */
-		if (kbuf->f_files != 0xffffffffffffffffULL
-		 && (kbuf->f_files & 0xffffffff00000000ULL))
-			return -EOVERFLOW;
-		if (kbuf->f_ffree != 0xffffffffffffffffULL
-		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
-			return -EOVERFLOW;
-	}
+
+	if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+		return -EOVERFLOW;
+
 	memset(&buf, 0, sizeof(struct compat_statfs64));
 	buf.f_type = kbuf->f_type;
 	buf.f_bsize = kbuf->f_bsize;

diff --git a/fs/super.c b/fs/super.c
index f3a8c00..cfadab2 100644
--- a/fs/super.c
+++ b/fs/super.c

@@ -32,9 +32,12 @@
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
 #include <linux/cleancache.h>
+#include <linux/fscrypt.h>
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
 #include <linux/user_namespace.h>
+#include <linux/fs_context.h>
+#include <uapi/linux/mount.h>
 #include "internal.h"
 
 static int thaw_super_locked(struct super_block *sb);
@@ -255,6 +258,8 @@
 	s->s_maxbytes = MAX_NON_LFS;
 	s->s_op = &default_op;
 	s->s_time_gran = 1000000000;
+	s->s_time_min = TIME64_MIN;
+	s->s_time_max = TIME64_MAX;
 	s->cleancache_poolid = CLEANCACHE_NO_POOL;
 
 	s->s_shrink.seeks = DEFAULT_SEEKS;
@@ -288,6 +293,7 @@
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(!list_empty(&s->s_mounts));
 		security_sb_free(s);
+		fscrypt_sb_free(s);
 		put_user_ns(s->s_user_ns);
 		kfree(s->s_subtype);
 		call_rcu(&s->rcu, destroy_super_rcu);
@@ -442,7 +448,7 @@
 		sync_filesystem(sb);
 		sb->s_flags &= ~SB_ACTIVE;
 
-		fsnotify_unmount_inodes(sb);
+		fsnotify_sb_delete(sb);
 		cgroup_writeback_umount();
 
 		evict_inodes(sb);
@@ -474,29 +480,115 @@
 
 EXPORT_SYMBOL(generic_shutdown_super);
 
+bool mount_capable(struct fs_context *fc)
+{
+	if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT))
+		return capable(CAP_SYS_ADMIN);
+	else
+		return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
+}
+
 /**
- *	sget_userns -	find or create a superblock
- *	@type:	filesystem type superblock should belong to
- *	@test:	comparison callback
- *	@set:	setup callback
- *	@flags:	mount flags
- *	@user_ns: User namespace for the super_block
- *	@data:	argument to each of them
+ * sget_fc - Find or create a superblock
+ * @fc:	Filesystem context.
+ * @test: Comparison callback
+ * @set: Setup callback
+ *
+ * Find or create a superblock using the parameters stored in the filesystem
+ * context and the two callback functions.
+ *
+ * If an extant superblock is matched, then that will be returned with an
+ * elevated reference count that the caller must transfer or discard.
+ *
+ * If no match is made, a new superblock will be allocated and basic
+ * initialisation will be performed (s_type, s_fs_info and s_id will be set and
+ * the set() callback will be invoked), the superblock will be published and it
+ * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
+ * as yet unset.
  */
-struct super_block *sget_userns(struct file_system_type *type,
+struct super_block *sget_fc(struct fs_context *fc,
+			    int (*test)(struct super_block *, struct fs_context *),
+			    int (*set)(struct super_block *, struct fs_context *))
+{
+	struct super_block *s = NULL;
+	struct super_block *old;
+	struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
+	int err;
+
+retry:
+	spin_lock(&sb_lock);
+	if (test) {
+		hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
+			if (test(old, fc))
+				goto share_extant_sb;
+		}
+	}
+	if (!s) {
+		spin_unlock(&sb_lock);
+		s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
+		if (!s)
+			return ERR_PTR(-ENOMEM);
+		goto retry;
+	}
+
+	s->s_fs_info = fc->s_fs_info;
+	err = set(s, fc);
+	if (err) {
+		s->s_fs_info = NULL;
+		spin_unlock(&sb_lock);
+		destroy_unused_super(s);
+		return ERR_PTR(err);
+	}
+	fc->s_fs_info = NULL;
+	s->s_type = fc->fs_type;
+	s->s_iflags |= fc->s_iflags;
+	strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+	list_add_tail(&s->s_list, &super_blocks);
+	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
+	spin_unlock(&sb_lock);
+	get_filesystem(s->s_type);
+	register_shrinker_prepared(&s->s_shrink);
+	return s;
+
+share_extant_sb:
+	if (user_ns != old->s_user_ns) {
+		spin_unlock(&sb_lock);
+		destroy_unused_super(s);
+		return ERR_PTR(-EBUSY);
+	}
+	if (!grab_super(old))
+		goto retry;
+	destroy_unused_super(s);
+	return old;
+}
+EXPORT_SYMBOL(sget_fc);
+
+/**
+ *	sget	-	find or create a superblock
+ *	@type:	  filesystem type superblock should belong to
+ *	@test:	  comparison callback
+ *	@set:	  setup callback
+ *	@flags:	  mount flags
+ *	@data:	  argument to each of them
+ */
+struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-			int flags, struct user_namespace *user_ns,
+			int flags,
 			void *data)
 {
+	struct user_namespace *user_ns = current_user_ns();
 	struct super_block *s = NULL;
 	struct super_block *old;
 	int err;
 
-	if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) &&
-	    !(type->fs_flags & FS_USERNS_MOUNT) &&
-	    !capable(CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & SB_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
@@ -537,39 +629,6 @@
 	register_shrinker_prepared(&s->s_shrink);
 	return s;
 }
-
-EXPORT_SYMBOL(sget_userns);
-
-/**
- *	sget	-	find or create a superblock
- *	@type:	  filesystem type superblock should belong to
- *	@test:	  comparison callback
- *	@set:	  setup callback
- *	@flags:	  mount flags
- *	@data:	  argument to each of them
- */
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags,
-			void *data)
-{
-	struct user_namespace *user_ns = current_user_ns();
-
-	/* We don't yet pass the user namespace of the parent
-	 * mount through to here so always use &init_user_ns
-	 * until that changes.
-	 */
-	if (flags & SB_SUBMOUNT)
-		user_ns = &init_user_ns;
-
-	/* Ensure the requestor has permissions over the target filesystem */
-	if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	return sget_userns(type, test, set, flags, user_ns, data);
-}
-
 EXPORT_SYMBOL(sget);
 
 void drop_super(struct super_block *sb)
@@ -834,28 +893,35 @@
 }
 
 /**
- *	do_remount_sb - asks filesystem to change mount options.
- *	@sb:	superblock in question
- *	@sb_flags: revised superblock flags
- *	@data:	the rest of options
- *      @force: whether or not to force the change
+ * reconfigure_super - asks filesystem to change superblock parameters
+ * @fc: The superblock and configuration
  *
- *	Alters the mount options of a mounted file system.
+ * Alters the configuration parameters of a live superblock.
  */
-int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
+int reconfigure_super(struct fs_context *fc)
 {
+	struct super_block *sb = fc->root->d_sb;
 	int retval;
-	int remount_ro;
+	bool remount_ro = false;
+	bool force = fc->sb_flags & SB_FORCE;
 
+	if (fc->sb_flags_mask & ~MS_RMT_MASK)
+		return -EINVAL;
 	if (sb->s_writers.frozen != SB_UNFROZEN)
 		return -EBUSY;
 
+	retval = security_sb_remount(sb, fc->security);
+	if (retval)
+		return retval;
+
+	if (fc->sb_flags_mask & SB_RDONLY) {
 #ifdef CONFIG_BLOCK
-	if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
-		return -EACCES;
+		if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
+			return -EACCES;
 #endif
 
-	remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+		remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+	}
 
 	if (remount_ro) {
 		if (!hlist_empty(&sb->s_pins)) {
@@ -866,13 +932,14 @@
 				return 0;
 			if (sb->s_writers.frozen != SB_UNFROZEN)
 				return -EBUSY;
-			remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+			remount_ro = !sb_rdonly(sb);
 		}
 	}
 	shrink_dcache_sb(sb);
 
-	/* If we are remounting RDONLY and current sb is read/write,
-	   make sure there are no rw files opened */
+	/* If we are reconfiguring to RDONLY and current sb is read/write,
+	 * make sure there are no files open for writing.
+	 */
 	if (remount_ro) {
 		if (force) {
 			sb->s_readonly_remount = 1;
@@ -884,8 +951,8 @@
 		}
 	}
 
-	if (sb->s_op->remount_fs) {
-		retval = sb->s_op->remount_fs(sb, &sb_flags, data);
+	if (fc->ops->reconfigure) {
+		retval = fc->ops->reconfigure(fc);
 		if (retval) {
 			if (!force)
 				goto cancel_readonly;
@@ -894,7 +961,9 @@
 			     sb->s_type->name, retval);
 		}
 	}
-	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK);
+
+	WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
+				 (fc->sb_flags & fc->sb_flags_mask)));
 	/* Needs to be ordered wrt mnt_is_readonly() */
 	smp_wmb();
 	sb->s_readonly_remount = 0;
@@ -921,10 +990,15 @@
 	down_write(&sb->s_umount);
 	if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
 	    !sb_rdonly(sb)) {
-		/*
-		 * What lock protects sb->s_flags??
-		 */
-		do_remount_sb(sb, SB_RDONLY, NULL, 1);
+		struct fs_context *fc;
+
+		fc = fs_context_for_reconfigure(sb->s_root,
+					SB_RDONLY | SB_FORCE, SB_RDONLY);
+		if (!IS_ERR(fc)) {
+			if (parse_monolithic_mount_data(fc, NULL) == 0)
+				(void)reconfigure_super(fc);
+			put_fs_context(fc);
+		}
 	}
 	up_write(&sb->s_umount);
 }
@@ -1042,51 +1116,138 @@
 }
 EXPORT_SYMBOL(kill_litter_super);
 
-static int ns_test_super(struct super_block *sb, void *data)
+int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
 {
-	return sb->s_fs_info == data;
-}
-
-static int ns_set_super(struct super_block *sb, void *data)
-{
-	sb->s_fs_info = data;
 	return set_anon_super(sb, NULL);
 }
+EXPORT_SYMBOL(set_anon_super_fc);
 
-struct dentry *mount_ns(struct file_system_type *fs_type,
-	int flags, void *data, void *ns, struct user_namespace *user_ns,
-	int (*fill_super)(struct super_block *, void *, int))
+static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct super_block *sb;
-
-	/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
-	 * over the namespace.
-	 */
-	if (!(flags & SB_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
-			 user_ns, ns);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-
-	if (!sb->s_root) {
-		int err;
-		err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
-		if (err) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(err);
-		}
-
-		sb->s_flags |= SB_ACTIVE;
-	}
-
-	return dget(sb->s_root);
+	return sb->s_fs_info == fc->s_fs_info;
 }
 
-EXPORT_SYMBOL(mount_ns);
+static int test_single_super(struct super_block *s, struct fs_context *fc)
+{
+	return 1;
+}
+
+/**
+ * vfs_get_super - Get a superblock with a search key set in s_fs_info.
+ * @fc: The filesystem context holding the parameters
+ * @keying: How to distinguish superblocks
+ * @fill_super: Helper to initialise a new superblock
+ *
+ * Search for a superblock and create a new one if not found.  The search
+ * criterion is controlled by @keying.  If the search fails, a new superblock
+ * is created and @fill_super() is called to initialise it.
+ *
+ * @keying can take one of a number of values:
+ *
+ * (1) vfs_get_single_super - Only one superblock of this type may exist on the
+ *     system.  This is typically used for special system filesystems.
+ *
+ * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
+ *     distinct keys (where the key is in s_fs_info).  Searching for the same
+ *     key again will turn up the superblock for that key.
+ *
+ * (3) vfs_get_independent_super - Multiple superblocks may exist and are
+ *     unkeyed.  Each call will get a new superblock.
+ *
+ * A permissions check is made by sget_fc() unless we're getting a superblock
+ * for a kernel-internal mount or a submount.
+ */
+int vfs_get_super(struct fs_context *fc,
+		  enum vfs_get_super_keying keying,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	int (*test)(struct super_block *, struct fs_context *);
+	struct super_block *sb;
+	int err;
+
+	switch (keying) {
+	case vfs_get_single_super:
+	case vfs_get_single_reconf_super:
+		test = test_single_super;
+		break;
+	case vfs_get_keyed_super:
+		test = test_keyed_super;
+		break;
+	case vfs_get_independent_super:
+		test = NULL;
+		break;
+	default:
+		BUG();
+	}
+
+	sb = sget_fc(fc, test, set_anon_super_fc);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	if (!sb->s_root) {
+		err = fill_super(sb, fc);
+		if (err)
+			goto error;
+
+		sb->s_flags |= SB_ACTIVE;
+		fc->root = dget(sb->s_root);
+	} else {
+		fc->root = dget(sb->s_root);
+		if (keying == vfs_get_single_reconf_super) {
+			err = reconfigure_super(fc);
+			if (err < 0) {
+				dput(fc->root);
+				fc->root = NULL;
+				goto error;
+			}
+		}
+	}
+
+	return 0;
+
+error:
+	deactivate_locked_super(sb);
+	return err;
+}
+EXPORT_SYMBOL(vfs_get_super);
+
+int get_tree_nodev(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_independent_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_nodev);
+
+int get_tree_single(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_single_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_single);
+
+int get_tree_single_reconf(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_single_reconf);
+
+int get_tree_keyed(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc),
+		void *key)
+{
+	fc->s_fs_info = key;
+	return vfs_get_super(fc, vfs_get_keyed_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_keyed);
 
 #ifdef CONFIG_BLOCK
+
 static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_bdev = data;
@@ -1096,6 +1257,102 @@
 	return 0;
 }
 
+static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
+{
+	return set_bdev_super(s, fc->sget_key);
+}
+
+static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
+{
+	return s->s_bdev == fc->sget_key;
+}
+
+/**
+ * get_tree_bdev - Get a superblock based on a single block device
+ * @fc: The filesystem context holding the parameters
+ * @fill_super: Helper to initialise a new superblock
+ */
+int get_tree_bdev(struct fs_context *fc,
+		int (*fill_super)(struct super_block *,
+				  struct fs_context *))
+{
+	struct block_device *bdev;
+	struct super_block *s;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
+	int error = 0;
+
+	if (!(fc->sb_flags & SB_RDONLY))
+		mode |= FMODE_WRITE;
+
+	if (!fc->source)
+		return invalf(fc, "No source specified");
+
+	bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type);
+	if (IS_ERR(bdev)) {
+		errorf(fc, "%s: Can't open blockdev", fc->source);
+		return PTR_ERR(bdev);
+	}
+
+	/* Once the superblock is inserted into the list by sget_fc(), s_umount
+	 * will protect the lockfs code from trying to start a snapshot while
+	 * we are mounting
+	 */
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		blkdev_put(bdev, mode);
+		warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
+		return -EBUSY;
+	}
+
+	fc->sb_flags |= SB_NOSEC;
+	fc->sget_key = bdev;
+	s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	if (IS_ERR(s)) {
+		blkdev_put(bdev, mode);
+		return PTR_ERR(s);
+	}
+
+	if (s->s_root) {
+		/* Don't summarily change the RO/RW state. */
+		if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
+			warnf(fc, "%pg: Can't mount, would change RO state", bdev);
+			deactivate_locked_super(s);
+			blkdev_put(bdev, mode);
+			return -EBUSY;
+		}
+
+		/*
+		 * s_umount nests inside bd_mutex during
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
+		 */
+		up_write(&s->s_umount);
+		blkdev_put(bdev, mode);
+		down_write(&s->s_umount);
+	} else {
+		s->s_mode = mode;
+		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+		sb_set_blocksize(s, block_size(bdev));
+		error = fill_super(s, fc);
+		if (error) {
+			deactivate_locked_super(s);
+			return error;
+		}
+
+		s->s_flags |= SB_ACTIVE;
+		bdev->bd_super = s;
+	}
+
+	BUG_ON(fc->root);
+	fc->root = dget(s->s_root);
+	return 0;
+}
+EXPORT_SYMBOL(get_tree_bdev);
+
 static int test_bdev_super(struct super_block *s, void *data)
 {
 	return (void *)s->s_bdev == data;
@@ -1211,6 +1468,31 @@
 }
 EXPORT_SYMBOL(mount_nodev);
 
+static int reconfigure_single(struct super_block *s,
+			      int flags, void *data)
+{
+	struct fs_context *fc;
+	int ret;
+
+	/* The caller really need to be passing fc down into mount_single(),
+	 * then a chunk of this can be removed.  [Bollocks -- AV]
+	 * Better yet, reconfiguration shouldn't happen, but rather the second
+	 * mount should be rejected if the parameters are not compatible.
+	 */
+	fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	ret = parse_monolithic_mount_data(fc, data);
+	if (ret < 0)
+		goto out;
+
+	ret = reconfigure_super(fc);
+out:
+	put_fs_context(fc);
+	return ret;
+}
+
 static int compare_single(struct super_block *s, void *p)
 {
 	return 1;
@@ -1228,43 +1510,52 @@
 		return ERR_CAST(s);
 	if (!s->s_root) {
 		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			return ERR_PTR(error);
-		}
-		s->s_flags |= SB_ACTIVE;
+		if (!error)
+			s->s_flags |= SB_ACTIVE;
 	} else {
-		do_remount_sb(s, flags, data, 0);
+		error = reconfigure_single(s, flags, data);
+	}
+	if (unlikely(error)) {
+		deactivate_locked_super(s);
+		return ERR_PTR(error);
 	}
 	return dget(s->s_root);
 }
 EXPORT_SYMBOL(mount_single);
 
-struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_get_tree - Get the mountable root
+ * @fc: The superblock configuration context.
+ *
+ * The filesystem is invoked to get or create a superblock which can then later
+ * be used for mounting.  The filesystem places a pointer to the root to be
+ * used for mounting in @fc->root.
+ */
+int vfs_get_tree(struct fs_context *fc)
 {
-	struct dentry *root;
 	struct super_block *sb;
-	char *secdata = NULL;
-	int error = -ENOMEM;
+	int error;
 
-	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
-		secdata = alloc_secdata();
-		if (!secdata)
-			goto out;
+	if (fc->root)
+		return -EBUSY;
 
-		error = security_sb_copy_data(data, secdata);
-		if (error)
-			goto out_free_secdata;
+	/* Get the mountable root in fc->root, with a ref on the root and a ref
+	 * on the superblock.
+	 */
+	error = fc->ops->get_tree(fc);
+	if (error < 0)
+		return error;
+
+	if (!fc->root) {
+		pr_err("Filesystem %s get_tree() didn't set fc->root\n",
+		       fc->fs_type->name);
+		/* We don't know what the locking state of the superblock is -
+		 * if there is a superblock.
+		 */
+		BUG();
 	}
 
-	root = type->mount(type, flags, name, data);
-	if (IS_ERR(root)) {
-		error = PTR_ERR(root);
-		goto out_free_secdata;
-	}
-	sb = root->d_sb;
-	BUG_ON(!sb);
+	sb = fc->root->d_sb;
 	WARN_ON(!sb->s_bdi);
 
 	/*
@@ -1276,9 +1567,11 @@
 	smp_wmb();
 	sb->s_flags |= SB_BORN;
 
-	error = security_sb_kern_mount(sb, flags, secdata);
-	if (error)
-		goto out_sb;
+	error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
+	if (unlikely(error)) {
+		fc_drop_locked(fc);
+		return error;
+	}
 
 	/*
 	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
@@ -1287,19 +1580,11 @@
 	 * violate this rule.
 	 */
 	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
-		"negative value (%lld)\n", type->name, sb->s_maxbytes);
+		"negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);
 
-	up_write(&sb->s_umount);
-	free_secdata(secdata);
-	return root;
-out_sb:
-	dput(root);
-	deactivate_locked_super(sb);
-out_free_secdata:
-	free_secdata(secdata);
-out:
-	return ERR_PTR(error);
+	return 0;
 }
+EXPORT_SYMBOL(vfs_get_tree);
 
 /*
  * Setup private BDI for given superblock. It gets automatically cleaned up

diff --git a/fs/sync.c b/fs/sync.c
index b54e054..4d1ff01 100644
--- a/fs/sync.c
+++ b/fs/sync.c

@@ -234,58 +234,10 @@
 	return do_fsync(fd, 1);
 }
 
-/*
- * sys_sync_file_range() permits finely controlled syncing over a segment of
- * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
- * zero then sys_sync_file_range() will operate from offset out to EOF.
- *
- * The flag bits are:
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
- * before performing the write.
- *
- * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
- * range which are not presently under writeback. Note that this may block for
- * significant periods due to exhaustion of disk request structures.
- *
- * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
- * after performing the write.
- *
- * Useful combinations of the flag bits are:
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
- * in the range which were dirty on entry to sys_sync_file_range() are placed
- * under writeout.  This is a start-write-for-data-integrity operation.
- *
- * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
- * are not presently under writeout.  This is an asynchronous flush-to-disk
- * operation.  Not suitable for data integrity operations.
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
- * completion of writeout of all pages in the range.  This will be used after an
- * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
- * for that operation to complete and to return the result.
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
- * a traditional sync() operation.  This is a write-for-data-integrity operation
- * which will ensure that all pages in the range which were dirty on entry to
- * sys_sync_file_range() are committed to disk.
- *
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
- * I/O errors or ENOSPC conditions and will return those to the caller, after
- * clearing the EIO and ENOSPC flags in the address_space.
- *
- * It should be noted that none of these operations write out the file's
- * metadata.  So unless the application is strictly performing overwrites of
- * already-instantiated disk blocks, there are no guarantees here that the data
- * will be available after a crash.
- */
-int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-			 unsigned int flags)
+int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
+		    unsigned int flags)
 {
 	int ret;
-	struct fd f;
 	struct address_space *mapping;
 	loff_t endbyte;			/* inclusive */
 	umode_t i_mode;
@@ -325,41 +277,105 @@
 	else
 		endbyte--;		/* inclusive */
 
-	ret = -EBADF;
-	f = fdget(fd);
-	if (!f.file)
-		goto out;
-
-	i_mode = file_inode(f.file)->i_mode;
+	i_mode = file_inode(file)->i_mode;
 	ret = -ESPIPE;
 	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
 			!S_ISLNK(i_mode))
-		goto out_put;
+		goto out;
 
-	mapping = f.file->f_mapping;
+	mapping = file->f_mapping;
 	ret = 0;
 	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
-		ret = file_fdatawait_range(f.file, offset, endbyte);
+		ret = file_fdatawait_range(file, offset, endbyte);
 		if (ret < 0)
-			goto out_put;
+			goto out;
 	}
 
 	if (flags & SYNC_FILE_RANGE_WRITE) {
+		int sync_mode = WB_SYNC_NONE;
+
+		if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
+			     SYNC_FILE_RANGE_WRITE_AND_WAIT)
+			sync_mode = WB_SYNC_ALL;
+
 		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-						 WB_SYNC_NONE);
+						 sync_mode);
 		if (ret < 0)
-			goto out_put;
+			goto out;
 	}
 
 	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
-		ret = file_fdatawait_range(f.file, offset, endbyte);
+		ret = file_fdatawait_range(file, offset, endbyte);
 
-out_put:
-	fdput(f);
 out:
 	return ret;
 }
 
+/*
+ * ksys_sync_file_range() permits finely controlled syncing over a segment of
+ * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
+ * zero then ksys_sync_file_range() will operate from offset out to EOF.
+ *
+ * The flag bits are:
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
+ * before performing the write.
+ *
+ * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
+ * range which are not presently under writeback. Note that this may block for
+ * significant periods due to exhaustion of disk request structures.
+ *
+ * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
+ * after performing the write.
+ *
+ * Useful combinations of the flag bits are:
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
+ * in the range which were dirty on entry to ksys_sync_file_range() are placed
+ * under writeout.  This is a start-write-for-data-integrity operation.
+ *
+ * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
+ * are not presently under writeout.  This is an asynchronous flush-to-disk
+ * operation.  Not suitable for data integrity operations.
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
+ * completion of writeout of all pages in the range.  This will be used after an
+ * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
+ * for that operation to complete and to return the result.
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
+ * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
+ * a traditional sync() operation.  This is a write-for-data-integrity operation
+ * which will ensure that all pages in the range which were dirty on entry to
+ * ksys_sync_file_range() are written to disk.  It should be noted that disk
+ * caches are not flushed by this call, so there are no guarantees here that the
+ * data will be available on disk after a crash.
+ *
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
+ * I/O errors or ENOSPC conditions and will return those to the caller, after
+ * clearing the EIO and ENOSPC flags in the address_space.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
+ */
+int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
+			 unsigned int flags)
+{
+	int ret;
+	struct fd f;
+
+	ret = -EBADF;
+	f = fdget(fd);
+	if (f.file)
+		ret = sync_file_range(f.file, offset, nbytes, flags);
+
+	fdput(f);
+	return ret;
+}
+
 SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 				unsigned int, flags)
 {

diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index b275601..b0fe1cc 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config SYSFS
 	bool "sysfs file system support" if EXPERT
 	default y

diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 6eff6e1..0906b9e 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the sysfs virtual filesystem
 #

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index feeae80..aa85f28 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c

@@ -43,7 +43,8 @@
 	kuid_t uid;
 	kgid_t gid;
 
-	BUG_ON(!kobj);
+	if (WARN_ON(!kobj))
+		return -EINVAL;
 
 	if (kobj->parent)
 		parent = kobj->parent->sd;

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 0a7252a..130fc6f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c

@@ -17,7 +17,6 @@
 #include <linux/seq_file.h>
 
 #include "sysfs.h"
-#include "../kernfs/kernfs-internal.h"
 
 /*
  * Determine ktype->sysfs_ops for the given kernfs_node.  This function
@@ -325,7 +324,8 @@
 	kuid_t uid;
 	kgid_t gid;
 
-	BUG_ON(!kobj || !kobj->sd || !attr);
+	if (WARN_ON(!kobj || !kobj->sd || !attr))
+		return -EINVAL;
 
 	kobject_get_ownership(kobj, &uid, &gid);
 	return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode,
@@ -334,7 +334,7 @@
 }
 EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
 
-int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
+int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr)
 {
 	int err = 0;
 	int i;
@@ -493,9 +493,10 @@
 	return ret;
 }
 
-void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
+void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr)
 {
 	int i;
+
 	for (i = 0; ptr[i]; i++)
 		sysfs_remove_file(kobj, ptr[i]);
 }
@@ -537,7 +538,8 @@
 	kuid_t uid;
 	kgid_t gid;
 
-	BUG_ON(!kobj || !kobj->sd || !attr);
+	if (WARN_ON(!kobj || !kobj->sd || !attr))
+		return -EINVAL;
 
 	kobject_get_ownership(kobj, &uid, &gid);
 	return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true,

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1eb2d63..d41c21f 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c

@@ -112,7 +112,8 @@
 	kgid_t gid;
 	int error;
 
-	BUG_ON(!kobj || (!update && !kobj->sd));
+	if (WARN_ON(!kobj || (!update && !kobj->sd)))
+		return -EINVAL;
 
 	/* Updates may happen before the object has been instantiated */
 	if (unlikely(update && !kobj->sd))
@@ -174,6 +175,26 @@
 }
 EXPORT_SYMBOL_GPL(sysfs_create_group);
 
+static int internal_create_groups(struct kobject *kobj, int update,
+				  const struct attribute_group **groups)
+{
+	int error = 0;
+	int i;
+
+	if (!groups)
+		return 0;
+
+	for (i = 0; groups[i]; i++) {
+		error = internal_create_group(kobj, update, groups[i]);
+		if (error) {
+			while (--i >= 0)
+				sysfs_remove_group(kobj, groups[i]);
+			break;
+		}
+	}
+	return error;
+}
+
 /**
  * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups
  * @kobj:	The kobject to create the group on
@@ -190,25 +211,29 @@
 int sysfs_create_groups(struct kobject *kobj,
 			const struct attribute_group **groups)
 {
-	int error = 0;
-	int i;
-
-	if (!groups)
-		return 0;
-
-	for (i = 0; groups[i]; i++) {
-		error = sysfs_create_group(kobj, groups[i]);
-		if (error) {
-			while (--i >= 0)
-				sysfs_remove_group(kobj, groups[i]);
-			break;
-		}
-	}
-	return error;
+	return internal_create_groups(kobj, 0, groups);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_groups);
 
 /**
+ * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups
+ * @kobj:	The kobject to update the group on
+ * @groups:	The attribute groups to update, NULL terminated
+ *
+ * This function update a bunch of attribute groups.  If an error occurs when
+ * updating a group, all previously updated groups will be removed together
+ * with already existing (not updated) attributes.
+ *
+ * Returns 0 on success or error code from sysfs_update_group on failure.
+ */
+int sysfs_update_groups(struct kobject *kobj,
+			const struct attribute_group **groups)
+{
+	return internal_create_groups(kobj, 1, groups);
+}
+EXPORT_SYMBOL_GPL(sysfs_update_groups);
+
+/**
  * sysfs_update_group - given a directory kobject, update an attribute group
  * @kobj:	The kobject to update the group on
  * @grp:	The attribute group to update

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 92682fc..db81cfb 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c

@@ -13,34 +13,70 @@
 #include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/fs_context.h>
+#include <net/net_namespace.h>
 
 #include "sysfs.h"
 
 static struct kernfs_root *sysfs_root;
 struct kernfs_node *sysfs_root_kn;
 
-static struct dentry *sysfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysfs_get_tree(struct fs_context *fc)
 {
-	struct dentry *root;
-	void *ns;
-	bool new_sb = false;
+	struct kernfs_fs_context *kfc = fc->fs_private;
+	int ret;
 
-	if (!(flags & SB_KERNMOUNT)) {
+	ret = kernfs_get_tree(fc);
+	if (ret)
+		return ret;
+
+	if (kfc->new_sb_created)
+		fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+	return 0;
+}
+
+static void sysfs_fs_context_free(struct fs_context *fc)
+{
+	struct kernfs_fs_context *kfc = fc->fs_private;
+
+	if (kfc->ns_tag)
+		kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag);
+	kernfs_free_fs_context(fc);
+	kfree(kfc);
+}
+
+static const struct fs_context_operations sysfs_fs_context_ops = {
+	.free		= sysfs_fs_context_free,
+	.get_tree	= sysfs_get_tree,
+};
+
+static int sysfs_init_fs_context(struct fs_context *fc)
+{
+	struct kernfs_fs_context *kfc;
+	struct net *netns;
+
+	if (!(fc->sb_flags & SB_KERNMOUNT)) {
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
-			return ERR_PTR(-EPERM);
+			return -EPERM;
 	}
 
-	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
-	root = kernfs_mount_ns(fs_type, flags, sysfs_root,
-				SYSFS_MAGIC, &new_sb, ns);
-	if (!new_sb)
-		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
-	else if (!IS_ERR(root))
-		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+	kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL);
+	if (!kfc)
+		return -ENOMEM;
 
-	return root;
+	kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	kfc->root = sysfs_root;
+	kfc->magic = SYSFS_MAGIC;
+	fc->fs_private = kfc;
+	fc->ops = &sysfs_fs_context_ops;
+	if (netns) {
+		put_user_ns(fc->user_ns);
+		fc->user_ns = get_user_ns(netns->user_ns);
+	}
+	fc->global = true;
+	return 0;
 }
 
 static void sysfs_kill_sb(struct super_block *sb)
@@ -52,10 +88,10 @@
 }
 
 static struct file_system_type sysfs_fs_type = {
-	.name		= "sysfs",
-	.mount		= sysfs_mount,
-	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.name			= "sysfs",
+	.init_fs_context	= sysfs_init_fs_context,
+	.kill_sb		= sysfs_kill_sb,
+	.fs_flags		= FS_USERNS_MOUNT,
 };
 
 int __init sysfs_init(void)

diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 215c225..c4deecc 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c

@@ -23,7 +23,8 @@
 {
 	struct kernfs_node *kn, *target = NULL;
 
-	BUG_ON(!name || !parent);
+	if (WARN_ON(!name || !parent))
+		return -EINVAL;
 
 	/*
 	 * We don't own @target_kobj and it may be removed at any time.

diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
index 33aeb4b..d4edf7d 100644
--- a/fs/sysv/Kconfig
+++ b/fs/sysv/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config SYSV_FS
 	tristate "System V/Xenix/V7/Coherent file system support"
 	depends on BLOCK

diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile
index 7a75e70..17d12ba 100644
--- a/fs/sysv/Makefile
+++ b/fs/sysv/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux SystemV/Coherent filesystem routines.
 #

diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 273736f..02b1d9d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c

@@ -313,17 +313,11 @@
 	return &si->vfs_inode;
 }
 
-static void sysv_i_callback(struct rcu_head *head)
+static void sysv_free_in_core_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
-static void sysv_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, sysv_i_callback);
-}
-
 static void init_once(void *p)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
@@ -333,7 +327,7 @@
 
 const struct super_operations sysv_sops = {
 	.alloc_inode	= sysv_alloc_inode,
-	.destroy_inode	= sysv_destroy_inode,
+	.free_inode	= sysv_free_in_core_inode,
 	.write_inode	= sysv_write_inode,
 	.evict_inode	= sysv_evict_inode,
 	.put_super	= sysv_put_super,

diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 4d5d204..ea2414b 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c

@@ -28,21 +28,6 @@
 	return err;
 }
 
-static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
-{
-	/* Truncate the name in place, avoids having to define a compare
-	   function. */
-	if (qstr->len > SYSV_NAMELEN) {
-		qstr->len = SYSV_NAMELEN;
-		qstr->hash = full_name_hash(dentry, qstr->name, qstr->len);
-	}
-	return 0;
-}
-
-const struct dentry_operations sysv_dentry_operations = {
-	.d_hash		= sysv_hash,
-};
-
 static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
 {
 	struct inode * inode = NULL;

diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 89765dd..cc8e2ed 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/sysv/inode.c
  *
@@ -312,7 +313,6 @@
 
 	flavour_setup[sbi->s_type](sbi, &sb->s_max_links);
 	
-	sbi->s_truncate = 1;
 	sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
 	sbi->s_inodes_per_block = bsize >> 6;
 	sbi->s_inodes_per_block_1 = (bsize >> 6)-1;
@@ -334,8 +334,6 @@
 	sb->s_op = &sysv_sops;
 	if (sbi->s_forced_ro)
 		sb->s_flags |= SB_RDONLY;
-	if (sbi->s_truncate)
-		sb->s_d_op = &sysv_dentry_operations;
 	root_inode = sysv_iget(sb, SYSV_ROOT_INO);
 	if (IS_ERR(root_inode)) {
 		printk("SysV FS: get root inode failed\n");
@@ -370,7 +368,8 @@
 	sbi->s_block_base = 0;
 	mutex_init(&sbi->s_lock);
 	sb->s_fs_info = sbi;
-
+	sb->s_time_min = 0;
+	sb->s_time_max = U32_MAX;
 	sb_set_blocksize(sb, BLOCK_SIZE);
 
 	for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) {
@@ -489,6 +488,8 @@
 	sbi->s_type = FSTYPE_V7;
 	mutex_init(&sbi->s_lock);
 	sb->s_fs_info = sbi;
+	sb->s_time_min = 0;
+	sb->s_time_max = U32_MAX;
 	
 	sb_set_blocksize(sb, 512);
 

diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index e913698..1cff585 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h

@@ -23,8 +23,6 @@
 	struct super_block *s_sb;	/* VFS superblock */
 	int	       s_type;		/* file system type: FSTYPE_{XENIX|SYSV|COH} */
 	char	       s_bytesex;	/* bytesex (le/be/pdp) */
-	char	       s_truncate;	/* if 1: names > SYSV_NAMELEN chars are truncated */
-					/* if 0: they are disallowed (ENAMETOOLONG) */
 	unsigned int   s_inodes_per_block;	/* number of inodes per block */
 	unsigned int   s_inodes_per_block_1;	/* inodes_per_block - 1 */
 	unsigned int   s_inodes_per_block_bits;	/* log2(inodes_per_block) */
@@ -166,7 +164,6 @@
 extern const struct file_operations sysv_dir_operations;
 extern const struct address_space_operations sysv_aops;
 extern const struct super_operations sysv_sops;
-extern const struct dentry_operations sysv_dentry_operations;
 
 
 enum {

diff --git a/fs/timerfd.c b/fs/timerfd.c
index d69ad80..48305ba 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c

@@ -471,7 +471,11 @@
 				break;
 		}
 		spin_unlock_irq(&ctx->wqh.lock);
-		cpu_relax();
+
+		if (isalarm(ctx))
+			hrtimer_cancel_wait_running(&ctx->t.alarm.timer);
+		else
+			hrtimer_cancel_wait_running(&ctx->t.tmr);
 	}
 
 	/*
@@ -560,30 +564,30 @@
 }
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
-		const struct compat_itimerspec __user *, utmr,
-		struct compat_itimerspec __user *, otmr)
+SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags,
+		const struct old_itimerspec32 __user *, utmr,
+		struct old_itimerspec32 __user *, otmr)
 {
 	struct itimerspec64 new, old;
 	int ret;
 
-	if (get_compat_itimerspec64(&new, utmr))
+	if (get_old_itimerspec32(&new, utmr))
 		return -EFAULT;
 	ret = do_timerfd_settime(ufd, flags, &new, &old);
 	if (ret)
 		return ret;
-	if (otmr && put_compat_itimerspec64(&old, otmr))
+	if (otmr && put_old_itimerspec32(&old, otmr))
 		return -EFAULT;
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
-		struct compat_itimerspec __user *, otmr)
+SYSCALL_DEFINE2(timerfd_gettime32, int, ufd,
+		struct old_itimerspec32 __user *, otmr)
 {
 	struct itimerspec64 kotmr;
 	int ret = do_timerfd_gettime(ufd, &kotmr);
 	if (ret)
 		return ret;
-	return put_compat_itimerspec64(&kotmr, otmr) ? -EFAULT : 0;
+	return put_old_itimerspec32(&kotmr, otmr) ? -EFAULT : 0;
 }
 #endif

diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
index 82fa35b..7c35a28 100644
--- a/fs/tracefs/Makefile
+++ b/fs/tracefs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 tracefs-objs	:= inode.o
 
 obj-$(CONFIG_TRACING)	+= tracefs.o

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 7098c49..0caa151 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  inode.c - part of tracefs, a pseudo file system for activating tracing
  *
@@ -5,12 +6,7 @@
  *
  *  Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
  *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  * tracefs is the file system that is used by the tracing infrastructure.
- *
  */
 
 #include <linux/module.h>
@@ -20,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/tracefs.h>
 #include <linux/fsnotify.h>
+#include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/parser.h>
 #include <linux/magic.h>
@@ -394,6 +391,9 @@
 	struct dentry *dentry;
 	struct inode *inode;
 
+	if (security_locked_down(LOCKDOWN_TRACEFS))
+		return NULL;
+
 	if (!(mode & S_IFMT))
 		mode |= S_IFREG;
 	BUG_ON(!S_ISREG(mode));
@@ -509,9 +509,12 @@
 			switch (dentry->d_inode->i_mode & S_IFMT) {
 			case S_IFDIR:
 				ret = simple_rmdir(parent->d_inode, dentry);
+				if (!ret)
+					fsnotify_rmdir(parent->d_inode, dentry);
 				break;
 			default:
 				simple_unlink(parent->d_inode, dentry);
+				fsnotify_unlink(parent->d_inode, dentry);
 				break;
 			}
 			if (!ret)

diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index bbc7854..69932bc 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config UBIFS_FS
 	tristate "UBIFS file system support"
 	select CRC16
@@ -5,15 +6,20 @@
 	select CRYPTO if UBIFS_FS_ADVANCED_COMPR
 	select CRYPTO if UBIFS_FS_LZO
 	select CRYPTO if UBIFS_FS_ZLIB
+	select CRYPTO if UBIFS_FS_ZSTD
 	select CRYPTO_LZO if UBIFS_FS_LZO
 	select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
+	select CRYPTO_ZSTD if UBIFS_FS_ZSTD
+	select CRYPTO_HASH_INFO
+	select UBIFS_FS_XATTR if FS_ENCRYPTION
 	depends on MTD_UBI
 	help
 	  UBIFS is a file system for flash devices which works on top of UBI.
 
+if UBIFS_FS
+
 config UBIFS_FS_ADVANCED_COMPR
 	bool "Advanced compression options"
-	depends on UBIFS_FS
 	help
 	  This option allows to explicitly choose which compressions, if any,
 	  are enabled in UBIFS. Removing compressors means inability to read
@@ -23,7 +29,6 @@
 
 config UBIFS_FS_LZO
 	bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
-	depends on UBIFS_FS
 	default y
 	help
 	   LZO compressor is generally faster than zlib but compresses worse.
@@ -31,14 +36,20 @@
 
 config UBIFS_FS_ZLIB
 	bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
-	depends on UBIFS_FS
 	default y
 	help
 	  Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
 
-config UBIFS_ATIME_SUPPORT
-	bool "Access time support" if UBIFS_FS
+config UBIFS_FS_ZSTD
+	bool "ZSTD compression support" if UBIFS_FS_ADVANCED_COMPR
 	depends on UBIFS_FS
+	default y
+	help
+	  ZSTD compresses is a big win in speed over Zlib and
+	  in compression ratio over LZO. Say 'Y' if unsure.
+
+config UBIFS_ATIME_SUPPORT
+	bool "Access time support"
 	default n
 	help
 	  Originally UBIFS did not support atime, because it looked like a bad idea due
@@ -53,7 +64,6 @@
 
 config UBIFS_FS_XATTR
 	bool "UBIFS XATTR support"
-	depends on UBIFS_FS
 	default y
 	help
 	  Saying Y here includes support for extended attributes (xattrs).
@@ -62,20 +72,9 @@
 
 	  If unsure, say Y.
 
-config UBIFS_FS_ENCRYPTION
-	bool "UBIFS Encryption"
-	depends on UBIFS_FS && UBIFS_FS_XATTR && BLOCK
-	select FS_ENCRYPTION
-	default n
-	help
-	  Enable encryption of UBIFS files and directories. This
-	  feature is similar to ecryptfs, but it is more memory
-	  efficient since it avoids caching the encrypted and
-	  decrypted pages in the page cache.
-
 config UBIFS_FS_SECURITY
 	bool "UBIFS Security Labels"
-	depends on UBIFS_FS && UBIFS_FS_XATTR
+	depends on UBIFS_FS_XATTR
 	default y
 	help
 	  Security labels provide an access control facility to support Linux
@@ -85,3 +84,17 @@
 	  the extended attribute support in advance.
 
 	  If you are not using a security module, say N.
+
+config UBIFS_FS_AUTHENTICATION
+	bool "UBIFS authentication support"
+	select KEYS
+	select CRYPTO_HMAC
+	select SYSTEM_DATA_VERIFICATION
+	help
+	  Enable authentication support for UBIFS. This feature offers protection
+	  against offline changes for both data and metadata of the filesystem.
+	  If you say yes here you should also select a hashing algorithm such as
+	  sha256, these are not selected automatically since there are many
+	  different options.
+
+endif # UBIFS_FS

diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 6197d7e..5c4b845 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile

@@ -6,5 +6,6 @@
 ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
 ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o
 ubifs-y += misc.o
-ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
+ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o
 ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
+ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o

diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
new file mode 100644
index 0000000..8cdbd53
--- /dev/null
+++ b/fs/ubifs/auth.c

@@ -0,0 +1,568 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2018 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de>
+ */
+
+/*
+ * This file implements various helper functions for UBIFS authentication support
+ */
+
+#include <linux/crypto.h>
+#include <linux/verification.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <crypto/algapi.h>
+#include <keys/user-type.h>
+#include <keys/asymmetric-type.h>
+
+#include "ubifs.h"
+
+/**
+ * ubifs_node_calc_hash - calculate the hash of a UBIFS node
+ * @c: UBIFS file-system description object
+ * @node: the node to calculate a hash for
+ * @hash: the returned hash
+ *
+ * Returns 0 for success or a negative error code otherwise.
+ */
+int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node,
+			    u8 *hash)
+{
+	const struct ubifs_ch *ch = node;
+	SHASH_DESC_ON_STACK(shash, c->hash_tfm);
+	int err;
+
+	shash->tfm = c->hash_tfm;
+
+	err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+/**
+ * ubifs_hash_calc_hmac - calculate a HMAC from a hash
+ * @c: UBIFS file-system description object
+ * @hash: the node to calculate a HMAC for
+ * @hmac: the returned HMAC
+ *
+ * Returns 0 for success or a negative error code otherwise.
+ */
+static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash,
+				 u8 *hmac)
+{
+	SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
+	int err;
+
+	shash->tfm = c->hmac_tfm;
+
+	err = crypto_shash_digest(shash, hash, c->hash_len, hmac);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+/**
+ * ubifs_prepare_auth_node - Prepare an authentication node
+ * @c: UBIFS file-system description object
+ * @node: the node to calculate a hash for
+ * @hash: input hash of previous nodes
+ *
+ * This function prepares an authentication node for writing onto flash.
+ * It creates a HMAC from the given input hash and writes it to the node.
+ *
+ * Returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
+			     struct shash_desc *inhash)
+{
+	struct ubifs_auth_node *auth = node;
+	u8 *hash;
+	int err;
+
+	hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS);
+	if (!hash)
+		return -ENOMEM;
+
+	{
+		SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
+
+		hash_desc->tfm = c->hash_tfm;
+		ubifs_shash_copy_state(c, inhash, hash_desc);
+
+		err = crypto_shash_final(hash_desc, hash);
+		if (err)
+			goto out;
+	}
+
+	err = ubifs_hash_calc_hmac(c, hash, auth->hmac);
+	if (err)
+		goto out;
+
+	auth->ch.node_type = UBIFS_AUTH_NODE;
+	ubifs_prepare_node(c, auth, ubifs_auth_node_sz(c), 0);
+
+	err = 0;
+out:
+	kfree(hash);
+
+	return err;
+}
+
+static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c,
+					 struct crypto_shash *tfm)
+{
+	struct shash_desc *desc;
+	int err;
+
+	if (!ubifs_authenticated(c))
+		return NULL;
+
+	desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(tfm), GFP_KERNEL);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
+	desc->tfm = tfm;
+
+	err = crypto_shash_init(desc);
+	if (err) {
+		kfree(desc);
+		return ERR_PTR(err);
+	}
+
+	return desc;
+}
+
+/**
+ * __ubifs_hash_get_desc - get a descriptor suitable for hashing a node
+ * @c: UBIFS file-system description object
+ *
+ * This function returns a descriptor suitable for hashing a node. Free after use
+ * with kfree.
+ */
+struct shash_desc *__ubifs_hash_get_desc(const struct ubifs_info *c)
+{
+	return ubifs_get_desc(c, c->hash_tfm);
+}
+
+/**
+ * ubifs_bad_hash - Report hash mismatches
+ * @c: UBIFS file-system description object
+ * @node: the node
+ * @hash: the expected hash
+ * @lnum: the LEB @node was read from
+ * @offs: offset in LEB @node was read from
+ *
+ * This function reports a hash mismatch when a node has a different hash than
+ * expected.
+ */
+void ubifs_bad_hash(const struct ubifs_info *c, const void *node, const u8 *hash,
+		    int lnum, int offs)
+{
+	int len = min(c->hash_len, 20);
+	int cropped = len != c->hash_len;
+	const char *cont = cropped ? "..." : "";
+
+	u8 calc[UBIFS_HASH_ARR_SZ];
+
+	__ubifs_node_calc_hash(c, node, calc);
+
+	ubifs_err(c, "hash mismatch on node at LEB %d:%d", lnum, offs);
+	ubifs_err(c, "hash expected:   %*ph%s", len, hash, cont);
+	ubifs_err(c, "hash calculated: %*ph%s", len, calc, cont);
+}
+
+/**
+ * __ubifs_node_check_hash - check the hash of a node against given hash
+ * @c: UBIFS file-system description object
+ * @node: the node
+ * @expected: the expected hash
+ *
+ * This function calculates a hash over a node and compares it to the given hash.
+ * Returns 0 if both hashes are equal or authentication is disabled, otherwise a
+ * negative error code is returned.
+ */
+int __ubifs_node_check_hash(const struct ubifs_info *c, const void *node,
+			    const u8 *expected)
+{
+	u8 calc[UBIFS_HASH_ARR_SZ];
+	int err;
+
+	err = __ubifs_node_calc_hash(c, node, calc);
+	if (err)
+		return err;
+
+	if (ubifs_check_hash(c, expected, calc))
+		return -EPERM;
+
+	return 0;
+}
+
+/**
+ * ubifs_sb_verify_signature - verify the signature of a superblock
+ * @c: UBIFS file-system description object
+ * @sup: The superblock node
+ *
+ * To support offline signed images the superblock can be signed with a
+ * PKCS#7 signature. The signature is placed directly behind the superblock
+ * node in an ubifs_sig_node.
+ *
+ * Returns 0 when the signature can be successfully verified or a negative
+ * error code if not.
+ */
+int ubifs_sb_verify_signature(struct ubifs_info *c,
+			      const struct ubifs_sb_node *sup)
+{
+	int err;
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	const struct ubifs_sig_node *signode;
+
+	sleb = ubifs_scan(c, UBIFS_SB_LNUM, UBIFS_SB_NODE_SZ, c->sbuf, 0);
+	if (IS_ERR(sleb)) {
+		err = PTR_ERR(sleb);
+		return err;
+	}
+
+	if (sleb->nodes_cnt == 0) {
+		ubifs_err(c, "Unable to find signature node");
+		err = -EINVAL;
+		goto out_destroy;
+	}
+
+	snod = list_first_entry(&sleb->nodes, struct ubifs_scan_node, list);
+
+	if (snod->type != UBIFS_SIG_NODE) {
+		ubifs_err(c, "Signature node is of wrong type");
+		err = -EINVAL;
+		goto out_destroy;
+	}
+
+	signode = snod->node;
+
+	if (le32_to_cpu(signode->len) > snod->len + sizeof(struct ubifs_sig_node)) {
+		ubifs_err(c, "invalid signature len %d", le32_to_cpu(signode->len));
+		err = -EINVAL;
+		goto out_destroy;
+	}
+
+	if (le32_to_cpu(signode->type) != UBIFS_SIGNATURE_TYPE_PKCS7) {
+		ubifs_err(c, "Signature type %d is not supported\n",
+			  le32_to_cpu(signode->type));
+		err = -EINVAL;
+		goto out_destroy;
+	}
+
+	err = verify_pkcs7_signature(sup, sizeof(struct ubifs_sb_node),
+				     signode->sig, le32_to_cpu(signode->len),
+				     NULL, VERIFYING_UNSPECIFIED_SIGNATURE,
+				     NULL, NULL);
+
+	if (err)
+		ubifs_err(c, "Failed to verify signature");
+	else
+		ubifs_msg(c, "Successfully verified super block signature");
+
+out_destroy:
+	ubifs_scan_destroy(sleb);
+
+	return err;
+}
+
+/**
+ * ubifs_init_authentication - initialize UBIFS authentication support
+ * @c: UBIFS file-system description object
+ *
+ * This function returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_init_authentication(struct ubifs_info *c)
+{
+	struct key *keyring_key;
+	const struct user_key_payload *ukp;
+	int err;
+	char hmac_name[CRYPTO_MAX_ALG_NAME];
+
+	if (!c->auth_hash_name) {
+		ubifs_err(c, "authentication hash name needed with authentication");
+		return -EINVAL;
+	}
+
+	c->auth_hash_algo = match_string(hash_algo_name, HASH_ALGO__LAST,
+					 c->auth_hash_name);
+	if ((int)c->auth_hash_algo < 0) {
+		ubifs_err(c, "Unknown hash algo %s specified",
+			  c->auth_hash_name);
+		return -EINVAL;
+	}
+
+	snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+		 c->auth_hash_name);
+
+	keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL);
+
+	if (IS_ERR(keyring_key)) {
+		ubifs_err(c, "Failed to request key: %ld",
+			  PTR_ERR(keyring_key));
+		return PTR_ERR(keyring_key);
+	}
+
+	down_read(&keyring_key->sem);
+
+	if (keyring_key->type != &key_type_logon) {
+		ubifs_err(c, "key type must be logon");
+		err = -ENOKEY;
+		goto out;
+	}
+
+	ukp = user_key_payload_locked(keyring_key);
+	if (!ukp) {
+		/* key was revoked before we acquired its semaphore */
+		err = -EKEYREVOKED;
+		goto out;
+	}
+
+	c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, 0);
+	if (IS_ERR(c->hash_tfm)) {
+		err = PTR_ERR(c->hash_tfm);
+		ubifs_err(c, "Can not allocate %s: %d",
+			  c->auth_hash_name, err);
+		goto out;
+	}
+
+	c->hash_len = crypto_shash_digestsize(c->hash_tfm);
+	if (c->hash_len > UBIFS_HASH_ARR_SZ) {
+		ubifs_err(c, "hash %s is bigger than maximum allowed hash size (%d > %d)",
+			  c->auth_hash_name, c->hash_len, UBIFS_HASH_ARR_SZ);
+		err = -EINVAL;
+		goto out_free_hash;
+	}
+
+	c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0);
+	if (IS_ERR(c->hmac_tfm)) {
+		err = PTR_ERR(c->hmac_tfm);
+		ubifs_err(c, "Can not allocate %s: %d", hmac_name, err);
+		goto out_free_hash;
+	}
+
+	c->hmac_desc_len = crypto_shash_digestsize(c->hmac_tfm);
+	if (c->hmac_desc_len > UBIFS_HMAC_ARR_SZ) {
+		ubifs_err(c, "hmac %s is bigger than maximum allowed hmac size (%d > %d)",
+			  hmac_name, c->hmac_desc_len, UBIFS_HMAC_ARR_SZ);
+		err = -EINVAL;
+		goto out_free_hash;
+	}
+
+	err = crypto_shash_setkey(c->hmac_tfm, ukp->data, ukp->datalen);
+	if (err)
+		goto out_free_hmac;
+
+	c->authenticated = true;
+
+	c->log_hash = ubifs_hash_get_desc(c);
+	if (IS_ERR(c->log_hash))
+		goto out_free_hmac;
+
+	err = 0;
+
+out_free_hmac:
+	if (err)
+		crypto_free_shash(c->hmac_tfm);
+out_free_hash:
+	if (err)
+		crypto_free_shash(c->hash_tfm);
+out:
+	up_read(&keyring_key->sem);
+	key_put(keyring_key);
+
+	return err;
+}
+
+/**
+ * __ubifs_exit_authentication - release resource
+ * @c: UBIFS file-system description object
+ *
+ * This function releases the authentication related resources.
+ */
+void __ubifs_exit_authentication(struct ubifs_info *c)
+{
+	if (!ubifs_authenticated(c))
+		return;
+
+	crypto_free_shash(c->hmac_tfm);
+	crypto_free_shash(c->hash_tfm);
+	kfree(c->log_hash);
+}
+
+/**
+ * ubifs_node_calc_hmac - calculate the HMAC of a UBIFS node
+ * @c: UBIFS file-system description object
+ * @node: the node to insert a HMAC into.
+ * @len: the length of the node
+ * @ofs_hmac: the offset in the node where the HMAC is inserted
+ * @hmac: returned HMAC
+ *
+ * This function calculates a HMAC of a UBIFS node. The HMAC is expected to be
+ * embedded into the node, so this area is not covered by the HMAC. Also not
+ * covered is the UBIFS_NODE_MAGIC and the CRC of the node.
+ */
+static int ubifs_node_calc_hmac(const struct ubifs_info *c, const void *node,
+				int len, int ofs_hmac, void *hmac)
+{
+	SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
+	int hmac_len = c->hmac_desc_len;
+	int err;
+
+	ubifs_assert(c, ofs_hmac > 8);
+	ubifs_assert(c, ofs_hmac + hmac_len < len);
+
+	shash->tfm = c->hmac_tfm;
+
+	err = crypto_shash_init(shash);
+	if (err)
+		return err;
+
+	/* behind common node header CRC up to HMAC begin */
+	err = crypto_shash_update(shash, node + 8, ofs_hmac - 8);
+	if (err < 0)
+		return err;
+
+	/* behind HMAC, if any */
+	if (len - ofs_hmac - hmac_len > 0) {
+		err = crypto_shash_update(shash, node + ofs_hmac + hmac_len,
+			    len - ofs_hmac - hmac_len);
+		if (err < 0)
+			return err;
+	}
+
+	return crypto_shash_final(shash, hmac);
+}
+
+/**
+ * __ubifs_node_insert_hmac - insert a HMAC into a UBIFS node
+ * @c: UBIFS file-system description object
+ * @node: the node to insert a HMAC into.
+ * @len: the length of the node
+ * @ofs_hmac: the offset in the node where the HMAC is inserted
+ *
+ * This function inserts a HMAC at offset @ofs_hmac into the node given in
+ * @node.
+ *
+ * This function returns 0 for success or a negative error code otherwise.
+ */
+int __ubifs_node_insert_hmac(const struct ubifs_info *c, void *node, int len,
+			    int ofs_hmac)
+{
+	return ubifs_node_calc_hmac(c, node, len, ofs_hmac, node + ofs_hmac);
+}
+
+/**
+ * __ubifs_node_verify_hmac - verify the HMAC of UBIFS node
+ * @c: UBIFS file-system description object
+ * @node: the node to insert a HMAC into.
+ * @len: the length of the node
+ * @ofs_hmac: the offset in the node where the HMAC is inserted
+ *
+ * This function verifies the HMAC at offset @ofs_hmac of the node given in
+ * @node. Returns 0 if successful or a negative error code otherwise.
+ */
+int __ubifs_node_verify_hmac(const struct ubifs_info *c, const void *node,
+			     int len, int ofs_hmac)
+{
+	int hmac_len = c->hmac_desc_len;
+	u8 *hmac;
+	int err;
+
+	hmac = kmalloc(hmac_len, GFP_NOFS);
+	if (!hmac)
+		return -ENOMEM;
+
+	err = ubifs_node_calc_hmac(c, node, len, ofs_hmac, hmac);
+	if (err) {
+		kfree(hmac);
+		return err;
+	}
+
+	err = crypto_memneq(hmac, node + ofs_hmac, hmac_len);
+
+	kfree(hmac);
+
+	if (!err)
+		return 0;
+
+	return -EPERM;
+}
+
+int __ubifs_shash_copy_state(const struct ubifs_info *c, struct shash_desc *src,
+			     struct shash_desc *target)
+{
+	u8 *state;
+	int err;
+
+	state = kmalloc(crypto_shash_descsize(src->tfm), GFP_NOFS);
+	if (!state)
+		return -ENOMEM;
+
+	err = crypto_shash_export(src, state);
+	if (err)
+		goto out;
+
+	err = crypto_shash_import(target, state);
+
+out:
+	kfree(state);
+
+	return err;
+}
+
+/**
+ * ubifs_hmac_wkm - Create a HMAC of the well known message
+ * @c: UBIFS file-system description object
+ * @hmac: The HMAC of the well known message
+ *
+ * This function creates a HMAC of a well known message. This is used
+ * to check if the provided key is suitable to authenticate a UBIFS
+ * image. This is only a convenience to the user to provide a better
+ * error message when the wrong key is provided.
+ *
+ * This function returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac)
+{
+	SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
+	int err;
+	const char well_known_message[] = "UBIFS";
+
+	if (!ubifs_authenticated(c))
+		return 0;
+
+	shash->tfm = c->hmac_tfm;
+
+	err = crypto_shash_init(shash);
+	if (err)
+		return err;
+
+	err = crypto_shash_update(shash, well_known_message,
+				  sizeof(well_known_message) - 1);
+	if (err < 0)
+		return err;
+
+	err = crypto_shash_final(shash, hmac);
+	if (err)
+		return err;
+	return 0;
+}
+
+/*
+ * ubifs_hmac_zero - test if a HMAC is zero
+ * @c: UBIFS file-system description object
+ * @hmac: the HMAC to test
+ *
+ * This function tests if a HMAC is zero and returns true if it is
+ * and false otherwise.
+ */
+bool ubifs_hmac_zero(struct ubifs_info *c, const u8 *hmac)
+{
+	return !memchr_inv(hmac, 0, c->hmac_desc_len);
+}

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 7ef22ba..c0b84e9 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -63,7 +51,7 @@
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
 	down_read(&c->vfs_sb->s_umount);
-	writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
+	writeback_inodes_sb_nr(c->vfs_sb, nr_to_write, WB_REASON_FS_FREE_SPACE);
 	up_read(&c->vfs_sb->s_umount);
 }
 

diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 591f2c7..ad292c5 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */

diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 565cb56..3a92e6a 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c

@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  * Copyright (C) 2006, 2007 University of Szeged, Hungary
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  *          Zoltan Sogor
@@ -71,6 +59,24 @@
 };
 #endif
 
+#ifdef CONFIG_UBIFS_FS_ZSTD
+static DEFINE_MUTEX(zstd_enc_mutex);
+static DEFINE_MUTEX(zstd_dec_mutex);
+
+static struct ubifs_compressor zstd_compr = {
+	.compr_type = UBIFS_COMPR_ZSTD,
+	.comp_mutex = &zstd_enc_mutex,
+	.decomp_mutex = &zstd_dec_mutex,
+	.name = "zstd",
+	.capi_name = "zstd",
+};
+#else
+static struct ubifs_compressor zstd_compr = {
+	.compr_type = UBIFS_COMPR_ZSTD,
+	.name = "zstd",
+};
+#endif
+
 /* All UBIFS compressors */
 struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
@@ -228,13 +234,19 @@
 	if (err)
 		return err;
 
-	err = compr_init(&zlib_compr);
+	err = compr_init(&zstd_compr);
 	if (err)
 		goto out_lzo;
 
+	err = compr_init(&zlib_compr);
+	if (err)
+		goto out_zstd;
+
 	ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
 	return 0;
 
+out_zstd:
+	compr_exit(&zstd_compr);
 out_lzo:
 	compr_exit(&lzo_compr);
 	return err;
@@ -247,4 +259,5 @@
 {
 	compr_exit(&lzo_compr);
 	compr_exit(&zlib_compr);
+	compr_exit(&zstd_compr);
 }

diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 4aaedf2..22be7ae 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c

@@ -29,8 +29,8 @@
 {
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	void *p = &dn->data;
-	struct page *ret;
 	unsigned int pad_len = round_up(in_len, UBIFS_CIPHER_BLOCK_SIZE);
+	int err;
 
 	ubifs_assert(c, pad_len <= *out_len);
 	dn->compr_size = cpu_to_le16(in_len);
@@ -39,11 +39,11 @@
 	if (pad_len != in_len)
 		memset(p + in_len, 0, pad_len - in_len);
 
-	ret = fscrypt_encrypt_page(inode, virt_to_page(&dn->data), pad_len,
-			offset_in_page(&dn->data), block, GFP_NOFS);
-	if (IS_ERR(ret)) {
-		ubifs_err(c, "fscrypt_encrypt_page failed: %ld", PTR_ERR(ret));
-		return PTR_ERR(ret);
+	err = fscrypt_encrypt_block_inplace(inode, virt_to_page(p), pad_len,
+					    offset_in_page(p), block, GFP_NOFS);
+	if (err) {
+		ubifs_err(c, "fscrypt_encrypt_block_inplace() failed: %d", err);
+		return err;
 	}
 	*out_len = pad_len;
 
@@ -64,10 +64,11 @@
 	}
 
 	ubifs_assert(c, dlen <= UBIFS_BLOCK_SIZE);
-	err = fscrypt_decrypt_page(inode, virt_to_page(&dn->data), dlen,
-			offset_in_page(&dn->data), block);
+	err = fscrypt_decrypt_block_inplace(inode, virt_to_page(&dn->data),
+					    dlen, offset_in_page(&dn->data),
+					    block);
 	if (err) {
-		ubifs_err(c, "fscrypt_decrypt_page failed: %i", err);
+		ubifs_err(c, "fscrypt_decrypt_block_inplace() failed: %d", err);
 		return err;
 	}
 	*out_len = clen;

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 564e330..e4b5278 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -165,6 +153,8 @@
 		return "commit start node";
 	case UBIFS_ORPH_NODE:
 		return "orphan node";
+	case UBIFS_AUTH_NODE:
+		return "auth node";
 	default:
 		return "unknown node";
 	}
@@ -542,6 +532,10 @@
 			       (unsigned long long)le64_to_cpu(orph->inos[i]));
 		break;
 	}
+	case UBIFS_AUTH_NODE:
+	{
+		break;
+	}
 	default:
 		pr_err("node type %d was not recognized\n",
 		       (int)ch->node_type);
@@ -1597,7 +1591,6 @@
 				err = PTR_ERR(child);
 				goto out_unlock;
 			}
-			zbr->znode = child;
 		}
 
 		znode = child;
@@ -2807,115 +2800,68 @@
  * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
  * @c: UBIFS file-system description object
  *
- * This function creates all debugfs files for this instance of UBIFS. Returns
- * zero in case of success and a negative error code in case of failure.
+ * This function creates all debugfs files for this instance of UBIFS.
  *
  * Note, the only reason we have not merged this function with the
  * 'ubifs_debugging_init()' function is because it is better to initialize
  * debugfs interfaces at the very end of the mount process, and remove them at
  * the very beginning of the mount process.
  */
-int dbg_debugfs_init_fs(struct ubifs_info *c)
+void dbg_debugfs_init_fs(struct ubifs_info *c)
 {
-	int err, n;
+	int n;
 	const char *fname;
-	struct dentry *dent;
 	struct ubifs_debug_info *d = c->dbg;
 
-	if (!IS_ENABLED(CONFIG_DEBUG_FS))
-		return 0;
-
 	n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME,
 		     c->vi.ubi_num, c->vi.vol_id);
 	if (n == UBIFS_DFS_DIR_LEN) {
 		/* The array size is too small */
-		fname = UBIFS_DFS_DIR_NAME;
-		dent = ERR_PTR(-EINVAL);
-		goto out;
+		return;
 	}
 
 	fname = d->dfs_dir_name;
-	dent = debugfs_create_dir(fname, dfs_rootdir);
-	if (IS_ERR_OR_NULL(dent))
-		goto out;
-	d->dfs_dir = dent;
+	d->dfs_dir = debugfs_create_dir(fname, dfs_rootdir);
 
 	fname = "dump_lprops";
-	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_dump_lprops = dent;
+	d->dfs_dump_lprops = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c,
+						 &dfs_fops);
 
 	fname = "dump_budg";
-	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_dump_budg = dent;
+	d->dfs_dump_budg = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c,
+					       &dfs_fops);
 
 	fname = "dump_tnc";
-	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_dump_tnc = dent;
+	d->dfs_dump_tnc = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c,
+					      &dfs_fops);
 
 	fname = "chk_general";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
-				   &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_chk_gen = dent;
+	d->dfs_chk_gen = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					     d->dfs_dir, c, &dfs_fops);
 
 	fname = "chk_index";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
-				   &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_chk_index = dent;
+	d->dfs_chk_index = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					       d->dfs_dir, c, &dfs_fops);
 
 	fname = "chk_orphans";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
-				   &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_chk_orph = dent;
+	d->dfs_chk_orph = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					      d->dfs_dir, c, &dfs_fops);
 
 	fname = "chk_lprops";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
-				   &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_chk_lprops = dent;
+	d->dfs_chk_lprops = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+						d->dfs_dir, c, &dfs_fops);
 
 	fname = "chk_fs";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
-				   &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_chk_fs = dent;
+	d->dfs_chk_fs = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					    d->dfs_dir, c, &dfs_fops);
 
 	fname = "tst_recovery";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
-				   &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_tst_rcvry = dent;
+	d->dfs_tst_rcvry = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					       d->dfs_dir, c, &dfs_fops);
 
 	fname = "ro_error";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
-				   &dfs_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	d->dfs_ro_error = dent;
-
-	return 0;
-
-out_remove:
-	debugfs_remove_recursive(d->dfs_dir);
-out:
-	err = dent ? PTR_ERR(dent) : -ENODEV;
-	ubifs_err(c, "cannot create \"%s\" debugfs file or directory, error %d\n",
-		  fname, err);
-	return err;
+	d->dfs_ro_error = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					      d->dfs_dir, c, &dfs_fops);
 }
 
 /**
@@ -2924,8 +2870,7 @@
  */
 void dbg_debugfs_exit_fs(struct ubifs_info *c)
 {
-	if (IS_ENABLED(CONFIG_DEBUG_FS))
-		debugfs_remove_recursive(c->dbg->dfs_dir);
+	debugfs_remove_recursive(c->dbg->dfs_dir);
 }
 
 struct ubifs_global_debug_info ubifs_dbg;
@@ -3001,75 +2946,38 @@
  *
  * UBIFS uses debugfs file-system to expose various debugging knobs to
  * user-space. This function creates "ubifs" directory in the debugfs
- * file-system. Returns zero in case of success and a negative error code in
- * case of failure.
+ * file-system.
  */
-int dbg_debugfs_init(void)
+void dbg_debugfs_init(void)
 {
-	int err;
 	const char *fname;
-	struct dentry *dent;
-
-	if (!IS_ENABLED(CONFIG_DEBUG_FS))
-		return 0;
 
 	fname = "ubifs";
-	dent = debugfs_create_dir(fname, NULL);
-	if (IS_ERR_OR_NULL(dent))
-		goto out;
-	dfs_rootdir = dent;
+	dfs_rootdir = debugfs_create_dir(fname, NULL);
 
 	fname = "chk_general";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
-				   &dfs_global_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	dfs_chk_gen = dent;
+	dfs_chk_gen = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir,
+					  NULL, &dfs_global_fops);
 
 	fname = "chk_index";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
-				   &dfs_global_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	dfs_chk_index = dent;
+	dfs_chk_index = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					    dfs_rootdir, NULL, &dfs_global_fops);
 
 	fname = "chk_orphans";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
-				   &dfs_global_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	dfs_chk_orph = dent;
+	dfs_chk_orph = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					   dfs_rootdir, NULL, &dfs_global_fops);
 
 	fname = "chk_lprops";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
-				   &dfs_global_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	dfs_chk_lprops = dent;
+	dfs_chk_lprops = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					     dfs_rootdir, NULL, &dfs_global_fops);
 
 	fname = "chk_fs";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
-				   &dfs_global_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	dfs_chk_fs = dent;
+	dfs_chk_fs = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir,
+					 NULL, &dfs_global_fops);
 
 	fname = "tst_recovery";
-	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL,
-				   &dfs_global_fops);
-	if (IS_ERR_OR_NULL(dent))
-		goto out_remove;
-	dfs_tst_rcvry = dent;
-
-	return 0;
-
-out_remove:
-	debugfs_remove_recursive(dfs_rootdir);
-out:
-	err = dent ? PTR_ERR(dent) : -ENODEV;
-	pr_err("UBIFS error (pid %d): cannot create \"%s\" debugfs file or directory, error %d\n",
-	       current->pid, fname, err);
-	return err;
+	dfs_tst_rcvry = debugfs_create_file(fname, S_IRUSR | S_IWUSR,
+					    dfs_rootdir, NULL, &dfs_global_fops);
 }
 
 /**
@@ -3077,8 +2985,7 @@
  */
 void dbg_debugfs_exit(void)
 {
-	if (IS_ENABLED(CONFIG_DEBUG_FS))
-		debugfs_remove_recursive(dfs_rootdir);
+	debugfs_remove_recursive(dfs_rootdir);
 }
 
 void ubifs_assert_failed(struct ubifs_info *c, const char *expr,

diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 64c6977..7763639 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h

@@ -1,21 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -309,9 +297,9 @@
 int dbg_leb_map(struct ubifs_info *c, int lnum);
 
 /* Debugfs-related stuff */
-int dbg_debugfs_init(void);
+void dbg_debugfs_init(void);
 void dbg_debugfs_exit(void);
-int dbg_debugfs_init_fs(struct ubifs_info *c);
+void dbg_debugfs_init_fs(struct ubifs_info *c);
 void dbg_debugfs_exit_fs(struct ubifs_info *c);
 
 #endif /* !__UBIFS_DEBUG_H__ */

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5767b37..0b98e3c 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  * Copyright (C) 2006, 2007 University of Szeged, Hungary
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  *          Zoltan Sogor
@@ -220,11 +208,9 @@
 
 	dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
 
-	err = fscrypt_prepare_lookup(dir, dentry, flags);
-	if (err)
-		return ERR_PTR(err);
-
-	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
+	err = fscrypt_prepare_lookup(dir, dentry, &nm);
+	if (err == -ENOENT)
+		return d_splice_alias(NULL, dentry);
 	if (err)
 		return ERR_PTR(err);
 
@@ -792,16 +778,14 @@
 		dentry, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
 
-	if (ubifs_crypt_is_encrypted(dir)) {
-		err = fscrypt_get_encryption_info(dir);
-		if (err && err != -ENOKEY)
-			return err;
-	}
-
 	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
 	if (err)
 		return err;
 
+	err = ubifs_purge_xattrs(inode);
+	if (err)
+		return err;
+
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
 	ubifs_assert(c, inode_is_locked(dir));
@@ -902,16 +886,14 @@
 	if (err)
 		return err;
 
-	if (ubifs_crypt_is_encrypted(dir)) {
-		err = fscrypt_get_encryption_info(dir);
-		if (err && err != -ENOKEY)
-			return err;
-	}
-
 	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
 	if (err)
 		return err;
 
+	err = ubifs_purge_xattrs(inode);
+	if (err)
+		return err;
+
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
 	err = ubifs_budget_space(c, &req);
@@ -1294,9 +1276,14 @@
 		old_dentry, old_inode->i_ino, old_dir->i_ino,
 		new_dentry, new_dir->i_ino, flags);
 
-	if (unlink)
+	if (unlink) {
 		ubifs_assert(c, inode_is_locked(new_inode));
 
+		err = ubifs_purge_xattrs(new_inode);
+		if (err)
+			return err;
+	}
+
 	if (unlink && is_dir) {
 		err = ubifs_check_dir_empty(new_inode);
 		if (err)
@@ -1652,9 +1639,7 @@
 #ifdef CONFIG_UBIFS_FS_XATTR
 	.listxattr   = ubifs_listxattr,
 #endif
-#ifdef CONFIG_UBIFS_ATIME_SUPPORT
 	.update_time = ubifs_update_time,
-#endif
 	.tmpfile     = ubifs_tmpfile,
 };
 

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1b78f2e..cd52585 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -1090,15 +1078,18 @@
 		inode->i_uid = attr->ia_uid;
 	if (attr->ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
-	if (attr->ia_valid & ATTR_ATIME)
-		inode->i_atime = timespec64_trunc(attr->ia_atime,
-						  inode->i_sb->s_time_gran);
-	if (attr->ia_valid & ATTR_MTIME)
-		inode->i_mtime = timespec64_trunc(attr->ia_mtime,
-						  inode->i_sb->s_time_gran);
-	if (attr->ia_valid & ATTR_CTIME)
-		inode->i_ctime = timespec64_trunc(attr->ia_ctime,
-						  inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_ATIME) {
+		inode->i_atime = timestamp_truncate(attr->ia_atime,
+						  inode);
+	}
+	if (attr->ia_valid & ATTR_MTIME) {
+		inode->i_mtime = timestamp_truncate(attr->ia_mtime,
+						  inode);
+	}
+	if (attr->ia_valid & ATTR_CTIME) {
+		inode->i_ctime = timestamp_truncate(attr->ia_ctime,
+						  inode);
+	}
 	if (attr->ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 
@@ -1375,7 +1366,6 @@
 	return 0;
 }
 
-#ifdef CONFIG_UBIFS_ATIME_SUPPORT
 /**
  * ubifs_update_time - update time of inode.
  * @inode: inode to update
@@ -1392,6 +1382,9 @@
 	int iflags = I_DIRTY_TIME;
 	int err, release;
 
+	if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
+		return generic_update_time(inode, time, flags);
+
 	err = ubifs_budget_space(c, &req);
 	if (err)
 		return err;
@@ -1414,7 +1407,6 @@
 		ubifs_release_budget(c, &req);
 	return 0;
 }
-#endif
 
 /**
  * update_mctime - update mtime and ctime of an inode.
@@ -1481,7 +1473,7 @@
 {
 	int rc;
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
@@ -1623,9 +1615,10 @@
 	if (err)
 		return err;
 	vma->vm_ops = &ubifs_file_vm_ops;
-#ifdef CONFIG_UBIFS_ATIME_SUPPORT
-	file_accessed(file);
-#endif
+
+	if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
+		file_accessed(file);
+
 	return 0;
 }
 
@@ -1663,9 +1656,7 @@
 #ifdef CONFIG_UBIFS_FS_XATTR
 	.listxattr   = ubifs_listxattr,
 #endif
-#ifdef CONFIG_UBIFS_ATIME_SUPPORT
 	.update_time = ubifs_update_time,
-#endif
 };
 
 const struct inode_operations ubifs_symlink_inode_operations = {
@@ -1675,9 +1666,7 @@
 #ifdef CONFIG_UBIFS_FS_XATTR
 	.listxattr   = ubifs_listxattr,
 #endif
-#ifdef CONFIG_UBIFS_ATIME_SUPPORT
 	.update_time = ubifs_update_time,
-#endif
 };
 
 const struct file_operations ubifs_file_operations = {

diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index f964683..873e6e1 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -747,12 +735,6 @@
 	return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
 }
 
-static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
-			   int size)
-{
-	swap(*a, *b);
-}
-
 /**
  * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
  * @c: the UBIFS file-system description object
@@ -772,8 +754,7 @@
 	       sizeof(void *) * c->dirty_idx.cnt);
 	/* Sort it so that the dirtiest is now at the end */
 	sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
-	     (int (*)(const void *, const void *))cmp_dirty_idx,
-	     (void (*)(void *, void *, int))swap_dirty_idx);
+	     (int (*)(const void *, const void *))cmp_dirty_idx, NULL);
 	dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
 	if (c->dirty_idx.cnt)
 		dbg_find("dirtiest index LEB is %d with dirty %d and free %d",

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d2680e0..62cb3db 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -254,7 +242,8 @@
 			     snod->type == UBIFS_DATA_NODE ||
 			     snod->type == UBIFS_DENT_NODE ||
 			     snod->type == UBIFS_XENT_NODE ||
-			     snod->type == UBIFS_TRUN_NODE);
+			     snod->type == UBIFS_TRUN_NODE ||
+			     snod->type == UBIFS_AUTH_NODE);
 
 		if (snod->type != UBIFS_INO_NODE  &&
 		    snod->type != UBIFS_DATA_NODE &&
@@ -364,12 +353,13 @@
 
 	/* Write nodes to their new location. Use the first-fit strategy */
 	while (1) {
-		int avail;
+		int avail, moved = 0;
 		struct ubifs_scan_node *snod, *tmp;
 
 		/* Move data nodes */
 		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-			avail = c->leb_size - wbuf->offs - wbuf->used;
+			avail = c->leb_size - wbuf->offs - wbuf->used -
+					ubifs_auth_node_sz(c);
 			if  (snod->len > avail)
 				/*
 				 * Do not skip data nodes in order to optimize
@@ -377,14 +367,21 @@
 				 */
 				break;
 
+			err = ubifs_shash_update(c, c->jheads[GCHD].log_hash,
+						 snod->node, snod->len);
+			if (err)
+				goto out;
+
 			err = move_node(c, sleb, snod, wbuf);
 			if (err)
 				goto out;
+			moved = 1;
 		}
 
 		/* Move non-data nodes */
 		list_for_each_entry_safe(snod, tmp, &nondata, list) {
-			avail = c->leb_size - wbuf->offs - wbuf->used;
+			avail = c->leb_size - wbuf->offs - wbuf->used -
+					ubifs_auth_node_sz(c);
 			if (avail < min)
 				break;
 
@@ -402,9 +399,41 @@
 				continue;
 			}
 
+			err = ubifs_shash_update(c, c->jheads[GCHD].log_hash,
+						 snod->node, snod->len);
+			if (err)
+				goto out;
+
 			err = move_node(c, sleb, snod, wbuf);
 			if (err)
 				goto out;
+			moved = 1;
+		}
+
+		if (ubifs_authenticated(c) && moved) {
+			struct ubifs_auth_node *auth;
+
+			auth = kmalloc(ubifs_auth_node_sz(c), GFP_NOFS);
+			if (!auth) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			err = ubifs_prepare_auth_node(c, auth,
+						c->jheads[GCHD].log_hash);
+			if (err) {
+				kfree(auth);
+				goto out;
+			}
+
+			err = ubifs_wbuf_write_nolock(wbuf, auth,
+						      ubifs_auth_node_sz(c));
+			if (err) {
+				kfree(auth);
+				goto out;
+			}
+
+			ubifs_add_dirt(c, wbuf->lnum, ubifs_auth_node_sz(c));
 		}
 
 		if (list_empty(&sleb->nodes) && list_empty(&nondata))

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 099bec9..8ceb514 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c

@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  * Copyright (C) 2006, 2007 University of Szeged, Hungary
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  *          Zoltan Sogor
@@ -365,6 +353,68 @@
 	return sqnum;
 }
 
+void ubifs_init_node(struct ubifs_info *c, void *node, int len, int pad)
+{
+	struct ubifs_ch *ch = node;
+	unsigned long long sqnum = next_sqnum(c);
+
+	ubifs_assert(c, len >= UBIFS_CH_SZ);
+
+	ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+	ch->len = cpu_to_le32(len);
+	ch->group_type = UBIFS_NO_NODE_GROUP;
+	ch->sqnum = cpu_to_le64(sqnum);
+	ch->padding[0] = ch->padding[1] = 0;
+
+	if (pad) {
+		len = ALIGN(len, 8);
+		pad = ALIGN(len, c->min_io_size) - len;
+		ubifs_pad(c, node + len, pad);
+	}
+}
+
+void ubifs_crc_node(struct ubifs_info *c, void *node, int len)
+{
+	struct ubifs_ch *ch = node;
+	uint32_t crc;
+
+	crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+	ch->crc = cpu_to_le32(crc);
+}
+
+/**
+ * ubifs_prepare_node_hmac - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @hmac_offs: offset of the HMAC in the node
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero. if @hmac_offs is positive then
+ * a HMAC is inserted into the node at the given offset.
+ *
+ * This function returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_prepare_node_hmac(struct ubifs_info *c, void *node, int len,
+			    int hmac_offs, int pad)
+{
+	int err;
+
+	ubifs_init_node(c, node, len, pad);
+
+	if (hmac_offs > 0) {
+		err = ubifs_node_insert_hmac(c, node, len, hmac_offs);
+		if (err)
+			return err;
+	}
+
+	ubifs_crc_node(c, node, len);
+
+	return 0;
+}
+
 /**
  * ubifs_prepare_node - prepare node to be written to flash.
  * @c: UBIFS file-system description object
@@ -378,25 +428,11 @@
  */
 void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
 {
-	uint32_t crc;
-	struct ubifs_ch *ch = node;
-	unsigned long long sqnum = next_sqnum(c);
-
-	ubifs_assert(c, len >= UBIFS_CH_SZ);
-
-	ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
-	ch->len = cpu_to_le32(len);
-	ch->group_type = UBIFS_NO_NODE_GROUP;
-	ch->sqnum = cpu_to_le64(sqnum);
-	ch->padding[0] = ch->padding[1] = 0;
-	crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
-	ch->crc = cpu_to_le32(crc);
-
-	if (pad) {
-		len = ALIGN(len, 8);
-		pad = ALIGN(len, c->min_io_size) - len;
-		ubifs_pad(c, node + len, pad);
-	}
+	/*
+	 * Deliberately ignore return value since this function can only fail
+	 * when a hmac offset is given.
+	 */
+	ubifs_prepare_node_hmac(c, node, len, 0, pad);
 }
 
 /**
@@ -849,6 +885,48 @@
 }
 
 /**
+ * ubifs_write_node_hmac - write node to the media.
+ * @c: UBIFS file-system description object
+ * @buf: the node to write
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @hmac_offs: offset of the HMAC within the node
+ *
+ * This function automatically fills node magic number, assigns sequence
+ * number, and calculates node CRC checksum. The length of the @buf buffer has
+ * to be aligned to the minimal I/O unit size. This function automatically
+ * appends padding node and padding bytes if needed. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum,
+			  int offs, int hmac_offs)
+{
+	int err, buf_len = ALIGN(len, c->min_io_size);
+
+	dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
+	       lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
+	       buf_len);
+	ubifs_assert(c, lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(c, offs % c->min_io_size == 0 && offs < c->leb_size);
+	ubifs_assert(c, !c->ro_media && !c->ro_mount);
+	ubifs_assert(c, !c->space_fixup);
+
+	if (c->ro_error)
+		return -EROFS;
+
+	err = ubifs_prepare_node_hmac(c, buf, len, hmac_offs, 1);
+	if (err)
+		return err;
+
+	err = ubifs_leb_write(c, lnum, buf, offs, buf_len);
+	if (err)
+		ubifs_dump_node(c, buf);
+
+	return err;
+}
+
+/**
  * ubifs_write_node - write node to the media.
  * @c: UBIFS file-system description object
  * @buf: the node to write
@@ -865,25 +943,7 @@
 int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
 		     int offs)
 {
-	int err, buf_len = ALIGN(len, c->min_io_size);
-
-	dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
-	       lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
-	       buf_len);
-	ubifs_assert(c, lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
-	ubifs_assert(c, offs % c->min_io_size == 0 && offs < c->leb_size);
-	ubifs_assert(c, !c->ro_media && !c->ro_mount);
-	ubifs_assert(c, !c->space_fixup);
-
-	if (c->ro_error)
-		return -EROFS;
-
-	ubifs_prepare_node(c, buf, len, 1);
-	err = ubifs_leb_write(c, lnum, buf, offs, buf_len);
-	if (err)
-		ubifs_dump_node(c, buf);
-
-	return err;
+	return ubifs_write_node_hmac(c, buf, len, lnum, offs, -1);
 }
 
 /**

diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 0164bcc..5dc5abc 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c

@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  * Copyright (C) 2006, 2007 University of Szeged, Hungary
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Zoltan Sogor
  *          Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
@@ -28,6 +16,11 @@
 #include <linux/mount.h>
 #include "ubifs.h"
 
+/* Need to be kept consistent with checked flags in ioctl2ubifs() */
+#define UBIFS_SUPPORTED_IOCTL_FLAGS \
+	(FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \
+	 FS_IMMUTABLE_FL | FS_DIRSYNC_FL)
+
 /**
  * ubifs_set_inode_flags - set VFS inode flags.
  * @inode: VFS inode to set flags for
@@ -114,18 +107,11 @@
 	if (err)
 		return err;
 
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-	 * the relevant capability.
-	 */
 	mutex_lock(&ui->ui_mutex);
 	oldflags = ubifs2ioctl(ui->flags);
-	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
-		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			err = -EPERM;
-			goto out_unlock;
-		}
-	}
+	err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+	if (err)
+		goto out_unlock;
 
 	ui->flags = ioctl2ubifs(flags);
 	ubifs_set_inode_flags(inode);
@@ -169,6 +155,9 @@
 		if (get_user(flags, (int __user *) arg))
 			return -EFAULT;
 
+		if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS)
+			return -EOPNOTSUPP;
+
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~FS_DIRSYNC_FL;
 
@@ -185,7 +174,6 @@
 		return err;
 	}
 	case FS_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
 		struct ubifs_info *c = inode->i_sb->s_fs_info;
 
 		err = ubifs_enable_encryption(c);
@@ -193,17 +181,24 @@
 			return err;
 
 		return fscrypt_ioctl_set_policy(file, (const void __user *)arg);
-#else
-		return -EOPNOTSUPP;
-#endif
 	}
-	case FS_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+	case FS_IOC_GET_ENCRYPTION_POLICY:
 		return fscrypt_ioctl_get_policy(file, (void __user *)arg);
-#else
-		return -EOPNOTSUPP;
-#endif
-	}
+
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+		return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
+
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+		return fscrypt_ioctl_add_key(file, (void __user *)arg);
+
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+		return fscrypt_ioctl_remove_key(file, (void __user *)arg);
+
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+		return fscrypt_ioctl_remove_key_all_users(file,
+							  (void __user *)arg);
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+		return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
 
 	default:
 		return -ENOTTY;
@@ -222,6 +217,11 @@
 		break;
 	case FS_IOC_SET_ENCRYPTION_POLICY:
 	case FS_IOC_GET_ENCRYPTION_POLICY:
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
 		break;
 	default:
 		return -ENOIOCTLCMD;

diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 802565a..4fd9683 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -90,6 +78,12 @@
 	memset(trun->padding, 0, 12);
 }
 
+static void ubifs_add_auth_dirt(struct ubifs_info *c, int lnum)
+{
+	if (ubifs_authenticated(c))
+		ubifs_add_dirt(c, lnum, ubifs_auth_node_sz(c));
+}
+
 /**
  * reserve_space - reserve space in the journal.
  * @c: UBIFS file-system description object
@@ -228,34 +222,33 @@
 	return err;
 }
 
-/**
- * write_node - write node to a journal head.
- * @c: UBIFS file-system description object
- * @jhead: journal head
- * @node: node to write
- * @len: node length
- * @lnum: LEB number written is returned here
- * @offs: offset written is returned here
- *
- * This function writes a node to reserved space of journal head @jhead.
- * Returns zero in case of success and a negative error code in case of
- * failure.
- */
-static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
-		      int *lnum, int *offs)
+static int ubifs_hash_nodes(struct ubifs_info *c, void *node,
+			     int len, struct shash_desc *hash)
 {
-	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+	int auth_node_size = ubifs_auth_node_sz(c);
+	int err;
 
-	ubifs_assert(c, jhead != GCHD);
+	while (1) {
+		const struct ubifs_ch *ch = node;
+		int nodelen = le32_to_cpu(ch->len);
 
-	*lnum = c->jheads[jhead].wbuf.lnum;
-	*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+		ubifs_assert(c, len >= auth_node_size);
 
-	dbg_jnl("jhead %s, LEB %d:%d, len %d",
-		dbg_jhead(jhead), *lnum, *offs, len);
-	ubifs_prepare_node(c, node, len, 0);
+		if (len == auth_node_size)
+			break;
 
-	return ubifs_wbuf_write_nolock(wbuf, node, len);
+		ubifs_assert(c, len > nodelen);
+		ubifs_assert(c, ch->magic == cpu_to_le32(UBIFS_NODE_MAGIC));
+
+		err = ubifs_shash_update(c, hash, (void *)node, nodelen);
+		if (err)
+			return err;
+
+		node += ALIGN(nodelen, 8);
+		len -= ALIGN(nodelen, 8);
+	}
+
+	return ubifs_prepare_auth_node(c, node, hash);
 }
 
 /**
@@ -268,9 +261,9 @@
  * @offs: offset written is returned here
  * @sync: non-zero if the write-buffer has to by synchronized
  *
- * This function is the same as 'write_node()' but it does not assume the
- * buffer it is writing is a node, so it does not prepare it (which means
- * initializing common header and calculating CRC).
+ * This function writes data to the reserved space of journal head @jhead.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
  */
 static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
 		      int *lnum, int *offs, int sync)
@@ -285,6 +278,12 @@
 	dbg_jnl("jhead %s, LEB %d:%d, len %d",
 		dbg_jhead(jhead), *lnum, *offs, len);
 
+	if (ubifs_authenticated(c)) {
+		err = ubifs_hash_nodes(c, buf, len, c->jheads[jhead].log_hash);
+		if (err)
+			return err;
+	}
+
 	err = ubifs_wbuf_write_nolock(wbuf, buf, len);
 	if (err)
 		return err;
@@ -548,6 +547,9 @@
 	struct ubifs_dent_node *dent;
 	struct ubifs_ino_node *ino;
 	union ubifs_key dent_key, ino_key;
+	u8 hash_dent[UBIFS_HASH_ARR_SZ];
+	u8 hash_ino[UBIFS_HASH_ARR_SZ];
+	u8 hash_ino_host[UBIFS_HASH_ARR_SZ];
 
 	ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex));
 
@@ -570,7 +572,10 @@
 
 	len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
 	/* Make sure to also account for extended attributes */
-	len += host_ui->data_len;
+	if (ubifs_authenticated(c))
+		len += ALIGN(host_ui->data_len, 8) + ubifs_auth_node_sz(c);
+	else
+		len += host_ui->data_len;
 
 	dent = kzalloc(len, GFP_NOFS);
 	if (!dent)
@@ -602,11 +607,21 @@
 
 	zero_dent_node_unused(dent);
 	ubifs_prep_grp_node(c, dent, dlen, 0);
+	err = ubifs_node_calc_hash(c, dent, hash_dent);
+	if (err)
+		goto out_release;
 
 	ino = (void *)dent + aligned_dlen;
 	pack_inode(c, ino, inode, 0);
+	err = ubifs_node_calc_hash(c, ino, hash_ino);
+	if (err)
+		goto out_release;
+
 	ino = (void *)ino + aligned_ilen;
 	pack_inode(c, ino, dir, 1);
+	err = ubifs_node_calc_hash(c, ino, hash_ino_host);
+	if (err)
+		goto out_release;
 
 	if (last_reference) {
 		err = ubifs_add_orphan(c, inode->i_ino);
@@ -628,6 +643,7 @@
 	}
 	release_head(c, BASEHD);
 	kfree(dent);
+	ubifs_add_auth_dirt(c, lnum);
 
 	if (deletion) {
 		if (nm->hash)
@@ -638,7 +654,8 @@
 			goto out_ro;
 		err = ubifs_add_dirt(c, lnum, dlen);
 	} else
-		err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
+		err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen,
+				       hash_dent, nm);
 	if (err)
 		goto out_ro;
 
@@ -650,14 +667,14 @@
 	 */
 	ino_key_init(c, &ino_key, inode->i_ino);
 	ino_offs = dent_offs + aligned_dlen;
-	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
+	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen, hash_ino);
 	if (err)
 		goto out_ro;
 
 	ino_key_init(c, &ino_key, dir->i_ino);
 	ino_offs += aligned_ilen;
 	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs,
-			    UBIFS_INO_NODE_SZ + host_ui->data_len);
+			    UBIFS_INO_NODE_SZ + host_ui->data_len, hash_ino_host);
 	if (err)
 		goto out_ro;
 
@@ -706,10 +723,12 @@
 			 const union ubifs_key *key, const void *buf, int len)
 {
 	struct ubifs_data_node *data;
-	int err, lnum, offs, compr_type, out_len, compr_len;
+	int err, lnum, offs, compr_type, out_len, compr_len, auth_len;
 	int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
+	int write_len;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	bool encrypted = ubifs_crypt_is_encrypted(inode);
+	u8 hash[UBIFS_HASH_ARR_SZ];
 
 	dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
 		(unsigned long)key_inum(c, key), key_block(c, key), len);
@@ -718,7 +737,9 @@
 	if (encrypted)
 		dlen += UBIFS_CIPHER_BLOCK_SIZE;
 
-	data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
+	auth_len = ubifs_auth_node_sz(c);
+
+	data = kmalloc(dlen + auth_len, GFP_NOFS | __GFP_NOWARN);
 	if (!data) {
 		/*
 		 * Fall-back to the write reserve buffer. Note, we might be
@@ -757,20 +778,33 @@
 	}
 
 	dlen = UBIFS_DATA_NODE_SZ + out_len;
+	if (ubifs_authenticated(c))
+		write_len = ALIGN(dlen, 8) + auth_len;
+	else
+		write_len = dlen;
+
 	data->compr_type = cpu_to_le16(compr_type);
 
 	/* Make reservation before allocating sequence numbers */
-	err = make_reservation(c, DATAHD, dlen);
+	err = make_reservation(c, DATAHD, write_len);
 	if (err)
 		goto out_free;
 
-	err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
+	ubifs_prepare_node(c, data, dlen, 0);
+	err = write_head(c, DATAHD, data, write_len, &lnum, &offs, 0);
 	if (err)
 		goto out_release;
+
+	err = ubifs_node_calc_hash(c, data, hash);
+	if (err)
+		goto out_release;
+
 	ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
 	release_head(c, DATAHD);
 
-	err = ubifs_tnc_add(c, key, lnum, offs, dlen);
+	ubifs_add_auth_dirt(c, lnum);
+
+	err = ubifs_tnc_add(c, key, lnum, offs, dlen, hash);
 	if (err)
 		goto out_ro;
 
@@ -806,9 +840,12 @@
 int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
 {
 	int err, lnum, offs;
-	struct ubifs_ino_node *ino;
+	struct ubifs_ino_node *ino, *ino_start;
 	struct ubifs_inode *ui = ubifs_inode(inode);
-	int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
+	int sync = 0, write_len = 0, ilen = UBIFS_INO_NODE_SZ;
+	int last_reference = !inode->i_nlink;
+	int kill_xattrs = ui->xattr_cnt && last_reference;
+	u8 hash[UBIFS_HASH_ARR_SZ];
 
 	dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
 
@@ -817,20 +854,79 @@
 	 * need to synchronize the write-buffer either.
 	 */
 	if (!last_reference) {
-		len += ui->data_len;
+		ilen += ui->data_len;
 		sync = IS_SYNC(inode);
+	} else if (kill_xattrs) {
+		write_len += UBIFS_INO_NODE_SZ * ui->xattr_cnt;
 	}
-	ino = kmalloc(len, GFP_NOFS);
+
+	if (ubifs_authenticated(c))
+		write_len += ALIGN(ilen, 8) + ubifs_auth_node_sz(c);
+	else
+		write_len += ilen;
+
+	ino_start = ino = kmalloc(write_len, GFP_NOFS);
 	if (!ino)
 		return -ENOMEM;
 
 	/* Make reservation before allocating sequence numbers */
-	err = make_reservation(c, BASEHD, len);
+	err = make_reservation(c, BASEHD, write_len);
 	if (err)
 		goto out_free;
 
+	if (kill_xattrs) {
+		union ubifs_key key;
+		struct fscrypt_name nm = {0};
+		struct inode *xino;
+		struct ubifs_dent_node *xent, *pxent = NULL;
+
+		if (ui->xattr_cnt >= ubifs_xattr_max_cnt(c)) {
+			ubifs_err(c, "Cannot delete inode, it has too much xattrs!");
+			goto out_release;
+		}
+
+		lowest_xent_key(c, &key, inode->i_ino);
+		while (1) {
+			xent = ubifs_tnc_next_ent(c, &key, &nm);
+			if (IS_ERR(xent)) {
+				err = PTR_ERR(xent);
+				if (err == -ENOENT)
+					break;
+
+				goto out_release;
+			}
+
+			fname_name(&nm) = xent->name;
+			fname_len(&nm) = le16_to_cpu(xent->nlen);
+
+			xino = ubifs_iget(c->vfs_sb, xent->inum);
+			if (IS_ERR(xino)) {
+				err = PTR_ERR(xino);
+				ubifs_err(c, "dead directory entry '%s', error %d",
+					  xent->name, err);
+				ubifs_ro_mode(c, err);
+				goto out_release;
+			}
+			ubifs_assert(c, ubifs_inode(xino)->xattr);
+
+			clear_nlink(xino);
+			pack_inode(c, ino, xino, 0);
+			ino = (void *)ino + UBIFS_INO_NODE_SZ;
+			iput(xino);
+
+			kfree(pxent);
+			pxent = xent;
+			key_read(c, &xent->key, &key);
+		}
+		kfree(pxent);
+	}
+
 	pack_inode(c, ino, inode, 1);
-	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+	err = ubifs_node_calc_hash(c, ino, hash);
+	if (err)
+		goto out_release;
+
+	err = write_head(c, BASEHD, ino_start, write_len, &lnum, &offs, sync);
 	if (err)
 		goto out_release;
 	if (!sync)
@@ -838,17 +934,19 @@
 					  inode->i_ino);
 	release_head(c, BASEHD);
 
+	ubifs_add_auth_dirt(c, lnum);
+
 	if (last_reference) {
 		err = ubifs_tnc_remove_ino(c, inode->i_ino);
 		if (err)
 			goto out_ro;
 		ubifs_delete_orphan(c, inode->i_ino);
-		err = ubifs_add_dirt(c, lnum, len);
+		err = ubifs_add_dirt(c, lnum, write_len);
 	} else {
 		union ubifs_key key;
 
 		ino_key_init(c, &key, inode->i_ino);
-		err = ubifs_tnc_add(c, &key, lnum, offs, len);
+		err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash);
 	}
 	if (err)
 		goto out_ro;
@@ -857,7 +955,7 @@
 	spin_lock(&ui->ui_lock);
 	ui->synced_i_size = ui->ui_size;
 	spin_unlock(&ui->ui_lock);
-	kfree(ino);
+	kfree(ino_start);
 	return 0;
 
 out_release:
@@ -866,7 +964,7 @@
 	ubifs_ro_mode(c, err);
 	finish_reservation(c);
 out_free:
-	kfree(ino);
+	kfree(ino_start);
 	return err;
 }
 
@@ -906,8 +1004,8 @@
 
 	ubifs_assert(c, inode->i_nlink == 0);
 
-	if (ui->del_cmtno != c->cmt_no)
-		/* A commit happened for sure */
+	if (ui->xattr_cnt || ui->del_cmtno != c->cmt_no)
+		/* A commit happened for sure or inode hosts xattrs */
 		return ubifs_jnl_write_inode(c, inode);
 
 	down_read(&c->commit_sem);
@@ -958,6 +1056,10 @@
 	int aligned_dlen1, aligned_dlen2;
 	int twoparents = (fst_dir != snd_dir);
 	void *p;
+	u8 hash_dent1[UBIFS_HASH_ARR_SZ];
+	u8 hash_dent2[UBIFS_HASH_ARR_SZ];
+	u8 hash_p1[UBIFS_HASH_ARR_SZ];
+	u8 hash_p2[UBIFS_HASH_ARR_SZ];
 
 	ubifs_assert(c, ubifs_inode(fst_dir)->data_len == 0);
 	ubifs_assert(c, ubifs_inode(snd_dir)->data_len == 0);
@@ -973,6 +1075,8 @@
 	if (twoparents)
 		len += plen;
 
+	len += ubifs_auth_node_sz(c);
+
 	dent1 = kzalloc(len, GFP_NOFS);
 	if (!dent1)
 		return -ENOMEM;
@@ -993,6 +1097,9 @@
 	set_dent_cookie(c, dent1);
 	zero_dent_node_unused(dent1);
 	ubifs_prep_grp_node(c, dent1, dlen1, 0);
+	err = ubifs_node_calc_hash(c, dent1, hash_dent1);
+	if (err)
+		goto out_release;
 
 	/* Make new dent for 2nd entry */
 	dent2 = (void *)dent1 + aligned_dlen1;
@@ -1006,14 +1113,26 @@
 	set_dent_cookie(c, dent2);
 	zero_dent_node_unused(dent2);
 	ubifs_prep_grp_node(c, dent2, dlen2, 0);
+	err = ubifs_node_calc_hash(c, dent2, hash_dent2);
+	if (err)
+		goto out_release;
 
 	p = (void *)dent2 + aligned_dlen2;
-	if (!twoparents)
+	if (!twoparents) {
 		pack_inode(c, p, fst_dir, 1);
-	else {
+		err = ubifs_node_calc_hash(c, p, hash_p1);
+		if (err)
+			goto out_release;
+	} else {
 		pack_inode(c, p, fst_dir, 0);
+		err = ubifs_node_calc_hash(c, p, hash_p1);
+		if (err)
+			goto out_release;
 		p += ALIGN(plen, 8);
 		pack_inode(c, p, snd_dir, 1);
+		err = ubifs_node_calc_hash(c, p, hash_p2);
+		if (err)
+			goto out_release;
 	}
 
 	err = write_head(c, BASEHD, dent1, len, &lnum, &offs, sync);
@@ -1027,28 +1146,30 @@
 	}
 	release_head(c, BASEHD);
 
+	ubifs_add_auth_dirt(c, lnum);
+
 	dent_key_init(c, &key, snd_dir->i_ino, snd_nm);
-	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, snd_nm);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, hash_dent1, snd_nm);
 	if (err)
 		goto out_ro;
 
 	offs += aligned_dlen1;
 	dent_key_init(c, &key, fst_dir->i_ino, fst_nm);
-	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, fst_nm);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, fst_nm);
 	if (err)
 		goto out_ro;
 
 	offs += aligned_dlen2;
 
 	ino_key_init(c, &key, fst_dir->i_ino);
-	err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+	err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_p1);
 	if (err)
 		goto out_ro;
 
 	if (twoparents) {
 		offs += ALIGN(plen, 8);
 		ino_key_init(c, &key, snd_dir->i_ino);
-		err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+		err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_p2);
 		if (err)
 			goto out_ro;
 	}
@@ -1101,6 +1222,11 @@
 	int last_reference = !!(new_inode && new_inode->i_nlink == 0);
 	int move = (old_dir != new_dir);
 	struct ubifs_inode *uninitialized_var(new_ui);
+	u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
+	u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
+	u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
+	u8 hash_dent1[UBIFS_HASH_ARR_SZ];
+	u8 hash_dent2[UBIFS_HASH_ARR_SZ];
 
 	ubifs_assert(c, ubifs_inode(old_dir)->data_len == 0);
 	ubifs_assert(c, ubifs_inode(new_dir)->data_len == 0);
@@ -1123,6 +1249,9 @@
 	len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
 	if (move)
 		len += plen;
+
+	len += ubifs_auth_node_sz(c);
+
 	dent = kzalloc(len, GFP_NOFS);
 	if (!dent)
 		return -ENOMEM;
@@ -1143,6 +1272,9 @@
 	set_dent_cookie(c, dent);
 	zero_dent_node_unused(dent);
 	ubifs_prep_grp_node(c, dent, dlen1, 0);
+	err = ubifs_node_calc_hash(c, dent, hash_dent1);
+	if (err)
+		goto out_release;
 
 	dent2 = (void *)dent + aligned_dlen1;
 	dent2->ch.node_type = UBIFS_DENT_NODE;
@@ -1162,19 +1294,36 @@
 	set_dent_cookie(c, dent2);
 	zero_dent_node_unused(dent2);
 	ubifs_prep_grp_node(c, dent2, dlen2, 0);
+	err = ubifs_node_calc_hash(c, dent2, hash_dent2);
+	if (err)
+		goto out_release;
 
 	p = (void *)dent2 + aligned_dlen2;
 	if (new_inode) {
 		pack_inode(c, p, new_inode, 0);
+		err = ubifs_node_calc_hash(c, p, hash_new_inode);
+		if (err)
+			goto out_release;
+
 		p += ALIGN(ilen, 8);
 	}
 
-	if (!move)
+	if (!move) {
 		pack_inode(c, p, old_dir, 1);
-	else {
+		err = ubifs_node_calc_hash(c, p, hash_old_dir);
+		if (err)
+			goto out_release;
+	} else {
 		pack_inode(c, p, old_dir, 0);
+		err = ubifs_node_calc_hash(c, p, hash_old_dir);
+		if (err)
+			goto out_release;
+
 		p += ALIGN(plen, 8);
 		pack_inode(c, p, new_dir, 1);
+		err = ubifs_node_calc_hash(c, p, hash_new_dir);
+		if (err)
+			goto out_release;
 	}
 
 	if (last_reference) {
@@ -1200,15 +1349,17 @@
 	}
 	release_head(c, BASEHD);
 
+	ubifs_add_auth_dirt(c, lnum);
+
 	dent_key_init(c, &key, new_dir->i_ino, new_nm);
-	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, new_nm);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, hash_dent1, new_nm);
 	if (err)
 		goto out_ro;
 
 	offs += aligned_dlen1;
 	if (whiteout) {
 		dent_key_init(c, &key, old_dir->i_ino, old_nm);
-		err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, old_nm);
+		err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm);
 		if (err)
 			goto out_ro;
 
@@ -1227,21 +1378,21 @@
 	offs += aligned_dlen2;
 	if (new_inode) {
 		ino_key_init(c, &key, new_inode->i_ino);
-		err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
+		err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash_new_inode);
 		if (err)
 			goto out_ro;
 		offs += ALIGN(ilen, 8);
 	}
 
 	ino_key_init(c, &key, old_dir->i_ino);
-	err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+	err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir);
 	if (err)
 		goto out_ro;
 
 	if (move) {
 		offs += ALIGN(plen, 8);
 		ino_key_init(c, &key, new_dir->i_ino);
-		err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+		err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_new_dir);
 		if (err)
 			goto out_ro;
 	}
@@ -1360,6 +1511,8 @@
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	ino_t inum = inode->i_ino;
 	unsigned int blk;
+	u8 hash_ino[UBIFS_HASH_ARR_SZ];
+	u8 hash_dn[UBIFS_HASH_ARR_SZ];
 
 	dbg_jnl("ino %lu, size %lld -> %lld",
 		(unsigned long)inum, old_size, new_size);
@@ -1369,6 +1522,9 @@
 
 	sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
 	     UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+
+	sz += ubifs_auth_node_sz(c);
+
 	ino = kmalloc(sz, GFP_NOFS);
 	if (!ino)
 		return -ENOMEM;
@@ -1414,16 +1570,28 @@
 
 	/* Must make reservation before allocating sequence numbers */
 	len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
-	if (dlen)
+
+	if (ubifs_authenticated(c))
+		len += ALIGN(dlen, 8) + ubifs_auth_node_sz(c);
+	else
 		len += dlen;
+
 	err = make_reservation(c, BASEHD, len);
 	if (err)
 		goto out_free;
 
 	pack_inode(c, ino, inode, 0);
+	err = ubifs_node_calc_hash(c, ino, hash_ino);
+	if (err)
+		goto out_release;
+
 	ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
-	if (dlen)
+	if (dlen) {
 		ubifs_prep_grp_node(c, dn, dlen, 1);
+		err = ubifs_node_calc_hash(c, dn, hash_dn);
+		if (err)
+			goto out_release;
+	}
 
 	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
 	if (err)
@@ -1432,15 +1600,17 @@
 		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
 	release_head(c, BASEHD);
 
+	ubifs_add_auth_dirt(c, lnum);
+
 	if (dlen) {
 		sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
-		err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
+		err = ubifs_tnc_add(c, &key, lnum, sz, dlen, hash_dn);
 		if (err)
 			goto out_ro;
 	}
 
 	ino_key_init(c, &key, inum);
-	err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
+	err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ, hash_ino);
 	if (err)
 		goto out_ro;
 
@@ -1495,12 +1665,13 @@
 			   const struct inode *inode,
 			   const struct fscrypt_name *nm)
 {
-	int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
+	int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen, write_len;
 	struct ubifs_dent_node *xent;
 	struct ubifs_ino_node *ino;
 	union ubifs_key xent_key, key1, key2;
 	int sync = IS_DIRSYNC(host);
 	struct ubifs_inode *host_ui = ubifs_inode(host);
+	u8 hash[UBIFS_HASH_ARR_SZ];
 
 	ubifs_assert(c, inode->i_nlink == 0);
 	ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex));
@@ -1514,12 +1685,14 @@
 	hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
 	len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
 
-	xent = kzalloc(len, GFP_NOFS);
+	write_len = len + ubifs_auth_node_sz(c);
+
+	xent = kzalloc(write_len, GFP_NOFS);
 	if (!xent)
 		return -ENOMEM;
 
 	/* Make reservation before allocating sequence numbers */
-	err = make_reservation(c, BASEHD, len);
+	err = make_reservation(c, BASEHD, write_len);
 	if (err) {
 		kfree(xent);
 		return err;
@@ -1540,11 +1713,16 @@
 	pack_inode(c, ino, inode, 0);
 	ino = (void *)ino + UBIFS_INO_NODE_SZ;
 	pack_inode(c, ino, host, 1);
+	err = ubifs_node_calc_hash(c, ino, hash);
+	if (err)
+		goto out_release;
 
-	err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
+	err = write_head(c, BASEHD, xent, write_len, &lnum, &xent_offs, sync);
 	if (!sync && !err)
 		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
 	release_head(c, BASEHD);
+
+	ubifs_add_auth_dirt(c, lnum);
 	kfree(xent);
 	if (err)
 		goto out_ro;
@@ -1572,7 +1750,7 @@
 
 	/* And update TNC with the new host inode position */
 	ino_key_init(c, &key1, host->i_ino);
-	err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
+	err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen, hash);
 	if (err)
 		goto out_ro;
 
@@ -1583,6 +1761,9 @@
 	mark_inode_clean(c, host_ui);
 	return 0;
 
+out_release:
+	kfree(xent);
+	release_head(c, BASEHD);
 out_ro:
 	ubifs_ro_mode(c, err);
 	finish_reservation(c);
@@ -1610,6 +1791,8 @@
 	struct ubifs_ino_node *ino;
 	union ubifs_key key;
 	int sync = IS_DIRSYNC(host);
+	u8 hash_host[UBIFS_HASH_ARR_SZ];
+	u8 hash[UBIFS_HASH_ARR_SZ];
 
 	dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
 	ubifs_assert(c, host->i_nlink > 0);
@@ -1621,6 +1804,8 @@
 	aligned_len1 = ALIGN(len1, 8);
 	aligned_len = aligned_len1 + ALIGN(len2, 8);
 
+	aligned_len += ubifs_auth_node_sz(c);
+
 	ino = kzalloc(aligned_len, GFP_NOFS);
 	if (!ino)
 		return -ENOMEM;
@@ -1631,7 +1816,13 @@
 		goto out_free;
 
 	pack_inode(c, ino, host, 0);
+	err = ubifs_node_calc_hash(c, ino, hash_host);
+	if (err)
+		goto out_release;
 	pack_inode(c, (void *)ino + aligned_len1, inode, 1);
+	err = ubifs_node_calc_hash(c, (void *)ino + aligned_len1, hash);
+	if (err)
+		goto out_release;
 
 	err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
 	if (!sync && !err) {
@@ -1644,13 +1835,15 @@
 	if (err)
 		goto out_ro;
 
+	ubifs_add_auth_dirt(c, lnum);
+
 	ino_key_init(c, &key, host->i_ino);
-	err = ubifs_tnc_add(c, &key, lnum, offs, len1);
+	err = ubifs_tnc_add(c, &key, lnum, offs, len1, hash_host);
 	if (err)
 		goto out_ro;
 
 	ino_key_init(c, &key, inode->i_ino);
-	err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
+	err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2, hash);
 	if (err)
 		goto out_ro;
 
@@ -1662,6 +1855,8 @@
 	kfree(ino);
 	return 0;
 
+out_release:
+	release_head(c, BASEHD);
 out_ro:
 	ubifs_ro_mode(c, err);
 	finish_reservation(c);

diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 2feff6c..afa704f 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h

@@ -1,21 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */

diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 86b0828..b6ac9c4 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -236,6 +224,7 @@
 	bud->lnum = lnum;
 	bud->start = offs;
 	bud->jhead = jhead;
+	bud->log_hash = NULL;
 
 	ref->ch.node_type = UBIFS_REF_NODE;
 	ref->lnum = cpu_to_le32(bud->lnum);
@@ -275,6 +264,14 @@
 	if (err)
 		goto out_unlock;
 
+	err = ubifs_shash_update(c, c->log_hash, ref, UBIFS_REF_NODE_SZ);
+	if (err)
+		goto out_unlock;
+
+	err = ubifs_shash_copy_state(c, c->log_hash, c->jheads[jhead].log_hash);
+	if (err)
+		goto out_unlock;
+
 	c->lhead_offs += c->ref_node_alsz;
 
 	ubifs_add_bud(c, bud);
@@ -377,6 +374,14 @@
 	cs->cmt_no = cpu_to_le64(c->cmt_no);
 	ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
 
+	err = ubifs_shash_init(c, c->log_hash);
+	if (err)
+		goto out;
+
+	err = ubifs_shash_update(c, c->log_hash, cs, UBIFS_CS_NODE_SZ);
+	if (err < 0)
+		goto out;
+
 	/*
 	 * Note, we do not lock 'c->log_mutex' because this is the commit start
 	 * phase and we are exclusively using the log. And we do not lock
@@ -402,6 +407,12 @@
 
 		ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
 		len += UBIFS_REF_NODE_SZ;
+
+		err = ubifs_shash_update(c, c->log_hash, ref,
+					 UBIFS_REF_NODE_SZ);
+		if (err)
+			goto out;
+		ubifs_shash_copy_state(c, c->log_hash, c->jheads[i].log_hash);
 	}
 
 	ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
@@ -427,10 +438,7 @@
 	*ltail_lnum = c->lhead_lnum;
 
 	c->lhead_offs += len;
-	if (c->lhead_offs == c->leb_size) {
-		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
-		c->lhead_offs = 0;
-	}
+	ubifs_assert(c, c->lhead_offs < c->leb_size);
 
 	remove_buds(c);
 
@@ -516,6 +524,7 @@
 		if (err)
 			return err;
 		list_del(&bud->list);
+		kfree(bud->log_hash);
 		kfree(bud);
 	}
 	mutex_lock(&c->log_mutex);

diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index fa8d775..29826c5 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */

diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 3139337..e21abf2 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -604,11 +592,12 @@
  * @lpt_first: LEB number of first LPT LEB
  * @lpt_lebs: number of LEBs for LPT is passed and returned here
  * @big_lpt: use big LPT model is passed and returned here
+ * @hash: hash of the LPT is returned here
  *
  * This function returns %0 on success and a negative error code on failure.
  */
 int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
-			  int *lpt_lebs, int *big_lpt)
+			  int *lpt_lebs, int *big_lpt, u8 *hash)
 {
 	int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
 	int blnum, boffs, bsz, bcnt;
@@ -617,6 +606,7 @@
 	void *buf = NULL, *p;
 	struct ubifs_lpt_lprops *ltab = NULL;
 	int *lsave = NULL;
+	struct shash_desc *desc;
 
 	err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
 	if (err)
@@ -630,6 +620,10 @@
 	/* Needed by 'ubifs_pack_lsave()' */
 	c->main_first = c->leb_cnt - *main_lebs;
 
+	desc = ubifs_hash_get_desc(c);
+	if (IS_ERR(desc))
+		return PTR_ERR(desc);
+
 	lsave = kmalloc_array(c->lsave_cnt, sizeof(int), GFP_KERNEL);
 	pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
 	nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
@@ -677,6 +671,10 @@
 
 	/* Add first pnode */
 	ubifs_pack_pnode(c, p, pnode);
+	err = ubifs_shash_update(c, desc, p, c->pnode_sz);
+	if (err)
+		goto out;
+
 	p += c->pnode_sz;
 	len = c->pnode_sz;
 	pnode->num += 1;
@@ -711,6 +709,10 @@
 			len = 0;
 		}
 		ubifs_pack_pnode(c, p, pnode);
+		err = ubifs_shash_update(c, desc, p, c->pnode_sz);
+		if (err)
+			goto out;
+
 		p += c->pnode_sz;
 		len += c->pnode_sz;
 		/*
@@ -830,6 +832,10 @@
 	if (err)
 		goto out;
 
+	err = ubifs_shash_final(c, desc, hash);
+	if (err)
+		goto out;
+
 	c->nhead_lnum = lnum;
 	c->nhead_offs = ALIGN(len, c->min_io_size);
 
@@ -853,6 +859,7 @@
 		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
 out:
 	c->ltab = NULL;
+	kfree(desc);
 	kfree(lsave);
 	vfree(ltab);
 	vfree(buf);
@@ -1439,26 +1446,25 @@
 }
 
 /**
- * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * ubifs_pnode_lookup - lookup a pnode in the LPT.
  * @c: UBIFS file-system description object
- * @lnum: LEB number to lookup
+ * @i: pnode number (0 to (main_lebs - 1) / UBIFS_LPT_FANOUT)
  *
- * This function returns a pointer to the LEB properties on success or a
- * negative error code on failure.
+ * This function returns a pointer to the pnode on success or a negative
+ * error code on failure.
  */
-struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+struct ubifs_pnode *ubifs_pnode_lookup(struct ubifs_info *c, int i)
 {
-	int err, i, h, iip, shft;
+	int err, h, iip, shft;
 	struct ubifs_nnode *nnode;
-	struct ubifs_pnode *pnode;
 
 	if (!c->nroot) {
 		err = ubifs_read_nnode(c, NULL, 0);
 		if (err)
 			return ERR_PTR(err);
 	}
+	i <<= UBIFS_LPT_FANOUT_SHIFT;
 	nnode = c->nroot;
-	i = lnum - c->main_first;
 	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
 	for (h = 1; h < c->lpt_hght; h++) {
 		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
@@ -1468,7 +1474,24 @@
 			return ERR_CAST(nnode);
 	}
 	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-	pnode = ubifs_get_pnode(c, nnode, iip);
+	return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+{
+	int i, iip;
+	struct ubifs_pnode *pnode;
+
+	i = lnum - c->main_first;
+	pnode = ubifs_pnode_lookup(c, i >> UBIFS_LPT_FANOUT_SHIFT);
 	if (IS_ERR(pnode))
 		return ERR_CAST(pnode);
 	iip = (i & (UBIFS_LPT_FANOUT - 1));
@@ -1620,6 +1643,131 @@
 }
 
 /**
+ * ubifs_lpt_calc_hash - Calculate hash of the LPT pnodes
+ * @c: UBIFS file-system description object
+ * @hash: the returned hash of the LPT pnodes
+ *
+ * This function iterates over the LPT pnodes and creates a hash over them.
+ * Returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash)
+{
+	struct ubifs_nnode *nnode, *nn;
+	struct ubifs_cnode *cnode;
+	struct shash_desc *desc;
+	int iip = 0, i;
+	int bufsiz = max_t(int, c->nnode_sz, c->pnode_sz);
+	void *buf;
+	int err;
+
+	if (!ubifs_authenticated(c))
+		return 0;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return err;
+	}
+
+	desc = ubifs_hash_get_desc(c);
+	if (IS_ERR(desc))
+		return PTR_ERR(desc);
+
+	buf = kmalloc(bufsiz, GFP_NOFS);
+	if (!buf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	cnode = (struct ubifs_cnode *)c->nroot;
+
+	while (cnode) {
+		nnode = cnode->parent;
+		nn = (struct ubifs_nnode *)cnode;
+		if (cnode->level > 1) {
+			while (iip < UBIFS_LPT_FANOUT) {
+				if (nn->nbranch[iip].lnum == 0) {
+					/* Go right */
+					iip++;
+					continue;
+				}
+
+				nnode = ubifs_get_nnode(c, nn, iip);
+				if (IS_ERR(nnode)) {
+					err = PTR_ERR(nnode);
+					goto out;
+				}
+
+				/* Go down */
+				iip = 0;
+				cnode = (struct ubifs_cnode *)nnode;
+				break;
+			}
+			if (iip < UBIFS_LPT_FANOUT)
+				continue;
+		} else {
+			struct ubifs_pnode *pnode;
+
+			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+				if (nn->nbranch[i].lnum == 0)
+					continue;
+				pnode = ubifs_get_pnode(c, nn, i);
+				if (IS_ERR(pnode)) {
+					err = PTR_ERR(pnode);
+					goto out;
+				}
+
+				ubifs_pack_pnode(c, buf, pnode);
+				err = ubifs_shash_update(c, desc, buf,
+							 c->pnode_sz);
+				if (err)
+					goto out;
+			}
+		}
+		/* Go up and to the right */
+		iip = cnode->iip + 1;
+		cnode = (struct ubifs_cnode *)nnode;
+	}
+
+	err = ubifs_shash_final(c, desc, hash);
+out:
+	kfree(desc);
+	kfree(buf);
+
+	return err;
+}
+
+/**
+ * lpt_check_hash - check the hash of the LPT.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates a hash over all pnodes in the LPT and compares it with
+ * the hash stored in the master node. Returns %0 on success and a negative error
+ * code on failure.
+ */
+static int lpt_check_hash(struct ubifs_info *c)
+{
+	int err;
+	u8 hash[UBIFS_HASH_ARR_SZ];
+
+	if (!ubifs_authenticated(c))
+		return 0;
+
+	err = ubifs_lpt_calc_hash(c, hash);
+	if (err)
+		return err;
+
+	if (ubifs_check_hash(c, c->mst_node->hash_lpt, hash)) {
+		err = -EPERM;
+		ubifs_err(c, "Failed to authenticate LPT");
+	} else {
+		err = 0;
+	}
+
+	return err;
+}
+
+/**
  * lpt_init_rd - initialize the LPT for reading.
  * @c: UBIFS file-system description object
  *
@@ -1660,6 +1808,10 @@
 	if (err)
 		return err;
 
+	err = lpt_check_hash(c);
+	if (err)
+		return err;
+
 	dbg_lp("space_bits %d", c->space_bits);
 	dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
 	dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 7ce3099..ff5e041 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -619,38 +607,6 @@
 }
 
 /**
- * pnode_lookup - lookup a pnode in the LPT.
- * @c: UBIFS file-system description object
- * @i: pnode number (0 to (main_lebs - 1) / UBIFS_LPT_FANOUT))
- *
- * This function returns a pointer to the pnode on success or a negative
- * error code on failure.
- */
-static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
-{
-	int err, h, iip, shft;
-	struct ubifs_nnode *nnode;
-
-	if (!c->nroot) {
-		err = ubifs_read_nnode(c, NULL, 0);
-		if (err)
-			return ERR_PTR(err);
-	}
-	i <<= UBIFS_LPT_FANOUT_SHIFT;
-	nnode = c->nroot;
-	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
-	for (h = 1; h < c->lpt_hght; h++) {
-		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-		shft -= UBIFS_LPT_FANOUT_SHIFT;
-		nnode = ubifs_get_nnode(c, nnode, iip);
-		if (IS_ERR(nnode))
-			return ERR_CAST(nnode);
-	}
-	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-	return ubifs_get_pnode(c, nnode, iip);
-}
-
-/**
  * add_pnode_dirt - add dirty space to LPT LEB properties.
  * @c: UBIFS file-system description object
  * @pnode: pnode for which to add dirt
@@ -702,7 +658,7 @@
 {
 	struct ubifs_pnode *pnode;
 
-	pnode = pnode_lookup(c, 0);
+	pnode = ubifs_pnode_lookup(c, 0);
 	if (IS_ERR(pnode))
 		return PTR_ERR(pnode);
 
@@ -956,7 +912,7 @@
 	struct ubifs_pnode *pnode;
 	struct ubifs_nbranch *branch;
 
-	pnode = pnode_lookup(c, node_num);
+	pnode = ubifs_pnode_lookup(c, node_num);
 	if (IS_ERR(pnode))
 		return PTR_ERR(pnode);
 	branch = &pnode->parent->nbranch[pnode->iip];
@@ -1279,6 +1235,10 @@
 	if (err)
 		goto out;
 
+	err = ubifs_lpt_calc_hash(c, c->mst_node->hash_lpt);
+	if (err)
+		goto out;
+
 	/* Copy the LPT's own lprops for end commit to write */
 	memcpy(c->ltab_cmt, c->ltab,
 	       sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
@@ -1558,7 +1518,7 @@
 		struct ubifs_nbranch *branch;
 
 		cond_resched();
-		pnode = pnode_lookup(c, i);
+		pnode = ubifs_pnode_lookup(c, i);
 		if (IS_ERR(pnode))
 			return PTR_ERR(pnode);
 		branch = &pnode->parent->nbranch[pnode->iip];
@@ -1710,7 +1670,7 @@
 	for (i = 0; i < cnt; i++) {
 		struct ubifs_pnode *pnode;
 
-		pnode = pnode_lookup(c, i);
+		pnode = ubifs_pnode_lookup(c, i);
 		if (IS_ERR(pnode))
 			return PTR_ERR(pnode);
 		cond_resched();

diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 9df4a41..52a85c0 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -25,6 +13,75 @@
 #include "ubifs.h"
 
 /**
+ * ubifs_compare_master_node - compare two UBIFS master nodes
+ * @c: UBIFS file-system description object
+ * @m1: the first node
+ * @m2: the second node
+ *
+ * This function compares two UBIFS master nodes. Returns 0 if they are equal
+ * and nonzero if not.
+ */
+int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2)
+{
+	int ret;
+	int behind;
+	int hmac_offs = offsetof(struct ubifs_mst_node, hmac);
+
+	/*
+	 * Do not compare the common node header since the sequence number and
+	 * hence the CRC are different.
+	 */
+	ret = memcmp(m1 + UBIFS_CH_SZ, m2 + UBIFS_CH_SZ,
+		     hmac_offs - UBIFS_CH_SZ);
+	if (ret)
+		return ret;
+
+	/*
+	 * Do not compare the embedded HMAC aswell which also must be different
+	 * due to the different common node header.
+	 */
+	behind = hmac_offs + UBIFS_MAX_HMAC_LEN;
+
+	if (UBIFS_MST_NODE_SZ > behind)
+		return memcmp(m1 + behind, m2 + behind, UBIFS_MST_NODE_SZ - behind);
+
+	return 0;
+}
+
+/* mst_node_check_hash - Check hash of a master node
+ * @c: UBIFS file-system description object
+ * @mst: The master node
+ * @expected: The expected hash of the master node
+ *
+ * This checks the hash of a master node against a given expected hash.
+ * Note that we have two master nodes on a UBIFS image which have different
+ * sequence numbers and consequently different CRCs. To be able to match
+ * both master nodes we exclude the common node header containing the sequence
+ * number and CRC from the hash.
+ *
+ * Returns 0 if the hashes are equal, a negative error code otherwise.
+ */
+static int mst_node_check_hash(const struct ubifs_info *c,
+			       const struct ubifs_mst_node *mst,
+			       const u8 *expected)
+{
+	u8 calc[UBIFS_MAX_HASH_LEN];
+	const void *node = mst;
+
+	SHASH_DESC_ON_STACK(shash, c->hash_tfm);
+
+	shash->tfm = c->hash_tfm;
+
+	crypto_shash_digest(shash, node + sizeof(struct ubifs_ch),
+			    UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc);
+
+	if (ubifs_check_hash(c, expected, calc))
+		return -EPERM;
+
+	return 0;
+}
+
+/**
  * scan_for_master - search the valid master node.
  * @c: UBIFS file-system description object
  *
@@ -37,7 +94,7 @@
 {
 	struct ubifs_scan_leb *sleb;
 	struct ubifs_scan_node *snod;
-	int lnum, offs = 0, nodes_cnt;
+	int lnum, offs = 0, nodes_cnt, err;
 
 	lnum = UBIFS_MST_LNUM;
 
@@ -69,12 +126,31 @@
 		goto out_dump;
 	if (snod->offs != offs)
 		goto out;
-	if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
-		   (void *)snod->node + UBIFS_CH_SZ,
-		   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+	if (ubifs_compare_master_node(c, c->mst_node, snod->node))
 		goto out;
+
 	c->mst_offs = offs;
 	ubifs_scan_destroy(sleb);
+
+	if (!ubifs_authenticated(c))
+		return 0;
+
+	if (ubifs_hmac_zero(c, c->mst_node->hmac)) {
+		err = mst_node_check_hash(c, c->mst_node,
+					  c->sup_node->hash_mst);
+		if (err)
+			ubifs_err(c, "Failed to verify master node hash");
+	} else {
+		err = ubifs_node_verify_hmac(c, c->mst_node,
+					sizeof(struct ubifs_mst_node),
+					offsetof(struct ubifs_mst_node, hmac));
+		if (err)
+			ubifs_err(c, "Failed to verify master node HMAC");
+	}
+
+	if (err)
+		return -EPERM;
+
 	return 0;
 
 out:
@@ -305,6 +381,8 @@
 	c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
 	c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
 
+	ubifs_copy_hash(c, c->mst_node->hash_root_idx, c->zroot.hash);
+
 	c->calc_idx_sz = c->bi.old_idx_sz;
 
 	if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
@@ -378,7 +456,9 @@
 	c->mst_offs = offs;
 	c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
 
-	err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
+	ubifs_copy_hash(c, c->zroot.hash, c->mst_node->hash_root_idx);
+	err = ubifs_write_node_hmac(c, c->mst_node, len, lnum, offs,
+				    offsetof(struct ubifs_mst_node, hmac));
 	if (err)
 		return err;
 
@@ -389,7 +469,8 @@
 		if (err)
 			return err;
 	}
-	err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
+	err = ubifs_write_node_hmac(c, c->mst_node, len, lnum, offs,
+				    offsetof(struct ubifs_mst_node, hmac));
 
 	return err;
 }

diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 21d35d7..c97a4d5 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h

@@ -1,21 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -197,7 +185,8 @@
  */
 static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
 {
-	return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
+	return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len + c->hash_len)
+				   * child_cnt;
 }
 
 /**
@@ -212,7 +201,7 @@
 				      int bnum)
 {
 	return (struct ubifs_branch *)((void *)idx->branches +
-				       (UBIFS_BRANCH_SZ + c->key_len) * bnum);
+			(UBIFS_BRANCH_SZ + c->key_len + c->hash_len) * bnum);
 }
 
 /**
@@ -287,6 +276,14 @@
 	return lnum;
 }
 
+static inline int ubifs_xattr_max_cnt(struct ubifs_info *c)
+{
+	int max_xattrs = (c->leb_size / 2) / UBIFS_INO_NODE_SZ;
+
+	ubifs_assert(c, max_xattrs < c->max_orphans);
+	return max_xattrs;
+}
+
 const char *ubifs_assert_action_name(struct ubifs_info *c);
 
 #endif /* __UBIFS_MISC_H__ */

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 8f70494..3b4b411 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Author: Adrian Hunter
  */
 
@@ -54,30 +42,24 @@
 
 static int dbg_check_orphans(struct ubifs_info *c);
 
-/**
- * ubifs_add_orphan - add an orphan.
- * @c: UBIFS file-system description object
- * @inum: orphan inode number
- *
- * Add an orphan. This function is called when an inodes link count drops to
- * zero.
- */
-int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
+static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum,
+				       struct ubifs_orphan *parent_orphan)
 {
 	struct ubifs_orphan *orphan, *o;
 	struct rb_node **p, *parent = NULL;
 
 	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
 	if (!orphan)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	orphan->inum = inum;
 	orphan->new = 1;
+	INIT_LIST_HEAD(&orphan->child_list);
 
 	spin_lock(&c->orphan_lock);
 	if (c->tot_orphans >= c->max_orphans) {
 		spin_unlock(&c->orphan_lock);
 		kfree(orphan);
-		return -ENFILE;
+		return ERR_PTR(-ENFILE);
 	}
 	p = &c->orph_tree.rb_node;
 	while (*p) {
@@ -91,7 +73,7 @@
 			ubifs_err(c, "orphaned twice");
 			spin_unlock(&c->orphan_lock);
 			kfree(orphan);
-			return 0;
+			return ERR_PTR(-EINVAL);
 		}
 	}
 	c->tot_orphans += 1;
@@ -100,8 +82,111 @@
 	rb_insert_color(&orphan->rb, &c->orph_tree);
 	list_add_tail(&orphan->list, &c->orph_list);
 	list_add_tail(&orphan->new_list, &c->orph_new);
+
+	if (parent_orphan) {
+		list_add_tail(&orphan->child_list,
+			      &parent_orphan->child_list);
+	}
+
 	spin_unlock(&c->orphan_lock);
 	dbg_gen("ino %lu", (unsigned long)inum);
+	return orphan;
+}
+
+static struct ubifs_orphan *lookup_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *o;
+	struct rb_node *p;
+
+	p = c->orph_tree.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else {
+			return o;
+		}
+	}
+	return NULL;
+}
+
+static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o)
+{
+	rb_erase(&o->rb, &c->orph_tree);
+	list_del(&o->list);
+	c->tot_orphans -= 1;
+
+	if (o->new) {
+		list_del(&o->new_list);
+		c->new_orphans -= 1;
+	}
+
+	kfree(o);
+}
+
+static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph)
+{
+	if (orph->del) {
+		dbg_gen("deleted twice ino %lu", orph->inum);
+		return;
+	}
+
+	if (orph->cmt) {
+		orph->del = 1;
+		orph->dnext = c->orph_dnext;
+		c->orph_dnext = orph;
+		dbg_gen("delete later ino %lu", orph->inum);
+		return;
+	}
+
+	__orphan_drop(c, orph);
+}
+
+/**
+ * ubifs_add_orphan - add an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Add an orphan. This function is called when an inodes link count drops to
+ * zero.
+ */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
+{
+	int err = 0;
+	ino_t xattr_inum;
+	union ubifs_key key;
+	struct ubifs_dent_node *xent;
+	struct fscrypt_name nm = {0};
+	struct ubifs_orphan *xattr_orphan;
+	struct ubifs_orphan *orphan;
+
+	orphan = orphan_add(c, inum, NULL);
+	if (IS_ERR(orphan))
+		return PTR_ERR(orphan);
+
+	lowest_xent_key(c, &key, inum);
+	while (1) {
+		xent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(xent)) {
+			err = PTR_ERR(xent);
+			if (err == -ENOENT)
+				break;
+			return err;
+		}
+
+		fname_name(&nm) = xent->name;
+		fname_len(&nm) = le16_to_cpu(xent->nlen);
+		xattr_inum = le64_to_cpu(xent->inum);
+
+		xattr_orphan = orphan_add(c, xattr_inum, orphan);
+		if (IS_ERR(xattr_orphan))
+			return PTR_ERR(xattr_orphan);
+
+		key_read(c, &xent->key, &key);
+	}
+
 	return 0;
 }
 
@@ -114,49 +199,27 @@
  */
 void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
 {
-	struct ubifs_orphan *o;
-	struct rb_node *p;
+	struct ubifs_orphan *orph, *child_orph, *tmp_o;
 
 	spin_lock(&c->orphan_lock);
-	p = c->orph_tree.rb_node;
-	while (p) {
-		o = rb_entry(p, struct ubifs_orphan, rb);
-		if (inum < o->inum)
-			p = p->rb_left;
-		else if (inum > o->inum)
-			p = p->rb_right;
-		else {
-			if (o->del) {
-				spin_unlock(&c->orphan_lock);
-				dbg_gen("deleted twice ino %lu",
-					(unsigned long)inum);
-				return;
-			}
-			if (o->cmt) {
-				o->del = 1;
-				o->dnext = c->orph_dnext;
-				c->orph_dnext = o;
-				spin_unlock(&c->orphan_lock);
-				dbg_gen("delete later ino %lu",
-					(unsigned long)inum);
-				return;
-			}
-			rb_erase(p, &c->orph_tree);
-			list_del(&o->list);
-			c->tot_orphans -= 1;
-			if (o->new) {
-				list_del(&o->new_list);
-				c->new_orphans -= 1;
-			}
-			spin_unlock(&c->orphan_lock);
-			kfree(o);
-			dbg_gen("inum %lu", (unsigned long)inum);
-			return;
-		}
+
+	orph = lookup_orphan(c, inum);
+	if (!orph) {
+		spin_unlock(&c->orphan_lock);
+		ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum);
+		dump_stack();
+
+		return;
 	}
+
+	list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) {
+		list_del(&child_orph->child_list);
+		orphan_delete(c, child_orph);
+	}
+	
+	orphan_delete(c, orph);
+
 	spin_unlock(&c->orphan_lock);
-	ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum);
-	dump_stack();
 }
 
 /**
@@ -563,6 +626,7 @@
 {
 	struct ubifs_scan_node *snod;
 	struct ubifs_orph_node *orph;
+	struct ubifs_ino_node *ino = NULL;
 	unsigned long long cmt_no;
 	ino_t inum;
 	int i, n, err, first = 1;
@@ -609,17 +673,40 @@
 		if (first)
 			first = 0;
 
+		ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+		if (!ino)
+			return -ENOMEM;
+
 		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
 		for (i = 0; i < n; i++) {
+			union ubifs_key key1, key2;
+
 			inum = le64_to_cpu(orph->inos[i]);
-			dbg_rcvry("deleting orphaned inode %lu",
-				  (unsigned long)inum);
-			err = ubifs_tnc_remove_ino(c, inum);
+
+			ino_key_init(c, &key1, inum);
+			err = ubifs_tnc_lookup(c, &key1, ino);
 			if (err)
-				return err;
+				goto out_free;
+
+			/*
+			 * Check whether an inode can really get deleted.
+			 * linkat() with O_TMPFILE allows rebirth of an inode.
+			 */
+			if (ino->nlink == 0) {
+				dbg_rcvry("deleting orphaned inode %lu",
+					  (unsigned long)inum);
+
+				lowest_ino_key(c, &key1, inum);
+				highest_ino_key(c, &key2, inum);
+
+				err = ubifs_tnc_remove_range(c, &key1, &key2);
+				if (err)
+					goto out_ro;
+			}
+
 			err = insert_dead_orphan(c, inum);
 			if (err)
-				return err;
+				goto out_free;
 		}
 
 		*last_cmt_no = cmt_no;
@@ -631,7 +718,15 @@
 			*last_flagged = 0;
 	}
 
-	return 0;
+	err = 0;
+out_free:
+	kfree(ino);
+	return err;
+
+out_ro:
+	ubifs_ro_mode(c, err);
+	kfree(ino);
+	return err;
 }
 
 /**
@@ -744,26 +839,15 @@
 	struct rb_root root;
 };
 
-static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
+static bool dbg_find_orphan(struct ubifs_info *c, ino_t inum)
 {
-	struct ubifs_orphan *o;
-	struct rb_node *p;
+	bool found = false;
 
 	spin_lock(&c->orphan_lock);
-	p = c->orph_tree.rb_node;
-	while (p) {
-		o = rb_entry(p, struct ubifs_orphan, rb);
-		if (inum < o->inum)
-			p = p->rb_left;
-		else if (inum > o->inum)
-			p = p->rb_right;
-		else {
-			spin_unlock(&c->orphan_lock);
-			return 1;
-		}
-	}
+	found = !!lookup_orphan(c, inum);
 	spin_unlock(&c->orphan_lock);
-	return 0;
+
+	return found;
 }
 
 static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 984e30e..f116f7b 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -212,7 +200,10 @@
 	save_flags = mst->flags;
 	mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
 
-	ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
+	err = ubifs_prepare_node_hmac(c, mst, UBIFS_MST_NODE_SZ,
+				      offsetof(struct ubifs_mst_node, hmac), 1);
+	if (err)
+		goto out;
 	err = ubifs_leb_change(c, lnum, mst, sz);
 	if (err)
 		goto out;
@@ -264,9 +255,7 @@
 			offs2 = (void *)mst2 - buf2;
 			if (offs1 == offs2) {
 				/* Same offset, so must be the same */
-				if (memcmp((void *)mst1 + UBIFS_CH_SZ,
-					   (void *)mst2 + UBIFS_CH_SZ,
-					   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+				if (ubifs_compare_master_node(c, mst1, mst2))
 					goto out_err;
 				mst = mst1;
 			} else if (offs2 + sz == offs1) {
@@ -829,7 +818,7 @@
 		goto out_err;
 	}
 	if (cs_node->ch.node_type != UBIFS_CS_NODE) {
-		ubifs_err(c, "Node a CS node, type is %d", cs_node->ch.node_type);
+		ubifs_err(c, "Not a CS node, type is %d", cs_node->ch.node_type);
 		goto out_err;
 	}
 	if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
@@ -1462,15 +1451,81 @@
 }
 
 /**
+ * inode_fix_size - fix inode size
+ * @c: UBIFS file-system description object
+ * @e: inode size information for recovery
+ */
+static int inode_fix_size(struct ubifs_info *c, struct size_entry *e)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+	int err;
+
+	if (c->ro_mount)
+		ubifs_assert(c, !e->inode);
+
+	if (e->inode) {
+		/* Remounting rw, pick up inode we stored earlier */
+		inode = e->inode;
+	} else {
+		inode = ubifs_iget(c->vfs_sb, e->inum);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+
+		if (inode->i_size >= e->d_size) {
+			/*
+			 * The original inode in the index already has a size
+			 * big enough, nothing to do
+			 */
+			iput(inode);
+			return 0;
+		}
+
+		dbg_rcvry("ino %lu size %lld -> %lld",
+			  (unsigned long)e->inum,
+			  inode->i_size, e->d_size);
+
+		ui = ubifs_inode(inode);
+
+		inode->i_size = e->d_size;
+		ui->ui_size = e->d_size;
+		ui->synced_i_size = e->d_size;
+
+		e->inode = inode;
+	}
+
+	/*
+	 * In readonly mode just keep the inode pinned in memory until we go
+	 * readwrite. In readwrite mode write the inode to the journal with the
+	 * fixed size.
+	 */
+	if (c->ro_mount)
+		return 0;
+
+	err = ubifs_jnl_write_inode(c, inode);
+
+	iput(inode);
+
+	if (err)
+		return err;
+
+	rb_erase(&e->rb, &c->size_tree);
+	kfree(e);
+
+	return 0;
+}
+
+/**
  * ubifs_recover_size - recover inode size.
  * @c: UBIFS file-system description object
+ * @in_place: If true, do a in-place size fixup
  *
  * This function attempts to fix inode size discrepancies identified by the
  * 'ubifs_recover_size_accum()' function.
  *
  * This functions returns %0 on success and a negative error code on failure.
  */
-int ubifs_recover_size(struct ubifs_info *c)
+int ubifs_recover_size(struct ubifs_info *c, bool in_place)
 {
 	struct rb_node *this = rb_first(&c->size_tree);
 
@@ -1479,6 +1534,9 @@
 		int err;
 
 		e = rb_entry(this, struct size_entry, rb);
+
+		this = rb_next(this);
+
 		if (!e->exists) {
 			union ubifs_key key;
 
@@ -1502,40 +1560,26 @@
 		}
 
 		if (e->exists && e->i_size < e->d_size) {
-			if (c->ro_mount) {
-				/* Fix the inode size and pin it in memory */
-				struct inode *inode;
-				struct ubifs_inode *ui;
+			ubifs_assert(c, !(c->ro_mount && in_place));
 
-				ubifs_assert(c, !e->inode);
+			/*
+			 * We found data that is outside the found inode size,
+			 * fixup the inode size
+			 */
 
-				inode = ubifs_iget(c->vfs_sb, e->inum);
-				if (IS_ERR(inode))
-					return PTR_ERR(inode);
-
-				ui = ubifs_inode(inode);
-				if (inode->i_size < e->d_size) {
-					dbg_rcvry("ino %lu size %lld -> %lld",
-						  (unsigned long)e->inum,
-						  inode->i_size, e->d_size);
-					inode->i_size = e->d_size;
-					ui->ui_size = e->d_size;
-					ui->synced_i_size = e->d_size;
-					e->inode = inode;
-					this = rb_next(this);
-					continue;
-				}
-				iput(inode);
-			} else {
-				/* Fix the size in place */
+			if (in_place) {
 				err = fix_size_in_place(c, e);
 				if (err)
 					return err;
 				iput(e->inode);
+			} else {
+				err = inode_fix_size(c, e);
+				if (err)
+					return err;
+				continue;
 			}
 		}
 
-		this = rb_next(this);
 		rb_erase(&e->rb, &c->size_tree);
 		kfree(e);
 	}

diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index c6f9b22..b28ac4d 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -34,6 +22,8 @@
 
 #include "ubifs.h"
 #include <linux/list_sort.h>
+#include <crypto/hash.h>
+#include <crypto/algapi.h>
 
 /**
  * struct replay_entry - replay list entry.
@@ -56,6 +46,7 @@
 	int lnum;
 	int offs;
 	int len;
+	u8 hash[UBIFS_HASH_ARR_SZ];
 	unsigned int deletion:1;
 	unsigned long long sqnum;
 	struct list_head list;
@@ -260,7 +251,7 @@
 			err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
 		else
 			err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
-					       r->len, &r->nm);
+					       r->len, r->hash, &r->nm);
 	} else {
 		if (r->deletion)
 			switch (key_type(c, &r->key)) {
@@ -285,7 +276,7 @@
 			}
 		else
 			err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
-					    r->len);
+					    r->len, r->hash);
 		if (err)
 			return err;
 
@@ -389,9 +380,9 @@
  * in case of success and a negative error code in case of failure.
  */
 static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
-		       union ubifs_key *key, unsigned long long sqnum,
-		       int deletion, int *used, loff_t old_size,
-		       loff_t new_size)
+		       const u8 *hash, union ubifs_key *key,
+		       unsigned long long sqnum, int deletion, int *used,
+		       loff_t old_size, loff_t new_size)
 {
 	struct replay_entry *r;
 
@@ -409,6 +400,7 @@
 	r->lnum = lnum;
 	r->offs = offs;
 	r->len = len;
+	ubifs_copy_hash(c, hash, r->hash);
 	r->deletion = !!deletion;
 	r->sqnum = sqnum;
 	key_copy(c, key, &r->key);
@@ -437,8 +429,9 @@
  * negative error code in case of failure.
  */
 static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
-		       union ubifs_key *key, const char *name, int nlen,
-		       unsigned long long sqnum, int deletion, int *used)
+		       const u8 *hash, union ubifs_key *key,
+		       const char *name, int nlen, unsigned long long sqnum,
+		       int deletion, int *used)
 {
 	struct replay_entry *r;
 	char *nbuf;
@@ -462,6 +455,7 @@
 	r->lnum = lnum;
 	r->offs = offs;
 	r->len = len;
+	ubifs_copy_hash(c, hash, r->hash);
 	r->deletion = !!deletion;
 	r->sqnum = sqnum;
 	key_copy(c, key, &r->key);
@@ -564,6 +558,116 @@
 	return data == 0xFFFFFFFF;
 }
 
+/* authenticate_sleb_hash and authenticate_sleb_hmac are split out for stack usage */
+static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash)
+{
+	SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
+
+	hash_desc->tfm = c->hash_tfm;
+
+	ubifs_shash_copy_state(c, log_hash, hash_desc);
+	return crypto_shash_final(hash_desc, hash);
+}
+
+static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac)
+{
+	SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm);
+
+	hmac_desc->tfm = c->hmac_tfm;
+
+	return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac);
+}
+
+/**
+ * authenticate_sleb - authenticate one scan LEB
+ * @c: UBIFS file-system description object
+ * @sleb: the scan LEB to authenticate
+ * @log_hash:
+ * @is_last: if true, this is is the last LEB
+ *
+ * This function iterates over the buds of a single LEB authenticating all buds
+ * with the authentication nodes on this LEB. Authentication nodes are written
+ * after some buds and contain a HMAC covering the authentication node itself
+ * and the buds between the last authentication node and the current
+ * authentication node. It can happen that the last buds cannot be authenticated
+ * because a powercut happened when some nodes were written but not the
+ * corresponding authentication node. This function returns the number of nodes
+ * that could be authenticated or a negative error code.
+ */
+static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+			     struct shash_desc *log_hash, int is_last)
+{
+	int n_not_auth = 0;
+	struct ubifs_scan_node *snod;
+	int n_nodes = 0;
+	int err;
+	u8 *hash, *hmac;
+
+	if (!ubifs_authenticated(c))
+		return sleb->nodes_cnt;
+
+	hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS);
+	hmac = kmalloc(c->hmac_desc_len, GFP_NOFS);
+	if (!hash || !hmac) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+
+		n_nodes++;
+
+		if (snod->type == UBIFS_AUTH_NODE) {
+			struct ubifs_auth_node *auth = snod->node;
+
+			err = authenticate_sleb_hash(c, log_hash, hash);
+			if (err)
+				goto out;
+
+			err = authenticate_sleb_hmac(c, hash, hmac);
+			if (err)
+				goto out;
+
+			err = ubifs_check_hmac(c, auth->hmac, hmac);
+			if (err) {
+				err = -EPERM;
+				goto out;
+			}
+			n_not_auth = 0;
+		} else {
+			err = crypto_shash_update(log_hash, snod->node,
+						  snod->len);
+			if (err)
+				goto out;
+			n_not_auth++;
+		}
+	}
+
+	/*
+	 * A powercut can happen when some nodes were written, but not yet
+	 * the corresponding authentication node. This may only happen on
+	 * the last bud though.
+	 */
+	if (n_not_auth) {
+		if (is_last) {
+			dbg_mnt("%d unauthenticated nodes found on LEB %d, Ignoring them",
+				n_not_auth, sleb->lnum);
+			err = 0;
+		} else {
+			dbg_mnt("%d unauthenticated nodes found on non-last LEB %d",
+				n_not_auth, sleb->lnum);
+			err = -EPERM;
+		}
+	} else {
+		err = 0;
+	}
+out:
+	kfree(hash);
+	kfree(hmac);
+
+	return err ? err : n_nodes - n_not_auth;
+}
+
 /**
  * replay_bud - replay a bud logical eraseblock.
  * @c: UBIFS file-system description object
@@ -577,6 +681,7 @@
 {
 	int is_last = is_last_bud(c, b->bud);
 	int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
+	int n_nodes, n = 0;
 	struct ubifs_scan_leb *sleb;
 	struct ubifs_scan_node *snod;
 
@@ -596,6 +701,15 @@
 	if (IS_ERR(sleb))
 		return PTR_ERR(sleb);
 
+	n_nodes = authenticate_sleb(c, sleb, b->bud->log_hash, is_last);
+	if (n_nodes < 0) {
+		err = n_nodes;
+		goto out;
+	}
+
+	ubifs_shash_copy_state(c, b->bud->log_hash,
+			       c->jheads[b->bud->jhead].log_hash);
+
 	/*
 	 * The bud does not have to start from offset zero - the beginning of
 	 * the 'lnum' LEB may contain previously committed data. One of the
@@ -619,6 +733,7 @@
 	 */
 
 	list_for_each_entry(snod, &sleb->nodes, list) {
+		u8 hash[UBIFS_HASH_ARR_SZ];
 		int deletion = 0;
 
 		cond_resched();
@@ -628,6 +743,8 @@
 			goto out_dump;
 		}
 
+		ubifs_node_calc_hash(c, snod->node, hash);
+
 		if (snod->sqnum > c->max_sqnum)
 			c->max_sqnum = snod->sqnum;
 
@@ -639,7 +756,7 @@
 
 			if (le32_to_cpu(ino->nlink) == 0)
 				deletion = 1;
-			err = insert_node(c, lnum, snod->offs, snod->len,
+			err = insert_node(c, lnum, snod->offs, snod->len, hash,
 					  &snod->key, snod->sqnum, deletion,
 					  &used, 0, new_size);
 			break;
@@ -651,7 +768,7 @@
 					  key_block(c, &snod->key) *
 					  UBIFS_BLOCK_SIZE;
 
-			err = insert_node(c, lnum, snod->offs, snod->len,
+			err = insert_node(c, lnum, snod->offs, snod->len, hash,
 					  &snod->key, snod->sqnum, deletion,
 					  &used, 0, new_size);
 			break;
@@ -665,7 +782,7 @@
 			if (err)
 				goto out_dump;
 
-			err = insert_dent(c, lnum, snod->offs, snod->len,
+			err = insert_dent(c, lnum, snod->offs, snod->len, hash,
 					  &snod->key, dent->name,
 					  le16_to_cpu(dent->nlen), snod->sqnum,
 					  !le64_to_cpu(dent->inum), &used);
@@ -691,11 +808,13 @@
 			 * functions which expect nodes to have keys.
 			 */
 			trun_key_init(c, &key, le32_to_cpu(trun->inum));
-			err = insert_node(c, lnum, snod->offs, snod->len,
+			err = insert_node(c, lnum, snod->offs, snod->len, hash,
 					  &key, snod->sqnum, 1, &used,
 					  old_size, new_size);
 			break;
 		}
+		case UBIFS_AUTH_NODE:
+			break;
 		default:
 			ubifs_err(c, "unexpected node type %d in bud LEB %d:%d",
 				  snod->type, lnum, snod->offs);
@@ -704,6 +823,10 @@
 		}
 		if (err)
 			goto out;
+
+		n++;
+		if (n == n_nodes)
+			break;
 	}
 
 	ubifs_assert(c, ubifs_search_bud(c, lnum));
@@ -782,6 +905,7 @@
 {
 	struct ubifs_bud *bud;
 	struct bud_entry *b;
+	int err;
 
 	dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
 
@@ -791,13 +915,21 @@
 
 	b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
 	if (!b) {
-		kfree(bud);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto out;
 	}
 
 	bud->lnum = lnum;
 	bud->start = offs;
 	bud->jhead = jhead;
+	bud->log_hash = ubifs_hash_get_desc(c);
+	if (IS_ERR(bud->log_hash)) {
+		err = PTR_ERR(bud->log_hash);
+		goto out;
+	}
+
+	ubifs_shash_copy_state(c, c->log_hash, bud->log_hash);
+
 	ubifs_add_bud(c, bud);
 
 	b->bud = bud;
@@ -805,6 +937,11 @@
 	list_add_tail(&b->list, &c->replay_buds);
 
 	return 0;
+out:
+	kfree(bud);
+	kfree(b);
+
+	return err;
 }
 
 /**
@@ -910,6 +1047,14 @@
 
 		c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
 		dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
+
+		err = ubifs_shash_init(c, c->log_hash);
+		if (err)
+			goto out;
+
+		err = ubifs_shash_update(c, c->log_hash, node, UBIFS_CS_NODE_SZ);
+		if (err < 0)
+			goto out;
 	}
 
 	if (snod->sqnum < c->cs_sqnum) {
@@ -957,6 +1102,11 @@
 			if (err)
 				goto out_dump;
 
+			err = ubifs_shash_update(c, c->log_hash, ref,
+						 UBIFS_REF_NODE_SZ);
+			if (err)
+				goto out;
+
 			err = add_replay_bud(c, le32_to_cpu(ref->lnum),
 					     le32_to_cpu(ref->offs),
 					     le32_to_cpu(ref->jhead),

diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf17f58..a551eb3 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -63,6 +51,17 @@
 /* Default time granularity in nanoseconds */
 #define DEFAULT_TIME_GRAN 1000000000
 
+static int get_default_compressor(struct ubifs_info *c)
+{
+	if (ubifs_compr_present(c, UBIFS_COMPR_LZO))
+		return UBIFS_COMPR_LZO;
+
+	if (ubifs_compr_present(c, UBIFS_COMPR_ZLIB))
+		return UBIFS_COMPR_ZLIB;
+
+	return UBIFS_COMPR_NONE;
+}
+
 /**
  * create_default_filesystem - format empty UBI volume.
  * @c: UBIFS file-system description object
@@ -82,10 +81,13 @@
 	int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
 	int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
 	int min_leb_cnt = UBIFS_MIN_LEB_CNT;
+	int idx_node_size;
 	long long tmp64, main_bytes;
 	__le64 tmp_le64;
 	__le32 tmp_le32;
 	struct timespec64 ts;
+	u8 hash[UBIFS_HASH_ARR_SZ];
+	u8 hash_lpt[UBIFS_HASH_ARR_SZ];
 
 	/* Some functions called from here depend on the @c->key_len filed */
 	c->key_len = UBIFS_SK_LEN;
@@ -147,7 +149,7 @@
 	c->lsave_cnt = DEFAULT_LSAVE_CNT;
 	c->max_leb_cnt = c->leb_cnt;
 	err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
-				    &big_lpt);
+				    &big_lpt, hash_lpt);
 	if (err)
 		return err;
 
@@ -156,17 +158,35 @@
 
 	main_first = c->leb_cnt - main_lebs;
 
+	sup = kzalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_KERNEL);
+	mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+	idx_node_size = ubifs_idx_node_sz(c, 1);
+	idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+	ino = kzalloc(ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size), GFP_KERNEL);
+	cs = kzalloc(ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size), GFP_KERNEL);
+
+	if (!sup || !mst || !idx || !ino || !cs) {
+		err = -ENOMEM;
+		goto out;
+	}
+
 	/* Create default superblock */
-	tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
-	sup = kzalloc(tmp, GFP_KERNEL);
-	if (!sup)
-		return -ENOMEM;
 
 	tmp64 = (long long)max_buds * c->leb_size;
 	if (big_lpt)
 		sup_flags |= UBIFS_FLG_BIGLPT;
 	sup_flags |= UBIFS_FLG_DOUBLE_HASH;
 
+	if (ubifs_authenticated(c)) {
+		sup_flags |= UBIFS_FLG_AUTHENTICATION;
+		sup->hash_algo = cpu_to_le16(c->auth_hash_algo);
+		err = ubifs_hmac_wkm(c, sup->hmac_wkm);
+		if (err)
+			goto out;
+	} else {
+		sup->hash_algo = 0xffff;
+	}
+
 	sup->ch.node_type  = UBIFS_SB_NODE;
 	sup->key_hash      = UBIFS_KEY_HASH_R5;
 	sup->flags         = cpu_to_le32(sup_flags);
@@ -186,7 +206,7 @@
 	if (c->mount_opts.override_compr)
 		sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
 	else
-		sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
+		sup->default_compr = cpu_to_le16(get_default_compressor(c));
 
 	generate_random_uuid(sup->uuid);
 
@@ -197,17 +217,9 @@
 	sup->rp_size = cpu_to_le64(tmp64);
 	sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
 
-	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0);
-	kfree(sup);
-	if (err)
-		return err;
-
 	dbg_gen("default superblock created at LEB 0:0");
 
 	/* Create default master node */
-	mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
-	if (!mst)
-		return -ENOMEM;
 
 	mst->ch.node_type = UBIFS_MST_NODE;
 	mst->log_lnum     = cpu_to_le32(UBIFS_LOG_LNUM);
@@ -233,6 +245,7 @@
 	mst->empty_lebs   = cpu_to_le32(main_lebs - 2);
 	mst->idx_lebs     = cpu_to_le32(1);
 	mst->leb_cnt      = cpu_to_le32(c->leb_cnt);
+	ubifs_copy_hash(c, hash_lpt, mst->hash_lpt);
 
 	/* Calculate lprops statistics */
 	tmp64 = main_bytes;
@@ -253,24 +266,9 @@
 
 	mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
 
-	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0);
-	if (err) {
-		kfree(mst);
-		return err;
-	}
-	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1,
-			       0);
-	kfree(mst);
-	if (err)
-		return err;
-
 	dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
 
 	/* Create the root indexing node */
-	tmp = ubifs_idx_node_sz(c, 1);
-	idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
-	if (!idx)
-		return -ENOMEM;
 
 	c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
 	c->key_hash = key_r5_hash;
@@ -282,19 +280,11 @@
 	key_write_idx(c, &key, &br->key);
 	br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
 	br->len  = cpu_to_le32(UBIFS_INO_NODE_SZ);
-	err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0);
-	kfree(idx);
-	if (err)
-		return err;
 
 	dbg_gen("default root indexing node created LEB %d:0",
 		main_first + DEFAULT_IDX_LEB);
 
 	/* Create default root inode */
-	tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
-	ino = kzalloc(tmp, GFP_KERNEL);
-	if (!ino)
-		return -ENOMEM;
 
 	ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
 	ino->ch.node_type = UBIFS_INO_NODE;
@@ -317,12 +307,6 @@
 	/* Set compression enabled by default */
 	ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
 
-	err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
-			       main_first + DEFAULT_DATA_LEB, 0);
-	kfree(ino);
-	if (err)
-		return err;
-
 	dbg_gen("root inode created at LEB %d:0",
 		main_first + DEFAULT_DATA_LEB);
 
@@ -331,19 +315,54 @@
 	 * always the case during normal file-system operation. Write a fake
 	 * commit start node to the log.
 	 */
-	tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
-	cs = kzalloc(tmp, GFP_KERNEL);
-	if (!cs)
-		return -ENOMEM;
 
 	cs->ch.node_type = UBIFS_CS_NODE;
-	err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
-	kfree(cs);
+
+	err = ubifs_write_node_hmac(c, sup, UBIFS_SB_NODE_SZ, 0, 0,
+				    offsetof(struct ubifs_sb_node, hmac));
 	if (err)
-		return err;
+		goto out;
+
+	err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
+			       main_first + DEFAULT_DATA_LEB, 0);
+	if (err)
+		goto out;
+
+	ubifs_node_calc_hash(c, ino, hash);
+	ubifs_copy_hash(c, hash, ubifs_branch_hash(c, br));
+
+	err = ubifs_write_node(c, idx, idx_node_size, main_first + DEFAULT_IDX_LEB, 0);
+	if (err)
+		goto out;
+
+	ubifs_node_calc_hash(c, idx, hash);
+	ubifs_copy_hash(c, hash, mst->hash_root_idx);
+
+	err = ubifs_write_node_hmac(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
+		offsetof(struct ubifs_mst_node, hmac));
+	if (err)
+		goto out;
+
+	err = ubifs_write_node_hmac(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1,
+			       0, offsetof(struct ubifs_mst_node, hmac));
+	if (err)
+		goto out;
+
+	err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
+	if (err)
+		goto out;
 
 	ubifs_msg(c, "default file-system created");
-	return 0;
+
+	err = 0;
+out:
+	kfree(sup);
+	kfree(mst);
+	kfree(idx);
+	kfree(ino);
+	kfree(cs);
+
+	return err;
 }
 
 /**
@@ -498,7 +517,7 @@
  * code. Note, the user of this function is responsible of kfree()'ing the
  * returned superblock buffer.
  */
-struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
+static struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
 {
 	struct ubifs_sb_node *sup;
 	int err;
@@ -517,6 +536,74 @@
 	return sup;
 }
 
+static int authenticate_sb_node(struct ubifs_info *c,
+				const struct ubifs_sb_node *sup)
+{
+	unsigned int sup_flags = le32_to_cpu(sup->flags);
+	u8 hmac_wkm[UBIFS_HMAC_ARR_SZ];
+	int authenticated = !!(sup_flags & UBIFS_FLG_AUTHENTICATION);
+	int hash_algo;
+	int err;
+
+	if (c->authenticated && !authenticated) {
+		ubifs_err(c, "authenticated FS forced, but found FS without authentication");
+		return -EINVAL;
+	}
+
+	if (!c->authenticated && authenticated) {
+		ubifs_err(c, "authenticated FS found, but no key given");
+		return -EINVAL;
+	}
+
+	ubifs_msg(c, "Mounting in %sauthenticated mode",
+		  c->authenticated ? "" : "un");
+
+	if (!c->authenticated)
+		return 0;
+
+	if (!IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION))
+		return -EOPNOTSUPP;
+
+	hash_algo = le16_to_cpu(sup->hash_algo);
+	if (hash_algo >= HASH_ALGO__LAST) {
+		ubifs_err(c, "superblock uses unknown hash algo %d",
+			  hash_algo);
+		return -EINVAL;
+	}
+
+	if (strcmp(hash_algo_name[hash_algo], c->auth_hash_name)) {
+		ubifs_err(c, "This filesystem uses %s for hashing,"
+			     " but %s is specified", hash_algo_name[hash_algo],
+			     c->auth_hash_name);
+		return -EINVAL;
+	}
+
+	/*
+	 * The super block node can either be authenticated by a HMAC or
+	 * by a signature in a ubifs_sig_node directly following the
+	 * super block node to support offline image creation.
+	 */
+	if (ubifs_hmac_zero(c, sup->hmac)) {
+		err = ubifs_sb_verify_signature(c, sup);
+	} else {
+		err = ubifs_hmac_wkm(c, hmac_wkm);
+		if (err)
+			return err;
+		if (ubifs_check_hmac(c, hmac_wkm, sup->hmac_wkm)) {
+			ubifs_err(c, "provided key does not fit");
+			return -ENOKEY;
+		}
+		err = ubifs_node_verify_hmac(c, sup, sizeof(*sup),
+					     offsetof(struct ubifs_sb_node,
+						      hmac));
+	}
+
+	if (err)
+		ubifs_err(c, "Failed to authenticate superblock: %d", err);
+
+	return err;
+}
+
 /**
  * ubifs_write_sb_node - write superblock node.
  * @c: UBIFS file-system description object
@@ -527,8 +614,13 @@
 int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
 {
 	int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+	int err;
 
-	ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
+	err = ubifs_prepare_node_hmac(c, sup, UBIFS_SB_NODE_SZ,
+				      offsetof(struct ubifs_sb_node, hmac), 1);
+	if (err)
+		return err;
+
 	return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len);
 }
 
@@ -555,6 +647,8 @@
 	if (IS_ERR(sup))
 		return PTR_ERR(sup);
 
+	c->sup_node = sup;
+
 	c->fmt_version = le32_to_cpu(sup->fmt_version);
 	c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
 
@@ -603,7 +697,7 @@
 		c->key_hash = key_test_hash;
 		c->key_hash_type = UBIFS_KEY_HASH_TEST;
 		break;
-	};
+	}
 
 	c->key_fmt = sup->key_fmt;
 
@@ -640,6 +734,10 @@
 	c->double_hash = !!(sup_flags & UBIFS_FLG_DOUBLE_HASH);
 	c->encrypted = !!(sup_flags & UBIFS_FLG_ENCRYPTION);
 
+	err = authenticate_sb_node(c, sup);
+	if (err)
+		goto out;
+
 	if ((sup_flags & ~UBIFS_FLG_MASK) != 0) {
 		ubifs_err(c, "Unknown feature flags found: %#x",
 			  sup_flags & ~UBIFS_FLG_MASK);
@@ -647,31 +745,24 @@
 		goto out;
 	}
 
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
-	if (c->encrypted) {
+	if (!IS_ENABLED(CONFIG_FS_ENCRYPTION) && c->encrypted) {
 		ubifs_err(c, "file system contains encrypted files but UBIFS"
 			     " was built without crypto support.");
 		err = -EINVAL;
 		goto out;
 	}
-#endif
 
 	/* Automatically increase file system size to the maximum size */
-	c->old_leb_cnt = c->leb_cnt;
 	if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
+		int old_leb_cnt = c->leb_cnt;
+
 		c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
-		if (c->ro_mount)
-			dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
-				c->old_leb_cnt,	c->leb_cnt);
-		else {
-			dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
-				c->old_leb_cnt, c->leb_cnt);
-			sup->leb_cnt = cpu_to_le32(c->leb_cnt);
-			err = ubifs_write_sb_node(c, sup);
-			if (err)
-				goto out;
-			c->old_leb_cnt = c->leb_cnt;
-		}
+		sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+
+		c->superblock_need_write = 1;
+
+		dbg_mnt("Auto resizing from %d LEBs to %d LEBs",
+			old_leb_cnt, c->leb_cnt);
 	}
 
 	c->log_bytes = (long long)c->log_lebs * c->leb_size;
@@ -686,7 +777,6 @@
 
 	err = validate_sb(c, sup);
 out:
-	kfree(sup);
 	return err;
 }
 
@@ -815,7 +905,7 @@
 int ubifs_fixup_free_space(struct ubifs_info *c)
 {
 	int err;
-	struct ubifs_sb_node *sup;
+	struct ubifs_sb_node *sup = c->sup_node;
 
 	ubifs_assert(c, c->space_fixup);
 	ubifs_assert(c, !c->ro_mount);
@@ -826,18 +916,11 @@
 	if (err)
 		return err;
 
-	sup = ubifs_read_sb_node(c);
-	if (IS_ERR(sup))
-		return PTR_ERR(sup);
-
 	/* Free-space fixup is no longer required */
 	c->space_fixup = 0;
 	sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
 
-	err = ubifs_write_sb_node(c, sup);
-	kfree(sup);
-	if (err)
-		return err;
+	c->superblock_need_write = 1;
 
 	ubifs_msg(c, "free space fixup complete");
 	return err;
@@ -846,7 +929,10 @@
 int ubifs_enable_encryption(struct ubifs_info *c)
 {
 	int err;
-	struct ubifs_sb_node *sup;
+	struct ubifs_sb_node *sup = c->sup_node;
+
+	if (!IS_ENABLED(CONFIG_FS_ENCRYPTION))
+		return -EOPNOTSUPP;
 
 	if (c->encrypted)
 		return 0;
@@ -859,16 +945,11 @@
 		return -EINVAL;
 	}
 
-	sup = ubifs_read_sb_node(c);
-	if (IS_ERR(sup))
-		return PTR_ERR(sup);
-
 	sup->flags |= cpu_to_le32(UBIFS_FLG_ENCRYPTION);
 
 	err = ubifs_write_sb_node(c, sup);
 	if (!err)
 		c->encrypted = 1;
-	kfree(sup);
 
 	return err;
 }

diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index ea88926..c69cdb5 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */

diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 5eb5958..d00a6f2 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index fec62e9..7d4547e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -129,9 +117,10 @@
 		goto out_ino;
 
 	inode->i_flags |= S_NOCMTIME;
-#ifndef CONFIG_UBIFS_ATIME_SUPPORT
-	inode->i_flags |= S_NOATIME;
-#endif
+
+	if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
+		inode->i_flags |= S_NOATIME;
+
 	set_nlink(inode, le32_to_cpu(ino->nlink));
 	i_uid_write(inode, le32_to_cpu(ino->uid));
 	i_gid_write(inode, le32_to_cpu(ino->gid));
@@ -272,19 +261,14 @@
 	return &ui->vfs_inode;
 };
 
-static void ubifs_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	struct ubifs_inode *ui = ubifs_inode(inode);
-	kmem_cache_free(ubifs_inode_slab, ui);
-}
-
-static void ubifs_destroy_inode(struct inode *inode)
+static void ubifs_free_inode(struct inode *inode)
 {
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	kfree(ui->data);
-	call_rcu(&inode->i_rcu, ubifs_i_callback);
+	fscrypt_free_inode(inode);
+
+	kmem_cache_free(ubifs_inode_slab, ui);
 }
 
 /*
@@ -334,6 +318,16 @@
 	return err;
 }
 
+static int ubifs_drop_inode(struct inode *inode)
+{
+	int drop = generic_drop_inode(inode);
+
+	if (!drop)
+		drop = fscrypt_drop_inode(inode);
+
+	return drop;
+}
+
 static void ubifs_evict_inode(struct inode *inode)
 {
 	int err;
@@ -579,6 +573,11 @@
 	c->ranges[UBIFS_REF_NODE].len  = UBIFS_REF_NODE_SZ;
 	c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
 	c->ranges[UBIFS_CS_NODE].len   = UBIFS_CS_NODE_SZ;
+	c->ranges[UBIFS_AUTH_NODE].min_len = UBIFS_AUTH_NODE_SZ;
+	c->ranges[UBIFS_AUTH_NODE].max_len = UBIFS_AUTH_NODE_SZ +
+				UBIFS_MAX_HMAC_LEN;
+	c->ranges[UBIFS_SIG_NODE].min_len = UBIFS_SIG_NODE_SZ;
+	c->ranges[UBIFS_SIG_NODE].max_len = c->leb_size - UBIFS_SB_NODE_SZ;
 
 	c->ranges[UBIFS_INO_NODE].min_len  = UBIFS_INO_NODE_SZ;
 	c->ranges[UBIFS_INO_NODE].max_len  = UBIFS_MAX_INO_NODE_SZ;
@@ -620,6 +619,10 @@
 	c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
 	if (c->max_bu_buf_len > c->leb_size)
 		c->max_bu_buf_len = c->leb_size;
+
+	/* Log is ready, preserve one LEB for commits. */
+	c->min_log_bytes = c->leb_size;
+
 	return 0;
 }
 
@@ -816,6 +819,9 @@
 		c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
 		c->jheads[i].wbuf.jhead = i;
 		c->jheads[i].grouped = 1;
+		c->jheads[i].log_hash = ubifs_hash_get_desc(c);
+		if (IS_ERR(c->jheads[i].log_hash))
+			goto out;
 	}
 
 	/*
@@ -826,6 +832,12 @@
 	c->jheads[GCHD].grouped = 0;
 
 	return 0;
+
+out:
+	while (i--)
+		kfree(c->jheads[i].log_hash);
+
+	return err;
 }
 
 /**
@@ -840,6 +852,7 @@
 		for (i = 0; i < c->jhead_cnt; i++) {
 			kfree(c->jheads[i].wbuf.buf);
 			kfree(c->jheads[i].wbuf.inodes);
+			kfree(c->jheads[i].log_hash);
 		}
 		kfree(c->jheads);
 		c->jheads = NULL;
@@ -924,6 +937,8 @@
  * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
  * Opt_override_compr: override default compressor
  * Opt_assert: set ubifs_assert() action
+ * Opt_auth_key: The key name used for authentication
+ * Opt_auth_hash_name: The hash type used for authentication
  * Opt_err: just end of array marker
  */
 enum {
@@ -935,6 +950,8 @@
 	Opt_no_chk_data_crc,
 	Opt_override_compr,
 	Opt_assert,
+	Opt_auth_key,
+	Opt_auth_hash_name,
 	Opt_ignore,
 	Opt_err,
 };
@@ -947,6 +964,8 @@
 	{Opt_chk_data_crc, "chk_data_crc"},
 	{Opt_no_chk_data_crc, "no_chk_data_crc"},
 	{Opt_override_compr, "compr=%s"},
+	{Opt_auth_key, "auth_key=%s"},
+	{Opt_auth_hash_name, "auth_hash_name=%s"},
 	{Opt_ignore, "ubi=%s"},
 	{Opt_ignore, "vol=%s"},
 	{Opt_assert, "assert=%s"},
@@ -1040,6 +1059,8 @@
 				c->mount_opts.compr_type = UBIFS_COMPR_LZO;
 			else if (!strcmp(name, "zlib"))
 				c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
+			else if (!strcmp(name, "zstd"))
+				c->mount_opts.compr_type = UBIFS_COMPR_ZSTD;
 			else {
 				ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready?
 				kfree(name);
@@ -1070,6 +1091,16 @@
 			kfree(act);
 			break;
 		}
+		case Opt_auth_key:
+			c->auth_key_name = kstrdup(args[0].from, GFP_KERNEL);
+			if (!c->auth_key_name)
+				return -ENOMEM;
+			break;
+		case Opt_auth_hash_name:
+			c->auth_hash_name = kstrdup(args[0].from, GFP_KERNEL);
+			if (!c->auth_hash_name)
+				return -ENOMEM;
+			break;
 		case Opt_ignore:
 			break;
 		default:
@@ -1249,6 +1280,19 @@
 
 	c->mounting = 1;
 
+	if (c->auth_key_name) {
+		if (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) {
+			err = ubifs_init_authentication(c);
+			if (err)
+				goto out_free;
+		} else {
+			ubifs_err(c, "auth_key_name, but UBIFS is built without"
+				  " authentication support");
+			err = -EINVAL;
+			goto out_free;
+		}
+	}
+
 	err = ubifs_read_superblock(c);
 	if (err)
 		goto out_free;
@@ -1270,8 +1314,7 @@
 	if (err)
 		goto out_free;
 
-	sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
-	sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
+	sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2;
 	c->cbuf = kmalloc(sz, GFP_NOFS);
 	if (!c->cbuf) {
 		err = -ENOMEM;
@@ -1334,6 +1377,26 @@
 			goto out_lpt;
 	}
 
+	/*
+	 * Handle offline signed images: Now that the master node is
+	 * written and its validation no longer depends on the hash
+	 * in the superblock, we can update the offline signed
+	 * superblock with a HMAC version,
+	 */
+	if (ubifs_authenticated(c) && ubifs_hmac_zero(c, c->sup_node->hmac)) {
+		err = ubifs_hmac_wkm(c, c->sup_node->hmac_wkm);
+		if (err)
+			goto out_lpt;
+		c->superblock_need_write = 1;
+	}
+
+	if (!c->ro_mount && c->superblock_need_write) {
+		err = ubifs_write_sb_node(c, c->sup_node);
+		if (err)
+			goto out_lpt;
+		c->superblock_need_write = 0;
+	}
+
 	err = dbg_check_idx_size(c, c->bi.old_idx_sz);
 	if (err)
 		goto out_lpt;
@@ -1367,12 +1430,21 @@
 		}
 
 		if (c->need_recovery) {
-			err = ubifs_recover_size(c);
-			if (err)
-				goto out_orphans;
+			if (!ubifs_authenticated(c)) {
+				err = ubifs_recover_size(c, true);
+				if (err)
+					goto out_orphans;
+			}
+
 			err = ubifs_rcvry_gc_commit(c);
 			if (err)
 				goto out_orphans;
+
+			if (ubifs_authenticated(c)) {
+				err = ubifs_recover_size(c, false);
+				if (err)
+					goto out_orphans;
+			}
 		} else {
 			err = take_gc_lnum(c);
 			if (err)
@@ -1391,7 +1463,7 @@
 		if (err)
 			goto out_orphans;
 	} else if (c->need_recovery) {
-		err = ubifs_recover_size(c);
+		err = ubifs_recover_size(c, false);
 		if (err)
 			goto out_orphans;
 	} else {
@@ -1430,9 +1502,7 @@
 	if (err)
 		goto out_infos;
 
-	err = dbg_debugfs_init_fs(c);
-	if (err)
-		goto out_infos;
+	dbg_debugfs_init_fs(c);
 
 	c->mounting = 0;
 
@@ -1499,6 +1569,8 @@
 		c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
 	dbg_gen("max. seq. number:    %llu", c->max_sqnum);
 	dbg_gen("commit number:       %llu", c->cmt_no);
+	dbg_gen("max. xattrs per inode: %d", ubifs_xattr_max_cnt(c));
+	dbg_gen("max orphans:           %d", c->max_orphans);
 
 	return 0;
 
@@ -1557,7 +1629,10 @@
 	free_wbufs(c);
 	free_orphans(c);
 	ubifs_lpt_free(c, 0);
+	ubifs_exit_authentication(c);
 
+	kfree(c->auth_key_name);
+	kfree(c->auth_hash_name);
 	kfree(c->cbuf);
 	kfree(c->rcvrd_mst_node);
 	kfree(c->mst_node);
@@ -1604,29 +1679,16 @@
 	if (err)
 		goto out;
 
-	if (c->old_leb_cnt != c->leb_cnt) {
-		struct ubifs_sb_node *sup;
-
-		sup = ubifs_read_sb_node(c);
-		if (IS_ERR(sup)) {
-			err = PTR_ERR(sup);
-			goto out;
-		}
-		sup->leb_cnt = cpu_to_le32(c->leb_cnt);
-		err = ubifs_write_sb_node(c, sup);
-		kfree(sup);
-		if (err)
-			goto out;
-	}
-
 	if (c->need_recovery) {
 		ubifs_msg(c, "completing deferred recovery");
 		err = ubifs_write_rcvrd_mst_node(c);
 		if (err)
 			goto out;
-		err = ubifs_recover_size(c);
-		if (err)
-			goto out;
+		if (!ubifs_authenticated(c)) {
+			err = ubifs_recover_size(c, true);
+			if (err)
+				goto out;
+		}
 		err = ubifs_clean_lebs(c, c->sbuf);
 		if (err)
 			goto out;
@@ -1648,6 +1710,16 @@
 			goto out;
 	}
 
+	if (c->superblock_need_write) {
+		struct ubifs_sb_node *sup = c->sup_node;
+
+		err = ubifs_write_sb_node(c, sup);
+		if (err)
+			goto out;
+
+		c->superblock_need_write = 0;
+	}
+
 	c->ileb_buf = vmalloc(c->leb_size);
 	if (!c->ileb_buf) {
 		err = -ENOMEM;
@@ -1692,10 +1764,19 @@
 			goto out;
 	}
 
-	if (c->need_recovery)
+	if (c->need_recovery) {
 		err = ubifs_rcvry_gc_commit(c);
-	else
+		if (err)
+			goto out;
+
+		if (ubifs_authenticated(c)) {
+			err = ubifs_recover_size(c, false);
+			if (err)
+				goto out;
+		}
+	} else {
 		err = ubifs_leb_unmap(c, c->gc_lnum);
+	}
 	if (err)
 		goto out;
 
@@ -1920,9 +2001,10 @@
 
 const struct super_operations ubifs_super_operations = {
 	.alloc_inode   = ubifs_alloc_inode,
-	.destroy_inode = ubifs_destroy_inode,
+	.free_inode    = ubifs_free_inode,
 	.put_super     = ubifs_put_super,
 	.write_inode   = ubifs_write_inode,
+	.drop_inode    = ubifs_drop_inode,
 	.evict_inode   = ubifs_evict_inode,
 	.statfs        = ubifs_statfs,
 	.dirty_inode   = ubifs_dirty_inode,
@@ -2087,9 +2169,7 @@
 #ifdef CONFIG_UBIFS_FS_XATTR
 	sb->s_xattr = ubifs_xattr_handlers;
 #endif
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
-	sb->s_cop = &ubifs_crypt_operations;
-#endif
+	fscrypt_set_ops(sb, &ubifs_crypt_operations);
 
 	mutex_lock(&c->umount_mutex);
 	err = mount_ubifs(c);
@@ -2187,15 +2267,16 @@
 		}
 	} else {
 		err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
-		if (err)
+		if (err) {
+			kfree(c);
 			goto out_deact;
+		}
 		/* We do not support atime */
 		sb->s_flags |= SB_ACTIVE;
-#ifndef CONFIG_UBIFS_ATIME_SUPPORT
-		sb->s_flags |= SB_NOATIME;
-#else
-		ubifs_msg(c, "full atime support is enabled.");
-#endif
+		if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
+			ubifs_msg(c, "full atime support is enabled.");
+		else
+			sb->s_flags |= SB_NOATIME;
 	}
 
 	/* 'fill_super()' opens ubi again so we must close it here */
@@ -2310,9 +2391,7 @@
 	if (err)
 		goto out_shrinker;
 
-	err = dbg_debugfs_init();
-	if (err)
-		goto out_compr;
+	dbg_debugfs_init();
 
 	err = register_filesystem(&ubifs_fs_type);
 	if (err) {
@@ -2324,7 +2403,6 @@
 
 out_dbg:
 	dbg_debugfs_exit();
-out_compr:
 	ubifs_compressors_exit();
 out_shrinker:
 	unregister_shrinker(&ubifs_shrinker_info);

diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index bf416e5..e8e7b0e 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -35,7 +23,7 @@
 #include "ubifs.h"
 
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
-			 int len, int lnum, int offs);
+			 struct ubifs_zbranch *zbr);
 static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
 			      struct ubifs_zbranch *zbr, void *node);
 
@@ -433,9 +421,7 @@
  * @c: UBIFS file-system description object
  * @buf: buffer to read to
  * @type: node type
- * @len: node length (not aligned)
- * @lnum: LEB number of node to read
- * @offs: offset of node to read
+ * @zbr: the zbranch describing the node to read
  *
  * This function tries to read a node of known type and length, checks it and
  * stores it in @buf. This function returns %1 if a node is present and %0 if
@@ -453,8 +439,11 @@
  * journal nodes may potentially be corrupted, so checking is required.
  */
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
-			 int len, int lnum, int offs)
+			 struct ubifs_zbranch *zbr)
 {
+	int len = zbr->len;
+	int lnum = zbr->lnum;
+	int offs = zbr->offs;
 	int err, node_len;
 	struct ubifs_ch *ch = buf;
 	uint32_t crc, node_crc;
@@ -478,14 +467,19 @@
 	if (node_len != len)
 		return 0;
 
-	if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
-	    !c->remounting_rw)
-		return 1;
+	if (type != UBIFS_DATA_NODE || !c->no_chk_data_crc || c->mounting ||
+	    c->remounting_rw) {
+		crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+		node_crc = le32_to_cpu(ch->crc);
+		if (crc != node_crc)
+			return 0;
+	}
 
-	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
-	node_crc = le32_to_cpu(ch->crc);
-	if (crc != node_crc)
+	err = ubifs_node_check_hash(c, buf, zbr->hash);
+	if (err) {
+		ubifs_bad_hash(c, buf, zbr->hash, lnum, offs);
 		return 0;
+	}
 
 	return 1;
 }
@@ -507,8 +501,7 @@
 
 	dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
 
-	ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
-			    zbr->offs);
+	ret = try_read_node(c, node, key_type(c, key), zbr);
 	if (ret == 1) {
 		union ubifs_key node_key;
 		struct ubifs_dent_node *dent = node;
@@ -1165,8 +1158,8 @@
  *   o exact match, i.e. the found zero-level znode contains key @key, then %1
  *     is returned and slot number of the matched branch is stored in @n;
  *   o not exact match, which means that zero-level znode does not contain
- *     @key, then %0 is returned and slot number of the closest branch is stored
- *     in @n;
+ *     @key, then %0 is returned and slot number of the closest branch or %-1
+ *     is stored in @n; In this case calling tnc_next() is mandatory.
  *   o @key is so small that it is even less than the lowest key of the
  *     leftmost zero-level node, then %0 is returned and %0 is stored in @n.
  *
@@ -1713,6 +1706,12 @@
 		goto out;
 	}
 
+	err = ubifs_node_check_hash(c, buf, zbr->hash);
+	if (err) {
+		ubifs_bad_hash(c, buf, zbr->hash, zbr->lnum, zbr->offs);
+		return err;
+	}
+
 	len = le32_to_cpu(ch->len);
 	if (len != zbr->len) {
 		ubifs_err(c, "bad node length %d, expected %d", len, zbr->len);
@@ -1883,13 +1882,19 @@
 
 static int search_dh_cookie(struct ubifs_info *c, const union ubifs_key *key,
 			    struct ubifs_dent_node *dent, uint32_t cookie,
-			    struct ubifs_znode **zn, int *n)
+			    struct ubifs_znode **zn, int *n, int exact)
 {
 	int err;
 	struct ubifs_znode *znode = *zn;
 	struct ubifs_zbranch *zbr;
 	union ubifs_key *dkey;
 
+	if (!exact) {
+		err = tnc_next(c, &znode, n);
+		if (err)
+			return err;
+	}
+
 	for (;;) {
 		zbr = &znode->zbranch[*n];
 		dkey = &zbr->key;
@@ -1931,7 +1936,7 @@
 	if (unlikely(err < 0))
 		goto out_unlock;
 
-	err = search_dh_cookie(c, key, dent, cookie, &znode, &n);
+	err = search_dh_cookie(c, key, dent, cookie, &znode, &n, err);
 
 out_unlock:
 	mutex_unlock(&c->tnc_mutex);
@@ -2260,13 +2265,14 @@
  * @lnum: LEB number of node
  * @offs: node offset
  * @len: node length
+ * @hash: The hash over the node
  *
  * This function adds a node with key @key to TNC. The node may be new or it may
  * obsolete some existing one. Returns %0 on success or negative error code on
  * failure.
  */
 int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
-		  int offs, int len)
+		  int offs, int len, const u8 *hash)
 {
 	int found, n, err = 0;
 	struct ubifs_znode *znode;
@@ -2281,6 +2287,7 @@
 		zbr.lnum = lnum;
 		zbr.offs = offs;
 		zbr.len = len;
+		ubifs_copy_hash(c, hash, zbr.hash);
 		key_copy(c, key, &zbr.key);
 		err = tnc_insert(c, znode, &zbr, n + 1);
 	} else if (found == 1) {
@@ -2291,6 +2298,7 @@
 		zbr->lnum = lnum;
 		zbr->offs = offs;
 		zbr->len = len;
+		ubifs_copy_hash(c, hash, zbr->hash);
 	} else
 		err = found;
 	if (!err)
@@ -2392,13 +2400,14 @@
  * @lnum: LEB number of node
  * @offs: node offset
  * @len: node length
+ * @hash: The hash over the node
  * @nm: node name
  *
  * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
  * may have collisions, like directory entry keys.
  */
 int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
-		     int lnum, int offs, int len,
+		     int lnum, int offs, int len, const u8 *hash,
 		     const struct fscrypt_name *nm)
 {
 	int found, n, err = 0;
@@ -2441,6 +2450,7 @@
 			zbr->lnum = lnum;
 			zbr->offs = offs;
 			zbr->len = len;
+			ubifs_copy_hash(c, hash, zbr->hash);
 			goto out_unlock;
 		}
 	}
@@ -2452,6 +2462,7 @@
 		zbr.lnum = lnum;
 		zbr.offs = offs;
 		zbr.len = len;
+		ubifs_copy_hash(c, hash, zbr.hash);
 		key_copy(c, key, &zbr.key);
 		err = tnc_insert(c, znode, &zbr, n + 1);
 		if (err)
@@ -2718,7 +2729,7 @@
 		if (unlikely(err < 0))
 			goto out_free;
 
-		err = search_dh_cookie(c, key, dent, cookie, &znode, &n);
+		err = search_dh_cookie(c, key, dent, cookie, &znode, &n, err);
 		if (err)
 			goto out_free;
 	}

diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index dba87d0..a384a0f 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -38,6 +26,7 @@
 			 struct ubifs_znode *znode, int lnum, int offs, int len)
 {
 	struct ubifs_znode *zp;
+	u8 hash[UBIFS_HASH_ARR_SZ];
 	int i, err;
 
 	/* Make index node */
@@ -52,6 +41,7 @@
 		br->lnum = cpu_to_le32(zbr->lnum);
 		br->offs = cpu_to_le32(zbr->offs);
 		br->len = cpu_to_le32(zbr->len);
+		ubifs_copy_hash(c, zbr->hash, ubifs_branch_hash(c, br));
 		if (!zbr->lnum || !zbr->len) {
 			ubifs_err(c, "bad ref in znode");
 			ubifs_dump_znode(c, znode);
@@ -62,6 +52,7 @@
 		}
 	}
 	ubifs_prepare_node(c, idx, len, 0);
+	ubifs_node_calc_hash(c, idx, hash);
 
 	znode->lnum = lnum;
 	znode->offs = offs;
@@ -78,10 +69,12 @@
 		zbr->lnum = lnum;
 		zbr->offs = offs;
 		zbr->len = len;
+		ubifs_copy_hash(c, hash, zbr->hash);
 	} else {
 		c->zroot.lnum = lnum;
 		c->zroot.offs = offs;
 		c->zroot.len = len;
+		ubifs_copy_hash(c, hash, c->zroot.hash);
 	}
 	c->calc_idx_sz += ALIGN(len, 8);
 
@@ -647,6 +640,8 @@
 			znode->cnext = c->cnext;
 			break;
 		}
+		znode->cparent = znode->parent;
+		znode->ciip = znode->iip;
 		znode->cnext = cnext;
 		znode = cnext;
 		cnt += 1;
@@ -840,6 +835,8 @@
 	}
 
 	while (1) {
+		u8 hash[UBIFS_HASH_ARR_SZ];
+
 		cond_resched();
 
 		znode = cnext;
@@ -857,6 +854,7 @@
 			br->lnum = cpu_to_le32(zbr->lnum);
 			br->offs = cpu_to_le32(zbr->offs);
 			br->len = cpu_to_le32(zbr->len);
+			ubifs_copy_hash(c, zbr->hash, ubifs_branch_hash(c, br));
 			if (!zbr->lnum || !zbr->len) {
 				ubifs_err(c, "bad ref in znode");
 				ubifs_dump_znode(c, znode);
@@ -868,6 +866,23 @@
 		}
 		len = ubifs_idx_node_sz(c, znode->child_cnt);
 		ubifs_prepare_node(c, idx, len, 0);
+		ubifs_node_calc_hash(c, idx, hash);
+
+		mutex_lock(&c->tnc_mutex);
+
+		if (znode->cparent)
+			ubifs_copy_hash(c, hash,
+					znode->cparent->zbranch[znode->ciip].hash);
+
+		if (znode->parent) {
+			if (!ubifs_zn_obsolete(znode))
+				ubifs_copy_hash(c, hash,
+					znode->parent->zbranch[znode->iip].hash);
+		} else {
+			ubifs_copy_hash(c, hash, c->zroot.hash);
+		}
+
+		mutex_unlock(&c->tnc_mutex);
 
 		/* Determine the index node position */
 		if (lnum == -1) {

diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index d90ee01..49cb34c 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Adrian Hunter
  *          Artem Bityutskiy (Битюцкий Артём)
  */
@@ -265,9 +253,7 @@
 /**
  * read_znode - read an indexing node from flash and fill znode.
  * @c: UBIFS file-system description object
- * @lnum: LEB of the indexing node to read
- * @offs: node offset
- * @len: node length
+ * @zzbr: the zbranch describing the node to read
  * @znode: znode to read to
  *
  * This function reads an indexing node from the flash media and fills znode
@@ -276,9 +262,12 @@
  * is wrong with it, this function prints complaint messages and returns
  * %-EINVAL.
  */
-static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
+static int read_znode(struct ubifs_info *c, struct ubifs_zbranch *zzbr,
 		      struct ubifs_znode *znode)
 {
+	int lnum = zzbr->lnum;
+	int offs = zzbr->offs;
+	int len = zzbr->len;
 	int i, err, type, cmp;
 	struct ubifs_idx_node *idx;
 
@@ -292,6 +281,13 @@
 		return err;
 	}
 
+	err = ubifs_node_check_hash(c, idx, zzbr->hash);
+	if (err) {
+		ubifs_bad_hash(c, idx, zzbr->hash, lnum, offs);
+		kfree(idx);
+		return err;
+	}
+
 	znode->child_cnt = le16_to_cpu(idx->child_cnt);
 	znode->level = le16_to_cpu(idx->level);
 
@@ -308,13 +304,14 @@
 	}
 
 	for (i = 0; i < znode->child_cnt; i++) {
-		const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+		struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
 		struct ubifs_zbranch *zbr = &znode->zbranch[i];
 
 		key_read(c, &br->key, &zbr->key);
 		zbr->lnum = le32_to_cpu(br->lnum);
 		zbr->offs = le32_to_cpu(br->offs);
 		zbr->len  = le32_to_cpu(br->len);
+		ubifs_copy_hash(c, ubifs_branch_hash(c, br), zbr->hash);
 		zbr->znode = NULL;
 
 		/* Validate branch */
@@ -425,7 +422,7 @@
 	if (!znode)
 		return ERR_PTR(-ENOMEM);
 
-	err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
+	err = read_znode(c, zbr, znode);
 	if (err)
 		goto out;
 
@@ -496,5 +493,11 @@
 		return -EINVAL;
 	}
 
+	err = ubifs_node_check_hash(c, node, zbr->hash);
+	if (err) {
+		ubifs_bad_hash(c, node, zbr->hash, zbr->lnum, zbr->offs);
+		return err;
+	}
+
 	return 0;
 }

diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index e8c23c9..3c9792c 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h

@@ -1,21 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -286,6 +274,9 @@
 #define UBIFS_IDX_NODE_SZ  sizeof(struct ubifs_idx_node)
 #define UBIFS_CS_NODE_SZ   sizeof(struct ubifs_cs_node)
 #define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
+#define UBIFS_AUTH_NODE_SZ sizeof(struct ubifs_auth_node)
+#define UBIFS_SIG_NODE_SZ  sizeof(struct ubifs_sig_node)
+
 /* Extended attribute entry nodes are identical to directory entry nodes */
 #define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
 /* Only this does not have to be multiple of 8 bytes */
@@ -300,12 +291,20 @@
 /* The largest UBIFS node */
 #define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
 
+/* The maxmimum size of a hash, enough for sha512 */
+#define UBIFS_MAX_HASH_LEN 64
+
+/* The maxmimum size of a hmac, enough for hmac(sha512) */
+#define UBIFS_MAX_HMAC_LEN 64
+
 /*
  * xattr name of UBIFS encryption context, we don't use a prefix
  * nor a long name to not waste space on the flash.
  */
 #define UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT "c"
 
+/* Type field in ubifs_sig_node */
+#define UBIFS_SIGNATURE_TYPE_PKCS7	1
 
 /*
  * On-flash inode flags.
@@ -341,12 +340,14 @@
  * UBIFS_COMPR_NONE: no compression
  * UBIFS_COMPR_LZO: LZO compression
  * UBIFS_COMPR_ZLIB: ZLIB compression
+ * UBIFS_COMPR_ZSTD: ZSTD compression
  * UBIFS_COMPR_TYPES_CNT: count of supported compression types
  */
 enum {
 	UBIFS_COMPR_NONE,
 	UBIFS_COMPR_LZO,
 	UBIFS_COMPR_ZLIB,
+	UBIFS_COMPR_ZSTD,
 	UBIFS_COMPR_TYPES_CNT,
 };
 
@@ -365,6 +366,8 @@
  * UBIFS_IDX_NODE: index node
  * UBIFS_CS_NODE: commit start node
  * UBIFS_ORPH_NODE: orphan node
+ * UBIFS_AUTH_NODE: authentication node
+ * UBIFS_SIG_NODE: signature node
  * UBIFS_NODE_TYPES_CNT: count of supported node types
  *
  * Note, we index arrays by these numbers, so keep them low and contiguous.
@@ -384,6 +387,8 @@
 	UBIFS_IDX_NODE,
 	UBIFS_CS_NODE,
 	UBIFS_ORPH_NODE,
+	UBIFS_AUTH_NODE,
+	UBIFS_SIG_NODE,
 	UBIFS_NODE_TYPES_CNT,
 };
 
@@ -421,15 +426,19 @@
  * UBIFS_FLG_DOUBLE_HASH: store a 32bit cookie in directory entry nodes to
  *			  support 64bit cookies for lookups by hash
  * UBIFS_FLG_ENCRYPTION: this filesystem contains encrypted files
+ * UBIFS_FLG_AUTHENTICATION: this filesystem contains hashes for authentication
  */
 enum {
 	UBIFS_FLG_BIGLPT = 0x02,
 	UBIFS_FLG_SPACE_FIXUP = 0x04,
 	UBIFS_FLG_DOUBLE_HASH = 0x08,
 	UBIFS_FLG_ENCRYPTION = 0x10,
+	UBIFS_FLG_AUTHENTICATION = 0x20,
 };
 
-#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT|UBIFS_FLG_SPACE_FIXUP|UBIFS_FLG_DOUBLE_HASH|UBIFS_FLG_ENCRYPTION)
+#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT | UBIFS_FLG_SPACE_FIXUP | \
+		UBIFS_FLG_DOUBLE_HASH | UBIFS_FLG_ENCRYPTION | \
+		UBIFS_FLG_AUTHENTICATION)
 
 /**
  * struct ubifs_ch - common header node.
@@ -633,6 +642,12 @@
  * @time_gran: time granularity in nanoseconds
  * @uuid: UUID generated when the file system image was created
  * @ro_compat_version: UBIFS R/O compatibility version
+ * @hmac: HMAC to authenticate the superblock node
+ * @hmac_wkm: HMAC of a well known message (the string "UBIFS") as a convenience
+ *            to the user to check if the correct key is passed.
+ * @hash_algo: The hash algo used for this filesystem (one of enum hash_algo)
+ * @hash_mst: hash of the master node, only valid for signed images in which the
+ *            master node does not contain a hmac
  */
 struct ubifs_sb_node {
 	struct ubifs_ch ch;
@@ -660,7 +675,11 @@
 	__le32 time_gran;
 	__u8 uuid[16];
 	__le32 ro_compat_version;
-	__u8 padding2[3968];
+	__u8 hmac[UBIFS_MAX_HMAC_LEN];
+	__u8 hmac_wkm[UBIFS_MAX_HMAC_LEN];
+	__le16 hash_algo;
+	__u8 hash_mst[UBIFS_MAX_HASH_LEN];
+	__u8 padding2[3774];
 } __packed;
 
 /**
@@ -695,6 +714,9 @@
  * @empty_lebs: number of empty logical eraseblocks
  * @idx_lebs: number of indexing logical eraseblocks
  * @leb_cnt: count of LEBs used by file-system
+ * @hash_root_idx: the hash of the root index node
+ * @hash_lpt: the hash of the LPT
+ * @hmac: HMAC to authenticate the master node
  * @padding: reserved for future, zeroes
  */
 struct ubifs_mst_node {
@@ -727,7 +749,10 @@
 	__le32 empty_lebs;
 	__le32 idx_lebs;
 	__le32 leb_cnt;
-	__u8 padding[344];
+	__u8 hash_root_idx[UBIFS_MAX_HASH_LEN];
+	__u8 hash_lpt[UBIFS_MAX_HASH_LEN];
+	__u8 hmac[UBIFS_MAX_HMAC_LEN];
+	__u8 padding[152];
 } __packed;
 
 /**
@@ -747,11 +772,42 @@
 } __packed;
 
 /**
+ * struct ubifs_auth_node - node for authenticating other nodes
+ * @ch: common header
+ * @hmac: The HMAC
+ */
+struct ubifs_auth_node {
+	struct ubifs_ch ch;
+	__u8 hmac[];
+} __packed;
+
+/**
+ * struct ubifs_sig_node - node for signing other nodes
+ * @ch: common header
+ * @type: type of the signature, currently only UBIFS_SIGNATURE_TYPE_PKCS7
+ * supported
+ * @len: The length of the signature data
+ * @padding: reserved for future, zeroes
+ * @sig: The signature data
+ */
+struct ubifs_sig_node {
+	struct ubifs_ch ch;
+	__le32 type;
+	__le32 len;
+	__u8 padding[32];
+	__u8 sig[];
+} __packed;
+
+/**
  * struct ubifs_branch - key/reference/length branch
  * @lnum: LEB number of the target node
  * @offs: offset within @lnum
  * @len: target node length
  * @key: key
+ *
+ * In an authenticated UBIFS we have the hash of the referenced node after @key.
+ * This can't be added to the struct type definition because @key is a
+ * dynamically sized element already.
  */
 struct ubifs_branch {
 	__le32 lnum;

diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4368cde..c55f212 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h

@@ -1,21 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -39,8 +27,10 @@
 #include <linux/security.h>
 #include <linux/xattr.h>
 #include <linux/random.h>
+#include <crypto/hash_info.h>
+#include <crypto/hash.h>
+#include <crypto/algapi.h>
 
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
 #include <linux/fscrypt.h>
 
 #include "ubifs-media.h"
@@ -139,7 +129,7 @@
  */
 #define WORST_COMPR_FACTOR 2
 
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 #define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
 #else
 #define UBIFS_CIPHER_BLOCK_SIZE 0
@@ -157,6 +147,14 @@
 /* Maximum number of data nodes to bulk-read */
 #define UBIFS_MAX_BULK_READ 32
 
+#ifdef CONFIG_UBIFS_FS_AUTHENTICATION
+#define UBIFS_HASH_ARR_SZ UBIFS_MAX_HASH_LEN
+#define UBIFS_HMAC_ARR_SZ UBIFS_MAX_HMAC_LEN
+#else
+#define UBIFS_HASH_ARR_SZ 0
+#define UBIFS_HMAC_ARR_SZ 0
+#endif
+
 /*
  * Lockdep classes for UBIFS inode @ui_mutex.
  */
@@ -706,6 +704,7 @@
  * @jhead: journal head number this bud belongs to
  * @list: link in the list buds belonging to the same journal head
  * @rb: link in the tree of all buds
+ * @log_hash: the log hash from the commit start node up to this bud
  */
 struct ubifs_bud {
 	int lnum;
@@ -713,6 +712,7 @@
 	int jhead;
 	struct list_head list;
 	struct rb_node rb;
+	struct shash_desc *log_hash;
 };
 
 /**
@@ -720,6 +720,7 @@
  * @wbuf: head's write-buffer
  * @buds_list: list of bud LEBs belonging to this journal head
  * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
+ * @log_hash: the log hash from the commit start node up to this journal head
  *
  * Note, the @buds list is protected by the @c->buds_lock.
  */
@@ -727,6 +728,7 @@
 	struct ubifs_wbuf wbuf;
 	struct list_head buds_list;
 	unsigned int grouped:1;
+	struct shash_desc *log_hash;
 };
 
 /**
@@ -736,6 +738,7 @@
  * @lnum: LEB number of the target node (indexing node or data node)
  * @offs: target node offset within @lnum
  * @len: target node length
+ * @hash: the hash of the target node
  */
 struct ubifs_zbranch {
 	union ubifs_key key;
@@ -746,12 +749,15 @@
 	int lnum;
 	int offs;
 	int len;
+	u8 hash[UBIFS_HASH_ARR_SZ];
 };
 
 /**
  * struct ubifs_znode - in-memory representation of an indexing node.
  * @parent: parent znode or NULL if it is the root
  * @cnext: next znode to commit
+ * @cparent: parent node for this commit
+ * @ciip: index in cparent's zbranch array
  * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
  * @time: last access time (seconds)
  * @level: level of the entry in the TNC tree
@@ -769,6 +775,8 @@
 struct ubifs_znode {
 	struct ubifs_znode *parent;
 	struct ubifs_znode *cnext;
+	struct ubifs_znode *cparent;
+	int ciip;
 	unsigned long flags;
 	time64_t time;
 	int level;
@@ -904,6 +912,8 @@
  * @rb: rb-tree node of rb-tree of orphans sorted by inode number
  * @list: list head of list of orphans in order added
  * @new_list: list head of list of orphans added since the last commit
+ * @child_list: list of xattr childs if this orphan hosts xattrs, list head
+ * if this orphan is a xattr, not used otherwise.
  * @cnext: next orphan to commit
  * @dnext: next orphan to delete
  * @inum: inode number
@@ -915,6 +925,7 @@
 	struct rb_node rb;
 	struct list_head list;
 	struct list_head new_list;
+	struct list_head child_list;
 	struct ubifs_orphan *cnext;
 	struct ubifs_orphan *dnext;
 	ino_t inum;
@@ -983,6 +994,7 @@
  * struct ubifs_info - UBIFS file-system description data structure
  * (per-superblock).
  * @vfs_sb: VFS @struct super_block object
+ * @sup_node: The super block node as read from the device
  *
  * @highest_inum: highest used inode number
  * @max_sqnum: current global sequence number
@@ -1028,6 +1040,7 @@
  * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
  * @rw_incompat: the media is not R/W compatible
  * @assert_action: action to take when a ubifs_assert() fails
+ * @authenticated: flag indigating the FS is mounted in authenticated mode
  *
  * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
  *             @calc_idx_sz
@@ -1075,6 +1088,7 @@
  * @key_hash: direntry key hash function
  * @key_fmt: key format
  * @key_len: key length
+ * @hash_len: The length of the index node hashes
  * @fanout: fanout of the index tree (number of links per indexing node)
  *
  * @min_io_size: minimal input/output unit size
@@ -1090,7 +1104,6 @@
  *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
  * @leb_cnt: count of logical eraseblocks
  * @max_leb_cnt: maximum count of logical eraseblocks
- * @old_leb_cnt: count of logical eraseblocks before re-size
  * @ro_media: the underlying UBI volume is read-only
  * @ro_mount: the file-system was mounted as read-only
  * @ro_error: UBIFS switched to R/O mode because an error happened
@@ -1210,6 +1223,15 @@
  * @rp_uid: reserved pool user ID
  * @rp_gid: reserved pool group ID
  *
+ * @hash_tfm: the hash transformation used for hashing nodes
+ * @hmac_tfm: the HMAC transformation for this filesystem
+ * @hmac_desc_len: length of the HMAC used for authentication
+ * @auth_key_name: the authentication key name
+ * @auth_hash_name: the name of the hash algorithm used for authentication
+ * @auth_hash_algo: the authentication hash used for this fs
+ * @log_hash: the log hash from the commit start node up to the latest reference
+ *            node.
+ *
  * @empty: %1 if the UBI device is empty
  * @need_recovery: %1 if the file-system needs recovery
  * @replaying: %1 during journal replay
@@ -1230,6 +1252,7 @@
  */
 struct ubifs_info {
 	struct super_block *vfs_sb;
+	struct ubifs_sb_node *sup_node;
 
 	ino_t highest_inum;
 	unsigned long long max_sqnum;
@@ -1270,6 +1293,8 @@
 	unsigned int default_compr:2;
 	unsigned int rw_incompat:1;
 	unsigned int assert_action:2;
+	unsigned int authenticated:1;
+	unsigned int superblock_need_write:1;
 
 	struct mutex tnc_mutex;
 	struct ubifs_zbranch zroot;
@@ -1314,6 +1339,7 @@
 	uint32_t (*key_hash)(const char *str, int len);
 	int key_fmt;
 	int key_len;
+	int hash_len;
 	int fanout;
 
 	int min_io_size;
@@ -1326,7 +1352,6 @@
 	int idx_leb_size;
 	int leb_cnt;
 	int max_leb_cnt;
-	int old_leb_cnt;
 	unsigned int ro_media:1;
 	unsigned int ro_mount:1;
 	unsigned int ro_error:1;
@@ -1441,6 +1466,15 @@
 	kuid_t rp_uid;
 	kgid_t rp_gid;
 
+	struct crypto_shash *hash_tfm;
+	struct crypto_shash *hmac_tfm;
+	int hmac_desc_len;
+	char *auth_key_name;
+	char *auth_hash_name;
+	enum hash_algo auth_hash_algo;
+
+	struct shash_desc *log_hash;
+
 	/* The below fields are used only during mounting and re-mounting */
 	unsigned int empty:1;
 	unsigned int need_recovery:1;
@@ -1471,6 +1505,198 @@
 extern const struct inode_operations ubifs_symlink_inode_operations;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
+/* auth.c */
+static inline int ubifs_authenticated(const struct ubifs_info *c)
+{
+	return (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) && c->authenticated;
+}
+
+struct shash_desc *__ubifs_hash_get_desc(const struct ubifs_info *c);
+static inline struct shash_desc *ubifs_hash_get_desc(const struct ubifs_info *c)
+{
+	return ubifs_authenticated(c) ? __ubifs_hash_get_desc(c) : NULL;
+}
+
+static inline int ubifs_shash_init(const struct ubifs_info *c,
+				   struct shash_desc *desc)
+{
+	if (ubifs_authenticated(c))
+		return crypto_shash_init(desc);
+	else
+		return 0;
+}
+
+static inline int ubifs_shash_update(const struct ubifs_info *c,
+				      struct shash_desc *desc, const void *buf,
+				      unsigned int len)
+{
+	int err = 0;
+
+	if (ubifs_authenticated(c)) {
+		err = crypto_shash_update(desc, buf, len);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static inline int ubifs_shash_final(const struct ubifs_info *c,
+				    struct shash_desc *desc, u8 *out)
+{
+	return ubifs_authenticated(c) ? crypto_shash_final(desc, out) : 0;
+}
+
+int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *buf,
+			  u8 *hash);
+static inline int ubifs_node_calc_hash(const struct ubifs_info *c,
+					const void *buf, u8 *hash)
+{
+	if (ubifs_authenticated(c))
+		return __ubifs_node_calc_hash(c, buf, hash);
+	else
+		return 0;
+}
+
+int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
+			     struct shash_desc *inhash);
+
+/**
+ * ubifs_check_hash - compare two hashes
+ * @c: UBIFS file-system description object
+ * @expected: first hash
+ * @got: second hash
+ *
+ * Compare two hashes @expected and @got. Returns 0 when they are equal, a
+ * negative error code otherwise.
+ */
+static inline int ubifs_check_hash(const struct ubifs_info *c,
+				   const u8 *expected, const u8 *got)
+{
+	return crypto_memneq(expected, got, c->hash_len);
+}
+
+/**
+ * ubifs_check_hmac - compare two HMACs
+ * @c: UBIFS file-system description object
+ * @expected: first HMAC
+ * @got: second HMAC
+ *
+ * Compare two hashes @expected and @got. Returns 0 when they are equal, a
+ * negative error code otherwise.
+ */
+static inline int ubifs_check_hmac(const struct ubifs_info *c,
+				   const u8 *expected, const u8 *got)
+{
+	return crypto_memneq(expected, got, c->hmac_desc_len);
+}
+
+void ubifs_bad_hash(const struct ubifs_info *c, const void *node,
+		    const u8 *hash, int lnum, int offs);
+
+int __ubifs_node_check_hash(const struct ubifs_info *c, const void *buf,
+			  const u8 *expected);
+static inline int ubifs_node_check_hash(const struct ubifs_info *c,
+					const void *buf, const u8 *expected)
+{
+	if (ubifs_authenticated(c))
+		return __ubifs_node_check_hash(c, buf, expected);
+	else
+		return 0;
+}
+
+int ubifs_init_authentication(struct ubifs_info *c);
+void __ubifs_exit_authentication(struct ubifs_info *c);
+static inline void ubifs_exit_authentication(struct ubifs_info *c)
+{
+	if (ubifs_authenticated(c))
+		__ubifs_exit_authentication(c);
+}
+
+/**
+ * ubifs_branch_hash - returns a pointer to the hash of a branch
+ * @c: UBIFS file-system description object
+ * @br: branch to get the hash from
+ *
+ * This returns a pointer to the hash of a branch. Since the key already is a
+ * dynamically sized object we cannot use a struct member here.
+ */
+static inline u8 *ubifs_branch_hash(struct ubifs_info *c,
+				    struct ubifs_branch *br)
+{
+	return (void *)br + sizeof(*br) + c->key_len;
+}
+
+/**
+ * ubifs_copy_hash - copy a hash
+ * @c: UBIFS file-system description object
+ * @from: source hash
+ * @to: destination hash
+ *
+ * With authentication this copies a hash, otherwise does nothing.
+ */
+static inline void ubifs_copy_hash(const struct ubifs_info *c, const u8 *from,
+				   u8 *to)
+{
+	if (ubifs_authenticated(c))
+		memcpy(to, from, c->hash_len);
+}
+
+int __ubifs_node_insert_hmac(const struct ubifs_info *c, void *buf,
+			      int len, int ofs_hmac);
+static inline int ubifs_node_insert_hmac(const struct ubifs_info *c, void *buf,
+					  int len, int ofs_hmac)
+{
+	if (ubifs_authenticated(c))
+		return __ubifs_node_insert_hmac(c, buf, len, ofs_hmac);
+	else
+		return 0;
+}
+
+int __ubifs_node_verify_hmac(const struct ubifs_info *c, const void *buf,
+			     int len, int ofs_hmac);
+static inline int ubifs_node_verify_hmac(const struct ubifs_info *c,
+					 const void *buf, int len, int ofs_hmac)
+{
+	if (ubifs_authenticated(c))
+		return __ubifs_node_verify_hmac(c, buf, len, ofs_hmac);
+	else
+		return 0;
+}
+
+/**
+ * ubifs_auth_node_sz - returns the size of an authentication node
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the size of an authentication node which can
+ * be 0 for unauthenticated filesystems or the real size of an auth node
+ * authentication is enabled.
+ */
+static inline int ubifs_auth_node_sz(const struct ubifs_info *c)
+{
+	if (ubifs_authenticated(c))
+		return sizeof(struct ubifs_auth_node) + c->hmac_desc_len;
+	else
+		return 0;
+}
+int ubifs_sb_verify_signature(struct ubifs_info *c,
+			      const struct ubifs_sb_node *sup);
+bool ubifs_hmac_zero(struct ubifs_info *c, const u8 *hmac);
+
+int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac);
+
+int __ubifs_shash_copy_state(const struct ubifs_info *c, struct shash_desc *src,
+			     struct shash_desc *target);
+static inline int ubifs_shash_copy_state(const struct ubifs_info *c,
+					   struct shash_desc *src,
+					   struct shash_desc *target)
+{
+	if (ubifs_authenticated(c))
+		return __ubifs_shash_copy_state(c, src, target);
+	else
+		return 0;
+}
+
 /* io.c */
 void ubifs_ro_mode(struct ubifs_info *c, int err);
 int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
@@ -1490,9 +1716,15 @@
 			 int lnum, int offs);
 int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
 		     int offs);
+int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum,
+			  int offs, int hmac_offs);
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		     int offs, int quiet, int must_chk_crc);
+void ubifs_init_node(struct ubifs_info *c, void *buf, int len, int pad);
+void ubifs_crc_node(struct ubifs_info *c, void *buf, int len);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
+int ubifs_prepare_node_hmac(struct ubifs_info *c, void *node, int len,
+			    int hmac_offs, int pad);
 void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
 int ubifs_io_init(struct ubifs_info *c);
 void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
@@ -1592,11 +1824,12 @@
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
 		     void *node, int *lnum, int *offs);
 int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
-		  int offs, int len);
+		  int offs, int len, const u8 *hash);
 int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
 		      int old_lnum, int old_offs, int lnum, int offs, int len);
 int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
-		     int lnum, int offs, int len, const struct fscrypt_name *nm);
+		     int lnum, int offs, int len, const u8 *hash,
+		     const struct fscrypt_name *nm);
 int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
 int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
 			const struct fscrypt_name *nm);
@@ -1659,12 +1892,12 @@
 void ubifs_wait_for_commit(struct ubifs_info *c);
 
 /* master.c */
+int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2);
 int ubifs_read_master(struct ubifs_info *c);
 int ubifs_write_master(struct ubifs_info *c);
 
 /* sb.c */
 int ubifs_read_superblock(struct ubifs_info *c);
-struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
 int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
 int ubifs_fixup_free_space(struct ubifs_info *c);
 int ubifs_enable_encryption(struct ubifs_info *c);
@@ -1693,7 +1926,7 @@
 /* lpt.c */
 int ubifs_calc_lpt_geom(struct ubifs_info *c);
 int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
-			  int *lpt_lebs, int *big_lpt);
+			  int *lpt_lebs, int *big_lpt, u8 *hash);
 int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
 struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
 struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
@@ -1712,6 +1945,7 @@
 				    struct ubifs_nnode *parent, int iip);
 struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
 				    struct ubifs_nnode *parent, int iip);
+struct ubifs_pnode *ubifs_pnode_lookup(struct ubifs_info *c, int i);
 int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
 void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
 void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
@@ -1720,6 +1954,7 @@
 /* Needed only in debugging code in lpt_commit.c */
 int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
 		       struct ubifs_nnode *nnode);
+int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash);
 
 /* lpt_commit.c */
 int ubifs_lpt_start_commit(struct ubifs_info *c);
@@ -1754,9 +1989,7 @@
 /* file.c */
 int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
-#ifdef CONFIG_UBIFS_ATIME_SUPPORT
 int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
-#endif
 
 /* dir.c */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
@@ -1775,9 +2008,14 @@
 
 #ifdef CONFIG_UBIFS_FS_XATTR
 void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum);
+int ubifs_purge_xattrs(struct inode *host);
 #else
 static inline void ubifs_evict_xattr_inode(struct ubifs_info *c,
 					   ino_t xattr_inum) { }
+static inline int ubifs_purge_xattrs(struct inode *host)
+{
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_UBIFS_FS_SECURITY
@@ -1807,7 +2045,7 @@
 int ubifs_rcvry_gc_commit(struct ubifs_info *c);
 int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
 			     int deletion, loff_t new_size);
-int ubifs_recover_size(struct ubifs_info *c);
+int ubifs_recover_size(struct ubifs_info *c, bool in_place);
 void ubifs_destroy_size_tree(struct ubifs_info *c);
 
 /* ioctl.c */
@@ -1829,7 +2067,7 @@
 #include "misc.h"
 #include "key.h"
 
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
 static inline int ubifs_encrypt(const struct inode *inode,
 				struct ubifs_data_node *dn,
 				unsigned int in_len, unsigned int *out_len,

diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index f5ad1ed..9aefbb6 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c

@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This file is part of UBIFS.
  *
  * Copyright (C) 2006-2008 Nokia Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 51
- * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
  * Authors: Artem Bityutskiy (Битюцкий Артём)
  *          Adrian Hunter
  */
@@ -61,12 +49,6 @@
 #include <linux/xattr.h>
 
 /*
- * Limit the number of extended attributes per inode so that the total size
- * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
- */
-#define MAX_XATTRS_PER_INODE 65535
-
-/*
  * Extended attribute type constants.
  *
  * USER_XATTR: user extended attribute ("user.*")
@@ -106,7 +88,7 @@
 				.new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
 				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
 
-	if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) {
+	if (host_ui->xattr_cnt >= ubifs_xattr_max_cnt(c)) {
 		ubifs_err(c, "inode %lu already has too many xattrs (%d), cannot create more",
 			  host->i_ino, host_ui->xattr_cnt);
 		return -ENOSPC;
@@ -507,6 +489,69 @@
 	return err;
 }
 
+int ubifs_purge_xattrs(struct inode *host)
+{
+	union ubifs_key key;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct ubifs_dent_node *xent, *pxent = NULL;
+	struct inode *xino;
+	struct fscrypt_name nm = {0};
+	int err;
+
+	if (ubifs_inode(host)->xattr_cnt < ubifs_xattr_max_cnt(c))
+		return 0;
+
+	ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion",
+		   host->i_ino);
+
+	lowest_xent_key(c, &key, host->i_ino);
+	while (1) {
+		xent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(xent)) {
+			err = PTR_ERR(xent);
+			break;
+		}
+
+		fname_name(&nm) = xent->name;
+		fname_len(&nm) = le16_to_cpu(xent->nlen);
+
+		xino = ubifs_iget(c->vfs_sb, le64_to_cpu(xent->inum));
+		if (IS_ERR(xino)) {
+			err = PTR_ERR(xino);
+			ubifs_err(c, "dead directory entry '%s', error %d",
+				  xent->name, err);
+			ubifs_ro_mode(c, err);
+			kfree(pxent);
+			return err;
+		}
+
+		ubifs_assert(c, ubifs_inode(xino)->xattr);
+
+		clear_nlink(xino);
+		err = remove_xattr(c, host, xino, &nm);
+		if (err) {
+			kfree(pxent);
+			iput(xino);
+			ubifs_err(c, "cannot remove xattr, error %d", err);
+			return err;
+		}
+
+		iput(xino);
+
+		kfree(pxent);
+		pxent = xent;
+		key_read(c, &xent->key, &key);
+	}
+
+	kfree(pxent);
+	if (err != -ENOENT) {
+		ubifs_err(c, "cannot find next direntry, error %d", err);
+		return err;
+	}
+
+	return 0;
+}
+
 /**
  * ubifs_evict_xattr_inode - Evict an xattr inode.
  * @c: UBIFS file-system description object

diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index aa41505..6848de5 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config UDF_FS
 	tristate "UDF file system support"
 	select CRC_ITU_T

diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index eb880f6..63981cd 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux udf-filesystem routines.
 #

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index fcda0fc..02f03fa 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c

@@ -175,8 +175,8 @@
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	int alloc_count = 0;
-	int bit, block, block_group, group_start;
-	int nr_groups, bitmap_nr;
+	int bit, block, block_group;
+	int bitmap_nr;
 	struct buffer_head *bh;
 	__u32 part_len;
 
@@ -189,10 +189,8 @@
 		block_count = part_len - first_block;
 
 	do {
-		nr_groups = udf_compute_nr_groups(sb, partition);
 		block = first_block + (sizeof(struct spaceBitmapDesc) << 3);
 		block_group = block >> (sb->s_blocksize_bits + 3);
-		group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
 
 		bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
 		if (bitmap_nr < 0)
@@ -327,6 +325,17 @@
 	newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
 		(sizeof(struct spaceBitmapDesc) << 3);
 
+	if (newblock >= sbi->s_partmaps[partition].s_partition_len) {
+		/*
+		 * Ran off the end of the bitmap, and bits following are
+		 * non-compliant (not all zero)
+		 */
+		udf_err(sb, "bitmap for partition %d corrupted (block %u marked"
+			" as free, partition length is %u)\n", partition,
+			newblock, sbi->s_partmaps[partition].s_partition_len);
+		goto error_return;
+	}
+
 	if (!udf_clear_bit(bit, bh->b_data)) {
 		udf_debug("bit already cleared for block %d\n", bit);
 		goto repeat;
@@ -652,12 +661,6 @@
 	} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
 		udf_table_free_blocks(sb, map->s_uspace.s_table,
 				      bloc, offset, count);
-	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
-		udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap,
-				       bloc, offset, count);
-	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
-		udf_table_free_blocks(sb, map->s_fspace.s_table,
-				      bloc, offset, count);
 	}
 
 	if (inode) {
@@ -684,16 +687,6 @@
 						      map->s_uspace.s_table,
 						      partition, first_block,
 						      block_count);
-	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
-		allocated = udf_bitmap_prealloc_blocks(sb,
-						       map->s_fspace.s_bitmap,
-						       partition, first_block,
-						       block_count);
-	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
-		allocated = udf_table_prealloc_blocks(sb,
-						      map->s_fspace.s_table,
-						      partition, first_block,
-						      block_count);
 	else
 		return 0;
 
@@ -717,14 +710,6 @@
 		block = udf_table_new_block(sb,
 					    map->s_uspace.s_table,
 					    partition, goal, err);
-	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
-		block = udf_bitmap_new_block(sb,
-					     map->s_fspace.s_bitmap,
-					     partition, goal, err);
-	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
-		block = udf_table_new_block(sb,
-					    map->s_fspace.s_table,
-					    partition, goal, err);
 	else {
 		*err = -EIO;
 		return 0;

diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 9f24bd1..fb7f2c7 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h

@@ -88,6 +88,20 @@
 #define ENTITYID_FLAGS_DIRTY		0x00
 #define ENTITYID_FLAGS_PROTECTED	0x01
 
+/* OSTA UDF 2.1.5.2 */
+#define UDF_ID_COMPLIANT "*OSTA UDF Compliant"
+
+/* OSTA UDF 2.1.5.3 */
+struct domainEntityIDSuffix {
+	uint16_t	revision;
+	uint8_t		flags;
+	uint8_t		reserved[5];
+};
+
+/* OSTA UDF 2.1.5.3 */
+#define ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT 0
+#define ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT 1
+
 /* Volume Structure Descriptor (ECMA 167r3 2/9.1) */
 #define VSD_STD_ID_LEN			5
 struct volStructDesc {

diff --git a/fs/udf/file.c b/fs/udf/file.c
index cd31e4f..628941a 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c

@@ -280,6 +280,9 @@
 			return error;
 	}
 
+	if (attr->ia_valid & ATTR_MODE)
+		udf_update_extra_perms(inode, attr->ia_mode);
+
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 	return 0;

diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index f8e5872..0adb407 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c

@@ -118,6 +118,9 @@
 	iinfo->i_lenAlloc = 0;
 	iinfo->i_use = 0;
 	iinfo->i_checkpoint = 1;
+	iinfo->i_extraPerms = FE_PERM_U_CHATTR;
+	udf_update_extra_perms(inode, mode);
+
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
 	else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5df554a..ea80036 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c

@@ -45,6 +45,13 @@
 
 #define EXTENT_MERGE_SIZE 5
 
+#define FE_MAPPED_PERMS	(FE_PERM_U_READ | FE_PERM_U_WRITE | FE_PERM_U_EXEC | \
+			 FE_PERM_G_READ | FE_PERM_G_WRITE | FE_PERM_G_EXEC | \
+			 FE_PERM_O_READ | FE_PERM_O_WRITE | FE_PERM_O_EXEC)
+
+#define FE_DELETE_PERMS	(FE_PERM_U_DELETE | FE_PERM_G_DELETE | \
+			 FE_PERM_O_DELETE)
+
 static umode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static int udf_sync_inode(struct inode *inode);
@@ -470,13 +477,15 @@
 	return NULL;
 }
 
-/* Extend the file by 'blocks' blocks, return the number of extents added */
+/* Extend the file with new blocks totaling 'new_block_bytes',
+ * return the number of extents added
+ */
 static int udf_do_extend_file(struct inode *inode,
 			      struct extent_position *last_pos,
 			      struct kernel_long_ad *last_ext,
-			      sector_t blocks)
+			      loff_t new_block_bytes)
 {
-	sector_t add;
+	uint32_t add;
 	int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
 	struct super_block *sb = inode->i_sb;
 	struct kernel_lb_addr prealloc_loc = {};
@@ -486,7 +495,7 @@
 
 	/* The previous extent is fake and we should not extend by anything
 	 * - there's nothing to do... */
-	if (!blocks && fake)
+	if (!new_block_bytes && fake)
 		return 0;
 
 	iinfo = UDF_I(inode);
@@ -517,13 +526,12 @@
 	/* Can we merge with the previous extent? */
 	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
 					EXT_NOT_RECORDED_NOT_ALLOCATED) {
-		add = ((1 << 30) - sb->s_blocksize -
-			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) >>
-			sb->s_blocksize_bits;
-		if (add > blocks)
-			add = blocks;
-		blocks -= add;
-		last_ext->extLength += add << sb->s_blocksize_bits;
+		add = (1 << 30) - sb->s_blocksize -
+			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
+		if (add > new_block_bytes)
+			add = new_block_bytes;
+		new_block_bytes -= add;
+		last_ext->extLength += add;
 	}
 
 	if (fake) {
@@ -544,28 +552,27 @@
 	}
 
 	/* Managed to do everything necessary? */
-	if (!blocks)
+	if (!new_block_bytes)
 		goto out;
 
 	/* All further extents will be NOT_RECORDED_NOT_ALLOCATED */
 	last_ext->extLocation.logicalBlockNum = 0;
 	last_ext->extLocation.partitionReferenceNum = 0;
-	add = (1 << (30-sb->s_blocksize_bits)) - 1;
-	last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
-				(add << sb->s_blocksize_bits);
+	add = (1 << 30) - sb->s_blocksize;
+	last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | add;
 
 	/* Create enough extents to cover the whole hole */
-	while (blocks > add) {
-		blocks -= add;
+	while (new_block_bytes > add) {
+		new_block_bytes -= add;
 		err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
 				   last_ext->extLength, 1);
 		if (err)
 			return err;
 		count++;
 	}
-	if (blocks) {
+	if (new_block_bytes) {
 		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
-			(blocks << sb->s_blocksize_bits);
+			new_block_bytes;
 		err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
 				   last_ext->extLength, 1);
 		if (err)
@@ -596,6 +603,24 @@
 	return count;
 }
 
+/* Extend the final block of the file to final_block_len bytes */
+static void udf_do_extend_final_block(struct inode *inode,
+				      struct extent_position *last_pos,
+				      struct kernel_long_ad *last_ext,
+				      uint32_t final_block_len)
+{
+	struct super_block *sb = inode->i_sb;
+	uint32_t added_bytes;
+
+	added_bytes = final_block_len -
+		      (last_ext->extLength & (sb->s_blocksize - 1));
+	last_ext->extLength += added_bytes;
+	UDF_I(inode)->i_lenExtents += added_bytes;
+
+	udf_write_aext(inode, last_pos, &last_ext->extLocation,
+			last_ext->extLength, 1);
+}
+
 static int udf_extend_file(struct inode *inode, loff_t newsize)
 {
 
@@ -605,10 +630,12 @@
 	int8_t etype;
 	struct super_block *sb = inode->i_sb;
 	sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+	unsigned long partial_final_block;
 	int adsize;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	struct kernel_long_ad extent;
-	int err;
+	int err = 0;
+	int within_final_block;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
 		adsize = sizeof(struct short_ad);
@@ -618,18 +645,8 @@
 		BUG();
 
 	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+	within_final_block = (etype != -1);
 
-	/* File has extent covering the new size (could happen when extending
-	 * inside a block)? */
-	if (etype != -1)
-		return 0;
-	if (newsize & (sb->s_blocksize - 1))
-		offset++;
-	/* Extended file just to the boundary of the last file block? */
-	if (offset == 0)
-		return 0;
-
-	/* Truncate is extending the file by 'offset' blocks */
 	if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
 	    (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
 		/* File has no extents at all or has empty last
@@ -643,7 +660,22 @@
 				      &extent.extLength, 0);
 		extent.extLength |= etype << 30;
 	}
-	err = udf_do_extend_file(inode, &epos, &extent, offset);
+
+	partial_final_block = newsize & (sb->s_blocksize - 1);
+
+	/* File has extent covering the new size (could happen when extending
+	 * inside a block)?
+	 */
+	if (within_final_block) {
+		/* Extending file within the last file block */
+		udf_do_extend_final_block(inode, &epos, &extent,
+					  partial_final_block);
+	} else {
+		loff_t add = ((loff_t)offset << sb->s_blocksize_bits) |
+			     partial_final_block;
+		err = udf_do_extend_file(inode, &epos, &extent, add);
+	}
+
 	if (err < 0)
 		goto out;
 	err = 0;
@@ -745,6 +777,7 @@
 	/* Are we beyond EOF? */
 	if (etype == -1) {
 		int ret;
+		loff_t hole_len;
 		isBeyondEOF = true;
 		if (count) {
 			if (c)
@@ -760,7 +793,8 @@
 			startnum = (offset > 0);
 		}
 		/* Create extents for the hole between EOF and offset */
-		ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
+		hole_len = (loff_t)offset << inode->i_blkbits;
+		ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len);
 		if (ret < 0) {
 			*err = ret;
 			newblock = 0;
@@ -1242,8 +1276,10 @@
 		truncate_setsize(inode, newsize);
 		down_write(&iinfo->i_data_sem);
 		udf_clear_extent_cache(inode);
-		udf_truncate_extents(inode);
+		err = udf_truncate_extents(inode);
 		up_write(&iinfo->i_data_sem);
+		if (err)
+			return err;
 	}
 update_time:
 	inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -1357,6 +1393,12 @@
 
 	iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) &
 							ICBTAG_FLAG_AD_MASK;
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT &&
+	    iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG &&
+	    iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		ret = -EIO;
+		goto out;
+	}
 	iinfo->i_unique = 0;
 	iinfo->i_lenEAttr = 0;
 	iinfo->i_lenExtents = 0;
@@ -1423,6 +1465,8 @@
 	else
 		inode->i_mode = udf_convert_permissions(fe);
 	inode->i_mode &= ~sbi->s_umask;
+	iinfo->i_extraPerms = le32_to_cpu(fe->permissions) & ~FE_MAPPED_PERMS;
+
 	read_unlock(&sbi->s_cred_lock);
 
 	link_count = le16_to_cpu(fe->fileLinkCount);
@@ -1450,6 +1494,8 @@
 		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
 		iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs);
 		iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint);
+		iinfo->i_streamdir = 0;
+		iinfo->i_lenStreams = 0;
 	} else {
 		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
 		    (inode->i_sb->s_blocksize_bits - 9);
@@ -1463,6 +1509,16 @@
 		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
 		iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
 		iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
+
+		/* Named streams */
+		iinfo->i_streamdir = (efe->streamDirectoryICB.extLength != 0);
+		iinfo->i_locStreamdir =
+			lelb_to_cpu(efe->streamDirectoryICB.extLocation);
+		iinfo->i_lenStreams = le64_to_cpu(efe->objectSize);
+		if (iinfo->i_lenStreams >= inode->i_size)
+			iinfo->i_lenStreams -= inode->i_size;
+		else
+			iinfo->i_lenStreams = 0;
 	}
 	inode->i_generation = iinfo->i_unique;
 
@@ -1584,6 +1640,23 @@
 	return mode;
 }
 
+void udf_update_extra_perms(struct inode *inode, umode_t mode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	/*
+	 * UDF 2.01 sec. 3.3.3.3 Note 2:
+	 * In Unix, delete permission tracks write
+	 */
+	iinfo->i_extraPerms &= ~FE_DELETE_PERMS;
+	if (mode & 0200)
+		iinfo->i_extraPerms |= FE_PERM_U_DELETE;
+	if (mode & 0020)
+		iinfo->i_extraPerms |= FE_PERM_G_DELETE;
+	if (mode & 0002)
+		iinfo->i_extraPerms |= FE_PERM_O_DELETE;
+}
+
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
@@ -1656,10 +1729,7 @@
 		   ((inode->i_mode & 0070) << 2) |
 		   ((inode->i_mode & 0700) << 4);
 
-	udfperms |= (le32_to_cpu(fe->permissions) &
-		    (FE_PERM_O_DELETE | FE_PERM_O_CHATTR |
-		     FE_PERM_G_DELETE | FE_PERM_G_CHATTR |
-		     FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
+	udfperms |= iinfo->i_extraPerms;
 	fe->permissions = cpu_to_le32(udfperms);
 
 	if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
@@ -1725,9 +1795,19 @@
 		       iinfo->i_ext.i_data,
 		       inode->i_sb->s_blocksize -
 					sizeof(struct extendedFileEntry));
-		efe->objectSize = cpu_to_le64(inode->i_size);
+		efe->objectSize =
+			cpu_to_le64(inode->i_size + iinfo->i_lenStreams);
 		efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
 
+		if (iinfo->i_streamdir) {
+			struct long_ad *icb_lad = &efe->streamDirectoryICB;
+
+			icb_lad->extLocation =
+				cpu_to_lelb(iinfo->i_locStreamdir);
+			icb_lad->extLength =
+				cpu_to_le32(inode->i_sb->s_blocksize);
+		}
+
 		udf_adjust_time(iinfo, inode->i_atime);
 		udf_adjust_time(iinfo, inode->i_mtime);
 		udf_adjust_time(iinfo, inode->i_ctime);

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 58cc241..77b6d89 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c

@@ -304,21 +304,6 @@
 	if (dentry->d_name.len > UDF_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-#ifdef UDF_RECOVERY
-	/* temporary shorthand for specifying files by inode number */
-	if (!strncmp(dentry->d_name.name, ".B=", 3)) {
-		struct kernel_lb_addr lb = {
-			.logicalBlockNum = 0,
-			.partitionReferenceNum =
-				simple_strtoul(dentry->d_name.name + 3,
-						NULL, 0),
-		};
-		inode = udf_iget(dir->i_sb, lb);
-		if (IS_ERR(inode))
-			return inode;
-	} else
-#endif /* UDF_RECOVERY */
-
 	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (IS_ERR(fi))
 		return ERR_CAST(fi);

diff --git a/fs/udf/super.c b/fs/udf/super.c
index c495db7..8c28e93 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c

@@ -92,10 +92,6 @@
 static int udf_sync_fs(struct super_block *, int);
 static int udf_remount_fs(struct super_block *, int *, char *);
 static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
-static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
-			    struct kernel_lb_addr *);
-static void udf_load_fileset(struct super_block *, struct buffer_head *,
-			     struct kernel_lb_addr *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
@@ -151,9 +147,11 @@
 
 	ei->i_unique = 0;
 	ei->i_lenExtents = 0;
+	ei->i_lenStreams = 0;
 	ei->i_next_alloc_block = 0;
 	ei->i_next_alloc_goal = 0;
 	ei->i_strat4096 = 0;
+	ei->i_streamdir = 0;
 	init_rwsem(&ei->i_data_sem);
 	ei->cached_extent.lstart = -1;
 	spin_lock_init(&ei->i_extent_cache_lock);
@@ -161,17 +159,11 @@
 	return &ei->vfs_inode;
 }
 
-static void udf_i_callback(struct rcu_head *head)
+static void udf_free_in_core_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
-static void udf_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, udf_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
@@ -206,7 +198,7 @@
 /* Superblock operations */
 static const struct super_operations udf_sb_ops = {
 	.alloc_inode	= udf_alloc_inode,
-	.destroy_inode	= udf_destroy_inode,
+	.free_inode	= udf_free_in_core_inode,
 	.write_inode	= udf_write_inode,
 	.evict_inode	= udf_evict_inode,
 	.put_super	= udf_put_super,
@@ -277,8 +269,7 @@
 	int nr_groups = bitmap->s_nr_groups;
 
 	for (i = 0; i < nr_groups; i++)
-		if (bitmap->s_block_bitmap[i])
-			brelse(bitmap->s_block_bitmap[i]);
+		brelse(bitmap->s_block_bitmap[i]);
 
 	kvfree(bitmap);
 }
@@ -290,12 +281,8 @@
 
 	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
 		iput(map->s_uspace.s_table);
-	if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
-		iput(map->s_fspace.s_table);
 	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
 		udf_sb_free_bitmap(map->s_uspace.s_bitmap);
-	if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
-		udf_sb_free_bitmap(map->s_fspace.s_bitmap);
 	if (map->s_partition_type == UDF_SPARABLE_MAP15)
 		for (i = 0; i < 4; i++)
 			brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
@@ -576,6 +563,11 @@
 			if (!remount) {
 				if (uopt->nls_map)
 					unload_nls(uopt->nls_map);
+				/*
+				 * load_nls() failure is handled later in
+				 * udf_fill_super() after all options are
+				 * parsed.
+				 */
 				uopt->nls_map = load_nls(args[0].from);
 				uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
 			}
@@ -651,16 +643,67 @@
 	return error;
 }
 
-/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
-/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
-static loff_t udf_check_vsd(struct super_block *sb)
+/*
+ * Check VSD descriptor. Returns -1 in case we are at the end of volume
+ * recognition area, 0 if the descriptor is valid but non-interesting, 1 if
+ * we found one of NSR descriptors we are looking for.
+ */
+static int identify_vsd(const struct volStructDesc *vsd)
+{
+	int ret = 0;
+
+	if (!memcmp(vsd->stdIdent, VSD_STD_ID_CD001, VSD_STD_ID_LEN)) {
+		switch (vsd->structType) {
+		case 0:
+			udf_debug("ISO9660 Boot Record found\n");
+			break;
+		case 1:
+			udf_debug("ISO9660 Primary Volume Descriptor found\n");
+			break;
+		case 2:
+			udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
+			break;
+		case 3:
+			udf_debug("ISO9660 Volume Partition Descriptor found\n");
+			break;
+		case 255:
+			udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
+			break;
+		default:
+			udf_debug("ISO9660 VRS (%u) found\n", vsd->structType);
+			break;
+		}
+	} else if (!memcmp(vsd->stdIdent, VSD_STD_ID_BEA01, VSD_STD_ID_LEN))
+		; /* ret = 0 */
+	else if (!memcmp(vsd->stdIdent, VSD_STD_ID_NSR02, VSD_STD_ID_LEN))
+		ret = 1;
+	else if (!memcmp(vsd->stdIdent, VSD_STD_ID_NSR03, VSD_STD_ID_LEN))
+		ret = 1;
+	else if (!memcmp(vsd->stdIdent, VSD_STD_ID_BOOT2, VSD_STD_ID_LEN))
+		; /* ret = 0 */
+	else if (!memcmp(vsd->stdIdent, VSD_STD_ID_CDW02, VSD_STD_ID_LEN))
+		; /* ret = 0 */
+	else {
+		/* TEA01 or invalid id : end of volume recognition area */
+		ret = -1;
+	}
+
+	return ret;
+}
+
+/*
+ * Check Volume Structure Descriptors (ECMA 167 2/9.1)
+ * We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1)
+ * @return   1 if NSR02 or NSR03 found,
+ *	    -1 if first sector read error, 0 otherwise
+ */
+static int udf_check_vsd(struct super_block *sb)
 {
 	struct volStructDesc *vsd = NULL;
 	loff_t sector = VSD_FIRST_SECTOR_OFFSET;
 	int sectorsize;
 	struct buffer_head *bh = NULL;
-	int nsr02 = 0;
-	int nsr03 = 0;
+	int nsr = 0;
 	struct udf_sb_info *sbi;
 
 	sbi = UDF_SB(sb);
@@ -684,71 +727,36 @@
 	 * activity. This actually happened with uninitialised SSD partitions
 	 * (all 0xFF) before the check for the limit and all valid IDs were
 	 * added */
-	for (; !nsr02 && !nsr03 && sector < VSD_MAX_SECTOR_OFFSET;
-	     sector += sectorsize) {
+	for (; !nsr && sector < VSD_MAX_SECTOR_OFFSET; sector += sectorsize) {
 		/* Read a block */
 		bh = udf_tread(sb, sector >> sb->s_blocksize_bits);
 		if (!bh)
 			break;
 
-		/* Look for ISO  descriptors */
 		vsd = (struct volStructDesc *)(bh->b_data +
 					      (sector & (sb->s_blocksize - 1)));
-
-		if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
-				    VSD_STD_ID_LEN)) {
-			switch (vsd->structType) {
-			case 0:
-				udf_debug("ISO9660 Boot Record found\n");
-				break;
-			case 1:
-				udf_debug("ISO9660 Primary Volume Descriptor found\n");
-				break;
-			case 2:
-				udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
-				break;
-			case 3:
-				udf_debug("ISO9660 Volume Partition Descriptor found\n");
-				break;
-			case 255:
-				udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
-				break;
-			default:
-				udf_debug("ISO9660 VRS (%u) found\n",
-					  vsd->structType);
-				break;
-			}
-		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BEA01,
-				    VSD_STD_ID_LEN))
-			; /* nothing */
-		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_TEA01,
-				    VSD_STD_ID_LEN)) {
+		nsr = identify_vsd(vsd);
+		/* Found NSR or end? */
+		if (nsr) {
 			brelse(bh);
 			break;
-		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR02,
-				    VSD_STD_ID_LEN))
-			nsr02 = sector;
-		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03,
-				    VSD_STD_ID_LEN))
-			nsr03 = sector;
-		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BOOT2,
-				    VSD_STD_ID_LEN))
-			; /* nothing */
-		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CDW02,
-				    VSD_STD_ID_LEN))
-			; /* nothing */
-		else {
-			/* invalid id : end of volume recognition area */
-			brelse(bh);
-			break;
+		}
+		/*
+		 * Special handling for improperly formatted VRS (e.g., Win10)
+		 * where components are separated by 2048 bytes even though
+		 * sectors are 4K
+		 */
+		if (sb->s_blocksize == 4096) {
+			nsr = identify_vsd(vsd + 1);
+			/* Ignore unknown IDs... */
+			if (nsr < 0)
+				nsr = 0;
 		}
 		brelse(bh);
 	}
 
-	if (nsr03)
-		return nsr03;
-	else if (nsr02)
-		return nsr02;
+	if (nsr > 0)
+		return 1;
 	else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) ==
 			VSD_FIRST_SECTOR_OFFSET)
 		return -1;
@@ -756,34 +764,82 @@
 		return 0;
 }
 
+static int udf_verify_domain_identifier(struct super_block *sb,
+					struct regid *ident, char *dname)
+{
+	struct domainEntityIDSuffix *suffix;
+
+	if (memcmp(ident->ident, UDF_ID_COMPLIANT, strlen(UDF_ID_COMPLIANT))) {
+		udf_warn(sb, "Not OSTA UDF compliant %s descriptor.\n", dname);
+		goto force_ro;
+	}
+	if (ident->flags & (1 << ENTITYID_FLAGS_DIRTY)) {
+		udf_warn(sb, "Possibly not OSTA UDF compliant %s descriptor.\n",
+			 dname);
+		goto force_ro;
+	}
+	suffix = (struct domainEntityIDSuffix *)ident->identSuffix;
+	if (suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT) ||
+	    suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT)) {
+		if (!sb_rdonly(sb)) {
+			udf_warn(sb, "Descriptor for %s marked write protected."
+				 " Forcing read only mount.\n", dname);
+		}
+		goto force_ro;
+	}
+	return 0;
+
+force_ro:
+	if (!sb_rdonly(sb))
+		return -EACCES;
+	UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+	return 0;
+}
+
+static int udf_load_fileset(struct super_block *sb, struct fileSetDesc *fset,
+			    struct kernel_lb_addr *root)
+{
+	int ret;
+
+	ret = udf_verify_domain_identifier(sb, &fset->domainIdent, "file set");
+	if (ret < 0)
+		return ret;
+
+	*root = lelb_to_cpu(fset->rootDirectoryICB.extLocation);
+	UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum);
+
+	udf_debug("Rootdir at block=%u, partition=%u\n",
+		  root->logicalBlockNum, root->partitionReferenceNum);
+	return 0;
+}
+
 static int udf_find_fileset(struct super_block *sb,
 			    struct kernel_lb_addr *fileset,
 			    struct kernel_lb_addr *root)
 {
 	struct buffer_head *bh = NULL;
 	uint16_t ident;
+	int ret;
 
-	if (fileset->logicalBlockNum != 0xFFFFFFFF ||
-	    fileset->partitionReferenceNum != 0xFFFF) {
-		bh = udf_read_ptagged(sb, fileset, 0, &ident);
+	if (fileset->logicalBlockNum == 0xFFFFFFFF &&
+	    fileset->partitionReferenceNum == 0xFFFF)
+		return -EINVAL;
 
-		if (!bh) {
-			return 1;
-		} else if (ident != TAG_IDENT_FSD) {
-			brelse(bh);
-			return 1;
-		}
-
-		udf_debug("Fileset at block=%u, partition=%u\n",
-			  fileset->logicalBlockNum,
-			  fileset->partitionReferenceNum);
-
-		UDF_SB(sb)->s_partition = fileset->partitionReferenceNum;
-		udf_load_fileset(sb, bh, root);
+	bh = udf_read_ptagged(sb, fileset, 0, &ident);
+	if (!bh)
+		return -EIO;
+	if (ident != TAG_IDENT_FSD) {
 		brelse(bh);
-		return 0;
+		return -EINVAL;
 	}
-	return 1;
+
+	udf_debug("Fileset at block=%u, partition=%u\n",
+		  fileset->logicalBlockNum, fileset->partitionReferenceNum);
+
+	UDF_SB(sb)->s_partition = fileset->partitionReferenceNum;
+	ret = udf_load_fileset(sb, (struct fileSetDesc *)bh->b_data, root);
+	brelse(bh);
+	return ret;
 }
 
 /*
@@ -799,9 +855,7 @@
 	struct buffer_head *bh;
 	uint16_t ident;
 	int ret = -ENOMEM;
-#ifdef UDFFS_DEBUG
 	struct timestamp *ts;
-#endif
 
 	outstr = kmalloc(128, GFP_NOFS);
 	if (!outstr)
@@ -822,13 +876,10 @@
 
 	udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
 			      pvoldesc->recordingDateAndTime);
-#ifdef UDFFS_DEBUG
 	ts = &pvoldesc->recordingDateAndTime;
 	udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n",
 		  le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
 		  ts->minute, le16_to_cpu(ts->typeAndTimezone));
-#endif
-
 
 	ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32);
 	if (ret < 0) {
@@ -944,21 +995,6 @@
 	return 0;
 }
 
-static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
-			     struct kernel_lb_addr *root)
-{
-	struct fileSetDesc *fset;
-
-	fset = (struct fileSetDesc *)bh->b_data;
-
-	*root = lelb_to_cpu(fset->rootDirectoryICB.extLocation);
-
-	UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum);
-
-	udf_debug("Rootdir at block=%u, partition=%u\n",
-		  root->logicalBlockNum, root->partitionReferenceNum);
-}
-
 int udf_compute_nr_groups(struct super_block *sb, u32 partition)
 {
 	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
@@ -989,12 +1025,62 @@
 	return bitmap;
 }
 
+static int check_partition_desc(struct super_block *sb,
+				struct partitionDesc *p,
+				struct udf_part_map *map)
+{
+	bool umap, utable, fmap, ftable;
+	struct partitionHeaderDesc *phd;
+
+	switch (le32_to_cpu(p->accessType)) {
+	case PD_ACCESS_TYPE_READ_ONLY:
+	case PD_ACCESS_TYPE_WRITE_ONCE:
+	case PD_ACCESS_TYPE_REWRITABLE:
+	case PD_ACCESS_TYPE_NONE:
+		goto force_ro;
+	}
+
+	/* No Partition Header Descriptor? */
+	if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
+	    strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+		goto force_ro;
+
+	phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
+	utable = phd->unallocSpaceTable.extLength;
+	umap = phd->unallocSpaceBitmap.extLength;
+	ftable = phd->freedSpaceTable.extLength;
+	fmap = phd->freedSpaceBitmap.extLength;
+
+	/* No allocation info? */
+	if (!utable && !umap && !ftable && !fmap)
+		goto force_ro;
+
+	/* We don't support blocks that require erasing before overwrite */
+	if (ftable || fmap)
+		goto force_ro;
+	/* UDF 2.60: 2.3.3 - no mixing of tables & bitmaps, no VAT. */
+	if (utable && umap)
+		goto force_ro;
+
+	if (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
+	    map->s_partition_type == UDF_VIRTUAL_MAP20)
+		goto force_ro;
+
+	return 0;
+force_ro:
+	if (!sb_rdonly(sb))
+		return -EACCES;
+	UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+	return 0;
+}
+
 static int udf_fill_partdesc_info(struct super_block *sb,
 		struct partitionDesc *p, int p_index)
 {
 	struct udf_part_map *map;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct partitionHeaderDesc *phd;
+	int err;
 
 	map = &sbi->s_partmaps[p_index];
 
@@ -1014,8 +1100,16 @@
 		  p_index, map->s_partition_type,
 		  map->s_partition_root, map->s_partition_len);
 
-	if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
-	    strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+	err = check_partition_desc(sb, p, map);
+	if (err)
+		return err;
+
+	/*
+	 * Skip loading allocation info it we cannot ever write to the fs.
+	 * This is a correctness thing as we may have decided to force ro mount
+	 * to avoid allocation info we don't support.
+	 */
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
 		return 0;
 
 	phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
@@ -1051,40 +1145,6 @@
 			  p_index, bitmap->s_extPosition);
 	}
 
-	if (phd->partitionIntegrityTable.extLength)
-		udf_debug("partitionIntegrityTable (part %d)\n", p_index);
-
-	if (phd->freedSpaceTable.extLength) {
-		struct kernel_lb_addr loc = {
-			.logicalBlockNum = le32_to_cpu(
-				phd->freedSpaceTable.extPosition),
-			.partitionReferenceNum = p_index,
-		};
-		struct inode *inode;
-
-		inode = udf_iget_special(sb, &loc);
-		if (IS_ERR(inode)) {
-			udf_debug("cannot load freedSpaceTable (part %d)\n",
-				  p_index);
-			return PTR_ERR(inode);
-		}
-		map->s_fspace.s_table = inode;
-		map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
-		udf_debug("freedSpaceTable (part %d) @ %lu\n",
-			  p_index, map->s_fspace.s_table->i_ino);
-	}
-
-	if (phd->freedSpaceBitmap.extLength) {
-		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
-		if (!bitmap)
-			return -ENOMEM;
-		map->s_fspace.s_bitmap = bitmap;
-		bitmap->s_extPosition = le32_to_cpu(
-				phd->freedSpaceBitmap.extPosition);
-		map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
-		udf_debug("freedSpaceBitmap (part %d) @ %u\n",
-			  p_index, bitmap->s_extPosition);
-	}
 	return 0;
 }
 
@@ -1219,9 +1279,7 @@
 	 * PHYSICAL partitions are already set up
 	 */
 	type1_idx = i;
-#ifdef UDFFS_DEBUG
 	map = NULL; /* supress 'maybe used uninitialized' warning */
-#endif
 	for (i = 0; i < sbi->s_partitions; i++) {
 		map = &sbi->s_partmaps[i];
 
@@ -1345,6 +1403,10 @@
 		goto out_bh;
 	}
 
+	ret = udf_verify_domain_identifier(sb, &lvd->domainIdent,
+					   "logical volume");
+	if (ret)
+		goto out_bh;
 	ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
 	if (ret)
 		goto out_bh;
@@ -1454,6 +1516,17 @@
 	if (lvd->integritySeqExt.extLength)
 		udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
 	ret = 0;
+
+	if (!sbi->s_lvid_bh) {
+		/* We can't generate unique IDs without a valid LVID */
+		if (sb_rdonly(sb)) {
+			UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+		} else {
+			udf_warn(sb, "Damaged or missing LVID, forcing "
+				     "readonly mount\n");
+			ret = -EACCES;
+		}
+	}
 out_bh:
 	brelse(bh);
 	return ret;
@@ -1885,7 +1958,7 @@
 			int silent, struct kernel_lb_addr *fileset)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
-	loff_t nsr_off;
+	int nsr = 0;
 	int ret;
 
 	if (!sb_set_blocksize(sb, uopt->blocksize)) {
@@ -1896,13 +1969,13 @@
 	sbi->s_last_block = uopt->lastblock;
 	if (!uopt->novrs) {
 		/* Check that it is NSR02 compliant */
-		nsr_off = udf_check_vsd(sb);
-		if (!nsr_off) {
+		nsr = udf_check_vsd(sb);
+		if (!nsr) {
 			if (!silent)
 				udf_warn(sb, "No VRS found\n");
 			return -EINVAL;
 		}
-		if (nsr_off == -1)
+		if (nsr == -1)
 			udf_debug("Failed to read sector at offset %d. "
 				  "Assuming open disc. Skipping validity "
 				  "check\n", VSD_FIRST_SECTOR_OFFSET);
@@ -1923,13 +1996,24 @@
 	return 0;
 }
 
+static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid)
+{
+	struct timespec64 ts;
+
+	ktime_get_real_ts64(&ts);
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
+	lvid->descTag.descCRC = cpu_to_le16(
+		crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+			le16_to_cpu(lvid->descTag.descCRCLength)));
+	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+}
+
 static void udf_open_lvid(struct super_block *sb)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct buffer_head *bh = sbi->s_lvid_bh;
 	struct logicalVolIntegrityDesc *lvid;
 	struct logicalVolIntegrityDescImpUse *lvidiu;
-	struct timespec64 ts;
 
 	if (!bh)
 		return;
@@ -1941,18 +2025,12 @@
 	mutex_lock(&sbi->s_alloc_mutex);
 	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-	ktime_get_real_ts64(&ts);
-	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
 	if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
 		lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
 	else
 		UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);
 
-	lvid->descTag.descCRC = cpu_to_le16(
-		crc_itu_t(0, (char *)lvid + sizeof(struct tag),
-			le16_to_cpu(lvid->descTag.descCRCLength)));
-
-	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+	udf_finalize_lvid(lvid);
 	mark_buffer_dirty(bh);
 	sbi->s_lvid_dirty = 0;
 	mutex_unlock(&sbi->s_alloc_mutex);
@@ -1966,7 +2044,6 @@
 	struct buffer_head *bh = sbi->s_lvid_bh;
 	struct logicalVolIntegrityDesc *lvid;
 	struct logicalVolIntegrityDescImpUse *lvidiu;
-	struct timespec64 ts;
 
 	if (!bh)
 		return;
@@ -1978,8 +2055,6 @@
 	mutex_lock(&sbi->s_alloc_mutex);
 	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-	ktime_get_real_ts64(&ts);
-	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
 	if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
 		lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
 	if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
@@ -1989,17 +2064,13 @@
 	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
 		lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
 
-	lvid->descTag.descCRC = cpu_to_le16(
-			crc_itu_t(0, (char *)lvid + sizeof(struct tag),
-				le16_to_cpu(lvid->descTag.descCRCLength)));
-
-	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
 	/*
 	 * We set buffer uptodate unconditionally here to avoid spurious
 	 * warnings from mark_buffer_dirty() when previous EIO has marked
 	 * the buffer as !uptodate
 	 */
 	set_buffer_uptodate(bh);
+	udf_finalize_lvid(lvid);
 	mark_buffer_dirty(bh);
 	sbi->s_lvid_dirty = 0;
 	mutex_unlock(&sbi->s_alloc_mutex);
@@ -2028,8 +2099,8 @@
 	if (!(++uniqueID & 0xFFFFFFFF))
 		uniqueID += 16;
 	lvhd->uniqueID = cpu_to_le64(uniqueID);
+	udf_updated_lvid(sb);
 	mutex_unlock(&sbi->s_alloc_mutex);
-	mark_buffer_dirty(bh);
 
 	return ret;
 }
@@ -2188,9 +2259,9 @@
 		UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
 	}
 
-	if (udf_find_fileset(sb, &fileset, &rootdir)) {
+	ret = udf_find_fileset(sb, &fileset, &rootdir);
+	if (ret < 0) {
 		udf_warn(sb, "No fileset found\n");
-		ret = -EINVAL;
 		goto error_out;
 	}
 
@@ -2300,11 +2371,17 @@
 
 	mutex_lock(&sbi->s_alloc_mutex);
 	if (sbi->s_lvid_dirty) {
+		struct buffer_head *bh = sbi->s_lvid_bh;
+		struct logicalVolIntegrityDesc *lvid;
+
+		lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+		udf_finalize_lvid(lvid);
+
 		/*
 		 * Blockdevice will be synced later so we don't have to submit
 		 * the buffer for IO
 		 */
-		mark_buffer_dirty(sbi->s_lvid_bh);
+		mark_buffer_dirty(bh);
 		sbi->s_lvid_dirty = 0;
 	}
 	mutex_unlock(&sbi->s_alloc_mutex);
@@ -2439,10 +2516,6 @@
 		accum += udf_count_free_bitmap(sb,
 					       map->s_uspace.s_bitmap);
 	}
-	if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
-		accum += udf_count_free_bitmap(sb,
-					       map->s_fspace.s_bitmap);
-	}
 	if (accum)
 		return accum;
 
@@ -2450,11 +2523,6 @@
 		accum += udf_count_free_table(sb,
 					      map->s_uspace.s_table);
 	}
-	if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
-		accum += udf_count_free_table(sb,
-					      map->s_fspace.s_table);
-	}
-
 	return accum;
 }
 

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index b647f0b..63a47f1 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c

@@ -199,7 +199,7 @@
  * for making file shorter. For making file longer, udf_extend_file() has to
  * be used.
  */
-void udf_truncate_extents(struct inode *inode)
+int udf_truncate_extents(struct inode *inode)
 {
 	struct extent_position epos;
 	struct kernel_lb_addr eloc, neloc = {};
@@ -224,7 +224,7 @@
 	if (etype == -1) {
 		/* We should extend the file? */
 		WARN_ON(byte_offset);
-		return;
+		return 0;
 	}
 	epos.offset -= adsize;
 	extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
@@ -260,6 +260,9 @@
 			epos.block = eloc;
 			epos.bh = udf_tread(sb,
 					udf_get_lb_pblock(sb, &eloc, 0));
+			/* Error reading indirect block? */
+			if (!epos.bh)
+				return -EIO;
 			if (elen)
 				indirect_ext_len =
 					(elen + sb->s_blocksize - 1) >>
@@ -283,4 +286,5 @@
 	iinfo->i_lenExtents = inode->i_size;
 
 	brelse(epos.bh);
+	return 0;
 }

diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 2ef0e21..4245d1f 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h

@@ -38,16 +38,20 @@
 	__u32			i_next_alloc_block;
 	__u32			i_next_alloc_goal;
 	__u32			i_checkpoint;
+	__u32			i_extraPerms;
 	unsigned		i_alloc_type : 3;
 	unsigned		i_efe : 1;	/* extendedFileEntry */
 	unsigned		i_use : 1;	/* unallocSpaceEntry */
 	unsigned		i_strat4096 : 1;
-	unsigned		reserved : 26;
+	unsigned		i_streamdir : 1;
+	unsigned		reserved : 25;
 	union {
 		struct short_ad	*i_sad;
 		struct long_ad		*i_lad;
 		__u8		*i_data;
 	} i_ext;
+	struct kernel_lb_addr	i_locStreamdir;
+	__u64			i_lenStreams;
 	struct rw_semaphore	i_data_sem;
 	struct udf_ext_cache cached_extent;
 	/* Spinlock for protecting extent cache */

diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d12e507..3d83be5 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h

@@ -35,8 +35,6 @@
 
 #define UDF_PART_FLAG_UNALLOC_BITMAP	0x0001
 #define UDF_PART_FLAG_UNALLOC_TABLE	0x0002
-#define UDF_PART_FLAG_FREED_BITMAP	0x0004
-#define UDF_PART_FLAG_FREED_TABLE	0x0008
 #define UDF_PART_FLAG_READ_ONLY		0x0010
 #define UDF_PART_FLAG_WRITE_ONCE	0x0020
 #define UDF_PART_FLAG_REWRITABLE	0x0040
@@ -52,8 +50,6 @@
 
 #define UDF_INVALID_MODE		((umode_t)-1)
 
-#pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
-
 #define MF_DUPLICATE_MD		0x01
 #define MF_MIRROR_FE_LOADED	0x02
 
@@ -95,10 +91,6 @@
 		struct udf_bitmap	*s_bitmap;
 		struct inode		*s_table;
 	} s_uspace;
-	union {
-		struct udf_bitmap	*s_bitmap;
-		struct inode		*s_table;
-	} s_fspace;
 	__u32	s_partition_root;
 	__u32	s_partition_len;
 	__u16	s_partition_type;

diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index ee24676..9dd0814 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h

@@ -31,16 +31,8 @@
 #define udf_info(fmt, ...)					\
 	pr_info("INFO " fmt, ##__VA_ARGS__)
 
-#undef UDFFS_DEBUG
-
-#ifdef UDFFS_DEBUG
 #define udf_debug(fmt, ...)					\
-	printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt),		\
-	       __FILE__, __LINE__, __func__, ##__VA_ARGS__)
-#else
-#define udf_debug(fmt, ...)					\
-	no_printk(fmt, ##__VA_ARGS__)
-#endif
+	pr_debug("%s:%d:%s: " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__)
 
 #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
 #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
@@ -178,6 +170,7 @@
 			    struct kernel_lb_addr *, uint32_t *, int);
 extern int8_t udf_current_aext(struct inode *, struct extent_position *,
 			       struct kernel_lb_addr *, uint32_t *, int);
+extern void udf_update_extra_perms(struct inode *inode, umode_t mode);
 
 /* misc.c */
 extern struct buffer_head *udf_tgetblk(struct super_block *sb,
@@ -235,7 +228,7 @@
 /* truncate.c */
 extern void udf_truncate_tail_extent(struct inode *);
 extern void udf_discard_prealloc(struct inode *);
-extern void udf_truncate_extents(struct inode *);
+extern int udf_truncate_extents(struct inode *);
 
 /* balloc.c */
 extern void udf_free_blocks(struct super_block *, struct inode *,

diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 0bf6e16..6d30adb 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config UFS_FS
 	tristate "UFS file system support (read only)"
 	depends on BLOCK
@@ -8,7 +9,7 @@
 	  this file system as well. Saying Y here will allow you to read from
 	  these partitions; if you also want to write to them, say Y to the
 	  experimental "UFS file system write support", below. Please read the
-	  file <file:Documentation/filesystems/ufs.txt> for more information.
+	  file <file:Documentation/admin-guide/ufs.rst> for more information.
 
           The recently released UFS2 variant (used in FreeBSD 5.x) is
           READ-ONLY supported.

diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index ec4a6b4..042344c 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile

@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Linux ufs filesystem routines.
 #

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index a4e07e9..1da0be6 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/fs/ufs/super.c
  *
@@ -842,6 +843,10 @@
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
+	sb->s_time_gran = NSEC_PER_SEC;
+	sb->s_time_min = S32_MIN;
+	sb->s_time_max = S32_MAX;
+
 	switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
 	case UFS_MOUNT_UFSTYPE_44BSD:
 		UFSD("ufstype=44bsd\n");
@@ -860,6 +865,9 @@
 		uspi->s_fshift = 9;
 		uspi->s_sbsize = super_block_size = 1536;
 		uspi->s_sbbase =  0;
+		sb->s_time_gran = 1;
+		sb->s_time_min = S64_MIN;
+		sb->s_time_max = S64_MAX;
 		flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
 		break;
 		
@@ -1406,11 +1414,9 @@
 	struct super_block *sb = dentry->d_sb;
 	struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
 	unsigned  flags = UFS_SB(sb)->s_flags;
-	struct ufs_super_block_third *usb3;
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	mutex_lock(&UFS_SB(sb)->s_lock);
-	usb3 = ubh_get_usb_third(uspi);
 	
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
 		buf->f_type = UFS2_MAGIC;
@@ -1449,17 +1455,11 @@
 	return &ei->vfs_inode;
 }
 
-static void ufs_i_callback(struct rcu_head *head)
+static void ufs_free_in_core_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
-static void ufs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, ufs_i_callback);
-}
-
 static void init_once(void *foo)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
@@ -1494,7 +1494,7 @@
 
 static const struct super_operations ufs_super_ops = {
 	.alloc_inode	= ufs_alloc_inode,
-	.destroy_inode	= ufs_destroy_inode,
+	.free_inode	= ufs_free_in_core_inode,
 	.write_inode	= ufs_write_inode,
 	.evict_inode	= ufs_evict_inode,
 	.put_super	= ufs_put_super,

diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 1fd3011..e1f1b2e 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h

@@ -42,7 +42,7 @@
 	case UFS_ST_SUNOS:
 		if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT)
 			return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state);
-		/* Fall Through to UFS_ST_SUN */
+		/* Fall Through - to UFS_ST_SUN */
 	case UFS_ST_SUN:
 		return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
 	case UFS_ST_SUNx86:
@@ -63,7 +63,7 @@
 			usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value);
 			break;
 		}
-		/* Fall Through to UFS_ST_SUN */
+		/* Fall Through - to UFS_ST_SUN */
 	case UFS_ST_SUN:
 		usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
 		break;
@@ -229,7 +229,7 @@
 	case UFS_UID_44BSD:
 		return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid);
 	case UFS_UID_EFT:
-		if (inode->ui_u1.oldids.ui_suid == 0xFFFF)
+		if (inode->ui_u1.oldids.ui_sgid == 0xFFFF)
 			return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid);
 		/* Fall through */
 	default:

diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore
new file mode 100644
index 0000000..0381e22
--- /dev/null
+++ b/fs/unicode/.gitignore

@@ -0,0 +1,2 @@
+mkutf8data
+utf8data.h

diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
new file mode 100644
index 0000000..2c27b9a
--- /dev/null
+++ b/fs/unicode/Kconfig

@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# UTF-8 normalization
+#
+config UNICODE
+	bool "UTF-8 normalization and casefolding support"
+	help
+	  Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
+	  support.
+
+config UNICODE_NORMALIZATION_SELFTEST
+	tristate "Test UTF-8 normalization support"
+	depends on UNICODE
+	default n

diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
new file mode 100644
index 0000000..d46e9ba
--- /dev/null
+++ b/fs/unicode/Makefile

@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_UNICODE) += unicode.o
+obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
+
+unicode-y := utf8-norm.o utf8-core.o
+
+$(obj)/utf8-norm.o: $(obj)/utf8data.h
+
+# In the normal build, the checked-in utf8data.h is just shipped.
+#
+# To generate utf8data.h from UCD, put *.txt files in this directory
+# and pass REGENERATE_UTF8DATA=1 from the command line.
+ifdef REGENERATE_UTF8DATA
+
+quiet_cmd_utf8data = GEN     $@
+      cmd_utf8data = $< \
+		-a $(srctree)/$(src)/DerivedAge.txt \
+		-c $(srctree)/$(src)/DerivedCombiningClass.txt \
+		-p $(srctree)/$(src)/DerivedCoreProperties.txt \
+		-d $(srctree)/$(src)/UnicodeData.txt \
+		-f $(srctree)/$(src)/CaseFolding.txt \
+		-n $(srctree)/$(src)/NormalizationCorrections.txt \
+		-t $(srctree)/$(src)/NormalizationTest.txt \
+		-o $@
+
+$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
+	$(call if_changed,utf8data)
+
+else
+
+$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE
+	$(call if_changed,shipped)
+
+endif
+
+targets += utf8data.h
+hostprogs-y += mkutf8data

diff --git a/fs/unicode/README.utf8data b/fs/unicode/README.utf8data
new file mode 100644
index 0000000..c737868
--- /dev/null
+++ b/fs/unicode/README.utf8data

@@ -0,0 +1,57 @@
+The utf8data.h file in this directory is generated from the Unicode
+Character Database for version 12.1.0 of the Unicode standard.
+
+The full set of files can be found here:
+
+  http://www.unicode.org/Public/12.1.0/ucd/
+
+Individual source links:
+
+  https://www.unicode.org/Public/12.1.0/ucd/CaseFolding.txt
+  https://www.unicode.org/Public/12.1.0/ucd/DerivedAge.txt
+  https://www.unicode.org/Public/12.1.0/ucd/extracted/DerivedCombiningClass.txt
+  https://www.unicode.org/Public/12.1.0/ucd/DerivedCoreProperties.txt
+  https://www.unicode.org/Public/12.1.0/ucd/NormalizationCorrections.txt
+  https://www.unicode.org/Public/12.1.0/ucd/NormalizationTest.txt
+  https://www.unicode.org/Public/12.1.0/ucd/UnicodeData.txt
+
+md5sums (verify by running "md5sum -c README.utf8data"):
+
+  900e76da1d822a160fd6b8c0b1d70094  CaseFolding.txt
+  131256380bff4fea8ad4a851616f2f10  DerivedAge.txt
+  e731a4089b30002144e107e3d6f8d1fa  DerivedCombiningClass.txt
+  a47c9fbd7ff92a9b261ba9831e68778a  DerivedCoreProperties.txt
+  fcab6dad15e440879d92f315978f93d3  NormalizationCorrections.txt
+  f9ff1c55a60decf436100f791b44aa98  NormalizationTest.txt
+  755f6af699f8c8d2d958da411f78f6c6  UnicodeData.txt
+
+sha1sums (verify by running "sha1sum -c README.utf8data"):
+
+  dc9245f6803c4ac99555c361f5052e0b13eb779b  CaseFolding.txt
+  3281104f237184cdb5d869e86eb8573678ada7da  DerivedAge.txt
+  2f5f995ccb96e0fa84b15151b35d5e2681535175  DerivedCombiningClass.txt
+  5b8698a3fcd5018e1987f296b02e2c17e696415e  DerivedCoreProperties.txt
+  cd83935fbc012345d8792d2c704f69497e753835  NormalizationCorrections.txt
+  ea419aae505b337b0d99a83fa83fe58ddff7c19f  NormalizationTest.txt
+  dc973c0fc93d6f09d9ab9f70d1c9f89c447f0526  UnicodeData.txt
+
+
+To update to the newer version of the Unicode standard, the latest
+released version of the UCD can be found here:
+
+  http://www.unicode.org/Public/UCD/latest/
+
+Then, build under fs/unicode/ with REGENERATE_UTF8DATA=1:
+
+	make REGENERATE_UTF8DATA=1 fs/unicode/
+
+After sanity checking the newly generated utf8data.h file (the
+version generated from the 12.1.0 UCD should be 4,109 lines long, and
+have a total size of 324k) and/or comparing it with the older version
+of utf8data.h_shipped, rename it to utf8data.h_shipped.
+
+If you are a kernel developer updating to a newer version of the
+Unicode Character Database, please update this README.utf8data file
+with the version of the UCD that was used, the md5sum and sha1sums of
+the *.txt files, before checking in the new versions of the utf8data.h
+and README.utf8data files.

diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c
new file mode 100644
index 0000000..ff2025a
--- /dev/null
+++ b/fs/unicode/mkutf8data.c

@@ -0,0 +1,3419 @@
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Generator for a compact trie for unicode normalization */
+
+#include <sys/types.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+
+/* Default names of the in- and output files. */
+
+#define AGE_NAME	"DerivedAge.txt"
+#define CCC_NAME	"DerivedCombiningClass.txt"
+#define PROP_NAME	"DerivedCoreProperties.txt"
+#define DATA_NAME	"UnicodeData.txt"
+#define FOLD_NAME	"CaseFolding.txt"
+#define NORM_NAME	"NormalizationCorrections.txt"
+#define TEST_NAME	"NormalizationTest.txt"
+#define UTF8_NAME	"utf8data.h"
+
+const char	*age_name  = AGE_NAME;
+const char	*ccc_name  = CCC_NAME;
+const char	*prop_name = PROP_NAME;
+const char	*data_name = DATA_NAME;
+const char	*fold_name = FOLD_NAME;
+const char	*norm_name = NORM_NAME;
+const char	*test_name = TEST_NAME;
+const char	*utf8_name = UTF8_NAME;
+
+int verbose = 0;
+
+/* An arbitrary line size limit on input lines. */
+
+#define LINESIZE	1024
+char line[LINESIZE];
+char buf0[LINESIZE];
+char buf1[LINESIZE];
+char buf2[LINESIZE];
+char buf3[LINESIZE];
+
+const char *argv0;
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode version numbers consist of three parts: major, minor, and a
+ * revision.  These numbers are packed into an unsigned int to obtain
+ * a single version number.
+ *
+ * To save space in the generated trie, the unicode version is not
+ * stored directly, instead we calculate a generation number from the
+ * unicode versions seen in the DerivedAge file, and use that as an
+ * index into a table of unicode versions.
+ */
+#define UNICODE_MAJ_SHIFT		(16)
+#define UNICODE_MIN_SHIFT		(8)
+
+#define UNICODE_MAJ_MAX			((unsigned short)-1)
+#define UNICODE_MIN_MAX			((unsigned char)-1)
+#define UNICODE_REV_MAX			((unsigned char)-1)
+
+#define UNICODE_AGE(MAJ,MIN,REV)			\
+	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
+	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
+	 ((unsigned int)(REV)))
+
+unsigned int *ages;
+int ages_count;
+
+unsigned int unicode_maxage;
+
+static int age_valid(unsigned int major, unsigned int minor,
+		     unsigned int revision)
+{
+	if (major > UNICODE_MAJ_MAX)
+		return 0;
+	if (minor > UNICODE_MIN_MAX)
+		return 0;
+	if (revision > UNICODE_REV_MAX)
+		return 0;
+	return 1;
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * utf8trie_t
+ *
+ * A compact binary tree, used to decode UTF-8 characters.
+ *
+ * Internal nodes are one byte for the node itself, and up to three
+ * bytes for an offset into the tree.  The first byte contains the
+ * following information:
+ *  NEXTBYTE  - flag        - advance to next byte if set
+ *  BITNUM    - 3 bit field - the bit number to tested
+ *  OFFLEN    - 2 bit field - number of bytes in the offset
+ * if offlen == 0 (non-branching node)
+ *  RIGHTPATH - 1 bit field - set if the following node is for the
+ *                            right-hand path (tested bit is set)
+ *  TRIENODE  - 1 bit field - set if the following node is an internal
+ *                            node, otherwise it is a leaf node
+ * if offlen != 0 (branching node)
+ *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
+ *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
+ *
+ * Due to the way utf8 works, there cannot be branching nodes with
+ * NEXTBYTE set, and moreover those nodes always have a righthand
+ * descendant.
+ */
+typedef unsigned char utf8trie_t;
+#define BITNUM		0x07
+#define NEXTBYTE	0x08
+#define OFFLEN		0x30
+#define OFFLEN_SHIFT	4
+#define RIGHTPATH	0x40
+#define TRIENODE	0x80
+#define RIGHTNODE	0x40
+#define LEFTNODE	0x80
+
+/*
+ * utf8leaf_t
+ *
+ * The leaves of the trie are embedded in the trie, and so the same
+ * underlying datatype, unsigned char.
+ *
+ * leaf[0]: The unicode version, stored as a generation number that is
+ *          an index into utf8agetab[].  With this we can filter code
+ *          points based on the unicode version in which they were
+ *          defined.  The CCC of a non-defined code point is 0.
+ * leaf[1]: Canonical Combining Class. During normalization, we need
+ *          to do a stable sort into ascending order of all characters
+ *          with a non-zero CCC that occur between two characters with
+ *          a CCC of 0, or at the begin or end of a string.
+ *          The unicode standard guarantees that all CCC values are
+ *          between 0 and 254 inclusive, which leaves 255 available as
+ *          a special value.
+ *          Code points with CCC 0 are known as stoppers.
+ * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
+ *          start of a NUL-terminated string that is the decomposition
+ *          of the character.
+ *          The CCC of a decomposable character is the same as the CCC
+ *          of the first character of its decomposition.
+ *          Some characters decompose as the empty string: these are
+ *          characters with the Default_Ignorable_Code_Point property.
+ *          These do affect normalization, as they all have CCC 0.
+ *
+ * The decompositions in the trie have been fully expanded.
+ *
+ * Casefolding, if applicable, is also done using decompositions.
+ */
+typedef unsigned char utf8leaf_t;
+
+#define LEAF_GEN(LEAF)	((LEAF)[0])
+#define LEAF_CCC(LEAF)	((LEAF)[1])
+#define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
+
+#define MAXGEN		(255)
+
+#define MINCCC		(0)
+#define MAXCCC		(254)
+#define STOPPER		(0)
+#define DECOMPOSE	(255)
+#define HANGUL		((char)(255))
+
+#define UTF8HANGULLEAF	(12)
+
+struct tree;
+static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
+			       const char *, size_t);
+static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
+
+unsigned char *utf8data;
+size_t utf8data_size;
+
+utf8trie_t *nfdi;
+utf8trie_t *nfdicf;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * UTF8 valid ranges.
+ *
+ * The UTF-8 encoding spreads the bits of a 32bit word over several
+ * bytes. This table gives the ranges that can be held and how they'd
+ * be represented.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is an additional requirement on UTF-8, in that only the
+ * shortest representation of a 32bit value is to be used.  A decoder
+ * must not decode sequences that do not satisfy this requirement.
+ * Thus the allowed ranges have a lower bound.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
+ * 17 planes of 65536 values.  This limits the sequences actually seen
+ * even more, to just the following.
+ *
+ *          0 -     0x7f: 0                     0x7f
+ *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
+ *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
+ *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
+ *
+ * Even within those ranges not all values are allowed: the surrogates
+ * 0xd800 - 0xdfff should never be seen.
+ *
+ * Note that the longest sequence seen with valid usage is 4 bytes,
+ * the same a single UTF-32 character.  This makes the UTF-8
+ * representation of Unicode strictly smaller than UTF-32.
+ *
+ * The shortest sequence requirement was introduced by:
+ *    Corrigendum #1: UTF-8 Shortest Form
+ * It can be found here:
+ *    http://www.unicode.org/versions/corrigendum1.html
+ *
+ */
+
+#define UTF8_2_BITS     0xC0
+#define UTF8_3_BITS     0xE0
+#define UTF8_4_BITS     0xF0
+#define UTF8_N_BITS     0x80
+#define UTF8_2_MASK     0xE0
+#define UTF8_3_MASK     0xF0
+#define UTF8_4_MASK     0xF8
+#define UTF8_N_MASK     0xC0
+#define UTF8_V_MASK     0x3F
+#define UTF8_V_SHIFT    6
+
+static int utf8encode(char *str, unsigned int val)
+{
+	int len;
+
+	if (val < 0x80) {
+		str[0] = val;
+		len = 1;
+	} else if (val < 0x800) {
+		str[1] = val & UTF8_V_MASK;
+		str[1] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[0] = val;
+		str[0] |= UTF8_2_BITS;
+		len = 2;
+	} else if (val < 0x10000) {
+		str[2] = val & UTF8_V_MASK;
+		str[2] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[1] = val & UTF8_V_MASK;
+		str[1] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[0] = val;
+		str[0] |= UTF8_3_BITS;
+		len = 3;
+	} else if (val < 0x110000) {
+		str[3] = val & UTF8_V_MASK;
+		str[3] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[2] = val & UTF8_V_MASK;
+		str[2] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[1] = val & UTF8_V_MASK;
+		str[1] |= UTF8_N_BITS;
+		val >>= UTF8_V_SHIFT;
+		str[0] = val;
+		str[0] |= UTF8_4_BITS;
+		len = 4;
+	} else {
+		printf("%#x: illegal val\n", val);
+		len = 0;
+	}
+	return len;
+}
+
+static unsigned int utf8decode(const char *str)
+{
+	const unsigned char *s = (const unsigned char*)str;
+	unsigned int unichar = 0;
+
+	if (*s < 0x80) {
+		unichar = *s;
+	} else if (*s < UTF8_3_BITS) {
+		unichar = *s++ & 0x1F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s & 0x3F;
+	} else if (*s < UTF8_4_BITS) {
+		unichar = *s++ & 0x0F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s++ & 0x3F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s & 0x3F;
+	} else {
+		unichar = *s++ & 0x0F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s++ & 0x3F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s++ & 0x3F;
+		unichar <<= UTF8_V_SHIFT;
+		unichar |= *s & 0x3F;
+	}
+	return unichar;
+}
+
+static int utf32valid(unsigned int unichar)
+{
+	return unichar < 0x110000;
+}
+
+#define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
+
+#define NODE 1
+#define LEAF 0
+
+struct tree {
+	void *root;
+	int childnode;
+	const char *type;
+	unsigned int maxage;
+	struct tree *next;
+	int (*leaf_equal)(void *, void *);
+	void (*leaf_print)(void *, int);
+	int (*leaf_mark)(void *);
+	int (*leaf_size)(void *);
+	int *(*leaf_index)(struct tree *, void *);
+	unsigned char *(*leaf_emit)(void *, unsigned char *);
+	int leafindex[0x110000];
+	int index;
+};
+
+struct node {
+	int index;
+	int offset;
+	int mark;
+	int size;
+	struct node *parent;
+	void *left;
+	void *right;
+	unsigned char bitnum;
+	unsigned char nextbyte;
+	unsigned char leftnode;
+	unsigned char rightnode;
+	unsigned int keybits;
+	unsigned int keymask;
+};
+
+/*
+ * Example lookup function for a tree.
+ */
+static void *lookup(struct tree *tree, const char *key)
+{
+	struct node *node;
+	void *leaf = NULL;
+
+	node = tree->root;
+	while (!leaf && node) {
+		if (node->nextbyte)
+			key++;
+		if (*key & (1 << (node->bitnum & 7))) {
+			/* Right leg */
+			if (node->rightnode == NODE) {
+				node = node->right;
+			} else if (node->rightnode == LEAF) {
+				leaf = node->right;
+			} else {
+				node = NULL;
+			}
+		} else {
+			/* Left leg */
+			if (node->leftnode == NODE) {
+				node = node->left;
+			} else if (node->leftnode == LEAF) {
+				leaf = node->left;
+			} else {
+				node = NULL;
+			}
+		}
+	}
+
+	return leaf;
+}
+
+/*
+ * A simple non-recursive tree walker: keep track of visits to the
+ * left and right branches in the leftmask and rightmask.
+ */
+static void tree_walk(struct tree *tree)
+{
+	struct node *node;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int indent = 1;
+	int nodes, singletons, leaves;
+
+	nodes = singletons = leaves = 0;
+
+	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
+	if (tree->childnode == LEAF) {
+		assert(tree->root);
+		tree->leaf_print(tree->root, indent);
+		leaves = 1;
+	} else {
+		assert(tree->childnode == NODE);
+		node = tree->root;
+		leftmask = rightmask = 0;
+		while (node) {
+			printf("%*snode @ %p bitnum %d nextbyte %d"
+			       " left %p right %p mask %x bits %x\n",
+				indent, "", node,
+				node->bitnum, node->nextbyte,
+				node->left, node->right,
+				node->keymask, node->keybits);
+			nodes += 1;
+			if (!(node->left && node->right))
+				singletons += 1;
+
+			while (node) {
+				bitmask = 1 << node->bitnum;
+				if ((leftmask & bitmask) == 0) {
+					leftmask |= bitmask;
+					if (node->leftnode == LEAF) {
+						assert(node->left);
+						tree->leaf_print(node->left,
+								 indent+1);
+						leaves += 1;
+					} else if (node->left) {
+						assert(node->leftnode == NODE);
+						indent += 1;
+						node = node->left;
+						break;
+					}
+				}
+				if ((rightmask & bitmask) == 0) {
+					rightmask |= bitmask;
+					if (node->rightnode == LEAF) {
+						assert(node->right);
+						tree->leaf_print(node->right,
+								 indent+1);
+						leaves += 1;
+					} else if (node->right) {
+						assert(node->rightnode == NODE);
+						indent += 1;
+						node = node->right;
+						break;
+					}
+				}
+				leftmask &= ~bitmask;
+				rightmask &= ~bitmask;
+				node = node->parent;
+				indent -= 1;
+			}
+		}
+	}
+	printf("nodes %d leaves %d singletons %d\n",
+	       nodes, leaves, singletons);
+}
+
+/*
+ * Allocate an initialize a new internal node.
+ */
+static struct node *alloc_node(struct node *parent)
+{
+	struct node *node;
+	int bitnum;
+
+	node = malloc(sizeof(*node));
+	node->left = node->right = NULL;
+	node->parent = parent;
+	node->leftnode = NODE;
+	node->rightnode = NODE;
+	node->keybits = 0;
+	node->keymask = 0;
+	node->mark = 0;
+	node->index = 0;
+	node->offset = -1;
+	node->size = 4;
+
+	if (node->parent) {
+		bitnum = parent->bitnum;
+		if ((bitnum & 7) == 0) {
+			node->bitnum = bitnum + 7 + 8;
+			node->nextbyte = 1;
+		} else {
+			node->bitnum = bitnum - 1;
+			node->nextbyte = 0;
+		}
+	} else {
+		node->bitnum = 7;
+		node->nextbyte = 0;
+	}
+
+	return node;
+}
+
+/*
+ * Insert a new leaf into the tree, and collapse any subtrees that are
+ * fully populated and end in identical leaves. A nextbyte tagged
+ * internal node will not be removed to preserve the tree's integrity.
+ * Note that due to the structure of utf8, no nextbyte tagged node
+ * will be a candidate for removal.
+ */
+static int insert(struct tree *tree, char *key, int keylen, void *leaf)
+{
+	struct node *node;
+	struct node *parent;
+	void **cursor;
+	int keybits;
+
+	assert(keylen >= 1 && keylen <= 4);
+
+	node = NULL;
+	cursor = &tree->root;
+	keybits = 8 * keylen;
+
+	/* Insert, creating path along the way. */
+	while (keybits) {
+		if (!*cursor)
+			*cursor = alloc_node(node);
+		node = *cursor;
+		if (node->nextbyte)
+			key++;
+		if (*key & (1 << (node->bitnum & 7)))
+			cursor = &node->right;
+		else
+			cursor = &node->left;
+		keybits--;
+	}
+	*cursor = leaf;
+
+	/* Merge subtrees if possible. */
+	while (node) {
+		if (*key & (1 << (node->bitnum & 7)))
+			node->rightnode = LEAF;
+		else
+			node->leftnode = LEAF;
+		if (node->nextbyte)
+			break;
+		if (node->leftnode == NODE || node->rightnode == NODE)
+			break;
+		assert(node->left);
+		assert(node->right);
+		/* Compare */
+		if (! tree->leaf_equal(node->left, node->right))
+			break;
+		/* Keep left, drop right leaf. */
+		leaf = node->left;
+		/* Check in parent */
+		parent = node->parent;
+		if (!parent) {
+			/* root of tree! */
+			tree->root = leaf;
+			tree->childnode = LEAF;
+		} else if (parent->left == node) {
+			parent->left = leaf;
+			parent->leftnode = LEAF;
+			if (parent->right) {
+				parent->keymask = 0;
+				parent->keybits = 0;
+			} else {
+				parent->keymask |= (1 << node->bitnum);
+			}
+		} else if (parent->right == node) {
+			parent->right = leaf;
+			parent->rightnode = LEAF;
+			if (parent->left) {
+				parent->keymask = 0;
+				parent->keybits = 0;
+			} else {
+				parent->keymask |= (1 << node->bitnum);
+				parent->keybits |= (1 << node->bitnum);
+			}
+		} else {
+			/* internal tree error */
+			assert(0);
+		}
+		free(node);
+		node = parent;
+	}
+
+	/* Propagate keymasks up along singleton chains. */
+	while (node) {
+		parent = node->parent;
+		if (!parent)
+			break;
+		/* Nix the mask for parents with two children. */
+		if (node->keymask == 0) {
+			parent->keymask = 0;
+			parent->keybits = 0;
+		} else if (parent->left && parent->right) {
+			parent->keymask = 0;
+			parent->keybits = 0;
+		} else {
+			assert((parent->keymask & node->keymask) == 0);
+			parent->keymask |= node->keymask;
+			parent->keymask |= (1 << parent->bitnum);
+			parent->keybits |= node->keybits;
+			if (parent->right)
+				parent->keybits |= (1 << parent->bitnum);
+		}
+		node = parent;
+	}
+
+	return 0;
+}
+
+/*
+ * Prune internal nodes.
+ *
+ * Fully populated subtrees that end at the same leaf have already
+ * been collapsed.  There are still internal nodes that have for both
+ * their left and right branches a sequence of singletons that make
+ * identical choices and end in identical leaves.  The keymask and
+ * keybits collected in the nodes describe the choices made in these
+ * singleton chains.  When they are identical for the left and right
+ * branch of a node, and the two leaves comare identical, the node in
+ * question can be removed.
+ *
+ * Note that nodes with the nextbyte tag set will not be removed by
+ * this to ensure tree integrity.  Note as well that the structure of
+ * utf8 ensures that these nodes would not have been candidates for
+ * removal in any case.
+ */
+static void prune(struct tree *tree)
+{
+	struct node *node;
+	struct node *left;
+	struct node *right;
+	struct node *parent;
+	void *leftleaf;
+	void *rightleaf;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int count;
+
+	if (verbose > 0)
+		printf("Pruning %s_%x\n", tree->type, tree->maxage);
+
+	count = 0;
+	if (tree->childnode == LEAF)
+		return;
+	if (!tree->root)
+		return;
+
+	leftmask = rightmask = 0;
+	node = tree->root;
+	while (node) {
+		if (node->nextbyte)
+			goto advance;
+		if (node->leftnode == LEAF)
+			goto advance;
+		if (node->rightnode == LEAF)
+			goto advance;
+		if (!node->left)
+			goto advance;
+		if (!node->right)
+			goto advance;
+		left = node->left;
+		right = node->right;
+		if (left->keymask == 0)
+			goto advance;
+		if (right->keymask == 0)
+			goto advance;
+		if (left->keymask != right->keymask)
+			goto advance;
+		if (left->keybits != right->keybits)
+			goto advance;
+		leftleaf = NULL;
+		while (!leftleaf) {
+			assert(left->left || left->right);
+			if (left->leftnode == LEAF)
+				leftleaf = left->left;
+			else if (left->rightnode == LEAF)
+				leftleaf = left->right;
+			else if (left->left)
+				left = left->left;
+			else if (left->right)
+				left = left->right;
+			else
+				assert(0);
+		}
+		rightleaf = NULL;
+		while (!rightleaf) {
+			assert(right->left || right->right);
+			if (right->leftnode == LEAF)
+				rightleaf = right->left;
+			else if (right->rightnode == LEAF)
+				rightleaf = right->right;
+			else if (right->left)
+				right = right->left;
+			else if (right->right)
+				right = right->right;
+			else
+				assert(0);
+		}
+		if (! tree->leaf_equal(leftleaf, rightleaf))
+			goto advance;
+		/*
+		 * This node has identical singleton-only subtrees.
+		 * Remove it.
+		 */
+		parent = node->parent;
+		left = node->left;
+		right = node->right;
+		if (parent->left == node)
+			parent->left = left;
+		else if (parent->right == node)
+			parent->right = left;
+		else
+			assert(0);
+		left->parent = parent;
+		left->keymask |= (1 << node->bitnum);
+		node->left = NULL;
+		while (node) {
+			bitmask = 1 << node->bitnum;
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+			if (node->leftnode == NODE && node->left) {
+				left = node->left;
+				free(node);
+				count++;
+				node = left;
+			} else if (node->rightnode == NODE && node->right) {
+				right = node->right;
+				free(node);
+				count++;
+				node = right;
+			} else {
+				node = NULL;
+			}
+		}
+		/* Propagate keymasks up along singleton chains. */
+		node = parent;
+		/* Force re-check */
+		bitmask = 1 << node->bitnum;
+		leftmask &= ~bitmask;
+		rightmask &= ~bitmask;
+		for (;;) {
+			if (node->left && node->right)
+				break;
+			if (node->left) {
+				left = node->left;
+				node->keymask |= left->keymask;
+				node->keybits |= left->keybits;
+			}
+			if (node->right) {
+				right = node->right;
+				node->keymask |= right->keymask;
+				node->keybits |= right->keybits;
+			}
+			node->keymask |= (1 << node->bitnum);
+			node = node->parent;
+			/* Force re-check */
+			bitmask = 1 << node->bitnum;
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+		}
+	advance:
+		bitmask = 1 << node->bitnum;
+		if ((leftmask & bitmask) == 0 &&
+		    node->leftnode == NODE &&
+		    node->left) {
+			leftmask |= bitmask;
+			node = node->left;
+		} else if ((rightmask & bitmask) == 0 &&
+			   node->rightnode == NODE &&
+			   node->right) {
+			rightmask |= bitmask;
+			node = node->right;
+		} else {
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+			node = node->parent;
+		}
+	}
+	if (verbose > 0)
+		printf("Pruned %d nodes\n", count);
+}
+
+/*
+ * Mark the nodes in the tree that lead to leaves that must be
+ * emitted.
+ */
+static void mark_nodes(struct tree *tree)
+{
+	struct node *node;
+	struct node *n;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int marked;
+
+	marked = 0;
+	if (verbose > 0)
+		printf("Marking %s_%x\n", tree->type, tree->maxage);
+	if (tree->childnode == LEAF)
+		goto done;
+
+	assert(tree->childnode == NODE);
+	node = tree->root;
+	leftmask = rightmask = 0;
+	while (node) {
+		bitmask = 1 << node->bitnum;
+		if ((leftmask & bitmask) == 0) {
+			leftmask |= bitmask;
+			if (node->leftnode == LEAF) {
+				assert(node->left);
+				if (tree->leaf_mark(node->left)) {
+					n = node;
+					while (n && !n->mark) {
+						marked++;
+						n->mark = 1;
+						n = n->parent;
+					}
+				}
+			} else if (node->left) {
+				assert(node->leftnode == NODE);
+				node = node->left;
+				continue;
+			}
+		}
+		if ((rightmask & bitmask) == 0) {
+			rightmask |= bitmask;
+			if (node->rightnode == LEAF) {
+				assert(node->right);
+				if (tree->leaf_mark(node->right)) {
+					n = node;
+					while (n && !n->mark) {
+						marked++;
+						n->mark = 1;
+						n = n->parent;
+					}
+				}
+			} else if (node->right) {
+				assert(node->rightnode == NODE);
+				node = node->right;
+				continue;
+			}
+		}
+		leftmask &= ~bitmask;
+		rightmask &= ~bitmask;
+		node = node->parent;
+	}
+
+	/* second pass: left siblings and singletons */
+
+	assert(tree->childnode == NODE);
+	node = tree->root;
+	leftmask = rightmask = 0;
+	while (node) {
+		bitmask = 1 << node->bitnum;
+		if ((leftmask & bitmask) == 0) {
+			leftmask |= bitmask;
+			if (node->leftnode == LEAF) {
+				assert(node->left);
+				if (tree->leaf_mark(node->left)) {
+					n = node;
+					while (n && !n->mark) {
+						marked++;
+						n->mark = 1;
+						n = n->parent;
+					}
+				}
+			} else if (node->left) {
+				assert(node->leftnode == NODE);
+				node = node->left;
+				if (!node->mark && node->parent->mark) {
+					marked++;
+					node->mark = 1;
+				}
+				continue;
+			}
+		}
+		if ((rightmask & bitmask) == 0) {
+			rightmask |= bitmask;
+			if (node->rightnode == LEAF) {
+				assert(node->right);
+				if (tree->leaf_mark(node->right)) {
+					n = node;
+					while (n && !n->mark) {
+						marked++;
+						n->mark = 1;
+						n = n->parent;
+					}
+				}
+			} else if (node->right) {
+				assert(node->rightnode == NODE);
+				node = node->right;
+				if (!node->mark && node->parent->mark &&
+				    !node->parent->left) {
+					marked++;
+					node->mark = 1;
+				}
+				continue;
+			}
+		}
+		leftmask &= ~bitmask;
+		rightmask &= ~bitmask;
+		node = node->parent;
+	}
+done:
+	if (verbose > 0)
+		printf("Marked %d nodes\n", marked);
+}
+
+/*
+ * Compute the index of each node and leaf, which is the offset in the
+ * emitted trie.  These values must be pre-computed because relative
+ * offsets between nodes are used to navigate the tree.
+ */
+static int index_nodes(struct tree *tree, int index)
+{
+	struct node *node;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int count;
+	int indent;
+
+	/* Align to a cache line (or half a cache line?). */
+	while (index % 64)
+		index++;
+	tree->index = index;
+	indent = 1;
+	count = 0;
+
+	if (verbose > 0)
+		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
+	if (tree->childnode == LEAF) {
+		index += tree->leaf_size(tree->root);
+		goto done;
+	}
+
+	assert(tree->childnode == NODE);
+	node = tree->root;
+	leftmask = rightmask = 0;
+	while (node) {
+		if (!node->mark)
+			goto skip;
+		count++;
+		if (node->index != index)
+			node->index = index;
+		index += node->size;
+skip:
+		while (node) {
+			bitmask = 1 << node->bitnum;
+			if (node->mark && (leftmask & bitmask) == 0) {
+				leftmask |= bitmask;
+				if (node->leftnode == LEAF) {
+					assert(node->left);
+					*tree->leaf_index(tree, node->left) =
+									index;
+					index += tree->leaf_size(node->left);
+					count++;
+				} else if (node->left) {
+					assert(node->leftnode == NODE);
+					indent += 1;
+					node = node->left;
+					break;
+				}
+			}
+			if (node->mark && (rightmask & bitmask) == 0) {
+				rightmask |= bitmask;
+				if (node->rightnode == LEAF) {
+					assert(node->right);
+					*tree->leaf_index(tree, node->right) = index;
+					index += tree->leaf_size(node->right);
+					count++;
+				} else if (node->right) {
+					assert(node->rightnode == NODE);
+					indent += 1;
+					node = node->right;
+					break;
+				}
+			}
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+			node = node->parent;
+			indent -= 1;
+		}
+	}
+done:
+	/* Round up to a multiple of 16 */
+	while (index % 16)
+		index++;
+	if (verbose > 0)
+		printf("Final index %d\n", index);
+	return index;
+}
+
+/*
+ * Mark the nodes in a subtree, helper for size_nodes().
+ */
+static int mark_subtree(struct node *node)
+{
+	int changed;
+
+	if (!node || node->mark)
+		return 0;
+	node->mark = 1;
+	node->index = node->parent->index;
+	changed = 1;
+	if (node->leftnode == NODE)
+		changed += mark_subtree(node->left);
+	if (node->rightnode == NODE)
+		changed += mark_subtree(node->right);
+	return changed;
+}
+
+/*
+ * Compute the size of nodes and leaves. We start by assuming that
+ * each node needs to store a three-byte offset. The indexes of the
+ * nodes are calculated based on that, and then this function is
+ * called to see if the sizes of some nodes can be reduced.  This is
+ * repeated until no more changes are seen.
+ */
+static int size_nodes(struct tree *tree)
+{
+	struct tree *next;
+	struct node *node;
+	struct node *right;
+	struct node *n;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	unsigned int pathbits;
+	unsigned int pathmask;
+	unsigned int nbit;
+	int changed;
+	int offset;
+	int size;
+	int indent;
+
+	indent = 1;
+	changed = 0;
+	size = 0;
+
+	if (verbose > 0)
+		printf("Sizing %s_%x\n", tree->type, tree->maxage);
+	if (tree->childnode == LEAF)
+		goto done;
+
+	assert(tree->childnode == NODE);
+	pathbits = 0;
+	pathmask = 0;
+	node = tree->root;
+	leftmask = rightmask = 0;
+	while (node) {
+		if (!node->mark)
+			goto skip;
+		offset = 0;
+		if (!node->left || !node->right) {
+			size = 1;
+		} else {
+			if (node->rightnode == NODE) {
+				/*
+				 * If the right node is not marked,
+				 * look for a corresponding node in
+				 * the next tree.  Such a node need
+				 * not exist.
+				 */
+				right = node->right;
+				next = tree->next;
+				while (!right->mark) {
+					assert(next);
+					n = next->root;
+					while (n->bitnum != node->bitnum) {
+						nbit = 1 << n->bitnum;
+						if (!(pathmask & nbit))
+							break;
+						if (pathbits & nbit) {
+							if (n->rightnode == LEAF)
+								break;
+							n = n->right;
+						} else {
+							if (n->leftnode == LEAF)
+								break;
+							n = n->left;
+						}
+					}
+					if (n->bitnum != node->bitnum)
+						break;
+					n = n->right;
+					right = n;
+					next = next->next;
+				}
+				/* Make sure the right node is marked. */
+				if (!right->mark)
+					changed += mark_subtree(right);
+				offset = right->index - node->index;
+			} else {
+				offset = *tree->leaf_index(tree, node->right);
+				offset -= node->index;
+			}
+			assert(offset >= 0);
+			assert(offset <= 0xffffff);
+			if (offset <= 0xff) {
+				size = 2;
+			} else if (offset <= 0xffff) {
+				size = 3;
+			} else { /* offset <= 0xffffff */
+				size = 4;
+			}
+		}
+		if (node->size != size || node->offset != offset) {
+			node->size = size;
+			node->offset = offset;
+			changed++;
+		}
+skip:
+		while (node) {
+			bitmask = 1 << node->bitnum;
+			pathmask |= bitmask;
+			if (node->mark && (leftmask & bitmask) == 0) {
+				leftmask |= bitmask;
+				if (node->leftnode == LEAF) {
+					assert(node->left);
+				} else if (node->left) {
+					assert(node->leftnode == NODE);
+					indent += 1;
+					node = node->left;
+					break;
+				}
+			}
+			if (node->mark && (rightmask & bitmask) == 0) {
+				rightmask |= bitmask;
+				pathbits |= bitmask;
+				if (node->rightnode == LEAF) {
+					assert(node->right);
+				} else if (node->right) {
+					assert(node->rightnode == NODE);
+					indent += 1;
+					node = node->right;
+					break;
+				}
+			}
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+			pathmask &= ~bitmask;
+			pathbits &= ~bitmask;
+			node = node->parent;
+			indent -= 1;
+		}
+	}
+done:
+	if (verbose > 0)
+		printf("Found %d changes\n", changed);
+	return changed;
+}
+
+/*
+ * Emit a trie for the given tree into the data array.
+ */
+static void emit(struct tree *tree, unsigned char *data)
+{
+	struct node *node;
+	unsigned int leftmask;
+	unsigned int rightmask;
+	unsigned int bitmask;
+	int offlen;
+	int offset;
+	int index;
+	int indent;
+	int size;
+	int bytes;
+	int leaves;
+	int nodes[4];
+	unsigned char byte;
+
+	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
+	leaves = 0;
+	bytes = 0;
+	index = tree->index;
+	data += index;
+	indent = 1;
+	if (verbose > 0)
+		printf("Emitting %s_%x\n", tree->type, tree->maxage);
+	if (tree->childnode == LEAF) {
+		assert(tree->root);
+		tree->leaf_emit(tree->root, data);
+		size = tree->leaf_size(tree->root);
+		index += size;
+		leaves++;
+		goto done;
+	}
+
+	assert(tree->childnode == NODE);
+	node = tree->root;
+	leftmask = rightmask = 0;
+	while (node) {
+		if (!node->mark)
+			goto skip;
+		assert(node->offset != -1);
+		assert(node->index == index);
+
+		byte = 0;
+		if (node->nextbyte)
+			byte |= NEXTBYTE;
+		byte |= (node->bitnum & BITNUM);
+		if (node->left && node->right) {
+			if (node->leftnode == NODE)
+				byte |= LEFTNODE;
+			if (node->rightnode == NODE)
+				byte |= RIGHTNODE;
+			if (node->offset <= 0xff)
+				offlen = 1;
+			else if (node->offset <= 0xffff)
+				offlen = 2;
+			else
+				offlen = 3;
+			nodes[offlen]++;
+			offset = node->offset;
+			byte |= offlen << OFFLEN_SHIFT;
+			*data++ = byte;
+			index++;
+			while (offlen--) {
+				*data++ = offset & 0xff;
+				index++;
+				offset >>= 8;
+			}
+		} else if (node->left) {
+			if (node->leftnode == NODE)
+				byte |= TRIENODE;
+			nodes[0]++;
+			*data++ = byte;
+			index++;
+		} else if (node->right) {
+			byte |= RIGHTNODE;
+			if (node->rightnode == NODE)
+				byte |= TRIENODE;
+			nodes[0]++;
+			*data++ = byte;
+			index++;
+		} else {
+			assert(0);
+		}
+skip:
+		while (node) {
+			bitmask = 1 << node->bitnum;
+			if (node->mark && (leftmask & bitmask) == 0) {
+				leftmask |= bitmask;
+				if (node->leftnode == LEAF) {
+					assert(node->left);
+					data = tree->leaf_emit(node->left,
+							       data);
+					size = tree->leaf_size(node->left);
+					index += size;
+					bytes += size;
+					leaves++;
+				} else if (node->left) {
+					assert(node->leftnode == NODE);
+					indent += 1;
+					node = node->left;
+					break;
+				}
+			}
+			if (node->mark && (rightmask & bitmask) == 0) {
+				rightmask |= bitmask;
+				if (node->rightnode == LEAF) {
+					assert(node->right);
+					data = tree->leaf_emit(node->right,
+							       data);
+					size = tree->leaf_size(node->right);
+					index += size;
+					bytes += size;
+					leaves++;
+				} else if (node->right) {
+					assert(node->rightnode == NODE);
+					indent += 1;
+					node = node->right;
+					break;
+				}
+			}
+			leftmask &= ~bitmask;
+			rightmask &= ~bitmask;
+			node = node->parent;
+			indent -= 1;
+		}
+	}
+done:
+	if (verbose > 0) {
+		printf("Emitted %d (%d) leaves",
+			leaves, bytes);
+		printf(" %d (%d+%d+%d+%d) nodes",
+			nodes[0] + nodes[1] + nodes[2] + nodes[3],
+			nodes[0], nodes[1], nodes[2], nodes[3]);
+		printf(" %d total\n", index - tree->index);
+	}
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode data.
+ *
+ * We need to keep track of the Canonical Combining Class, the Age,
+ * and decompositions for a code point.
+ *
+ * For the Age, we store the index into the ages table.  Effectively
+ * this is a generation number that the table maps to a unicode
+ * version.
+ *
+ * The correction field is used to indicate that this entry is in the
+ * corrections array, which contains decompositions that were
+ * corrected in later revisions.  The value of the correction field is
+ * the Unicode version in which the mapping was corrected.
+ */
+struct unicode_data {
+	unsigned int code;
+	int ccc;
+	int gen;
+	int correction;
+	unsigned int *utf32nfdi;
+	unsigned int *utf32nfdicf;
+	char *utf8nfdi;
+	char *utf8nfdicf;
+};
+
+struct unicode_data unicode_data[0x110000];
+struct unicode_data *corrections;
+int    corrections_count;
+
+struct tree *nfdi_tree;
+struct tree *nfdicf_tree;
+
+struct tree *trees;
+int          trees_count;
+
+/*
+ * Check the corrections array to see if this entry was corrected at
+ * some point.
+ */
+static struct unicode_data *corrections_lookup(struct unicode_data *u)
+{
+	int i;
+
+	for (i = 0; i != corrections_count; i++)
+		if (u->code == corrections[i].code)
+			return &corrections[i];
+	return u;
+}
+
+static int nfdi_equal(void *l, void *r)
+{
+	struct unicode_data *left = l;
+	struct unicode_data *right = r;
+
+	if (left->gen != right->gen)
+		return 0;
+	if (left->ccc != right->ccc)
+		return 0;
+	if (left->utf8nfdi && right->utf8nfdi &&
+	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
+		return 1;
+	if (left->utf8nfdi || right->utf8nfdi)
+		return 0;
+	return 1;
+}
+
+static int nfdicf_equal(void *l, void *r)
+{
+	struct unicode_data *left = l;
+	struct unicode_data *right = r;
+
+	if (left->gen != right->gen)
+		return 0;
+	if (left->ccc != right->ccc)
+		return 0;
+	if (left->utf8nfdicf && right->utf8nfdicf &&
+	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
+		return 1;
+	if (left->utf8nfdicf && right->utf8nfdicf)
+		return 0;
+	if (left->utf8nfdicf || right->utf8nfdicf)
+		return 0;
+	if (left->utf8nfdi && right->utf8nfdi &&
+	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
+		return 1;
+	if (left->utf8nfdi || right->utf8nfdi)
+		return 0;
+	return 1;
+}
+
+static void nfdi_print(void *l, int indent)
+{
+	struct unicode_data *leaf = l;
+
+	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
+		leaf->code, leaf->ccc, leaf->gen);
+
+	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
+		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
+	else if (leaf->utf8nfdi)
+		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
+
+	printf("\n");
+}
+
+static void nfdicf_print(void *l, int indent)
+{
+	struct unicode_data *leaf = l;
+
+	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
+		leaf->code, leaf->ccc, leaf->gen);
+
+	if (leaf->utf8nfdicf)
+		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
+	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
+		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
+	else if (leaf->utf8nfdi)
+		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
+	printf("\n");
+}
+
+static int nfdi_mark(void *l)
+{
+	return 1;
+}
+
+static int nfdicf_mark(void *l)
+{
+	struct unicode_data *leaf = l;
+
+	if (leaf->utf8nfdicf)
+		return 1;
+	return 0;
+}
+
+static int correction_mark(void *l)
+{
+	struct unicode_data *leaf = l;
+
+	return leaf->correction;
+}
+
+static int nfdi_size(void *l)
+{
+	struct unicode_data *leaf = l;
+	int size = 2;
+
+	if (HANGUL_SYLLABLE(leaf->code))
+		size += 1;
+	else if (leaf->utf8nfdi)
+		size += strlen(leaf->utf8nfdi) + 1;
+	return size;
+}
+
+static int nfdicf_size(void *l)
+{
+	struct unicode_data *leaf = l;
+	int size = 2;
+
+	if (HANGUL_SYLLABLE(leaf->code))
+		size += 1;
+	else if (leaf->utf8nfdicf)
+		size += strlen(leaf->utf8nfdicf) + 1;
+	else if (leaf->utf8nfdi)
+		size += strlen(leaf->utf8nfdi) + 1;
+	return size;
+}
+
+static int *nfdi_index(struct tree *tree, void *l)
+{
+	struct unicode_data *leaf = l;
+
+	return &tree->leafindex[leaf->code];
+}
+
+static int *nfdicf_index(struct tree *tree, void *l)
+{
+	struct unicode_data *leaf = l;
+
+	return &tree->leafindex[leaf->code];
+}
+
+static unsigned char *nfdi_emit(void *l, unsigned char *data)
+{
+	struct unicode_data *leaf = l;
+	unsigned char *s;
+
+	*data++ = leaf->gen;
+
+	if (HANGUL_SYLLABLE(leaf->code)) {
+		*data++ = DECOMPOSE;
+		*data++ = HANGUL;
+	} else if (leaf->utf8nfdi) {
+		*data++ = DECOMPOSE;
+		s = (unsigned char*)leaf->utf8nfdi;
+		while ((*data++ = *s++) != 0)
+			;
+	} else {
+		*data++ = leaf->ccc;
+	}
+	return data;
+}
+
+static unsigned char *nfdicf_emit(void *l, unsigned char *data)
+{
+	struct unicode_data *leaf = l;
+	unsigned char *s;
+
+	*data++ = leaf->gen;
+
+	if (HANGUL_SYLLABLE(leaf->code)) {
+		*data++ = DECOMPOSE;
+		*data++ = HANGUL;
+	} else if (leaf->utf8nfdicf) {
+		*data++ = DECOMPOSE;
+		s = (unsigned char*)leaf->utf8nfdicf;
+		while ((*data++ = *s++) != 0)
+			;
+	} else if (leaf->utf8nfdi) {
+		*data++ = DECOMPOSE;
+		s = (unsigned char*)leaf->utf8nfdi;
+		while ((*data++ = *s++) != 0)
+			;
+	} else {
+		*data++ = leaf->ccc;
+	}
+	return data;
+}
+
+static void utf8_create(struct unicode_data *data)
+{
+	char utf[18*4+1];
+	char *u;
+	unsigned int *um;
+	int i;
+
+	if (data->utf8nfdi) {
+		assert(data->utf8nfdi[0] == HANGUL);
+		return;
+	}
+
+	u = utf;
+	um = data->utf32nfdi;
+	if (um) {
+		for (i = 0; um[i]; i++)
+			u += utf8encode(u, um[i]);
+		*u = '\0';
+		data->utf8nfdi = strdup(utf);
+	}
+	u = utf;
+	um = data->utf32nfdicf;
+	if (um) {
+		for (i = 0; um[i]; i++)
+			u += utf8encode(u, um[i]);
+		*u = '\0';
+		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
+			data->utf8nfdicf = strdup(utf);
+	}
+}
+
+static void utf8_init(void)
+{
+	unsigned int unichar;
+	int i;
+
+	for (unichar = 0; unichar != 0x110000; unichar++)
+		utf8_create(&unicode_data[unichar]);
+
+	for (i = 0; i != corrections_count; i++)
+		utf8_create(&corrections[i]);
+}
+
+static void trees_init(void)
+{
+	struct unicode_data *data;
+	unsigned int maxage;
+	unsigned int nextage;
+	int count;
+	int i;
+	int j;
+
+	/* Count the number of different ages. */
+	count = 0;
+	nextage = (unsigned int)-1;
+	do {
+		maxage = nextage;
+		nextage = 0;
+		for (i = 0; i <= corrections_count; i++) {
+			data = &corrections[i];
+			if (nextage < data->correction &&
+			    data->correction < maxage)
+				nextage = data->correction;
+		}
+		count++;
+	} while (nextage);
+
+	/* Two trees per age: nfdi and nfdicf */
+	trees_count = count * 2;
+	trees = calloc(trees_count, sizeof(struct tree));
+
+	/* Assign ages to the trees. */
+	count = trees_count;
+	nextage = (unsigned int)-1;
+	do {
+		maxage = nextage;
+		trees[--count].maxage = maxage;
+		trees[--count].maxage = maxage;
+		nextage = 0;
+		for (i = 0; i <= corrections_count; i++) {
+			data = &corrections[i];
+			if (nextage < data->correction &&
+			    data->correction < maxage)
+				nextage = data->correction;
+		}
+	} while (nextage);
+
+	/* The ages assigned above are off by one. */
+	for (i = 0; i != trees_count; i++) {
+		j = 0;
+		while (ages[j] < trees[i].maxage)
+			j++;
+		trees[i].maxage = ages[j-1];
+	}
+
+	/* Set up the forwarding between trees. */
+	trees[trees_count-2].next = &trees[trees_count-1];
+	trees[trees_count-1].leaf_mark = nfdi_mark;
+	trees[trees_count-2].leaf_mark = nfdicf_mark;
+	for (i = 0; i != trees_count-2; i += 2) {
+		trees[i].next = &trees[trees_count-2];
+		trees[i].leaf_mark = correction_mark;
+		trees[i+1].next = &trees[trees_count-1];
+		trees[i+1].leaf_mark = correction_mark;
+	}
+
+	/* Assign the callouts. */
+	for (i = 0; i != trees_count; i += 2) {
+		trees[i].type = "nfdicf";
+		trees[i].leaf_equal = nfdicf_equal;
+		trees[i].leaf_print = nfdicf_print;
+		trees[i].leaf_size = nfdicf_size;
+		trees[i].leaf_index = nfdicf_index;
+		trees[i].leaf_emit = nfdicf_emit;
+
+		trees[i+1].type = "nfdi";
+		trees[i+1].leaf_equal = nfdi_equal;
+		trees[i+1].leaf_print = nfdi_print;
+		trees[i+1].leaf_size = nfdi_size;
+		trees[i+1].leaf_index = nfdi_index;
+		trees[i+1].leaf_emit = nfdi_emit;
+	}
+
+	/* Finish init. */
+	for (i = 0; i != trees_count; i++)
+		trees[i].childnode = NODE;
+}
+
+static void trees_populate(void)
+{
+	struct unicode_data *data;
+	unsigned int unichar;
+	char keyval[4];
+	int keylen;
+	int i;
+
+	for (i = 0; i != trees_count; i++) {
+		if (verbose > 0) {
+			printf("Populating %s_%x\n",
+				trees[i].type, trees[i].maxage);
+		}
+		for (unichar = 0; unichar != 0x110000; unichar++) {
+			if (unicode_data[unichar].gen < 0)
+				continue;
+			keylen = utf8encode(keyval, unichar);
+			data = corrections_lookup(&unicode_data[unichar]);
+			if (data->correction <= trees[i].maxage)
+				data = &unicode_data[unichar];
+			insert(&trees[i], keyval, keylen, data);
+		}
+	}
+}
+
+static void trees_reduce(void)
+{
+	int i;
+	int size;
+	int changed;
+
+	for (i = 0; i != trees_count; i++)
+		prune(&trees[i]);
+	for (i = 0; i != trees_count; i++)
+		mark_nodes(&trees[i]);
+	do {
+		size = 0;
+		for (i = 0; i != trees_count; i++)
+			size = index_nodes(&trees[i], size);
+		changed = 0;
+		for (i = 0; i != trees_count; i++)
+			changed += size_nodes(&trees[i]);
+	} while (changed);
+
+	utf8data = calloc(size, 1);
+	utf8data_size = size;
+	for (i = 0; i != trees_count; i++)
+		emit(&trees[i], utf8data);
+
+	if (verbose > 0) {
+		for (i = 0; i != trees_count; i++) {
+			printf("%s_%x idx %d\n",
+				trees[i].type, trees[i].maxage, trees[i].index);
+		}
+	}
+
+	nfdi = utf8data + trees[trees_count-1].index;
+	nfdicf = utf8data + trees[trees_count-2].index;
+
+	nfdi_tree = &trees[trees_count-1];
+	nfdicf_tree = &trees[trees_count-2];
+}
+
+static void verify(struct tree *tree)
+{
+	struct unicode_data *data;
+	utf8leaf_t	*leaf;
+	unsigned int	unichar;
+	char		key[4];
+	unsigned char	hangul[UTF8HANGULLEAF];
+	int		report;
+	int		nocf;
+
+	if (verbose > 0)
+		printf("Verifying %s_%x\n", tree->type, tree->maxage);
+	nocf = strcmp(tree->type, "nfdicf");
+
+	for (unichar = 0; unichar != 0x110000; unichar++) {
+		report = 0;
+		data = corrections_lookup(&unicode_data[unichar]);
+		if (data->correction <= tree->maxage)
+			data = &unicode_data[unichar];
+		utf8encode(key,unichar);
+		leaf = utf8lookup(tree, hangul, key);
+
+		if (!leaf) {
+			if (data->gen != -1)
+				report++;
+			if (unichar < 0xd800 || unichar > 0xdfff)
+				report++;
+		} else {
+			if (unichar >= 0xd800 && unichar <= 0xdfff)
+				report++;
+			if (data->gen == -1)
+				report++;
+			if (data->gen != LEAF_GEN(leaf))
+				report++;
+			if (LEAF_CCC(leaf) == DECOMPOSE) {
+				if (HANGUL_SYLLABLE(data->code)) {
+					if (data->utf8nfdi[0] != HANGUL)
+						report++;
+				} else if (nocf) {
+					if (!data->utf8nfdi) {
+						report++;
+					} else if (strcmp(data->utf8nfdi,
+							  LEAF_STR(leaf))) {
+						report++;
+					}
+				} else {
+					if (!data->utf8nfdicf &&
+					    !data->utf8nfdi) {
+						report++;
+					} else if (data->utf8nfdicf) {
+						if (strcmp(data->utf8nfdicf,
+							   LEAF_STR(leaf)))
+							report++;
+					} else if (strcmp(data->utf8nfdi,
+							  LEAF_STR(leaf))) {
+						report++;
+					}
+				}
+			} else if (data->ccc != LEAF_CCC(leaf)) {
+				report++;
+			}
+		}
+		if (report) {
+			printf("%X code %X gen %d ccc %d"
+				" nfdi -> \"%s\"",
+				unichar, data->code, data->gen,
+				data->ccc,
+				data->utf8nfdi);
+			if (leaf) {
+				printf(" gen %d ccc %d"
+					" nfdi -> \"%s\"",
+					LEAF_GEN(leaf),
+					LEAF_CCC(leaf),
+					LEAF_CCC(leaf) == DECOMPOSE ?
+						LEAF_STR(leaf) : "");
+			}
+			printf("\n");
+		}
+	}
+}
+
+static void trees_verify(void)
+{
+	int i;
+
+	for (i = 0; i != trees_count; i++)
+		verify(&trees[i]);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void help(void)
+{
+	printf("Usage: %s [options]\n", argv0);
+	printf("\n");
+	printf("This program creates an a data trie used for parsing and\n");
+	printf("normalization of UTF-8 strings. The trie is derived from\n");
+	printf("a set of input files from the Unicode character database\n");
+	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
+	printf("\n");
+	printf("The generated tree supports two normalization forms:\n");
+	printf("\n");
+	printf("\tnfdi:\n");
+	printf("\t- Apply unicode normalization form NFD.\n");
+	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
+	printf("\n");
+	printf("\tnfdicf:\n");
+	printf("\t- Apply unicode normalization form NFD.\n");
+	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
+	printf("\t- Apply a full casefold (C + F).\n");
+	printf("\n");
+	printf("These forms were chosen as being most useful when dealing\n");
+	printf("with file names: NFD catches most cases where characters\n");
+	printf("should be considered equivalent. The ignorables are mostly\n");
+	printf("invisible, making names hard to type.\n");
+	printf("\n");
+	printf("The options to specify the files to be used are listed\n");
+	printf("below with their default values, which are the names used\n");
+	printf("by version 11.0.0 of the Unicode Character Database.\n");
+	printf("\n");
+	printf("The input files:\n");
+	printf("\t-a %s\n", AGE_NAME);
+	printf("\t-c %s\n", CCC_NAME);
+	printf("\t-p %s\n", PROP_NAME);
+	printf("\t-d %s\n", DATA_NAME);
+	printf("\t-f %s\n", FOLD_NAME);
+	printf("\t-n %s\n", NORM_NAME);
+	printf("\n");
+	printf("Additionally, the generated tables are tested using:\n");
+	printf("\t-t %s\n", TEST_NAME);
+	printf("\n");
+	printf("Finally, the output file:\n");
+	printf("\t-o %s\n", UTF8_NAME);
+	printf("\n");
+}
+
+static void usage(void)
+{
+	help();
+	exit(1);
+}
+
+static void open_fail(const char *name, int error)
+{
+	printf("Error %d opening %s: %s\n", error, name, strerror(error));
+	exit(1);
+}
+
+static void file_fail(const char *filename)
+{
+	printf("Error parsing %s\n", filename);
+	exit(1);
+}
+
+static void line_fail(const char *filename, const char *line)
+{
+	printf("Error parsing %s:%s\n", filename, line);
+	exit(1);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void print_utf32(unsigned int *utf32str)
+{
+	int	i;
+
+	for (i = 0; utf32str[i]; i++)
+		printf(" %X", utf32str[i]);
+}
+
+static void print_utf32nfdi(unsigned int unichar)
+{
+	printf(" %X ->", unichar);
+	print_utf32(unicode_data[unichar].utf32nfdi);
+	printf("\n");
+}
+
+static void print_utf32nfdicf(unsigned int unichar)
+{
+	printf(" %X ->", unichar);
+	print_utf32(unicode_data[unichar].utf32nfdicf);
+	printf("\n");
+}
+
+/* ------------------------------------------------------------------ */
+
+static void age_init(void)
+{
+	FILE *file;
+	unsigned int first;
+	unsigned int last;
+	unsigned int unichar;
+	unsigned int major;
+	unsigned int minor;
+	unsigned int revision;
+	int gen;
+	int count;
+	int ret;
+
+	if (verbose > 0)
+		printf("Parsing %s\n", age_name);
+
+	file = fopen(age_name, "r");
+	if (!file)
+		open_fail(age_name, errno);
+	count = 0;
+
+	gen = 0;
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "# Age=V%d_%d_%d",
+				&major, &minor, &revision);
+		if (ret == 3) {
+			ages_count++;
+			if (verbose > 1)
+				printf(" Age V%d_%d_%d\n",
+					major, minor, revision);
+			if (!age_valid(major, minor, revision))
+				line_fail(age_name, line);
+			continue;
+		}
+		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
+		if (ret == 2) {
+			ages_count++;
+			if (verbose > 1)
+				printf(" Age V%d_%d\n", major, minor);
+			if (!age_valid(major, minor, 0))
+				line_fail(age_name, line);
+			continue;
+		}
+	}
+
+	/* We must have found something above. */
+	if (verbose > 1)
+		printf("%d age entries\n", ages_count);
+	if (ages_count == 0 || ages_count > MAXGEN)
+		file_fail(age_name);
+
+	/* There is a 0 entry. */
+	ages_count++;
+	ages = calloc(ages_count + 1, sizeof(*ages));
+	/* And a guard entry. */
+	ages[ages_count] = (unsigned int)-1;
+
+	rewind(file);
+	count = 0;
+	gen = 0;
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "# Age=V%d_%d_%d",
+				&major, &minor, &revision);
+		if (ret == 3) {
+			ages[++gen] =
+				UNICODE_AGE(major, minor, revision);
+			if (verbose > 1)
+				printf(" Age V%d_%d_%d = gen %d\n",
+					major, minor, revision, gen);
+			if (!age_valid(major, minor, revision))
+				line_fail(age_name, line);
+			continue;
+		}
+		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
+		if (ret == 2) {
+			ages[++gen] = UNICODE_AGE(major, minor, 0);
+			if (verbose > 1)
+				printf(" Age V%d_%d = %d\n",
+					major, minor, gen);
+			if (!age_valid(major, minor, 0))
+				line_fail(age_name, line);
+			continue;
+		}
+		ret = sscanf(line, "%X..%X ; %d.%d #",
+			     &first, &last, &major, &minor);
+		if (ret == 4) {
+			for (unichar = first; unichar <= last; unichar++)
+				unicode_data[unichar].gen = gen;
+			count += 1 + last - first;
+			if (verbose > 1)
+				printf("  %X..%X gen %d\n", first, last, gen);
+			if (!utf32valid(first) || !utf32valid(last))
+				line_fail(age_name, line);
+			continue;
+		}
+		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
+		if (ret == 3) {
+			unicode_data[unichar].gen = gen;
+			count++;
+			if (verbose > 1)
+				printf("  %X gen %d\n", unichar, gen);
+			if (!utf32valid(unichar))
+				line_fail(age_name, line);
+			continue;
+		}
+	}
+	unicode_maxage = ages[gen];
+	fclose(file);
+
+	/* Nix surrogate block */
+	if (verbose > 1)
+		printf(" Removing surrogate block D800..DFFF\n");
+	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
+		unicode_data[unichar].gen = -1;
+
+	if (verbose > 0)
+	        printf("Found %d entries\n", count);
+	if (count == 0)
+		file_fail(age_name);
+}
+
+static void ccc_init(void)
+{
+	FILE *file;
+	unsigned int first;
+	unsigned int last;
+	unsigned int unichar;
+	unsigned int value;
+	int count;
+	int ret;
+
+	if (verbose > 0)
+		printf("Parsing %s\n", ccc_name);
+
+	file = fopen(ccc_name, "r");
+	if (!file)
+		open_fail(ccc_name, errno);
+
+	count = 0;
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
+		if (ret == 3) {
+			for (unichar = first; unichar <= last; unichar++) {
+				unicode_data[unichar].ccc = value;
+                                count++;
+			}
+			if (verbose > 1)
+				printf(" %X..%X ccc %d\n", first, last, value);
+			if (!utf32valid(first) || !utf32valid(last))
+				line_fail(ccc_name, line);
+			continue;
+		}
+		ret = sscanf(line, "%X ; %d #", &unichar, &value);
+		if (ret == 2) {
+			unicode_data[unichar].ccc = value;
+                        count++;
+			if (verbose > 1)
+				printf(" %X ccc %d\n", unichar, value);
+			if (!utf32valid(unichar))
+				line_fail(ccc_name, line);
+			continue;
+		}
+	}
+	fclose(file);
+
+	if (verbose > 0)
+		printf("Found %d entries\n", count);
+	if (count == 0)
+		file_fail(ccc_name);
+}
+
+static int ignore_compatibility_form(char *type)
+{
+	int i;
+	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
+				 "final", "isolated", "circle", "super",
+				 "sub", "vertical", "wide", "narrow",
+				 "small", "square", "fraction", "compat"};
+
+	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
+		if (strcmp(type, ignored_types[i]) == 0)
+			return 1;
+	return 0;
+}
+
+static void nfdi_init(void)
+{
+	FILE *file;
+	unsigned int unichar;
+	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+	char *s;
+	char *type;
+	unsigned int *um;
+	int count;
+	int i;
+	int ret;
+
+	if (verbose > 0)
+		printf("Parsing %s\n", data_name);
+	file = fopen(data_name, "r");
+	if (!file)
+		open_fail(data_name, errno);
+
+	count = 0;
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
+			     &unichar, buf0);
+		if (ret != 2)
+			continue;
+		if (!utf32valid(unichar))
+			line_fail(data_name, line);
+
+		s = buf0;
+		/* skip over <tag> */
+		if (*s == '<') {
+			type = ++s;
+			while (*++s != '>');
+			*s++ = '\0';
+			if(ignore_compatibility_form(type))
+				continue;
+		}
+		/* decode the decomposition into UTF-32 */
+		i = 0;
+		while (*s) {
+			mapping[i] = strtoul(s, &s, 16);
+			if (!utf32valid(mapping[i]))
+				line_fail(data_name, line);
+			i++;
+		}
+		mapping[i++] = 0;
+
+		um = malloc(i * sizeof(unsigned int));
+		memcpy(um, mapping, i * sizeof(unsigned int));
+		unicode_data[unichar].utf32nfdi = um;
+
+		if (verbose > 1)
+			print_utf32nfdi(unichar);
+		count++;
+	}
+	fclose(file);
+	if (verbose > 0)
+		printf("Found %d entries\n", count);
+	if (count == 0)
+		file_fail(data_name);
+}
+
+static void nfdicf_init(void)
+{
+	FILE *file;
+	unsigned int unichar;
+	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+	char status;
+	char *s;
+	unsigned int *um;
+	int i;
+	int count;
+	int ret;
+
+	if (verbose > 0)
+		printf("Parsing %s\n", fold_name);
+	file = fopen(fold_name, "r");
+	if (!file)
+		open_fail(fold_name, errno);
+
+	count = 0;
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
+		if (ret != 3)
+			continue;
+		if (!utf32valid(unichar))
+			line_fail(fold_name, line);
+		/* Use the C+F casefold. */
+		if (status != 'C' && status != 'F')
+			continue;
+		s = buf0;
+		if (*s == '<')
+			while (*s++ != ' ')
+				;
+		i = 0;
+		while (*s) {
+			mapping[i] = strtoul(s, &s, 16);
+			if (!utf32valid(mapping[i]))
+				line_fail(fold_name, line);
+			i++;
+		}
+		mapping[i++] = 0;
+
+		um = malloc(i * sizeof(unsigned int));
+		memcpy(um, mapping, i * sizeof(unsigned int));
+		unicode_data[unichar].utf32nfdicf = um;
+
+		if (verbose > 1)
+			print_utf32nfdicf(unichar);
+		count++;
+	}
+	fclose(file);
+	if (verbose > 0)
+		printf("Found %d entries\n", count);
+	if (count == 0)
+		file_fail(fold_name);
+}
+
+static void ignore_init(void)
+{
+	FILE *file;
+	unsigned int unichar;
+	unsigned int first;
+	unsigned int last;
+	unsigned int *um;
+	int count;
+	int ret;
+
+	if (verbose > 0)
+		printf("Parsing %s\n", prop_name);
+	file = fopen(prop_name, "r");
+	if (!file)
+		open_fail(prop_name, errno);
+	assert(file);
+	count = 0;
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
+		if (ret == 3) {
+			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
+				continue;
+			if (!utf32valid(first) || !utf32valid(last))
+				line_fail(prop_name, line);
+			for (unichar = first; unichar <= last; unichar++) {
+				free(unicode_data[unichar].utf32nfdi);
+				um = malloc(sizeof(unsigned int));
+				*um = 0;
+				unicode_data[unichar].utf32nfdi = um;
+				free(unicode_data[unichar].utf32nfdicf);
+				um = malloc(sizeof(unsigned int));
+				*um = 0;
+				unicode_data[unichar].utf32nfdicf = um;
+				count++;
+			}
+			if (verbose > 1)
+				printf(" %X..%X Default_Ignorable_Code_Point\n",
+					first, last);
+			continue;
+		}
+		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
+		if (ret == 2) {
+			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
+				continue;
+			if (!utf32valid(unichar))
+				line_fail(prop_name, line);
+			free(unicode_data[unichar].utf32nfdi);
+			um = malloc(sizeof(unsigned int));
+			*um = 0;
+			unicode_data[unichar].utf32nfdi = um;
+			free(unicode_data[unichar].utf32nfdicf);
+			um = malloc(sizeof(unsigned int));
+			*um = 0;
+			unicode_data[unichar].utf32nfdicf = um;
+			if (verbose > 1)
+				printf(" %X Default_Ignorable_Code_Point\n",
+					unichar);
+			count++;
+			continue;
+		}
+	}
+	fclose(file);
+
+	if (verbose > 0)
+		printf("Found %d entries\n", count);
+	if (count == 0)
+		file_fail(prop_name);
+}
+
+static void corrections_init(void)
+{
+	FILE *file;
+	unsigned int unichar;
+	unsigned int major;
+	unsigned int minor;
+	unsigned int revision;
+	unsigned int age;
+	unsigned int *um;
+	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+	char *s;
+	int i;
+	int count;
+	int ret;
+
+	if (verbose > 0)
+		printf("Parsing %s\n", norm_name);
+	file = fopen(norm_name, "r");
+	if (!file)
+		open_fail(norm_name, errno);
+
+	count = 0;
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
+				&unichar, buf0, buf1,
+				&major, &minor, &revision);
+		if (ret != 6)
+			continue;
+		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
+			line_fail(norm_name, line);
+		count++;
+	}
+	corrections = calloc(count, sizeof(struct unicode_data));
+	corrections_count = count;
+	rewind(file);
+
+	count = 0;
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
+				&unichar, buf0, buf1,
+				&major, &minor, &revision);
+		if (ret != 6)
+			continue;
+		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
+			line_fail(norm_name, line);
+		corrections[count] = unicode_data[unichar];
+		assert(corrections[count].code == unichar);
+		age = UNICODE_AGE(major, minor, revision);
+		corrections[count].correction = age;
+
+		i = 0;
+		s = buf0;
+		while (*s) {
+			mapping[i] = strtoul(s, &s, 16);
+			if (!utf32valid(mapping[i]))
+				line_fail(norm_name, line);
+			i++;
+		}
+		mapping[i++] = 0;
+
+		um = malloc(i * sizeof(unsigned int));
+		memcpy(um, mapping, i * sizeof(unsigned int));
+		corrections[count].utf32nfdi = um;
+
+		if (verbose > 1)
+			printf(" %X -> %s -> %s V%d_%d_%d\n",
+				unichar, buf0, buf1, major, minor, revision);
+		count++;
+	}
+	fclose(file);
+
+	if (verbose > 0)
+	        printf("Found %d entries\n", count);
+	if (count == 0)
+		file_fail(norm_name);
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
+ *
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ *
+ * SBase = 0xAC00
+ * LBase = 0x1100
+ * VBase = 0x1161
+ * TBase = 0x11A7
+ * LCount = 19
+ * VCount = 21
+ * TCount = 28
+ * NCount = 588 (VCount * TCount)
+ * SCount = 11172 (LCount * NCount)
+ *
+ * Decomposition:
+ *   SIndex = s - SBase
+ *
+ * LV (Canonical/Full)
+ *   LIndex = SIndex / NCount
+ *   VIndex = (Sindex % NCount) / TCount
+ *   LPart = LBase + LIndex
+ *   VPart = VBase + VIndex
+ *
+ * LVT (Canonical)
+ *   LVIndex = (SIndex / TCount) * TCount
+ *   TIndex = (Sindex % TCount)
+ *   LVPart = SBase + LVIndex
+ *   TPart = TBase + TIndex
+ *
+ * LVT (Full)
+ *   LIndex = SIndex / NCount
+ *   VIndex = (Sindex % NCount) / TCount
+ *   TIndex = (Sindex % TCount)
+ *   LPart = LBase + LIndex
+ *   VPart = VBase + VIndex
+ *   if (TIndex == 0) {
+ *          d = <LPart, VPart>
+ *   } else {
+ *          TPart = TBase + TIndex
+ *          d = <LPart, VPart, TPart>
+ *   }
+ *
+ */
+
+static void hangul_decompose(void)
+{
+	unsigned int sb = 0xAC00;
+	unsigned int lb = 0x1100;
+	unsigned int vb = 0x1161;
+	unsigned int tb = 0x11a7;
+	/* unsigned int lc = 19; */
+	unsigned int vc = 21;
+	unsigned int tc = 28;
+	unsigned int nc = (vc * tc);
+	/* unsigned int sc = (lc * nc); */
+	unsigned int unichar;
+	unsigned int mapping[4];
+	unsigned int *um;
+        int count;
+	int i;
+
+	if (verbose > 0)
+		printf("Decomposing hangul\n");
+	/* Hangul */
+	count = 0;
+	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
+		unsigned int si = unichar - sb;
+		unsigned int li = si / nc;
+		unsigned int vi = (si % nc) / tc;
+		unsigned int ti = si % tc;
+
+		i = 0;
+		mapping[i++] = lb + li;
+		mapping[i++] = vb + vi;
+		if (ti)
+			mapping[i++] = tb + ti;
+		mapping[i++] = 0;
+
+		assert(!unicode_data[unichar].utf32nfdi);
+		um = malloc(i * sizeof(unsigned int));
+		memcpy(um, mapping, i * sizeof(unsigned int));
+		unicode_data[unichar].utf32nfdi = um;
+
+		assert(!unicode_data[unichar].utf32nfdicf);
+		um = malloc(i * sizeof(unsigned int));
+		memcpy(um, mapping, i * sizeof(unsigned int));
+		unicode_data[unichar].utf32nfdicf = um;
+
+		/*
+		 * Add a cookie as a reminder that the hangul syllable
+		 * decompositions must not be stored in the generated
+		 * trie.
+		 */
+		unicode_data[unichar].utf8nfdi = malloc(2);
+		unicode_data[unichar].utf8nfdi[0] = HANGUL;
+		unicode_data[unichar].utf8nfdi[1] = '\0';
+
+		if (verbose > 1)
+			print_utf32nfdi(unichar);
+
+		count++;
+	}
+	if (verbose > 0)
+		printf("Created %d entries\n", count);
+}
+
+static void nfdi_decompose(void)
+{
+	unsigned int unichar;
+	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+	unsigned int *um;
+	unsigned int *dc;
+	int count;
+	int i;
+	int j;
+	int ret;
+
+	if (verbose > 0)
+		printf("Decomposing nfdi\n");
+
+	count = 0;
+	for (unichar = 0; unichar != 0x110000; unichar++) {
+		if (!unicode_data[unichar].utf32nfdi)
+			continue;
+		for (;;) {
+			ret = 1;
+			i = 0;
+			um = unicode_data[unichar].utf32nfdi;
+			while (*um) {
+				dc = unicode_data[*um].utf32nfdi;
+				if (dc) {
+					for (j = 0; dc[j]; j++)
+						mapping[i++] = dc[j];
+					ret = 0;
+				} else {
+					mapping[i++] = *um;
+				}
+				um++;
+			}
+			mapping[i++] = 0;
+			if (ret)
+				break;
+			free(unicode_data[unichar].utf32nfdi);
+			um = malloc(i * sizeof(unsigned int));
+			memcpy(um, mapping, i * sizeof(unsigned int));
+			unicode_data[unichar].utf32nfdi = um;
+		}
+		/* Add this decomposition to nfdicf if there is no entry. */
+		if (!unicode_data[unichar].utf32nfdicf) {
+			um = malloc(i * sizeof(unsigned int));
+			memcpy(um, mapping, i * sizeof(unsigned int));
+			unicode_data[unichar].utf32nfdicf = um;
+		}
+		if (verbose > 1)
+			print_utf32nfdi(unichar);
+		count++;
+	}
+	if (verbose > 0)
+		printf("Processed %d entries\n", count);
+}
+
+static void nfdicf_decompose(void)
+{
+	unsigned int unichar;
+	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+	unsigned int *um;
+	unsigned int *dc;
+	int count;
+	int i;
+	int j;
+	int ret;
+
+	if (verbose > 0)
+		printf("Decomposing nfdicf\n");
+	count = 0;
+	for (unichar = 0; unichar != 0x110000; unichar++) {
+		if (!unicode_data[unichar].utf32nfdicf)
+			continue;
+		for (;;) {
+			ret = 1;
+			i = 0;
+			um = unicode_data[unichar].utf32nfdicf;
+			while (*um) {
+				dc = unicode_data[*um].utf32nfdicf;
+				if (dc) {
+					for (j = 0; dc[j]; j++)
+						mapping[i++] = dc[j];
+					ret = 0;
+				} else {
+					mapping[i++] = *um;
+				}
+				um++;
+			}
+			mapping[i++] = 0;
+			if (ret)
+				break;
+			free(unicode_data[unichar].utf32nfdicf);
+			um = malloc(i * sizeof(unsigned int));
+			memcpy(um, mapping, i * sizeof(unsigned int));
+			unicode_data[unichar].utf32nfdicf = um;
+		}
+		if (verbose > 1)
+			print_utf32nfdicf(unichar);
+		count++;
+	}
+	if (verbose > 0)
+		printf("Processed %d entries\n", count);
+}
+
+/* ------------------------------------------------------------------ */
+
+int utf8agemax(struct tree *, const char *);
+int utf8nagemax(struct tree *, const char *, size_t);
+int utf8agemin(struct tree *, const char *);
+int utf8nagemin(struct tree *, const char *, size_t);
+ssize_t utf8len(struct tree *, const char *);
+ssize_t utf8nlen(struct tree *, const char *, size_t);
+struct utf8cursor;
+int utf8cursor(struct utf8cursor *, struct tree *, const char *);
+int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
+int utf8byte(struct utf8cursor *);
+
+/*
+ * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
+ *
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ *
+ * SBase = 0xAC00
+ * LBase = 0x1100
+ * VBase = 0x1161
+ * TBase = 0x11A7
+ * LCount = 19
+ * VCount = 21
+ * TCount = 28
+ * NCount = 588 (VCount * TCount)
+ * SCount = 11172 (LCount * NCount)
+ *
+ * Decomposition:
+ *   SIndex = s - SBase
+ *
+ * LV (Canonical/Full)
+ *   LIndex = SIndex / NCount
+ *   VIndex = (Sindex % NCount) / TCount
+ *   LPart = LBase + LIndex
+ *   VPart = VBase + VIndex
+ *
+ * LVT (Canonical)
+ *   LVIndex = (SIndex / TCount) * TCount
+ *   TIndex = (Sindex % TCount)
+ *   LVPart = SBase + LVIndex
+ *   TPart = TBase + TIndex
+ *
+ * LVT (Full)
+ *   LIndex = SIndex / NCount
+ *   VIndex = (Sindex % NCount) / TCount
+ *   TIndex = (Sindex % TCount)
+ *   LPart = LBase + LIndex
+ *   VPart = VBase + VIndex
+ *   if (TIndex == 0) {
+ *          d = <LPart, VPart>
+ *   } else {
+ *          TPart = TBase + TIndex
+ *          d = <LPart, VPart, TPart>
+ *   }
+ */
+
+/* Constants */
+#define SB	(0xAC00)
+#define LB	(0x1100)
+#define VB	(0x1161)
+#define TB	(0x11A7)
+#define LC	(19)
+#define VC	(21)
+#define TC	(28)
+#define NC	(VC * TC)
+#define SC	(LC * NC)
+
+/* Algorithmic decomposition of hangul syllable. */
+static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
+{
+	unsigned int	si;
+	unsigned int	li;
+	unsigned int	vi;
+	unsigned int	ti;
+	unsigned char	*h;
+
+	/* Calculate the SI, LI, VI, and TI values. */
+	si = utf8decode(str) - SB;
+	li = si / NC;
+	vi = (si % NC) / TC;
+	ti = si % TC;
+
+	/* Fill in base of leaf. */
+	h = hangul;
+	LEAF_GEN(h) = 2;
+	LEAF_CCC(h) = DECOMPOSE;
+	h += 2;
+
+	/* Add LPart, a 3-byte UTF-8 sequence. */
+	h += utf8encode((char *)h, li + LB);
+
+	/* Add VPart, a 3-byte UTF-8 sequence. */
+	h += utf8encode((char *)h, vi + VB);
+
+	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
+	if (ti)
+		h += utf8encode((char *)h, ti + TB);
+
+	/* Terminate string. */
+	h[0] = '\0';
+
+	return hangul;
+}
+
+/*
+ * Use trie to scan s, touching at most len bytes.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * A non-NULL return guarantees that the UTF-8 sequence starting at s
+ * is well-formed and corresponds to a known unicode code point.  The
+ * shorthand for this will be "is valid UTF-8 unicode".
+ */
+static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
+			       const char *s, size_t len)
+{
+	utf8trie_t	*trie;
+	int		offlen;
+	int		offset;
+	int		mask;
+	int		node;
+
+	if (!tree)
+		return NULL;
+	if (len == 0)
+		return NULL;
+	node = 1;
+	trie = utf8data + tree->index;
+	while (node) {
+		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
+		if (*trie & NEXTBYTE) {
+			if (--len == 0)
+				return NULL;
+			s++;
+		}
+		mask = 1 << (*trie & BITNUM);
+		if (*s & mask) {
+			/* Right leg */
+			if (offlen) {
+				/* Right node at offset of trie */
+				node = (*trie & RIGHTNODE);
+				offset = trie[offlen];
+				while (--offlen) {
+					offset <<= 8;
+					offset |= trie[offlen];
+				}
+				trie += offset;
+			} else if (*trie & RIGHTPATH) {
+				/* Right node after this node */
+				node = (*trie & TRIENODE);
+				trie++;
+			} else {
+				/* No right node. */
+				return NULL;
+			}
+		} else {
+			/* Left leg */
+			if (offlen) {
+				/* Left node after this node. */
+				node = (*trie & LEFTNODE);
+				trie += offlen + 1;
+			} else if (*trie & RIGHTPATH) {
+				/* No left node. */
+				return NULL;
+			} else {
+				/* Left node after this node */
+				node = (*trie & TRIENODE);
+				trie++;
+			}
+		}
+	}
+	/*
+	 * Hangul decomposition is done algorithmically. These are the
+	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
+	 * always 3 bytes long, so s has been advanced twice, and the
+	 * start of the sequence is at s-2.
+	 */
+	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
+		trie = utf8hangul(s - 2, hangul);
+	return trie;
+}
+
+/*
+ * Use trie to scan s.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * Forwards to trie_nlookup().
+ */
+static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
+			      const char *s)
+{
+	return utf8nlookup(tree, hangul, s, (size_t)-1);
+}
+
+/*
+ * Return the number of bytes used by the current UTF-8 sequence.
+ * Assumes the input points to the first byte of a valid UTF-8
+ * sequence.
+ */
+static inline int utf8clen(const char *s)
+{
+	unsigned char c = *s;
+	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
+}
+
+/*
+ * Maximum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if only non-assigned code points are used.
+ */
+int utf8agemax(struct tree *tree, const char *s)
+{
+	utf8leaf_t	*leaf;
+	int		age = 0;
+	int		leaf_age;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!tree)
+		return -1;
+
+	while (*s) {
+		leaf = utf8lookup(tree, hangul, s);
+		if (!leaf)
+			return -1;
+		leaf_age = ages[LEAF_GEN(leaf)];
+		if (leaf_age <= tree->maxage && leaf_age > age)
+			age = leaf_age;
+		s += utf8clen(s);
+	}
+	return age;
+}
+
+/*
+ * Minimum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if non-assigned code points are used.
+ */
+int utf8agemin(struct tree *tree, const char *s)
+{
+	utf8leaf_t	*leaf;
+	int		age;
+	int		leaf_age;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!tree)
+		return -1;
+	age = tree->maxage;
+	while (*s) {
+		leaf = utf8lookup(tree, hangul, s);
+		if (!leaf)
+			return -1;
+		leaf_age = ages[LEAF_GEN(leaf)];
+		if (leaf_age <= tree->maxage && leaf_age < age)
+			age = leaf_age;
+		s += utf8clen(s);
+	}
+	return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int utf8nagemax(struct tree *tree, const char *s, size_t len)
+{
+	utf8leaf_t	*leaf;
+	int		age = 0;
+	int		leaf_age;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!tree)
+		return -1;
+
+        while (len && *s) {
+		leaf = utf8nlookup(tree, hangul, s, len);
+		if (!leaf)
+			return -1;
+		leaf_age = ages[LEAF_GEN(leaf)];
+		if (leaf_age <= tree->maxage && leaf_age > age)
+			age = leaf_age;
+		len -= utf8clen(s);
+		s += utf8clen(s);
+	}
+	return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int utf8nagemin(struct tree *tree, const char *s, size_t len)
+{
+	utf8leaf_t	*leaf;
+	int		leaf_age;
+	int		age;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!tree)
+		return -1;
+	age = tree->maxage;
+        while (len && *s) {
+		leaf = utf8nlookup(tree, hangul, s, len);
+		if (!leaf)
+			return -1;
+		leaf_age = ages[LEAF_GEN(leaf)];
+		if (leaf_age <= tree->maxage && leaf_age < age)
+			age = leaf_age;
+		len -= utf8clen(s);
+		s += utf8clen(s);
+	}
+	return age;
+}
+
+/*
+ * Length of the normalization of s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ *
+ * A string of Default_Ignorable_Code_Point has length 0.
+ */
+ssize_t utf8len(struct tree *tree, const char *s)
+{
+	utf8leaf_t	*leaf;
+	size_t		ret = 0;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!tree)
+		return -1;
+	while (*s) {
+		leaf = utf8lookup(tree, hangul, s);
+		if (!leaf)
+			return -1;
+		if (ages[LEAF_GEN(leaf)] > tree->maxage)
+			ret += utf8clen(s);
+		else if (LEAF_CCC(leaf) == DECOMPOSE)
+			ret += strlen(LEAF_STR(leaf));
+		else
+			ret += utf8clen(s);
+		s += utf8clen(s);
+	}
+	return ret;
+}
+
+/*
+ * Length of the normalization of s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
+{
+	utf8leaf_t	*leaf;
+	size_t		ret = 0;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!tree)
+		return -1;
+	while (len && *s) {
+		leaf = utf8nlookup(tree, hangul, s, len);
+		if (!leaf)
+			return -1;
+		if (ages[LEAF_GEN(leaf)] > tree->maxage)
+			ret += utf8clen(s);
+		else if (LEAF_CCC(leaf) == DECOMPOSE)
+			ret += strlen(LEAF_STR(leaf));
+		else
+			ret += utf8clen(s);
+		len -= utf8clen(s);
+		s += utf8clen(s);
+	}
+	return ret;
+}
+
+/*
+ * Cursor structure used by the normalizer.
+ */
+struct utf8cursor {
+	struct tree	*tree;
+	const char	*s;
+	const char	*p;
+	const char	*ss;
+	const char	*sp;
+	unsigned int	len;
+	unsigned int	slen;
+	short int	ccc;
+	short int	nccc;
+	unsigned int	unichar;
+	unsigned char	hangul[UTF8HANGULLEAF];
+};
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ *   s      : string.
+ *   len    : length of s.
+ *   u8c    : pointer to cursor.
+ *   trie   : utf8trie_t to use for normalization.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
+		size_t len)
+{
+	if (!tree)
+		return -1;
+	if (!s)
+		return -1;
+	u8c->tree = tree;
+	u8c->s = s;
+	u8c->p = NULL;
+	u8c->ss = NULL;
+	u8c->sp = NULL;
+	u8c->len = len;
+	u8c->slen = 0;
+	u8c->ccc = STOPPER;
+	u8c->nccc = STOPPER;
+	u8c->unichar = 0;
+	/* Check we didn't clobber the maximum length. */
+	if (u8c->len != len)
+		return -1;
+	/* The first byte of s may not be an utf8 continuation. */
+	if (len > 0 && (*s & 0xC0) == 0x80)
+		return -1;
+	return 0;
+}
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ *   s      : NUL-terminated string.
+ *   u8c    : pointer to cursor.
+ *   trie   : utf8trie_t to use for normalization.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
+{
+	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
+}
+
+/*
+ * Get one byte from the normalized form of the string described by u8c.
+ *
+ * Returns the byte cast to an unsigned char on succes, and -1 on failure.
+ *
+ * The cursor keeps track of the location in the string in u8c->s.
+ * When a character is decomposed, the current location is stored in
+ * u8c->p, and u8c->s is set to the start of the decomposition. Note
+ * that bytes from a decomposition do not count against u8c->len.
+ *
+ * Characters are emitted if they match the current CCC in u8c->ccc.
+ * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
+ * and the function returns 0 in that case.
+ *
+ * Sorting by CCC is done by repeatedly scanning the string.  The
+ * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
+ * the start of the scan.  The first pass finds the lowest CCC to be
+ * emitted and stores it in u8c->nccc, the second pass emits the
+ * characters with this CCC and finds the next lowest CCC. This limits
+ * the number of passes to 1 + the number of different CCCs in the
+ * sequence being scanned.
+ *
+ * Therefore:
+ *  u8c->p  != NULL -> a decomposition is being scanned.
+ *  u8c->ss != NULL -> this is a repeating scan.
+ *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
+ */
+int utf8byte(struct utf8cursor *u8c)
+{
+	utf8leaf_t *leaf;
+	int ccc;
+
+	for (;;) {
+		/* Check for the end of a decomposed character. */
+		if (u8c->p && *u8c->s == '\0') {
+			u8c->s = u8c->p;
+			u8c->p = NULL;
+		}
+
+		/* Check for end-of-string. */
+		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
+			/* There is no next byte. */
+			if (u8c->ccc == STOPPER)
+				return 0;
+			/* End-of-string during a scan counts as a stopper. */
+			ccc = STOPPER;
+			goto ccc_mismatch;
+		} else if ((*u8c->s & 0xC0) == 0x80) {
+			/* This is a continuation of the current character. */
+			if (!u8c->p)
+				u8c->len--;
+			return (unsigned char)*u8c->s++;
+		}
+
+		/* Look up the data for the current character. */
+		if (u8c->p) {
+			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
+		} else {
+			leaf = utf8nlookup(u8c->tree, u8c->hangul,
+					   u8c->s, u8c->len);
+		}
+
+		/* No leaf found implies that the input is a binary blob. */
+		if (!leaf)
+			return -1;
+
+		/* Characters that are too new have CCC 0. */
+		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
+			ccc = STOPPER;
+		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
+			u8c->len -= utf8clen(u8c->s);
+			u8c->p = u8c->s + utf8clen(u8c->s);
+			u8c->s = LEAF_STR(leaf);
+			/* Empty decomposition implies CCC 0. */
+			if (*u8c->s == '\0') {
+				if (u8c->ccc == STOPPER)
+					continue;
+				ccc = STOPPER;
+				goto ccc_mismatch;
+			}
+			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
+			ccc = LEAF_CCC(leaf);
+		}
+		u8c->unichar = utf8decode(u8c->s);
+
+		/*
+		 * If this is not a stopper, then see if it updates
+		 * the next canonical class to be emitted.
+		 */
+		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
+			u8c->nccc = ccc;
+
+		/*
+		 * Return the current byte if this is the current
+		 * combining class.
+		 */
+		if (ccc == u8c->ccc) {
+			if (!u8c->p)
+				u8c->len--;
+			return (unsigned char)*u8c->s++;
+		}
+
+		/* Current combining class mismatch. */
+	ccc_mismatch:
+		if (u8c->nccc == STOPPER) {
+			/*
+			 * Scan forward for the first canonical class
+			 * to be emitted.  Save the position from
+			 * which to restart.
+			 */
+			assert(u8c->ccc == STOPPER);
+			u8c->ccc = MINCCC - 1;
+			u8c->nccc = ccc;
+			u8c->sp = u8c->p;
+			u8c->ss = u8c->s;
+			u8c->slen = u8c->len;
+			if (!u8c->p)
+				u8c->len -= utf8clen(u8c->s);
+			u8c->s += utf8clen(u8c->s);
+		} else if (ccc != STOPPER) {
+			/* Not a stopper, and not the ccc we're emitting. */
+			if (!u8c->p)
+				u8c->len -= utf8clen(u8c->s);
+			u8c->s += utf8clen(u8c->s);
+		} else if (u8c->nccc != MAXCCC + 1) {
+			/* At a stopper, restart for next ccc. */
+			u8c->ccc = u8c->nccc;
+			u8c->nccc = MAXCCC + 1;
+			u8c->s = u8c->ss;
+			u8c->p = u8c->sp;
+			u8c->len = u8c->slen;
+		} else {
+			/* All done, proceed from here. */
+			u8c->ccc = STOPPER;
+			u8c->nccc = STOPPER;
+			u8c->sp = NULL;
+			u8c->ss = NULL;
+			u8c->slen = 0;
+		}
+	}
+}
+
+/* ------------------------------------------------------------------ */
+
+static int normalize_line(struct tree *tree)
+{
+	char *s;
+	char *t;
+	int c;
+	struct utf8cursor u8c;
+
+	/* First test: null-terminated string. */
+	s = buf2;
+	t = buf3;
+	if (utf8cursor(&u8c, tree, s))
+		return -1;
+	while ((c = utf8byte(&u8c)) > 0)
+		if (c != (unsigned char)*t++)
+			return -1;
+	if (c < 0)
+		return -1;
+	if (*t != 0)
+		return -1;
+
+	/* Second test: length-limited string. */
+	s = buf2;
+	/* Replace NUL with a value that will cause an error if seen. */
+	s[strlen(s) + 1] = -1;
+	t = buf3;
+	if (utf8cursor(&u8c, tree, s))
+		return -1;
+	while ((c = utf8byte(&u8c)) > 0)
+		if (c != (unsigned char)*t++)
+			return -1;
+	if (c < 0)
+		return -1;
+	if (*t != 0)
+		return -1;
+
+	return 0;
+}
+
+static void normalization_test(void)
+{
+	FILE *file;
+	unsigned int unichar;
+	struct unicode_data *data;
+	char *s;
+	char *t;
+	int ret;
+	int ignorables;
+	int tests = 0;
+	int failures = 0;
+
+	if (verbose > 0)
+		printf("Parsing %s\n", test_name);
+	/* Step one, read data from file. */
+	file = fopen(test_name, "r");
+	if (!file)
+		open_fail(test_name, errno);
+
+	while (fgets(line, LINESIZE, file)) {
+		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
+			     buf0, buf1);
+		if (ret != 2 || *line == '#')
+			continue;
+		s = buf0;
+		t = buf2;
+		while (*s) {
+			unichar = strtoul(s, &s, 16);
+			t += utf8encode(t, unichar);
+		}
+		*t = '\0';
+
+		ignorables = 0;
+		s = buf1;
+		t = buf3;
+		while (*s) {
+			unichar = strtoul(s, &s, 16);
+			data = &unicode_data[unichar];
+			if (data->utf8nfdi && !*data->utf8nfdi)
+				ignorables = 1;
+			else
+				t += utf8encode(t, unichar);
+		}
+		*t = '\0';
+
+		tests++;
+		if (normalize_line(nfdi_tree) < 0) {
+			printf("Line %s -> %s", buf0, buf1);
+			if (ignorables)
+				printf(" (ignorables removed)");
+			printf(" failure\n");
+			failures++;
+		}
+	}
+	fclose(file);
+	if (verbose > 0)
+		printf("Ran %d tests with %d failures\n", tests, failures);
+	if (failures)
+		file_fail(test_name);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void write_file(void)
+{
+	FILE *file;
+	int i;
+	int j;
+	int t;
+	int gen;
+
+	if (verbose > 0)
+		printf("Writing %s\n", utf8_name);
+	file = fopen(utf8_name, "w");
+	if (!file)
+		open_fail(utf8_name, errno);
+
+	fprintf(file, "/* This file is generated code, do not edit. */\n");
+	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
+	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
+	fprintf(file, "#endif\n");
+	fprintf(file, "\n");
+	fprintf(file, "static const unsigned int utf8vers = %#x;\n",
+		unicode_maxage);
+	fprintf(file, "\n");
+	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
+	for (i = 0; i != ages_count; i++)
+		fprintf(file, "\t%#x%s\n", ages[i],
+			ages[i] == unicode_maxage ? "" : ",");
+	fprintf(file, "};\n");
+	fprintf(file, "\n");
+	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
+	t = 0;
+	for (gen = 0; gen < ages_count; gen++) {
+		fprintf(file, "\t{ %#x, %d }%s\n",
+			ages[gen], trees[t].index,
+			ages[gen] == unicode_maxage ? "" : ",");
+		if (trees[t].maxage == ages[gen])
+			t += 2;
+	}
+	fprintf(file, "};\n");
+	fprintf(file, "\n");
+	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
+	t = 1;
+	for (gen = 0; gen < ages_count; gen++) {
+		fprintf(file, "\t{ %#x, %d }%s\n",
+			ages[gen], trees[t].index,
+			ages[gen] == unicode_maxage ? "" : ",");
+		if (trees[t].maxage == ages[gen])
+			t += 2;
+	}
+	fprintf(file, "};\n");
+	fprintf(file, "\n");
+	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
+		utf8data_size);
+	t = 0;
+	for (i = 0; i != utf8data_size; i += 16) {
+		if (i == trees[t].index) {
+			fprintf(file, "\t/* %s_%x */\n",
+				trees[t].type, trees[t].maxage);
+			if (t < trees_count-1)
+				t++;
+		}
+		fprintf(file, "\t");
+		for (j = i; j != i + 16; j++)
+			fprintf(file, "0x%.2x%s", utf8data[j],
+				(j < utf8data_size -1 ? "," : ""));
+		fprintf(file, "\n");
+	}
+	fprintf(file, "};\n");
+	fclose(file);
+}
+
+/* ------------------------------------------------------------------ */
+
+int main(int argc, char *argv[])
+{
+	unsigned int unichar;
+	int opt;
+
+	argv0 = argv[0];
+
+	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
+		switch (opt) {
+		case 'a':
+			age_name = optarg;
+			break;
+		case 'c':
+			ccc_name = optarg;
+			break;
+		case 'd':
+			data_name = optarg;
+			break;
+		case 'f':
+			fold_name = optarg;
+			break;
+		case 'n':
+			norm_name = optarg;
+			break;
+		case 'o':
+			utf8_name = optarg;
+			break;
+		case 'p':
+			prop_name = optarg;
+			break;
+		case 't':
+			test_name = optarg;
+			break;
+		case 'v':
+			verbose++;
+			break;
+		case 'h':
+			help();
+			exit(0);
+		default:
+			usage();
+		}
+	}
+
+	if (verbose > 1)
+		help();
+	for (unichar = 0; unichar != 0x110000; unichar++)
+		unicode_data[unichar].code = unichar;
+	age_init();
+	ccc_init();
+	nfdi_init();
+	nfdicf_init();
+	ignore_init();
+	corrections_init();
+	hangul_decompose();
+	nfdi_decompose();
+	nfdicf_decompose();
+	utf8_init();
+	trees_init();
+	trees_populate();
+	trees_reduce();
+	trees_verify();
+	/* Prevent "unused function" warning. */
+	(void)lookup(nfdi_tree, " ");
+	if (verbose > 2)
+		tree_walk(nfdi_tree);
+	if (verbose > 2)
+		tree_walk(nfdicf_tree);
+	normalization_test();
+	write_file();
+
+	return 0;
+}

diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 0000000..2a878b7
--- /dev/null
+++ b/fs/unicode/utf8-core.c

@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+
+#include "utf8n.h"
+
+int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+	const struct utf8data *data = utf8nfdi(um->version);
+
+	if (utf8nlen(data, str->name, str->len) < 0)
+		return -1;
+	return 0;
+}
+EXPORT_SYMBOL(utf8_validate);
+
+int utf8_strncmp(const struct unicode_map *um,
+		 const struct qstr *s1, const struct qstr *s2)
+{
+	const struct utf8data *data = utf8nfdi(um->version);
+	struct utf8cursor cur1, cur2;
+	int c1, c2;
+
+	if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+		return -EINVAL;
+
+	if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (c1 < 0 || c2 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+EXPORT_SYMBOL(utf8_strncmp);
+
+int utf8_strncasecmp(const struct unicode_map *um,
+		     const struct qstr *s1, const struct qstr *s2)
+{
+	const struct utf8data *data = utf8nfdicf(um->version);
+	struct utf8cursor cur1, cur2;
+	int c1, c2;
+
+	if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+		return -EINVAL;
+
+	if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (c1 < 0 || c2 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+EXPORT_SYMBOL(utf8_strncasecmp);
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+int utf8_strncasecmp_folded(const struct unicode_map *um,
+			    const struct qstr *cf,
+			    const struct qstr *s1)
+{
+	const struct utf8data *data = utf8nfdicf(um->version);
+	struct utf8cursor cur1;
+	int c1, c2;
+	int i = 0;
+
+	if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = cf->name[i++];
+		if (c1 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+EXPORT_SYMBOL(utf8_strncasecmp_folded);
+
+int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+		  unsigned char *dest, size_t dlen)
+{
+	const struct utf8data *data = utf8nfdicf(um->version);
+	struct utf8cursor cur;
+	size_t nlen = 0;
+
+	if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+		return -EINVAL;
+
+	for (nlen = 0; nlen < dlen; nlen++) {
+		int c = utf8byte(&cur);
+
+		dest[nlen] = c;
+		if (!c)
+			return nlen;
+		if (c == -1)
+			break;
+	}
+	return -EINVAL;
+}
+
+EXPORT_SYMBOL(utf8_casefold);
+
+int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+		   unsigned char *dest, size_t dlen)
+{
+	const struct utf8data *data = utf8nfdi(um->version);
+	struct utf8cursor cur;
+	ssize_t nlen = 0;
+
+	if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+		return -EINVAL;
+
+	for (nlen = 0; nlen < dlen; nlen++) {
+		int c = utf8byte(&cur);
+
+		dest[nlen] = c;
+		if (!c)
+			return nlen;
+		if (c == -1)
+			break;
+	}
+	return -EINVAL;
+}
+
+EXPORT_SYMBOL(utf8_normalize);
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+			      unsigned int *min, unsigned int *rev)
+{
+	substring_t args[3];
+	char version_string[12];
+	static const struct match_token token[] = {
+		{1, "%d.%d.%d"},
+		{0, NULL}
+	};
+
+	strncpy(version_string, version, sizeof(version_string));
+
+	if (match_token(version_string, token, args) != 1)
+		return -EINVAL;
+
+	if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+	    match_int(&args[2], rev))
+		return -EINVAL;
+
+	return 0;
+}
+
+struct unicode_map *utf8_load(const char *version)
+{
+	struct unicode_map *um = NULL;
+	int unicode_version;
+
+	if (version) {
+		unsigned int maj, min, rev;
+
+		if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+			return ERR_PTR(-EINVAL);
+
+		if (!utf8version_is_supported(maj, min, rev))
+			return ERR_PTR(-EINVAL);
+
+		unicode_version = UNICODE_AGE(maj, min, rev);
+	} else {
+		unicode_version = utf8version_latest();
+		printk(KERN_WARNING"UTF-8 version not specified. "
+		       "Assuming latest supported version (%d.%d.%d).",
+		       (unicode_version >> 16) & 0xff,
+		       (unicode_version >> 8) & 0xff,
+		       (unicode_version & 0xff));
+	}
+
+	um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
+	if (!um)
+		return ERR_PTR(-ENOMEM);
+
+	um->charset = "UTF-8";
+	um->version = unicode_version;
+
+	return um;
+}
+EXPORT_SYMBOL(utf8_load);
+
+void utf8_unload(struct unicode_map *um)
+{
+	kfree(um);
+}
+EXPORT_SYMBOL(utf8_unload);
+
+MODULE_LICENSE("GPL v2");

diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
new file mode 100644
index 0000000..1d2d2e5
--- /dev/null
+++ b/fs/unicode/utf8-norm.c

@@ -0,0 +1,792 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ */
+
+#include "utf8n.h"
+
+struct utf8data {
+	unsigned int maxage;
+	unsigned int offset;
+};
+
+#define __INCLUDED_FROM_UTF8NORM_C__
+#include "utf8data.h"
+#undef __INCLUDED_FROM_UTF8NORM_C__
+
+int utf8version_is_supported(u8 maj, u8 min, u8 rev)
+{
+	int i = ARRAY_SIZE(utf8agetab) - 1;
+	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
+
+	while (i >= 0 && utf8agetab[i] != 0) {
+		if (sb_utf8version == utf8agetab[i])
+			return 1;
+		i--;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(utf8version_is_supported);
+
+int utf8version_latest(void)
+{
+	return utf8vers;
+}
+EXPORT_SYMBOL(utf8version_latest);
+
+/*
+ * UTF-8 valid ranges.
+ *
+ * The UTF-8 encoding spreads the bits of a 32bit word over several
+ * bytes. This table gives the ranges that can be held and how they'd
+ * be represented.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is an additional requirement on UTF-8, in that only the
+ * shortest representation of a 32bit value is to be used.  A decoder
+ * must not decode sequences that do not satisfy this requirement.
+ * Thus the allowed ranges have a lower bound.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
+ * 17 planes of 65536 values.  This limits the sequences actually seen
+ * even more, to just the following.
+ *
+ *          0 -     0x7F: 0                   - 0x7F
+ *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
+ *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
+ *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
+ *
+ * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
+ *
+ * Note that the longest sequence seen with valid usage is 4 bytes,
+ * the same a single UTF-32 character.  This makes the UTF-8
+ * representation of Unicode strictly smaller than UTF-32.
+ *
+ * The shortest sequence requirement was introduced by:
+ *    Corrigendum #1: UTF-8 Shortest Form
+ * It can be found here:
+ *    http://www.unicode.org/versions/corrigendum1.html
+ *
+ */
+
+/*
+ * Return the number of bytes used by the current UTF-8 sequence.
+ * Assumes the input points to the first byte of a valid UTF-8
+ * sequence.
+ */
+static inline int utf8clen(const char *s)
+{
+	unsigned char c = *s;
+
+	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
+}
+
+/*
+ * Decode a 3-byte UTF-8 sequence.
+ */
+static unsigned int
+utf8decode3(const char *str)
+{
+	unsigned int		uc;
+
+	uc = *str++ & 0x0F;
+	uc <<= 6;
+	uc |= *str++ & 0x3F;
+	uc <<= 6;
+	uc |= *str++ & 0x3F;
+
+	return uc;
+}
+
+/*
+ * Encode a 3-byte UTF-8 sequence.
+ */
+static int
+utf8encode3(char *str, unsigned int val)
+{
+	str[2] = (val & 0x3F) | 0x80;
+	val >>= 6;
+	str[1] = (val & 0x3F) | 0x80;
+	val >>= 6;
+	str[0] = val | 0xE0;
+
+	return 3;
+}
+
+/*
+ * utf8trie_t
+ *
+ * A compact binary tree, used to decode UTF-8 characters.
+ *
+ * Internal nodes are one byte for the node itself, and up to three
+ * bytes for an offset into the tree.  The first byte contains the
+ * following information:
+ *  NEXTBYTE  - flag        - advance to next byte if set
+ *  BITNUM    - 3 bit field - the bit number to tested
+ *  OFFLEN    - 2 bit field - number of bytes in the offset
+ * if offlen == 0 (non-branching node)
+ *  RIGHTPATH - 1 bit field - set if the following node is for the
+ *                            right-hand path (tested bit is set)
+ *  TRIENODE  - 1 bit field - set if the following node is an internal
+ *                            node, otherwise it is a leaf node
+ * if offlen != 0 (branching node)
+ *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
+ *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
+ *
+ * Due to the way utf8 works, there cannot be branching nodes with
+ * NEXTBYTE set, and moreover those nodes always have a righthand
+ * descendant.
+ */
+typedef const unsigned char utf8trie_t;
+#define BITNUM		0x07
+#define NEXTBYTE	0x08
+#define OFFLEN		0x30
+#define OFFLEN_SHIFT	4
+#define RIGHTPATH	0x40
+#define TRIENODE	0x80
+#define RIGHTNODE	0x40
+#define LEFTNODE	0x80
+
+/*
+ * utf8leaf_t
+ *
+ * The leaves of the trie are embedded in the trie, and so the same
+ * underlying datatype: unsigned char.
+ *
+ * leaf[0]: The unicode version, stored as a generation number that is
+ *          an index into utf8agetab[].  With this we can filter code
+ *          points based on the unicode version in which they were
+ *          defined.  The CCC of a non-defined code point is 0.
+ * leaf[1]: Canonical Combining Class. During normalization, we need
+ *          to do a stable sort into ascending order of all characters
+ *          with a non-zero CCC that occur between two characters with
+ *          a CCC of 0, or at the begin or end of a string.
+ *          The unicode standard guarantees that all CCC values are
+ *          between 0 and 254 inclusive, which leaves 255 available as
+ *          a special value.
+ *          Code points with CCC 0 are known as stoppers.
+ * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
+ *          start of a NUL-terminated string that is the decomposition
+ *          of the character.
+ *          The CCC of a decomposable character is the same as the CCC
+ *          of the first character of its decomposition.
+ *          Some characters decompose as the empty string: these are
+ *          characters with the Default_Ignorable_Code_Point property.
+ *          These do affect normalization, as they all have CCC 0.
+ *
+ * The decompositions in the trie have been fully expanded, with the
+ * exception of Hangul syllables, which are decomposed algorithmically.
+ *
+ * Casefolding, if applicable, is also done using decompositions.
+ *
+ * The trie is constructed in such a way that leaves exist for all
+ * UTF-8 sequences that match the criteria from the "UTF-8 valid
+ * ranges" comment above, and only for those sequences.  Therefore a
+ * lookup in the trie can be used to validate the UTF-8 input.
+ */
+typedef const unsigned char utf8leaf_t;
+
+#define LEAF_GEN(LEAF)	((LEAF)[0])
+#define LEAF_CCC(LEAF)	((LEAF)[1])
+#define LEAF_STR(LEAF)	((const char *)((LEAF) + 2))
+
+#define MINCCC		(0)
+#define MAXCCC		(254)
+#define STOPPER		(0)
+#define	DECOMPOSE	(255)
+
+/* Marker for hangul syllable decomposition. */
+#define HANGUL		((char)(255))
+/* Size of the synthesized leaf used for Hangul syllable decomposition. */
+#define UTF8HANGULLEAF	(12)
+
+/*
+ * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
+ *
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ *
+ * SBase = 0xAC00
+ * LBase = 0x1100
+ * VBase = 0x1161
+ * TBase = 0x11A7
+ * LCount = 19
+ * VCount = 21
+ * TCount = 28
+ * NCount = 588 (VCount * TCount)
+ * SCount = 11172 (LCount * NCount)
+ *
+ * Decomposition:
+ *   SIndex = s - SBase
+ *
+ * LV (Canonical/Full)
+ *   LIndex = SIndex / NCount
+ *   VIndex = (Sindex % NCount) / TCount
+ *   LPart = LBase + LIndex
+ *   VPart = VBase + VIndex
+ *
+ * LVT (Canonical)
+ *   LVIndex = (SIndex / TCount) * TCount
+ *   TIndex = (Sindex % TCount)
+ *   LVPart = SBase + LVIndex
+ *   TPart = TBase + TIndex
+ *
+ * LVT (Full)
+ *   LIndex = SIndex / NCount
+ *   VIndex = (Sindex % NCount) / TCount
+ *   TIndex = (Sindex % TCount)
+ *   LPart = LBase + LIndex
+ *   VPart = VBase + VIndex
+ *   if (TIndex == 0) {
+ *          d = <LPart, VPart>
+ *   } else {
+ *          TPart = TBase + TIndex
+ *          d = <LPart, TPart, VPart>
+ *   }
+ */
+
+/* Constants */
+#define SB	(0xAC00)
+#define LB	(0x1100)
+#define VB	(0x1161)
+#define TB	(0x11A7)
+#define LC	(19)
+#define VC	(21)
+#define TC	(28)
+#define NC	(VC * TC)
+#define SC	(LC * NC)
+
+/* Algorithmic decomposition of hangul syllable. */
+static utf8leaf_t *
+utf8hangul(const char *str, unsigned char *hangul)
+{
+	unsigned int	si;
+	unsigned int	li;
+	unsigned int	vi;
+	unsigned int	ti;
+	unsigned char	*h;
+
+	/* Calculate the SI, LI, VI, and TI values. */
+	si = utf8decode3(str) - SB;
+	li = si / NC;
+	vi = (si % NC) / TC;
+	ti = si % TC;
+
+	/* Fill in base of leaf. */
+	h = hangul;
+	LEAF_GEN(h) = 2;
+	LEAF_CCC(h) = DECOMPOSE;
+	h += 2;
+
+	/* Add LPart, a 3-byte UTF-8 sequence. */
+	h += utf8encode3((char *)h, li + LB);
+
+	/* Add VPart, a 3-byte UTF-8 sequence. */
+	h += utf8encode3((char *)h, vi + VB);
+
+	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
+	if (ti)
+		h += utf8encode3((char *)h, ti + TB);
+
+	/* Terminate string. */
+	h[0] = '\0';
+
+	return hangul;
+}
+
+/*
+ * Use trie to scan s, touching at most len bytes.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * A non-NULL return guarantees that the UTF-8 sequence starting at s
+ * is well-formed and corresponds to a known unicode code point.  The
+ * shorthand for this will be "is valid UTF-8 unicode".
+ */
+static utf8leaf_t *utf8nlookup(const struct utf8data *data,
+			       unsigned char *hangul, const char *s, size_t len)
+{
+	utf8trie_t	*trie = NULL;
+	int		offlen;
+	int		offset;
+	int		mask;
+	int		node;
+
+	if (!data)
+		return NULL;
+	if (len == 0)
+		return NULL;
+
+	trie = utf8data + data->offset;
+	node = 1;
+	while (node) {
+		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
+		if (*trie & NEXTBYTE) {
+			if (--len == 0)
+				return NULL;
+			s++;
+		}
+		mask = 1 << (*trie & BITNUM);
+		if (*s & mask) {
+			/* Right leg */
+			if (offlen) {
+				/* Right node at offset of trie */
+				node = (*trie & RIGHTNODE);
+				offset = trie[offlen];
+				while (--offlen) {
+					offset <<= 8;
+					offset |= trie[offlen];
+				}
+				trie += offset;
+			} else if (*trie & RIGHTPATH) {
+				/* Right node after this node */
+				node = (*trie & TRIENODE);
+				trie++;
+			} else {
+				/* No right node. */
+				return NULL;
+			}
+		} else {
+			/* Left leg */
+			if (offlen) {
+				/* Left node after this node. */
+				node = (*trie & LEFTNODE);
+				trie += offlen + 1;
+			} else if (*trie & RIGHTPATH) {
+				/* No left node. */
+				return NULL;
+			} else {
+				/* Left node after this node */
+				node = (*trie & TRIENODE);
+				trie++;
+			}
+		}
+	}
+	/*
+	 * Hangul decomposition is done algorithmically. These are the
+	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
+	 * always 3 bytes long, so s has been advanced twice, and the
+	 * start of the sequence is at s-2.
+	 */
+	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
+		trie = utf8hangul(s - 2, hangul);
+	return trie;
+}
+
+/*
+ * Use trie to scan s.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * Forwards to utf8nlookup().
+ */
+static utf8leaf_t *utf8lookup(const struct utf8data *data,
+			      unsigned char *hangul, const char *s)
+{
+	return utf8nlookup(data, hangul, s, (size_t)-1);
+}
+
+/*
+ * Maximum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if only non-assigned code points are used.
+ */
+int utf8agemax(const struct utf8data *data, const char *s)
+{
+	utf8leaf_t	*leaf;
+	int		age = 0;
+	int		leaf_age;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!data)
+		return -1;
+
+	while (*s) {
+		leaf = utf8lookup(data, hangul, s);
+		if (!leaf)
+			return -1;
+
+		leaf_age = utf8agetab[LEAF_GEN(leaf)];
+		if (leaf_age <= data->maxage && leaf_age > age)
+			age = leaf_age;
+		s += utf8clen(s);
+	}
+	return age;
+}
+EXPORT_SYMBOL(utf8agemax);
+
+/*
+ * Minimum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if non-assigned code points are used.
+ */
+int utf8agemin(const struct utf8data *data, const char *s)
+{
+	utf8leaf_t	*leaf;
+	int		age;
+	int		leaf_age;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!data)
+		return -1;
+	age = data->maxage;
+	while (*s) {
+		leaf = utf8lookup(data, hangul, s);
+		if (!leaf)
+			return -1;
+		leaf_age = utf8agetab[LEAF_GEN(leaf)];
+		if (leaf_age <= data->maxage && leaf_age < age)
+			age = leaf_age;
+		s += utf8clen(s);
+	}
+	return age;
+}
+EXPORT_SYMBOL(utf8agemin);
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
+{
+	utf8leaf_t	*leaf;
+	int		age = 0;
+	int		leaf_age;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!data)
+		return -1;
+
+	while (len && *s) {
+		leaf = utf8nlookup(data, hangul, s, len);
+		if (!leaf)
+			return -1;
+		leaf_age = utf8agetab[LEAF_GEN(leaf)];
+		if (leaf_age <= data->maxage && leaf_age > age)
+			age = leaf_age;
+		len -= utf8clen(s);
+		s += utf8clen(s);
+	}
+	return age;
+}
+EXPORT_SYMBOL(utf8nagemax);
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
+{
+	utf8leaf_t	*leaf;
+	int		leaf_age;
+	int		age;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!data)
+		return -1;
+	age = data->maxage;
+	while (len && *s) {
+		leaf = utf8nlookup(data, hangul, s, len);
+		if (!leaf)
+			return -1;
+		leaf_age = utf8agetab[LEAF_GEN(leaf)];
+		if (leaf_age <= data->maxage && leaf_age < age)
+			age = leaf_age;
+		len -= utf8clen(s);
+		s += utf8clen(s);
+	}
+	return age;
+}
+EXPORT_SYMBOL(utf8nagemin);
+
+/*
+ * Length of the normalization of s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ *
+ * A string of Default_Ignorable_Code_Point has length 0.
+ */
+ssize_t utf8len(const struct utf8data *data, const char *s)
+{
+	utf8leaf_t	*leaf;
+	size_t		ret = 0;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!data)
+		return -1;
+	while (*s) {
+		leaf = utf8lookup(data, hangul, s);
+		if (!leaf)
+			return -1;
+		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
+			ret += utf8clen(s);
+		else if (LEAF_CCC(leaf) == DECOMPOSE)
+			ret += strlen(LEAF_STR(leaf));
+		else
+			ret += utf8clen(s);
+		s += utf8clen(s);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(utf8len);
+
+/*
+ * Length of the normalization of s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
+{
+	utf8leaf_t	*leaf;
+	size_t		ret = 0;
+	unsigned char	hangul[UTF8HANGULLEAF];
+
+	if (!data)
+		return -1;
+	while (len && *s) {
+		leaf = utf8nlookup(data, hangul, s, len);
+		if (!leaf)
+			return -1;
+		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
+			ret += utf8clen(s);
+		else if (LEAF_CCC(leaf) == DECOMPOSE)
+			ret += strlen(LEAF_STR(leaf));
+		else
+			ret += utf8clen(s);
+		len -= utf8clen(s);
+		s += utf8clen(s);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(utf8nlen);
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ *   u8c    : pointer to cursor.
+ *   data   : const struct utf8data to use for normalization.
+ *   s      : string.
+ *   len    : length of s.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
+		const char *s, size_t len)
+{
+	if (!data)
+		return -1;
+	if (!s)
+		return -1;
+	u8c->data = data;
+	u8c->s = s;
+	u8c->p = NULL;
+	u8c->ss = NULL;
+	u8c->sp = NULL;
+	u8c->len = len;
+	u8c->slen = 0;
+	u8c->ccc = STOPPER;
+	u8c->nccc = STOPPER;
+	/* Check we didn't clobber the maximum length. */
+	if (u8c->len != len)
+		return -1;
+	/* The first byte of s may not be an utf8 continuation. */
+	if (len > 0 && (*s & 0xC0) == 0x80)
+		return -1;
+	return 0;
+}
+EXPORT_SYMBOL(utf8ncursor);
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ *   u8c    : pointer to cursor.
+ *   data   : const struct utf8data to use for normalization.
+ *   s      : NUL-terminated string.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
+	       const char *s)
+{
+	return utf8ncursor(u8c, data, s, (unsigned int)-1);
+}
+EXPORT_SYMBOL(utf8cursor);
+
+/*
+ * Get one byte from the normalized form of the string described by u8c.
+ *
+ * Returns the byte cast to an unsigned char on succes, and -1 on failure.
+ *
+ * The cursor keeps track of the location in the string in u8c->s.
+ * When a character is decomposed, the current location is stored in
+ * u8c->p, and u8c->s is set to the start of the decomposition. Note
+ * that bytes from a decomposition do not count against u8c->len.
+ *
+ * Characters are emitted if they match the current CCC in u8c->ccc.
+ * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
+ * and the function returns 0 in that case.
+ *
+ * Sorting by CCC is done by repeatedly scanning the string.  The
+ * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
+ * the start of the scan.  The first pass finds the lowest CCC to be
+ * emitted and stores it in u8c->nccc, the second pass emits the
+ * characters with this CCC and finds the next lowest CCC. This limits
+ * the number of passes to 1 + the number of different CCCs in the
+ * sequence being scanned.
+ *
+ * Therefore:
+ *  u8c->p  != NULL -> a decomposition is being scanned.
+ *  u8c->ss != NULL -> this is a repeating scan.
+ *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
+ */
+int utf8byte(struct utf8cursor *u8c)
+{
+	utf8leaf_t *leaf;
+	int ccc;
+
+	for (;;) {
+		/* Check for the end of a decomposed character. */
+		if (u8c->p && *u8c->s == '\0') {
+			u8c->s = u8c->p;
+			u8c->p = NULL;
+		}
+
+		/* Check for end-of-string. */
+		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
+			/* There is no next byte. */
+			if (u8c->ccc == STOPPER)
+				return 0;
+			/* End-of-string during a scan counts as a stopper. */
+			ccc = STOPPER;
+			goto ccc_mismatch;
+		} else if ((*u8c->s & 0xC0) == 0x80) {
+			/* This is a continuation of the current character. */
+			if (!u8c->p)
+				u8c->len--;
+			return (unsigned char)*u8c->s++;
+		}
+
+		/* Look up the data for the current character. */
+		if (u8c->p) {
+			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
+		} else {
+			leaf = utf8nlookup(u8c->data, u8c->hangul,
+					   u8c->s, u8c->len);
+		}
+
+		/* No leaf found implies that the input is a binary blob. */
+		if (!leaf)
+			return -1;
+
+		ccc = LEAF_CCC(leaf);
+		/* Characters that are too new have CCC 0. */
+		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
+			ccc = STOPPER;
+		} else if (ccc == DECOMPOSE) {
+			u8c->len -= utf8clen(u8c->s);
+			u8c->p = u8c->s + utf8clen(u8c->s);
+			u8c->s = LEAF_STR(leaf);
+			/* Empty decomposition implies CCC 0. */
+			if (*u8c->s == '\0') {
+				if (u8c->ccc == STOPPER)
+					continue;
+				ccc = STOPPER;
+				goto ccc_mismatch;
+			}
+
+			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
+			if (!leaf)
+				return -1;
+			ccc = LEAF_CCC(leaf);
+		}
+
+		/*
+		 * If this is not a stopper, then see if it updates
+		 * the next canonical class to be emitted.
+		 */
+		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
+			u8c->nccc = ccc;
+
+		/*
+		 * Return the current byte if this is the current
+		 * combining class.
+		 */
+		if (ccc == u8c->ccc) {
+			if (!u8c->p)
+				u8c->len--;
+			return (unsigned char)*u8c->s++;
+		}
+
+		/* Current combining class mismatch. */
+ccc_mismatch:
+		if (u8c->nccc == STOPPER) {
+			/*
+			 * Scan forward for the first canonical class
+			 * to be emitted.  Save the position from
+			 * which to restart.
+			 */
+			u8c->ccc = MINCCC - 1;
+			u8c->nccc = ccc;
+			u8c->sp = u8c->p;
+			u8c->ss = u8c->s;
+			u8c->slen = u8c->len;
+			if (!u8c->p)
+				u8c->len -= utf8clen(u8c->s);
+			u8c->s += utf8clen(u8c->s);
+		} else if (ccc != STOPPER) {
+			/* Not a stopper, and not the ccc we're emitting. */
+			if (!u8c->p)
+				u8c->len -= utf8clen(u8c->s);
+			u8c->s += utf8clen(u8c->s);
+		} else if (u8c->nccc != MAXCCC + 1) {
+			/* At a stopper, restart for next ccc. */
+			u8c->ccc = u8c->nccc;
+			u8c->nccc = MAXCCC + 1;
+			u8c->s = u8c->ss;
+			u8c->p = u8c->sp;
+			u8c->len = u8c->slen;
+		} else {
+			/* All done, proceed from here. */
+			u8c->ccc = STOPPER;
+			u8c->nccc = STOPPER;
+			u8c->sp = NULL;
+			u8c->ss = NULL;
+			u8c->slen = 0;
+		}
+	}
+}
+EXPORT_SYMBOL(utf8byte);
+
+const struct utf8data *utf8nfdi(unsigned int maxage)
+{
+	int i = ARRAY_SIZE(utf8nfdidata) - 1;
+
+	while (maxage < utf8nfdidata[i].maxage)
+		i--;
+	if (maxage > utf8nfdidata[i].maxage)
+		return NULL;
+	return &utf8nfdidata[i];
+}
+EXPORT_SYMBOL(utf8nfdi);
+
+const struct utf8data *utf8nfdicf(unsigned int maxage)
+{
+	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
+
+	while (maxage < utf8nfdicfdata[i].maxage)
+		i--;
+	if (maxage > utf8nfdicfdata[i].maxage)
+		return NULL;
+	return &utf8nfdicfdata[i];
+}
+EXPORT_SYMBOL(utf8nfdicf);

diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
new file mode 100644
index 0000000..6fe8af7
--- /dev/null
+++ b/fs/unicode/utf8-selftest.c

@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel module for testing utf-8 support.
+ *
+ * Copyright 2017 Collabora Ltd.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/unicode.h>
+#include <linux/dcache.h>
+
+#include "utf8n.h"
+
+unsigned int failed_tests;
+unsigned int total_tests;
+
+/* Tests will be based on this version. */
+#define latest_maj 12
+#define latest_min 1
+#define latest_rev 0
+
+#define _test(cond, func, line, fmt, ...) do {				\
+		total_tests++;						\
+		if (!cond) {						\
+			failed_tests++;					\
+			pr_err("test %s:%d Failed: %s%s",		\
+			       func, line, #cond, (fmt?":":"."));	\
+			if (fmt)					\
+				pr_err(fmt, ##__VA_ARGS__);		\
+		}							\
+	} while (0)
+#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define test(cond) _test(cond, __func__, __LINE__, "")
+
+static const struct {
+	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
+	unsigned char str[10];
+	unsigned char dec[10];
+} nfdi_test_data[] = {
+	/* Trivial sequence */
+	{
+		/* "ABba" decomposes to itself */
+		.str = "aBba",
+		.dec = "aBba",
+	},
+	/* Simple equivalent sequences */
+	{
+               /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
+                  'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
+                  canonical decomposition */
+               .str = {0xc2, 0xbc, 0x00},
+	       .dec = {0xc2, 0xbc, 0x00},
+	},
+	{
+		/* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
+		   'LETTER A' + 'COMBINING DIAERESIS' */
+		.str = {0xc3, 0xa4, 0x00},
+		.dec = {0x61, 0xcc, 0x88, 0x00},
+	},
+	{
+		/* 'LATIN SMALL LETTER LJ' can't decompose to
+		   'LETTER L' + 'LETTER J' on canonical decomposition */
+		.str = {0xC7, 0x89, 0x00},
+		.dec = {0xC7, 0x89, 0x00},
+	},
+	{
+		/* GREEK ANO TELEIA decomposes to MIDDLE DOT */
+		.str = {0xCE, 0x87, 0x00},
+		.dec = {0xC2, 0xB7, 0x00}
+	},
+	/* Canonical ordering */
+	{
+		/* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
+		   to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
+		.str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
+		.dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
+	},
+	{
+		/* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
+		   decomposes to
+		   'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
+		.str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
+
+		.dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
+	},
+
+};
+
+static const struct {
+	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
+	unsigned char str[30];
+	unsigned char ncf[30];
+} nfdicf_test_data[] = {
+	/* Trivial sequences */
+	{
+		/* "ABba" folds to lowercase */
+		.str = {0x41, 0x42, 0x62, 0x61, 0x00},
+		.ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
+	},
+	{
+		/* All ASCII folds to lower-case */
+		.str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
+		.ncf = "abcdefghijklmnopqrstuvwxyz0.1",
+	},
+	{
+		/* LATIN SMALL LETTER SHARP S folds to
+		   LATIN SMALL LETTER S + LATIN SMALL LETTER S */
+		.str = {0xc3, 0x9f, 0x00},
+		.ncf = {0x73, 0x73, 0x00},
+	},
+	{
+		/* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
+		   LATIN SMALL LETTER A + COMBINING RING ABOVE */
+		.str = {0xC3, 0x85, 0x00},
+		.ncf = {0x61, 0xcc, 0x8a, 0x00},
+	},
+	/* Introduced by UTF-8.0.0. */
+	/* Cherokee letters are interesting test-cases because they fold
+	   to upper-case.  Before 8.0.0, Cherokee lowercase were
+	   undefined, thus, the folding from LC is not stable between
+	   7.0.0 -> 8.0.0, but it is from UC. */
+	{
+		/* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
+		.str = {0xea, 0xad, 0xb0, 0x00},
+		.ncf = {0xe1, 0x8e, 0xa0, 0x00},
+	},
+	{
+		/* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
+		.str = {0xe1, 0x8f, 0xb8, 0x00},
+		.ncf = {0xe1, 0x8f, 0xb0, 0x00},
+	},
+	{
+		/* OLD HUNGARIAN CAPITAL LETTER AMB folds to
+		   OLD HUNGARIAN SMALL LETTER AMB */
+		.str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
+		.ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
+	},
+	/* Introduced by UTF-9.0.0. */
+	{
+		/* OSAGE CAPITAL LETTER CHA folds to
+		   OSAGE SMALL LETTER CHA */
+		.str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
+		.ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
+	},
+	{
+		/* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
+		   LATIN LETTER SMALL CAPITAL I */
+		.str = {0xea, 0x9e, 0xae, 0x00},
+		.ncf = {0xc9, 0xaa, 0x00},
+	},
+	/* Introduced by UTF-11.0.0. */
+	{
+		/* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
+		   CAPITAL LETTER AN */
+		.str = {0xe1, 0xb2, 0x90, 0x00},
+		.ncf = {0xe1, 0x83, 0x90, 0x00},
+	}
+};
+
+static void check_utf8_nfdi(void)
+{
+	int i;
+	struct utf8cursor u8c;
+	const struct utf8data *data;
+
+	data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev));
+	if (!data) {
+		pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
+		       __func__, latest_maj, latest_min, latest_rev);
+		return;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
+		int len = strlen(nfdi_test_data[i].str);
+		int nlen = strlen(nfdi_test_data[i].dec);
+		int j = 0;
+		unsigned char c;
+
+		test((utf8len(data, nfdi_test_data[i].str) == nlen));
+		test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen));
+
+		if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0)
+			pr_err("can't create cursor\n");
+
+		while ((c = utf8byte(&u8c)) > 0) {
+			test_f((c == nfdi_test_data[i].dec[j]),
+			       "Unexpected byte 0x%x should be 0x%x\n",
+			       c, nfdi_test_data[i].dec[j]);
+			j++;
+		}
+
+		test((j == nlen));
+	}
+}
+
+static void check_utf8_nfdicf(void)
+{
+	int i;
+	struct utf8cursor u8c;
+	const struct utf8data *data;
+
+	data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev));
+	if (!data) {
+		pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
+		       __func__, latest_maj, latest_min, latest_rev);
+		return;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
+		int len = strlen(nfdicf_test_data[i].str);
+		int nlen = strlen(nfdicf_test_data[i].ncf);
+		int j = 0;
+		unsigned char c;
+
+		test((utf8len(data, nfdicf_test_data[i].str) == nlen));
+		test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen));
+
+		if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0)
+			pr_err("can't create cursor\n");
+
+		while ((c = utf8byte(&u8c)) > 0) {
+			test_f((c == nfdicf_test_data[i].ncf[j]),
+			       "Unexpected byte 0x%x should be 0x%x\n",
+			       c, nfdicf_test_data[i].ncf[j]);
+			j++;
+		}
+
+		test((j == nlen));
+	}
+}
+
+static void check_utf8_comparisons(void)
+{
+	int i;
+	struct unicode_map *table = utf8_load("12.1.0");
+
+	if (IS_ERR(table)) {
+		pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
+		       __func__, latest_maj, latest_min, latest_rev);
+		return;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
+		const struct qstr s1 = {.name = nfdi_test_data[i].str,
+					.len = sizeof(nfdi_test_data[i].str)};
+		const struct qstr s2 = {.name = nfdi_test_data[i].dec,
+					.len = sizeof(nfdi_test_data[i].dec)};
+
+		test_f(!utf8_strncmp(table, &s1, &s2),
+		       "%s %s comparison mismatch\n", s1.name, s2.name);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
+		const struct qstr s1 = {.name = nfdicf_test_data[i].str,
+					.len = sizeof(nfdicf_test_data[i].str)};
+		const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
+					.len = sizeof(nfdicf_test_data[i].ncf)};
+
+		test_f(!utf8_strncasecmp(table, &s1, &s2),
+		       "%s %s comparison mismatch\n", s1.name, s2.name);
+	}
+
+	utf8_unload(table);
+}
+
+static void check_supported_versions(void)
+{
+	/* Unicode 7.0.0 should be supported. */
+	test(utf8version_is_supported(7, 0, 0));
+
+	/* Unicode 9.0.0 should be supported. */
+	test(utf8version_is_supported(9, 0, 0));
+
+	/* Unicode 1x.0.0 (the latest version) should be supported. */
+	test(utf8version_is_supported(latest_maj, latest_min, latest_rev));
+
+	/* Next versions don't exist. */
+	test(!utf8version_is_supported(13, 0, 0));
+	test(!utf8version_is_supported(0, 0, 0));
+	test(!utf8version_is_supported(-1, -1, -1));
+}
+
+static int __init init_test_ucd(void)
+{
+	failed_tests = 0;
+	total_tests = 0;
+
+	check_supported_versions();
+	check_utf8_nfdi();
+	check_utf8_nfdicf();
+	check_utf8_comparisons();
+
+	if (!failed_tests)
+		pr_info("All %u tests passed\n", total_tests);
+	else
+		pr_err("%u out of %u tests failed\n", failed_tests,
+		       total_tests);
+	return 0;
+}
+
+static void __exit exit_test_ucd(void)
+{
+}
+
+module_init(init_test_ucd);
+module_exit(exit_test_ucd);
+
+MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
+MODULE_LICENSE("GPL");

diff --git a/fs/unicode/utf8data.h_shipped b/fs/unicode/utf8data.h_shipped
new file mode 100644
index 0000000..76e4f0e
--- /dev/null
+++ b/fs/unicode/utf8data.h_shipped

@@ -0,0 +1,4109 @@
+/* This file is generated code, do not edit. */
+#ifndef __INCLUDED_FROM_UTF8NORM_C__
+#error Only nls_utf8-norm.c should include this file.
+#endif
+
+static const unsigned int utf8vers = 0xc0100;
+
+static const unsigned int utf8agetab[] = {
+	0,
+	0x10100,
+	0x20000,
+	0x20100,
+	0x30000,
+	0x30100,
+	0x30200,
+	0x40000,
+	0x40100,
+	0x50000,
+	0x50100,
+	0x50200,
+	0x60000,
+	0x60100,
+	0x60200,
+	0x60300,
+	0x70000,
+	0x80000,
+	0x90000,
+	0xa0000,
+	0xb0000,
+	0xc0000,
+	0xc0100
+};
+
+static const struct utf8data utf8nfdicfdata[] = {
+	{ 0, 0 },
+	{ 0x10100, 0 },
+	{ 0x20000, 0 },
+	{ 0x20100, 0 },
+	{ 0x30000, 0 },
+	{ 0x30100, 0 },
+	{ 0x30200, 1792 },
+	{ 0x40000, 3200 },
+	{ 0x40100, 3200 },
+	{ 0x50000, 3200 },
+	{ 0x50100, 3200 },
+	{ 0x50200, 3200 },
+	{ 0x60000, 3200 },
+	{ 0x60100, 3200 },
+	{ 0x60200, 3200 },
+	{ 0x60300, 3200 },
+	{ 0x70000, 3200 },
+	{ 0x80000, 3200 },
+	{ 0x90000, 3200 },
+	{ 0xa0000, 3200 },
+	{ 0xb0000, 3200 },
+	{ 0xc0000, 3200 },
+	{ 0xc0100, 3200 }
+};
+
+static const struct utf8data utf8nfdidata[] = {
+	{ 0, 896 },
+	{ 0x10100, 896 },
+	{ 0x20000, 896 },
+	{ 0x20100, 896 },
+	{ 0x30000, 896 },
+	{ 0x30100, 896 },
+	{ 0x30200, 2496 },
+	{ 0x40000, 20736 },
+	{ 0x40100, 20736 },
+	{ 0x50000, 20736 },
+	{ 0x50100, 20736 },
+	{ 0x50200, 20736 },
+	{ 0x60000, 20736 },
+	{ 0x60100, 20736 },
+	{ 0x60200, 20736 },
+	{ 0x60300, 20736 },
+	{ 0x70000, 20736 },
+	{ 0x80000, 20736 },
+	{ 0x90000, 20736 },
+	{ 0xa0000, 20736 },
+	{ 0xb0000, 20736 },
+	{ 0xc0000, 20736 },
+	{ 0xc0100, 20736 }
+};
+
+static const unsigned char utf8data[64256] = {
+	/* nfdicf_30100 */
+	0xd7,0x07,0x66,0x84,0x0c,0x01,0x00,0xc6,0xd5,0x16,0xe4,0x99,0x1a,0xe3,0x63,0x15,
+	0xe2,0x4c,0x0e,0xc1,0xe0,0x4e,0x0d,0xcf,0x86,0x65,0x2d,0x0d,0x01,0x00,0xd4,0xb8,
+	0xd3,0x27,0xe2,0x89,0xa3,0xe1,0xce,0x35,0xe0,0x2c,0x22,0xcf,0x86,0xc5,0xe4,0x15,
+	0x6d,0xe3,0x60,0x68,0xe2,0xf6,0x65,0xe1,0x29,0x65,0xe0,0xee,0x64,0xcf,0x86,0xe5,
+	0xb3,0x64,0x64,0x96,0x64,0x0b,0x00,0xd2,0x0e,0xe1,0xb5,0x3c,0xe0,0xba,0xa3,0xcf,
+	0x86,0xcf,0x06,0x01,0x00,0xd1,0x0c,0xe0,0x1e,0xa9,0xcf,0x86,0xcf,0x06,0x02,0xff,
+	0xff,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x01,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x01,
+	0x00,0xe4,0xe1,0x45,0xe3,0x3b,0x45,0xd2,0x06,0xcf,0x06,0x01,0x00,0xe1,0x87,0xad,
+	0xd0,0x21,0xcf,0x86,0xe5,0x81,0xaa,0xe4,0x00,0xaa,0xe3,0xbf,0xa9,0xe2,0x9e,0xa9,
+	0xe1,0x8d,0xa9,0x10,0x08,0x01,0xff,0xe8,0xb1,0x88,0x00,0x01,0xff,0xe6,0x9b,0xb4,
+	0x00,0xcf,0x86,0xe5,0x63,0xac,0xd4,0x19,0xe3,0xa2,0xab,0xe2,0x81,0xab,0xe1,0x70,
+	0xab,0x10,0x08,0x01,0xff,0xe9,0xb9,0xbf,0x00,0x01,0xff,0xe8,0xab,0x96,0x00,0xe3,
+	0x09,0xac,0xe2,0xe8,0xab,0xe1,0xd7,0xab,0x10,0x08,0x01,0xff,0xe7,0xb8,0xb7,0x00,
+	0x01,0xff,0xe9,0x9b,0xbb,0x00,0x83,0xe2,0x19,0xfa,0xe1,0xf2,0xf6,0xe0,0x6f,0xf5,
+	0xcf,0x86,0xd5,0x31,0xc4,0xe3,0x54,0x4e,0xe2,0xf5,0x4c,0xe1,0xa4,0xcc,0xe0,0x9c,
+	0x4b,0xcf,0x86,0xe5,0x8e,0x49,0xe4,0xaf,0x46,0xe3,0x11,0xbd,0xe2,0x68,0xbc,0xe1,
+	0x43,0xbc,0xe0,0x1c,0xbc,0xcf,0x86,0xe5,0xe9,0xbb,0x94,0x07,0x63,0xd4,0xbb,0x07,
+	0x00,0x07,0x00,0xe4,0xdb,0xf4,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x05,0x00,0xd2,0x0b,
+	0xe1,0xea,0xe1,0xcf,0x86,0xcf,0x06,0x05,0x00,0xd1,0x0e,0xe0,0xd9,0xe2,0xcf,0x86,
+	0xe5,0x9e,0xe2,0xcf,0x06,0x11,0x00,0xd0,0x0b,0xcf,0x86,0xe5,0xd9,0xe2,0xcf,0x06,
+	0x13,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xe4,0x74,0xf4,0xe3,0x5d,0xf3,
+	0xd2,0xa0,0xe1,0x13,0xe7,0xd0,0x21,0xcf,0x86,0xe5,0x14,0xe4,0xe4,0x90,0xe3,0xe3,
+	0x4e,0xe3,0xe2,0x2d,0xe3,0xe1,0x1b,0xe3,0x10,0x08,0x05,0xff,0xe4,0xb8,0xbd,0x00,
+	0x05,0xff,0xe4,0xb8,0xb8,0x00,0xcf,0x86,0xd5,0x1c,0xe4,0x70,0xe5,0xe3,0x2f,0xe5,
+	0xe2,0x0e,0xe5,0xe1,0xfd,0xe4,0x10,0x08,0x05,0xff,0xe5,0x92,0xa2,0x00,0x05,0xff,
+	0xe5,0x93,0xb6,0x00,0xd4,0x34,0xd3,0x18,0xe2,0xf7,0xe5,0xe1,0xe6,0xe5,0x10,0x09,
+	0x05,0xff,0xf0,0xa1,0x9a,0xa8,0x00,0x05,0xff,0xf0,0xa1,0x9b,0xaa,0x00,0xe2,0x17,
+	0xe6,0x91,0x11,0x10,0x09,0x05,0xff,0xf0,0xa1,0x8d,0xaa,0x00,0x05,0xff,0xe5,0xac,
+	0x88,0x00,0x05,0xff,0xe5,0xac,0xbe,0x00,0xe3,0x5d,0xe6,0xd2,0x14,0xe1,0x2c,0xe6,
+	0x10,0x08,0x05,0xff,0xe5,0xaf,0xb3,0x00,0x05,0xff,0xf0,0xa1,0xac,0x98,0x00,0xe1,
+	0x38,0xe6,0x10,0x08,0x05,0xff,0xe5,0xbc,0xb3,0x00,0x05,0xff,0xe5,0xb0,0xa2,0x00,
+	0xd1,0xd5,0xd0,0x6a,0xcf,0x86,0xe5,0x8d,0xeb,0xd4,0x19,0xe3,0xc6,0xea,0xe2,0xa4,
+	0xea,0xe1,0x93,0xea,0x10,0x08,0x05,0xff,0xe6,0xb4,0xbe,0x00,0x05,0xff,0xe6,0xb5,
+	0xb7,0x00,0xd3,0x18,0xe2,0x10,0xeb,0xe1,0xff,0xea,0x10,0x09,0x05,0xff,0xf0,0xa3,
+	0xbd,0x9e,0x00,0x05,0xff,0xf0,0xa3,0xbe,0x8e,0x00,0xd2,0x13,0xe1,0x28,0xeb,0x10,
+	0x08,0x05,0xff,0xe7,0x81,0xbd,0x00,0x05,0xff,0xe7,0x81,0xb7,0x00,0xd1,0x11,0x10,
+	0x08,0x05,0xff,0xe7,0x85,0x85,0x00,0x05,0xff,0xf0,0xa4,0x89,0xa3,0x00,0x10,0x08,
+	0x05,0xff,0xe7,0x86,0x9c,0x00,0x05,0xff,0xe4,0x8e,0xab,0x00,0xcf,0x86,0xe5,0x2a,
+	0xed,0xd4,0x1a,0xe3,0x62,0xec,0xe2,0x48,0xec,0xe1,0x35,0xec,0x10,0x08,0x05,0xff,
+	0xe7,0x9b,0xb4,0x00,0x05,0xff,0xf0,0xa5,0x83,0xb3,0x00,0xd3,0x16,0xe2,0xaa,0xec,
+	0xe1,0x98,0xec,0x10,0x08,0x05,0xff,0xe7,0xa3,0x8c,0x00,0x05,0xff,0xe4,0x83,0xa3,
+	0x00,0xd2,0x13,0xe1,0xc6,0xec,0x10,0x08,0x05,0xff,0xe4,0x84,0xaf,0x00,0x05,0xff,
+	0xe7,0xa9,0x80,0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa5,0xa5,0xbc,0x00,0x05,
+	0xff,0xf0,0xa5,0xaa,0xa7,0x00,0x10,0x09,0x05,0xff,0xf0,0xa5,0xaa,0xa7,0x00,0x05,
+	0xff,0xe7,0xaa,0xae,0x00,0xe0,0xdc,0xef,0xcf,0x86,0xd5,0x1d,0xe4,0x51,0xee,0xe3,
+	0x0d,0xee,0xe2,0xeb,0xed,0xe1,0xda,0xed,0x10,0x09,0x05,0xff,0xf0,0xa3,0x8d,0x9f,
+	0x00,0x05,0xff,0xe4,0x8f,0x95,0x00,0xd4,0x19,0xe3,0xf8,0xee,0xe2,0xd4,0xee,0xe1,
+	0xc3,0xee,0x10,0x08,0x05,0xff,0xe8,0x8d,0x93,0x00,0x05,0xff,0xe8,0x8f,0x8a,0x00,
+	0xd3,0x18,0xe2,0x43,0xef,0xe1,0x32,0xef,0x10,0x09,0x05,0xff,0xf0,0xa6,0xbe,0xb1,
+	0x00,0x05,0xff,0xf0,0xa7,0x83,0x92,0x00,0xd2,0x13,0xe1,0x5b,0xef,0x10,0x08,0x05,
+	0xff,0xe8,0x9a,0x88,0x00,0x05,0xff,0xe8,0x9c,0x8e,0x00,0xd1,0x10,0x10,0x08,0x05,
+	0xff,0xe8,0x9c,0xa8,0x00,0x05,0xff,0xe8,0x9d,0xab,0x00,0x10,0x08,0x05,0xff,0xe8,
+	0x9e,0x86,0x00,0x05,0xff,0xe4,0xb5,0x97,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	/* nfdi_30100 */
+	0x57,0x04,0x01,0x00,0xc6,0xd5,0x16,0xe4,0xc2,0x59,0xe3,0xfb,0x54,0xe2,0x74,0x4f,
+	0xc1,0xe0,0xa0,0x4d,0xcf,0x86,0x65,0x84,0x4d,0x01,0x00,0xd4,0xb8,0xd3,0x27,0xe2,
+	0x0c,0xa0,0xe1,0xdf,0x8d,0xe0,0x39,0x71,0xcf,0x86,0xc5,0xe4,0x98,0x69,0xe3,0xe3,
+	0x64,0xe2,0x79,0x62,0xe1,0xac,0x61,0xe0,0x71,0x61,0xcf,0x86,0xe5,0x36,0x61,0x64,
+	0x19,0x61,0x0b,0x00,0xd2,0x0e,0xe1,0xc2,0xa0,0xe0,0x3d,0xa0,0xcf,0x86,0xcf,0x06,
+	0x01,0x00,0xd1,0x0c,0xe0,0xa1,0xa5,0xcf,0x86,0xcf,0x06,0x02,0xff,0xff,0xd0,0x08,
+	0xcf,0x86,0xcf,0x06,0x01,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x01,0x00,0xe4,0x9e,
+	0xb6,0xe3,0x18,0xae,0xd2,0x06,0xcf,0x06,0x01,0x00,0xe1,0x0a,0xaa,0xd0,0x21,0xcf,
+	0x86,0xe5,0x04,0xa7,0xe4,0x83,0xa6,0xe3,0x42,0xa6,0xe2,0x21,0xa6,0xe1,0x10,0xa6,
+	0x10,0x08,0x01,0xff,0xe8,0xb1,0x88,0x00,0x01,0xff,0xe6,0x9b,0xb4,0x00,0xcf,0x86,
+	0xe5,0xe6,0xa8,0xd4,0x19,0xe3,0x25,0xa8,0xe2,0x04,0xa8,0xe1,0xf3,0xa7,0x10,0x08,
+	0x01,0xff,0xe9,0xb9,0xbf,0x00,0x01,0xff,0xe8,0xab,0x96,0x00,0xe3,0x8c,0xa8,0xe2,
+	0x6b,0xa8,0xe1,0x5a,0xa8,0x10,0x08,0x01,0xff,0xe7,0xb8,0xb7,0x00,0x01,0xff,0xe9,
+	0x9b,0xbb,0x00,0x83,0xe2,0x9c,0xf6,0xe1,0x75,0xf3,0xe0,0xf2,0xf1,0xcf,0x86,0xd5,
+	0x31,0xc4,0xe3,0x6d,0xcc,0xe2,0x46,0xca,0xe1,0x27,0xc9,0xe0,0xb7,0xbf,0xcf,0x86,
+	0xe5,0xaa,0xbb,0xe4,0xa3,0xba,0xe3,0x94,0xb9,0xe2,0xeb,0xb8,0xe1,0xc6,0xb8,0xe0,
+	0x9f,0xb8,0xcf,0x86,0xe5,0x6c,0xb8,0x94,0x07,0x63,0x57,0xb8,0x07,0x00,0x07,0x00,
+	0xe4,0x5e,0xf1,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x05,0x00,0xd2,0x0b,0xe1,0x6d,0xde,
+	0xcf,0x86,0xcf,0x06,0x05,0x00,0xd1,0x0e,0xe0,0x5c,0xdf,0xcf,0x86,0xe5,0x21,0xdf,
+	0xcf,0x06,0x11,0x00,0xd0,0x0b,0xcf,0x86,0xe5,0x5c,0xdf,0xcf,0x06,0x13,0x00,0xcf,
+	0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xe4,0xf7,0xf0,0xe3,0xe0,0xef,0xd2,0xa0,0xe1,
+	0x96,0xe3,0xd0,0x21,0xcf,0x86,0xe5,0x97,0xe0,0xe4,0x13,0xe0,0xe3,0xd1,0xdf,0xe2,
+	0xb0,0xdf,0xe1,0x9e,0xdf,0x10,0x08,0x05,0xff,0xe4,0xb8,0xbd,0x00,0x05,0xff,0xe4,
+	0xb8,0xb8,0x00,0xcf,0x86,0xd5,0x1c,0xe4,0xf3,0xe1,0xe3,0xb2,0xe1,0xe2,0x91,0xe1,
+	0xe1,0x80,0xe1,0x10,0x08,0x05,0xff,0xe5,0x92,0xa2,0x00,0x05,0xff,0xe5,0x93,0xb6,
+	0x00,0xd4,0x34,0xd3,0x18,0xe2,0x7a,0xe2,0xe1,0x69,0xe2,0x10,0x09,0x05,0xff,0xf0,
+	0xa1,0x9a,0xa8,0x00,0x05,0xff,0xf0,0xa1,0x9b,0xaa,0x00,0xe2,0x9a,0xe2,0x91,0x11,
+	0x10,0x09,0x05,0xff,0xf0,0xa1,0x8d,0xaa,0x00,0x05,0xff,0xe5,0xac,0x88,0x00,0x05,
+	0xff,0xe5,0xac,0xbe,0x00,0xe3,0xe0,0xe2,0xd2,0x14,0xe1,0xaf,0xe2,0x10,0x08,0x05,
+	0xff,0xe5,0xaf,0xb3,0x00,0x05,0xff,0xf0,0xa1,0xac,0x98,0x00,0xe1,0xbb,0xe2,0x10,
+	0x08,0x05,0xff,0xe5,0xbc,0xb3,0x00,0x05,0xff,0xe5,0xb0,0xa2,0x00,0xd1,0xd5,0xd0,
+	0x6a,0xcf,0x86,0xe5,0x10,0xe8,0xd4,0x19,0xe3,0x49,0xe7,0xe2,0x27,0xe7,0xe1,0x16,
+	0xe7,0x10,0x08,0x05,0xff,0xe6,0xb4,0xbe,0x00,0x05,0xff,0xe6,0xb5,0xb7,0x00,0xd3,
+	0x18,0xe2,0x93,0xe7,0xe1,0x82,0xe7,0x10,0x09,0x05,0xff,0xf0,0xa3,0xbd,0x9e,0x00,
+	0x05,0xff,0xf0,0xa3,0xbe,0x8e,0x00,0xd2,0x13,0xe1,0xab,0xe7,0x10,0x08,0x05,0xff,
+	0xe7,0x81,0xbd,0x00,0x05,0xff,0xe7,0x81,0xb7,0x00,0xd1,0x11,0x10,0x08,0x05,0xff,
+	0xe7,0x85,0x85,0x00,0x05,0xff,0xf0,0xa4,0x89,0xa3,0x00,0x10,0x08,0x05,0xff,0xe7,
+	0x86,0x9c,0x00,0x05,0xff,0xe4,0x8e,0xab,0x00,0xcf,0x86,0xe5,0xad,0xe9,0xd4,0x1a,
+	0xe3,0xe5,0xe8,0xe2,0xcb,0xe8,0xe1,0xb8,0xe8,0x10,0x08,0x05,0xff,0xe7,0x9b,0xb4,
+	0x00,0x05,0xff,0xf0,0xa5,0x83,0xb3,0x00,0xd3,0x16,0xe2,0x2d,0xe9,0xe1,0x1b,0xe9,
+	0x10,0x08,0x05,0xff,0xe7,0xa3,0x8c,0x00,0x05,0xff,0xe4,0x83,0xa3,0x00,0xd2,0x13,
+	0xe1,0x49,0xe9,0x10,0x08,0x05,0xff,0xe4,0x84,0xaf,0x00,0x05,0xff,0xe7,0xa9,0x80,
+	0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa5,0xa5,0xbc,0x00,0x05,0xff,0xf0,0xa5,
+	0xaa,0xa7,0x00,0x10,0x09,0x05,0xff,0xf0,0xa5,0xaa,0xa7,0x00,0x05,0xff,0xe7,0xaa,
+	0xae,0x00,0xe0,0x5f,0xec,0xcf,0x86,0xd5,0x1d,0xe4,0xd4,0xea,0xe3,0x90,0xea,0xe2,
+	0x6e,0xea,0xe1,0x5d,0xea,0x10,0x09,0x05,0xff,0xf0,0xa3,0x8d,0x9f,0x00,0x05,0xff,
+	0xe4,0x8f,0x95,0x00,0xd4,0x19,0xe3,0x7b,0xeb,0xe2,0x57,0xeb,0xe1,0x46,0xeb,0x10,
+	0x08,0x05,0xff,0xe8,0x8d,0x93,0x00,0x05,0xff,0xe8,0x8f,0x8a,0x00,0xd3,0x18,0xe2,
+	0xc6,0xeb,0xe1,0xb5,0xeb,0x10,0x09,0x05,0xff,0xf0,0xa6,0xbe,0xb1,0x00,0x05,0xff,
+	0xf0,0xa7,0x83,0x92,0x00,0xd2,0x13,0xe1,0xde,0xeb,0x10,0x08,0x05,0xff,0xe8,0x9a,
+	0x88,0x00,0x05,0xff,0xe8,0x9c,0x8e,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe8,0x9c,
+	0xa8,0x00,0x05,0xff,0xe8,0x9d,0xab,0x00,0x10,0x08,0x05,0xff,0xe8,0x9e,0x86,0x00,
+	0x05,0xff,0xe4,0xb5,0x97,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	/* nfdicf_30200 */
+	0xd7,0x07,0x66,0x84,0x05,0x01,0x00,0xc6,0xd5,0x16,0xe4,0x99,0x13,0xe3,0x63,0x0e,
+	0xe2,0x4c,0x07,0xc1,0xe0,0x4e,0x06,0xcf,0x86,0x65,0x2d,0x06,0x01,0x00,0xd4,0x2a,
+	0xe3,0xd0,0x35,0xe2,0x88,0x9c,0xe1,0xcd,0x2e,0xe0,0x2b,0x1b,0xcf,0x86,0xc5,0xe4,
+	0x14,0x66,0xe3,0x5f,0x61,0xe2,0xf5,0x5e,0xe1,0x28,0x5e,0xe0,0xed,0x5d,0xcf,0x86,
+	0xe5,0xb2,0x5d,0x64,0x95,0x5d,0x0b,0x00,0x83,0xe2,0xa7,0xf3,0xe1,0x80,0xf0,0xe0,
+	0xfd,0xee,0xcf,0x86,0xd5,0x31,0xc4,0xe3,0xe2,0x47,0xe2,0x83,0x46,0xe1,0x32,0xc6,
+	0xe0,0x2a,0x45,0xcf,0x86,0xe5,0x1c,0x43,0xe4,0x3d,0x40,0xe3,0x9f,0xb6,0xe2,0xf6,
+	0xb5,0xe1,0xd1,0xb5,0xe0,0xaa,0xb5,0xcf,0x86,0xe5,0x77,0xb5,0x94,0x07,0x63,0x62,
+	0xb5,0x07,0x00,0x07,0x00,0xe4,0x69,0xee,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x05,0x00,
+	0xd2,0x0b,0xe1,0x78,0xdb,0xcf,0x86,0xcf,0x06,0x05,0x00,0xd1,0x0e,0xe0,0x67,0xdc,
+	0xcf,0x86,0xe5,0x2c,0xdc,0xcf,0x06,0x11,0x00,0xd0,0x0b,0xcf,0x86,0xe5,0x67,0xdc,
+	0xcf,0x06,0x13,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xe4,0x02,0xee,0xe3,
+	0xeb,0xec,0xd2,0xa0,0xe1,0xa1,0xe0,0xd0,0x21,0xcf,0x86,0xe5,0xa2,0xdd,0xe4,0x1e,
+	0xdd,0xe3,0xdc,0xdc,0xe2,0xbb,0xdc,0xe1,0xa9,0xdc,0x10,0x08,0x05,0xff,0xe4,0xb8,
+	0xbd,0x00,0x05,0xff,0xe4,0xb8,0xb8,0x00,0xcf,0x86,0xd5,0x1c,0xe4,0xfe,0xde,0xe3,
+	0xbd,0xde,0xe2,0x9c,0xde,0xe1,0x8b,0xde,0x10,0x08,0x05,0xff,0xe5,0x92,0xa2,0x00,
+	0x05,0xff,0xe5,0x93,0xb6,0x00,0xd4,0x34,0xd3,0x18,0xe2,0x85,0xdf,0xe1,0x74,0xdf,
+	0x10,0x09,0x05,0xff,0xf0,0xa1,0x9a,0xa8,0x00,0x05,0xff,0xf0,0xa1,0x9b,0xaa,0x00,
+	0xe2,0xa5,0xdf,0x91,0x11,0x10,0x09,0x05,0xff,0xf0,0xa1,0x8d,0xaa,0x00,0x05,0xff,
+	0xe5,0xac,0x88,0x00,0x05,0xff,0xe5,0xac,0xbe,0x00,0xe3,0xeb,0xdf,0xd2,0x14,0xe1,
+	0xba,0xdf,0x10,0x08,0x05,0xff,0xe5,0xaf,0xb3,0x00,0x05,0xff,0xf0,0xa1,0xac,0x98,
+	0x00,0xe1,0xc6,0xdf,0x10,0x08,0x05,0xff,0xe5,0xbc,0xb3,0x00,0x05,0xff,0xe5,0xb0,
+	0xa2,0x00,0xd1,0xd5,0xd0,0x6a,0xcf,0x86,0xe5,0x1b,0xe5,0xd4,0x19,0xe3,0x54,0xe4,
+	0xe2,0x32,0xe4,0xe1,0x21,0xe4,0x10,0x08,0x05,0xff,0xe6,0xb4,0xbe,0x00,0x05,0xff,
+	0xe6,0xb5,0xb7,0x00,0xd3,0x18,0xe2,0x9e,0xe4,0xe1,0x8d,0xe4,0x10,0x09,0x05,0xff,
+	0xf0,0xa3,0xbd,0x9e,0x00,0x05,0xff,0xf0,0xa3,0xbe,0x8e,0x00,0xd2,0x13,0xe1,0xb6,
+	0xe4,0x10,0x08,0x05,0xff,0xe7,0x81,0xbd,0x00,0x05,0xff,0xe7,0x81,0xb7,0x00,0xd1,
+	0x11,0x10,0x08,0x05,0xff,0xe7,0x85,0x85,0x00,0x05,0xff,0xf0,0xa4,0x89,0xa3,0x00,
+	0x10,0x08,0x05,0xff,0xe7,0x86,0x9c,0x00,0x05,0xff,0xe4,0x8e,0xab,0x00,0xcf,0x86,
+	0xe5,0xb8,0xe6,0xd4,0x1a,0xe3,0xf0,0xe5,0xe2,0xd6,0xe5,0xe1,0xc3,0xe5,0x10,0x08,
+	0x05,0xff,0xe7,0x9b,0xb4,0x00,0x05,0xff,0xf0,0xa5,0x83,0xb3,0x00,0xd3,0x16,0xe2,
+	0x38,0xe6,0xe1,0x26,0xe6,0x10,0x08,0x05,0xff,0xe7,0xa3,0x8c,0x00,0x05,0xff,0xe4,
+	0x83,0xa3,0x00,0xd2,0x13,0xe1,0x54,0xe6,0x10,0x08,0x05,0xff,0xe4,0x84,0xaf,0x00,
+	0x05,0xff,0xe7,0xa9,0x80,0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa5,0xa5,0xbc,
+	0x00,0x05,0xff,0xf0,0xa5,0xaa,0xa7,0x00,0x10,0x09,0x05,0xff,0xf0,0xa5,0xaa,0xa7,
+	0x00,0x05,0xff,0xe7,0xaa,0xae,0x00,0xe0,0x6a,0xe9,0xcf,0x86,0xd5,0x1d,0xe4,0xdf,
+	0xe7,0xe3,0x9b,0xe7,0xe2,0x79,0xe7,0xe1,0x68,0xe7,0x10,0x09,0x05,0xff,0xf0,0xa3,
+	0x8d,0x9f,0x00,0x05,0xff,0xe4,0x8f,0x95,0x00,0xd4,0x19,0xe3,0x86,0xe8,0xe2,0x62,
+	0xe8,0xe1,0x51,0xe8,0x10,0x08,0x05,0xff,0xe8,0x8d,0x93,0x00,0x05,0xff,0xe8,0x8f,
+	0x8a,0x00,0xd3,0x18,0xe2,0xd1,0xe8,0xe1,0xc0,0xe8,0x10,0x09,0x05,0xff,0xf0,0xa6,
+	0xbe,0xb1,0x00,0x05,0xff,0xf0,0xa7,0x83,0x92,0x00,0xd2,0x13,0xe1,0xe9,0xe8,0x10,
+	0x08,0x05,0xff,0xe8,0x9a,0x88,0x00,0x05,0xff,0xe8,0x9c,0x8e,0x00,0xd1,0x10,0x10,
+	0x08,0x05,0xff,0xe8,0x9c,0xa8,0x00,0x05,0xff,0xe8,0x9d,0xab,0x00,0x10,0x08,0x05,
+	0xff,0xe8,0x9e,0x86,0x00,0x05,0xff,0xe4,0xb5,0x97,0x00,0x00,0x00,0x00,0x00,0x00,
+	/* nfdi_30200 */
+	0x57,0x04,0x01,0x00,0xc6,0xd5,0x16,0xe4,0x82,0x53,0xe3,0xbb,0x4e,0xe2,0x34,0x49,
+	0xc1,0xe0,0x60,0x47,0xcf,0x86,0x65,0x44,0x47,0x01,0x00,0xd4,0x2a,0xe3,0x1c,0x9a,
+	0xe2,0xcb,0x99,0xe1,0x9e,0x87,0xe0,0xf8,0x6a,0xcf,0x86,0xc5,0xe4,0x57,0x63,0xe3,
+	0xa2,0x5e,0xe2,0x38,0x5c,0xe1,0x6b,0x5b,0xe0,0x30,0x5b,0xcf,0x86,0xe5,0xf5,0x5a,
+	0x64,0xd8,0x5a,0x0b,0x00,0x83,0xe2,0xea,0xf0,0xe1,0xc3,0xed,0xe0,0x40,0xec,0xcf,
+	0x86,0xd5,0x31,0xc4,0xe3,0xbb,0xc6,0xe2,0x94,0xc4,0xe1,0x75,0xc3,0xe0,0x05,0xba,
+	0xcf,0x86,0xe5,0xf8,0xb5,0xe4,0xf1,0xb4,0xe3,0xe2,0xb3,0xe2,0x39,0xb3,0xe1,0x14,
+	0xb3,0xe0,0xed,0xb2,0xcf,0x86,0xe5,0xba,0xb2,0x94,0x07,0x63,0xa5,0xb2,0x07,0x00,
+	0x07,0x00,0xe4,0xac,0xeb,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x05,0x00,0xd2,0x0b,0xe1,
+	0xbb,0xd8,0xcf,0x86,0xcf,0x06,0x05,0x00,0xd1,0x0e,0xe0,0xaa,0xd9,0xcf,0x86,0xe5,
+	0x6f,0xd9,0xcf,0x06,0x11,0x00,0xd0,0x0b,0xcf,0x86,0xe5,0xaa,0xd9,0xcf,0x06,0x13,
+	0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xe4,0x45,0xeb,0xe3,0x2e,0xea,0xd2,
+	0xa0,0xe1,0xe4,0xdd,0xd0,0x21,0xcf,0x86,0xe5,0xe5,0xda,0xe4,0x61,0xda,0xe3,0x1f,
+	0xda,0xe2,0xfe,0xd9,0xe1,0xec,0xd9,0x10,0x08,0x05,0xff,0xe4,0xb8,0xbd,0x00,0x05,
+	0xff,0xe4,0xb8,0xb8,0x00,0xcf,0x86,0xd5,0x1c,0xe4,0x41,0xdc,0xe3,0x00,0xdc,0xe2,
+	0xdf,0xdb,0xe1,0xce,0xdb,0x10,0x08,0x05,0xff,0xe5,0x92,0xa2,0x00,0x05,0xff,0xe5,
+	0x93,0xb6,0x00,0xd4,0x34,0xd3,0x18,0xe2,0xc8,0xdc,0xe1,0xb7,0xdc,0x10,0x09,0x05,
+	0xff,0xf0,0xa1,0x9a,0xa8,0x00,0x05,0xff,0xf0,0xa1,0x9b,0xaa,0x00,0xe2,0xe8,0xdc,
+	0x91,0x11,0x10,0x09,0x05,0xff,0xf0,0xa1,0x8d,0xaa,0x00,0x05,0xff,0xe5,0xac,0x88,
+	0x00,0x05,0xff,0xe5,0xac,0xbe,0x00,0xe3,0x2e,0xdd,0xd2,0x14,0xe1,0xfd,0xdc,0x10,
+	0x08,0x05,0xff,0xe5,0xaf,0xb3,0x00,0x05,0xff,0xf0,0xa1,0xac,0x98,0x00,0xe1,0x09,
+	0xdd,0x10,0x08,0x05,0xff,0xe5,0xbc,0xb3,0x00,0x05,0xff,0xe5,0xb0,0xa2,0x00,0xd1,
+	0xd5,0xd0,0x6a,0xcf,0x86,0xe5,0x5e,0xe2,0xd4,0x19,0xe3,0x97,0xe1,0xe2,0x75,0xe1,
+	0xe1,0x64,0xe1,0x10,0x08,0x05,0xff,0xe6,0xb4,0xbe,0x00,0x05,0xff,0xe6,0xb5,0xb7,
+	0x00,0xd3,0x18,0xe2,0xe1,0xe1,0xe1,0xd0,0xe1,0x10,0x09,0x05,0xff,0xf0,0xa3,0xbd,
+	0x9e,0x00,0x05,0xff,0xf0,0xa3,0xbe,0x8e,0x00,0xd2,0x13,0xe1,0xf9,0xe1,0x10,0x08,
+	0x05,0xff,0xe7,0x81,0xbd,0x00,0x05,0xff,0xe7,0x81,0xb7,0x00,0xd1,0x11,0x10,0x08,
+	0x05,0xff,0xe7,0x85,0x85,0x00,0x05,0xff,0xf0,0xa4,0x89,0xa3,0x00,0x10,0x08,0x05,
+	0xff,0xe7,0x86,0x9c,0x00,0x05,0xff,0xe4,0x8e,0xab,0x00,0xcf,0x86,0xe5,0xfb,0xe3,
+	0xd4,0x1a,0xe3,0x33,0xe3,0xe2,0x19,0xe3,0xe1,0x06,0xe3,0x10,0x08,0x05,0xff,0xe7,
+	0x9b,0xb4,0x00,0x05,0xff,0xf0,0xa5,0x83,0xb3,0x00,0xd3,0x16,0xe2,0x7b,0xe3,0xe1,
+	0x69,0xe3,0x10,0x08,0x05,0xff,0xe7,0xa3,0x8c,0x00,0x05,0xff,0xe4,0x83,0xa3,0x00,
+	0xd2,0x13,0xe1,0x97,0xe3,0x10,0x08,0x05,0xff,0xe4,0x84,0xaf,0x00,0x05,0xff,0xe7,
+	0xa9,0x80,0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa5,0xa5,0xbc,0x00,0x05,0xff,
+	0xf0,0xa5,0xaa,0xa7,0x00,0x10,0x09,0x05,0xff,0xf0,0xa5,0xaa,0xa7,0x00,0x05,0xff,
+	0xe7,0xaa,0xae,0x00,0xe0,0xad,0xe6,0xcf,0x86,0xd5,0x1d,0xe4,0x22,0xe5,0xe3,0xde,
+	0xe4,0xe2,0xbc,0xe4,0xe1,0xab,0xe4,0x10,0x09,0x05,0xff,0xf0,0xa3,0x8d,0x9f,0x00,
+	0x05,0xff,0xe4,0x8f,0x95,0x00,0xd4,0x19,0xe3,0xc9,0xe5,0xe2,0xa5,0xe5,0xe1,0x94,
+	0xe5,0x10,0x08,0x05,0xff,0xe8,0x8d,0x93,0x00,0x05,0xff,0xe8,0x8f,0x8a,0x00,0xd3,
+	0x18,0xe2,0x14,0xe6,0xe1,0x03,0xe6,0x10,0x09,0x05,0xff,0xf0,0xa6,0xbe,0xb1,0x00,
+	0x05,0xff,0xf0,0xa7,0x83,0x92,0x00,0xd2,0x13,0xe1,0x2c,0xe6,0x10,0x08,0x05,0xff,
+	0xe8,0x9a,0x88,0x00,0x05,0xff,0xe8,0x9c,0x8e,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,
+	0xe8,0x9c,0xa8,0x00,0x05,0xff,0xe8,0x9d,0xab,0x00,0x10,0x08,0x05,0xff,0xe8,0x9e,
+	0x86,0x00,0x05,0xff,0xe4,0xb5,0x97,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	/* nfdicf_c0100 */
+	0xd7,0xb0,0x56,0x04,0x01,0x00,0x95,0xa8,0xd4,0x5e,0xd3,0x2e,0xd2,0x16,0xd1,0x0a,
+	0x10,0x04,0x01,0x00,0x01,0xff,0x61,0x00,0x10,0x06,0x01,0xff,0x62,0x00,0x01,0xff,
+	0x63,0x00,0xd1,0x0c,0x10,0x06,0x01,0xff,0x64,0x00,0x01,0xff,0x65,0x00,0x10,0x06,
+	0x01,0xff,0x66,0x00,0x01,0xff,0x67,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x06,0x01,0xff,
+	0x68,0x00,0x01,0xff,0x69,0x00,0x10,0x06,0x01,0xff,0x6a,0x00,0x01,0xff,0x6b,0x00,
+	0xd1,0x0c,0x10,0x06,0x01,0xff,0x6c,0x00,0x01,0xff,0x6d,0x00,0x10,0x06,0x01,0xff,
+	0x6e,0x00,0x01,0xff,0x6f,0x00,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x06,0x01,0xff,
+	0x70,0x00,0x01,0xff,0x71,0x00,0x10,0x06,0x01,0xff,0x72,0x00,0x01,0xff,0x73,0x00,
+	0xd1,0x0c,0x10,0x06,0x01,0xff,0x74,0x00,0x01,0xff,0x75,0x00,0x10,0x06,0x01,0xff,
+	0x76,0x00,0x01,0xff,0x77,0x00,0x92,0x16,0xd1,0x0c,0x10,0x06,0x01,0xff,0x78,0x00,
+	0x01,0xff,0x79,0x00,0x10,0x06,0x01,0xff,0x7a,0x00,0x01,0x00,0x01,0x00,0x01,0x00,
+	0xc6,0xe5,0xf9,0x14,0xe4,0x6f,0x0d,0xe3,0x39,0x08,0xe2,0x22,0x01,0xc1,0xd0,0x24,
+	0xcf,0x86,0x55,0x04,0x01,0x00,0xd4,0x07,0x63,0xd8,0x43,0x01,0x00,0x93,0x13,0x52,
+	0x04,0x01,0x00,0x91,0x0b,0x10,0x04,0x01,0x00,0x01,0xff,0xce,0xbc,0x00,0x01,0x00,
+	0x01,0x00,0xcf,0x86,0xe5,0xb3,0x44,0xd4,0x7f,0xd3,0x3f,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x61,0xcc,0x80,0x00,0x01,0xff,0x61,0xcc,0x81,0x00,0x10,0x08,0x01,
+	0xff,0x61,0xcc,0x82,0x00,0x01,0xff,0x61,0xcc,0x83,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x61,0xcc,0x88,0x00,0x01,0xff,0x61,0xcc,0x8a,0x00,0x10,0x07,0x01,0xff,0xc3,
+	0xa6,0x00,0x01,0xff,0x63,0xcc,0xa7,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0x65,0xcc,0x80,0x00,0x01,0xff,0x65,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x65,0xcc,
+	0x82,0x00,0x01,0xff,0x65,0xcc,0x88,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x69,0xcc,
+	0x80,0x00,0x01,0xff,0x69,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x69,0xcc,0x82,0x00,
+	0x01,0xff,0x69,0xcc,0x88,0x00,0xd3,0x3b,0xd2,0x1f,0xd1,0x0f,0x10,0x07,0x01,0xff,
+	0xc3,0xb0,0x00,0x01,0xff,0x6e,0xcc,0x83,0x00,0x10,0x08,0x01,0xff,0x6f,0xcc,0x80,
+	0x00,0x01,0xff,0x6f,0xcc,0x81,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x6f,0xcc,0x82,
+	0x00,0x01,0xff,0x6f,0xcc,0x83,0x00,0x10,0x08,0x01,0xff,0x6f,0xcc,0x88,0x00,0x01,
+	0x00,0xd2,0x1f,0xd1,0x0f,0x10,0x07,0x01,0xff,0xc3,0xb8,0x00,0x01,0xff,0x75,0xcc,
+	0x80,0x00,0x10,0x08,0x01,0xff,0x75,0xcc,0x81,0x00,0x01,0xff,0x75,0xcc,0x82,0x00,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0x75,0xcc,0x88,0x00,0x01,0xff,0x79,0xcc,0x81,0x00,
+	0x10,0x07,0x01,0xff,0xc3,0xbe,0x00,0x01,0xff,0x73,0x73,0x00,0xe1,0xd4,0x03,0xe0,
+	0xeb,0x01,0xcf,0x86,0xd5,0xfb,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0x61,0xcc,0x84,0x00,0x01,0xff,0x61,0xcc,0x84,0x00,0x10,0x08,0x01,0xff,
+	0x61,0xcc,0x86,0x00,0x01,0xff,0x61,0xcc,0x86,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0x61,0xcc,0xa8,0x00,0x01,0xff,0x61,0xcc,0xa8,0x00,0x10,0x08,0x01,0xff,0x63,0xcc,
+	0x81,0x00,0x01,0xff,0x63,0xcc,0x81,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0x63,0xcc,0x82,0x00,0x01,0xff,0x63,0xcc,0x82,0x00,0x10,0x08,0x01,0xff,0x63,0xcc,
+	0x87,0x00,0x01,0xff,0x63,0xcc,0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x63,0xcc,
+	0x8c,0x00,0x01,0xff,0x63,0xcc,0x8c,0x00,0x10,0x08,0x01,0xff,0x64,0xcc,0x8c,0x00,
+	0x01,0xff,0x64,0xcc,0x8c,0x00,0xd3,0x3b,0xd2,0x1b,0xd1,0x0b,0x10,0x07,0x01,0xff,
+	0xc4,0x91,0x00,0x01,0x00,0x10,0x08,0x01,0xff,0x65,0xcc,0x84,0x00,0x01,0xff,0x65,
+	0xcc,0x84,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x65,0xcc,0x86,0x00,0x01,0xff,0x65,
+	0xcc,0x86,0x00,0x10,0x08,0x01,0xff,0x65,0xcc,0x87,0x00,0x01,0xff,0x65,0xcc,0x87,
+	0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x65,0xcc,0xa8,0x00,0x01,0xff,0x65,
+	0xcc,0xa8,0x00,0x10,0x08,0x01,0xff,0x65,0xcc,0x8c,0x00,0x01,0xff,0x65,0xcc,0x8c,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x67,0xcc,0x82,0x00,0x01,0xff,0x67,0xcc,0x82,
+	0x00,0x10,0x08,0x01,0xff,0x67,0xcc,0x86,0x00,0x01,0xff,0x67,0xcc,0x86,0x00,0xd4,
+	0x7b,0xd3,0x3b,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x67,0xcc,0x87,0x00,0x01,
+	0xff,0x67,0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x67,0xcc,0xa7,0x00,0x01,0xff,0x67,
+	0xcc,0xa7,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x68,0xcc,0x82,0x00,0x01,0xff,0x68,
+	0xcc,0x82,0x00,0x10,0x07,0x01,0xff,0xc4,0xa7,0x00,0x01,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0x69,0xcc,0x83,0x00,0x01,0xff,0x69,0xcc,0x83,0x00,0x10,0x08,
+	0x01,0xff,0x69,0xcc,0x84,0x00,0x01,0xff,0x69,0xcc,0x84,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0x69,0xcc,0x86,0x00,0x01,0xff,0x69,0xcc,0x86,0x00,0x10,0x08,0x01,0xff,
+	0x69,0xcc,0xa8,0x00,0x01,0xff,0x69,0xcc,0xa8,0x00,0xd3,0x37,0xd2,0x17,0xd1,0x0c,
+	0x10,0x08,0x01,0xff,0x69,0xcc,0x87,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xc4,0xb3,
+	0x00,0x01,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x6a,0xcc,0x82,0x00,0x01,0xff,0x6a,
+	0xcc,0x82,0x00,0x10,0x08,0x01,0xff,0x6b,0xcc,0xa7,0x00,0x01,0xff,0x6b,0xcc,0xa7,
+	0x00,0xd2,0x1c,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0x6c,0xcc,0x81,0x00,0x10,
+	0x08,0x01,0xff,0x6c,0xcc,0x81,0x00,0x01,0xff,0x6c,0xcc,0xa7,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x6c,0xcc,0xa7,0x00,0x01,0xff,0x6c,0xcc,0x8c,0x00,0x10,0x08,0x01,
+	0xff,0x6c,0xcc,0x8c,0x00,0x01,0xff,0xc5,0x80,0x00,0xcf,0x86,0xd5,0xed,0xd4,0x72,
+	0xd3,0x37,0xd2,0x17,0xd1,0x0b,0x10,0x04,0x01,0x00,0x01,0xff,0xc5,0x82,0x00,0x10,
+	0x04,0x01,0x00,0x01,0xff,0x6e,0xcc,0x81,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x6e,
+	0xcc,0x81,0x00,0x01,0xff,0x6e,0xcc,0xa7,0x00,0x10,0x08,0x01,0xff,0x6e,0xcc,0xa7,
+	0x00,0x01,0xff,0x6e,0xcc,0x8c,0x00,0xd2,0x1b,0xd1,0x10,0x10,0x08,0x01,0xff,0x6e,
+	0xcc,0x8c,0x00,0x01,0xff,0xca,0xbc,0x6e,0x00,0x10,0x07,0x01,0xff,0xc5,0x8b,0x00,
+	0x01,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x6f,0xcc,0x84,0x00,0x01,0xff,0x6f,0xcc,
+	0x84,0x00,0x10,0x08,0x01,0xff,0x6f,0xcc,0x86,0x00,0x01,0xff,0x6f,0xcc,0x86,0x00,
+	0xd3,0x3b,0xd2,0x1b,0xd1,0x10,0x10,0x08,0x01,0xff,0x6f,0xcc,0x8b,0x00,0x01,0xff,
+	0x6f,0xcc,0x8b,0x00,0x10,0x07,0x01,0xff,0xc5,0x93,0x00,0x01,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x72,0xcc,0x81,0x00,0x01,0xff,0x72,0xcc,0x81,0x00,0x10,0x08,0x01,
+	0xff,0x72,0xcc,0xa7,0x00,0x01,0xff,0x72,0xcc,0xa7,0x00,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x72,0xcc,0x8c,0x00,0x01,0xff,0x72,0xcc,0x8c,0x00,0x10,0x08,0x01,
+	0xff,0x73,0xcc,0x81,0x00,0x01,0xff,0x73,0xcc,0x81,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x73,0xcc,0x82,0x00,0x01,0xff,0x73,0xcc,0x82,0x00,0x10,0x08,0x01,0xff,0x73,
+	0xcc,0xa7,0x00,0x01,0xff,0x73,0xcc,0xa7,0x00,0xd4,0x7b,0xd3,0x3b,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x73,0xcc,0x8c,0x00,0x01,0xff,0x73,0xcc,0x8c,0x00,0x10,
+	0x08,0x01,0xff,0x74,0xcc,0xa7,0x00,0x01,0xff,0x74,0xcc,0xa7,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x74,0xcc,0x8c,0x00,0x01,0xff,0x74,0xcc,0x8c,0x00,0x10,0x07,0x01,
+	0xff,0xc5,0xa7,0x00,0x01,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x75,0xcc,
+	0x83,0x00,0x01,0xff,0x75,0xcc,0x83,0x00,0x10,0x08,0x01,0xff,0x75,0xcc,0x84,0x00,
+	0x01,0xff,0x75,0xcc,0x84,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x75,0xcc,0x86,0x00,
+	0x01,0xff,0x75,0xcc,0x86,0x00,0x10,0x08,0x01,0xff,0x75,0xcc,0x8a,0x00,0x01,0xff,
+	0x75,0xcc,0x8a,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x75,0xcc,
+	0x8b,0x00,0x01,0xff,0x75,0xcc,0x8b,0x00,0x10,0x08,0x01,0xff,0x75,0xcc,0xa8,0x00,
+	0x01,0xff,0x75,0xcc,0xa8,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x77,0xcc,0x82,0x00,
+	0x01,0xff,0x77,0xcc,0x82,0x00,0x10,0x08,0x01,0xff,0x79,0xcc,0x82,0x00,0x01,0xff,
+	0x79,0xcc,0x82,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x79,0xcc,0x88,0x00,
+	0x01,0xff,0x7a,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x7a,0xcc,0x81,0x00,0x01,0xff,
+	0x7a,0xcc,0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x7a,0xcc,0x87,0x00,0x01,0xff,
+	0x7a,0xcc,0x8c,0x00,0x10,0x08,0x01,0xff,0x7a,0xcc,0x8c,0x00,0x01,0xff,0x73,0x00,
+	0xe0,0x65,0x01,0xcf,0x86,0xd5,0xb4,0xd4,0x5a,0xd3,0x2f,0xd2,0x16,0xd1,0x0b,0x10,
+	0x04,0x01,0x00,0x01,0xff,0xc9,0x93,0x00,0x10,0x07,0x01,0xff,0xc6,0x83,0x00,0x01,
+	0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xc6,0x85,0x00,0x01,0x00,0x10,0x07,0x01,0xff,
+	0xc9,0x94,0x00,0x01,0xff,0xc6,0x88,0x00,0xd2,0x19,0xd1,0x0b,0x10,0x04,0x01,0x00,
+	0x01,0xff,0xc9,0x96,0x00,0x10,0x07,0x01,0xff,0xc9,0x97,0x00,0x01,0xff,0xc6,0x8c,
+	0x00,0x51,0x04,0x01,0x00,0x10,0x07,0x01,0xff,0xc7,0x9d,0x00,0x01,0xff,0xc9,0x99,
+	0x00,0xd3,0x32,0xd2,0x19,0xd1,0x0e,0x10,0x07,0x01,0xff,0xc9,0x9b,0x00,0x01,0xff,
+	0xc6,0x92,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0xc9,0xa0,0x00,0xd1,0x0b,0x10,0x07,
+	0x01,0xff,0xc9,0xa3,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xc9,0xa9,0x00,0x01,0xff,
+	0xc9,0xa8,0x00,0xd2,0x0f,0x91,0x0b,0x10,0x07,0x01,0xff,0xc6,0x99,0x00,0x01,0x00,
+	0x01,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xc9,0xaf,0x00,0x01,0xff,0xc9,0xb2,0x00,
+	0x10,0x04,0x01,0x00,0x01,0xff,0xc9,0xb5,0x00,0xd4,0x5d,0xd3,0x34,0xd2,0x1b,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x6f,0xcc,0x9b,0x00,0x01,0xff,0x6f,0xcc,0x9b,0x00,0x10,
+	0x07,0x01,0xff,0xc6,0xa3,0x00,0x01,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xc6,0xa5,
+	0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xca,0x80,0x00,0x01,0xff,0xc6,0xa8,0x00,0xd2,
+	0x0f,0x91,0x0b,0x10,0x04,0x01,0x00,0x01,0xff,0xca,0x83,0x00,0x01,0x00,0xd1,0x0b,
+	0x10,0x07,0x01,0xff,0xc6,0xad,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xca,0x88,0x00,
+	0x01,0xff,0x75,0xcc,0x9b,0x00,0xd3,0x33,0xd2,0x1d,0xd1,0x0f,0x10,0x08,0x01,0xff,
+	0x75,0xcc,0x9b,0x00,0x01,0xff,0xca,0x8a,0x00,0x10,0x07,0x01,0xff,0xca,0x8b,0x00,
+	0x01,0xff,0xc6,0xb4,0x00,0xd1,0x0b,0x10,0x04,0x01,0x00,0x01,0xff,0xc6,0xb6,0x00,
+	0x10,0x04,0x01,0x00,0x01,0xff,0xca,0x92,0x00,0xd2,0x0f,0x91,0x0b,0x10,0x07,0x01,
+	0xff,0xc6,0xb9,0x00,0x01,0x00,0x01,0x00,0x91,0x0b,0x10,0x07,0x01,0xff,0xc6,0xbd,
+	0x00,0x01,0x00,0x01,0x00,0xcf,0x86,0xd5,0xd4,0xd4,0x44,0xd3,0x16,0x52,0x04,0x01,
+	0x00,0x51,0x07,0x01,0xff,0xc7,0x86,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0xc7,0x89,
+	0x00,0xd2,0x12,0x91,0x0b,0x10,0x07,0x01,0xff,0xc7,0x89,0x00,0x01,0x00,0x01,0xff,
+	0xc7,0x8c,0x00,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0x61,0xcc,0x8c,0x00,0x10,
+	0x08,0x01,0xff,0x61,0xcc,0x8c,0x00,0x01,0xff,0x69,0xcc,0x8c,0x00,0xd3,0x46,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x69,0xcc,0x8c,0x00,0x01,0xff,0x6f,0xcc,0x8c,
+	0x00,0x10,0x08,0x01,0xff,0x6f,0xcc,0x8c,0x00,0x01,0xff,0x75,0xcc,0x8c,0x00,0xd1,
+	0x12,0x10,0x08,0x01,0xff,0x75,0xcc,0x8c,0x00,0x01,0xff,0x75,0xcc,0x88,0xcc,0x84,
+	0x00,0x10,0x0a,0x01,0xff,0x75,0xcc,0x88,0xcc,0x84,0x00,0x01,0xff,0x75,0xcc,0x88,
+	0xcc,0x81,0x00,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x75,0xcc,0x88,0xcc,0x81,
+	0x00,0x01,0xff,0x75,0xcc,0x88,0xcc,0x8c,0x00,0x10,0x0a,0x01,0xff,0x75,0xcc,0x88,
+	0xcc,0x8c,0x00,0x01,0xff,0x75,0xcc,0x88,0xcc,0x80,0x00,0xd1,0x0e,0x10,0x0a,0x01,
+	0xff,0x75,0xcc,0x88,0xcc,0x80,0x00,0x01,0x00,0x10,0x0a,0x01,0xff,0x61,0xcc,0x88,
+	0xcc,0x84,0x00,0x01,0xff,0x61,0xcc,0x88,0xcc,0x84,0x00,0xd4,0x87,0xd3,0x41,0xd2,
+	0x26,0xd1,0x14,0x10,0x0a,0x01,0xff,0x61,0xcc,0x87,0xcc,0x84,0x00,0x01,0xff,0x61,
+	0xcc,0x87,0xcc,0x84,0x00,0x10,0x09,0x01,0xff,0xc3,0xa6,0xcc,0x84,0x00,0x01,0xff,
+	0xc3,0xa6,0xcc,0x84,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xc7,0xa5,0x00,0x01,0x00,
+	0x10,0x08,0x01,0xff,0x67,0xcc,0x8c,0x00,0x01,0xff,0x67,0xcc,0x8c,0x00,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0x6b,0xcc,0x8c,0x00,0x01,0xff,0x6b,0xcc,0x8c,0x00,
+	0x10,0x08,0x01,0xff,0x6f,0xcc,0xa8,0x00,0x01,0xff,0x6f,0xcc,0xa8,0x00,0xd1,0x14,
+	0x10,0x0a,0x01,0xff,0x6f,0xcc,0xa8,0xcc,0x84,0x00,0x01,0xff,0x6f,0xcc,0xa8,0xcc,
+	0x84,0x00,0x10,0x09,0x01,0xff,0xca,0x92,0xcc,0x8c,0x00,0x01,0xff,0xca,0x92,0xcc,
+	0x8c,0x00,0xd3,0x38,0xd2,0x1a,0xd1,0x0f,0x10,0x08,0x01,0xff,0x6a,0xcc,0x8c,0x00,
+	0x01,0xff,0xc7,0xb3,0x00,0x10,0x07,0x01,0xff,0xc7,0xb3,0x00,0x01,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0x67,0xcc,0x81,0x00,0x01,0xff,0x67,0xcc,0x81,0x00,0x10,0x07,
+	0x04,0xff,0xc6,0x95,0x00,0x04,0xff,0xc6,0xbf,0x00,0xd2,0x24,0xd1,0x10,0x10,0x08,
+	0x04,0xff,0x6e,0xcc,0x80,0x00,0x04,0xff,0x6e,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,
+	0x61,0xcc,0x8a,0xcc,0x81,0x00,0x01,0xff,0x61,0xcc,0x8a,0xcc,0x81,0x00,0xd1,0x12,
+	0x10,0x09,0x01,0xff,0xc3,0xa6,0xcc,0x81,0x00,0x01,0xff,0xc3,0xa6,0xcc,0x81,0x00,
+	0x10,0x09,0x01,0xff,0xc3,0xb8,0xcc,0x81,0x00,0x01,0xff,0xc3,0xb8,0xcc,0x81,0x00,
+	0xe2,0x31,0x02,0xe1,0xc3,0x44,0xe0,0xc8,0x01,0xcf,0x86,0xd5,0xfb,0xd4,0x80,0xd3,
+	0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x61,0xcc,0x8f,0x00,0x01,0xff,0x61,
+	0xcc,0x8f,0x00,0x10,0x08,0x01,0xff,0x61,0xcc,0x91,0x00,0x01,0xff,0x61,0xcc,0x91,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x65,0xcc,0x8f,0x00,0x01,0xff,0x65,0xcc,0x8f,
+	0x00,0x10,0x08,0x01,0xff,0x65,0xcc,0x91,0x00,0x01,0xff,0x65,0xcc,0x91,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x69,0xcc,0x8f,0x00,0x01,0xff,0x69,0xcc,0x8f,
+	0x00,0x10,0x08,0x01,0xff,0x69,0xcc,0x91,0x00,0x01,0xff,0x69,0xcc,0x91,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x6f,0xcc,0x8f,0x00,0x01,0xff,0x6f,0xcc,0x8f,0x00,0x10,
+	0x08,0x01,0xff,0x6f,0xcc,0x91,0x00,0x01,0xff,0x6f,0xcc,0x91,0x00,0xd3,0x40,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x72,0xcc,0x8f,0x00,0x01,0xff,0x72,0xcc,0x8f,
+	0x00,0x10,0x08,0x01,0xff,0x72,0xcc,0x91,0x00,0x01,0xff,0x72,0xcc,0x91,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x75,0xcc,0x8f,0x00,0x01,0xff,0x75,0xcc,0x8f,0x00,0x10,
+	0x08,0x01,0xff,0x75,0xcc,0x91,0x00,0x01,0xff,0x75,0xcc,0x91,0x00,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x04,0xff,0x73,0xcc,0xa6,0x00,0x04,0xff,0x73,0xcc,0xa6,0x00,0x10,
+	0x08,0x04,0xff,0x74,0xcc,0xa6,0x00,0x04,0xff,0x74,0xcc,0xa6,0x00,0xd1,0x0b,0x10,
+	0x07,0x04,0xff,0xc8,0x9d,0x00,0x04,0x00,0x10,0x08,0x04,0xff,0x68,0xcc,0x8c,0x00,
+	0x04,0xff,0x68,0xcc,0x8c,0x00,0xd4,0x79,0xd3,0x31,0xd2,0x16,0xd1,0x0b,0x10,0x07,
+	0x06,0xff,0xc6,0x9e,0x00,0x07,0x00,0x10,0x07,0x04,0xff,0xc8,0xa3,0x00,0x04,0x00,
+	0xd1,0x0b,0x10,0x07,0x04,0xff,0xc8,0xa5,0x00,0x04,0x00,0x10,0x08,0x04,0xff,0x61,
+	0xcc,0x87,0x00,0x04,0xff,0x61,0xcc,0x87,0x00,0xd2,0x24,0xd1,0x10,0x10,0x08,0x04,
+	0xff,0x65,0xcc,0xa7,0x00,0x04,0xff,0x65,0xcc,0xa7,0x00,0x10,0x0a,0x04,0xff,0x6f,
+	0xcc,0x88,0xcc,0x84,0x00,0x04,0xff,0x6f,0xcc,0x88,0xcc,0x84,0x00,0xd1,0x14,0x10,
+	0x0a,0x04,0xff,0x6f,0xcc,0x83,0xcc,0x84,0x00,0x04,0xff,0x6f,0xcc,0x83,0xcc,0x84,
+	0x00,0x10,0x08,0x04,0xff,0x6f,0xcc,0x87,0x00,0x04,0xff,0x6f,0xcc,0x87,0x00,0xd3,
+	0x27,0xe2,0x21,0x43,0xd1,0x14,0x10,0x0a,0x04,0xff,0x6f,0xcc,0x87,0xcc,0x84,0x00,
+	0x04,0xff,0x6f,0xcc,0x87,0xcc,0x84,0x00,0x10,0x08,0x04,0xff,0x79,0xcc,0x84,0x00,
+	0x04,0xff,0x79,0xcc,0x84,0x00,0xd2,0x13,0x51,0x04,0x08,0x00,0x10,0x08,0x08,0xff,
+	0xe2,0xb1,0xa5,0x00,0x08,0xff,0xc8,0xbc,0x00,0xd1,0x0b,0x10,0x04,0x08,0x00,0x08,
+	0xff,0xc6,0x9a,0x00,0x10,0x08,0x08,0xff,0xe2,0xb1,0xa6,0x00,0x08,0x00,0xcf,0x86,
+	0x95,0x5f,0x94,0x5b,0xd3,0x2f,0xd2,0x16,0xd1,0x0b,0x10,0x04,0x08,0x00,0x08,0xff,
+	0xc9,0x82,0x00,0x10,0x04,0x09,0x00,0x09,0xff,0xc6,0x80,0x00,0xd1,0x0e,0x10,0x07,
+	0x09,0xff,0xca,0x89,0x00,0x09,0xff,0xca,0x8c,0x00,0x10,0x07,0x09,0xff,0xc9,0x87,
+	0x00,0x09,0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x09,0xff,0xc9,0x89,0x00,0x09,0x00,
+	0x10,0x07,0x09,0xff,0xc9,0x8b,0x00,0x09,0x00,0xd1,0x0b,0x10,0x07,0x09,0xff,0xc9,
+	0x8d,0x00,0x09,0x00,0x10,0x07,0x09,0xff,0xc9,0x8f,0x00,0x09,0x00,0x01,0x00,0x01,
+	0x00,0xd1,0x8b,0xd0,0x0c,0xcf,0x86,0xe5,0x10,0x43,0x64,0xef,0x42,0x01,0xe6,0xcf,
+	0x86,0xd5,0x2a,0xe4,0x99,0x43,0xe3,0x7f,0x43,0xd2,0x11,0xe1,0x5e,0x43,0x10,0x07,
+	0x01,0xff,0xcc,0x80,0x00,0x01,0xff,0xcc,0x81,0x00,0xe1,0x65,0x43,0x10,0x09,0x01,
+	0xff,0xcc,0x88,0xcc,0x81,0x00,0x01,0xff,0xce,0xb9,0x00,0xd4,0x0f,0x93,0x0b,0x92,
+	0x07,0x61,0xab,0x43,0x01,0xea,0x06,0xe6,0x06,0xe6,0xd3,0x2c,0xd2,0x16,0xd1,0x0b,
+	0x10,0x07,0x0a,0xff,0xcd,0xb1,0x00,0x0a,0x00,0x10,0x07,0x0a,0xff,0xcd,0xb3,0x00,
+	0x0a,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xca,0xb9,0x00,0x01,0x00,0x10,0x07,0x0a,
+	0xff,0xcd,0xb7,0x00,0x0a,0x00,0xd2,0x07,0x61,0x97,0x43,0x00,0x00,0x51,0x04,0x09,
+	0x00,0x10,0x06,0x01,0xff,0x3b,0x00,0x10,0xff,0xcf,0xb3,0x00,0xe0,0x31,0x01,0xcf,
+	0x86,0xd5,0xd3,0xd4,0x5f,0xd3,0x21,0x52,0x04,0x00,0x00,0xd1,0x0d,0x10,0x04,0x01,
+	0x00,0x01,0xff,0xc2,0xa8,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,0xce,0xb1,0xcc,0x81,
+	0x00,0x01,0xff,0xc2,0xb7,0x00,0xd2,0x1f,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb5,
+	0xcc,0x81,0x00,0x01,0xff,0xce,0xb7,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,0xce,0xb9,
+	0xcc,0x81,0x00,0x00,0x00,0xd1,0x0d,0x10,0x09,0x01,0xff,0xce,0xbf,0xcc,0x81,0x00,
+	0x00,0x00,0x10,0x09,0x01,0xff,0xcf,0x85,0xcc,0x81,0x00,0x01,0xff,0xcf,0x89,0xcc,
+	0x81,0x00,0xd3,0x3c,0xd2,0x20,0xd1,0x12,0x10,0x0b,0x01,0xff,0xce,0xb9,0xcc,0x88,
+	0xcc,0x81,0x00,0x01,0xff,0xce,0xb1,0x00,0x10,0x07,0x01,0xff,0xce,0xb2,0x00,0x01,
+	0xff,0xce,0xb3,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xce,0xb4,0x00,0x01,0xff,0xce,
+	0xb5,0x00,0x10,0x07,0x01,0xff,0xce,0xb6,0x00,0x01,0xff,0xce,0xb7,0x00,0xd2,0x1c,
+	0xd1,0x0e,0x10,0x07,0x01,0xff,0xce,0xb8,0x00,0x01,0xff,0xce,0xb9,0x00,0x10,0x07,
+	0x01,0xff,0xce,0xba,0x00,0x01,0xff,0xce,0xbb,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,
+	0xce,0xbc,0x00,0x01,0xff,0xce,0xbd,0x00,0x10,0x07,0x01,0xff,0xce,0xbe,0x00,0x01,
+	0xff,0xce,0xbf,0x00,0xe4,0x85,0x43,0xd3,0x35,0xd2,0x19,0xd1,0x0e,0x10,0x07,0x01,
+	0xff,0xcf,0x80,0x00,0x01,0xff,0xcf,0x81,0x00,0x10,0x04,0x00,0x00,0x01,0xff,0xcf,
+	0x83,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xcf,0x84,0x00,0x01,0xff,0xcf,0x85,0x00,
+	0x10,0x07,0x01,0xff,0xcf,0x86,0x00,0x01,0xff,0xcf,0x87,0x00,0xe2,0x2b,0x43,0xd1,
+	0x0e,0x10,0x07,0x01,0xff,0xcf,0x88,0x00,0x01,0xff,0xcf,0x89,0x00,0x10,0x09,0x01,
+	0xff,0xce,0xb9,0xcc,0x88,0x00,0x01,0xff,0xcf,0x85,0xcc,0x88,0x00,0xcf,0x86,0xd5,
+	0x94,0xd4,0x3c,0xd3,0x13,0x92,0x0f,0x51,0x04,0x01,0x00,0x10,0x07,0x01,0xff,0xcf,
+	0x83,0x00,0x01,0x00,0x01,0x00,0xd2,0x07,0x61,0x3a,0x43,0x01,0x00,0xd1,0x12,0x10,
+	0x09,0x01,0xff,0xce,0xbf,0xcc,0x81,0x00,0x01,0xff,0xcf,0x85,0xcc,0x81,0x00,0x10,
+	0x09,0x01,0xff,0xcf,0x89,0xcc,0x81,0x00,0x0a,0xff,0xcf,0x97,0x00,0xd3,0x2c,0xd2,
+	0x11,0xe1,0x46,0x43,0x10,0x07,0x01,0xff,0xce,0xb2,0x00,0x01,0xff,0xce,0xb8,0x00,
+	0xd1,0x10,0x10,0x09,0x01,0xff,0xcf,0x92,0xcc,0x88,0x00,0x01,0xff,0xcf,0x86,0x00,
+	0x10,0x07,0x01,0xff,0xcf,0x80,0x00,0x04,0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x06,
+	0xff,0xcf,0x99,0x00,0x06,0x00,0x10,0x07,0x01,0xff,0xcf,0x9b,0x00,0x04,0x00,0xd1,
+	0x0b,0x10,0x07,0x01,0xff,0xcf,0x9d,0x00,0x04,0x00,0x10,0x07,0x01,0xff,0xcf,0x9f,
+	0x00,0x04,0x00,0xd4,0x58,0xd3,0x2c,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xcf,
+	0xa1,0x00,0x04,0x00,0x10,0x07,0x01,0xff,0xcf,0xa3,0x00,0x01,0x00,0xd1,0x0b,0x10,
+	0x07,0x01,0xff,0xcf,0xa5,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xcf,0xa7,0x00,0x01,
+	0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xcf,0xa9,0x00,0x01,0x00,0x10,0x07,
+	0x01,0xff,0xcf,0xab,0x00,0x01,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xcf,0xad,0x00,
+	0x01,0x00,0x10,0x07,0x01,0xff,0xcf,0xaf,0x00,0x01,0x00,0xd3,0x2b,0xd2,0x12,0x91,
+	0x0e,0x10,0x07,0x01,0xff,0xce,0xba,0x00,0x01,0xff,0xcf,0x81,0x00,0x01,0x00,0xd1,
+	0x0e,0x10,0x07,0x05,0xff,0xce,0xb8,0x00,0x05,0xff,0xce,0xb5,0x00,0x10,0x04,0x06,
+	0x00,0x07,0xff,0xcf,0xb8,0x00,0xd2,0x16,0xd1,0x0b,0x10,0x04,0x07,0x00,0x07,0xff,
+	0xcf,0xb2,0x00,0x10,0x07,0x07,0xff,0xcf,0xbb,0x00,0x07,0x00,0xd1,0x0b,0x10,0x04,
+	0x08,0x00,0x08,0xff,0xcd,0xbb,0x00,0x10,0x07,0x08,0xff,0xcd,0xbc,0x00,0x08,0xff,
+	0xcd,0xbd,0x00,0xe3,0xed,0x46,0xe2,0x3d,0x05,0xe1,0x27,0x02,0xe0,0x66,0x01,0xcf,
+	0x86,0xd5,0xf0,0xd4,0x7e,0xd3,0x40,0xd2,0x22,0xd1,0x12,0x10,0x09,0x04,0xff,0xd0,
+	0xb5,0xcc,0x80,0x00,0x01,0xff,0xd0,0xb5,0xcc,0x88,0x00,0x10,0x07,0x01,0xff,0xd1,
+	0x92,0x00,0x01,0xff,0xd0,0xb3,0xcc,0x81,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd1,
+	0x94,0x00,0x01,0xff,0xd1,0x95,0x00,0x10,0x07,0x01,0xff,0xd1,0x96,0x00,0x01,0xff,
+	0xd1,0x96,0xcc,0x88,0x00,0xd2,0x1c,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd1,0x98,0x00,
+	0x01,0xff,0xd1,0x99,0x00,0x10,0x07,0x01,0xff,0xd1,0x9a,0x00,0x01,0xff,0xd1,0x9b,
+	0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xd0,0xba,0xcc,0x81,0x00,0x04,0xff,0xd0,0xb8,
+	0xcc,0x80,0x00,0x10,0x09,0x01,0xff,0xd1,0x83,0xcc,0x86,0x00,0x01,0xff,0xd1,0x9f,
+	0x00,0xd3,0x38,0xd2,0x1c,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd0,0xb0,0x00,0x01,0xff,
+	0xd0,0xb1,0x00,0x10,0x07,0x01,0xff,0xd0,0xb2,0x00,0x01,0xff,0xd0,0xb3,0x00,0xd1,
+	0x0e,0x10,0x07,0x01,0xff,0xd0,0xb4,0x00,0x01,0xff,0xd0,0xb5,0x00,0x10,0x07,0x01,
+	0xff,0xd0,0xb6,0x00,0x01,0xff,0xd0,0xb7,0x00,0xd2,0x1e,0xd1,0x10,0x10,0x07,0x01,
+	0xff,0xd0,0xb8,0x00,0x01,0xff,0xd0,0xb8,0xcc,0x86,0x00,0x10,0x07,0x01,0xff,0xd0,
+	0xba,0x00,0x01,0xff,0xd0,0xbb,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd0,0xbc,0x00,
+	0x01,0xff,0xd0,0xbd,0x00,0x10,0x07,0x01,0xff,0xd0,0xbe,0x00,0x01,0xff,0xd0,0xbf,
+	0x00,0xe4,0x25,0x42,0xd3,0x38,0xd2,0x1c,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd1,0x80,
+	0x00,0x01,0xff,0xd1,0x81,0x00,0x10,0x07,0x01,0xff,0xd1,0x82,0x00,0x01,0xff,0xd1,
+	0x83,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd1,0x84,0x00,0x01,0xff,0xd1,0x85,0x00,
+	0x10,0x07,0x01,0xff,0xd1,0x86,0x00,0x01,0xff,0xd1,0x87,0x00,0xd2,0x1c,0xd1,0x0e,
+	0x10,0x07,0x01,0xff,0xd1,0x88,0x00,0x01,0xff,0xd1,0x89,0x00,0x10,0x07,0x01,0xff,
+	0xd1,0x8a,0x00,0x01,0xff,0xd1,0x8b,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd1,0x8c,
+	0x00,0x01,0xff,0xd1,0x8d,0x00,0x10,0x07,0x01,0xff,0xd1,0x8e,0x00,0x01,0xff,0xd1,
+	0x8f,0x00,0xcf,0x86,0xd5,0x07,0x64,0xcf,0x41,0x01,0x00,0xd4,0x58,0xd3,0x2c,0xd2,
+	0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd1,0xa1,0x00,0x01,0x00,0x10,0x07,0x01,0xff,
+	0xd1,0xa3,0x00,0x01,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd1,0xa5,0x00,0x01,0x00,
+	0x10,0x07,0x01,0xff,0xd1,0xa7,0x00,0x01,0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,
+	0xff,0xd1,0xa9,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd1,0xab,0x00,0x01,0x00,0xd1,
+	0x0b,0x10,0x07,0x01,0xff,0xd1,0xad,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd1,0xaf,
+	0x00,0x01,0x00,0xd3,0x33,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd1,0xb1,0x00,
+	0x01,0x00,0x10,0x07,0x01,0xff,0xd1,0xb3,0x00,0x01,0x00,0xd1,0x0b,0x10,0x07,0x01,
+	0xff,0xd1,0xb5,0x00,0x01,0x00,0x10,0x09,0x01,0xff,0xd1,0xb5,0xcc,0x8f,0x00,0x01,
+	0xff,0xd1,0xb5,0xcc,0x8f,0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd1,0xb9,
+	0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd1,0xbb,0x00,0x01,0x00,0xd1,0x0b,0x10,0x07,
+	0x01,0xff,0xd1,0xbd,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd1,0xbf,0x00,0x01,0x00,
+	0xe0,0x41,0x01,0xcf,0x86,0xd5,0x8e,0xd4,0x36,0xd3,0x11,0xe2,0x91,0x41,0xe1,0x88,
+	0x41,0x10,0x07,0x01,0xff,0xd2,0x81,0x00,0x01,0x00,0xd2,0x0f,0x51,0x04,0x04,0x00,
+	0x10,0x07,0x06,0xff,0xd2,0x8b,0x00,0x06,0x00,0xd1,0x0b,0x10,0x07,0x04,0xff,0xd2,
+	0x8d,0x00,0x04,0x00,0x10,0x07,0x04,0xff,0xd2,0x8f,0x00,0x04,0x00,0xd3,0x2c,0xd2,
+	0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd2,0x91,0x00,0x01,0x00,0x10,0x07,0x01,0xff,
+	0xd2,0x93,0x00,0x01,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd2,0x95,0x00,0x01,0x00,
+	0x10,0x07,0x01,0xff,0xd2,0x97,0x00,0x01,0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,
+	0xff,0xd2,0x99,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd2,0x9b,0x00,0x01,0x00,0xd1,
+	0x0b,0x10,0x07,0x01,0xff,0xd2,0x9d,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd2,0x9f,
+	0x00,0x01,0x00,0xd4,0x58,0xd3,0x2c,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd2,
+	0xa1,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd2,0xa3,0x00,0x01,0x00,0xd1,0x0b,0x10,
+	0x07,0x01,0xff,0xd2,0xa5,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd2,0xa7,0x00,0x01,
+	0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd2,0xa9,0x00,0x01,0x00,0x10,0x07,
+	0x01,0xff,0xd2,0xab,0x00,0x01,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd2,0xad,0x00,
+	0x01,0x00,0x10,0x07,0x01,0xff,0xd2,0xaf,0x00,0x01,0x00,0xd3,0x2c,0xd2,0x16,0xd1,
+	0x0b,0x10,0x07,0x01,0xff,0xd2,0xb1,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd2,0xb3,
+	0x00,0x01,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd2,0xb5,0x00,0x01,0x00,0x10,0x07,
+	0x01,0xff,0xd2,0xb7,0x00,0x01,0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd2,
+	0xb9,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd2,0xbb,0x00,0x01,0x00,0xd1,0x0b,0x10,
+	0x07,0x01,0xff,0xd2,0xbd,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xd2,0xbf,0x00,0x01,
+	0x00,0xcf,0x86,0xd5,0xdc,0xd4,0x5a,0xd3,0x36,0xd2,0x20,0xd1,0x10,0x10,0x07,0x01,
+	0xff,0xd3,0x8f,0x00,0x01,0xff,0xd0,0xb6,0xcc,0x86,0x00,0x10,0x09,0x01,0xff,0xd0,
+	0xb6,0xcc,0x86,0x00,0x01,0xff,0xd3,0x84,0x00,0xd1,0x0b,0x10,0x04,0x01,0x00,0x06,
+	0xff,0xd3,0x86,0x00,0x10,0x04,0x06,0x00,0x01,0xff,0xd3,0x88,0x00,0xd2,0x16,0xd1,
+	0x0b,0x10,0x04,0x01,0x00,0x06,0xff,0xd3,0x8a,0x00,0x10,0x04,0x06,0x00,0x01,0xff,
+	0xd3,0x8c,0x00,0xe1,0x69,0x40,0x10,0x04,0x01,0x00,0x06,0xff,0xd3,0x8e,0x00,0xd3,
+	0x41,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xd0,0xb0,0xcc,0x86,0x00,0x01,0xff,
+	0xd0,0xb0,0xcc,0x86,0x00,0x10,0x09,0x01,0xff,0xd0,0xb0,0xcc,0x88,0x00,0x01,0xff,
+	0xd0,0xb0,0xcc,0x88,0x00,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd3,0x95,0x00,0x01,0x00,
+	0x10,0x09,0x01,0xff,0xd0,0xb5,0xcc,0x86,0x00,0x01,0xff,0xd0,0xb5,0xcc,0x86,0x00,
+	0xd2,0x1d,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd3,0x99,0x00,0x01,0x00,0x10,0x09,0x01,
+	0xff,0xd3,0x99,0xcc,0x88,0x00,0x01,0xff,0xd3,0x99,0xcc,0x88,0x00,0xd1,0x12,0x10,
+	0x09,0x01,0xff,0xd0,0xb6,0xcc,0x88,0x00,0x01,0xff,0xd0,0xb6,0xcc,0x88,0x00,0x10,
+	0x09,0x01,0xff,0xd0,0xb7,0xcc,0x88,0x00,0x01,0xff,0xd0,0xb7,0xcc,0x88,0x00,0xd4,
+	0x82,0xd3,0x41,0xd2,0x1d,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd3,0xa1,0x00,0x01,0x00,
+	0x10,0x09,0x01,0xff,0xd0,0xb8,0xcc,0x84,0x00,0x01,0xff,0xd0,0xb8,0xcc,0x84,0x00,
+	0xd1,0x12,0x10,0x09,0x01,0xff,0xd0,0xb8,0xcc,0x88,0x00,0x01,0xff,0xd0,0xb8,0xcc,
+	0x88,0x00,0x10,0x09,0x01,0xff,0xd0,0xbe,0xcc,0x88,0x00,0x01,0xff,0xd0,0xbe,0xcc,
+	0x88,0x00,0xd2,0x1d,0xd1,0x0b,0x10,0x07,0x01,0xff,0xd3,0xa9,0x00,0x01,0x00,0x10,
+	0x09,0x01,0xff,0xd3,0xa9,0xcc,0x88,0x00,0x01,0xff,0xd3,0xa9,0xcc,0x88,0x00,0xd1,
+	0x12,0x10,0x09,0x04,0xff,0xd1,0x8d,0xcc,0x88,0x00,0x04,0xff,0xd1,0x8d,0xcc,0x88,
+	0x00,0x10,0x09,0x01,0xff,0xd1,0x83,0xcc,0x84,0x00,0x01,0xff,0xd1,0x83,0xcc,0x84,
+	0x00,0xd3,0x41,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xd1,0x83,0xcc,0x88,0x00,
+	0x01,0xff,0xd1,0x83,0xcc,0x88,0x00,0x10,0x09,0x01,0xff,0xd1,0x83,0xcc,0x8b,0x00,
+	0x01,0xff,0xd1,0x83,0xcc,0x8b,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xd1,0x87,0xcc,
+	0x88,0x00,0x01,0xff,0xd1,0x87,0xcc,0x88,0x00,0x10,0x07,0x08,0xff,0xd3,0xb7,0x00,
+	0x08,0x00,0xd2,0x1d,0xd1,0x12,0x10,0x09,0x01,0xff,0xd1,0x8b,0xcc,0x88,0x00,0x01,
+	0xff,0xd1,0x8b,0xcc,0x88,0x00,0x10,0x07,0x09,0xff,0xd3,0xbb,0x00,0x09,0x00,0xd1,
+	0x0b,0x10,0x07,0x09,0xff,0xd3,0xbd,0x00,0x09,0x00,0x10,0x07,0x09,0xff,0xd3,0xbf,
+	0x00,0x09,0x00,0xe1,0x26,0x02,0xe0,0x78,0x01,0xcf,0x86,0xd5,0xb0,0xd4,0x58,0xd3,
+	0x2c,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x06,0xff,0xd4,0x81,0x00,0x06,0x00,0x10,0x07,
+	0x06,0xff,0xd4,0x83,0x00,0x06,0x00,0xd1,0x0b,0x10,0x07,0x06,0xff,0xd4,0x85,0x00,
+	0x06,0x00,0x10,0x07,0x06,0xff,0xd4,0x87,0x00,0x06,0x00,0xd2,0x16,0xd1,0x0b,0x10,
+	0x07,0x06,0xff,0xd4,0x89,0x00,0x06,0x00,0x10,0x07,0x06,0xff,0xd4,0x8b,0x00,0x06,
+	0x00,0xd1,0x0b,0x10,0x07,0x06,0xff,0xd4,0x8d,0x00,0x06,0x00,0x10,0x07,0x06,0xff,
+	0xd4,0x8f,0x00,0x06,0x00,0xd3,0x2c,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x09,0xff,0xd4,
+	0x91,0x00,0x09,0x00,0x10,0x07,0x09,0xff,0xd4,0x93,0x00,0x09,0x00,0xd1,0x0b,0x10,
+	0x07,0x0a,0xff,0xd4,0x95,0x00,0x0a,0x00,0x10,0x07,0x0a,0xff,0xd4,0x97,0x00,0x0a,
+	0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x0a,0xff,0xd4,0x99,0x00,0x0a,0x00,0x10,0x07,
+	0x0a,0xff,0xd4,0x9b,0x00,0x0a,0x00,0xd1,0x0b,0x10,0x07,0x0a,0xff,0xd4,0x9d,0x00,
+	0x0a,0x00,0x10,0x07,0x0a,0xff,0xd4,0x9f,0x00,0x0a,0x00,0xd4,0x58,0xd3,0x2c,0xd2,
+	0x16,0xd1,0x0b,0x10,0x07,0x0a,0xff,0xd4,0xa1,0x00,0x0a,0x00,0x10,0x07,0x0a,0xff,
+	0xd4,0xa3,0x00,0x0a,0x00,0xd1,0x0b,0x10,0x07,0x0b,0xff,0xd4,0xa5,0x00,0x0b,0x00,
+	0x10,0x07,0x0c,0xff,0xd4,0xa7,0x00,0x0c,0x00,0xd2,0x16,0xd1,0x0b,0x10,0x07,0x10,
+	0xff,0xd4,0xa9,0x00,0x10,0x00,0x10,0x07,0x10,0xff,0xd4,0xab,0x00,0x10,0x00,0xd1,
+	0x0b,0x10,0x07,0x10,0xff,0xd4,0xad,0x00,0x10,0x00,0x10,0x07,0x10,0xff,0xd4,0xaf,
+	0x00,0x10,0x00,0xd3,0x35,0xd2,0x19,0xd1,0x0b,0x10,0x04,0x00,0x00,0x01,0xff,0xd5,
+	0xa1,0x00,0x10,0x07,0x01,0xff,0xd5,0xa2,0x00,0x01,0xff,0xd5,0xa3,0x00,0xd1,0x0e,
+	0x10,0x07,0x01,0xff,0xd5,0xa4,0x00,0x01,0xff,0xd5,0xa5,0x00,0x10,0x07,0x01,0xff,
+	0xd5,0xa6,0x00,0x01,0xff,0xd5,0xa7,0x00,0xd2,0x1c,0xd1,0x0e,0x10,0x07,0x01,0xff,
+	0xd5,0xa8,0x00,0x01,0xff,0xd5,0xa9,0x00,0x10,0x07,0x01,0xff,0xd5,0xaa,0x00,0x01,
+	0xff,0xd5,0xab,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd5,0xac,0x00,0x01,0xff,0xd5,
+	0xad,0x00,0x10,0x07,0x01,0xff,0xd5,0xae,0x00,0x01,0xff,0xd5,0xaf,0x00,0xcf,0x86,
+	0xe5,0x08,0x3f,0xd4,0x70,0xd3,0x38,0xd2,0x1c,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd5,
+	0xb0,0x00,0x01,0xff,0xd5,0xb1,0x00,0x10,0x07,0x01,0xff,0xd5,0xb2,0x00,0x01,0xff,
+	0xd5,0xb3,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd5,0xb4,0x00,0x01,0xff,0xd5,0xb5,
+	0x00,0x10,0x07,0x01,0xff,0xd5,0xb6,0x00,0x01,0xff,0xd5,0xb7,0x00,0xd2,0x1c,0xd1,
+	0x0e,0x10,0x07,0x01,0xff,0xd5,0xb8,0x00,0x01,0xff,0xd5,0xb9,0x00,0x10,0x07,0x01,
+	0xff,0xd5,0xba,0x00,0x01,0xff,0xd5,0xbb,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd5,
+	0xbc,0x00,0x01,0xff,0xd5,0xbd,0x00,0x10,0x07,0x01,0xff,0xd5,0xbe,0x00,0x01,0xff,
+	0xd5,0xbf,0x00,0xe3,0x87,0x3e,0xd2,0x1c,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd6,0x80,
+	0x00,0x01,0xff,0xd6,0x81,0x00,0x10,0x07,0x01,0xff,0xd6,0x82,0x00,0x01,0xff,0xd6,
+	0x83,0x00,0xd1,0x0e,0x10,0x07,0x01,0xff,0xd6,0x84,0x00,0x01,0xff,0xd6,0x85,0x00,
+	0x10,0x07,0x01,0xff,0xd6,0x86,0x00,0x00,0x00,0xe0,0x2f,0x3f,0xcf,0x86,0xe5,0xc0,
+	0x3e,0xe4,0x97,0x3e,0xe3,0x76,0x3e,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,
+	0x04,0x01,0x00,0x01,0xff,0xd5,0xa5,0xd6,0x82,0x00,0xe4,0x3e,0x25,0xe3,0xc3,0x1a,
+	0xe2,0x7b,0x81,0xe1,0xc0,0x13,0xd0,0x1e,0xcf,0x86,0xc5,0xe4,0x08,0x4b,0xe3,0x53,
+	0x46,0xe2,0xe9,0x43,0xe1,0x1c,0x43,0xe0,0xe1,0x42,0xcf,0x86,0xe5,0xa6,0x42,0x64,
+	0x89,0x42,0x0b,0x00,0xcf,0x86,0xe5,0xfa,0x01,0xe4,0x03,0x56,0xe3,0x76,0x01,0xe2,
+	0x8e,0x53,0xd1,0x0c,0xe0,0xef,0x52,0xcf,0x86,0x65,0x8d,0x52,0x04,0x00,0xe0,0x0d,
+	0x01,0xcf,0x86,0xd5,0x0a,0xe4,0x10,0x53,0x63,0xff,0x52,0x0a,0x00,0xd4,0x80,0xd3,
+	0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0x80,0x00,0x01,0xff,0xe2,
+	0xb4,0x81,0x00,0x10,0x08,0x01,0xff,0xe2,0xb4,0x82,0x00,0x01,0xff,0xe2,0xb4,0x83,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0x84,0x00,0x01,0xff,0xe2,0xb4,0x85,
+	0x00,0x10,0x08,0x01,0xff,0xe2,0xb4,0x86,0x00,0x01,0xff,0xe2,0xb4,0x87,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0x88,0x00,0x01,0xff,0xe2,0xb4,0x89,
+	0x00,0x10,0x08,0x01,0xff,0xe2,0xb4,0x8a,0x00,0x01,0xff,0xe2,0xb4,0x8b,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0x8c,0x00,0x01,0xff,0xe2,0xb4,0x8d,0x00,0x10,
+	0x08,0x01,0xff,0xe2,0xb4,0x8e,0x00,0x01,0xff,0xe2,0xb4,0x8f,0x00,0xd3,0x40,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0x90,0x00,0x01,0xff,0xe2,0xb4,0x91,
+	0x00,0x10,0x08,0x01,0xff,0xe2,0xb4,0x92,0x00,0x01,0xff,0xe2,0xb4,0x93,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0x94,0x00,0x01,0xff,0xe2,0xb4,0x95,0x00,0x10,
+	0x08,0x01,0xff,0xe2,0xb4,0x96,0x00,0x01,0xff,0xe2,0xb4,0x97,0x00,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0x98,0x00,0x01,0xff,0xe2,0xb4,0x99,0x00,0x10,
+	0x08,0x01,0xff,0xe2,0xb4,0x9a,0x00,0x01,0xff,0xe2,0xb4,0x9b,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0xe2,0xb4,0x9c,0x00,0x01,0xff,0xe2,0xb4,0x9d,0x00,0x10,0x08,0x01,
+	0xff,0xe2,0xb4,0x9e,0x00,0x01,0xff,0xe2,0xb4,0x9f,0x00,0xcf,0x86,0xe5,0x42,0x52,
+	0x94,0x50,0xd3,0x3c,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0xa0,0x00,
+	0x01,0xff,0xe2,0xb4,0xa1,0x00,0x10,0x08,0x01,0xff,0xe2,0xb4,0xa2,0x00,0x01,0xff,
+	0xe2,0xb4,0xa3,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0xb4,0xa4,0x00,0x01,0xff,
+	0xe2,0xb4,0xa5,0x00,0x10,0x04,0x00,0x00,0x0d,0xff,0xe2,0xb4,0xa7,0x00,0x52,0x04,
+	0x00,0x00,0x91,0x0c,0x10,0x04,0x00,0x00,0x0d,0xff,0xe2,0xb4,0xad,0x00,0x00,0x00,
+	0x01,0x00,0xd2,0x1b,0xe1,0xfc,0x52,0xe0,0xad,0x52,0xcf,0x86,0x95,0x0f,0x94,0x0b,
+	0x93,0x07,0x62,0x92,0x52,0x04,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0xd1,0x13,0xe0,
+	0xd3,0x53,0xcf,0x86,0x95,0x0a,0xe4,0xa8,0x53,0x63,0x97,0x53,0x04,0x00,0x04,0x00,
+	0xd0,0x0d,0xcf,0x86,0x95,0x07,0x64,0x22,0x54,0x08,0x00,0x04,0x00,0xcf,0x86,0x55,
+	0x04,0x04,0x00,0x54,0x04,0x04,0x00,0xd3,0x07,0x62,0x2f,0x54,0x04,0x00,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8f,0xb0,0x00,0x11,0xff,0xe1,0x8f,0xb1,0x00,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0xb2,0x00,0x11,0xff,0xe1,0x8f,0xb3,0x00,0x91,0x10,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0xb4,0x00,0x11,0xff,0xe1,0x8f,0xb5,0x00,0x00,0x00,
+	0xd4,0x1c,0xe3,0xe0,0x56,0xe2,0x17,0x56,0xe1,0xda,0x55,0xe0,0xbb,0x55,0xcf,0x86,
+	0x95,0x0a,0xe4,0xa4,0x55,0x63,0x88,0x55,0x04,0x00,0x04,0x00,0xe3,0xd2,0x01,0xe2,
+	0x2b,0x5a,0xd1,0x0c,0xe0,0x4c,0x59,0xcf,0x86,0x65,0x25,0x59,0x0a,0x00,0xe0,0x9c,
+	0x59,0xcf,0x86,0xd5,0xc5,0xd4,0x45,0xd3,0x31,0xd2,0x1c,0xd1,0x0e,0x10,0x07,0x12,
+	0xff,0xd0,0xb2,0x00,0x12,0xff,0xd0,0xb4,0x00,0x10,0x07,0x12,0xff,0xd0,0xbe,0x00,
+	0x12,0xff,0xd1,0x81,0x00,0x51,0x07,0x12,0xff,0xd1,0x82,0x00,0x10,0x07,0x12,0xff,
+	0xd1,0x8a,0x00,0x12,0xff,0xd1,0xa3,0x00,0x92,0x10,0x91,0x0c,0x10,0x08,0x12,0xff,
+	0xea,0x99,0x8b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x14,0xff,0xe1,0x83,0x90,0x00,0x14,0xff,0xe1,0x83,0x91,0x00,0x10,0x08,
+	0x14,0xff,0xe1,0x83,0x92,0x00,0x14,0xff,0xe1,0x83,0x93,0x00,0xd1,0x10,0x10,0x08,
+	0x14,0xff,0xe1,0x83,0x94,0x00,0x14,0xff,0xe1,0x83,0x95,0x00,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0x96,0x00,0x14,0xff,0xe1,0x83,0x97,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x14,0xff,0xe1,0x83,0x98,0x00,0x14,0xff,0xe1,0x83,0x99,0x00,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0x9a,0x00,0x14,0xff,0xe1,0x83,0x9b,0x00,0xd1,0x10,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0x9c,0x00,0x14,0xff,0xe1,0x83,0x9d,0x00,0x10,0x08,0x14,0xff,0xe1,0x83,
+	0x9e,0x00,0x14,0xff,0xe1,0x83,0x9f,0x00,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x14,0xff,0xe1,0x83,0xa0,0x00,0x14,0xff,0xe1,0x83,0xa1,0x00,0x10,0x08,
+	0x14,0xff,0xe1,0x83,0xa2,0x00,0x14,0xff,0xe1,0x83,0xa3,0x00,0xd1,0x10,0x10,0x08,
+	0x14,0xff,0xe1,0x83,0xa4,0x00,0x14,0xff,0xe1,0x83,0xa5,0x00,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0xa6,0x00,0x14,0xff,0xe1,0x83,0xa7,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x14,0xff,0xe1,0x83,0xa8,0x00,0x14,0xff,0xe1,0x83,0xa9,0x00,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0xaa,0x00,0x14,0xff,0xe1,0x83,0xab,0x00,0xd1,0x10,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0xac,0x00,0x14,0xff,0xe1,0x83,0xad,0x00,0x10,0x08,0x14,0xff,0xe1,0x83,
+	0xae,0x00,0x14,0xff,0xe1,0x83,0xaf,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x14,0xff,0xe1,0x83,0xb0,0x00,0x14,0xff,0xe1,0x83,0xb1,0x00,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0xb2,0x00,0x14,0xff,0xe1,0x83,0xb3,0x00,0xd1,0x10,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0xb4,0x00,0x14,0xff,0xe1,0x83,0xb5,0x00,0x10,0x08,0x14,0xff,0xe1,0x83,
+	0xb6,0x00,0x14,0xff,0xe1,0x83,0xb7,0x00,0xd2,0x1c,0xd1,0x10,0x10,0x08,0x14,0xff,
+	0xe1,0x83,0xb8,0x00,0x14,0xff,0xe1,0x83,0xb9,0x00,0x10,0x08,0x14,0xff,0xe1,0x83,
+	0xba,0x00,0x00,0x00,0xd1,0x0c,0x10,0x04,0x00,0x00,0x14,0xff,0xe1,0x83,0xbd,0x00,
+	0x10,0x08,0x14,0xff,0xe1,0x83,0xbe,0x00,0x14,0xff,0xe1,0x83,0xbf,0x00,0xe2,0x9d,
+	0x08,0xe1,0x48,0x04,0xe0,0x1c,0x02,0xcf,0x86,0xe5,0x11,0x01,0xd4,0x84,0xd3,0x40,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x61,0xcc,0xa5,0x00,0x01,0xff,0x61,0xcc,
+	0xa5,0x00,0x10,0x08,0x01,0xff,0x62,0xcc,0x87,0x00,0x01,0xff,0x62,0xcc,0x87,0x00,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0x62,0xcc,0xa3,0x00,0x01,0xff,0x62,0xcc,0xa3,0x00,
+	0x10,0x08,0x01,0xff,0x62,0xcc,0xb1,0x00,0x01,0xff,0x62,0xcc,0xb1,0x00,0xd2,0x24,
+	0xd1,0x14,0x10,0x0a,0x01,0xff,0x63,0xcc,0xa7,0xcc,0x81,0x00,0x01,0xff,0x63,0xcc,
+	0xa7,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x64,0xcc,0x87,0x00,0x01,0xff,0x64,0xcc,
+	0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x64,0xcc,0xa3,0x00,0x01,0xff,0x64,0xcc,
+	0xa3,0x00,0x10,0x08,0x01,0xff,0x64,0xcc,0xb1,0x00,0x01,0xff,0x64,0xcc,0xb1,0x00,
+	0xd3,0x48,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x64,0xcc,0xa7,0x00,0x01,0xff,
+	0x64,0xcc,0xa7,0x00,0x10,0x08,0x01,0xff,0x64,0xcc,0xad,0x00,0x01,0xff,0x64,0xcc,
+	0xad,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x65,0xcc,0x84,0xcc,0x80,0x00,0x01,0xff,
+	0x65,0xcc,0x84,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0x65,0xcc,0x84,0xcc,0x81,0x00,
+	0x01,0xff,0x65,0xcc,0x84,0xcc,0x81,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0x65,0xcc,0xad,0x00,0x01,0xff,0x65,0xcc,0xad,0x00,0x10,0x08,0x01,0xff,0x65,0xcc,
+	0xb0,0x00,0x01,0xff,0x65,0xcc,0xb0,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x65,0xcc,
+	0xa7,0xcc,0x86,0x00,0x01,0xff,0x65,0xcc,0xa7,0xcc,0x86,0x00,0x10,0x08,0x01,0xff,
+	0x66,0xcc,0x87,0x00,0x01,0xff,0x66,0xcc,0x87,0x00,0xd4,0x84,0xd3,0x40,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0x67,0xcc,0x84,0x00,0x01,0xff,0x67,0xcc,0x84,0x00,
+	0x10,0x08,0x01,0xff,0x68,0xcc,0x87,0x00,0x01,0xff,0x68,0xcc,0x87,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0x68,0xcc,0xa3,0x00,0x01,0xff,0x68,0xcc,0xa3,0x00,0x10,0x08,
+	0x01,0xff,0x68,0xcc,0x88,0x00,0x01,0xff,0x68,0xcc,0x88,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0x68,0xcc,0xa7,0x00,0x01,0xff,0x68,0xcc,0xa7,0x00,0x10,0x08,
+	0x01,0xff,0x68,0xcc,0xae,0x00,0x01,0xff,0x68,0xcc,0xae,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0x69,0xcc,0xb0,0x00,0x01,0xff,0x69,0xcc,0xb0,0x00,0x10,0x0a,0x01,0xff,
+	0x69,0xcc,0x88,0xcc,0x81,0x00,0x01,0xff,0x69,0xcc,0x88,0xcc,0x81,0x00,0xd3,0x40,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x6b,0xcc,0x81,0x00,0x01,0xff,0x6b,0xcc,
+	0x81,0x00,0x10,0x08,0x01,0xff,0x6b,0xcc,0xa3,0x00,0x01,0xff,0x6b,0xcc,0xa3,0x00,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0x6b,0xcc,0xb1,0x00,0x01,0xff,0x6b,0xcc,0xb1,0x00,
+	0x10,0x08,0x01,0xff,0x6c,0xcc,0xa3,0x00,0x01,0xff,0x6c,0xcc,0xa3,0x00,0xd2,0x24,
+	0xd1,0x14,0x10,0x0a,0x01,0xff,0x6c,0xcc,0xa3,0xcc,0x84,0x00,0x01,0xff,0x6c,0xcc,
+	0xa3,0xcc,0x84,0x00,0x10,0x08,0x01,0xff,0x6c,0xcc,0xb1,0x00,0x01,0xff,0x6c,0xcc,
+	0xb1,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x6c,0xcc,0xad,0x00,0x01,0xff,0x6c,0xcc,
+	0xad,0x00,0x10,0x08,0x01,0xff,0x6d,0xcc,0x81,0x00,0x01,0xff,0x6d,0xcc,0x81,0x00,
+	0xcf,0x86,0xe5,0x15,0x01,0xd4,0x88,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x6d,0xcc,0x87,0x00,0x01,0xff,0x6d,0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x6d,
+	0xcc,0xa3,0x00,0x01,0xff,0x6d,0xcc,0xa3,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x6e,
+	0xcc,0x87,0x00,0x01,0xff,0x6e,0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x6e,0xcc,0xa3,
+	0x00,0x01,0xff,0x6e,0xcc,0xa3,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x6e,
+	0xcc,0xb1,0x00,0x01,0xff,0x6e,0xcc,0xb1,0x00,0x10,0x08,0x01,0xff,0x6e,0xcc,0xad,
+	0x00,0x01,0xff,0x6e,0xcc,0xad,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x83,
+	0xcc,0x81,0x00,0x01,0xff,0x6f,0xcc,0x83,0xcc,0x81,0x00,0x10,0x0a,0x01,0xff,0x6f,
+	0xcc,0x83,0xcc,0x88,0x00,0x01,0xff,0x6f,0xcc,0x83,0xcc,0x88,0x00,0xd3,0x48,0xd2,
+	0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x84,0xcc,0x80,0x00,0x01,0xff,0x6f,
+	0xcc,0x84,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x84,0xcc,0x81,0x00,0x01,
+	0xff,0x6f,0xcc,0x84,0xcc,0x81,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x70,0xcc,0x81,
+	0x00,0x01,0xff,0x70,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x70,0xcc,0x87,0x00,0x01,
+	0xff,0x70,0xcc,0x87,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x72,0xcc,0x87,
+	0x00,0x01,0xff,0x72,0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x72,0xcc,0xa3,0x00,0x01,
+	0xff,0x72,0xcc,0xa3,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x72,0xcc,0xa3,0xcc,0x84,
+	0x00,0x01,0xff,0x72,0xcc,0xa3,0xcc,0x84,0x00,0x10,0x08,0x01,0xff,0x72,0xcc,0xb1,
+	0x00,0x01,0xff,0x72,0xcc,0xb1,0x00,0xd4,0x8c,0xd3,0x48,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x73,0xcc,0x87,0x00,0x01,0xff,0x73,0xcc,0x87,0x00,0x10,0x08,0x01,
+	0xff,0x73,0xcc,0xa3,0x00,0x01,0xff,0x73,0xcc,0xa3,0x00,0xd1,0x14,0x10,0x0a,0x01,
+	0xff,0x73,0xcc,0x81,0xcc,0x87,0x00,0x01,0xff,0x73,0xcc,0x81,0xcc,0x87,0x00,0x10,
+	0x0a,0x01,0xff,0x73,0xcc,0x8c,0xcc,0x87,0x00,0x01,0xff,0x73,0xcc,0x8c,0xcc,0x87,
+	0x00,0xd2,0x24,0xd1,0x14,0x10,0x0a,0x01,0xff,0x73,0xcc,0xa3,0xcc,0x87,0x00,0x01,
+	0xff,0x73,0xcc,0xa3,0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x74,0xcc,0x87,0x00,0x01,
+	0xff,0x74,0xcc,0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x74,0xcc,0xa3,0x00,0x01,
+	0xff,0x74,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x74,0xcc,0xb1,0x00,0x01,0xff,0x74,
+	0xcc,0xb1,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x74,0xcc,0xad,
+	0x00,0x01,0xff,0x74,0xcc,0xad,0x00,0x10,0x08,0x01,0xff,0x75,0xcc,0xa4,0x00,0x01,
+	0xff,0x75,0xcc,0xa4,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x75,0xcc,0xb0,0x00,0x01,
+	0xff,0x75,0xcc,0xb0,0x00,0x10,0x08,0x01,0xff,0x75,0xcc,0xad,0x00,0x01,0xff,0x75,
+	0xcc,0xad,0x00,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x75,0xcc,0x83,0xcc,0x81,
+	0x00,0x01,0xff,0x75,0xcc,0x83,0xcc,0x81,0x00,0x10,0x0a,0x01,0xff,0x75,0xcc,0x84,
+	0xcc,0x88,0x00,0x01,0xff,0x75,0xcc,0x84,0xcc,0x88,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x76,0xcc,0x83,0x00,0x01,0xff,0x76,0xcc,0x83,0x00,0x10,0x08,0x01,0xff,0x76,
+	0xcc,0xa3,0x00,0x01,0xff,0x76,0xcc,0xa3,0x00,0xe0,0x11,0x02,0xcf,0x86,0xd5,0xe2,
+	0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x77,0xcc,0x80,0x00,
+	0x01,0xff,0x77,0xcc,0x80,0x00,0x10,0x08,0x01,0xff,0x77,0xcc,0x81,0x00,0x01,0xff,
+	0x77,0xcc,0x81,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x77,0xcc,0x88,0x00,0x01,0xff,
+	0x77,0xcc,0x88,0x00,0x10,0x08,0x01,0xff,0x77,0xcc,0x87,0x00,0x01,0xff,0x77,0xcc,
+	0x87,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x77,0xcc,0xa3,0x00,0x01,0xff,
+	0x77,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x78,0xcc,0x87,0x00,0x01,0xff,0x78,0xcc,
+	0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x78,0xcc,0x88,0x00,0x01,0xff,0x78,0xcc,
+	0x88,0x00,0x10,0x08,0x01,0xff,0x79,0xcc,0x87,0x00,0x01,0xff,0x79,0xcc,0x87,0x00,
+	0xd3,0x33,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x7a,0xcc,0x82,0x00,0x01,0xff,
+	0x7a,0xcc,0x82,0x00,0x10,0x08,0x01,0xff,0x7a,0xcc,0xa3,0x00,0x01,0xff,0x7a,0xcc,
+	0xa3,0x00,0xe1,0x12,0x59,0x10,0x08,0x01,0xff,0x7a,0xcc,0xb1,0x00,0x01,0xff,0x7a,
+	0xcc,0xb1,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x77,0xcc,0x8a,0x00,0x01,
+	0xff,0x79,0xcc,0x8a,0x00,0x10,0x08,0x01,0xff,0x61,0xca,0xbe,0x00,0x02,0xff,0x73,
+	0xcc,0x87,0x00,0x51,0x04,0x0a,0x00,0x10,0x07,0x0a,0xff,0x73,0x73,0x00,0x0a,0x00,
+	0xd4,0x98,0xd3,0x48,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x61,0xcc,0xa3,0x00,
+	0x01,0xff,0x61,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x61,0xcc,0x89,0x00,0x01,0xff,
+	0x61,0xcc,0x89,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x61,0xcc,0x82,0xcc,0x81,0x00,
+	0x01,0xff,0x61,0xcc,0x82,0xcc,0x81,0x00,0x10,0x0a,0x01,0xff,0x61,0xcc,0x82,0xcc,
+	0x80,0x00,0x01,0xff,0x61,0xcc,0x82,0xcc,0x80,0x00,0xd2,0x28,0xd1,0x14,0x10,0x0a,
+	0x01,0xff,0x61,0xcc,0x82,0xcc,0x89,0x00,0x01,0xff,0x61,0xcc,0x82,0xcc,0x89,0x00,
+	0x10,0x0a,0x01,0xff,0x61,0xcc,0x82,0xcc,0x83,0x00,0x01,0xff,0x61,0xcc,0x82,0xcc,
+	0x83,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x61,0xcc,0xa3,0xcc,0x82,0x00,0x01,0xff,
+	0x61,0xcc,0xa3,0xcc,0x82,0x00,0x10,0x0a,0x01,0xff,0x61,0xcc,0x86,0xcc,0x81,0x00,
+	0x01,0xff,0x61,0xcc,0x86,0xcc,0x81,0x00,0xd3,0x50,0xd2,0x28,0xd1,0x14,0x10,0x0a,
+	0x01,0xff,0x61,0xcc,0x86,0xcc,0x80,0x00,0x01,0xff,0x61,0xcc,0x86,0xcc,0x80,0x00,
+	0x10,0x0a,0x01,0xff,0x61,0xcc,0x86,0xcc,0x89,0x00,0x01,0xff,0x61,0xcc,0x86,0xcc,
+	0x89,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x61,0xcc,0x86,0xcc,0x83,0x00,0x01,0xff,
+	0x61,0xcc,0x86,0xcc,0x83,0x00,0x10,0x0a,0x01,0xff,0x61,0xcc,0xa3,0xcc,0x86,0x00,
+	0x01,0xff,0x61,0xcc,0xa3,0xcc,0x86,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0x65,0xcc,0xa3,0x00,0x01,0xff,0x65,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x65,0xcc,
+	0x89,0x00,0x01,0xff,0x65,0xcc,0x89,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x65,0xcc,
+	0x83,0x00,0x01,0xff,0x65,0xcc,0x83,0x00,0x10,0x0a,0x01,0xff,0x65,0xcc,0x82,0xcc,
+	0x81,0x00,0x01,0xff,0x65,0xcc,0x82,0xcc,0x81,0x00,0xcf,0x86,0xe5,0x31,0x01,0xd4,
+	0x90,0xd3,0x50,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x65,0xcc,0x82,0xcc,0x80,
+	0x00,0x01,0xff,0x65,0xcc,0x82,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0x65,0xcc,0x82,
+	0xcc,0x89,0x00,0x01,0xff,0x65,0xcc,0x82,0xcc,0x89,0x00,0xd1,0x14,0x10,0x0a,0x01,
+	0xff,0x65,0xcc,0x82,0xcc,0x83,0x00,0x01,0xff,0x65,0xcc,0x82,0xcc,0x83,0x00,0x10,
+	0x0a,0x01,0xff,0x65,0xcc,0xa3,0xcc,0x82,0x00,0x01,0xff,0x65,0xcc,0xa3,0xcc,0x82,
+	0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x69,0xcc,0x89,0x00,0x01,0xff,0x69,
+	0xcc,0x89,0x00,0x10,0x08,0x01,0xff,0x69,0xcc,0xa3,0x00,0x01,0xff,0x69,0xcc,0xa3,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x6f,0xcc,0xa3,0x00,0x01,0xff,0x6f,0xcc,0xa3,
+	0x00,0x10,0x08,0x01,0xff,0x6f,0xcc,0x89,0x00,0x01,0xff,0x6f,0xcc,0x89,0x00,0xd3,
+	0x50,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x82,0xcc,0x81,0x00,0x01,
+	0xff,0x6f,0xcc,0x82,0xcc,0x81,0x00,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x82,0xcc,0x80,
+	0x00,0x01,0xff,0x6f,0xcc,0x82,0xcc,0x80,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x6f,
+	0xcc,0x82,0xcc,0x89,0x00,0x01,0xff,0x6f,0xcc,0x82,0xcc,0x89,0x00,0x10,0x0a,0x01,
+	0xff,0x6f,0xcc,0x82,0xcc,0x83,0x00,0x01,0xff,0x6f,0xcc,0x82,0xcc,0x83,0x00,0xd2,
+	0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x6f,0xcc,0xa3,0xcc,0x82,0x00,0x01,0xff,0x6f,
+	0xcc,0xa3,0xcc,0x82,0x00,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0x81,0x00,0x01,
+	0xff,0x6f,0xcc,0x9b,0xcc,0x81,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x9b,
+	0xcc,0x80,0x00,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0x6f,
+	0xcc,0x9b,0xcc,0x89,0x00,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0x89,0x00,0xd4,0x98,0xd3,
+	0x48,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0x83,0x00,0x01,
+	0xff,0x6f,0xcc,0x9b,0xcc,0x83,0x00,0x10,0x0a,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0xa3,
+	0x00,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0xa3,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x75,
+	0xcc,0xa3,0x00,0x01,0xff,0x75,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x75,0xcc,0x89,
+	0x00,0x01,0xff,0x75,0xcc,0x89,0x00,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x75,
+	0xcc,0x9b,0xcc,0x81,0x00,0x01,0xff,0x75,0xcc,0x9b,0xcc,0x81,0x00,0x10,0x0a,0x01,
+	0xff,0x75,0xcc,0x9b,0xcc,0x80,0x00,0x01,0xff,0x75,0xcc,0x9b,0xcc,0x80,0x00,0xd1,
+	0x14,0x10,0x0a,0x01,0xff,0x75,0xcc,0x9b,0xcc,0x89,0x00,0x01,0xff,0x75,0xcc,0x9b,
+	0xcc,0x89,0x00,0x10,0x0a,0x01,0xff,0x75,0xcc,0x9b,0xcc,0x83,0x00,0x01,0xff,0x75,
+	0xcc,0x9b,0xcc,0x83,0x00,0xd3,0x44,0xd2,0x24,0xd1,0x14,0x10,0x0a,0x01,0xff,0x75,
+	0xcc,0x9b,0xcc,0xa3,0x00,0x01,0xff,0x75,0xcc,0x9b,0xcc,0xa3,0x00,0x10,0x08,0x01,
+	0xff,0x79,0xcc,0x80,0x00,0x01,0xff,0x79,0xcc,0x80,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x79,0xcc,0xa3,0x00,0x01,0xff,0x79,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x79,
+	0xcc,0x89,0x00,0x01,0xff,0x79,0xcc,0x89,0x00,0xd2,0x1c,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x79,0xcc,0x83,0x00,0x01,0xff,0x79,0xcc,0x83,0x00,0x10,0x08,0x0a,0xff,0xe1,
+	0xbb,0xbb,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xe1,0xbb,0xbd,0x00,0x0a,
+	0x00,0x10,0x08,0x0a,0xff,0xe1,0xbb,0xbf,0x00,0x0a,0x00,0xe1,0xbf,0x02,0xe0,0xa1,
+	0x01,0xcf,0x86,0xd5,0xc6,0xd4,0x6c,0xd3,0x18,0xe2,0x0e,0x59,0xe1,0xf7,0x58,0x10,
+	0x09,0x01,0xff,0xce,0xb1,0xcc,0x93,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0x00,0xd2,
+	0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb1,0xcc,0x93,0x00,0x01,0xff,0xce,0xb1,
+	0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0xb1,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,
+	0xce,0xb1,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xb1,0xcc,
+	0x93,0xcc,0x81,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,
+	0xff,0xce,0xb1,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xcd,0x82,
+	0x00,0xd3,0x18,0xe2,0x4a,0x59,0xe1,0x33,0x59,0x10,0x09,0x01,0xff,0xce,0xb5,0xcc,
+	0x93,0x00,0x01,0xff,0xce,0xb5,0xcc,0x94,0x00,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,
+	0xff,0xce,0xb5,0xcc,0x93,0x00,0x01,0xff,0xce,0xb5,0xcc,0x94,0x00,0x10,0x0b,0x01,
+	0xff,0xce,0xb5,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0xb5,0xcc,0x94,0xcc,0x80,
+	0x00,0x91,0x16,0x10,0x0b,0x01,0xff,0xce,0xb5,0xcc,0x93,0xcc,0x81,0x00,0x01,0xff,
+	0xce,0xb5,0xcc,0x94,0xcc,0x81,0x00,0x00,0x00,0xd4,0x6c,0xd3,0x18,0xe2,0x74,0x59,
+	0xe1,0x5d,0x59,0x10,0x09,0x01,0xff,0xce,0xb7,0xcc,0x93,0x00,0x01,0xff,0xce,0xb7,
+	0xcc,0x94,0x00,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb7,0xcc,0x93,0x00,
+	0x01,0xff,0xce,0xb7,0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcc,
+	0x80,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,
+	0xff,0xce,0xb7,0xcc,0x93,0xcc,0x81,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcc,0x81,
+	0x00,0x10,0x0b,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0xb7,
+	0xcc,0x94,0xcd,0x82,0x00,0xd3,0x18,0xe2,0xb0,0x59,0xe1,0x99,0x59,0x10,0x09,0x01,
+	0xff,0xce,0xb9,0xcc,0x93,0x00,0x01,0xff,0xce,0xb9,0xcc,0x94,0x00,0xd2,0x28,0xd1,
+	0x12,0x10,0x09,0x01,0xff,0xce,0xb9,0xcc,0x93,0x00,0x01,0xff,0xce,0xb9,0xcc,0x94,
+	0x00,0x10,0x0b,0x01,0xff,0xce,0xb9,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0xb9,
+	0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xb9,0xcc,0x93,0xcc,
+	0x81,0x00,0x01,0xff,0xce,0xb9,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,0xff,0xce,
+	0xb9,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0xb9,0xcc,0x94,0xcd,0x82,0x00,0xcf,
+	0x86,0xd5,0xac,0xd4,0x5a,0xd3,0x18,0xe2,0xed,0x59,0xe1,0xd6,0x59,0x10,0x09,0x01,
+	0xff,0xce,0xbf,0xcc,0x93,0x00,0x01,0xff,0xce,0xbf,0xcc,0x94,0x00,0xd2,0x28,0xd1,
+	0x12,0x10,0x09,0x01,0xff,0xce,0xbf,0xcc,0x93,0x00,0x01,0xff,0xce,0xbf,0xcc,0x94,
+	0x00,0x10,0x0b,0x01,0xff,0xce,0xbf,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0xbf,
+	0xcc,0x94,0xcc,0x80,0x00,0x91,0x16,0x10,0x0b,0x01,0xff,0xce,0xbf,0xcc,0x93,0xcc,
+	0x81,0x00,0x01,0xff,0xce,0xbf,0xcc,0x94,0xcc,0x81,0x00,0x00,0x00,0xd3,0x18,0xe2,
+	0x17,0x5a,0xe1,0x00,0x5a,0x10,0x09,0x01,0xff,0xcf,0x85,0xcc,0x93,0x00,0x01,0xff,
+	0xcf,0x85,0xcc,0x94,0x00,0xd2,0x1c,0xd1,0x0d,0x10,0x04,0x00,0x00,0x01,0xff,0xcf,
+	0x85,0xcc,0x94,0x00,0x10,0x04,0x00,0x00,0x01,0xff,0xcf,0x85,0xcc,0x94,0xcc,0x80,
+	0x00,0xd1,0x0f,0x10,0x04,0x00,0x00,0x01,0xff,0xcf,0x85,0xcc,0x94,0xcc,0x81,0x00,
+	0x10,0x04,0x00,0x00,0x01,0xff,0xcf,0x85,0xcc,0x94,0xcd,0x82,0x00,0xe4,0xd3,0x5a,
+	0xd3,0x18,0xe2,0x52,0x5a,0xe1,0x3b,0x5a,0x10,0x09,0x01,0xff,0xcf,0x89,0xcc,0x93,
+	0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0x00,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,0xff,
+	0xcf,0x89,0xcc,0x93,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,
+	0xcf,0x89,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcc,0x80,0x00,
+	0xd1,0x16,0x10,0x0b,0x01,0xff,0xcf,0x89,0xcc,0x93,0xcc,0x81,0x00,0x01,0xff,0xcf,
+	0x89,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,0xff,0xcf,0x89,0xcc,0x93,0xcd,0x82,
+	0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcd,0x82,0x00,0xe0,0xd9,0x02,0xcf,0x86,0xe5,
+	0x91,0x01,0xd4,0xc8,0xd3,0x64,0xd2,0x30,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xb1,
+	0xcc,0x93,0xce,0xb9,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xce,0xb9,0x00,0x10,0x0d,
+	0x01,0xff,0xce,0xb1,0xcc,0x93,0xcc,0x80,0xce,0xb9,0x00,0x01,0xff,0xce,0xb1,0xcc,
+	0x94,0xcc,0x80,0xce,0xb9,0x00,0xd1,0x1a,0x10,0x0d,0x01,0xff,0xce,0xb1,0xcc,0x93,
+	0xcc,0x81,0xce,0xb9,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xcc,0x81,0xce,0xb9,0x00,
+	0x10,0x0d,0x01,0xff,0xce,0xb1,0xcc,0x93,0xcd,0x82,0xce,0xb9,0x00,0x01,0xff,0xce,
+	0xb1,0xcc,0x94,0xcd,0x82,0xce,0xb9,0x00,0xd2,0x30,0xd1,0x16,0x10,0x0b,0x01,0xff,
+	0xce,0xb1,0xcc,0x93,0xce,0xb9,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xce,0xb9,0x00,
+	0x10,0x0d,0x01,0xff,0xce,0xb1,0xcc,0x93,0xcc,0x80,0xce,0xb9,0x00,0x01,0xff,0xce,
+	0xb1,0xcc,0x94,0xcc,0x80,0xce,0xb9,0x00,0xd1,0x1a,0x10,0x0d,0x01,0xff,0xce,0xb1,
+	0xcc,0x93,0xcc,0x81,0xce,0xb9,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xcc,0x81,0xce,
+	0xb9,0x00,0x10,0x0d,0x01,0xff,0xce,0xb1,0xcc,0x93,0xcd,0x82,0xce,0xb9,0x00,0x01,
+	0xff,0xce,0xb1,0xcc,0x94,0xcd,0x82,0xce,0xb9,0x00,0xd3,0x64,0xd2,0x30,0xd1,0x16,
+	0x10,0x0b,0x01,0xff,0xce,0xb7,0xcc,0x93,0xce,0xb9,0x00,0x01,0xff,0xce,0xb7,0xcc,
+	0x94,0xce,0xb9,0x00,0x10,0x0d,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcc,0x80,0xce,0xb9,
+	0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcc,0x80,0xce,0xb9,0x00,0xd1,0x1a,0x10,0x0d,
+	0x01,0xff,0xce,0xb7,0xcc,0x93,0xcc,0x81,0xce,0xb9,0x00,0x01,0xff,0xce,0xb7,0xcc,
+	0x94,0xcc,0x81,0xce,0xb9,0x00,0x10,0x0d,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcd,0x82,
+	0xce,0xb9,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcd,0x82,0xce,0xb9,0x00,0xd2,0x30,
+	0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xb7,0xcc,0x93,0xce,0xb9,0x00,0x01,0xff,0xce,
+	0xb7,0xcc,0x94,0xce,0xb9,0x00,0x10,0x0d,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcc,0x80,
+	0xce,0xb9,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcc,0x80,0xce,0xb9,0x00,0xd1,0x1a,
+	0x10,0x0d,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcc,0x81,0xce,0xb9,0x00,0x01,0xff,0xce,
+	0xb7,0xcc,0x94,0xcc,0x81,0xce,0xb9,0x00,0x10,0x0d,0x01,0xff,0xce,0xb7,0xcc,0x93,
+	0xcd,0x82,0xce,0xb9,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcd,0x82,0xce,0xb9,0x00,
+	0xd4,0xc8,0xd3,0x64,0xd2,0x30,0xd1,0x16,0x10,0x0b,0x01,0xff,0xcf,0x89,0xcc,0x93,
+	0xce,0xb9,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xce,0xb9,0x00,0x10,0x0d,0x01,0xff,
+	0xcf,0x89,0xcc,0x93,0xcc,0x80,0xce,0xb9,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcc,
+	0x80,0xce,0xb9,0x00,0xd1,0x1a,0x10,0x0d,0x01,0xff,0xcf,0x89,0xcc,0x93,0xcc,0x81,
+	0xce,0xb9,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcc,0x81,0xce,0xb9,0x00,0x10,0x0d,
+	0x01,0xff,0xcf,0x89,0xcc,0x93,0xcd,0x82,0xce,0xb9,0x00,0x01,0xff,0xcf,0x89,0xcc,
+	0x94,0xcd,0x82,0xce,0xb9,0x00,0xd2,0x30,0xd1,0x16,0x10,0x0b,0x01,0xff,0xcf,0x89,
+	0xcc,0x93,0xce,0xb9,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xce,0xb9,0x00,0x10,0x0d,
+	0x01,0xff,0xcf,0x89,0xcc,0x93,0xcc,0x80,0xce,0xb9,0x00,0x01,0xff,0xcf,0x89,0xcc,
+	0x94,0xcc,0x80,0xce,0xb9,0x00,0xd1,0x1a,0x10,0x0d,0x01,0xff,0xcf,0x89,0xcc,0x93,
+	0xcc,0x81,0xce,0xb9,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcc,0x81,0xce,0xb9,0x00,
+	0x10,0x0d,0x01,0xff,0xcf,0x89,0xcc,0x93,0xcd,0x82,0xce,0xb9,0x00,0x01,0xff,0xcf,
+	0x89,0xcc,0x94,0xcd,0x82,0xce,0xb9,0x00,0xd3,0x49,0xd2,0x26,0xd1,0x12,0x10,0x09,
+	0x01,0xff,0xce,0xb1,0xcc,0x86,0x00,0x01,0xff,0xce,0xb1,0xcc,0x84,0x00,0x10,0x0b,
+	0x01,0xff,0xce,0xb1,0xcc,0x80,0xce,0xb9,0x00,0x01,0xff,0xce,0xb1,0xce,0xb9,0x00,
+	0xd1,0x0f,0x10,0x0b,0x01,0xff,0xce,0xb1,0xcc,0x81,0xce,0xb9,0x00,0x00,0x00,0x10,
+	0x09,0x01,0xff,0xce,0xb1,0xcd,0x82,0x00,0x01,0xff,0xce,0xb1,0xcd,0x82,0xce,0xb9,
+	0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb1,0xcc,0x86,0x00,0x01,0xff,
+	0xce,0xb1,0xcc,0x84,0x00,0x10,0x09,0x01,0xff,0xce,0xb1,0xcc,0x80,0x00,0x01,0xff,
+	0xce,0xb1,0xcc,0x81,0x00,0xe1,0xf3,0x5a,0x10,0x09,0x01,0xff,0xce,0xb1,0xce,0xb9,
+	0x00,0x01,0x00,0xcf,0x86,0xd5,0xbd,0xd4,0x7e,0xd3,0x44,0xd2,0x21,0xd1,0x0d,0x10,
+	0x04,0x01,0x00,0x01,0xff,0xc2,0xa8,0xcd,0x82,0x00,0x10,0x0b,0x01,0xff,0xce,0xb7,
+	0xcc,0x80,0xce,0xb9,0x00,0x01,0xff,0xce,0xb7,0xce,0xb9,0x00,0xd1,0x0f,0x10,0x0b,
+	0x01,0xff,0xce,0xb7,0xcc,0x81,0xce,0xb9,0x00,0x00,0x00,0x10,0x09,0x01,0xff,0xce,
+	0xb7,0xcd,0x82,0x00,0x01,0xff,0xce,0xb7,0xcd,0x82,0xce,0xb9,0x00,0xd2,0x24,0xd1,
+	0x12,0x10,0x09,0x01,0xff,0xce,0xb5,0xcc,0x80,0x00,0x01,0xff,0xce,0xb5,0xcc,0x81,
+	0x00,0x10,0x09,0x01,0xff,0xce,0xb7,0xcc,0x80,0x00,0x01,0xff,0xce,0xb7,0xcc,0x81,
+	0x00,0xe1,0x02,0x5b,0x10,0x09,0x01,0xff,0xce,0xb7,0xce,0xb9,0x00,0x01,0xff,0xe1,
+	0xbe,0xbf,0xcc,0x80,0x00,0xd3,0x18,0xe2,0x28,0x5b,0xe1,0x11,0x5b,0x10,0x09,0x01,
+	0xff,0xce,0xb9,0xcc,0x86,0x00,0x01,0xff,0xce,0xb9,0xcc,0x84,0x00,0xe2,0x4c,0x5b,
+	0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb9,0xcc,0x86,0x00,0x01,0xff,0xce,0xb9,0xcc,
+	0x84,0x00,0x10,0x09,0x01,0xff,0xce,0xb9,0xcc,0x80,0x00,0x01,0xff,0xce,0xb9,0xcc,
+	0x81,0x00,0xd4,0x51,0xd3,0x18,0xe2,0x6f,0x5b,0xe1,0x58,0x5b,0x10,0x09,0x01,0xff,
+	0xcf,0x85,0xcc,0x86,0x00,0x01,0xff,0xcf,0x85,0xcc,0x84,0x00,0xd2,0x24,0xd1,0x12,
+	0x10,0x09,0x01,0xff,0xcf,0x85,0xcc,0x86,0x00,0x01,0xff,0xcf,0x85,0xcc,0x84,0x00,
+	0x10,0x09,0x01,0xff,0xcf,0x85,0xcc,0x80,0x00,0x01,0xff,0xcf,0x85,0xcc,0x81,0x00,
+	0xe1,0x8f,0x5b,0x10,0x09,0x01,0xff,0xcf,0x81,0xcc,0x94,0x00,0x01,0xff,0xc2,0xa8,
+	0xcc,0x80,0x00,0xd3,0x3b,0xd2,0x18,0x51,0x04,0x00,0x00,0x10,0x0b,0x01,0xff,0xcf,
+	0x89,0xcc,0x80,0xce,0xb9,0x00,0x01,0xff,0xcf,0x89,0xce,0xb9,0x00,0xd1,0x0f,0x10,
+	0x0b,0x01,0xff,0xcf,0x89,0xcc,0x81,0xce,0xb9,0x00,0x00,0x00,0x10,0x09,0x01,0xff,
+	0xcf,0x89,0xcd,0x82,0x00,0x01,0xff,0xcf,0x89,0xcd,0x82,0xce,0xb9,0x00,0xd2,0x24,
+	0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xbf,0xcc,0x80,0x00,0x01,0xff,0xce,0xbf,0xcc,
+	0x81,0x00,0x10,0x09,0x01,0xff,0xcf,0x89,0xcc,0x80,0x00,0x01,0xff,0xcf,0x89,0xcc,
+	0x81,0x00,0xe1,0x99,0x5b,0x10,0x09,0x01,0xff,0xcf,0x89,0xce,0xb9,0x00,0x01,0xff,
+	0xc2,0xb4,0x00,0xe0,0x0c,0x68,0xcf,0x86,0xe5,0x23,0x02,0xe4,0x25,0x01,0xe3,0x85,
+	0x5e,0xd2,0x2a,0xe1,0x5f,0x5c,0xe0,0xdd,0x5b,0xcf,0x86,0xe5,0xbb,0x5b,0x94,0x1b,
+	0xe3,0xa4,0x5b,0x92,0x14,0x91,0x10,0x10,0x08,0x01,0xff,0xe2,0x80,0x82,0x00,0x01,
+	0xff,0xe2,0x80,0x83,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd1,0xd6,0xd0,0x46,0xcf,
+	0x86,0x55,0x04,0x01,0x00,0xd4,0x29,0xd3,0x13,0x52,0x04,0x01,0x00,0x51,0x04,0x01,
+	0x00,0x10,0x07,0x01,0xff,0xcf,0x89,0x00,0x01,0x00,0x92,0x12,0x51,0x04,0x01,0x00,
+	0x10,0x06,0x01,0xff,0x6b,0x00,0x01,0xff,0x61,0xcc,0x8a,0x00,0x01,0x00,0xe3,0x25,
+	0x5d,0x92,0x10,0x51,0x04,0x01,0x00,0x10,0x08,0x01,0xff,0xe2,0x85,0x8e,0x00,0x01,
+	0x00,0x01,0x00,0xcf,0x86,0xd5,0x0a,0xe4,0x42,0x5d,0x63,0x2d,0x5d,0x06,0x00,0x94,
+	0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0x85,0xb0,0x00,0x01,
+	0xff,0xe2,0x85,0xb1,0x00,0x10,0x08,0x01,0xff,0xe2,0x85,0xb2,0x00,0x01,0xff,0xe2,
+	0x85,0xb3,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0x85,0xb4,0x00,0x01,0xff,0xe2,
+	0x85,0xb5,0x00,0x10,0x08,0x01,0xff,0xe2,0x85,0xb6,0x00,0x01,0xff,0xe2,0x85,0xb7,
+	0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0x85,0xb8,0x00,0x01,0xff,0xe2,
+	0x85,0xb9,0x00,0x10,0x08,0x01,0xff,0xe2,0x85,0xba,0x00,0x01,0xff,0xe2,0x85,0xbb,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0x85,0xbc,0x00,0x01,0xff,0xe2,0x85,0xbd,
+	0x00,0x10,0x08,0x01,0xff,0xe2,0x85,0xbe,0x00,0x01,0xff,0xe2,0x85,0xbf,0x00,0x01,
+	0x00,0xe0,0x34,0x5d,0xcf,0x86,0xe5,0x13,0x5d,0xe4,0xf2,0x5c,0xe3,0xe1,0x5c,0xe2,
+	0xd4,0x5c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x04,0xff,0xe2,0x86,0x84,0x00,
+	0xe3,0x23,0x61,0xe2,0xf0,0x60,0xd1,0x0c,0xe0,0x9d,0x60,0xcf,0x86,0x65,0x7e,0x60,
+	0x01,0x00,0xd0,0x62,0xcf,0x86,0x55,0x04,0x01,0x00,0x54,0x04,0x01,0x00,0xd3,0x18,
+	0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x08,0x01,0xff,0xe2,0x93,0x90,0x00,
+	0x01,0xff,0xe2,0x93,0x91,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0x93,
+	0x92,0x00,0x01,0xff,0xe2,0x93,0x93,0x00,0x10,0x08,0x01,0xff,0xe2,0x93,0x94,0x00,
+	0x01,0xff,0xe2,0x93,0x95,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe2,0x93,0x96,0x00,
+	0x01,0xff,0xe2,0x93,0x97,0x00,0x10,0x08,0x01,0xff,0xe2,0x93,0x98,0x00,0x01,0xff,
+	0xe2,0x93,0x99,0x00,0xcf,0x86,0xe5,0x57,0x60,0x94,0x80,0xd3,0x40,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe2,0x93,0x9a,0x00,0x01,0xff,0xe2,0x93,0x9b,0x00,0x10,
+	0x08,0x01,0xff,0xe2,0x93,0x9c,0x00,0x01,0xff,0xe2,0x93,0x9d,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0xe2,0x93,0x9e,0x00,0x01,0xff,0xe2,0x93,0x9f,0x00,0x10,0x08,0x01,
+	0xff,0xe2,0x93,0xa0,0x00,0x01,0xff,0xe2,0x93,0xa1,0x00,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0xe2,0x93,0xa2,0x00,0x01,0xff,0xe2,0x93,0xa3,0x00,0x10,0x08,0x01,
+	0xff,0xe2,0x93,0xa4,0x00,0x01,0xff,0xe2,0x93,0xa5,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0xe2,0x93,0xa6,0x00,0x01,0xff,0xe2,0x93,0xa7,0x00,0x10,0x08,0x01,0xff,0xe2,
+	0x93,0xa8,0x00,0x01,0xff,0xe2,0x93,0xa9,0x00,0x01,0x00,0xd4,0x0c,0xe3,0x33,0x62,
+	0xe2,0x2c,0x62,0xcf,0x06,0x04,0x00,0xe3,0x0c,0x65,0xe2,0xff,0x63,0xe1,0x2e,0x02,
+	0xe0,0x84,0x01,0xcf,0x86,0xe5,0x01,0x01,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x08,0xff,0xe2,0xb0,0xb0,0x00,0x08,0xff,0xe2,0xb0,0xb1,0x00,0x10,0x08,
+	0x08,0xff,0xe2,0xb0,0xb2,0x00,0x08,0xff,0xe2,0xb0,0xb3,0x00,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe2,0xb0,0xb4,0x00,0x08,0xff,0xe2,0xb0,0xb5,0x00,0x10,0x08,0x08,0xff,
+	0xe2,0xb0,0xb6,0x00,0x08,0xff,0xe2,0xb0,0xb7,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe2,0xb0,0xb8,0x00,0x08,0xff,0xe2,0xb0,0xb9,0x00,0x10,0x08,0x08,0xff,
+	0xe2,0xb0,0xba,0x00,0x08,0xff,0xe2,0xb0,0xbb,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe2,0xb0,0xbc,0x00,0x08,0xff,0xe2,0xb0,0xbd,0x00,0x10,0x08,0x08,0xff,0xe2,0xb0,
+	0xbe,0x00,0x08,0xff,0xe2,0xb0,0xbf,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe2,0xb1,0x80,0x00,0x08,0xff,0xe2,0xb1,0x81,0x00,0x10,0x08,0x08,0xff,
+	0xe2,0xb1,0x82,0x00,0x08,0xff,0xe2,0xb1,0x83,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe2,0xb1,0x84,0x00,0x08,0xff,0xe2,0xb1,0x85,0x00,0x10,0x08,0x08,0xff,0xe2,0xb1,
+	0x86,0x00,0x08,0xff,0xe2,0xb1,0x87,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe2,0xb1,0x88,0x00,0x08,0xff,0xe2,0xb1,0x89,0x00,0x10,0x08,0x08,0xff,0xe2,0xb1,
+	0x8a,0x00,0x08,0xff,0xe2,0xb1,0x8b,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,0xe2,0xb1,
+	0x8c,0x00,0x08,0xff,0xe2,0xb1,0x8d,0x00,0x10,0x08,0x08,0xff,0xe2,0xb1,0x8e,0x00,
+	0x08,0xff,0xe2,0xb1,0x8f,0x00,0x94,0x7c,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe2,0xb1,0x90,0x00,0x08,0xff,0xe2,0xb1,0x91,0x00,0x10,0x08,0x08,0xff,
+	0xe2,0xb1,0x92,0x00,0x08,0xff,0xe2,0xb1,0x93,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe2,0xb1,0x94,0x00,0x08,0xff,0xe2,0xb1,0x95,0x00,0x10,0x08,0x08,0xff,0xe2,0xb1,
+	0x96,0x00,0x08,0xff,0xe2,0xb1,0x97,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe2,0xb1,0x98,0x00,0x08,0xff,0xe2,0xb1,0x99,0x00,0x10,0x08,0x08,0xff,0xe2,0xb1,
+	0x9a,0x00,0x08,0xff,0xe2,0xb1,0x9b,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,0xe2,0xb1,
+	0x9c,0x00,0x08,0xff,0xe2,0xb1,0x9d,0x00,0x10,0x08,0x08,0xff,0xe2,0xb1,0x9e,0x00,
+	0x00,0x00,0x08,0x00,0xcf,0x86,0xd5,0x07,0x64,0xef,0x61,0x08,0x00,0xd4,0x63,0xd3,
+	0x32,0xd2,0x1b,0xd1,0x0c,0x10,0x08,0x09,0xff,0xe2,0xb1,0xa1,0x00,0x09,0x00,0x10,
+	0x07,0x09,0xff,0xc9,0xab,0x00,0x09,0xff,0xe1,0xb5,0xbd,0x00,0xd1,0x0b,0x10,0x07,
+	0x09,0xff,0xc9,0xbd,0x00,0x09,0x00,0x10,0x04,0x09,0x00,0x09,0xff,0xe2,0xb1,0xa8,
+	0x00,0xd2,0x18,0xd1,0x0c,0x10,0x04,0x09,0x00,0x09,0xff,0xe2,0xb1,0xaa,0x00,0x10,
+	0x04,0x09,0x00,0x09,0xff,0xe2,0xb1,0xac,0x00,0xd1,0x0b,0x10,0x04,0x09,0x00,0x0a,
+	0xff,0xc9,0x91,0x00,0x10,0x07,0x0a,0xff,0xc9,0xb1,0x00,0x0a,0xff,0xc9,0x90,0x00,
+	0xd3,0x27,0xd2,0x17,0xd1,0x0b,0x10,0x07,0x0b,0xff,0xc9,0x92,0x00,0x0a,0x00,0x10,
+	0x08,0x0a,0xff,0xe2,0xb1,0xb3,0x00,0x0a,0x00,0x91,0x0c,0x10,0x04,0x09,0x00,0x09,
+	0xff,0xe2,0xb1,0xb6,0x00,0x09,0x00,0x52,0x04,0x0a,0x00,0x51,0x04,0x0a,0x00,0x10,
+	0x07,0x0b,0xff,0xc8,0xbf,0x00,0x0b,0xff,0xc9,0x80,0x00,0xe0,0x83,0x01,0xcf,0x86,
+	0xd5,0xc0,0xd4,0x60,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,
+	0x81,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0x83,0x00,0x08,0x00,0xd1,0x0c,
+	0x10,0x08,0x08,0xff,0xe2,0xb2,0x85,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,
+	0x87,0x00,0x08,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,0x89,0x00,
+	0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0x8b,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,
+	0x08,0xff,0xe2,0xb2,0x8d,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0x8f,0x00,
+	0x08,0x00,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,0x91,0x00,
+	0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0x93,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,
+	0x08,0xff,0xe2,0xb2,0x95,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0x97,0x00,
+	0x08,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,0x99,0x00,0x08,0x00,
+	0x10,0x08,0x08,0xff,0xe2,0xb2,0x9b,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,0x08,0xff,
+	0xe2,0xb2,0x9d,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0x9f,0x00,0x08,0x00,
+	0xd4,0x60,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,0xa1,0x00,
+	0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0xa3,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,
+	0x08,0xff,0xe2,0xb2,0xa5,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0xa7,0x00,
+	0x08,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,0xa9,0x00,0x08,0x00,
+	0x10,0x08,0x08,0xff,0xe2,0xb2,0xab,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,0x08,0xff,
+	0xe2,0xb2,0xad,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0xaf,0x00,0x08,0x00,
+	0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,0xb1,0x00,0x08,0x00,
+	0x10,0x08,0x08,0xff,0xe2,0xb2,0xb3,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,0x08,0xff,
+	0xe2,0xb2,0xb5,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0xb7,0x00,0x08,0x00,
+	0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,0xb9,0x00,0x08,0x00,0x10,0x08,
+	0x08,0xff,0xe2,0xb2,0xbb,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb2,
+	0xbd,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb2,0xbf,0x00,0x08,0x00,0xcf,0x86,
+	0xd5,0xc0,0xd4,0x60,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb3,
+	0x81,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb3,0x83,0x00,0x08,0x00,0xd1,0x0c,
+	0x10,0x08,0x08,0xff,0xe2,0xb3,0x85,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb3,
+	0x87,0x00,0x08,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb3,0x89,0x00,
+	0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb3,0x8b,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,
+	0x08,0xff,0xe2,0xb3,0x8d,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb3,0x8f,0x00,
+	0x08,0x00,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb3,0x91,0x00,
+	0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb3,0x93,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,
+	0x08,0xff,0xe2,0xb3,0x95,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb3,0x97,0x00,
+	0x08,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb3,0x99,0x00,0x08,0x00,
+	0x10,0x08,0x08,0xff,0xe2,0xb3,0x9b,0x00,0x08,0x00,0xd1,0x0c,0x10,0x08,0x08,0xff,
+	0xe2,0xb3,0x9d,0x00,0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb3,0x9f,0x00,0x08,0x00,
+	0xd4,0x3b,0xd3,0x1c,0x92,0x18,0xd1,0x0c,0x10,0x08,0x08,0xff,0xe2,0xb3,0xa1,0x00,
+	0x08,0x00,0x10,0x08,0x08,0xff,0xe2,0xb3,0xa3,0x00,0x08,0x00,0x08,0x00,0xd2,0x10,
+	0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x0b,0xff,0xe2,0xb3,0xac,0x00,0xe1,0x3b,
+	0x5f,0x10,0x04,0x0b,0x00,0x0b,0xff,0xe2,0xb3,0xae,0x00,0xe3,0x40,0x5f,0x92,0x10,
+	0x51,0x04,0x0b,0xe6,0x10,0x08,0x0d,0xff,0xe2,0xb3,0xb3,0x00,0x0d,0x00,0x00,0x00,
+	0xe2,0x98,0x08,0xd1,0x0b,0xe0,0x11,0x67,0xcf,0x86,0xcf,0x06,0x01,0x00,0xe0,0x65,
+	0x6c,0xcf,0x86,0xe5,0xa7,0x05,0xd4,0x06,0xcf,0x06,0x04,0x00,0xd3,0x0c,0xe2,0xf8,
+	0x67,0xe1,0x8f,0x67,0xcf,0x06,0x04,0x00,0xe2,0xdb,0x01,0xe1,0x26,0x01,0xd0,0x09,
+	0xcf,0x86,0x65,0xf4,0x67,0x0a,0x00,0xcf,0x86,0xd5,0xc0,0xd4,0x60,0xd3,0x30,0xd2,
+	0x18,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x99,0x81,0x00,0x0a,0x00,0x10,0x08,0x0a,
+	0xff,0xea,0x99,0x83,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x99,0x85,
+	0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x99,0x87,0x00,0x0a,0x00,0xd2,0x18,0xd1,
+	0x0c,0x10,0x08,0x0a,0xff,0xea,0x99,0x89,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,
+	0x99,0x8b,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x99,0x8d,0x00,0x0a,
+	0x00,0x10,0x08,0x0a,0xff,0xea,0x99,0x8f,0x00,0x0a,0x00,0xd3,0x30,0xd2,0x18,0xd1,
+	0x0c,0x10,0x08,0x0a,0xff,0xea,0x99,0x91,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,
+	0x99,0x93,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x99,0x95,0x00,0x0a,
+	0x00,0x10,0x08,0x0a,0xff,0xea,0x99,0x97,0x00,0x0a,0x00,0xd2,0x18,0xd1,0x0c,0x10,
+	0x08,0x0a,0xff,0xea,0x99,0x99,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x99,0x9b,
+	0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x99,0x9d,0x00,0x0a,0x00,0x10,
+	0x08,0x0a,0xff,0xea,0x99,0x9f,0x00,0x0a,0x00,0xe4,0x5d,0x67,0xd3,0x30,0xd2,0x18,
+	0xd1,0x0c,0x10,0x08,0x0c,0xff,0xea,0x99,0xa1,0x00,0x0c,0x00,0x10,0x08,0x0a,0xff,
+	0xea,0x99,0xa3,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x99,0xa5,0x00,
+	0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x99,0xa7,0x00,0x0a,0x00,0xd2,0x18,0xd1,0x0c,
+	0x10,0x08,0x0a,0xff,0xea,0x99,0xa9,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x99,
+	0xab,0x00,0x0a,0x00,0xe1,0x0c,0x67,0x10,0x08,0x0a,0xff,0xea,0x99,0xad,0x00,0x0a,
+	0x00,0xe0,0x35,0x67,0xcf,0x86,0x95,0xab,0xd4,0x60,0xd3,0x30,0xd2,0x18,0xd1,0x0c,
+	0x10,0x08,0x0a,0xff,0xea,0x9a,0x81,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9a,
+	0x83,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9a,0x85,0x00,0x0a,0x00,
+	0x10,0x08,0x0a,0xff,0xea,0x9a,0x87,0x00,0x0a,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,
+	0x0a,0xff,0xea,0x9a,0x89,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9a,0x8b,0x00,
+	0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9a,0x8d,0x00,0x0a,0x00,0x10,0x08,
+	0x0a,0xff,0xea,0x9a,0x8f,0x00,0x0a,0x00,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,
+	0x0a,0xff,0xea,0x9a,0x91,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9a,0x93,0x00,
+	0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9a,0x95,0x00,0x0a,0x00,0x10,0x08,
+	0x0a,0xff,0xea,0x9a,0x97,0x00,0x0a,0x00,0xe2,0x92,0x66,0xd1,0x0c,0x10,0x08,0x10,
+	0xff,0xea,0x9a,0x99,0x00,0x10,0x00,0x10,0x08,0x10,0xff,0xea,0x9a,0x9b,0x00,0x10,
+	0x00,0x0b,0x00,0xe1,0x10,0x02,0xd0,0xb9,0xcf,0x86,0xd5,0x07,0x64,0x9e,0x66,0x08,
+	0x00,0xd4,0x58,0xd3,0x28,0xd2,0x10,0x51,0x04,0x09,0x00,0x10,0x08,0x0a,0xff,0xea,
+	0x9c,0xa3,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9c,0xa5,0x00,0x0a,
+	0x00,0x10,0x08,0x0a,0xff,0xea,0x9c,0xa7,0x00,0x0a,0x00,0xd2,0x18,0xd1,0x0c,0x10,
+	0x08,0x0a,0xff,0xea,0x9c,0xa9,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9c,0xab,
+	0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9c,0xad,0x00,0x0a,0x00,0x10,
+	0x08,0x0a,0xff,0xea,0x9c,0xaf,0x00,0x0a,0x00,0xd3,0x28,0xd2,0x10,0x51,0x04,0x0a,
+	0x00,0x10,0x08,0x0a,0xff,0xea,0x9c,0xb3,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,
+	0xff,0xea,0x9c,0xb5,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9c,0xb7,0x00,0x0a,
+	0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9c,0xb9,0x00,0x0a,0x00,0x10,
+	0x08,0x0a,0xff,0xea,0x9c,0xbb,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,
+	0x9c,0xbd,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9c,0xbf,0x00,0x0a,0x00,0xcf,
+	0x86,0xd5,0xc0,0xd4,0x60,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,
+	0x9d,0x81,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0x83,0x00,0x0a,0x00,0xd1,
+	0x0c,0x10,0x08,0x0a,0xff,0xea,0x9d,0x85,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,
+	0x9d,0x87,0x00,0x0a,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9d,0x89,
+	0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0x8b,0x00,0x0a,0x00,0xd1,0x0c,0x10,
+	0x08,0x0a,0xff,0xea,0x9d,0x8d,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0x8f,
+	0x00,0x0a,0x00,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9d,0x91,
+	0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0x93,0x00,0x0a,0x00,0xd1,0x0c,0x10,
+	0x08,0x0a,0xff,0xea,0x9d,0x95,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0x97,
+	0x00,0x0a,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9d,0x99,0x00,0x0a,
+	0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0x9b,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,
+	0xff,0xea,0x9d,0x9d,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0x9f,0x00,0x0a,
+	0x00,0xd4,0x60,0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9d,0xa1,
+	0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0xa3,0x00,0x0a,0x00,0xd1,0x0c,0x10,
+	0x08,0x0a,0xff,0xea,0x9d,0xa5,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0xa7,
+	0x00,0x0a,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9d,0xa9,0x00,0x0a,
+	0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0xab,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,
+	0xff,0xea,0x9d,0xad,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0xaf,0x00,0x0a,
+	0x00,0x53,0x04,0x0a,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x04,0x0a,0x00,0x0a,0xff,0xea,
+	0x9d,0xba,0x00,0x10,0x04,0x0a,0x00,0x0a,0xff,0xea,0x9d,0xbc,0x00,0xd1,0x0c,0x10,
+	0x04,0x0a,0x00,0x0a,0xff,0xe1,0xb5,0xb9,0x00,0x10,0x08,0x0a,0xff,0xea,0x9d,0xbf,
+	0x00,0x0a,0x00,0xe0,0x71,0x01,0xcf,0x86,0xd5,0xa6,0xd4,0x4e,0xd3,0x30,0xd2,0x18,
+	0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9e,0x81,0x00,0x0a,0x00,0x10,0x08,0x0a,0xff,
+	0xea,0x9e,0x83,0x00,0x0a,0x00,0xd1,0x0c,0x10,0x08,0x0a,0xff,0xea,0x9e,0x85,0x00,
+	0x0a,0x00,0x10,0x08,0x0a,0xff,0xea,0x9e,0x87,0x00,0x0a,0x00,0xd2,0x10,0x51,0x04,
+	0x0a,0x00,0x10,0x04,0x0a,0x00,0x0a,0xff,0xea,0x9e,0x8c,0x00,0xe1,0x9a,0x64,0x10,
+	0x04,0x0a,0x00,0x0c,0xff,0xc9,0xa5,0x00,0xd3,0x28,0xd2,0x18,0xd1,0x0c,0x10,0x08,
+	0x0c,0xff,0xea,0x9e,0x91,0x00,0x0c,0x00,0x10,0x08,0x0d,0xff,0xea,0x9e,0x93,0x00,
+	0x0d,0x00,0x51,0x04,0x10,0x00,0x10,0x08,0x10,0xff,0xea,0x9e,0x97,0x00,0x10,0x00,
+	0xd2,0x18,0xd1,0x0c,0x10,0x08,0x10,0xff,0xea,0x9e,0x99,0x00,0x10,0x00,0x10,0x08,
+	0x10,0xff,0xea,0x9e,0x9b,0x00,0x10,0x00,0xd1,0x0c,0x10,0x08,0x10,0xff,0xea,0x9e,
+	0x9d,0x00,0x10,0x00,0x10,0x08,0x10,0xff,0xea,0x9e,0x9f,0x00,0x10,0x00,0xd4,0x63,
+	0xd3,0x30,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x0c,0xff,0xea,0x9e,0xa1,0x00,0x0c,0x00,
+	0x10,0x08,0x0c,0xff,0xea,0x9e,0xa3,0x00,0x0c,0x00,0xd1,0x0c,0x10,0x08,0x0c,0xff,
+	0xea,0x9e,0xa5,0x00,0x0c,0x00,0x10,0x08,0x0c,0xff,0xea,0x9e,0xa7,0x00,0x0c,0x00,
+	0xd2,0x1a,0xd1,0x0c,0x10,0x08,0x0c,0xff,0xea,0x9e,0xa9,0x00,0x0c,0x00,0x10,0x07,
+	0x0d,0xff,0xc9,0xa6,0x00,0x10,0xff,0xc9,0x9c,0x00,0xd1,0x0e,0x10,0x07,0x10,0xff,
+	0xc9,0xa1,0x00,0x10,0xff,0xc9,0xac,0x00,0x10,0x07,0x12,0xff,0xc9,0xaa,0x00,0x14,
+	0x00,0xd3,0x35,0xd2,0x1d,0xd1,0x0e,0x10,0x07,0x10,0xff,0xca,0x9e,0x00,0x10,0xff,
+	0xca,0x87,0x00,0x10,0x07,0x11,0xff,0xca,0x9d,0x00,0x11,0xff,0xea,0xad,0x93,0x00,
+	0xd1,0x0c,0x10,0x08,0x11,0xff,0xea,0x9e,0xb5,0x00,0x11,0x00,0x10,0x08,0x11,0xff,
+	0xea,0x9e,0xb7,0x00,0x11,0x00,0xd2,0x18,0xd1,0x0c,0x10,0x08,0x14,0xff,0xea,0x9e,
+	0xb9,0x00,0x14,0x00,0x10,0x08,0x15,0xff,0xea,0x9e,0xbb,0x00,0x15,0x00,0xd1,0x0c,
+	0x10,0x08,0x15,0xff,0xea,0x9e,0xbd,0x00,0x15,0x00,0x10,0x08,0x15,0xff,0xea,0x9e,
+	0xbf,0x00,0x15,0x00,0xcf,0x86,0xe5,0xd4,0x63,0x94,0x2f,0x93,0x2b,0xd2,0x10,0x51,
+	0x04,0x00,0x00,0x10,0x08,0x15,0xff,0xea,0x9f,0x83,0x00,0x15,0x00,0xd1,0x0f,0x10,
+	0x08,0x15,0xff,0xea,0x9e,0x94,0x00,0x15,0xff,0xca,0x82,0x00,0x10,0x08,0x15,0xff,
+	0xe1,0xb6,0x8e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xe4,0xb4,0x66,0xd3,0x1d,0xe2,
+	0x5b,0x64,0xe1,0x0a,0x64,0xe0,0xf7,0x63,0xcf,0x86,0xe5,0xd8,0x63,0x94,0x0b,0x93,
+	0x07,0x62,0xc3,0x63,0x08,0x00,0x08,0x00,0x08,0x00,0xd2,0x0f,0xe1,0x5a,0x65,0xe0,
+	0x27,0x65,0xcf,0x86,0x65,0x0c,0x65,0x0a,0x00,0xd1,0xab,0xd0,0x1a,0xcf,0x86,0xe5,
+	0x17,0x66,0xe4,0xfa,0x65,0xe3,0xe1,0x65,0xe2,0xd4,0x65,0x91,0x08,0x10,0x04,0x00,
+	0x00,0x0c,0x00,0x0c,0x00,0xcf,0x86,0x55,0x04,0x10,0x00,0xd4,0x0b,0x93,0x07,0x62,
+	0x27,0x66,0x11,0x00,0x00,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x11,0xff,
+	0xe1,0x8e,0xa0,0x00,0x11,0xff,0xe1,0x8e,0xa1,0x00,0x10,0x08,0x11,0xff,0xe1,0x8e,
+	0xa2,0x00,0x11,0xff,0xe1,0x8e,0xa3,0x00,0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8e,
+	0xa4,0x00,0x11,0xff,0xe1,0x8e,0xa5,0x00,0x10,0x08,0x11,0xff,0xe1,0x8e,0xa6,0x00,
+	0x11,0xff,0xe1,0x8e,0xa7,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8e,
+	0xa8,0x00,0x11,0xff,0xe1,0x8e,0xa9,0x00,0x10,0x08,0x11,0xff,0xe1,0x8e,0xaa,0x00,
+	0x11,0xff,0xe1,0x8e,0xab,0x00,0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8e,0xac,0x00,
+	0x11,0xff,0xe1,0x8e,0xad,0x00,0x10,0x08,0x11,0xff,0xe1,0x8e,0xae,0x00,0x11,0xff,
+	0xe1,0x8e,0xaf,0x00,0xe0,0xb2,0x65,0xcf,0x86,0xe5,0x01,0x01,0xd4,0x80,0xd3,0x40,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8e,0xb0,0x00,0x11,0xff,0xe1,0x8e,
+	0xb1,0x00,0x10,0x08,0x11,0xff,0xe1,0x8e,0xb2,0x00,0x11,0xff,0xe1,0x8e,0xb3,0x00,
+	0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8e,0xb4,0x00,0x11,0xff,0xe1,0x8e,0xb5,0x00,
+	0x10,0x08,0x11,0xff,0xe1,0x8e,0xb6,0x00,0x11,0xff,0xe1,0x8e,0xb7,0x00,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8e,0xb8,0x00,0x11,0xff,0xe1,0x8e,0xb9,0x00,
+	0x10,0x08,0x11,0xff,0xe1,0x8e,0xba,0x00,0x11,0xff,0xe1,0x8e,0xbb,0x00,0xd1,0x10,
+	0x10,0x08,0x11,0xff,0xe1,0x8e,0xbc,0x00,0x11,0xff,0xe1,0x8e,0xbd,0x00,0x10,0x08,
+	0x11,0xff,0xe1,0x8e,0xbe,0x00,0x11,0xff,0xe1,0x8e,0xbf,0x00,0xd3,0x40,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8f,0x80,0x00,0x11,0xff,0xe1,0x8f,0x81,0x00,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0x82,0x00,0x11,0xff,0xe1,0x8f,0x83,0x00,0xd1,0x10,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0x84,0x00,0x11,0xff,0xe1,0x8f,0x85,0x00,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0x86,0x00,0x11,0xff,0xe1,0x8f,0x87,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0x88,0x00,0x11,0xff,0xe1,0x8f,0x89,0x00,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0x8a,0x00,0x11,0xff,0xe1,0x8f,0x8b,0x00,0xd1,0x10,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0x8c,0x00,0x11,0xff,0xe1,0x8f,0x8d,0x00,0x10,0x08,0x11,0xff,
+	0xe1,0x8f,0x8e,0x00,0x11,0xff,0xe1,0x8f,0x8f,0x00,0xd4,0x80,0xd3,0x40,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x11,0xff,0xe1,0x8f,0x90,0x00,0x11,0xff,0xe1,0x8f,0x91,0x00,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0x92,0x00,0x11,0xff,0xe1,0x8f,0x93,0x00,0xd1,0x10,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0x94,0x00,0x11,0xff,0xe1,0x8f,0x95,0x00,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0x96,0x00,0x11,0xff,0xe1,0x8f,0x97,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0x98,0x00,0x11,0xff,0xe1,0x8f,0x99,0x00,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0x9a,0x00,0x11,0xff,0xe1,0x8f,0x9b,0x00,0xd1,0x10,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0x9c,0x00,0x11,0xff,0xe1,0x8f,0x9d,0x00,0x10,0x08,0x11,0xff,
+	0xe1,0x8f,0x9e,0x00,0x11,0xff,0xe1,0x8f,0x9f,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x11,0xff,0xe1,0x8f,0xa0,0x00,0x11,0xff,0xe1,0x8f,0xa1,0x00,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0xa2,0x00,0x11,0xff,0xe1,0x8f,0xa3,0x00,0xd1,0x10,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0xa4,0x00,0x11,0xff,0xe1,0x8f,0xa5,0x00,0x10,0x08,0x11,0xff,
+	0xe1,0x8f,0xa6,0x00,0x11,0xff,0xe1,0x8f,0xa7,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x11,0xff,0xe1,0x8f,0xa8,0x00,0x11,0xff,0xe1,0x8f,0xa9,0x00,0x10,0x08,0x11,0xff,
+	0xe1,0x8f,0xaa,0x00,0x11,0xff,0xe1,0x8f,0xab,0x00,0xd1,0x10,0x10,0x08,0x11,0xff,
+	0xe1,0x8f,0xac,0x00,0x11,0xff,0xe1,0x8f,0xad,0x00,0x10,0x08,0x11,0xff,0xe1,0x8f,
+	0xae,0x00,0x11,0xff,0xe1,0x8f,0xaf,0x00,0xd1,0x0c,0xe0,0xeb,0x63,0xcf,0x86,0xcf,
+	0x06,0x02,0xff,0xff,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x01,0x00,0xcf,0x86,0xd5,0x06,
+	0xcf,0x06,0x01,0x00,0xd4,0xae,0xd3,0x09,0xe2,0x54,0x64,0xcf,0x06,0x01,0x00,0xd2,
+	0x27,0xe1,0x1f,0x70,0xe0,0x26,0x6e,0xcf,0x86,0xe5,0x3f,0x6d,0xe4,0xce,0x6c,0xe3,
+	0x99,0x6c,0xe2,0x78,0x6c,0xe1,0x67,0x6c,0x10,0x08,0x01,0xff,0xe5,0x88,0x87,0x00,
+	0x01,0xff,0xe5,0xba,0xa6,0x00,0xe1,0x74,0x74,0xe0,0xe8,0x73,0xcf,0x86,0xe5,0x22,
+	0x73,0xd4,0x3b,0x93,0x37,0xd2,0x1d,0xd1,0x0e,0x10,0x07,0x01,0xff,0x66,0x66,0x00,
+	0x01,0xff,0x66,0x69,0x00,0x10,0x07,0x01,0xff,0x66,0x6c,0x00,0x01,0xff,0x66,0x66,
+	0x69,0x00,0xd1,0x0f,0x10,0x08,0x01,0xff,0x66,0x66,0x6c,0x00,0x01,0xff,0x73,0x74,
+	0x00,0x10,0x07,0x01,0xff,0x73,0x74,0x00,0x00,0x00,0x00,0x00,0xe3,0xc8,0x72,0xd2,
+	0x11,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0xff,0xd5,0xb4,0xd5,0xb6,0x00,
+	0xd1,0x12,0x10,0x09,0x01,0xff,0xd5,0xb4,0xd5,0xa5,0x00,0x01,0xff,0xd5,0xb4,0xd5,
+	0xab,0x00,0x10,0x09,0x01,0xff,0xd5,0xbe,0xd5,0xb6,0x00,0x01,0xff,0xd5,0xb4,0xd5,
+	0xad,0x00,0xd3,0x09,0xe2,0x40,0x74,0xcf,0x06,0x01,0x00,0xd2,0x13,0xe1,0x30,0x75,
+	0xe0,0xc1,0x74,0xcf,0x86,0xe5,0x9e,0x74,0x64,0x8d,0x74,0x06,0xff,0x00,0xe1,0x96,
+	0x75,0xe0,0x63,0x75,0xcf,0x86,0xd5,0x18,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,0x7c,
+	0xd3,0x3c,0xd2,0x1c,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0xef,0xbd,0x81,0x00,
+	0x10,0x08,0x01,0xff,0xef,0xbd,0x82,0x00,0x01,0xff,0xef,0xbd,0x83,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xef,0xbd,0x84,0x00,0x01,0xff,0xef,0xbd,0x85,0x00,0x10,0x08,
+	0x01,0xff,0xef,0xbd,0x86,0x00,0x01,0xff,0xef,0xbd,0x87,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xef,0xbd,0x88,0x00,0x01,0xff,0xef,0xbd,0x89,0x00,0x10,0x08,
+	0x01,0xff,0xef,0xbd,0x8a,0x00,0x01,0xff,0xef,0xbd,0x8b,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xef,0xbd,0x8c,0x00,0x01,0xff,0xef,0xbd,0x8d,0x00,0x10,0x08,0x01,0xff,
+	0xef,0xbd,0x8e,0x00,0x01,0xff,0xef,0xbd,0x8f,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xef,0xbd,0x90,0x00,0x01,0xff,0xef,0xbd,0x91,0x00,0x10,0x08,
+	0x01,0xff,0xef,0xbd,0x92,0x00,0x01,0xff,0xef,0xbd,0x93,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xef,0xbd,0x94,0x00,0x01,0xff,0xef,0xbd,0x95,0x00,0x10,0x08,0x01,0xff,
+	0xef,0xbd,0x96,0x00,0x01,0xff,0xef,0xbd,0x97,0x00,0x92,0x1c,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xef,0xbd,0x98,0x00,0x01,0xff,0xef,0xbd,0x99,0x00,0x10,0x08,0x01,0xff,
+	0xef,0xbd,0x9a,0x00,0x01,0x00,0x01,0x00,0x83,0xe2,0x87,0xb3,0xe1,0x60,0xb0,0xe0,
+	0xdd,0xae,0xcf,0x86,0xe5,0x81,0x9b,0xc4,0xe3,0xc1,0x07,0xe2,0x62,0x06,0xe1,0x11,
+	0x86,0xe0,0x09,0x05,0xcf,0x86,0xe5,0xfb,0x02,0xd4,0x1c,0xe3,0x7f,0x76,0xe2,0xd6,
+	0x75,0xe1,0xb1,0x75,0xe0,0x8a,0x75,0xcf,0x86,0xe5,0x57,0x75,0x94,0x07,0x63,0x42,
+	0x75,0x07,0x00,0x07,0x00,0xe3,0x2b,0x78,0xe2,0xf0,0x77,0xe1,0x77,0x01,0xe0,0x88,
+	0x77,0xcf,0x86,0xe5,0x21,0x01,0xd4,0x90,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,
+	0x05,0xff,0xf0,0x90,0x90,0xa8,0x00,0x05,0xff,0xf0,0x90,0x90,0xa9,0x00,0x10,0x09,
+	0x05,0xff,0xf0,0x90,0x90,0xaa,0x00,0x05,0xff,0xf0,0x90,0x90,0xab,0x00,0xd1,0x12,
+	0x10,0x09,0x05,0xff,0xf0,0x90,0x90,0xac,0x00,0x05,0xff,0xf0,0x90,0x90,0xad,0x00,
+	0x10,0x09,0x05,0xff,0xf0,0x90,0x90,0xae,0x00,0x05,0xff,0xf0,0x90,0x90,0xaf,0x00,
+	0xd2,0x24,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0x90,0x90,0xb0,0x00,0x05,0xff,0xf0,
+	0x90,0x90,0xb1,0x00,0x10,0x09,0x05,0xff,0xf0,0x90,0x90,0xb2,0x00,0x05,0xff,0xf0,
+	0x90,0x90,0xb3,0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0x90,0x90,0xb4,0x00,0x05,
+	0xff,0xf0,0x90,0x90,0xb5,0x00,0x10,0x09,0x05,0xff,0xf0,0x90,0x90,0xb6,0x00,0x05,
+	0xff,0xf0,0x90,0x90,0xb7,0x00,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,0x05,0xff,
+	0xf0,0x90,0x90,0xb8,0x00,0x05,0xff,0xf0,0x90,0x90,0xb9,0x00,0x10,0x09,0x05,0xff,
+	0xf0,0x90,0x90,0xba,0x00,0x05,0xff,0xf0,0x90,0x90,0xbb,0x00,0xd1,0x12,0x10,0x09,
+	0x05,0xff,0xf0,0x90,0x90,0xbc,0x00,0x05,0xff,0xf0,0x90,0x90,0xbd,0x00,0x10,0x09,
+	0x05,0xff,0xf0,0x90,0x90,0xbe,0x00,0x05,0xff,0xf0,0x90,0x90,0xbf,0x00,0xd2,0x24,
+	0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0x90,0x91,0x80,0x00,0x05,0xff,0xf0,0x90,0x91,
+	0x81,0x00,0x10,0x09,0x05,0xff,0xf0,0x90,0x91,0x82,0x00,0x05,0xff,0xf0,0x90,0x91,
+	0x83,0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0x90,0x91,0x84,0x00,0x05,0xff,0xf0,
+	0x90,0x91,0x85,0x00,0x10,0x09,0x05,0xff,0xf0,0x90,0x91,0x86,0x00,0x05,0xff,0xf0,
+	0x90,0x91,0x87,0x00,0x94,0x4c,0x93,0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,0x05,0xff,
+	0xf0,0x90,0x91,0x88,0x00,0x05,0xff,0xf0,0x90,0x91,0x89,0x00,0x10,0x09,0x05,0xff,
+	0xf0,0x90,0x91,0x8a,0x00,0x05,0xff,0xf0,0x90,0x91,0x8b,0x00,0xd1,0x12,0x10,0x09,
+	0x05,0xff,0xf0,0x90,0x91,0x8c,0x00,0x05,0xff,0xf0,0x90,0x91,0x8d,0x00,0x10,0x09,
+	0x07,0xff,0xf0,0x90,0x91,0x8e,0x00,0x07,0xff,0xf0,0x90,0x91,0x8f,0x00,0x05,0x00,
+	0x05,0x00,0xd0,0xa0,0xcf,0x86,0xd5,0x07,0x64,0x30,0x76,0x07,0x00,0xd4,0x07,0x63,
+	0x3d,0x76,0x07,0x00,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x90,
+	0x93,0x98,0x00,0x12,0xff,0xf0,0x90,0x93,0x99,0x00,0x10,0x09,0x12,0xff,0xf0,0x90,
+	0x93,0x9a,0x00,0x12,0xff,0xf0,0x90,0x93,0x9b,0x00,0xd1,0x12,0x10,0x09,0x12,0xff,
+	0xf0,0x90,0x93,0x9c,0x00,0x12,0xff,0xf0,0x90,0x93,0x9d,0x00,0x10,0x09,0x12,0xff,
+	0xf0,0x90,0x93,0x9e,0x00,0x12,0xff,0xf0,0x90,0x93,0x9f,0x00,0xd2,0x24,0xd1,0x12,
+	0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xa0,0x00,0x12,0xff,0xf0,0x90,0x93,0xa1,0x00,
+	0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xa2,0x00,0x12,0xff,0xf0,0x90,0x93,0xa3,0x00,
+	0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xa4,0x00,0x12,0xff,0xf0,0x90,0x93,
+	0xa5,0x00,0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xa6,0x00,0x12,0xff,0xf0,0x90,0x93,
+	0xa7,0x00,0xcf,0x86,0xe5,0xc6,0x75,0xd4,0x90,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,
+	0x09,0x12,0xff,0xf0,0x90,0x93,0xa8,0x00,0x12,0xff,0xf0,0x90,0x93,0xa9,0x00,0x10,
+	0x09,0x12,0xff,0xf0,0x90,0x93,0xaa,0x00,0x12,0xff,0xf0,0x90,0x93,0xab,0x00,0xd1,
+	0x12,0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xac,0x00,0x12,0xff,0xf0,0x90,0x93,0xad,
+	0x00,0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xae,0x00,0x12,0xff,0xf0,0x90,0x93,0xaf,
+	0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xb0,0x00,0x12,0xff,
+	0xf0,0x90,0x93,0xb1,0x00,0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xb2,0x00,0x12,0xff,
+	0xf0,0x90,0x93,0xb3,0x00,0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xb4,0x00,
+	0x12,0xff,0xf0,0x90,0x93,0xb5,0x00,0x10,0x09,0x12,0xff,0xf0,0x90,0x93,0xb6,0x00,
+	0x12,0xff,0xf0,0x90,0x93,0xb7,0x00,0x93,0x28,0x92,0x24,0xd1,0x12,0x10,0x09,0x12,
+	0xff,0xf0,0x90,0x93,0xb8,0x00,0x12,0xff,0xf0,0x90,0x93,0xb9,0x00,0x10,0x09,0x12,
+	0xff,0xf0,0x90,0x93,0xba,0x00,0x12,0xff,0xf0,0x90,0x93,0xbb,0x00,0x00,0x00,0x12,
+	0x00,0xd4,0x1f,0xe3,0xdf,0x76,0xe2,0x6a,0x76,0xe1,0x09,0x76,0xe0,0xea,0x75,0xcf,
+	0x86,0xe5,0xb7,0x75,0x94,0x0a,0xe3,0xa2,0x75,0x62,0x99,0x75,0x07,0x00,0x07,0x00,
+	0xe3,0xde,0x78,0xe2,0xaf,0x78,0xd1,0x09,0xe0,0x4c,0x78,0xcf,0x06,0x0b,0x00,0xe0,
+	0x7f,0x78,0xcf,0x86,0xe5,0x21,0x01,0xd4,0x90,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,
+	0x09,0x11,0xff,0xf0,0x90,0xb3,0x80,0x00,0x11,0xff,0xf0,0x90,0xb3,0x81,0x00,0x10,
+	0x09,0x11,0xff,0xf0,0x90,0xb3,0x82,0x00,0x11,0xff,0xf0,0x90,0xb3,0x83,0x00,0xd1,
+	0x12,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x84,0x00,0x11,0xff,0xf0,0x90,0xb3,0x85,
+	0x00,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x86,0x00,0x11,0xff,0xf0,0x90,0xb3,0x87,
+	0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x88,0x00,0x11,0xff,
+	0xf0,0x90,0xb3,0x89,0x00,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x8a,0x00,0x11,0xff,
+	0xf0,0x90,0xb3,0x8b,0x00,0xd1,0x12,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x8c,0x00,
+	0x11,0xff,0xf0,0x90,0xb3,0x8d,0x00,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x8e,0x00,
+	0x11,0xff,0xf0,0x90,0xb3,0x8f,0x00,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,0x11,
+	0xff,0xf0,0x90,0xb3,0x90,0x00,0x11,0xff,0xf0,0x90,0xb3,0x91,0x00,0x10,0x09,0x11,
+	0xff,0xf0,0x90,0xb3,0x92,0x00,0x11,0xff,0xf0,0x90,0xb3,0x93,0x00,0xd1,0x12,0x10,
+	0x09,0x11,0xff,0xf0,0x90,0xb3,0x94,0x00,0x11,0xff,0xf0,0x90,0xb3,0x95,0x00,0x10,
+	0x09,0x11,0xff,0xf0,0x90,0xb3,0x96,0x00,0x11,0xff,0xf0,0x90,0xb3,0x97,0x00,0xd2,
+	0x24,0xd1,0x12,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x98,0x00,0x11,0xff,0xf0,0x90,
+	0xb3,0x99,0x00,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x9a,0x00,0x11,0xff,0xf0,0x90,
+	0xb3,0x9b,0x00,0xd1,0x12,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x9c,0x00,0x11,0xff,
+	0xf0,0x90,0xb3,0x9d,0x00,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0x9e,0x00,0x11,0xff,
+	0xf0,0x90,0xb3,0x9f,0x00,0xd4,0x90,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,0x11,
+	0xff,0xf0,0x90,0xb3,0xa0,0x00,0x11,0xff,0xf0,0x90,0xb3,0xa1,0x00,0x10,0x09,0x11,
+	0xff,0xf0,0x90,0xb3,0xa2,0x00,0x11,0xff,0xf0,0x90,0xb3,0xa3,0x00,0xd1,0x12,0x10,
+	0x09,0x11,0xff,0xf0,0x90,0xb3,0xa4,0x00,0x11,0xff,0xf0,0x90,0xb3,0xa5,0x00,0x10,
+	0x09,0x11,0xff,0xf0,0x90,0xb3,0xa6,0x00,0x11,0xff,0xf0,0x90,0xb3,0xa7,0x00,0xd2,
+	0x24,0xd1,0x12,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0xa8,0x00,0x11,0xff,0xf0,0x90,
+	0xb3,0xa9,0x00,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0xaa,0x00,0x11,0xff,0xf0,0x90,
+	0xb3,0xab,0x00,0xd1,0x12,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0xac,0x00,0x11,0xff,
+	0xf0,0x90,0xb3,0xad,0x00,0x10,0x09,0x11,0xff,0xf0,0x90,0xb3,0xae,0x00,0x11,0xff,
+	0xf0,0x90,0xb3,0xaf,0x00,0x93,0x23,0x92,0x1f,0xd1,0x12,0x10,0x09,0x11,0xff,0xf0,
+	0x90,0xb3,0xb0,0x00,0x11,0xff,0xf0,0x90,0xb3,0xb1,0x00,0x10,0x09,0x11,0xff,0xf0,
+	0x90,0xb3,0xb2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0xd5,0x15,0xe4,0x91,
+	0x7b,0xe3,0x9b,0x79,0xe2,0x94,0x78,0xe1,0xe4,0x77,0xe0,0x9d,0x77,0xcf,0x06,0x0c,
+	0x00,0xe4,0xeb,0x7e,0xe3,0x44,0x7e,0xe2,0xed,0x7d,0xd1,0x0c,0xe0,0xb2,0x7d,0xcf,
+	0x86,0x65,0x93,0x7d,0x14,0x00,0xe0,0xb6,0x7d,0xcf,0x86,0x55,0x04,0x00,0x00,0xd4,
+	0x90,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,0x10,0xff,0xf0,0x91,0xa3,0x80,0x00,
+	0x10,0xff,0xf0,0x91,0xa3,0x81,0x00,0x10,0x09,0x10,0xff,0xf0,0x91,0xa3,0x82,0x00,
+	0x10,0xff,0xf0,0x91,0xa3,0x83,0x00,0xd1,0x12,0x10,0x09,0x10,0xff,0xf0,0x91,0xa3,
+	0x84,0x00,0x10,0xff,0xf0,0x91,0xa3,0x85,0x00,0x10,0x09,0x10,0xff,0xf0,0x91,0xa3,
+	0x86,0x00,0x10,0xff,0xf0,0x91,0xa3,0x87,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x10,
+	0xff,0xf0,0x91,0xa3,0x88,0x00,0x10,0xff,0xf0,0x91,0xa3,0x89,0x00,0x10,0x09,0x10,
+	0xff,0xf0,0x91,0xa3,0x8a,0x00,0x10,0xff,0xf0,0x91,0xa3,0x8b,0x00,0xd1,0x12,0x10,
+	0x09,0x10,0xff,0xf0,0x91,0xa3,0x8c,0x00,0x10,0xff,0xf0,0x91,0xa3,0x8d,0x00,0x10,
+	0x09,0x10,0xff,0xf0,0x91,0xa3,0x8e,0x00,0x10,0xff,0xf0,0x91,0xa3,0x8f,0x00,0xd3,
+	0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,0x10,0xff,0xf0,0x91,0xa3,0x90,0x00,0x10,0xff,
+	0xf0,0x91,0xa3,0x91,0x00,0x10,0x09,0x10,0xff,0xf0,0x91,0xa3,0x92,0x00,0x10,0xff,
+	0xf0,0x91,0xa3,0x93,0x00,0xd1,0x12,0x10,0x09,0x10,0xff,0xf0,0x91,0xa3,0x94,0x00,
+	0x10,0xff,0xf0,0x91,0xa3,0x95,0x00,0x10,0x09,0x10,0xff,0xf0,0x91,0xa3,0x96,0x00,
+	0x10,0xff,0xf0,0x91,0xa3,0x97,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x10,0xff,0xf0,
+	0x91,0xa3,0x98,0x00,0x10,0xff,0xf0,0x91,0xa3,0x99,0x00,0x10,0x09,0x10,0xff,0xf0,
+	0x91,0xa3,0x9a,0x00,0x10,0xff,0xf0,0x91,0xa3,0x9b,0x00,0xd1,0x12,0x10,0x09,0x10,
+	0xff,0xf0,0x91,0xa3,0x9c,0x00,0x10,0xff,0xf0,0x91,0xa3,0x9d,0x00,0x10,0x09,0x10,
+	0xff,0xf0,0x91,0xa3,0x9e,0x00,0x10,0xff,0xf0,0x91,0xa3,0x9f,0x00,0xd1,0x11,0xe0,
+	0x12,0x81,0xcf,0x86,0xe5,0x09,0x81,0xe4,0xd2,0x80,0xcf,0x06,0x00,0x00,0xe0,0xdb,
+	0x82,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x09,0xe3,0x10,0x81,0xcf,0x06,
+	0x0c,0x00,0xd3,0x06,0xcf,0x06,0x00,0x00,0xe2,0x3b,0x82,0xe1,0x16,0x82,0xd0,0x06,
+	0xcf,0x06,0x00,0x00,0xcf,0x86,0xa5,0x21,0x01,0xd4,0x90,0xd3,0x48,0xd2,0x24,0xd1,
+	0x12,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xa0,0x00,0x14,0xff,0xf0,0x96,0xb9,0xa1,
+	0x00,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xa2,0x00,0x14,0xff,0xf0,0x96,0xb9,0xa3,
+	0x00,0xd1,0x12,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xa4,0x00,0x14,0xff,0xf0,0x96,
+	0xb9,0xa5,0x00,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xa6,0x00,0x14,0xff,0xf0,0x96,
+	0xb9,0xa7,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xa8,0x00,
+	0x14,0xff,0xf0,0x96,0xb9,0xa9,0x00,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xaa,0x00,
+	0x14,0xff,0xf0,0x96,0xb9,0xab,0x00,0xd1,0x12,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,
+	0xac,0x00,0x14,0xff,0xf0,0x96,0xb9,0xad,0x00,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,
+	0xae,0x00,0x14,0xff,0xf0,0x96,0xb9,0xaf,0x00,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,
+	0x09,0x14,0xff,0xf0,0x96,0xb9,0xb0,0x00,0x14,0xff,0xf0,0x96,0xb9,0xb1,0x00,0x10,
+	0x09,0x14,0xff,0xf0,0x96,0xb9,0xb2,0x00,0x14,0xff,0xf0,0x96,0xb9,0xb3,0x00,0xd1,
+	0x12,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xb4,0x00,0x14,0xff,0xf0,0x96,0xb9,0xb5,
+	0x00,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xb6,0x00,0x14,0xff,0xf0,0x96,0xb9,0xb7,
+	0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xb8,0x00,0x14,0xff,
+	0xf0,0x96,0xb9,0xb9,0x00,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xba,0x00,0x14,0xff,
+	0xf0,0x96,0xb9,0xbb,0x00,0xd1,0x12,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xbc,0x00,
+	0x14,0xff,0xf0,0x96,0xb9,0xbd,0x00,0x10,0x09,0x14,0xff,0xf0,0x96,0xb9,0xbe,0x00,
+	0x14,0xff,0xf0,0x96,0xb9,0xbf,0x00,0x14,0x00,0xd2,0x14,0xe1,0x25,0x82,0xe0,0x1c,
+	0x82,0xcf,0x86,0xe5,0xdd,0x81,0xe4,0x9a,0x81,0xcf,0x06,0x12,0x00,0xd1,0x0b,0xe0,
+	0x51,0x83,0xcf,0x86,0xcf,0x06,0x00,0x00,0xe0,0x95,0x8b,0xcf,0x86,0xd5,0x22,0xe4,
+	0xd0,0x88,0xe3,0x93,0x88,0xe2,0x38,0x88,0xe1,0x31,0x88,0xe0,0x2a,0x88,0xcf,0x86,
+	0xe5,0xfb,0x87,0xe4,0xe2,0x87,0x93,0x07,0x62,0xd1,0x87,0x12,0xe6,0x12,0xe6,0xe4,
+	0x36,0x89,0xe3,0x2f,0x89,0xd2,0x09,0xe1,0xb8,0x88,0xcf,0x06,0x10,0x00,0xe1,0x1f,
+	0x89,0xe0,0xec,0x88,0xcf,0x86,0xe5,0x21,0x01,0xd4,0x90,0xd3,0x48,0xd2,0x24,0xd1,
+	0x12,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xa2,0x00,0x12,0xff,0xf0,0x9e,0xa4,0xa3,
+	0x00,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xa4,0x00,0x12,0xff,0xf0,0x9e,0xa4,0xa5,
+	0x00,0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xa6,0x00,0x12,0xff,0xf0,0x9e,
+	0xa4,0xa7,0x00,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xa8,0x00,0x12,0xff,0xf0,0x9e,
+	0xa4,0xa9,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xaa,0x00,
+	0x12,0xff,0xf0,0x9e,0xa4,0xab,0x00,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xac,0x00,
+	0x12,0xff,0xf0,0x9e,0xa4,0xad,0x00,0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,
+	0xae,0x00,0x12,0xff,0xf0,0x9e,0xa4,0xaf,0x00,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,
+	0xb0,0x00,0x12,0xff,0xf0,0x9e,0xa4,0xb1,0x00,0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,
+	0x09,0x12,0xff,0xf0,0x9e,0xa4,0xb2,0x00,0x12,0xff,0xf0,0x9e,0xa4,0xb3,0x00,0x10,
+	0x09,0x12,0xff,0xf0,0x9e,0xa4,0xb4,0x00,0x12,0xff,0xf0,0x9e,0xa4,0xb5,0x00,0xd1,
+	0x12,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xb6,0x00,0x12,0xff,0xf0,0x9e,0xa4,0xb7,
+	0x00,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xb8,0x00,0x12,0xff,0xf0,0x9e,0xa4,0xb9,
+	0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xba,0x00,0x12,0xff,
+	0xf0,0x9e,0xa4,0xbb,0x00,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xbc,0x00,0x12,0xff,
+	0xf0,0x9e,0xa4,0xbd,0x00,0xd1,0x12,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa4,0xbe,0x00,
+	0x12,0xff,0xf0,0x9e,0xa4,0xbf,0x00,0x10,0x09,0x12,0xff,0xf0,0x9e,0xa5,0x80,0x00,
+	0x12,0xff,0xf0,0x9e,0xa5,0x81,0x00,0x94,0x1e,0x93,0x1a,0x92,0x16,0x91,0x12,0x10,
+	0x09,0x12,0xff,0xf0,0x9e,0xa5,0x82,0x00,0x12,0xff,0xf0,0x9e,0xa5,0x83,0x00,0x12,
+	0x00,0x12,0x00,0x12,0x00,0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	/* nfdi_c0100 */
+	0x57,0x04,0x01,0x00,0xc6,0xe5,0xac,0x13,0xe4,0x41,0x0c,0xe3,0x7a,0x07,0xe2,0xf3,
+	0x01,0xc1,0xd0,0x1f,0xcf,0x86,0x55,0x04,0x01,0x00,0x94,0x15,0x53,0x04,0x01,0x00,
+	0x52,0x04,0x01,0x00,0x91,0x09,0x10,0x04,0x01,0x00,0x01,0xff,0x00,0x01,0x00,0x01,
+	0x00,0xcf,0x86,0xd5,0xe4,0xd4,0x7c,0xd3,0x3c,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x41,0xcc,0x80,0x00,0x01,0xff,0x41,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x41,
+	0xcc,0x82,0x00,0x01,0xff,0x41,0xcc,0x83,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x41,
+	0xcc,0x88,0x00,0x01,0xff,0x41,0xcc,0x8a,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0x43,
+	0xcc,0xa7,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x45,0xcc,0x80,0x00,0x01,
+	0xff,0x45,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x45,0xcc,0x82,0x00,0x01,0xff,0x45,
+	0xcc,0x88,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x49,0xcc,0x80,0x00,0x01,0xff,0x49,
+	0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x49,0xcc,0x82,0x00,0x01,0xff,0x49,0xcc,0x88,
+	0x00,0xd3,0x38,0xd2,0x1c,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0x4e,0xcc,0x83,
+	0x00,0x10,0x08,0x01,0xff,0x4f,0xcc,0x80,0x00,0x01,0xff,0x4f,0xcc,0x81,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x4f,0xcc,0x82,0x00,0x01,0xff,0x4f,0xcc,0x83,0x00,0x10,
+	0x08,0x01,0xff,0x4f,0xcc,0x88,0x00,0x01,0x00,0xd2,0x1c,0xd1,0x0c,0x10,0x04,0x01,
+	0x00,0x01,0xff,0x55,0xcc,0x80,0x00,0x10,0x08,0x01,0xff,0x55,0xcc,0x81,0x00,0x01,
+	0xff,0x55,0xcc,0x82,0x00,0x91,0x10,0x10,0x08,0x01,0xff,0x55,0xcc,0x88,0x00,0x01,
+	0xff,0x59,0xcc,0x81,0x00,0x01,0x00,0xd4,0x7c,0xd3,0x3c,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x61,0xcc,0x80,0x00,0x01,0xff,0x61,0xcc,0x81,0x00,0x10,0x08,0x01,
+	0xff,0x61,0xcc,0x82,0x00,0x01,0xff,0x61,0xcc,0x83,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x61,0xcc,0x88,0x00,0x01,0xff,0x61,0xcc,0x8a,0x00,0x10,0x04,0x01,0x00,0x01,
+	0xff,0x63,0xcc,0xa7,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x65,0xcc,0x80,
+	0x00,0x01,0xff,0x65,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x65,0xcc,0x82,0x00,0x01,
+	0xff,0x65,0xcc,0x88,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x69,0xcc,0x80,0x00,0x01,
+	0xff,0x69,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x69,0xcc,0x82,0x00,0x01,0xff,0x69,
+	0xcc,0x88,0x00,0xd3,0x38,0xd2,0x1c,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0x6e,
+	0xcc,0x83,0x00,0x10,0x08,0x01,0xff,0x6f,0xcc,0x80,0x00,0x01,0xff,0x6f,0xcc,0x81,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x6f,0xcc,0x82,0x00,0x01,0xff,0x6f,0xcc,0x83,
+	0x00,0x10,0x08,0x01,0xff,0x6f,0xcc,0x88,0x00,0x01,0x00,0xd2,0x1c,0xd1,0x0c,0x10,
+	0x04,0x01,0x00,0x01,0xff,0x75,0xcc,0x80,0x00,0x10,0x08,0x01,0xff,0x75,0xcc,0x81,
+	0x00,0x01,0xff,0x75,0xcc,0x82,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x75,0xcc,0x88,
+	0x00,0x01,0xff,0x79,0xcc,0x81,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0x79,0xcc,0x88,
+	0x00,0xe1,0x9a,0x03,0xe0,0xd3,0x01,0xcf,0x86,0xd5,0xf4,0xd4,0x80,0xd3,0x40,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x41,0xcc,0x84,0x00,0x01,0xff,0x61,0xcc,0x84,
+	0x00,0x10,0x08,0x01,0xff,0x41,0xcc,0x86,0x00,0x01,0xff,0x61,0xcc,0x86,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x41,0xcc,0xa8,0x00,0x01,0xff,0x61,0xcc,0xa8,0x00,0x10,
+	0x08,0x01,0xff,0x43,0xcc,0x81,0x00,0x01,0xff,0x63,0xcc,0x81,0x00,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x43,0xcc,0x82,0x00,0x01,0xff,0x63,0xcc,0x82,0x00,0x10,
+	0x08,0x01,0xff,0x43,0xcc,0x87,0x00,0x01,0xff,0x63,0xcc,0x87,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x43,0xcc,0x8c,0x00,0x01,0xff,0x63,0xcc,0x8c,0x00,0x10,0x08,0x01,
+	0xff,0x44,0xcc,0x8c,0x00,0x01,0xff,0x64,0xcc,0x8c,0x00,0xd3,0x34,0xd2,0x14,0x51,
+	0x04,0x01,0x00,0x10,0x08,0x01,0xff,0x45,0xcc,0x84,0x00,0x01,0xff,0x65,0xcc,0x84,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x45,0xcc,0x86,0x00,0x01,0xff,0x65,0xcc,0x86,
+	0x00,0x10,0x08,0x01,0xff,0x45,0xcc,0x87,0x00,0x01,0xff,0x65,0xcc,0x87,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x45,0xcc,0xa8,0x00,0x01,0xff,0x65,0xcc,0xa8,
+	0x00,0x10,0x08,0x01,0xff,0x45,0xcc,0x8c,0x00,0x01,0xff,0x65,0xcc,0x8c,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x47,0xcc,0x82,0x00,0x01,0xff,0x67,0xcc,0x82,0x00,0x10,
+	0x08,0x01,0xff,0x47,0xcc,0x86,0x00,0x01,0xff,0x67,0xcc,0x86,0x00,0xd4,0x74,0xd3,
+	0x34,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x47,0xcc,0x87,0x00,0x01,0xff,0x67,
+	0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x47,0xcc,0xa7,0x00,0x01,0xff,0x67,0xcc,0xa7,
+	0x00,0x91,0x10,0x10,0x08,0x01,0xff,0x48,0xcc,0x82,0x00,0x01,0xff,0x68,0xcc,0x82,
+	0x00,0x01,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x49,0xcc,0x83,0x00,0x01,
+	0xff,0x69,0xcc,0x83,0x00,0x10,0x08,0x01,0xff,0x49,0xcc,0x84,0x00,0x01,0xff,0x69,
+	0xcc,0x84,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x49,0xcc,0x86,0x00,0x01,0xff,0x69,
+	0xcc,0x86,0x00,0x10,0x08,0x01,0xff,0x49,0xcc,0xa8,0x00,0x01,0xff,0x69,0xcc,0xa8,
+	0x00,0xd3,0x30,0xd2,0x10,0x91,0x0c,0x10,0x08,0x01,0xff,0x49,0xcc,0x87,0x00,0x01,
+	0x00,0x01,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x4a,0xcc,0x82,0x00,0x01,0xff,0x6a,
+	0xcc,0x82,0x00,0x10,0x08,0x01,0xff,0x4b,0xcc,0xa7,0x00,0x01,0xff,0x6b,0xcc,0xa7,
+	0x00,0xd2,0x1c,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0x4c,0xcc,0x81,0x00,0x10,
+	0x08,0x01,0xff,0x6c,0xcc,0x81,0x00,0x01,0xff,0x4c,0xcc,0xa7,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x6c,0xcc,0xa7,0x00,0x01,0xff,0x4c,0xcc,0x8c,0x00,0x10,0x08,0x01,
+	0xff,0x6c,0xcc,0x8c,0x00,0x01,0x00,0xcf,0x86,0xd5,0xd4,0xd4,0x60,0xd3,0x30,0xd2,
+	0x10,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0x4e,0xcc,0x81,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x6e,0xcc,0x81,0x00,0x01,0xff,0x4e,0xcc,0xa7,0x00,0x10,
+	0x08,0x01,0xff,0x6e,0xcc,0xa7,0x00,0x01,0xff,0x4e,0xcc,0x8c,0x00,0xd2,0x10,0x91,
+	0x0c,0x10,0x08,0x01,0xff,0x6e,0xcc,0x8c,0x00,0x01,0x00,0x01,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x4f,0xcc,0x84,0x00,0x01,0xff,0x6f,0xcc,0x84,0x00,0x10,0x08,0x01,
+	0xff,0x4f,0xcc,0x86,0x00,0x01,0xff,0x6f,0xcc,0x86,0x00,0xd3,0x34,0xd2,0x14,0x91,
+	0x10,0x10,0x08,0x01,0xff,0x4f,0xcc,0x8b,0x00,0x01,0xff,0x6f,0xcc,0x8b,0x00,0x01,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x52,0xcc,0x81,0x00,0x01,0xff,0x72,0xcc,0x81,
+	0x00,0x10,0x08,0x01,0xff,0x52,0xcc,0xa7,0x00,0x01,0xff,0x72,0xcc,0xa7,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x52,0xcc,0x8c,0x00,0x01,0xff,0x72,0xcc,0x8c,
+	0x00,0x10,0x08,0x01,0xff,0x53,0xcc,0x81,0x00,0x01,0xff,0x73,0xcc,0x81,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x53,0xcc,0x82,0x00,0x01,0xff,0x73,0xcc,0x82,0x00,0x10,
+	0x08,0x01,0xff,0x53,0xcc,0xa7,0x00,0x01,0xff,0x73,0xcc,0xa7,0x00,0xd4,0x74,0xd3,
+	0x34,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x53,0xcc,0x8c,0x00,0x01,0xff,0x73,
+	0xcc,0x8c,0x00,0x10,0x08,0x01,0xff,0x54,0xcc,0xa7,0x00,0x01,0xff,0x74,0xcc,0xa7,
+	0x00,0x91,0x10,0x10,0x08,0x01,0xff,0x54,0xcc,0x8c,0x00,0x01,0xff,0x74,0xcc,0x8c,
+	0x00,0x01,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x55,0xcc,0x83,0x00,0x01,
+	0xff,0x75,0xcc,0x83,0x00,0x10,0x08,0x01,0xff,0x55,0xcc,0x84,0x00,0x01,0xff,0x75,
+	0xcc,0x84,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x55,0xcc,0x86,0x00,0x01,0xff,0x75,
+	0xcc,0x86,0x00,0x10,0x08,0x01,0xff,0x55,0xcc,0x8a,0x00,0x01,0xff,0x75,0xcc,0x8a,
+	0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x55,0xcc,0x8b,0x00,0x01,
+	0xff,0x75,0xcc,0x8b,0x00,0x10,0x08,0x01,0xff,0x55,0xcc,0xa8,0x00,0x01,0xff,0x75,
+	0xcc,0xa8,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x57,0xcc,0x82,0x00,0x01,0xff,0x77,
+	0xcc,0x82,0x00,0x10,0x08,0x01,0xff,0x59,0xcc,0x82,0x00,0x01,0xff,0x79,0xcc,0x82,
+	0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x59,0xcc,0x88,0x00,0x01,0xff,0x5a,
+	0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x7a,0xcc,0x81,0x00,0x01,0xff,0x5a,0xcc,0x87,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x7a,0xcc,0x87,0x00,0x01,0xff,0x5a,0xcc,0x8c,
+	0x00,0x10,0x08,0x01,0xff,0x7a,0xcc,0x8c,0x00,0x01,0x00,0xd0,0x4a,0xcf,0x86,0x55,
+	0x04,0x01,0x00,0xd4,0x2c,0xd3,0x18,0x92,0x14,0x91,0x10,0x10,0x08,0x01,0xff,0x4f,
+	0xcc,0x9b,0x00,0x01,0xff,0x6f,0xcc,0x9b,0x00,0x01,0x00,0x01,0x00,0x52,0x04,0x01,
+	0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0x55,0xcc,0x9b,0x00,0x93,
+	0x14,0x92,0x10,0x91,0x0c,0x10,0x08,0x01,0xff,0x75,0xcc,0x9b,0x00,0x01,0x00,0x01,
+	0x00,0x01,0x00,0x01,0x00,0xcf,0x86,0xd5,0xb4,0xd4,0x24,0x53,0x04,0x01,0x00,0x52,
+	0x04,0x01,0x00,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0x41,0xcc,0x8c,0x00,0x10,
+	0x08,0x01,0xff,0x61,0xcc,0x8c,0x00,0x01,0xff,0x49,0xcc,0x8c,0x00,0xd3,0x46,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x69,0xcc,0x8c,0x00,0x01,0xff,0x4f,0xcc,0x8c,
+	0x00,0x10,0x08,0x01,0xff,0x6f,0xcc,0x8c,0x00,0x01,0xff,0x55,0xcc,0x8c,0x00,0xd1,
+	0x12,0x10,0x08,0x01,0xff,0x75,0xcc,0x8c,0x00,0x01,0xff,0x55,0xcc,0x88,0xcc,0x84,
+	0x00,0x10,0x0a,0x01,0xff,0x75,0xcc,0x88,0xcc,0x84,0x00,0x01,0xff,0x55,0xcc,0x88,
+	0xcc,0x81,0x00,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x75,0xcc,0x88,0xcc,0x81,
+	0x00,0x01,0xff,0x55,0xcc,0x88,0xcc,0x8c,0x00,0x10,0x0a,0x01,0xff,0x75,0xcc,0x88,
+	0xcc,0x8c,0x00,0x01,0xff,0x55,0xcc,0x88,0xcc,0x80,0x00,0xd1,0x0e,0x10,0x0a,0x01,
+	0xff,0x75,0xcc,0x88,0xcc,0x80,0x00,0x01,0x00,0x10,0x0a,0x01,0xff,0x41,0xcc,0x88,
+	0xcc,0x84,0x00,0x01,0xff,0x61,0xcc,0x88,0xcc,0x84,0x00,0xd4,0x80,0xd3,0x3a,0xd2,
+	0x26,0xd1,0x14,0x10,0x0a,0x01,0xff,0x41,0xcc,0x87,0xcc,0x84,0x00,0x01,0xff,0x61,
+	0xcc,0x87,0xcc,0x84,0x00,0x10,0x09,0x01,0xff,0xc3,0x86,0xcc,0x84,0x00,0x01,0xff,
+	0xc3,0xa6,0xcc,0x84,0x00,0x51,0x04,0x01,0x00,0x10,0x08,0x01,0xff,0x47,0xcc,0x8c,
+	0x00,0x01,0xff,0x67,0xcc,0x8c,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x4b,
+	0xcc,0x8c,0x00,0x01,0xff,0x6b,0xcc,0x8c,0x00,0x10,0x08,0x01,0xff,0x4f,0xcc,0xa8,
+	0x00,0x01,0xff,0x6f,0xcc,0xa8,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x4f,0xcc,0xa8,
+	0xcc,0x84,0x00,0x01,0xff,0x6f,0xcc,0xa8,0xcc,0x84,0x00,0x10,0x09,0x01,0xff,0xc6,
+	0xb7,0xcc,0x8c,0x00,0x01,0xff,0xca,0x92,0xcc,0x8c,0x00,0xd3,0x24,0xd2,0x10,0x91,
+	0x0c,0x10,0x08,0x01,0xff,0x6a,0xcc,0x8c,0x00,0x01,0x00,0x01,0x00,0x91,0x10,0x10,
+	0x08,0x01,0xff,0x47,0xcc,0x81,0x00,0x01,0xff,0x67,0xcc,0x81,0x00,0x04,0x00,0xd2,
+	0x24,0xd1,0x10,0x10,0x08,0x04,0xff,0x4e,0xcc,0x80,0x00,0x04,0xff,0x6e,0xcc,0x80,
+	0x00,0x10,0x0a,0x01,0xff,0x41,0xcc,0x8a,0xcc,0x81,0x00,0x01,0xff,0x61,0xcc,0x8a,
+	0xcc,0x81,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xc3,0x86,0xcc,0x81,0x00,0x01,0xff,
+	0xc3,0xa6,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,0xc3,0x98,0xcc,0x81,0x00,0x01,0xff,
+	0xc3,0xb8,0xcc,0x81,0x00,0xe2,0x07,0x02,0xe1,0xae,0x01,0xe0,0x93,0x01,0xcf,0x86,
+	0xd5,0xf4,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x41,0xcc,
+	0x8f,0x00,0x01,0xff,0x61,0xcc,0x8f,0x00,0x10,0x08,0x01,0xff,0x41,0xcc,0x91,0x00,
+	0x01,0xff,0x61,0xcc,0x91,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x45,0xcc,0x8f,0x00,
+	0x01,0xff,0x65,0xcc,0x8f,0x00,0x10,0x08,0x01,0xff,0x45,0xcc,0x91,0x00,0x01,0xff,
+	0x65,0xcc,0x91,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x49,0xcc,0x8f,0x00,
+	0x01,0xff,0x69,0xcc,0x8f,0x00,0x10,0x08,0x01,0xff,0x49,0xcc,0x91,0x00,0x01,0xff,
+	0x69,0xcc,0x91,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x4f,0xcc,0x8f,0x00,0x01,0xff,
+	0x6f,0xcc,0x8f,0x00,0x10,0x08,0x01,0xff,0x4f,0xcc,0x91,0x00,0x01,0xff,0x6f,0xcc,
+	0x91,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x52,0xcc,0x8f,0x00,
+	0x01,0xff,0x72,0xcc,0x8f,0x00,0x10,0x08,0x01,0xff,0x52,0xcc,0x91,0x00,0x01,0xff,
+	0x72,0xcc,0x91,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x55,0xcc,0x8f,0x00,0x01,0xff,
+	0x75,0xcc,0x8f,0x00,0x10,0x08,0x01,0xff,0x55,0xcc,0x91,0x00,0x01,0xff,0x75,0xcc,
+	0x91,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x04,0xff,0x53,0xcc,0xa6,0x00,0x04,0xff,
+	0x73,0xcc,0xa6,0x00,0x10,0x08,0x04,0xff,0x54,0xcc,0xa6,0x00,0x04,0xff,0x74,0xcc,
+	0xa6,0x00,0x51,0x04,0x04,0x00,0x10,0x08,0x04,0xff,0x48,0xcc,0x8c,0x00,0x04,0xff,
+	0x68,0xcc,0x8c,0x00,0xd4,0x68,0xd3,0x20,0xd2,0x0c,0x91,0x08,0x10,0x04,0x06,0x00,
+	0x07,0x00,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x08,0x04,0xff,0x41,0xcc,0x87,0x00,
+	0x04,0xff,0x61,0xcc,0x87,0x00,0xd2,0x24,0xd1,0x10,0x10,0x08,0x04,0xff,0x45,0xcc,
+	0xa7,0x00,0x04,0xff,0x65,0xcc,0xa7,0x00,0x10,0x0a,0x04,0xff,0x4f,0xcc,0x88,0xcc,
+	0x84,0x00,0x04,0xff,0x6f,0xcc,0x88,0xcc,0x84,0x00,0xd1,0x14,0x10,0x0a,0x04,0xff,
+	0x4f,0xcc,0x83,0xcc,0x84,0x00,0x04,0xff,0x6f,0xcc,0x83,0xcc,0x84,0x00,0x10,0x08,
+	0x04,0xff,0x4f,0xcc,0x87,0x00,0x04,0xff,0x6f,0xcc,0x87,0x00,0x93,0x30,0xd2,0x24,
+	0xd1,0x14,0x10,0x0a,0x04,0xff,0x4f,0xcc,0x87,0xcc,0x84,0x00,0x04,0xff,0x6f,0xcc,
+	0x87,0xcc,0x84,0x00,0x10,0x08,0x04,0xff,0x59,0xcc,0x84,0x00,0x04,0xff,0x79,0xcc,
+	0x84,0x00,0x51,0x04,0x07,0x00,0x10,0x04,0x07,0x00,0x08,0x00,0x08,0x00,0xcf,0x86,
+	0x95,0x14,0x94,0x10,0x93,0x0c,0x92,0x08,0x11,0x04,0x08,0x00,0x09,0x00,0x09,0x00,
+	0x09,0x00,0x01,0x00,0x01,0x00,0xd0,0x22,0xcf,0x86,0x55,0x04,0x01,0x00,0x94,0x18,
+	0x53,0x04,0x01,0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x04,0x00,0x04,0x00,
+	0x11,0x04,0x04,0x00,0x07,0x00,0x01,0x00,0xcf,0x86,0xd5,0x18,0x54,0x04,0x01,0x00,
+	0x53,0x04,0x01,0x00,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,
+	0x04,0x00,0x94,0x18,0x53,0x04,0x01,0x00,0xd2,0x08,0x11,0x04,0x01,0x00,0x04,0x00,
+	0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x07,0x00,0x07,0x00,0xe1,0x35,0x01,0xd0,
+	0x72,0xcf,0x86,0xd5,0x24,0x54,0x04,0x01,0xe6,0xd3,0x10,0x52,0x04,0x01,0xe6,0x91,
+	0x08,0x10,0x04,0x01,0xe6,0x01,0xe8,0x01,0xdc,0x92,0x0c,0x51,0x04,0x01,0xdc,0x10,
+	0x04,0x01,0xe8,0x01,0xd8,0x01,0xdc,0xd4,0x2c,0xd3,0x1c,0xd2,0x10,0xd1,0x08,0x10,
+	0x04,0x01,0xdc,0x01,0xca,0x10,0x04,0x01,0xca,0x01,0xdc,0x51,0x04,0x01,0xdc,0x10,
+	0x04,0x01,0xdc,0x01,0xca,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0xca,0x01,0xdc,0x01,
+	0xdc,0x01,0xdc,0xd3,0x08,0x12,0x04,0x01,0xdc,0x01,0x01,0xd2,0x0c,0x91,0x08,0x10,
+	0x04,0x01,0x01,0x01,0xdc,0x01,0xdc,0x91,0x08,0x10,0x04,0x01,0xdc,0x01,0xe6,0x01,
+	0xe6,0xcf,0x86,0xd5,0x7f,0xd4,0x47,0xd3,0x2e,0xd2,0x19,0xd1,0x0e,0x10,0x07,0x01,
+	0xff,0xcc,0x80,0x00,0x01,0xff,0xcc,0x81,0x00,0x10,0x04,0x01,0xe6,0x01,0xff,0xcc,
+	0x93,0x00,0xd1,0x0d,0x10,0x09,0x01,0xff,0xcc,0x88,0xcc,0x81,0x00,0x01,0xf0,0x10,
+	0x04,0x04,0xe6,0x04,0xdc,0xd2,0x08,0x11,0x04,0x04,0xdc,0x04,0xe6,0xd1,0x08,0x10,
+	0x04,0x04,0xe6,0x04,0xdc,0x10,0x04,0x04,0xdc,0x06,0xff,0x00,0xd3,0x18,0xd2,0x0c,
+	0x51,0x04,0x07,0xe6,0x10,0x04,0x07,0xe6,0x07,0xdc,0x51,0x04,0x07,0xdc,0x10,0x04,
+	0x07,0xdc,0x07,0xe6,0xd2,0x10,0xd1,0x08,0x10,0x04,0x08,0xe8,0x08,0xdc,0x10,0x04,
+	0x08,0xdc,0x08,0xe6,0xd1,0x08,0x10,0x04,0x08,0xe9,0x07,0xea,0x10,0x04,0x07,0xea,
+	0x07,0xe9,0xd4,0x14,0x93,0x10,0x92,0x0c,0x51,0x04,0x01,0xea,0x10,0x04,0x04,0xe9,
+	0x06,0xe6,0x06,0xe6,0x06,0xe6,0xd3,0x13,0x52,0x04,0x0a,0x00,0x91,0x0b,0x10,0x07,
+	0x01,0xff,0xca,0xb9,0x00,0x01,0x00,0x0a,0x00,0xd2,0x0c,0x51,0x04,0x00,0x00,0x10,
+	0x04,0x01,0x00,0x09,0x00,0x51,0x04,0x09,0x00,0x10,0x06,0x01,0xff,0x3b,0x00,0x10,
+	0x00,0xd0,0xe1,0xcf,0x86,0xd5,0x7a,0xd4,0x5f,0xd3,0x21,0x52,0x04,0x00,0x00,0xd1,
+	0x0d,0x10,0x04,0x01,0x00,0x01,0xff,0xc2,0xa8,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,
+	0xce,0x91,0xcc,0x81,0x00,0x01,0xff,0xc2,0xb7,0x00,0xd2,0x1f,0xd1,0x12,0x10,0x09,
+	0x01,0xff,0xce,0x95,0xcc,0x81,0x00,0x01,0xff,0xce,0x97,0xcc,0x81,0x00,0x10,0x09,
+	0x01,0xff,0xce,0x99,0xcc,0x81,0x00,0x00,0x00,0xd1,0x0d,0x10,0x09,0x01,0xff,0xce,
+	0x9f,0xcc,0x81,0x00,0x00,0x00,0x10,0x09,0x01,0xff,0xce,0xa5,0xcc,0x81,0x00,0x01,
+	0xff,0xce,0xa9,0xcc,0x81,0x00,0x93,0x17,0x92,0x13,0x91,0x0f,0x10,0x0b,0x01,0xff,
+	0xce,0xb9,0xcc,0x88,0xcc,0x81,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,
+	0x4a,0xd3,0x10,0x92,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0x01,
+	0x00,0xd2,0x16,0x51,0x04,0x01,0x00,0x10,0x09,0x01,0xff,0xce,0x99,0xcc,0x88,0x00,
+	0x01,0xff,0xce,0xa5,0xcc,0x88,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb1,0xcc,
+	0x81,0x00,0x01,0xff,0xce,0xb5,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,0xce,0xb7,0xcc,
+	0x81,0x00,0x01,0xff,0xce,0xb9,0xcc,0x81,0x00,0x93,0x17,0x92,0x13,0x91,0x0f,0x10,
+	0x0b,0x01,0xff,0xcf,0x85,0xcc,0x88,0xcc,0x81,0x00,0x01,0x00,0x01,0x00,0x01,0x00,
+	0x01,0x00,0xcf,0x86,0xd5,0x7b,0xd4,0x39,0x53,0x04,0x01,0x00,0xd2,0x16,0x51,0x04,
+	0x01,0x00,0x10,0x09,0x01,0xff,0xce,0xb9,0xcc,0x88,0x00,0x01,0xff,0xcf,0x85,0xcc,
+	0x88,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xbf,0xcc,0x81,0x00,0x01,0xff,0xcf,
+	0x85,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,0xcf,0x89,0xcc,0x81,0x00,0x0a,0x00,0xd3,
+	0x26,0xd2,0x11,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0xcf,0x92,0xcc,
+	0x81,0x00,0xd1,0x0d,0x10,0x09,0x01,0xff,0xcf,0x92,0xcc,0x88,0x00,0x01,0x00,0x10,
+	0x04,0x01,0x00,0x04,0x00,0xd2,0x0c,0x51,0x04,0x06,0x00,0x10,0x04,0x01,0x00,0x04,
+	0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x04,0x00,0x10,0x04,0x01,0x00,0x04,0x00,0xd4,
+	0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x04,0x00,0x01,0x00,0x01,
+	0x00,0x01,0x00,0xd3,0x10,0x52,0x04,0x01,0x00,0x51,0x04,0x05,0x00,0x10,0x04,0x06,
+	0x00,0x07,0x00,0x12,0x04,0x07,0x00,0x08,0x00,0xe3,0x47,0x04,0xe2,0xbe,0x02,0xe1,
+	0x07,0x01,0xd0,0x8b,0xcf,0x86,0xd5,0x6c,0xd4,0x53,0xd3,0x30,0xd2,0x1f,0xd1,0x12,
+	0x10,0x09,0x04,0xff,0xd0,0x95,0xcc,0x80,0x00,0x01,0xff,0xd0,0x95,0xcc,0x88,0x00,
+	0x10,0x04,0x01,0x00,0x01,0xff,0xd0,0x93,0xcc,0x81,0x00,0x51,0x04,0x01,0x00,0x10,
+	0x04,0x01,0x00,0x01,0xff,0xd0,0x86,0xcc,0x88,0x00,0x52,0x04,0x01,0x00,0xd1,0x12,
+	0x10,0x09,0x01,0xff,0xd0,0x9a,0xcc,0x81,0x00,0x04,0xff,0xd0,0x98,0xcc,0x80,0x00,
+	0x10,0x09,0x01,0xff,0xd0,0xa3,0xcc,0x86,0x00,0x01,0x00,0x53,0x04,0x01,0x00,0x92,
+	0x11,0x91,0x0d,0x10,0x04,0x01,0x00,0x01,0xff,0xd0,0x98,0xcc,0x86,0x00,0x01,0x00,
+	0x01,0x00,0x54,0x04,0x01,0x00,0x53,0x04,0x01,0x00,0x92,0x11,0x91,0x0d,0x10,0x04,
+	0x01,0x00,0x01,0xff,0xd0,0xb8,0xcc,0x86,0x00,0x01,0x00,0x01,0x00,0xcf,0x86,0xd5,
+	0x57,0x54,0x04,0x01,0x00,0xd3,0x30,0xd2,0x1f,0xd1,0x12,0x10,0x09,0x04,0xff,0xd0,
+	0xb5,0xcc,0x80,0x00,0x01,0xff,0xd0,0xb5,0xcc,0x88,0x00,0x10,0x04,0x01,0x00,0x01,
+	0xff,0xd0,0xb3,0xcc,0x81,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xff,
+	0xd1,0x96,0xcc,0x88,0x00,0x52,0x04,0x01,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xd0,
+	0xba,0xcc,0x81,0x00,0x04,0xff,0xd0,0xb8,0xcc,0x80,0x00,0x10,0x09,0x01,0xff,0xd1,
+	0x83,0xcc,0x86,0x00,0x01,0x00,0x54,0x04,0x01,0x00,0x93,0x1a,0x52,0x04,0x01,0x00,
+	0x51,0x04,0x01,0x00,0x10,0x09,0x01,0xff,0xd1,0xb4,0xcc,0x8f,0x00,0x01,0xff,0xd1,
+	0xb5,0xcc,0x8f,0x00,0x01,0x00,0xd0,0x2e,0xcf,0x86,0x95,0x28,0x94,0x24,0xd3,0x18,
+	0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xe6,0x51,0x04,0x01,0xe6,
+	0x10,0x04,0x01,0xe6,0x0a,0xe6,0x92,0x08,0x11,0x04,0x04,0x00,0x06,0x00,0x04,0x00,
+	0x01,0x00,0x01,0x00,0xcf,0x86,0xd5,0xbe,0xd4,0x4a,0xd3,0x2a,0xd2,0x1a,0xd1,0x0d,
+	0x10,0x04,0x01,0x00,0x01,0xff,0xd0,0x96,0xcc,0x86,0x00,0x10,0x09,0x01,0xff,0xd0,
+	0xb6,0xcc,0x86,0x00,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x06,0x00,0x10,0x04,
+	0x06,0x00,0x01,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,0x06,0x00,0x10,0x04,
+	0x06,0x00,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x06,0x00,0x10,0x04,0x06,0x00,
+	0x09,0x00,0xd3,0x3a,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xd0,0x90,0xcc,0x86,
+	0x00,0x01,0xff,0xd0,0xb0,0xcc,0x86,0x00,0x10,0x09,0x01,0xff,0xd0,0x90,0xcc,0x88,
+	0x00,0x01,0xff,0xd0,0xb0,0xcc,0x88,0x00,0x51,0x04,0x01,0x00,0x10,0x09,0x01,0xff,
+	0xd0,0x95,0xcc,0x86,0x00,0x01,0xff,0xd0,0xb5,0xcc,0x86,0x00,0xd2,0x16,0x51,0x04,
+	0x01,0x00,0x10,0x09,0x01,0xff,0xd3,0x98,0xcc,0x88,0x00,0x01,0xff,0xd3,0x99,0xcc,
+	0x88,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xd0,0x96,0xcc,0x88,0x00,0x01,0xff,0xd0,
+	0xb6,0xcc,0x88,0x00,0x10,0x09,0x01,0xff,0xd0,0x97,0xcc,0x88,0x00,0x01,0xff,0xd0,
+	0xb7,0xcc,0x88,0x00,0xd4,0x74,0xd3,0x3a,0xd2,0x16,0x51,0x04,0x01,0x00,0x10,0x09,
+	0x01,0xff,0xd0,0x98,0xcc,0x84,0x00,0x01,0xff,0xd0,0xb8,0xcc,0x84,0x00,0xd1,0x12,
+	0x10,0x09,0x01,0xff,0xd0,0x98,0xcc,0x88,0x00,0x01,0xff,0xd0,0xb8,0xcc,0x88,0x00,
+	0x10,0x09,0x01,0xff,0xd0,0x9e,0xcc,0x88,0x00,0x01,0xff,0xd0,0xbe,0xcc,0x88,0x00,
+	0xd2,0x16,0x51,0x04,0x01,0x00,0x10,0x09,0x01,0xff,0xd3,0xa8,0xcc,0x88,0x00,0x01,
+	0xff,0xd3,0xa9,0xcc,0x88,0x00,0xd1,0x12,0x10,0x09,0x04,0xff,0xd0,0xad,0xcc,0x88,
+	0x00,0x04,0xff,0xd1,0x8d,0xcc,0x88,0x00,0x10,0x09,0x01,0xff,0xd0,0xa3,0xcc,0x84,
+	0x00,0x01,0xff,0xd1,0x83,0xcc,0x84,0x00,0xd3,0x3a,0xd2,0x24,0xd1,0x12,0x10,0x09,
+	0x01,0xff,0xd0,0xa3,0xcc,0x88,0x00,0x01,0xff,0xd1,0x83,0xcc,0x88,0x00,0x10,0x09,
+	0x01,0xff,0xd0,0xa3,0xcc,0x8b,0x00,0x01,0xff,0xd1,0x83,0xcc,0x8b,0x00,0x91,0x12,
+	0x10,0x09,0x01,0xff,0xd0,0xa7,0xcc,0x88,0x00,0x01,0xff,0xd1,0x87,0xcc,0x88,0x00,
+	0x08,0x00,0x92,0x16,0x91,0x12,0x10,0x09,0x01,0xff,0xd0,0xab,0xcc,0x88,0x00,0x01,
+	0xff,0xd1,0x8b,0xcc,0x88,0x00,0x09,0x00,0x09,0x00,0xd1,0x74,0xd0,0x36,0xcf,0x86,
+	0xd5,0x10,0x54,0x04,0x06,0x00,0x93,0x08,0x12,0x04,0x09,0x00,0x0a,0x00,0x0a,0x00,
+	0xd4,0x10,0x93,0x0c,0x52,0x04,0x0a,0x00,0x11,0x04,0x0b,0x00,0x0c,0x00,0x10,0x00,
+	0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,
+	0x01,0x00,0xcf,0x86,0xd5,0x24,0x54,0x04,0x01,0x00,0xd3,0x10,0x52,0x04,0x01,0x00,
+	0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,
+	0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x14,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd0,0xba,
+	0xcf,0x86,0xd5,0x4c,0xd4,0x24,0x53,0x04,0x01,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,
+	0x14,0x00,0x01,0x00,0x10,0x04,0x04,0x00,0x00,0x00,0xd1,0x08,0x10,0x04,0x00,0x00,
+	0x10,0x00,0x10,0x04,0x10,0x00,0x0d,0x00,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,
+	0x00,0x00,0x02,0xdc,0x02,0xe6,0x51,0x04,0x02,0xe6,0x10,0x04,0x02,0xdc,0x02,0xe6,
+	0x92,0x0c,0x51,0x04,0x02,0xe6,0x10,0x04,0x02,0xde,0x02,0xdc,0x02,0xe6,0xd4,0x2c,
+	0xd3,0x10,0x92,0x0c,0x51,0x04,0x02,0xe6,0x10,0x04,0x08,0xdc,0x02,0xdc,0x02,0xdc,
+	0xd2,0x0c,0x51,0x04,0x02,0xe6,0x10,0x04,0x02,0xdc,0x02,0xe6,0xd1,0x08,0x10,0x04,
+	0x02,0xe6,0x02,0xde,0x10,0x04,0x02,0xe4,0x02,0xe6,0xd3,0x20,0xd2,0x10,0xd1,0x08,
+	0x10,0x04,0x01,0x0a,0x01,0x0b,0x10,0x04,0x01,0x0c,0x01,0x0d,0xd1,0x08,0x10,0x04,
+	0x01,0x0e,0x01,0x0f,0x10,0x04,0x01,0x10,0x01,0x11,0xd2,0x10,0xd1,0x08,0x10,0x04,
+	0x01,0x12,0x01,0x13,0x10,0x04,0x09,0x13,0x01,0x14,0xd1,0x08,0x10,0x04,0x01,0x15,
+	0x01,0x16,0x10,0x04,0x01,0x00,0x01,0x17,0xcf,0x86,0xd5,0x28,0x94,0x24,0x93,0x20,
+	0xd2,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,0x01,0x18,0x10,0x04,0x01,0x19,0x01,0x00,
+	0xd1,0x08,0x10,0x04,0x02,0xe6,0x08,0xdc,0x10,0x04,0x08,0x00,0x08,0x12,0x00,0x00,
+	0x01,0x00,0xd4,0x1c,0x53,0x04,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,
+	0x01,0x00,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x14,0x00,0x93,0x10,
+	0x52,0x04,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0xe2,0xfb,0x01,0xe1,0x2b,0x01,0xd0,0xa8,0xcf,0x86,0xd5,0x55,0xd4,0x28,0xd3,0x10,
+	0x52,0x04,0x07,0x00,0x91,0x08,0x10,0x04,0x0d,0x00,0x10,0x00,0x0a,0x00,0xd2,0x0c,
+	0x51,0x04,0x0a,0x00,0x10,0x04,0x0a,0x00,0x08,0x00,0x91,0x08,0x10,0x04,0x01,0x00,
+	0x07,0x00,0x07,0x00,0xd3,0x0c,0x52,0x04,0x07,0xe6,0x11,0x04,0x07,0xe6,0x0a,0xe6,
+	0xd2,0x10,0xd1,0x08,0x10,0x04,0x0a,0x1e,0x0a,0x1f,0x10,0x04,0x0a,0x20,0x01,0x00,
+	0xd1,0x09,0x10,0x05,0x0f,0xff,0x00,0x00,0x00,0x10,0x04,0x08,0x00,0x01,0x00,0xd4,
+	0x3d,0x93,0x39,0xd2,0x1a,0xd1,0x08,0x10,0x04,0x0c,0x00,0x01,0x00,0x10,0x09,0x01,
+	0xff,0xd8,0xa7,0xd9,0x93,0x00,0x01,0xff,0xd8,0xa7,0xd9,0x94,0x00,0xd1,0x12,0x10,
+	0x09,0x01,0xff,0xd9,0x88,0xd9,0x94,0x00,0x01,0xff,0xd8,0xa7,0xd9,0x95,0x00,0x10,
+	0x09,0x01,0xff,0xd9,0x8a,0xd9,0x94,0x00,0x01,0x00,0x01,0x00,0x53,0x04,0x01,0x00,
+	0x92,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x0a,0x00,0x0a,0x00,0xcf,0x86,
+	0xd5,0x5c,0xd4,0x20,0x53,0x04,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,
+	0x01,0x00,0x01,0x1b,0xd1,0x08,0x10,0x04,0x01,0x1c,0x01,0x1d,0x10,0x04,0x01,0x1e,
+	0x01,0x1f,0xd3,0x20,0xd2,0x10,0xd1,0x08,0x10,0x04,0x01,0x20,0x01,0x21,0x10,0x04,
+	0x01,0x22,0x04,0xe6,0xd1,0x08,0x10,0x04,0x04,0xe6,0x04,0xdc,0x10,0x04,0x07,0xdc,
+	0x07,0xe6,0xd2,0x0c,0x91,0x08,0x10,0x04,0x07,0xe6,0x08,0xe6,0x08,0xe6,0xd1,0x08,
+	0x10,0x04,0x08,0xdc,0x08,0xe6,0x10,0x04,0x08,0xe6,0x0c,0xdc,0xd4,0x10,0x53,0x04,
+	0x01,0x00,0x52,0x04,0x01,0x00,0x11,0x04,0x01,0x00,0x06,0x00,0x93,0x10,0x92,0x0c,
+	0x91,0x08,0x10,0x04,0x01,0x23,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd0,0x22,
+	0xcf,0x86,0x55,0x04,0x01,0x00,0x54,0x04,0x01,0x00,0x53,0x04,0x01,0x00,0xd2,0x08,
+	0x11,0x04,0x04,0x00,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x04,0x00,
+	0xcf,0x86,0xd5,0x5b,0xd4,0x2e,0xd3,0x1e,0x92,0x1a,0xd1,0x0d,0x10,0x09,0x01,0xff,
+	0xdb,0x95,0xd9,0x94,0x00,0x01,0x00,0x10,0x09,0x01,0xff,0xdb,0x81,0xd9,0x94,0x00,
+	0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,
+	0x04,0x00,0xd3,0x19,0xd2,0x11,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xff,
+	0xdb,0x92,0xd9,0x94,0x00,0x11,0x04,0x01,0x00,0x01,0xe6,0x52,0x04,0x01,0xe6,0xd1,
+	0x08,0x10,0x04,0x01,0xe6,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xe6,0xd4,0x38,0xd3,
+	0x1c,0xd2,0x0c,0x51,0x04,0x01,0xe6,0x10,0x04,0x01,0xe6,0x01,0xdc,0xd1,0x08,0x10,
+	0x04,0x01,0xe6,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xe6,0xd2,0x10,0xd1,0x08,0x10,
+	0x04,0x01,0xe6,0x01,0x00,0x10,0x04,0x01,0xdc,0x01,0xe6,0x91,0x08,0x10,0x04,0x01,
+	0xe6,0x01,0xdc,0x07,0x00,0x53,0x04,0x01,0x00,0xd2,0x08,0x11,0x04,0x01,0x00,0x04,
+	0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x07,0x00,0xd1,0xc8,0xd0,0x76,0xcf,
+	0x86,0xd5,0x28,0xd4,0x14,0x53,0x04,0x04,0x00,0x52,0x04,0x04,0x00,0x51,0x04,0x04,
+	0x00,0x10,0x04,0x00,0x00,0x04,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x04,
+	0x00,0x04,0x24,0x04,0x00,0x04,0x00,0x04,0x00,0xd4,0x14,0x53,0x04,0x04,0x00,0x52,
+	0x04,0x04,0x00,0x91,0x08,0x10,0x04,0x04,0x00,0x07,0x00,0x07,0x00,0xd3,0x1c,0xd2,
+	0x0c,0x91,0x08,0x10,0x04,0x04,0xe6,0x04,0xdc,0x04,0xe6,0xd1,0x08,0x10,0x04,0x04,
+	0xdc,0x04,0xe6,0x10,0x04,0x04,0xe6,0x04,0xdc,0xd2,0x0c,0x51,0x04,0x04,0xdc,0x10,
+	0x04,0x04,0xe6,0x04,0xdc,0xd1,0x08,0x10,0x04,0x04,0xdc,0x04,0xe6,0x10,0x04,0x04,
+	0xdc,0x04,0xe6,0xcf,0x86,0xd5,0x3c,0x94,0x38,0xd3,0x1c,0xd2,0x0c,0x51,0x04,0x04,
+	0xe6,0x10,0x04,0x04,0xdc,0x04,0xe6,0xd1,0x08,0x10,0x04,0x04,0xdc,0x04,0xe6,0x10,
+	0x04,0x04,0xdc,0x04,0xe6,0xd2,0x10,0xd1,0x08,0x10,0x04,0x04,0xdc,0x04,0xe6,0x10,
+	0x04,0x04,0xe6,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x07,0x00,0x07,0x00,0x08,
+	0x00,0x94,0x10,0x53,0x04,0x08,0x00,0x52,0x04,0x08,0x00,0x11,0x04,0x08,0x00,0x0a,
+	0x00,0x0a,0x00,0xd0,0x1e,0xcf,0x86,0x55,0x04,0x04,0x00,0x54,0x04,0x04,0x00,0x93,
+	0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x04,0x00,0x06,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0xcf,0x86,0x55,0x04,0x09,0x00,0xd4,0x14,0x53,0x04,0x09,0x00,0x92,0x0c,0x51,
+	0x04,0x09,0x00,0x10,0x04,0x09,0x00,0x09,0xe6,0x09,0xe6,0xd3,0x10,0x92,0x0c,0x51,
+	0x04,0x09,0xe6,0x10,0x04,0x09,0xdc,0x09,0xe6,0x09,0x00,0xd2,0x0c,0x51,0x04,0x09,
+	0x00,0x10,0x04,0x09,0x00,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x14,0xdc,0x14,
+	0x00,0xe4,0xf8,0x57,0xe3,0x45,0x3f,0xe2,0xf4,0x3e,0xe1,0xc7,0x2c,0xe0,0x21,0x10,
+	0xcf,0x86,0xc5,0xe4,0x80,0x08,0xe3,0xcb,0x03,0xe2,0x61,0x01,0xd1,0x94,0xd0,0x5a,
+	0xcf,0x86,0xd5,0x20,0x54,0x04,0x0b,0x00,0xd3,0x0c,0x52,0x04,0x0b,0x00,0x11,0x04,
+	0x0b,0x00,0x0b,0xe6,0x92,0x0c,0x51,0x04,0x0b,0xe6,0x10,0x04,0x0b,0x00,0x0b,0xe6,
+	0x0b,0xe6,0xd4,0x24,0xd3,0x10,0x52,0x04,0x0b,0xe6,0x91,0x08,0x10,0x04,0x0b,0x00,
+	0x0b,0xe6,0x0b,0xe6,0xd2,0x0c,0x91,0x08,0x10,0x04,0x0b,0x00,0x0b,0xe6,0x0b,0xe6,
+	0x11,0x04,0x0b,0xe6,0x00,0x00,0x53,0x04,0x0b,0x00,0x52,0x04,0x0b,0x00,0x51,0x04,
+	0x0b,0x00,0x10,0x04,0x0b,0x00,0x00,0x00,0xcf,0x86,0xd5,0x20,0x54,0x04,0x0c,0x00,
+	0x53,0x04,0x0c,0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,0x0c,0x00,0x0c,0xdc,0x0c,0xdc,
+	0x51,0x04,0x00,0x00,0x10,0x04,0x0c,0x00,0x00,0x00,0x94,0x14,0x53,0x04,0x13,0x00,
+	0x92,0x0c,0x51,0x04,0x13,0x00,0x10,0x04,0x13,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0xd0,0x4a,0xcf,0x86,0x55,0x04,0x00,0x00,0xd4,0x20,0xd3,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x0d,0x00,0x10,0x00,0x0d,0x00,0x0d,0x00,0x52,0x04,0x0d,0x00,0x91,0x08,
+	0x10,0x04,0x0d,0x00,0x10,0x00,0x10,0x00,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x10,0x00,
+	0x10,0x04,0x10,0x00,0x11,0x00,0x91,0x08,0x10,0x04,0x11,0x00,0x00,0x00,0x12,0x00,
+	0x52,0x04,0x12,0x00,0x11,0x04,0x12,0x00,0x00,0x00,0xcf,0x86,0xd5,0x18,0x54,0x04,
+	0x00,0x00,0x93,0x10,0x92,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x14,0xdc,
+	0x12,0xe6,0x12,0xe6,0xd4,0x30,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x12,0xe6,0x10,0x04,
+	0x12,0x00,0x11,0xdc,0x51,0x04,0x0d,0xe6,0x10,0x04,0x0d,0xdc,0x0d,0xe6,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x0d,0xe6,0x0d,0xdc,0x0d,0xe6,0x91,0x08,0x10,0x04,0x0d,0xe6,
+	0x0d,0xdc,0x0d,0xdc,0xd3,0x1c,0xd2,0x10,0xd1,0x08,0x10,0x04,0x0d,0x1b,0x0d,0x1c,
+	0x10,0x04,0x0d,0x1d,0x0d,0xe6,0x51,0x04,0x0d,0xe6,0x10,0x04,0x0d,0xdc,0x0d,0xe6,
+	0xd2,0x10,0xd1,0x08,0x10,0x04,0x0d,0xe6,0x0d,0xdc,0x10,0x04,0x0d,0xdc,0x0d,0xe6,
+	0x51,0x04,0x0d,0xe6,0x10,0x04,0x0d,0xe6,0x10,0xe6,0xe1,0x3a,0x01,0xd0,0x77,0xcf,
+	0x86,0xd5,0x20,0x94,0x1c,0x93,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x0b,0x00,0x01,
+	0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x07,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,
+	0x00,0xd4,0x1b,0x53,0x04,0x01,0x00,0x92,0x13,0x91,0x0f,0x10,0x04,0x01,0x00,0x01,
+	0xff,0xe0,0xa4,0xa8,0xe0,0xa4,0xbc,0x00,0x01,0x00,0x01,0x00,0xd3,0x26,0xd2,0x13,
+	0x91,0x0f,0x10,0x04,0x01,0x00,0x01,0xff,0xe0,0xa4,0xb0,0xe0,0xa4,0xbc,0x00,0x01,
+	0x00,0x91,0x0f,0x10,0x0b,0x01,0xff,0xe0,0xa4,0xb3,0xe0,0xa4,0xbc,0x00,0x01,0x00,
+	0x01,0x00,0xd2,0x08,0x11,0x04,0x01,0x00,0x0c,0x00,0x91,0x08,0x10,0x04,0x01,0x07,
+	0x01,0x00,0x01,0x00,0xcf,0x86,0xd5,0x8c,0xd4,0x18,0x53,0x04,0x01,0x00,0x52,0x04,
+	0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x01,0x09,0x10,0x04,0x0b,0x00,0x0c,0x00,
+	0xd3,0x1c,0xd2,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,0x01,0xe6,0x10,0x04,0x01,0xdc,
+	0x01,0xe6,0x91,0x08,0x10,0x04,0x01,0xe6,0x0b,0x00,0x0c,0x00,0xd2,0x2c,0xd1,0x16,
+	0x10,0x0b,0x01,0xff,0xe0,0xa4,0x95,0xe0,0xa4,0xbc,0x00,0x01,0xff,0xe0,0xa4,0x96,
+	0xe0,0xa4,0xbc,0x00,0x10,0x0b,0x01,0xff,0xe0,0xa4,0x97,0xe0,0xa4,0xbc,0x00,0x01,
+	0xff,0xe0,0xa4,0x9c,0xe0,0xa4,0xbc,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xe0,0xa4,
+	0xa1,0xe0,0xa4,0xbc,0x00,0x01,0xff,0xe0,0xa4,0xa2,0xe0,0xa4,0xbc,0x00,0x10,0x0b,
+	0x01,0xff,0xe0,0xa4,0xab,0xe0,0xa4,0xbc,0x00,0x01,0xff,0xe0,0xa4,0xaf,0xe0,0xa4,
+	0xbc,0x00,0x54,0x04,0x01,0x00,0xd3,0x14,0x92,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,
+	0x0a,0x00,0x10,0x04,0x0a,0x00,0x0c,0x00,0x0c,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,
+	0x10,0x00,0x0b,0x00,0x10,0x04,0x0b,0x00,0x09,0x00,0x91,0x08,0x10,0x04,0x09,0x00,
+	0x08,0x00,0x09,0x00,0xd0,0x86,0xcf,0x86,0xd5,0x44,0xd4,0x2c,0xd3,0x18,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x10,0x00,0x01,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x00,0x00,
+	0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,
+	0x10,0x04,0x00,0x00,0x01,0x00,0x93,0x14,0x92,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,
+	0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,0x14,0x53,0x04,
+	0x01,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,
+	0xd3,0x18,0xd2,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x01,0x00,
+	0x00,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0xd2,0x08,0x11,0x04,0x01,0x00,0x00,0x00,
+	0x91,0x08,0x10,0x04,0x01,0x07,0x07,0x00,0x01,0x00,0xcf,0x86,0xd5,0x7b,0xd4,0x42,
+	0xd3,0x14,0x52,0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,
+	0x00,0x00,0x01,0x00,0xd2,0x17,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,
+	0x00,0x00,0x01,0xff,0xe0,0xa7,0x87,0xe0,0xa6,0xbe,0x00,0xd1,0x0f,0x10,0x0b,0x01,
+	0xff,0xe0,0xa7,0x87,0xe0,0xa7,0x97,0x00,0x01,0x09,0x10,0x04,0x08,0x00,0x00,0x00,
+	0xd3,0x10,0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,
+	0x52,0x04,0x00,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xe0,0xa6,0xa1,0xe0,0xa6,0xbc,
+	0x00,0x01,0xff,0xe0,0xa6,0xa2,0xe0,0xa6,0xbc,0x00,0x10,0x04,0x00,0x00,0x01,0xff,
+	0xe0,0xa6,0xaf,0xe0,0xa6,0xbc,0x00,0xd4,0x10,0x93,0x0c,0x52,0x04,0x01,0x00,0x11,
+	0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x53,0x04,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,
+	0x00,0x10,0x04,0x01,0x00,0x0b,0x00,0x51,0x04,0x13,0x00,0x10,0x04,0x14,0xe6,0x00,
+	0x00,0xe2,0x48,0x02,0xe1,0x4f,0x01,0xd0,0xa4,0xcf,0x86,0xd5,0x4c,0xd4,0x34,0xd3,
+	0x1c,0xd2,0x10,0xd1,0x08,0x10,0x04,0x00,0x00,0x07,0x00,0x10,0x04,0x01,0x00,0x07,
+	0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,
+	0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x01,
+	0x00,0x93,0x14,0x92,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x00,
+	0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,0x14,0x53,0x04,0x01,0x00,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,0xd3,0x2e,0xd2,0x17,0xd1,
+	0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0xe0,0xa8,0xb2,
+	0xe0,0xa8,0xbc,0x00,0xd1,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x10,0x0b,0x01,0xff,
+	0xe0,0xa8,0xb8,0xe0,0xa8,0xbc,0x00,0x00,0x00,0xd2,0x08,0x11,0x04,0x01,0x00,0x00,
+	0x00,0x91,0x08,0x10,0x04,0x01,0x07,0x00,0x00,0x01,0x00,0xcf,0x86,0xd5,0x80,0xd4,
+	0x34,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x51,
+	0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x01,
+	0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x01,
+	0x09,0x00,0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x0a,0x00,0x00,
+	0x00,0x00,0x00,0xd2,0x25,0xd1,0x0f,0x10,0x04,0x00,0x00,0x01,0xff,0xe0,0xa8,0x96,
+	0xe0,0xa8,0xbc,0x00,0x10,0x0b,0x01,0xff,0xe0,0xa8,0x97,0xe0,0xa8,0xbc,0x00,0x01,
+	0xff,0xe0,0xa8,0x9c,0xe0,0xa8,0xbc,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,
+	0x10,0x0b,0x01,0xff,0xe0,0xa8,0xab,0xe0,0xa8,0xbc,0x00,0x00,0x00,0xd4,0x10,0x93,
+	0x0c,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x93,0x14,0x52,
+	0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x0a,0x00,0x10,0x04,0x14,0x00,0x00,
+	0x00,0x00,0x00,0xd0,0x82,0xcf,0x86,0xd5,0x40,0xd4,0x2c,0xd3,0x18,0xd2,0x0c,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x01,
+	0x00,0x01,0x00,0x52,0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x07,0x00,0x01,0x00,0x10,
+	0x04,0x00,0x00,0x01,0x00,0x93,0x10,0x92,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x00,
+	0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,0x14,0x53,0x04,0x01,0x00,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,0xd3,0x18,0xd2,0x0c,0x91,
+	0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x01,
+	0x00,0x01,0x00,0xd2,0x08,0x11,0x04,0x01,0x00,0x00,0x00,0x91,0x08,0x10,0x04,0x01,
+	0x07,0x01,0x00,0x01,0x00,0xcf,0x86,0xd5,0x3c,0xd4,0x28,0xd3,0x10,0x52,0x04,0x01,
+	0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,
+	0x00,0x10,0x04,0x00,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x01,0x09,0x00,
+	0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0xd4,0x18,0x93,0x14,0xd2,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x07,
+	0x00,0x07,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0xd3,0x10,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x0d,0x00,0x07,0x00,0x00,0x00,0x00,0x00,0x92,0x0c,0x91,0x08,0x10,
+	0x04,0x00,0x00,0x11,0x00,0x13,0x00,0x13,0x00,0xe1,0x24,0x01,0xd0,0x86,0xcf,0x86,
+	0xd5,0x44,0xd4,0x2c,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,
+	0x01,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,
+	0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0x93,0x14,
+	0x92,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,
+	0x01,0x00,0x01,0x00,0xd4,0x14,0x53,0x04,0x01,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,
+	0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,
+	0x01,0x00,0x00,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x07,0x00,0x01,0x00,
+	0xd2,0x08,0x11,0x04,0x01,0x00,0x00,0x00,0x91,0x08,0x10,0x04,0x01,0x07,0x01,0x00,
+	0x01,0x00,0xcf,0x86,0xd5,0x73,0xd4,0x45,0xd3,0x14,0x52,0x04,0x01,0x00,0xd1,0x08,
+	0x10,0x04,0x0a,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0xd2,0x1e,0xd1,0x0f,
+	0x10,0x0b,0x01,0xff,0xe0,0xad,0x87,0xe0,0xad,0x96,0x00,0x00,0x00,0x10,0x04,0x00,
+	0x00,0x01,0xff,0xe0,0xad,0x87,0xe0,0xac,0xbe,0x00,0x91,0x0f,0x10,0x0b,0x01,0xff,
+	0xe0,0xad,0x87,0xe0,0xad,0x97,0x00,0x01,0x09,0x00,0x00,0xd3,0x0c,0x52,0x04,0x00,
+	0x00,0x11,0x04,0x00,0x00,0x01,0x00,0x52,0x04,0x00,0x00,0xd1,0x16,0x10,0x0b,0x01,
+	0xff,0xe0,0xac,0xa1,0xe0,0xac,0xbc,0x00,0x01,0xff,0xe0,0xac,0xa2,0xe0,0xac,0xbc,
+	0x00,0x10,0x04,0x00,0x00,0x01,0x00,0xd4,0x14,0x93,0x10,0xd2,0x08,0x11,0x04,0x01,
+	0x00,0x0a,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x93,0x10,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x01,0x00,0x07,0x00,0x0c,0x00,0x0c,0x00,0x00,0x00,0xd0,0xb1,0xcf,
+	0x86,0xd5,0x63,0xd4,0x28,0xd3,0x14,0xd2,0x08,0x11,0x04,0x00,0x00,0x01,0x00,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,
+	0x04,0x01,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0xd3,0x1f,0xd2,0x0c,0x91,
+	0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x91,0x0f,0x10,0x0b,0x01,0xff,0xe0,
+	0xae,0x92,0xe0,0xaf,0x97,0x00,0x01,0x00,0x00,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,
+	0x00,0x00,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x91,0x08,0x10,0x04,0x01,0x00,
+	0x00,0x00,0x01,0x00,0xd4,0x2c,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,
+	0x00,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x00,0x00,0xd2,0x0c,
+	0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x01,0x00,
+	0xd3,0x10,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x08,0x00,0x01,0x00,
+	0xd2,0x08,0x11,0x04,0x01,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0xcf,0x86,
+	0xd5,0x61,0xd4,0x45,0xd3,0x14,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,
+	0x00,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0xd2,0x1e,0xd1,0x08,0x10,0x04,0x01,0x00,
+	0x00,0x00,0x10,0x0b,0x01,0xff,0xe0,0xaf,0x86,0xe0,0xae,0xbe,0x00,0x01,0xff,0xe0,
+	0xaf,0x87,0xe0,0xae,0xbe,0x00,0x91,0x0f,0x10,0x0b,0x01,0xff,0xe0,0xaf,0x86,0xe0,
+	0xaf,0x97,0x00,0x01,0x09,0x00,0x00,0x93,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x0a,
+	0x00,0x00,0x00,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0x00,
+	0x00,0xd4,0x14,0x93,0x10,0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x08,
+	0x00,0x01,0x00,0x01,0x00,0xd3,0x10,0x92,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,
+	0x00,0x07,0x00,0x07,0x00,0x92,0x0c,0x51,0x04,0x07,0x00,0x10,0x04,0x07,0x00,0x00,
+	0x00,0x00,0x00,0xe3,0x1c,0x04,0xe2,0x1a,0x02,0xd1,0xf3,0xd0,0x76,0xcf,0x86,0xd5,
+	0x3c,0xd4,0x28,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x10,0x00,0x01,0x00,0x01,
+	0x00,0x91,0x08,0x10,0x04,0x14,0x00,0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,0x91,
+	0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,
+	0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,0x14,0x53,0x04,0x01,
+	0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,0xd3,
+	0x10,0x52,0x04,0x01,0x00,0x91,0x08,0x10,0x04,0x10,0x00,0x01,0x00,0x01,0x00,0xd2,
+	0x08,0x11,0x04,0x01,0x00,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x0a,0x00,0x01,
+	0x00,0xcf,0x86,0xd5,0x53,0xd4,0x2f,0xd3,0x10,0x52,0x04,0x01,0x00,0x91,0x08,0x10,
+	0x04,0x01,0x00,0x00,0x00,0x01,0x00,0xd2,0x13,0x91,0x0f,0x10,0x0b,0x01,0xff,0xe0,
+	0xb1,0x86,0xe0,0xb1,0x96,0x00,0x00,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,
+	0x01,0x09,0x00,0x00,0xd3,0x14,0x52,0x04,0x00,0x00,0xd1,0x08,0x10,0x04,0x00,0x00,
+	0x01,0x54,0x10,0x04,0x01,0x5b,0x00,0x00,0x92,0x0c,0x51,0x04,0x0a,0x00,0x10,0x04,
+	0x11,0x00,0x00,0x00,0x00,0x00,0xd4,0x14,0x93,0x10,0xd2,0x08,0x11,0x04,0x01,0x00,
+	0x0a,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x93,0x10,0x52,0x04,0x00,0x00,
+	0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x15,0x00,0x0a,0x00,0xd0,0x76,0xcf,0x86,
+	0xd5,0x3c,0xd4,0x28,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x12,0x00,0x10,0x00,
+	0x01,0x00,0x91,0x08,0x10,0x04,0x14,0x00,0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,
+	0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,0x14,0x53,0x04,
+	0x01,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,
+	0xd3,0x10,0x52,0x04,0x01,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,
+	0xd2,0x08,0x11,0x04,0x01,0x00,0x00,0x00,0x91,0x08,0x10,0x04,0x07,0x07,0x07,0x00,
+	0x01,0x00,0xcf,0x86,0xd5,0x82,0xd4,0x5e,0xd3,0x2a,0xd2,0x13,0x91,0x0f,0x10,0x0b,
+	0x01,0xff,0xe0,0xb2,0xbf,0xe0,0xb3,0x95,0x00,0x01,0x00,0x01,0x00,0xd1,0x08,0x10,
+	0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0xe0,0xb3,0x86,0xe0,0xb3,
+	0x95,0x00,0xd2,0x28,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe0,0xb3,0x86,0xe0,0xb3,0x96,
+	0x00,0x00,0x00,0x10,0x0b,0x01,0xff,0xe0,0xb3,0x86,0xe0,0xb3,0x82,0x00,0x01,0xff,
+	0xe0,0xb3,0x86,0xe0,0xb3,0x82,0xe0,0xb3,0x95,0x00,0x91,0x08,0x10,0x04,0x01,0x00,
+	0x01,0x09,0x00,0x00,0xd3,0x14,0x52,0x04,0x00,0x00,0xd1,0x08,0x10,0x04,0x00,0x00,
+	0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,
+	0x10,0x04,0x01,0x00,0x00,0x00,0xd4,0x14,0x93,0x10,0xd2,0x08,0x11,0x04,0x01,0x00,
+	0x09,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x93,0x14,0x92,0x10,0xd1,0x08,
+	0x10,0x04,0x00,0x00,0x09,0x00,0x10,0x04,0x09,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0xe1,0x06,0x01,0xd0,0x6e,0xcf,0x86,0xd5,0x3c,0xd4,0x28,0xd3,0x18,0xd2,0x0c,0x91,
+	0x08,0x10,0x04,0x13,0x00,0x10,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x01,
+	0x00,0x01,0x00,0x52,0x04,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,
+	0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,
+	0x00,0x01,0x00,0xd4,0x14,0x53,0x04,0x01,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,
+	0x00,0x0c,0x00,0x01,0x00,0x01,0x00,0x53,0x04,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,
+	0x00,0x10,0x04,0x0c,0x00,0x13,0x09,0x91,0x08,0x10,0x04,0x13,0x09,0x0a,0x00,0x01,
+	0x00,0xcf,0x86,0xd5,0x65,0xd4,0x45,0xd3,0x10,0x52,0x04,0x01,0x00,0x91,0x08,0x10,
+	0x04,0x0a,0x00,0x00,0x00,0x01,0x00,0xd2,0x1e,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,
+	0x00,0x10,0x0b,0x01,0xff,0xe0,0xb5,0x86,0xe0,0xb4,0xbe,0x00,0x01,0xff,0xe0,0xb5,
+	0x87,0xe0,0xb4,0xbe,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe0,0xb5,0x86,0xe0,0xb5,
+	0x97,0x00,0x01,0x09,0x10,0x04,0x0c,0x00,0x12,0x00,0xd3,0x10,0x52,0x04,0x00,0x00,
+	0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x00,0x01,0x00,0x52,0x04,0x12,0x00,0x51,0x04,
+	0x12,0x00,0x10,0x04,0x12,0x00,0x11,0x00,0xd4,0x14,0x93,0x10,0xd2,0x08,0x11,0x04,
+	0x01,0x00,0x0a,0x00,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0xd3,0x0c,0x52,0x04,
+	0x0a,0x00,0x11,0x04,0x0a,0x00,0x12,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x12,0x00,
+	0x0a,0x00,0x0a,0x00,0x0a,0x00,0xd0,0x5a,0xcf,0x86,0xd5,0x34,0xd4,0x18,0x93,0x14,
+	0xd2,0x08,0x11,0x04,0x00,0x00,0x04,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x04,0x00,
+	0x04,0x00,0x04,0x00,0xd3,0x10,0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,
+	0x04,0x00,0x00,0x00,0x92,0x08,0x11,0x04,0x00,0x00,0x04,0x00,0x04,0x00,0x54,0x04,
+	0x04,0x00,0xd3,0x10,0x92,0x0c,0x51,0x04,0x04,0x00,0x10,0x04,0x00,0x00,0x04,0x00,
+	0x04,0x00,0x52,0x04,0x04,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x04,0x00,0x00,0x00,
+	0xcf,0x86,0xd5,0x77,0xd4,0x28,0xd3,0x10,0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,
+	0x10,0x04,0x04,0x00,0x00,0x00,0xd2,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x04,0x09,
+	0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x04,0x00,0xd3,0x14,0x52,0x04,
+	0x04,0x00,0xd1,0x08,0x10,0x04,0x04,0x00,0x00,0x00,0x10,0x04,0x04,0x00,0x00,0x00,
+	0xd2,0x13,0x51,0x04,0x04,0x00,0x10,0x0b,0x04,0xff,0xe0,0xb7,0x99,0xe0,0xb7,0x8a,
+	0x00,0x04,0x00,0xd1,0x19,0x10,0x0b,0x04,0xff,0xe0,0xb7,0x99,0xe0,0xb7,0x8f,0x00,
+	0x04,0xff,0xe0,0xb7,0x99,0xe0,0xb7,0x8f,0xe0,0xb7,0x8a,0x00,0x10,0x0b,0x04,0xff,
+	0xe0,0xb7,0x99,0xe0,0xb7,0x9f,0x00,0x04,0x00,0xd4,0x10,0x93,0x0c,0x52,0x04,0x00,
+	0x00,0x11,0x04,0x00,0x00,0x10,0x00,0x10,0x00,0x93,0x14,0xd2,0x08,0x11,0x04,0x00,
+	0x00,0x04,0x00,0x91,0x08,0x10,0x04,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xe2,
+	0x31,0x01,0xd1,0x58,0xd0,0x3a,0xcf,0x86,0xd5,0x18,0x94,0x14,0x93,0x10,0x92,0x0c,
+	0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,
+	0x54,0x04,0x01,0x00,0x53,0x04,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,0x67,0x10,0x04,
+	0x01,0x09,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x01,0x00,0xcf,0x86,
+	0x95,0x18,0xd4,0x0c,0x53,0x04,0x01,0x00,0x12,0x04,0x01,0x6b,0x01,0x00,0x53,0x04,
+	0x01,0x00,0x12,0x04,0x01,0x00,0x00,0x00,0x00,0x00,0xd0,0x9e,0xcf,0x86,0xd5,0x54,
+	0xd4,0x3c,0xd3,0x20,0xd2,0x10,0xd1,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x10,0x04,
+	0x01,0x00,0x00,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x15,0x00,
+	0x01,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,0x15,0x00,0x10,0x04,0x01,0x00,
+	0x00,0x00,0x91,0x08,0x10,0x04,0x15,0x00,0x01,0x00,0x15,0x00,0xd3,0x08,0x12,0x04,
+	0x15,0x00,0x01,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x15,0x00,0x01,0x00,0x01,0x00,
+	0x01,0x00,0xd4,0x30,0xd3,0x1c,0xd2,0x0c,0x91,0x08,0x10,0x04,0x15,0x00,0x01,0x00,
+	0x01,0x00,0xd1,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x10,0x04,0x00,0x00,0x01,0x00,
+	0xd2,0x08,0x11,0x04,0x15,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x15,0x00,0x01,0x00,
+	0x01,0x00,0x53,0x04,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,0x76,0x10,0x04,0x15,0x09,
+	0x01,0x00,0x11,0x04,0x01,0x00,0x00,0x00,0xcf,0x86,0x95,0x34,0xd4,0x20,0xd3,0x14,
+	0x52,0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x01,0x00,
+	0x00,0x00,0x52,0x04,0x01,0x7a,0x11,0x04,0x01,0x00,0x00,0x00,0x53,0x04,0x01,0x00,
+	0xd2,0x08,0x11,0x04,0x01,0x00,0x00,0x00,0x11,0x04,0x01,0x00,0x0d,0x00,0x00,0x00,
+	0xe1,0x2b,0x01,0xd0,0x3e,0xcf,0x86,0xd5,0x14,0x54,0x04,0x02,0x00,0x53,0x04,0x02,
+	0x00,0x92,0x08,0x11,0x04,0x02,0xdc,0x02,0x00,0x02,0x00,0x54,0x04,0x02,0x00,0xd3,
+	0x14,0x52,0x04,0x02,0x00,0xd1,0x08,0x10,0x04,0x02,0x00,0x02,0xdc,0x10,0x04,0x02,
+	0x00,0x02,0xdc,0x92,0x0c,0x91,0x08,0x10,0x04,0x02,0x00,0x02,0xd8,0x02,0x00,0x02,
+	0x00,0xcf,0x86,0xd5,0x73,0xd4,0x36,0xd3,0x17,0x92,0x13,0x51,0x04,0x02,0x00,0x10,
+	0x04,0x02,0x00,0x02,0xff,0xe0,0xbd,0x82,0xe0,0xbe,0xb7,0x00,0x02,0x00,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x00,0x00,0x02,0x00,0x02,0x00,0x91,0x0f,0x10,0x04,0x02,0x00,
+	0x02,0xff,0xe0,0xbd,0x8c,0xe0,0xbe,0xb7,0x00,0x02,0x00,0xd3,0x26,0xd2,0x13,0x51,
+	0x04,0x02,0x00,0x10,0x0b,0x02,0xff,0xe0,0xbd,0x91,0xe0,0xbe,0xb7,0x00,0x02,0x00,
+	0x51,0x04,0x02,0x00,0x10,0x04,0x02,0x00,0x02,0xff,0xe0,0xbd,0x96,0xe0,0xbe,0xb7,
+	0x00,0x52,0x04,0x02,0x00,0x91,0x0f,0x10,0x0b,0x02,0xff,0xe0,0xbd,0x9b,0xe0,0xbe,
+	0xb7,0x00,0x02,0x00,0x02,0x00,0xd4,0x27,0x53,0x04,0x02,0x00,0xd2,0x17,0xd1,0x0f,
+	0x10,0x04,0x02,0x00,0x02,0xff,0xe0,0xbd,0x80,0xe0,0xbe,0xb5,0x00,0x10,0x04,0x04,
+	0x00,0x0a,0x00,0x91,0x08,0x10,0x04,0x0a,0x00,0x00,0x00,0x00,0x00,0xd3,0x35,0xd2,
+	0x17,0xd1,0x08,0x10,0x04,0x00,0x00,0x02,0x81,0x10,0x04,0x02,0x82,0x02,0xff,0xe0,
+	0xbd,0xb1,0xe0,0xbd,0xb2,0x00,0xd1,0x0f,0x10,0x04,0x02,0x84,0x02,0xff,0xe0,0xbd,
+	0xb1,0xe0,0xbd,0xb4,0x00,0x10,0x0b,0x02,0xff,0xe0,0xbe,0xb2,0xe0,0xbe,0x80,0x00,
+	0x02,0x00,0xd2,0x13,0x91,0x0f,0x10,0x0b,0x02,0xff,0xe0,0xbe,0xb3,0xe0,0xbe,0x80,
+	0x00,0x02,0x00,0x02,0x82,0x11,0x04,0x02,0x82,0x02,0x00,0xd0,0xd3,0xcf,0x86,0xd5,
+	0x65,0xd4,0x27,0xd3,0x1f,0xd2,0x13,0x91,0x0f,0x10,0x04,0x02,0x82,0x02,0xff,0xe0,
+	0xbd,0xb1,0xe0,0xbe,0x80,0x00,0x02,0xe6,0x91,0x08,0x10,0x04,0x02,0x09,0x02,0x00,
+	0x02,0xe6,0x12,0x04,0x02,0x00,0x0c,0x00,0xd3,0x1f,0xd2,0x13,0x51,0x04,0x02,0x00,
+	0x10,0x04,0x02,0x00,0x02,0xff,0xe0,0xbe,0x92,0xe0,0xbe,0xb7,0x00,0x51,0x04,0x02,
+	0x00,0x10,0x04,0x04,0x00,0x02,0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x02,
+	0x00,0x02,0x00,0x91,0x0f,0x10,0x04,0x02,0x00,0x02,0xff,0xe0,0xbe,0x9c,0xe0,0xbe,
+	0xb7,0x00,0x02,0x00,0xd4,0x3d,0xd3,0x26,0xd2,0x13,0x51,0x04,0x02,0x00,0x10,0x0b,
+	0x02,0xff,0xe0,0xbe,0xa1,0xe0,0xbe,0xb7,0x00,0x02,0x00,0x51,0x04,0x02,0x00,0x10,
+	0x04,0x02,0x00,0x02,0xff,0xe0,0xbe,0xa6,0xe0,0xbe,0xb7,0x00,0x52,0x04,0x02,0x00,
+	0x91,0x0f,0x10,0x0b,0x02,0xff,0xe0,0xbe,0xab,0xe0,0xbe,0xb7,0x00,0x02,0x00,0x04,
+	0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x04,0x00,0x02,0x00,0x02,0x00,0x02,
+	0x00,0xd2,0x13,0x91,0x0f,0x10,0x04,0x04,0x00,0x02,0xff,0xe0,0xbe,0x90,0xe0,0xbe,
+	0xb5,0x00,0x04,0x00,0x91,0x08,0x10,0x04,0x04,0x00,0x00,0x00,0x04,0x00,0xcf,0x86,
+	0x95,0x4c,0xd4,0x24,0xd3,0x10,0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,
+	0x04,0xdc,0x04,0x00,0x52,0x04,0x04,0x00,0xd1,0x08,0x10,0x04,0x04,0x00,0x00,0x00,
+	0x10,0x04,0x0a,0x00,0x04,0x00,0xd3,0x14,0xd2,0x08,0x11,0x04,0x08,0x00,0x0a,0x00,
+	0x91,0x08,0x10,0x04,0x0a,0x00,0x0b,0x00,0x0b,0x00,0x92,0x10,0xd1,0x08,0x10,0x04,
+	0x0b,0x00,0x0c,0x00,0x10,0x04,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,
+	0xe5,0xf7,0x04,0xe4,0x79,0x03,0xe3,0x7b,0x01,0xe2,0x04,0x01,0xd1,0x7f,0xd0,0x65,
+	0xcf,0x86,0x55,0x04,0x04,0x00,0xd4,0x33,0xd3,0x1f,0xd2,0x0c,0x51,0x04,0x04,0x00,
+	0x10,0x04,0x0a,0x00,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x0b,0x04,0xff,0xe1,0x80,
+	0xa5,0xe1,0x80,0xae,0x00,0x04,0x00,0x92,0x10,0xd1,0x08,0x10,0x04,0x0a,0x00,0x04,
+	0x00,0x10,0x04,0x04,0x00,0x0a,0x00,0x04,0x00,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x04,
+	0x00,0x10,0x04,0x04,0x00,0x0a,0x00,0x51,0x04,0x0a,0x00,0x10,0x04,0x04,0x00,0x04,
+	0x07,0x92,0x10,0xd1,0x08,0x10,0x04,0x04,0x00,0x04,0x09,0x10,0x04,0x0a,0x09,0x0a,
+	0x00,0x0a,0x00,0xcf,0x86,0x95,0x14,0x54,0x04,0x04,0x00,0x53,0x04,0x04,0x00,0x92,
+	0x08,0x11,0x04,0x04,0x00,0x0a,0x00,0x0a,0x00,0x0a,0x00,0xd0,0x2e,0xcf,0x86,0x95,
+	0x28,0xd4,0x14,0x53,0x04,0x0a,0x00,0x52,0x04,0x0a,0x00,0x91,0x08,0x10,0x04,0x0a,
+	0x00,0x0a,0xdc,0x0a,0x00,0x53,0x04,0x0a,0x00,0xd2,0x08,0x11,0x04,0x0a,0x00,0x0b,
+	0x00,0x11,0x04,0x0b,0x00,0x0a,0x00,0x01,0x00,0xcf,0x86,0xd5,0x24,0x94,0x20,0xd3,
+	0x10,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x00,0x00,0x0d,0x00,0x52,
+	0x04,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x54,
+	0x04,0x01,0x00,0xd3,0x10,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,
+	0x00,0x06,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x06,0x00,0x08,0x00,0x10,0x04,0x08,
+	0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x08,0x00,0x0d,0x00,0x0d,0x00,0xd1,0x3e,0xd0,
+	0x06,0xcf,0x06,0x01,0x00,0xcf,0x86,0xd5,0x1d,0x54,0x04,0x01,0x00,0x53,0x04,0x01,
+	0x00,0xd2,0x08,0x11,0x04,0x01,0x00,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,
+	0x00,0x01,0xff,0x00,0x94,0x15,0x93,0x11,0x92,0x0d,0x91,0x09,0x10,0x05,0x01,0xff,
+	0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd0,0x1e,0xcf,0x86,0x55,
+	0x04,0x01,0x00,0x94,0x14,0x93,0x10,0x92,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,
+	0x00,0x0b,0x00,0x0b,0x00,0x01,0x00,0x01,0x00,0xcf,0x86,0x55,0x04,0x01,0x00,0x54,
+	0x04,0x01,0x00,0x53,0x04,0x01,0x00,0x92,0x08,0x11,0x04,0x01,0x00,0x0b,0x00,0x0b,
+	0x00,0xe2,0x21,0x01,0xd1,0x6c,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x94,0x14,0x93,0x10,
+	0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,0x04,0x00,
+	0x04,0x00,0x04,0x00,0xcf,0x86,0x95,0x48,0xd4,0x24,0xd3,0x10,0x52,0x04,0x04,0x00,
+	0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,
+	0x04,0x00,0x00,0x00,0x04,0x00,0x11,0x04,0x04,0x00,0x00,0x00,0xd3,0x10,0x52,0x04,
+	0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x00,0x00,0xd2,0x0c,0x91,0x08,
+	0x10,0x04,0x04,0x00,0x00,0x00,0x04,0x00,0x11,0x04,0x04,0x00,0x00,0x00,0x04,0x00,
+	0xd0,0x62,0xcf,0x86,0xd5,0x28,0x94,0x24,0xd3,0x10,0x52,0x04,0x04,0x00,0x51,0x04,
+	0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,0x04,0x00,
+	0x00,0x00,0x04,0x00,0x11,0x04,0x04,0x00,0x00,0x00,0x04,0x00,0xd4,0x14,0x53,0x04,
+	0x04,0x00,0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,
+	0xd3,0x14,0xd2,0x0c,0x91,0x08,0x10,0x04,0x04,0x00,0x00,0x00,0x04,0x00,0x11,0x04,
+	0x04,0x00,0x00,0x00,0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,
+	0x00,0x00,0xcf,0x86,0xd5,0x38,0xd4,0x24,0xd3,0x14,0xd2,0x0c,0x91,0x08,0x10,0x04,
+	0x04,0x00,0x00,0x00,0x04,0x00,0x11,0x04,0x04,0x00,0x00,0x00,0x52,0x04,0x04,0x00,
+	0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,0x93,0x10,0x52,0x04,0x04,0x00,
+	0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x00,0x00,0x04,0x00,0x94,0x14,0x53,0x04,
+	0x04,0x00,0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,
+	0x04,0x00,0xd1,0x9c,0xd0,0x3e,0xcf,0x86,0x95,0x38,0xd4,0x14,0x53,0x04,0x04,0x00,
+	0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,0xd3,0x14,
+	0xd2,0x0c,0x91,0x08,0x10,0x04,0x04,0x00,0x00,0x00,0x04,0x00,0x11,0x04,0x04,0x00,
+	0x00,0x00,0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,
+	0x04,0x00,0xcf,0x86,0xd5,0x34,0xd4,0x14,0x93,0x10,0x52,0x04,0x04,0x00,0x51,0x04,
+	0x04,0x00,0x10,0x04,0x04,0x00,0x08,0x00,0x04,0x00,0x53,0x04,0x04,0x00,0xd2,0x0c,
+	0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x00,0x00,0xd1,0x08,0x10,0x04,0x00,0x00,
+	0x0c,0xe6,0x10,0x04,0x0c,0xe6,0x08,0xe6,0xd4,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x08,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0x53,0x04,0x04,0x00,
+	0x52,0x04,0x04,0x00,0x91,0x08,0x10,0x04,0x04,0x00,0x00,0x00,0x00,0x00,0xd0,0x1a,
+	0xcf,0x86,0x95,0x14,0x54,0x04,0x08,0x00,0x53,0x04,0x08,0x00,0x92,0x08,0x11,0x04,
+	0x08,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0xcf,0x86,0x55,0x04,0x04,0x00,0x54,0x04,
+	0x04,0x00,0xd3,0x10,0x52,0x04,0x04,0x00,0x91,0x08,0x10,0x04,0x04,0x00,0x11,0x00,
+	0x00,0x00,0x52,0x04,0x11,0x00,0x11,0x04,0x11,0x00,0x00,0x00,0xd3,0x30,0xd2,0x2a,
+	0xd1,0x24,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x0b,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0x04,0x00,
+	0xcf,0x06,0x04,0x00,0xcf,0x06,0x04,0x00,0xcf,0x06,0x04,0x00,0xd2,0x6c,0xd1,0x24,
+	0xd0,0x06,0xcf,0x06,0x04,0x00,0xcf,0x86,0x55,0x04,0x04,0x00,0x54,0x04,0x04,0x00,
+	0x93,0x10,0x52,0x04,0x04,0x00,0x51,0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x0b,0x00,
+	0x0b,0x00,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x54,0x04,0x04,0x00,0x53,0x04,0x04,0x00,
+	0x52,0x04,0x04,0x00,0x91,0x08,0x10,0x04,0x04,0x00,0x00,0x00,0x00,0x00,0x04,0x00,
+	0xcf,0x86,0x55,0x04,0x04,0x00,0x54,0x04,0x04,0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x04,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,
+	0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd1,0x80,0xd0,0x46,0xcf,0x86,0xd5,0x28,
+	0xd4,0x14,0x53,0x04,0x06,0x00,0x52,0x04,0x06,0x00,0x91,0x08,0x10,0x04,0x06,0x00,
+	0x00,0x00,0x06,0x00,0x93,0x10,0x52,0x04,0x06,0x00,0x91,0x08,0x10,0x04,0x06,0x09,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x54,0x04,0x06,0x00,0x93,0x14,0x52,0x04,0x06,0x00,
+	0xd1,0x08,0x10,0x04,0x06,0x09,0x06,0x00,0x10,0x04,0x06,0x00,0x00,0x00,0x00,0x00,
+	0xcf,0x86,0xd5,0x10,0x54,0x04,0x06,0x00,0x93,0x08,0x12,0x04,0x06,0x00,0x00,0x00,
+	0x00,0x00,0xd4,0x14,0x53,0x04,0x06,0x00,0x52,0x04,0x06,0x00,0x91,0x08,0x10,0x04,
+	0x06,0x00,0x00,0x00,0x06,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x06,0x00,
+	0x00,0x00,0x06,0x00,0x00,0x00,0x00,0x00,0xd0,0x1b,0xcf,0x86,0x55,0x04,0x04,0x00,
+	0x54,0x04,0x04,0x00,0x93,0x0d,0x52,0x04,0x04,0x00,0x11,0x05,0x04,0xff,0x00,0x04,
+	0x00,0x04,0x00,0xcf,0x86,0xd5,0x24,0x54,0x04,0x04,0x00,0xd3,0x10,0x92,0x0c,0x51,
+	0x04,0x04,0x00,0x10,0x04,0x04,0x09,0x04,0x00,0x04,0x00,0x52,0x04,0x04,0x00,0x91,
+	0x08,0x10,0x04,0x04,0x00,0x07,0xe6,0x00,0x00,0xd4,0x10,0x53,0x04,0x04,0x00,0x92,
+	0x08,0x11,0x04,0x04,0x00,0x00,0x00,0x00,0x00,0x53,0x04,0x07,0x00,0x92,0x08,0x11,
+	0x04,0x07,0x00,0x00,0x00,0x00,0x00,0xe4,0xb7,0x03,0xe3,0x58,0x01,0xd2,0x8f,0xd1,
+	0x53,0xd0,0x35,0xcf,0x86,0x95,0x2f,0xd4,0x1f,0x53,0x04,0x04,0x00,0xd2,0x0d,0x51,
+	0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x04,0xff,0x00,0x51,0x05,0x04,0xff,0x00,0x10,
+	0x05,0x04,0xff,0x00,0x00,0x00,0x53,0x04,0x04,0x00,0x92,0x08,0x11,0x04,0x04,0x00,
+	0x00,0x00,0x00,0x00,0x04,0x00,0xcf,0x86,0x55,0x04,0x04,0x00,0x54,0x04,0x04,0x00,
+	0x53,0x04,0x04,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x14,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0xd0,0x22,0xcf,0x86,0x55,0x04,0x04,0x00,0x94,0x18,0x53,0x04,0x04,0x00,
+	0x92,0x10,0xd1,0x08,0x10,0x04,0x04,0x00,0x04,0xe4,0x10,0x04,0x0a,0x00,0x00,0x00,
+	0x00,0x00,0x0b,0x00,0xcf,0x86,0x55,0x04,0x0b,0x00,0x54,0x04,0x0b,0x00,0x93,0x0c,
+	0x52,0x04,0x0b,0x00,0x11,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0xd1,0x80,0xd0,0x42,
+	0xcf,0x86,0xd5,0x1c,0x54,0x04,0x07,0x00,0x53,0x04,0x07,0x00,0x52,0x04,0x07,0x00,
+	0xd1,0x08,0x10,0x04,0x07,0x00,0x10,0x00,0x10,0x04,0x10,0x00,0x00,0x00,0xd4,0x0c,
+	0x53,0x04,0x07,0x00,0x12,0x04,0x07,0x00,0x00,0x00,0x53,0x04,0x07,0x00,0x92,0x10,
+	0xd1,0x08,0x10,0x04,0x07,0x00,0x07,0xde,0x10,0x04,0x07,0xe6,0x07,0xdc,0x00,0x00,
+	0xcf,0x86,0xd5,0x18,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x07,0x00,
+	0x00,0x00,0x00,0x00,0x07,0x00,0x07,0x00,0x07,0x00,0xd4,0x10,0x53,0x04,0x07,0x00,
+	0x52,0x04,0x07,0x00,0x11,0x04,0x07,0x00,0x00,0x00,0x93,0x10,0x52,0x04,0x07,0x00,
+	0x91,0x08,0x10,0x04,0x07,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd0,0x1a,0xcf,0x86,
+	0x55,0x04,0x08,0x00,0x94,0x10,0x53,0x04,0x08,0x00,0x92,0x08,0x11,0x04,0x08,0x00,
+	0x0b,0x00,0x00,0x00,0x08,0x00,0xcf,0x86,0x95,0x28,0xd4,0x10,0x53,0x04,0x08,0x00,
+	0x92,0x08,0x11,0x04,0x08,0x00,0x00,0x00,0x00,0x00,0x53,0x04,0x08,0x00,0xd2,0x0c,
+	0x51,0x04,0x08,0x00,0x10,0x04,0x0b,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x08,0x00,
+	0x07,0x00,0xd2,0xe4,0xd1,0x80,0xd0,0x2e,0xcf,0x86,0x95,0x28,0x54,0x04,0x08,0x00,
+	0xd3,0x10,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x08,0xe6,
+	0xd2,0x0c,0x91,0x08,0x10,0x04,0x08,0xdc,0x08,0x00,0x08,0x00,0x11,0x04,0x00,0x00,
+	0x08,0x00,0x0b,0x00,0xcf,0x86,0xd5,0x18,0x54,0x04,0x0b,0x00,0x53,0x04,0x0b,0x00,
+	0x52,0x04,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,0x00,0x00,0x00,0xd4,0x14,
+	0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0b,0x09,0x0b,0x00,0x0b,0x00,0x0b,0x00,
+	0x0b,0x00,0xd3,0x10,0x52,0x04,0x0b,0x00,0x91,0x08,0x10,0x04,0x0b,0x00,0x0b,0xe6,
+	0x0b,0xe6,0x52,0x04,0x0b,0xe6,0xd1,0x08,0x10,0x04,0x0b,0xe6,0x00,0x00,0x10,0x04,
+	0x00,0x00,0x0b,0xdc,0xd0,0x5e,0xcf,0x86,0xd5,0x20,0xd4,0x10,0x53,0x04,0x0b,0x00,
+	0x92,0x08,0x11,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0x53,0x04,0x0b,0x00,0x92,0x08,
+	0x11,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0xd4,0x10,0x53,0x04,0x0b,0x00,0x52,0x04,
+	0x0b,0x00,0x11,0x04,0x0b,0x00,0x00,0x00,0xd3,0x10,0x52,0x04,0x10,0xe6,0x91,0x08,
+	0x10,0x04,0x10,0xe6,0x10,0xdc,0x10,0xdc,0xd2,0x0c,0x51,0x04,0x10,0xdc,0x10,0x04,
+	0x10,0xdc,0x10,0xe6,0xd1,0x08,0x10,0x04,0x10,0xe6,0x10,0xdc,0x10,0x04,0x10,0x00,
+	0x00,0x00,0xcf,0x06,0x00,0x00,0xe1,0x1e,0x01,0xd0,0xaa,0xcf,0x86,0xd5,0x6e,0xd4,
+	0x53,0xd3,0x17,0x52,0x04,0x09,0x00,0x51,0x04,0x09,0x00,0x10,0x0b,0x09,0xff,0xe1,
+	0xac,0x85,0xe1,0xac,0xb5,0x00,0x09,0x00,0xd2,0x1e,0xd1,0x0f,0x10,0x0b,0x09,0xff,
+	0xe1,0xac,0x87,0xe1,0xac,0xb5,0x00,0x09,0x00,0x10,0x0b,0x09,0xff,0xe1,0xac,0x89,
+	0xe1,0xac,0xb5,0x00,0x09,0x00,0xd1,0x0f,0x10,0x0b,0x09,0xff,0xe1,0xac,0x8b,0xe1,
+	0xac,0xb5,0x00,0x09,0x00,0x10,0x0b,0x09,0xff,0xe1,0xac,0x8d,0xe1,0xac,0xb5,0x00,
+	0x09,0x00,0x93,0x17,0x92,0x13,0x51,0x04,0x09,0x00,0x10,0x0b,0x09,0xff,0xe1,0xac,
+	0x91,0xe1,0xac,0xb5,0x00,0x09,0x00,0x09,0x00,0x09,0x00,0x54,0x04,0x09,0x00,0xd3,
+	0x10,0x52,0x04,0x09,0x00,0x91,0x08,0x10,0x04,0x09,0x07,0x09,0x00,0x09,0x00,0xd2,
+	0x13,0x51,0x04,0x09,0x00,0x10,0x04,0x09,0x00,0x09,0xff,0xe1,0xac,0xba,0xe1,0xac,
+	0xb5,0x00,0x91,0x0f,0x10,0x04,0x09,0x00,0x09,0xff,0xe1,0xac,0xbc,0xe1,0xac,0xb5,
+	0x00,0x09,0x00,0xcf,0x86,0xd5,0x3d,0x94,0x39,0xd3,0x31,0xd2,0x25,0xd1,0x16,0x10,
+	0x0b,0x09,0xff,0xe1,0xac,0xbe,0xe1,0xac,0xb5,0x00,0x09,0xff,0xe1,0xac,0xbf,0xe1,
+	0xac,0xb5,0x00,0x10,0x04,0x09,0x00,0x09,0xff,0xe1,0xad,0x82,0xe1,0xac,0xb5,0x00,
+	0x91,0x08,0x10,0x04,0x09,0x09,0x09,0x00,0x09,0x00,0x12,0x04,0x09,0x00,0x00,0x00,
+	0x09,0x00,0xd4,0x1c,0x53,0x04,0x09,0x00,0xd2,0x0c,0x51,0x04,0x09,0x00,0x10,0x04,
+	0x09,0x00,0x09,0xe6,0x91,0x08,0x10,0x04,0x09,0xdc,0x09,0xe6,0x09,0xe6,0xd3,0x08,
+	0x12,0x04,0x09,0xe6,0x09,0x00,0x52,0x04,0x09,0x00,0x91,0x08,0x10,0x04,0x09,0x00,
+	0x00,0x00,0x00,0x00,0xd0,0x2e,0xcf,0x86,0x55,0x04,0x0a,0x00,0xd4,0x18,0x53,0x04,
+	0x0a,0x00,0xd2,0x0c,0x51,0x04,0x0a,0x00,0x10,0x04,0x0a,0x09,0x0d,0x09,0x11,0x04,
+	0x0d,0x00,0x0a,0x00,0x53,0x04,0x0a,0x00,0x92,0x08,0x11,0x04,0x0a,0x00,0x0d,0x00,
+	0x0d,0x00,0xcf,0x86,0x55,0x04,0x0c,0x00,0xd4,0x14,0x93,0x10,0x52,0x04,0x0c,0x00,
+	0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x07,0x0c,0x00,0x0c,0x00,0xd3,0x0c,0x92,0x08,
+	0x11,0x04,0x0c,0x00,0x0c,0x09,0x00,0x00,0x12,0x04,0x00,0x00,0x0c,0x00,0xe3,0xb2,
+	0x01,0xe2,0x09,0x01,0xd1,0x4c,0xd0,0x2a,0xcf,0x86,0x55,0x04,0x0a,0x00,0x54,0x04,
+	0x0a,0x00,0xd3,0x10,0x52,0x04,0x0a,0x00,0x51,0x04,0x0a,0x00,0x10,0x04,0x0a,0x00,
+	0x0a,0x07,0x92,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x0a,0x00,0x0a,0x00,
+	0xcf,0x86,0x95,0x1c,0x94,0x18,0x53,0x04,0x0a,0x00,0xd2,0x08,0x11,0x04,0x0a,0x00,
+	0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x0a,0x00,0x0a,0x00,0x0a,0x00,0x0a,0x00,
+	0xd0,0x3a,0xcf,0x86,0xd5,0x18,0x94,0x14,0x53,0x04,0x12,0x00,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x14,0x00,0x54,0x04,0x14,0x00,
+	0x53,0x04,0x14,0x00,0xd2,0x0c,0x51,0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x00,0x00,
+	0x91,0x08,0x10,0x04,0x00,0x00,0x14,0x00,0x14,0x00,0xcf,0x86,0xd5,0x2c,0xd4,0x08,
+	0x13,0x04,0x0d,0x00,0x00,0x00,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x0b,0xe6,0x10,0x04,
+	0x0b,0xe6,0x0b,0x00,0x91,0x08,0x10,0x04,0x0b,0x01,0x0b,0xdc,0x0b,0xdc,0x92,0x08,
+	0x11,0x04,0x0b,0xdc,0x0b,0xe6,0x0b,0xdc,0xd4,0x28,0xd3,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x0b,0xe6,0x0b,0x00,0x0b,0x01,0x0b,0x01,0xd2,0x0c,0x91,0x08,0x10,0x04,
+	0x0b,0x01,0x0b,0x00,0x0b,0x00,0x91,0x08,0x10,0x04,0x0b,0x00,0x0b,0xdc,0x0b,0x00,
+	0xd3,0x1c,0xd2,0x0c,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,0x00,0x0d,0x00,0xd1,0x08,
+	0x10,0x04,0x0d,0xe6,0x0d,0x00,0x10,0x04,0x0d,0x00,0x13,0x00,0x92,0x0c,0x51,0x04,
+	0x10,0xe6,0x10,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0xd1,0x1c,0xd0,0x06,0xcf,0x06,
+	0x07,0x00,0xcf,0x86,0x55,0x04,0x07,0x00,0x94,0x0c,0x53,0x04,0x07,0x00,0x12,0x04,
+	0x07,0x00,0x08,0x00,0x08,0x00,0xd0,0x06,0xcf,0x06,0x08,0x00,0xcf,0x86,0xd5,0x40,
+	0xd4,0x2c,0xd3,0x10,0x92,0x0c,0x51,0x04,0x08,0xe6,0x10,0x04,0x08,0xdc,0x08,0xe6,
+	0x09,0xe6,0xd2,0x0c,0x51,0x04,0x09,0xe6,0x10,0x04,0x09,0xdc,0x0a,0xe6,0xd1,0x08,
+	0x10,0x04,0x0a,0xe6,0x0a,0xea,0x10,0x04,0x0a,0xd6,0x0a,0xdc,0x93,0x10,0x92,0x0c,
+	0x91,0x08,0x10,0x04,0x0a,0xca,0x0a,0xe6,0x0a,0xe6,0x0a,0xe6,0x0a,0xe6,0xd4,0x14,
+	0x93,0x10,0x52,0x04,0x0a,0xe6,0x51,0x04,0x0a,0xe6,0x10,0x04,0x0a,0xe6,0x10,0xe6,
+	0x10,0xe6,0xd3,0x10,0x52,0x04,0x10,0xe6,0x51,0x04,0x10,0xe6,0x10,0x04,0x13,0xe8,
+	0x13,0xe4,0xd2,0x10,0xd1,0x08,0x10,0x04,0x13,0xe4,0x13,0xdc,0x10,0x04,0x00,0x00,
+	0x12,0xe6,0xd1,0x08,0x10,0x04,0x0c,0xe9,0x0b,0xdc,0x10,0x04,0x09,0xe6,0x09,0xdc,
+	0xe2,0x80,0x08,0xe1,0x48,0x04,0xe0,0x1c,0x02,0xcf,0x86,0xe5,0x11,0x01,0xd4,0x84,
+	0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x41,0xcc,0xa5,0x00,0x01,0xff,
+	0x61,0xcc,0xa5,0x00,0x10,0x08,0x01,0xff,0x42,0xcc,0x87,0x00,0x01,0xff,0x62,0xcc,
+	0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x42,0xcc,0xa3,0x00,0x01,0xff,0x62,0xcc,
+	0xa3,0x00,0x10,0x08,0x01,0xff,0x42,0xcc,0xb1,0x00,0x01,0xff,0x62,0xcc,0xb1,0x00,
+	0xd2,0x24,0xd1,0x14,0x10,0x0a,0x01,0xff,0x43,0xcc,0xa7,0xcc,0x81,0x00,0x01,0xff,
+	0x63,0xcc,0xa7,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x44,0xcc,0x87,0x00,0x01,0xff,
+	0x64,0xcc,0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x44,0xcc,0xa3,0x00,0x01,0xff,
+	0x64,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x44,0xcc,0xb1,0x00,0x01,0xff,0x64,0xcc,
+	0xb1,0x00,0xd3,0x48,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x44,0xcc,0xa7,0x00,
+	0x01,0xff,0x64,0xcc,0xa7,0x00,0x10,0x08,0x01,0xff,0x44,0xcc,0xad,0x00,0x01,0xff,
+	0x64,0xcc,0xad,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x45,0xcc,0x84,0xcc,0x80,0x00,
+	0x01,0xff,0x65,0xcc,0x84,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0x45,0xcc,0x84,0xcc,
+	0x81,0x00,0x01,0xff,0x65,0xcc,0x84,0xcc,0x81,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0x45,0xcc,0xad,0x00,0x01,0xff,0x65,0xcc,0xad,0x00,0x10,0x08,0x01,0xff,
+	0x45,0xcc,0xb0,0x00,0x01,0xff,0x65,0xcc,0xb0,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,
+	0x45,0xcc,0xa7,0xcc,0x86,0x00,0x01,0xff,0x65,0xcc,0xa7,0xcc,0x86,0x00,0x10,0x08,
+	0x01,0xff,0x46,0xcc,0x87,0x00,0x01,0xff,0x66,0xcc,0x87,0x00,0xd4,0x84,0xd3,0x40,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x47,0xcc,0x84,0x00,0x01,0xff,0x67,0xcc,
+	0x84,0x00,0x10,0x08,0x01,0xff,0x48,0xcc,0x87,0x00,0x01,0xff,0x68,0xcc,0x87,0x00,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0x48,0xcc,0xa3,0x00,0x01,0xff,0x68,0xcc,0xa3,0x00,
+	0x10,0x08,0x01,0xff,0x48,0xcc,0x88,0x00,0x01,0xff,0x68,0xcc,0x88,0x00,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0x48,0xcc,0xa7,0x00,0x01,0xff,0x68,0xcc,0xa7,0x00,
+	0x10,0x08,0x01,0xff,0x48,0xcc,0xae,0x00,0x01,0xff,0x68,0xcc,0xae,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0x49,0xcc,0xb0,0x00,0x01,0xff,0x69,0xcc,0xb0,0x00,0x10,0x0a,
+	0x01,0xff,0x49,0xcc,0x88,0xcc,0x81,0x00,0x01,0xff,0x69,0xcc,0x88,0xcc,0x81,0x00,
+	0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x4b,0xcc,0x81,0x00,0x01,0xff,
+	0x6b,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x4b,0xcc,0xa3,0x00,0x01,0xff,0x6b,0xcc,
+	0xa3,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x4b,0xcc,0xb1,0x00,0x01,0xff,0x6b,0xcc,
+	0xb1,0x00,0x10,0x08,0x01,0xff,0x4c,0xcc,0xa3,0x00,0x01,0xff,0x6c,0xcc,0xa3,0x00,
+	0xd2,0x24,0xd1,0x14,0x10,0x0a,0x01,0xff,0x4c,0xcc,0xa3,0xcc,0x84,0x00,0x01,0xff,
+	0x6c,0xcc,0xa3,0xcc,0x84,0x00,0x10,0x08,0x01,0xff,0x4c,0xcc,0xb1,0x00,0x01,0xff,
+	0x6c,0xcc,0xb1,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x4c,0xcc,0xad,0x00,0x01,0xff,
+	0x6c,0xcc,0xad,0x00,0x10,0x08,0x01,0xff,0x4d,0xcc,0x81,0x00,0x01,0xff,0x6d,0xcc,
+	0x81,0x00,0xcf,0x86,0xe5,0x15,0x01,0xd4,0x88,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x4d,0xcc,0x87,0x00,0x01,0xff,0x6d,0xcc,0x87,0x00,0x10,0x08,0x01,
+	0xff,0x4d,0xcc,0xa3,0x00,0x01,0xff,0x6d,0xcc,0xa3,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x4e,0xcc,0x87,0x00,0x01,0xff,0x6e,0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x4e,
+	0xcc,0xa3,0x00,0x01,0xff,0x6e,0xcc,0xa3,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x4e,0xcc,0xb1,0x00,0x01,0xff,0x6e,0xcc,0xb1,0x00,0x10,0x08,0x01,0xff,0x4e,
+	0xcc,0xad,0x00,0x01,0xff,0x6e,0xcc,0xad,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x4f,
+	0xcc,0x83,0xcc,0x81,0x00,0x01,0xff,0x6f,0xcc,0x83,0xcc,0x81,0x00,0x10,0x0a,0x01,
+	0xff,0x4f,0xcc,0x83,0xcc,0x88,0x00,0x01,0xff,0x6f,0xcc,0x83,0xcc,0x88,0x00,0xd3,
+	0x48,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x4f,0xcc,0x84,0xcc,0x80,0x00,0x01,
+	0xff,0x6f,0xcc,0x84,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0x4f,0xcc,0x84,0xcc,0x81,
+	0x00,0x01,0xff,0x6f,0xcc,0x84,0xcc,0x81,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x50,
+	0xcc,0x81,0x00,0x01,0xff,0x70,0xcc,0x81,0x00,0x10,0x08,0x01,0xff,0x50,0xcc,0x87,
+	0x00,0x01,0xff,0x70,0xcc,0x87,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x52,
+	0xcc,0x87,0x00,0x01,0xff,0x72,0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x52,0xcc,0xa3,
+	0x00,0x01,0xff,0x72,0xcc,0xa3,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x52,0xcc,0xa3,
+	0xcc,0x84,0x00,0x01,0xff,0x72,0xcc,0xa3,0xcc,0x84,0x00,0x10,0x08,0x01,0xff,0x52,
+	0xcc,0xb1,0x00,0x01,0xff,0x72,0xcc,0xb1,0x00,0xd4,0x8c,0xd3,0x48,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0x53,0xcc,0x87,0x00,0x01,0xff,0x73,0xcc,0x87,0x00,0x10,
+	0x08,0x01,0xff,0x53,0xcc,0xa3,0x00,0x01,0xff,0x73,0xcc,0xa3,0x00,0xd1,0x14,0x10,
+	0x0a,0x01,0xff,0x53,0xcc,0x81,0xcc,0x87,0x00,0x01,0xff,0x73,0xcc,0x81,0xcc,0x87,
+	0x00,0x10,0x0a,0x01,0xff,0x53,0xcc,0x8c,0xcc,0x87,0x00,0x01,0xff,0x73,0xcc,0x8c,
+	0xcc,0x87,0x00,0xd2,0x24,0xd1,0x14,0x10,0x0a,0x01,0xff,0x53,0xcc,0xa3,0xcc,0x87,
+	0x00,0x01,0xff,0x73,0xcc,0xa3,0xcc,0x87,0x00,0x10,0x08,0x01,0xff,0x54,0xcc,0x87,
+	0x00,0x01,0xff,0x74,0xcc,0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x54,0xcc,0xa3,
+	0x00,0x01,0xff,0x74,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x54,0xcc,0xb1,0x00,0x01,
+	0xff,0x74,0xcc,0xb1,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x54,
+	0xcc,0xad,0x00,0x01,0xff,0x74,0xcc,0xad,0x00,0x10,0x08,0x01,0xff,0x55,0xcc,0xa4,
+	0x00,0x01,0xff,0x75,0xcc,0xa4,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x55,0xcc,0xb0,
+	0x00,0x01,0xff,0x75,0xcc,0xb0,0x00,0x10,0x08,0x01,0xff,0x55,0xcc,0xad,0x00,0x01,
+	0xff,0x75,0xcc,0xad,0x00,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x55,0xcc,0x83,
+	0xcc,0x81,0x00,0x01,0xff,0x75,0xcc,0x83,0xcc,0x81,0x00,0x10,0x0a,0x01,0xff,0x55,
+	0xcc,0x84,0xcc,0x88,0x00,0x01,0xff,0x75,0xcc,0x84,0xcc,0x88,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0x56,0xcc,0x83,0x00,0x01,0xff,0x76,0xcc,0x83,0x00,0x10,0x08,0x01,
+	0xff,0x56,0xcc,0xa3,0x00,0x01,0xff,0x76,0xcc,0xa3,0x00,0xe0,0x10,0x02,0xcf,0x86,
+	0xd5,0xe1,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x57,0xcc,
+	0x80,0x00,0x01,0xff,0x77,0xcc,0x80,0x00,0x10,0x08,0x01,0xff,0x57,0xcc,0x81,0x00,
+	0x01,0xff,0x77,0xcc,0x81,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x57,0xcc,0x88,0x00,
+	0x01,0xff,0x77,0xcc,0x88,0x00,0x10,0x08,0x01,0xff,0x57,0xcc,0x87,0x00,0x01,0xff,
+	0x77,0xcc,0x87,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x57,0xcc,0xa3,0x00,
+	0x01,0xff,0x77,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x58,0xcc,0x87,0x00,0x01,0xff,
+	0x78,0xcc,0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x58,0xcc,0x88,0x00,0x01,0xff,
+	0x78,0xcc,0x88,0x00,0x10,0x08,0x01,0xff,0x59,0xcc,0x87,0x00,0x01,0xff,0x79,0xcc,
+	0x87,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x5a,0xcc,0x82,0x00,
+	0x01,0xff,0x7a,0xcc,0x82,0x00,0x10,0x08,0x01,0xff,0x5a,0xcc,0xa3,0x00,0x01,0xff,
+	0x7a,0xcc,0xa3,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x5a,0xcc,0xb1,0x00,0x01,0xff,
+	0x7a,0xcc,0xb1,0x00,0x10,0x08,0x01,0xff,0x68,0xcc,0xb1,0x00,0x01,0xff,0x74,0xcc,
+	0x88,0x00,0x92,0x1d,0xd1,0x10,0x10,0x08,0x01,0xff,0x77,0xcc,0x8a,0x00,0x01,0xff,
+	0x79,0xcc,0x8a,0x00,0x10,0x04,0x01,0x00,0x02,0xff,0xc5,0xbf,0xcc,0x87,0x00,0x0a,
+	0x00,0xd4,0x98,0xd3,0x48,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x41,0xcc,0xa3,
+	0x00,0x01,0xff,0x61,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x41,0xcc,0x89,0x00,0x01,
+	0xff,0x61,0xcc,0x89,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x41,0xcc,0x82,0xcc,0x81,
+	0x00,0x01,0xff,0x61,0xcc,0x82,0xcc,0x81,0x00,0x10,0x0a,0x01,0xff,0x41,0xcc,0x82,
+	0xcc,0x80,0x00,0x01,0xff,0x61,0xcc,0x82,0xcc,0x80,0x00,0xd2,0x28,0xd1,0x14,0x10,
+	0x0a,0x01,0xff,0x41,0xcc,0x82,0xcc,0x89,0x00,0x01,0xff,0x61,0xcc,0x82,0xcc,0x89,
+	0x00,0x10,0x0a,0x01,0xff,0x41,0xcc,0x82,0xcc,0x83,0x00,0x01,0xff,0x61,0xcc,0x82,
+	0xcc,0x83,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x41,0xcc,0xa3,0xcc,0x82,0x00,0x01,
+	0xff,0x61,0xcc,0xa3,0xcc,0x82,0x00,0x10,0x0a,0x01,0xff,0x41,0xcc,0x86,0xcc,0x81,
+	0x00,0x01,0xff,0x61,0xcc,0x86,0xcc,0x81,0x00,0xd3,0x50,0xd2,0x28,0xd1,0x14,0x10,
+	0x0a,0x01,0xff,0x41,0xcc,0x86,0xcc,0x80,0x00,0x01,0xff,0x61,0xcc,0x86,0xcc,0x80,
+	0x00,0x10,0x0a,0x01,0xff,0x41,0xcc,0x86,0xcc,0x89,0x00,0x01,0xff,0x61,0xcc,0x86,
+	0xcc,0x89,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x41,0xcc,0x86,0xcc,0x83,0x00,0x01,
+	0xff,0x61,0xcc,0x86,0xcc,0x83,0x00,0x10,0x0a,0x01,0xff,0x41,0xcc,0xa3,0xcc,0x86,
+	0x00,0x01,0xff,0x61,0xcc,0xa3,0xcc,0x86,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0x45,0xcc,0xa3,0x00,0x01,0xff,0x65,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x45,
+	0xcc,0x89,0x00,0x01,0xff,0x65,0xcc,0x89,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x45,
+	0xcc,0x83,0x00,0x01,0xff,0x65,0xcc,0x83,0x00,0x10,0x0a,0x01,0xff,0x45,0xcc,0x82,
+	0xcc,0x81,0x00,0x01,0xff,0x65,0xcc,0x82,0xcc,0x81,0x00,0xcf,0x86,0xe5,0x31,0x01,
+	0xd4,0x90,0xd3,0x50,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x45,0xcc,0x82,0xcc,
+	0x80,0x00,0x01,0xff,0x65,0xcc,0x82,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0x45,0xcc,
+	0x82,0xcc,0x89,0x00,0x01,0xff,0x65,0xcc,0x82,0xcc,0x89,0x00,0xd1,0x14,0x10,0x0a,
+	0x01,0xff,0x45,0xcc,0x82,0xcc,0x83,0x00,0x01,0xff,0x65,0xcc,0x82,0xcc,0x83,0x00,
+	0x10,0x0a,0x01,0xff,0x45,0xcc,0xa3,0xcc,0x82,0x00,0x01,0xff,0x65,0xcc,0xa3,0xcc,
+	0x82,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0x49,0xcc,0x89,0x00,0x01,0xff,
+	0x69,0xcc,0x89,0x00,0x10,0x08,0x01,0xff,0x49,0xcc,0xa3,0x00,0x01,0xff,0x69,0xcc,
+	0xa3,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0x4f,0xcc,0xa3,0x00,0x01,0xff,0x6f,0xcc,
+	0xa3,0x00,0x10,0x08,0x01,0xff,0x4f,0xcc,0x89,0x00,0x01,0xff,0x6f,0xcc,0x89,0x00,
+	0xd3,0x50,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x4f,0xcc,0x82,0xcc,0x81,0x00,
+	0x01,0xff,0x6f,0xcc,0x82,0xcc,0x81,0x00,0x10,0x0a,0x01,0xff,0x4f,0xcc,0x82,0xcc,
+	0x80,0x00,0x01,0xff,0x6f,0xcc,0x82,0xcc,0x80,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,
+	0x4f,0xcc,0x82,0xcc,0x89,0x00,0x01,0xff,0x6f,0xcc,0x82,0xcc,0x89,0x00,0x10,0x0a,
+	0x01,0xff,0x4f,0xcc,0x82,0xcc,0x83,0x00,0x01,0xff,0x6f,0xcc,0x82,0xcc,0x83,0x00,
+	0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x4f,0xcc,0xa3,0xcc,0x82,0x00,0x01,0xff,
+	0x6f,0xcc,0xa3,0xcc,0x82,0x00,0x10,0x0a,0x01,0xff,0x4f,0xcc,0x9b,0xcc,0x81,0x00,
+	0x01,0xff,0x6f,0xcc,0x9b,0xcc,0x81,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0x4f,0xcc,
+	0x9b,0xcc,0x80,0x00,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,
+	0x4f,0xcc,0x9b,0xcc,0x89,0x00,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0x89,0x00,0xd4,0x98,
+	0xd3,0x48,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0x4f,0xcc,0x9b,0xcc,0x83,0x00,
+	0x01,0xff,0x6f,0xcc,0x9b,0xcc,0x83,0x00,0x10,0x0a,0x01,0xff,0x4f,0xcc,0x9b,0xcc,
+	0xa3,0x00,0x01,0xff,0x6f,0xcc,0x9b,0xcc,0xa3,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0x55,0xcc,0xa3,0x00,0x01,0xff,0x75,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,0x55,0xcc,
+	0x89,0x00,0x01,0xff,0x75,0xcc,0x89,0x00,0xd2,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,
+	0x55,0xcc,0x9b,0xcc,0x81,0x00,0x01,0xff,0x75,0xcc,0x9b,0xcc,0x81,0x00,0x10,0x0a,
+	0x01,0xff,0x55,0xcc,0x9b,0xcc,0x80,0x00,0x01,0xff,0x75,0xcc,0x9b,0xcc,0x80,0x00,
+	0xd1,0x14,0x10,0x0a,0x01,0xff,0x55,0xcc,0x9b,0xcc,0x89,0x00,0x01,0xff,0x75,0xcc,
+	0x9b,0xcc,0x89,0x00,0x10,0x0a,0x01,0xff,0x55,0xcc,0x9b,0xcc,0x83,0x00,0x01,0xff,
+	0x75,0xcc,0x9b,0xcc,0x83,0x00,0xd3,0x44,0xd2,0x24,0xd1,0x14,0x10,0x0a,0x01,0xff,
+	0x55,0xcc,0x9b,0xcc,0xa3,0x00,0x01,0xff,0x75,0xcc,0x9b,0xcc,0xa3,0x00,0x10,0x08,
+	0x01,0xff,0x59,0xcc,0x80,0x00,0x01,0xff,0x79,0xcc,0x80,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0x59,0xcc,0xa3,0x00,0x01,0xff,0x79,0xcc,0xa3,0x00,0x10,0x08,0x01,0xff,
+	0x59,0xcc,0x89,0x00,0x01,0xff,0x79,0xcc,0x89,0x00,0x92,0x14,0x91,0x10,0x10,0x08,
+	0x01,0xff,0x59,0xcc,0x83,0x00,0x01,0xff,0x79,0xcc,0x83,0x00,0x0a,0x00,0x0a,0x00,
+	0xe1,0xc0,0x04,0xe0,0x80,0x02,0xcf,0x86,0xe5,0x2d,0x01,0xd4,0xa8,0xd3,0x54,0xd2,
+	0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb1,0xcc,0x93,0x00,0x01,0xff,0xce,0xb1,
+	0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0xb1,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,
+	0xce,0xb1,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xb1,0xcc,
+	0x93,0xcc,0x81,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,
+	0xff,0xce,0xb1,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xcd,0x82,
+	0x00,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0x91,0xcc,0x93,0x00,0x01,0xff,
+	0xce,0x91,0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0x91,0xcc,0x93,0xcc,0x80,0x00,
+	0x01,0xff,0xce,0x91,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,
+	0x91,0xcc,0x93,0xcc,0x81,0x00,0x01,0xff,0xce,0x91,0xcc,0x94,0xcc,0x81,0x00,0x10,
+	0x0b,0x01,0xff,0xce,0x91,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0x91,0xcc,0x94,
+	0xcd,0x82,0x00,0xd3,0x42,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb5,0xcc,
+	0x93,0x00,0x01,0xff,0xce,0xb5,0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0xb5,0xcc,
+	0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0xb5,0xcc,0x94,0xcc,0x80,0x00,0x91,0x16,0x10,
+	0x0b,0x01,0xff,0xce,0xb5,0xcc,0x93,0xcc,0x81,0x00,0x01,0xff,0xce,0xb5,0xcc,0x94,
+	0xcc,0x81,0x00,0x00,0x00,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0x95,0xcc,
+	0x93,0x00,0x01,0xff,0xce,0x95,0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0x95,0xcc,
+	0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0x95,0xcc,0x94,0xcc,0x80,0x00,0x91,0x16,0x10,
+	0x0b,0x01,0xff,0xce,0x95,0xcc,0x93,0xcc,0x81,0x00,0x01,0xff,0xce,0x95,0xcc,0x94,
+	0xcc,0x81,0x00,0x00,0x00,0xd4,0xa8,0xd3,0x54,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,
+	0xff,0xce,0xb7,0xcc,0x93,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0x00,0x10,0x0b,0x01,
+	0xff,0xce,0xb7,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcc,0x80,
+	0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcc,0x81,0x00,0x01,0xff,
+	0xce,0xb7,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcd,
+	0x82,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcd,0x82,0x00,0xd2,0x28,0xd1,0x12,0x10,
+	0x09,0x01,0xff,0xce,0x97,0xcc,0x93,0x00,0x01,0xff,0xce,0x97,0xcc,0x94,0x00,0x10,
+	0x0b,0x01,0xff,0xce,0x97,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0x97,0xcc,0x94,
+	0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0x97,0xcc,0x93,0xcc,0x81,0x00,
+	0x01,0xff,0xce,0x97,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,0xff,0xce,0x97,0xcc,
+	0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0x97,0xcc,0x94,0xcd,0x82,0x00,0xd3,0x54,0xd2,
+	0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb9,0xcc,0x93,0x00,0x01,0xff,0xce,0xb9,
+	0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0xb9,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,
+	0xce,0xb9,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xb9,0xcc,
+	0x93,0xcc,0x81,0x00,0x01,0xff,0xce,0xb9,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,
+	0xff,0xce,0xb9,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0xb9,0xcc,0x94,0xcd,0x82,
+	0x00,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0x99,0xcc,0x93,0x00,0x01,0xff,
+	0xce,0x99,0xcc,0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0x99,0xcc,0x93,0xcc,0x80,0x00,
+	0x01,0xff,0xce,0x99,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,
+	0x99,0xcc,0x93,0xcc,0x81,0x00,0x01,0xff,0xce,0x99,0xcc,0x94,0xcc,0x81,0x00,0x10,
+	0x0b,0x01,0xff,0xce,0x99,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0x99,0xcc,0x94,
+	0xcd,0x82,0x00,0xcf,0x86,0xe5,0x13,0x01,0xd4,0x84,0xd3,0x42,0xd2,0x28,0xd1,0x12,
+	0x10,0x09,0x01,0xff,0xce,0xbf,0xcc,0x93,0x00,0x01,0xff,0xce,0xbf,0xcc,0x94,0x00,
+	0x10,0x0b,0x01,0xff,0xce,0xbf,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0xbf,0xcc,
+	0x94,0xcc,0x80,0x00,0x91,0x16,0x10,0x0b,0x01,0xff,0xce,0xbf,0xcc,0x93,0xcc,0x81,
+	0x00,0x01,0xff,0xce,0xbf,0xcc,0x94,0xcc,0x81,0x00,0x00,0x00,0xd2,0x28,0xd1,0x12,
+	0x10,0x09,0x01,0xff,0xce,0x9f,0xcc,0x93,0x00,0x01,0xff,0xce,0x9f,0xcc,0x94,0x00,
+	0x10,0x0b,0x01,0xff,0xce,0x9f,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xce,0x9f,0xcc,
+	0x94,0xcc,0x80,0x00,0x91,0x16,0x10,0x0b,0x01,0xff,0xce,0x9f,0xcc,0x93,0xcc,0x81,
+	0x00,0x01,0xff,0xce,0x9f,0xcc,0x94,0xcc,0x81,0x00,0x00,0x00,0xd3,0x54,0xd2,0x28,
+	0xd1,0x12,0x10,0x09,0x01,0xff,0xcf,0x85,0xcc,0x93,0x00,0x01,0xff,0xcf,0x85,0xcc,
+	0x94,0x00,0x10,0x0b,0x01,0xff,0xcf,0x85,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xcf,
+	0x85,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xcf,0x85,0xcc,0x93,
+	0xcc,0x81,0x00,0x01,0xff,0xcf,0x85,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,0xff,
+	0xcf,0x85,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xcf,0x85,0xcc,0x94,0xcd,0x82,0x00,
+	0xd2,0x1c,0xd1,0x0d,0x10,0x04,0x00,0x00,0x01,0xff,0xce,0xa5,0xcc,0x94,0x00,0x10,
+	0x04,0x00,0x00,0x01,0xff,0xce,0xa5,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x0f,0x10,0x04,
+	0x00,0x00,0x01,0xff,0xce,0xa5,0xcc,0x94,0xcc,0x81,0x00,0x10,0x04,0x00,0x00,0x01,
+	0xff,0xce,0xa5,0xcc,0x94,0xcd,0x82,0x00,0xd4,0xa8,0xd3,0x54,0xd2,0x28,0xd1,0x12,
+	0x10,0x09,0x01,0xff,0xcf,0x89,0xcc,0x93,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0x00,
+	0x10,0x0b,0x01,0xff,0xcf,0x89,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xcf,0x89,0xcc,
+	0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xcf,0x89,0xcc,0x93,0xcc,0x81,
+	0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,0xff,0xcf,0x89,
+	0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcd,0x82,0x00,0xd2,0x28,
+	0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xa9,0xcc,0x93,0x00,0x01,0xff,0xce,0xa9,0xcc,
+	0x94,0x00,0x10,0x0b,0x01,0xff,0xce,0xa9,0xcc,0x93,0xcc,0x80,0x00,0x01,0xff,0xce,
+	0xa9,0xcc,0x94,0xcc,0x80,0x00,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xa9,0xcc,0x93,
+	0xcc,0x81,0x00,0x01,0xff,0xce,0xa9,0xcc,0x94,0xcc,0x81,0x00,0x10,0x0b,0x01,0xff,
+	0xce,0xa9,0xcc,0x93,0xcd,0x82,0x00,0x01,0xff,0xce,0xa9,0xcc,0x94,0xcd,0x82,0x00,
+	0xd3,0x48,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb1,0xcc,0x80,0x00,0x01,
+	0xff,0xce,0xb1,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,0xce,0xb5,0xcc,0x80,0x00,0x01,
+	0xff,0xce,0xb5,0xcc,0x81,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb7,0xcc,0x80,
+	0x00,0x01,0xff,0xce,0xb7,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,0xce,0xb9,0xcc,0x80,
+	0x00,0x01,0xff,0xce,0xb9,0xcc,0x81,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,
+	0xce,0xbf,0xcc,0x80,0x00,0x01,0xff,0xce,0xbf,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,
+	0xcf,0x85,0xcc,0x80,0x00,0x01,0xff,0xcf,0x85,0xcc,0x81,0x00,0x91,0x12,0x10,0x09,
+	0x01,0xff,0xcf,0x89,0xcc,0x80,0x00,0x01,0xff,0xcf,0x89,0xcc,0x81,0x00,0x00,0x00,
+	0xe0,0xe1,0x02,0xcf,0x86,0xe5,0x91,0x01,0xd4,0xc8,0xd3,0x64,0xd2,0x30,0xd1,0x16,
+	0x10,0x0b,0x01,0xff,0xce,0xb1,0xcc,0x93,0xcd,0x85,0x00,0x01,0xff,0xce,0xb1,0xcc,
+	0x94,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,0xce,0xb1,0xcc,0x93,0xcc,0x80,0xcd,0x85,
+	0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xcc,0x80,0xcd,0x85,0x00,0xd1,0x1a,0x10,0x0d,
+	0x01,0xff,0xce,0xb1,0xcc,0x93,0xcc,0x81,0xcd,0x85,0x00,0x01,0xff,0xce,0xb1,0xcc,
+	0x94,0xcc,0x81,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,0xce,0xb1,0xcc,0x93,0xcd,0x82,
+	0xcd,0x85,0x00,0x01,0xff,0xce,0xb1,0xcc,0x94,0xcd,0x82,0xcd,0x85,0x00,0xd2,0x30,
+	0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0x91,0xcc,0x93,0xcd,0x85,0x00,0x01,0xff,0xce,
+	0x91,0xcc,0x94,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,0xce,0x91,0xcc,0x93,0xcc,0x80,
+	0xcd,0x85,0x00,0x01,0xff,0xce,0x91,0xcc,0x94,0xcc,0x80,0xcd,0x85,0x00,0xd1,0x1a,
+	0x10,0x0d,0x01,0xff,0xce,0x91,0xcc,0x93,0xcc,0x81,0xcd,0x85,0x00,0x01,0xff,0xce,
+	0x91,0xcc,0x94,0xcc,0x81,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,0xce,0x91,0xcc,0x93,
+	0xcd,0x82,0xcd,0x85,0x00,0x01,0xff,0xce,0x91,0xcc,0x94,0xcd,0x82,0xcd,0x85,0x00,
+	0xd3,0x64,0xd2,0x30,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcd,0x85,
+	0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,0xce,0xb7,
+	0xcc,0x93,0xcc,0x80,0xcd,0x85,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcc,0x80,0xcd,
+	0x85,0x00,0xd1,0x1a,0x10,0x0d,0x01,0xff,0xce,0xb7,0xcc,0x93,0xcc,0x81,0xcd,0x85,
+	0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcc,0x81,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,
+	0xce,0xb7,0xcc,0x93,0xcd,0x82,0xcd,0x85,0x00,0x01,0xff,0xce,0xb7,0xcc,0x94,0xcd,
+	0x82,0xcd,0x85,0x00,0xd2,0x30,0xd1,0x16,0x10,0x0b,0x01,0xff,0xce,0x97,0xcc,0x93,
+	0xcd,0x85,0x00,0x01,0xff,0xce,0x97,0xcc,0x94,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,
+	0xce,0x97,0xcc,0x93,0xcc,0x80,0xcd,0x85,0x00,0x01,0xff,0xce,0x97,0xcc,0x94,0xcc,
+	0x80,0xcd,0x85,0x00,0xd1,0x1a,0x10,0x0d,0x01,0xff,0xce,0x97,0xcc,0x93,0xcc,0x81,
+	0xcd,0x85,0x00,0x01,0xff,0xce,0x97,0xcc,0x94,0xcc,0x81,0xcd,0x85,0x00,0x10,0x0d,
+	0x01,0xff,0xce,0x97,0xcc,0x93,0xcd,0x82,0xcd,0x85,0x00,0x01,0xff,0xce,0x97,0xcc,
+	0x94,0xcd,0x82,0xcd,0x85,0x00,0xd4,0xc8,0xd3,0x64,0xd2,0x30,0xd1,0x16,0x10,0x0b,
+	0x01,0xff,0xcf,0x89,0xcc,0x93,0xcd,0x85,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcd,
+	0x85,0x00,0x10,0x0d,0x01,0xff,0xcf,0x89,0xcc,0x93,0xcc,0x80,0xcd,0x85,0x00,0x01,
+	0xff,0xcf,0x89,0xcc,0x94,0xcc,0x80,0xcd,0x85,0x00,0xd1,0x1a,0x10,0x0d,0x01,0xff,
+	0xcf,0x89,0xcc,0x93,0xcc,0x81,0xcd,0x85,0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcc,
+	0x81,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,0xcf,0x89,0xcc,0x93,0xcd,0x82,0xcd,0x85,
+	0x00,0x01,0xff,0xcf,0x89,0xcc,0x94,0xcd,0x82,0xcd,0x85,0x00,0xd2,0x30,0xd1,0x16,
+	0x10,0x0b,0x01,0xff,0xce,0xa9,0xcc,0x93,0xcd,0x85,0x00,0x01,0xff,0xce,0xa9,0xcc,
+	0x94,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,0xce,0xa9,0xcc,0x93,0xcc,0x80,0xcd,0x85,
+	0x00,0x01,0xff,0xce,0xa9,0xcc,0x94,0xcc,0x80,0xcd,0x85,0x00,0xd1,0x1a,0x10,0x0d,
+	0x01,0xff,0xce,0xa9,0xcc,0x93,0xcc,0x81,0xcd,0x85,0x00,0x01,0xff,0xce,0xa9,0xcc,
+	0x94,0xcc,0x81,0xcd,0x85,0x00,0x10,0x0d,0x01,0xff,0xce,0xa9,0xcc,0x93,0xcd,0x82,
+	0xcd,0x85,0x00,0x01,0xff,0xce,0xa9,0xcc,0x94,0xcd,0x82,0xcd,0x85,0x00,0xd3,0x49,
+	0xd2,0x26,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0xb1,0xcc,0x86,0x00,0x01,0xff,0xce,
+	0xb1,0xcc,0x84,0x00,0x10,0x0b,0x01,0xff,0xce,0xb1,0xcc,0x80,0xcd,0x85,0x00,0x01,
+	0xff,0xce,0xb1,0xcd,0x85,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xce,0xb1,0xcc,0x81,
+	0xcd,0x85,0x00,0x00,0x00,0x10,0x09,0x01,0xff,0xce,0xb1,0xcd,0x82,0x00,0x01,0xff,
+	0xce,0xb1,0xcd,0x82,0xcd,0x85,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,
+	0x91,0xcc,0x86,0x00,0x01,0xff,0xce,0x91,0xcc,0x84,0x00,0x10,0x09,0x01,0xff,0xce,
+	0x91,0xcc,0x80,0x00,0x01,0xff,0xce,0x91,0xcc,0x81,0x00,0xd1,0x0d,0x10,0x09,0x01,
+	0xff,0xce,0x91,0xcd,0x85,0x00,0x01,0x00,0x10,0x07,0x01,0xff,0xce,0xb9,0x00,0x01,
+	0x00,0xcf,0x86,0xe5,0x16,0x01,0xd4,0x8f,0xd3,0x44,0xd2,0x21,0xd1,0x0d,0x10,0x04,
+	0x01,0x00,0x01,0xff,0xc2,0xa8,0xcd,0x82,0x00,0x10,0x0b,0x01,0xff,0xce,0xb7,0xcc,
+	0x80,0xcd,0x85,0x00,0x01,0xff,0xce,0xb7,0xcd,0x85,0x00,0xd1,0x0f,0x10,0x0b,0x01,
+	0xff,0xce,0xb7,0xcc,0x81,0xcd,0x85,0x00,0x00,0x00,0x10,0x09,0x01,0xff,0xce,0xb7,
+	0xcd,0x82,0x00,0x01,0xff,0xce,0xb7,0xcd,0x82,0xcd,0x85,0x00,0xd2,0x24,0xd1,0x12,
+	0x10,0x09,0x01,0xff,0xce,0x95,0xcc,0x80,0x00,0x01,0xff,0xce,0x95,0xcc,0x81,0x00,
+	0x10,0x09,0x01,0xff,0xce,0x97,0xcc,0x80,0x00,0x01,0xff,0xce,0x97,0xcc,0x81,0x00,
+	0xd1,0x13,0x10,0x09,0x01,0xff,0xce,0x97,0xcd,0x85,0x00,0x01,0xff,0xe1,0xbe,0xbf,
+	0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0xe1,0xbe,0xbf,0xcc,0x81,0x00,0x01,0xff,0xe1,
+	0xbe,0xbf,0xcd,0x82,0x00,0xd3,0x40,0xd2,0x28,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,
+	0xb9,0xcc,0x86,0x00,0x01,0xff,0xce,0xb9,0xcc,0x84,0x00,0x10,0x0b,0x01,0xff,0xce,
+	0xb9,0xcc,0x88,0xcc,0x80,0x00,0x01,0xff,0xce,0xb9,0xcc,0x88,0xcc,0x81,0x00,0x51,
+	0x04,0x00,0x00,0x10,0x09,0x01,0xff,0xce,0xb9,0xcd,0x82,0x00,0x01,0xff,0xce,0xb9,
+	0xcc,0x88,0xcd,0x82,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,0x99,0xcc,
+	0x86,0x00,0x01,0xff,0xce,0x99,0xcc,0x84,0x00,0x10,0x09,0x01,0xff,0xce,0x99,0xcc,
+	0x80,0x00,0x01,0xff,0xce,0x99,0xcc,0x81,0x00,0xd1,0x0e,0x10,0x04,0x00,0x00,0x01,
+	0xff,0xe1,0xbf,0xbe,0xcc,0x80,0x00,0x10,0x0a,0x01,0xff,0xe1,0xbf,0xbe,0xcc,0x81,
+	0x00,0x01,0xff,0xe1,0xbf,0xbe,0xcd,0x82,0x00,0xd4,0x93,0xd3,0x4e,0xd2,0x28,0xd1,
+	0x12,0x10,0x09,0x01,0xff,0xcf,0x85,0xcc,0x86,0x00,0x01,0xff,0xcf,0x85,0xcc,0x84,
+	0x00,0x10,0x0b,0x01,0xff,0xcf,0x85,0xcc,0x88,0xcc,0x80,0x00,0x01,0xff,0xcf,0x85,
+	0xcc,0x88,0xcc,0x81,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xcf,0x81,0xcc,0x93,0x00,
+	0x01,0xff,0xcf,0x81,0xcc,0x94,0x00,0x10,0x09,0x01,0xff,0xcf,0x85,0xcd,0x82,0x00,
+	0x01,0xff,0xcf,0x85,0xcc,0x88,0xcd,0x82,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,
+	0xff,0xce,0xa5,0xcc,0x86,0x00,0x01,0xff,0xce,0xa5,0xcc,0x84,0x00,0x10,0x09,0x01,
+	0xff,0xce,0xa5,0xcc,0x80,0x00,0x01,0xff,0xce,0xa5,0xcc,0x81,0x00,0xd1,0x12,0x10,
+	0x09,0x01,0xff,0xce,0xa1,0xcc,0x94,0x00,0x01,0xff,0xc2,0xa8,0xcc,0x80,0x00,0x10,
+	0x09,0x01,0xff,0xc2,0xa8,0xcc,0x81,0x00,0x01,0xff,0x60,0x00,0xd3,0x3b,0xd2,0x18,
+	0x51,0x04,0x00,0x00,0x10,0x0b,0x01,0xff,0xcf,0x89,0xcc,0x80,0xcd,0x85,0x00,0x01,
+	0xff,0xcf,0x89,0xcd,0x85,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xcf,0x89,0xcc,0x81,
+	0xcd,0x85,0x00,0x00,0x00,0x10,0x09,0x01,0xff,0xcf,0x89,0xcd,0x82,0x00,0x01,0xff,
+	0xcf,0x89,0xcd,0x82,0xcd,0x85,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xce,
+	0x9f,0xcc,0x80,0x00,0x01,0xff,0xce,0x9f,0xcc,0x81,0x00,0x10,0x09,0x01,0xff,0xce,
+	0xa9,0xcc,0x80,0x00,0x01,0xff,0xce,0xa9,0xcc,0x81,0x00,0xd1,0x10,0x10,0x09,0x01,
+	0xff,0xce,0xa9,0xcd,0x85,0x00,0x01,0xff,0xc2,0xb4,0x00,0x10,0x04,0x01,0x00,0x00,
+	0x00,0xe0,0x7e,0x0c,0xcf,0x86,0xe5,0xbb,0x08,0xe4,0x14,0x06,0xe3,0xf7,0x02,0xe2,
+	0xbd,0x01,0xd1,0xd0,0xd0,0x4f,0xcf,0x86,0xd5,0x2e,0x94,0x2a,0xd3,0x18,0x92,0x14,
+	0x91,0x10,0x10,0x08,0x01,0xff,0xe2,0x80,0x82,0x00,0x01,0xff,0xe2,0x80,0x83,0x00,
+	0x01,0x00,0x01,0x00,0x92,0x0d,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xff,
+	0x00,0x01,0xff,0x00,0x01,0x00,0x94,0x1b,0x53,0x04,0x01,0x00,0xd2,0x09,0x11,0x04,
+	0x01,0x00,0x01,0xff,0x00,0x51,0x05,0x01,0xff,0x00,0x10,0x05,0x01,0xff,0x00,0x04,
+	0x00,0x01,0x00,0xcf,0x86,0xd5,0x48,0xd4,0x1c,0xd3,0x10,0x52,0x04,0x01,0x00,0x51,
+	0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x06,0x00,0x52,0x04,0x04,0x00,0x11,0x04,0x04,
+	0x00,0x06,0x00,0xd3,0x1c,0xd2,0x0c,0x51,0x04,0x06,0x00,0x10,0x04,0x06,0x00,0x07,
+	0x00,0xd1,0x08,0x10,0x04,0x07,0x00,0x08,0x00,0x10,0x04,0x08,0x00,0x06,0x00,0x52,
+	0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x06,0x00,0xd4,0x23,0xd3,
+	0x14,0x52,0x05,0x06,0xff,0x00,0x91,0x0a,0x10,0x05,0x0a,0xff,0x00,0x00,0xff,0x00,
+	0x0f,0xff,0x00,0x92,0x0a,0x11,0x05,0x0f,0xff,0x00,0x01,0xff,0x00,0x01,0xff,0x00,
+	0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x06,0x00,0x00,0x00,0x01,0x00,
+	0x01,0x00,0xd0,0x7e,0xcf,0x86,0xd5,0x34,0xd4,0x14,0x53,0x04,0x01,0x00,0x52,0x04,
+	0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0xd3,0x10,0x52,0x04,
+	0x08,0x00,0x91,0x08,0x10,0x04,0x08,0x00,0x0c,0x00,0x0c,0x00,0x52,0x04,0x0c,0x00,
+	0x91,0x08,0x10,0x04,0x0c,0x00,0x00,0x00,0x00,0x00,0xd4,0x1c,0x53,0x04,0x01,0x00,
+	0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x02,0x00,0x91,0x08,0x10,0x04,
+	0x03,0x00,0x04,0x00,0x04,0x00,0xd3,0x10,0xd2,0x08,0x11,0x04,0x06,0x00,0x08,0x00,
+	0x11,0x04,0x08,0x00,0x0b,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x0b,0x00,0x0c,0x00,
+	0x10,0x04,0x0e,0x00,0x10,0x00,0x51,0x04,0x10,0x00,0x10,0x04,0x11,0x00,0x13,0x00,
+	0xcf,0x86,0xd5,0x28,0x54,0x04,0x00,0x00,0xd3,0x0c,0x92,0x08,0x11,0x04,0x01,0xe6,
+	0x01,0x01,0x01,0xe6,0xd2,0x0c,0x51,0x04,0x01,0x01,0x10,0x04,0x01,0x01,0x01,0xe6,
+	0x91,0x08,0x10,0x04,0x01,0xe6,0x01,0x00,0x01,0x00,0xd4,0x30,0xd3,0x1c,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x01,0x00,0x01,0xe6,0x04,0x00,0xd1,0x08,0x10,0x04,0x06,0x00,
+	0x06,0x01,0x10,0x04,0x06,0x01,0x06,0xe6,0x92,0x10,0xd1,0x08,0x10,0x04,0x06,0xdc,
+	0x06,0xe6,0x10,0x04,0x06,0x01,0x08,0x01,0x09,0xdc,0x93,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x0a,0xe6,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd1,0x81,0xd0,0x4f,
+	0xcf,0x86,0x55,0x04,0x01,0x00,0xd4,0x29,0xd3,0x13,0x52,0x04,0x01,0x00,0x51,0x04,
+	0x01,0x00,0x10,0x07,0x01,0xff,0xce,0xa9,0x00,0x01,0x00,0x92,0x12,0x51,0x04,0x01,
+	0x00,0x10,0x06,0x01,0xff,0x4b,0x00,0x01,0xff,0x41,0xcc,0x8a,0x00,0x01,0x00,0x53,
+	0x04,0x01,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x01,0x00,0x04,0x00,0x10,0x04,0x04,
+	0x00,0x07,0x00,0x91,0x08,0x10,0x04,0x08,0x00,0x06,0x00,0x06,0x00,0xcf,0x86,0x95,
+	0x2c,0xd4,0x18,0x53,0x04,0x06,0x00,0x52,0x04,0x06,0x00,0xd1,0x08,0x10,0x04,0x08,
+	0x00,0x09,0x00,0x10,0x04,0x09,0x00,0x0a,0x00,0x93,0x10,0x92,0x0c,0x51,0x04,0x0b,
+	0x00,0x10,0x04,0x0b,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd0,0x68,0xcf,
+	0x86,0xd5,0x48,0xd4,0x28,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,
+	0x00,0x04,0x00,0x91,0x08,0x10,0x04,0x09,0x00,0x0a,0x00,0x0a,0x00,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x0a,0x00,0x0b,0x00,0x11,0x00,0x00,0x00,0x53,0x04,0x01,0x00,0x92,
+	0x18,0x51,0x04,0x01,0x00,0x10,0x0a,0x01,0xff,0xe2,0x86,0x90,0xcc,0xb8,0x00,0x01,
+	0xff,0xe2,0x86,0x92,0xcc,0xb8,0x00,0x01,0x00,0x94,0x1a,0x53,0x04,0x01,0x00,0x52,
+	0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x0a,0x01,0xff,0xe2,0x86,0x94,0xcc,0xb8,
+	0x00,0x01,0x00,0x01,0x00,0xcf,0x86,0xd5,0x2e,0x94,0x2a,0x53,0x04,0x01,0x00,0x52,
+	0x04,0x01,0x00,0xd1,0x0e,0x10,0x04,0x01,0x00,0x01,0xff,0xe2,0x87,0x90,0xcc,0xb8,
+	0x00,0x10,0x0a,0x01,0xff,0xe2,0x87,0x94,0xcc,0xb8,0x00,0x01,0xff,0xe2,0x87,0x92,
+	0xcc,0xb8,0x00,0x01,0x00,0xd4,0x14,0x53,0x04,0x01,0x00,0x92,0x0c,0x51,0x04,0x01,
+	0x00,0x10,0x04,0x01,0x00,0x04,0x00,0x04,0x00,0x93,0x08,0x12,0x04,0x04,0x00,0x06,
+	0x00,0x06,0x00,0xe2,0x38,0x02,0xe1,0x3f,0x01,0xd0,0x68,0xcf,0x86,0xd5,0x3e,0x94,
+	0x3a,0xd3,0x16,0x52,0x04,0x01,0x00,0x91,0x0e,0x10,0x0a,0x01,0xff,0xe2,0x88,0x83,
+	0xcc,0xb8,0x00,0x01,0x00,0x01,0x00,0xd2,0x12,0x91,0x0e,0x10,0x04,0x01,0x00,0x01,
+	0xff,0xe2,0x88,0x88,0xcc,0xb8,0x00,0x01,0x00,0x91,0x0e,0x10,0x0a,0x01,0xff,0xe2,
+	0x88,0x8b,0xcc,0xb8,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x94,0x24,0x93,0x20,0x52,
+	0x04,0x01,0x00,0xd1,0x0e,0x10,0x0a,0x01,0xff,0xe2,0x88,0xa3,0xcc,0xb8,0x00,0x01,
+	0x00,0x10,0x0a,0x01,0xff,0xe2,0x88,0xa5,0xcc,0xb8,0x00,0x01,0x00,0x01,0x00,0x01,
+	0x00,0xcf,0x86,0xd5,0x48,0x94,0x44,0xd3,0x2e,0xd2,0x12,0x91,0x0e,0x10,0x04,0x01,
+	0x00,0x01,0xff,0xe2,0x88,0xbc,0xcc,0xb8,0x00,0x01,0x00,0xd1,0x0e,0x10,0x0a,0x01,
+	0xff,0xe2,0x89,0x83,0xcc,0xb8,0x00,0x01,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0xe2,
+	0x89,0x85,0xcc,0xb8,0x00,0x92,0x12,0x91,0x0e,0x10,0x04,0x01,0x00,0x01,0xff,0xe2,
+	0x89,0x88,0xcc,0xb8,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,0x40,0xd3,0x1e,0x92,
+	0x1a,0xd1,0x0c,0x10,0x08,0x01,0xff,0x3d,0xcc,0xb8,0x00,0x01,0x00,0x10,0x0a,0x01,
+	0xff,0xe2,0x89,0xa1,0xcc,0xb8,0x00,0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,0xd1,
+	0x0e,0x10,0x04,0x01,0x00,0x01,0xff,0xe2,0x89,0x8d,0xcc,0xb8,0x00,0x10,0x08,0x01,
+	0xff,0x3c,0xcc,0xb8,0x00,0x01,0xff,0x3e,0xcc,0xb8,0x00,0xd3,0x30,0xd2,0x18,0x91,
+	0x14,0x10,0x0a,0x01,0xff,0xe2,0x89,0xa4,0xcc,0xb8,0x00,0x01,0xff,0xe2,0x89,0xa5,
+	0xcc,0xb8,0x00,0x01,0x00,0x91,0x14,0x10,0x0a,0x01,0xff,0xe2,0x89,0xb2,0xcc,0xb8,
+	0x00,0x01,0xff,0xe2,0x89,0xb3,0xcc,0xb8,0x00,0x01,0x00,0x92,0x18,0x91,0x14,0x10,
+	0x0a,0x01,0xff,0xe2,0x89,0xb6,0xcc,0xb8,0x00,0x01,0xff,0xe2,0x89,0xb7,0xcc,0xb8,
+	0x00,0x01,0x00,0x01,0x00,0xd0,0x86,0xcf,0x86,0xd5,0x50,0x94,0x4c,0xd3,0x30,0xd2,
+	0x18,0x91,0x14,0x10,0x0a,0x01,0xff,0xe2,0x89,0xba,0xcc,0xb8,0x00,0x01,0xff,0xe2,
+	0x89,0xbb,0xcc,0xb8,0x00,0x01,0x00,0x91,0x14,0x10,0x0a,0x01,0xff,0xe2,0x8a,0x82,
+	0xcc,0xb8,0x00,0x01,0xff,0xe2,0x8a,0x83,0xcc,0xb8,0x00,0x01,0x00,0x92,0x18,0x91,
+	0x14,0x10,0x0a,0x01,0xff,0xe2,0x8a,0x86,0xcc,0xb8,0x00,0x01,0xff,0xe2,0x8a,0x87,
+	0xcc,0xb8,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x94,0x30,0x53,0x04,0x01,0x00,0x52,
+	0x04,0x01,0x00,0xd1,0x14,0x10,0x0a,0x01,0xff,0xe2,0x8a,0xa2,0xcc,0xb8,0x00,0x01,
+	0xff,0xe2,0x8a,0xa8,0xcc,0xb8,0x00,0x10,0x0a,0x01,0xff,0xe2,0x8a,0xa9,0xcc,0xb8,
+	0x00,0x01,0xff,0xe2,0x8a,0xab,0xcc,0xb8,0x00,0x01,0x00,0xcf,0x86,0x55,0x04,0x01,
+	0x00,0xd4,0x5c,0xd3,0x2c,0x92,0x28,0xd1,0x14,0x10,0x0a,0x01,0xff,0xe2,0x89,0xbc,
+	0xcc,0xb8,0x00,0x01,0xff,0xe2,0x89,0xbd,0xcc,0xb8,0x00,0x10,0x0a,0x01,0xff,0xe2,
+	0x8a,0x91,0xcc,0xb8,0x00,0x01,0xff,0xe2,0x8a,0x92,0xcc,0xb8,0x00,0x01,0x00,0xd2,
+	0x18,0x51,0x04,0x01,0x00,0x10,0x0a,0x01,0xff,0xe2,0x8a,0xb2,0xcc,0xb8,0x00,0x01,
+	0xff,0xe2,0x8a,0xb3,0xcc,0xb8,0x00,0x91,0x14,0x10,0x0a,0x01,0xff,0xe2,0x8a,0xb4,
+	0xcc,0xb8,0x00,0x01,0xff,0xe2,0x8a,0xb5,0xcc,0xb8,0x00,0x01,0x00,0x93,0x0c,0x92,
+	0x08,0x11,0x04,0x01,0x00,0x06,0x00,0x06,0x00,0x06,0x00,0xd1,0x64,0xd0,0x3e,0xcf,
+	0x86,0xd5,0x18,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,0x00,0x04,
+	0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x94,0x20,0x53,0x04,0x01,0x00,0x92,
+	0x18,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0xe3,0x80,0x88,0x00,0x10,0x08,0x01,
+	0xff,0xe3,0x80,0x89,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xcf,0x86,0x55,0x04,0x01,
+	0x00,0x54,0x04,0x01,0x00,0x53,0x04,0x01,0x00,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,
+	0x04,0x01,0x00,0x04,0x00,0x91,0x08,0x10,0x04,0x06,0x00,0x04,0x00,0x04,0x00,0xd0,
+	0x1e,0xcf,0x86,0x95,0x18,0x54,0x04,0x04,0x00,0x53,0x04,0x04,0x00,0x92,0x0c,0x51,
+	0x04,0x04,0x00,0x10,0x04,0x04,0x00,0x06,0x00,0x06,0x00,0x06,0x00,0xcf,0x86,0xd5,
+	0x2c,0xd4,0x14,0x53,0x04,0x06,0x00,0x52,0x04,0x06,0x00,0x51,0x04,0x06,0x00,0x10,
+	0x04,0x06,0x00,0x07,0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x07,0x00,0x08,
+	0x00,0x08,0x00,0x08,0x00,0x12,0x04,0x08,0x00,0x09,0x00,0xd4,0x14,0x53,0x04,0x09,
+	0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x0b,0x00,0x0c,0x00,0x0c,0x00,0x0c,0x00,0xd3,
+	0x08,0x12,0x04,0x0c,0x00,0x10,0x00,0xd2,0x0c,0x51,0x04,0x10,0x00,0x10,0x04,0x10,
+	0x00,0x12,0x00,0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x00,0x13,0x00,0xd3,0xa6,0xd2,
+	0x74,0xd1,0x40,0xd0,0x22,0xcf,0x86,0x55,0x04,0x01,0x00,0x94,0x18,0x93,0x14,0x52,
+	0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x04,0x00,0x10,0x04,0x04,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0x95,0x18,0x94,0x14,0x53,0x04,0x01,0x00,0x92,
+	0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
+	0x00,0xd0,0x06,0xcf,0x06,0x01,0x00,0xcf,0x86,0x55,0x04,0x01,0x00,0xd4,0x14,0x53,
+	0x04,0x01,0x00,0x92,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x06,0x00,0x06,
+	0x00,0x53,0x04,0x06,0x00,0x52,0x04,0x06,0x00,0x51,0x04,0x06,0x00,0x10,0x04,0x06,
+	0x00,0x07,0x00,0xd1,0x06,0xcf,0x06,0x01,0x00,0xd0,0x1a,0xcf,0x86,0x95,0x14,0x54,
+	0x04,0x01,0x00,0x93,0x0c,0x52,0x04,0x01,0x00,0x11,0x04,0x01,0x00,0x06,0x00,0x06,
+	0x00,0x01,0x00,0xcf,0x86,0x55,0x04,0x01,0x00,0x54,0x04,0x01,0x00,0x13,0x04,0x04,
+	0x00,0x06,0x00,0xd2,0xdc,0xd1,0x48,0xd0,0x26,0xcf,0x86,0x95,0x20,0x54,0x04,0x01,
+	0x00,0xd3,0x0c,0x52,0x04,0x01,0x00,0x11,0x04,0x07,0x00,0x06,0x00,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x08,0x00,0x04,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xcf,0x86,0x55,
+	0x04,0x01,0x00,0x54,0x04,0x01,0x00,0xd3,0x0c,0x92,0x08,0x11,0x04,0x04,0x00,0x06,
+	0x00,0x06,0x00,0x52,0x04,0x06,0x00,0x11,0x04,0x06,0x00,0x08,0x00,0xd0,0x5e,0xcf,
+	0x86,0xd5,0x2c,0xd4,0x10,0x53,0x04,0x06,0x00,0x92,0x08,0x11,0x04,0x06,0x00,0x07,
+	0x00,0x07,0x00,0xd3,0x0c,0x92,0x08,0x11,0x04,0x07,0x00,0x08,0x00,0x08,0x00,0x52,
+	0x04,0x08,0x00,0x91,0x08,0x10,0x04,0x08,0x00,0x0a,0x00,0x0b,0x00,0xd4,0x10,0x93,
+	0x0c,0x92,0x08,0x11,0x04,0x07,0x00,0x08,0x00,0x08,0x00,0x08,0x00,0xd3,0x10,0x92,
+	0x0c,0x51,0x04,0x08,0x00,0x10,0x04,0x09,0x00,0x0a,0x00,0x0a,0x00,0x52,0x04,0x0a,
+	0x00,0x91,0x08,0x10,0x04,0x0a,0x00,0x0b,0x00,0x0b,0x00,0xcf,0x86,0xd5,0x1c,0x94,
+	0x18,0xd3,0x08,0x12,0x04,0x0a,0x00,0x0b,0x00,0x52,0x04,0x0b,0x00,0x51,0x04,0x0b,
+	0x00,0x10,0x04,0x0c,0x00,0x0b,0x00,0x0b,0x00,0x94,0x14,0x93,0x10,0x92,0x0c,0x51,
+	0x04,0x0b,0x00,0x10,0x04,0x0c,0x00,0x0b,0x00,0x0c,0x00,0x0b,0x00,0x0b,0x00,0xd1,
+	0xa8,0xd0,0x42,0xcf,0x86,0xd5,0x28,0x94,0x24,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,
+	0x04,0x10,0x00,0x01,0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x0c,0x00,0x01,
+	0x00,0x92,0x08,0x11,0x04,0x01,0x00,0x0c,0x00,0x01,0x00,0x01,0x00,0x94,0x14,0x53,
+	0x04,0x01,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x0c,0x00,0x01,0x00,0x01,0x00,0x01,
+	0x00,0x01,0x00,0xcf,0x86,0xd5,0x40,0xd4,0x18,0x53,0x04,0x01,0x00,0x52,0x04,0x01,
+	0x00,0xd1,0x08,0x10,0x04,0x0c,0x00,0x01,0x00,0x10,0x04,0x0c,0x00,0x01,0x00,0xd3,
+	0x18,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x0c,0x00,0x51,0x04,0x0c,
+	0x00,0x10,0x04,0x01,0x00,0x0b,0x00,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,
+	0x04,0x01,0x00,0x0c,0x00,0xd4,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0c,
+	0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x06,0x00,0x93,0x0c,0x52,0x04,0x06,0x00,0x11,
+	0x04,0x06,0x00,0x01,0x00,0x01,0x00,0xd0,0x3e,0xcf,0x86,0xd5,0x18,0x54,0x04,0x01,
+	0x00,0x93,0x10,0x52,0x04,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x0c,0x00,0x0c,
+	0x00,0x01,0x00,0x54,0x04,0x01,0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0c,
+	0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,
+	0x04,0x01,0x00,0x0c,0x00,0xcf,0x86,0xd5,0x2c,0x94,0x28,0xd3,0x10,0x52,0x04,0x08,
+	0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x09,0x00,0xd2,0x0c,0x51,0x04,0x09,
+	0x00,0x10,0x04,0x09,0x00,0x0d,0x00,0x91,0x08,0x10,0x04,0x0a,0x00,0x0d,0x00,0x0c,
+	0x00,0x06,0x00,0x94,0x0c,0x53,0x04,0x06,0x00,0x12,0x04,0x06,0x00,0x0a,0x00,0x06,
+	0x00,0xe4,0x39,0x01,0xd3,0x0c,0xd2,0x06,0xcf,0x06,0x04,0x00,0xcf,0x06,0x06,0x00,
+	0xd2,0x30,0xd1,0x06,0xcf,0x06,0x06,0x00,0xd0,0x06,0xcf,0x06,0x06,0x00,0xcf,0x86,
+	0x95,0x1e,0x54,0x04,0x06,0x00,0x53,0x04,0x06,0x00,0x52,0x04,0x06,0x00,0x91,0x0e,
+	0x10,0x0a,0x06,0xff,0xe2,0xab,0x9d,0xcc,0xb8,0x00,0x06,0x00,0x06,0x00,0x06,0x00,
+	0xd1,0x80,0xd0,0x3a,0xcf,0x86,0xd5,0x28,0xd4,0x10,0x53,0x04,0x07,0x00,0x52,0x04,
+	0x07,0x00,0x11,0x04,0x07,0x00,0x08,0x00,0xd3,0x08,0x12,0x04,0x08,0x00,0x09,0x00,
+	0x92,0x0c,0x51,0x04,0x09,0x00,0x10,0x04,0x09,0x00,0x0a,0x00,0x0a,0x00,0x94,0x0c,
+	0x93,0x08,0x12,0x04,0x09,0x00,0x0a,0x00,0x0a,0x00,0x0a,0x00,0xcf,0x86,0xd5,0x30,
+	0xd4,0x14,0x53,0x04,0x0a,0x00,0x52,0x04,0x0a,0x00,0x91,0x08,0x10,0x04,0x0a,0x00,
+	0x10,0x00,0x10,0x00,0xd3,0x10,0x52,0x04,0x0a,0x00,0x91,0x08,0x10,0x04,0x0a,0x00,
+	0x0b,0x00,0x0b,0x00,0x92,0x08,0x11,0x04,0x0b,0x00,0x10,0x00,0x10,0x00,0x54,0x04,
+	0x10,0x00,0x93,0x0c,0x52,0x04,0x10,0x00,0x11,0x04,0x00,0x00,0x10,0x00,0x10,0x00,
+	0xd0,0x32,0xcf,0x86,0xd5,0x14,0x54,0x04,0x10,0x00,0x93,0x0c,0x52,0x04,0x10,0x00,
+	0x11,0x04,0x10,0x00,0x00,0x00,0x10,0x00,0x54,0x04,0x10,0x00,0x53,0x04,0x10,0x00,
+	0xd2,0x08,0x11,0x04,0x10,0x00,0x14,0x00,0x91,0x08,0x10,0x04,0x14,0x00,0x10,0x00,
+	0x10,0x00,0xcf,0x86,0xd5,0x28,0xd4,0x14,0x53,0x04,0x10,0x00,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x10,0x00,0x15,0x00,0x10,0x00,0x10,0x00,0x93,0x10,0x92,0x0c,0x51,0x04,
+	0x10,0x00,0x10,0x04,0x13,0x00,0x14,0x00,0x14,0x00,0x14,0x00,0xd4,0x0c,0x53,0x04,
+	0x14,0x00,0x12,0x04,0x14,0x00,0x11,0x00,0x53,0x04,0x14,0x00,0x52,0x04,0x14,0x00,
+	0x51,0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x15,0x00,0xe3,0xb9,0x01,0xd2,0xac,0xd1,
+	0x68,0xd0,0x1e,0xcf,0x86,0x55,0x04,0x08,0x00,0x94,0x14,0x53,0x04,0x08,0x00,0x52,
+	0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x00,0x00,0x08,0x00,0xcf,
+	0x86,0xd5,0x18,0x54,0x04,0x08,0x00,0x53,0x04,0x08,0x00,0x52,0x04,0x08,0x00,0x51,
+	0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x00,0x00,0xd4,0x14,0x53,0x04,0x09,0x00,0x52,
+	0x04,0x09,0x00,0x91,0x08,0x10,0x04,0x09,0x00,0x0a,0x00,0x0a,0x00,0xd3,0x10,0x92,
+	0x0c,0x91,0x08,0x10,0x04,0x0b,0x00,0x0a,0x00,0x0a,0x00,0x09,0x00,0x52,0x04,0x0a,
+	0x00,0x11,0x04,0x0a,0x00,0x0b,0x00,0xd0,0x06,0xcf,0x06,0x08,0x00,0xcf,0x86,0x55,
+	0x04,0x08,0x00,0xd4,0x1c,0x53,0x04,0x08,0x00,0xd2,0x0c,0x51,0x04,0x08,0x00,0x10,
+	0x04,0x08,0x00,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,0x00,0x0b,0xe6,0xd3,
+	0x0c,0x92,0x08,0x11,0x04,0x0b,0xe6,0x0d,0x00,0x00,0x00,0x92,0x0c,0x91,0x08,0x10,
+	0x04,0x00,0x00,0x08,0x00,0x08,0x00,0x08,0x00,0xd1,0x6c,0xd0,0x2a,0xcf,0x86,0x55,
+	0x04,0x08,0x00,0x94,0x20,0xd3,0x10,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,
+	0x04,0x00,0x00,0x0d,0x00,0x52,0x04,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x0d,
+	0x00,0x00,0x00,0x08,0x00,0xcf,0x86,0x55,0x04,0x08,0x00,0xd4,0x1c,0xd3,0x0c,0x52,
+	0x04,0x08,0x00,0x11,0x04,0x08,0x00,0x0d,0x00,0x52,0x04,0x00,0x00,0x51,0x04,0x00,
+	0x00,0x10,0x04,0x00,0x00,0x08,0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0c,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x10,
+	0x04,0x00,0x00,0x0c,0x09,0xd0,0x5a,0xcf,0x86,0xd5,0x18,0x54,0x04,0x08,0x00,0x93,
+	0x10,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x00,0x00,0x00,
+	0x00,0xd4,0x20,0xd3,0x10,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,
+	0x00,0x00,0x00,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x00,
+	0x00,0xd3,0x10,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x00,
+	0x00,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x00,0x00,0xcf,
+	0x86,0x95,0x40,0xd4,0x20,0xd3,0x10,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,
+	0x04,0x08,0x00,0x00,0x00,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,
+	0x00,0x00,0x00,0xd3,0x10,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,
+	0x00,0x00,0x00,0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x00,
+	0x00,0x0a,0xe6,0xd2,0x9c,0xd1,0x68,0xd0,0x32,0xcf,0x86,0xd5,0x14,0x54,0x04,0x08,
+	0x00,0x53,0x04,0x08,0x00,0x52,0x04,0x0a,0x00,0x11,0x04,0x08,0x00,0x0a,0x00,0x54,
+	0x04,0x0a,0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0a,0x00,0x0b,0x00,0x0d,
+	0x00,0x0d,0x00,0x12,0x04,0x0d,0x00,0x10,0x00,0xcf,0x86,0x95,0x30,0x94,0x2c,0xd3,
+	0x18,0xd2,0x0c,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x12,0x00,0x91,0x08,0x10,
+	0x04,0x12,0x00,0x13,0x00,0x13,0x00,0xd2,0x08,0x11,0x04,0x13,0x00,0x14,0x00,0x51,
+	0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x15,0x00,0x00,0x00,0x00,0x00,0xd0,0x1e,0xcf,
+	0x86,0x95,0x18,0x54,0x04,0x04,0x00,0x53,0x04,0x04,0x00,0x92,0x0c,0x51,0x04,0x04,
+	0x00,0x10,0x04,0x00,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0xcf,0x86,0x55,0x04,0x04,
+	0x00,0x54,0x04,0x04,0x00,0x93,0x08,0x12,0x04,0x04,0x00,0x00,0x00,0x00,0x00,0xd1,
+	0x06,0xcf,0x06,0x04,0x00,0xd0,0x06,0xcf,0x06,0x04,0x00,0xcf,0x86,0xd5,0x14,0x54,
+	0x04,0x04,0x00,0x93,0x0c,0x52,0x04,0x04,0x00,0x11,0x04,0x04,0x00,0x00,0x00,0x00,
+	0x00,0x54,0x04,0x00,0x00,0x53,0x04,0x04,0x00,0x12,0x04,0x04,0x00,0x00,0x00,0xcf,
+	0x86,0xe5,0xa6,0x05,0xe4,0x9f,0x05,0xe3,0x96,0x04,0xe2,0xe4,0x03,0xe1,0xc0,0x01,
+	0xd0,0x3e,0xcf,0x86,0x55,0x04,0x01,0x00,0xd4,0x1c,0x53,0x04,0x01,0x00,0xd2,0x0c,
+	0x51,0x04,0x01,0x00,0x10,0x04,0x01,0xda,0x01,0xe4,0x91,0x08,0x10,0x04,0x01,0xe8,
+	0x01,0xde,0x01,0xe0,0x53,0x04,0x01,0x00,0xd2,0x0c,0x51,0x04,0x04,0x00,0x10,0x04,
+	0x04,0x00,0x06,0x00,0x51,0x04,0x06,0x00,0x10,0x04,0x04,0x00,0x01,0x00,0xcf,0x86,
+	0xd5,0xaa,0xd4,0x32,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,
+	0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x81,
+	0x8b,0xe3,0x82,0x99,0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x81,0x8d,0xe3,0x82,
+	0x99,0x00,0x01,0x00,0xd3,0x3c,0xd2,0x1e,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x81,
+	0x8f,0xe3,0x82,0x99,0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x81,0x91,0xe3,0x82,
+	0x99,0x00,0x01,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x81,0x93,0xe3,0x82,0x99,
+	0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x81,0x95,0xe3,0x82,0x99,0x00,0x01,0x00,
+	0xd2,0x1e,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x81,0x97,0xe3,0x82,0x99,0x00,0x01,
+	0x00,0x10,0x0b,0x01,0xff,0xe3,0x81,0x99,0xe3,0x82,0x99,0x00,0x01,0x00,0xd1,0x0f,
+	0x10,0x0b,0x01,0xff,0xe3,0x81,0x9b,0xe3,0x82,0x99,0x00,0x01,0x00,0x10,0x0b,0x01,
+	0xff,0xe3,0x81,0x9d,0xe3,0x82,0x99,0x00,0x01,0x00,0xd4,0x53,0xd3,0x3c,0xd2,0x1e,
+	0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x81,0x9f,0xe3,0x82,0x99,0x00,0x01,0x00,0x10,
+	0x0b,0x01,0xff,0xe3,0x81,0xa1,0xe3,0x82,0x99,0x00,0x01,0x00,0xd1,0x0f,0x10,0x04,
+	0x01,0x00,0x01,0xff,0xe3,0x81,0xa4,0xe3,0x82,0x99,0x00,0x10,0x04,0x01,0x00,0x01,
+	0xff,0xe3,0x81,0xa6,0xe3,0x82,0x99,0x00,0x92,0x13,0x91,0x0f,0x10,0x04,0x01,0x00,
+	0x01,0xff,0xe3,0x81,0xa8,0xe3,0x82,0x99,0x00,0x01,0x00,0x01,0x00,0xd3,0x4a,0xd2,
+	0x25,0xd1,0x16,0x10,0x0b,0x01,0xff,0xe3,0x81,0xaf,0xe3,0x82,0x99,0x00,0x01,0xff,
+	0xe3,0x81,0xaf,0xe3,0x82,0x9a,0x00,0x10,0x04,0x01,0x00,0x01,0xff,0xe3,0x81,0xb2,
+	0xe3,0x82,0x99,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x81,0xb2,0xe3,0x82,0x9a,
+	0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x81,0xb5,0xe3,0x82,0x99,0x00,0x01,0xff,
+	0xe3,0x81,0xb5,0xe3,0x82,0x9a,0x00,0xd2,0x1e,0xd1,0x0f,0x10,0x04,0x01,0x00,0x01,
+	0xff,0xe3,0x81,0xb8,0xe3,0x82,0x99,0x00,0x10,0x0b,0x01,0xff,0xe3,0x81,0xb8,0xe3,
+	0x82,0x9a,0x00,0x01,0x00,0x91,0x16,0x10,0x0b,0x01,0xff,0xe3,0x81,0xbb,0xe3,0x82,
+	0x99,0x00,0x01,0xff,0xe3,0x81,0xbb,0xe3,0x82,0x9a,0x00,0x01,0x00,0xd0,0xee,0xcf,
+	0x86,0xd5,0x42,0x54,0x04,0x01,0x00,0xd3,0x1b,0x52,0x04,0x01,0x00,0xd1,0x0f,0x10,
+	0x0b,0x01,0xff,0xe3,0x81,0x86,0xe3,0x82,0x99,0x00,0x06,0x00,0x10,0x04,0x06,0x00,
+	0x00,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x00,0x00,0x01,0x08,0x10,0x04,0x01,0x08,
+	0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x82,0x9d,0xe3,0x82,0x99,
+	0x00,0x06,0x00,0xd4,0x32,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x06,0x00,0x01,
+	0x00,0x01,0x00,0x01,0x00,0x52,0x04,0x01,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,
+	0x82,0xab,0xe3,0x82,0x99,0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x82,0xad,0xe3,
+	0x82,0x99,0x00,0x01,0x00,0xd3,0x3c,0xd2,0x1e,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,
+	0x82,0xaf,0xe3,0x82,0x99,0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x82,0xb1,0xe3,
+	0x82,0x99,0x00,0x01,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x82,0xb3,0xe3,0x82,
+	0x99,0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x82,0xb5,0xe3,0x82,0x99,0x00,0x01,
+	0x00,0xd2,0x1e,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x82,0xb7,0xe3,0x82,0x99,0x00,
+	0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x82,0xb9,0xe3,0x82,0x99,0x00,0x01,0x00,0xd1,
+	0x0f,0x10,0x0b,0x01,0xff,0xe3,0x82,0xbb,0xe3,0x82,0x99,0x00,0x01,0x00,0x10,0x0b,
+	0x01,0xff,0xe3,0x82,0xbd,0xe3,0x82,0x99,0x00,0x01,0x00,0xcf,0x86,0xd5,0xd5,0xd4,
+	0x53,0xd3,0x3c,0xd2,0x1e,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,0x82,0xbf,0xe3,0x82,
+	0x99,0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x83,0x81,0xe3,0x82,0x99,0x00,0x01,
+	0x00,0xd1,0x0f,0x10,0x04,0x01,0x00,0x01,0xff,0xe3,0x83,0x84,0xe3,0x82,0x99,0x00,
+	0x10,0x04,0x01,0x00,0x01,0xff,0xe3,0x83,0x86,0xe3,0x82,0x99,0x00,0x92,0x13,0x91,
+	0x0f,0x10,0x04,0x01,0x00,0x01,0xff,0xe3,0x83,0x88,0xe3,0x82,0x99,0x00,0x01,0x00,
+	0x01,0x00,0xd3,0x4a,0xd2,0x25,0xd1,0x16,0x10,0x0b,0x01,0xff,0xe3,0x83,0x8f,0xe3,
+	0x82,0x99,0x00,0x01,0xff,0xe3,0x83,0x8f,0xe3,0x82,0x9a,0x00,0x10,0x04,0x01,0x00,
+	0x01,0xff,0xe3,0x83,0x92,0xe3,0x82,0x99,0x00,0xd1,0x0f,0x10,0x0b,0x01,0xff,0xe3,
+	0x83,0x92,0xe3,0x82,0x9a,0x00,0x01,0x00,0x10,0x0b,0x01,0xff,0xe3,0x83,0x95,0xe3,
+	0x82,0x99,0x00,0x01,0xff,0xe3,0x83,0x95,0xe3,0x82,0x9a,0x00,0xd2,0x1e,0xd1,0x0f,
+	0x10,0x04,0x01,0x00,0x01,0xff,0xe3,0x83,0x98,0xe3,0x82,0x99,0x00,0x10,0x0b,0x01,
+	0xff,0xe3,0x83,0x98,0xe3,0x82,0x9a,0x00,0x01,0x00,0x91,0x16,0x10,0x0b,0x01,0xff,
+	0xe3,0x83,0x9b,0xe3,0x82,0x99,0x00,0x01,0xff,0xe3,0x83,0x9b,0xe3,0x82,0x9a,0x00,
+	0x01,0x00,0x54,0x04,0x01,0x00,0xd3,0x22,0x52,0x04,0x01,0x00,0xd1,0x0f,0x10,0x0b,
+	0x01,0xff,0xe3,0x82,0xa6,0xe3,0x82,0x99,0x00,0x01,0x00,0x10,0x04,0x01,0x00,0x01,
+	0xff,0xe3,0x83,0xaf,0xe3,0x82,0x99,0x00,0xd2,0x25,0xd1,0x16,0x10,0x0b,0x01,0xff,
+	0xe3,0x83,0xb0,0xe3,0x82,0x99,0x00,0x01,0xff,0xe3,0x83,0xb1,0xe3,0x82,0x99,0x00,
+	0x10,0x0b,0x01,0xff,0xe3,0x83,0xb2,0xe3,0x82,0x99,0x00,0x01,0x00,0x51,0x04,0x01,
+	0x00,0x10,0x0b,0x01,0xff,0xe3,0x83,0xbd,0xe3,0x82,0x99,0x00,0x06,0x00,0xd1,0x65,
+	0xd0,0x46,0xcf,0x86,0xd5,0x18,0x94,0x14,0x93,0x10,0x52,0x04,0x00,0x00,0x91,0x08,
+	0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd4,0x18,0x53,0x04,
+	0x01,0x00,0x52,0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x0a,0x00,0x10,0x04,
+	0x13,0x00,0x14,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,
+	0x01,0x00,0x01,0x00,0x01,0x00,0xcf,0x86,0x55,0x04,0x01,0x00,0x94,0x15,0x93,0x11,
+	0x52,0x04,0x01,0x00,0x91,0x09,0x10,0x05,0x01,0xff,0x00,0x01,0x00,0x01,0x00,0x01,
+	0x00,0x01,0x00,0xd0,0x32,0xcf,0x86,0xd5,0x18,0x94,0x14,0x53,0x04,0x01,0x00,0x52,
+	0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x54,
+	0x04,0x04,0x00,0x53,0x04,0x04,0x00,0x92,0x0c,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,
+	0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0xd5,0x08,0x14,0x04,0x08,0x00,0x0a,0x00,0x94,
+	0x0c,0x93,0x08,0x12,0x04,0x0a,0x00,0x00,0x00,0x00,0x00,0x06,0x00,0xd2,0xa4,0xd1,
+	0x5c,0xd0,0x22,0xcf,0x86,0x95,0x1c,0x54,0x04,0x01,0x00,0x53,0x04,0x01,0x00,0x52,
+	0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x07,0x00,0x10,0x04,0x07,0x00,0x00,
+	0x00,0x01,0x00,0xcf,0x86,0xd5,0x20,0xd4,0x0c,0x93,0x08,0x12,0x04,0x01,0x00,0x0b,
+	0x00,0x0b,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x07,0x00,0x06,0x00,0x06,
+	0x00,0x06,0x00,0x06,0x00,0x54,0x04,0x01,0x00,0x53,0x04,0x01,0x00,0x52,0x04,0x01,
+	0x00,0x51,0x04,0x07,0x00,0x10,0x04,0x08,0x00,0x01,0x00,0xd0,0x1e,0xcf,0x86,0x55,
+	0x04,0x01,0x00,0x54,0x04,0x01,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x01,
+	0x00,0x06,0x00,0x06,0x00,0x06,0x00,0x06,0x00,0xcf,0x86,0xd5,0x10,0x94,0x0c,0x53,
+	0x04,0x01,0x00,0x12,0x04,0x01,0x00,0x07,0x00,0x01,0x00,0x54,0x04,0x01,0x00,0x53,
+	0x04,0x01,0x00,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x16,
+	0x00,0xd1,0x30,0xd0,0x06,0xcf,0x06,0x01,0x00,0xcf,0x86,0x55,0x04,0x01,0x00,0x54,
+	0x04,0x01,0x00,0xd3,0x10,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,
+	0x00,0x07,0x00,0x92,0x0c,0x51,0x04,0x07,0x00,0x10,0x04,0x07,0x00,0x01,0x00,0x01,
+	0x00,0xd0,0x06,0xcf,0x06,0x01,0x00,0xcf,0x86,0xd5,0x14,0x54,0x04,0x01,0x00,0x53,
+	0x04,0x01,0x00,0x52,0x04,0x01,0x00,0x11,0x04,0x01,0x00,0x07,0x00,0x54,0x04,0x01,
+	0x00,0x53,0x04,0x01,0x00,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,
+	0x00,0x07,0x00,0xcf,0x06,0x04,0x00,0xcf,0x06,0x04,0x00,0xd1,0x48,0xd0,0x40,0xcf,
+	0x86,0xd5,0x06,0xcf,0x06,0x04,0x00,0xd4,0x06,0xcf,0x06,0x04,0x00,0xd3,0x2c,0xd2,
+	0x06,0xcf,0x06,0x04,0x00,0xd1,0x06,0xcf,0x06,0x04,0x00,0xd0,0x1a,0xcf,0x86,0x55,
+	0x04,0x04,0x00,0x54,0x04,0x04,0x00,0x93,0x0c,0x52,0x04,0x04,0x00,0x11,0x04,0x04,
+	0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x07,0x00,0xcf,0x06,0x01,0x00,0xcf,0x86,0xcf,
+	0x06,0x01,0x00,0xcf,0x86,0xcf,0x06,0x01,0x00,0xe2,0x71,0x05,0xd1,0x8c,0xd0,0x08,
+	0xcf,0x86,0xcf,0x06,0x01,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x01,0x00,0xd4,0x06,
+	0xcf,0x06,0x01,0x00,0xd3,0x06,0xcf,0x06,0x01,0x00,0xd2,0x06,0xcf,0x06,0x01,0x00,
+	0xd1,0x06,0xcf,0x06,0x01,0x00,0xd0,0x22,0xcf,0x86,0x55,0x04,0x01,0x00,0xd4,0x10,
+	0x93,0x0c,0x52,0x04,0x01,0x00,0x11,0x04,0x01,0x00,0x08,0x00,0x08,0x00,0x53,0x04,
+	0x08,0x00,0x12,0x04,0x08,0x00,0x0a,0x00,0xcf,0x86,0xd5,0x28,0xd4,0x18,0xd3,0x08,
+	0x12,0x04,0x0a,0x00,0x0b,0x00,0x52,0x04,0x0b,0x00,0x91,0x08,0x10,0x04,0x0d,0x00,
+	0x11,0x00,0x11,0x00,0x93,0x0c,0x52,0x04,0x11,0x00,0x11,0x04,0x11,0x00,0x13,0x00,
+	0x13,0x00,0x94,0x14,0x53,0x04,0x13,0x00,0x92,0x0c,0x51,0x04,0x13,0x00,0x10,0x04,
+	0x13,0x00,0x14,0x00,0x14,0x00,0x00,0x00,0xe0,0xdb,0x04,0xcf,0x86,0xe5,0xdf,0x01,
+	0xd4,0x06,0xcf,0x06,0x04,0x00,0xd3,0x74,0xd2,0x6e,0xd1,0x06,0xcf,0x06,0x04,0x00,
+	0xd0,0x3e,0xcf,0x86,0xd5,0x18,0x94,0x14,0x53,0x04,0x04,0x00,0x52,0x04,0x04,0x00,
+	0x91,0x08,0x10,0x04,0x04,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0xd4,0x10,0x93,0x0c,
+	0x92,0x08,0x11,0x04,0x04,0x00,0x06,0x00,0x04,0x00,0x04,0x00,0x93,0x10,0x52,0x04,
+	0x04,0x00,0x91,0x08,0x10,0x04,0x06,0x00,0x04,0x00,0x04,0x00,0x04,0x00,0xcf,0x86,
+	0x95,0x24,0x94,0x20,0x93,0x1c,0xd2,0x0c,0x91,0x08,0x10,0x04,0x04,0x00,0x06,0x00,
+	0x04,0x00,0xd1,0x08,0x10,0x04,0x04,0x00,0x06,0x00,0x10,0x04,0x04,0x00,0x00,0x00,
+	0x00,0x00,0x0b,0x00,0x0b,0x00,0xcf,0x06,0x0a,0x00,0xd2,0x84,0xd1,0x4c,0xd0,0x16,
+	0xcf,0x86,0x55,0x04,0x0a,0x00,0x94,0x0c,0x53,0x04,0x0a,0x00,0x12,0x04,0x0a,0x00,
+	0x00,0x00,0x00,0x00,0xcf,0x86,0x55,0x04,0x0a,0x00,0xd4,0x1c,0xd3,0x0c,0x92,0x08,
+	0x11,0x04,0x0c,0x00,0x0a,0x00,0x0a,0x00,0x52,0x04,0x0a,0x00,0x51,0x04,0x0a,0x00,
+	0x10,0x04,0x0a,0x00,0x0a,0xe6,0xd3,0x08,0x12,0x04,0x0a,0x00,0x0d,0xe6,0x52,0x04,
+	0x0d,0xe6,0x11,0x04,0x0a,0xe6,0x0a,0x00,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x54,0x04,
+	0x0a,0x00,0x53,0x04,0x0a,0x00,0x52,0x04,0x10,0x00,0x51,0x04,0x10,0x00,0x10,0x04,
+	0x11,0xe6,0x0d,0xe6,0x0b,0x00,0xcf,0x86,0x55,0x04,0x0b,0x00,0x54,0x04,0x0b,0x00,
+	0x93,0x0c,0x92,0x08,0x11,0x04,0x0b,0xe6,0x0b,0x00,0x0b,0x00,0x00,0x00,0xd1,0x40,
+	0xd0,0x3a,0xcf,0x86,0xd5,0x24,0x54,0x04,0x08,0x00,0xd3,0x10,0x52,0x04,0x08,0x00,
+	0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x09,0x00,0x92,0x0c,0x51,0x04,0x09,0x00,
+	0x10,0x04,0x09,0x00,0x0a,0x00,0x0a,0x00,0x94,0x10,0x93,0x0c,0x92,0x08,0x11,0x04,
+	0x09,0x00,0x0a,0x00,0x0a,0x00,0x0a,0x00,0x0a,0x00,0xcf,0x06,0x0a,0x00,0xd0,0x5e,
+	0xcf,0x86,0xd5,0x28,0xd4,0x18,0x53,0x04,0x0a,0x00,0x52,0x04,0x0a,0x00,0xd1,0x08,
+	0x10,0x04,0x0a,0x00,0x0c,0x00,0x10,0x04,0x0c,0x00,0x11,0x00,0x93,0x0c,0x92,0x08,
+	0x11,0x04,0x0c,0x00,0x0d,0x00,0x10,0x00,0x10,0x00,0xd4,0x1c,0x53,0x04,0x0c,0x00,
+	0xd2,0x0c,0x51,0x04,0x0c,0x00,0x10,0x04,0x0d,0x00,0x10,0x00,0x51,0x04,0x10,0x00,
+	0x10,0x04,0x12,0x00,0x14,0x00,0xd3,0x0c,0x92,0x08,0x11,0x04,0x10,0x00,0x11,0x00,
+	0x11,0x00,0x92,0x08,0x11,0x04,0x14,0x00,0x15,0x00,0x15,0x00,0xcf,0x86,0xd5,0x1c,
+	0x94,0x18,0x93,0x14,0xd2,0x08,0x11,0x04,0x00,0x00,0x15,0x00,0x51,0x04,0x15,0x00,
+	0x10,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x54,0x04,0x00,0x00,0xd3,0x10,
+	0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0x92,0x0c,
+	0x51,0x04,0x0d,0x00,0x10,0x04,0x0c,0x00,0x0a,0x00,0x0a,0x00,0xe4,0xf2,0x02,0xe3,
+	0x65,0x01,0xd2,0x98,0xd1,0x48,0xd0,0x36,0xcf,0x86,0xd5,0x18,0x94,0x14,0x93,0x10,
+	0x52,0x04,0x08,0x00,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x09,0x08,0x00,0x08,0x00,
+	0x08,0x00,0xd4,0x0c,0x53,0x04,0x08,0x00,0x12,0x04,0x08,0x00,0x00,0x00,0x53,0x04,
+	0x0b,0x00,0x92,0x08,0x11,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0x55,0x04,
+	0x09,0x00,0x54,0x04,0x09,0x00,0x13,0x04,0x09,0x00,0x00,0x00,0xd0,0x06,0xcf,0x06,
+	0x0a,0x00,0xcf,0x86,0xd5,0x2c,0xd4,0x1c,0xd3,0x10,0x52,0x04,0x0a,0x00,0x91,0x08,
+	0x10,0x04,0x0a,0x09,0x12,0x00,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,
+	0x0a,0x00,0x53,0x04,0x0a,0x00,0x92,0x08,0x11,0x04,0x0a,0x00,0x00,0x00,0x00,0x00,
+	0x54,0x04,0x0b,0xe6,0xd3,0x0c,0x92,0x08,0x11,0x04,0x0b,0xe6,0x0b,0x00,0x0b,0x00,
+	0x52,0x04,0x0b,0x00,0x11,0x04,0x11,0x00,0x14,0x00,0xd1,0x60,0xd0,0x22,0xcf,0x86,
+	0x55,0x04,0x0a,0x00,0x94,0x18,0x53,0x04,0x0a,0x00,0xd2,0x0c,0x51,0x04,0x0a,0x00,
+	0x10,0x04,0x0a,0x00,0x0a,0xdc,0x11,0x04,0x0a,0xdc,0x0a,0x00,0x0a,0x00,0xcf,0x86,
+	0xd5,0x24,0x54,0x04,0x0a,0x00,0xd3,0x10,0x92,0x0c,0x51,0x04,0x0a,0x00,0x10,0x04,
+	0x0a,0x00,0x0a,0x09,0x00,0x00,0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,
+	0x00,0x00,0x0a,0x00,0x54,0x04,0x0b,0x00,0x53,0x04,0x0b,0x00,0x52,0x04,0x0b,0x00,
+	0x91,0x08,0x10,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0xd0,0x1e,0xcf,0x86,0x55,0x04,
+	0x0b,0x00,0x54,0x04,0x0b,0x00,0x93,0x10,0x92,0x0c,0x51,0x04,0x0b,0x00,0x10,0x04,
+	0x0b,0x00,0x0b,0x07,0x0b,0x00,0x0b,0x00,0xcf,0x86,0xd5,0x34,0xd4,0x20,0xd3,0x10,
+	0x92,0x0c,0x91,0x08,0x10,0x04,0x0b,0x09,0x0b,0x00,0x0b,0x00,0x0b,0x00,0x52,0x04,
+	0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,0x00,0x00,0x0b,0x00,0x53,0x04,0x0b,0x00,
+	0xd2,0x08,0x11,0x04,0x0b,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x0b,0x00,0x54,0x04,
+	0x10,0x00,0x53,0x04,0x10,0x00,0x52,0x04,0x10,0x00,0x51,0x04,0x10,0x00,0x10,0x04,
+	0x10,0x00,0x00,0x00,0xd2,0xd0,0xd1,0x50,0xd0,0x1e,0xcf,0x86,0x55,0x04,0x0a,0x00,
+	0x54,0x04,0x0a,0x00,0x93,0x10,0x52,0x04,0x0a,0x00,0x51,0x04,0x0a,0x00,0x10,0x04,
+	0x0a,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0xd5,0x20,0xd4,0x10,0x53,0x04,0x0a,0x00,
+	0x52,0x04,0x0a,0x00,0x11,0x04,0x0a,0x00,0x00,0x00,0x53,0x04,0x0a,0x00,0x92,0x08,
+	0x11,0x04,0x0a,0x00,0x00,0x00,0x0a,0x00,0x54,0x04,0x0b,0x00,0x53,0x04,0x0b,0x00,
+	0x12,0x04,0x0b,0x00,0x10,0x00,0xd0,0x3a,0xcf,0x86,0x55,0x04,0x0b,0x00,0x54,0x04,
+	0x0b,0x00,0xd3,0x1c,0xd2,0x0c,0x91,0x08,0x10,0x04,0x0b,0xe6,0x0b,0x00,0x0b,0xe6,
+	0xd1,0x08,0x10,0x04,0x0b,0xdc,0x0b,0x00,0x10,0x04,0x0b,0x00,0x0b,0xe6,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x0b,0xe6,0x0b,0x00,0x0b,0x00,0x11,0x04,0x0b,0x00,0x0b,0xe6,
+	0xcf,0x86,0xd5,0x2c,0xd4,0x18,0x93,0x14,0x92,0x10,0xd1,0x08,0x10,0x04,0x0b,0x00,
+	0x0b,0xe6,0x10,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x53,0x04,0x00,0x00,
+	0x92,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x0b,0x00,0x0b,0x00,0x54,0x04,
+	0x0d,0x00,0x93,0x10,0x52,0x04,0x0d,0x00,0x51,0x04,0x0d,0x00,0x10,0x04,0x0d,0x09,
+	0x00,0x00,0x00,0x00,0xd1,0x8c,0xd0,0x72,0xcf,0x86,0xd5,0x4c,0xd4,0x30,0xd3,0x18,
+	0xd2,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x0c,0x00,0x0c,0x00,0x51,0x04,0x0c,0x00,
+	0x10,0x04,0x0c,0x00,0x00,0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x0c,0x00,
+	0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,0x00,0x00,0x93,0x18,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x00,0x00,0x0c,0x00,0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,
+	0x0c,0x00,0x00,0x00,0x00,0x00,0x94,0x20,0xd3,0x10,0x52,0x04,0x0c,0x00,0x51,0x04,
+	0x0c,0x00,0x10,0x04,0x0c,0x00,0x00,0x00,0x52,0x04,0x0c,0x00,0x51,0x04,0x0c,0x00,
+	0x10,0x04,0x0c,0x00,0x00,0x00,0x10,0x00,0xcf,0x86,0x55,0x04,0x10,0x00,0x94,0x10,
+	0x93,0x0c,0x52,0x04,0x11,0x00,0x11,0x04,0x10,0x00,0x15,0x00,0x00,0x00,0x11,0x00,
+	0xd0,0x06,0xcf,0x06,0x11,0x00,0xcf,0x86,0x55,0x04,0x0b,0x00,0xd4,0x14,0x53,0x04,
+	0x0b,0x00,0x52,0x04,0x0b,0x00,0x91,0x08,0x10,0x04,0x0b,0x00,0x0b,0x09,0x00,0x00,
+	0x53,0x04,0x0b,0x00,0x92,0x08,0x11,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,
+	0x02,0xff,0xff,0xcf,0x86,0xcf,0x06,0x02,0xff,0xff,0xd1,0x76,0xd0,0x09,0xcf,0x86,
+	0xcf,0x06,0x02,0xff,0xff,0xcf,0x86,0x85,0xd4,0x07,0xcf,0x06,0x02,0xff,0xff,0xd3,
+	0x07,0xcf,0x06,0x02,0xff,0xff,0xd2,0x07,0xcf,0x06,0x02,0xff,0xff,0xd1,0x07,0xcf,
+	0x06,0x02,0xff,0xff,0xd0,0x18,0xcf,0x86,0x55,0x05,0x02,0xff,0xff,0x94,0x0d,0x93,
+	0x09,0x12,0x05,0x02,0xff,0xff,0x00,0x00,0x00,0x00,0x0b,0x00,0xcf,0x86,0xd5,0x24,
+	0x94,0x20,0xd3,0x10,0x52,0x04,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,0x00,
+	0x00,0x00,0x92,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x0b,0x00,0x0b,0x00,
+	0x0b,0x00,0x54,0x04,0x0b,0x00,0x53,0x04,0x0b,0x00,0x12,0x04,0x0b,0x00,0x00,0x00,
+	0xd0,0x08,0xcf,0x86,0xcf,0x06,0x01,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x01,0x00,
+	0xe4,0x9c,0x10,0xe3,0x16,0x08,0xd2,0x06,0xcf,0x06,0x01,0x00,0xe1,0x08,0x04,0xe0,
+	0x04,0x02,0xcf,0x86,0xe5,0x01,0x01,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0xe8,0xb1,0x88,0x00,0x01,0xff,0xe6,0x9b,0xb4,0x00,0x10,0x08,0x01,
+	0xff,0xe8,0xbb,0x8a,0x00,0x01,0xff,0xe8,0xb3,0x88,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0xe6,0xbb,0x91,0x00,0x01,0xff,0xe4,0xb8,0xb2,0x00,0x10,0x08,0x01,0xff,0xe5,
+	0x8f,0xa5,0x00,0x01,0xff,0xe9,0xbe,0x9c,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0xe9,0xbe,0x9c,0x00,0x01,0xff,0xe5,0xa5,0x91,0x00,0x10,0x08,0x01,0xff,0xe9,
+	0x87,0x91,0x00,0x01,0xff,0xe5,0x96,0x87,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe5,
+	0xa5,0x88,0x00,0x01,0xff,0xe6,0x87,0xb6,0x00,0x10,0x08,0x01,0xff,0xe7,0x99,0xa9,
+	0x00,0x01,0xff,0xe7,0xbe,0x85,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0xe8,0x98,0xbf,0x00,0x01,0xff,0xe8,0x9e,0xba,0x00,0x10,0x08,0x01,0xff,0xe8,
+	0xa3,0xb8,0x00,0x01,0xff,0xe9,0x82,0x8f,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe6,
+	0xa8,0x82,0x00,0x01,0xff,0xe6,0xb4,0x9b,0x00,0x10,0x08,0x01,0xff,0xe7,0x83,0x99,
+	0x00,0x01,0xff,0xe7,0x8f,0x9e,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe8,
+	0x90,0xbd,0x00,0x01,0xff,0xe9,0x85,0xaa,0x00,0x10,0x08,0x01,0xff,0xe9,0xa7,0xb1,
+	0x00,0x01,0xff,0xe4,0xba,0x82,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe5,0x8d,0xb5,
+	0x00,0x01,0xff,0xe6,0xac,0x84,0x00,0x10,0x08,0x01,0xff,0xe7,0x88,0x9b,0x00,0x01,
+	0xff,0xe8,0x98,0xad,0x00,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0xe9,0xb8,0x9e,0x00,0x01,0xff,0xe5,0xb5,0x90,0x00,0x10,0x08,0x01,0xff,0xe6,
+	0xbf,0xab,0x00,0x01,0xff,0xe8,0x97,0x8d,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe8,
+	0xa5,0xa4,0x00,0x01,0xff,0xe6,0x8b,0x89,0x00,0x10,0x08,0x01,0xff,0xe8,0x87,0x98,
+	0x00,0x01,0xff,0xe8,0xa0,0x9f,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe5,
+	0xbb,0x8a,0x00,0x01,0xff,0xe6,0x9c,0x97,0x00,0x10,0x08,0x01,0xff,0xe6,0xb5,0xaa,
+	0x00,0x01,0xff,0xe7,0x8b,0xbc,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe9,0x83,0x8e,
+	0x00,0x01,0xff,0xe4,0xbe,0x86,0x00,0x10,0x08,0x01,0xff,0xe5,0x86,0xb7,0x00,0x01,
+	0xff,0xe5,0x8b,0x9e,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe6,
+	0x93,0x84,0x00,0x01,0xff,0xe6,0xab,0x93,0x00,0x10,0x08,0x01,0xff,0xe7,0x88,0x90,
+	0x00,0x01,0xff,0xe7,0x9b,0xa7,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe8,0x80,0x81,
+	0x00,0x01,0xff,0xe8,0x98,0x86,0x00,0x10,0x08,0x01,0xff,0xe8,0x99,0x9c,0x00,0x01,
+	0xff,0xe8,0xb7,0xaf,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe9,0x9c,0xb2,
+	0x00,0x01,0xff,0xe9,0xad,0xaf,0x00,0x10,0x08,0x01,0xff,0xe9,0xb7,0xba,0x00,0x01,
+	0xff,0xe7,0xa2,0x8c,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe7,0xa5,0xbf,0x00,0x01,
+	0xff,0xe7,0xb6,0xa0,0x00,0x10,0x08,0x01,0xff,0xe8,0x8f,0x89,0x00,0x01,0xff,0xe9,
+	0x8c,0x84,0x00,0xcf,0x86,0xe5,0x01,0x01,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe9,0xb9,0xbf,0x00,0x01,0xff,0xe8,0xab,0x96,0x00,0x10,0x08,
+	0x01,0xff,0xe5,0xa3,0x9f,0x00,0x01,0xff,0xe5,0xbc,0x84,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe7,0xb1,0xa0,0x00,0x01,0xff,0xe8,0x81,0xbe,0x00,0x10,0x08,0x01,0xff,
+	0xe7,0x89,0xa2,0x00,0x01,0xff,0xe7,0xa3,0x8a,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe8,0xb3,0x82,0x00,0x01,0xff,0xe9,0x9b,0xb7,0x00,0x10,0x08,0x01,0xff,
+	0xe5,0xa3,0x98,0x00,0x01,0xff,0xe5,0xb1,0xa2,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe6,0xa8,0x93,0x00,0x01,0xff,0xe6,0xb7,0x9a,0x00,0x10,0x08,0x01,0xff,0xe6,0xbc,
+	0x8f,0x00,0x01,0xff,0xe7,0xb4,0xaf,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe7,0xb8,0xb7,0x00,0x01,0xff,0xe9,0x99,0x8b,0x00,0x10,0x08,0x01,0xff,
+	0xe5,0x8b,0x92,0x00,0x01,0xff,0xe8,0x82,0x8b,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe5,0x87,0x9c,0x00,0x01,0xff,0xe5,0x87,0x8c,0x00,0x10,0x08,0x01,0xff,0xe7,0xa8,
+	0x9c,0x00,0x01,0xff,0xe7,0xb6,0xbe,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe8,0x8f,0xb1,0x00,0x01,0xff,0xe9,0x99,0xb5,0x00,0x10,0x08,0x01,0xff,0xe8,0xae,
+	0x80,0x00,0x01,0xff,0xe6,0x8b,0x8f,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe6,0xa8,
+	0x82,0x00,0x01,0xff,0xe8,0xab,0xbe,0x00,0x10,0x08,0x01,0xff,0xe4,0xb8,0xb9,0x00,
+	0x01,0xff,0xe5,0xaf,0xa7,0x00,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe6,0x80,0x92,0x00,0x01,0xff,0xe7,0x8e,0x87,0x00,0x10,0x08,0x01,0xff,
+	0xe7,0x95,0xb0,0x00,0x01,0xff,0xe5,0x8c,0x97,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe7,0xa3,0xbb,0x00,0x01,0xff,0xe4,0xbe,0xbf,0x00,0x10,0x08,0x01,0xff,0xe5,0xbe,
+	0xa9,0x00,0x01,0xff,0xe4,0xb8,0x8d,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe6,0xb3,0x8c,0x00,0x01,0xff,0xe6,0x95,0xb8,0x00,0x10,0x08,0x01,0xff,0xe7,0xb4,
+	0xa2,0x00,0x01,0xff,0xe5,0x8f,0x83,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe5,0xa1,
+	0x9e,0x00,0x01,0xff,0xe7,0x9c,0x81,0x00,0x10,0x08,0x01,0xff,0xe8,0x91,0x89,0x00,
+	0x01,0xff,0xe8,0xaa,0xaa,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe6,0xae,0xba,0x00,0x01,0xff,0xe8,0xbe,0xb0,0x00,0x10,0x08,0x01,0xff,0xe6,0xb2,
+	0x88,0x00,0x01,0xff,0xe6,0x8b,0xbe,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe8,0x8b,
+	0xa5,0x00,0x01,0xff,0xe6,0x8e,0xa0,0x00,0x10,0x08,0x01,0xff,0xe7,0x95,0xa5,0x00,
+	0x01,0xff,0xe4,0xba,0xae,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe5,0x85,
+	0xa9,0x00,0x01,0xff,0xe5,0x87,0x89,0x00,0x10,0x08,0x01,0xff,0xe6,0xa2,0x81,0x00,
+	0x01,0xff,0xe7,0xb3,0xa7,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe8,0x89,0xaf,0x00,
+	0x01,0xff,0xe8,0xab,0x92,0x00,0x10,0x08,0x01,0xff,0xe9,0x87,0x8f,0x00,0x01,0xff,
+	0xe5,0x8b,0xb5,0x00,0xe0,0x04,0x02,0xcf,0x86,0xe5,0x01,0x01,0xd4,0x80,0xd3,0x40,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe5,0x91,0x82,0x00,0x01,0xff,0xe5,0xa5,
+	0xb3,0x00,0x10,0x08,0x01,0xff,0xe5,0xbb,0xac,0x00,0x01,0xff,0xe6,0x97,0x85,0x00,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0xe6,0xbf,0xbe,0x00,0x01,0xff,0xe7,0xa4,0xaa,0x00,
+	0x10,0x08,0x01,0xff,0xe9,0x96,0xad,0x00,0x01,0xff,0xe9,0xa9,0xaa,0x00,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0xe9,0xba,0x97,0x00,0x01,0xff,0xe9,0xbb,0x8e,0x00,
+	0x10,0x08,0x01,0xff,0xe5,0x8a,0x9b,0x00,0x01,0xff,0xe6,0x9b,0x86,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe6,0xad,0xb7,0x00,0x01,0xff,0xe8,0xbd,0xa2,0x00,0x10,0x08,
+	0x01,0xff,0xe5,0xb9,0xb4,0x00,0x01,0xff,0xe6,0x86,0x90,0x00,0xd3,0x40,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0xe6,0x88,0x80,0x00,0x01,0xff,0xe6,0x92,0x9a,0x00,
+	0x10,0x08,0x01,0xff,0xe6,0xbc,0xa3,0x00,0x01,0xff,0xe7,0x85,0x89,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe7,0x92,0x89,0x00,0x01,0xff,0xe7,0xa7,0x8a,0x00,0x10,0x08,
+	0x01,0xff,0xe7,0xb7,0xb4,0x00,0x01,0xff,0xe8,0x81,0xaf,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe8,0xbc,0xa6,0x00,0x01,0xff,0xe8,0x93,0xae,0x00,0x10,0x08,
+	0x01,0xff,0xe9,0x80,0xa3,0x00,0x01,0xff,0xe9,0x8d,0x8a,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe5,0x88,0x97,0x00,0x01,0xff,0xe5,0x8a,0xa3,0x00,0x10,0x08,0x01,0xff,
+	0xe5,0x92,0xbd,0x00,0x01,0xff,0xe7,0x83,0x88,0x00,0xd4,0x80,0xd3,0x40,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0xe8,0xa3,0x82,0x00,0x01,0xff,0xe8,0xaa,0xaa,0x00,
+	0x10,0x08,0x01,0xff,0xe5,0xbb,0x89,0x00,0x01,0xff,0xe5,0xbf,0xb5,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe6,0x8d,0xbb,0x00,0x01,0xff,0xe6,0xae,0xae,0x00,0x10,0x08,
+	0x01,0xff,0xe7,0xb0,0xbe,0x00,0x01,0xff,0xe7,0x8d,0xb5,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe4,0xbb,0xa4,0x00,0x01,0xff,0xe5,0x9b,0xb9,0x00,0x10,0x08,
+	0x01,0xff,0xe5,0xaf,0xa7,0x00,0x01,0xff,0xe5,0xb6,0xba,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe6,0x80,0x9c,0x00,0x01,0xff,0xe7,0x8e,0xb2,0x00,0x10,0x08,0x01,0xff,
+	0xe7,0x91,0xa9,0x00,0x01,0xff,0xe7,0xbe,0x9a,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe8,0x81,0x86,0x00,0x01,0xff,0xe9,0x88,0xb4,0x00,0x10,0x08,
+	0x01,0xff,0xe9,0x9b,0xb6,0x00,0x01,0xff,0xe9,0x9d,0x88,0x00,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe9,0xa0,0x98,0x00,0x01,0xff,0xe4,0xbe,0x8b,0x00,0x10,0x08,0x01,0xff,
+	0xe7,0xa6,0xae,0x00,0x01,0xff,0xe9,0x86,0xb4,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe9,0x9a,0xb8,0x00,0x01,0xff,0xe6,0x83,0xa1,0x00,0x10,0x08,0x01,0xff,
+	0xe4,0xba,0x86,0x00,0x01,0xff,0xe5,0x83,0x9a,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe5,0xaf,0xae,0x00,0x01,0xff,0xe5,0xb0,0xbf,0x00,0x10,0x08,0x01,0xff,0xe6,0x96,
+	0x99,0x00,0x01,0xff,0xe6,0xa8,0x82,0x00,0xcf,0x86,0xe5,0x01,0x01,0xd4,0x80,0xd3,
+	0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe7,0x87,0x8e,0x00,0x01,0xff,0xe7,
+	0x99,0x82,0x00,0x10,0x08,0x01,0xff,0xe8,0x93,0xbc,0x00,0x01,0xff,0xe9,0x81,0xbc,
+	0x00,0xd1,0x10,0x10,0x08,0x01,0xff,0xe9,0xbe,0x8d,0x00,0x01,0xff,0xe6,0x9a,0x88,
+	0x00,0x10,0x08,0x01,0xff,0xe9,0x98,0xae,0x00,0x01,0xff,0xe5,0x8a,0x89,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe6,0x9d,0xbb,0x00,0x01,0xff,0xe6,0x9f,0xb3,
+	0x00,0x10,0x08,0x01,0xff,0xe6,0xb5,0x81,0x00,0x01,0xff,0xe6,0xba,0x9c,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe7,0x90,0x89,0x00,0x01,0xff,0xe7,0x95,0x99,0x00,0x10,
+	0x08,0x01,0xff,0xe7,0xa1,0xab,0x00,0x01,0xff,0xe7,0xb4,0x90,0x00,0xd3,0x40,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe9,0xa1,0x9e,0x00,0x01,0xff,0xe5,0x85,0xad,
+	0x00,0x10,0x08,0x01,0xff,0xe6,0x88,0xae,0x00,0x01,0xff,0xe9,0x99,0xb8,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe5,0x80,0xab,0x00,0x01,0xff,0xe5,0xb4,0x99,0x00,0x10,
+	0x08,0x01,0xff,0xe6,0xb7,0xaa,0x00,0x01,0xff,0xe8,0xbc,0xaa,0x00,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe5,0xbe,0x8b,0x00,0x01,0xff,0xe6,0x85,0x84,0x00,0x10,
+	0x08,0x01,0xff,0xe6,0xa0,0x97,0x00,0x01,0xff,0xe7,0x8e,0x87,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0xe9,0x9a,0x86,0x00,0x01,0xff,0xe5,0x88,0xa9,0x00,0x10,0x08,0x01,
+	0xff,0xe5,0x90,0x8f,0x00,0x01,0xff,0xe5,0xb1,0xa5,0x00,0xd4,0x80,0xd3,0x40,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x01,0xff,0xe6,0x98,0x93,0x00,0x01,0xff,0xe6,0x9d,0x8e,
+	0x00,0x10,0x08,0x01,0xff,0xe6,0xa2,0xa8,0x00,0x01,0xff,0xe6,0xb3,0xa5,0x00,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe7,0x90,0x86,0x00,0x01,0xff,0xe7,0x97,0xa2,0x00,0x10,
+	0x08,0x01,0xff,0xe7,0xbd,0xb9,0x00,0x01,0xff,0xe8,0xa3,0x8f,0x00,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe8,0xa3,0xa1,0x00,0x01,0xff,0xe9,0x87,0x8c,0x00,0x10,
+	0x08,0x01,0xff,0xe9,0x9b,0xa2,0x00,0x01,0xff,0xe5,0x8c,0xbf,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0xe6,0xba,0xba,0x00,0x01,0xff,0xe5,0x90,0x9d,0x00,0x10,0x08,0x01,
+	0xff,0xe7,0x87,0x90,0x00,0x01,0xff,0xe7,0x92,0x98,0x00,0xd3,0x40,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x01,0xff,0xe8,0x97,0xba,0x00,0x01,0xff,0xe9,0x9a,0xa3,0x00,0x10,
+	0x08,0x01,0xff,0xe9,0xb1,0x97,0x00,0x01,0xff,0xe9,0xba,0x9f,0x00,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0xe6,0x9e,0x97,0x00,0x01,0xff,0xe6,0xb7,0x8b,0x00,0x10,0x08,0x01,
+	0xff,0xe8,0x87,0xa8,0x00,0x01,0xff,0xe7,0xab,0x8b,0x00,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x01,0xff,0xe7,0xac,0xa0,0x00,0x01,0xff,0xe7,0xb2,0x92,0x00,0x10,0x08,0x01,
+	0xff,0xe7,0x8b,0x80,0x00,0x01,0xff,0xe7,0x82,0x99,0x00,0xd1,0x10,0x10,0x08,0x01,
+	0xff,0xe8,0xad,0x98,0x00,0x01,0xff,0xe4,0xbb,0x80,0x00,0x10,0x08,0x01,0xff,0xe8,
+	0x8c,0xb6,0x00,0x01,0xff,0xe5,0x88,0xba,0x00,0xe2,0xad,0x06,0xe1,0xc4,0x03,0xe0,
+	0xcb,0x01,0xcf,0x86,0xd5,0xe4,0xd4,0x74,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x01,0xff,0xe5,0x88,0x87,0x00,0x01,0xff,0xe5,0xba,0xa6,0x00,0x10,0x08,0x01,0xff,
+	0xe6,0x8b,0x93,0x00,0x01,0xff,0xe7,0xb3,0x96,0x00,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe5,0xae,0x85,0x00,0x01,0xff,0xe6,0xb4,0x9e,0x00,0x10,0x08,0x01,0xff,0xe6,0x9a,
+	0xb4,0x00,0x01,0xff,0xe8,0xbc,0xbb,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x01,0xff,
+	0xe8,0xa1,0x8c,0x00,0x01,0xff,0xe9,0x99,0x8d,0x00,0x10,0x08,0x01,0xff,0xe8,0xa6,
+	0x8b,0x00,0x01,0xff,0xe5,0xbb,0x93,0x00,0x91,0x10,0x10,0x08,0x01,0xff,0xe5,0x85,
+	0x80,0x00,0x01,0xff,0xe5,0x97,0x80,0x00,0x01,0x00,0xd3,0x34,0xd2,0x18,0xd1,0x0c,
+	0x10,0x08,0x01,0xff,0xe5,0xa1,0x9a,0x00,0x01,0x00,0x10,0x08,0x01,0xff,0xe6,0x99,
+	0xb4,0x00,0x01,0x00,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0xe5,0x87,0x9e,0x00,
+	0x10,0x08,0x01,0xff,0xe7,0x8c,0xaa,0x00,0x01,0xff,0xe7,0x9b,0x8a,0x00,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x01,0xff,0xe7,0xa4,0xbc,0x00,0x01,0xff,0xe7,0xa5,0x9e,0x00,
+	0x10,0x08,0x01,0xff,0xe7,0xa5,0xa5,0x00,0x01,0xff,0xe7,0xa6,0x8f,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe9,0x9d,0x96,0x00,0x01,0xff,0xe7,0xb2,0xbe,0x00,0x10,0x08,
+	0x01,0xff,0xe7,0xbe,0xbd,0x00,0x01,0x00,0xd4,0x64,0xd3,0x30,0xd2,0x18,0xd1,0x0c,
+	0x10,0x08,0x01,0xff,0xe8,0x98,0x92,0x00,0x01,0x00,0x10,0x08,0x01,0xff,0xe8,0xab,
+	0xb8,0x00,0x01,0x00,0xd1,0x0c,0x10,0x04,0x01,0x00,0x01,0xff,0xe9,0x80,0xb8,0x00,
+	0x10,0x08,0x01,0xff,0xe9,0x83,0xbd,0x00,0x01,0x00,0xd2,0x14,0x51,0x04,0x01,0x00,
+	0x10,0x08,0x01,0xff,0xe9,0xa3,0xaf,0x00,0x01,0xff,0xe9,0xa3,0xbc,0x00,0xd1,0x10,
+	0x10,0x08,0x01,0xff,0xe9,0xa4,0xa8,0x00,0x01,0xff,0xe9,0xb6,0xb4,0x00,0x10,0x08,
+	0x0d,0xff,0xe9,0x83,0x9e,0x00,0x0d,0xff,0xe9,0x9a,0xb7,0x00,0xd3,0x40,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x06,0xff,0xe4,0xbe,0xae,0x00,0x06,0xff,0xe5,0x83,0xa7,0x00,
+	0x10,0x08,0x06,0xff,0xe5,0x85,0x8d,0x00,0x06,0xff,0xe5,0x8b,0x89,0x00,0xd1,0x10,
+	0x10,0x08,0x06,0xff,0xe5,0x8b,0xa4,0x00,0x06,0xff,0xe5,0x8d,0x91,0x00,0x10,0x08,
+	0x06,0xff,0xe5,0x96,0x9d,0x00,0x06,0xff,0xe5,0x98,0x86,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x06,0xff,0xe5,0x99,0xa8,0x00,0x06,0xff,0xe5,0xa1,0x80,0x00,0x10,0x08,
+	0x06,0xff,0xe5,0xa2,0xa8,0x00,0x06,0xff,0xe5,0xb1,0xa4,0x00,0xd1,0x10,0x10,0x08,
+	0x06,0xff,0xe5,0xb1,0xae,0x00,0x06,0xff,0xe6,0x82,0x94,0x00,0x10,0x08,0x06,0xff,
+	0xe6,0x85,0xa8,0x00,0x06,0xff,0xe6,0x86,0x8e,0x00,0xcf,0x86,0xe5,0x01,0x01,0xd4,
+	0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x06,0xff,0xe6,0x87,0xb2,0x00,0x06,
+	0xff,0xe6,0x95,0x8f,0x00,0x10,0x08,0x06,0xff,0xe6,0x97,0xa2,0x00,0x06,0xff,0xe6,
+	0x9a,0x91,0x00,0xd1,0x10,0x10,0x08,0x06,0xff,0xe6,0xa2,0x85,0x00,0x06,0xff,0xe6,
+	0xb5,0xb7,0x00,0x10,0x08,0x06,0xff,0xe6,0xb8,0x9a,0x00,0x06,0xff,0xe6,0xbc,0xa2,
+	0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x06,0xff,0xe7,0x85,0xae,0x00,0x06,0xff,0xe7,
+	0x88,0xab,0x00,0x10,0x08,0x06,0xff,0xe7,0x90,0xa2,0x00,0x06,0xff,0xe7,0xa2,0x91,
+	0x00,0xd1,0x10,0x10,0x08,0x06,0xff,0xe7,0xa4,0xbe,0x00,0x06,0xff,0xe7,0xa5,0x89,
+	0x00,0x10,0x08,0x06,0xff,0xe7,0xa5,0x88,0x00,0x06,0xff,0xe7,0xa5,0x90,0x00,0xd3,
+	0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x06,0xff,0xe7,0xa5,0x96,0x00,0x06,0xff,0xe7,
+	0xa5,0x9d,0x00,0x10,0x08,0x06,0xff,0xe7,0xa6,0x8d,0x00,0x06,0xff,0xe7,0xa6,0x8e,
+	0x00,0xd1,0x10,0x10,0x08,0x06,0xff,0xe7,0xa9,0x80,0x00,0x06,0xff,0xe7,0xaa,0x81,
+	0x00,0x10,0x08,0x06,0xff,0xe7,0xaf,0x80,0x00,0x06,0xff,0xe7,0xb7,0xb4,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x06,0xff,0xe7,0xb8,0x89,0x00,0x06,0xff,0xe7,0xb9,0x81,
+	0x00,0x10,0x08,0x06,0xff,0xe7,0xbd,0xb2,0x00,0x06,0xff,0xe8,0x80,0x85,0x00,0xd1,
+	0x10,0x10,0x08,0x06,0xff,0xe8,0x87,0xad,0x00,0x06,0xff,0xe8,0x89,0xb9,0x00,0x10,
+	0x08,0x06,0xff,0xe8,0x89,0xb9,0x00,0x06,0xff,0xe8,0x91,0x97,0x00,0xd4,0x75,0xd3,
+	0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x06,0xff,0xe8,0xa4,0x90,0x00,0x06,0xff,0xe8,
+	0xa6,0x96,0x00,0x10,0x08,0x06,0xff,0xe8,0xac,0x81,0x00,0x06,0xff,0xe8,0xac,0xb9,
+	0x00,0xd1,0x10,0x10,0x08,0x06,0xff,0xe8,0xb3,0x93,0x00,0x06,0xff,0xe8,0xb4,0x88,
+	0x00,0x10,0x08,0x06,0xff,0xe8,0xbe,0xb6,0x00,0x06,0xff,0xe9,0x80,0xb8,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x06,0xff,0xe9,0x9b,0xa3,0x00,0x06,0xff,0xe9,0x9f,0xbf,
+	0x00,0x10,0x08,0x06,0xff,0xe9,0xa0,0xbb,0x00,0x0b,0xff,0xe6,0x81,0xb5,0x00,0x91,
+	0x11,0x10,0x09,0x0b,0xff,0xf0,0xa4,0x8b,0xae,0x00,0x0b,0xff,0xe8,0x88,0x98,0x00,
+	0x00,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x08,0xff,0xe4,0xb8,0xa6,0x00,
+	0x08,0xff,0xe5,0x86,0xb5,0x00,0x10,0x08,0x08,0xff,0xe5,0x85,0xa8,0x00,0x08,0xff,
+	0xe4,0xbe,0x80,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,0xe5,0x85,0x85,0x00,0x08,0xff,
+	0xe5,0x86,0x80,0x00,0x10,0x08,0x08,0xff,0xe5,0x8b,0x87,0x00,0x08,0xff,0xe5,0x8b,
+	0xba,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x08,0xff,0xe5,0x96,0x9d,0x00,0x08,0xff,
+	0xe5,0x95,0x95,0x00,0x10,0x08,0x08,0xff,0xe5,0x96,0x99,0x00,0x08,0xff,0xe5,0x97,
+	0xa2,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,0xe5,0xa1,0x9a,0x00,0x08,0xff,0xe5,0xa2,
+	0xb3,0x00,0x10,0x08,0x08,0xff,0xe5,0xa5,0x84,0x00,0x08,0xff,0xe5,0xa5,0x94,0x00,
+	0xe0,0x04,0x02,0xcf,0x86,0xe5,0x01,0x01,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x08,0xff,0xe5,0xa9,0xa2,0x00,0x08,0xff,0xe5,0xac,0xa8,0x00,0x10,0x08,
+	0x08,0xff,0xe5,0xbb,0x92,0x00,0x08,0xff,0xe5,0xbb,0x99,0x00,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe5,0xbd,0xa9,0x00,0x08,0xff,0xe5,0xbe,0xad,0x00,0x10,0x08,0x08,0xff,
+	0xe6,0x83,0x98,0x00,0x08,0xff,0xe6,0x85,0x8e,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe6,0x84,0x88,0x00,0x08,0xff,0xe6,0x86,0x8e,0x00,0x10,0x08,0x08,0xff,
+	0xe6,0x85,0xa0,0x00,0x08,0xff,0xe6,0x87,0xb2,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe6,0x88,0xb4,0x00,0x08,0xff,0xe6,0x8f,0x84,0x00,0x10,0x08,0x08,0xff,0xe6,0x90,
+	0x9c,0x00,0x08,0xff,0xe6,0x91,0x92,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe6,0x95,0x96,0x00,0x08,0xff,0xe6,0x99,0xb4,0x00,0x10,0x08,0x08,0xff,
+	0xe6,0x9c,0x97,0x00,0x08,0xff,0xe6,0x9c,0x9b,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe6,0x9d,0x96,0x00,0x08,0xff,0xe6,0xad,0xb9,0x00,0x10,0x08,0x08,0xff,0xe6,0xae,
+	0xba,0x00,0x08,0xff,0xe6,0xb5,0x81,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe6,0xbb,0x9b,0x00,0x08,0xff,0xe6,0xbb,0x8b,0x00,0x10,0x08,0x08,0xff,0xe6,0xbc,
+	0xa2,0x00,0x08,0xff,0xe7,0x80,0x9e,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,0xe7,0x85,
+	0xae,0x00,0x08,0xff,0xe7,0x9e,0xa7,0x00,0x10,0x08,0x08,0xff,0xe7,0x88,0xb5,0x00,
+	0x08,0xff,0xe7,0x8a,0xaf,0x00,0xd4,0x80,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe7,0x8c,0xaa,0x00,0x08,0xff,0xe7,0x91,0xb1,0x00,0x10,0x08,0x08,0xff,
+	0xe7,0x94,0x86,0x00,0x08,0xff,0xe7,0x94,0xbb,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe7,0x98,0x9d,0x00,0x08,0xff,0xe7,0x98,0x9f,0x00,0x10,0x08,0x08,0xff,0xe7,0x9b,
+	0x8a,0x00,0x08,0xff,0xe7,0x9b,0x9b,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe7,0x9b,0xb4,0x00,0x08,0xff,0xe7,0x9d,0x8a,0x00,0x10,0x08,0x08,0xff,0xe7,0x9d,
+	0x80,0x00,0x08,0xff,0xe7,0xa3,0x8c,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,0xe7,0xaa,
+	0xb1,0x00,0x08,0xff,0xe7,0xaf,0x80,0x00,0x10,0x08,0x08,0xff,0xe7,0xb1,0xbb,0x00,
+	0x08,0xff,0xe7,0xb5,0x9b,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe7,0xb7,0xb4,0x00,0x08,0xff,0xe7,0xbc,0xbe,0x00,0x10,0x08,0x08,0xff,0xe8,0x80,
+	0x85,0x00,0x08,0xff,0xe8,0x8d,0x92,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,0xe8,0x8f,
+	0xaf,0x00,0x08,0xff,0xe8,0x9d,0xb9,0x00,0x10,0x08,0x08,0xff,0xe8,0xa5,0x81,0x00,
+	0x08,0xff,0xe8,0xa6,0x86,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x08,0xff,0xe8,0xa6,
+	0x96,0x00,0x08,0xff,0xe8,0xaa,0xbf,0x00,0x10,0x08,0x08,0xff,0xe8,0xab,0xb8,0x00,
+	0x08,0xff,0xe8,0xab,0x8b,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,0xe8,0xac,0x81,0x00,
+	0x08,0xff,0xe8,0xab,0xbe,0x00,0x10,0x08,0x08,0xff,0xe8,0xab,0xad,0x00,0x08,0xff,
+	0xe8,0xac,0xb9,0x00,0xcf,0x86,0x95,0xde,0xd4,0x81,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x08,0xff,0xe8,0xae,0x8a,0x00,0x08,0xff,0xe8,0xb4,0x88,0x00,0x10,0x08,
+	0x08,0xff,0xe8,0xbc,0xb8,0x00,0x08,0xff,0xe9,0x81,0xb2,0x00,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe9,0x86,0x99,0x00,0x08,0xff,0xe9,0x89,0xb6,0x00,0x10,0x08,0x08,0xff,
+	0xe9,0x99,0xbc,0x00,0x08,0xff,0xe9,0x9b,0xa3,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x08,0xff,0xe9,0x9d,0x96,0x00,0x08,0xff,0xe9,0x9f,0x9b,0x00,0x10,0x08,0x08,0xff,
+	0xe9,0x9f,0xbf,0x00,0x08,0xff,0xe9,0xa0,0x8b,0x00,0xd1,0x10,0x10,0x08,0x08,0xff,
+	0xe9,0xa0,0xbb,0x00,0x08,0xff,0xe9,0xac,0x92,0x00,0x10,0x08,0x08,0xff,0xe9,0xbe,
+	0x9c,0x00,0x08,0xff,0xf0,0xa2,0xa1,0x8a,0x00,0xd3,0x45,0xd2,0x22,0xd1,0x12,0x10,
+	0x09,0x08,0xff,0xf0,0xa2,0xa1,0x84,0x00,0x08,0xff,0xf0,0xa3,0x8f,0x95,0x00,0x10,
+	0x08,0x08,0xff,0xe3,0xae,0x9d,0x00,0x08,0xff,0xe4,0x80,0x98,0x00,0xd1,0x11,0x10,
+	0x08,0x08,0xff,0xe4,0x80,0xb9,0x00,0x08,0xff,0xf0,0xa5,0x89,0x89,0x00,0x10,0x09,
+	0x08,0xff,0xf0,0xa5,0xb3,0x90,0x00,0x08,0xff,0xf0,0xa7,0xbb,0x93,0x00,0x92,0x14,
+	0x91,0x10,0x10,0x08,0x08,0xff,0xe9,0xbd,0x83,0x00,0x08,0xff,0xe9,0xbe,0x8e,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0xe1,0x94,0x01,0xe0,0x08,0x01,0xcf,0x86,0xd5,0x42,
+	0xd4,0x14,0x93,0x10,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,
+	0x00,0x00,0x00,0x00,0xd3,0x10,0x92,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,
+	0x01,0x00,0x01,0x00,0x52,0x04,0x00,0x00,0xd1,0x0d,0x10,0x04,0x00,0x00,0x04,0xff,
+	0xd7,0x99,0xd6,0xb4,0x00,0x10,0x04,0x01,0x1a,0x01,0xff,0xd7,0xb2,0xd6,0xb7,0x00,
+	0xd4,0x42,0x53,0x04,0x01,0x00,0xd2,0x16,0x51,0x04,0x01,0x00,0x10,0x09,0x01,0xff,
+	0xd7,0xa9,0xd7,0x81,0x00,0x01,0xff,0xd7,0xa9,0xd7,0x82,0x00,0xd1,0x16,0x10,0x0b,
+	0x01,0xff,0xd7,0xa9,0xd6,0xbc,0xd7,0x81,0x00,0x01,0xff,0xd7,0xa9,0xd6,0xbc,0xd7,
+	0x82,0x00,0x10,0x09,0x01,0xff,0xd7,0x90,0xd6,0xb7,0x00,0x01,0xff,0xd7,0x90,0xd6,
+	0xb8,0x00,0xd3,0x43,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xd7,0x90,0xd6,0xbc,
+	0x00,0x01,0xff,0xd7,0x91,0xd6,0xbc,0x00,0x10,0x09,0x01,0xff,0xd7,0x92,0xd6,0xbc,
+	0x00,0x01,0xff,0xd7,0x93,0xd6,0xbc,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xd7,0x94,
+	0xd6,0xbc,0x00,0x01,0xff,0xd7,0x95,0xd6,0xbc,0x00,0x10,0x09,0x01,0xff,0xd7,0x96,
+	0xd6,0xbc,0x00,0x00,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xd7,0x98,0xd6,
+	0xbc,0x00,0x01,0xff,0xd7,0x99,0xd6,0xbc,0x00,0x10,0x09,0x01,0xff,0xd7,0x9a,0xd6,
+	0xbc,0x00,0x01,0xff,0xd7,0x9b,0xd6,0xbc,0x00,0xd1,0x0d,0x10,0x09,0x01,0xff,0xd7,
+	0x9c,0xd6,0xbc,0x00,0x00,0x00,0x10,0x09,0x01,0xff,0xd7,0x9e,0xd6,0xbc,0x00,0x00,
+	0x00,0xcf,0x86,0x95,0x85,0x94,0x81,0xd3,0x3e,0xd2,0x1f,0xd1,0x12,0x10,0x09,0x01,
+	0xff,0xd7,0xa0,0xd6,0xbc,0x00,0x01,0xff,0xd7,0xa1,0xd6,0xbc,0x00,0x10,0x04,0x00,
+	0x00,0x01,0xff,0xd7,0xa3,0xd6,0xbc,0x00,0xd1,0x0d,0x10,0x09,0x01,0xff,0xd7,0xa4,
+	0xd6,0xbc,0x00,0x00,0x00,0x10,0x09,0x01,0xff,0xd7,0xa6,0xd6,0xbc,0x00,0x01,0xff,
+	0xd7,0xa7,0xd6,0xbc,0x00,0xd2,0x24,0xd1,0x12,0x10,0x09,0x01,0xff,0xd7,0xa8,0xd6,
+	0xbc,0x00,0x01,0xff,0xd7,0xa9,0xd6,0xbc,0x00,0x10,0x09,0x01,0xff,0xd7,0xaa,0xd6,
+	0xbc,0x00,0x01,0xff,0xd7,0x95,0xd6,0xb9,0x00,0xd1,0x12,0x10,0x09,0x01,0xff,0xd7,
+	0x91,0xd6,0xbf,0x00,0x01,0xff,0xd7,0x9b,0xd6,0xbf,0x00,0x10,0x09,0x01,0xff,0xd7,
+	0xa4,0xd6,0xbf,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd0,0x1a,0xcf,0x86,0x55,0x04,
+	0x01,0x00,0x54,0x04,0x01,0x00,0x93,0x0c,0x92,0x08,0x11,0x04,0x01,0x00,0x0c,0x00,
+	0x0c,0x00,0x0c,0x00,0xcf,0x86,0x95,0x24,0xd4,0x10,0x93,0x0c,0x92,0x08,0x11,0x04,
+	0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x93,0x10,0x92,0x0c,0x51,0x04,0x00,0x00,
+	0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0xd3,0x5a,0xd2,0x06,
+	0xcf,0x06,0x01,0x00,0xd1,0x14,0xd0,0x06,0xcf,0x06,0x01,0x00,0xcf,0x86,0x95,0x08,
+	0x14,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0xd0,0x1a,0xcf,0x86,0x95,0x14,0x54,0x04,
+	0x01,0x00,0x93,0x0c,0x92,0x08,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,
+	0x01,0x00,0xcf,0x86,0xd5,0x0c,0x94,0x08,0x13,0x04,0x01,0x00,0x00,0x00,0x05,0x00,
+	0x54,0x04,0x05,0x00,0x53,0x04,0x01,0x00,0x52,0x04,0x01,0x00,0x91,0x08,0x10,0x04,
+	0x06,0x00,0x07,0x00,0x00,0x00,0xd2,0xce,0xd1,0xa5,0xd0,0x37,0xcf,0x86,0xd5,0x15,
+	0x54,0x05,0x06,0xff,0x00,0x53,0x04,0x08,0x00,0x92,0x08,0x11,0x04,0x08,0x00,0x00,
+	0x00,0x00,0x00,0x94,0x1c,0xd3,0x10,0x52,0x04,0x01,0xe6,0x51,0x04,0x0a,0xe6,0x10,
+	0x04,0x0a,0xe6,0x10,0xdc,0x52,0x04,0x10,0xdc,0x11,0x04,0x10,0xdc,0x11,0xe6,0x01,
+	0x00,0xcf,0x86,0xd5,0x38,0xd4,0x24,0xd3,0x14,0x52,0x04,0x01,0x00,0xd1,0x08,0x10,
+	0x04,0x01,0x00,0x06,0x00,0x10,0x04,0x06,0x00,0x07,0x00,0x92,0x0c,0x91,0x08,0x10,
+	0x04,0x07,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x93,0x10,0x92,0x0c,0x51,0x04,0x01,
+	0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,0xd4,0x18,0xd3,0x10,0x52,
+	0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x12,0x04,0x01,
+	0x00,0x00,0x00,0x93,0x18,0xd2,0x0c,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x06,
+	0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x01,0x00,0x01,0x00,0xd0,0x06,0xcf,
+	0x06,0x01,0x00,0xcf,0x86,0x55,0x04,0x01,0x00,0x54,0x04,0x01,0x00,0x53,0x04,0x01,
+	0x00,0x52,0x04,0x01,0x00,0xd1,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x10,0x04,0x00,
+	0x00,0x01,0xff,0x00,0xd1,0x50,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x94,0x14,0x93,0x10,
+	0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,0x00,
+	0x01,0x00,0x01,0x00,0xcf,0x86,0xd5,0x18,0x54,0x04,0x01,0x00,0x53,0x04,0x01,0x00,
+	0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x06,0x00,0x94,0x14,
+	0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x06,0x00,0x01,0x00,0x01,0x00,0x01,0x00,
+	0x01,0x00,0x01,0x00,0xd0,0x2f,0xcf,0x86,0x55,0x04,0x01,0x00,0xd4,0x15,0x93,0x11,
+	0x92,0x0d,0x91,0x09,0x10,0x05,0x01,0xff,0x00,0x01,0x00,0x01,0x00,0x01,0x00,0x01,
+	0x00,0x53,0x04,0x01,0x00,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,
+	0x00,0x00,0x00,0xcf,0x86,0xd5,0x38,0xd4,0x18,0xd3,0x0c,0x92,0x08,0x11,0x04,0x00,
+	0x00,0x01,0x00,0x01,0x00,0x92,0x08,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0xd3,
+	0x0c,0x92,0x08,0x11,0x04,0x00,0x00,0x01,0x00,0x01,0x00,0xd2,0x08,0x11,0x04,0x00,
+	0x00,0x01,0x00,0x91,0x08,0x10,0x04,0x01,0x00,0x00,0x00,0x00,0x00,0xd4,0x20,0xd3,
+	0x10,0x52,0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x52,
+	0x04,0x01,0x00,0x51,0x04,0x01,0x00,0x10,0x04,0x01,0x00,0x00,0x00,0x53,0x05,0x00,
+	0xff,0x00,0xd2,0x0d,0x91,0x09,0x10,0x05,0x00,0xff,0x00,0x04,0x00,0x04,0x00,0x91,
+	0x08,0x10,0x04,0x03,0x00,0x01,0x00,0x01,0x00,0x83,0xe2,0x46,0x3e,0xe1,0x1f,0x3b,
+	0xe0,0x9c,0x39,0xcf,0x86,0xe5,0x40,0x26,0xc4,0xe3,0x16,0x14,0xe2,0xef,0x11,0xe1,
+	0xd0,0x10,0xe0,0x60,0x07,0xcf,0x86,0xe5,0x53,0x03,0xe4,0x4c,0x02,0xe3,0x3d,0x01,
+	0xd2,0x94,0xd1,0x70,0xd0,0x4a,0xcf,0x86,0xd5,0x18,0x94,0x14,0x53,0x04,0x07,0x00,
+	0x52,0x04,0x07,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x07,0x00,0x07,0x00,0x07,0x00,
+	0xd4,0x14,0x93,0x10,0x52,0x04,0x07,0x00,0x51,0x04,0x07,0x00,0x10,0x04,0x07,0x00,
+	0x00,0x00,0x07,0x00,0x53,0x04,0x07,0x00,0xd2,0x0c,0x51,0x04,0x07,0x00,0x10,0x04,
+	0x07,0x00,0x00,0x00,0x51,0x04,0x07,0x00,0x10,0x04,0x00,0x00,0x07,0x00,0xcf,0x86,
+	0x95,0x20,0xd4,0x10,0x53,0x04,0x07,0x00,0x52,0x04,0x07,0x00,0x11,0x04,0x07,0x00,
+	0x00,0x00,0x53,0x04,0x07,0x00,0x52,0x04,0x07,0x00,0x11,0x04,0x07,0x00,0x00,0x00,
+	0x00,0x00,0xd0,0x06,0xcf,0x06,0x07,0x00,0xcf,0x86,0x55,0x04,0x07,0x00,0x54,0x04,
+	0x07,0x00,0x53,0x04,0x07,0x00,0x92,0x0c,0x51,0x04,0x07,0x00,0x10,0x04,0x07,0x00,
+	0x00,0x00,0x00,0x00,0xd1,0x40,0xd0,0x3a,0xcf,0x86,0xd5,0x20,0x94,0x1c,0x93,0x18,
+	0xd2,0x0c,0x51,0x04,0x07,0x00,0x10,0x04,0x07,0x00,0x00,0x00,0x51,0x04,0x00,0x00,
+	0x10,0x04,0x00,0x00,0x07,0x00,0x07,0x00,0x07,0x00,0x54,0x04,0x07,0x00,0x93,0x10,
+	0x52,0x04,0x07,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x07,0x00,0x07,0x00,
+	0xcf,0x06,0x08,0x00,0xd0,0x46,0xcf,0x86,0xd5,0x2c,0xd4,0x20,0x53,0x04,0x08,0x00,
+	0xd2,0x0c,0x51,0x04,0x08,0x00,0x10,0x04,0x08,0x00,0x10,0x00,0xd1,0x08,0x10,0x04,
+	0x10,0x00,0x12,0x00,0x10,0x04,0x12,0x00,0x00,0x00,0x53,0x04,0x0a,0x00,0x12,0x04,
+	0x0a,0x00,0x00,0x00,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x10,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0xd5,0x08,0x14,0x04,
+	0x00,0x00,0x0a,0x00,0x54,0x04,0x0a,0x00,0x53,0x04,0x0a,0x00,0x52,0x04,0x0a,0x00,
+	0x91,0x08,0x10,0x04,0x0a,0x00,0x0a,0xdc,0x00,0x00,0xd2,0x5e,0xd1,0x06,0xcf,0x06,
+	0x00,0x00,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x54,0x04,0x0a,0x00,0x53,0x04,0x0a,0x00,
+	0x52,0x04,0x0a,0x00,0x91,0x08,0x10,0x04,0x0a,0x00,0x00,0x00,0x00,0x00,0x0a,0x00,
+	0xcf,0x86,0xd5,0x18,0x54,0x04,0x0a,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,
+	0x0a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd4,0x14,0x93,0x10,0x92,0x0c,
+	0x91,0x08,0x10,0x04,0x10,0xdc,0x10,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x53,0x04,
+	0x10,0x00,0x12,0x04,0x10,0x00,0x00,0x00,0xd1,0x70,0xd0,0x36,0xcf,0x86,0xd5,0x18,
+	0x54,0x04,0x05,0x00,0x53,0x04,0x05,0x00,0x52,0x04,0x05,0x00,0x51,0x04,0x05,0x00,
+	0x10,0x04,0x05,0x00,0x10,0x00,0x94,0x18,0xd3,0x08,0x12,0x04,0x05,0x00,0x00,0x00,
+	0x52,0x04,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x13,0x00,0x13,0x00,0x05,0x00,
+	0xcf,0x86,0xd5,0x18,0x94,0x14,0x53,0x04,0x05,0x00,0x92,0x0c,0x51,0x04,0x05,0x00,
+	0x10,0x04,0x05,0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x54,0x04,0x10,0x00,0xd3,0x0c,
+	0x52,0x04,0x10,0x00,0x11,0x04,0x10,0x00,0x10,0xe6,0x92,0x0c,0x51,0x04,0x10,0xe6,
+	0x10,0x04,0x10,0xe6,0x00,0x00,0x00,0x00,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x54,0x04,
+	0x07,0x00,0x53,0x04,0x07,0x00,0x52,0x04,0x07,0x00,0x51,0x04,0x07,0x00,0x10,0x04,
+	0x00,0x00,0x07,0x00,0x08,0x00,0xcf,0x86,0x95,0x1c,0xd4,0x0c,0x93,0x08,0x12,0x04,
+	0x08,0x00,0x00,0x00,0x08,0x00,0x93,0x0c,0x52,0x04,0x08,0x00,0x11,0x04,0x08,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0xd3,0xba,0xd2,0x80,0xd1,0x34,0xd0,0x1a,0xcf,0x86,
+	0x55,0x04,0x05,0x00,0x94,0x10,0x93,0x0c,0x52,0x04,0x05,0x00,0x11,0x04,0x05,0x00,
+	0x07,0x00,0x05,0x00,0x05,0x00,0xcf,0x86,0x95,0x14,0x94,0x10,0x53,0x04,0x05,0x00,
+	0x52,0x04,0x05,0x00,0x11,0x04,0x05,0x00,0x07,0x00,0x07,0x00,0x07,0x00,0xd0,0x2a,
+	0xcf,0x86,0xd5,0x14,0x54,0x04,0x07,0x00,0x53,0x04,0x07,0x00,0x52,0x04,0x07,0x00,
+	0x11,0x04,0x07,0x00,0x00,0x00,0x94,0x10,0x53,0x04,0x07,0x00,0x92,0x08,0x11,0x04,
+	0x07,0x00,0x00,0x00,0x00,0x00,0x12,0x00,0xcf,0x86,0xd5,0x10,0x54,0x04,0x12,0x00,
+	0x93,0x08,0x12,0x04,0x12,0x00,0x00,0x00,0x12,0x00,0x54,0x04,0x12,0x00,0x53,0x04,
+	0x12,0x00,0x12,0x04,0x12,0x00,0x00,0x00,0xd1,0x34,0xd0,0x12,0xcf,0x86,0x55,0x04,
+	0x10,0x00,0x94,0x08,0x13,0x04,0x10,0x00,0x00,0x00,0x10,0x00,0xcf,0x86,0x55,0x04,
+	0x10,0x00,0x94,0x18,0xd3,0x08,0x12,0x04,0x10,0x00,0x00,0x00,0x52,0x04,0x00,0x00,
+	0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,
+	0xd2,0x06,0xcf,0x06,0x10,0x00,0xd1,0x40,0xd0,0x1e,0xcf,0x86,0x55,0x04,0x10,0x00,
+	0x54,0x04,0x10,0x00,0x93,0x10,0x52,0x04,0x10,0x00,0x51,0x04,0x10,0x00,0x10,0x04,
+	0x10,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0xd5,0x14,0x54,0x04,0x10,0x00,0x93,0x0c,
+	0x52,0x04,0x10,0x00,0x11,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x94,0x08,0x13,0x04,
+	0x10,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xe4,0xce,0x02,0xe3,0x45,0x01,
+	0xd2,0xd0,0xd1,0x70,0xd0,0x52,0xcf,0x86,0xd5,0x20,0x94,0x1c,0xd3,0x0c,0x52,0x04,
+	0x07,0x00,0x11,0x04,0x07,0x00,0x00,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x07,0x00,
+	0x00,0x00,0x07,0x00,0x07,0x00,0x07,0x00,0x54,0x04,0x07,0x00,0xd3,0x10,0x52,0x04,
+	0x07,0x00,0x51,0x04,0x07,0x00,0x10,0x04,0x00,0x00,0x07,0x00,0xd2,0x0c,0x91,0x08,
+	0x10,0x04,0x07,0x00,0x00,0x00,0x00,0x00,0xd1,0x08,0x10,0x04,0x07,0x00,0x00,0x00,
+	0x10,0x04,0x00,0x00,0x07,0x00,0xcf,0x86,0x95,0x18,0x54,0x04,0x0b,0x00,0x93,0x10,
+	0x52,0x04,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,0x00,0x00,0x0b,0x00,0x0b,0x00,
+	0x10,0x00,0xd0,0x32,0xcf,0x86,0xd5,0x18,0x54,0x04,0x10,0x00,0x53,0x04,0x10,0x00,
+	0x52,0x04,0x10,0x00,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x00,0x00,0x94,0x14,
+	0x93,0x10,0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,
+	0x10,0x00,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x11,0x00,0xd3,0x14,
+	0xd2,0x0c,0x51,0x04,0x11,0x00,0x10,0x04,0x11,0x00,0x00,0x00,0x11,0x04,0x11,0x00,
+	0x00,0x00,0x92,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x11,0x00,0x11,0x00,
+	0xd1,0x40,0xd0,0x3a,0xcf,0x86,0xd5,0x1c,0x54,0x04,0x09,0x00,0x53,0x04,0x09,0x00,
+	0xd2,0x08,0x11,0x04,0x09,0x00,0x0b,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,
+	0x09,0x00,0x54,0x04,0x0a,0x00,0x53,0x04,0x0a,0x00,0xd2,0x08,0x11,0x04,0x0a,0x00,
+	0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x0a,0x00,0xcf,0x06,0x00,0x00,
+	0xd0,0x1a,0xcf,0x86,0x55,0x04,0x0d,0x00,0x54,0x04,0x0d,0x00,0x53,0x04,0x0d,0x00,
+	0x52,0x04,0x00,0x00,0x11,0x04,0x11,0x00,0x0d,0x00,0xcf,0x86,0x95,0x14,0x54,0x04,
+	0x11,0x00,0x93,0x0c,0x92,0x08,0x11,0x04,0x00,0x00,0x11,0x00,0x11,0x00,0x11,0x00,
+	0x11,0x00,0xd2,0xec,0xd1,0xa4,0xd0,0x76,0xcf,0x86,0xd5,0x48,0xd4,0x28,0xd3,0x14,
+	0x52,0x04,0x08,0x00,0xd1,0x08,0x10,0x04,0x00,0x00,0x08,0x00,0x10,0x04,0x08,0x00,
+	0x00,0x00,0x52,0x04,0x00,0x00,0xd1,0x08,0x10,0x04,0x08,0x00,0x08,0xdc,0x10,0x04,
+	0x08,0x00,0x08,0xe6,0xd3,0x10,0x52,0x04,0x08,0x00,0x91,0x08,0x10,0x04,0x00,0x00,
+	0x08,0x00,0x08,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x08,0x00,0x08,0x00,
+	0x08,0x00,0x54,0x04,0x08,0x00,0xd3,0x0c,0x52,0x04,0x08,0x00,0x11,0x04,0x14,0x00,
+	0x00,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x08,0xe6,0x08,0x01,0x10,0x04,0x08,0xdc,
+	0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x08,0x09,0xcf,0x86,0x95,0x28,
+	0xd4,0x14,0x53,0x04,0x08,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x14,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x53,0x04,0x08,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x08,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x0b,0x00,0xd0,0x0a,0xcf,0x86,0x15,0x04,0x10,0x00,
+	0x00,0x00,0xcf,0x86,0x55,0x04,0x10,0x00,0xd4,0x24,0xd3,0x14,0x52,0x04,0x10,0x00,
+	0xd1,0x08,0x10,0x04,0x10,0x00,0x10,0xe6,0x10,0x04,0x10,0xdc,0x00,0x00,0x92,0x0c,
+	0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0x10,0x00,0x93,0x10,0x52,0x04,
+	0x10,0x00,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0xd1,0x54,
+	0xd0,0x26,0xcf,0x86,0x55,0x04,0x0b,0x00,0x54,0x04,0x0b,0x00,0xd3,0x0c,0x52,0x04,
+	0x0b,0x00,0x11,0x04,0x0b,0x00,0x00,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,
+	0x0b,0x00,0x0b,0x00,0x0b,0x00,0xcf,0x86,0xd5,0x14,0x54,0x04,0x0b,0x00,0x93,0x0c,
+	0x52,0x04,0x0b,0x00,0x11,0x04,0x0b,0x00,0x00,0x00,0x0b,0x00,0x54,0x04,0x0b,0x00,
+	0x93,0x10,0x92,0x0c,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,
+	0x0b,0x00,0xd0,0x42,0xcf,0x86,0xd5,0x28,0x54,0x04,0x10,0x00,0xd3,0x0c,0x92,0x08,
+	0x11,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,
+	0x10,0x00,0x10,0x00,0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x94,0x14,
+	0x53,0x04,0x00,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x10,0x00,0x10,0x00,
+	0x10,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xd3,0x96,0xd2,0x68,0xd1,0x24,0xd0,0x06,
+	0xcf,0x06,0x0b,0x00,0xcf,0x86,0x95,0x18,0x94,0x14,0x53,0x04,0x0b,0x00,0x92,0x0c,
+	0x91,0x08,0x10,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0xd0,0x1e,0xcf,0x86,0x55,0x04,0x11,0x00,0x54,0x04,0x11,0x00,0x93,0x10,0x92,0x0c,
+	0x51,0x04,0x11,0x00,0x10,0x04,0x11,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,
+	0x55,0x04,0x11,0x00,0x54,0x04,0x11,0x00,0xd3,0x10,0x92,0x0c,0x51,0x04,0x11,0x00,
+	0x10,0x04,0x11,0x00,0x00,0x00,0x00,0x00,0x92,0x08,0x11,0x04,0x00,0x00,0x11,0x00,
+	0x11,0x00,0xd1,0x28,0xd0,0x22,0xcf,0x86,0x55,0x04,0x14,0x00,0xd4,0x0c,0x93,0x08,
+	0x12,0x04,0x14,0x00,0x14,0xe6,0x00,0x00,0x53,0x04,0x14,0x00,0x92,0x08,0x11,0x04,
+	0x14,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xd2,0x2a,
+	0xd1,0x24,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,
+	0x0b,0x00,0x53,0x04,0x0b,0x00,0x52,0x04,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,
+	0x0b,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xd1,0x58,0xd0,0x12,0xcf,0x86,0x55,0x04,
+	0x14,0x00,0x94,0x08,0x13,0x04,0x14,0x00,0x00,0x00,0x14,0x00,0xcf,0x86,0x95,0x40,
+	0xd4,0x24,0xd3,0x0c,0x52,0x04,0x14,0x00,0x11,0x04,0x14,0x00,0x14,0xdc,0xd2,0x0c,
+	0x51,0x04,0x14,0xe6,0x10,0x04,0x14,0xe6,0x14,0xdc,0x91,0x08,0x10,0x04,0x14,0xe6,
+	0x14,0xdc,0x14,0xdc,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x14,0xdc,0x14,0x00,
+	0x14,0x00,0x14,0x00,0x92,0x08,0x11,0x04,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x15,0x00,
+	0x93,0x10,0x52,0x04,0x15,0x00,0x51,0x04,0x15,0x00,0x10,0x04,0x15,0x00,0x00,0x00,
+	0x00,0x00,0xcf,0x86,0xe5,0x0f,0x06,0xe4,0xf8,0x03,0xe3,0x02,0x02,0xd2,0xfb,0xd1,
+	0x4c,0xd0,0x06,0xcf,0x06,0x0c,0x00,0xcf,0x86,0xd5,0x2c,0xd4,0x1c,0xd3,0x10,0x52,
+	0x04,0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x09,0x0c,0x00,0x52,0x04,0x0c,
+	0x00,0x11,0x04,0x0c,0x00,0x00,0x00,0x93,0x0c,0x92,0x08,0x11,0x04,0x00,0x00,0x0c,
+	0x00,0x0c,0x00,0x0c,0x00,0x54,0x04,0x0c,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,
+	0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x09,0xd0,0x69,0xcf,0x86,0xd5,
+	0x32,0x54,0x04,0x0b,0x00,0x53,0x04,0x0b,0x00,0xd2,0x15,0x51,0x04,0x0b,0x00,0x10,
+	0x0d,0x0b,0xff,0xf0,0x91,0x82,0x99,0xf0,0x91,0x82,0xba,0x00,0x0b,0x00,0x91,0x11,
+	0x10,0x0d,0x0b,0xff,0xf0,0x91,0x82,0x9b,0xf0,0x91,0x82,0xba,0x00,0x0b,0x00,0x0b,
+	0x00,0xd4,0x1d,0x53,0x04,0x0b,0x00,0x92,0x15,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,
+	0x00,0x0b,0xff,0xf0,0x91,0x82,0xa5,0xf0,0x91,0x82,0xba,0x00,0x0b,0x00,0x53,0x04,
+	0x0b,0x00,0x92,0x10,0xd1,0x08,0x10,0x04,0x0b,0x00,0x0b,0x09,0x10,0x04,0x0b,0x07,
+	0x0b,0x00,0x0b,0x00,0xcf,0x86,0xd5,0x20,0x94,0x1c,0xd3,0x0c,0x92,0x08,0x11,0x04,
+	0x0b,0x00,0x00,0x00,0x00,0x00,0x52,0x04,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,
+	0x14,0x00,0x00,0x00,0x0d,0x00,0xd4,0x14,0x53,0x04,0x0d,0x00,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x53,0x04,0x0d,0x00,0x92,0x08,
+	0x11,0x04,0x0d,0x00,0x00,0x00,0x00,0x00,0xd1,0x96,0xd0,0x5c,0xcf,0x86,0xd5,0x18,
+	0x94,0x14,0x93,0x10,0x92,0x0c,0x51,0x04,0x0d,0xe6,0x10,0x04,0x0d,0xe6,0x0d,0x00,
+	0x0d,0x00,0x0d,0x00,0x0d,0x00,0xd4,0x26,0x53,0x04,0x0d,0x00,0x52,0x04,0x0d,0x00,
+	0x51,0x04,0x0d,0x00,0x10,0x0d,0x0d,0xff,0xf0,0x91,0x84,0xb1,0xf0,0x91,0x84,0xa7,
+	0x00,0x0d,0xff,0xf0,0x91,0x84,0xb2,0xf0,0x91,0x84,0xa7,0x00,0x93,0x18,0xd2,0x0c,
+	0x51,0x04,0x0d,0x00,0x10,0x04,0x0d,0x00,0x0d,0x09,0x91,0x08,0x10,0x04,0x0d,0x09,
+	0x00,0x00,0x0d,0x00,0x0d,0x00,0xcf,0x86,0xd5,0x18,0x94,0x14,0x93,0x10,0x52,0x04,
+	0x0d,0x00,0x51,0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x00,0x00,0x00,0x00,0x10,0x00,
+	0x54,0x04,0x10,0x00,0x93,0x18,0xd2,0x0c,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,
+	0x10,0x07,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0xd0,0x06,
+	0xcf,0x06,0x0d,0x00,0xcf,0x86,0xd5,0x40,0xd4,0x2c,0xd3,0x10,0x92,0x0c,0x91,0x08,
+	0x10,0x04,0x0d,0x09,0x0d,0x00,0x0d,0x00,0x0d,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,
+	0x0d,0x00,0x11,0x00,0x10,0x04,0x11,0x07,0x11,0x00,0x91,0x08,0x10,0x04,0x11,0x00,
+	0x10,0x00,0x00,0x00,0x53,0x04,0x0d,0x00,0x92,0x0c,0x51,0x04,0x0d,0x00,0x10,0x04,
+	0x10,0x00,0x11,0x00,0x11,0x00,0xd4,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,
+	0x00,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x93,0x10,0x52,0x04,0x10,0x00,
+	0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd2,0xc8,0xd1,0x48,
+	0xd0,0x42,0xcf,0x86,0xd5,0x18,0x54,0x04,0x10,0x00,0x93,0x10,0x92,0x0c,0x51,0x04,
+	0x10,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x54,0x04,0x10,0x00,
+	0xd3,0x14,0x52,0x04,0x10,0x00,0xd1,0x08,0x10,0x04,0x10,0x00,0x10,0x09,0x10,0x04,
+	0x10,0x07,0x10,0x00,0x52,0x04,0x10,0x00,0x51,0x04,0x10,0x00,0x10,0x04,0x12,0x00,
+	0x00,0x00,0xcf,0x06,0x00,0x00,0xd0,0x52,0xcf,0x86,0xd5,0x3c,0xd4,0x28,0xd3,0x10,
+	0x52,0x04,0x11,0x00,0x51,0x04,0x11,0x00,0x10,0x04,0x11,0x00,0x00,0x00,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x11,0x00,0x00,0x00,0x11,0x00,0x51,0x04,0x11,0x00,0x10,0x04,
+	0x00,0x00,0x11,0x00,0x53,0x04,0x11,0x00,0x52,0x04,0x11,0x00,0x51,0x04,0x11,0x00,
+	0x10,0x04,0x00,0x00,0x11,0x00,0x94,0x10,0x53,0x04,0x11,0x00,0x92,0x08,0x11,0x04,
+	0x11,0x00,0x00,0x00,0x00,0x00,0x10,0x00,0xcf,0x86,0x55,0x04,0x10,0x00,0xd4,0x18,
+	0x53,0x04,0x10,0x00,0x92,0x10,0xd1,0x08,0x10,0x04,0x10,0x00,0x10,0x07,0x10,0x04,
+	0x10,0x09,0x00,0x00,0x00,0x00,0x53,0x04,0x10,0x00,0x92,0x08,0x11,0x04,0x10,0x00,
+	0x00,0x00,0x00,0x00,0xe1,0x27,0x01,0xd0,0x8a,0xcf,0x86,0xd5,0x44,0xd4,0x2c,0xd3,
+	0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x11,0x00,0x10,0x00,0x10,0x00,0x91,0x08,0x10,
+	0x04,0x00,0x00,0x10,0x00,0x10,0x00,0x52,0x04,0x10,0x00,0xd1,0x08,0x10,0x04,0x10,
+	0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0x93,0x14,0x92,0x10,0xd1,0x08,0x10,
+	0x04,0x10,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0xd4,
+	0x14,0x53,0x04,0x10,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x10,
+	0x00,0x10,0x00,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x10,
+	0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x10,0x00,0x10,0x00,0xd2,0x0c,0x51,0x04,0x10,
+	0x00,0x10,0x04,0x00,0x00,0x14,0x07,0x91,0x08,0x10,0x04,0x10,0x07,0x10,0x00,0x10,
+	0x00,0xcf,0x86,0xd5,0x6a,0xd4,0x42,0xd3,0x14,0x52,0x04,0x10,0x00,0xd1,0x08,0x10,
+	0x04,0x10,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0xd2,0x19,0xd1,0x08,0x10,
+	0x04,0x10,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0xff,0xf0,0x91,0x8d,0x87,0xf0,
+	0x91,0x8c,0xbe,0x00,0x91,0x11,0x10,0x0d,0x10,0xff,0xf0,0x91,0x8d,0x87,0xf0,0x91,
+	0x8d,0x97,0x00,0x10,0x09,0x00,0x00,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x11,
+	0x00,0x00,0x00,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0x52,
+	0x04,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x10,0x00,0x10,0x00,0xd4,0x1c,0xd3,
+	0x0c,0x52,0x04,0x10,0x00,0x11,0x04,0x00,0x00,0x10,0xe6,0x52,0x04,0x10,0xe6,0x91,
+	0x08,0x10,0x04,0x10,0xe6,0x00,0x00,0x00,0x00,0x93,0x10,0x52,0x04,0x10,0xe6,0x91,
+	0x08,0x10,0x04,0x10,0xe6,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xe3,
+	0x30,0x01,0xd2,0xb7,0xd1,0x48,0xd0,0x06,0xcf,0x06,0x12,0x00,0xcf,0x86,0x95,0x3c,
+	0xd4,0x1c,0x93,0x18,0xd2,0x0c,0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x09,0x12,0x00,
+	0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x07,0x12,0x00,0x12,0x00,0x53,0x04,0x12,0x00,
+	0xd2,0x0c,0x51,0x04,0x12,0x00,0x10,0x04,0x00,0x00,0x12,0x00,0xd1,0x08,0x10,0x04,
+	0x00,0x00,0x12,0x00,0x10,0x04,0x14,0xe6,0x15,0x00,0x00,0x00,0xd0,0x45,0xcf,0x86,
+	0x55,0x04,0x10,0x00,0x54,0x04,0x10,0x00,0x53,0x04,0x10,0x00,0xd2,0x15,0x51,0x04,
+	0x10,0x00,0x10,0x04,0x10,0x00,0x10,0xff,0xf0,0x91,0x92,0xb9,0xf0,0x91,0x92,0xba,
+	0x00,0xd1,0x11,0x10,0x0d,0x10,0xff,0xf0,0x91,0x92,0xb9,0xf0,0x91,0x92,0xb0,0x00,
+	0x10,0x00,0x10,0x0d,0x10,0xff,0xf0,0x91,0x92,0xb9,0xf0,0x91,0x92,0xbd,0x00,0x10,
+	0x00,0xcf,0x86,0x95,0x24,0xd4,0x14,0x93,0x10,0x92,0x0c,0x51,0x04,0x10,0x00,0x10,
+	0x04,0x10,0x09,0x10,0x07,0x10,0x00,0x00,0x00,0x53,0x04,0x10,0x00,0x92,0x08,0x11,
+	0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,0xd0,
+	0x40,0xcf,0x86,0x55,0x04,0x10,0x00,0x54,0x04,0x10,0x00,0xd3,0x0c,0x52,0x04,0x10,
+	0x00,0x11,0x04,0x10,0x00,0x00,0x00,0xd2,0x1e,0x51,0x04,0x10,0x00,0x10,0x0d,0x10,
+	0xff,0xf0,0x91,0x96,0xb8,0xf0,0x91,0x96,0xaf,0x00,0x10,0xff,0xf0,0x91,0x96,0xb9,
+	0xf0,0x91,0x96,0xaf,0x00,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x10,0x09,0xcf,
+	0x86,0x95,0x2c,0xd4,0x1c,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x10,0x07,0x10,
+	0x00,0x10,0x00,0x10,0x00,0x92,0x08,0x11,0x04,0x10,0x00,0x11,0x00,0x11,0x00,0x53,
+	0x04,0x11,0x00,0x52,0x04,0x11,0x00,0x11,0x04,0x11,0x00,0x00,0x00,0x00,0x00,0xd2,
+	0xa0,0xd1,0x5c,0xd0,0x1e,0xcf,0x86,0x55,0x04,0x10,0x00,0x54,0x04,0x10,0x00,0x53,
+	0x04,0x10,0x00,0x52,0x04,0x10,0x00,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x10,
+	0x09,0xcf,0x86,0xd5,0x24,0xd4,0x14,0x93,0x10,0x52,0x04,0x10,0x00,0x91,0x08,0x10,
+	0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x53,0x04,0x10,0x00,0x92,0x08,0x11,
+	0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x94,0x14,0x53,0x04,0x12,0x00,0x52,0x04,0x12,
+	0x00,0x91,0x08,0x10,0x04,0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd0,0x2a,0xcf,
+	0x86,0x55,0x04,0x0d,0x00,0x54,0x04,0x0d,0x00,0xd3,0x10,0x52,0x04,0x0d,0x00,0x51,
+	0x04,0x0d,0x00,0x10,0x04,0x0d,0x09,0x0d,0x07,0x92,0x0c,0x91,0x08,0x10,0x04,0x15,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0x95,0x14,0x94,0x10,0x53,0x04,0x0d,
+	0x00,0x92,0x08,0x11,0x04,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd1,
+	0x40,0xd0,0x3a,0xcf,0x86,0xd5,0x20,0x54,0x04,0x11,0x00,0x53,0x04,0x11,0x00,0xd2,
+	0x0c,0x51,0x04,0x11,0x00,0x10,0x04,0x14,0x00,0x00,0x00,0x91,0x08,0x10,0x04,0x00,
+	0x00,0x11,0x00,0x11,0x00,0x94,0x14,0x53,0x04,0x11,0x00,0x92,0x0c,0x51,0x04,0x11,
+	0x00,0x10,0x04,0x11,0x00,0x11,0x09,0x00,0x00,0x11,0x00,0xcf,0x06,0x00,0x00,0xcf,
+	0x06,0x00,0x00,0xe4,0x59,0x01,0xd3,0xb2,0xd2,0x5c,0xd1,0x28,0xd0,0x22,0xcf,0x86,
+	0x55,0x04,0x14,0x00,0x54,0x04,0x14,0x00,0x53,0x04,0x14,0x00,0x92,0x10,0xd1,0x08,
+	0x10,0x04,0x14,0x00,0x14,0x09,0x10,0x04,0x14,0x07,0x14,0x00,0x00,0x00,0xcf,0x06,
+	0x00,0x00,0xd0,0x0a,0xcf,0x86,0x15,0x04,0x00,0x00,0x10,0x00,0xcf,0x86,0x55,0x04,
+	0x10,0x00,0x54,0x04,0x10,0x00,0xd3,0x10,0x92,0x0c,0x51,0x04,0x10,0x00,0x10,0x04,
+	0x10,0x00,0x00,0x00,0x00,0x00,0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,
+	0x00,0x00,0x10,0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,0xd0,0x1a,0xcf,0x86,0x55,0x04,
+	0x00,0x00,0x94,0x10,0x53,0x04,0x15,0x00,0x92,0x08,0x11,0x04,0x00,0x00,0x15,0x00,
+	0x15,0x00,0x15,0x00,0xcf,0x86,0xd5,0x14,0x54,0x04,0x15,0x00,0x53,0x04,0x15,0x00,
+	0x92,0x08,0x11,0x04,0x00,0x00,0x15,0x00,0x15,0x00,0x94,0x1c,0x93,0x18,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x15,0x09,0x15,0x00,0x15,0x00,0x91,0x08,0x10,0x04,0x15,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd2,0xa0,0xd1,0x3c,0xd0,0x1e,0xcf,0x86,
+	0x55,0x04,0x13,0x00,0x54,0x04,0x13,0x00,0x93,0x10,0x52,0x04,0x13,0x00,0x91,0x08,
+	0x10,0x04,0x13,0x09,0x13,0x00,0x13,0x00,0x13,0x00,0xcf,0x86,0x95,0x18,0x94,0x14,
+	0x93,0x10,0x52,0x04,0x13,0x00,0x51,0x04,0x13,0x00,0x10,0x04,0x13,0x00,0x13,0x09,
+	0x00,0x00,0x13,0x00,0x13,0x00,0xd0,0x46,0xcf,0x86,0xd5,0x2c,0xd4,0x10,0x93,0x0c,
+	0x52,0x04,0x13,0x00,0x11,0x04,0x15,0x00,0x13,0x00,0x13,0x00,0x53,0x04,0x13,0x00,
+	0xd2,0x0c,0x91,0x08,0x10,0x04,0x13,0x00,0x13,0x09,0x13,0x00,0x91,0x08,0x10,0x04,
+	0x13,0x00,0x14,0x00,0x13,0x00,0x94,0x14,0x93,0x10,0x92,0x0c,0x51,0x04,0x13,0x00,
+	0x10,0x04,0x13,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0x55,0x04,
+	0x10,0x00,0x54,0x04,0x10,0x00,0x53,0x04,0x10,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,
+	0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xe3,0xa9,0x01,0xd2,
+	0xb0,0xd1,0x6c,0xd0,0x3e,0xcf,0x86,0xd5,0x18,0x94,0x14,0x53,0x04,0x12,0x00,0x92,
+	0x0c,0x91,0x08,0x10,0x04,0x12,0x00,0x00,0x00,0x12,0x00,0x12,0x00,0x12,0x00,0x54,
+	0x04,0x12,0x00,0xd3,0x10,0x52,0x04,0x12,0x00,0x51,0x04,0x12,0x00,0x10,0x04,0x12,
+	0x00,0x00,0x00,0x52,0x04,0x12,0x00,0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x00,0x12,
+	0x09,0xcf,0x86,0xd5,0x14,0x94,0x10,0x93,0x0c,0x52,0x04,0x12,0x00,0x11,0x04,0x12,
+	0x00,0x00,0x00,0x00,0x00,0x12,0x00,0x94,0x14,0x53,0x04,0x12,0x00,0x52,0x04,0x12,
+	0x00,0x91,0x08,0x10,0x04,0x12,0x00,0x00,0x00,0x00,0x00,0x12,0x00,0xd0,0x3e,0xcf,
+	0x86,0xd5,0x14,0x54,0x04,0x12,0x00,0x93,0x0c,0x92,0x08,0x11,0x04,0x00,0x00,0x12,
+	0x00,0x12,0x00,0x12,0x00,0xd4,0x14,0x53,0x04,0x12,0x00,0x92,0x0c,0x91,0x08,0x10,
+	0x04,0x00,0x00,0x12,0x00,0x12,0x00,0x12,0x00,0x93,0x10,0x52,0x04,0x12,0x00,0x51,
+	0x04,0x12,0x00,0x10,0x04,0x12,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xd1,
+	0xa0,0xd0,0x52,0xcf,0x86,0xd5,0x24,0x94,0x20,0xd3,0x10,0x52,0x04,0x13,0x00,0x51,
+	0x04,0x13,0x00,0x10,0x04,0x13,0x00,0x00,0x00,0x92,0x0c,0x51,0x04,0x13,0x00,0x10,
+	0x04,0x00,0x00,0x13,0x00,0x13,0x00,0x13,0x00,0x54,0x04,0x13,0x00,0xd3,0x10,0x52,
+	0x04,0x13,0x00,0x51,0x04,0x13,0x00,0x10,0x04,0x13,0x00,0x00,0x00,0xd2,0x0c,0x51,
+	0x04,0x00,0x00,0x10,0x04,0x13,0x00,0x00,0x00,0x51,0x04,0x13,0x00,0x10,0x04,0x00,
+	0x00,0x13,0x00,0xcf,0x86,0xd5,0x28,0xd4,0x18,0x93,0x14,0xd2,0x0c,0x51,0x04,0x13,
+	0x00,0x10,0x04,0x13,0x07,0x13,0x00,0x11,0x04,0x13,0x09,0x13,0x00,0x00,0x00,0x53,
+	0x04,0x13,0x00,0x92,0x08,0x11,0x04,0x13,0x00,0x00,0x00,0x00,0x00,0x94,0x20,0xd3,
+	0x10,0x52,0x04,0x14,0x00,0x51,0x04,0x14,0x00,0x10,0x04,0x00,0x00,0x14,0x00,0x92,
+	0x0c,0x91,0x08,0x10,0x04,0x14,0x00,0x00,0x00,0x14,0x00,0x14,0x00,0x14,0x00,0xd0,
+	0x52,0xcf,0x86,0xd5,0x3c,0xd4,0x14,0x53,0x04,0x14,0x00,0x52,0x04,0x14,0x00,0x51,
+	0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x00,0x00,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x14,
+	0x00,0x10,0x04,0x00,0x00,0x14,0x00,0x51,0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x14,
+	0x09,0x92,0x0c,0x91,0x08,0x10,0x04,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x94,
+	0x10,0x53,0x04,0x14,0x00,0x92,0x08,0x11,0x04,0x14,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x00,0xcf,0x06,0x00,0x00,0xd2,0x2a,0xd1,0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,0xcf,
+	0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x14,0x00,0x53,0x04,0x14,
+	0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd1,
+	0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x15,
+	0x00,0x54,0x04,0x15,0x00,0xd3,0x0c,0x92,0x08,0x11,0x04,0x15,0x00,0x00,0x00,0x00,
+	0x00,0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x15,0x00,0xd0,
+	0xca,0xcf,0x86,0xd5,0xc2,0xd4,0x54,0xd3,0x06,0xcf,0x06,0x09,0x00,0xd2,0x06,0xcf,
+	0x06,0x09,0x00,0xd1,0x24,0xd0,0x06,0xcf,0x06,0x09,0x00,0xcf,0x86,0x55,0x04,0x09,
+	0x00,0x94,0x14,0x53,0x04,0x09,0x00,0x52,0x04,0x09,0x00,0x51,0x04,0x09,0x00,0x10,
+	0x04,0x09,0x00,0x10,0x00,0x10,0x00,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x54,0x04,0x10,
+	0x00,0x53,0x04,0x10,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x10,0x00,0x11,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xd3,0x68,0xd2,0x46,0xd1,0x40,0xd0,
+	0x06,0xcf,0x06,0x09,0x00,0xcf,0x86,0x55,0x04,0x09,0x00,0xd4,0x20,0xd3,0x10,0x92,
+	0x0c,0x51,0x04,0x09,0x00,0x10,0x04,0x09,0x00,0x10,0x00,0x10,0x00,0x52,0x04,0x10,
+	0x00,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x00,0x00,0x93,0x10,0x52,0x04,0x09,
+	0x00,0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x11,
+	0x00,0xd1,0x1c,0xd0,0x06,0xcf,0x06,0x11,0x00,0xcf,0x86,0x95,0x10,0x94,0x0c,0x93,
+	0x08,0x12,0x04,0x11,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,
+	0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x86,0xd5,0x4c,0xd4,0x06,0xcf,
+	0x06,0x0b,0x00,0xd3,0x40,0xd2,0x3a,0xd1,0x34,0xd0,0x2e,0xcf,0x86,0x55,0x04,0x0b,
+	0x00,0xd4,0x14,0x53,0x04,0x0b,0x00,0x52,0x04,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,
+	0x04,0x0b,0x00,0x00,0x00,0x53,0x04,0x15,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x15,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,
+	0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xd1,0x4c,0xd0,0x44,0xcf,
+	0x86,0xd5,0x3c,0xd4,0x06,0xcf,0x06,0x00,0x00,0xd3,0x06,0xcf,0x06,0x11,0x00,0xd2,
+	0x2a,0xd1,0x24,0xd0,0x06,0xcf,0x06,0x11,0x00,0xcf,0x86,0x95,0x18,0x94,0x14,0x93,
+	0x10,0x52,0x04,0x11,0x00,0x51,0x04,0x11,0x00,0x10,0x04,0x11,0x00,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,
+	0x00,0xcf,0x86,0xcf,0x06,0x00,0x00,0xe0,0xd2,0x01,0xcf,0x86,0xd5,0x06,0xcf,0x06,
+	0x00,0x00,0xe4,0x0b,0x01,0xd3,0x06,0xcf,0x06,0x0c,0x00,0xd2,0x84,0xd1,0x50,0xd0,
+	0x1e,0xcf,0x86,0x55,0x04,0x0c,0x00,0x54,0x04,0x0c,0x00,0x53,0x04,0x0c,0x00,0x92,
+	0x0c,0x91,0x08,0x10,0x04,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0xd5,
+	0x18,0x54,0x04,0x10,0x00,0x53,0x04,0x10,0x00,0x52,0x04,0x10,0x00,0x51,0x04,0x10,
+	0x00,0x10,0x04,0x10,0x00,0x00,0x00,0x94,0x14,0x53,0x04,0x10,0x00,0xd2,0x08,0x11,
+	0x04,0x10,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x10,0x00,0x00,0x00,0xd0,0x06,0xcf,
+	0x06,0x00,0x00,0xcf,0x86,0xd5,0x08,0x14,0x04,0x00,0x00,0x10,0x00,0xd4,0x10,0x53,
+	0x04,0x10,0x00,0x52,0x04,0x10,0x00,0x11,0x04,0x10,0x00,0x00,0x00,0x93,0x10,0x52,
+	0x04,0x10,0x01,0x91,0x08,0x10,0x04,0x10,0x01,0x10,0x00,0x00,0x00,0x00,0x00,0xd1,
+	0x6c,0xd0,0x1e,0xcf,0x86,0x55,0x04,0x10,0x00,0x54,0x04,0x10,0x00,0x93,0x10,0x52,
+	0x04,0x10,0xe6,0x51,0x04,0x10,0xe6,0x10,0x04,0x10,0xe6,0x10,0x00,0x10,0x00,0xcf,
+	0x86,0xd5,0x24,0xd4,0x10,0x93,0x0c,0x52,0x04,0x10,0x00,0x11,0x04,0x10,0x00,0x00,
+	0x00,0x00,0x00,0x53,0x04,0x10,0x00,0x92,0x0c,0x51,0x04,0x10,0x00,0x10,0x04,0x00,
+	0x00,0x10,0x00,0x10,0x00,0xd4,0x14,0x93,0x10,0x92,0x0c,0x51,0x04,0x10,0x00,0x10,
+	0x04,0x00,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x53,0x04,0x10,0x00,0x52,0x04,0x00,
+	0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x10,0x00,0x10,0x00,0xd0,0x0e,0xcf,0x86,0x95,
+	0x08,0x14,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xd3,0x06,0xcf,
+	0x06,0x00,0x00,0xd2,0x30,0xd1,0x0c,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x06,0x14,
+	0x00,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x54,0x04,0x14,0x00,0x53,0x04,0x14,0x00,0x92,
+	0x0c,0x51,0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,
+	0x06,0x00,0x00,0xd1,0x4c,0xd0,0x06,0xcf,0x06,0x0d,0x00,0xcf,0x86,0xd5,0x2c,0x94,
+	0x28,0xd3,0x10,0x52,0x04,0x0d,0x00,0x91,0x08,0x10,0x04,0x0d,0x00,0x15,0x00,0x15,
+	0x00,0xd2,0x0c,0x51,0x04,0x15,0x00,0x10,0x04,0x15,0x00,0x00,0x00,0x51,0x04,0x00,
+	0x00,0x10,0x04,0x00,0x00,0x15,0x00,0x0d,0x00,0x54,0x04,0x0d,0x00,0x53,0x04,0x0d,
+	0x00,0x52,0x04,0x0d,0x00,0x51,0x04,0x0d,0x00,0x10,0x04,0x0d,0x00,0x15,0x00,0xd0,
+	0x1e,0xcf,0x86,0x95,0x18,0x94,0x14,0x53,0x04,0x15,0x00,0x52,0x04,0x00,0x00,0x51,
+	0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x0d,0x00,0x0d,0x00,0x00,0x00,0xcf,0x86,0x55,
+	0x04,0x00,0x00,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x12,0x00,0x13,
+	0x00,0x15,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0xcf,0x06,0x12,0x00,0xe2,
+	0xc6,0x01,0xd1,0x8e,0xd0,0x86,0xcf,0x86,0xd5,0x48,0xd4,0x06,0xcf,0x06,0x12,0x00,
+	0xd3,0x06,0xcf,0x06,0x12,0x00,0xd2,0x06,0xcf,0x06,0x12,0x00,0xd1,0x06,0xcf,0x06,
+	0x12,0x00,0xd0,0x06,0xcf,0x06,0x12,0x00,0xcf,0x86,0x55,0x04,0x12,0x00,0xd4,0x14,
+	0x53,0x04,0x12,0x00,0x52,0x04,0x12,0x00,0x91,0x08,0x10,0x04,0x12,0x00,0x14,0x00,
+	0x14,0x00,0x93,0x0c,0x92,0x08,0x11,0x04,0x14,0x00,0x15,0x00,0x15,0x00,0x00,0x00,
+	0xd4,0x36,0xd3,0x06,0xcf,0x06,0x12,0x00,0xd2,0x2a,0xd1,0x06,0xcf,0x06,0x12,0x00,
+	0xd0,0x06,0xcf,0x06,0x12,0x00,0xcf,0x86,0x55,0x04,0x12,0x00,0x54,0x04,0x12,0x00,
+	0x93,0x10,0x92,0x0c,0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x00,0x00,0x00,0x00,0x00,
+	0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x86,0xcf,0x06,0x00,0x00,
+	0xd0,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xcf,0x86,0xd5,0xa2,0xd4,0x9c,0xd3,0x74,
+	0xd2,0x26,0xd1,0x20,0xd0,0x1a,0xcf,0x86,0x95,0x14,0x94,0x10,0x93,0x0c,0x92,0x08,
+	0x11,0x04,0x0c,0x00,0x13,0x00,0x13,0x00,0x13,0x00,0x13,0x00,0x13,0x00,0xcf,0x06,
+	0x13,0x00,0xcf,0x06,0x13,0x00,0xd1,0x48,0xd0,0x1e,0xcf,0x86,0x95,0x18,0x54,0x04,
+	0x13,0x00,0x53,0x04,0x13,0x00,0x52,0x04,0x13,0x00,0x51,0x04,0x13,0x00,0x10,0x04,
+	0x13,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0xd5,0x18,0x54,0x04,0x00,0x00,0x93,0x10,
+	0x92,0x0c,0x51,0x04,0x15,0x00,0x10,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0x94,0x0c,0x93,0x08,0x12,0x04,0x00,0x00,0x15,0x00,0x00,0x00,0x13,0x00,0xcf,0x06,
+	0x13,0x00,0xd2,0x22,0xd1,0x06,0xcf,0x06,0x13,0x00,0xd0,0x06,0xcf,0x06,0x13,0x00,
+	0xcf,0x86,0x55,0x04,0x13,0x00,0x54,0x04,0x13,0x00,0x53,0x04,0x13,0x00,0x12,0x04,
+	0x13,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,
+	0x00,0x00,0xd3,0x7f,0xd2,0x79,0xd1,0x34,0xd0,0x06,0xcf,0x06,0x10,0x00,0xcf,0x86,
+	0x55,0x04,0x10,0x00,0xd4,0x14,0x53,0x04,0x10,0x00,0x92,0x0c,0x51,0x04,0x10,0x00,
+	0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x53,0x04,0x10,0x00,0x52,0x04,0x10,0x00,
+	0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0xd0,0x3f,0xcf,0x86,0xd5,0x2c,
+	0xd4,0x14,0x53,0x04,0x10,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x53,0x04,0x10,0x00,0xd2,0x08,0x11,0x04,0x10,0x00,0x00,0x00,
+	0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x01,0x10,0x00,0x94,0x0d,0x93,0x09,0x12,0x05,
+	0x10,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,
+	0x00,0xcf,0x06,0x00,0x00,0xe1,0x96,0x04,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,
+	0xcf,0x86,0xe5,0x33,0x04,0xe4,0x83,0x02,0xe3,0xf8,0x01,0xd2,0x26,0xd1,0x06,0xcf,
+	0x06,0x05,0x00,0xd0,0x06,0xcf,0x06,0x05,0x00,0xcf,0x86,0x55,0x04,0x05,0x00,0x54,
+	0x04,0x05,0x00,0x93,0x0c,0x52,0x04,0x05,0x00,0x11,0x04,0x05,0x00,0x00,0x00,0x00,
+	0x00,0xd1,0xef,0xd0,0x2a,0xcf,0x86,0x55,0x04,0x05,0x00,0x94,0x20,0xd3,0x10,0x52,
+	0x04,0x05,0x00,0x51,0x04,0x05,0x00,0x10,0x04,0x05,0x00,0x00,0x00,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x0a,0x00,0x05,0x00,0x05,0x00,0x05,0x00,0xcf,0x86,0xd5,
+	0x2a,0x54,0x04,0x05,0x00,0x53,0x04,0x05,0x00,0x52,0x04,0x05,0x00,0x51,0x04,0x05,
+	0x00,0x10,0x0d,0x05,0xff,0xf0,0x9d,0x85,0x97,0xf0,0x9d,0x85,0xa5,0x00,0x05,0xff,
+	0xf0,0x9d,0x85,0x98,0xf0,0x9d,0x85,0xa5,0x00,0xd4,0x75,0xd3,0x61,0xd2,0x44,0xd1,
+	0x22,0x10,0x11,0x05,0xff,0xf0,0x9d,0x85,0x98,0xf0,0x9d,0x85,0xa5,0xf0,0x9d,0x85,
+	0xae,0x00,0x05,0xff,0xf0,0x9d,0x85,0x98,0xf0,0x9d,0x85,0xa5,0xf0,0x9d,0x85,0xaf,
+	0x00,0x10,0x11,0x05,0xff,0xf0,0x9d,0x85,0x98,0xf0,0x9d,0x85,0xa5,0xf0,0x9d,0x85,
+	0xb0,0x00,0x05,0xff,0xf0,0x9d,0x85,0x98,0xf0,0x9d,0x85,0xa5,0xf0,0x9d,0x85,0xb1,
+	0x00,0xd1,0x15,0x10,0x11,0x05,0xff,0xf0,0x9d,0x85,0x98,0xf0,0x9d,0x85,0xa5,0xf0,
+	0x9d,0x85,0xb2,0x00,0x05,0xd8,0x10,0x04,0x05,0xd8,0x05,0x01,0xd2,0x08,0x11,0x04,
+	0x05,0x01,0x05,0x00,0x91,0x08,0x10,0x04,0x05,0x00,0x05,0xe2,0x05,0xd8,0xd3,0x12,
+	0x92,0x0d,0x51,0x04,0x05,0xd8,0x10,0x04,0x05,0xd8,0x05,0xff,0x00,0x05,0xff,0x00,
+	0x92,0x0e,0x51,0x05,0x05,0xff,0x00,0x10,0x05,0x05,0xff,0x00,0x05,0xdc,0x05,0xdc,
+	0xd0,0x97,0xcf,0x86,0xd5,0x28,0x94,0x24,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x05,0xdc,
+	0x10,0x04,0x05,0xdc,0x05,0x00,0x91,0x08,0x10,0x04,0x05,0x00,0x05,0xe6,0x05,0xe6,
+	0x92,0x08,0x11,0x04,0x05,0xe6,0x05,0xdc,0x05,0x00,0x05,0x00,0xd4,0x14,0x53,0x04,
+	0x05,0x00,0xd2,0x08,0x11,0x04,0x05,0x00,0x05,0xe6,0x11,0x04,0x05,0xe6,0x05,0x00,
+	0x53,0x04,0x05,0x00,0xd2,0x15,0x51,0x04,0x05,0x00,0x10,0x04,0x05,0x00,0x05,0xff,
+	0xf0,0x9d,0x86,0xb9,0xf0,0x9d,0x85,0xa5,0x00,0xd1,0x1e,0x10,0x0d,0x05,0xff,0xf0,
+	0x9d,0x86,0xba,0xf0,0x9d,0x85,0xa5,0x00,0x05,0xff,0xf0,0x9d,0x86,0xb9,0xf0,0x9d,
+	0x85,0xa5,0xf0,0x9d,0x85,0xae,0x00,0x10,0x11,0x05,0xff,0xf0,0x9d,0x86,0xba,0xf0,
+	0x9d,0x85,0xa5,0xf0,0x9d,0x85,0xae,0x00,0x05,0xff,0xf0,0x9d,0x86,0xb9,0xf0,0x9d,
+	0x85,0xa5,0xf0,0x9d,0x85,0xaf,0x00,0xcf,0x86,0xd5,0x31,0xd4,0x21,0x93,0x1d,0x92,
+	0x19,0x91,0x15,0x10,0x11,0x05,0xff,0xf0,0x9d,0x86,0xba,0xf0,0x9d,0x85,0xa5,0xf0,
+	0x9d,0x85,0xaf,0x00,0x05,0x00,0x05,0x00,0x05,0x00,0x05,0x00,0x53,0x04,0x05,0x00,
+	0x52,0x04,0x05,0x00,0x11,0x04,0x05,0x00,0x11,0x00,0x94,0x14,0x53,0x04,0x11,0x00,
+	0x92,0x0c,0x91,0x08,0x10,0x04,0x11,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	0xd2,0x44,0xd1,0x28,0xd0,0x06,0xcf,0x06,0x08,0x00,0xcf,0x86,0x95,0x1c,0x94,0x18,
+	0x93,0x14,0xd2,0x08,0x11,0x04,0x08,0x00,0x08,0xe6,0x91,0x08,0x10,0x04,0x08,0xe6,
+	0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,
+	0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x14,0x00,0x93,0x08,0x12,0x04,0x14,0x00,
+	0x00,0x00,0x00,0x00,0xd1,0x40,0xd0,0x06,0xcf,0x06,0x07,0x00,0xcf,0x86,0xd5,0x18,
+	0x54,0x04,0x07,0x00,0x93,0x10,0x52,0x04,0x07,0x00,0x51,0x04,0x07,0x00,0x10,0x04,
+	0x07,0x00,0x00,0x00,0x00,0x00,0x54,0x04,0x09,0x00,0xd3,0x0c,0x92,0x08,0x11,0x04,
+	0x09,0x00,0x14,0x00,0x14,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x14,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xe3,0x5f,0x01,0xd2,0xb4,0xd1,0x24,0xd0,
+	0x06,0xcf,0x06,0x05,0x00,0xcf,0x86,0x95,0x18,0x54,0x04,0x05,0x00,0x93,0x10,0x52,
+	0x04,0x05,0x00,0x91,0x08,0x10,0x04,0x05,0x00,0x00,0x00,0x05,0x00,0x05,0x00,0x05,
+	0x00,0xd0,0x6a,0xcf,0x86,0xd5,0x18,0x54,0x04,0x05,0x00,0x53,0x04,0x05,0x00,0x52,
+	0x04,0x05,0x00,0x91,0x08,0x10,0x04,0x05,0x00,0x00,0x00,0x05,0x00,0xd4,0x34,0xd3,
+	0x1c,0xd2,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x05,0x00,0x00,0x00,0xd1,0x08,0x10,
+	0x04,0x00,0x00,0x05,0x00,0x10,0x04,0x05,0x00,0x00,0x00,0xd2,0x0c,0x91,0x08,0x10,
+	0x04,0x00,0x00,0x05,0x00,0x05,0x00,0x91,0x08,0x10,0x04,0x05,0x00,0x00,0x00,0x05,
+	0x00,0x53,0x04,0x05,0x00,0xd2,0x0c,0x51,0x04,0x05,0x00,0x10,0x04,0x00,0x00,0x05,
+	0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x05,0x00,0x05,0x00,0xcf,0x86,0x95,0x20,0x94,
+	0x1c,0x93,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x05,0x00,0x07,0x00,0x05,0x00,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x05,0x00,0x05,0x00,0x05,0x00,0x05,0x00,0x05,0x00,0xd1,
+	0xa4,0xd0,0x6a,0xcf,0x86,0xd5,0x48,0xd4,0x28,0xd3,0x10,0x52,0x04,0x05,0x00,0x51,
+	0x04,0x05,0x00,0x10,0x04,0x00,0x00,0x05,0x00,0xd2,0x0c,0x51,0x04,0x05,0x00,0x10,
+	0x04,0x05,0x00,0x00,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x05,0x00,0x05,0x00,0xd3,
+	0x10,0x52,0x04,0x05,0x00,0x91,0x08,0x10,0x04,0x05,0x00,0x00,0x00,0x05,0x00,0x52,
+	0x04,0x05,0x00,0x91,0x08,0x10,0x04,0x05,0x00,0x00,0x00,0x05,0x00,0x54,0x04,0x05,
+	0x00,0x53,0x04,0x05,0x00,0xd2,0x0c,0x51,0x04,0x05,0x00,0x10,0x04,0x00,0x00,0x05,
+	0x00,0x51,0x04,0x05,0x00,0x10,0x04,0x05,0x00,0x00,0x00,0xcf,0x86,0x95,0x34,0xd4,
+	0x20,0xd3,0x14,0x52,0x04,0x05,0x00,0xd1,0x08,0x10,0x04,0x05,0x00,0x00,0x00,0x10,
+	0x04,0x05,0x00,0x00,0x00,0x92,0x08,0x11,0x04,0x00,0x00,0x05,0x00,0x05,0x00,0x93,
+	0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x05,0x00,0x00,0x00,0x05,0x00,0x05,0x00,0x05,
+	0x00,0x05,0x00,0xcf,0x06,0x05,0x00,0xd2,0x26,0xd1,0x06,0xcf,0x06,0x05,0x00,0xd0,
+	0x1a,0xcf,0x86,0x55,0x04,0x05,0x00,0x94,0x10,0x93,0x0c,0x52,0x04,0x05,0x00,0x11,
+	0x04,0x08,0x00,0x00,0x00,0x05,0x00,0x05,0x00,0xcf,0x06,0x05,0x00,0xd1,0x06,0xcf,
+	0x06,0x05,0x00,0xd0,0x06,0xcf,0x06,0x05,0x00,0xcf,0x86,0x95,0x18,0x94,0x14,0x53,
+	0x04,0x05,0x00,0xd2,0x08,0x11,0x04,0x05,0x00,0x09,0x00,0x11,0x04,0x00,0x00,0x05,
+	0x00,0x05,0x00,0x05,0x00,0xd4,0x52,0xd3,0x06,0xcf,0x06,0x11,0x00,0xd2,0x46,0xd1,
+	0x06,0xcf,0x06,0x11,0x00,0xd0,0x3a,0xcf,0x86,0xd5,0x20,0xd4,0x0c,0x53,0x04,0x11,
+	0x00,0x12,0x04,0x11,0x00,0x00,0x00,0x53,0x04,0x00,0x00,0x92,0x0c,0x51,0x04,0x00,
+	0x00,0x10,0x04,0x00,0x00,0x11,0x00,0x11,0x00,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x11,0x00,0x11,0x00,0x11,0x00,0x11,0x00,0x00,0x00,0xcf,
+	0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xe0,0xc2,0x03,0xcf,0x86,
+	0xe5,0x03,0x01,0xd4,0xfc,0xd3,0xc0,0xd2,0x66,0xd1,0x60,0xd0,0x5a,0xcf,0x86,0xd5,
+	0x2c,0xd4,0x14,0x93,0x10,0x52,0x04,0x12,0xe6,0x51,0x04,0x12,0xe6,0x10,0x04,0x12,
+	0xe6,0x00,0x00,0x12,0xe6,0x53,0x04,0x12,0xe6,0x92,0x10,0xd1,0x08,0x10,0x04,0x12,
+	0xe6,0x00,0x00,0x10,0x04,0x00,0x00,0x12,0xe6,0x12,0xe6,0x94,0x28,0xd3,0x18,0xd2,
+	0x0c,0x51,0x04,0x12,0xe6,0x10,0x04,0x00,0x00,0x12,0xe6,0x91,0x08,0x10,0x04,0x12,
+	0xe6,0x00,0x00,0x12,0xe6,0x92,0x0c,0x51,0x04,0x12,0xe6,0x10,0x04,0x12,0xe6,0x00,
+	0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xd1,0x54,0xd0,
+	0x36,0xcf,0x86,0x55,0x04,0x15,0x00,0xd4,0x14,0x53,0x04,0x15,0x00,0x52,0x04,0x15,
+	0x00,0x91,0x08,0x10,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0xd3,0x10,0x52,0x04,0x15,
+	0xe6,0x51,0x04,0x15,0xe6,0x10,0x04,0x15,0xe6,0x15,0x00,0x52,0x04,0x15,0x00,0x11,
+	0x04,0x15,0x00,0x00,0x00,0xcf,0x86,0x95,0x18,0x94,0x14,0x53,0x04,0x15,0x00,0xd2,
+	0x08,0x11,0x04,0x15,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x15,0x00,0x00,0x00,0x00,
+	0x00,0xcf,0x06,0x00,0x00,0xd2,0x36,0xd1,0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,0xcf,
+	0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x15,0x00,0xd4,0x0c,0x53,0x04,0x15,0x00,0x12,
+	0x04,0x15,0x00,0x15,0xe6,0x53,0x04,0x15,0x00,0xd2,0x08,0x11,0x04,0x15,0x00,0x00,
+	0x00,0x51,0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x15,0x00,0xcf,0x06,0x00,0x00,0xcf,
+	0x06,0x00,0x00,0xd4,0x82,0xd3,0x7c,0xd2,0x3e,0xd1,0x06,0xcf,0x06,0x10,0x00,0xd0,
+	0x06,0xcf,0x06,0x10,0x00,0xcf,0x86,0x95,0x2c,0xd4,0x18,0x93,0x14,0x52,0x04,0x10,
+	0x00,0xd1,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x10,0x00,0x10,
+	0x00,0x93,0x10,0x52,0x04,0x10,0xdc,0x51,0x04,0x10,0xdc,0x10,0x04,0x10,0xdc,0x00,
+	0x00,0x00,0x00,0x00,0x00,0xd1,0x38,0xd0,0x06,0xcf,0x06,0x12,0x00,0xcf,0x86,0x95,
+	0x2c,0xd4,0x18,0xd3,0x08,0x12,0x04,0x12,0x00,0x12,0xe6,0x92,0x0c,0x51,0x04,0x12,
+	0xe6,0x10,0x04,0x12,0x07,0x15,0x00,0x00,0x00,0x53,0x04,0x12,0x00,0xd2,0x08,0x11,
+	0x04,0x12,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x12,0x00,0x00,0x00,0xcf,0x06,0x00,
+	0x00,0xcf,0x06,0x00,0x00,0xd3,0x82,0xd2,0x48,0xd1,0x24,0xd0,0x06,0xcf,0x06,0x00,
+	0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,0x93,0x10,0x92,0x0c,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x14,0x00,0x14,0x00,0x14,0x00,0x14,0x00,0xd0,0x1e,0xcf,
+	0x86,0x55,0x04,0x14,0x00,0x54,0x04,0x14,0x00,0x93,0x10,0x52,0x04,0x14,0x00,0x91,
+	0x08,0x10,0x04,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xd1,
+	0x34,0xd0,0x2e,0xcf,0x86,0xd5,0x18,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,
+	0x04,0x00,0x00,0x15,0x00,0x15,0x00,0x15,0x00,0x15,0x00,0x15,0x00,0x54,0x04,0x15,
+	0x00,0x53,0x04,0x15,0x00,0x52,0x04,0x15,0x00,0x11,0x04,0x15,0x00,0x00,0x00,0xcf,
+	0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xe2,0xb2,0x01,0xe1,0x41,0x01,0xd0,0x6e,0xcf,
+	0x86,0xd5,0x18,0x94,0x14,0x93,0x10,0x52,0x04,0x0d,0x00,0x91,0x08,0x10,0x04,0x00,
+	0x00,0x0d,0x00,0x0d,0x00,0x0d,0x00,0x0d,0x00,0xd4,0x30,0xd3,0x20,0xd2,0x10,0xd1,
+	0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x10,0x04,0x0d,0x00,0x00,0x00,0xd1,0x08,0x10,
+	0x04,0x0d,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x0d,0x00,0x92,0x0c,0x91,0x08,0x10,
+	0x04,0x00,0x00,0x0d,0x00,0x0d,0x00,0x0d,0x00,0xd3,0x10,0x92,0x0c,0x51,0x04,0x0d,
+	0x00,0x10,0x04,0x0d,0x00,0x00,0x00,0x0d,0x00,0x92,0x10,0xd1,0x08,0x10,0x04,0x00,
+	0x00,0x0d,0x00,0x10,0x04,0x00,0x00,0x0d,0x00,0x00,0x00,0xcf,0x86,0xd5,0x74,0xd4,
+	0x34,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x00,0x00,0x10,0x04,0x0d,0x00,0x00,0x00,0x51,
+	0x04,0x00,0x00,0x10,0x04,0x00,0x00,0x0d,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x00,
+	0x00,0x0d,0x00,0x10,0x04,0x00,0x00,0x0d,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x0d,
+	0x00,0x0d,0x00,0xd3,0x20,0xd2,0x10,0xd1,0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x10,
+	0x04,0x0d,0x00,0x00,0x00,0xd1,0x08,0x10,0x04,0x0d,0x00,0x00,0x00,0x10,0x04,0x00,
+	0x00,0x0d,0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x10,0x04,0x00,
+	0x00,0x0d,0x00,0xd1,0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x10,0x04,0x00,0x00,0x0d,
+	0x00,0xd4,0x30,0xd3,0x20,0xd2,0x10,0xd1,0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x10,
+	0x04,0x0d,0x00,0x00,0x00,0xd1,0x08,0x10,0x04,0x0d,0x00,0x00,0x00,0x10,0x04,0x00,
+	0x00,0x0d,0x00,0x92,0x0c,0x51,0x04,0x0d,0x00,0x10,0x04,0x0d,0x00,0x00,0x00,0x0d,
+	0x00,0xd3,0x10,0x92,0x0c,0x51,0x04,0x0d,0x00,0x10,0x04,0x0d,0x00,0x00,0x00,0x0d,
+	0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x0d,0x00,0xd1,0x08,0x10,
+	0x04,0x0d,0x00,0x00,0x00,0x10,0x04,0x0d,0x00,0x00,0x00,0xd0,0x56,0xcf,0x86,0xd5,
+	0x20,0xd4,0x14,0x53,0x04,0x0d,0x00,0x92,0x0c,0x51,0x04,0x0d,0x00,0x10,0x04,0x00,
+	0x00,0x0d,0x00,0x0d,0x00,0x53,0x04,0x0d,0x00,0x12,0x04,0x0d,0x00,0x00,0x00,0xd4,
+	0x28,0xd3,0x18,0xd2,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x0d,0x00,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x0d,0x00,0x0d,0x00,0x92,0x0c,0x51,0x04,0x0d,0x00,0x10,
+	0x04,0x00,0x00,0x0d,0x00,0x0d,0x00,0x53,0x04,0x0d,0x00,0x12,0x04,0x0d,0x00,0x00,
+	0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,0x93,0x0c,0x92,0x08,0x11,
+	0x04,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x86,0xe5,
+	0x96,0x05,0xe4,0x28,0x03,0xe3,0xed,0x01,0xd2,0xa0,0xd1,0x1c,0xd0,0x16,0xcf,0x86,
+	0x55,0x04,0x0a,0x00,0x94,0x0c,0x53,0x04,0x0a,0x00,0x12,0x04,0x0a,0x00,0x00,0x00,
+	0x0a,0x00,0xcf,0x06,0x0a,0x00,0xd0,0x46,0xcf,0x86,0xd5,0x10,0x54,0x04,0x0a,0x00,
+	0x93,0x08,0x12,0x04,0x0a,0x00,0x00,0x00,0x00,0x00,0xd4,0x14,0x53,0x04,0x0c,0x00,
+	0x52,0x04,0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,0x00,0x00,0xd3,0x10,
+	0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x0c,0x00,0x0c,0x00,0x0c,0x00,0x52,0x04,
+	0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,0x10,0x00,0xcf,0x86,0xd5,0x28,
+	0xd4,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x0c,0x00,0x0c,0x00,
+	0x0c,0x00,0x0c,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x00,0x00,0x0c,0x00,
+	0x0c,0x00,0x0c,0x00,0x0c,0x00,0x54,0x04,0x10,0x00,0x93,0x0c,0x52,0x04,0x10,0x00,
+	0x11,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0xd1,0xe4,0xd0,0x5a,0xcf,0x86,0xd5,0x20,
+	0x94,0x1c,0x53,0x04,0x0b,0x00,0xd2,0x0c,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,0x00,
+	0x10,0x00,0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x0b,0x00,0xd4,0x14,
+	0x53,0x04,0x0b,0x00,0x52,0x04,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,0x0b,0x00,
+	0x14,0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0c,0x00,0x0b,0x00,0x0c,0x00,
+	0x0c,0x00,0x52,0x04,0x0c,0x00,0xd1,0x08,0x10,0x04,0x0c,0x00,0x0b,0x00,0x10,0x04,
+	0x0c,0x00,0x0b,0x00,0xcf,0x86,0xd5,0x4c,0xd4,0x2c,0xd3,0x18,0xd2,0x0c,0x51,0x04,
+	0x0c,0x00,0x10,0x04,0x0b,0x00,0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,0x0b,0x00,
+	0x0c,0x00,0xd2,0x08,0x11,0x04,0x0c,0x00,0x0b,0x00,0x51,0x04,0x0b,0x00,0x10,0x04,
+	0x0b,0x00,0x0c,0x00,0xd3,0x10,0x52,0x04,0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,
+	0x0c,0x00,0x0b,0x00,0x52,0x04,0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,
+	0x0b,0x00,0xd4,0x18,0x53,0x04,0x0c,0x00,0xd2,0x08,0x11,0x04,0x0c,0x00,0x0d,0x00,
+	0x91,0x08,0x10,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0x53,0x04,0x0c,0x00,0xd2,0x10,
+	0xd1,0x08,0x10,0x04,0x0c,0x00,0x0b,0x00,0x10,0x04,0x0c,0x00,0x0b,0x00,0xd1,0x08,
+	0x10,0x04,0x0b,0x00,0x0c,0x00,0x10,0x04,0x0c,0x00,0x0b,0x00,0xd0,0x4e,0xcf,0x86,
+	0xd5,0x34,0xd4,0x14,0x53,0x04,0x0c,0x00,0xd2,0x08,0x11,0x04,0x0c,0x00,0x0b,0x00,
+	0x11,0x04,0x0b,0x00,0x0c,0x00,0xd3,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0b,0x00,
+	0x0c,0x00,0x0c,0x00,0x0c,0x00,0x92,0x0c,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,
+	0x12,0x00,0x12,0x00,0x94,0x14,0x53,0x04,0x12,0x00,0x52,0x04,0x12,0x00,0x91,0x08,
+	0x10,0x04,0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,
+	0x94,0x10,0x93,0x0c,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x0c,0x00,0x0c,0x00,
+	0x0c,0x00,0xd2,0x7e,0xd1,0x78,0xd0,0x3e,0xcf,0x86,0xd5,0x1c,0x94,0x18,0x93,0x14,
+	0x92,0x10,0xd1,0x08,0x10,0x04,0x0b,0x00,0x0c,0x00,0x10,0x04,0x0c,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0x0b,0x00,0x54,0x04,0x0b,0x00,0xd3,0x0c,0x92,0x08,0x11,0x04,
+	0x0b,0x00,0x0c,0x00,0x0c,0x00,0x92,0x0c,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,
+	0x12,0x00,0x00,0x00,0xcf,0x86,0xd5,0x24,0xd4,0x14,0x53,0x04,0x0b,0x00,0x92,0x0c,
+	0x91,0x08,0x10,0x04,0x0b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x93,0x0c,0x92,0x08,
+	0x11,0x04,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x94,0x10,0x93,0x0c,0x52,0x04,
+	0x13,0x00,0x11,0x04,0x13,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,
+	0xd1,0x58,0xd0,0x3a,0xcf,0x86,0x55,0x04,0x0c,0x00,0xd4,0x20,0xd3,0x10,0x92,0x0c,
+	0x91,0x08,0x10,0x04,0x0c,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x52,0x04,0x10,0x00,
+	0x91,0x08,0x10,0x04,0x10,0x00,0x11,0x00,0x11,0x00,0x93,0x10,0x52,0x04,0x0c,0x00,
+	0x51,0x04,0x0c,0x00,0x10,0x04,0x10,0x00,0x0c,0x00,0x0c,0x00,0xcf,0x86,0x55,0x04,
+	0x0c,0x00,0x54,0x04,0x0c,0x00,0x53,0x04,0x0c,0x00,0x52,0x04,0x0c,0x00,0x91,0x08,
+	0x10,0x04,0x0c,0x00,0x10,0x00,0x11,0x00,0xd0,0x16,0xcf,0x86,0x95,0x10,0x54,0x04,
+	0x0c,0x00,0x93,0x08,0x12,0x04,0x0c,0x00,0x10,0x00,0x10,0x00,0x0c,0x00,0xcf,0x86,
+	0xd5,0x34,0xd4,0x28,0xd3,0x10,0x52,0x04,0x0c,0x00,0x91,0x08,0x10,0x04,0x0c,0x00,
+	0x10,0x00,0x0c,0x00,0xd2,0x0c,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,0x10,0x00,
+	0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x11,0x00,0x93,0x08,0x12,0x04,0x11,0x00,
+	0x10,0x00,0x10,0x00,0x54,0x04,0x0c,0x00,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,
+	0x0c,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x11,0x00,0xd3,0xfc,0xd2,0x6c,0xd1,0x3c,
+	0xd0,0x1e,0xcf,0x86,0x55,0x04,0x0c,0x00,0x54,0x04,0x0c,0x00,0x53,0x04,0x0c,0x00,
+	0x52,0x04,0x0c,0x00,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,0x10,0x00,0xcf,0x86,
+	0x95,0x18,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0c,0x00,0x10,0x00,
+	0x0c,0x00,0x0c,0x00,0x0c,0x00,0x0c,0x00,0x0c,0x00,0xd0,0x06,0xcf,0x06,0x0c,0x00,
+	0xcf,0x86,0x55,0x04,0x0c,0x00,0x54,0x04,0x0c,0x00,0x53,0x04,0x0c,0x00,0xd2,0x0c,
+	0x91,0x08,0x10,0x04,0x10,0x00,0x0c,0x00,0x0c,0x00,0xd1,0x08,0x10,0x04,0x0c,0x00,
+	0x10,0x00,0x10,0x04,0x10,0x00,0x11,0x00,0xd1,0x54,0xd0,0x1a,0xcf,0x86,0x55,0x04,
+	0x0c,0x00,0x54,0x04,0x0c,0x00,0x53,0x04,0x0c,0x00,0x52,0x04,0x0c,0x00,0x11,0x04,
+	0x0c,0x00,0x10,0x00,0xcf,0x86,0xd5,0x1c,0x94,0x18,0xd3,0x08,0x12,0x04,0x0d,0x00,
+	0x10,0x00,0x92,0x0c,0x51,0x04,0x10,0x00,0x10,0x04,0x10,0x00,0x11,0x00,0x11,0x00,
+	0x0c,0x00,0xd4,0x08,0x13,0x04,0x0c,0x00,0x10,0x00,0x53,0x04,0x10,0x00,0x92,0x0c,
+	0x51,0x04,0x10,0x00,0x10,0x04,0x12,0x00,0x10,0x00,0x10,0x00,0xd0,0x1e,0xcf,0x86,
+	0x55,0x04,0x10,0x00,0x94,0x14,0x93,0x10,0x52,0x04,0x10,0x00,0x91,0x08,0x10,0x04,
+	0x12,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0x10,0x00,0xcf,0x86,0x55,0x04,0x10,0x00,
+	0x54,0x04,0x10,0x00,0x53,0x04,0x10,0x00,0x92,0x0c,0x51,0x04,0x10,0x00,0x10,0x04,
+	0x10,0x00,0x0c,0x00,0x0c,0x00,0xe2,0x19,0x01,0xd1,0xa8,0xd0,0x7e,0xcf,0x86,0xd5,
+	0x4c,0xd4,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x0d,0x00,0x0c,0x00,0x0c,
+	0x00,0x0c,0x00,0x0c,0x00,0xd3,0x1c,0xd2,0x0c,0x91,0x08,0x10,0x04,0x0c,0x00,0x0d,
+	0x00,0x0c,0x00,0xd1,0x08,0x10,0x04,0x0c,0x00,0x0d,0x00,0x10,0x04,0x0c,0x00,0x0d,
+	0x00,0xd2,0x10,0xd1,0x08,0x10,0x04,0x0c,0x00,0x0d,0x00,0x10,0x04,0x0c,0x00,0x0d,
+	0x00,0x51,0x04,0x0c,0x00,0x10,0x04,0x0c,0x00,0x0d,0x00,0xd4,0x1c,0xd3,0x0c,0x52,
+	0x04,0x0c,0x00,0x11,0x04,0x0c,0x00,0x0d,0x00,0x52,0x04,0x0c,0x00,0x91,0x08,0x10,
+	0x04,0x0d,0x00,0x0c,0x00,0x0d,0x00,0x93,0x10,0x52,0x04,0x0c,0x00,0x91,0x08,0x10,
+	0x04,0x0d,0x00,0x0c,0x00,0x0c,0x00,0x0c,0x00,0xcf,0x86,0x95,0x24,0x94,0x20,0x93,
+	0x1c,0xd2,0x10,0xd1,0x08,0x10,0x04,0x0c,0x00,0x10,0x00,0x10,0x04,0x10,0x00,0x11,
+	0x00,0x91,0x08,0x10,0x04,0x11,0x00,0x0c,0x00,0x0c,0x00,0x0c,0x00,0x10,0x00,0x10,
+	0x00,0xd0,0x06,0xcf,0x06,0x0c,0x00,0xcf,0x86,0xd5,0x30,0xd4,0x10,0x93,0x0c,0x52,
+	0x04,0x0c,0x00,0x11,0x04,0x0c,0x00,0x10,0x00,0x10,0x00,0x93,0x1c,0xd2,0x10,0xd1,
+	0x08,0x10,0x04,0x11,0x00,0x12,0x00,0x10,0x04,0x12,0x00,0x13,0x00,0x91,0x08,0x10,
+	0x04,0x13,0x00,0x15,0x00,0x00,0x00,0x00,0x00,0xd4,0x14,0x53,0x04,0x10,0x00,0x52,
+	0x04,0x10,0x00,0x91,0x08,0x10,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0xd3,0x10,0x52,
+	0x04,0x10,0x00,0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x00,0x13,0x00,0x92,0x10,0xd1,
+	0x08,0x10,0x04,0x13,0x00,0x14,0x00,0x10,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0xd1,
+	0x1c,0xd0,0x06,0xcf,0x06,0x0c,0x00,0xcf,0x86,0x55,0x04,0x0c,0x00,0x54,0x04,0x0c,
+	0x00,0x93,0x08,0x12,0x04,0x0c,0x00,0x00,0x00,0x00,0x00,0xd0,0x06,0xcf,0x06,0x10,
+	0x00,0xcf,0x86,0xd5,0x24,0x54,0x04,0x10,0x00,0xd3,0x10,0x52,0x04,0x10,0x00,0x91,
+	0x08,0x10,0x04,0x10,0x00,0x14,0x00,0x14,0x00,0x92,0x0c,0x91,0x08,0x10,0x04,0x14,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x94,0x0c,0x53,0x04,0x15,0x00,0x12,0x04,0x15,
+	0x00,0x00,0x00,0x00,0x00,0xe4,0x40,0x02,0xe3,0xc9,0x01,0xd2,0x5c,0xd1,0x34,0xd0,
+	0x16,0xcf,0x86,0x95,0x10,0x94,0x0c,0x53,0x04,0x10,0x00,0x12,0x04,0x10,0x00,0x00,
+	0x00,0x10,0x00,0x10,0x00,0xcf,0x86,0x95,0x18,0xd4,0x08,0x13,0x04,0x10,0x00,0x00,
+	0x00,0x53,0x04,0x10,0x00,0x92,0x08,0x11,0x04,0x10,0x00,0x00,0x00,0x00,0x00,0x10,
+	0x00,0xd0,0x22,0xcf,0x86,0xd5,0x0c,0x94,0x08,0x13,0x04,0x10,0x00,0x00,0x00,0x10,
+	0x00,0x94,0x10,0x53,0x04,0x10,0x00,0x52,0x04,0x10,0x00,0x11,0x04,0x10,0x00,0x00,
+	0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xd1,0xc0,0xd0,0x5e,0xcf,0x86,0xd5,0x30,0xd4,
+	0x14,0x53,0x04,0x13,0x00,0x52,0x04,0x13,0x00,0x91,0x08,0x10,0x04,0x00,0x00,0x15,
+	0x00,0x15,0x00,0x53,0x04,0x11,0x00,0xd2,0x0c,0x91,0x08,0x10,0x04,0x11,0x00,0x12,
+	0x00,0x12,0x00,0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x00,0x13,0x00,0xd4,0x08,0x13,
+	0x04,0x12,0x00,0x13,0x00,0xd3,0x14,0x92,0x10,0xd1,0x08,0x10,0x04,0x12,0x00,0x13,
+	0x00,0x10,0x04,0x13,0x00,0x12,0x00,0x12,0x00,0x52,0x04,0x12,0x00,0x51,0x04,0x12,
+	0x00,0x10,0x04,0x12,0x00,0x15,0x00,0xcf,0x86,0xd5,0x28,0xd4,0x14,0x53,0x04,0x12,
+	0x00,0x52,0x04,0x12,0x00,0x91,0x08,0x10,0x04,0x13,0x00,0x14,0x00,0x14,0x00,0x53,
+	0x04,0x12,0x00,0x52,0x04,0x12,0x00,0x51,0x04,0x12,0x00,0x10,0x04,0x12,0x00,0x13,
+	0x00,0xd4,0x0c,0x53,0x04,0x13,0x00,0x12,0x04,0x13,0x00,0x14,0x00,0xd3,0x1c,0xd2,
+	0x10,0xd1,0x08,0x10,0x04,0x14,0x00,0x15,0x00,0x10,0x04,0x00,0x00,0x14,0x00,0x51,
+	0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x00,0x00,0x92,0x0c,0x51,0x04,0x00,0x00,0x10,
+	0x04,0x14,0x00,0x15,0x00,0x14,0x00,0xd0,0x62,0xcf,0x86,0xd5,0x24,0xd4,0x14,0x93,
+	0x10,0x52,0x04,0x11,0x00,0x91,0x08,0x10,0x04,0x11,0x00,0x12,0x00,0x12,0x00,0x12,
+	0x00,0x93,0x0c,0x92,0x08,0x11,0x04,0x12,0x00,0x13,0x00,0x13,0x00,0x14,0x00,0xd4,
+	0x2c,0xd3,0x18,0xd2,0x0c,0x51,0x04,0x14,0x00,0x10,0x04,0x14,0x00,0x00,0x00,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x15,0x00,0x15,0x00,0xd2,0x0c,0x51,0x04,0x15,0x00,0x10,
+	0x04,0x15,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x15,0x00,0x53,0x04,0x14,0x00,0x92,
+	0x08,0x11,0x04,0x14,0x00,0x15,0x00,0x15,0x00,0xcf,0x86,0xd5,0x30,0x94,0x2c,0xd3,
+	0x14,0x92,0x10,0xd1,0x08,0x10,0x04,0x11,0x00,0x14,0x00,0x10,0x04,0x14,0x00,0x15,
+	0x00,0x15,0x00,0xd2,0x0c,0x51,0x04,0x15,0x00,0x10,0x04,0x15,0x00,0x00,0x00,0x91,
+	0x08,0x10,0x04,0x00,0x00,0x15,0x00,0x15,0x00,0x13,0x00,0x94,0x14,0x93,0x10,0x52,
+	0x04,0x13,0x00,0x51,0x04,0x13,0x00,0x10,0x04,0x13,0x00,0x14,0x00,0x14,0x00,0x14,
+	0x00,0xd2,0x70,0xd1,0x40,0xd0,0x06,0xcf,0x06,0x15,0x00,0xcf,0x86,0xd5,0x10,0x54,
+	0x04,0x15,0x00,0x93,0x08,0x12,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0xd4,0x10,0x53,
+	0x04,0x14,0x00,0x52,0x04,0x14,0x00,0x11,0x04,0x14,0x00,0x00,0x00,0xd3,0x08,0x12,
+	0x04,0x15,0x00,0x00,0x00,0x92,0x0c,0x51,0x04,0x15,0x00,0x10,0x04,0x15,0x00,0x00,
+	0x00,0x00,0x00,0xd0,0x2a,0xcf,0x86,0x95,0x24,0xd4,0x14,0x93,0x10,0x92,0x0c,0x51,
+	0x04,0x15,0x00,0x10,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x93,0x0c,0x52,
+	0x04,0x15,0x00,0x11,0x04,0x15,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,
+	0x00,0xcf,0x06,0x00,0x00,0xd3,0x06,0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,
+	0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,
+	0x04,0x00,0x00,0x54,0x04,0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,
+	0x04,0x00,0x00,0x02,0x00,0xe4,0xf9,0x12,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x05,0x00,
+	0xd2,0xc2,0xd1,0x08,0xcf,0x86,0xcf,0x06,0x05,0x00,0xd0,0x44,0xcf,0x86,0xd5,0x3c,
+	0xd4,0x06,0xcf,0x06,0x05,0x00,0xd3,0x06,0xcf,0x06,0x05,0x00,0xd2,0x2a,0xd1,0x06,
+	0xcf,0x06,0x05,0x00,0xd0,0x06,0xcf,0x06,0x05,0x00,0xcf,0x86,0x95,0x18,0x54,0x04,
+	0x05,0x00,0x93,0x10,0x52,0x04,0x05,0x00,0x51,0x04,0x05,0x00,0x10,0x04,0x05,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x0b,0x00,0xcf,0x06,0x0b,0x00,0xcf,0x86,
+	0xd5,0x3c,0xd4,0x06,0xcf,0x06,0x0b,0x00,0xd3,0x06,0xcf,0x06,0x0b,0x00,0xd2,0x06,
+	0xcf,0x06,0x0b,0x00,0xd1,0x24,0xd0,0x1e,0xcf,0x86,0x55,0x04,0x0b,0x00,0x54,0x04,
+	0x0b,0x00,0x93,0x10,0x52,0x04,0x0b,0x00,0x91,0x08,0x10,0x04,0x0b,0x00,0x00,0x00,
+	0x00,0x00,0x00,0x00,0xcf,0x06,0x0c,0x00,0xcf,0x06,0x0c,0x00,0xd4,0x32,0xd3,0x2c,
+	0xd2,0x26,0xd1,0x20,0xd0,0x1a,0xcf,0x86,0x95,0x14,0x54,0x04,0x0c,0x00,0x53,0x04,
+	0x0c,0x00,0x52,0x04,0x0c,0x00,0x11,0x04,0x0c,0x00,0x00,0x00,0x11,0x00,0xcf,0x06,
+	0x11,0x00,0xcf,0x06,0x11,0x00,0xcf,0x06,0x11,0x00,0xcf,0x06,0x11,0x00,0xcf,0x06,
+	0x11,0x00,0xd1,0x48,0xd0,0x40,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x11,0x00,0xd4,0x06,
+	0xcf,0x06,0x11,0x00,0xd3,0x06,0xcf,0x06,0x11,0x00,0xd2,0x26,0xd1,0x06,0xcf,0x06,
+	0x11,0x00,0xd0,0x1a,0xcf,0x86,0x55,0x04,0x11,0x00,0x94,0x10,0x93,0x0c,0x92,0x08,
+	0x11,0x04,0x11,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x13,0x00,0xcf,0x06,0x13,0x00,
+	0xcf,0x06,0x13,0x00,0xcf,0x86,0xcf,0x06,0x13,0x00,0xd0,0x44,0xcf,0x86,0xd5,0x06,
+	0xcf,0x06,0x13,0x00,0xd4,0x36,0xd3,0x06,0xcf,0x06,0x13,0x00,0xd2,0x06,0xcf,0x06,
+	0x13,0x00,0xd1,0x06,0xcf,0x06,0x13,0x00,0xd0,0x06,0xcf,0x06,0x13,0x00,0xcf,0x86,
+	0x55,0x04,0x13,0x00,0x94,0x14,0x93,0x10,0x92,0x0c,0x91,0x08,0x10,0x04,0x13,0x00,
+	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x86,
+	0xd5,0x06,0xcf,0x06,0x00,0x00,0xe4,0x68,0x11,0xe3,0x51,0x10,0xe2,0x17,0x08,0xe1,
+	0x06,0x04,0xe0,0x03,0x02,0xcf,0x86,0xe5,0x06,0x01,0xd4,0x82,0xd3,0x41,0xd2,0x21,
+	0xd1,0x10,0x10,0x08,0x05,0xff,0xe4,0xb8,0xbd,0x00,0x05,0xff,0xe4,0xb8,0xb8,0x00,
+	0x10,0x08,0x05,0xff,0xe4,0xb9,0x81,0x00,0x05,0xff,0xf0,0xa0,0x84,0xa2,0x00,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe4,0xbd,0xa0,0x00,0x05,0xff,0xe4,0xbe,0xae,0x00,0x10,
+	0x08,0x05,0xff,0xe4,0xbe,0xbb,0x00,0x05,0xff,0xe5,0x80,0x82,0x00,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe5,0x81,0xba,0x00,0x05,0xff,0xe5,0x82,0x99,0x00,0x10,
+	0x08,0x05,0xff,0xe5,0x83,0xa7,0x00,0x05,0xff,0xe5,0x83,0x8f,0x00,0xd1,0x11,0x10,
+	0x08,0x05,0xff,0xe3,0x92,0x9e,0x00,0x05,0xff,0xf0,0xa0,0x98,0xba,0x00,0x10,0x08,
+	0x05,0xff,0xe5,0x85,0x8d,0x00,0x05,0xff,0xe5,0x85,0x94,0x00,0xd3,0x42,0xd2,0x21,
+	0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0x85,0xa4,0x00,0x05,0xff,0xe5,0x85,0xb7,0x00,
+	0x10,0x09,0x05,0xff,0xf0,0xa0,0x94,0x9c,0x00,0x05,0xff,0xe3,0x92,0xb9,0x00,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe5,0x85,0xa7,0x00,0x05,0xff,0xe5,0x86,0x8d,0x00,0x10,
+	0x09,0x05,0xff,0xf0,0xa0,0x95,0x8b,0x00,0x05,0xff,0xe5,0x86,0x97,0x00,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0x86,0xa4,0x00,0x05,0xff,0xe4,0xbb,0x8c,0x00,
+	0x10,0x08,0x05,0xff,0xe5,0x86,0xac,0x00,0x05,0xff,0xe5,0x86,0xb5,0x00,0xd1,0x11,
+	0x10,0x09,0x05,0xff,0xf0,0xa9,0x87,0x9f,0x00,0x05,0xff,0xe5,0x87,0xb5,0x00,0x10,
+	0x08,0x05,0xff,0xe5,0x88,0x83,0x00,0x05,0xff,0xe3,0x93,0x9f,0x00,0xd4,0x80,0xd3,
+	0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0x88,0xbb,0x00,0x05,0xff,0xe5,
+	0x89,0x86,0x00,0x10,0x08,0x05,0xff,0xe5,0x89,0xb2,0x00,0x05,0xff,0xe5,0x89,0xb7,
+	0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe3,0x94,0x95,0x00,0x05,0xff,0xe5,0x8b,0x87,
+	0x00,0x10,0x08,0x05,0xff,0xe5,0x8b,0x89,0x00,0x05,0xff,0xe5,0x8b,0xa4,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0x8b,0xba,0x00,0x05,0xff,0xe5,0x8c,0x85,
+	0x00,0x10,0x08,0x05,0xff,0xe5,0x8c,0x86,0x00,0x05,0xff,0xe5,0x8c,0x97,0x00,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe5,0x8d,0x89,0x00,0x05,0xff,0xe5,0x8d,0x91,0x00,0x10,
+	0x08,0x05,0xff,0xe5,0x8d,0x9a,0x00,0x05,0xff,0xe5,0x8d,0xb3,0x00,0xd3,0x39,0xd2,
+	0x18,0x91,0x10,0x10,0x08,0x05,0xff,0xe5,0x8d,0xbd,0x00,0x05,0xff,0xe5,0x8d,0xbf,
+	0x00,0x05,0xff,0xe5,0x8d,0xbf,0x00,0xd1,0x11,0x10,0x09,0x05,0xff,0xf0,0xa0,0xa8,
+	0xac,0x00,0x05,0xff,0xe7,0x81,0xb0,0x00,0x10,0x08,0x05,0xff,0xe5,0x8f,0x8a,0x00,
+	0x05,0xff,0xe5,0x8f,0x9f,0x00,0xd2,0x21,0xd1,0x11,0x10,0x09,0x05,0xff,0xf0,0xa0,
+	0xad,0xa3,0x00,0x05,0xff,0xe5,0x8f,0xab,0x00,0x10,0x08,0x05,0xff,0xe5,0x8f,0xb1,
+	0x00,0x05,0xff,0xe5,0x90,0x86,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0x92,0x9e,
+	0x00,0x05,0xff,0xe5,0x90,0xb8,0x00,0x10,0x08,0x05,0xff,0xe5,0x91,0x88,0x00,0x05,
+	0xff,0xe5,0x91,0xa8,0x00,0xcf,0x86,0xe5,0x02,0x01,0xd4,0x80,0xd3,0x40,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0x92,0xa2,0x00,0x05,0xff,0xe5,0x93,0xb6,0x00,
+	0x10,0x08,0x05,0xff,0xe5,0x94,0x90,0x00,0x05,0xff,0xe5,0x95,0x93,0x00,0xd1,0x10,
+	0x10,0x08,0x05,0xff,0xe5,0x95,0xa3,0x00,0x05,0xff,0xe5,0x96,0x84,0x00,0x10,0x08,
+	0x05,0xff,0xe5,0x96,0x84,0x00,0x05,0xff,0xe5,0x96,0x99,0x00,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x05,0xff,0xe5,0x96,0xab,0x00,0x05,0xff,0xe5,0x96,0xb3,0x00,0x10,0x08,
+	0x05,0xff,0xe5,0x97,0x82,0x00,0x05,0xff,0xe5,0x9c,0x96,0x00,0xd1,0x10,0x10,0x08,
+	0x05,0xff,0xe5,0x98,0x86,0x00,0x05,0xff,0xe5,0x9c,0x97,0x00,0x10,0x08,0x05,0xff,
+	0xe5,0x99,0x91,0x00,0x05,0xff,0xe5,0x99,0xb4,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x05,0xff,0xe5,0x88,0x87,0x00,0x05,0xff,0xe5,0xa3,0xae,0x00,0x10,0x08,
+	0x05,0xff,0xe5,0x9f,0x8e,0x00,0x05,0xff,0xe5,0x9f,0xb4,0x00,0xd1,0x10,0x10,0x08,
+	0x05,0xff,0xe5,0xa0,0x8d,0x00,0x05,0xff,0xe5,0x9e,0x8b,0x00,0x10,0x08,0x05,0xff,
+	0xe5,0xa0,0xb2,0x00,0x05,0xff,0xe5,0xa0,0xb1,0x00,0xd2,0x21,0xd1,0x11,0x10,0x08,
+	0x05,0xff,0xe5,0xa2,0xac,0x00,0x05,0xff,0xf0,0xa1,0x93,0xa4,0x00,0x10,0x08,0x05,
+	0xff,0xe5,0xa3,0xb2,0x00,0x05,0xff,0xe5,0xa3,0xb7,0x00,0xd1,0x10,0x10,0x08,0x05,
+	0xff,0xe5,0xa4,0x86,0x00,0x05,0xff,0xe5,0xa4,0x9a,0x00,0x10,0x08,0x05,0xff,0xe5,
+	0xa4,0xa2,0x00,0x05,0xff,0xe5,0xa5,0xa2,0x00,0xd4,0x7b,0xd3,0x42,0xd2,0x22,0xd1,
+	0x12,0x10,0x09,0x05,0xff,0xf0,0xa1,0x9a,0xa8,0x00,0x05,0xff,0xf0,0xa1,0x9b,0xaa,
+	0x00,0x10,0x08,0x05,0xff,0xe5,0xa7,0xac,0x00,0x05,0xff,0xe5,0xa8,0x9b,0x00,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe5,0xa8,0xa7,0x00,0x05,0xff,0xe5,0xa7,0x98,0x00,0x10,
+	0x08,0x05,0xff,0xe5,0xa9,0xa6,0x00,0x05,0xff,0xe3,0x9b,0xae,0x00,0xd2,0x18,0x91,
+	0x10,0x10,0x08,0x05,0xff,0xe3,0x9b,0xbc,0x00,0x05,0xff,0xe5,0xac,0x88,0x00,0x05,
+	0xff,0xe5,0xac,0xbe,0x00,0xd1,0x11,0x10,0x09,0x05,0xff,0xf0,0xa1,0xa7,0x88,0x00,
+	0x05,0xff,0xe5,0xaf,0x83,0x00,0x10,0x08,0x05,0xff,0xe5,0xaf,0x98,0x00,0x05,0xff,
+	0xe5,0xaf,0xa7,0x00,0xd3,0x41,0xd2,0x21,0xd1,0x11,0x10,0x08,0x05,0xff,0xe5,0xaf,
+	0xb3,0x00,0x05,0xff,0xf0,0xa1,0xac,0x98,0x00,0x10,0x08,0x05,0xff,0xe5,0xaf,0xbf,
+	0x00,0x05,0xff,0xe5,0xb0,0x86,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0xbd,0x93,
+	0x00,0x05,0xff,0xe5,0xb0,0xa2,0x00,0x10,0x08,0x05,0xff,0xe3,0x9e,0x81,0x00,0x05,
+	0xff,0xe5,0xb1,0xa0,0x00,0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0xb1,0xae,
+	0x00,0x05,0xff,0xe5,0xb3,0x80,0x00,0x10,0x08,0x05,0xff,0xe5,0xb2,0x8d,0x00,0x05,
+	0xff,0xf0,0xa1,0xb7,0xa4,0x00,0xd1,0x11,0x10,0x08,0x05,0xff,0xe5,0xb5,0x83,0x00,
+	0x05,0xff,0xf0,0xa1,0xb7,0xa6,0x00,0x10,0x08,0x05,0xff,0xe5,0xb5,0xae,0x00,0x05,
+	0xff,0xe5,0xb5,0xab,0x00,0xe0,0x04,0x02,0xcf,0x86,0xd5,0xfe,0xd4,0x82,0xd3,0x40,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0xb5,0xbc,0x00,0x05,0xff,0xe5,0xb7,
+	0xa1,0x00,0x10,0x08,0x05,0xff,0xe5,0xb7,0xa2,0x00,0x05,0xff,0xe3,0xa0,0xaf,0x00,
+	0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0xb7,0xbd,0x00,0x05,0xff,0xe5,0xb8,0xa8,0x00,
+	0x10,0x08,0x05,0xff,0xe5,0xb8,0xbd,0x00,0x05,0xff,0xe5,0xb9,0xa9,0x00,0xd2,0x21,
+	0xd1,0x11,0x10,0x08,0x05,0xff,0xe3,0xa1,0xa2,0x00,0x05,0xff,0xf0,0xa2,0x86,0x83,
+	0x00,0x10,0x08,0x05,0xff,0xe3,0xa1,0xbc,0x00,0x05,0xff,0xe5,0xba,0xb0,0x00,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe5,0xba,0xb3,0x00,0x05,0xff,0xe5,0xba,0xb6,0x00,0x10,
+	0x08,0x05,0xff,0xe5,0xbb,0x8a,0x00,0x05,0xff,0xf0,0xaa,0x8e,0x92,0x00,0xd3,0x3b,
+	0xd2,0x22,0xd1,0x11,0x10,0x08,0x05,0xff,0xe5,0xbb,0xbe,0x00,0x05,0xff,0xf0,0xa2,
+	0x8c,0xb1,0x00,0x10,0x09,0x05,0xff,0xf0,0xa2,0x8c,0xb1,0x00,0x05,0xff,0xe8,0x88,
+	0x81,0x00,0x51,0x08,0x05,0xff,0xe5,0xbc,0xa2,0x00,0x10,0x08,0x05,0xff,0xe3,0xa3,
+	0x87,0x00,0x05,0xff,0xf0,0xa3,0x8a,0xb8,0x00,0xd2,0x21,0xd1,0x11,0x10,0x09,0x05,
+	0xff,0xf0,0xa6,0x87,0x9a,0x00,0x05,0xff,0xe5,0xbd,0xa2,0x00,0x10,0x08,0x05,0xff,
+	0xe5,0xbd,0xab,0x00,0x05,0xff,0xe3,0xa3,0xa3,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,
+	0xe5,0xbe,0x9a,0x00,0x05,0xff,0xe5,0xbf,0x8d,0x00,0x10,0x08,0x05,0xff,0xe5,0xbf,
+	0x97,0x00,0x05,0xff,0xe5,0xbf,0xb9,0x00,0xd4,0x81,0xd3,0x41,0xd2,0x20,0xd1,0x10,
+	0x10,0x08,0x05,0xff,0xe6,0x82,0x81,0x00,0x05,0xff,0xe3,0xa4,0xba,0x00,0x10,0x08,
+	0x05,0xff,0xe3,0xa4,0x9c,0x00,0x05,0xff,0xe6,0x82,0x94,0x00,0xd1,0x11,0x10,0x09,
+	0x05,0xff,0xf0,0xa2,0x9b,0x94,0x00,0x05,0xff,0xe6,0x83,0x87,0x00,0x10,0x08,0x05,
+	0xff,0xe6,0x85,0x88,0x00,0x05,0xff,0xe6,0x85,0x8c,0x00,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x05,0xff,0xe6,0x85,0x8e,0x00,0x05,0xff,0xe6,0x85,0x8c,0x00,0x10,0x08,0x05,
+	0xff,0xe6,0x85,0xba,0x00,0x05,0xff,0xe6,0x86,0x8e,0x00,0xd1,0x10,0x10,0x08,0x05,
+	0xff,0xe6,0x86,0xb2,0x00,0x05,0xff,0xe6,0x86,0xa4,0x00,0x10,0x08,0x05,0xff,0xe6,
+	0x86,0xaf,0x00,0x05,0xff,0xe6,0x87,0x9e,0x00,0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,
+	0x08,0x05,0xff,0xe6,0x87,0xb2,0x00,0x05,0xff,0xe6,0x87,0xb6,0x00,0x10,0x08,0x05,
+	0xff,0xe6,0x88,0x90,0x00,0x05,0xff,0xe6,0x88,0x9b,0x00,0xd1,0x10,0x10,0x08,0x05,
+	0xff,0xe6,0x89,0x9d,0x00,0x05,0xff,0xe6,0x8a,0xb1,0x00,0x10,0x08,0x05,0xff,0xe6,
+	0x8b,0x94,0x00,0x05,0xff,0xe6,0x8d,0x90,0x00,0xd2,0x21,0xd1,0x11,0x10,0x09,0x05,
+	0xff,0xf0,0xa2,0xac,0x8c,0x00,0x05,0xff,0xe6,0x8c,0xbd,0x00,0x10,0x08,0x05,0xff,
+	0xe6,0x8b,0xbc,0x00,0x05,0xff,0xe6,0x8d,0xa8,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,
+	0xe6,0x8e,0x83,0x00,0x05,0xff,0xe6,0x8f,0xa4,0x00,0x10,0x09,0x05,0xff,0xf0,0xa2,
+	0xaf,0xb1,0x00,0x05,0xff,0xe6,0x90,0xa2,0x00,0xcf,0x86,0xe5,0x03,0x01,0xd4,0x81,
+	0xd3,0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0x8f,0x85,0x00,0x05,0xff,
+	0xe6,0x8e,0xa9,0x00,0x10,0x08,0x05,0xff,0xe3,0xa8,0xae,0x00,0x05,0xff,0xe6,0x91,
+	0xa9,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0x91,0xbe,0x00,0x05,0xff,0xe6,0x92,
+	0x9d,0x00,0x10,0x08,0x05,0xff,0xe6,0x91,0xb7,0x00,0x05,0xff,0xe3,0xa9,0xac,0x00,
+	0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0x95,0x8f,0x00,0x05,0xff,0xe6,0x95,
+	0xac,0x00,0x10,0x09,0x05,0xff,0xf0,0xa3,0x80,0x8a,0x00,0x05,0xff,0xe6,0x97,0xa3,
+	0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0x9b,0xb8,0x00,0x05,0xff,0xe6,0x99,0x89,
+	0x00,0x10,0x08,0x05,0xff,0xe3,0xac,0x99,0x00,0x05,0xff,0xe6,0x9a,0x91,0x00,0xd3,
+	0x40,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe3,0xac,0x88,0x00,0x05,0xff,0xe3,
+	0xab,0xa4,0x00,0x10,0x08,0x05,0xff,0xe5,0x86,0x92,0x00,0x05,0xff,0xe5,0x86,0x95,
+	0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0x9c,0x80,0x00,0x05,0xff,0xe6,0x9a,0x9c,
+	0x00,0x10,0x08,0x05,0xff,0xe8,0x82,0xad,0x00,0x05,0xff,0xe4,0x8f,0x99,0x00,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0x9c,0x97,0x00,0x05,0xff,0xe6,0x9c,0x9b,
+	0x00,0x10,0x08,0x05,0xff,0xe6,0x9c,0xa1,0x00,0x05,0xff,0xe6,0x9d,0x9e,0x00,0xd1,
+	0x11,0x10,0x08,0x05,0xff,0xe6,0x9d,0x93,0x00,0x05,0xff,0xf0,0xa3,0x8f,0x83,0x00,
+	0x10,0x08,0x05,0xff,0xe3,0xad,0x89,0x00,0x05,0xff,0xe6,0x9f,0xba,0x00,0xd4,0x82,
+	0xd3,0x41,0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0x9e,0x85,0x00,0x05,0xff,
+	0xe6,0xa1,0x92,0x00,0x10,0x08,0x05,0xff,0xe6,0xa2,0x85,0x00,0x05,0xff,0xf0,0xa3,
+	0x91,0xad,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0xa2,0x8e,0x00,0x05,0xff,0xe6,
+	0xa0,0x9f,0x00,0x10,0x08,0x05,0xff,0xe6,0xa4,0x94,0x00,0x05,0xff,0xe3,0xae,0x9d,
+	0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0xa5,0x82,0x00,0x05,0xff,0xe6,
+	0xa6,0xa3,0x00,0x10,0x08,0x05,0xff,0xe6,0xa7,0xaa,0x00,0x05,0xff,0xe6,0xaa,0xa8,
+	0x00,0xd1,0x11,0x10,0x09,0x05,0xff,0xf0,0xa3,0x9a,0xa3,0x00,0x05,0xff,0xe6,0xab,
+	0x9b,0x00,0x10,0x08,0x05,0xff,0xe3,0xb0,0x98,0x00,0x05,0xff,0xe6,0xac,0xa1,0x00,
+	0xd3,0x42,0xd2,0x21,0xd1,0x11,0x10,0x09,0x05,0xff,0xf0,0xa3,0xa2,0xa7,0x00,0x05,
+	0xff,0xe6,0xad,0x94,0x00,0x10,0x08,0x05,0xff,0xe3,0xb1,0x8e,0x00,0x05,0xff,0xe6,
+	0xad,0xb2,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0xae,0x9f,0x00,0x05,0xff,0xe6,
+	0xae,0xba,0x00,0x10,0x08,0x05,0xff,0xe6,0xae,0xbb,0x00,0x05,0xff,0xf0,0xa3,0xaa,
+	0x8d,0x00,0xd2,0x23,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa1,0xb4,0x8b,0x00,0x05,
+	0xff,0xf0,0xa3,0xab,0xba,0x00,0x10,0x08,0x05,0xff,0xe6,0xb1,0x8e,0x00,0x05,0xff,
+	0xf0,0xa3,0xb2,0xbc,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0xb2,0xbf,0x00,0x05,
+	0xff,0xe6,0xb3,0x8d,0x00,0x10,0x08,0x05,0xff,0xe6,0xb1,0xa7,0x00,0x05,0xff,0xe6,
+	0xb4,0x96,0x00,0xe1,0x1d,0x04,0xe0,0x0c,0x02,0xcf,0x86,0xe5,0x08,0x01,0xd4,0x82,
+	0xd3,0x41,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0xb4,0xbe,0x00,0x05,0xff,
+	0xe6,0xb5,0xb7,0x00,0x10,0x08,0x05,0xff,0xe6,0xb5,0x81,0x00,0x05,0xff,0xe6,0xb5,
+	0xa9,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0xb5,0xb8,0x00,0x05,0xff,0xe6,0xb6,
+	0x85,0x00,0x10,0x09,0x05,0xff,0xf0,0xa3,0xb4,0x9e,0x00,0x05,0xff,0xe6,0xb4,0xb4,
+	0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe6,0xb8,0xaf,0x00,0x05,0xff,0xe6,
+	0xb9,0xae,0x00,0x10,0x08,0x05,0xff,0xe3,0xb4,0xb3,0x00,0x05,0xff,0xe6,0xbb,0x8b,
+	0x00,0xd1,0x11,0x10,0x08,0x05,0xff,0xe6,0xbb,0x87,0x00,0x05,0xff,0xf0,0xa3,0xbb,
+	0x91,0x00,0x10,0x08,0x05,0xff,0xe6,0xb7,0xb9,0x00,0x05,0xff,0xe6,0xbd,0xae,0x00,
+	0xd3,0x42,0xd2,0x22,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa3,0xbd,0x9e,0x00,0x05,
+	0xff,0xf0,0xa3,0xbe,0x8e,0x00,0x10,0x08,0x05,0xff,0xe6,0xbf,0x86,0x00,0x05,0xff,
+	0xe7,0x80,0xb9,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe7,0x80,0x9e,0x00,0x05,0xff,
+	0xe7,0x80,0x9b,0x00,0x10,0x08,0x05,0xff,0xe3,0xb6,0x96,0x00,0x05,0xff,0xe7,0x81,
+	0x8a,0x00,0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe7,0x81,0xbd,0x00,0x05,0xff,
+	0xe7,0x81,0xb7,0x00,0x10,0x08,0x05,0xff,0xe7,0x82,0xad,0x00,0x05,0xff,0xf0,0xa0,
+	0x94,0xa5,0x00,0xd1,0x11,0x10,0x08,0x05,0xff,0xe7,0x85,0x85,0x00,0x05,0xff,0xf0,
+	0xa4,0x89,0xa3,0x00,0x10,0x08,0x05,0xff,0xe7,0x86,0x9c,0x00,0x05,0xff,0xf0,0xa4,
+	0x8e,0xab,0x00,0xd4,0x7b,0xd3,0x43,0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe7,
+	0x88,0xa8,0x00,0x05,0xff,0xe7,0x88,0xb5,0x00,0x10,0x08,0x05,0xff,0xe7,0x89,0x90,
+	0x00,0x05,0xff,0xf0,0xa4,0x98,0x88,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe7,0x8a,
+	0x80,0x00,0x05,0xff,0xe7,0x8a,0x95,0x00,0x10,0x09,0x05,0xff,0xf0,0xa4,0x9c,0xb5,
+	0x00,0x05,0xff,0xf0,0xa4,0xa0,0x94,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,
+	0xe7,0x8d,0xba,0x00,0x05,0xff,0xe7,0x8e,0x8b,0x00,0x10,0x08,0x05,0xff,0xe3,0xba,
+	0xac,0x00,0x05,0xff,0xe7,0x8e,0xa5,0x00,0x51,0x08,0x05,0xff,0xe3,0xba,0xb8,0x00,
+	0x10,0x08,0x05,0xff,0xe7,0x91,0x87,0x00,0x05,0xff,0xe7,0x91,0x9c,0x00,0xd3,0x42,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe7,0x91,0xb1,0x00,0x05,0xff,0xe7,0x92,
+	0x85,0x00,0x10,0x08,0x05,0xff,0xe7,0x93,0x8a,0x00,0x05,0xff,0xe3,0xbc,0x9b,0x00,
+	0xd1,0x11,0x10,0x08,0x05,0xff,0xe7,0x94,0xa4,0x00,0x05,0xff,0xf0,0xa4,0xb0,0xb6,
+	0x00,0x10,0x08,0x05,0xff,0xe7,0x94,0xbe,0x00,0x05,0xff,0xf0,0xa4,0xb2,0x92,0x00,
+	0xd2,0x22,0xd1,0x11,0x10,0x08,0x05,0xff,0xe7,0x95,0xb0,0x00,0x05,0xff,0xf0,0xa2,
+	0x86,0x9f,0x00,0x10,0x08,0x05,0xff,0xe7,0x98,0x90,0x00,0x05,0xff,0xf0,0xa4,0xbe,
+	0xa1,0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa4,0xbe,0xb8,0x00,0x05,0xff,0xf0,
+	0xa5,0x81,0x84,0x00,0x10,0x08,0x05,0xff,0xe3,0xbf,0xbc,0x00,0x05,0xff,0xe4,0x80,
+	0x88,0x00,0xcf,0x86,0xe5,0x04,0x01,0xd4,0x7d,0xd3,0x3c,0xd2,0x23,0xd1,0x11,0x10,
+	0x08,0x05,0xff,0xe7,0x9b,0xb4,0x00,0x05,0xff,0xf0,0xa5,0x83,0xb3,0x00,0x10,0x09,
+	0x05,0xff,0xf0,0xa5,0x83,0xb2,0x00,0x05,0xff,0xf0,0xa5,0x84,0x99,0x00,0x91,0x11,
+	0x10,0x09,0x05,0xff,0xf0,0xa5,0x84,0xb3,0x00,0x05,0xff,0xe7,0x9c,0x9e,0x00,0x05,
+	0xff,0xe7,0x9c,0x9f,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe7,0x9d,0x8a,
+	0x00,0x05,0xff,0xe4,0x80,0xb9,0x00,0x10,0x08,0x05,0xff,0xe7,0x9e,0x8b,0x00,0x05,
+	0xff,0xe4,0x81,0x86,0x00,0xd1,0x11,0x10,0x08,0x05,0xff,0xe4,0x82,0x96,0x00,0x05,
+	0xff,0xf0,0xa5,0x90,0x9d,0x00,0x10,0x08,0x05,0xff,0xe7,0xa1,0x8e,0x00,0x05,0xff,
+	0xe7,0xa2,0x8c,0x00,0xd3,0x43,0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe7,0xa3,
+	0x8c,0x00,0x05,0xff,0xe4,0x83,0xa3,0x00,0x10,0x09,0x05,0xff,0xf0,0xa5,0x98,0xa6,
+	0x00,0x05,0xff,0xe7,0xa5,0x96,0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa5,0x9a,
+	0x9a,0x00,0x05,0xff,0xf0,0xa5,0x9b,0x85,0x00,0x10,0x08,0x05,0xff,0xe7,0xa6,0x8f,
+	0x00,0x05,0xff,0xe7,0xa7,0xab,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe4,
+	0x84,0xaf,0x00,0x05,0xff,0xe7,0xa9,0x80,0x00,0x10,0x08,0x05,0xff,0xe7,0xa9,0x8a,
+	0x00,0x05,0xff,0xe7,0xa9,0x8f,0x00,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa5,0xa5,
+	0xbc,0x00,0x05,0xff,0xf0,0xa5,0xaa,0xa7,0x00,0x10,0x09,0x05,0xff,0xf0,0xa5,0xaa,
+	0xa7,0x00,0x05,0xff,0xe7,0xab,0xae,0x00,0xd4,0x83,0xd3,0x42,0xd2,0x21,0xd1,0x11,
+	0x10,0x08,0x05,0xff,0xe4,0x88,0x82,0x00,0x05,0xff,0xf0,0xa5,0xae,0xab,0x00,0x10,
+	0x08,0x05,0xff,0xe7,0xaf,0x86,0x00,0x05,0xff,0xe7,0xaf,0x89,0x00,0xd1,0x11,0x10,
+	0x08,0x05,0xff,0xe4,0x88,0xa7,0x00,0x05,0xff,0xf0,0xa5,0xb2,0x80,0x00,0x10,0x08,
+	0x05,0xff,0xe7,0xb3,0x92,0x00,0x05,0xff,0xe4,0x8a,0xa0,0x00,0xd2,0x21,0xd1,0x10,
+	0x10,0x08,0x05,0xff,0xe7,0xb3,0xa8,0x00,0x05,0xff,0xe7,0xb3,0xa3,0x00,0x10,0x08,
+	0x05,0xff,0xe7,0xb4,0x80,0x00,0x05,0xff,0xf0,0xa5,0xbe,0x86,0x00,0xd1,0x10,0x10,
+	0x08,0x05,0xff,0xe7,0xb5,0xa3,0x00,0x05,0xff,0xe4,0x8c,0x81,0x00,0x10,0x08,0x05,
+	0xff,0xe7,0xb7,0x87,0x00,0x05,0xff,0xe7,0xb8,0x82,0x00,0xd3,0x44,0xd2,0x22,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe7,0xb9,0x85,0x00,0x05,0xff,0xe4,0x8c,0xb4,0x00,0x10,
+	0x09,0x05,0xff,0xf0,0xa6,0x88,0xa8,0x00,0x05,0xff,0xf0,0xa6,0x89,0x87,0x00,0xd1,
+	0x11,0x10,0x08,0x05,0xff,0xe4,0x8d,0x99,0x00,0x05,0xff,0xf0,0xa6,0x8b,0x99,0x00,
+	0x10,0x08,0x05,0xff,0xe7,0xbd,0xba,0x00,0x05,0xff,0xf0,0xa6,0x8c,0xbe,0x00,0xd2,
+	0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe7,0xbe,0x95,0x00,0x05,0xff,0xe7,0xbf,0xba,
+	0x00,0x10,0x08,0x05,0xff,0xe8,0x80,0x85,0x00,0x05,0xff,0xf0,0xa6,0x93,0x9a,0x00,
+	0xd1,0x11,0x10,0x09,0x05,0xff,0xf0,0xa6,0x94,0xa3,0x00,0x05,0xff,0xe8,0x81,0xa0,
+	0x00,0x10,0x09,0x05,0xff,0xf0,0xa6,0x96,0xa8,0x00,0x05,0xff,0xe8,0x81,0xb0,0x00,
+	0xe0,0x11,0x02,0xcf,0x86,0xe5,0x07,0x01,0xd4,0x85,0xd3,0x42,0xd2,0x21,0xd1,0x11,
+	0x10,0x09,0x05,0xff,0xf0,0xa3,0x8d,0x9f,0x00,0x05,0xff,0xe4,0x8f,0x95,0x00,0x10,
+	0x08,0x05,0xff,0xe8,0x82,0xb2,0x00,0x05,0xff,0xe8,0x84,0x83,0x00,0xd1,0x10,0x10,
+	0x08,0x05,0xff,0xe4,0x90,0x8b,0x00,0x05,0xff,0xe8,0x84,0xbe,0x00,0x10,0x08,0x05,
+	0xff,0xe5,0xaa,0xb5,0x00,0x05,0xff,0xf0,0xa6,0x9e,0xa7,0x00,0xd2,0x23,0xd1,0x12,
+	0x10,0x09,0x05,0xff,0xf0,0xa6,0x9e,0xb5,0x00,0x05,0xff,0xf0,0xa3,0x8e,0x93,0x00,
+	0x10,0x09,0x05,0xff,0xf0,0xa3,0x8e,0x9c,0x00,0x05,0xff,0xe8,0x88,0x81,0x00,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe8,0x88,0x84,0x00,0x05,0xff,0xe8,0xbe,0x9e,0x00,0x10,
+	0x08,0x05,0xff,0xe4,0x91,0xab,0x00,0x05,0xff,0xe8,0x8a,0x91,0x00,0xd3,0x41,0xd2,
+	0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe8,0x8a,0x8b,0x00,0x05,0xff,0xe8,0x8a,0x9d,
+	0x00,0x10,0x08,0x05,0xff,0xe5,0x8a,0xb3,0x00,0x05,0xff,0xe8,0x8a,0xb1,0x00,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe8,0x8a,0xb3,0x00,0x05,0xff,0xe8,0x8a,0xbd,0x00,0x10,
+	0x08,0x05,0xff,0xe8,0x8b,0xa6,0x00,0x05,0xff,0xf0,0xa6,0xac,0xbc,0x00,0xd2,0x20,
+	0xd1,0x10,0x10,0x08,0x05,0xff,0xe8,0x8b,0xa5,0x00,0x05,0xff,0xe8,0x8c,0x9d,0x00,
+	0x10,0x08,0x05,0xff,0xe8,0x8d,0xa3,0x00,0x05,0xff,0xe8,0x8e,0xad,0x00,0xd1,0x10,
+	0x10,0x08,0x05,0xff,0xe8,0x8c,0xa3,0x00,0x05,0xff,0xe8,0x8e,0xbd,0x00,0x10,0x08,
+	0x05,0xff,0xe8,0x8f,0xa7,0x00,0x05,0xff,0xe8,0x91,0x97,0x00,0xd4,0x85,0xd3,0x43,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe8,0x8d,0x93,0x00,0x05,0xff,0xe8,0x8f,
+	0x8a,0x00,0x10,0x08,0x05,0xff,0xe8,0x8f,0x8c,0x00,0x05,0xff,0xe8,0x8f,0x9c,0x00,
+	0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa6,0xb0,0xb6,0x00,0x05,0xff,0xf0,0xa6,0xb5,
+	0xab,0x00,0x10,0x09,0x05,0xff,0xf0,0xa6,0xb3,0x95,0x00,0x05,0xff,0xe4,0x94,0xab,
+	0x00,0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe8,0x93,0xb1,0x00,0x05,0xff,0xe8,
+	0x93,0xb3,0x00,0x10,0x08,0x05,0xff,0xe8,0x94,0x96,0x00,0x05,0xff,0xf0,0xa7,0x8f,
+	0x8a,0x00,0xd1,0x11,0x10,0x08,0x05,0xff,0xe8,0x95,0xa4,0x00,0x05,0xff,0xf0,0xa6,
+	0xbc,0xac,0x00,0x10,0x08,0x05,0xff,0xe4,0x95,0x9d,0x00,0x05,0xff,0xe4,0x95,0xa1,
+	0x00,0xd3,0x42,0xd2,0x22,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa6,0xbe,0xb1,0x00,
+	0x05,0xff,0xf0,0xa7,0x83,0x92,0x00,0x10,0x08,0x05,0xff,0xe4,0x95,0xab,0x00,0x05,
+	0xff,0xe8,0x99,0x90,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe8,0x99,0x9c,0x00,0x05,
+	0xff,0xe8,0x99,0xa7,0x00,0x10,0x08,0x05,0xff,0xe8,0x99,0xa9,0x00,0x05,0xff,0xe8,
+	0x9a,0xa9,0x00,0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe8,0x9a,0x88,0x00,0x05,
+	0xff,0xe8,0x9c,0x8e,0x00,0x10,0x08,0x05,0xff,0xe8,0x9b,0xa2,0x00,0x05,0xff,0xe8,
+	0x9d,0xb9,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe8,0x9c,0xa8,0x00,0x05,0xff,0xe8,
+	0x9d,0xab,0x00,0x10,0x08,0x05,0xff,0xe8,0x9e,0x86,0x00,0x05,0xff,0xe4,0x97,0x97,
+	0x00,0xcf,0x86,0xe5,0x08,0x01,0xd4,0x83,0xd3,0x41,0xd2,0x20,0xd1,0x10,0x10,0x08,
+	0x05,0xff,0xe8,0x9f,0xa1,0x00,0x05,0xff,0xe8,0xa0,0x81,0x00,0x10,0x08,0x05,0xff,
+	0xe4,0x97,0xb9,0x00,0x05,0xff,0xe8,0xa1,0xa0,0x00,0xd1,0x11,0x10,0x08,0x05,0xff,
+	0xe8,0xa1,0xa3,0x00,0x05,0xff,0xf0,0xa7,0x99,0xa7,0x00,0x10,0x08,0x05,0xff,0xe8,
+	0xa3,0x97,0x00,0x05,0xff,0xe8,0xa3,0x9e,0x00,0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,
+	0xff,0xe4,0x98,0xb5,0x00,0x05,0xff,0xe8,0xa3,0xba,0x00,0x10,0x08,0x05,0xff,0xe3,
+	0x92,0xbb,0x00,0x05,0xff,0xf0,0xa7,0xa2,0xae,0x00,0xd1,0x11,0x10,0x09,0x05,0xff,
+	0xf0,0xa7,0xa5,0xa6,0x00,0x05,0xff,0xe4,0x9a,0xbe,0x00,0x10,0x08,0x05,0xff,0xe4,
+	0x9b,0x87,0x00,0x05,0xff,0xe8,0xaa,0xa0,0x00,0xd3,0x41,0xd2,0x21,0xd1,0x10,0x10,
+	0x08,0x05,0xff,0xe8,0xab,0xad,0x00,0x05,0xff,0xe8,0xae,0x8a,0x00,0x10,0x08,0x05,
+	0xff,0xe8,0xb1,0x95,0x00,0x05,0xff,0xf0,0xa7,0xb2,0xa8,0x00,0xd1,0x10,0x10,0x08,
+	0x05,0xff,0xe8,0xb2,0xab,0x00,0x05,0xff,0xe8,0xb3,0x81,0x00,0x10,0x08,0x05,0xff,
+	0xe8,0xb4,0x9b,0x00,0x05,0xff,0xe8,0xb5,0xb7,0x00,0xd2,0x22,0xd1,0x12,0x10,0x09,
+	0x05,0xff,0xf0,0xa7,0xbc,0xaf,0x00,0x05,0xff,0xf0,0xa0,0xa0,0x84,0x00,0x10,0x08,
+	0x05,0xff,0xe8,0xb7,0x8b,0x00,0x05,0xff,0xe8,0xb6,0xbc,0x00,0xd1,0x11,0x10,0x08,
+	0x05,0xff,0xe8,0xb7,0xb0,0x00,0x05,0xff,0xf0,0xa0,0xa3,0x9e,0x00,0x10,0x08,0x05,
+	0xff,0xe8,0xbb,0x94,0x00,0x05,0xff,0xe8,0xbc,0xb8,0x00,0xd4,0x84,0xd3,0x43,0xd2,
+	0x22,0xd1,0x12,0x10,0x09,0x05,0xff,0xf0,0xa8,0x97,0x92,0x00,0x05,0xff,0xf0,0xa8,
+	0x97,0xad,0x00,0x10,0x08,0x05,0xff,0xe9,0x82,0x94,0x00,0x05,0xff,0xe9,0x83,0xb1,
+	0x00,0xd1,0x11,0x10,0x08,0x05,0xff,0xe9,0x84,0x91,0x00,0x05,0xff,0xf0,0xa8,0x9c,
+	0xae,0x00,0x10,0x08,0x05,0xff,0xe9,0x84,0x9b,0x00,0x05,0xff,0xe9,0x88,0xb8,0x00,
+	0xd2,0x20,0xd1,0x10,0x10,0x08,0x05,0xff,0xe9,0x8b,0x97,0x00,0x05,0xff,0xe9,0x8b,
+	0x98,0x00,0x10,0x08,0x05,0xff,0xe9,0x89,0xbc,0x00,0x05,0xff,0xe9,0x8f,0xb9,0x00,
+	0xd1,0x11,0x10,0x08,0x05,0xff,0xe9,0x90,0x95,0x00,0x05,0xff,0xf0,0xa8,0xaf,0xba,
+	0x00,0x10,0x08,0x05,0xff,0xe9,0x96,0x8b,0x00,0x05,0xff,0xe4,0xa6,0x95,0x00,0xd3,
+	0x43,0xd2,0x21,0xd1,0x11,0x10,0x08,0x05,0xff,0xe9,0x96,0xb7,0x00,0x05,0xff,0xf0,
+	0xa8,0xb5,0xb7,0x00,0x10,0x08,0x05,0xff,0xe4,0xa7,0xa6,0x00,0x05,0xff,0xe9,0x9b,
+	0x83,0x00,0xd1,0x10,0x10,0x08,0x05,0xff,0xe5,0xb6,0xb2,0x00,0x05,0xff,0xe9,0x9c,
+	0xa3,0x00,0x10,0x09,0x05,0xff,0xf0,0xa9,0x85,0x85,0x00,0x05,0xff,0xf0,0xa9,0x88,
+	0x9a,0x00,0xd2,0x21,0xd1,0x10,0x10,0x08,0x05,0xff,0xe4,0xa9,0xae,0x00,0x05,0xff,
+	0xe4,0xa9,0xb6,0x00,0x10,0x08,0x05,0xff,0xe9,0x9f,0xa0,0x00,0x05,0xff,0xf0,0xa9,
+	0x90,0x8a,0x00,0x91,0x11,0x10,0x08,0x05,0xff,0xe4,0xaa,0xb2,0x00,0x05,0xff,0xf0,
+	0xa9,0x92,0x96,0x00,0x05,0xff,0xe9,0xa0,0x8b,0x00,0xe2,0x10,0x01,0xe1,0x09,0x01,
+	0xe0,0x02,0x01,0xcf,0x86,0x95,0xfb,0xd4,0x82,0xd3,0x41,0xd2,0x21,0xd1,0x11,0x10,
+	0x08,0x05,0xff,0xe9,0xa0,0xa9,0x00,0x05,0xff,0xf0,0xa9,0x96,0xb6,0x00,0x10,0x08,
+	0x05,0xff,0xe9,0xa3,0xa2,0x00,0x05,0xff,0xe4,0xac,0xb3,0x00,0xd1,0x10,0x10,0x08,
+	0x05,0xff,0xe9,0xa4,0xa9,0x00,0x05,0xff,0xe9,0xa6,0xa7,0x00,0x10,0x08,0x05,0xff,
+	0xe9,0xa7,0x82,0x00,0x05,0xff,0xe9,0xa7,0xbe,0x00,0xd2,0x21,0xd1,0x11,0x10,0x08,
+	0x05,0xff,0xe4,0xaf,0x8e,0x00,0x05,0xff,0xf0,0xa9,0xac,0xb0,0x00,0x10,0x08,0x05,
+	0xff,0xe9,0xac,0x92,0x00,0x05,0xff,0xe9,0xb1,0x80,0x00,0xd1,0x10,0x10,0x08,0x05,
+	0xff,0xe9,0xb3,0xbd,0x00,0x05,0xff,0xe4,0xb3,0x8e,0x00,0x10,0x08,0x05,0xff,0xe4,
+	0xb3,0xad,0x00,0x05,0xff,0xe9,0xb5,0xa7,0x00,0xd3,0x44,0xd2,0x23,0xd1,0x11,0x10,
+	0x09,0x05,0xff,0xf0,0xaa,0x83,0x8e,0x00,0x05,0xff,0xe4,0xb3,0xb8,0x00,0x10,0x09,
+	0x05,0xff,0xf0,0xaa,0x84,0x85,0x00,0x05,0xff,0xf0,0xaa,0x88,0x8e,0x00,0xd1,0x11,
+	0x10,0x09,0x05,0xff,0xf0,0xaa,0x8a,0x91,0x00,0x05,0xff,0xe9,0xba,0xbb,0x00,0x10,
+	0x08,0x05,0xff,0xe4,0xb5,0x96,0x00,0x05,0xff,0xe9,0xbb,0xb9,0x00,0xd2,0x20,0xd1,
+	0x10,0x10,0x08,0x05,0xff,0xe9,0xbb,0xbe,0x00,0x05,0xff,0xe9,0xbc,0x85,0x00,0x10,
+	0x08,0x05,0xff,0xe9,0xbc,0x8f,0x00,0x05,0xff,0xe9,0xbc,0x96,0x00,0x91,0x11,0x10,
+	0x08,0x05,0xff,0xe9,0xbc,0xbb,0x00,0x05,0xff,0xf0,0xaa,0x98,0x80,0x00,0x00,0x00,
+	0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xcf,0x06,0x00,0x00,0xd3,0x06,
+	0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,
+	0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,
+	0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xd3,0x08,
+	0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd1,0x08,
+	0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xcf,0x86,
+	0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,0x00,0xd3,0x06,0xcf,0x06,
+	0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,
+	0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,0x53,0x04,
+	0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xd5,0xc0,
+	0xd4,0x60,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,0x06,
+	0x00,0x00,0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,0x06,
+	0x00,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,0x00,
+	0xd3,0x06,0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,0x06,
+	0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,
+	0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,
+	0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,
+	0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,
+	0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,0x00,0xd3,0x06,
+	0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,
+	0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,
+	0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xd4,0x60,
+	0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,
+	0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,
+	0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,0x00,0xd3,0x06,
+	0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,
+	0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,
+	0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xd3,0x08,
+	0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd1,0x08,
+	0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xcf,0x86,
+	0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,0x00,0xd3,0x06,0xcf,0x06,
+	0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,
+	0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,0x53,0x04,
+	0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xe0,0x83,0x01,0xcf,
+	0x86,0xd5,0xc0,0xd4,0x60,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,
+	0x86,0xcf,0x06,0x00,0x00,0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,
+	0x86,0xcf,0x06,0x00,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,
+	0x06,0x00,0x00,0xd3,0x06,0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,
+	0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,
+	0x00,0x54,0x04,0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,
+	0x00,0x02,0x00,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,
+	0x06,0x00,0x00,0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,
+	0x06,0x00,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,
+	0x00,0xd3,0x06,0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,
+	0x06,0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,
+	0x04,0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,
+	0x00,0xd4,0x60,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,
+	0x06,0x00,0x00,0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,
+	0x06,0x00,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,
+	0x00,0xd3,0x06,0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,
+	0x06,0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,
+	0x04,0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,
+	0x00,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,0x06,0x00,
+	0x00,0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x00,
+	0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,0x00,0xd3,
+	0x06,0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,0x06,0x00,
+	0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,
+	0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,
+	0x86,0xd5,0xc0,0xd4,0x60,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,
+	0x86,0xcf,0x06,0x00,0x00,0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,
+	0x86,0xcf,0x06,0x00,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,
+	0x06,0x00,0x00,0xd3,0x06,0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,
+	0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,
+	0x00,0x54,0x04,0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,
+	0x00,0x02,0x00,0xd3,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,
+	0x06,0x00,0x00,0xd1,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,
+	0x06,0x00,0x00,0xcf,0x86,0xd5,0x06,0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,
+	0x00,0xd3,0x06,0xcf,0x06,0x00,0x00,0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,
+	0x06,0x00,0x00,0xd0,0x06,0xcf,0x06,0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,
+	0x04,0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,
+	0x00,0xd4,0xd9,0xd3,0x81,0xd2,0x79,0xd1,0x71,0xd0,0x69,0xcf,0x86,0xd5,0x60,0xd4,
+	0x59,0xd3,0x52,0xd2,0x33,0xd1,0x2c,0xd0,0x25,0xcf,0x86,0x95,0x1e,0x94,0x19,0x93,
+	0x14,0x92,0x0f,0x91,0x0a,0x10,0x05,0x00,0xff,0x00,0x05,0xff,0x00,0x00,0xff,0x00,
+	0x00,0xff,0x00,0x00,0xff,0x00,0x00,0xff,0x00,0x05,0xff,0x00,0xcf,0x06,0x05,0xff,
+	0x00,0xcf,0x06,0x00,0xff,0x00,0xd1,0x07,0xcf,0x06,0x07,0xff,0x00,0xd0,0x07,0xcf,
+	0x06,0x07,0xff,0x00,0xcf,0x86,0x55,0x05,0x07,0xff,0x00,0x14,0x05,0x07,0xff,0x00,
+	0x00,0xff,0x00,0xcf,0x06,0x00,0xff,0x00,0xcf,0x06,0x00,0xff,0x00,0xcf,0x06,0x00,
+	0xff,0x00,0xcf,0x86,0xcf,0x06,0x00,0x00,0xcf,0x86,0xcf,0x06,0x00,0x00,0xcf,0x86,
+	0xcf,0x06,0x00,0x00,0xd2,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xd1,0x08,0xcf,0x86,
+	0xcf,0x06,0x00,0x00,0xd0,0x08,0xcf,0x86,0xcf,0x06,0x00,0x00,0xcf,0x86,0xd5,0x06,
+	0xcf,0x06,0x00,0x00,0xd4,0x06,0xcf,0x06,0x00,0x00,0xd3,0x06,0xcf,0x06,0x00,0x00,
+	0xd2,0x06,0xcf,0x06,0x00,0x00,0xd1,0x06,0xcf,0x06,0x00,0x00,0xd0,0x06,0xcf,0x06,
+	0x00,0x00,0xcf,0x86,0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,0x53,0x04,0x00,0x00,
+	0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xcf,0x06,0x02,0x00,
+	0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00
+};

diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
new file mode 100644
index 0000000..0acd530
--- /dev/null
+++ b/fs/unicode/utf8n.h

@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ */
+
+#ifndef UTF8NORM_H
+#define UTF8NORM_H
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+/* Encoding a unicode version number as a single unsigned int. */
+#define UNICODE_MAJ_SHIFT		(16)
+#define UNICODE_MIN_SHIFT		(8)
+
+#define UNICODE_AGE(MAJ, MIN, REV)			\
+	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
+	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
+	 ((unsigned int)(REV)))
+
+/* Highest unicode version supported by the data tables. */
+extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
+extern int utf8version_latest(void);
+
+/*
+ * Look for the correct const struct utf8data for a unicode version.
+ * Returns NULL if the version requested is too new.
+ *
+ * Two normalization forms are supported: nfdi and nfdicf.
+ *
+ * nfdi:
+ *  - Apply unicode normalization form NFD.
+ *  - Remove any Default_Ignorable_Code_Point.
+ *
+ * nfdicf:
+ *  - Apply unicode normalization form NFD.
+ *  - Remove any Default_Ignorable_Code_Point.
+ *  - Apply a full casefold (C + F).
+ */
+extern const struct utf8data *utf8nfdi(unsigned int maxage);
+extern const struct utf8data *utf8nfdicf(unsigned int maxage);
+
+/*
+ * Determine the maximum age of any unicode character in the string.
+ * Returns 0 if only unassigned code points are present.
+ * Returns -1 if the input is not valid UTF-8.
+ */
+extern int utf8agemax(const struct utf8data *data, const char *s);
+extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
+
+/*
+ * Determine the minimum age of any unicode character in the string.
+ * Returns 0 if any unassigned code points are present.
+ * Returns -1 if the input is not valid UTF-8.
+ */
+extern int utf8agemin(const struct utf8data *data, const char *s);
+extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
+
+/*
+ * Determine the length of the normalized from of the string,
+ * excluding any terminating NULL byte.
+ * Returns 0 if only ignorable code points are present.
+ * Returns -1 if the input is not valid UTF-8.
+ */
+extern ssize_t utf8len(const struct utf8data *data, const char *s);
+extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
+
+/* Needed in struct utf8cursor below. */
+#define UTF8HANGULLEAF	(12)
+
+/*
+ * Cursor structure used by the normalizer.
+ */
+struct utf8cursor {
+	const struct utf8data	*data;
+	const char	*s;
+	const char	*p;
+	const char	*ss;
+	const char	*sp;
+	unsigned int	len;
+	unsigned int	slen;
+	short int	ccc;
+	short int	nccc;
+	unsigned char	hangul[UTF8HANGULLEAF];
+};
+
+/*
+ * Initialize a utf8cursor to normalize a string.
+ * Returns 0 on success.
+ * Returns -1 on failure.
+ */
+extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
+		      const char *s);
+extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
+		       const char *s, size_t len);
+
+/*
+ * Get the next byte in the normalization.
+ * Returns a value > 0 && < 256 on success.
+ * Returns 0 when the end of the normalization is reached.
+ * Returns -1 if the string being normalized is not valid UTF-8.
+ */
+extern int utf8byte(struct utf8cursor *u8c);
+
+#endif /* UTF8NORM_H */

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7a85e60..f9fd186 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  fs/userfaultfd.c
  *
@@ -5,9 +6,6 @@
  *  Copyright (C) 2008-2009 Red Hat, Inc.
  *  Copyright (C) 2015  Red Hat, Inc.
  *
- *  This work is licensed under the terms of the GNU GPL, version 2. See
- *  the COPYING file in the top-level directory.
- *
  *  Some part derived from fs/eventfd.c (anon inode setup) and
  *  mm/ksm.c (mm hashing).
  */
@@ -30,6 +28,8 @@
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 
+int sysctl_unprivileged_userfaultfd __read_mostly = 1;
+
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
 enum userfaultfd_state {
@@ -40,6 +40,16 @@
 /*
  * Start with fault_pending_wqh and fault_wqh so they're more likely
  * to be in the same cacheline.
+ *
+ * Locking order:
+ *	fd_wqh.lock
+ *		fault_pending_wqh.lock
+ *			fault_wqh.lock
+ *		event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
  */
 struct userfaultfd_ctx {
 	/* waitqueue head for the pending (i.e. not read) userfaults */
@@ -53,7 +63,7 @@
 	/* a refile sequence protected by fault_pending_wqh lock */
 	struct seqcount refile_seq;
 	/* pseudo fd refcounting */
-	atomic_t refcount;
+	refcount_t refcount;
 	/* userfaultfd syscall flags */
 	unsigned int flags;
 	/* features requested from the userspace */
@@ -140,8 +150,7 @@
  */
 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
 {
-	if (!atomic_inc_not_zero(&ctx->refcount))
-		BUG();
+	refcount_inc(&ctx->refcount);
 }
 
 /**
@@ -154,7 +163,7 @@
  */
 static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 {
-	if (atomic_dec_and_test(&ctx->refcount)) {
+	if (refcount_dec_and_test(&ctx->refcount)) {
 		VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
 		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
@@ -459,7 +468,7 @@
 	blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
 			 TASK_KILLABLE;
 
-	spin_lock(&ctx->fault_pending_wqh.lock);
+	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 	/*
 	 * After the __add_wait_queue the uwq is visible to userland
 	 * through poll/read().
@@ -471,7 +480,7 @@
 	 * __add_wait_queue.
 	 */
 	set_current_state(blocking_state);
-	spin_unlock(&ctx->fault_pending_wqh.lock);
+	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 
 	if (!is_vm_hugetlb_page(vmf->vma))
 		must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
@@ -553,13 +562,13 @@
 	 * kernel stack can be released after the list_del_init.
 	 */
 	if (!list_empty_careful(&uwq.wq.entry)) {
-		spin_lock(&ctx->fault_pending_wqh.lock);
+		spin_lock_irq(&ctx->fault_pending_wqh.lock);
 		/*
 		 * No need of list_del_init(), the uwq on the stack
 		 * will be freed shortly anyway.
 		 */
 		list_del(&uwq.wq.entry);
-		spin_unlock(&ctx->fault_pending_wqh.lock);
+		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 	}
 
 	/*
@@ -584,7 +593,7 @@
 	init_waitqueue_entry(&ewq->wq, current);
 	release_new_ctx = NULL;
 
-	spin_lock(&ctx->event_wqh.lock);
+	spin_lock_irq(&ctx->event_wqh.lock);
 	/*
 	 * After the __add_wait_queue the uwq is visible to userland
 	 * through poll/read().
@@ -614,15 +623,15 @@
 			break;
 		}
 
-		spin_unlock(&ctx->event_wqh.lock);
+		spin_unlock_irq(&ctx->event_wqh.lock);
 
 		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 		schedule();
 
-		spin_lock(&ctx->event_wqh.lock);
+		spin_lock_irq(&ctx->event_wqh.lock);
 	}
 	__set_current_state(TASK_RUNNING);
-	spin_unlock(&ctx->event_wqh.lock);
+	spin_unlock_irq(&ctx->event_wqh.lock);
 
 	if (release_new_ctx) {
 		struct vm_area_struct *vma;
@@ -630,6 +639,8 @@
 
 		/* the various vma->vm_userfaultfd_ctx still points to it */
 		down_write(&mm->mmap_sem);
+		/* no task can run (and in turn coredump) yet */
+		VM_WARN_ON(!mmget_still_valid(mm));
 		for (vma = mm->mmap; vma; vma = vma->vm_next)
 			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
 				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -686,7 +697,7 @@
 			return -ENOMEM;
 		}
 
-		atomic_set(&ctx->refcount, 1);
+		refcount_set(&ctx->refcount, 1);
 		ctx->flags = octx->flags;
 		ctx->state = UFFD_STATE_RUNNING;
 		ctx->features = octx->features;
@@ -736,10 +747,18 @@
 	struct userfaultfd_ctx *ctx;
 
 	ctx = vma->vm_userfaultfd_ctx.ctx;
-	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
+
+	if (!ctx)
+		return;
+
+	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
 		vm_ctx->ctx = ctx;
 		userfaultfd_ctx_get(ctx);
 		WRITE_ONCE(ctx->mmap_changing, true);
+	} else {
+		/* Drop uffd context if remap feature not enabled */
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
 	}
 }
 
@@ -861,6 +880,7 @@
 	/* len == 0 means wake all */
 	struct userfaultfd_wake_range range = { .len = 0, };
 	unsigned long new_flags;
+	bool still_valid;
 
 	WRITE_ONCE(ctx->released, true);
 
@@ -876,6 +896,7 @@
 	 * taking the mmap_sem for writing.
 	 */
 	down_write(&mm->mmap_sem);
+	still_valid = mmget_still_valid(mm);
 	prev = NULL;
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		cond_resched();
@@ -886,15 +907,17 @@
 			continue;
 		}
 		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
-		prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
-				 new_flags, vma->anon_vma,
-				 vma->vm_file, vma->vm_pgoff,
-				 vma_policy(vma),
-				 NULL_VM_UFFD_CTX);
-		if (prev)
-			vma = prev;
-		else
-			prev = vma;
+		if (still_valid) {
+			prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+					 new_flags, vma->anon_vma,
+					 vma->vm_file, vma->vm_pgoff,
+					 vma_policy(vma),
+					 NULL_VM_UFFD_CTX);
+			if (prev)
+				vma = prev;
+			else
+				prev = vma;
+		}
 		vma->vm_flags = new_flags;
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 	}
@@ -906,10 +929,10 @@
 	 * the last page faults that may have been already waiting on
 	 * the fault_*wqh.
 	 */
-	spin_lock(&ctx->fault_pending_wqh.lock);
+	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
 	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
-	spin_unlock(&ctx->fault_pending_wqh.lock);
+	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 
 	/* Flush pending events that may still wait on event_wqh */
 	wake_up_all(&ctx->event_wqh);
@@ -926,7 +949,7 @@
 	wait_queue_entry_t *wq;
 	struct userfaultfd_wait_queue *uwq;
 
-	VM_BUG_ON(!spin_is_locked(&wqh->lock));
+	lockdep_assert_held(&wqh->lock);
 
 	uwq = NULL;
 	if (!waitqueue_active(wqh))
@@ -1122,7 +1145,7 @@
 
 	if (!ret && msg->event == UFFD_EVENT_FORK) {
 		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
-		spin_lock(&ctx->event_wqh.lock);
+		spin_lock_irq(&ctx->event_wqh.lock);
 		if (!list_empty(&fork_event)) {
 			/*
 			 * The fork thread didn't abort, so we can
@@ -1168,7 +1191,7 @@
 			if (ret)
 				userfaultfd_ctx_put(fork_nctx);
 		}
-		spin_unlock(&ctx->event_wqh.lock);
+		spin_unlock_irq(&ctx->event_wqh.lock);
 	}
 
 	return ret;
@@ -1207,14 +1230,14 @@
 static void __wake_userfault(struct userfaultfd_ctx *ctx,
 			     struct userfaultfd_wake_range *range)
 {
-	spin_lock(&ctx->fault_pending_wqh.lock);
+	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 	/* wake all in the range and autoremove */
 	if (waitqueue_active(&ctx->fault_pending_wqh))
 		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
 				     range);
 	if (waitqueue_active(&ctx->fault_wqh))
 		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
-	spin_unlock(&ctx->fault_pending_wqh.lock);
+	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 }
 
 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
@@ -1249,21 +1272,23 @@
 }
 
 static __always_inline int validate_range(struct mm_struct *mm,
-					  __u64 start, __u64 len)
+					  __u64 *start, __u64 len)
 {
 	__u64 task_size = mm->task_size;
 
-	if (start & ~PAGE_MASK)
+	*start = untagged_addr(*start);
+
+	if (*start & ~PAGE_MASK)
 		return -EINVAL;
 	if (len & ~PAGE_MASK)
 		return -EINVAL;
 	if (!len)
 		return -EINVAL;
-	if (start < mmap_min_addr)
+	if (*start < mmap_min_addr)
 		return -EINVAL;
-	if (start >= task_size)
+	if (*start >= task_size)
 		return -EINVAL;
-	if (len > task_size - start)
+	if (len > task_size - *start)
 		return -EINVAL;
 	return 0;
 }
@@ -1313,7 +1338,7 @@
 		goto out;
 	}
 
-	ret = validate_range(mm, uffdio_register.range.start,
+	ret = validate_range(mm, &uffdio_register.range.start,
 			     uffdio_register.range.len);
 	if (ret)
 		goto out;
@@ -1326,6 +1351,8 @@
 		goto out;
 
 	down_write(&mm->mmap_sem);
+	if (!mmget_still_valid(mm))
+		goto out_unlock;
 	vma = find_vma_prev(mm, start, &prev);
 	if (!vma)
 		goto out_unlock;
@@ -1500,7 +1527,7 @@
 	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
 		goto out;
 
-	ret = validate_range(mm, uffdio_unregister.start,
+	ret = validate_range(mm, &uffdio_unregister.start,
 			     uffdio_unregister.len);
 	if (ret)
 		goto out;
@@ -1513,6 +1540,8 @@
 		goto out;
 
 	down_write(&mm->mmap_sem);
+	if (!mmget_still_valid(mm))
+		goto out_unlock;
 	vma = find_vma_prev(mm, start, &prev);
 	if (!vma)
 		goto out_unlock;
@@ -1649,7 +1678,7 @@
 	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
 		goto out;
 
-	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+	ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
 	if (ret)
 		goto out;
 
@@ -1689,7 +1718,7 @@
 			   sizeof(uffdio_copy)-sizeof(__s64)))
 		goto out;
 
-	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+	ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
 	if (ret)
 		goto out;
 	/*
@@ -1745,7 +1774,7 @@
 			   sizeof(uffdio_zeropage)-sizeof(__s64)))
 		goto out;
 
-	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+	ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
 			     uffdio_zeropage.range.len);
 	if (ret)
 		goto out;
@@ -1865,7 +1894,7 @@
 	wait_queue_entry_t *wq;
 	unsigned long pending = 0, total = 0;
 
-	spin_lock(&ctx->fault_pending_wqh.lock);
+	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
 		pending++;
 		total++;
@@ -1873,7 +1902,7 @@
 	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
 		total++;
 	}
-	spin_unlock(&ctx->fault_pending_wqh.lock);
+	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 
 	/*
 	 * If more protocols will be added, there will be all shown
@@ -1914,6 +1943,9 @@
 	struct userfaultfd_ctx *ctx;
 	int fd;
 
+	if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+
 	BUG_ON(!current->mm);
 
 	/* Check the UFFD_* constants for consistency.  */
@@ -1927,7 +1959,7 @@
 	if (!ctx)
 		return -ENOMEM;
 
-	atomic_set(&ctx->refcount, 1);
+	refcount_set(&ctx->refcount, 1);
 	ctx->flags = flags;
 	ctx->features = 0;
 	ctx->state = UFFD_STATE_WAIT_API;

diff --git a/fs/utimes.c b/fs/utimes.c
index 69d4b6b..1ba3f78 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c

@@ -8,35 +8,6 @@
 #include <linux/compat.h>
 #include <asm/unistd.h>
 
-#ifdef __ARCH_WANT_SYS_UTIME
-
-/*
- * sys_utime() can be implemented in user-level using sys_utimes().
- * Is this for backwards compatibility?  If so, why not move it
- * into the appropriate arch directory (for those architectures that
- * need it).
- */
-
-/* If times==NULL, set access and modification to current time,
- * must be owner or have write permission.
- * Else, update from *times, must be owner or super user.
- */
-SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
-{
-	struct timespec64 tv[2];
-
-	if (times) {
-		if (get_user(tv[0].tv_sec, &times->actime) ||
-		    get_user(tv[1].tv_sec, &times->modtime))
-			return -EFAULT;
-		tv[0].tv_nsec = 0;
-		tv[1].tv_nsec = 0;
-	}
-	return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0);
-}
-
-#endif
-
 static bool nsec_valid(long nsec)
 {
 	if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
@@ -65,16 +36,14 @@
 		if (times[0].tv_nsec == UTIME_OMIT)
 			newattrs.ia_valid &= ~ATTR_ATIME;
 		else if (times[0].tv_nsec != UTIME_NOW) {
-			newattrs.ia_atime.tv_sec = times[0].tv_sec;
-			newattrs.ia_atime.tv_nsec = times[0].tv_nsec;
+			newattrs.ia_atime = timestamp_truncate(times[0], inode);
 			newattrs.ia_valid |= ATTR_ATIME_SET;
 		}
 
 		if (times[1].tv_nsec == UTIME_OMIT)
 			newattrs.ia_valid &= ~ATTR_MTIME;
 		else if (times[1].tv_nsec != UTIME_NOW) {
-			newattrs.ia_mtime.tv_sec = times[1].tv_sec;
-			newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
+			newattrs.ia_mtime = timestamp_truncate(times[1], inode);
 			newattrs.ia_valid |= ATTR_MTIME_SET;
 		}
 		/*
@@ -166,7 +135,7 @@
 }
 
 SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
-		struct timespec __user *, utimes, int, flags)
+		struct __kernel_timespec __user *, utimes, int, flags)
 {
 	struct timespec64 tstimes[2];
 
@@ -184,6 +153,13 @@
 	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
 }
 
+#ifdef __ARCH_WANT_SYS_UTIME
+/*
+ * futimesat(), utimes() and utime() are older versions of utimensat()
+ * that are provided for compatibility with traditional C libraries.
+ * On modern architectures, we always use libc wrappers around
+ * utimensat() instead.
+ */
 static long do_futimesat(int dfd, const char __user *filename,
 			 struct timeval __user *utimes)
 {
@@ -225,13 +201,29 @@
 	return do_futimesat(AT_FDCWD, filename, utimes);
 }
 
-#ifdef CONFIG_COMPAT
+SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
+{
+	struct timespec64 tv[2];
+
+	if (times) {
+		if (get_user(tv[0].tv_sec, &times->actime) ||
+		    get_user(tv[1].tv_sec, &times->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0);
+}
+#endif
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
  */
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
-		       struct compat_utimbuf __user *, t)
+#ifdef __ARCH_WANT_SYS_UTIME32
+SYSCALL_DEFINE2(utime32, const char __user *, filename,
+		struct old_utimbuf32 __user *, t)
 {
 	struct timespec64 tv[2];
 
@@ -244,14 +236,15 @@
 	}
 	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
 }
+#endif
 
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
+SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
 {
 	struct timespec64 tv[2];
 
 	if  (t) {
-		if (compat_get_timespec64(&tv[0], &t[0]) ||
-		    compat_get_timespec64(&tv[1], &t[1]))
+		if (get_old_timespec32(&tv[0], &t[0]) ||
+		    get_old_timespec32(&tv[1], &t[1]))
 			return -EFAULT;
 
 		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -260,8 +253,9 @@
 	return do_utimes(dfd, filename, t ? tv : NULL, flags);
 }
 
+#ifdef __ARCH_WANT_SYS_UTIME32
 static long do_compat_futimesat(unsigned int dfd, const char __user *filename,
-				struct compat_timeval __user *t)
+				struct old_timeval32 __user *t)
 {
 	struct timespec64 tv[2];
 
@@ -280,15 +274,16 @@
 	return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
 
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd,
+SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd,
 		       const char __user *, filename,
-		       struct compat_timeval __user *, t)
+		       struct old_timeval32 __user *, t)
 {
 	return do_compat_futimesat(dfd, filename, t);
 }
 
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
+SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t)
 {
 	return do_compat_futimesat(AT_FDCWD, filename, t);
 }
 #endif
+#endif

diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
new file mode 100644
index 0000000..88fb251
--- /dev/null
+++ b/fs/verity/Kconfig

@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config FS_VERITY
+	bool "FS Verity (read-only file-based authenticity protection)"
+	select CRYPTO
+	# SHA-256 is selected as it's intended to be the default hash algorithm.
+	# To avoid bloat, other wanted algorithms must be selected explicitly.
+	select CRYPTO_SHA256
+	help
+	  This option enables fs-verity.  fs-verity is the dm-verity
+	  mechanism implemented at the file level.  On supported
+	  filesystems (currently EXT4 and F2FS), userspace can use an
+	  ioctl to enable verity for a file, which causes the filesystem
+	  to build a Merkle tree for the file.  The filesystem will then
+	  transparently verify any data read from the file against the
+	  Merkle tree.  The file is also made read-only.
+
+	  This serves as an integrity check, but the availability of the
+	  Merkle tree root hash also allows efficiently supporting
+	  various use cases where normally the whole file would need to
+	  be hashed at once, such as: (a) auditing (logging the file's
+	  hash), or (b) authenticity verification (comparing the hash
+	  against a known good value, e.g. from a digital signature).
+
+	  fs-verity is especially useful on large files where not all
+	  the contents may actually be needed.  Also, fs-verity verifies
+	  data each time it is paged back in, which provides better
+	  protection against malicious disks vs. an ahead-of-time hash.
+
+	  If unsure, say N.
+
+config FS_VERITY_DEBUG
+	bool "FS Verity debugging"
+	depends on FS_VERITY
+	help
+	  Enable debugging messages related to fs-verity by default.
+
+	  Say N unless you are an fs-verity developer.
+
+config FS_VERITY_BUILTIN_SIGNATURES
+	bool "FS Verity builtin signature support"
+	depends on FS_VERITY
+	select SYSTEM_DATA_VERIFICATION
+	help
+	  Support verifying signatures of verity files against the X.509
+	  certificates that have been loaded into the ".fs-verity"
+	  kernel keyring.
+
+	  This is meant as a relatively simple mechanism that can be
+	  used to provide an authenticity guarantee for verity files, as
+	  an alternative to IMA appraisal.  Userspace programs still
+	  need to check that the verity bit is set in order to get an
+	  authenticity guarantee.
+
+	  If unsure, say N.

diff --git a/fs/verity/Makefile b/fs/verity/Makefile
new file mode 100644
index 0000000..570e913
--- /dev/null
+++ b/fs/verity/Makefile

@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_FS_VERITY) += enable.o \
+			   hash_algs.o \
+			   init.o \
+			   measure.o \
+			   open.o \
+			   verify.o
+
+obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o

diff --git a/fs/verity/enable.c b/fs/verity/enable.c
new file mode 100644
index 0000000..eabc6ac
--- /dev/null
+++ b/fs/verity/enable.c

@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/enable.c: ioctl to enable verity on a file
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+
+static int build_merkle_tree_level(struct inode *inode, unsigned int level,
+				   u64 num_blocks_to_hash,
+				   const struct merkle_tree_params *params,
+				   u8 *pending_hashes,
+				   struct ahash_request *req)
+{
+	const struct fsverity_operations *vops = inode->i_sb->s_vop;
+	unsigned int pending_size = 0;
+	u64 dst_block_num;
+	u64 i;
+	int err;
+
+	if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */
+		return -EINVAL;
+
+	if (level < params->num_levels) {
+		dst_block_num = params->level_start[level];
+	} else {
+		if (WARN_ON(num_blocks_to_hash != 1))
+			return -EINVAL;
+		dst_block_num = 0; /* unused */
+	}
+
+	for (i = 0; i < num_blocks_to_hash; i++) {
+		struct page *src_page;
+
+		if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash)
+			pr_debug("Hashing block %llu of %llu for level %u\n",
+				 i + 1, num_blocks_to_hash, level);
+
+		if (level == 0) {
+			/* Leaf: hashing a data block */
+			src_page = read_mapping_page(inode->i_mapping, i, NULL);
+			if (IS_ERR(src_page)) {
+				err = PTR_ERR(src_page);
+				fsverity_err(inode,
+					     "Error %d reading data page %llu",
+					     err, i);
+				return err;
+			}
+		} else {
+			/* Non-leaf: hashing hash block from level below */
+			src_page = vops->read_merkle_tree_page(inode,
+					params->level_start[level - 1] + i);
+			if (IS_ERR(src_page)) {
+				err = PTR_ERR(src_page);
+				fsverity_err(inode,
+					     "Error %d reading Merkle tree page %llu",
+					     err, params->level_start[level - 1] + i);
+				return err;
+			}
+		}
+
+		err = fsverity_hash_page(params, inode, req, src_page,
+					 &pending_hashes[pending_size]);
+		put_page(src_page);
+		if (err)
+			return err;
+		pending_size += params->digest_size;
+
+		if (level == params->num_levels) /* Root hash? */
+			return 0;
+
+		if (pending_size + params->digest_size > params->block_size ||
+		    i + 1 == num_blocks_to_hash) {
+			/* Flush the pending hash block */
+			memset(&pending_hashes[pending_size], 0,
+			       params->block_size - pending_size);
+			err = vops->write_merkle_tree_block(inode,
+					pending_hashes,
+					dst_block_num,
+					params->log_blocksize);
+			if (err) {
+				fsverity_err(inode,
+					     "Error %d writing Merkle tree block %llu",
+					     err, dst_block_num);
+				return err;
+			}
+			dst_block_num++;
+			pending_size = 0;
+		}
+
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		cond_resched();
+	}
+	return 0;
+}
+
+/*
+ * Build the Merkle tree for the given inode using the given parameters, and
+ * return the root hash in @root_hash.
+ *
+ * The tree is written to a filesystem-specific location as determined by the
+ * ->write_merkle_tree_block() method.  However, the blocks that comprise the
+ * tree are the same for all filesystems.
+ */
+static int build_merkle_tree(struct inode *inode,
+			     const struct merkle_tree_params *params,
+			     u8 *root_hash)
+{
+	u8 *pending_hashes;
+	struct ahash_request *req;
+	u64 blocks;
+	unsigned int level;
+	int err = -ENOMEM;
+
+	if (inode->i_size == 0) {
+		/* Empty file is a special case; root hash is all 0's */
+		memset(root_hash, 0, params->digest_size);
+		return 0;
+	}
+
+	pending_hashes = kmalloc(params->block_size, GFP_KERNEL);
+	req = ahash_request_alloc(params->hash_alg->tfm, GFP_KERNEL);
+	if (!pending_hashes || !req)
+		goto out;
+
+	/*
+	 * Build each level of the Merkle tree, starting at the leaf level
+	 * (level 0) and ascending to the root node (level 'num_levels - 1').
+	 * Then at the end (level 'num_levels'), calculate the root hash.
+	 */
+	blocks = (inode->i_size + params->block_size - 1) >>
+		 params->log_blocksize;
+	for (level = 0; level <= params->num_levels; level++) {
+		err = build_merkle_tree_level(inode, level, blocks, params,
+					      pending_hashes, req);
+		if (err)
+			goto out;
+		blocks = (blocks + params->hashes_per_block - 1) >>
+			 params->log_arity;
+	}
+	memcpy(root_hash, pending_hashes, params->digest_size);
+	err = 0;
+out:
+	kfree(pending_hashes);
+	ahash_request_free(req);
+	return err;
+}
+
+static int enable_verity(struct file *filp,
+			 const struct fsverity_enable_arg *arg)
+{
+	struct inode *inode = file_inode(filp);
+	const struct fsverity_operations *vops = inode->i_sb->s_vop;
+	struct merkle_tree_params params = { };
+	struct fsverity_descriptor *desc;
+	size_t desc_size = sizeof(*desc) + arg->sig_size;
+	struct fsverity_info *vi;
+	int err;
+
+	/* Start initializing the fsverity_descriptor */
+	desc = kzalloc(desc_size, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+	desc->version = 1;
+	desc->hash_algorithm = arg->hash_algorithm;
+	desc->log_blocksize = ilog2(arg->block_size);
+
+	/* Get the salt if the user provided one */
+	if (arg->salt_size &&
+	    copy_from_user(desc->salt,
+			   (const u8 __user *)(uintptr_t)arg->salt_ptr,
+			   arg->salt_size)) {
+		err = -EFAULT;
+		goto out;
+	}
+	desc->salt_size = arg->salt_size;
+
+	/* Get the signature if the user provided one */
+	if (arg->sig_size &&
+	    copy_from_user(desc->signature,
+			   (const u8 __user *)(uintptr_t)arg->sig_ptr,
+			   arg->sig_size)) {
+		err = -EFAULT;
+		goto out;
+	}
+	desc->sig_size = cpu_to_le32(arg->sig_size);
+
+	desc->data_size = cpu_to_le64(inode->i_size);
+
+	/* Prepare the Merkle tree parameters */
+	err = fsverity_init_merkle_tree_params(&params, inode,
+					       arg->hash_algorithm,
+					       desc->log_blocksize,
+					       desc->salt, desc->salt_size);
+	if (err)
+		goto out;
+
+	/*
+	 * Start enabling verity on this file, serialized by the inode lock.
+	 * Fail if verity is already enabled or is already being enabled.
+	 */
+	inode_lock(inode);
+	if (IS_VERITY(inode))
+		err = -EEXIST;
+	else
+		err = vops->begin_enable_verity(filp);
+	inode_unlock(inode);
+	if (err)
+		goto out;
+
+	/*
+	 * Build the Merkle tree.  Don't hold the inode lock during this, since
+	 * on huge files this may take a very long time and we don't want to
+	 * force unrelated syscalls like chown() to block forever.  We don't
+	 * need the inode lock here because deny_write_access() already prevents
+	 * the file from being written to or truncated, and we still serialize
+	 * ->begin_enable_verity() and ->end_enable_verity() using the inode
+	 * lock and only allow one process to be here at a time on a given file.
+	 */
+	pr_debug("Building Merkle tree...\n");
+	BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE);
+	err = build_merkle_tree(inode, &params, desc->root_hash);
+	if (err) {
+		fsverity_err(inode, "Error %d building Merkle tree", err);
+		goto rollback;
+	}
+	pr_debug("Done building Merkle tree.  Root hash is %s:%*phN\n",
+		 params.hash_alg->name, params.digest_size, desc->root_hash);
+
+	/*
+	 * Create the fsverity_info.  Don't bother trying to save work by
+	 * reusing the merkle_tree_params from above.  Instead, just create the
+	 * fsverity_info from the fsverity_descriptor as if it were just loaded
+	 * from disk.  This is simpler, and it serves as an extra check that the
+	 * metadata we're writing is valid before actually enabling verity.
+	 */
+	vi = fsverity_create_info(inode, desc, desc_size);
+	if (IS_ERR(vi)) {
+		err = PTR_ERR(vi);
+		goto rollback;
+	}
+
+	if (arg->sig_size)
+		pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n",
+			 arg->sig_size);
+
+	/*
+	 * Tell the filesystem to finish enabling verity on the file.
+	 * Serialized with ->begin_enable_verity() by the inode lock.
+	 */
+	inode_lock(inode);
+	err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size);
+	inode_unlock(inode);
+	if (err) {
+		fsverity_err(inode, "%ps() failed with err %d",
+			     vops->end_enable_verity, err);
+		fsverity_free_info(vi);
+	} else if (WARN_ON(!IS_VERITY(inode))) {
+		err = -EINVAL;
+		fsverity_free_info(vi);
+	} else {
+		/* Successfully enabled verity */
+
+		/*
+		 * Readers can start using ->i_verity_info immediately, so it
+		 * can't be rolled back once set.  So don't set it until just
+		 * after the filesystem has successfully enabled verity.
+		 */
+		fsverity_set_info(inode, vi);
+	}
+out:
+	kfree(params.hashstate);
+	kfree(desc);
+	return err;
+
+rollback:
+	inode_lock(inode);
+	(void)vops->end_enable_verity(filp, NULL, 0, params.tree_size);
+	inode_unlock(inode);
+	goto out;
+}
+
+/**
+ * fsverity_ioctl_enable() - enable verity on a file
+ *
+ * Enable fs-verity on a file.  See the "FS_IOC_ENABLE_VERITY" section of
+ * Documentation/filesystems/fsverity.rst for the documentation.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
+{
+	struct inode *inode = file_inode(filp);
+	struct fsverity_enable_arg arg;
+	int err;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.version != 1)
+		return -EINVAL;
+
+	if (arg.__reserved1 ||
+	    memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2)))
+		return -EINVAL;
+
+	if (arg.block_size != PAGE_SIZE)
+		return -EINVAL;
+
+	if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt))
+		return -EMSGSIZE;
+
+	if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE)
+		return -EMSGSIZE;
+
+	/*
+	 * Require a regular file with write access.  But the actual fd must
+	 * still be readonly so that we can lock out all writers.  This is
+	 * needed to guarantee that no writable fds exist to the file once it
+	 * has verity enabled, and to stabilize the data being hashed.
+	 */
+
+	err = inode_permission(inode, MAY_WRITE);
+	if (err)
+		return err;
+
+	if (IS_APPEND(inode))
+		return -EPERM;
+
+	if (S_ISDIR(inode->i_mode))
+		return -EISDIR;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	err = mnt_want_write_file(filp);
+	if (err) /* -EROFS */
+		return err;
+
+	err = deny_write_access(filp);
+	if (err) /* -ETXTBSY */
+		goto out_drop_write;
+
+	err = enable_verity(filp, &arg);
+	if (err)
+		goto out_allow_write_access;
+
+	/*
+	 * Some pages of the file may have been evicted from pagecache after
+	 * being used in the Merkle tree construction, then read into pagecache
+	 * again by another process reading from the file concurrently.  Since
+	 * these pages didn't undergo verification against the file measurement
+	 * which fs-verity now claims to be enforcing, we have to wipe the
+	 * pagecache to ensure that all future reads are verified.
+	 */
+	filemap_write_and_wait(inode->i_mapping);
+	invalidate_inode_pages2(inode->i_mapping);
+
+	/*
+	 * allow_write_access() is needed to pair with deny_write_access().
+	 * Regardless, the filesystem won't allow writing to verity files.
+	 */
+out_allow_write_access:
+	allow_write_access(filp);
+out_drop_write:
+	mnt_drop_write_file(filp);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_enable);

diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
new file mode 100644
index 0000000..e74c79b
--- /dev/null
+++ b/fs/verity/fsverity_private.h

@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * fs-verity: read-only file-based authenticity protection
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef _FSVERITY_PRIVATE_H
+#define _FSVERITY_PRIVATE_H
+
+#ifdef CONFIG_FS_VERITY_DEBUG
+#define DEBUG
+#endif
+
+#define pr_fmt(fmt) "fs-verity: " fmt
+
+#include <crypto/sha.h>
+#include <linux/fsverity.h>
+
+struct ahash_request;
+
+/*
+ * Implementation limit: maximum depth of the Merkle tree.  For now 8 is plenty;
+ * it's enough for over U64_MAX bytes of data using SHA-256 and 4K blocks.
+ */
+#define FS_VERITY_MAX_LEVELS		8
+
+/*
+ * Largest digest size among all hash algorithms supported by fs-verity.
+ * Currently assumed to be <= size of fsverity_descriptor::root_hash.
+ */
+#define FS_VERITY_MAX_DIGEST_SIZE	SHA512_DIGEST_SIZE
+
+/* A hash algorithm supported by fs-verity */
+struct fsverity_hash_alg {
+	struct crypto_ahash *tfm; /* hash tfm, allocated on demand */
+	const char *name;	  /* crypto API name, e.g. sha256 */
+	unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
+	unsigned int block_size;  /* block size in bytes, e.g. 64 for SHA-256 */
+};
+
+/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
+struct merkle_tree_params {
+	const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
+	const u8 *hashstate;		/* initial hash state or NULL */
+	unsigned int digest_size;	/* same as hash_alg->digest_size */
+	unsigned int block_size;	/* size of data and tree blocks */
+	unsigned int hashes_per_block;	/* number of hashes per tree block */
+	unsigned int log_blocksize;	/* log2(block_size) */
+	unsigned int log_arity;		/* log2(hashes_per_block) */
+	unsigned int num_levels;	/* number of levels in Merkle tree */
+	u64 tree_size;			/* Merkle tree size in bytes */
+
+	/*
+	 * Starting block index for each tree level, ordered from leaf level (0)
+	 * to root level ('num_levels - 1')
+	 */
+	u64 level_start[FS_VERITY_MAX_LEVELS];
+};
+
+/**
+ * fsverity_info - cached verity metadata for an inode
+ *
+ * When a verity file is first opened, an instance of this struct is allocated
+ * and stored in ->i_verity_info; it remains until the inode is evicted.  It
+ * caches information about the Merkle tree that's needed to efficiently verify
+ * data read from the file.  It also caches the file measurement.  The Merkle
+ * tree pages themselves are not cached here, but the filesystem may cache them.
+ */
+struct fsverity_info {
+	struct merkle_tree_params tree_params;
+	u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
+	u8 measurement[FS_VERITY_MAX_DIGEST_SIZE];
+	const struct inode *inode;
+};
+
+/*
+ * Merkle tree properties.  The file measurement is the hash of this structure
+ * excluding the signature and with the sig_size field set to 0.
+ */
+struct fsverity_descriptor {
+	__u8 version;		/* must be 1 */
+	__u8 hash_algorithm;	/* Merkle tree hash algorithm */
+	__u8 log_blocksize;	/* log2 of size of data and tree blocks */
+	__u8 salt_size;		/* size of salt in bytes; 0 if none */
+	__le32 sig_size;	/* size of signature in bytes; 0 if none */
+	__le64 data_size;	/* size of file the Merkle tree is built over */
+	__u8 root_hash[64];	/* Merkle tree root hash */
+	__u8 salt[32];		/* salt prepended to each hashed block */
+	__u8 __reserved[144];	/* must be 0's */
+	__u8 signature[];	/* optional PKCS#7 signature */
+};
+
+/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
+#define FS_VERITY_MAX_DESCRIPTOR_SIZE	16384
+
+#define FS_VERITY_MAX_SIGNATURE_SIZE	(FS_VERITY_MAX_DESCRIPTOR_SIZE - \
+					 sizeof(struct fsverity_descriptor))
+
+/*
+ * Format in which verity file measurements are signed.  This is the same as
+ * 'struct fsverity_digest', except here some magic bytes are prepended to
+ * provide some context about what is being signed in case the same key is used
+ * for non-fsverity purposes, and here the fields have fixed endianness.
+ */
+struct fsverity_signed_digest {
+	char magic[8];			/* must be "FSVerity" */
+	__le16 digest_algorithm;
+	__le16 digest_size;
+	__u8 digest[];
+};
+
+/* hash_algs.c */
+
+extern struct fsverity_hash_alg fsverity_hash_algs[];
+
+const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+						      unsigned int num);
+const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+				      const u8 *salt, size_t salt_size);
+int fsverity_hash_page(const struct merkle_tree_params *params,
+		       const struct inode *inode,
+		       struct ahash_request *req, struct page *page, u8 *out);
+int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+			 const void *data, size_t size, u8 *out);
+void __init fsverity_check_hash_algs(void);
+
+/* init.c */
+
+extern void __printf(3, 4) __cold
+fsverity_msg(const struct inode *inode, const char *level,
+	     const char *fmt, ...);
+
+#define fsverity_warn(inode, fmt, ...)		\
+	fsverity_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__)
+#define fsverity_err(inode, fmt, ...)		\
+	fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
+
+/* open.c */
+
+int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
+				     const struct inode *inode,
+				     unsigned int hash_algorithm,
+				     unsigned int log_blocksize,
+				     const u8 *salt, size_t salt_size);
+
+struct fsverity_info *fsverity_create_info(const struct inode *inode,
+					   void *desc, size_t desc_size);
+
+void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
+
+void fsverity_free_info(struct fsverity_info *vi);
+
+int __init fsverity_init_info_cache(void);
+void __init fsverity_exit_info_cache(void);
+
+/* signature.c */
+
+#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
+int fsverity_verify_signature(const struct fsverity_info *vi,
+			      const struct fsverity_descriptor *desc,
+			      size_t desc_size);
+
+int __init fsverity_init_signature(void);
+#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
+static inline int
+fsverity_verify_signature(const struct fsverity_info *vi,
+			  const struct fsverity_descriptor *desc,
+			  size_t desc_size)
+{
+	return 0;
+}
+
+static inline int fsverity_init_signature(void)
+{
+	return 0;
+}
+#endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
+
+/* verify.c */
+
+int __init fsverity_init_workqueue(void);
+void __init fsverity_exit_workqueue(void);
+
+#endif /* _FSVERITY_PRIVATE_H */

diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
new file mode 100644
index 0000000..31e6d7d
--- /dev/null
+++ b/fs/verity/hash_algs.c

@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/hash_algs.c: fs-verity hash algorithms
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/scatterlist.h>
+
+/* The hash algorithms supported by fs-verity */
+struct fsverity_hash_alg fsverity_hash_algs[] = {
+	[FS_VERITY_HASH_ALG_SHA256] = {
+		.name = "sha256",
+		.digest_size = SHA256_DIGEST_SIZE,
+		.block_size = SHA256_BLOCK_SIZE,
+	},
+	[FS_VERITY_HASH_ALG_SHA512] = {
+		.name = "sha512",
+		.digest_size = SHA512_DIGEST_SIZE,
+		.block_size = SHA512_BLOCK_SIZE,
+	},
+};
+
+/**
+ * fsverity_get_hash_alg() - validate and prepare a hash algorithm
+ * @inode: optional inode for logging purposes
+ * @num: the hash algorithm number
+ *
+ * Get the struct fsverity_hash_alg for the given hash algorithm number, and
+ * ensure it has a hash transform ready to go.  The hash transforms are
+ * allocated on-demand so that we don't waste resources unnecessarily, and
+ * because the crypto modules may be initialized later than fs/verity/.
+ *
+ * Return: pointer to the hash alg on success, else an ERR_PTR()
+ */
+const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+						      unsigned int num)
+{
+	struct fsverity_hash_alg *alg;
+	struct crypto_ahash *tfm;
+	int err;
+
+	if (num >= ARRAY_SIZE(fsverity_hash_algs) ||
+	    !fsverity_hash_algs[num].name) {
+		fsverity_warn(inode, "Unknown hash algorithm number: %u", num);
+		return ERR_PTR(-EINVAL);
+	}
+	alg = &fsverity_hash_algs[num];
+
+	/* pairs with cmpxchg() below */
+	tfm = READ_ONCE(alg->tfm);
+	if (likely(tfm != NULL))
+		return alg;
+	/*
+	 * Using the shash API would make things a bit simpler, but the ahash
+	 * API is preferable as it allows the use of crypto accelerators.
+	 */
+	tfm = crypto_alloc_ahash(alg->name, 0, 0);
+	if (IS_ERR(tfm)) {
+		if (PTR_ERR(tfm) == -ENOENT) {
+			fsverity_warn(inode,
+				      "Missing crypto API support for hash algorithm \"%s\"",
+				      alg->name);
+			return ERR_PTR(-ENOPKG);
+		}
+		fsverity_err(inode,
+			     "Error allocating hash algorithm \"%s\": %ld",
+			     alg->name, PTR_ERR(tfm));
+		return ERR_CAST(tfm);
+	}
+
+	err = -EINVAL;
+	if (WARN_ON(alg->digest_size != crypto_ahash_digestsize(tfm)))
+		goto err_free_tfm;
+	if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm)))
+		goto err_free_tfm;
+
+	pr_info("%s using implementation \"%s\"\n",
+		alg->name, crypto_ahash_driver_name(tfm));
+
+	/* pairs with READ_ONCE() above */
+	if (cmpxchg(&alg->tfm, NULL, tfm) != NULL)
+		crypto_free_ahash(tfm);
+
+	return alg;
+
+err_free_tfm:
+	crypto_free_ahash(tfm);
+	return ERR_PTR(err);
+}
+
+/**
+ * fsverity_prepare_hash_state() - precompute the initial hash state
+ * @alg: hash algorithm
+ * @salt: a salt which is to be prepended to all data to be hashed
+ * @salt_size: salt size in bytes, possibly 0
+ *
+ * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed
+ *	   initial hash state on success or an ERR_PTR() on failure.
+ */
+const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+				      const u8 *salt, size_t salt_size)
+{
+	u8 *hashstate = NULL;
+	struct ahash_request *req = NULL;
+	u8 *padded_salt = NULL;
+	size_t padded_salt_size;
+	struct scatterlist sg;
+	DECLARE_CRYPTO_WAIT(wait);
+	int err;
+
+	if (salt_size == 0)
+		return NULL;
+
+	hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL);
+	if (!hashstate)
+		return ERR_PTR(-ENOMEM);
+
+	req = ahash_request_alloc(alg->tfm, GFP_KERNEL);
+	if (!req) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	/*
+	 * Zero-pad the salt to the next multiple of the input size of the hash
+	 * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128
+	 * bytes for SHA-512.  This ensures that the hash algorithm won't have
+	 * any bytes buffered internally after processing the salt, thus making
+	 * salted hashing just as fast as unsalted hashing.
+	 */
+	padded_salt_size = round_up(salt_size, alg->block_size);
+	padded_salt = kzalloc(padded_salt_size, GFP_KERNEL);
+	if (!padded_salt) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+	memcpy(padded_salt, salt, salt_size);
+
+	sg_init_one(&sg, padded_salt, padded_salt_size);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &wait);
+	ahash_request_set_crypt(req, &sg, NULL, padded_salt_size);
+
+	err = crypto_wait_req(crypto_ahash_init(req), &wait);
+	if (err)
+		goto err_free;
+
+	err = crypto_wait_req(crypto_ahash_update(req), &wait);
+	if (err)
+		goto err_free;
+
+	err = crypto_ahash_export(req, hashstate);
+	if (err)
+		goto err_free;
+out:
+	ahash_request_free(req);
+	kfree(padded_salt);
+	return hashstate;
+
+err_free:
+	kfree(hashstate);
+	hashstate = ERR_PTR(err);
+	goto out;
+}
+
+/**
+ * fsverity_hash_page() - hash a single data or hash page
+ * @params: the Merkle tree's parameters
+ * @inode: inode for which the hashing is being done
+ * @req: preallocated hash request
+ * @page: the page to hash
+ * @out: output digest, size 'params->digest_size' bytes
+ *
+ * Hash a single data or hash block, assuming block_size == PAGE_SIZE.
+ * The hash is salted if a salt is specified in the Merkle tree parameters.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_hash_page(const struct merkle_tree_params *params,
+		       const struct inode *inode,
+		       struct ahash_request *req, struct page *page, u8 *out)
+{
+	struct scatterlist sg;
+	DECLARE_CRYPTO_WAIT(wait);
+	int err;
+
+	if (WARN_ON(params->block_size != PAGE_SIZE))
+		return -EINVAL;
+
+	sg_init_table(&sg, 1);
+	sg_set_page(&sg, page, PAGE_SIZE, 0);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &wait);
+	ahash_request_set_crypt(req, &sg, out, PAGE_SIZE);
+
+	if (params->hashstate) {
+		err = crypto_ahash_import(req, params->hashstate);
+		if (err) {
+			fsverity_err(inode,
+				     "Error %d importing hash state", err);
+			return err;
+		}
+		err = crypto_ahash_finup(req);
+	} else {
+		err = crypto_ahash_digest(req);
+	}
+
+	err = crypto_wait_req(err, &wait);
+	if (err)
+		fsverity_err(inode, "Error %d computing page hash", err);
+	return err;
+}
+
+/**
+ * fsverity_hash_buffer() - hash some data
+ * @alg: the hash algorithm to use
+ * @data: the data to hash
+ * @size: size of data to hash, in bytes
+ * @out: output digest, size 'alg->digest_size' bytes
+ *
+ * Hash some data which is located in physically contiguous memory (i.e. memory
+ * allocated by kmalloc(), not by vmalloc()).  No salt is used.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+			 const void *data, size_t size, u8 *out)
+{
+	struct ahash_request *req;
+	struct scatterlist sg;
+	DECLARE_CRYPTO_WAIT(wait);
+	int err;
+
+	req = ahash_request_alloc(alg->tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	sg_init_one(&sg, data, size);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &wait);
+	ahash_request_set_crypt(req, &sg, out, size);
+
+	err = crypto_wait_req(crypto_ahash_digest(req), &wait);
+
+	ahash_request_free(req);
+	return err;
+}
+
+void __init fsverity_check_hash_algs(void)
+{
+	size_t i;
+
+	/*
+	 * Sanity check the hash algorithms (could be a build-time check, but
+	 * they're in an array)
+	 */
+	for (i = 0; i < ARRAY_SIZE(fsverity_hash_algs); i++) {
+		const struct fsverity_hash_alg *alg = &fsverity_hash_algs[i];
+
+		if (!alg->name)
+			continue;
+
+		BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE);
+
+		/*
+		 * For efficiency, the implementation currently assumes the
+		 * digest and block sizes are powers of 2.  This limitation can
+		 * be lifted if the code is updated to handle other values.
+		 */
+		BUG_ON(!is_power_of_2(alg->digest_size));
+		BUG_ON(!is_power_of_2(alg->block_size));
+	}
+}

diff --git a/fs/verity/init.c b/fs/verity/init.c
new file mode 100644
index 0000000..94c104e
--- /dev/null
+++ b/fs/verity/init.c

@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/init.c: fs-verity module initialization and logging
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/ratelimit.h>
+
+void fsverity_msg(const struct inode *inode, const char *level,
+		  const char *fmt, ...)
+{
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	struct va_format vaf;
+	va_list args;
+
+	if (!__ratelimit(&rs))
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	if (inode)
+		printk("%sfs-verity (%s, inode %lu): %pV\n",
+		       level, inode->i_sb->s_id, inode->i_ino, &vaf);
+	else
+		printk("%sfs-verity: %pV\n", level, &vaf);
+	va_end(args);
+}
+
+static int __init fsverity_init(void)
+{
+	int err;
+
+	fsverity_check_hash_algs();
+
+	err = fsverity_init_info_cache();
+	if (err)
+		return err;
+
+	err = fsverity_init_workqueue();
+	if (err)
+		goto err_exit_info_cache;
+
+	err = fsverity_init_signature();
+	if (err)
+		goto err_exit_workqueue;
+
+	pr_debug("Initialized fs-verity\n");
+	return 0;
+
+err_exit_workqueue:
+	fsverity_exit_workqueue();
+err_exit_info_cache:
+	fsverity_exit_info_cache();
+	return err;
+}
+late_initcall(fsverity_init)

diff --git a/fs/verity/measure.c b/fs/verity/measure.c
new file mode 100644
index 0000000..05049b6
--- /dev/null
+++ b/fs/verity/measure.c

@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/measure.c: ioctl to get a verity file's measurement
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/uaccess.h>
+
+/**
+ * fsverity_ioctl_measure() - get a verity file's measurement
+ *
+ * Retrieve the file measurement that the kernel is enforcing for reads from a
+ * verity file.  See the "FS_IOC_MEASURE_VERITY" section of
+ * Documentation/filesystems/fsverity.rst for the documentation.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_ioctl_measure(struct file *filp, void __user *_uarg)
+{
+	const struct inode *inode = file_inode(filp);
+	struct fsverity_digest __user *uarg = _uarg;
+	const struct fsverity_info *vi;
+	const struct fsverity_hash_alg *hash_alg;
+	struct fsverity_digest arg;
+
+	vi = fsverity_get_info(inode);
+	if (!vi)
+		return -ENODATA; /* not a verity file */
+	hash_alg = vi->tree_params.hash_alg;
+
+	/*
+	 * The user specifies the digest_size their buffer has space for; we can
+	 * return the digest if it fits in the available space.  We write back
+	 * the actual size, which may be shorter than the user-specified size.
+	 */
+
+	if (get_user(arg.digest_size, &uarg->digest_size))
+		return -EFAULT;
+	if (arg.digest_size < hash_alg->digest_size)
+		return -EOVERFLOW;
+
+	memset(&arg, 0, sizeof(arg));
+	arg.digest_algorithm = hash_alg - fsverity_hash_algs;
+	arg.digest_size = hash_alg->digest_size;
+
+	if (copy_to_user(uarg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);

diff --git a/fs/verity/open.c b/fs/verity/open.c
new file mode 100644
index 0000000..63d1004
--- /dev/null
+++ b/fs/verity/open.c

@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/open.c: opening fs-verity files
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/slab.h>
+
+static struct kmem_cache *fsverity_info_cachep;
+
+/**
+ * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters
+ * @params: the parameters struct to initialize
+ * @inode: the inode for which the Merkle tree is being built
+ * @hash_algorithm: number of hash algorithm to use
+ * @log_blocksize: log base 2 of block size to use
+ * @salt: pointer to salt (optional)
+ * @salt_size: size of salt, possibly 0
+ *
+ * Validate the hash algorithm and block size, then compute the tree topology
+ * (num levels, num blocks in each level, etc.) and initialize @params.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
+				     const struct inode *inode,
+				     unsigned int hash_algorithm,
+				     unsigned int log_blocksize,
+				     const u8 *salt, size_t salt_size)
+{
+	const struct fsverity_hash_alg *hash_alg;
+	int err;
+	u64 blocks;
+	u64 offset;
+	int level;
+
+	memset(params, 0, sizeof(*params));
+
+	hash_alg = fsverity_get_hash_alg(inode, hash_algorithm);
+	if (IS_ERR(hash_alg))
+		return PTR_ERR(hash_alg);
+	params->hash_alg = hash_alg;
+	params->digest_size = hash_alg->digest_size;
+
+	params->hashstate = fsverity_prepare_hash_state(hash_alg, salt,
+							salt_size);
+	if (IS_ERR(params->hashstate)) {
+		err = PTR_ERR(params->hashstate);
+		params->hashstate = NULL;
+		fsverity_err(inode, "Error %d preparing hash state", err);
+		goto out_err;
+	}
+
+	if (log_blocksize != PAGE_SHIFT) {
+		fsverity_warn(inode, "Unsupported log_blocksize: %u",
+			      log_blocksize);
+		err = -EINVAL;
+		goto out_err;
+	}
+	params->log_blocksize = log_blocksize;
+	params->block_size = 1 << log_blocksize;
+
+	if (WARN_ON(!is_power_of_2(params->digest_size))) {
+		err = -EINVAL;
+		goto out_err;
+	}
+	if (params->block_size < 2 * params->digest_size) {
+		fsverity_warn(inode,
+			      "Merkle tree block size (%u) too small for hash algorithm \"%s\"",
+			      params->block_size, hash_alg->name);
+		err = -EINVAL;
+		goto out_err;
+	}
+	params->log_arity = params->log_blocksize - ilog2(params->digest_size);
+	params->hashes_per_block = 1 << params->log_arity;
+
+	pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n",
+		 hash_alg->name, params->block_size, params->hashes_per_block,
+		 (int)salt_size, salt);
+
+	/*
+	 * Compute the number of levels in the Merkle tree and create a map from
+	 * level to the starting block of that level.  Level 'num_levels - 1' is
+	 * the root and is stored first.  Level 0 is the level directly "above"
+	 * the data blocks and is stored last.
+	 */
+
+	/* Compute number of levels and the number of blocks in each level */
+	blocks = (inode->i_size + params->block_size - 1) >> log_blocksize;
+	pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks);
+	while (blocks > 1) {
+		if (params->num_levels >= FS_VERITY_MAX_LEVELS) {
+			fsverity_err(inode, "Too many levels in Merkle tree");
+			err = -EINVAL;
+			goto out_err;
+		}
+		blocks = (blocks + params->hashes_per_block - 1) >>
+			 params->log_arity;
+		/* temporarily using level_start[] to store blocks in level */
+		params->level_start[params->num_levels++] = blocks;
+	}
+
+	/* Compute the starting block of each level */
+	offset = 0;
+	for (level = (int)params->num_levels - 1; level >= 0; level--) {
+		blocks = params->level_start[level];
+		params->level_start[level] = offset;
+		pr_debug("Level %d is %llu blocks starting at index %llu\n",
+			 level, blocks, offset);
+		offset += blocks;
+	}
+
+	params->tree_size = offset << log_blocksize;
+	return 0;
+
+out_err:
+	kfree(params->hashstate);
+	memset(params, 0, sizeof(*params));
+	return err;
+}
+
+/*
+ * Compute the file measurement by hashing the fsverity_descriptor excluding the
+ * signature and with the sig_size field set to 0.
+ */
+static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg,
+				    struct fsverity_descriptor *desc,
+				    u8 *measurement)
+{
+	__le32 sig_size = desc->sig_size;
+	int err;
+
+	desc->sig_size = 0;
+	err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
+	desc->sig_size = sig_size;
+
+	return err;
+}
+
+/*
+ * Validate the given fsverity_descriptor and create a new fsverity_info from
+ * it.  The signature (if present) is also checked.
+ */
+struct fsverity_info *fsverity_create_info(const struct inode *inode,
+					   void *_desc, size_t desc_size)
+{
+	struct fsverity_descriptor *desc = _desc;
+	struct fsverity_info *vi;
+	int err;
+
+	if (desc_size < sizeof(*desc)) {
+		fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
+			     desc_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (desc->version != 1) {
+		fsverity_err(inode, "Unrecognized descriptor version: %u",
+			     desc->version);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
+		fsverity_err(inode, "Reserved bits set in descriptor");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (desc->salt_size > sizeof(desc->salt)) {
+		fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (le64_to_cpu(desc->data_size) != inode->i_size) {
+		fsverity_err(inode,
+			     "Wrong data_size: %llu (desc) != %lld (inode)",
+			     le64_to_cpu(desc->data_size), inode->i_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
+	if (!vi)
+		return ERR_PTR(-ENOMEM);
+	vi->inode = inode;
+
+	err = fsverity_init_merkle_tree_params(&vi->tree_params, inode,
+					       desc->hash_algorithm,
+					       desc->log_blocksize,
+					       desc->salt, desc->salt_size);
+	if (err) {
+		fsverity_err(inode,
+			     "Error %d initializing Merkle tree parameters",
+			     err);
+		goto out;
+	}
+
+	memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
+
+	err = compute_file_measurement(vi->tree_params.hash_alg, desc,
+				       vi->measurement);
+	if (err) {
+		fsverity_err(inode, "Error %d computing file measurement", err);
+		goto out;
+	}
+	pr_debug("Computed file measurement: %s:%*phN\n",
+		 vi->tree_params.hash_alg->name,
+		 vi->tree_params.digest_size, vi->measurement);
+
+	err = fsverity_verify_signature(vi, desc, desc_size);
+out:
+	if (err) {
+		fsverity_free_info(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
+{
+	/*
+	 * Multiple processes may race to set ->i_verity_info, so use cmpxchg.
+	 * This pairs with the READ_ONCE() in fsverity_get_info().
+	 */
+	if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL)
+		fsverity_free_info(vi);
+}
+
+void fsverity_free_info(struct fsverity_info *vi)
+{
+	if (!vi)
+		return;
+	kfree(vi->tree_params.hashstate);
+	kmem_cache_free(fsverity_info_cachep, vi);
+}
+
+/* Ensure the inode has an ->i_verity_info */
+static int ensure_verity_info(struct inode *inode)
+{
+	struct fsverity_info *vi = fsverity_get_info(inode);
+	struct fsverity_descriptor *desc;
+	int res;
+
+	if (vi)
+		return 0;
+
+	res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0);
+	if (res < 0) {
+		fsverity_err(inode,
+			     "Error %d getting verity descriptor size", res);
+		return res;
+	}
+	if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+		fsverity_err(inode, "Verity descriptor is too large (%d bytes)",
+			     res);
+		return -EMSGSIZE;
+	}
+	desc = kmalloc(res, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+	res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res);
+	if (res < 0) {
+		fsverity_err(inode, "Error %d reading verity descriptor", res);
+		goto out_free_desc;
+	}
+
+	vi = fsverity_create_info(inode, desc, res);
+	if (IS_ERR(vi)) {
+		res = PTR_ERR(vi);
+		goto out_free_desc;
+	}
+
+	fsverity_set_info(inode, vi);
+	res = 0;
+out_free_desc:
+	kfree(desc);
+	return res;
+}
+
+/**
+ * fsverity_file_open() - prepare to open a verity file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * When opening a verity file, deny the open if it is for writing.  Otherwise,
+ * set up the inode's ->i_verity_info if not already done.
+ *
+ * When combined with fscrypt, this must be called after fscrypt_file_open().
+ * Otherwise, we won't have the key set up to decrypt the verity metadata.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_file_open(struct inode *inode, struct file *filp)
+{
+	if (!IS_VERITY(inode))
+		return 0;
+
+	if (filp->f_mode & FMODE_WRITE) {
+		pr_debug("Denying opening verity file (ino %lu) for write\n",
+			 inode->i_ino);
+		return -EPERM;
+	}
+
+	return ensure_verity_info(inode);
+}
+EXPORT_SYMBOL_GPL(fsverity_file_open);
+
+/**
+ * fsverity_prepare_setattr() - prepare to change a verity inode's attributes
+ * @dentry: dentry through which the inode is being changed
+ * @attr: attributes to change
+ *
+ * Verity files are immutable, so deny truncates.  This isn't covered by the
+ * open-time check because sys_truncate() takes a path, not a file descriptor.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) {
+		pr_debug("Denying truncate of verity file (ino %lu)\n",
+			 d_inode(dentry)->i_ino);
+		return -EPERM;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fsverity_prepare_setattr);
+
+/**
+ * fsverity_cleanup_inode() - free the inode's verity info, if present
+ *
+ * Filesystems must call this on inode eviction to free ->i_verity_info.
+ */
+void fsverity_cleanup_inode(struct inode *inode)
+{
+	fsverity_free_info(inode->i_verity_info);
+	inode->i_verity_info = NULL;
+}
+EXPORT_SYMBOL_GPL(fsverity_cleanup_inode);
+
+int __init fsverity_init_info_cache(void)
+{
+	fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info,
+						   SLAB_RECLAIM_ACCOUNT,
+						   measurement);
+	if (!fsverity_info_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+void __init fsverity_exit_info_cache(void)
+{
+	kmem_cache_destroy(fsverity_info_cachep);
+	fsverity_info_cachep = NULL;
+}

diff --git a/fs/verity/signature.c b/fs/verity/signature.c
new file mode 100644
index 0000000..c8b2552
--- /dev/null
+++ b/fs/verity/signature.c

@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/signature.c: verification of builtin signatures
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/cred.h>
+#include <linux/key.h>
+#include <linux/slab.h>
+#include <linux/verification.h>
+
+/*
+ * /proc/sys/fs/verity/require_signatures
+ * If 1, all verity files must have a valid builtin signature.
+ */
+static int fsverity_require_signatures;
+
+/*
+ * Keyring that contains the trusted X.509 certificates.
+ *
+ * Only root (kuid=0) can modify this.  Also, root may use
+ * keyctl_restrict_keyring() to prevent any more additions.
+ */
+static struct key *fsverity_keyring;
+
+/**
+ * fsverity_verify_signature() - check a verity file's signature
+ *
+ * If the file's fs-verity descriptor includes a signature of the file
+ * measurement, verify it against the certificates in the fs-verity keyring.
+ *
+ * Return: 0 on success (signature valid or not required); -errno on failure
+ */
+int fsverity_verify_signature(const struct fsverity_info *vi,
+			      const struct fsverity_descriptor *desc,
+			      size_t desc_size)
+{
+	const struct inode *inode = vi->inode;
+	const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg;
+	const u32 sig_size = le32_to_cpu(desc->sig_size);
+	struct fsverity_signed_digest *d;
+	int err;
+
+	if (sig_size == 0) {
+		if (fsverity_require_signatures) {
+			fsverity_err(inode,
+				     "require_signatures=1, rejecting unsigned file!");
+			return -EPERM;
+		}
+		return 0;
+	}
+
+	if (sig_size > desc_size - sizeof(*desc)) {
+		fsverity_err(inode, "Signature overflows verity descriptor");
+		return -EBADMSG;
+	}
+
+	d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+	memcpy(d->magic, "FSVerity", 8);
+	d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs);
+	d->digest_size = cpu_to_le16(hash_alg->digest_size);
+	memcpy(d->digest, vi->measurement, hash_alg->digest_size);
+
+	err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size,
+				     desc->signature, sig_size,
+				     fsverity_keyring,
+				     VERIFYING_UNSPECIFIED_SIGNATURE,
+				     NULL, NULL);
+	kfree(d);
+
+	if (err) {
+		if (err == -ENOKEY)
+			fsverity_err(inode,
+				     "File's signing cert isn't in the fs-verity keyring");
+		else if (err == -EKEYREJECTED)
+			fsverity_err(inode, "Incorrect file signature");
+		else if (err == -EBADMSG)
+			fsverity_err(inode, "Malformed file signature");
+		else
+			fsverity_err(inode, "Error %d verifying file signature",
+				     err);
+		return err;
+	}
+
+	pr_debug("Valid signature for file measurement %s:%*phN\n",
+		 hash_alg->name, hash_alg->digest_size, vi->measurement);
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fsverity_sysctl_header;
+
+static const struct ctl_path fsverity_sysctl_path[] = {
+	{ .procname = "fs", },
+	{ .procname = "verity", },
+	{ }
+};
+
+static struct ctl_table fsverity_sysctl_table[] = {
+	{
+		.procname       = "require_signatures",
+		.data           = &fsverity_require_signatures,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{ }
+};
+
+static int __init fsverity_sysctl_init(void)
+{
+	fsverity_sysctl_header = register_sysctl_paths(fsverity_sysctl_path,
+						       fsverity_sysctl_table);
+	if (!fsverity_sysctl_header) {
+		pr_err("sysctl registration failed!\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+#else /* !CONFIG_SYSCTL */
+static inline int __init fsverity_sysctl_init(void)
+{
+	return 0;
+}
+#endif /* !CONFIG_SYSCTL */
+
+int __init fsverity_init_signature(void)
+{
+	struct key *ring;
+	int err;
+
+	ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0),
+			     current_cred(), KEY_POS_SEARCH |
+				KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE |
+				KEY_USR_SEARCH | KEY_USR_SETATTR,
+			     KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+	if (IS_ERR(ring))
+		return PTR_ERR(ring);
+
+	err = fsverity_sysctl_init();
+	if (err)
+		goto err_put_ring;
+
+	fsverity_keyring = ring;
+	return 0;
+
+err_put_ring:
+	key_put(ring);
+	return err;
+}

diff --git a/fs/verity/verify.c b/fs/verity/verify.c
new file mode 100644
index 0000000..3e8f2de
--- /dev/null
+++ b/fs/verity/verify.c

@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages()
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/bio.h>
+#include <linux/ratelimit.h>
+
+static struct workqueue_struct *fsverity_read_workqueue;
+
+/**
+ * hash_at_level() - compute the location of the block's hash at the given level
+ *
+ * @params:	(in) the Merkle tree parameters
+ * @dindex:	(in) the index of the data block being verified
+ * @level:	(in) the level of hash we want (0 is leaf level)
+ * @hindex:	(out) the index of the hash block containing the wanted hash
+ * @hoffset:	(out) the byte offset to the wanted hash within the hash block
+ */
+static void hash_at_level(const struct merkle_tree_params *params,
+			  pgoff_t dindex, unsigned int level, pgoff_t *hindex,
+			  unsigned int *hoffset)
+{
+	pgoff_t position;
+
+	/* Offset of the hash within the level's region, in hashes */
+	position = dindex >> (level * params->log_arity);
+
+	/* Index of the hash block in the tree overall */
+	*hindex = params->level_start[level] + (position >> params->log_arity);
+
+	/* Offset of the wanted hash (in bytes) within the hash block */
+	*hoffset = (position & ((1 << params->log_arity) - 1)) <<
+		   (params->log_blocksize - params->log_arity);
+}
+
+/* Extract a hash from a hash page */
+static void extract_hash(struct page *hpage, unsigned int hoffset,
+			 unsigned int hsize, u8 *out)
+{
+	void *virt = kmap_atomic(hpage);
+
+	memcpy(out, virt + hoffset, hsize);
+	kunmap_atomic(virt);
+}
+
+static inline int cmp_hashes(const struct fsverity_info *vi,
+			     const u8 *want_hash, const u8 *real_hash,
+			     pgoff_t index, int level)
+{
+	const unsigned int hsize = vi->tree_params.digest_size;
+
+	if (memcmp(want_hash, real_hash, hsize) == 0)
+		return 0;
+
+	fsverity_err(vi->inode,
+		     "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
+		     index, level,
+		     vi->tree_params.hash_alg->name, hsize, want_hash,
+		     vi->tree_params.hash_alg->name, hsize, real_hash);
+	return -EBADMSG;
+}
+
+/*
+ * Verify a single data page against the file's Merkle tree.
+ *
+ * In principle, we need to verify the entire path to the root node.  However,
+ * for efficiency the filesystem may cache the hash pages.  Therefore we need
+ * only ascend the tree until an already-verified page is seen, as indicated by
+ * the PageChecked bit being set; then verify the path to that page.
+ *
+ * This code currently only supports the case where the verity block size is
+ * equal to PAGE_SIZE.  Doing otherwise would be possible but tricky, since we
+ * wouldn't be able to use the PageChecked bit.
+ *
+ * Note that multiple processes may race to verify a hash page and mark it
+ * Checked, but it doesn't matter; the result will be the same either way.
+ *
+ * Return: true if the page is valid, else false.
+ */
+static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
+			struct ahash_request *req, struct page *data_page)
+{
+	const struct merkle_tree_params *params = &vi->tree_params;
+	const unsigned int hsize = params->digest_size;
+	const pgoff_t index = data_page->index;
+	int level;
+	u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE];
+	const u8 *want_hash;
+	u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
+	struct page *hpages[FS_VERITY_MAX_LEVELS];
+	unsigned int hoffsets[FS_VERITY_MAX_LEVELS];
+	int err;
+
+	if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page)))
+		return false;
+
+	pr_debug_ratelimited("Verifying data page %lu...\n", index);
+
+	/*
+	 * Starting at the leaf level, ascend the tree saving hash pages along
+	 * the way until we find a verified hash page, indicated by PageChecked;
+	 * or until we reach the root.
+	 */
+	for (level = 0; level < params->num_levels; level++) {
+		pgoff_t hindex;
+		unsigned int hoffset;
+		struct page *hpage;
+
+		hash_at_level(params, index, level, &hindex, &hoffset);
+
+		pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n",
+				     level, hindex, hoffset);
+
+		hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
+								  hindex);
+		if (IS_ERR(hpage)) {
+			err = PTR_ERR(hpage);
+			fsverity_err(inode,
+				     "Error %d reading Merkle tree page %lu",
+				     err, hindex);
+			goto out;
+		}
+
+		if (PageChecked(hpage)) {
+			extract_hash(hpage, hoffset, hsize, _want_hash);
+			want_hash = _want_hash;
+			put_page(hpage);
+			pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n",
+					     params->hash_alg->name,
+					     hsize, want_hash);
+			goto descend;
+		}
+		pr_debug_ratelimited("Hash page not yet checked\n");
+		hpages[level] = hpage;
+		hoffsets[level] = hoffset;
+	}
+
+	want_hash = vi->root_hash;
+	pr_debug("Want root hash: %s:%*phN\n",
+		 params->hash_alg->name, hsize, want_hash);
+descend:
+	/* Descend the tree verifying hash pages */
+	for (; level > 0; level--) {
+		struct page *hpage = hpages[level - 1];
+		unsigned int hoffset = hoffsets[level - 1];
+
+		err = fsverity_hash_page(params, inode, req, hpage, real_hash);
+		if (err)
+			goto out;
+		err = cmp_hashes(vi, want_hash, real_hash, index, level - 1);
+		if (err)
+			goto out;
+		SetPageChecked(hpage);
+		extract_hash(hpage, hoffset, hsize, _want_hash);
+		want_hash = _want_hash;
+		put_page(hpage);
+		pr_debug("Verified hash page at level %d, now want %s:%*phN\n",
+			 level - 1, params->hash_alg->name, hsize, want_hash);
+	}
+
+	/* Finally, verify the data page */
+	err = fsverity_hash_page(params, inode, req, data_page, real_hash);
+	if (err)
+		goto out;
+	err = cmp_hashes(vi, want_hash, real_hash, index, -1);
+out:
+	for (; level > 0; level--)
+		put_page(hpages[level - 1]);
+
+	return err == 0;
+}
+
+/**
+ * fsverity_verify_page() - verify a data page
+ *
+ * Verify a page that has just been read from a verity file.  The page must be a
+ * pagecache page that is still locked and not yet uptodate.
+ *
+ * Return: true if the page is valid, else false.
+ */
+bool fsverity_verify_page(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	const struct fsverity_info *vi = inode->i_verity_info;
+	struct ahash_request *req;
+	bool valid;
+
+	req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS);
+	if (unlikely(!req))
+		return false;
+
+	valid = verify_page(inode, vi, req, page);
+
+	ahash_request_free(req);
+
+	return valid;
+}
+EXPORT_SYMBOL_GPL(fsverity_verify_page);
+
+#ifdef CONFIG_BLOCK
+/**
+ * fsverity_verify_bio() - verify a 'read' bio that has just completed
+ *
+ * Verify a set of pages that have just been read from a verity file.  The pages
+ * must be pagecache pages that are still locked and not yet uptodate.  Pages
+ * that fail verification are set to the Error state.  Verification is skipped
+ * for pages already in the Error state, e.g. due to fscrypt decryption failure.
+ *
+ * This is a helper function for use by the ->readpages() method of filesystems
+ * that issue bios to read data directly into the page cache.  Filesystems that
+ * populate the page cache without issuing bios (e.g. non block-based
+ * filesystems) must instead call fsverity_verify_page() directly on each page.
+ * All filesystems must also call fsverity_verify_page() on holes.
+ */
+void fsverity_verify_bio(struct bio *bio)
+{
+	struct inode *inode = bio_first_page_all(bio)->mapping->host;
+	const struct fsverity_info *vi = inode->i_verity_info;
+	struct ahash_request *req;
+	struct bio_vec *bv;
+	struct bvec_iter_all iter_all;
+
+	req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS);
+	if (unlikely(!req)) {
+		bio_for_each_segment_all(bv, bio, iter_all)
+			SetPageError(bv->bv_page);
+		return;
+	}
+
+	bio_for_each_segment_all(bv, bio, iter_all) {
+		struct page *page = bv->bv_page;
+
+		if (!PageError(page) && !verify_page(inode, vi, req, page))
+			SetPageError(page);
+	}
+
+	ahash_request_free(req);
+}
+EXPORT_SYMBOL_GPL(fsverity_verify_bio);
+#endif /* CONFIG_BLOCK */
+
+/**
+ * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue
+ *
+ * Enqueue verification work for asynchronous processing.
+ */
+void fsverity_enqueue_verify_work(struct work_struct *work)
+{
+	queue_work(fsverity_read_workqueue, work);
+}
+EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work);
+
+int __init fsverity_init_workqueue(void)
+{
+	/*
+	 * Use an unbound workqueue to allow bios to be verified in parallel
+	 * even when they happen to complete on the same CPU.  This sacrifices
+	 * locality, but it's worthwhile since hashing is CPU-intensive.
+	 *
+	 * Also use a high-priority workqueue to prioritize verification work,
+	 * which blocks reads from completing, over regular application tasks.
+	 */
+	fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue",
+						  WQ_UNBOUND | WQ_HIGHPRI,
+						  num_online_cpus());
+	if (!fsverity_read_workqueue)
+		return -ENOMEM;
+	return 0;
+}
+
+void __init fsverity_exit_workqueue(void)
+{
+	destroy_workqueue(fsverity_read_workqueue);
+	fsverity_read_workqueue = NULL;
+}

diff --git a/fs/xattr.c b/fs/xattr.c
index 0d6a6a4..90dd78f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
   File: fs/xattr.c
 

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 457ac9f..e685299 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig

@@ -1,7 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config XFS_FS
 	tristate "XFS filesystem support"
 	depends on BLOCK
-	depends on (64BIT || LBDAF)
 	select EXPORTFS
 	select LIBCRC32C
 	select FS_IOMAP

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7f96bda..06b68b6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile

@@ -4,8 +4,8 @@
 # All Rights Reserved.
 #
 
-ccflags-y += -I$(src)			# needed for trace events
-ccflags-y += -I$(src)/libxfs
+ccflags-y += -I $(srctree)/$(src)		# needed for trace events
+ccflags-y += -I $(srctree)/$(src)/libxfs
 
 ccflags-$(CONFIG_XFS_DEBUG) += -g
 
@@ -49,6 +49,7 @@
 				   xfs_refcount_btree.o \
 				   xfs_sb.o \
 				   xfs_symlink_remote.o \
+				   xfs_trans_inode.o \
 				   xfs_trans_resv.o \
 				   xfs_types.o \
 				   )
@@ -62,6 +63,7 @@
 				   xfs_attr_inactive.o \
 				   xfs_attr_list.o \
 				   xfs_bmap_util.o \
+				   xfs_bio_io.o \
 				   xfs_buf.o \
 				   xfs_dir2_readdir.o \
 				   xfs_discard.o \
@@ -73,15 +75,18 @@
 				   xfs_fsmap.o \
 				   xfs_fsops.o \
 				   xfs_globals.o \
+				   xfs_health.o \
 				   xfs_icache.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
 				   xfs_iops.o \
 				   xfs_inode.o \
 				   xfs_itable.o \
+				   xfs_iwalk.o \
 				   xfs_message.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
+				   xfs_pwork.o \
 				   xfs_reflink.o \
 				   xfs_stats.o \
 				   xfs_super.o \
@@ -103,12 +108,7 @@
 				   xfs_rmap_item.o \
 				   xfs_log_recover.o \
 				   xfs_trans_ail.o \
-				   xfs_trans_bmap.o \
-				   xfs_trans_buf.o \
-				   xfs_trans_extfree.o \
-				   xfs_trans_inode.o \
-				   xfs_trans_refcount.o \
-				   xfs_trans_rmap.o \
+				   xfs_trans_buf.o
 
 # optional features
 xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
@@ -142,6 +142,8 @@
 				   common.o \
 				   dabtree.o \
 				   dir.o \
+				   fscounters.o \
+				   health.o \
 				   ialloc.o \
 				   inode.o \
 				   parent.o \

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index fdd9d6e..da031b9 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c

@@ -3,15 +3,10 @@
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
-#include <linux/mm.h>
-#include <linux/sched/mm.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/blkdev.h>
+#include "xfs.h"
 #include <linux/backing-dev.h>
-#include "kmem.h"
 #include "xfs_message.h"
+#include "xfs_trace.h"
 
 void *
 kmem_alloc(size_t size, xfs_km_flags_t flags)
@@ -20,9 +15,11 @@
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
+	trace_kmem_alloc(size, flags, _RET_IP_);
+
 	do {
 		ptr = kmalloc(size, lflags);
-		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+		if (ptr || (flags & KM_MAYFAIL))
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
@@ -33,28 +30,24 @@
 	} while (1);
 }
 
-void *
-kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+
+/*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
+ * we need to tell memory reclaim that we are in such a context via
+ * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
+ * and potentially deadlocking.
+ */
+static void *
+__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
 {
 	unsigned nofs_flag = 0;
 	void	*ptr;
-	gfp_t	lflags;
+	gfp_t	lflags = kmem_flags_convert(flags);
 
-	ptr = kmem_alloc(size, flags | KM_MAYFAIL);
-	if (ptr)
-		return ptr;
-
-	/*
-	 * __vmalloc() will allocate data pages and auxillary structures (e.g.
-	 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
-	 * here. Hence we need to tell memory reclaim that we are in such a
-	 * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
-	 * the filesystem here and potentially deadlocking.
-	 */
 	if (flags & KM_NOFS)
 		nofs_flag = memalloc_nofs_save();
 
-	lflags = kmem_flags_convert(flags);
 	ptr = __vmalloc(size, lflags, PAGE_KERNEL);
 
 	if (flags & KM_NOFS)
@@ -63,6 +56,44 @@
 	return ptr;
 }
 
+/*
+ * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
+ * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
+ * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
+ * aligned region.
+ */
+void *
+kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
+{
+	void	*ptr;
+
+	trace_kmem_alloc_io(size, flags, _RET_IP_);
+
+	if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
+		align_mask = PAGE_SIZE - 1;
+
+	ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+	if (ptr) {
+		if (!((uintptr_t)ptr & align_mask))
+			return ptr;
+		kfree(ptr);
+	}
+	return __kmem_vmalloc(size, flags);
+}
+
+void *
+kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+{
+	void	*ptr;
+
+	trace_kmem_alloc_large(size, flags, _RET_IP_);
+
+	ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+	if (ptr)
+		return ptr;
+	return __kmem_vmalloc(size, flags);
+}
+
 void *
 kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
 {
@@ -70,9 +101,11 @@
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
+	trace_kmem_realloc(newsize, flags, _RET_IP_);
+
 	do {
 		ptr = krealloc(old, newsize, lflags);
-		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+		if (ptr || (flags & KM_MAYFAIL))
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
@@ -90,9 +123,10 @@
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
+	trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
 	do {
 		ptr = kmem_cache_alloc(zone, lflags);
-		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+		if (ptr || (flags & KM_MAYFAIL))
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,

diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 8e6b3ba..8170d95 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h

@@ -16,8 +16,6 @@
  */
 
 typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_SLEEP	((__force xfs_km_flags_t)0x0001u)
-#define KM_NOSLEEP	((__force xfs_km_flags_t)0x0002u)
 #define KM_NOFS		((__force xfs_km_flags_t)0x0004u)
 #define KM_MAYFAIL	((__force xfs_km_flags_t)0x0008u)
 #define KM_ZERO		((__force xfs_km_flags_t)0x0010u)
@@ -32,15 +30,11 @@
 {
 	gfp_t	lflags;
 
-	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+	BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
 
-	if (flags & KM_NOSLEEP) {
-		lflags = GFP_ATOMIC | __GFP_NOWARN;
-	} else {
-		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if (flags & KM_NOFS)
-			lflags &= ~__GFP_FS;
-	}
+	lflags = GFP_KERNEL | __GFP_NOWARN;
+	if (flags & KM_NOFS)
+		lflags &= ~__GFP_FS;
 
 	/*
 	 * Default page/slab allocator behavior is to retry for ever
@@ -59,6 +53,7 @@
 }
 
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
 extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
 static inline void  kmem_free(const void *ptr)
@@ -124,4 +119,12 @@
 	return kmem_zone_alloc(zone, flags | KM_ZERO);
 }
 
+static inline struct page *
+kmem_to_page(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return vmalloc_to_page(addr);
+	return virt_to_page(addr);
+}
+
 #endif /* __XFS_SUPPORT_KMEM_H__ */

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 9345802..14fbdf2 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c

@@ -10,6 +10,7 @@
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_btree.h"
@@ -19,18 +20,19 @@
 #include "xfs_ialloc.h"
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
 
 static struct xfs_buf *
 xfs_get_aghdr_buf(
 	struct xfs_mount	*mp,
 	xfs_daddr_t		blkno,
 	size_t			numblks,
-	int			flags,
 	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf		*bp;
 
-	bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+	bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0);
 	if (!bp)
 		return NULL;
 
@@ -42,6 +44,12 @@
 	return bp;
 }
 
+static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
+{
+	return mp->m_sb.sb_logstart > 0 &&
+	       id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
 /*
  * Generic btree root block init function
  */
@@ -51,7 +59,62 @@
 	struct xfs_buf		*bp,
 	struct aghdr_init_data	*id)
 {
-	xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0);
+	xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
+}
+
+/* Finish initializing a free space btree. */
+static void
+xfs_freesp_init_recs(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_alloc_rec	*arec;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+
+	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+
+	if (is_log_ag(mp, id)) {
+		struct xfs_alloc_rec	*nrec;
+		xfs_agblock_t		start = XFS_FSB_TO_AGBNO(mp,
+							mp->m_sb.sb_logstart);
+
+		ASSERT(start >= mp->m_ag_prealloc_blocks);
+		if (start != mp->m_ag_prealloc_blocks) {
+			/*
+			 * Modify first record to pad stripe align of log
+			 */
+			arec->ar_blockcount = cpu_to_be32(start -
+						mp->m_ag_prealloc_blocks);
+			nrec = arec + 1;
+
+			/*
+			 * Insert second record at start of internal log
+			 * which then gets trimmed.
+			 */
+			nrec->ar_startblock = cpu_to_be32(
+					be32_to_cpu(arec->ar_startblock) +
+					be32_to_cpu(arec->ar_blockcount));
+			arec = nrec;
+			be16_add_cpu(&block->bb_numrecs, 1);
+		}
+		/*
+		 * Change record start to after the internal log
+		 */
+		be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks);
+	}
+
+	/*
+	 * Calculate the record block count and check for the case where
+	 * the log might have consumed all available space in the AG. If
+	 * so, reset the record count to 0 to avoid exposure of an invalid
+	 * record start block.
+	 */
+	arec->ar_blockcount = cpu_to_be32(id->agsize -
+					  be32_to_cpu(arec->ar_startblock));
+	if (!arec->ar_blockcount)
+		block->bb_numrecs = 0;
 }
 
 /*
@@ -63,13 +126,8 @@
 	struct xfs_buf		*bp,
 	struct aghdr_init_data	*id)
 {
-	struct xfs_alloc_rec	*arec;
-
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0);
-	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
-	arec->ar_blockcount = cpu_to_be32(id->agsize -
-					  be32_to_cpu(arec->ar_startblock));
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno);
+	xfs_freesp_init_recs(mp, bp, id);
 }
 
 static void
@@ -78,13 +136,8 @@
 	struct xfs_buf		*bp,
 	struct aghdr_init_data	*id)
 {
-	struct xfs_alloc_rec	*arec;
-
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0);
-	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
-	arec->ar_blockcount = cpu_to_be32(id->agsize -
-					  be32_to_cpu(arec->ar_startblock));
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno);
+	xfs_freesp_init_recs(mp, bp, id);
 }
 
 /*
@@ -99,7 +152,7 @@
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_rmap_rec	*rrec;
 
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0);
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
 
 	/*
 	 * mark the AG header regions as static metadata The BNO
@@ -147,6 +200,18 @@
 		rrec->rm_offset = 0;
 		be16_add_cpu(&block->bb_numrecs, 1);
 	}
+
+	/* account for the log space */
+	if (is_log_ag(mp, id)) {
+		rrec = XFS_RMAP_REC_ADDR(block,
+				be16_to_cpu(block->bb_numrecs) + 1);
+		rrec->rm_startblock = cpu_to_be32(
+				XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart));
+		rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks);
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+	}
 }
 
 /*
@@ -207,6 +272,14 @@
 		agf->agf_refcount_level = cpu_to_be32(1);
 		agf->agf_refcount_blocks = cpu_to_be32(1);
 	}
+
+	if (is_log_ag(mp, id)) {
+		int64_t	logblocks = mp->m_sb.sb_logblocks;
+
+		be32_add_cpu(&agf->agf_freeblks, -logblocks);
+		agf->agf_longest = cpu_to_be32(id->agsize -
+			XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks);
+	}
 }
 
 static void
@@ -271,7 +344,7 @@
 {
 	struct xfs_buf		*bp;
 
-	bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+	bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops);
 	if (!bp)
 		return -ENOMEM;
 
@@ -339,14 +412,14 @@
 	{ /* BNO root block */
 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_allocbt_buf_ops,
+		.ops = &xfs_bnobt_buf_ops,
 		.work = &xfs_bnoroot_init,
 		.need_init = true
 	},
 	{ /* CNT root block */
 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_allocbt_buf_ops,
+		.ops = &xfs_cntbt_buf_ops,
 		.work = &xfs_cntroot_init,
 		.need_init = true
 	},
@@ -361,7 +434,7 @@
 	{ /* FINO root block */
 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_inobt_buf_ops,
+		.ops = &xfs_finobt_buf_ops,
 		.work = &xfs_btroot_init,
 		.type = XFS_BTNUM_FINO,
 		.need_init =  xfs_sb_version_hasfinobt(&mp->m_sb)
@@ -414,7 +487,6 @@
 	struct aghdr_init_data	*id,
 	xfs_extlen_t		len)
 {
-	struct xfs_owner_info	oinfo;
 	struct xfs_buf		*bp;
 	struct xfs_agi		*agi;
 	struct xfs_agf		*agf;
@@ -448,17 +520,69 @@
 	/*
 	 * Free the new space.
 	 *
-	 * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
+	 * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
 	 * this doesn't actually exist in the rmap btree.
 	 */
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
 	error = xfs_rmap_free(tp, bp, id->agno,
 				be32_to_cpu(agf->agf_length) - len,
-				len, &oinfo);
+				len, &XFS_RMAP_OINFO_SKIP_UPDATE);
 	if (error)
 		return error;
 
 	return  xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
 					be32_to_cpu(agf->agf_length) - len),
-				len, &oinfo, XFS_AG_RESV_NONE);
+				len, &XFS_RMAP_OINFO_SKIP_UPDATE,
+				XFS_AG_RESV_NONE);
+}
+
+/* Retrieve AG geometry. */
+int
+xfs_ag_get_geometry(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	struct xfs_ag_geometry	*ageo)
+{
+	struct xfs_buf		*agi_bp;
+	struct xfs_buf		*agf_bp;
+	struct xfs_agi		*agi;
+	struct xfs_agf		*agf;
+	struct xfs_perag	*pag;
+	unsigned int		freeblks;
+	int			error;
+
+	if (agno >= mp->m_sb.sb_agcount)
+		return -EINVAL;
+
+	/* Lock the AG headers. */
+	error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+	if (error)
+		return error;
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+	if (error)
+		goto out_agi;
+	pag = xfs_perag_get(mp, agno);
+
+	/* Fill out form. */
+	memset(ageo, 0, sizeof(*ageo));
+	ageo->ag_number = agno;
+
+	agi = XFS_BUF_TO_AGI(agi_bp);
+	ageo->ag_icount = be32_to_cpu(agi->agi_count);
+	ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
+
+	agf = XFS_BUF_TO_AGF(agf_bp);
+	ageo->ag_length = be32_to_cpu(agf->agf_length);
+	freeblks = pag->pagf_freeblks +
+		   pag->pagf_flcount +
+		   pag->pagf_btreeblks -
+		   xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE);
+	ageo->ag_freeblks = freeblks;
+	xfs_ag_geom_health(pag, ageo);
+
+	/* Release resources. */
+	xfs_perag_put(pag);
+	xfs_buf_relse(agf_bp);
+out_agi:
+	xfs_buf_relse(agi_bp);
+	return error;
 }

diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 412702e..5166322 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h

@@ -26,5 +26,7 @@
 int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
 int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
 			struct aghdr_init_data *id, xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
+			struct xfs_ag_geometry *ageo);
 
 #endif /* __LIBXFS_AG_H */

diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e701ebc..87a9747 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c

@@ -9,20 +9,12 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_alloc.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_trans.h"
-#include "xfs_bit.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ag_resv.h"
-#include "xfs_trans_space.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_btree.h"
 #include "xfs_refcount_btree.h"
@@ -281,7 +273,7 @@
 			 */
 			ask = used = 0;
 
-			mp->m_inotbt_nores = true;
+			mp->m_finobt_nores = true;
 
 			error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
 					&used);

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index e1c0c0d..533b04a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c

@@ -13,7 +13,6 @@
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
-#include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_alloc_btree.h"
@@ -21,7 +20,6 @@
 #include "xfs_extent_busy.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
-#include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
@@ -41,8 +39,6 @@
 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
 
 /*
  * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
@@ -555,7 +551,7 @@
 xfs_agfl_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
 	int		i;
 
@@ -568,9 +564,9 @@
 	if (!xfs_sb_version_hascrc(&mp->m_sb))
 		return NULL;
 
-	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
 		return __this_address;
-	if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
 		return __this_address;
 	/*
 	 * during growfs operations, the perag is not fully initialised,
@@ -596,7 +592,7 @@
 xfs_agfl_read_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	xfs_failaddr_t	fa;
 
 	/*
@@ -621,7 +617,7 @@
 xfs_agfl_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	xfs_failaddr_t		fa;
 
@@ -643,6 +639,7 @@
 
 const struct xfs_buf_ops xfs_agfl_buf_ops = {
 	.name = "xfs_agfl",
+	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
 	.verify_read = xfs_agfl_read_verify,
 	.verify_write = xfs_agfl_write_verify,
 	.verify_struct = xfs_agfl_verify,
@@ -699,6 +696,107 @@
  */
 
 /*
+ * Deal with the case where only small freespaces remain. Either return the
+ * contents of the last freespace record, or allocate space from the freelist if
+ * there is nothing in the tree.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_small(
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
+	xfs_agblock_t		*fbnop,	/* result block number */
+	xfs_extlen_t		*flenp,	/* result length */
+	int			*stat)	/* status: 0-freelist, 1-normal/none */
+{
+	int			error = 0;
+	xfs_agblock_t		fbno = NULLAGBLOCK;
+	xfs_extlen_t		flen = 0;
+	int			i = 0;
+
+	/*
+	 * If a cntbt cursor is provided, try to allocate the largest record in
+	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
+	 * allocation. Make sure to respect minleft even when pulling from the
+	 * freelist.
+	 */
+	if (ccur)
+		error = xfs_btree_decrement(ccur, 0, &i);
+	if (error)
+		goto error;
+	if (i) {
+		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
+		if (error)
+			goto error;
+		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
+		goto out;
+	}
+
+	if (args->minlen != 1 || args->alignment != 1 ||
+	    args->resv == XFS_AG_RESV_AGFL ||
+	    (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
+	     args->minleft))
+		goto out;
+
+	error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+	if (error)
+		goto error;
+	if (fbno == NULLAGBLOCK)
+		goto out;
+
+	xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+			      xfs_alloc_allow_busy_reuse(args->datatype));
+
+	if (xfs_alloc_is_userdata(args->datatype)) {
+		struct xfs_buf	*bp;
+
+		bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
+		if (!bp) {
+			error = -EFSCORRUPTED;
+			goto error;
+		}
+		xfs_trans_binval(args->tp, bp);
+	}
+	*fbnop = args->agbno = fbno;
+	*flenp = args->len = 1;
+	XFS_WANT_CORRUPTED_GOTO(args->mp,
+		fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+		error);
+	args->wasfromfl = 1;
+	trace_xfs_alloc_small_freelist(args);
+
+	/*
+	 * If we're feeding an AGFL block to something that doesn't live in the
+	 * free space, we need to clear out the OWN_AG rmap.
+	 */
+	error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
+			      &XFS_RMAP_OINFO_AG);
+	if (error)
+		goto error;
+
+	*stat = 0;
+	return 0;
+
+out:
+	/*
+	 * Can't do the allocation, give up.
+	 */
+	if (flen < args->minlen) {
+		args->agbno = NULLAGBLOCK;
+		trace_xfs_alloc_small_notenough(args);
+		flen = 0;
+	}
+	*fbnop = fbno;
+	*flenp = flen;
+	*stat = 1;
+	trace_xfs_alloc_small_done(args);
+	return 0;
+
+error:
+	trace_xfs_alloc_small_error(args);
+	return error;
+}
+
+/*
  * Allocate a variable extent in the allocation group agno.
  * Type and bno are used to determine where in the allocation group the
  * extent will start.
@@ -1582,140 +1680,32 @@
 }
 
 /*
- * Deal with the case where only small freespaces remain.
- * Either return the contents of the last freespace record,
- * or allocate space from the freelist if there is nothing in the tree.
- */
-STATIC int			/* error */
-xfs_alloc_ag_vextent_small(
-	xfs_alloc_arg_t	*args,	/* allocation argument structure */
-	xfs_btree_cur_t	*ccur,	/* by-size cursor */
-	xfs_agblock_t	*fbnop,	/* result block number */
-	xfs_extlen_t	*flenp,	/* result length */
-	int		*stat)	/* status: 0-freelist, 1-normal/none */
-{
-	struct xfs_owner_info	oinfo;
-	int		error;
-	xfs_agblock_t	fbno;
-	xfs_extlen_t	flen;
-	int		i;
-
-	if ((error = xfs_btree_decrement(ccur, 0, &i)))
-		goto error0;
-	if (i) {
-		if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-	}
-	/*
-	 * Nothing in the btree, try the freelist.  Make sure
-	 * to respect minleft even when pulling from the
-	 * freelist.
-	 */
-	else if (args->minlen == 1 && args->alignment == 1 &&
-		 args->resv != XFS_AG_RESV_AGFL &&
-		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
-		  > args->minleft)) {
-		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
-		if (error)
-			goto error0;
-		if (fbno != NULLAGBLOCK) {
-			xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
-			      xfs_alloc_allow_busy_reuse(args->datatype));
-
-			if (xfs_alloc_is_userdata(args->datatype)) {
-				xfs_buf_t	*bp;
-
-				bp = xfs_btree_get_bufs(args->mp, args->tp,
-					args->agno, fbno, 0);
-				if (!bp) {
-					error = -EFSCORRUPTED;
-					goto error0;
-				}
-				xfs_trans_binval(args->tp, bp);
-			}
-			args->len = 1;
-			args->agbno = fbno;
-			XFS_WANT_CORRUPTED_GOTO(args->mp,
-				args->agbno + args->len <=
-				be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
-				error0);
-			args->wasfromfl = 1;
-			trace_xfs_alloc_small_freelist(args);
-
-			/*
-			 * If we're feeding an AGFL block to something that
-			 * doesn't live in the free space, we need to clear
-			 * out the OWN_AG rmap.
-			 */
-			xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
-			error = xfs_rmap_free(args->tp, args->agbp, args->agno,
-					fbno, 1, &oinfo);
-			if (error)
-				goto error0;
-
-			*stat = 0;
-			return 0;
-		}
-		/*
-		 * Nothing in the freelist.
-		 */
-		else
-			flen = 0;
-	}
-	/*
-	 * Can't allocate from the freelist for some reason.
-	 */
-	else {
-		fbno = NULLAGBLOCK;
-		flen = 0;
-	}
-	/*
-	 * Can't do the allocation, give up.
-	 */
-	if (flen < args->minlen) {
-		args->agbno = NULLAGBLOCK;
-		trace_xfs_alloc_small_notenough(args);
-		flen = 0;
-	}
-	*fbnop = fbno;
-	*flenp = flen;
-	*stat = 1;
-	trace_xfs_alloc_small_done(args);
-	return 0;
-
-error0:
-	trace_xfs_alloc_small_error(args);
-	return error;
-}
-
-/*
  * Free the extent starting at agno/bno for length.
  */
 STATIC int
 xfs_free_ag_extent(
-	xfs_trans_t		*tp,
-	xfs_buf_t		*agbp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo,
-	enum xfs_ag_resv_type	type)
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	xfs_agnumber_t			agno,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		type)
 {
-	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */
-	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */
-	int		error;		/* error return value */
-	xfs_agblock_t	gtbno;		/* start of right neighbor block */
-	xfs_extlen_t	gtlen;		/* length of right neighbor block */
-	int		haveleft;	/* have a left neighbor block */
-	int		haveright;	/* have a right neighbor block */
-	int		i;		/* temp, result code */
-	xfs_agblock_t	ltbno;		/* start of left neighbor block */
-	xfs_extlen_t	ltlen;		/* length of left neighbor block */
-	xfs_mount_t	*mp;		/* mount point struct for filesystem */
-	xfs_agblock_t	nbno;		/* new starting block of freespace */
-	xfs_extlen_t	nlen;		/* new length of freespace */
-	xfs_perag_t	*pag;		/* per allocation group data */
+	struct xfs_mount		*mp;
+	struct xfs_perag		*pag;
+	struct xfs_btree_cur		*bno_cur;
+	struct xfs_btree_cur		*cnt_cur;
+	xfs_agblock_t			gtbno; /* start of right neighbor */
+	xfs_extlen_t			gtlen; /* length of right neighbor */
+	xfs_agblock_t			ltbno; /* start of left neighbor */
+	xfs_extlen_t			ltlen; /* length of left neighbor */
+	xfs_agblock_t			nbno; /* new starting block of freesp */
+	xfs_extlen_t			nlen; /* new length of freespace */
+	int				haveleft; /* have a left neighbor */
+	int				haveright; /* have a right neighbor */
+	int				i;
+	int				error;
 
 	bno_cur = cnt_cur = NULL;
 	mp = tp->t_mountp;
@@ -2043,6 +2033,7 @@
 	xfs_extlen_t		alloc_len, longest;
 	xfs_extlen_t		reservation; /* blocks that are still reserved */
 	int			available;
+	xfs_extlen_t		agflcount;
 
 	if (flags & XFS_ALLOC_FLAG_FREEING)
 		return true;
@@ -2055,8 +2046,13 @@
 	if (longest < alloc_len)
 		return false;
 
-	/* do we have enough free space remaining for the allocation? */
-	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+	/*
+	 * Do we have enough free space remaining for the allocation? Don't
+	 * account extra agfl blocks because we are about to defer free them,
+	 * making them unavailable until the current transaction commits.
+	 */
+	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
+	available = (int)(pag->pagf_freeblks + agflcount -
 			  reservation - min_free - args->minleft);
 	if (available < (int)max(args->total, alloc_len))
 		return false;
@@ -2090,7 +2086,7 @@
 	if (error)
 		return error;
 
-	bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0);
+	bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
 	if (!bp)
 		return -EFSCORRUPTED;
 	xfs_trans_binval(tp, bp);
@@ -2209,7 +2205,7 @@
 	ASSERT(xfs_bmap_free_item_zone != NULL);
 	ASSERT(oinfo != NULL);
 
-	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
 	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
 	new->xefi_blockcount = 1;
 	new->xefi_oinfo = *oinfo;
@@ -2238,6 +2234,9 @@
 	xfs_extlen_t		need;	/* total blocks needed in freelist */
 	int			error = 0;
 
+	/* deferred ops (AGFL block frees) require permanent transactions */
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
 	if (!pag->pagf_init) {
 		error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
 		if (error)
@@ -2314,10 +2313,11 @@
 	 * repair/rmap.c in xfsprogs for details.
 	 */
 	memset(&targs, 0, sizeof(targs));
+	/* struct copy below */
 	if (flags & XFS_ALLOC_FLAG_NORMAP)
-		xfs_rmap_skip_owner_update(&targs.oinfo);
+		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
 	else
-		xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
+		targs.oinfo = XFS_RMAP_OINFO_AG;
 	while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
 		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
 		if (error)
@@ -2435,7 +2435,6 @@
 	be32_add_cpu(&agf->agf_flcount, -1);
 	xfs_trans_agflist_delta(tp, -1);
 	pag->pagf_flcount--;
-	xfs_perag_put(pag);
 
 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
 	if (btreeblk) {
@@ -2443,6 +2442,7 @@
 		pag->pagf_btreeblks++;
 		logflags |= XFS_AGF_BTREEBLKS;
 	}
+	xfs_perag_put(pag);
 
 	xfs_alloc_log_agf(tp, agbp, logflags);
 	*bnop = bno;
@@ -2577,7 +2577,7 @@
 xfs_agf_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(bp);
 
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -2588,8 +2588,10 @@
 			return __this_address;
 	}
 
-	if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-	      XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+	if (!xfs_verify_magic(bp, agf->agf_magicnum))
+		return __this_address;
+
+	if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
 	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
 	      be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
 	      be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2633,7 +2635,7 @@
 xfs_agf_read_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	xfs_failaddr_t	fa;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -2650,7 +2652,7 @@
 xfs_agf_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	xfs_failaddr_t		fa;
 
@@ -2671,6 +2673,7 @@
 
 const struct xfs_buf_ops xfs_agf_buf_ops = {
 	.name = "xfs_agf",
+	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
 	.verify_read = xfs_agf_read_verify,
 	.verify_write = xfs_agf_write_verify,
 	.verify_struct = xfs_agf_verify,
@@ -3008,21 +3011,21 @@
  * Just break up the extent address and hand off to xfs_free_ag_extent
  * after fixing up the freelist.
  */
-int				/* error */
+int
 __xfs_free_extent(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_fsblock_t		bno,	/* starting block number of extent */
-	xfs_extlen_t		len,	/* length of extent */
-	struct xfs_owner_info	*oinfo,	/* extent owner */
-	enum xfs_ag_resv_type	type,	/* block reservation type */
-	bool			skip_discard)
+	struct xfs_trans		*tp,
+	xfs_fsblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		type,
+	bool				skip_discard)
 {
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_buf		*agbp;
-	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, bno);
-	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(mp, bno);
-	int			error;
-	unsigned int		busy_flags = 0;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_buf			*agbp;
+	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, bno);
+	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	int				error;
+	unsigned int			busy_flags = 0;
 
 	ASSERT(len != 0);
 	ASSERT(type != XFS_AG_RESV_AGFL);
@@ -3134,7 +3137,7 @@
 
 /*
  * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
- * error code or XFS_BTREE_QUERY_RANGE_ABORT.
+ * error code or XFS_ITER_*.
  */
 int
 xfs_agfl_walk(

diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 00cd5ec..d6ed5d2 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h

@@ -182,7 +182,7 @@
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_fsblock_t		bno,	/* starting block number of extent */
 	xfs_extlen_t		len,	/* length of extent */
-	struct xfs_owner_info	*oinfo,	/* extent owner */
+	const struct xfs_owner_info	*oinfo,	/* extent owner */
 	enum xfs_ag_resv_type	type,	/* block reservation type */
 	bool			skip_discard);
 
@@ -191,7 +191,7 @@
 	struct xfs_trans	*tp,
 	xfs_fsblock_t		bno,
 	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo,
+	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type	type)
 {
 	return __xfs_free_extent(tp, bno, len, oinfo, type, false);

diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 4e59cc8..2a94543 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c

@@ -17,7 +17,6 @@
 #include "xfs_extent_busy.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_trans.h"
 
 
@@ -292,53 +291,39 @@
 xfs_allocbt_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_perag	*pag = bp->b_pag;
 	xfs_failaddr_t		fa;
 	unsigned int		level;
+	xfs_btnum_t		btnum = XFS_BTNUM_BNOi;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		fa = xfs_btree_sblock_v5hdr_verify(bp);
+		if (fa)
+			return fa;
+	}
 
 	/*
-	 * magic number and level verification
+	 * The perag may not be attached during grow operations or fully
+	 * initialized from the AGF during log recovery. Therefore we can only
+	 * check against maximum tree depth from those contexts.
 	 *
-	 * During growfs operations, we can't verify the exact level or owner as
-	 * the perag is not fully initialised and hence not attached to the
-	 * buffer.  In this case, check against the maximum tree depth.
-	 *
-	 * Similarly, during log recovery we will have a perag structure
-	 * attached, but the agf information will not yet have been initialised
-	 * from the on disk AGF. Again, we can only check against maximum limits
-	 * in this case.
+	 * Otherwise check against the per-tree limit. Peek at one of the
+	 * verifier magic values to determine the type of tree we're verifying
+	 * against.
 	 */
 	level = be16_to_cpu(block->bb_level);
-	switch (block->bb_magic) {
-	case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
-		fa = xfs_btree_sblock_v5hdr_verify(bp);
-		if (fa)
-			return fa;
-		/* fall through */
-	case cpu_to_be32(XFS_ABTB_MAGIC):
-		if (pag && pag->pagf_init) {
-			if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
-				return __this_address;
-		} else if (level >= mp->m_ag_maxlevels)
+	if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+		btnum = XFS_BTNUM_CNTi;
+	if (pag && pag->pagf_init) {
+		if (level >= pag->pagf_levels[btnum])
 			return __this_address;
-		break;
-	case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
-		fa = xfs_btree_sblock_v5hdr_verify(bp);
-		if (fa)
-			return fa;
-		/* fall through */
-	case cpu_to_be32(XFS_ABTC_MAGIC):
-		if (pag && pag->pagf_init) {
-			if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
-				return __this_address;
-		} else if (level >= mp->m_ag_maxlevels)
-			return __this_address;
-		break;
-	default:
+	} else if (level >= mp->m_ag_maxlevels)
 		return __this_address;
-	}
 
 	return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
 }
@@ -377,13 +362,23 @@
 
 }
 
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
-	.name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+	.name = "xfs_bnobt",
+	.magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+		   cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
 	.verify_read = xfs_allocbt_read_verify,
 	.verify_write = xfs_allocbt_write_verify,
 	.verify_struct = xfs_allocbt_verify,
 };
 
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+	.name = "xfs_cntbt",
+	.magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+		   cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+	.verify_read = xfs_allocbt_read_verify,
+	.verify_write = xfs_allocbt_write_verify,
+	.verify_struct = xfs_allocbt_verify,
+};
 
 STATIC int
 xfs_bnobt_keys_inorder(
@@ -448,7 +443,7 @@
 	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
 	.key_diff		= xfs_bnobt_key_diff,
-	.buf_ops		= &xfs_allocbt_buf_ops,
+	.buf_ops		= &xfs_bnobt_buf_ops,
 	.diff_two_keys		= xfs_bnobt_diff_two_keys,
 	.keys_inorder		= xfs_bnobt_keys_inorder,
 	.recs_inorder		= xfs_bnobt_recs_inorder,
@@ -470,7 +465,7 @@
 	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
 	.key_diff		= xfs_cntbt_key_diff,
-	.buf_ops		= &xfs_allocbt_buf_ops,
+	.buf_ops		= &xfs_cntbt_buf_ops,
 	.diff_two_keys		= xfs_cntbt_diff_two_keys,
 	.keys_inorder		= xfs_cntbt_keys_inorder,
 	.recs_inorder		= xfs_cntbt_recs_inorder,

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index c6299f8..510ca69 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c

@@ -9,23 +9,18 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_attr_sf.h"
 #include "xfs_inode.h"
-#include "xfs_alloc.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr_remote.h"
-#include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
@@ -102,7 +97,10 @@
  * Overall external interface routines.
  *========================================================================*/
 
-/* Retrieve an extended attribute and its value.  Must have ilock. */
+/*
+ * Retrieve an extended attribute and its value.  Must have ilock.
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
 int
 xfs_attr_get_ilocked(
 	struct xfs_inode	*ip,
@@ -120,12 +118,28 @@
 		return xfs_attr_node_get(args);
 }
 
-/* Retrieve an extended attribute by name, and its value. */
+/*
+ * Retrieve an extended attribute by name, and its value if requested.
+ *
+ * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
+ * just an indication whether the attribute exists and the size of the value if
+ * it exists. The size is returned in @valuelenp,
+ *
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * @valuelenp, return -ERANGE with the size of the attribute that was found in
+ * @valuelenp.
+ *
+ * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
+ * existence of the attribute has been determined. On success, return that
+ * buffer to the caller and leave them to free it. On failure, free any
+ * allocated buffer and ensure the buffer pointer returned to the caller is
+ * null.
+ */
 int
 xfs_attr_get(
 	struct xfs_inode	*ip,
 	const unsigned char	*name,
-	unsigned char		*value,
+	unsigned char		**value,
 	int			*valuelenp,
 	int			flags)
 {
@@ -133,6 +147,8 @@
 	uint			lock_mode;
 	int			error;
 
+	ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
+
 	XFS_STATS_INC(ip->i_mount, xs_attr_get);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -142,17 +158,29 @@
 	if (error)
 		return error;
 
-	args.value = value;
-	args.valuelen = *valuelenp;
 	/* Entirely possible to look up a name which doesn't exist */
 	args.op_flags = XFS_DA_OP_OKNOENT;
+	if (flags & ATTR_ALLOC)
+		args.op_flags |= XFS_DA_OP_ALLOCVAL;
+	else
+		args.value = *value;
+	args.valuelen = *valuelenp;
 
 	lock_mode = xfs_ilock_attr_map_shared(ip);
 	error = xfs_attr_get_ilocked(ip, &args);
 	xfs_iunlock(ip, lock_mode);
-
 	*valuelenp = args.valuelen;
-	return error == -EEXIST ? 0 : error;
+
+	/* on error, we have to clean up allocated value buffers */
+	if (error) {
+		if (flags & ATTR_ALLOC) {
+			kmem_free(args.value);
+			*value = NULL;
+		}
+		return error;
+	}
+	*value = args.value;
+	return 0;
 }
 
 /*
@@ -191,6 +219,121 @@
 	return nblks;
 }
 
+STATIC int
+xfs_attr_try_sf_addname(
+	struct xfs_inode	*dp,
+	struct xfs_da_args	*args)
+{
+
+	struct xfs_mount	*mp = dp->i_mount;
+	int			error, error2;
+
+	error = xfs_attr_shortform_addname(args);
+	if (error == -ENOSPC)
+		return error;
+
+	/*
+	 * Commit the shortform mods, and we're done.
+	 * NOTE: this is also the error path (EEXIST, etc).
+	 */
+	if (!error && (args->flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(args->trans);
+
+	error2 = xfs_trans_commit(args->trans);
+	args->trans = NULL;
+	return error ? error : error2;
+}
+
+/*
+ * Set the attribute specified in @args.
+ */
+int
+xfs_attr_set_args(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_buf          *leaf_bp = NULL;
+	int			error;
+
+	/*
+	 * If the attribute list is non-existent or a shortform list,
+	 * upgrade it to a single-leaf-block attribute list.
+	 */
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+
+		/*
+		 * Build initial attribute list (if required).
+		 */
+		if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+			xfs_attr_shortform_create(args);
+
+		/*
+		 * Try to add the attr to the attribute list in the inode.
+		 */
+		error = xfs_attr_try_sf_addname(dp, args);
+		if (error != -ENOSPC)
+			return error;
+
+		/*
+		 * It won't fit in the shortform, transform to a leaf block.
+		 * GROT: another possible req'mt for a double-split btree op.
+		 */
+		error = xfs_attr_shortform_to_leaf(args, &leaf_bp);
+		if (error)
+			return error;
+
+		/*
+		 * Prevent the leaf buffer from being unlocked so that a
+		 * concurrent AIL push cannot grab the half-baked leaf
+		 * buffer and run into problems with the write verifier.
+		 * Once we're done rolling the transaction we can release
+		 * the hold and add the attr to the leaf.
+		 */
+		xfs_trans_bhold(args->trans, leaf_bp);
+		error = xfs_defer_finish(&args->trans);
+		xfs_trans_bhold_release(args->trans, leaf_bp);
+		if (error) {
+			xfs_trans_brelse(args->trans, leaf_bp);
+			return error;
+		}
+	}
+
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+		error = xfs_attr_leaf_addname(args);
+	else
+		error = xfs_attr_node_addname(args);
+	return error;
+}
+
+/*
+ * Remove the attribute specified in @args.
+ */
+int
+xfs_attr_remove_args(
+	struct xfs_da_args      *args)
+{
+	struct xfs_inode	*dp = args->dp;
+	int			error;
+
+	if (!xfs_inode_hasattr(dp)) {
+		error = -ENOATTR;
+	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+		error = xfs_attr_shortform_remove(args);
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_removename(args);
+	} else {
+		error = xfs_attr_node_removename(args);
+	}
+
+	return error;
+}
+
 int
 xfs_attr_set(
 	struct xfs_inode	*dp,
@@ -200,11 +343,10 @@
 	int			flags)
 {
 	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_buf		*leaf_bp = NULL;
 	struct xfs_da_args	args;
 	struct xfs_trans_res	tres;
 	int			rsvd = (flags & ATTR_ROOT) != 0;
-	int			error, err2, local;
+	int			error, local;
 
 	XFS_STATS_INC(mp, xs_attr_set);
 
@@ -255,93 +397,17 @@
 	error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
 				rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
 				       XFS_QMOPT_RES_REGBLKS);
-	if (error) {
-		xfs_iunlock(dp, XFS_ILOCK_EXCL);
-		xfs_trans_cancel(args.trans);
-		return error;
-	}
+	if (error)
+		goto out_trans_cancel;
 
 	xfs_trans_ijoin(args.trans, dp, 0);
-
-	/*
-	 * If the attribute list is non-existent or a shortform list,
-	 * upgrade it to a single-leaf-block attribute list.
-	 */
-	if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     dp->i_d.di_anextents == 0)) {
-
-		/*
-		 * Build initial attribute list (if required).
-		 */
-		if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
-			xfs_attr_shortform_create(&args);
-
-		/*
-		 * Try to add the attr to the attribute list in
-		 * the inode.
-		 */
-		error = xfs_attr_shortform_addname(&args);
-		if (error != -ENOSPC) {
-			/*
-			 * Commit the shortform mods, and we're done.
-			 * NOTE: this is also the error path (EEXIST, etc).
-			 */
-			ASSERT(args.trans != NULL);
-
-			/*
-			 * If this is a synchronous mount, make sure that
-			 * the transaction goes to disk before returning
-			 * to the user.
-			 */
-			if (mp->m_flags & XFS_MOUNT_WSYNC)
-				xfs_trans_set_sync(args.trans);
-
-			if (!error && (flags & ATTR_KERNOTIME) == 0) {
-				xfs_trans_ichgtime(args.trans, dp,
-							XFS_ICHGTIME_CHG);
-			}
-			err2 = xfs_trans_commit(args.trans);
-			xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-			return error ? error : err2;
-		}
-
-		/*
-		 * It won't fit in the shortform, transform to a leaf block.
-		 * GROT: another possible req'mt for a double-split btree op.
-		 */
-		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
-		if (error)
-			goto out;
-		/*
-		 * Prevent the leaf buffer from being unlocked so that a
-		 * concurrent AIL push cannot grab the half-baked leaf
-		 * buffer and run into problems with the write verifier.
-		 */
-		xfs_trans_bhold(args.trans, leaf_bp);
-		error = xfs_defer_finish(&args.trans);
-		if (error)
-			goto out;
-
-		/*
-		 * Commit the leaf transformation.  We'll need another (linked)
-		 * transaction to add the new attribute to the leaf, which
-		 * means that we have to hold & join the leaf buffer here too.
-		 */
-		error = xfs_trans_roll_inode(&args.trans, dp);
-		if (error)
-			goto out;
-		xfs_trans_bjoin(args.trans, leaf_bp);
-		leaf_bp = NULL;
-	}
-
-	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
-		error = xfs_attr_leaf_addname(&args);
-	else
-		error = xfs_attr_node_addname(&args);
+	error = xfs_attr_set_args(&args);
 	if (error)
-		goto out;
+		goto out_trans_cancel;
+	if (!args.trans) {
+		/* shortform attribute has already been committed */
+		goto out_unlock;
+	}
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -358,17 +424,14 @@
 	 */
 	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
 	error = xfs_trans_commit(args.trans);
+out_unlock:
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
 	return error;
 
-out:
-	if (leaf_bp)
-		xfs_trans_brelse(args.trans, leaf_bp);
+out_trans_cancel:
 	if (args.trans)
 		xfs_trans_cancel(args.trans);
-	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-	return error;
+	goto out_unlock;
 }
 
 /*
@@ -423,17 +486,7 @@
 	 */
 	xfs_trans_ijoin(args.trans, dp, 0);
 
-	if (!xfs_inode_hasattr(dp)) {
-		error = -ENOATTR;
-	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
-		error = xfs_attr_shortform_remove(&args);
-	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-		error = xfs_attr_leaf_removename(&args);
-	} else {
-		error = xfs_attr_node_removename(&args);
-	}
-
+	error = xfs_attr_remove_args(&args);
 	if (error)
 		goto out;
 
@@ -748,6 +801,8 @@
  *
  * This leaf block cannot have a "remote" value, we only call this routine
  * if bmap_one_block() says there is only one block (ie: no remote blks).
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
  */
 STATIC int
 xfs_attr_leaf_get(xfs_da_args_t *args)
@@ -769,9 +824,6 @@
 	}
 	error = xfs_attr3_leaf_getvalue(bp, args);
 	xfs_trans_brelse(args->trans, bp);
-	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
-		error = xfs_attr_rmtval_get(args);
-	}
 	return error;
 }
 
@@ -1248,11 +1300,13 @@
 }
 
 /*
- * Look up a filename in a node attribute list.
+ * Retrieve the attribute data from a node attribute list.
  *
  * This routine gets called for any attribute fork that has more than one
  * block, ie: both true Btree attr lists and for single-leaf-blocks with
  * "remote" values taking up more blocks.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
  */
 STATIC int
 xfs_attr_node_get(xfs_da_args_t *args)
@@ -1274,24 +1328,21 @@
 	error = xfs_da3_node_lookup_int(state, &retval);
 	if (error) {
 		retval = error;
-	} else if (retval == -EEXIST) {
-		blk = &state->path.blk[ state->path.active-1 ];
-		ASSERT(blk->bp != NULL);
-		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-
-		/*
-		 * Get the value, local or "remote"
-		 */
-		retval = xfs_attr3_leaf_getvalue(blk->bp, args);
-		if (!retval && (args->rmtblkno > 0)
-		    && !(args->flags & ATTR_KERNOVAL)) {
-			retval = xfs_attr_rmtval_get(args);
-		}
+		goto out_release;
 	}
+	if (retval != -EEXIST)
+		goto out_release;
+
+	/*
+	 * Get the value, local or "remote"
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	retval = xfs_attr3_leaf_getvalue(blk->bp, args);
 
 	/*
 	 * If not in a transaction, we have to release all the buffers.
 	 */
+out_release:
 	for (i = 0; i < state->path.active; i++) {
 		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
 		state->path.blk[i].bp = NULL;
@@ -1300,3 +1351,20 @@
 	xfs_da_state_free(state);
 	return retval;
 }
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+	const void	*name,
+	size_t		length)
+{
+	/*
+	 * MAXNAMELEN includes the trailing null, but (name/length) leave it
+	 * out, so use >= for the length check.
+	 */
+	if (length >= MAXNAMELEN)
+		return false;
+
+	/* There shouldn't be any nulls here */
+	return !memchr(name, 0, length);
+}

diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
similarity index 91%
rename from fs/xfs/xfs_attr.h
rename to fs/xfs/libxfs/xfs_attr.h
index 033ff8c..94badfa 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h

@@ -37,6 +37,7 @@
 #define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
 
 #define ATTR_INCOMPLETE	0x4000	/* [kernel] return INCOMPLETE attr keys */
+#define ATTR_ALLOC	0x8000	/* allocate xattr buffer on demand */
 
 #define XFS_ATTR_FLAGS \
 	{ ATTR_DONTFOLLOW, 	"DONTFOLLOW" }, \
@@ -47,7 +48,8 @@
 	{ ATTR_REPLACE,		"REPLACE" }, \
 	{ ATTR_KERNOTIME,	"KERNOTIME" }, \
 	{ ATTR_KERNOVAL,	"KERNOVAL" }, \
-	{ ATTR_INCOMPLETE,	"INCOMPLETE" }
+	{ ATTR_INCOMPLETE,	"INCOMPLETE" }, \
+	{ ATTR_ALLOC,		"ALLOC" }
 
 /*
  * The maximum size (into the kernel or returned from the kernel) of an
@@ -112,7 +114,13 @@
 	struct xfs_inode		*dp;		/* inode */
 	struct attrlist_cursor_kern	*cursor;	/* position in list */
 	char				*alist;		/* output buffer */
-	int				seen_enough;	/* T/F: seen enough of list? */
+
+	/*
+	 * Abort attribute list iteration if non-zero.  Can be used to pass
+	 * error values to the xfs_attr_list caller.
+	 */
+	int				seen_enough;
+
 	ssize_t				count;		/* num used entries */
 	int				dupcnt;		/* count dup hashvals seen */
 	int				bufsize;	/* total buffer size */
@@ -137,12 +145,14 @@
 int xfs_inode_hasattr(struct xfs_inode *ip);
 int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
 int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
-		 unsigned char *value, int *valuelenp, int flags);
+		 unsigned char **value, int *valuelenp, int flags);
 int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
 		 unsigned char *value, int valuelen, int flags);
+int xfs_attr_set_args(struct xfs_da_args *args);
 int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_remove_args(struct xfs_da_args *args);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 		  int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
 
 #endif	/* __XFS_ATTR_H__ */

diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6fc5425..f0089e8 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c

@@ -10,14 +10,12 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_attr_sf.h"
@@ -27,7 +25,6 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_buf_item.h"
-#include "xfs_cksum.h"
 #include "xfs_dir2.h"
 #include "xfs_log.h"
 
@@ -240,30 +237,19 @@
 	struct xfs_buf			*bp)
 {
 	struct xfs_attr3_icleaf_hdr	ichdr;
-	struct xfs_mount		*mp = bp->b_target->bt_mount;
+	struct xfs_mount		*mp = bp->b_mount;
 	struct xfs_attr_leafblock	*leaf = bp->b_addr;
 	struct xfs_attr_leaf_entry	*entries;
-	uint16_t			end;
+	uint32_t			end;	/* must be 32bit - see below */
 	int				i;
+	xfs_failaddr_t			fa;
 
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
 
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+	fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+	if (fa)
+		return fa;
 
-		if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
-			return __this_address;
-
-		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
-			return __this_address;
-		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-			return __this_address;
-		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
-			return __this_address;
-	} else {
-		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
-			return __this_address;
-	}
 	/*
 	 * In recovery there is a transient state where count == 0 is valid
 	 * because we may have transitioned an empty shortform attr to a leaf
@@ -293,6 +279,11 @@
 	/*
 	 * Quickly check the freemap information.  Attribute data has to be
 	 * aligned to 4-byte boundaries, and likewise for the free space.
+	 *
+	 * Note that for 64k block size filesystems, the freemap entries cannot
+	 * overflow as they are only be16 fields. However, when checking end
+	 * pointer of the freemap, we have to be careful to detect overflows and
+	 * so use uint32_t for those checks.
 	 */
 	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
 		if (ichdr.freemap[i].base > mp->m_attr_geo->blksize)
@@ -303,7 +294,9 @@
 			return __this_address;
 		if (ichdr.freemap[i].size & 0x3)
 			return __this_address;
-		end = ichdr.freemap[i].base + ichdr.freemap[i].size;
+
+		/* be care of 16 bit overflows here */
+		end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size;
 		if (end < ichdr.freemap[i].base)
 			return __this_address;
 		if (end > mp->m_attr_geo->blksize)
@@ -317,7 +310,7 @@
 xfs_attr3_leaf_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
 	xfs_failaddr_t		fa;
@@ -347,7 +340,7 @@
 xfs_attr3_leaf_read_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	xfs_failaddr_t		fa;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -362,6 +355,8 @@
 
 const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
 	.name = "xfs_attr3_leaf",
+	.magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+		     cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
 	.verify_read = xfs_attr3_leaf_read_verify,
 	.verify_write = xfs_attr3_leaf_write_verify,
 	.verify_struct = xfs_attr3_leaf_verify,
@@ -398,6 +393,50 @@
 	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
 }
 
+static int
+xfs_attr_copy_value(
+	struct xfs_da_args	*args,
+	unsigned char		*value,
+	int			valuelen)
+{
+	/*
+	 * No copy if all we have to do is get the length
+	 */
+	if (args->flags & ATTR_KERNOVAL) {
+		args->valuelen = valuelen;
+		return 0;
+	}
+
+	/*
+	 * No copy if the length of the existing buffer is too small
+	 */
+	if (args->valuelen < valuelen) {
+		args->valuelen = valuelen;
+		return -ERANGE;
+	}
+
+	if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
+		args->value = kmem_alloc_large(valuelen, 0);
+		if (!args->value)
+			return -ENOMEM;
+	}
+	args->valuelen = valuelen;
+
+	/* remote block xattr requires IO for copy-in */
+	if (args->rmtblkno)
+		return xfs_attr_rmtval_get(args);
+
+	/*
+	 * This is to prevent a GCC warning because the remote xattr case
+	 * doesn't have a value to pass in. In that case, we never reach here,
+	 * but GCC can't work that out and so throws a "passing NULL to
+	 * memcpy" warning.
+	 */
+	if (!value)
+		return -EINVAL;
+	memcpy(args->value, value, valuelen);
+	return 0;
+}
 
 /*========================================================================
  * External routines when attribute fork size < XFS_LITINO(mp).
@@ -725,15 +764,19 @@
 }
 
 /*
- * Look up a name in a shortform attribute list structure.
+ * Retreive the attribute value and length.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
  */
-/*ARGSUSED*/
 int
-xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+xfs_attr_shortform_getvalue(
+	struct xfs_da_args	*args)
 {
-	xfs_attr_shortform_t *sf;
-	xfs_attr_sf_entry_t *sfe;
-	int i;
+	struct xfs_attr_shortform *sf;
+	struct xfs_attr_sf_entry *sfe;
+	int			i;
 
 	ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
 	sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
@@ -746,18 +789,8 @@
 			continue;
 		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
 			continue;
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = sfe->valuelen;
-			return -EEXIST;
-		}
-		if (args->valuelen < sfe->valuelen) {
-			args->valuelen = sfe->valuelen;
-			return -ERANGE;
-		}
-		args->valuelen = sfe->valuelen;
-		memcpy(args->value, &sfe->nameval[args->namelen],
-						    args->valuelen);
-		return -EEXIST;
+		return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
+						sfe->valuelen);
 	}
 	return -ENOATTR;
 }
@@ -787,38 +820,23 @@
 	ifp = dp->i_afp;
 	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
 	size = be16_to_cpu(sf->hdr.totsize);
-	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	tmpbuffer = kmem_alloc(size, 0);
 	ASSERT(tmpbuffer != NULL);
 	memcpy(tmpbuffer, ifp->if_u1.if_data, size);
 	sf = (xfs_attr_shortform_t *)tmpbuffer;
 
 	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
-	xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+	xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
 
 	bp = NULL;
 	error = xfs_da_grow_inode(args, &blkno);
-	if (error) {
-		/*
-		 * If we hit an IO error middle of the transaction inside
-		 * grow_inode(), we may have inconsistent data. Bail out.
-		 */
-		if (error == -EIO)
-			goto out;
-		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
-		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
+	if (error)
 		goto out;
-	}
 
 	ASSERT(blkno == 0);
 	error = xfs_attr3_leaf_create(args, blkno, &bp);
-	if (error) {
-		/* xfs_attr3_leaf_create may not have instantiated a block */
-		if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
-			goto out;
-		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
-		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
+	if (error)
 		goto out;
-	}
 
 	memset((char *)&nargs, 0, sizeof(nargs));
 	nargs.dp = dp;
@@ -867,7 +885,7 @@
 	struct xfs_attr3_icleaf_hdr leafhdr;
 	int			bytes;
 	int			i;
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 
 	leaf = bp->b_addr;
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
@@ -990,7 +1008,7 @@
 
 	trace_xfs_attr_leaf_to_sf(args);
 
-	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+	tmpbuffer = kmem_alloc(args->geo->blksize, 0);
 	if (!tmpbuffer)
 		return -ENOMEM;
 
@@ -1453,7 +1471,7 @@
 
 	trace_xfs_attr_leaf_compact(args);
 
-	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+	tmpbuffer = kmem_alloc(args->geo->blksize, 0);
 	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
 	memset(bp->b_addr, 0, args->geo->blksize);
 	leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1527,7 +1545,7 @@
 {
 	struct xfs_attr3_icleaf_hdr ichdr1;
 	struct xfs_attr3_icleaf_hdr ichdr2;
-	struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
+	struct xfs_mount *mp = leaf1_bp->b_mount;
 
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
@@ -2172,7 +2190,7 @@
 		struct xfs_attr_leafblock *tmp_leaf;
 		struct xfs_attr3_icleaf_hdr tmphdr;
 
-		tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+		tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
 
 		/*
 		 * Copy the header into the temp leaf so that all the stuff
@@ -2355,6 +2373,10 @@
 /*
  * Get the value associated with an attribute name from a leaf attribute
  * list structure.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
  */
 int
 xfs_attr3_leaf_getvalue(
@@ -2366,7 +2388,6 @@
 	struct xfs_attr_leaf_entry *entry;
 	struct xfs_attr_leaf_name_local *name_loc;
 	struct xfs_attr_leaf_name_remote *name_rmt;
-	int			valuelen;
 
 	leaf = bp->b_addr;
 	xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
@@ -2378,36 +2399,19 @@
 		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
 		ASSERT(name_loc->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
-		valuelen = be16_to_cpu(name_loc->valuelen);
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = valuelen;
-			return 0;
-		}
-		if (args->valuelen < valuelen) {
-			args->valuelen = valuelen;
-			return -ERANGE;
-		}
-		args->valuelen = valuelen;
-		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
-	} else {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-		ASSERT(name_rmt->namelen == args->namelen);
-		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-		args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-		args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-		args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-						       args->rmtvaluelen);
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = args->rmtvaluelen;
-			return 0;
-		}
-		if (args->valuelen < args->rmtvaluelen) {
-			args->valuelen = args->rmtvaluelen;
-			return -ERANGE;
-		}
-		args->valuelen = args->rmtvaluelen;
+		return xfs_attr_copy_value(args,
+					&name_loc->nameval[args->namelen],
+					be16_to_cpu(name_loc->valuelen));
 	}
-	return 0;
+
+	name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+	ASSERT(name_rmt->namelen == args->namelen);
+	ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+	args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+	args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+	args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+					       args->rmtvaluelen);
+	return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
 }
 
 /*========================================================================
@@ -2570,7 +2574,7 @@
 {
 	struct xfs_attr3_icleaf_hdr ichdr;
 	struct xfs_attr_leaf_entry *entries;
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
 	entries = xfs_attr3_leaf_entryp(bp->b_addr);

diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d89363c..3e39b7d 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c

@@ -16,18 +16,10 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
-#include "xfs_alloc.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
 #include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_trans_space.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
 #include "xfs_error.h"
 
 #define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
@@ -79,6 +71,7 @@
 static xfs_failaddr_t
 xfs_attr3_rmt_verify(
 	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
 	void			*ptr,
 	int			fsbsize,
 	xfs_daddr_t		bno)
@@ -87,7 +80,7 @@
 
 	if (!xfs_sb_version_hascrc(&mp->m_sb))
 		return __this_address;
-	if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+	if (!xfs_verify_magic(bp, rmt->rm_magic))
 		return __this_address;
 	if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
 		return __this_address;
@@ -110,7 +103,7 @@
 	bool		check_crc,
 	xfs_failaddr_t	*failaddr)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	char		*ptr;
 	int		len;
 	xfs_daddr_t	bno;
@@ -131,7 +124,7 @@
 			*failaddr = __this_address;
 			return -EFSBADCRC;
 		}
-		*failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+		*failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
 		if (*failaddr)
 			return -EFSCORRUPTED;
 		len -= blksize;
@@ -174,7 +167,7 @@
 xfs_attr3_rmt_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	xfs_failaddr_t	fa;
 	int		blksize = mp->m_attr_geo->blksize;
 	char		*ptr;
@@ -193,7 +186,7 @@
 	while (len > 0) {
 		struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
 
-		fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+		fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
 		if (fa) {
 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 			return;
@@ -220,6 +213,7 @@
 
 const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
 	.name = "xfs_attr3_rmt",
+	.magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
 	.verify_read = xfs_attr3_rmt_read_verify,
 	.verify_write = xfs_attr3_rmt_write_verify,
 	.verify_struct = xfs_attr3_rmt_verify_struct,
@@ -364,6 +358,8 @@
 /*
  * Read the value associated with an attribute from the out-of-line buffer
  * that we stored it in.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
  */
 int
 xfs_attr_rmtval_get(
@@ -533,7 +529,7 @@
 		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
 		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
 
-		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt);
 		if (!bp)
 			return -ENOMEM;
 		bp->b_ops = &xfs_attr3_rmt_buf_ops;

diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 40ce5f3..7071ff9 100644
--- a/fs/xfs/libxfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c

@@ -5,7 +5,6 @@
  */
 #include "xfs.h"
 #include "xfs_log_format.h"
-#include "xfs_bit.h"
 
 /*
  * XFS bit manipulation routines, used in non-realtime code.

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a476703..02469d5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c

@@ -13,14 +13,10 @@
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
@@ -32,7 +28,6 @@
 #include "xfs_trans_space.h"
 #include "xfs_buf_item.h"
 #include "xfs_trace.h"
-#include "xfs_symlink.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_filestream.h"
 #include "xfs_rmap.h"
@@ -370,7 +365,7 @@
 		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
 		if (!bp) {
 			bp_release = 1;
-			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+			error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
 						XFS_BMAP_BTREE_REF,
 						&xfs_bmbt_buf_ops);
 			if (error)
@@ -454,7 +449,7 @@
 		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
 		if (!bp) {
 			bp_release = 1;
-			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+			error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
 						XFS_BMAP_BTREE_REF,
 						&xfs_bmbt_buf_ops);
 			if (error)
@@ -536,7 +531,7 @@
 	struct xfs_trans		*tp,
 	xfs_fsblock_t			bno,
 	xfs_filblks_t			len,
-	struct xfs_owner_info		*oinfo,
+	const struct xfs_owner_info	*oinfo,
 	bool				skip_discard)
 {
 	struct xfs_extent_free_item	*new;		/* new element */
@@ -558,13 +553,13 @@
 #endif
 	ASSERT(xfs_bmap_free_item_zone != NULL);
 
-	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
 	new->xefi_startblock = bno;
 	new->xefi_blockcount = (xfs_extlen_t)len;
 	if (oinfo)
 		new->xefi_oinfo = *oinfo;
 	else
-		xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+		new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
 	new->xefi_skip_discard = skip_discard;
 	trace_xfs_bmap_free_defer(tp->t_mountp,
 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
@@ -577,47 +572,49 @@
  */
 
 /*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
  */
 STATIC int				/* error */
 xfs_bmap_btree_to_extents(
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_btree_cur_t		*cur,	/* btree cursor */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	struct xfs_inode	*ip,	/* incore inode pointer */
+	struct xfs_btree_cur	*cur,	/* btree cursor */
 	int			*logflagsp, /* inode logging flags */
 	int			whichfork)  /* data or attr fork */
 {
-	/* REFERENCED */
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_btree_block	*rblock = ifp->if_broot;
 	struct xfs_btree_block	*cblock;/* child btree block */
 	xfs_fsblock_t		cbno;	/* child block number */
 	xfs_buf_t		*cbp;	/* child block's buffer */
 	int			error;	/* error return value */
-	struct xfs_ifork	*ifp;	/* inode fork data */
-	xfs_mount_t		*mp;	/* mount point structure */
 	__be64			*pp;	/* ptr to block address */
-	struct xfs_btree_block	*rblock;/* root btree block */
 	struct xfs_owner_info	oinfo;
 
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	/* check if we actually need the extent format first: */
+	if (!xfs_bmap_wants_extents(ip, whichfork))
+		return 0;
+
+	ASSERT(cur);
 	ASSERT(whichfork != XFS_COW_FORK);
 	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-	rblock = ifp->if_broot;
 	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
 	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
 	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
 	cbno = be64_to_cpu(*pp);
-	*logflagsp = 0;
 #ifdef DEBUG
 	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 			xfs_btree_check_lptr(cur, cbno, 1));
 #endif
-	error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+	error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
 				&xfs_bmbt_buf_ops);
 	if (error)
 		return error;
@@ -635,7 +632,7 @@
 	ASSERT(ifp->if_broot == NULL);
 	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
 	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+	*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 	return 0;
 }
 
@@ -730,7 +727,7 @@
 	cur->bc_private.b.allocated++;
 	ip->i_d.di_nblocks++;
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
-	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+	abp = xfs_btree_get_bufl(mp, tp, args.fsbno);
 	if (!abp) {
 		error = -EFSCORRUPTED;
 		goto out_unreserve_dquot;
@@ -795,6 +792,7 @@
  */
 void
 xfs_bmap_local_to_extents_empty(
+	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	int			whichfork)
 {
@@ -811,6 +809,7 @@
 	ifp->if_u1.if_root = NULL;
 	ifp->if_height = 0;
 	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
 
@@ -843,7 +842,7 @@
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
 
 	if (!ifp->if_bytes) {
-		xfs_bmap_local_to_extents_empty(ip, whichfork);
+		xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
 		flags = XFS_ILOG_CORE;
 		goto done;
 	}
@@ -876,7 +875,7 @@
 	ASSERT(args.fsbno != NULLFSBLOCK);
 	ASSERT(args.len == 1);
 	tp->t_firstblock = args.fsbno;
-	bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+	bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno);
 
 	/*
 	 * Initialize the block, copy the data and log the remote buffer.
@@ -890,7 +889,7 @@
 
 	/* account for the change in fork size */
 	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
-	xfs_bmap_local_to_extents_empty(ip, whichfork);
+	xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
 	flags |= XFS_ILOG_CORE;
 
 	ifp->if_u1.if_root = NULL;
@@ -1019,6 +1018,34 @@
 	return -EFSCORRUPTED;
 }
 
+/* Set an inode attr fork off based on the format */
+int
+xfs_bmap_set_attrforkoff(
+	struct xfs_inode	*ip,
+	int			size,
+	int			*version)
+{
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+		if (!ip->i_d.di_forkoff)
+			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+		else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version)
+			*version = 2;
+		break;
+	default:
+		ASSERT(0);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /*
  * Convert inode from non-attributed to attributed.
  * Must not be in a transaction, ip must not be locked.
@@ -1070,28 +1097,11 @@
 
 	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	switch (ip->i_d.di_format) {
-	case XFS_DINODE_FMT_DEV:
-		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
-		break;
-	case XFS_DINODE_FMT_LOCAL:
-	case XFS_DINODE_FMT_EXTENTS:
-	case XFS_DINODE_FMT_BTREE:
-		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
-		if (!ip->i_d.di_forkoff)
-			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
-		else if (mp->m_flags & XFS_MOUNT_ATTR2)
-			version = 2;
-		break;
-	default:
-		ASSERT(0);
-		error = -EINVAL;
+	error = xfs_bmap_set_attrforkoff(ip, size, &version);
+	if (error)
 		goto trans_cancel;
-	}
-
 	ASSERT(ip->i_afp == NULL);
-	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
 	ip->i_afp->if_flags = XFS_IFEXTENTS;
 	logflags = 0;
 	switch (ip->i_d.di_format) {
@@ -1178,7 +1188,10 @@
 	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
 	 */
 	level = be16_to_cpu(block->bb_level);
-	ASSERT(level > 0);
+	if (unlikely(level == 0)) {
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
 	bno = be64_to_cpu(*pp);
 
@@ -1187,7 +1200,7 @@
 	 * pointer (leftmost) at each level.
 	 */
 	while (level-- > 0) {
-		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+		error = xfs_btree_read_bufl(mp, tp, bno, &bp,
 				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
 		if (error)
 			goto out;
@@ -1260,7 +1273,7 @@
 		 */
 		if (bno == NULLFSBLOCK)
 			break;
-		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+		error = xfs_btree_read_bufl(mp, tp, bno, &bp,
 				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
 		if (error)
 			goto out;
@@ -1683,10 +1696,13 @@
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
 		/*
 		 * Filling in all of a previously delayed allocation extent.
-		 * The right neighbor is contiguous, the left is not.
+		 * The right neighbor is contiguous, the left is not. Take care
+		 * with delay -> unwritten extent allocation here because the
+		 * delalloc record we are overwriting is always written.
 		 */
 		PREV.br_startblock = new->br_startblock;
 		PREV.br_blockcount += RIGHT.br_blockcount;
+		PREV.br_state = new->br_state;
 
 		xfs_iext_next(ifp, &bma->icur);
 		xfs_iext_remove(bma->ip, &bma->icur, state);
@@ -1971,11 +1987,8 @@
 	}
 
 	/* add reverse mapping unless caller opted out */
-	if (!(bma->flags & XFS_BMAPI_NORMAP)) {
-		error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
-		if (error)
-			goto done;
-	}
+	if (!(bma->flags & XFS_BMAPI_NORMAP))
+		xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
 
 	/* convert to a btree if necessary */
 	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -1990,6 +2003,9 @@
 			goto done;
 	}
 
+	if (da_new != da_old)
+		xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
+
 	if (bma->cur) {
 		da_new += bma->cur->bc_private.b.allocated;
 		bma->cur->bc_private.b.allocated = 0;
@@ -2015,7 +2031,7 @@
 /*
  * Convert an unwritten allocation to a real allocation or vice versa.
  */
-STATIC int				/* error */
+int					/* error */
 xfs_bmap_add_extent_unwritten_real(
 	struct xfs_trans	*tp,
 	xfs_inode_t		*ip,	/* incore inode pointer */
@@ -2454,9 +2470,7 @@
 	}
 
 	/* update reverse mappings */
-	error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
-	if (error)
-		goto done;
+	xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
 
 	/* convert to a btree if necessary */
 	if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -2621,6 +2635,7 @@
 		/*
 		 * Nothing to do for disk quota accounting here.
 		 */
+		xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen);
 	}
 }
 
@@ -2814,11 +2829,8 @@
 	}
 
 	/* add reverse mapping unless caller opted out */
-	if (!(flags & XFS_BMAPI_NORMAP)) {
-		error = xfs_rmap_map_extent(tp, ip, whichfork, new);
-		if (error)
-			goto done;
-	}
+	if (!(flags & XFS_BMAPI_NORMAP))
+		xfs_rmap_map_extent(tp, ip, whichfork, new);
 
 	/* convert to a btree if necessary */
 	if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -3333,8 +3345,10 @@
 		 * already have quota reservation and there's nothing to do
 		 * yet.
 		 */
-		if (ap->wasdel)
+		if (ap->wasdel) {
+			xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
 			return;
+		}
 
 		/*
 		 * Otherwise, we've allocated blocks in a hole. The transaction
@@ -3353,8 +3367,10 @@
 	/* data/attr fork only */
 	ap->ip->i_d.di_nblocks += args->len;
 	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-	if (ap->wasdel)
+	if (ap->wasdel) {
 		ap->ip->i_delayed_blks -= args->len;
+		xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+	}
 	xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
 		ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
 		args->len);
@@ -3439,7 +3455,7 @@
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->blkno;
-	xfs_rmap_skip_owner_update(&args.oinfo);
+	args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
 
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = min(ap->length, mp->m_ag_max_usable);
@@ -3671,17 +3687,6 @@
 	}
 }
 
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
-	struct xfs_bmbt_irec	*irec,
-	struct xfs_inode	*ip)
-
-{
-	xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
-					      i_size_read(VFS_I(ip))));
-}
-
 /*
  * Trim the returned map to the required bounds
  */
@@ -3824,15 +3829,28 @@
 	XFS_STATS_INC(mp, xs_blk_mapr);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!ifp) {
+		/* No CoW fork?  Return a hole. */
+		if (whichfork == XFS_COW_FORK) {
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount = len;
+			mval->br_state = XFS_EXT_NORM;
+			*nmap = 1;
+			return 0;
+		}
 
-	/* No CoW fork?  Return a hole. */
-	if (whichfork == XFS_COW_FORK && !ifp) {
-		mval->br_startoff = bno;
-		mval->br_startblock = HOLESTARTBLOCK;
-		mval->br_blockcount = len;
-		mval->br_state = XFS_EXT_NORM;
-		*nmap = 1;
-		return 0;
+		/*
+		 * A missing attr ifork implies that the inode says we're in
+		 * extents or btree format but failed to pass the inode fork
+		 * verifier while trying to load it.  Treat that as a file
+		 * corruption too.
+		 */
+#ifdef DEBUG
+		xfs_alert(mp, "%s: inode %llu missing fork %d",
+				__func__, ip->i_ino, whichfork);
+#endif /* DEBUG */
+		return -EFSCORRUPTED;
 	}
 
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -3961,6 +3979,7 @@
 
 
 	ip->i_delayed_blks += alen;
+	xfs_mod_delalloc(ip->i_mount, alen + indlen);
 
 	got->br_startoff = aoff;
 	got->br_startblock = nullstartblock(indlen);
@@ -4081,8 +4100,7 @@
 	 * extents to real extents when we're about to write the data.
 	 */
 	if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
-	    (bma->flags & XFS_BMAPI_PREALLOC) &&
-	    xfs_sb_version_hasextflgbit(&mp->m_sb))
+	    (bma->flags & XFS_BMAPI_PREALLOC))
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
 	if (bma->wasdel)
@@ -4190,6 +4208,44 @@
 	return 0;
 }
 
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			fork)
+{
+	if (tp && tp->t_firstblock != NULLFSBLOCK)
+		return 0;
+	if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+		return 1;
+	return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error.  Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should.  Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+	struct xfs_bmalloca	*bma,
+	int			whichfork,
+	int			error)
+{
+	if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+	    XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		bma->logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+		 XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+	if (bma->logflags)
+		xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+	if (bma->cur)
+		xfs_btree_del_cursor(bma->cur, error);
+}
+
 /*
  * Map file blocks to filesystem blocks, and allocate blocks or convert the
  * extent state if necessary.  Details behaviour is controlled by the flags
@@ -4207,9 +4263,13 @@
 	struct xfs_bmbt_irec	*mval,		/* output: map values */
 	int			*nmap)		/* i/o: mval size/count */
 {
+	struct xfs_bmalloca	bma = {
+		.tp		= tp,
+		.ip		= ip,
+		.total		= total,
+	};
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp;
-	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	bool			eof = false;	/* after the end of extents */
 	int			error;		/* error return */
@@ -4234,9 +4294,7 @@
 
 	ASSERT(*nmap >= 1);
 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-	ASSERT(tp != NULL ||
-	       (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
-			(XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+	ASSERT(tp != NULL);
 	ASSERT(len > 0);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4269,34 +4327,21 @@
 
 	XFS_STATS_INC(mp, xs_blk_mapw);
 
-	if (!tp || tp->t_firstblock == NULLFSBLOCK) {
-		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-		else
-			bma.minleft = 1;
-	} else {
-		bma.minleft = 0;
-	}
-
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(tp, ip, whichfork);
 		if (error)
 			goto error0;
 	}
 
-	n = 0;
-	end = bno + len;
-	obno = bno;
-
 	if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
 		eof = true;
 	if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
 		bma.prev.br_startoff = NULLFILEOFF;
-	bma.tp = tp;
-	bma.ip = ip;
-	bma.total = total;
-	bma.datatype = 0;
+	bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
 
+	n = 0;
+	end = bno + len;
+	obno = bno;
 	while (bno < end && n < *nmap) {
 		bool			need_alloc = false, wasdelay = false;
 
@@ -4310,26 +4355,7 @@
 			ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
 			         (flags & XFS_BMAPI_COWFORK)));
 
-			if (flags & XFS_BMAPI_DELALLOC) {
-				/*
-				 * For the COW fork we can reasonably get a
-				 * request for converting an extent that races
-				 * with other threads already having converted
-				 * part of it, as there converting COW to
-				 * regular blocks is not protected using the
-				 * IOLOCK.
-				 */
-				ASSERT(flags & XFS_BMAPI_COWFORK);
-				if (!(flags & XFS_BMAPI_COWFORK)) {
-					error = -EIO;
-					goto error0;
-				}
-
-				if (eof || bno >= end)
-					break;
-			} else {
-				need_alloc = true;
-			}
+			need_alloc = true;
 		} else if (isnullstartblock(bma.got.br_startblock)) {
 			wasdelay = true;
 		}
@@ -4338,8 +4364,7 @@
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
-		if ((need_alloc || wasdelay) &&
-		    !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+		if (need_alloc || wasdelay) {
 			bma.eof = eof;
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
@@ -4370,12 +4395,9 @@
 			 * If this is a CoW allocation, record the data in
 			 * the refcount btree for orphan recovery.
 			 */
-			if (whichfork == XFS_COW_FORK) {
-				error = xfs_refcount_alloc_cow_extent(tp,
-						bma.blkno, bma.length);
-				if (error)
-					goto error0;
-			}
+			if (whichfork == XFS_COW_FORK)
+				xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+						bma.length);
 		}
 
 		/* Deal with the allocated space we found.  */
@@ -4407,49 +4429,126 @@
 	}
 	*nmap = n;
 
-	/*
-	 * Transform from btree to extents, give it cur.
-	 */
-	if (xfs_bmap_wants_extents(ip, whichfork)) {
-		int		tmp_logflags = 0;
-
-		ASSERT(bma.cur);
-		error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
-			&tmp_logflags, whichfork);
-		bma.logflags |= tmp_logflags;
-		if (error)
-			goto error0;
-	}
+	error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+			whichfork);
+	if (error)
+		goto error0;
 
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
 	       XFS_IFORK_NEXTENTS(ip, whichfork) >
 		XFS_IFORK_MAXEXT(ip, whichfork));
-	error = 0;
+	xfs_bmapi_finish(&bma, whichfork, 0);
+	xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+		orig_nmap, *nmap);
+	return 0;
 error0:
-	/*
-	 * Log everything.  Do this after conversion, there's no point in
-	 * logging the extent records if we've converted to btree format.
-	 */
-	if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		bma.logflags &= ~xfs_ilog_fext(whichfork);
-	else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
-		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		bma.logflags &= ~xfs_ilog_fbroot(whichfork);
-	/*
-	 * Log whatever the flags say, even if error.  Otherwise we might miss
-	 * detecting a case where the data is changed, there's an error,
-	 * and it's not logged so we don't shutdown when we should.
-	 */
-	if (bma.logflags)
-		xfs_trans_log_inode(tp, ip, bma.logflags);
+	xfs_bmapi_finish(&bma, whichfork, error);
+	return error;
+}
 
-	if (bma.cur) {
-		xfs_btree_del_cursor(bma.cur, error);
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_fileoff_t		offset_fsb,
+	struct xfs_bmbt_irec	*imap,
+	unsigned int		*seq)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmalloca	bma = { NULL };
+	struct xfs_trans	*tp;
+	int			error;
+
+	/*
+	 * Space for the extent and indirect blocks was reserved when the
+	 * delalloc extent was created so there's no need to do so here.
+	 */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+				XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+	    bma.got.br_startoff > offset_fsb) {
+		/*
+		 * No extent found in the range we are trying to convert.  This
+		 * should only happen for the COW fork, where another thread
+		 * might have moved the extent to the data fork in the meantime.
+		 */
+		WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+		error = -EAGAIN;
+		goto out_trans_cancel;
 	}
-	if (!error)
-		xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
-			orig_nmap, *nmap);
+
+	/*
+	 * If we find a real extent here we raced with another thread converting
+	 * the extent.  Just return the real extent at this offset.
+	 */
+	if (!isnullstartblock(bma.got.br_startblock)) {
+		*imap = bma.got;
+		*seq = READ_ONCE(ifp->if_seq);
+		goto out_trans_cancel;
+	}
+
+	bma.tp = tp;
+	bma.ip = ip;
+	bma.wasdel = true;
+	bma.offset = bma.got.br_startoff;
+	bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+	bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+	bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+	if (whichfork == XFS_COW_FORK)
+		bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+
+	if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+		bma.prev.br_startoff = NULLFILEOFF;
+
+	error = xfs_bmapi_allocate(&bma);
+	if (error)
+		goto out_finish;
+
+	error = -ENOSPC;
+	if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+		goto out_finish;
+	error = -EFSCORRUPTED;
+	if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
+		goto out_finish;
+
+	XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+	XFS_STATS_INC(mp, xs_xstrat_quick);
+
+	ASSERT(!isnullstartblock(bma.got.br_startblock));
+	*imap = bma.got;
+	*seq = READ_ONCE(ifp->if_seq);
+
+	if (whichfork == XFS_COW_FORK)
+		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+
+	error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+			whichfork);
+	if (error)
+		goto out_finish;
+
+	xfs_bmapi_finish(&bma, whichfork, 0);
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_finish:
+	xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
@@ -4523,13 +4622,7 @@
 	if (error)
 		goto error0;
 
-	if (xfs_bmap_wants_extents(ip, whichfork)) {
-		int		tmp_logflags = 0;
-
-		error = xfs_bmap_btree_to_extents(tp, ip, cur,
-			&tmp_logflags, whichfork);
-		logflags |= tmp_logflags;
-	}
+	error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
 
 error0:
 	if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -4751,8 +4844,10 @@
 	da_diff = da_old - da_new;
 	if (!isrt)
 		da_diff += del->br_blockcount;
-	if (da_diff)
+	if (da_diff) {
 		xfs_mod_fdblocks(mp, da_diff, false);
+		xfs_mod_delalloc(mp, -da_diff);
+	}
 	return error;
 }
 
@@ -5041,18 +5136,14 @@
 	}
 
 	/* remove reverse mapping */
-	error = xfs_rmap_unmap_extent(tp, ip, whichfork, del);
-	if (error)
-		goto done;
+	xfs_rmap_unmap_extent(tp, ip, whichfork, del);
 
 	/*
 	 * If we need to, add to list of extents to delete.
 	 */
 	if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
-			error = xfs_refcount_decrease_extent(tp, del);
-			if (error)
-				goto done;
+			xfs_refcount_decrease_extent(tp, del);
 		} else {
 			__xfs_bmap_add_free(tp, del->br_startblock,
 					del->br_blockcount, NULL,
@@ -5245,8 +5336,7 @@
 			 * unmapping part of it.  But we can't really
 			 * get rid of part of a realtime extent.
 			 */
-			if (del.br_state == XFS_EXT_UNWRITTEN ||
-			    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+			if (del.br_state == XFS_EXT_UNWRITTEN) {
 				/*
 				 * This piece is unwritten, or we're not
 				 * using unwritten extents.  Skip over it.
@@ -5296,10 +5386,9 @@
 				del.br_blockcount -= mod;
 				del.br_startoff += mod;
 				del.br_startblock += mod;
-			} else if ((del.br_startoff == start &&
-				    (del.br_state == XFS_EXT_UNWRITTEN ||
-				     tp->t_blk_res == 0)) ||
-				   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+			} else if (del.br_startoff == start &&
+				   (del.br_state == XFS_EXT_UNWRITTEN ||
+				    tp->t_blk_res == 0)) {
 				/*
 				 * Can't make it unwritten.  There isn't
 				 * a full extent here so just skip it.
@@ -5395,24 +5484,11 @@
 		error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
 				&tmp_logflags, whichfork);
 		logflags |= tmp_logflags;
-		if (error)
-			goto error0;
-	}
-	/*
-	 * transform from btree to extents, give it cur
-	 */
-	else if (xfs_bmap_wants_extents(ip, whichfork)) {
-		ASSERT(cur != NULL);
-		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+	} else {
+		error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
 			whichfork);
-		logflags |= tmp_logflags;
-		if (error)
-			goto error0;
 	}
-	/*
-	 * transform from extents to local?
-	 */
-	error = 0;
+
 error0:
 	/*
 	 * Log everything.  Do this after conversion, there's no point in
@@ -5551,6 +5627,11 @@
 	if (error)
 		return error;
 
+	/* change to extent format if required after extent removal */
+	error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork);
+	if (error)
+		return error;
+
 done:
 	xfs_iext_remove(ip, icur, 0);
 	xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur);
@@ -5558,12 +5639,11 @@
 			&new);
 
 	/* update reverse mapping. rmap functions merge the rmaps for us */
-	error = xfs_rmap_unmap_extent(tp, ip, whichfork, got);
-	if (error)
-		return error;
+	xfs_rmap_unmap_extent(tp, ip, whichfork, got);
 	memcpy(&new, got, sizeof(new));
 	new.br_startoff = left->br_startoff + left->br_blockcount;
-	return xfs_rmap_map_extent(tp, ip, whichfork, &new);
+	xfs_rmap_map_extent(tp, ip, whichfork, &new);
+	return 0;
 }
 
 static int
@@ -5602,10 +5682,9 @@
 			got);
 
 	/* update reverse mapping */
-	error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
-	if (error)
-		return error;
-	return xfs_rmap_map_extent(tp, ip, whichfork, got);
+	xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
+	xfs_rmap_map_extent(tp, ip, whichfork, got);
+	return 0;
 }
 
 int
@@ -6001,7 +6080,7 @@
 			bmap->br_blockcount,
 			bmap->br_state);
 
-	bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+	bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
 	INIT_LIST_HEAD(&bi->bi_list);
 	bi->bi_type = type;
 	bi->bi_owner = ip;
@@ -6013,29 +6092,29 @@
 }
 
 /* Map an extent into a file. */
-int
+void
 xfs_bmap_map_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_bmap_is_update_needed(PREV))
-		return 0;
+		return;
 
-	return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+	__xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
 }
 
 /* Unmap an extent out of a file. */
-int
+void
 xfs_bmap_unmap_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_bmap_is_update_needed(PREV))
-		return 0;
+		return;
 
-	return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+	__xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
 }
 
 /*
@@ -6114,11 +6193,7 @@
 		    XFS_FSB_TO_AGNO(mp, endfsb))
 			return __this_address;
 	}
-	if (irec->br_state != XFS_EXT_NORM) {
-		if (whichfork != XFS_DATA_FORK)
-			return __this_address;
-		if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
-			return __this_address;
-	}
+	if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
+		return __this_address;
 	return NULL;
 }

diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b6e9b63..e2798c6 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h

@@ -95,12 +95,6 @@
 /* Map something in the CoW fork. */
 #define XFS_BMAPI_COWFORK	0x200
 
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC	0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY	0x800
-
 /* Skip online discard of freed extents */
 #define XFS_BMAPI_NODISCARD	0x1000
 
@@ -117,8 +111,6 @@
 	{ XFS_BMAPI_ZERO,	"ZERO" }, \
 	{ XFS_BMAPI_REMAP,	"REMAP" }, \
 	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
-	{ XFS_BMAPI_DELALLOC,	"DELALLOC" }, \
-	{ XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
 	{ XFS_BMAPI_NODISCARD,	"NODISCARD" }, \
 	{ XFS_BMAPI_NORMAP,	"NORMAP" }
 
@@ -179,13 +171,21 @@
 		!isnullstartblock(irec->br_startblock);
 }
 
+/*
+ * Check the mapping for obviously garbage allocations that could trash the
+ * filesystem immediately.
+ */
+#define xfs_valid_startblock(ip, startblock) \
+	((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
+
 void	xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
 		xfs_filblks_t len);
-void	xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
 int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
-void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+int	xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
+void	xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
+		struct xfs_inode *ip, int whichfork);
 void	__xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
-		xfs_filblks_t len, struct xfs_owner_info *oinfo,
+		xfs_filblks_t len, const struct xfs_owner_info *oinfo,
 		bool skip_discard);
 void	xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
 int	xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
@@ -227,13 +227,20 @@
 		xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
 		struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
 		int eof);
+int	xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+		unsigned int *seq);
+int	xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+		struct xfs_inode *ip, int whichfork,
+		struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+		struct xfs_bmbt_irec *new, int *logflagsp);
 
 static inline void
 xfs_bmap_add_free(
 	struct xfs_trans		*tp,
 	xfs_fsblock_t			bno,
 	xfs_filblks_t			len,
-	struct xfs_owner_info		*oinfo)
+	const struct xfs_owner_info	*oinfo)
 {
 	__xfs_bmap_add_free(tp, bno, len, oinfo, false);
 }
@@ -255,9 +262,9 @@
 		enum xfs_bmap_intent_type type, int whichfork,
 		xfs_fileoff_t startoff, xfs_fsblock_t startblock,
 		xfs_filblks_t *blockcount, xfs_exntst_t state);
-int	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap);
-int	xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void	xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap);
 
 static inline int xfs_bmap_fork_to_state(int whichfork)

diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cdb74d2..ffe608d 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c

@@ -11,10 +11,8 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_bmap_btree.h"
@@ -22,7 +20,6 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_rmap.h"
 
 /*
@@ -403,21 +400,35 @@
 	union xfs_btree_key	*k1,
 	union xfs_btree_key	*k2)
 {
-	return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) -
-			  be64_to_cpu(k2->bmbt.br_startoff);
+	uint64_t		a = be64_to_cpu(k1->bmbt.br_startoff);
+	uint64_t		b = be64_to_cpu(k2->bmbt.br_startoff);
+
+	/*
+	 * Note: This routine previously casted a and b to int64 and subtracted
+	 * them to generate a result.  This lead to problems if b was the
+	 * "maximum" key value (all ones) being signed incorrectly, hence this
+	 * somewhat less efficient version.
+	 */
+	if (a > b)
+		return 1;
+	if (b > a)
+		return -1;
+	return 0;
 }
 
 static xfs_failaddr_t
 xfs_bmbt_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	xfs_failaddr_t		fa;
 	unsigned int		level;
 
-	switch (block->bb_magic) {
-	case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		/*
 		 * XXX: need a better way of verifying the owner here. Right now
 		 * just make sure there has been one set.
@@ -425,11 +436,6 @@
 		fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
 		if (fa)
 			return fa;
-		/* fall through */
-	case cpu_to_be32(XFS_BMAP_MAGIC):
-		break;
-	default:
-		return __this_address;
 	}
 
 	/*
@@ -481,6 +487,8 @@
 
 const struct xfs_buf_ops xfs_bmbt_buf_ops = {
 	.name = "xfs_bmbt",
+	.magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+		   cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
 	.verify_read = xfs_bmbt_read_verify,
 	.verify_write = xfs_bmbt_write_verify,
 	.verify_struct = xfs_bmbt_verify,

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 34c6d7b..71de937 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c

@@ -11,16 +11,13 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_buf_item.h"
 #include "xfs_btree.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_alloc.h"
 #include "xfs_log.h"
 
@@ -276,7 +273,7 @@
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 
-	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+	if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 		return;
 	if (bip)
 		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -288,7 +285,7 @@
 	struct xfs_buf		*bp)
 {
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
@@ -314,7 +311,7 @@
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 
-	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+	if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 		return;
 	if (bip)
 		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -326,11 +323,11 @@
 	struct xfs_buf		*bp)
 {
 	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
-			return __this_address;
+			return false;
 		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 	}
 
@@ -691,14 +688,13 @@
 xfs_btree_get_bufl(
 	xfs_mount_t	*mp,		/* file system mount point */
 	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_fsblock_t	fsbno,		/* file system block number */
-	uint		lock)		/* lock flags for get_buf */
+	xfs_fsblock_t	fsbno)		/* file system block number */
 {
 	xfs_daddr_t		d;		/* real disk block address */
 
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
 }
 
 /*
@@ -710,15 +706,14 @@
 	xfs_mount_t	*mp,		/* file system mount point */
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* allocation group block number */
-	uint		lock)		/* lock flags for get_buf */
+	xfs_agblock_t	agbno)		/* allocation group block number */
 {
 	xfs_daddr_t		d;		/* real disk block address */
 
 	ASSERT(agno != NULLAGNUMBER);
 	ASSERT(agbno != NULLAGBLOCK);
 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
 }
 
 /*
@@ -845,7 +840,6 @@
 	struct xfs_mount	*mp,		/* file system mount point */
 	struct xfs_trans	*tp,		/* transaction pointer */
 	xfs_fsblock_t		fsbno,		/* file system block number */
-	uint			lock,		/* lock flags for read_buf */
 	struct xfs_buf		**bpp,		/* buffer for fsbno */
 	int			refval,		/* ref count value for buffer */
 	const struct xfs_buf_ops *ops)
@@ -858,7 +852,7 @@
 		return -EFSCORRUPTED;
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, lock, &bp, ops);
+				   mp->m_bsize, 0, &bp, ops);
 	if (error)
 		return error;
 	if (bp)
@@ -1185,11 +1179,10 @@
 	xfs_btnum_t	btnum,
 	__u16		level,
 	__u16		numrecs,
-	__u64		owner,
-	unsigned int	flags)
+	__u64		owner)
 {
 	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-				 btnum, level, numrecs, owner, flags);
+				 btnum, level, numrecs, owner, 0);
 }
 
 STATIC void
@@ -1288,7 +1281,6 @@
 xfs_btree_get_buf_block(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_ptr	*ptr,
-	int			flags,
 	struct xfs_btree_block	**block,
 	struct xfs_buf		**bpp)
 {
@@ -1296,14 +1288,11 @@
 	xfs_daddr_t		d;
 	int			error;
 
-	/* need to sort out how callers deal with failures first */
-	ASSERT(!(flags & XBF_TRYLOCK));
-
 	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
 	if (error)
 		return error;
 	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
-				 mp->m_bsize, flags);
+				 mp->m_bsize, 0);
 
 	if (!*bpp)
 		return -ENOMEM;
@@ -2706,7 +2695,7 @@
 	XFS_BTREE_STATS_INC(cur, alloc);
 
 	/* Set up the new block as "right". */
-	error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+	error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
 	if (error)
 		goto error0;
 
@@ -2961,7 +2950,7 @@
 	XFS_BTREE_STATS_INC(cur, alloc);
 
 	/* Copy the root into a real block. */
-	error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+	error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
 	if (error)
 		goto error0;
 
@@ -3058,7 +3047,7 @@
 	XFS_BTREE_STATS_INC(cur, alloc);
 
 	/* Set up the new block. */
-	error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+	error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
 	if (error)
 		goto error0;
 
@@ -4433,7 +4422,7 @@
 	struct xfs_buf		*bp,
 	uint64_t		owner)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 
 	if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -4454,7 +4443,7 @@
 	struct xfs_buf		*bp,
 	unsigned int		max_recs)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 
 	/* numrecs verification */
@@ -4477,14 +4466,12 @@
  *				      btree block
  *
  * @bp: buffer containing the btree block
- * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
- * @pag_max_level: pointer to the per-ag max level field
  */
 xfs_failaddr_t
 xfs_btree_sblock_v5hdr_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_perag	*pag = bp->b_pag;
 
@@ -4510,7 +4497,7 @@
 	struct xfs_buf		*bp,
 	unsigned int		max_recs)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	xfs_agblock_t		agno;
 
@@ -4611,7 +4598,7 @@
 
 		/* Callback */
 		error = fn(cur, recp, priv);
-		if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+		if (error)
 			break;
 
 advloop:
@@ -4713,8 +4700,7 @@
 			 */
 			if (ldiff >= 0 && hdiff >= 0) {
 				error = fn(cur, recp, priv);
-				if (error < 0 ||
-				    error == XFS_BTREE_QUERY_RANGE_ABORT)
+				if (error)
 					break;
 			} else if (hdiff < 0) {
 				/* Record is larger than high key; pop. */
@@ -4785,8 +4771,7 @@
  * Query a btree for all records overlapping a given interval of keys.  The
  * supplied function will be called with each record found; return one of the
  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
- * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
- * negative error code.
+ * code.  This function returns -ECANCELED, zero, or a negative error code.
  */
 int
 xfs_btree_query_range(
@@ -4902,7 +4887,7 @@
 	union xfs_btree_rec		*rec,
 	void				*priv)
 {
-	return XFS_BTREE_QUERY_RANGE_ABORT;
+	return -ECANCELED;
 }
 
 /* Is there a record covering a given range of keys? */
@@ -4917,7 +4902,7 @@
 
 	error = xfs_btree_query_range(cur, low, high,
 			&xfs_btree_has_record_helper, NULL);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+	if (error == -ECANCELED) {
 		*exists = true;
 		return 0;
 	}

diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index e3b3e9d..ced1e65 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h

@@ -301,8 +301,7 @@
 xfs_btree_get_bufl(
 	struct xfs_mount	*mp,	/* file system mount point */
 	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_fsblock_t		fsbno,	/* file system block number */
-	uint			lock);	/* lock flags for get_buf */
+	xfs_fsblock_t		fsbno);	/* file system block number */
 
 /*
  * Get a buffer for the block, return it with no data read.
@@ -313,8 +312,7 @@
 	struct xfs_mount	*mp,	/* file system mount point */
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_agnumber_t		agno,	/* allocation group number */
-	xfs_agblock_t		agbno,	/* allocation group block number */
-	uint			lock);	/* lock flags for get_buf */
+	xfs_agblock_t		agbno);	/* allocation group block number */
 
 /*
  * Check for the cursor referring to the last block at the given level.
@@ -345,7 +343,6 @@
 	struct xfs_mount	*mp,	/* file system mount point */
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_fsblock_t		fsbno,	/* file system block number */
-	uint			lock,	/* lock flags for read_buf */
 	struct xfs_buf		**bpp,	/* buffer for fsbno */
 	int			refval,	/* ref count value for buffer */
 	const struct xfs_buf_ops *ops);
@@ -383,8 +380,7 @@
 	xfs_btnum_t	btnum,
 	__u16		level,
 	__u16		numrecs,
-	__u64		owner,
-	unsigned int	flags);
+	__u64		owner);
 
 void
 xfs_btree_init_block_int(
@@ -468,9 +464,13 @@
 uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
 unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
 
-/* return codes */
-#define XFS_BTREE_QUERY_RANGE_CONTINUE	0	/* keep iterating */
-#define XFS_BTREE_QUERY_RANGE_ABORT	1	/* stop iterating */
+/*
+ * Return codes for the query range iterator function are 0 to continue
+ * iterating, and non-zero to stop iterating.  Any non-zero value will be
+ * passed up to the _query_range caller.  The special value -ECANCELED can be
+ * used to stop iteration, because _query_range never generates that error
+ * code on its own.
+ */
 typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
 		union xfs_btree_rec *rec, void *priv);
 

diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 376bee9..4fd1223 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c

@@ -12,20 +12,14 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
 #include "xfs_bmap.h"
-#include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
 
@@ -116,35 +110,52 @@
 	kmem_zone_free(xfs_da_state_zone, state);
 }
 
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+	struct xfs_buf		*bp,
+	struct xfs_da3_blkinfo	*hdr3)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_da_blkinfo	*hdr = &hdr3->hdr;
+
+	if (!xfs_verify_magic16(bp, hdr->magic))
+		return __this_address;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+			return __this_address;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return __this_address;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+			return __this_address;
+	}
+
+	return NULL;
+}
+
 static xfs_failaddr_t
 xfs_da3_node_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_da_intnode	*hdr = bp->b_addr;
 	struct xfs_da3_icnode_hdr ichdr;
 	const struct xfs_dir_ops *ops;
+	xfs_failaddr_t		fa;
 
 	ops = xfs_dir_get_ops(mp, NULL);
 
 	ops->node_hdr_from_disk(&ichdr, hdr);
 
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+	fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+	if (fa)
+		return fa;
 
-		if (ichdr.magic != XFS_DA3_NODE_MAGIC)
-			return __this_address;
-
-		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
-			return __this_address;
-		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-			return __this_address;
-		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
-			return __this_address;
-	} else {
-		if (ichdr.magic != XFS_DA_NODE_MAGIC)
-			return __this_address;
-	}
 	if (ichdr.level == 0)
 		return __this_address;
 	if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -169,7 +180,7 @@
 xfs_da3_node_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 	xfs_failaddr_t		fa;
@@ -257,6 +268,8 @@
 
 const struct xfs_buf_ops xfs_da3_node_buf_ops = {
 	.name = "xfs_da3_node",
+	.magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+		     cpu_to_be16(XFS_DA3_NODE_MAGIC) },
 	.verify_read = xfs_da3_node_read_verify,
 	.verify_write = xfs_da3_node_write_verify,
 	.verify_struct = xfs_da3_node_verify_struct,
@@ -474,10 +487,8 @@
 	ASSERT(state->path.active == 0);
 	oldblk = &state->path.blk[0];
 	error = xfs_da3_root_split(state, oldblk, addblk);
-	if (error) {
-		addblk->bp = NULL;
-		return error;	/* GROT: dir is inconsistent */
-	}
+	if (error)
+		goto out;
 
 	/*
 	 * Update pointers to the node which used to be block 0 and just got
@@ -492,7 +503,10 @@
 	 */
 	node = oldblk->bp->b_addr;
 	if (node->hdr.info.forw) {
-		ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+		if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
 		node = addblk->bp->b_addr;
 		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
 		xfs_trans_log_buf(state->args->trans, addblk->bp,
@@ -501,15 +515,19 @@
 	}
 	node = oldblk->bp->b_addr;
 	if (node->hdr.info.back) {
-		ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+		if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
 		node = addblk->bp->b_addr;
 		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
 		xfs_trans_log_buf(state->args->trans, addblk->bp,
 				  XFS_DA_LOGRANGE(node, &node->hdr.info,
 				  sizeof(node->hdr.info)));
 	}
+out:
 	addblk->bp = NULL;
-	return 0;
+	return error;
 }
 
 /*
@@ -2080,7 +2098,7 @@
 		 * If we didn't get it and the block might work if fragmented,
 		 * try without the CONTIG flag.  Loop until we get it all.
 		 */
-		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		mapp = kmem_alloc(sizeof(*mapp) * count, 0);
 		for (b = *bno, mapi = 0; b < *bno + count; ) {
 			nmap = min(XFS_BMAP_MAX_NMAP, count);
 			c = (int)(*bno + count - b);
@@ -2462,7 +2480,7 @@
 
 	if (nirecs > 1) {
 		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
-				  KM_SLEEP | KM_NOFS);
+				  KM_NOFS);
 		if (!map)
 			return -ENOMEM;
 		*mapp = map;
@@ -2521,7 +2539,7 @@
 		 */
 		if (nfsb != 1)
 			irecs = kmem_zalloc(sizeof(irec) * nfsb,
-					    KM_SLEEP | KM_NOFS);
+					    KM_NOFS);
 
 		nirecs = nfsb;
 		error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,

diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 84dd865..ae0bbd2 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h

@@ -81,13 +81,15 @@
 #define XFS_DA_OP_ADDNAME	0x0004	/* this is an add operation */
 #define XFS_DA_OP_OKNOENT	0x0008	/* lookup/add op, ENOENT ok, else die */
 #define XFS_DA_OP_CILOOKUP	0x0010	/* lookup to return CI name if found */
+#define XFS_DA_OP_ALLOCVAL	0x0020	/* lookup to alloc buffer if found  */
 
 #define XFS_DA_OP_FLAGS \
 	{ XFS_DA_OP_JUSTCHECK,	"JUSTCHECK" }, \
 	{ XFS_DA_OP_RENAME,	"RENAME" }, \
 	{ XFS_DA_OP_ADDNAME,	"ADDNAME" }, \
 	{ XFS_DA_OP_OKNOENT,	"OKNOENT" }, \
-	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }
+	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }, \
+	{ XFS_DA_OP_ALLOCVAL,	"ALLOCVAL" }
 
 /*
  * Storage for holding state during Btree searches and split/join ops.

diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index b39053d..b1ae572 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c

@@ -11,11 +11,8 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
 
 /*
  * Shortform directory ops

diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5d5bf3b..ae654e0 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h

@@ -869,4 +869,7 @@
 	return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
 }
 
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+				      struct xfs_da3_blkinfo *hdr3);
+
 #endif /* __XFS_DA_FORMAT_H__ */

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index e792b16..2255752 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c

@@ -9,8 +9,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_trans.h"
@@ -172,7 +170,13 @@
  * reoccur.
  */
 
-static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
+static const struct xfs_defer_op_type *defer_op_types[] = {
+	[XFS_DEFER_OPS_TYPE_BMAP]	= &xfs_bmap_update_defer_type,
+	[XFS_DEFER_OPS_TYPE_REFCOUNT]	= &xfs_refcount_update_defer_type,
+	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
+	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
+	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
+};
 
 /*
  * For each pending item in the intake list, log its intent item and the
@@ -185,15 +189,15 @@
 {
 	struct list_head		*li;
 	struct xfs_defer_pending	*dfp;
+	const struct xfs_defer_op_type	*ops;
 
 	list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
-		dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
-				dfp->dfp_count);
+		ops = defer_op_types[dfp->dfp_type];
+		dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
 		trace_xfs_defer_create_intent(tp->t_mountp, dfp);
-		list_sort(tp->t_mountp, &dfp->dfp_work,
-				dfp->dfp_type->diff_items);
+		list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
 		list_for_each(li, &dfp->dfp_work)
-			dfp->dfp_type->log_item(tp, dfp->dfp_intent, li);
+			ops->log_item(tp, dfp->dfp_intent, li);
 	}
 }
 
@@ -204,14 +208,16 @@
 	struct list_head		*dop_pending)
 {
 	struct xfs_defer_pending	*dfp;
+	const struct xfs_defer_op_type	*ops;
 
 	trace_xfs_defer_trans_abort(tp, _RET_IP_);
 
 	/* Abort intent items that don't have a done item. */
 	list_for_each_entry(dfp, dop_pending, dfp_list) {
+		ops = defer_op_types[dfp->dfp_type];
 		trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
 		if (dfp->dfp_intent && !dfp->dfp_done) {
-			dfp->dfp_type->abort_intent(dfp->dfp_intent);
+			ops->abort_intent(dfp->dfp_intent);
 			dfp->dfp_intent = NULL;
 		}
 	}
@@ -266,13 +272,15 @@
 
 	trace_xfs_defer_trans_roll(tp, _RET_IP_);
 
-	/* Roll the transaction. */
+	/*
+	 * Roll the transaction.  Rolling always given a new transaction (even
+	 * if committing the old one fails!) to hand back to the caller, so we
+	 * join the held resources to the new transaction so that we always
+	 * return with the held resources joined to @tpp, no matter what
+	 * happened.
+	 */
 	error = xfs_trans_roll(tpp);
 	tp = *tpp;
-	if (error) {
-		trace_xfs_defer_trans_roll_error(tp, error);
-		return error;
-	}
 
 	/* Rejoin the joined inodes. */
 	for (i = 0; i < ipcount; i++)
@@ -284,6 +292,8 @@
 		xfs_trans_bhold(tp, bplist[i]);
 	}
 
+	if (error)
+		trace_xfs_defer_trans_roll_error(tp, error);
 	return error;
 }
 
@@ -315,18 +325,20 @@
 	struct xfs_defer_pending	*pli;
 	struct list_head		*pwi;
 	struct list_head		*n;
+	const struct xfs_defer_op_type	*ops;
 
 	/*
 	 * Free the pending items.  Caller should already have arranged
 	 * for the intent items to be released.
 	 */
 	list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
+		ops = defer_op_types[dfp->dfp_type];
 		trace_xfs_defer_cancel_list(mp, dfp);
 		list_del(&dfp->dfp_list);
 		list_for_each_safe(pwi, n, &dfp->dfp_work) {
 			list_del(pwi);
 			dfp->dfp_count--;
-			dfp->dfp_type->cancel_item(pwi);
+			ops->cancel_item(pwi);
 		}
 		ASSERT(dfp->dfp_count == 0);
 		kmem_free(dfp);
@@ -350,7 +362,7 @@
 	struct list_head		*n;
 	void				*state;
 	int				error = 0;
-	void				(*cleanup_fn)(struct xfs_trans *, void *, int);
+	const struct xfs_defer_op_type	*ops;
 	LIST_HEAD(dop_pending);
 
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -373,18 +385,18 @@
 		/* Log an intent-done item for the first pending item. */
 		dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
 				       dfp_list);
+		ops = defer_op_types[dfp->dfp_type];
 		trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
-		dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+		dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
 				dfp->dfp_count);
-		cleanup_fn = dfp->dfp_type->finish_cleanup;
 
 		/* Finish the work items. */
 		state = NULL;
 		list_for_each_safe(li, n, &dfp->dfp_work) {
 			list_del(li);
 			dfp->dfp_count--;
-			error = dfp->dfp_type->finish_item(*tp, li,
-					dfp->dfp_done, &state);
+			error = ops->finish_item(*tp, li, dfp->dfp_done,
+					&state);
 			if (error == -EAGAIN) {
 				/*
 				 * Caller wants a fresh transaction;
@@ -400,8 +412,8 @@
 				 * xfs_defer_cancel will take care of freeing
 				 * all these lists and stuff.
 				 */
-				if (cleanup_fn)
-					cleanup_fn(*tp, state, error);
+				if (ops->finish_cleanup)
+					ops->finish_cleanup(*tp, state, error);
 				goto out;
 			}
 		}
@@ -413,20 +425,19 @@
 			 * a Fresh Transaction while Finishing
 			 * Deferred Work" above.
 			 */
-			dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+			dfp->dfp_intent = ops->create_intent(*tp,
 					dfp->dfp_count);
 			dfp->dfp_done = NULL;
 			list_for_each(li, &dfp->dfp_work)
-				dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
-						li);
+				ops->log_item(*tp, dfp->dfp_intent, li);
 		} else {
 			/* Done with the dfp, free it. */
 			list_del(&dfp->dfp_list);
 			kmem_free(dfp);
 		}
 
-		if (cleanup_fn)
-			cleanup_fn(*tp, state, error);
+		if (ops->finish_cleanup)
+			ops->finish_cleanup(*tp, state, error);
 	}
 
 out:
@@ -486,8 +497,10 @@
 	struct list_head		*li)
 {
 	struct xfs_defer_pending	*dfp = NULL;
+	const struct xfs_defer_op_type	*ops;
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+	BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
 
 	/*
 	 * Add the item to a pending item at the end of the intake list.
@@ -497,15 +510,15 @@
 	if (!list_empty(&tp->t_dfops)) {
 		dfp = list_last_entry(&tp->t_dfops,
 				struct xfs_defer_pending, dfp_list);
-		if (dfp->dfp_type->type != type ||
-		    (dfp->dfp_type->max_items &&
-		     dfp->dfp_count >= dfp->dfp_type->max_items))
+		ops = defer_op_types[dfp->dfp_type];
+		if (dfp->dfp_type != type ||
+		    (ops->max_items && dfp->dfp_count >= ops->max_items))
 			dfp = NULL;
 	}
 	if (!dfp) {
 		dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
-				KM_SLEEP | KM_NOFS);
-		dfp->dfp_type = defer_op_types[type];
+				KM_NOFS);
+		dfp->dfp_type = type;
 		dfp->dfp_intent = NULL;
 		dfp->dfp_done = NULL;
 		dfp->dfp_count = 0;
@@ -517,14 +530,6 @@
 	dfp->dfp_count++;
 }
 
-/* Initialize a deferred operation list. */
-void
-xfs_defer_init_op_type(
-	const struct xfs_defer_op_type	*type)
-{
-	defer_op_types[type->type] = type;
-}
-
 /*
  * Move deferred ops from one transaction to another and reset the source to
  * initial state. This is primarily used to carry state forward across

diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 2584a5b..7c28d76 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h

@@ -9,20 +9,6 @@
 struct xfs_defer_op_type;
 
 /*
- * Save a log intent item and a list of extents, so that we can replay
- * whatever action had to happen to the extent list and file the log done
- * item.
- */
-struct xfs_defer_pending {
-	const struct xfs_defer_op_type	*dfp_type;	/* function pointers */
-	struct list_head		dfp_list;	/* pending items */
-	void				*dfp_intent;	/* log intent item */
-	void				*dfp_done;	/* log done item */
-	struct list_head		dfp_work;	/* work items */
-	unsigned int			dfp_count;	/* # extent items */
-};
-
-/*
  * Header for deferred operation list.
  */
 enum xfs_defer_ops_type {
@@ -34,6 +20,20 @@
 	XFS_DEFER_OPS_TYPE_MAX,
 };
 
+/*
+ * Save a log intent item and a list of extents, so that we can replay
+ * whatever action had to happen to the extent list and file the log done
+ * item.
+ */
+struct xfs_defer_pending {
+	struct list_head		dfp_list;	/* pending items */
+	struct list_head		dfp_work;	/* work items */
+	void				*dfp_intent;	/* log intent item */
+	void				*dfp_done;	/* log done item */
+	unsigned int			dfp_count;	/* # extent items */
+	enum xfs_defer_ops_type		dfp_type;
+};
+
 void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
 		struct list_head *h);
 int xfs_defer_finish_noroll(struct xfs_trans **tp);
@@ -43,8 +43,6 @@
 
 /* Description of a deferred type. */
 struct xfs_defer_op_type {
-	enum xfs_defer_ops_type	type;
-	unsigned int		max_items;
 	void (*abort_intent)(void *);
 	void *(*create_done)(struct xfs_trans *, void *, unsigned int);
 	int (*finish_item)(struct xfs_trans *, struct list_head *, void *,
@@ -54,8 +52,13 @@
 	int (*diff_items)(void *, struct list_head *, struct list_head *);
 	void *(*create_intent)(struct xfs_trans *, uint);
 	void (*log_item)(struct xfs_trans *, void *, struct list_head *);
+	unsigned int		max_items;
 };
 
-void xfs_defer_init_op_type(const struct xfs_defer_op_type *type);
+extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
+extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
 
 #endif /* __XFS_DEFER_H__ */

diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 229152c..867c5de 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c

@@ -5,20 +5,16 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -114,9 +110,9 @@
 
 	nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
 	mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-				    KM_SLEEP | KM_MAYFAIL);
+				    KM_MAYFAIL);
 	mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-				     KM_SLEEP | KM_MAYFAIL);
+				     KM_MAYFAIL);
 	if (!mp->m_dir_geo || !mp->m_attr_geo) {
 		kmem_free(mp->m_dir_geo);
 		kmem_free(mp->m_attr_geo);
@@ -221,7 +217,7 @@
 	if (error)
 		return error;
 
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	if (!args)
 		return -ENOMEM;
 
@@ -258,7 +254,7 @@
 		XFS_STATS_INC(dp->i_mount, xs_dir_create);
 	}
 
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	if (!args)
 		return -ENOMEM;
 
@@ -357,7 +353,7 @@
 	 * lockdep Doing this avoids having to add a bunch of lockdep class
 	 * annotations into the reclaim path for the ilock.
 	 */
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	args->geo = dp->i_mount->m_dir_geo;
 	args->name = name->name;
 	args->namelen = name->len;
@@ -426,7 +422,7 @@
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	XFS_STATS_INC(dp->i_mount, xs_dir_remove);
 
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	if (!args)
 		return -ENOMEM;
 
@@ -487,7 +483,7 @@
 	if (rval)
 		return rval;
 
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args = kmem_zalloc(sizeof(*args), KM_NOFS);
 	if (!args)
 		return -ENOMEM;
 
@@ -703,3 +699,20 @@
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 	return 0;
 }
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+	const void	*name,
+	size_t		length)
+{
+	/*
+	 * MAXNAMELEN includes the trailing null, but (name/length) leave it
+	 * out, so use >= for the length check.
+	 */
+	if (length >= MAXNAMELEN)
+		return false;
+
+	/* There shouldn't be any slashes or nulls here */
+	return !memchr(name, '/', length) && !memchr(name, 0, length);
+}

diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c3e3f6b..f542447 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h

@@ -326,5 +326,6 @@
 unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
 void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
 		struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
 
 #endif	/* __XFS_DIR2_H__ */

diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 30ed591..49e4bc3 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c

@@ -6,22 +6,19 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_log.h"
 
 /*
@@ -50,21 +47,19 @@
 xfs_dir3_block_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 
+	if (!xfs_verify_magic(bp, hdr3->magic))
+		return __this_address;
+
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
-			return __this_address;
 		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
 			return __this_address;
 		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
 			return __this_address;
 		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
 			return __this_address;
-	} else {
-		if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
-			return __this_address;
 	}
 	return __xfs_dir3_data_check(NULL, bp);
 }
@@ -73,7 +68,7 @@
 xfs_dir3_block_read_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	xfs_failaddr_t		fa;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -90,7 +85,7 @@
 xfs_dir3_block_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 	xfs_failaddr_t		fa;
@@ -112,6 +107,8 @@
 
 const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
 	.name = "xfs_dir3_block",
+	.magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+		   cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
 	.verify_read = xfs_dir3_block_read_verify,
 	.verify_write = xfs_dir3_block_write_verify,
 	.verify_struct = xfs_dir3_block_verify,
@@ -1095,11 +1092,11 @@
 	 * Copy the directory into a temporary buffer.
 	 * Then pitch the incore inode data so we can make extents.
 	 */
-	sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+	sfp = kmem_alloc(ifp->if_bytes, 0);
 	memcpy(sfp, oldsfp, ifp->if_bytes);
 
 	xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
-	xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+	xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
 	dp->i_d.di_size = 0;
 
 	/*

diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 01162c6..2c79be4 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c

@@ -6,19 +6,16 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
-#include "xfs_cksum.h"
 #include "xfs_log.h"
 
 static xfs_failaddr_t xfs_dir2_data_freefind_verify(
@@ -50,14 +47,13 @@
 	int			i;		/* leaf index */
 	int			lastfree;	/* last entry was unused */
 	xfs_dir2_leaf_entry_t	*lep=NULL;	/* block leaf entries */
-	xfs_mount_t		*mp;		/* filesystem mount point */
+	struct xfs_mount	*mp = bp->b_mount;
 	char			*p;		/* current data position */
 	int			stale;		/* count of stale leaves */
 	struct xfs_name		name;
 	const struct xfs_dir_ops *ops;
 	struct xfs_da_geometry	*geo;
 
-	mp = bp->b_target->bt_mount;
 	geo = mp->m_dir_geo;
 
 	/*
@@ -249,21 +245,19 @@
 xfs_dir3_data_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 
+	if (!xfs_verify_magic(bp, hdr3->magic))
+		return __this_address;
+
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
-			return __this_address;
 		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
 			return __this_address;
 		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
 			return __this_address;
 		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
 			return __this_address;
-	} else {
-		if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
-			return __this_address;
 	}
 	return __xfs_dir3_data_check(NULL, bp);
 }
@@ -300,7 +294,7 @@
 xfs_dir3_data_read_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	xfs_failaddr_t		fa;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -317,7 +311,7 @@
 xfs_dir3_data_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 	xfs_failaddr_t		fa;
@@ -339,6 +333,8 @@
 
 const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
 	.name = "xfs_dir3_data",
+	.magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+		   cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
 	.verify_read = xfs_dir3_data_read_verify,
 	.verify_write = xfs_dir3_data_write_verify,
 	.verify_struct = xfs_dir3_data_verify,
@@ -346,6 +342,8 @@
 
 static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
 	.name = "xfs_dir3_data_reada",
+	.magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+		   cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
 	.verify_read = xfs_dir3_data_reada_verify,
 	.verify_write = xfs_dir3_data_write_verify,
 };

diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1728a3e..a53e458 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c

@@ -6,12 +6,11 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_dir2.h"
@@ -20,8 +19,6 @@
 #include "xfs_trace.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-#include "xfs_log.h"
 
 /*
  * Local function declarations.
@@ -142,66 +139,46 @@
  */
 static xfs_failaddr_t
 xfs_dir3_leaf_verify(
-	struct xfs_buf		*bp,
-	uint16_t		magic)
+	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	xfs_failaddr_t		fa;
 
-	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-		uint16_t		magic3;
-
-		magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
-							 : XFS_DIR3_LEAFN_MAGIC;
-
-		if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
-			return __this_address;
-		if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
-			return __this_address;
-		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-			return __this_address;
-		if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
-			return __this_address;
-	} else {
-		if (leaf->hdr.info.magic != cpu_to_be16(magic))
-			return __this_address;
-	}
+	fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+	if (fa)
+		return fa;
 
 	return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
 }
 
 static void
-__read_verify(
-	struct xfs_buf  *bp,
-	uint16_t	magic)
+xfs_dir3_leaf_read_verify(
+	struct xfs_buf  *bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	xfs_failaddr_t		fa;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	     !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
-		fa = xfs_dir3_leaf_verify(bp, magic);
+		fa = xfs_dir3_leaf_verify(bp);
 		if (fa)
 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 	}
 }
 
 static void
-__write_verify(
-	struct xfs_buf  *bp,
-	uint16_t	magic)
+xfs_dir3_leaf_write_verify(
+	struct xfs_buf  *bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
 	xfs_failaddr_t		fa;
 
-	fa = xfs_dir3_leaf_verify(bp, magic);
+	fa = xfs_dir3_leaf_verify(bp);
 	if (fa) {
 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 		return;
@@ -216,60 +193,22 @@
 	xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
 }
 
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
-	struct xfs_buf	*bp)
-{
-	return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
-	struct xfs_buf	*bp)
-{
-	__read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
-	struct xfs_buf	*bp)
-{
-	__write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
-	struct xfs_buf	*bp)
-{
-	return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
-	struct xfs_buf	*bp)
-{
-	__read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
-	struct xfs_buf	*bp)
-{
-	__write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
 const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
 	.name = "xfs_dir3_leaf1",
-	.verify_read = xfs_dir3_leaf1_read_verify,
-	.verify_write = xfs_dir3_leaf1_write_verify,
-	.verify_struct = xfs_dir3_leaf1_verify,
+	.magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+		     cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+	.verify_read = xfs_dir3_leaf_read_verify,
+	.verify_write = xfs_dir3_leaf_write_verify,
+	.verify_struct = xfs_dir3_leaf_verify,
 };
 
 const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
 	.name = "xfs_dir3_leafn",
-	.verify_read = xfs_dir3_leafn_read_verify,
-	.verify_write = xfs_dir3_leafn_write_verify,
-	.verify_struct = xfs_dir3_leafn_verify,
+	.magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+		     cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+	.verify_read = xfs_dir3_leaf_read_verify,
+	.verify_write = xfs_dir3_leaf_write_verify,
+	.verify_struct = xfs_dir3_leaf_verify,
 };
 
 int
@@ -621,43 +560,40 @@
  */
 int						/* error */
 xfs_dir2_leaf_addname(
-	xfs_da_args_t		*args)		/* operation arguments */
+	struct xfs_da_args	*args)		/* operation arguments */
 {
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_trans	*tp = args->trans;
 	__be16			*bestsp;	/* freespace table in leaf */
-	int			compact;	/* need to compact leaves */
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	__be16			*tagp;		/* end of data entry */
 	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data block entry */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* data unused entry */
+	struct xfs_buf		*lbp;		/* leaf's buffer */
+	struct xfs_dir2_leaf	*leaf;		/* leaf structure */
+	struct xfs_inode	*dp = args->dp;	/* incore directory inode */
+	struct xfs_dir2_data_hdr *hdr;		/* data block header */
+	struct xfs_dir2_data_entry *dep;	/* data block entry */
+	struct xfs_dir2_leaf_entry *lep;	/* leaf entry table pointer */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir2_data_unused *dup;	/* data unused entry */
+	struct xfs_dir2_leaf_tail *ltp;		/* leaf tail pointer */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	int			compact;	/* need to compact leaves */
 	int			error;		/* error return value */
 	int			grown;		/* allocated new data block */
-	int			highstale;	/* index of next stale leaf */
+	int			highstale = 0;	/* index of next stale leaf */
 	int			i;		/* temporary, index */
 	int			index;		/* leaf table position */
-	struct xfs_buf		*lbp;		/* leaf's buffer */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
 	int			length;		/* length of new entry */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry table pointer */
 	int			lfloglow;	/* low leaf logging index */
 	int			lfloghigh;	/* high leaf logging index */
-	int			lowstale;	/* index of prev stale leaf */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	int			lowstale = 0;	/* index of prev stale leaf */
 	int			needbytes;	/* leaf block bytes needed */
 	int			needlog;	/* need to log data header */
 	int			needscan;	/* need to rescan data free */
-	__be16			*tagp;		/* end of data entry */
-	xfs_trans_t		*tp;		/* transaction pointer */
 	xfs_dir2_db_t		use_block;	/* data block number */
-	struct xfs_dir2_data_free *bf;		/* bestfree table */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	trace_xfs_dir2_leaf_addname(args);
 
-	dp = args->dp;
-	tp = args->trans;
-
 	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
 	if (error)
 		return error;

diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index f1bb343..705c4f5 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c

@@ -6,12 +6,11 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_dir2.h"
@@ -20,7 +19,6 @@
 #include "xfs_trace.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
-#include "xfs_cksum.h"
 #include "xfs_log.h"
 
 /*
@@ -34,8 +32,6 @@
 static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
 				 int index, xfs_da_state_blk_t *dblk,
 				 int *rval);
-static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
-				     xfs_da_state_blk_t *fblk);
 
 /*
  * Check internal consistency of a leafn block.
@@ -84,23 +80,21 @@
 xfs_dir3_free_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_dir2_free_hdr *hdr = bp->b_addr;
 
+	if (!xfs_verify_magic(bp, hdr->magic))
+		return __this_address;
+
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
 
-		if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
-			return __this_address;
 		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
 			return __this_address;
 		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
 			return __this_address;
 		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
 			return __this_address;
-	} else {
-		if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
-			return __this_address;
 	}
 
 	/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -112,7 +106,7 @@
 xfs_dir3_free_read_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	xfs_failaddr_t		fa;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -129,7 +123,7 @@
 xfs_dir3_free_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 	xfs_failaddr_t		fa;
@@ -151,6 +145,8 @@
 
 const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
 	.name = "xfs_dir3_free",
+	.magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+		   cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
 	.verify_read = xfs_dir3_free_read_verify,
 	.verify_write = xfs_dir3_free_write_verify,
 	.verify_struct = xfs_dir3_free_verify,
@@ -426,24 +422,22 @@
 static int					/* error */
 xfs_dir2_leafn_add(
 	struct xfs_buf		*bp,		/* leaf buffer */
-	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_da_args	*args,		/* operation arguments */
 	int			index)		/* insertion pt for new entry */
 {
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir2_leaf_entry *lep;
+	struct xfs_dir2_leaf_entry *ents;
 	int			compact;	/* compacting stale leaves */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			highstale;	/* next stale entry */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			highstale = 0;	/* next stale entry */
 	int			lfloghigh;	/* high leaf entry logging */
 	int			lfloglow;	/* low leaf entry logging */
-	int			lowstale;	/* previous stale entry */
-	struct xfs_dir3_icleaf_hdr leafhdr;
-	struct xfs_dir2_leaf_entry *ents;
+	int			lowstale = 0;	/* previous stale entry */
 
 	trace_xfs_dir2_leafn_add(args, index);
 
-	dp = args->dp;
-	leaf = bp->b_addr;
 	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
 
@@ -745,7 +739,8 @@
 	ents = dp->d_ops->leaf_ents_p(leaf);
 
 	xfs_dir3_leaf_check(dp, bp);
-	ASSERT(leafhdr.count > 0);
+	if (leafhdr.count <= 0)
+		return -EFSCORRUPTED;
 
 	/*
 	 * Look up the hash value in the leaf entries.
@@ -1614,6 +1609,353 @@
 }
 
 /*
+ * Add a new data block to the directory at the free space index that the caller
+ * has specified.
+ */
+static int
+xfs_dir2_node_add_datablk(
+	struct xfs_da_args	*args,
+	struct xfs_da_state_blk	*fblk,
+	xfs_dir2_db_t		*dbno,
+	struct xfs_buf		**dbpp,
+	struct xfs_buf		**fbpp,
+	int			*findex)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_dir2_data_free *bf;
+	struct xfs_dir2_data_hdr *hdr;
+	struct xfs_dir2_free	*free = NULL;
+	xfs_dir2_db_t		fbno;
+	struct xfs_buf		*fbp;
+	struct xfs_buf		*dbp;
+	__be16			*bests = NULL;
+	int			error;
+
+	/* Not allowed to allocate, return failure. */
+	if (args->total == 0)
+		return -ENOSPC;
+
+	/* Allocate and initialize the new data block.  */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
+	if (error)
+		return error;
+	error = xfs_dir3_data_init(args, *dbno, &dbp);
+	if (error)
+		return error;
+
+	/*
+	 * Get the freespace block corresponding to the data block
+	 * that was just allocated.
+	 */
+	fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
+	error = xfs_dir2_free_try_read(tp, dp,
+			       xfs_dir2_db_to_da(args->geo, fbno), &fbp);
+	if (error)
+		return error;
+
+	/*
+	 * If there wasn't a freespace block, the read will
+	 * return a NULL fbp.  Allocate and initialize a new one.
+	 */
+	if (!fbp) {
+		error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
+		if (error)
+			return error;
+
+		if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
+			xfs_alert(mp,
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
+				__func__, (unsigned long long)dp->i_ino,
+				(long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
+				(long long)*dbno, (long long)fbno);
+			if (fblk) {
+				xfs_alert(mp,
+			" fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
+					fblk, (unsigned long long)fblk->blkno,
+					fblk->index, fblk->magic);
+			} else {
+				xfs_alert(mp, " ... fblk is NULL");
+			}
+			XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+			return -EFSCORRUPTED;
+		}
+
+		/* Get a buffer for the new block. */
+		error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+		if (error)
+			return error;
+		free = fbp->b_addr;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+		/* Remember the first slot as our empty slot. */
+		freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+							XFS_DIR2_FREE_OFFSET)) *
+				dp->d_ops->free_max_bests(args->geo);
+	} else {
+		free = fbp->b_addr;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+	}
+
+	/* Set the freespace block index from the data block number. */
+	*findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
+
+	/* Extend the freespace table if the new data block is off the end. */
+	if (*findex >= freehdr.nvalid) {
+		ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
+		freehdr.nvalid = *findex + 1;
+		bests[*findex] = cpu_to_be16(NULLDATAOFF);
+	}
+
+	/*
+	 * If this entry was for an empty data block (this should always be
+	 * true) then update the header.
+	 */
+	if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
+		freehdr.nused++;
+		dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+		xfs_dir2_free_log_header(args, fbp);
+	}
+
+	/* Update the freespace value for the new block in the table. */
+	hdr = dbp->b_addr;
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	bests[*findex] = bf[0].length;
+
+	*dbpp = dbp;
+	*fbpp = fbp;
+	return 0;
+}
+
+static int
+xfs_dir2_node_find_freeblk(
+	struct xfs_da_args	*args,
+	struct xfs_da_state_blk	*fblk,
+	xfs_dir2_db_t		*dbnop,
+	struct xfs_buf		**fbpp,
+	int			*findexp,
+	int			length)
+{
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_dir2_free	*free = NULL;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_buf		*fbp = NULL;
+	xfs_dir2_db_t		firstfbno;
+	xfs_dir2_db_t		lastfbno;
+	xfs_dir2_db_t		ifbno = -1;
+	xfs_dir2_db_t		dbno = -1;
+	xfs_dir2_db_t		fbno;
+	xfs_fileoff_t		fo;
+	__be16			*bests = NULL;
+	int			findex = 0;
+	int			error;
+
+	/*
+	 * If we came in with a freespace block that means that lookup
+	 * found an entry with our hash value.  This is the freespace
+	 * block for that data entry.
+	 */
+	if (fblk) {
+		fbp = fblk->bp;
+		free = fbp->b_addr;
+		findex = fblk->index;
+		if (findex >= 0) {
+			/* caller already found the freespace for us. */
+			bests = dp->d_ops->free_bests_p(free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+			ASSERT(findex < freehdr.nvalid);
+			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+			ASSERT(be16_to_cpu(bests[findex]) >= length);
+			dbno = freehdr.firstdb + findex;
+			goto found_block;
+		}
+
+		/*
+		 * The data block looked at didn't have enough room.
+		 * We'll start at the beginning of the freespace entries.
+		 */
+		ifbno = fblk->blkno;
+		xfs_trans_brelse(tp, fbp);
+		fbp = NULL;
+		fblk->bp = NULL;
+	}
+
+	/*
+	 * If we don't have a data block yet, we're going to scan the freespace
+	 * data for a data block with enough free space in it.
+	 */
+	error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
+	if (error)
+		return error;
+	lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+	firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
+
+	for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
+		/* If it's ifbno we already looked at it. */
+		if (fbno == ifbno)
+			continue;
+
+		/*
+		 * Read the block.  There can be holes in the freespace blocks,
+		 * so this might not succeed.  This should be really rare, so
+		 * there's no reason to avoid it.
+		 */
+		error = xfs_dir2_free_try_read(tp, dp,
+				xfs_dir2_db_to_da(args->geo, fbno),
+				&fbp);
+		if (error)
+			return error;
+		if (!fbp)
+			continue;
+
+		free = fbp->b_addr;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+		/* Scan the free entry array for a large enough free space. */
+		for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
+			if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+			    be16_to_cpu(bests[findex]) >= length) {
+				dbno = freehdr.firstdb + findex;
+				goto found_block;
+			}
+		}
+
+		/* Didn't find free space, go on to next free block */
+		xfs_trans_brelse(tp, fbp);
+	}
+
+found_block:
+	*dbnop = dbno;
+	*fbpp = fbp;
+	*findexp = findex;
+	return 0;
+}
+
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int
+xfs_dir2_node_addname_int(
+	struct xfs_da_args	*args,		/* operation arguments */
+	struct xfs_da_state_blk	*fblk)		/* optional freespace block */
+{
+	struct xfs_dir2_data_unused *dup;	/* data unused entry pointer */
+	struct xfs_dir2_data_entry *dep;	/* data entry pointer */
+	struct xfs_dir2_data_hdr *hdr;		/* data block header */
+	struct xfs_dir2_data_free *bf;
+	struct xfs_dir2_free	*free = NULL;	/* freespace block structure */
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_buf		*dbp;		/* data block buffer */
+	struct xfs_buf		*fbp;		/* freespace buffer */
+	xfs_dir2_data_aoff_t	aoff;
+	xfs_dir2_db_t		dbno;		/* data block number */
+	int			error;		/* error return value */
+	int			findex;		/* freespace entry index */
+	int			length;		/* length of the new entry */
+	int			logfree = 0;	/* need to log free entry */
+	int			needlog = 0;	/* need to log data header */
+	int			needscan = 0;	/* need to rescan data frees */
+	__be16			*tagp;		/* data entry tag pointer */
+	__be16			*bests;
+
+	length = dp->d_ops->data_entsize(args->namelen);
+	error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
+					   length);
+	if (error)
+		return error;
+
+	/*
+	 * Now we know if we must allocate blocks, so if we are checking whether
+	 * we can insert without allocation then we can return now.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		if (dbno == -1)
+			return -ENOSPC;
+		return 0;
+	}
+
+	/*
+	 * If we don't have a data block, we need to allocate one and make
+	 * the freespace entries refer to it.
+	 */
+	if (dbno == -1) {
+		/* we're going to have to log the free block index later */
+		logfree = 1;
+		error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
+						  &findex);
+	} else {
+		/* Read the data block in. */
+		error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, dbno),
+					   -1, &dbp);
+	}
+	if (error)
+		return error;
+
+	/* setup for data block up now */
+	hdr = dbp->b_addr;
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	ASSERT(be16_to_cpu(bf[0].length) >= length);
+
+	/* Point to the existing unused space. */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
+
+	/* Mark the first part of the unused space, inuse for us. */
+	aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+	error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
+			&needlog, &needscan);
+	if (error) {
+		xfs_trans_brelse(tp, dbp);
+		return error;
+	}
+
+	/* Fill in the new entry and log it. */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, dbp, dep);
+
+	/* Rescan the freespace and log the data block if needed. */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+
+	/* If the freespace block entry is now wrong, update it. */
+	free = fbp->b_addr;
+	bests = dp->d_ops->free_bests_p(free);
+	if (bests[findex] != bf[0].length) {
+		bests[findex] = bf[0].length;
+		logfree = 1;
+	}
+
+	/* Log the freespace entry if needed. */
+	if (logfree)
+		xfs_dir2_free_log_bests(args, fbp, findex, findex);
+
+	/* Return the data block and offset in args. */
+	args->blkno = (xfs_dablk_t)dbno;
+	args->index = be16_to_cpu(*tagp);
+	return 0;
+}
+
+/*
  * Top-level node form directory addname routine.
  */
 int						/* error */
@@ -1683,383 +2025,6 @@
 }
 
 /*
- * Add the data entry for a node-format directory name addition.
- * The leaf entry is added in xfs_dir2_leafn_add.
- * We may enter with a freespace block that the lookup found.
- */
-static int					/* error */
-xfs_dir2_node_addname_int(
-	xfs_da_args_t		*args,		/* operation arguments */
-	xfs_da_state_blk_t	*fblk)		/* optional freespace block */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_dir2_db_t		dbno;		/* data block number */
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* data unused entry pointer */
-	int			error;		/* error return value */
-	xfs_dir2_db_t		fbno;		/* freespace block number */
-	struct xfs_buf		*fbp;		/* freespace buffer */
-	int			findex;		/* freespace entry index */
-	xfs_dir2_free_t		*free=NULL;	/* freespace block structure */
-	xfs_dir2_db_t		ifbno;		/* initial freespace block no */
-	xfs_dir2_db_t		lastfbno=0;	/* highest freespace block no */
-	int			length;		/* length of the new entry */
-	int			logfree;	/* need to log free entry */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log data header */
-	int			needscan;	/* need to rescan data frees */
-	__be16			*tagp;		/* data entry tag pointer */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	__be16			*bests;
-	struct xfs_dir3_icfree_hdr freehdr;
-	struct xfs_dir2_data_free *bf;
-	xfs_dir2_data_aoff_t	aoff;
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	length = dp->d_ops->data_entsize(args->namelen);
-	/*
-	 * If we came in with a freespace block that means that lookup
-	 * found an entry with our hash value.  This is the freespace
-	 * block for that data entry.
-	 */
-	if (fblk) {
-		fbp = fblk->bp;
-		/*
-		 * Remember initial freespace block number.
-		 */
-		ifbno = fblk->blkno;
-		free = fbp->b_addr;
-		findex = fblk->index;
-		bests = dp->d_ops->free_bests_p(free);
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-		/*
-		 * This means the free entry showed that the data block had
-		 * space for our entry, so we remembered it.
-		 * Use that data block.
-		 */
-		if (findex >= 0) {
-			ASSERT(findex < freehdr.nvalid);
-			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
-			ASSERT(be16_to_cpu(bests[findex]) >= length);
-			dbno = freehdr.firstdb + findex;
-		} else {
-			/*
-			 * The data block looked at didn't have enough room.
-			 * We'll start at the beginning of the freespace entries.
-			 */
-			dbno = -1;
-			findex = 0;
-		}
-	} else {
-		/*
-		 * Didn't come in with a freespace block, so no data block.
-		 */
-		ifbno = dbno = -1;
-		fbp = NULL;
-		findex = 0;
-	}
-
-	/*
-	 * If we don't have a data block yet, we're going to scan the
-	 * freespace blocks looking for one.  Figure out what the
-	 * highest freespace block number is.
-	 */
-	if (dbno == -1) {
-		xfs_fileoff_t	fo;		/* freespace block number */
-
-		if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
-			return error;
-		lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
-		fbno = ifbno;
-	}
-	/*
-	 * While we haven't identified a data block, search the freeblock
-	 * data for a good data block.  If we find a null freeblock entry,
-	 * indicating a hole in the data blocks, remember that.
-	 */
-	while (dbno == -1) {
-		/*
-		 * If we don't have a freeblock in hand, get the next one.
-		 */
-		if (fbp == NULL) {
-			/*
-			 * Happens the first time through unless lookup gave
-			 * us a freespace block to start with.
-			 */
-			if (++fbno == 0)
-				fbno = xfs_dir2_byte_to_db(args->geo,
-							XFS_DIR2_FREE_OFFSET);
-			/*
-			 * If it's ifbno we already looked at it.
-			 */
-			if (fbno == ifbno)
-				fbno++;
-			/*
-			 * If it's off the end we're done.
-			 */
-			if (fbno >= lastfbno)
-				break;
-			/*
-			 * Read the block.  There can be holes in the
-			 * freespace blocks, so this might not succeed.
-			 * This should be really rare, so there's no reason
-			 * to avoid it.
-			 */
-			error = xfs_dir2_free_try_read(tp, dp,
-					xfs_dir2_db_to_da(args->geo, fbno),
-					&fbp);
-			if (error)
-				return error;
-			if (!fbp)
-				continue;
-			free = fbp->b_addr;
-			findex = 0;
-		}
-		/*
-		 * Look at the current free entry.  Is it good enough?
-		 *
-		 * The bests initialisation should be where the bufer is read in
-		 * the above branch. But gcc is too stupid to realise that bests
-		 * and the freehdr are actually initialised if they are placed
-		 * there, so we have to do it here to avoid warnings. Blech.
-		 */
-		bests = dp->d_ops->free_bests_p(free);
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-		    be16_to_cpu(bests[findex]) >= length)
-			dbno = freehdr.firstdb + findex;
-		else {
-			/*
-			 * Are we done with the freeblock?
-			 */
-			if (++findex == freehdr.nvalid) {
-				/*
-				 * Drop the block.
-				 */
-				xfs_trans_brelse(tp, fbp);
-				fbp = NULL;
-				if (fblk && fblk->bp)
-					fblk->bp = NULL;
-			}
-		}
-	}
-	/*
-	 * If we don't have a data block, we need to allocate one and make
-	 * the freespace entries refer to it.
-	 */
-	if (unlikely(dbno == -1)) {
-		/*
-		 * Not allowed to allocate, return failure.
-		 */
-		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-			return -ENOSPC;
-
-		/*
-		 * Allocate and initialize the new data block.
-		 */
-		if (unlikely((error = xfs_dir2_grow_inode(args,
-							 XFS_DIR2_DATA_SPACE,
-							 &dbno)) ||
-		    (error = xfs_dir3_data_init(args, dbno, &dbp))))
-			return error;
-
-		/*
-		 * If (somehow) we have a freespace block, get rid of it.
-		 */
-		if (fbp)
-			xfs_trans_brelse(tp, fbp);
-		if (fblk && fblk->bp)
-			fblk->bp = NULL;
-
-		/*
-		 * Get the freespace block corresponding to the data block
-		 * that was just allocated.
-		 */
-		fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
-		error = xfs_dir2_free_try_read(tp, dp,
-				       xfs_dir2_db_to_da(args->geo, fbno),
-				       &fbp);
-		if (error)
-			return error;
-
-		/*
-		 * If there wasn't a freespace block, the read will
-		 * return a NULL fbp.  Allocate and initialize a new one.
-		 */
-		if (!fbp) {
-			error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
-						    &fbno);
-			if (error)
-				return error;
-
-			if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
-				xfs_alert(mp,
-"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
-					__func__, (unsigned long long)dp->i_ino,
-					(long long)dp->d_ops->db_to_fdb(
-								args->geo, dbno),
-					(long long)dbno, (long long)fbno,
-					(unsigned long long)ifbno, lastfbno);
-				if (fblk) {
-					xfs_alert(mp,
-				" fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
-						fblk,
-						(unsigned long long)fblk->blkno,
-						fblk->index,
-						fblk->magic);
-				} else {
-					xfs_alert(mp, " ... fblk is NULL");
-				}
-				XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
-						 XFS_ERRLEVEL_LOW, mp);
-				return -EFSCORRUPTED;
-			}
-
-			/*
-			 * Get a buffer for the new block.
-			 */
-			error = xfs_dir3_free_get_buf(args, fbno, &fbp);
-			if (error)
-				return error;
-			free = fbp->b_addr;
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-			/*
-			 * Remember the first slot as our empty slot.
-			 */
-			freehdr.firstdb =
-				(fbno - xfs_dir2_byte_to_db(args->geo,
-							XFS_DIR2_FREE_OFFSET)) *
-					dp->d_ops->free_max_bests(args->geo);
-		} else {
-			free = fbp->b_addr;
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		}
-
-		/*
-		 * Set the freespace block index from the data block number.
-		 */
-		findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
-		/*
-		 * If it's after the end of the current entries in the
-		 * freespace block, extend that table.
-		 */
-		if (findex >= freehdr.nvalid) {
-			ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
-			freehdr.nvalid = findex + 1;
-			/*
-			 * Tag new entry so nused will go up.
-			 */
-			bests[findex] = cpu_to_be16(NULLDATAOFF);
-		}
-		/*
-		 * If this entry was for an empty data block
-		 * (this should always be true) then update the header.
-		 */
-		if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
-			freehdr.nused++;
-			dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-			xfs_dir2_free_log_header(args, fbp);
-		}
-		/*
-		 * Update the real value in the table.
-		 * We haven't allocated the data entry yet so this will
-		 * change again.
-		 */
-		hdr = dbp->b_addr;
-		bf = dp->d_ops->data_bestfree_p(hdr);
-		bests[findex] = bf[0].length;
-		logfree = 1;
-	}
-	/*
-	 * We had a data block so we don't have to make a new one.
-	 */
-	else {
-		/*
-		 * If just checking, we succeeded.
-		 */
-		if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-			return 0;
-
-		/*
-		 * Read the data block in.
-		 */
-		error = xfs_dir3_data_read(tp, dp,
-					   xfs_dir2_db_to_da(args->geo, dbno),
-					   -1, &dbp);
-		if (error)
-			return error;
-		hdr = dbp->b_addr;
-		bf = dp->d_ops->data_bestfree_p(hdr);
-		logfree = 0;
-	}
-	ASSERT(be16_to_cpu(bf[0].length) >= length);
-	/*
-	 * Point to the existing unused space.
-	 */
-	dup = (xfs_dir2_data_unused_t *)
-	      ((char *)hdr + be16_to_cpu(bf[0].offset));
-	needscan = needlog = 0;
-	/*
-	 * Mark the first part of the unused space, inuse for us.
-	 */
-	aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
-	error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
-			&needlog, &needscan);
-	if (error) {
-		xfs_trans_brelse(tp, dbp);
-		return error;
-	}
-	/*
-	 * Fill in the new entry and log it.
-	 */
-	dep = (xfs_dir2_data_entry_t *)dup;
-	dep->inumber = cpu_to_be64(args->inumber);
-	dep->namelen = args->namelen;
-	memcpy(dep->name, args->name, dep->namelen);
-	dp->d_ops->data_put_ftype(dep, args->filetype);
-	tagp = dp->d_ops->data_entry_tag_p(dep);
-	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	xfs_dir2_data_log_entry(args, dbp, dep);
-	/*
-	 * Rescan the block for bestfree if needed.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	/*
-	 * Log the data block header if needed.
-	 */
-	if (needlog)
-		xfs_dir2_data_log_header(args, dbp);
-	/*
-	 * If the freespace entry is now wrong, update it.
-	 */
-	bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
-	if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
-		bests[findex] = bf[0].length;
-		logfree = 1;
-	}
-	/*
-	 * Log the freespace entry if needed.
-	 */
-	if (logfree)
-		xfs_dir2_free_log_bests(args, fbp, findex, findex);
-	/*
-	 * Return the data block and offset in args, then drop the data block.
-	 */
-	args->blkno = (xfs_dablk_t)dbno;
-	args->index = be16_to_cpu(*tagp);
-	return 0;
-}
-
-/*
  * Lookup an entry in a node-format directory.
  * All the real work happens in xfs_da3_node_lookup_int.
  * The only real output is the inode number of the entry.

diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 585dfdb..85f14fc 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c

@@ -5,16 +5,13 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_error.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_trace.h"
@@ -167,7 +164,7 @@
 	 * can free the block and copy the formatted data into the inode literal
 	 * area.
 	 */
-	dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+	dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
 	hdr = bp->b_addr;
 
 	/*
@@ -439,7 +436,7 @@
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	old_isize = (int)dp->i_d.di_size;
-	buf = kmem_alloc(old_isize, KM_SLEEP);
+	buf = kmem_alloc(old_isize, 0);
 	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
 	memcpy(oldsfp, sfp, old_isize);
 	/*
@@ -1099,7 +1096,7 @@
 	 * Don't want xfs_idata_realloc copying the data here.
 	 */
 	oldsize = dp->i_df.if_bytes;
-	buf = kmem_alloc(oldsize, KM_SLEEP);
+	buf = kmem_alloc(oldsize, 0);
 	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	ASSERT(oldsfp->i8count == 1);
 	memcpy(buf, oldsfp, oldsize);
@@ -1172,7 +1169,7 @@
 	 * Don't want xfs_idata_realloc copying the data here.
 	 */
 	oldsize = dp->i_df.if_bytes;
-	buf = kmem_alloc(oldsize, KM_SLEEP);
+	buf = kmem_alloc(oldsize, 0);
 	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	ASSERT(oldsfp->i8count == 0);
 	memcpy(buf, oldsfp, oldsize);

diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index d293f37..e8bd688 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c

@@ -16,8 +16,6 @@
 #include "xfs_trans.h"
 #include "xfs_qm.h"
 #include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_trace.h"
 
 int
 xfs_calc_dquots_per_chunk(
@@ -110,7 +108,7 @@
 /*
  * Do some primitive error checking on ondisk dquot data structures.
  */
-int
+void
 xfs_dqblk_repair(
 	struct xfs_mount	*mp,
 	struct xfs_dqblk	*dqb,
@@ -133,8 +131,6 @@
 		xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
 				 XFS_DQUOT_CRC_OFF);
 	}
-
-	return 0;
 }
 
 STATIC bool
@@ -226,7 +222,7 @@
 xfs_dquot_buf_verify_struct(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 
 	return xfs_dquot_buf_verify(mp, bp, false);
 }
@@ -235,7 +231,7 @@
 xfs_dquot_buf_read_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 
 	if (!xfs_dquot_buf_verify_crc(mp, bp, false))
 		return;
@@ -252,7 +248,7 @@
 xfs_dquot_buf_readahead_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 
 	if (!xfs_dquot_buf_verify_crc(mp, bp, true) ||
 	    xfs_dquot_buf_verify(mp, bp, true) != NULL) {
@@ -270,13 +266,15 @@
 xfs_dquot_buf_write_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 
 	xfs_dquot_buf_verify(mp, bp, false);
 }
 
 const struct xfs_buf_ops xfs_dquot_buf_ops = {
 	.name = "xfs_dquot",
+	.magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+		     cpu_to_be16(XFS_DQUOT_MAGIC) },
 	.verify_read = xfs_dquot_buf_read_verify,
 	.verify_write = xfs_dquot_buf_write_verify,
 	.verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +282,8 @@
 
 const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
 	.name = "xfs_dquot_ra",
+	.magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+		     cpu_to_be16(XFS_DQUOT_MAGIC) },
 	.verify_read = xfs_dquot_buf_readahead_verify,
 	.verify_write = xfs_dquot_buf_write_verify,
 };

diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a1..79e6c4f 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h

@@ -54,7 +54,8 @@
 #define XFS_ERRTAG_BUF_LRU_REF				31
 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR			32
 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC			33
-#define XFS_ERRTAG_MAX					34
+#define XFS_ERRTAG_IUNLINK_FALLBACK			34
+#define XFS_ERRTAG_MAX					35
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
 #define XFS_RANDOM_BUF_LRU_REF				2
 #define XFS_RANDOM_FORCE_SCRUB_REPAIR			1
 #define XFS_RANDOM_FORCE_SUMMARY_RECALC			1
+#define XFS_RANDOM_IUNLINK_FALLBACK			(XFS_RANDOM_DEFAULT/10)
 
 #endif /* __XFS_ERRORTAG_H_ */

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index afbe336..c968b60 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h

@@ -287,6 +287,8 @@
 {
 	if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
 		return false;
+	if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+		return false;
 
 	/* check for unknown features in the fs */
 	if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
@@ -357,12 +359,6 @@
 	       (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
 }
 
-static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
-{
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-	       (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
-}
-
 static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
 {
 	return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
@@ -920,6 +916,9 @@
 
 /*
  * Values for di_format
+ *
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
  */
 typedef enum xfs_dinode_fmt {
 	XFS_DINODE_FMT_DEV,		/* xfs_dev_t */
@@ -929,6 +928,13 @@
 	XFS_DINODE_FMT_UUID		/* added long ago, but never used */
 } xfs_dinode_fmt_t;
 
+#define XFS_INODE_FORMAT_STR \
+	{ XFS_DINODE_FMT_DEV,		"dev" }, \
+	{ XFS_DINODE_FMT_LOCAL,		"local" }, \
+	{ XFS_DINODE_FMT_EXTENTS,	"extent" }, \
+	{ XFS_DINODE_FMT_BTREE,		"btree" }, \
+	{ XFS_DINODE_FMT_UUID,		"uuid" }
+
 /*
  * Inode minimum and maximum sizes.
  */
@@ -1065,7 +1071,7 @@
 #define	XFS_INO_MASK(k)			(uint32_t)((1ULL << (k)) - 1)
 #define	XFS_INO_OFFSET_BITS(mp)		(mp)->m_sb.sb_inopblog
 #define	XFS_INO_AGBNO_BITS(mp)		(mp)->m_sb.sb_agblklog
-#define	XFS_INO_AGINO_BITS(mp)		(mp)->m_agino_log
+#define	XFS_INO_AGINO_BITS(mp)		((mp)->m_ino_geo.agino_log)
 #define	XFS_INO_AGNO_BITS(mp)		(mp)->m_agno_log
 #define	XFS_INO_BITS(mp)		\
 	XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
@@ -1087,6 +1093,8 @@
 	((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
 #define	XFS_OFFBNO_TO_AGINO(mp,b,o)	\
 	((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#define	XFS_FSB_TO_INO(mp, b)	((xfs_ino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
+#define	XFS_AGB_TO_AGINO(mp, b)	((xfs_agino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
 
 #define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
 #define	XFS_MAXINUMBER_32	((xfs_ino_t)((1ULL << 32) - 1ULL))

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index f3aa593..e9371a8 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h

@@ -97,7 +97,7 @@
  * For use by backup and restore programs to set the XFS on-disk inode
  * fields di_dmevmask and di_dmstate.  These must be set to exactly and
  * only values previously obtained via xfs_bulkstat!  (Specifically the
- * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ * struct xfs_bstat fields bs_dmevmask and bs_dmstate.)
  */
 #ifndef HAVE_FSDMIDATA
 struct fsdmidata {
@@ -124,7 +124,7 @@
 /*
  * Output for XFS_IOC_FSGEOMETRY_V1
  */
-typedef struct xfs_fsop_geom_v1 {
+struct xfs_fsop_geom_v1 {
 	__u32		blocksize;	/* filesystem (data) block size */
 	__u32		rtextsize;	/* realtime extent size		*/
 	__u32		agblocks;	/* fsblocks in an AG		*/
@@ -145,12 +145,39 @@
 	__u32		logsectsize;	/* log sector size, bytes	*/
 	__u32		rtsectsize;	/* realtime sector size, bytes	*/
 	__u32		dirblocksize;	/* directory block size, bytes	*/
-} xfs_fsop_geom_v1_t;
+};
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V4
+ */
+struct xfs_fsop_geom_v4 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+	__u32		logsunit;	/* log stripe unit, bytes	*/
+};
 
 /*
  * Output for XFS_IOC_FSGEOMETRY
  */
-typedef struct xfs_fsop_geom {
+struct xfs_fsop_geom {
 	__u32		blocksize;	/* filesystem (data) block size */
 	__u32		rtextsize;	/* realtime extent size		*/
 	__u32		agblocks;	/* fsblocks in an AG		*/
@@ -171,8 +198,18 @@
 	__u32		logsectsize;	/* log sector size, bytes	*/
 	__u32		rtsectsize;	/* realtime sector size, bytes	*/
 	__u32		dirblocksize;	/* directory block size, bytes	*/
-	__u32		logsunit;	/* log stripe unit, bytes */
-} xfs_fsop_geom_t;
+	__u32		logsunit;	/* log stripe unit, bytes	*/
+	uint32_t	sick;		/* o: unhealthy fs & rt metadata */
+	uint32_t	checked;	/* o: checked fs & rt metadata	*/
+	__u64		reserved[17];	/* reserved space		*/
+};
+
+#define XFS_FSOP_GEOM_SICK_COUNTERS	(1 << 0)  /* summary counters */
+#define XFS_FSOP_GEOM_SICK_UQUOTA	(1 << 1)  /* user quota */
+#define XFS_FSOP_GEOM_SICK_GQUOTA	(1 << 2)  /* group quota */
+#define XFS_FSOP_GEOM_SICK_PQUOTA	(1 << 3)  /* project quota */
+#define XFS_FSOP_GEOM_SICK_RT_BITMAP	(1 << 4)  /* realtime bitmap */
+#define XFS_FSOP_GEOM_SICK_RT_SUMMARY	(1 << 5)  /* realtime summary */
 
 /* Output for XFS_FS_COUNTS */
 typedef struct xfs_fsop_counts {
@@ -188,28 +225,30 @@
 	__u64  resblks_avail;
 } xfs_fsop_resblks_t;
 
-#define XFS_FSOP_GEOM_VERSION	0
+#define XFS_FSOP_GEOM_VERSION		0
+#define XFS_FSOP_GEOM_VERSION_V5	5
 
-#define XFS_FSOP_GEOM_FLAGS_ATTR	0x0001	/* attributes in use	*/
-#define XFS_FSOP_GEOM_FLAGS_NLINK	0x0002	/* 32-bit nlink values	*/
-#define XFS_FSOP_GEOM_FLAGS_QUOTA	0x0004	/* quotas enabled	*/
-#define XFS_FSOP_GEOM_FLAGS_IALIGN	0x0008	/* inode alignment	*/
-#define XFS_FSOP_GEOM_FLAGS_DALIGN	0x0010	/* large data alignment */
-#define XFS_FSOP_GEOM_FLAGS_SHARED	0x0020	/* read-only shared	*/
-#define XFS_FSOP_GEOM_FLAGS_EXTFLG	0x0040	/* special extent flag	*/
-#define XFS_FSOP_GEOM_FLAGS_DIRV2	0x0080	/* directory version 2	*/
-#define XFS_FSOP_GEOM_FLAGS_LOGV2	0x0100	/* log format version 2	*/
-#define XFS_FSOP_GEOM_FLAGS_SECTOR	0x0200	/* sector sizes >1BB	*/
-#define XFS_FSOP_GEOM_FLAGS_ATTR2	0x0400	/* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_PROJID32	0x0800	/* 32-bit project IDs	*/
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI	0x1000	/* ASCII only CI names	*/
-#define XFS_FSOP_GEOM_FLAGS_LAZYSB	0x4000	/* lazy superblock counters */
-#define XFS_FSOP_GEOM_FLAGS_V5SB	0x8000	/* version 5 superblock */
-#define XFS_FSOP_GEOM_FLAGS_FTYPE	0x10000	/* inode directory types */
-#define XFS_FSOP_GEOM_FLAGS_FINOBT	0x20000	/* free inode btree */
-#define XFS_FSOP_GEOM_FLAGS_SPINODES	0x40000	/* sparse inode chunks	*/
-#define XFS_FSOP_GEOM_FLAGS_RMAPBT	0x80000	/* reverse mapping btree */
-#define XFS_FSOP_GEOM_FLAGS_REFLINK	0x100000 /* files can share blocks */
+#define XFS_FSOP_GEOM_FLAGS_ATTR	(1 << 0)  /* attributes in use	   */
+#define XFS_FSOP_GEOM_FLAGS_NLINK	(1 << 1)  /* 32-bit nlink values   */
+#define XFS_FSOP_GEOM_FLAGS_QUOTA	(1 << 2)  /* quotas enabled	   */
+#define XFS_FSOP_GEOM_FLAGS_IALIGN	(1 << 3)  /* inode alignment	   */
+#define XFS_FSOP_GEOM_FLAGS_DALIGN	(1 << 4)  /* large data alignment  */
+#define XFS_FSOP_GEOM_FLAGS_SHARED	(1 << 5)  /* read-only shared	   */
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG	(1 << 6)  /* special extent flag   */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2	(1 << 7)  /* directory version 2   */
+#define XFS_FSOP_GEOM_FLAGS_LOGV2	(1 << 8)  /* log format version 2  */
+#define XFS_FSOP_GEOM_FLAGS_SECTOR	(1 << 9)  /* sector sizes >1BB	   */
+#define XFS_FSOP_GEOM_FLAGS_ATTR2	(1 << 10) /* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32	(1 << 11) /* 32-bit project IDs	   */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI	(1 << 12) /* ASCII only CI names   */
+	/*  -- Do not use --		(1 << 13)    SGI parent pointers   */
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB	(1 << 14) /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB	(1 << 15) /* version 5 superblock  */
+#define XFS_FSOP_GEOM_FLAGS_FTYPE	(1 << 16) /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT	(1 << 17) /* free inode btree	   */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES	(1 << 18) /* sparse inode chunks   */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT	(1 << 19) /* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK	(1 << 20) /* files can share blocks */
 
 /*
  * Minimum and maximum sizes need for growth checks.
@@ -238,6 +277,31 @@
 			 (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
 
 /*
+ * Output for XFS_IOC_AG_GEOMETRY
+ */
+struct xfs_ag_geometry {
+	uint32_t	ag_number;	/* i/o: AG number */
+	uint32_t	ag_length;	/* o: length in blocks */
+	uint32_t	ag_freeblks;	/* o: free space */
+	uint32_t	ag_icount;	/* o: inodes allocated */
+	uint32_t	ag_ifree;	/* o: inodes free */
+	uint32_t	ag_sick;	/* o: sick things in ag */
+	uint32_t	ag_checked;	/* o: checked metadata in ag */
+	uint32_t	ag_flags;	/* i/o: flags for this ag */
+	uint64_t	ag_reserved[12];/* o: zero */
+};
+#define XFS_AG_GEOM_SICK_SB	(1 << 0)  /* superblock */
+#define XFS_AG_GEOM_SICK_AGF	(1 << 1)  /* AGF header */
+#define XFS_AG_GEOM_SICK_AGFL	(1 << 2)  /* AGFL header */
+#define XFS_AG_GEOM_SICK_AGI	(1 << 3)  /* AGI header */
+#define XFS_AG_GEOM_SICK_BNOBT	(1 << 4)  /* free space by block */
+#define XFS_AG_GEOM_SICK_CNTBT	(1 << 5)  /* free space by length */
+#define XFS_AG_GEOM_SICK_INOBT	(1 << 6)  /* inode index */
+#define XFS_AG_GEOM_SICK_FINOBT	(1 << 7)  /* free inode index */
+#define XFS_AG_GEOM_SICK_RMAPBT	(1 << 8)  /* reverse mappings */
+#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9)  /* reference counts */
+
+/*
  * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
  */
 typedef struct xfs_growfs_data {
@@ -264,7 +328,7 @@
 	__s32		tv_nsec;	/* and nanoseconds	*/
 } xfs_bstime_t;
 
-typedef struct xfs_bstat {
+struct xfs_bstat {
 	__u64		bs_ino;		/* inode number			*/
 	__u16		bs_mode;	/* type and mode		*/
 	__u16		bs_nlink;	/* number of links		*/
@@ -285,12 +349,70 @@
 #define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
 	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
 	__u16		bs_projid_hi;	/* higher part of project id	*/
-	unsigned char	bs_pad[6];	/* pad space, unused		*/
+	uint16_t	bs_sick;	/* sick inode metadata		*/
+	uint16_t	bs_checked;	/* checked inode metadata	*/
+	unsigned char	bs_pad[2];	/* pad space, unused		*/
 	__u32		bs_cowextsize;	/* cow extent size		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
-} xfs_bstat_t;
+};
+
+/* New bulkstat structure that reports v5 features and fixes padding issues */
+struct xfs_bulkstat {
+	uint64_t	bs_ino;		/* inode number			*/
+	uint64_t	bs_size;	/* file size			*/
+
+	uint64_t	bs_blocks;	/* number of blocks		*/
+	uint64_t	bs_xflags;	/* extended flags		*/
+
+	int64_t		bs_atime;	/* access time, seconds		*/
+	int64_t		bs_mtime;	/* modify time, seconds		*/
+
+	int64_t		bs_ctime;	/* inode change time, seconds	*/
+	int64_t		bs_btime;	/* creation time, seconds	*/
+
+	uint32_t	bs_gen;		/* generation count		*/
+	uint32_t	bs_uid;		/* user id			*/
+	uint32_t	bs_gid;		/* group id			*/
+	uint32_t	bs_projectid;	/* project id			*/
+
+	uint32_t	bs_atime_nsec;	/* access time, nanoseconds	*/
+	uint32_t	bs_mtime_nsec;	/* modify time, nanoseconds	*/
+	uint32_t	bs_ctime_nsec;	/* inode change time, nanoseconds */
+	uint32_t	bs_btime_nsec;	/* creation time, nanoseconds	*/
+
+	uint32_t	bs_blksize;	/* block size			*/
+	uint32_t	bs_rdev;	/* device value			*/
+	uint32_t	bs_cowextsize_blks; /* cow extent size hint, blocks */
+	uint32_t	bs_extsize_blks; /* extent size hint, blocks	*/
+
+	uint32_t	bs_nlink;	/* number of links		*/
+	uint32_t	bs_extents;	/* number of extents		*/
+	uint32_t	bs_aextents;	/* attribute number of extents	*/
+	uint16_t	bs_version;	/* structure version		*/
+	uint16_t	bs_forkoff;	/* inode fork offset in bytes	*/
+
+	uint16_t	bs_sick;	/* sick inode metadata		*/
+	uint16_t	bs_checked;	/* checked inode metadata	*/
+	uint16_t	bs_mode;	/* type and mode		*/
+	uint16_t	bs_pad2;	/* zeroed			*/
+
+	uint64_t	bs_pad[7];	/* zeroed			*/
+};
+
+#define XFS_BULKSTAT_VERSION_V1	(1)
+#define XFS_BULKSTAT_VERSION_V5	(5)
+
+/* bs_sick flags */
+#define XFS_BS_SICK_INODE	(1 << 0)  /* inode core */
+#define XFS_BS_SICK_BMBTD	(1 << 1)  /* data fork */
+#define XFS_BS_SICK_BMBTA	(1 << 2)  /* attr fork */
+#define XFS_BS_SICK_BMBTC	(1 << 3)  /* cow fork */
+#define XFS_BS_SICK_DIR		(1 << 4)  /* directory */
+#define XFS_BS_SICK_XATTR	(1 << 5)  /* extended attributes */
+#define XFS_BS_SICK_SYMLINK	(1 << 6)  /* symbolic link remote target */
+#define XFS_BS_SICK_PARENT	(1 << 7)  /* parent pointers */
 
 /*
  * Project quota id helpers (previously projid was 16bit only
@@ -298,7 +420,7 @@
  * to retain compatibility with "old" filesystems).
  */
 static inline uint32_t
-bstat_get_projid(struct xfs_bstat *bs)
+bstat_get_projid(const struct xfs_bstat *bs)
 {
 	return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
 }
@@ -306,23 +428,79 @@
 /*
  * The user-level BulkStat Request interface structure.
  */
-typedef struct xfs_fsop_bulkreq {
+struct xfs_fsop_bulkreq {
 	__u64		__user *lastip;	/* last inode # pointer		*/
 	__s32		icount;		/* count of entries in buffer	*/
 	void		__user *ubuffer;/* user buffer for inode desc.	*/
 	__s32		__user *ocount;	/* output count pointer		*/
-} xfs_fsop_bulkreq_t;
-
+};
 
 /*
  * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
  */
-typedef struct xfs_inogrp {
+struct xfs_inogrp {
 	__u64		xi_startino;	/* starting inode number	*/
 	__s32		xi_alloccount;	/* # bits set in allocmask	*/
 	__u64		xi_allocmask;	/* mask of allocated inodes	*/
-} xfs_inogrp_t;
+};
 
+/* New inumbers structure that reports v5 features and fixes padding issues */
+struct xfs_inumbers {
+	uint64_t	xi_startino;	/* starting inode number	*/
+	uint64_t	xi_allocmask;	/* mask of allocated inodes	*/
+	uint8_t		xi_alloccount;	/* # bits set in allocmask	*/
+	uint8_t		xi_version;	/* version			*/
+	uint8_t		xi_padding[6];	/* zero				*/
+};
+
+#define XFS_INUMBERS_VERSION_V1	(1)
+#define XFS_INUMBERS_VERSION_V5	(5)
+
+/* Header for bulk inode requests. */
+struct xfs_bulk_ireq {
+	uint64_t	ino;		/* I/O: start with this inode	*/
+	uint32_t	flags;		/* I/O: operation flags		*/
+	uint32_t	icount;		/* I: count of entries in buffer */
+	uint32_t	ocount;		/* O: count of entries filled out */
+	uint32_t	agno;		/* I: see comment for IREQ_AGNO	*/
+	uint64_t	reserved[5];	/* must be zero			*/
+};
+
+/*
+ * Only return results from the specified @agno.  If @ino is zero, start
+ * with the first inode of @agno.
+ */
+#define XFS_BULK_IREQ_AGNO	(1 << 0)
+
+/*
+ * Return bulkstat information for a single inode, where @ino value is a
+ * special value, not a literal inode number.  See the XFS_BULK_IREQ_SPECIAL_*
+ * values below.  Not compatible with XFS_BULK_IREQ_AGNO.
+ */
+#define XFS_BULK_IREQ_SPECIAL	(1 << 1)
+
+#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO | \
+				 XFS_BULK_IREQ_SPECIAL)
+
+/* Operate on the root directory inode. */
+#define XFS_BULK_IREQ_SPECIAL_ROOT	(1)
+
+/*
+ * ioctl structures for v5 bulkstat and inumbers requests
+ */
+struct xfs_bulkstat_req {
+	struct xfs_bulk_ireq	hdr;
+	struct xfs_bulkstat	bulkstat[];
+};
+#define XFS_BULKSTAT_REQ_SIZE(nr)	(sizeof(struct xfs_bulkstat_req) + \
+					 (nr) * sizeof(struct xfs_bulkstat))
+
+struct xfs_inumbers_req {
+	struct xfs_bulk_ireq	hdr;
+	struct xfs_inumbers	inumbers[];
+};
+#define XFS_INUMBERS_REQ_SIZE(nr)	(sizeof(struct xfs_inumbers_req) + \
+					 (nr) * sizeof(struct xfs_inumbers))
 
 /*
  * Error injection.
@@ -453,7 +631,7 @@
 	xfs_off_t	sx_offset;	/* offset into file */
 	xfs_off_t	sx_length;	/* leng from offset */
 	char		sx_pad[16];	/* pad space, unused */
-	xfs_bstat_t	sx_stat;	/* stat of target b4 copy */
+	struct xfs_bstat sx_stat;	/* stat of target b4 copy */
 } xfs_swapext_t;
 
 /*
@@ -502,9 +680,10 @@
 #define XFS_SCRUB_TYPE_UQUOTA	21	/* user quotas */
 #define XFS_SCRUB_TYPE_GQUOTA	22	/* group quotas */
 #define XFS_SCRUB_TYPE_PQUOTA	23	/* project quotas */
+#define XFS_SCRUB_TYPE_FSCOUNTERS 24	/* fs summary counters */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	24
+#define XFS_SCRUB_TYPE_NR	25
 
 /* i: Repair this metadata. */
 #define XFS_SCRUB_IFLAG_REPAIR		(1 << 0)
@@ -590,6 +769,7 @@
 #define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_fs_eofblocks)
 /*	XFS_IOC_GETFSMAP ------ hoisted 59         */
 #define XFS_IOC_SCRUB_METADATA	_IOWR('X', 60, struct xfs_scrub_metadata)
+#define XFS_IOC_AG_GEOMETRY	_IOWR('X', 61, struct xfs_ag_geometry)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
@@ -620,8 +800,11 @@
 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
 #define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
-#define XFS_IOC_FSGEOMETRY	     _IOR ('X', 124, struct xfs_fsop_geom)
+#define XFS_IOC_FSGEOMETRY_V4	     _IOR ('X', 124, struct xfs_fsop_geom_v4)
 #define XFS_IOC_GOINGDOWN	     _IOR ('X', 125, uint32_t)
+#define XFS_IOC_FSGEOMETRY	     _IOR ('X', 126, struct xfs_fsop_geom)
+#define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
+#define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
 
 

diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
new file mode 100644
index 0000000..272005a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_health.h

@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_HEALTH_H__
+#define __XFS_HEALTH_H__
+
+/*
+ * In-Core Filesystem Health Assessments
+ * =====================================
+ *
+ * We'd like to be able to summarize the current health status of the
+ * filesystem so that the administrator knows when it's necessary to schedule
+ * some downtime for repairs.  Until then, we would also like to avoid abrupt
+ * shutdowns due to corrupt metadata.
+ *
+ * The online scrub feature evaluates the health of all filesystem metadata.
+ * When scrub detects corruption in a piece of metadata it will set the
+ * corresponding sickness flag, and repair will clear it if successful.  If
+ * problems remain at unmount time, we can also request manual intervention by
+ * logging a notice to run xfs_repair.
+ *
+ * Each health tracking group uses a pair of fields for reporting.  The
+ * "checked" field tell us if a given piece of metadata has ever been examined,
+ * and the "sick" field tells us if that piece was found to need repairs.
+ * Therefore we can conclude that for a given sick flag value:
+ *
+ *  - checked && sick  => metadata needs repair
+ *  - checked && !sick => metadata is ok
+ *  - !checked         => has not been examined since mount
+ */
+
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_inode;
+struct xfs_fsop_geom;
+
+/* Observable health issues for metadata spanning the entire filesystem. */
+#define XFS_SICK_FS_COUNTERS	(1 << 0)  /* summary counters */
+#define XFS_SICK_FS_UQUOTA	(1 << 1)  /* user quota */
+#define XFS_SICK_FS_GQUOTA	(1 << 2)  /* group quota */
+#define XFS_SICK_FS_PQUOTA	(1 << 3)  /* project quota */
+
+/* Observable health issues for realtime volume metadata. */
+#define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
+#define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
+
+/* Observable health issues for AG metadata. */
+#define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
+#define XFS_SICK_AG_AGF		(1 << 1)  /* AGF header */
+#define XFS_SICK_AG_AGFL	(1 << 2)  /* AGFL header */
+#define XFS_SICK_AG_AGI		(1 << 3)  /* AGI header */
+#define XFS_SICK_AG_BNOBT	(1 << 4)  /* free space by block */
+#define XFS_SICK_AG_CNTBT	(1 << 5)  /* free space by length */
+#define XFS_SICK_AG_INOBT	(1 << 6)  /* inode index */
+#define XFS_SICK_AG_FINOBT	(1 << 7)  /* free inode index */
+#define XFS_SICK_AG_RMAPBT	(1 << 8)  /* reverse mappings */
+#define XFS_SICK_AG_REFCNTBT	(1 << 9)  /* reference counts */
+
+/* Observable health issues for inode metadata. */
+#define XFS_SICK_INO_CORE	(1 << 0)  /* inode core */
+#define XFS_SICK_INO_BMBTD	(1 << 1)  /* data fork */
+#define XFS_SICK_INO_BMBTA	(1 << 2)  /* attr fork */
+#define XFS_SICK_INO_BMBTC	(1 << 3)  /* cow fork */
+#define XFS_SICK_INO_DIR	(1 << 4)  /* directory */
+#define XFS_SICK_INO_XATTR	(1 << 5)  /* extended attributes */
+#define XFS_SICK_INO_SYMLINK	(1 << 6)  /* symbolic link remote target */
+#define XFS_SICK_INO_PARENT	(1 << 7)  /* parent pointers */
+
+/* Primary evidence of health problems in a given group. */
+#define XFS_SICK_FS_PRIMARY	(XFS_SICK_FS_COUNTERS | \
+				 XFS_SICK_FS_UQUOTA | \
+				 XFS_SICK_FS_GQUOTA | \
+				 XFS_SICK_FS_PQUOTA)
+
+#define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
+				 XFS_SICK_RT_SUMMARY)
+
+#define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
+				 XFS_SICK_AG_AGF | \
+				 XFS_SICK_AG_AGFL | \
+				 XFS_SICK_AG_AGI | \
+				 XFS_SICK_AG_BNOBT | \
+				 XFS_SICK_AG_CNTBT | \
+				 XFS_SICK_AG_INOBT | \
+				 XFS_SICK_AG_FINOBT | \
+				 XFS_SICK_AG_RMAPBT | \
+				 XFS_SICK_AG_REFCNTBT)
+
+#define XFS_SICK_INO_PRIMARY	(XFS_SICK_INO_CORE | \
+				 XFS_SICK_INO_BMBTD | \
+				 XFS_SICK_INO_BMBTA | \
+				 XFS_SICK_INO_BMBTC | \
+				 XFS_SICK_INO_DIR | \
+				 XFS_SICK_INO_XATTR | \
+				 XFS_SICK_INO_SYMLINK | \
+				 XFS_SICK_INO_PARENT)
+
+/* These functions must be provided by the xfs implementation. */
+
+void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_health_unmount(struct xfs_mount *mp);
+
+/* Now some helpers. */
+
+static inline bool
+xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_fs_measure_sickness(mp, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_rt_measure_sickness(mp, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_ag_measure_sickness(pag, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_inode_measure_sickness(ip, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_fs_is_healthy(struct xfs_mount *mp)
+{
+	return !xfs_fs_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_rt_is_healthy(struct xfs_mount *mp)
+{
+	return !xfs_rt_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_ag_is_healthy(struct xfs_perag *pag)
+{
+	return !xfs_ag_has_sickness(pag, -1U);
+}
+
+static inline bool
+xfs_inode_is_healthy(struct xfs_inode *ip)
+{
+	return !xfs_inode_has_sickness(ip, -1U);
+}
+
+void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
+void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
+
+#endif	/* __XFS_HEALTH_H__ */

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index a8f6db7..588d446 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c

@@ -12,17 +12,14 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-#include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_icreate_item.h"
@@ -31,20 +28,6 @@
 #include "xfs_log.h"
 #include "xfs_rmap.h"
 
-
-/*
- * Allocation group level functions.
- */
-int
-xfs_ialloc_cluster_alignment(
-	struct xfs_mount	*mp)
-{
-	if (xfs_sb_version_hasalign(&mp->m_sb) &&
-	    mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
-		return mp->m_sb.sb_inoalignmt;
-	return 1;
-}
-
 /*
  * Lookup a record by ino in the btree given by cur.
  */
@@ -288,7 +271,7 @@
 {
 	struct xfs_buf		*fbuf;
 	struct xfs_dinode	*free;
-	int			nbufs, blks_per_cluster, inodes_per_cluster;
+	int			nbufs;
 	int			version;
 	int			i, j;
 	xfs_daddr_t		d;
@@ -299,9 +282,7 @@
 	 * sizes, manipulate the inodes in buffers  which are multiples of the
 	 * blocks size.
 	 */
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-	nbufs = length / blks_per_cluster;
+	nbufs = length / M_IGEO(mp)->blocks_per_cluster;
 
 	/*
 	 * Figure out what version number to use in the inodes we create.  If
@@ -312,7 +293,7 @@
 	 *
 	 * For v3 inodes, we also need to write the inode number into the inode,
 	 * so calculate the first inode number of the chunk here as
-	 * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+	 * XFS_AGB_TO_AGINO() only works within a filesystem block, not
 	 * across multiple filesystem blocks (such as a cluster) and so cannot
 	 * be used in the cluster buffer loop below.
 	 *
@@ -324,8 +305,7 @@
 	 */
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		version = 3;
-		ino = XFS_AGINO_TO_INO(mp, agno,
-				       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+		ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
 
 		/*
 		 * log the initialisation that is about to take place as an
@@ -345,9 +325,11 @@
 		/*
 		 * Get the block.
 		 */
-		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno +
+				(j * M_IGEO(mp)->blocks_per_cluster));
 		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-					 mp->m_bsize * blks_per_cluster,
+					 mp->m_bsize *
+					 M_IGEO(mp)->blocks_per_cluster,
 					 XBF_UNMAPPED);
 		if (!fbuf)
 			return -ENOMEM;
@@ -355,7 +337,7 @@
 		/* Initialize the inode buffers and log them appropriately. */
 		fbuf->b_ops = &xfs_inode_buf_ops;
 		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
-		for (i = 0; i < inodes_per_cluster; i++) {
+		for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
 			int	ioffset = i << mp->m_sb.sb_inodelog;
 			uint	isize = xfs_dinode_size(version);
 
@@ -445,7 +427,7 @@
 		return;
 
 	/* calculate the inode offset and align startino */
-	offset = mod << mp->m_sb.sb_inopblog;
+	offset = XFS_AGB_TO_AGINO(mp, mod);
 	*startino -= offset;
 
 	/*
@@ -618,35 +600,37 @@
  * Allocate new inodes in the allocation group specified by agbp.
  * Return 0 for success, else error code.
  */
-STATIC int				/* error code or 0 */
+STATIC int
 xfs_ialloc_ag_alloc(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_buf_t	*agbp,		/* alloc group buffer */
-	int		*alloc)
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	int			*alloc)
 {
-	xfs_agi_t	*agi;		/* allocation group header */
-	xfs_alloc_arg_t	args;		/* allocation argument structure */
-	xfs_agnumber_t	agno;
-	int		error;
-	xfs_agino_t	newino;		/* new first inode's number */
-	xfs_agino_t	newlen;		/* new number of inodes */
-	int		isaligned = 0;	/* inode allocation at stripe unit */
-					/* boundary */
-	uint16_t	allocmask = (uint16_t) -1; /* init. to full chunk */
+	struct xfs_agi		*agi;
+	struct xfs_alloc_arg	args;
+	xfs_agnumber_t		agno;
+	int			error;
+	xfs_agino_t		newino;		/* new first inode's number */
+	xfs_agino_t		newlen;		/* new number of inodes */
+	int			isaligned = 0;	/* inode allocation at stripe */
+						/* unit boundary */
+	/* init. to full chunk */
+	uint16_t		allocmask = (uint16_t) -1;
 	struct xfs_inobt_rec_incore rec;
-	struct xfs_perag *pag;
-	int		do_sparse = 0;
+	struct xfs_perag	*pag;
+	struct xfs_ino_geometry	*igeo = M_IGEO(tp->t_mountp);
+	int			do_sparse = 0;
 
 	memset(&args, 0, sizeof(args));
 	args.tp = tp;
 	args.mp = tp->t_mountp;
 	args.fsbno = NULLFSBLOCK;
-	xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES);
+	args.oinfo = XFS_RMAP_OINFO_INODES;
 
 #ifdef DEBUG
 	/* randomly do sparse inode allocations */
 	if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
-	    args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+	    igeo->ialloc_min_blks < igeo->ialloc_blks)
 		do_sparse = prandom_u32() & 1;
 #endif
 
@@ -654,12 +638,12 @@
 	 * Locking will ensure that we don't have two callers in here
 	 * at one time.
 	 */
-	newlen = args.mp->m_ialloc_inos;
-	if (args.mp->m_maxicount &&
+	newlen = igeo->ialloc_inos;
+	if (igeo->maxicount &&
 	    percpu_counter_read_positive(&args.mp->m_icount) + newlen >
-							args.mp->m_maxicount)
+							igeo->maxicount)
 		return -ENOSPC;
-	args.minlen = args.maxlen = args.mp->m_ialloc_blks;
+	args.minlen = args.maxlen = igeo->ialloc_blks;
 	/*
 	 * First try to allocate inodes contiguous with the last-allocated
 	 * chunk of inodes.  If the filesystem is striped, this will fill
@@ -669,7 +653,7 @@
 	newino = be32_to_cpu(agi->agi_newino);
 	agno = be32_to_cpu(agi->agi_seqno);
 	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-		     args.mp->m_ialloc_blks;
+		     igeo->ialloc_blks;
 	if (do_sparse)
 		goto sparse_alloc;
 	if (likely(newino != NULLAGINO &&
@@ -692,10 +676,10 @@
 		 * but not to use them in the actual exact allocation.
 		 */
 		args.alignment = 1;
-		args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
+		args.minalignslop = igeo->cluster_align - 1;
 
 		/* Allow space for the inode btree to split. */
-		args.minleft = args.mp->m_in_maxlevels - 1;
+		args.minleft = igeo->inobt_maxlevels - 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 
@@ -722,12 +706,12 @@
 		 * pieces, so don't need alignment anyway.
 		 */
 		isaligned = 0;
-		if (args.mp->m_sinoalign) {
+		if (igeo->ialloc_align) {
 			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
 			args.alignment = args.mp->m_dalign;
 			isaligned = 1;
 		} else
-			args.alignment = xfs_ialloc_cluster_alignment(args.mp);
+			args.alignment = igeo->cluster_align;
 		/*
 		 * Need to figure out where to allocate the inode blocks.
 		 * Ideally they should be spaced out through the a.g.
@@ -743,7 +727,7 @@
 		/*
 		 * Allow space for the inode btree to split.
 		 */
-		args.minleft = args.mp->m_in_maxlevels - 1;
+		args.minleft = igeo->inobt_maxlevels - 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
@@ -756,7 +740,7 @@
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 		args.agbno = be32_to_cpu(agi->agi_root);
 		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-		args.alignment = xfs_ialloc_cluster_alignment(args.mp);
+		args.alignment = igeo->cluster_align;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
@@ -766,7 +750,7 @@
 	 * the sparse allocation length is smaller than a full chunk.
 	 */
 	if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
-	    args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+	    igeo->ialloc_min_blks < igeo->ialloc_blks &&
 	    args.fsbno == NULLFSBLOCK) {
 sparse_alloc:
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -775,7 +759,7 @@
 		args.alignment = args.mp->m_sb.sb_spino_align;
 		args.prod = 1;
 
-		args.minlen = args.mp->m_ialloc_min_blks;
+		args.minlen = igeo->ialloc_min_blks;
 		args.maxlen = args.minlen;
 
 		/*
@@ -791,13 +775,13 @@
 		args.min_agbno = args.mp->m_sb.sb_inoalignmt;
 		args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
 					    args.mp->m_sb.sb_inoalignmt) -
-				 args.mp->m_ialloc_blks;
+				 igeo->ialloc_blks;
 
 		error = xfs_alloc_vextent(&args);
 		if (error)
 			return error;
 
-		newlen = args.len << args.mp->m_sb.sb_inopblog;
+		newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
 		ASSERT(newlen <= XFS_INODES_PER_CHUNK);
 		allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
 	}
@@ -825,7 +809,7 @@
 	/*
 	 * Convert the results.
 	 */
-	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+	newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
 
 	if (xfs_inobt_issparse(~allocmask)) {
 		/*
@@ -1008,7 +992,7 @@
 		 * space needed for alignment of inode chunks when checking the
 		 * longest contiguous free space in the AG - this prevents us
 		 * from getting ENOSPC because we have free space larger than
-		 * m_ialloc_blks but alignment constraints prevent us from using
+		 * ialloc_blks but alignment constraints prevent us from using
 		 * it.
 		 *
 		 * If we can't find an AG with space for full alignment slack to
@@ -1017,9 +1001,9 @@
 		 * if we fail allocation due to alignment issues then it is most
 		 * likely a real ENOSPC condition.
 		 */
-		ineed = mp->m_ialloc_min_blks;
+		ineed = M_IGEO(mp)->ialloc_min_blks;
 		if (flags && ineed > 1)
-			ineed += xfs_ialloc_cluster_alignment(mp);
+			ineed += M_IGEO(mp)->cluster_align;
 		longest = pag->pagf_longest;
 		if (!longest)
 			longest = pag->pagf_flcount > 0;
@@ -1705,6 +1689,7 @@
 	int			noroom = 0;
 	xfs_agnumber_t		start_agno;
 	struct xfs_perag	*pag;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
 	int			okalloc = 1;
 
 	if (*IO_agbp) {
@@ -1735,9 +1720,9 @@
 	 * Read rough value of mp->m_icount by percpu_counter_read_positive,
 	 * which will sacrifice the preciseness but improve the performance.
 	 */
-	if (mp->m_maxicount &&
-	    percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos
-							> mp->m_maxicount) {
+	if (igeo->maxicount &&
+	    percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
+							> igeo->maxicount) {
 		noroom = 1;
 		okalloc = 0;
 	}
@@ -1849,14 +1834,13 @@
 	int				nextbit;
 	xfs_agblock_t			agbno;
 	int				contigblk;
-	struct xfs_owner_info		oinfo;
 	DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
 
 	if (!xfs_inobt_issparse(rec->ir_holemask)) {
 		/* not sparse, calculate extent info directly */
 		xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
-				  mp->m_ialloc_blks, &oinfo);
+				  M_IGEO(mp)->ialloc_blks,
+				  &XFS_RMAP_OINFO_INODES);
 		return;
 	}
 
@@ -1900,7 +1884,7 @@
 		ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
 		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
 		xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
-				  contigblk, &oinfo);
+				  contigblk, &XFS_RMAP_OINFO_INODES);
 
 		/* reset range to current bit and carry on... */
 		startidx = endidx = nextbit;
@@ -2265,7 +2249,7 @@
 
 	/* check that the returned record contains the required inode */
 	if (rec.ir_startino > agino ||
-	    rec.ir_startino + mp->m_ialloc_inos <= agino)
+	    rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
 		return -EINVAL;
 
 	/* for untrusted inodes check it is allocated first */
@@ -2292,7 +2276,6 @@
 	xfs_agblock_t	agbno;	/* block number of inode in the alloc group */
 	xfs_agino_t	agino;	/* inode number within alloc group */
 	xfs_agnumber_t	agno;	/* allocation group number */
-	int		blks_per_cluster; /* num blocks per inode cluster */
 	xfs_agblock_t	chunk_agbno;	/* first block in inode chunk */
 	xfs_agblock_t	cluster_agbno;	/* first block in inode cluster */
 	int		error;	/* error code */
@@ -2338,8 +2321,6 @@
 		return -EINVAL;
 	}
 
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-
 	/*
 	 * For bulkstat and handle lookups, we have an untrusted inode number
 	 * that we have to verify is valid. We cannot do this just by reading
@@ -2359,7 +2340,7 @@
 	 * If the inode cluster size is the same as the blocksize or
 	 * smaller we get to the buffer by simple arithmetics.
 	 */
-	if (blks_per_cluster == 1) {
+	if (M_IGEO(mp)->blocks_per_cluster == 1) {
 		offset = XFS_INO_TO_OFFSET(mp, ino);
 		ASSERT(offset < mp->m_sb.sb_inopblock);
 
@@ -2375,8 +2356,8 @@
 	 * find the location. Otherwise we have to do a btree
 	 * lookup to find the location.
 	 */
-	if (mp->m_inoalign_mask) {
-		offset_agbno = agbno & mp->m_inoalign_mask;
+	if (M_IGEO(mp)->inoalign_mask) {
+		offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
 		chunk_agbno = agbno - offset_agbno;
 	} else {
 		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
@@ -2388,12 +2369,13 @@
 out_map:
 	ASSERT(agbno >= chunk_agbno);
 	cluster_agbno = chunk_agbno +
-		((offset_agbno / blks_per_cluster) * blks_per_cluster);
+		((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
+		 M_IGEO(mp)->blocks_per_cluster);
 	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
 		XFS_INO_TO_OFFSET(mp, ino);
 
 	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
-	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+	imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
 	imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
 
 	/*
@@ -2415,20 +2397,6 @@
 }
 
 /*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
-	xfs_mount_t	*mp)		/* file system mount structure */
-{
-	uint		inodes;
-
-	inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
-	mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr,
-							 inodes);
-}
-
-/*
  * Log specified fields for the ag hdr (inode section). The growth of the agi
  * structure over time requires that we interpret the buffer as two logical
  * regions delineated by the end of the unlinked list. This is due to the size
@@ -2499,7 +2467,7 @@
 xfs_agi_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	struct xfs_agi	*agi = XFS_BUF_TO_AGI(bp);
 	int		i;
 
@@ -2514,7 +2482,7 @@
 	/*
 	 * Validate the magic number of the agi block.
 	 */
-	if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+	if (!xfs_verify_magic(bp, agi->agi_magicnum))
 		return __this_address;
 	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
 		return __this_address;
@@ -2551,7 +2519,7 @@
 xfs_agi_read_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	xfs_failaddr_t	fa;
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -2568,7 +2536,7 @@
 xfs_agi_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	xfs_failaddr_t		fa;
 
@@ -2588,6 +2556,7 @@
 
 const struct xfs_buf_ops xfs_agi_buf_ops = {
 	.name = "xfs_agi",
+	.magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
 	.verify_read = xfs_agi_read_verify,
 	.verify_write = xfs_agi_write_verify,
 	.verify_struct = xfs_agi_verify,
@@ -2726,8 +2695,8 @@
 	xfs_agino_t		low;
 	xfs_agino_t		high;
 
-	low = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno, 0);
-	high = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno + len, 0) - 1;
+	low = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
+	high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
 
 	return xfs_ialloc_has_inode_record(cur, low, high, exists);
 }
@@ -2773,3 +2742,115 @@
 	*freecount = ci.freecount;
 	return 0;
 }
+
+/*
+ * Initialize inode-related geometry information.
+ *
+ * Compute the inode btree min and max levels and set maxicount.
+ *
+ * Set the inode cluster size.  This may still be overridden by the file
+ * system block size if it is larger than the chosen cluster size.
+ *
+ * For v5 filesystems, scale the cluster size with the inode size to keep a
+ * constant ratio of inode per cluster buffer, but only if mkfs has set the
+ * inode alignment value appropriately for larger cluster sizes.
+ *
+ * Then compute the inode cluster alignment information.
+ */
+void
+xfs_ialloc_setup_geometry(
+	struct xfs_mount	*mp)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	uint64_t		icount;
+	uint			inodes;
+
+	/* Compute inode btree geometry. */
+	igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+	igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+	igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+	igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
+	igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
+
+	igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
+			sbp->sb_inopblock);
+	igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
+
+	if (sbp->sb_spino_align)
+		igeo->ialloc_min_blks = sbp->sb_spino_align;
+	else
+		igeo->ialloc_min_blks = igeo->ialloc_blks;
+
+	/* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
+	inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+	igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
+			inodes);
+
+	/*
+	 * Set the maximum inode count for this filesystem, being careful not
+	 * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
+	 * users should never get here due to failing sb verification, but
+	 * certain users (xfs_db) need to be usable even with corrupt metadata.
+	 */
+	if (sbp->sb_imax_pct && igeo->ialloc_blks) {
+		/*
+		 * Make sure the maximum inode count is a multiple
+		 * of the units we allocate inodes in.
+		 */
+		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+		do_div(icount, 100);
+		do_div(icount, igeo->ialloc_blks);
+		igeo->maxicount = XFS_FSB_TO_INO(mp,
+				icount * igeo->ialloc_blks);
+	} else {
+		igeo->maxicount = 0;
+	}
+
+	/*
+	 * Compute the desired size of an inode cluster buffer size, which
+	 * starts at 8K and (on v5 filesystems) scales up with larger inode
+	 * sizes.
+	 *
+	 * Preserve the desired inode cluster size because the sparse inodes
+	 * feature uses that desired size (not the actual size) to compute the
+	 * sparse inode alignment.  The mount code validates this value, so we
+	 * cannot change the behavior.
+	 */
+	igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		int	new_size = igeo->inode_cluster_size_raw;
+
+		new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+			igeo->inode_cluster_size_raw = new_size;
+	}
+
+	/* Calculate inode cluster ratios. */
+	if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
+		igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
+				igeo->inode_cluster_size_raw);
+	else
+		igeo->blocks_per_cluster = 1;
+	igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
+	igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
+
+	/* Calculate inode cluster alignment. */
+	if (xfs_sb_version_hasalign(&mp->m_sb) &&
+	    mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
+		igeo->cluster_align = mp->m_sb.sb_inoalignmt;
+	else
+		igeo->cluster_align = 1;
+	igeo->inoalign_mask = igeo->cluster_align - 1;
+	igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
+
+	/*
+	 * If we are using stripe alignment, check whether
+	 * the stripe unit is a multiple of the inode alignment
+	 */
+	if (mp->m_dalign && igeo->inoalign_mask &&
+	    !(mp->m_dalign & igeo->inoalign_mask))
+		igeo->ialloc_align = mp->m_dalign;
+	else
+		igeo->ialloc_align = 0;
+}

diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index e936b7c..323592d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h

@@ -23,16 +23,6 @@
 					 * sparse chunks */
 };
 
-/* Calculate and return the number of filesystem blocks per inode cluster */
-static inline int
-xfs_icluster_size_fsb(
-	struct xfs_mount	*mp)
-{
-	if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
-		return 1;
-	return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
-}
-
 /*
  * Make an inode pointer out of the buffer/offset.
  */
@@ -96,13 +86,6 @@
 	uint		flags);		/* flags for inode btree lookup */
 
 /*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
-	struct xfs_mount *mp);		/* file system mount structure */
-
-/*
  * Log specified fields for the ag hdr (inode section)
  */
 void
@@ -168,5 +151,6 @@
 		int *stat);
 
 int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
+void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
 
 #endif	/* __XFS_IALLOC_H__ */

diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 86c5020..b82992f 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c

@@ -11,14 +11,12 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
-#include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_rmap.h"
 
@@ -28,7 +26,7 @@
 	struct xfs_btree_cur	*cur,
 	int			level)
 {
-	return cur->bc_mp->m_inobt_mnr[level != 0];
+	return M_IGEO(cur->bc_mp)->inobt_mnr[level != 0];
 }
 
 STATIC struct xfs_btree_cur *
@@ -84,7 +82,7 @@
 	memset(&args, 0, sizeof(args));
 	args.tp = cur->bc_tp;
 	args.mp = cur->bc_mp;
-	xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT);
+	args.oinfo = XFS_RMAP_OINFO_INOBT;
 	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
 	args.minlen = 1;
 	args.maxlen = 1;
@@ -124,7 +122,7 @@
 	union xfs_btree_ptr	*new,
 	int			*stat)
 {
-	if (cur->bc_mp->m_inotbt_nores)
+	if (cur->bc_mp->m_finobt_nores)
 		return xfs_inobt_alloc_block(cur, start, new, stat);
 	return __xfs_inobt_alloc_block(cur, start, new, stat,
 			XFS_AG_RESV_METADATA);
@@ -136,12 +134,9 @@
 	struct xfs_buf		*bp,
 	enum xfs_ag_resv_type	resv)
 {
-	struct xfs_owner_info	oinfo;
-
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
 	return xfs_free_extent(cur->bc_tp,
 			XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
-			&oinfo, resv);
+			&XFS_RMAP_OINFO_INOBT, resv);
 }
 
 STATIC int
@@ -157,7 +152,7 @@
 	struct xfs_btree_cur	*cur,
 	struct xfs_buf		*bp)
 {
-	if (cur->bc_mp->m_inotbt_nores)
+	if (cur->bc_mp->m_finobt_nores)
 		return xfs_inobt_free_block(cur, bp);
 	return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
 }
@@ -167,7 +162,7 @@
 	struct xfs_btree_cur	*cur,
 	int			level)
 {
-	return cur->bc_mp->m_inobt_mxr[level != 0];
+	return M_IGEO(cur->bc_mp)->inobt_mxr[level != 0];
 }
 
 STATIC void
@@ -258,11 +253,14 @@
 xfs_inobt_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	xfs_failaddr_t		fa;
 	unsigned int		level;
 
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
 	/*
 	 * During growfs operations, we can't verify the exact owner as the
 	 * perag is not fully initialised and hence not attached to the buffer.
@@ -273,26 +271,19 @@
 	 * but beware of the landmine (i.e. need to check pag->pagi_init) if we
 	 * ever do.
 	 */
-	switch (block->bb_magic) {
-	case cpu_to_be32(XFS_IBT_CRC_MAGIC):
-	case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		fa = xfs_btree_sblock_v5hdr_verify(bp);
 		if (fa)
 			return fa;
-		/* fall through */
-	case cpu_to_be32(XFS_IBT_MAGIC):
-	case cpu_to_be32(XFS_FIBT_MAGIC):
-		break;
-	default:
-		return __this_address;
 	}
 
 	/* level verification */
 	level = be16_to_cpu(block->bb_level);
-	if (level >= mp->m_in_maxlevels)
+	if (level >= M_IGEO(mp)->inobt_maxlevels)
 		return __this_address;
 
-	return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]);
+	return xfs_btree_sblock_verify(bp,
+			M_IGEO(mp)->inobt_mxr[level != 0]);
 }
 
 static void
@@ -331,6 +322,16 @@
 
 const struct xfs_buf_ops xfs_inobt_buf_ops = {
 	.name = "xfs_inobt",
+	.magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+	.verify_read = xfs_inobt_read_verify,
+	.verify_write = xfs_inobt_write_verify,
+	.verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+	.name = "xfs_finobt",
+	.magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+		   cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
 	.verify_read = xfs_inobt_read_verify,
 	.verify_write = xfs_inobt_write_verify,
 	.verify_struct = xfs_inobt_verify,
@@ -392,7 +393,7 @@
 	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_finobt_init_ptr_from_cur,
 	.key_diff		= xfs_inobt_key_diff,
-	.buf_ops		= &xfs_inobt_buf_ops,
+	.buf_ops		= &xfs_finobt_buf_ops,
 	.diff_two_keys		= xfs_inobt_diff_two_keys,
 	.keys_inorder		= xfs_inobt_keys_inorder,
 	.recs_inorder		= xfs_inobt_recs_inorder,
@@ -538,15 +539,57 @@
 
 static xfs_extlen_t
 xfs_inobt_max_size(
-	struct xfs_mount	*mp)
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
 {
+	xfs_agblock_t		agblocks = xfs_ag_block_count(mp, agno);
+
 	/* Bail out if we're uninitialized, which can happen in mkfs. */
-	if (mp->m_inobt_mxr[0] == 0)
+	if (M_IGEO(mp)->inobt_mxr[0] == 0)
 		return 0;
 
-	return xfs_btree_calc_size(mp->m_inobt_mnr,
-		(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
-				XFS_INODES_PER_CHUNK);
+	/*
+	 * The log is permanently allocated, so the space it occupies will
+	 * never be available for the kinds of things that would require btree
+	 * expansion.  We therefore can pretend the space isn't there.
+	 */
+	if (mp->m_sb.sb_logstart &&
+	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+		agblocks -= mp->m_sb.sb_logblocks;
+
+	return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
+				(uint64_t)agblocks * mp->m_sb.sb_inopblock /
+					XFS_INODES_PER_CHUNK);
+}
+
+/* Read AGI and create inobt cursor. */
+int
+xfs_inobt_cur(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_btnum_t		which,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp)
+{
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ASSERT(*agi_bpp == NULL);
+	ASSERT(*curpp == NULL);
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp);
+	if (error)
+		return error;
+
+	cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which);
+	if (!cur) {
+		xfs_trans_brelse(tp, *agi_bpp);
+		*agi_bpp = NULL;
+		return -ENOMEM;
+	}
+	*curpp = cur;
+	return 0;
 }
 
 static int
@@ -557,15 +600,14 @@
 	xfs_btnum_t		btnum,
 	xfs_extlen_t		*tree_blocks)
 {
-	struct xfs_buf		*agbp;
-	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp = NULL;
+	struct xfs_btree_cur	*cur = NULL;
 	int			error;
 
-	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp);
 	if (error)
 		return error;
 
-	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
 	error = xfs_btree_count_blocks(cur, tree_blocks);
 	xfs_btree_del_cursor(cur, error);
 	xfs_trans_brelse(tp, agbp);
@@ -594,7 +636,7 @@
 	if (error)
 		return error;
 
-	*ask += xfs_inobt_max_size(mp);
+	*ask += xfs_inobt_max_size(mp, agno);
 	*used += tree_len;
 	return 0;
 }
@@ -605,5 +647,5 @@
 	struct xfs_mount	*mp,
 	unsigned long long	len)
 {
-	return xfs_btree_calc_size(mp->m_inobt_mnr, len);
+	return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
 }

diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index ebdd0c6..951305e 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h

@@ -64,5 +64,8 @@
 		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
 extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
+int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
+		xfs_agnumber_t agno, xfs_btnum_t btnum,
+		struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
 
 #endif	/* __XFS_IALLOC_BTREE_H__ */

diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 771dd07..7bc8740 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c

@@ -3,18 +3,14 @@
  * Copyright (c) 2017 Christoph Hellwig.
  */
 
-#include <linux/cache.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_inode.h"
-#include "xfs_inode_fork.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_bmap.h"
 #include "xfs_trace.h"
 
 /*
@@ -614,16 +610,15 @@
 }
 
 /*
- * Increment the sequence counter if we are on a COW fork.  This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed.  We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
  */
-static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
+static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
 {
-	if (state & BMAP_COWFORK)
-		WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+	WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
 }
 
 void
@@ -638,7 +633,7 @@
 	struct xfs_iext_leaf	*new = NULL;
 	int			nr_entries, i;
 
-	xfs_iext_inc_seq(ifp, state);
+	xfs_iext_inc_seq(ifp);
 
 	if (ifp->if_height == 0)
 		xfs_iext_alloc_root(ifp, cur);
@@ -880,7 +875,7 @@
 	ASSERT(ifp->if_u1.if_root != NULL);
 	ASSERT(xfs_iext_valid(ifp, cur));
 
-	xfs_iext_inc_seq(ifp, state);
+	xfs_iext_inc_seq(ifp);
 
 	nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
 	for (i = cur->pos; i < nr_entries; i++)
@@ -988,7 +983,7 @@
 {
 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
 
-	xfs_iext_inc_seq(ifp, state);
+	xfs_iext_inc_seq(ifp);
 
 	if (cur->pos == 0) {
 		struct xfs_bmbt_irec	old;

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09d9c8c..28ab3c5 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c

@@ -10,11 +10,9 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
-#include "xfs_cksum.h"
 #include "xfs_icache.h"
 #include "xfs_trans.h"
 #include "xfs_ialloc.h"
@@ -33,12 +31,9 @@
 	xfs_buf_t	*bp)
 {
 	int		i;
-	int		j;
 	xfs_dinode_t	*dip;
 
-	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-
-	for (i = 0; i < j; i++) {
+	for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
 		dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
 		if (!dip->di_next_unlinked)  {
 			xfs_alert(mp,
@@ -80,7 +75,7 @@
 	struct xfs_buf	*bp,
 	bool		readahead)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	xfs_agnumber_t	agno;
 	int		i;
 	int		ni;
@@ -97,10 +92,9 @@
 
 		dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
 		unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
-		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+		di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
 			xfs_dinode_good_version(mp, dip->di_version) &&
-			(unlinked_ino == NULLAGINO ||
-			 xfs_verify_agino(mp, agno, unlinked_ino));
+			xfs_verify_agino_or_null(mp, agno, unlinked_ino);
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 						XFS_ERRTAG_ITOBP_INOTOBP))) {
 			if (readahead) {
@@ -147,12 +141,16 @@
 
 const struct xfs_buf_ops xfs_inode_buf_ops = {
 	.name = "xfs_inode",
+	.magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+		     cpu_to_be16(XFS_DINODE_MAGIC) },
 	.verify_read = xfs_inode_buf_read_verify,
 	.verify_write = xfs_inode_buf_write_verify,
 };
 
 const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
-	.name = "xxfs_inode_ra",
+	.name = "xfs_inode_ra",
+	.magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+		     cpu_to_be16(XFS_DINODE_MAGIC) },
 	.verify_read = xfs_inode_buf_readahead_verify,
 	.verify_write = xfs_inode_buf_write_verify,
 };

diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index f9acf1d..c643bee 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c

@@ -3,10 +3,10 @@
  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
-#include <linux/log2.h>
 
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
@@ -19,12 +19,10 @@
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_attr_sf.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_attr_leaf.h"
-#include "xfs_shared.h"
 
 kmem_zone_t *xfs_ifork_zone;
 
@@ -96,7 +94,7 @@
 		return 0;
 
 	ASSERT(ip->i_afp == NULL);
-	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
 
 	switch (dip->di_aformat) {
 	case XFS_DINODE_FMT_LOCAL:
@@ -149,7 +147,7 @@
 
 	if (size) {
 		real_size = roundup(mem_size, 4);
-		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
 		memcpy(ifp->if_u1.if_data, data, size);
 		if (zero_terminate)
 			ifp->if_u1.if_data[size] = '\0';
@@ -304,7 +302,7 @@
 	}
 
 	ifp->if_broot_bytes = size;
-	ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+	ifp->if_broot = kmem_alloc(size, KM_NOFS);
 	ASSERT(ifp->if_broot != NULL);
 	/*
 	 * Copy and convert from the on-disk structure
@@ -369,7 +367,7 @@
 		 */
 		if (ifp->if_broot_bytes == 0) {
 			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+			ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
 			ifp->if_broot_bytes = (int)new_size;
 			return;
 		}
@@ -384,7 +382,7 @@
 		new_max = cur_max + rec_diff;
 		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
 		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-				KM_SLEEP | KM_NOFS);
+				KM_NOFS);
 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -410,7 +408,7 @@
 	else
 		new_size = 0;
 	if (new_size > 0) {
-		new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+		new_broot = kmem_alloc(new_size, KM_NOFS);
 		/*
 		 * First copy over the btree block header.
 		 */
@@ -494,7 +492,7 @@
 	 * We enforce that here.
 	 */
 	ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
-			roundup(new_size, 4), KM_SLEEP | KM_NOFS);
+			roundup(new_size, 4), KM_NOFS);
 	ifp->if_bytes = new_size;
 }
 
@@ -685,7 +683,7 @@
 		return;
 
 	ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
-				       KM_SLEEP | KM_NOFS);
+				       KM_NOFS);
 	ip->i_cowfp->if_flags = XFS_IFEXTENTS;
 	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
 	ip->i_cnextents = 0;

diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 60361d2..00c62ce 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h

@@ -14,7 +14,7 @@
  */
 struct xfs_ifork {
 	int			if_bytes;	/* bytes in if_u1 */
-	unsigned int		if_seq;		/* cow fork mod counter */
+	unsigned int		if_seq;		/* fork mod counter */
 	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
 	short			if_broot_bytes;	/* bytes allocated for root */
 	unsigned char		if_flags;	/* per-fork flags */

diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 1b542ec..7f55eb3 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c

@@ -12,9 +12,7 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_trans_space.h"
-#include "xfs_inode.h"
 #include "xfs_da_btree.h"
-#include "xfs_attr_leaf.h"
 #include "xfs_bmap_btree.h"
 
 /*

diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 4bfdd5f..b2113b1 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h

@@ -142,7 +142,7 @@
 extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
 		struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
 extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
-extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
+extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
 		xfs_dqid_t id, uint type);
 
 #endif	/* __XFS_QUOTA_H__ */

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 542aa14..9a7fadb 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c

@@ -9,7 +9,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_btree.h"
@@ -19,7 +18,6 @@
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_bit.h"
 #include "xfs_refcount.h"
@@ -1176,7 +1174,7 @@
 /*
  * Record a refcount intent for later processing.
  */
-static int
+static void
 __xfs_refcount_add(
 	struct xfs_trans		*tp,
 	enum xfs_refcount_intent_type	type,
@@ -1191,44 +1189,43 @@
 			blockcount);
 
 	ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
-			KM_SLEEP | KM_NOFS);
+			KM_NOFS);
 	INIT_LIST_HEAD(&ri->ri_list);
 	ri->ri_type = type;
 	ri->ri_startblock = startblock;
 	ri->ri_blockcount = blockcount;
 
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
-	return 0;
 }
 
 /*
  * Increase the reference count of the blocks backing a file's extent.
  */
-int
+void
 xfs_refcount_increase_extent(
 	struct xfs_trans		*tp,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
-		return 0;
+		return;
 
-	return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE,
-			PREV->br_startblock, PREV->br_blockcount);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+			PREV->br_blockcount);
 }
 
 /*
  * Decrease the reference count of the blocks backing a file's extent.
  */
-int
+void
 xfs_refcount_decrease_extent(
 	struct xfs_trans		*tp,
 	struct xfs_bmbt_irec		*PREV)
 {
 	if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
-		return 0;
+		return;
 
-	return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE,
-			PREV->br_startblock, PREV->br_blockcount);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+			PREV->br_blockcount);
 }
 
 /*
@@ -1543,47 +1540,40 @@
 }
 
 /* Record a CoW staging extent in the refcount btree. */
-int
+void
 xfs_refcount_alloc_cow_extent(
 	struct xfs_trans		*tp,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	int				error;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
-		return 0;
+		return;
 
-	error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
-	if (error)
-		return error;
+	__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
 
 	/* Add rmap entry */
-	return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+	xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
 			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
-int
+void
 xfs_refcount_free_cow_extent(
 	struct xfs_trans		*tp,
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
-	int				error;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
-		return 0;
+		return;
 
 	/* Remove rmap entry */
-	error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+	xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
 			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
-	if (error)
-		return error;
-
-	return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+	__xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 
 struct xfs_refcount_recovery {
@@ -1604,7 +1594,7 @@
 	if (be32_to_cpu(rec->refc.rc_refcount) != 1)
 		return -EFSCORRUPTED;
 
-	rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+	rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
 	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
 	list_add_tail(&rr->rr_list, debris);
 
@@ -1681,10 +1671,8 @@
 		/* Free the orphan record */
 		agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
 		fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
-		error = xfs_refcount_free_cow_extent(tp, fsb,
+		xfs_refcount_free_cow_extent(tp, fsb,
 				rr->rr_rrec.rc_blockcount);
-		if (error)
-			goto out_trans;
 
 		/* Free the block. */
 		xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);

diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 1d9c518..2097955 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h

@@ -29,9 +29,9 @@
 	xfs_extlen_t				ri_blockcount;
 };
 
-extern int xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp,
 		struct xfs_bmbt_irec *irec);
 
 extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
@@ -45,10 +45,10 @@
 		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
 		xfs_extlen_t *flen, bool find_end_of_shared);
 
-extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp,
-		xfs_fsblock_t fsb, xfs_extlen_t len);
-extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp,
-		xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		xfs_agnumber_t agno);
 

diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 1aaa01c..38529db 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c

@@ -12,12 +12,10 @@
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_btree.h"
-#include "xfs_bmap.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_bit.h"
 #include "xfs_rmap.h"
@@ -70,7 +68,7 @@
 	args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
 			xfs_refc_block(args.mp));
-	xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
+	args.oinfo = XFS_RMAP_OINFO_REFC;
 	args.minlen = args.maxlen = args.prod = 1;
 	args.resv = XFS_AG_RESV_METADATA;
 
@@ -106,15 +104,13 @@
 	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
 	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
-	struct xfs_owner_info	oinfo;
 	int			error;
 
 	trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
 			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
 	be32_add_cpu(&agf->agf_refcount_blocks, -1);
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
-	error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
+	error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC,
 			XFS_AG_RESV_METADATA);
 	if (error)
 		return error;
@@ -205,13 +201,13 @@
 xfs_refcountbt_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_perag	*pag = bp->b_pag;
 	xfs_failaddr_t		fa;
 	unsigned int		level;
 
-	if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+	if (!xfs_verify_magic(bp, block->bb_magic))
 		return __this_address;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -266,6 +262,7 @@
 
 const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
 	.name			= "xfs_refcountbt",
+	.magic			= { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
 	.verify_read		= xfs_refcountbt_read_verify,
 	.verify_write		= xfs_refcountbt_write_verify,
 	.verify_struct		= xfs_refcountbt_verify,
@@ -428,6 +425,15 @@
 	tree_len = be32_to_cpu(agf->agf_refcount_blocks);
 	xfs_trans_brelse(tp, agbp);
 
+	/*
+	 * The log is permanently allocated, so the space it occupies will
+	 * never be available for the kinds of things that would require btree
+	 * expansion.  We therefore can pretend the space isn't there.
+	 */
+	if (mp->m_sb.sb_logstart &&
+	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+		agblocks -= mp->m_sb.sb_logblocks;
+
 	*ask += xfs_refcountbt_max_size(mp, agblocks);
 	*used += tree_len;
 

diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 245af45..38e9414 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c

@@ -10,24 +10,17 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_btree.h"
 #include "xfs_trans.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
-#include "xfs_trans_space.h"
 #include "xfs_trace.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
-#include "xfs_extent_busy.h"
-#include "xfs_bmap.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 
 /*
  * Lookup the first record less than or equal to [bno, len, owner, offset]
@@ -175,7 +168,6 @@
 	union xfs_btree_rec	*rec,
 	struct xfs_rmap_irec	*irec)
 {
-	irec->rm_flags = 0;
 	irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
 	irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
 	irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
@@ -261,15 +253,15 @@
 			rec->rm_flags);
 
 	if (rec->rm_owner != info->high.rm_owner)
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
 	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
 	    rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 
 	*info->irec = *rec;
 	*info->stat = 1;
-	return XFS_BTREE_QUERY_RANGE_ABORT;
+	return -ECANCELED;
 }
 
 /*
@@ -312,7 +304,7 @@
 
 	error = xfs_rmap_query_range(cur, &info.high, &info.high,
 			xfs_rmap_find_left_neighbor_helper, &info);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+	if (error == -ECANCELED)
 		error = 0;
 	if (*stat)
 		trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
@@ -337,16 +329,16 @@
 			rec->rm_flags);
 
 	if (rec->rm_owner != info->high.rm_owner)
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
 	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
 	    (rec->rm_offset > info->high.rm_offset ||
 	     rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 
 	*info->irec = *rec;
 	*info->stat = 1;
-	return XFS_BTREE_QUERY_RANGE_ABORT;
+	return -ECANCELED;
 }
 
 /*
@@ -384,7 +376,7 @@
 			cur->bc_private.a.agno, bno, 0, owner, offset, flags);
 	error = xfs_rmap_query_range(cur, &info.high, &info.high,
 			xfs_rmap_lookup_le_range_helper, &info);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+	if (error == -ECANCELED)
 		error = 0;
 	if (*stat)
 		trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
@@ -458,21 +450,21 @@
  */
 STATIC int
 xfs_rmap_unmap(
-	struct xfs_btree_cur	*cur,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	bool			unwritten,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_rmap_irec	ltrec;
-	uint64_t		ltoff;
-	int			error = 0;
-	int			i;
-	uint64_t		owner;
-	uint64_t		offset;
-	unsigned int		flags;
-	bool			ignore_off;
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		ltrec;
+	uint64_t			ltoff;
+	int				error = 0;
+	int				i;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags;
+	bool				ignore_off;
 
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
@@ -653,16 +645,16 @@
  */
 int
 xfs_rmap_free(
-	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	xfs_agnumber_t			agno,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
 {
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_btree_cur	*cur;
-	int			error;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*cur;
+	int				error;
 
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
 		return 0;
@@ -710,23 +702,23 @@
  */
 STATIC int
 xfs_rmap_map(
-	struct xfs_btree_cur	*cur,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	bool			unwritten,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_rmap_irec	ltrec;
-	struct xfs_rmap_irec	gtrec;
-	int			have_gt;
-	int			have_lt;
-	int			error = 0;
-	int			i;
-	uint64_t		owner;
-	uint64_t		offset;
-	unsigned int		flags = 0;
-	bool			ignore_off;
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		ltrec;
+	struct xfs_rmap_irec		gtrec;
+	int				have_gt;
+	int				have_lt;
+	int				error = 0;
+	int				i;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags = 0;
+	bool				ignore_off;
 
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	ASSERT(owner != 0);
@@ -890,16 +882,16 @@
  */
 int
 xfs_rmap_alloc(
-	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	xfs_agnumber_t			agno,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
 {
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_btree_cur	*cur;
-	int			error;
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*cur;
+	int				error;
 
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
 		return 0;
@@ -929,16 +921,16 @@
  */
 STATIC int
 xfs_rmap_convert(
-	struct xfs_btree_cur	*cur,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	bool			unwritten,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_rmap_irec	r[4];	/* neighbor extent entries */
-					/* left is 0, right is 1, prev is 2 */
-					/* new is 3 */
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		r[4];	/* neighbor extent entries */
+						/* left is 0, right is 1, */
+						/* prev is 2, new is 3 */
 	uint64_t		owner;
 	uint64_t		offset;
 	uint64_t		new_endoff;
@@ -1354,16 +1346,16 @@
  */
 STATIC int
 xfs_rmap_convert_shared(
-	struct xfs_btree_cur	*cur,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	bool			unwritten,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_rmap_irec	r[4];	/* neighbor extent entries */
-					/* left is 0, right is 1, prev is 2 */
-					/* new is 3 */
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		r[4];	/* neighbor extent entries */
+						/* left is 0, right is 1, */
+						/* prev is 2, new is 3 */
 	uint64_t		owner;
 	uint64_t		offset;
 	uint64_t		new_endoff;
@@ -1743,20 +1735,20 @@
  */
 STATIC int
 xfs_rmap_unmap_shared(
-	struct xfs_btree_cur	*cur,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	bool			unwritten,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_rmap_irec	ltrec;
-	uint64_t		ltoff;
-	int			error = 0;
-	int			i;
-	uint64_t		owner;
-	uint64_t		offset;
-	unsigned int		flags;
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		ltrec;
+	uint64_t			ltoff;
+	int				error = 0;
+	int				i;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags;
 
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	if (unwritten)
@@ -1905,22 +1897,22 @@
  */
 STATIC int
 xfs_rmap_map_shared(
-	struct xfs_btree_cur	*cur,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	bool			unwritten,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_rmap_irec	ltrec;
-	struct xfs_rmap_irec	gtrec;
-	int			have_gt;
-	int			have_lt;
-	int			error = 0;
-	int			i;
-	uint64_t		owner;
-	uint64_t		offset;
-	unsigned int		flags = 0;
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_rmap_irec		ltrec;
+	struct xfs_rmap_irec		gtrec;
+	int				have_gt;
+	int				have_lt;
+	int				error = 0;
+	int				i;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags = 0;
 
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	if (unwritten)
@@ -2275,7 +2267,7 @@
  * Record a rmap intent; the list is kept sorted first by AG and then by
  * increasing age.
  */
-static int
+static void
 __xfs_rmap_add(
 	struct xfs_trans		*tp,
 	enum xfs_rmap_intent_type	type,
@@ -2294,7 +2286,7 @@
 			bmap->br_blockcount,
 			bmap->br_state);
 
-	ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+	ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
 	INIT_LIST_HEAD(&ri->ri_list);
 	ri->ri_type = type;
 	ri->ri_owner = owner;
@@ -2302,11 +2294,10 @@
 	ri->ri_bmap = *bmap;
 
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
-	return 0;
 }
 
 /* Map an extent into a file. */
-int
+void
 xfs_rmap_map_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
@@ -2314,15 +2305,15 @@
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
-		return 0;
+		return;
 
-	return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+	__xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
 			XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
 			whichfork, PREV);
 }
 
 /* Unmap an extent out of a file. */
-int
+void
 xfs_rmap_unmap_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
@@ -2330,9 +2321,9 @@
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
-		return 0;
+		return;
 
-	return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+	__xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
 			XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
 			whichfork, PREV);
 }
@@ -2343,7 +2334,7 @@
  * Note that tp can be NULL here as no transaction is used for COW fork
  * unwritten conversion.
  */
-int
+void
 xfs_rmap_convert_extent(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
@@ -2352,15 +2343,15 @@
 	struct xfs_bmbt_irec	*PREV)
 {
 	if (!xfs_rmap_update_is_needed(mp, whichfork))
-		return 0;
+		return;
 
-	return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+	__xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
 			XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
 			whichfork, PREV);
 }
 
 /* Schedule the creation of an rmap for non-file data. */
-int
+void
 xfs_rmap_alloc_extent(
 	struct xfs_trans	*tp,
 	xfs_agnumber_t		agno,
@@ -2371,18 +2362,18 @@
 	struct xfs_bmbt_irec	bmap;
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
-		return 0;
+		return;
 
 	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
 }
 
 /* Schedule the deletion of an rmap for non-file data. */
-int
+void
 xfs_rmap_free_extent(
 	struct xfs_trans	*tp,
 	xfs_agnumber_t		agno,
@@ -2393,14 +2384,14 @@
 	struct xfs_bmbt_irec	bmap;
 
 	if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
-		return 0;
+		return;
 
 	bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
 	bmap.br_blockcount = len;
 	bmap.br_startoff = 0;
 	bmap.br_state = XFS_EXT_NORM;
 
-	return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+	__xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
 }
 
 /* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
@@ -2459,18 +2450,18 @@
  */
 int
 xfs_rmap_record_exists(
-	struct xfs_btree_cur	*cur,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo,
-	bool			*has_rmap)
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	bool				*has_rmap)
 {
-	uint64_t		owner;
-	uint64_t		offset;
-	unsigned int		flags;
-	int			has_record;
-	struct xfs_rmap_irec	irec;
-	int			error;
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags;
+	int				has_record;
+	struct xfs_rmap_irec		irec;
+	int				error;
 
 	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
 	ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
@@ -2518,7 +2509,7 @@
 	    ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
 		return 0;
 	rks->has_rmap = true;
-	return XFS_BTREE_QUERY_RANGE_ABORT;
+	return -ECANCELED;
 }
 
 /*
@@ -2530,7 +2521,7 @@
 	struct xfs_btree_cur		*cur,
 	xfs_agblock_t			bno,
 	xfs_extlen_t			len,
-	struct xfs_owner_info		*oinfo,
+	const struct xfs_owner_info	*oinfo,
 	bool				*has_rmap)
 {
 	struct xfs_rmap_irec		low = {0};
@@ -2547,6 +2538,37 @@
 
 	error = xfs_rmap_query_range(cur, &low, &high,
 			xfs_rmap_has_other_keys_helper, &rks);
+	if (error < 0)
+		return error;
+
 	*has_rmap = rks.has_rmap;
-	return error;
+	return 0;
 }
+
+const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
+	.oi_owner = XFS_RMAP_OWN_NULL,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER = {
+	.oi_owner = XFS_RMAP_OWN_UNKNOWN,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_FS = {
+	.oi_owner = XFS_RMAP_OWN_FS,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_LOG = {
+	.oi_owner = XFS_RMAP_OWN_LOG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_AG = {
+	.oi_owner = XFS_RMAP_OWN_AG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INOBT = {
+	.oi_owner = XFS_RMAP_OWN_INOBT,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INODES = {
+	.oi_owner = XFS_RMAP_OWN_INODES,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
+	.oi_owner = XFS_RMAP_OWN_REFC,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
+	.oi_owner = XFS_RMAP_OWN_COW,
+};

diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 157dc72..abe6334 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h

@@ -7,16 +7,6 @@
 #define __XFS_RMAP_H__
 
 static inline void
-xfs_rmap_ag_owner(
-	struct xfs_owner_info	*oi,
-	uint64_t		owner)
-{
-	oi->oi_owner = owner;
-	oi->oi_offset = 0;
-	oi->oi_flags = 0;
-}
-
-static inline void
 xfs_rmap_ino_bmbt_owner(
 	struct xfs_owner_info	*oi,
 	xfs_ino_t		ino,
@@ -43,27 +33,13 @@
 		oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
 }
 
-static inline void
-xfs_rmap_skip_owner_update(
-	struct xfs_owner_info	*oi)
-{
-	xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL);
-}
-
 static inline bool
 xfs_rmap_should_skip_owner_update(
-	struct xfs_owner_info	*oi)
+	const struct xfs_owner_info	*oi)
 {
 	return oi->oi_owner == XFS_RMAP_OWN_NULL;
 }
 
-static inline void
-xfs_rmap_any_owner_update(
-	struct xfs_owner_info	*oi)
-{
-	xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN);
-}
-
 /* Reverse mapping functions. */
 
 struct xfs_buf;
@@ -92,6 +68,7 @@
 	if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
 		return -EFSCORRUPTED;
 	irec->rm_offset = XFS_RMAP_OFF(offset);
+	irec->rm_flags = 0;
 	if (offset & XFS_RMAP_OFF_ATTR_FORK)
 		irec->rm_flags |= XFS_RMAP_ATTR_FORK;
 	if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
@@ -103,12 +80,12 @@
 
 static inline void
 xfs_owner_info_unpack(
-	struct xfs_owner_info	*oinfo,
-	uint64_t		*owner,
-	uint64_t		*offset,
-	unsigned int		*flags)
+	const struct xfs_owner_info	*oinfo,
+	uint64_t			*owner,
+	uint64_t			*offset,
+	unsigned int			*flags)
 {
-	unsigned int		r = 0;
+	unsigned int			r = 0;
 
 	*owner = oinfo->oi_owner;
 	*offset = oinfo->oi_offset;
@@ -137,10 +114,10 @@
 
 int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
 		   xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
-		   struct xfs_owner_info *oinfo);
+		   const struct xfs_owner_info *oinfo);
 int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
 		  xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
-		  struct xfs_owner_info *oinfo);
+		  const struct xfs_owner_info *oinfo);
 
 int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, uint64_t owner, uint64_t offset,
@@ -185,16 +162,16 @@
 };
 
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
-int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 		int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
+void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
 		struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *imap);
-int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
 		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
 		xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
 
 void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
@@ -218,11 +195,21 @@
 int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, bool *exists);
 int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-		xfs_extlen_t len, struct xfs_owner_info *oinfo,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo,
 		bool *has_rmap);
 int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-		xfs_extlen_t len, struct xfs_owner_info *oinfo,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo,
 		bool *has_rmap);
 int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
 
+extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_FS;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_LOG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_AG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INOBT;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+
 #endif	/* __XFS_RMAP_H__ */

diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f79cf04..fc78efa 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c

@@ -9,18 +9,14 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
 #include "xfs_ag_resv.h"
@@ -292,7 +288,7 @@
 xfs_rmapbt_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_perag	*pag = bp->b_pag;
 	xfs_failaddr_t		fa;
@@ -310,7 +306,7 @@
 	 * from the on disk AGF. Again, we can only check against maximum limits
 	 * in this case.
 	 */
-	if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+	if (!xfs_verify_magic(bp, block->bb_magic))
 		return __this_address;
 
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +361,7 @@
 
 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
 	.name			= "xfs_rmapbt",
+	.magic			= { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
 	.verify_read		= xfs_rmapbt_read_verify,
 	.verify_write		= xfs_rmapbt_write_verify,
 	.verify_struct		= xfs_rmapbt_verify,
@@ -577,6 +574,15 @@
 	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
 	xfs_trans_brelse(tp, agbp);
 
+	/*
+	 * The log is permanently allocated, so the space it occupies will
+	 * never be available for the kinds of things that would require btree
+	 * expansion.  We therefore can pretend the space isn't there.
+	 */
+	if (mp->m_sb.sb_logstart &&
+	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+		agblocks -= mp->m_sb.sb_logblocks;
+
 	/* Reserve 1% of the AG or enough for 1 block per record. */
 	*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
 	*used += tree_len;

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index b228c82..8ea1efc 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c

@@ -13,15 +13,7 @@
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
 #include "xfs_trans.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_buf.h"
-#include "xfs_icache.h"
 #include "xfs_rtalloc.h"
 
 
@@ -505,6 +497,12 @@
 		uint first = (uint)((char *)sp - (char *)bp->b_addr);
 
 		*sp += delta;
+		if (mp->m_rsum_cache) {
+			if (*sp == 0 && log == mp->m_rsum_cache[bbno])
+				mp->m_rsum_cache[bbno]++;
+			if (*sp != 0 && log < mp->m_rsum_cache[bbno])
+				mp->m_rsum_cache[bbno] = log;
+		}
 		xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
 	}
 	if (sum)

diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 081f46e..ac6cdca 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c

@@ -10,26 +10,20 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_log.h"
 #include "xfs_rmap_btree.h"
-#include "xfs_bmap.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_da_format.h"
-#include "xfs_da_btree.h"
+#include "xfs_health.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -225,10 +219,11 @@
 	struct xfs_buf		*bp,
 	struct xfs_sb		*sbp)
 {
+	struct xfs_dsb		*dsb = XFS_BUF_TO_SBP(bp);
 	uint32_t		agcount = 0;
 	uint32_t		rem;
 
-	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+	if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
 		xfs_warn(mp, "bad magic number");
 		return -EWRONGFS;
 	}
@@ -684,7 +679,7 @@
 	struct xfs_buf		*bp)
 {
 	struct xfs_sb		sb;
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_dsb		*dsb = XFS_BUF_TO_SBP(bp);
 	int			error;
 
@@ -750,7 +745,7 @@
 	struct xfs_buf		*bp)
 {
 	struct xfs_sb		sb;
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	int			error;
 
@@ -781,12 +776,14 @@
 
 const struct xfs_buf_ops xfs_sb_buf_ops = {
 	.name = "xfs_sb",
+	.magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
 	.verify_read = xfs_sb_read_verify,
 	.verify_write = xfs_sb_write_verify,
 };
 
 const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
 	.name = "xfs_sb_quiet",
+	.magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
 	.verify_read = xfs_sb_quiet_read_verify,
 	.verify_write = xfs_sb_write_verify,
 };
@@ -796,12 +793,14 @@
  *
  * Mount initialization code establishing various mount
  * fields from the superblock associated with the given
- * mount structure
+ * mount structure.
+ *
+ * Inode geometry are calculated in xfs_ialloc_setup_geometry.
  */
 void
 xfs_sb_mount_common(
-	struct xfs_mount *mp,
-	struct xfs_sb	*sbp)
+	struct xfs_mount	*mp,
+	struct xfs_sb		*sbp)
 {
 	mp->m_agfrotor = mp->m_agirotor = 0;
 	mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -809,7 +808,6 @@
 	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
 	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
 	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
-	mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
 	mp->m_blockmask = sbp->sb_blocksize - 1;
 	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
 	mp->m_blockwmask = mp->m_blockwsize - 1;
@@ -819,11 +817,6 @@
 	mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
 	mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
 
-	mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-	mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
-	mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
-	mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-
 	mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
 	mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
 	mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
@@ -840,14 +833,6 @@
 	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
 
 	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
-	mp->m_ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
-					sbp->sb_inopblock);
-	mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
-
-	if (sbp->sb_spino_align)
-		mp->m_ialloc_min_blks = sbp->sb_spino_align;
-	else
-		mp->m_ialloc_min_blks = mp->m_ialloc_blks;
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 	mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
 }
@@ -874,7 +859,7 @@
 	uint64_t	bfreelst = 0;
 	uint64_t	btree = 0;
 	uint64_t	fdblocks;
-	int		error;
+	int		error = 0;
 
 	for (index = 0; index < agcount; index++) {
 		/*
@@ -902,7 +887,7 @@
 	/*
 	 * If the new summary counts are obviously incorrect, fail the
 	 * mount operation because that implies the AGFs are also corrupt.
-	 * Clear BAD_SUMMARY so that we don't unmount with a dirty log, which
+	 * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
 	 * will prevent xfs_repair from fixing anything.
 	 */
 	if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
@@ -920,7 +905,7 @@
 
 	xfs_reinit_percpu_counters(mp);
 out:
-	mp->m_flags &= ~XFS_MOUNT_BAD_SUMMARY;
+	xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
 	return error;
 }
 
@@ -935,7 +920,7 @@
 	struct xfs_trans	*tp)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_buf		*bp = xfs_trans_getsb(tp, mp, 0);
+	struct xfs_buf		*bp = xfs_trans_getsb(tp, mp);
 
 	mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
 	mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
@@ -943,7 +928,7 @@
 
 	xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
-	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
+	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
 }
 
 /*
@@ -1001,7 +986,7 @@
 
 		bp = xfs_buf_get(mp->m_ddev_targp,
 				 XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
-				 XFS_FSS_TO_BB(mp, 1), 0);
+				 XFS_FSS_TO_BB(mp, 1));
 		/*
 		 * If we get an error reading or writing alternate superblocks,
 		 * continue.  xfs_repair chooses the "best" superblock based
@@ -1065,7 +1050,7 @@
 	if (error)
 		return error;
 
-	bp = xfs_trans_getsb(tp, mp, 0);
+	bp = xfs_trans_getsb(tp, mp);
 	xfs_log_sb(tp);
 	xfs_trans_bhold(tp, bp);
 	xfs_trans_set_sync(tp);
@@ -1081,7 +1066,7 @@
 	return error;
 }
 
-int
+void
 xfs_fs_geometry(
 	struct xfs_sb		*sbp,
 	struct xfs_fsop_geom	*geo,
@@ -1105,17 +1090,18 @@
 	memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid));
 
 	if (struct_version < 2)
-		return 0;
+		return;
 
 	geo->sunit = sbp->sb_unit;
 	geo->swidth = sbp->sb_width;
 
 	if (struct_version < 3)
-		return 0;
+		return;
 
 	geo->version = XFS_FSOP_GEOM_VERSION;
 	geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
-		     XFS_FSOP_GEOM_FLAGS_DIRV2;
+		     XFS_FSOP_GEOM_FLAGS_DIRV2 |
+		     XFS_FSOP_GEOM_FLAGS_EXTFLG;
 	if (xfs_sb_version_hasattr(sbp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
 	if (xfs_sb_version_hasquota(sbp))
@@ -1124,8 +1110,6 @@
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
 	if (xfs_sb_version_hasdalign(sbp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
-	if (xfs_sb_version_hasextflgbit(sbp))
-		geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG;
 	if (xfs_sb_version_hassector(sbp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
 	if (xfs_sb_version_hasasciici(sbp))
@@ -1156,14 +1140,17 @@
 	geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
 
 	if (struct_version < 4)
-		return 0;
+		return;
 
 	if (xfs_sb_version_haslogv2(sbp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2;
 
 	geo->logsunit = sbp->sb_logsunit;
 
-	return 0;
+	if (struct_version < 5)
+		return;
+
+	geo->version = XFS_FSOP_GEOM_VERSION_V5;
 }
 
 /* Read a secondary superblock. */

diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 13564d6..92465a9 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h

@@ -33,7 +33,7 @@
 extern int	xfs_update_secondary_sbs(struct xfs_mount *mp);
 
 #define XFS_FS_GEOM_MAX_STRUCT_VER	(4)
-extern int	xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
+extern void	xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
 				int struct_version);
 extern int	xfs_sb_read_secondary(struct xfs_mount *mp,
 				struct xfs_trans *tp, xfs_agnumber_t agno,

diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1c5debe..c45acbd 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h

@@ -25,7 +25,8 @@
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 extern const struct xfs_buf_ops xfs_agf_buf_ops;
 extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_dquot_buf_ops;
@@ -63,7 +65,6 @@
 #define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
 #define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */
 #define XFS_TRANS_NO_WRITECOUNT 0x40	/* do not elevate SB writecount */
-#define XFS_TRANS_NOFS		0x80	/* pass KM_NOFS to kmem_alloc */
 /*
  * LOWMODE is used by the allocator to activate the lowspace algorithm - when
  * free space is running low the extent allocator may choose to allocate an
@@ -134,4 +135,46 @@
 				 struct xfs_inode *ip, struct xfs_ifork *ifp);
 xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
 
+/* Computed inode geometry for the filesystem. */
+struct xfs_ino_geometry {
+	/* Maximum inode count in this filesystem. */
+	uint64_t	maxicount;
+
+	/* Actual inode cluster buffer size, in bytes. */
+	unsigned int	inode_cluster_size;
+
+	/*
+	 * Desired inode cluster buffer size, in bytes.  This value is not
+	 * rounded up to at least one filesystem block, which is necessary for
+	 * the sole purpose of validating sb_spino_align.  Runtime code must
+	 * only ever use inode_cluster_size.
+	 */
+	unsigned int	inode_cluster_size_raw;
+
+	/* Inode cluster sizes, adjusted to be at least 1 fsb. */
+	unsigned int	inodes_per_cluster;
+	unsigned int	blocks_per_cluster;
+
+	/* Inode cluster alignment. */
+	unsigned int	cluster_align;
+	unsigned int	cluster_align_inodes;
+	unsigned int	inoalign_mask;	/* mask sb_inoalignmt if used */
+
+	unsigned int	inobt_mxr[2]; /* max inobt btree records */
+	unsigned int	inobt_mnr[2]; /* min inobt btree records */
+	unsigned int	inobt_maxlevels; /* max inobt btree levels. */
+
+	/* Size of inode allocations under normal operation. */
+	unsigned int	ialloc_inos;
+	unsigned int	ialloc_blks;
+
+	/* Minimum inode blocks for a sparse allocation. */
+	unsigned int	ialloc_min_blks;
+
+	/* stripe unit inode alignment */
+	unsigned int	ialloc_align;
+
+	unsigned int	agino_log;	/* #bits for agino in inum */
+};
+
 #endif /* __XFS_SHARED_H__ */

diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 95374ab..3b8260c 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c

@@ -11,12 +11,8 @@
 #include "xfs_shared.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_symlink.h"
-#include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
@@ -90,12 +86,12 @@
 xfs_symlink_verify(
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
 
 	if (!xfs_sb_version_hascrc(&mp->m_sb))
 		return __this_address;
-	if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+	if (!xfs_verify_magic(bp, dsl->sl_magic))
 		return __this_address;
 	if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
 		return __this_address;
@@ -116,7 +112,7 @@
 xfs_symlink_read_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	xfs_failaddr_t	fa;
 
 	/* no verification of non-crc buffers */
@@ -136,7 +132,7 @@
 xfs_symlink_write_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_mount *mp = bp->b_mount;
 	struct xfs_buf_log_item	*bip = bp->b_log_item;
 	xfs_failaddr_t		fa;
 
@@ -159,6 +155,7 @@
 
 const struct xfs_buf_ops xfs_symlink_buf_ops = {
 	.name = "xfs_symlink",
+	.magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
 	.verify_read = xfs_symlink_read_verify,
 	.verify_write = xfs_symlink_write_verify,
 	.verify_struct = xfs_symlink_verify,
@@ -199,7 +196,10 @@
 					ifp->if_bytes - 1);
 }
 
-/* Verify the consistency of an inline symlink. */
+/*
+ * Verify the in-memory consistency of an inline symlink data fork. This
+ * does not do on-disk format checks.
+ */
 xfs_failaddr_t
 xfs_symlink_shortform_verify(
 	struct xfs_inode	*ip)
@@ -215,9 +215,12 @@
 	size = ifp->if_bytes;
 	endp = sfp + size;
 
-	/* Zero length symlinks can exist while we're deleting a remote one. */
-	if (size == 0)
-		return NULL;
+	/*
+	 * Zero length symlinks should never occur in memory as they are
+	 * never alllowed to exist on disk.
+	 */
+	if (!size)
+		return __this_address;
 
 	/* No negative sizes or overly long symlink targets. */
 	if (size < 0 || size > XFS_SYMLINK_MAXLEN)

diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
similarity index 96%
rename from fs/xfs/xfs_trans_inode.c
rename to fs/xfs/libxfs/xfs_trans_inode.c
index 5429273..a9ad909 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c

@@ -8,13 +8,10 @@
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
-#include "xfs_trace.h"
 
 #include <linux/iversion.h>
 
@@ -69,6 +66,10 @@
 		inode->i_mtime = tv;
 	if (flags & XFS_ICHGTIME_CHG)
 		inode->i_ctime = tv;
+	if (flags & XFS_ICHGTIME_CREATE) {
+		ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
+		ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
+	}
 }
 
 /*

diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f99a7ae..d12bbd5 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c

@@ -15,12 +15,10 @@
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_quota.h"
 #include "xfs_trans.h"
 #include "xfs_qm.h"
 #include "xfs_trans_space.h"
-#include "xfs_trace.h"
 
 #define _ALLOC	true
 #define _FREE	false
@@ -136,9 +134,10 @@
 xfs_calc_inobt_res(
 	struct xfs_mount	*mp)
 {
-	return xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
-				 XFS_FSB_TO_B(mp, 1));
+	return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
+			XFS_FSB_TO_B(mp, 1)) +
+				xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+			XFS_FSB_TO_B(mp, 1));
 }
 
 /*
@@ -167,7 +166,7 @@
  * includes:
  *
  * the allocation btrees: 2 trees * (max depth - 1) * block size
- * the inode chunk: m_ialloc_blks * N
+ * the inode chunk: m_ino_geo.ialloc_blks * N
  *
  * The size N of the inode chunk reservation depends on whether it is for
  * allocation or free and which type of create transaction is in use. An inode
@@ -193,7 +192,7 @@
 		size = XFS_FSB_TO_B(mp, 1);
 	}
 
-	res += xfs_calc_buf_res(mp->m_ialloc_blks, size);
+	res += xfs_calc_buf_res(M_IGEO(mp)->ialloc_blks, size);
 	return res;
 }
 
@@ -307,7 +306,7 @@
 	struct xfs_mount        *mp)
 {
 	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-	       2 * max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+	       2 * M_IGEO(mp)->inode_cluster_size;
 }
 
 /*
@@ -345,7 +344,7 @@
 xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
 {
 	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-		max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+			M_IGEO(mp)->inode_cluster_size;
 }
 
 /*
@@ -876,9 +875,13 @@
 	resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
 	resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
 
+	/* growdata requires permanent res; it can free space to the last AG */
+	resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+	resp->tr_growdata.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+	resp->tr_growdata.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
 	/* The following transaction are logged in logical format */
 	resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
-	resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
 	resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
 	resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
 	resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);

diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index a62fb95..88221c7 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h

@@ -56,9 +56,9 @@
 #define	XFS_DIRREMOVE_SPACE_RES(mp)	\
 	XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
 #define	XFS_IALLOC_SPACE_RES(mp)	\
-	((mp)->m_ialloc_blks + \
+	(M_IGEO(mp)->ialloc_blks + \
 	 (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
-	  ((mp)->m_in_maxlevels - 1)))
+	  (M_IGEO(mp)->inobt_maxlevels - 1)))
 
 /*
  * Space reservation values for various transactions.
@@ -94,7 +94,8 @@
 #define	XFS_SYMLINK_SPACE_RES(mp,nl,b)	\
 	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
 #define XFS_IFREE_SPACE_RES(mp)		\
-	(xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+	(xfs_sb_version_hasfinobt(&mp->m_sb) ? \
+			M_IGEO(mp)->inobt_maxlevels : 0)
 
 
 #endif	/* __XFS_TRANS_SPACE_H__ */

diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 33a5ca3..4f59554 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c

@@ -7,19 +7,10 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
-#include "xfs_log_format.h"
 #include "xfs_shared.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_rmap.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
 
 /* Find the size of the AG, in blocks. */
 xfs_agblock_t
@@ -87,16 +78,15 @@
 	 * Calculate the first inode, which will be in the first
 	 * cluster-aligned block after the AGFL.
 	 */
-	bno = round_up(XFS_AGFL_BLOCK(mp) + 1,
-			xfs_ialloc_cluster_alignment(mp));
-	*first = XFS_OFFBNO_TO_AGINO(mp, bno, 0);
+	bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
+	*first = XFS_AGB_TO_AGINO(mp, bno);
 
 	/*
 	 * Calculate the last inode, which will be at the end of the
 	 * last (aligned) cluster that can be allocated in the AG.
 	 */
-	bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp));
-	*last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1;
+	bno = round_down(eoag, M_IGEO(mp)->cluster_align);
+	*last = XFS_AGB_TO_AGINO(mp, bno) - 1;
 }
 
 /*
@@ -117,6 +107,19 @@
 }
 
 /*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino)
+{
+	return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
+/*
  * Verify that an FS inode number pointer neither points outside the
  * filesystem nor points at static AG metadata.
  */
@@ -173,7 +176,7 @@
 }
 
 /* Calculate the range of valid icount values. */
-static void
+void
 xfs_icount_range(
 	struct xfs_mount	*mp,
 	unsigned long long	*min,
@@ -205,3 +208,14 @@
 	xfs_icount_range(mp, &min, &max);
 	return icount >= min && icount <= max;
 }
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+	struct xfs_mount	*mp,
+	xfs_fileoff_t		dabno)
+{
+	xfs_dablk_t		max_dablk = -1U;
+
+	return dabno <= max_dablk;
+}

diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b9e6c89..300b3e9 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h

@@ -100,15 +100,37 @@
  */
 #define MAXNAMELEN	256
 
+/*
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
+ */
 typedef enum {
 	XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
 } xfs_lookup_t;
 
+#define XFS_AG_BTREE_CMP_FORMAT_STR \
+	{ XFS_LOOKUP_EQi,	"eq" }, \
+	{ XFS_LOOKUP_LEi,	"le" }, \
+	{ XFS_LOOKUP_GEi,	"ge" }
+
+/*
+ * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
+ * please keep the TRACE_DEFINE_ENUMs for it up to date.
+ */
 typedef enum {
 	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
 	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 
+#define XFS_BTNUM_STRINGS \
+	{ XFS_BTNUM_BNOi,	"bnobt" }, \
+	{ XFS_BTNUM_CNTi,	"cntbt" }, \
+	{ XFS_BTNUM_RMAPi,	"rmapbt" }, \
+	{ XFS_BTNUM_BMAPi,	"bmbt" }, \
+	{ XFS_BTNUM_INOi,	"inobt" }, \
+	{ XFS_BTNUM_FINOi,	"finobt" }, \
+	{ XFS_BTNUM_REFCi,	"refcbt" }
+
 struct xfs_name {
 	const unsigned char	*name;
 	int			len;
@@ -147,6 +169,14 @@
 	xfs_exntst_t	br_state;	/* extent state */
 } xfs_bmbt_irec_t;
 
+/* per-AG block reservation types */
+enum xfs_ag_resv_type {
+	XFS_AG_RESV_NONE = 0,
+	XFS_AG_RESV_AGFL,
+	XFS_AG_RESV_METADATA,
+	XFS_AG_RESV_RMAPBT,
+};
+
 /*
  * Type verifier functions
  */
@@ -161,10 +191,15 @@
 		xfs_agino_t *first, xfs_agino_t *last);
 bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
 		xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_agino_t agino);
 bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
 bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
+void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
+		unsigned long long *max);
 
 #endif	/* __XFS_TYPES_H__ */

diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 3068a93..ba0f747 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c

@@ -9,20 +9,13 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_rmap.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
-#include "scrub/trace.h"
 
 /* Superblock */
 
@@ -32,7 +25,6 @@
 	struct xfs_scrub	*sc,
 	struct xfs_buf		*bp)
 {
-	struct xfs_owner_info	oinfo;
 	struct xfs_mount	*mp = sc->mp;
 	xfs_agnumber_t		agno = sc->sm->sm_agno;
 	xfs_agblock_t		agbno;
@@ -49,8 +41,7 @@
 
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
-	xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
 	xchk_xref_is_not_shared(sc, agbno, 1);
 
 	/* scrub teardown will take care of sc->sa for us */
@@ -401,7 +392,7 @@
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
 		return;
 	if (!have) {
-		if (agf->agf_freeblks != be32_to_cpu(0))
+		if (agf->agf_freeblks != cpu_to_be32(0))
 			xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
 		return;
 	}
@@ -484,7 +475,6 @@
 xchk_agf_xref(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_owner_info	oinfo;
 	struct xfs_mount	*mp = sc->mp;
 	xfs_agblock_t		agbno;
 	int			error;
@@ -502,8 +492,7 @@
 	xchk_agf_xref_freeblks(sc);
 	xchk_agf_xref_cntbt(sc);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
-	xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
 	xchk_agf_xref_btreeblks(sc);
 	xchk_xref_is_not_shared(sc, agbno, 1);
 	xchk_agf_xref_refcblks(sc);
@@ -518,6 +507,7 @@
 {
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_agf		*agf;
+	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
 	xfs_agblock_t		eoag;
@@ -590,6 +580,16 @@
 	if (agfl_count != 0 && fl_count != agfl_count)
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
+	/* Do the incore counters match? */
+	pag = xfs_perag_get(mp, agno);
+	if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks))
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+	if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount))
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+	if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
+		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+	xfs_perag_put(pag);
+
 	xchk_agf_xref(sc);
 out:
 	return error;
@@ -598,7 +598,6 @@
 /* AGFL */
 
 struct xchk_agfl_info {
-	struct xfs_owner_info	oinfo;
 	unsigned int		sz_entries;
 	unsigned int		nr_entries;
 	xfs_agblock_t		*entries;
@@ -609,15 +608,14 @@
 STATIC void
 xchk_agfl_block_xref(
 	struct xfs_scrub	*sc,
-	xfs_agblock_t		agbno,
-	struct xfs_owner_info	*oinfo)
+	xfs_agblock_t		agbno)
 {
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		return;
 
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
-	xchk_xref_is_owned_by(sc, agbno, 1, oinfo);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG);
 	xchk_xref_is_not_shared(sc, agbno, 1);
 }
 
@@ -638,10 +636,10 @@
 	else
 		xchk_block_set_corrupt(sc, sc->sa.agfl_bp);
 
-	xchk_agfl_block_xref(sc, agbno, priv);
+	xchk_agfl_block_xref(sc, agbno);
 
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return XFS_BTREE_QUERY_RANGE_ABORT;
+		return -ECANCELED;
 
 	return 0;
 }
@@ -662,7 +660,6 @@
 xchk_agfl_xref(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_owner_info	oinfo;
 	struct xfs_mount	*mp = sc->mp;
 	xfs_agblock_t		agbno;
 	int			error;
@@ -678,8 +675,7 @@
 
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
-	xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
 	xchk_xref_is_not_shared(sc, agbno, 1);
 
 	/*
@@ -732,10 +728,9 @@
 	}
 
 	/* Check the blocks in the AGFL. */
-	xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG);
 	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
 			sc->sa.agfl_bp, xchk_agfl_block, &sai);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+	if (error == -ECANCELED) {
 		error = 0;
 		goto out_free;
 	}
@@ -791,7 +786,6 @@
 xchk_agi_xref(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_owner_info	oinfo;
 	struct xfs_mount	*mp = sc->mp;
 	xfs_agblock_t		agbno;
 	int			error;
@@ -808,8 +802,7 @@
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_xref_is_not_inode_chunk(sc, agbno, 1);
 	xchk_agi_xref_icounts(sc);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
-	xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
 	xchk_xref_is_not_shared(sc, agbno, 1);
 
 	/* scrub teardown will take care of sc->sa for us */
@@ -822,6 +815,7 @@
 {
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_agi		*agi;
+	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
 	xfs_agblock_t		eoag;
@@ -875,25 +869,31 @@
 
 	/* Check inode pointers */
 	agino = be32_to_cpu(agi->agi_newino);
-	if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+	if (!xfs_verify_agino_or_null(mp, agno, agino))
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 	agino = be32_to_cpu(agi->agi_dirino);
-	if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+	if (!xfs_verify_agino_or_null(mp, agno, agino))
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 	/* Check unlinked inode buckets */
 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
 		agino = be32_to_cpu(agi->agi_unlinked[i]);
-		if (agino == NULLAGINO)
-			continue;
-		if (!xfs_verify_agino(mp, agno, agino))
+		if (!xfs_verify_agino_or_null(mp, agno, agino))
 			xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 	}
 
 	if (agi->agi_pad32 != cpu_to_be32(0))
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
+	/* Do the incore counters match? */
+	pag = xfs_perag_get(mp, agno);
+	if (pag->pagi_count != be32_to_cpu(agi->agi_count))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+	if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
+		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+	xfs_perag_put(pag);
+
 	xchk_agi_xref(sc);
 out:
 	return error;

diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index f7568a4..7a1a38b 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c

@@ -9,22 +9,17 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -341,23 +336,19 @@
 	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
 		[XREP_AGF_BNOBT] = {
 			.rmap_owner = XFS_RMAP_OWN_AG,
-			.buf_ops = &xfs_allocbt_buf_ops,
-			.magic = XFS_ABTB_CRC_MAGIC,
+			.buf_ops = &xfs_bnobt_buf_ops,
 		},
 		[XREP_AGF_CNTBT] = {
 			.rmap_owner = XFS_RMAP_OWN_AG,
-			.buf_ops = &xfs_allocbt_buf_ops,
-			.magic = XFS_ABTC_CRC_MAGIC,
+			.buf_ops = &xfs_cntbt_buf_ops,
 		},
 		[XREP_AGF_RMAPBT] = {
 			.rmap_owner = XFS_RMAP_OWN_AG,
 			.buf_ops = &xfs_rmapbt_buf_ops,
-			.magic = XFS_RMAP_CRC_MAGIC,
 		},
 		[XREP_AGF_REFCOUNTBT] = {
 			.rmap_owner = XFS_RMAP_OWN_REFC,
 			.buf_ops = &xfs_refcountbt_buf_ops,
-			.magic = XFS_REFC_CRC_MAGIC,
 		},
 		[XREP_AGF_END] = {
 			.buf_ops = NULL,
@@ -646,7 +637,6 @@
 xrep_agfl(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_owner_info	oinfo;
 	struct xfs_bitmap	agfl_extents;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_buf		*agf_bp;
@@ -708,8 +698,8 @@
 		goto err;
 
 	/* Dump any AGFL overflow. */
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
-	return xrep_reap_extents(sc, &agfl_extents, &oinfo, XFS_AG_RESV_AGFL);
+	return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+			XFS_AG_RESV_AGFL);
 err:
 	xfs_bitmap_destroy(&agfl_extents);
 	return error;
@@ -876,12 +866,10 @@
 		[XREP_AGI_INOBT] = {
 			.rmap_owner = XFS_RMAP_OWN_INOBT,
 			.buf_ops = &xfs_inobt_buf_ops,
-			.magic = XFS_IBT_CRC_MAGIC,
 		},
 		[XREP_AGI_FINOBT] = {
 			.rmap_owner = XFS_RMAP_OWN_INOBT,
-			.buf_ops = &xfs_inobt_buf_ops,
-			.magic = XFS_FIBT_CRC_MAGIC,
+			.buf_ops = &xfs_finobt_buf_ops,
 		},
 		[XREP_AGI_END] = {
 			.buf_ops = NULL

diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 376bcb5..5533e48 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c

@@ -9,19 +9,12 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
-#include "scrub/trace.h"
 
 /*
  * Set us up to scrub free space btrees.
@@ -104,7 +97,6 @@
 	xfs_agnumber_t		agno = bs->cur->bc_private.a.agno;
 	xfs_agblock_t		bno;
 	xfs_extlen_t		len;
-	int			error = 0;
 
 	bno = be32_to_cpu(rec->alloc.ar_startblock);
 	len = be32_to_cpu(rec->alloc.ar_blockcount);
@@ -116,7 +108,7 @@
 
 	xchk_allocbt_xref(bs->sc, bno, len);
 
-	return error;
+	return 0;
 }
 
 /* Scrub the freespace btrees for some AG. */
@@ -125,12 +117,10 @@
 	struct xfs_scrub	*sc,
 	xfs_btnum_t		which)
 {
-	struct xfs_owner_info	oinfo;
 	struct xfs_btree_cur	*cur;
 
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
 	cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
-	return xchk_btree(sc, cur, xchk_allocbt_rec, &oinfo, NULL);
+	return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL);
 }
 
 int

diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 81d5e90..0edc7f8 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c

@@ -9,26 +9,62 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/dabtree.h"
-#include "scrub/trace.h"
+#include "scrub/attr.h"
 
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
+/*
+ * Allocate enough memory to hold an attr value and attr block bitmaps,
+ * reallocating the buffer if necessary.  Buffer contents are not preserved
+ * across a reallocation.
+ */
+int
+xchk_setup_xattr_buf(
+	struct xfs_scrub	*sc,
+	size_t			value_size,
+	xfs_km_flags_t		flags)
+{
+	size_t			sz;
+	struct xchk_xattr_buf	*ab = sc->buf;
+
+	/*
+	 * We need enough space to read an xattr value from the file or enough
+	 * space to hold three copies of the xattr free space bitmap.  We don't
+	 * need the buffer space for both purposes at the same time.
+	 */
+	sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+	sz = max_t(size_t, sz, value_size);
+
+	/*
+	 * If there's already a buffer, figure out if we need to reallocate it
+	 * to accommodate a larger size.
+	 */
+	if (ab) {
+		if (sz <= ab->sz)
+			return 0;
+		kmem_free(ab);
+		sc->buf = NULL;
+	}
+
+	/*
+	 * Don't zero the buffer upon allocation to avoid runtime overhead.
+	 * All users must be careful never to read uninitialized contents.
+	 */
+	ab = kmem_alloc_large(sizeof(*ab) + sz, flags);
+	if (!ab)
+		return -ENOMEM;
+
+	ab->sz = sz;
+	sc->buf = ab;
+	return 0;
+}
 
 /* Set us up to scrub an inode's extended attributes. */
 int
@@ -36,19 +72,18 @@
 	struct xfs_scrub	*sc,
 	struct xfs_inode	*ip)
 {
-	size_t			sz;
+	int			error;
 
 	/*
-	 * Allocate the buffer without the inode lock held.  We need enough
-	 * space to read every xattr value in the file or enough space to
-	 * hold three copies of the xattr free space bitmap.  (Not both at
-	 * the same time.)
+	 * We failed to get memory while checking attrs, so this time try to
+	 * get all the memory we're ever going to need.  Allocate the buffer
+	 * without the inode lock held, which means we can sleep.
 	 */
-	sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) *
-			BITS_TO_LONGS(sc->mp->m_attr_geo->blksize));
-	sc->buf = kmem_zalloc_large(sz, KM_SLEEP);
-	if (!sc->buf)
-		return -ENOMEM;
+	if (sc->flags & XCHK_TRY_HARDER) {
+		error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
+		if (error)
+			return error;
+	}
 
 	return xchk_setup_inode_contents(sc, ip, 0);
 }
@@ -82,12 +117,36 @@
 
 	sx = container_of(context, struct xchk_xattr, context);
 
+	if (xchk_should_terminate(sx->sc, &error)) {
+		context->seen_enough = error;
+		return;
+	}
+
 	if (flags & XFS_ATTR_INCOMPLETE) {
 		/* Incomplete attr key, just mark the inode for preening. */
 		xchk_ino_set_preen(sx->sc, context->dp->i_ino);
 		return;
 	}
 
+	/* Does this name make sense? */
+	if (!xfs_attr_namecheck(name, namelen)) {
+		xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+		return;
+	}
+
+	/*
+	 * Try to allocate enough memory to extrat the attr value.  If that
+	 * doesn't work, we overload the seen_enough variable to convey
+	 * the error message back to the main scrub function.
+	 */
+	error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL);
+	if (error == -ENOMEM)
+		error = -EDEADLOCK;
+	if (error) {
+		context->seen_enough = error;
+		return;
+	}
+
 	args.flags = ATTR_KERNOTIME;
 	if (flags & XFS_ATTR_ROOT)
 		args.flags |= ATTR_ROOT;
@@ -100,12 +159,10 @@
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(args.name, args.namelen);
 	args.trans = context->tp;
-	args.value = sx->sc->buf;
-	args.valuelen = XATTR_SIZE_MAX;
+	args.value = xchk_xattr_valuebuf(sx->sc);
+	args.valuelen = valuelen;
 
 	error = xfs_attr_get_ilocked(context->dp, &args);
-	if (error == -EEXIST)
-		error = 0;
 	if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
 			&error))
 		goto fail_xref;
@@ -159,13 +216,12 @@
 	unsigned long			*map,
 	struct xfs_attr3_icleaf_hdr	*leafhdr)
 {
-	unsigned long			*freemap;
-	unsigned long			*dstmap;
+	unsigned long			*freemap = xchk_xattr_freemap(sc);
+	unsigned long			*dstmap = xchk_xattr_dstmap(sc);
 	unsigned int			mapsize = sc->mp->m_attr_geo->blksize;
 	int				i;
 
 	/* Construct bitmap of freemap contents. */
-	freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize);
 	bitmap_zero(freemap, mapsize);
 	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
 		if (!xchk_xattr_set_map(sc, freemap,
@@ -175,7 +231,6 @@
 	}
 
 	/* Look for bits that are set in freemap and are marked in use. */
-	dstmap = freemap + BITS_TO_LONGS(mapsize);
 	return bitmap_and(dstmap, freemap, map, mapsize) == 0;
 }
 
@@ -190,13 +245,13 @@
 	char				*buf_end,
 	struct xfs_attr_leafblock	*leaf,
 	struct xfs_attr3_icleaf_hdr	*leafhdr,
-	unsigned long			*usedmap,
 	struct xfs_attr_leaf_entry	*ent,
 	int				idx,
 	unsigned int			*usedbytes,
 	__u32				*last_hashval)
 {
 	struct xfs_mount		*mp = ds->state->mp;
+	unsigned long			*usedmap = xchk_xattr_usedmap(ds->sc);
 	char				*name_end;
 	struct xfs_attr_leaf_name_local	*lentry;
 	struct xfs_attr_leaf_name_remote *rentry;
@@ -256,16 +311,26 @@
 	struct xfs_attr_leafblock	*leaf = bp->b_addr;
 	struct xfs_attr_leaf_entry	*ent;
 	struct xfs_attr_leaf_entry	*entries;
-	unsigned long			*usedmap = ds->sc->buf;
+	unsigned long			*usedmap;
 	char				*buf_end;
 	size_t				off;
 	__u32				last_hashval = 0;
 	unsigned int			usedbytes = 0;
 	unsigned int			hdrsize;
 	int				i;
+	int				error;
 
 	if (*last_checked == blk->blkno)
 		return 0;
+
+	/* Allocate memory for block usage checking. */
+	error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL);
+	if (error == -ENOMEM)
+		return -EDEADLOCK;
+	if (error)
+		return error;
+	usedmap = xchk_xattr_usedmap(ds->sc);
+
 	*last_checked = blk->blkno;
 	bitmap_zero(usedmap, mp->m_attr_geo->blksize);
 
@@ -313,7 +378,7 @@
 
 		/* Check the entry and nameval. */
 		xchk_xattr_entry(ds, level, buf_end, leaf, &leafhdr,
-				usedmap, ent, i, &usedbytes, &last_hashval);
+				ent, i, &usedbytes, &last_hashval);
 
 		if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 			goto out;
@@ -453,6 +518,10 @@
 	error = xfs_attr_list_int_ilocked(&sx.context);
 	if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
 		goto out;
+
+	/* Did our listent function try to return any errors? */
+	if (sx.context.seen_enough < 0)
+		error = sx.context.seen_enough;
 out:
 	return error;
 }

diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
new file mode 100644
index 0000000..13a1d2e
--- /dev/null
+++ b/fs/xfs/scrub/attr.h

@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_ATTR_H__
+#define __XFS_SCRUB_ATTR_H__
+
+/*
+ * Temporary storage for online scrub and repair of extended attributes.
+ */
+struct xchk_xattr_buf {
+	/* Size of @buf, in bytes. */
+	size_t			sz;
+
+	/*
+	 * Memory buffer -- either used for extracting attr values while
+	 * walking the attributes; or for computing attr block bitmaps when
+	 * checking the attribute tree.
+	 *
+	 * Each bitmap contains enough bits to track every byte in an attr
+	 * block (rounded up to the size of an unsigned long).  The attr block
+	 * used space bitmap starts at the beginning of the buffer; the free
+	 * space bitmap follows immediately after; and we have a third buffer
+	 * for storing intermediate bitmap results.
+	 */
+	uint8_t			buf[0];
+};
+
+/* A place to store attribute values. */
+static inline uint8_t *
+xchk_xattr_valuebuf(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_xattr_buf	*ab = sc->buf;
+
+	return ab->buf;
+}
+
+/* A bitmap of space usage computed by walking an attr leaf block. */
+static inline unsigned long *
+xchk_xattr_usedmap(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_xattr_buf	*ab = sc->buf;
+
+	return (unsigned long *)ab->buf;
+}
+
+/* A bitmap of free space computed by walking attr leaf block free info. */
+static inline unsigned long *
+xchk_xattr_freemap(
+	struct xfs_scrub	*sc)
+{
+	return xchk_xattr_usedmap(sc) +
+			BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+}
+
+/* A bitmap used to hold temporary results. */
+static inline unsigned long *
+xchk_xattr_dstmap(
+	struct xfs_scrub	*sc)
+{
+	return xchk_xattr_freemap(sc) +
+			BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+}
+
+int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size,
+		xfs_km_flags_t flags);
+
+#endif	/* __XFS_SCRUB_ATTR_H__ */

diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index fdadc9e..3d47d11 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c

@@ -10,11 +10,6 @@
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_btree.h"
-#include "scrub/xfs_scrub.h"
-#include "scrub/scrub.h"
-#include "scrub/common.h"
-#include "scrub/trace.h"
-#include "scrub/repair.h"
 #include "scrub/bitmap.h"
 
 /*

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e1d11f3..fa6ea64 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c

@@ -9,27 +9,19 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
 #include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
-#include "xfs_inode_fork.h"
 #include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
-#include "scrub/trace.h"
 
 /* Set us up with an inode's bmap. */
 int
@@ -83,6 +75,7 @@
 	xfs_fileoff_t		lastoff;
 	bool			is_rt;
 	bool			is_shared;
+	bool			was_loaded;
 	int			whichfork;
 };
 
@@ -221,25 +214,20 @@
 
 /* Cross-reference a single rtdev extent record. */
 STATIC void
-xchk_bmap_rt_extent_xref(
-	struct xchk_bmap_info	*info,
+xchk_bmap_rt_iextent_xref(
 	struct xfs_inode	*ip,
-	struct xfs_btree_cur	*cur,
+	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
-	if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return;
-
 	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
 			irec->br_blockcount);
 }
 
 /* Cross-reference a single datadev extent record. */
 STATIC void
-xchk_bmap_extent_xref(
-	struct xchk_bmap_info	*info,
+xchk_bmap_iextent_xref(
 	struct xfs_inode	*ip,
-	struct xfs_btree_cur	*cur,
+	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
 	struct xfs_mount	*mp = info->sc->mp;
@@ -248,9 +236,6 @@
 	xfs_extlen_t		len;
 	int			error;
 
-	if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return;
-
 	agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
 	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
 	len = irec->br_blockcount;
@@ -281,22 +266,42 @@
 	xchk_ag_free(info->sc, &info->sc->sa);
 }
 
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+	struct xfs_inode	*ip,
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		off;
+
+	if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+		return;
+
+	if (!xfs_verify_dablk(mp, irec->br_startoff))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	off = irec->br_startoff + irec->br_blockcount - 1;
+	if (!xfs_verify_dablk(mp, off))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
 /* Scrub a single extent record. */
 STATIC int
-xchk_bmap_extent(
+xchk_bmap_iextent(
 	struct xfs_inode	*ip,
-	struct xfs_btree_cur	*cur,
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
 	struct xfs_mount	*mp = info->sc->mp;
-	struct xfs_buf		*bp = NULL;
 	xfs_filblks_t		end;
 	int			error = 0;
 
-	if (cur)
-		xfs_btree_get_block(cur, 0, &bp);
-
 	/*
 	 * Check for out-of-order extents.  This record could have come
 	 * from the incore list, for which there is no ordering check.
@@ -305,6 +310,8 @@
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
+	xchk_bmap_dirattr_extent(ip, info, irec);
+
 	/* There should never be a "hole" extent in either extent list. */
 	if (irec->br_startblock == HOLESTARTBLOCK)
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
@@ -345,10 +352,13 @@
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
+	if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
 	if (info->is_rt)
-		xchk_bmap_rt_extent_xref(info, ip, cur, irec);
+		xchk_bmap_rt_iextent_xref(ip, info, irec);
 	else
-		xchk_bmap_extent_xref(info, ip, cur, irec);
+		xchk_bmap_iextent_xref(ip, info, irec);
 
 	info->lastoff = irec->br_startoff + irec->br_blockcount;
 	return error;
@@ -361,10 +371,13 @@
 	union xfs_btree_rec	*rec)
 {
 	struct xfs_bmbt_irec	irec;
+	struct xfs_bmbt_irec	iext_irec;
+	struct xfs_iext_cursor	icur;
 	struct xchk_bmap_info	*info = bs->private;
 	struct xfs_inode	*ip = bs->cur->bc_private.b.ip;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_btree_block	*block;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, info->whichfork);
 	uint64_t		owner;
 	int			i;
 
@@ -383,9 +396,26 @@
 		}
 	}
 
-	/* Set up the in-core record and scrub it. */
+	/*
+	 * Check that the incore extent tree contains an extent that matches
+	 * this one exactly.  We validate those cached bmaps later, so we don't
+	 * need to check them here.  If the incore extent tree was just loaded
+	 * from disk by the scrubber, we assume that its contents match what's
+	 * on disk (we still hold the ILOCK) and skip the equivalence check.
+	 */
+	if (!info->was_loaded)
+		return 0;
+
 	xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
-	return xchk_bmap_extent(ip, bs->cur, info, &irec);
+	if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
+				&iext_irec) ||
+	    irec.br_startoff != iext_irec.br_startoff ||
+	    irec.br_startblock != iext_irec.br_startblock ||
+	    irec.br_blockcount != iext_irec.br_blockcount ||
+	    irec.br_state != iext_irec.br_state)
+		xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+				irec.br_startoff);
+	return 0;
 }
 
 /* Scan the btree records. */
@@ -396,15 +426,26 @@
 	struct xchk_bmap_info	*info)
 {
 	struct xfs_owner_info	oinfo;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(sc->ip, whichfork);
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_inode	*ip = sc->ip;
 	struct xfs_btree_cur	*cur;
 	int			error;
 
+	/* Load the incore bmap cache if it's not loaded. */
+	info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
+	if (!info->was_loaded) {
+		error = xfs_iread_extents(sc->tp, ip, whichfork);
+		if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+			goto out;
+	}
+
+	/* Check the btree structure. */
 	cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
 	error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
 	xfs_btree_del_cursor(cur, error);
+out:
 	return error;
 }
 
@@ -481,7 +522,7 @@
 
 out:
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return XFS_BTREE_QUERY_RANGE_ABORT;
+		return -ECANCELED;
 	return 0;
 }
 
@@ -510,7 +551,7 @@
 	sbcri.sc = sc;
 	sbcri.whichfork = whichfork;
 	error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+	if (error == -ECANCELED)
 		error = 0;
 
 	xfs_btree_del_cursor(cur, error);
@@ -652,13 +693,6 @@
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		goto out;
 
-	/* Now try to scrub the in-memory extent list. */
-        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(sc->tp, ip, whichfork);
-		if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
-			goto out;
-	}
-
 	/* Find the offset of the last extent in the mapping. */
 	error = xfs_bmap_last_offset(ip, &endoff, whichfork);
 	if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
@@ -670,7 +704,7 @@
 	for_each_xfs_iext(ifp, &icur, &irec) {
 		if (xchk_should_terminate(sc, &error) ||
 		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
-			break;
+			goto out;
 		if (isnullstartblock(irec.br_startblock))
 			continue;
 		if (irec.br_startoff >= endoff) {
@@ -678,7 +712,7 @@
 					irec.br_startoff);
 			goto out;
 		}
-		error = xchk_bmap_extent(ip, NULL, &info, &irec);
+		error = xchk_bmap_iextent(ip, &info, &irec);
 		if (error)
 			goto out;
 	}

diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 4ae959f..f52a7b8 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c

@@ -9,14 +9,7 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -415,8 +408,17 @@
 	struct xfs_btree_cur	*cur = bs->cur;
 	struct check_owner	*co;
 
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL)
+	/*
+	 * In theory, xfs_btree_get_block should only give us a null buffer
+	 * pointer for the root of a root-in-inode btree type, but we need
+	 * to check defensively here in case the cursor state is also screwed
+	 * up.
+	 */
+	if (bp == NULL) {
+		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
 		return 0;
+	}
 
 	/*
 	 * We want to cross-reference each btree block with the bnobt
@@ -583,31 +585,32 @@
  */
 int
 xchk_btree(
-	struct xfs_scrub	*sc,
-	struct xfs_btree_cur	*cur,
-	xchk_btree_rec_fn	scrub_fn,
-	struct xfs_owner_info	*oinfo,
-	void			*private)
+	struct xfs_scrub		*sc,
+	struct xfs_btree_cur		*cur,
+	xchk_btree_rec_fn		scrub_fn,
+	const struct xfs_owner_info	*oinfo,
+	void				*private)
 {
-	struct xchk_btree	bs = { NULL };
-	union xfs_btree_ptr	ptr;
-	union xfs_btree_ptr	*pp;
-	union xfs_btree_rec	*recp;
-	struct xfs_btree_block	*block;
-	int			level;
-	struct xfs_buf		*bp;
-	struct check_owner	*co;
-	struct check_owner	*n;
-	int			i;
-	int			error = 0;
+	struct xchk_btree		bs = {
+		.cur			= cur,
+		.scrub_rec		= scrub_fn,
+		.oinfo			= oinfo,
+		.firstrec		= true,
+		.private		= private,
+		.sc			= sc,
+	};
+	union xfs_btree_ptr		ptr;
+	union xfs_btree_ptr		*pp;
+	union xfs_btree_rec		*recp;
+	struct xfs_btree_block		*block;
+	int				level;
+	struct xfs_buf			*bp;
+	struct check_owner		*co;
+	struct check_owner		*n;
+	int				i;
+	int				error = 0;
 
 	/* Initialize scrub state */
-	bs.cur = cur;
-	bs.scrub_rec = scrub_fn;
-	bs.oinfo = oinfo;
-	bs.firstrec = true;
-	bs.private = private;
-	bs.sc = sc;
 	for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
 		bs.firstkey[i] = true;
 	INIT_LIST_HEAD(&bs.to_check);

diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index aada763..5572e47 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h

@@ -31,21 +31,21 @@
 
 struct xchk_btree {
 	/* caller-provided scrub state */
-	struct xfs_scrub	*sc;
-	struct xfs_btree_cur	*cur;
-	xchk_btree_rec_fn	scrub_rec;
-	struct xfs_owner_info	*oinfo;
-	void			*private;
+	struct xfs_scrub		*sc;
+	struct xfs_btree_cur		*cur;
+	xchk_btree_rec_fn		scrub_rec;
+	const struct xfs_owner_info	*oinfo;
+	void				*private;
 
 	/* internal scrub state */
-	union xfs_btree_rec	lastrec;
-	bool			firstrec;
-	union xfs_btree_key	lastkey[XFS_BTREE_MAXLEVELS];
-	bool			firstkey[XFS_BTREE_MAXLEVELS];
-	struct list_head	to_check;
+	union xfs_btree_rec		lastrec;
+	bool				firstrec;
+	union xfs_btree_key		lastkey[XFS_BTREE_MAXLEVELS];
+	bool				firstkey[XFS_BTREE_MAXLEVELS];
+	struct list_head		to_check;
 };
 int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
-		xchk_btree_rec_fn scrub_fn, struct xfs_owner_info *oinfo,
+		xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo,
 		void *private);
 
 #endif /* __XFS_SCRUB_BTREE_H__ */

diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 346b02a..1887605 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c

@@ -9,22 +9,16 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_icache.h"
-#include "xfs_itable.h"
 #include "xfs_alloc.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
@@ -32,12 +26,11 @@
 #include "xfs_trans_priv.h"
 #include "xfs_attr.h"
 #include "xfs_reflink.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
-#include "scrub/btree.h"
 #include "scrub/repair.h"
+#include "scrub/health.h"
 
 /* Common code for the metadata scrubbers. */
 
@@ -208,6 +201,15 @@
 	trace_xchk_ino_preen(sc, ino, __return_address);
 }
 
+/* Record something being wrong with the filesystem primary superblock. */
+void
+xchk_set_corrupt(
+	struct xfs_scrub	*sc)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+	trace_xchk_fs_error(sc, 0, __return_address);
+}
+
 /* Record a corrupt block. */
 void
 xchk_block_set_corrupt(
@@ -313,8 +315,8 @@
  */
 
 struct xchk_rmap_ownedby_info {
-	struct xfs_owner_info	*oinfo;
-	xfs_filblks_t		*blocks;
+	const struct xfs_owner_info	*oinfo;
+	xfs_filblks_t			*blocks;
 };
 
 STATIC int
@@ -347,15 +349,15 @@
 xchk_count_rmap_ownedby_ag(
 	struct xfs_scrub		*sc,
 	struct xfs_btree_cur		*cur,
-	struct xfs_owner_info		*oinfo,
+	const struct xfs_owner_info	*oinfo,
 	xfs_filblks_t			*blocks)
 {
-	struct xchk_rmap_ownedby_info	sroi;
+	struct xchk_rmap_ownedby_info	sroi = {
+		.oinfo			= oinfo,
+		.blocks			= blocks,
+	};
 
-	sroi.oinfo = oinfo;
 	*blocks = 0;
-	sroi.blocks = blocks;
-
 	return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
 			&sroi);
 }
@@ -458,13 +460,18 @@
 	struct xfs_mount	*mp = sc->mp;
 	xfs_agnumber_t		agno = sa->agno;
 
-	if (sa->agf_bp) {
+	xchk_perag_get(sc->mp, sa);
+	if (sa->agf_bp &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
 		/* Set up a bnobt cursor for cross-referencing. */
 		sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
 				agno, XFS_BTNUM_BNO);
 		if (!sa->bno_cur)
 			goto err;
+	}
 
+	if (sa->agf_bp &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
 		/* Set up a cntbt cursor for cross-referencing. */
 		sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
 				agno, XFS_BTNUM_CNT);
@@ -473,7 +480,8 @@
 	}
 
 	/* Set up a inobt cursor for cross-referencing. */
-	if (sa->agi_bp) {
+	if (sa->agi_bp &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
 		sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
 					agno, XFS_BTNUM_INO);
 		if (!sa->ino_cur)
@@ -481,7 +489,8 @@
 	}
 
 	/* Set up a finobt cursor for cross-referencing. */
-	if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) {
+	if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
 		sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
 				agno, XFS_BTNUM_FINO);
 		if (!sa->fino_cur)
@@ -489,7 +498,8 @@
 	}
 
 	/* Set up a rmapbt cursor for cross-referencing. */
-	if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+	if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
 		sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
 				agno);
 		if (!sa->rmap_cur)
@@ -497,7 +507,8 @@
 	}
 
 	/* Set up a refcountbt cursor for cross-referencing. */
-	if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) {
+	if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) &&
+	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
 		sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
 				sa->agf_bp, agno);
 		if (!sa->refc_cur)
@@ -884,3 +895,21 @@
 	}
 	return -EDEADLOCK;
 }
+
+/* Pause background reaping of resources. */
+void
+xchk_stop_reaping(
+	struct xfs_scrub	*sc)
+{
+	sc->flags |= XCHK_REAPING_DISABLED;
+	xfs_stop_block_reaping(sc->mp);
+}
+
+/* Restart background reaping of resources. */
+void
+xchk_start_reaping(
+	struct xfs_scrub	*sc)
+{
+	xfs_start_block_reaping(sc->mp);
+	sc->flags &= ~XCHK_REAPING_DISABLED;
+}

diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 2d4324d..003a772 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h

@@ -39,6 +39,7 @@
 		struct xfs_buf *bp);
 void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino);
 
+void xchk_set_corrupt(struct xfs_scrub *sc);
 void xchk_block_set_corrupt(struct xfs_scrub *sc,
 		struct xfs_buf *bp);
 void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
@@ -105,6 +106,7 @@
 	return -ENOENT;
 }
 #endif
+int xchk_setup_fscounters(struct xfs_scrub *sc, struct xfs_inode *ip);
 
 void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
 int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
@@ -116,7 +118,7 @@
 void xchk_ag_btcur_free(struct xchk_ag *sa);
 int xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
 int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
-		struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
+		const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
 
 int xchk_setup_ag_btree(struct xfs_scrub *sc, struct xfs_inode *ip,
 		bool force_log);
@@ -137,5 +139,7 @@
 
 int xchk_metadata_inode_forks(struct xfs_scrub *sc);
 int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+void xchk_stop_reaping(struct xfs_scrub *sc);
+void xchk_start_reaping(struct xfs_scrub *sc);
 
 #endif	/* __XFS_SCRUB_COMMON_H__ */

diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index f1260b4..77ff9f9 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c

@@ -9,20 +9,12 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
-#include "xfs_inode_fork.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_attr_leaf.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -286,7 +278,11 @@
 	/* Compare upper level pointer to sibling pointer. */
 	if (ds->state->altpath.blk[level].blkno != sibling)
 		xchk_da_set_corrupt(ds, level);
-	xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp);
+	if (ds->state->altpath.blk[level].bp) {
+		xfs_trans_brelse(ds->dargs.trans,
+				ds->state->altpath.blk[level].bp);
+		ds->state->altpath.blk[level].bp = NULL;
+	}
 out:
 	return error;
 }
@@ -574,6 +570,11 @@
 		/* Drill another level deeper. */
 		blkno = be32_to_cpu(key->before);
 		level++;
+		if (level >= XFS_DA_NODE_MAXDEPTH) {
+			/* Too deep! */
+			xchk_da_set_corrupt(&ds, level - 1);
+			break;
+		}
 		ds.tree_level--;
 		error = xchk_da_btree_block(&ds, level, blkno);
 		if (error)

diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cd3e4d7..1e2e117 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c

@@ -9,24 +9,14 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_icache.h"
-#include "xfs_itable.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
-#include "scrub/trace.h"
 #include "scrub/dabtree.h"
 
 /* Set us up to scrub directories. */
@@ -129,6 +119,12 @@
 		goto out;
 	}
 
+	/* Does this name make sense? */
+	if (!xfs_dir2_namecheck(name, namelen)) {
+		xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+		goto out;
+	}
+
 	if (!strncmp(".", name, namelen)) {
 		/* If this is "." then check that the inum matches the dir. */
 		if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)

diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
new file mode 100644
index 0000000..98f82d7
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.c

@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_health.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * The basics of filesystem summary counter checking are that we iterate the
+ * AGs counting the number of free blocks, free space btree blocks, per-AG
+ * reservations, inodes, delayed allocation reservations, and free inodes.
+ * Then we compare what we computed against the in-core counters.
+ *
+ * However, the reality is that summary counters are a tricky beast to check.
+ * While we /could/ freeze the filesystem and scramble around the AGs counting
+ * the free blocks, in practice we prefer not do that for a scan because
+ * freezing is costly.  To get around this, we added a per-cpu counter of the
+ * delalloc reservations so that we can rotor around the AGs relatively
+ * quickly, and we allow the counts to be slightly off because we're not taking
+ * any locks while we do this.
+ *
+ * So the first thing we do is warm up the buffer cache in the setup routine by
+ * walking all the AGs to make sure the incore per-AG structure has been
+ * initialized.  The expected value calculation then iterates the incore per-AG
+ * structures as quickly as it can.  We snapshot the percpu counters before and
+ * after this operation and use the difference in counter values to guess at
+ * our tolerance for mismatch between expected and actual counter values.
+ */
+
+/*
+ * Since the expected value computation is lockless but only browses incore
+ * values, the percpu counters should be fairly close to each other.  However,
+ * we'll allow ourselves to be off by at least this (arbitrary) amount.
+ */
+#define XCHK_FSCOUNT_MIN_VARIANCE	(512)
+
+/*
+ * Make sure the per-AG structure has been initialized from the on-disk header
+ * contents and trust that the incore counters match the ondisk counters.  (The
+ * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
+ * summary counters after checking all AG headers).  Do this from the setup
+ * function so that the inner AG aggregation loop runs as quickly as possible.
+ *
+ * This function runs during the setup phase /before/ we start checking any
+ * metadata.
+ */
+STATIC int
+xchk_fscount_warmup(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*agi_bp = NULL;
+	struct xfs_buf		*agf_bp = NULL;
+	struct xfs_perag	*pag = NULL;
+	xfs_agnumber_t		agno;
+	int			error = 0;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		pag = xfs_perag_get(mp, agno);
+
+		if (pag->pagi_init && pag->pagf_init)
+			goto next_loop_perag;
+
+		/* Lock both AG headers. */
+		error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
+		if (error)
+			break;
+		error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+		if (error)
+			break;
+		error = -ENOMEM;
+		if (!agf_bp || !agi_bp)
+			break;
+
+		/*
+		 * These are supposed to be initialized by the header read
+		 * function.
+		 */
+		error = -EFSCORRUPTED;
+		if (!pag->pagi_init || !pag->pagf_init)
+			break;
+
+		xfs_buf_relse(agf_bp);
+		agf_bp = NULL;
+		xfs_buf_relse(agi_bp);
+		agi_bp = NULL;
+next_loop_perag:
+		xfs_perag_put(pag);
+		pag = NULL;
+		error = 0;
+
+		if (fatal_signal_pending(current))
+			break;
+	}
+
+	if (agf_bp)
+		xfs_buf_relse(agf_bp);
+	if (agi_bp)
+		xfs_buf_relse(agi_bp);
+	if (pag)
+		xfs_perag_put(pag);
+	return error;
+}
+
+int
+xchk_setup_fscounters(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*ip)
+{
+	struct xchk_fscounters	*fsc;
+	int			error;
+
+	sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
+	if (!sc->buf)
+		return -ENOMEM;
+	fsc = sc->buf;
+
+	xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
+
+	/* We must get the incore counters set up before we can proceed. */
+	error = xchk_fscount_warmup(sc);
+	if (error)
+		return error;
+
+	/*
+	 * Pause background reclaim while we're scrubbing to reduce the
+	 * likelihood of background perturbations to the counters throwing off
+	 * our calculations.
+	 */
+	xchk_stop_reaping(sc);
+
+	return xchk_trans_alloc(sc, 0);
+}
+
+/*
+ * Calculate what the global in-core counters ought to be from the incore
+ * per-AG structure.  Callers can compare this to the actual in-core counters
+ * to estimate by how much both in-core and on-disk counters need to be
+ * adjusted.
+ */
+STATIC int
+xchk_fscount_aggregate_agcounts(
+	struct xfs_scrub	*sc,
+	struct xchk_fscounters	*fsc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_perag	*pag;
+	uint64_t		delayed;
+	xfs_agnumber_t		agno;
+	int			tries = 8;
+
+retry:
+	fsc->icount = 0;
+	fsc->ifree = 0;
+	fsc->fdblocks = 0;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		pag = xfs_perag_get(mp, agno);
+
+		/* This somehow got unset since the warmup? */
+		if (!pag->pagi_init || !pag->pagf_init) {
+			xfs_perag_put(pag);
+			return -EFSCORRUPTED;
+		}
+
+		/* Count all the inodes */
+		fsc->icount += pag->pagi_count;
+		fsc->ifree += pag->pagi_freecount;
+
+		/* Add up the free/freelist/bnobt/cntbt blocks */
+		fsc->fdblocks += pag->pagf_freeblks;
+		fsc->fdblocks += pag->pagf_flcount;
+		fsc->fdblocks += pag->pagf_btreeblks;
+
+		/*
+		 * Per-AG reservations are taken out of the incore counters,
+		 * so they must be left out of the free blocks computation.
+		 */
+		fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
+		fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
+
+		xfs_perag_put(pag);
+
+		if (fatal_signal_pending(current))
+			break;
+	}
+
+	/*
+	 * The global incore space reservation is taken from the incore
+	 * counters, so leave that out of the computation.
+	 */
+	fsc->fdblocks -= mp->m_resblks_avail;
+
+	/*
+	 * Delayed allocation reservations are taken out of the incore counters
+	 * but not recorded on disk, so leave them and their indlen blocks out
+	 * of the computation.
+	 */
+	delayed = percpu_counter_sum(&mp->m_delalloc_blks);
+	fsc->fdblocks -= delayed;
+
+	trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
+			delayed);
+
+
+	/* Bail out if the values we compute are totally nonsense. */
+	if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
+	    fsc->fdblocks > mp->m_sb.sb_dblocks ||
+	    fsc->ifree > fsc->icount_max)
+		return -EFSCORRUPTED;
+
+	/*
+	 * If ifree > icount then we probably had some perturbation in the
+	 * counters while we were calculating things.  We'll try a few times
+	 * to maintain ifree <= icount before giving up.
+	 */
+	if (fsc->ifree > fsc->icount) {
+		if (tries--)
+			goto retry;
+		xchk_set_incomplete(sc);
+		return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Is the @counter reasonably close to the @expected value?
+ *
+ * We neither locked nor froze anything in the filesystem while aggregating the
+ * per-AG data to compute the @expected value, which means that the counter
+ * could have changed.  We know the @old_value of the summation of the counter
+ * before the aggregation, and we re-sum the counter now.  If the expected
+ * value falls between the two summations, we're ok.
+ *
+ * Otherwise, we /might/ have a problem.  If the change in the summations is
+ * more than we want to tolerate, the filesystem is probably busy and we should
+ * just send back INCOMPLETE and see if userspace will try again.
+ */
+static inline bool
+xchk_fscount_within_range(
+	struct xfs_scrub	*sc,
+	const int64_t		old_value,
+	struct percpu_counter	*counter,
+	uint64_t		expected)
+{
+	int64_t			min_value, max_value;
+	int64_t			curr_value = percpu_counter_sum(counter);
+
+	trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
+			old_value);
+
+	/* Negative values are always wrong. */
+	if (curr_value < 0)
+		return false;
+
+	/* Exact matches are always ok. */
+	if (curr_value == expected)
+		return true;
+
+	min_value = min(old_value, curr_value);
+	max_value = max(old_value, curr_value);
+
+	/* Within the before-and-after range is ok. */
+	if (expected >= min_value && expected <= max_value)
+		return true;
+
+	/*
+	 * If the difference between the two summations is too large, the fs
+	 * might just be busy and so we'll mark the scrub incomplete.  Return
+	 * true here so that we don't mark the counter corrupt.
+	 *
+	 * XXX: In the future when userspace can grant scrub permission to
+	 * quiesce the filesystem to solve the outsized variance problem, this
+	 * check should be moved up and the return code changed to signal to
+	 * userspace that we need quiesce permission.
+	 */
+	if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
+		xchk_set_incomplete(sc);
+		return true;
+	}
+
+	return false;
+}
+
+/* Check the superblock counters. */
+int
+xchk_fscounters(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xchk_fscounters	*fsc = sc->buf;
+	int64_t			icount, ifree, fdblocks;
+	int			error;
+
+	/* Snapshot the percpu counters. */
+	icount = percpu_counter_sum(&mp->m_icount);
+	ifree = percpu_counter_sum(&mp->m_ifree);
+	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
+	/* No negative values, please! */
+	if (icount < 0 || ifree < 0 || fdblocks < 0)
+		xchk_set_corrupt(sc);
+
+	/* See if icount is obviously wrong. */
+	if (icount < fsc->icount_min || icount > fsc->icount_max)
+		xchk_set_corrupt(sc);
+
+	/* See if fdblocks is obviously wrong. */
+	if (fdblocks > mp->m_sb.sb_dblocks)
+		xchk_set_corrupt(sc);
+
+	/*
+	 * If ifree exceeds icount by more than the minimum variance then
+	 * something's probably wrong with the counters.
+	 */
+	if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
+		xchk_set_corrupt(sc);
+
+	/* Walk the incore AG headers to calculate the expected counters. */
+	error = xchk_fscount_aggregate_agcounts(sc, fsc);
+	if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+		return error;
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+		return 0;
+
+	/* Compare the in-core counters with whatever we counted. */
+	if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
+		xchk_set_corrupt(sc);
+
+	if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
+		xchk_set_corrupt(sc);
+
+	if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
+			fsc->fdblocks))
+		xchk_set_corrupt(sc);
+
+	return 0;
+}

diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
new file mode 100644
index 0000000..b2f6028
--- /dev/null
+++ b/fs/xfs/scrub/health.c

@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_btree.h"
+#include "xfs_sb.h"
+#include "xfs_health.h"
+#include "scrub/scrub.h"
+
+/*
+ * Scrub and In-Core Filesystem Health Assessments
+ * ===============================================
+ *
+ * Online scrub and repair have the time and the ability to perform stronger
+ * checks than we can do from the metadata verifiers, because they can
+ * cross-reference records between data structures.  Therefore, scrub is in a
+ * good position to update the online filesystem health assessments to reflect
+ * the good/bad state of the data structure.
+ *
+ * We therefore extend scrub in the following ways to achieve this:
+ *
+ * 1. Create a "sick_mask" field in the scrub context.  When we're setting up a
+ * scrub call, set this to the default XFS_SICK_* flag(s) for the selected
+ * scrub type (call it A).  Scrub and repair functions can override the default
+ * sick_mask value if they choose.
+ *
+ * 2. If the scrubber returns a runtime error code, we exit making no changes
+ * to the incore sick state.
+ *
+ * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore
+ * sick flags before exiting.
+ *
+ * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore
+ * sick flags.  If the user didn't want to repair then we exit, leaving the
+ * metadata structure unfixed and the sick flag set.
+ *
+ * 5. Now we know that A is corrupt and the user wants to repair, so run the
+ * repairer.  If the repairer returns an error code, we exit with that error
+ * code, having made no further changes to the incore sick state.
+ *
+ * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean,
+ * use sick_mask to clear the incore sick flags.  This should have the effect
+ * that A is no longer marked sick.
+ *
+ * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and
+ * use sick_mask to set the incore sick flags.  This should have no externally
+ * visible effect since we already set them in step (4).
+ *
+ * There are some complications to this story, however.  For certain types of
+ * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild
+ * both structures at the same time.  The following principles apply to this
+ * type of repair strategy:
+ *
+ * 8. Any repair function that rebuilds multiple structures should update
+ * sick_mask_visible to reflect whatever other structures are rebuilt, and
+ * verify that all the rebuilt structures can pass a scrub check.  The outcomes
+ * of 5-7 still apply, but with a sick_mask that covers everything being
+ * rebuilt.
+ */
+
+/* Map our scrub type to a sick mask and a set of health update functions. */
+
+enum xchk_health_group {
+	XHG_FS = 1,
+	XHG_RT,
+	XHG_AG,
+	XHG_INO,
+};
+
+struct xchk_health_map {
+	enum xchk_health_group	group;
+	unsigned int		sick_mask;
+};
+
+static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
+	[XFS_SCRUB_TYPE_SB]		= { XHG_AG,  XFS_SICK_AG_SB },
+	[XFS_SCRUB_TYPE_AGF]		= { XHG_AG,  XFS_SICK_AG_AGF },
+	[XFS_SCRUB_TYPE_AGFL]		= { XHG_AG,  XFS_SICK_AG_AGFL },
+	[XFS_SCRUB_TYPE_AGI]		= { XHG_AG,  XFS_SICK_AG_AGI },
+	[XFS_SCRUB_TYPE_BNOBT]		= { XHG_AG,  XFS_SICK_AG_BNOBT },
+	[XFS_SCRUB_TYPE_CNTBT]		= { XHG_AG,  XFS_SICK_AG_CNTBT },
+	[XFS_SCRUB_TYPE_INOBT]		= { XHG_AG,  XFS_SICK_AG_INOBT },
+	[XFS_SCRUB_TYPE_FINOBT]		= { XHG_AG,  XFS_SICK_AG_FINOBT },
+	[XFS_SCRUB_TYPE_RMAPBT]		= { XHG_AG,  XFS_SICK_AG_RMAPBT },
+	[XFS_SCRUB_TYPE_REFCNTBT]	= { XHG_AG,  XFS_SICK_AG_REFCNTBT },
+	[XFS_SCRUB_TYPE_INODE]		= { XHG_INO, XFS_SICK_INO_CORE },
+	[XFS_SCRUB_TYPE_BMBTD]		= { XHG_INO, XFS_SICK_INO_BMBTD },
+	[XFS_SCRUB_TYPE_BMBTA]		= { XHG_INO, XFS_SICK_INO_BMBTA },
+	[XFS_SCRUB_TYPE_BMBTC]		= { XHG_INO, XFS_SICK_INO_BMBTC },
+	[XFS_SCRUB_TYPE_DIR]		= { XHG_INO, XFS_SICK_INO_DIR },
+	[XFS_SCRUB_TYPE_XATTR]		= { XHG_INO, XFS_SICK_INO_XATTR },
+	[XFS_SCRUB_TYPE_SYMLINK]	= { XHG_INO, XFS_SICK_INO_SYMLINK },
+	[XFS_SCRUB_TYPE_PARENT]		= { XHG_INO, XFS_SICK_INO_PARENT },
+	[XFS_SCRUB_TYPE_RTBITMAP]	= { XHG_RT,  XFS_SICK_RT_BITMAP },
+	[XFS_SCRUB_TYPE_RTSUM]		= { XHG_RT,  XFS_SICK_RT_SUMMARY },
+	[XFS_SCRUB_TYPE_UQUOTA]		= { XHG_FS,  XFS_SICK_FS_UQUOTA },
+	[XFS_SCRUB_TYPE_GQUOTA]		= { XHG_FS,  XFS_SICK_FS_GQUOTA },
+	[XFS_SCRUB_TYPE_PQUOTA]		= { XHG_FS,  XFS_SICK_FS_PQUOTA },
+	[XFS_SCRUB_TYPE_FSCOUNTERS]	= { XHG_FS,  XFS_SICK_FS_COUNTERS },
+};
+
+/* Return the health status mask for this scrub type. */
+unsigned int
+xchk_health_mask_for_scrub_type(
+	__u32			scrub_type)
+{
+	return type_to_health_flag[scrub_type].sick_mask;
+}
+
+/*
+ * Update filesystem health assessments based on what we found and did.
+ *
+ * If the scrubber finds errors, we mark sick whatever's mentioned in
+ * sick_mask, no matter whether this is a first scan or an
+ * evaluation of repair effectiveness.
+ *
+ * Otherwise, no direct corruption was found, so mark whatever's in
+ * sick_mask as healthy.
+ */
+void
+xchk_update_health(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_perag	*pag;
+	bool			bad;
+
+	if (!sc->sick_mask)
+		return;
+
+	bad = (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT);
+	switch (type_to_health_flag[sc->sm->sm_type].group) {
+	case XHG_AG:
+		pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
+		if (bad)
+			xfs_ag_mark_sick(pag, sc->sick_mask);
+		else
+			xfs_ag_mark_healthy(pag, sc->sick_mask);
+		xfs_perag_put(pag);
+		break;
+	case XHG_INO:
+		if (!sc->ip)
+			return;
+		if (bad)
+			xfs_inode_mark_sick(sc->ip, sc->sick_mask);
+		else
+			xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
+		break;
+	case XHG_FS:
+		if (bad)
+			xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+		else
+			xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
+		break;
+	case XHG_RT:
+		if (bad)
+			xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+		else
+			xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/* Is the given per-AG btree healthy enough for scanning? */
+bool
+xchk_ag_btree_healthy_enough(
+	struct xfs_scrub	*sc,
+	struct xfs_perag	*pag,
+	xfs_btnum_t		btnum)
+{
+	unsigned int		mask = 0;
+
+	/*
+	 * We always want the cursor if it's the same type as whatever we're
+	 * scrubbing, even if we already know the structure is corrupt.
+	 *
+	 * Otherwise, we're only interested in the btree for cross-referencing.
+	 * If we know the btree is bad then don't bother, just set XFAIL.
+	 */
+	switch (btnum) {
+	case XFS_BTNUM_BNO:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
+			return true;
+		mask = XFS_SICK_AG_BNOBT;
+		break;
+	case XFS_BTNUM_CNT:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
+			return true;
+		mask = XFS_SICK_AG_CNTBT;
+		break;
+	case XFS_BTNUM_INO:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+			return true;
+		mask = XFS_SICK_AG_INOBT;
+		break;
+	case XFS_BTNUM_FINO:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
+			return true;
+		mask = XFS_SICK_AG_FINOBT;
+		break;
+	case XFS_BTNUM_RMAP:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
+			return true;
+		mask = XFS_SICK_AG_RMAPBT;
+		break;
+	case XFS_BTNUM_REFC:
+		if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
+			return true;
+		mask = XFS_SICK_AG_REFCNTBT;
+		break;
+	default:
+		ASSERT(0);
+		return true;
+	}
+
+	if (xfs_ag_has_sickness(pag, mask)) {
+		sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
+		return false;
+	}
+
+	return true;
+}

diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
new file mode 100644
index 0000000..d0b938d
--- /dev/null
+++ b/fs/xfs/scrub/health.h

@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_HEALTH_H__
+#define __XFS_SCRUB_HEALTH_H__
+
+unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
+void xchk_update_health(struct xfs_scrub *sc);
+bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
+		xfs_btnum_t btnum);
+
+#endif /* __XFS_SCRUB_HEALTH_H__ */

diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 224dba9..6817587 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c

@@ -9,21 +9,14 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
-#include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_icache.h"
 #include "xfs_rmap.h"
-#include "xfs_log.h"
-#include "xfs_trans_priv.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -39,11 +32,22 @@
 	struct xfs_scrub	*sc,
 	struct xfs_inode	*ip)
 {
-	return xchk_setup_ag_btree(sc, ip, sc->try_harder);
+	return xchk_setup_ag_btree(sc, ip, sc->flags & XCHK_TRY_HARDER);
 }
 
 /* Inode btree scrubber. */
 
+struct xchk_iallocbt {
+	/* Number of inodes we see while scanning inobt. */
+	unsigned long long	inodes;
+
+	/* Expected next startino, for big block filesystems. */
+	xfs_agino_t		next_startino;
+
+	/* Expected end of the current inode cluster. */
+	xfs_agino_t		next_cluster_ino;
+};
+
 /*
  * If we're checking the finobt, cross-reference with the inobt.
  * Otherwise we're checking the inobt; if there is an finobt, make sure
@@ -82,15 +86,12 @@
 	xfs_agblock_t			agbno,
 	xfs_extlen_t			len)
 {
-	struct xfs_owner_info		oinfo;
-
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		return;
 
 	xchk_xref_is_used_space(sc, agbno, len);
 	xchk_iallocbt_chunk_xref_other(sc, irec, agino);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
-	xchk_xref_is_owned_by(sc, agbno, len, &oinfo);
+	xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES);
 	xchk_xref_is_not_shared(sc, agbno, len);
 }
 
@@ -126,42 +127,58 @@
 	return hweight64(freemask);
 }
 
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record.  First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
 STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
 	struct xchk_btree		*bs,
-	xfs_ino_t			fsino,
-	xfs_agino_t			chunkino,
-	xfs_agino_t			clusterino,
 	struct xfs_inobt_rec_incore	*irec,
-	struct xfs_buf			*bp)
+	unsigned int			irec_ino,
+	struct xfs_dinode		*dip)
 {
-	struct xfs_dinode		*dip;
 	struct xfs_mount		*mp = bs->cur->bc_mp;
-	bool				inode_is_free = false;
+	xfs_ino_t			fsino;
+	xfs_agino_t			agino;
+	bool				irec_free;
+	bool				ino_inuse;
 	bool				freemask_ok;
-	bool				inuse;
 	int				error = 0;
 
 	if (xchk_should_terminate(bs->sc, &error))
 		return error;
 
-	dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+	/*
+	 * Given an inobt record and the offset of an inode from the start of
+	 * the record, compute which fs inode we're talking about.
+	 */
+	agino = irec->ir_startino + irec_ino;
+	fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+	irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
 	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
-	    (dip->di_version >= 3 &&
-	     be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+	    (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 		goto out;
 	}
 
-	if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
-		inode_is_free = true;
-	error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
-			fsino + clusterino, &inuse);
+	error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+			&ino_inuse);
 	if (error == -ENODATA) {
 		/* Not cached, just read the disk buffer */
-		freemask_ok = inode_is_free ^ !!(dip->di_mode);
-		if (!bs->sc->try_harder && !freemask_ok)
+		freemask_ok = irec_free ^ !!(dip->di_mode);
+		if (!(bs->sc->flags & XCHK_TRY_HARDER) && !freemask_ok)
 			return -EDEADLOCK;
 	} else if (error < 0) {
 		/*
@@ -172,7 +189,7 @@
 		goto out;
 	} else {
 		/* Inode is all there. */
-		freemask_ok = inode_is_free ^ inuse;
+		freemask_ok = irec_free ^ ino_inuse;
 	}
 	if (!freemask_ok)
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -180,91 +197,225 @@
 	return 0;
 }
 
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk.  If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
 STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
 	struct xchk_btree		*bs,
-	struct xfs_inobt_rec_incore	*irec)
+	struct xfs_inobt_rec_incore	*irec,
+	unsigned int			cluster_base)
 {
-	struct xfs_owner_info		oinfo;
 	struct xfs_imap			imap;
 	struct xfs_mount		*mp = bs->cur->bc_mp;
 	struct xfs_dinode		*dip;
-	struct xfs_buf			*bp;
-	xfs_ino_t			fsino;
-	xfs_agino_t			nr_inodes;
-	xfs_agino_t			agino;
-	xfs_agino_t			chunkino;
-	xfs_agino_t			clusterino;
+	struct xfs_buf			*cluster_bp;
+	unsigned int			nr_inodes;
+	xfs_agnumber_t			agno = bs->cur->bc_private.a.agno;
 	xfs_agblock_t			agbno;
-	int				blks_per_cluster;
-	uint16_t			holemask;
+	unsigned int			cluster_index;
+	uint16_t			cluster_mask = 0;
 	uint16_t			ir_holemask;
 	int				error = 0;
 
-	/* Make sure the freemask matches the inode records. */
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-	nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
+	nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+			M_IGEO(mp)->inodes_per_cluster);
 
-	for (agino = irec->ir_startino;
-	     agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
-	     agino += blks_per_cluster * mp->m_sb.sb_inopblock) {
-		fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
-		chunkino = agino - irec->ir_startino;
-		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	/* Map this inode cluster */
+	agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
 
-		/* Compute the holemask mask for this cluster. */
-		for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
-		     clusterino += XFS_INODES_PER_HOLEMASK_BIT)
-			holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
-					XFS_INODES_PER_HOLEMASK_BIT);
+	/* Compute a bitmask for this cluster that can be used for holemask. */
+	for (cluster_index = 0;
+	     cluster_index < nr_inodes;
+	     cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+		cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+				XFS_INODES_PER_HOLEMASK_BIT);
 
-		/* The whole cluster must be a hole or not a hole. */
-		ir_holemask = (irec->ir_holemask & holemask);
-		if (ir_holemask != holemask && ir_holemask != 0) {
+	/*
+	 * Map the first inode of this cluster to a buffer and offset.
+	 * Be careful about inobt records that don't align with the start of
+	 * the inode buffer when block sizes are large enough to hold multiple
+	 * inode chunks.  When this happens, cluster_base will be zero but
+	 * ir_startino can be large enough to make im_boffset nonzero.
+	 */
+	ir_holemask = (irec->ir_holemask & cluster_mask);
+	imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
+	imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) <<
+			mp->m_sb.sb_inodelog;
+
+	if (imap.im_boffset != 0 && cluster_base != 0) {
+		ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
+
+	trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+			imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+			cluster_mask, ir_holemask,
+			XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+					  cluster_base));
+
+	/* The whole cluster must be a hole or not a hole. */
+	if (ir_holemask != cluster_mask && ir_holemask != 0) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return 0;
+	}
+
+	/* If any part of this is a hole, skip it. */
+	if (ir_holemask) {
+		xchk_xref_is_not_owned_by(bs->sc, agbno,
+				M_IGEO(mp)->blocks_per_cluster,
+				&XFS_RMAP_OINFO_INODES);
+		return 0;
+	}
+
+	xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster,
+			&XFS_RMAP_OINFO_INODES);
+
+	/* Grab the inode cluster buffer. */
+	error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+			0, 0);
+	if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+		return error;
+
+	/* Check free status of each inode within this cluster. */
+	for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+		struct xfs_dinode	*dip;
+
+		if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
 			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-			continue;
+			break;
 		}
 
-		/* If any part of this is a hole, skip it. */
-		if (ir_holemask) {
-			xchk_xref_is_not_owned_by(bs->sc, agbno,
-					blks_per_cluster, &oinfo);
-			continue;
-		}
+		dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+		error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+				cluster_base + cluster_index, dip);
+		if (error)
+			break;
+		imap.im_boffset += mp->m_sb.sb_inodesize;
+	}
 
-		xchk_xref_is_owned_by(bs->sc, agbno, blks_per_cluster,
-				&oinfo);
+	xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+	return error;
+}
 
-		/* Grab the inode cluster buffer. */
-		imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
-				agbno);
-		imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
-		imap.im_boffset = 0;
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+	struct xchk_btree		*bs,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	unsigned int			cluster_base;
+	int				error = 0;
 
-		error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
-				&dip, &bp, 0, 0);
-		if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
-				&error))
-			continue;
-
-		/* Which inodes are free? */
-		for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
-			error = xchk_iallocbt_check_cluster_freemask(bs,
-					fsino, chunkino, clusterino, irec, bp);
-			if (error) {
-				xfs_trans_brelse(bs->cur->bc_tp, bp);
-				return error;
-			}
-		}
-
-		xfs_trans_brelse(bs->cur->bc_tp, bp);
+	/*
+	 * For the common case where this inobt record maps to multiple inode
+	 * clusters this will call _check_cluster for each cluster.
+	 *
+	 * For the case that multiple inobt records map to a single cluster,
+	 * this will call _check_cluster once.
+	 */
+	for (cluster_base = 0;
+	     cluster_base < XFS_INODES_PER_CHUNK;
+	     cluster_base += M_IGEO(bs->sc->mp)->inodes_per_cluster) {
+		error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+		if (error)
+			break;
 	}
 
 	return error;
 }
 
+/*
+ * Make sure this inode btree record is aligned properly.  Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk.  This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+	struct xchk_btree		*bs,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	struct xfs_mount		*mp = bs->sc->mp;
+	struct xchk_iallocbt		*iabt = bs->private;
+	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
+
+	/*
+	 * finobt records have different positioning requirements than inobt
+	 * records: each finobt record must have a corresponding inobt record.
+	 * That is checked in the xref function, so for now we only catch the
+	 * obvious case where the record isn't at all aligned properly.
+	 *
+	 * Note that if a fs block contains more than a single chunk of inodes,
+	 * we will have finobt records only for those chunks containing free
+	 * inodes, and therefore expect chunk alignment of finobt records.
+	 * Otherwise, we expect that the finobt record is aligned to the
+	 * cluster alignment as told by the superblock.
+	 */
+	if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+		unsigned int	imask;
+
+		imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+				igeo->cluster_align_inodes) - 1;
+		if (irec->ir_startino & imask)
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return;
+	}
+
+	if (iabt->next_startino != NULLAGINO) {
+		/*
+		 * We're midway through a cluster of inodes that is mapped by
+		 * multiple inobt records.  Did we get the record for the next
+		 * irec in the sequence?
+		 */
+		if (irec->ir_startino != iabt->next_startino) {
+			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+			return;
+		}
+
+		iabt->next_startino += XFS_INODES_PER_CHUNK;
+
+		/* Are we done with the cluster? */
+		if (iabt->next_startino >= iabt->next_cluster_ino) {
+			iabt->next_startino = NULLAGINO;
+			iabt->next_cluster_ino = NULLAGINO;
+		}
+		return;
+	}
+
+	/* inobt records must be aligned to cluster and inoalignmnt size. */
+	if (irec->ir_startino & (igeo->cluster_align_inodes - 1)) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return;
+	}
+
+	if (irec->ir_startino & (igeo->inodes_per_cluster - 1)) {
+		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+		return;
+	}
+
+	if (igeo->inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+		return;
+
+	/*
+	 * If this is the start of an inode cluster that can be mapped by
+	 * multiple inobt records, the next inobt record must follow exactly
+	 * after this one.
+	 */
+	iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+	iabt->next_cluster_ino = irec->ir_startino + igeo->inodes_per_cluster;
+}
+
 /* Scrub an inobt/finobt record. */
 STATIC int
 xchk_iallocbt_rec(
@@ -272,12 +423,11 @@
 	union xfs_btree_rec		*rec)
 {
 	struct xfs_mount		*mp = bs->cur->bc_mp;
-	xfs_filblks_t			*inode_blocks = bs->private;
+	struct xchk_iallocbt		*iabt = bs->private;
 	struct xfs_inobt_rec_incore	irec;
 	uint64_t			holes;
 	xfs_agnumber_t			agno = bs->cur->bc_private.a.agno;
 	xfs_agino_t			agino;
-	xfs_agblock_t			agbno;
 	xfs_extlen_t			len;
 	int				holecount;
 	int				i;
@@ -304,14 +454,11 @@
 		goto out;
 	}
 
-	/* Make sure this record is aligned to cluster and inoalignmnt size. */
-	agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
-	if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) ||
-	    (agbno & (xfs_icluster_size_fsb(mp) - 1)))
-		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+	xchk_iallocbt_rec_alignment(bs, &irec);
+	if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
 
-	*inode_blocks += XFS_B_TO_FSB(mp,
-			irec.ir_count * mp->m_sb.sb_inodesize);
+	iabt->inodes += irec.ir_count;
 
 	/* Handle non-sparse inodes */
 	if (!xfs_inobt_issparse(irec.ir_holemask)) {
@@ -322,7 +469,7 @@
 
 		if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
 			goto out;
-		goto check_freemask;
+		goto check_clusters;
 	}
 
 	/* Check each chunk of a sparse inode cluster. */
@@ -348,8 +495,8 @@
 	    holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
-check_freemask:
-	error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+	error = xchk_iallocbt_check_clusters(bs, &irec);
 	if (error)
 		goto out;
 
@@ -366,7 +513,6 @@
 	struct xfs_scrub	*sc,
 	int			which)
 {
-	struct xfs_owner_info	oinfo;
 	xfs_filblks_t		blocks;
 	xfs_extlen_t		inobt_blocks = 0;
 	xfs_extlen_t		finobt_blocks = 0;
@@ -388,9 +534,8 @@
 			return;
 	}
 
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
-	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo,
-			&blocks);
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+			&XFS_RMAP_OINFO_INOBT, &blocks);
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
 		return;
 	if (blocks != inobt_blocks + finobt_blocks)
@@ -405,21 +550,21 @@
 xchk_iallocbt_xref_rmap_inodes(
 	struct xfs_scrub	*sc,
 	int			which,
-	xfs_filblks_t		inode_blocks)
+	unsigned long long	inodes)
 {
-	struct xfs_owner_info	oinfo;
 	xfs_filblks_t		blocks;
+	xfs_filblks_t		inode_blocks;
 	int			error;
 
 	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
 		return;
 
 	/* Check that we saw as many inode blocks as the rmap knows about. */
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
-	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo,
-			&blocks);
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+			&XFS_RMAP_OINFO_INODES, &blocks);
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
 		return;
+	inode_blocks = XFS_B_TO_FSB(sc->mp, inodes * sc->mp->m_sb.sb_inodesize);
 	if (blocks != inode_blocks)
 		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
 }
@@ -431,14 +576,16 @@
 	xfs_btnum_t		which)
 {
 	struct xfs_btree_cur	*cur;
-	struct xfs_owner_info	oinfo;
-	xfs_filblks_t		inode_blocks = 0;
+	struct xchk_iallocbt	iabt = {
+		.inodes		= 0,
+		.next_startino	= NULLAGINO,
+		.next_cluster_ino = NULLAGINO,
+	};
 	int			error;
 
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
 	cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
-	error = xchk_btree(sc, cur, xchk_iallocbt_rec, &oinfo,
-			&inode_blocks);
+	error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
+			&iabt);
 	if (error)
 		return error;
 
@@ -452,7 +599,7 @@
 	 * to inode chunks with free inodes.
 	 */
 	if (which == XFS_BTNUM_INO)
-		xchk_iallocbt_xref_rmap_inodes(sc, which, inode_blocks);
+		xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
 
 	return error;
 }

diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index e386c9b..6d483ab 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c

@@ -9,27 +9,17 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
-#include "xfs_icache.h"
-#include "xfs_inode_buf.h"
-#include "xfs_inode_fork.h"
 #include "xfs_ialloc.h"
 #include "xfs_da_format.h"
 #include "xfs_reflink.h"
 #include "xfs_rmap.h"
-#include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
-#include "scrub/trace.h"
 
 /*
  * Grab total control of the inode metadata.  It doesn't matter here if
@@ -509,7 +499,6 @@
 	xfs_ino_t		ino,
 	struct xfs_dinode	*dip)
 {
-	struct xfs_owner_info	oinfo;
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
 	int			error;
@@ -526,8 +515,7 @@
 
 	xchk_xref_is_used_space(sc, agbno, 1);
 	xchk_inode_xref_finobt(sc, ino);
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
-	xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
 	xchk_xref_is_not_shared(sc, agbno, 1);
 	xchk_inode_xref_bmap(sc, dip);
 

diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 1c9d7c7..c962bd5 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c

@@ -9,21 +9,13 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_icache.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
-#include "scrub/trace.h"
 
 /* Set us up to scrub parents. */
 int
@@ -320,7 +312,7 @@
 	 * If we failed to lock the parent inode even after a retry, just mark
 	 * this scrub incomplete and return.
 	 */
-	if (sc->try_harder && error == -EDEADLOCK) {
+	if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
 		error = 0;
 		xchk_set_incomplete(sc);
 	}

diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 782d582..0a33b44 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c

@@ -9,24 +9,13 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
-#include "xfs_inode_fork.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
 #include "xfs_quota.h"
 #include "xfs_qm.h"
-#include "xfs_dquot.h"
-#include "xfs_dquot_item.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
-#include "scrub/trace.h"
 
 /* Convert a scrub type code to a DQ flag, or return 0 if error. */
 static inline uint
@@ -60,7 +49,7 @@
 	dqtype = xchk_quota_to_dqtype(sc);
 	if (dqtype == 0)
 		return -EINVAL;
-	sc->has_quotaofflock = true;
+	sc->flags |= XCHK_HAS_QUOTAOFFLOCK;
 	mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
 	if (!xfs_this_quota_on(sc->mp, dqtype))
 		return -ENOENT;
@@ -144,7 +133,7 @@
 	if (bsoft > bhard)
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
 
-	if (ihard > mp->m_maxicount)
+	if (ihard > M_IGEO(mp)->maxicount)
 		xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
 	if (isoft > ihard)
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);

diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index e8c82b0..0cab11a 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c

@@ -7,22 +7,12 @@
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
 #include "xfs_rmap.h"
 #include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
-#include "scrub/trace.h"
 
 /*
  * Set us up to scrub reference count btrees.
@@ -351,7 +341,6 @@
 	xfs_extlen_t		len;
 	xfs_nlink_t		refcount;
 	bool			has_cowflag;
-	int			error = 0;
 
 	bno = be32_to_cpu(rec->refc.rc_startblock);
 	len = be32_to_cpu(rec->refc.rc_blockcount);
@@ -376,14 +365,13 @@
 
 	xchk_refcountbt_xref(bs->sc, bno, len, refcount);
 
-	return error;
+	return 0;
 }
 
 /* Make sure we have as many refc blocks as the rmap says. */
 STATIC void
 xchk_refcount_xref_rmap(
 	struct xfs_scrub	*sc,
-	struct xfs_owner_info	*oinfo,
 	xfs_filblks_t		cow_blocks)
 {
 	xfs_extlen_t		refcbt_blocks = 0;
@@ -397,17 +385,16 @@
 	error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks);
 	if (!xchk_btree_process_error(sc, sc->sa.refc_cur, 0, &error))
 		return;
-	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo,
-			&blocks);
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+			&XFS_RMAP_OINFO_REFC, &blocks);
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
 		return;
 	if (blocks != refcbt_blocks)
 		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
 
 	/* Check that we saw as many cow blocks as the rmap knows about. */
-	xfs_rmap_ag_owner(oinfo, XFS_RMAP_OWN_COW);
-	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo,
-			&blocks);
+	error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+			&XFS_RMAP_OINFO_COW, &blocks);
 	if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
 		return;
 	if (blocks != cow_blocks)
@@ -419,17 +406,15 @@
 xchk_refcountbt(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_owner_info	oinfo;
 	xfs_agblock_t		cow_blocks = 0;
 	int			error;
 
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
 	error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec,
-			&oinfo, &cow_blocks);
+			&XFS_RMAP_OINFO_REFC, &cow_blocks);
 	if (error)
 		return error;
 
-	xchk_refcount_xref_rmap(sc, &oinfo, cow_blocks);
+	xchk_refcount_xref_rmap(sc, cow_blocks);
 
 	return 0;
 }

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 9f08dd9..b70a88b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c

@@ -9,27 +9,21 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_inode.h"
-#include "xfs_icache.h"
 #include "xfs_alloc.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_extent_busy.h"
 #include "xfs_ag_resv.h"
-#include "xfs_trans_space.h"
 #include "xfs_quota.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -44,8 +38,7 @@
 int
 xrep_attempt(
 	struct xfs_inode	*ip,
-	struct xfs_scrub	*sc,
-	bool			*fixed)
+	struct xfs_scrub	*sc)
 {
 	int			error = 0;
 
@@ -64,13 +57,13 @@
 		 * scrub so that we can tell userspace if we fixed the problem.
 		 */
 		sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
-		*fixed = true;
+		sc->flags |= XREP_ALREADY_FIXED;
 		return -EAGAIN;
 	case -EDEADLOCK:
 	case -EAGAIN:
 		/* Tell the caller to try again having grabbed all the locks. */
-		if (!sc->try_harder) {
-			sc->try_harder = true;
+		if (!(sc->flags & XCHK_TRY_HARDER)) {
+			sc->flags |= XCHK_TRY_HARDER;
 			return -EAGAIN;
 		}
 		/*
@@ -135,10 +128,16 @@
 	if (sc->sa.agfl_bp)
 		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
 
-	/* Roll the transaction. */
+	/*
+	 * Roll the transaction.  We still own the buffer and the buffer lock
+	 * regardless of whether or not the roll succeeds.  If the roll fails,
+	 * the buffers will be released during teardown on our way out of the
+	 * kernel.  If it succeeds, we join them to the new transaction and
+	 * move on.
+	 */
 	error = xfs_trans_roll(&sc->tp);
 	if (error)
-		goto out_release;
+		return error;
 
 	/* Join AG headers to the new transaction. */
 	if (sc->sa.agi_bp)
@@ -149,21 +148,6 @@
 		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
 
 	return 0;
-
-out_release:
-	/*
-	 * Rolling failed, so release the hold on the buffers.  The
-	 * buffers will be released during teardown on our way out
-	 * of the kernel.
-	 */
-	if (sc->sa.agi_bp)
-		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
-	if (sc->sa.agf_bp)
-		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
-	if (sc->sa.agfl_bp)
-		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
-
-	return error;
 }
 
 /*
@@ -297,14 +281,14 @@
 /* Allocate a block in an AG. */
 int
 xrep_alloc_ag_block(
-	struct xfs_scrub	*sc,
-	struct xfs_owner_info	*oinfo,
-	xfs_fsblock_t		*fsbno,
-	enum xfs_ag_resv_type	resv)
+	struct xfs_scrub		*sc,
+	const struct xfs_owner_info	*oinfo,
+	xfs_fsblock_t			*fsbno,
+	enum xfs_ag_resv_type		resv)
 {
-	struct xfs_alloc_arg	args = {0};
-	xfs_agblock_t		bno;
-	int			error;
+	struct xfs_alloc_arg		args = {0};
+	xfs_agblock_t			bno;
+	int				error;
 
 	switch (resv) {
 	case XFS_AG_RESV_AGFL:
@@ -365,9 +349,9 @@
 	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
 			XFS_FSB_TO_BB(mp, 1), 0);
 	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
+	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
-	xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+	xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
 	bp->b_ops = ops;
 	*bpp = bp;
 
@@ -503,7 +487,6 @@
 	struct xfs_scrub	*sc,
 	xfs_agblock_t		agbno)
 {
-	struct xfs_owner_info	oinfo;
 	int			error;
 
 	/* Make sure there's space on the freelist. */
@@ -516,9 +499,8 @@
 	 * create an rmap for the block prior to merging it or else other
 	 * parts will break.
 	 */
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
 	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
-			&oinfo);
+			&XFS_RMAP_OINFO_AG);
 	if (error)
 		return error;
 
@@ -536,17 +518,17 @@
 /* Dispose of a single block. */
 STATIC int
 xrep_reap_block(
-	struct xfs_scrub	*sc,
-	xfs_fsblock_t		fsbno,
-	struct xfs_owner_info	*oinfo,
-	enum xfs_ag_resv_type	resv)
+	struct xfs_scrub		*sc,
+	xfs_fsblock_t			fsbno,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		resv)
 {
-	struct xfs_btree_cur	*cur;
-	struct xfs_buf		*agf_bp = NULL;
-	xfs_agnumber_t		agno;
-	xfs_agblock_t		agbno;
-	bool			has_other_rmap;
-	int			error;
+	struct xfs_btree_cur		*cur;
+	struct xfs_buf			*agf_bp = NULL;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	bool				has_other_rmap;
+	int				error;
 
 	agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
 	agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
@@ -610,15 +592,15 @@
 /* Dispose of every block of every extent in the bitmap. */
 int
 xrep_reap_extents(
-	struct xfs_scrub	*sc,
-	struct xfs_bitmap	*bitmap,
-	struct xfs_owner_info	*oinfo,
-	enum xfs_ag_resv_type	type)
+	struct xfs_scrub		*sc,
+	struct xfs_bitmap		*bitmap,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		type)
 {
-	struct xfs_bitmap_range	*bmr;
-	struct xfs_bitmap_range	*n;
-	xfs_fsblock_t		fsbno;
-	int			error = 0;
+	struct xfs_bitmap_range		*bmr;
+	struct xfs_bitmap_range		*n;
+	xfs_fsblock_t			fsbno;
+	int				error = 0;
 
 	ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
 
@@ -682,7 +664,7 @@
 {
 	xfs_agblock_t		*agbno = priv;
 
-	return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
+	return (*agbno == bno) ? -ECANCELED : 0;
 }
 
 /* Does this block match the btree information passed in? */
@@ -692,13 +674,14 @@
 	struct xrep_find_ag_btree	*fab,
 	uint64_t			owner,
 	xfs_agblock_t			agbno,
-	bool				*found_it)
+	bool				*done_with_block)
 {
 	struct xfs_mount		*mp = ri->sc->mp;
 	struct xfs_buf			*bp;
 	struct xfs_btree_block		*btblock;
 	xfs_daddr_t			daddr;
-	int				error;
+	int				block_level;
+	int				error = 0;
 
 	daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
 
@@ -711,42 +694,123 @@
 	if (owner == XFS_RMAP_OWN_AG) {
 		error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
 				xrep_findroot_agfl_walk, &agbno);
-		if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+		if (error == -ECANCELED)
 			return 0;
 		if (error)
 			return error;
 	}
 
+	/*
+	 * Read the buffer into memory so that we can see if it's a match for
+	 * our btree type.  We have no clue if it is beforehand, and we want to
+	 * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
+	 * will cause needless disk reads in subsequent calls to this function)
+	 * and logging metadata verifier failures.
+	 *
+	 * Therefore, pass in NULL buffer ops.  If the buffer was already in
+	 * memory from some other caller it will already have b_ops assigned.
+	 * If it was in memory from a previous unsuccessful findroot_block
+	 * call, the buffer won't have b_ops but it should be clean and ready
+	 * for us to try to verify if the read call succeeds.  The same applies
+	 * if the buffer wasn't in memory at all.
+	 *
+	 * Note: If we never match a btree type with this buffer, it will be
+	 * left in memory with NULL b_ops.  This shouldn't be a problem unless
+	 * the buffer gets written.
+	 */
 	error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
 			mp->m_bsize, 0, &bp, NULL);
 	if (error)
 		return error;
 
-	/*
-	 * Does this look like a block matching our fs and higher than any
-	 * other block we've found so far?  If so, reattach buffer verifiers
-	 * so the AIL won't complain if the buffer is also dirty.
-	 */
+	/* Ensure the block magic matches the btree type we're looking for. */
 	btblock = XFS_BUF_TO_BLOCK(bp);
-	if (be32_to_cpu(btblock->bb_magic) != fab->magic)
-		goto out;
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
-		goto out;
-	bp->b_ops = fab->buf_ops;
-
-	/* Ignore this block if it's lower in the tree than we've seen. */
-	if (fab->root != NULLAGBLOCK &&
-	    xfs_btree_get_level(btblock) < fab->height)
+	ASSERT(fab->buf_ops->magic[1] != 0);
+	if (btblock->bb_magic != fab->buf_ops->magic[1])
 		goto out;
 
-	/* Make sure we pass the verifiers. */
-	bp->b_ops->verify_read(bp);
-	if (bp->b_error)
+	/*
+	 * If the buffer already has ops applied and they're not the ones for
+	 * this btree type, we know this block doesn't match the btree and we
+	 * can bail out.
+	 *
+	 * If the buffer ops match ours, someone else has already validated
+	 * the block for us, so we can move on to checking if this is a root
+	 * block candidate.
+	 *
+	 * If the buffer does not have ops, nobody has successfully validated
+	 * the contents and the buffer cannot be dirty.  If the magic, uuid,
+	 * and structure match this btree type then we'll move on to checking
+	 * if it's a root block candidate.  If there is no match, bail out.
+	 */
+	if (bp->b_ops) {
+		if (bp->b_ops != fab->buf_ops)
+			goto out;
+	} else {
+		ASSERT(!xfs_trans_buf_is_dirty(bp));
+		if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
+				&mp->m_sb.sb_meta_uuid))
+			goto out;
+		/*
+		 * Read verifiers can reference b_ops, so we set the pointer
+		 * here.  If the verifier fails we'll reset the buffer state
+		 * to what it was before we touched the buffer.
+		 */
+		bp->b_ops = fab->buf_ops;
+		fab->buf_ops->verify_read(bp);
+		if (bp->b_error) {
+			bp->b_ops = NULL;
+			bp->b_error = 0;
+			goto out;
+		}
+
+		/*
+		 * Some read verifiers will (re)set b_ops, so we must be
+		 * careful not to change b_ops after running the verifier.
+		 */
+	}
+
+	/*
+	 * This block passes the magic/uuid and verifier tests for this btree
+	 * type.  We don't need the caller to try the other tree types.
+	 */
+	*done_with_block = true;
+
+	/*
+	 * Compare this btree block's level to the height of the current
+	 * candidate root block.
+	 *
+	 * If the level matches the root we found previously, throw away both
+	 * blocks because there can't be two candidate roots.
+	 *
+	 * If level is lower in the tree than the root we found previously,
+	 * ignore this block.
+	 */
+	block_level = xfs_btree_get_level(btblock);
+	if (block_level + 1 == fab->height) {
+		fab->root = NULLAGBLOCK;
 		goto out;
-	fab->root = agbno;
-	fab->height = xfs_btree_get_level(btblock) + 1;
-	*found_it = true;
+	} else if (block_level < fab->height) {
+		goto out;
+	}
+
+	/*
+	 * This is the highest block in the tree that we've found so far.
+	 * Update the btree height to reflect what we've learned from this
+	 * block.
+	 */
+	fab->height = block_level + 1;
+
+	/*
+	 * If this block doesn't have sibling pointers, then it's the new root
+	 * block candidate.  Otherwise, the root will be found farther up the
+	 * tree.
+	 */
+	if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
+	    btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+		fab->root = agbno;
+	else
+		fab->root = NULLAGBLOCK;
 
 	trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno,
 			be32_to_cpu(btblock->bb_magic), fab->height - 1);
@@ -768,7 +832,7 @@
 	struct xrep_findroot		*ri = priv;
 	struct xrep_find_ag_btree	*fab;
 	xfs_agblock_t			b;
-	bool				found_it;
+	bool				done;
 	int				error = 0;
 
 	/* Ignore anything that isn't AG metadata. */
@@ -777,16 +841,16 @@
 
 	/* Otherwise scan each block + btree type. */
 	for (b = 0; b < rec->rm_blockcount; b++) {
-		found_it = false;
+		done = false;
 		for (fab = ri->btree_info; fab->buf_ops; fab++) {
 			if (rec->rm_owner != fab->rmap_owner)
 				continue;
 			error = xrep_findroot_block(ri, fab,
 					rec->rm_owner, rec->rm_startblock + b,
-					&found_it);
+					&done);
 			if (error)
 				return error;
-			if (found_it)
+			if (done)
 				break;
 		}
 	}

diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9de321e..60c61d7 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h

@@ -15,14 +15,15 @@
 
 /* Repair helpers */
 
-int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc, bool *fixed);
+int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc);
 void xrep_failure(struct xfs_mount *mp);
 int xrep_roll_ag_trans(struct xfs_scrub *sc);
 bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
 		enum xfs_ag_resv_type type);
 xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
-int xrep_alloc_ag_block(struct xfs_scrub *sc, struct xfs_owner_info *oinfo,
-		xfs_fsblock_t *fsbno, enum xfs_ag_resv_type resv);
+int xrep_alloc_ag_block(struct xfs_scrub *sc,
+		const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+		enum xfs_ag_resv_type resv);
 int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
 		struct xfs_buf **bpp, xfs_btnum_t btnum,
 		const struct xfs_buf_ops *ops);
@@ -32,7 +33,7 @@
 int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
 int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist);
 int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist,
-		struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+		const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
 
 struct xrep_find_ag_btree {
 	/* in: rmap owner of the btree we're looking for */
@@ -41,9 +42,6 @@
 	/* in: buffer ops */
 	const struct xfs_buf_ops	*buf_ops;
 
-	/* in: magic number of the btree */
-	uint32_t			magic;
-
 	/* out: the highest btree block found and the tree height */
 	xfs_agblock_t			root;
 	unsigned int			height;
@@ -66,8 +64,7 @@
 
 static inline int xrep_attempt(
 	struct xfs_inode	*ip,
-	struct xfs_scrub	*sc,
-	bool			*fixed)
+	struct xfs_scrub	*sc)
 {
 	return -EOPNOTSUPP;
 }

diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 5e293c1..8d4cefd 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c

@@ -9,21 +9,12 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
 #include "xfs_rmap.h"
 #include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
-#include "scrub/trace.h"
 
 /*
  * Set us up to scrub reverse mapping btrees.
@@ -174,24 +165,21 @@
 xchk_rmapbt(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_owner_info	oinfo;
-
-	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
 	return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec,
-			&oinfo, NULL);
+			&XFS_RMAP_OINFO_AG, NULL);
 }
 
 /* xref check that the extent is owned by a given owner */
 static inline void
 xchk_xref_check_owner(
-	struct xfs_scrub	*sc,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo,
-	bool			should_have_rmap)
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	bool				should_have_rmap)
 {
-	bool			has_rmap;
-	int			error;
+	bool				has_rmap;
+	int				error;
 
 	if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
 		return;
@@ -207,10 +195,10 @@
 /* xref check that the extent is owned by a given owner */
 void
 xchk_xref_is_owned_by(
-	struct xfs_scrub	*sc,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
 {
 	xchk_xref_check_owner(sc, bno, len, oinfo, true);
 }
@@ -218,10 +206,10 @@
 /* xref check that the extent is not owned by a given owner */
 void
 xchk_xref_is_not_owned_by(
-	struct xfs_scrub	*sc,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_scrub		*sc,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo)
 {
 	xchk_xref_check_owner(sc, bno, len, oinfo, false);
 }

diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 665d4bb..c642bc2 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c

@@ -9,19 +9,12 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_inode.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
-#include "scrub/trace.h"
 
 /* Set us up with the realtime metadata locked. */
 int
@@ -141,9 +134,8 @@
 	startext = fsbno;
 	endext = fsbno + len - 1;
 	do_div(startext, sc->mp->m_sb.sb_rextsize);
-	if (do_div(endext, sc->mp->m_sb.sb_rextsize))
-		endext++;
-	extcount = endext - startext;
+	do_div(endext, sc->mp->m_sb.sb_rextsize);
+	extcount = endext - startext + 1;
 	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 	error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
 			&is_free);

diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4bfae1e..15c8c5f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c

@@ -9,37 +9,18 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
-#include "xfs_icache.h"
-#include "xfs_itable.h"
-#include "xfs_alloc.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_refcount.h"
-#include "xfs_refcount_btree.h"
-#include "xfs_rmap.h"
-#include "xfs_rmap_btree.h"
 #include "xfs_quota.h"
 #include "xfs_qm.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
-#include "xfs_log.h"
-#include "xfs_trans_priv.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
-#include "scrub/btree.h"
 #include "scrub/repair.h"
+#include "scrub/health.h"
 
 /*
  * Online Scrub and Repair
@@ -186,8 +167,12 @@
 			xfs_irele(sc->ip);
 		sc->ip = NULL;
 	}
-	if (sc->has_quotaofflock)
+	if (sc->flags & XCHK_REAPING_DISABLED)
+		xchk_start_reaping(sc);
+	if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
 		mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+		sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK;
+	}
 	if (sc->buf) {
 		kmem_free(sc->buf);
 		sc->buf = NULL;
@@ -347,6 +332,12 @@
 		.scrub	= xchk_quota,
 		.repair	= xrep_notsupported,
 	},
+	[XFS_SCRUB_TYPE_FSCOUNTERS] = {	/* fs summary counters */
+		.type	= ST_FS,
+		.setup	= xchk_setup_fscounters,
+		.scrub	= xchk_fscounters,
+		.repair	= xrep_notsupported,
+	},
 };
 
 /* This isn't a stable feature, warn once per day. */
@@ -412,19 +403,6 @@
 		goto out;
 	}
 
-	error = -EOPNOTSUPP;
-	/*
-	 * We won't scrub any filesystem that doesn't have the ability
-	 * to record unwritten extents.  The option was made default in
-	 * 2003, removed from mkfs in 2007, and cannot be disabled in
-	 * v5, so if we find a filesystem without this flag it's either
-	 * really old or totally unsupported.  Avoid it either way.
-	 * We also don't support v1-v3 filesystems, which aren't
-	 * mountable.
-	 */
-	if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
-		goto out;
-
 	/*
 	 * We only want to repair read-write v5+ filesystems.  Defer the check
 	 * for ops->repair until after our scrub confirms that we need to
@@ -479,10 +457,14 @@
 	struct xfs_inode		*ip,
 	struct xfs_scrub_metadata	*sm)
 {
-	struct xfs_scrub		sc;
+	struct xfs_scrub		sc = {
+		.mp			= ip->i_mount,
+		.sm			= sm,
+		.sa			= {
+			.agno		= NULLAGNUMBER,
+		},
+	};
 	struct xfs_mount		*mp = ip->i_mount;
-	bool				try_harder = false;
-	bool				already_fixed = false;
 	int				error = 0;
 
 	BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -504,21 +486,17 @@
 
 	xchk_experimental_warning(mp);
 
+	sc.ops = &meta_scrub_ops[sm->sm_type];
+	sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
 retry_op:
 	/* Set up for the operation. */
-	memset(&sc, 0, sizeof(sc));
-	sc.mp = ip->i_mount;
-	sc.sm = sm;
-	sc.ops = &meta_scrub_ops[sm->sm_type];
-	sc.try_harder = try_harder;
-	sc.sa.agno = NULLAGNUMBER;
 	error = sc.ops->setup(&sc, ip);
 	if (error)
 		goto out_teardown;
 
 	/* Scrub for errors. */
 	error = sc.ops->scrub(&sc);
-	if (!try_harder && error == -EDEADLOCK) {
+	if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
 		/*
 		 * Scrubbers return -EDEADLOCK to mean 'try harder'.
 		 * Tear down everything we hold, then set up again with
@@ -527,12 +505,15 @@
 		error = xchk_teardown(&sc, ip, 0);
 		if (error)
 			goto out;
-		try_harder = true;
+		sc.flags |= XCHK_TRY_HARDER;
 		goto retry_op;
 	} else if (error)
 		goto out_teardown;
 
-	if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) {
+	xchk_update_health(&sc);
+
+	if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+	    !(sc.flags & XREP_ALREADY_FIXED)) {
 		bool needs_fix;
 
 		/* Let debug users force us into the repair routines. */
@@ -555,10 +536,13 @@
 		 * If it's broken, userspace wants us to fix it, and we haven't
 		 * already tried to fix it, then attempt a repair.
 		 */
-		error = xrep_attempt(ip, &sc, &already_fixed);
+		error = xrep_attempt(ip, &sc);
 		if (error == -EAGAIN) {
-			if (sc.try_harder)
-				try_harder = true;
+			/*
+			 * Either the repair function succeeded or it couldn't
+			 * get all the resources it needs; either way, we go
+			 * back to the beginning and call the scrub function.
+			 */
 			error = xchk_teardown(&sc, ip, 0);
 			if (error) {
 				xrep_failure(mp);

diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index af323b2..ad1ceb4 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h

@@ -62,13 +62,27 @@
 	struct xfs_inode		*ip;
 	void				*buf;
 	uint				ilock_flags;
-	bool				try_harder;
-	bool				has_quotaofflock;
+
+	/* See the XCHK/XREP state flags below. */
+	unsigned int			flags;
+
+	/*
+	 * The XFS_SICK_* flags that correspond to the metadata being scrubbed
+	 * or repaired.  We will use this mask to update the in-core fs health
+	 * status with whatever we find.
+	 */
+	unsigned int			sick_mask;
 
 	/* State tracking for single-AG operations. */
 	struct xchk_ag			sa;
 };
 
+/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+#define XCHK_TRY_HARDER		(1 << 0)  /* can't get resources, try again */
+#define XCHK_HAS_QUOTAOFFLOCK	(1 << 1)  /* we hold the quotaoff lock */
+#define XCHK_REAPING_DISABLED	(1 << 2)  /* background block reaping paused */
+#define XREP_ALREADY_FIXED	(1 << 31) /* checking our repair work */
+
 /* Metadata scrubbers */
 int xchk_tester(struct xfs_scrub *sc);
 int xchk_superblock(struct xfs_scrub *sc);
@@ -113,6 +127,7 @@
 	return -ENOENT;
 }
 #endif
+int xchk_fscounters(struct xfs_scrub *sc);
 
 /* cross-referencing helpers */
 void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
@@ -122,9 +137,9 @@
 void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
 		xfs_extlen_t len);
 void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
-		xfs_extlen_t len, struct xfs_owner_info *oinfo);
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
 void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
-		xfs_extlen_t len, struct xfs_owner_info *oinfo);
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo);
 void xchk_xref_has_no_owner(struct xfs_scrub *sc, xfs_agblock_t agbno,
 		xfs_extlen_t len);
 void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
@@ -138,4 +153,12 @@
 # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
 #endif
 
+struct xchk_fscounters {
+	uint64_t		icount;
+	uint64_t		ifree;
+	uint64_t		fdblocks;
+	unsigned long long	icount_min;
+	unsigned long long	icount_max;
+};
+
 #endif	/* __XFS_SCRUB_SCRUB_H__ */

diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index f7ebaa9..5641ae5 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c

@@ -9,19 +9,11 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
 #include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
 #include "xfs_inode.h"
-#include "xfs_inode_fork.h"
 #include "xfs_symlink.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
-#include "scrub/trace.h"
 
 /* Set us up to scrub a symbolic link. */
 int
@@ -30,7 +22,7 @@
 	struct xfs_inode	*ip)
 {
 	/* Allocate the buffer without the inode lock held. */
-	sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+	sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
 	if (!sc->buf)
 		return -ENOMEM;
 

diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 96feaf8..9eaab2e 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c

@@ -10,15 +10,9 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_trans.h"
-#include "xfs_bit.h"
-#include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
-#include "scrub/common.h"
 
 /* Figure out which block the btree cursor was pointing to. */
 static inline xfs_fsblock_t

diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4e20f0e..3362bae 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h

@@ -12,6 +12,73 @@
 #include <linux/tracepoint.h>
 #include "xfs_bit.h"
 
+/*
+ * ftrace's __print_symbolic requires that all enum values be wrapped in the
+ * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
+ * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
+ * code.
+ */
+TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGFL);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGI);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BNOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_CNTBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FINOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_REFCNTBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INODE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTD);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTC);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIR);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_XATTR);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SYMLINK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PARENT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTBITMAP);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
+
+#define XFS_SCRUB_TYPE_STRINGS \
+	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
+	{ XFS_SCRUB_TYPE_SB,		"sb" }, \
+	{ XFS_SCRUB_TYPE_AGF,		"agf" }, \
+	{ XFS_SCRUB_TYPE_AGFL,		"agfl" }, \
+	{ XFS_SCRUB_TYPE_AGI,		"agi" }, \
+	{ XFS_SCRUB_TYPE_BNOBT,		"bnobt" }, \
+	{ XFS_SCRUB_TYPE_CNTBT,		"cntbt" }, \
+	{ XFS_SCRUB_TYPE_INOBT,		"inobt" }, \
+	{ XFS_SCRUB_TYPE_FINOBT,	"finobt" }, \
+	{ XFS_SCRUB_TYPE_RMAPBT,	"rmapbt" }, \
+	{ XFS_SCRUB_TYPE_REFCNTBT,	"refcountbt" }, \
+	{ XFS_SCRUB_TYPE_INODE,		"inode" }, \
+	{ XFS_SCRUB_TYPE_BMBTD,		"bmapbtd" }, \
+	{ XFS_SCRUB_TYPE_BMBTA,		"bmapbta" }, \
+	{ XFS_SCRUB_TYPE_BMBTC,		"bmapbtc" }, \
+	{ XFS_SCRUB_TYPE_DIR,		"directory" }, \
+	{ XFS_SCRUB_TYPE_XATTR,		"xattr" }, \
+	{ XFS_SCRUB_TYPE_SYMLINK,	"symlink" }, \
+	{ XFS_SCRUB_TYPE_PARENT,	"parent" }, \
+	{ XFS_SCRUB_TYPE_RTBITMAP,	"rtbitmap" }, \
+	{ XFS_SCRUB_TYPE_RTSUM,		"rtsummary" }, \
+	{ XFS_SCRUB_TYPE_UQUOTA,	"usrquota" }, \
+	{ XFS_SCRUB_TYPE_GQUOTA,	"grpquota" }, \
+	{ XFS_SCRUB_TYPE_PQUOTA,	"prjquota" }, \
+	{ XFS_SCRUB_TYPE_FSCOUNTERS,	"fscounters" }
+
 DECLARE_EVENT_CLASS(xchk_class,
 	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
 		 int error),
@@ -36,10 +103,10 @@
 		__entry->flags = sm->sm_flags;
 		__entry->error = error;
 	),
-	TP_printk("dev %d:%d ino 0x%llx type %u agno %u inum %llu gen %u flags 0x%x error %d",
+	TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
-		  __entry->type,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
 		  __entry->agno,
 		  __entry->inum,
 		  __entry->gen,
@@ -78,9 +145,9 @@
 		__entry->error = error;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pS",
+	TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
 		  __entry->agno,
 		  __entry->bno,
 		  __entry->error,
@@ -109,11 +176,11 @@
 		__entry->error = error;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu error %d ret_ip %pS",
+	TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->whichfork,
-		  __entry->type,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
 		  __entry->offset,
 		  __entry->error,
 		  __entry->ret_ip)
@@ -144,9 +211,9 @@
 		__entry->bno = bno;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pS",
+	TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
 		  __entry->agno,
 		  __entry->bno,
 		  __entry->ret_ip)
@@ -158,6 +225,7 @@
 		 void *ret_ip), \
 	TP_ARGS(sc, daddr, ret_ip))
 
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error);
 DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error);
 DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen);
 
@@ -176,10 +244,10 @@
 		__entry->type = sc->sm->sm_type;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS",
+	TP_printk("dev %d:%d ino 0x%llx type %s ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
-		  __entry->type,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
 		  __entry->ret_ip)
 )
 
@@ -213,11 +281,11 @@
 		__entry->offset = offset;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu ret_ip %pS",
+	TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->whichfork,
-		  __entry->type,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
 		  __entry->offset,
 		  __entry->ret_ip)
 );
@@ -244,9 +312,9 @@
 		__entry->type = sc->sm->sm_type;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d type %u ret_ip %pS",
+	TP_printk("dev %d:%d type %s ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
 		  __entry->ret_ip)
 );
 
@@ -278,10 +346,10 @@
 		__entry->error = error;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+	TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
-		  __entry->btnum,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
 		  __entry->level,
 		  __entry->ptr,
 		  __entry->agno,
@@ -321,12 +389,12 @@
 		__entry->error = error;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+	TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->whichfork,
-		  __entry->type,
-		  __entry->btnum,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
 		  __entry->level,
 		  __entry->ptr,
 		  __entry->agno,
@@ -360,10 +428,10 @@
 		__entry->ptr = cur->bc_ptrs[level];
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS",
+	TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
-		  __entry->btnum,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
 		  __entry->level,
 		  __entry->ptr,
 		  __entry->agno,
@@ -400,12 +468,12 @@
 		__entry->ptr = cur->bc_ptrs[level];
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS",
+	TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->whichfork,
-		  __entry->type,
-		  __entry->btnum,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
 		  __entry->level,
 		  __entry->ptr,
 		  __entry->agno,
@@ -439,10 +507,10 @@
 		__entry->nlevels = cur->bc_nlevels;
 		__entry->ptr = cur->bc_ptrs[level];
 	),
-	TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d",
+	TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
-		  __entry->btnum,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
 		  __entry->agno,
 		  __entry->bno,
 		  __entry->level,
@@ -473,13 +541,116 @@
 		__entry->error = error;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d type %u xref error %d ret_ip %pF",
+	TP_printk("dev %d:%d type %s xref error %d ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
+		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
 		  __entry->error,
 		  __entry->ret_ip)
 );
 
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t startino, xfs_daddr_t map_daddr,
+		 unsigned short map_len, unsigned int chunk_ino,
+		 unsigned int nr_inodes, uint16_t cluster_mask,
+		 uint16_t holemask, unsigned int cluster_ino),
+	TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+		cluster_mask, holemask, cluster_ino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+		__field(xfs_daddr_t, map_daddr)
+		__field(unsigned short, map_len)
+		__field(unsigned int, chunk_ino)
+		__field(unsigned int, nr_inodes)
+		__field(unsigned int, cluster_ino)
+		__field(uint16_t, cluster_mask)
+		__field(uint16_t, holemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = startino;
+		__entry->map_daddr = map_daddr;
+		__entry->map_len = map_len;
+		__entry->chunk_ino = chunk_ino;
+		__entry->nr_inodes = nr_inodes;
+		__entry->cluster_mask = cluster_mask;
+		__entry->holemask = holemask;
+		__entry->cluster_ino = cluster_ino;
+	),
+	TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startino,
+		  __entry->map_daddr,
+		  __entry->map_len,
+		  __entry->chunk_ino,
+		  __entry->nr_inodes,
+		  __entry->cluster_mask,
+		  __entry->holemask,
+		  __entry->cluster_ino)
+)
+
+TRACE_EVENT(xchk_fscounters_calc,
+	TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
+		 uint64_t fdblocks, uint64_t delalloc),
+	TP_ARGS(mp, icount, ifree, fdblocks, delalloc),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int64_t, icount_sb)
+		__field(uint64_t, icount_calculated)
+		__field(int64_t, ifree_sb)
+		__field(uint64_t, ifree_calculated)
+		__field(int64_t, fdblocks_sb)
+		__field(uint64_t, fdblocks_calculated)
+		__field(uint64_t, delalloc)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->icount_sb = mp->m_sb.sb_icount;
+		__entry->icount_calculated = icount;
+		__entry->ifree_sb = mp->m_sb.sb_ifree;
+		__entry->ifree_calculated = ifree;
+		__entry->fdblocks_sb = mp->m_sb.sb_fdblocks;
+		__entry->fdblocks_calculated = fdblocks;
+		__entry->delalloc = delalloc;
+	),
+	TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->icount_sb,
+		  __entry->icount_calculated,
+		  __entry->ifree_sb,
+		  __entry->ifree_calculated,
+		  __entry->fdblocks_sb,
+		  __entry->fdblocks_calculated,
+		  __entry->delalloc)
+)
+
+TRACE_EVENT(xchk_fscounters_within_range,
+	TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value,
+		 int64_t old_value),
+	TP_ARGS(mp, expected, curr_value, old_value),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint64_t, expected)
+		__field(int64_t, curr_value)
+		__field(int64_t, old_value)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->expected = expected;
+		__entry->curr_value = curr_value;
+		__entry->old_value = old_value;
+	),
+	TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->expected,
+		  __entry->curr_value,
+		  __entry->old_value)
+)
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
@@ -598,11 +769,11 @@
 		__entry->agbno = agbno;
 		__entry->btnum = btnum;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u btnum %d",
+	TP_printk("dev %d:%d agno %u agbno %u btree %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
-		  __entry->btnum)
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS))
 )
 TRACE_EVENT(xrep_findroot_block,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 8039e35..96d7071 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c

@@ -4,16 +4,14 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_trace.h"
-#include <linux/slab.h>
-#include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 
 
@@ -114,7 +112,7 @@
 {
 	struct xfs_inode *ip = XFS_I(inode);
 	struct posix_acl *acl = NULL;
-	struct xfs_acl *xfs_acl;
+	struct xfs_acl *xfs_acl = NULL;
 	unsigned char *ea_name;
 	int error;
 	int len;
@@ -137,12 +135,8 @@
 	 * go out to the disk.
 	 */
 	len = XFS_ACL_MAX_SIZE(ip->i_mount);
-	xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
-	if (!xfs_acl)
-		return ERR_PTR(-ENOMEM);
-
-	error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
-							&len, ATTR_ROOT);
+	error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
+				ATTR_ALLOC | ATTR_ROOT);
 	if (error) {
 		/*
 		 * If the attribute doesn't exist make sure we have a negative
@@ -153,8 +147,8 @@
 	} else  {
 		acl = xfs_acl_from_disk(xfs_acl, len,
 					XFS_ACL_MAX_ENTRIES(ip->i_mount));
+		kmem_free(xfs_acl);
 	}
-	kmem_free(xfs_acl);
 	return acl;
 }
 
@@ -182,7 +176,7 @@
 		struct xfs_acl *xfs_acl;
 		int len = XFS_ACL_MAX_SIZE(ip->i_mount);
 
-		xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+		xfs_acl = kmem_zalloc_large(len, 0);
 		if (!xfs_acl)
 			return -ENOMEM;
 

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 49f5f58..f16d5f1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c

@@ -12,23 +12,19 @@
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
-#include <linux/writeback.h>
 
 /*
  * structure owned by writepages passed to individual writepage calls
  */
 struct xfs_writepage_ctx {
 	struct xfs_bmbt_irec    imap;
-	unsigned int		io_type;
+	int			fork;
+	unsigned int		data_seq;
 	unsigned int		cow_seq;
 	struct xfs_ioend	*ioend;
 };
@@ -62,7 +58,7 @@
 static void
 xfs_finish_page_writeback(
 	struct inode		*inode,
-	struct bio_vec		*bvec,
+	struct bio_vec	*bvec,
 	int			error)
 {
 	struct iomap_page	*iop = to_iomap_page(bvec->bv_page);
@@ -97,7 +93,7 @@
 
 	for (bio = &ioend->io_inline_bio; bio; bio = next) {
 		struct bio_vec	*bvec;
-		int		i;
+		struct bvec_iter_all iter_all;
 
 		/*
 		 * For the last bio, bi_private points to the ioend, so we
@@ -109,7 +105,7 @@
 			next = bio->bi_private;
 
 		/* walk each page on bio, ending page IO on them */
-		bio_for_each_segment_all(bvec, bio, i)
+		bio_for_each_segment_all(bvec, bio, iter_all)
 			xfs_finish_page_writeback(inode, bvec, error);
 		bio_put(bio);
 	}
@@ -137,8 +133,7 @@
 	struct xfs_trans	*tp;
 	int			error;
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
-				XFS_TRANS_NOFS, &tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 	if (error)
 		return error;
 
@@ -232,17 +227,24 @@
  * IO write completion.
  */
 STATIC void
-xfs_end_io(
-	struct work_struct *work)
+xfs_end_ioend(
+	struct xfs_ioend	*ioend)
 {
-	struct xfs_ioend	*ioend =
-		container_of(work, struct xfs_ioend, io_work);
+	struct list_head	ioend_list;
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
+	unsigned int		nofs_flag;
 	int			error;
 
 	/*
+	 * We can allocate memory here while doing writeback on behalf of
+	 * memory reclaim.  To avoid memory allocation deadlocks set the
+	 * task-wide nofs context for the following operations.
+	 */
+	nofs_flag = memalloc_nofs_save();
+
+	/*
 	 * Just clean up the in-memory strutures if the fs has been shut down.
 	 */
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -255,35 +257,140 @@
 	 */
 	error = blk_status_to_errno(ioend->io_bio->bi_status);
 	if (unlikely(error)) {
-		switch (ioend->io_type) {
-		case XFS_IO_COW:
+		if (ioend->io_fork == XFS_COW_FORK)
 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
-			break;
-		}
-
 		goto done;
 	}
 
 	/*
-	 * Success:  commit the COW or unwritten blocks if needed.
+	 * Success: commit the COW or unwritten blocks if needed.
 	 */
-	switch (ioend->io_type) {
-	case XFS_IO_COW:
+	if (ioend->io_fork == XFS_COW_FORK)
 		error = xfs_reflink_end_cow(ip, offset, size);
-		break;
-	case XFS_IO_UNWRITTEN:
-		/* writeback should never update isize */
+	else if (ioend->io_state == XFS_EXT_UNWRITTEN)
 		error = xfs_iomap_write_unwritten(ip, offset, size, false);
-		break;
-	default:
+	else
 		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
-		break;
-	}
 
 done:
 	if (ioend->io_append_trans)
 		error = xfs_setfilesize_ioend(ioend, error);
+	list_replace_init(&ioend->io_list, &ioend_list);
 	xfs_destroy_ioend(ioend, error);
+
+	while (!list_empty(&ioend_list)) {
+		ioend = list_first_entry(&ioend_list, struct xfs_ioend,
+				io_list);
+		list_del_init(&ioend->io_list);
+		xfs_destroy_ioend(ioend, error);
+	}
+
+	memalloc_nofs_restore(nofs_flag);
+}
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool
+xfs_ioend_can_merge(
+	struct xfs_ioend	*ioend,
+	struct xfs_ioend	*next)
+{
+	if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+		return false;
+	if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
+		return false;
+	if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
+	    (next->io_state == XFS_EXT_UNWRITTEN))
+		return false;
+	if (ioend->io_offset + ioend->io_size != next->io_offset)
+		return false;
+	return true;
+}
+
+/*
+ * If the to be merged ioend has a preallocated transaction for file
+ * size updates we need to ensure the ioend it is merged into also
+ * has one.  If it already has one we can simply cancel the transaction
+ * as it is guaranteed to be clean.
+ */
+static void
+xfs_ioend_merge_append_transactions(
+	struct xfs_ioend	*ioend,
+	struct xfs_ioend	*next)
+{
+	if (!ioend->io_append_trans) {
+		ioend->io_append_trans = next->io_append_trans;
+		next->io_append_trans = NULL;
+	} else {
+		xfs_setfilesize_ioend(next, -ECANCELED);
+	}
+}
+
+/* Try to merge adjacent completions. */
+STATIC void
+xfs_ioend_try_merge(
+	struct xfs_ioend	*ioend,
+	struct list_head	*more_ioends)
+{
+	struct xfs_ioend	*next_ioend;
+
+	while (!list_empty(more_ioends)) {
+		next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
+				io_list);
+		if (!xfs_ioend_can_merge(ioend, next_ioend))
+			break;
+		list_move_tail(&next_ioend->io_list, &ioend->io_list);
+		ioend->io_size += next_ioend->io_size;
+		if (next_ioend->io_append_trans)
+			xfs_ioend_merge_append_transactions(ioend, next_ioend);
+	}
+}
+
+/* list_sort compare function for ioends */
+static int
+xfs_ioend_compare(
+	void			*priv,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	struct xfs_ioend	*ia;
+	struct xfs_ioend	*ib;
+
+	ia = container_of(a, struct xfs_ioend, io_list);
+	ib = container_of(b, struct xfs_ioend, io_list);
+	if (ia->io_offset < ib->io_offset)
+		return -1;
+	else if (ia->io_offset > ib->io_offset)
+		return 1;
+	return 0;
+}
+
+/* Finish all pending io completions. */
+void
+xfs_end_io(
+	struct work_struct	*work)
+{
+	struct xfs_inode	*ip;
+	struct xfs_ioend	*ioend;
+	struct list_head	completion_list;
+	unsigned long		flags;
+
+	ip = container_of(work, struct xfs_inode, i_ioend_work);
+
+	spin_lock_irqsave(&ip->i_ioend_lock, flags);
+	list_replace_init(&ip->i_ioend_list, &completion_list);
+	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
+
+	list_sort(NULL, &completion_list, xfs_ioend_compare);
+
+	while (!list_empty(&completion_list)) {
+		ioend = list_first_entry(&completion_list, struct xfs_ioend,
+				io_list);
+		list_del_init(&ioend->io_list);
+		xfs_ioend_try_merge(ioend, &completion_list);
+		xfs_end_ioend(ioend);
+	}
 }
 
 STATIC void
@@ -291,16 +398,92 @@
 	struct bio		*bio)
 {
 	struct xfs_ioend	*ioend = bio->bi_private;
-	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	unsigned long		flags;
 
-	if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
-		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-	else if (ioend->io_append_trans)
-		queue_work(mp->m_data_workqueue, &ioend->io_work);
-	else
+	if (ioend->io_fork == XFS_COW_FORK ||
+	    ioend->io_state == XFS_EXT_UNWRITTEN ||
+	    ioend->io_append_trans != NULL) {
+		spin_lock_irqsave(&ip->i_ioend_lock, flags);
+		if (list_empty(&ip->i_ioend_list))
+			WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
+						 &ip->i_ioend_work));
+		list_add_tail(&ioend->io_list, &ip->i_ioend_list);
+		spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
+	} else
 		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 }
 
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+	struct xfs_writepage_ctx	*wpc,
+	struct xfs_inode		*ip,
+	xfs_fileoff_t			offset_fsb)
+{
+	if (offset_fsb < wpc->imap.br_startoff ||
+	    offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+		return false;
+	/*
+	 * If this is a COW mapping, it is sufficient to check that the mapping
+	 * covers the offset. Be careful to check this first because the caller
+	 * can revalidate a COW mapping without updating the data seqno.
+	 */
+	if (wpc->fork == XFS_COW_FORK)
+		return true;
+
+	/*
+	 * This is not a COW mapping. Check the sequence number of the data fork
+	 * because concurrent changes could have invalidated the extent. Check
+	 * the COW fork because concurrent changes since the last time we
+	 * checked (and found nothing at this offset) could have added
+	 * overlapping blocks.
+	 */
+	if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+		return false;
+	if (xfs_inode_has_cow_data(ip) &&
+	    wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+		return false;
+	return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+	struct xfs_writepage_ctx *wpc,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb)
+{
+	int			error;
+
+	/*
+	 * Attempt to allocate whatever delalloc extent currently backs
+	 * offset_fsb and put the result into wpc->imap.  Allocate in a loop
+	 * because it may take several attempts to allocate real blocks for a
+	 * contiguous delalloc extent if free space is sufficiently fragmented.
+	 */
+	do {
+		error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+				&wpc->imap, wpc->fork == XFS_COW_FORK ?
+					&wpc->cow_seq : &wpc->data_seq);
+		if (error)
+			return error;
+	} while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+	return 0;
+}
+
 STATIC int
 xfs_map_blocks(
 	struct xfs_writepage_ctx *wpc,
@@ -310,26 +493,16 @@
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			count = i_blocksize(inode);
-	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
 	xfs_fileoff_t		cow_fsb = NULLFILEOFF;
 	struct xfs_bmbt_irec	imap;
-	int			whichfork = XFS_DATA_FORK;
 	struct xfs_iext_cursor	icur;
-	bool			imap_valid;
+	int			retries = 0;
 	int			error = 0;
 
-	/*
-	 * We have to make sure the cached mapping is within EOF to protect
-	 * against eofblocks trimming on file release leaving us with a stale
-	 * mapping. Otherwise, a page for a subsequent file extending buffered
-	 * write could get picked up by this writeback cycle and written to the
-	 * wrong blocks.
-	 *
-	 * Note that what we really want here is a generic mapping invalidation
-	 * mechanism to protect us from arbitrary extent modifying contexts, not
-	 * just eofblocks.
-	 */
-	xfs_trim_extent_eof(&wpc->imap, ip);
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
 
 	/*
 	 * COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +519,19 @@
 	 * against concurrent updates and provides a memory barrier on the way
 	 * out that ensures that we always see the current value.
 	 */
-	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
-		     offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
-	if (imap_valid &&
-	    (!xfs_inode_has_cow_data(ip) ||
-	     wpc->io_type == XFS_IO_COW ||
-	     wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+	if (xfs_imap_valid(wpc, ip, offset_fsb))
 		return 0;
 
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
 	/*
 	 * If we don't have a valid map, now it's time to get a new one for this
 	 * offset.  This will convert delayed allocations (including COW ones)
 	 * into real extents.  If we return without a valid map, it means we
 	 * landed in a hole and we skip the block.
 	 */
+retry:
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
-	ASSERT(offset <= mp->m_super->s_maxbytes);
-
-	if (offset > mp->m_super->s_maxbytes - count)
-		count = mp->m_super->s_maxbytes - offset;
-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 
 	/*
 	 * Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +543,16 @@
 	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
 		wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-		/*
-		 * Truncate can race with writeback since writeback doesn't
-		 * take the iolock and truncate decreases the file size before
-		 * it starts truncating the pages between new_size and old_size.
-		 * Therefore, we can end up in the situation where writeback
-		 * gets a CoW fork mapping but the truncate makes the mapping
-		 * invalid and we end up in here trying to get a new mapping.
-		 * bail out here so that we simply never get a valid mapping
-		 * and so we drop the write altogether.  The page truncation
-		 * will kill the contents anyway.
-		 */
-		if (offset > i_size_read(inode)) {
-			wpc->io_type = XFS_IO_HOLE;
-			return 0;
-		}
-		whichfork = XFS_COW_FORK;
-		wpc->io_type = XFS_IO_COW;
+
+		wpc->fork = XFS_COW_FORK;
 		goto allocate_blocks;
 	}
 
 	/*
-	 * Map valid and no COW extent in the way?  We're done.
+	 * No COW extent overlap. Revalidate now that we may have updated
+	 * ->cow_seq. If the data mapping is still valid, we're done.
 	 */
-	if (imap_valid) {
+	if (xfs_imap_valid(wpc, ip, offset_fsb)) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		return 0;
 	}
@@ -417,49 +564,65 @@
 	 */
 	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
 		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
+	wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
+	wpc->fork = XFS_DATA_FORK;
+
+	/* landed in a hole or beyond EOF? */
 	if (imap.br_startoff > offset_fsb) {
-		/* landed in a hole or beyond EOF */
 		imap.br_blockcount = imap.br_startoff - offset_fsb;
 		imap.br_startoff = offset_fsb;
 		imap.br_startblock = HOLESTARTBLOCK;
-		wpc->io_type = XFS_IO_HOLE;
-	} else {
-		/*
-		 * Truncate to the next COW extent if there is one.  This is the
-		 * only opportunity to do this because we can skip COW fork
-		 * lookups for the subsequent blocks in the mapping; however,
-		 * the requirement to treat the COW range separately remains.
-		 */
-		if (cow_fsb != NULLFILEOFF &&
-		    cow_fsb < imap.br_startoff + imap.br_blockcount)
-			imap.br_blockcount = cow_fsb - imap.br_startoff;
-
-		if (isnullstartblock(imap.br_startblock)) {
-			/* got a delalloc extent */
-			wpc->io_type = XFS_IO_DELALLOC;
-			goto allocate_blocks;
-		}
-
-		if (imap.br_state == XFS_EXT_UNWRITTEN)
-			wpc->io_type = XFS_IO_UNWRITTEN;
-		else
-			wpc->io_type = XFS_IO_OVERWRITE;
+		imap.br_state = XFS_EXT_NORM;
 	}
 
+	/*
+	 * Truncate to the next COW extent if there is one.  This is the only
+	 * opportunity to do this because we can skip COW fork lookups for the
+	 * subsequent blocks in the mapping; however, the requirement to treat
+	 * the COW range separately remains.
+	 */
+	if (cow_fsb != NULLFILEOFF &&
+	    cow_fsb < imap.br_startoff + imap.br_blockcount)
+		imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+	/* got a delalloc extent? */
+	if (imap.br_startblock != HOLESTARTBLOCK &&
+	    isnullstartblock(imap.br_startblock))
+		goto allocate_blocks;
+
 	wpc->imap = imap;
-	trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+	trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
 	return 0;
 allocate_blocks:
-	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
-			&wpc->cow_seq);
-	if (error)
+	error = xfs_convert_blocks(wpc, ip, offset_fsb);
+	if (error) {
+		/*
+		 * If we failed to find the extent in the COW fork we might have
+		 * raced with a COW to data fork conversion or truncate.
+		 * Restart the lookup to catch the extent in the data fork for
+		 * the former case, but prevent additional retries to avoid
+		 * looping forever for the latter case.
+		 */
+		if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+			goto retry;
+		ASSERT(error != -EAGAIN);
 		return error;
-	ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
-	       imap.br_startoff + imap.br_blockcount <= cow_fsb);
-	wpc->imap = imap;
-	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+	}
+
+	/*
+	 * Due to merging the return real extent might be larger than the
+	 * original delalloc one.  Trim the return extent to the next COW
+	 * boundary again to force a re-lookup.
+	 */
+	if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+	    cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+		wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+	ASSERT(wpc->imap.br_startoff <= offset_fsb);
+	ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
 	return 0;
 }
 
@@ -471,7 +634,7 @@
  * reference to the ioend to ensure that the ioend completion is only done once
  * all bios have been submitted and the ioend is really done.
  *
- * If @fail is non-zero, it means that we have a situation where some part of
+ * If @status is non-zero, it means that we have a situation where some part of
  * the submission process has failed after we have marked paged for writeback
  * and unlocked them. In this situation, we need to fail the bio and ioend
  * rather than submit it to IO. This typically only happens on a filesystem
@@ -483,33 +646,33 @@
 	struct xfs_ioend	*ioend,
 	int			status)
 {
-	/* Convert CoW extents to regular */
-	if (!status && ioend->io_type == XFS_IO_COW) {
-		/*
-		 * Yuk. This can do memory allocation, but is not a
-		 * transactional operation so everything is done in GFP_KERNEL
-		 * context. That can deadlock, because we hold pages in
-		 * writeback state and GFP_KERNEL allocations can block on them.
-		 * Hence we must operate in nofs conditions here.
-		 */
-		unsigned nofs_flag;
+	unsigned int		nofs_flag;
 
-		nofs_flag = memalloc_nofs_save();
+	/*
+	 * We can allocate memory here while doing writeback on behalf of
+	 * memory reclaim.  To avoid memory allocation deadlocks set the
+	 * task-wide nofs context for the following operations.
+	 */
+	nofs_flag = memalloc_nofs_save();
+
+	/* Convert CoW extents to regular */
+	if (!status && ioend->io_fork == XFS_COW_FORK) {
 		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 				ioend->io_offset, ioend->io_size);
-		memalloc_nofs_restore(nofs_flag);
 	}
 
 	/* Reserve log space if we might write beyond the on-disk inode size. */
 	if (!status &&
-	    ioend->io_type != XFS_IO_UNWRITTEN &&
+	    (ioend->io_fork == XFS_COW_FORK ||
+	     ioend->io_state != XFS_EXT_UNWRITTEN) &&
 	    xfs_ioend_is_append(ioend) &&
 	    !ioend->io_append_trans)
 		status = xfs_setfilesize_trans_alloc(ioend);
 
+	memalloc_nofs_restore(nofs_flag);
+
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
-	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
@@ -523,7 +686,6 @@
 		return status;
 	}
 
-	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 	submit_bio(ioend->io_bio);
 	return 0;
 }
@@ -531,10 +693,12 @@
 static struct xfs_ioend *
 xfs_alloc_ioend(
 	struct inode		*inode,
-	unsigned int		type,
+	int			fork,
+	xfs_exntst_t		state,
 	xfs_off_t		offset,
 	struct block_device	*bdev,
-	sector_t		sector)
+	sector_t		sector,
+	struct writeback_control *wbc)
 {
 	struct xfs_ioend	*ioend;
 	struct bio		*bio;
@@ -542,14 +706,17 @@
 	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
 	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = sector;
+	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+	bio->bi_write_hint = inode->i_write_hint;
+	wbc_init_bio(wbc, bio);
 
 	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 	INIT_LIST_HEAD(&ioend->io_list);
-	ioend->io_type = type;
+	ioend->io_fork = fork;
+	ioend->io_state = state;
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
 	ioend->io_offset = offset;
-	INIT_WORK(&ioend->io_work, xfs_end_io);
 	ioend->io_append_trans = NULL;
 	ioend->io_bio = bio;
 	return ioend;
@@ -562,24 +729,22 @@
  * so that the bi_private linkage is set up in the right direction for the
  * traversal in xfs_destroy_ioend().
  */
-static void
+static struct bio *
 xfs_chain_bio(
-	struct xfs_ioend	*ioend,
-	struct writeback_control *wbc,
-	struct block_device	*bdev,
-	sector_t		sector)
+	struct bio		*prev)
 {
 	struct bio *new;
 
 	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
-	bio_set_dev(new, bdev);
-	new->bi_iter.bi_sector = sector;
-	bio_chain(ioend->io_bio, new);
-	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
-	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
-	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
-	submit_bio(ioend->io_bio);
-	ioend->io_bio = new;
+	bio_copy_dev(new, prev);/* also copies over blkcg information */
+	new->bi_iter.bi_sector = bio_end_sector(prev);
+	new->bi_opf = prev->bi_opf;
+	new->bi_write_hint = prev->bi_write_hint;
+
+	bio_chain(prev, new);
+	bio_get(prev);		/* for xfs_destroy_ioend */
+	submit_bio(prev);
+	return new;
 }
 
 /*
@@ -601,29 +766,37 @@
 	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
 	unsigned		len = i_blocksize(inode);
 	unsigned		poff = offset & (PAGE_SIZE - 1);
+	bool			merged, same_page = false;
 	sector_t		sector;
 
 	sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
 		((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
 
-	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+	if (!wpc->ioend ||
+	    wpc->fork != wpc->ioend->io_fork ||
+	    wpc->imap.br_state != wpc->ioend->io_state ||
 	    sector != bio_end_sector(wpc->ioend->io_bio) ||
 	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 		if (wpc->ioend)
 			list_add(&wpc->ioend->io_list, iolist);
-		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
-				bdev, sector);
+		wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+				wpc->imap.br_state, offset, bdev, sector, wbc);
 	}
 
-	if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
-		if (iop)
-			atomic_inc(&iop->write_count);
-		if (bio_full(wpc->ioend->io_bio))
-			xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
-		__bio_add_page(wpc->ioend->io_bio, page, len, poff);
+	merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
+			&same_page);
+
+	if (iop && !same_page)
+		atomic_inc(&iop->write_count);
+
+	if (!merged) {
+		if (bio_full(wpc->ioend->io_bio, len))
+			wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);
+		bio_add_page(wpc->ioend->io_bio, page, len, poff);
 	}
 
 	wpc->ioend->io_size += len;
+	wbc_account_cgroup_owner(wbc, page, len);
 }
 
 STATIC void
@@ -721,7 +894,7 @@
 		error = xfs_map_blocks(wpc, inode, file_offset);
 		if (error)
 			break;
-		if (wpc->io_type == XFS_IO_HOLE)
+		if (wpc->imap.br_startblock == HOLESTARTBLOCK)
 			continue;
 		xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
 				 &submit_list);
@@ -916,9 +1089,7 @@
 	struct page		*page,
 	struct writeback_control *wbc)
 {
-	struct xfs_writepage_ctx wpc = {
-		.io_type = XFS_IO_INVALID,
-	};
+	struct xfs_writepage_ctx wpc = { };
 	int			ret;
 
 	ret = xfs_do_writepage(page, wbc, &wpc);
@@ -932,9 +1103,7 @@
 	struct address_space	*mapping,
 	struct writeback_control *wbc)
 {
-	struct xfs_writepage_ctx wpc = {
-		.io_type = XFS_IO_INVALID,
-	};
+	struct xfs_writepage_ctx wpc = { };
 	int			ret;
 
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -981,7 +1150,7 @@
 	 * Since we don't pass back blockdev info, we can't return bmap
 	 * information for rt files either.
 	 */
-	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 		return 0;
 	return iomap_bmap(mapping, block, &xfs_iomap_ops);
 }

diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 9af8679..45a1ea2 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h

@@ -9,35 +9,15 @@
 extern struct bio_set xfs_ioend_bioset;
 
 /*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-	XFS_IO_INVALID,		/* initial state */
-	XFS_IO_DELALLOC,	/* covers delalloc region */
-	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
-	XFS_IO_OVERWRITE,	/* covers already allocated extent */
-	XFS_IO_COW,		/* covers copy-on-write extent */
-	XFS_IO_HOLE,		/* covers region without any block allocation */
-};
-
-#define XFS_IO_TYPES \
-	{ XFS_IO_INVALID,		"invalid" }, \
-	{ XFS_IO_DELALLOC,		"delalloc" }, \
-	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
-	{ XFS_IO_OVERWRITE,		"overwrite" }, \
-	{ XFS_IO_COW,			"CoW" }, \
-	{ XFS_IO_HOLE,			"hole" }
-
-/*
  * Structure for buffered I/O completions.
  */
 struct xfs_ioend {
 	struct list_head	io_list;	/* next ioend in chain */
-	unsigned int		io_type;	/* delalloc / unwritten */
+	int			io_fork;	/* inode fork written back */
+	xfs_exntst_t		io_state;	/* extent state */
 	struct inode		*io_inode;	/* file being written to */
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
-	struct work_struct	io_work;	/* xfsdatad work queue */
 	struct xfs_trans	*io_append_trans;/* xact. for size update */
 	struct bio		*io_bio;	/* bio being built */
 	struct bio		io_inline_bio;	/* MUST BE LAST! */
@@ -48,7 +28,6 @@
 
 int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
 
-extern void xfs_count_page_state(struct page *, int *, int *);
 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
 extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *);
 

diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 228821b..a640a28 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c

@@ -15,18 +15,13 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
-#include "xfs_alloc.h"
 #include "xfs_attr_remote.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
-#include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_trace.h"
 #include "xfs_dir2.h"
-#include "xfs_defer.h"
 
 /*
  * Look at all the extents for this logical region,
@@ -121,7 +116,7 @@
 	int			size;
 	int			tmp;
 	int			i;
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 
 	leaf = bp->b_addr;
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
@@ -152,7 +147,7 @@
 	 * Allocate storage for a list of all the "remote" value extents.
 	 */
 	size = count * sizeof(xfs_attr_inactive_list_t);
-	list = kmem_alloc(size, KM_SLEEP);
+	list = kmem_alloc(size, 0);
 
 	/*
 	 * Identify each of the "remote" value extents.

diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a580340..00758fd 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c

@@ -6,25 +6,20 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
 #include "xfs_dir2.h"
 
 STATIC int
@@ -114,7 +109,7 @@
 	 * It didn't all fit, so we have to sort everything on hashval.
 	 */
 	sbsize = sf->hdr.count * sizeof(*sbuf);
-	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+	sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
 
 	/*
 	 * Scan the attribute list for the rest of the entries, storing
@@ -555,6 +550,7 @@
 	attrlist_ent_t *aep;
 	int arraytop;
 
+	ASSERT(!context->seen_enough);
 	ASSERT(!(context->flags & ATTR_KERNOVAL));
 	ASSERT(context->count >= 0);
 	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));

diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
new file mode 100644
index 0000000..e2148f2
--- /dev/null
+++ b/fs/xfs/xfs_bio_io.c

@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Christoph Hellwig.
+ */
+#include "xfs.h"
+
+static inline unsigned int bio_max_vecs(unsigned int count)
+{
+	return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+}
+
+int
+xfs_rw_bdev(
+	struct block_device	*bdev,
+	sector_t		sector,
+	unsigned int		count,
+	char			*data,
+	unsigned int		op)
+
+{
+	unsigned int		is_vmalloc = is_vmalloc_addr(data);
+	unsigned int		left = count;
+	int			error;
+	struct bio		*bio;
+
+	if (is_vmalloc && op == REQ_OP_WRITE)
+		flush_kernel_vmap_range(data, count);
+
+	bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
+	bio_set_dev(bio, bdev);
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_opf = op | REQ_META | REQ_SYNC;
+
+	do {
+		struct page	*page = kmem_to_page(data);
+		unsigned int	off = offset_in_page(data);
+		unsigned int	len = min_t(unsigned, left, PAGE_SIZE - off);
+
+		while (bio_add_page(bio, page, len, off) != len) {
+			struct bio	*prev = bio;
+
+			bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
+			bio_copy_dev(bio, prev);
+			bio->bi_iter.bi_sector = bio_end_sector(prev);
+			bio->bi_opf = prev->bi_opf;
+			bio_chain(prev, bio);
+
+			submit_bio(prev);
+		}
+
+		data += len;
+		left -= len;
+	} while (left > 0);
+
+	error = submit_bio_wait(bio);
+	bio_put(bio);
+
+	if (is_vmalloc && op == REQ_OP_READ)
+		invalidate_kernel_vmap_range(data, count);
+	return error;
+}

diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index ce45f06..83d24e9 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c

@@ -9,17 +9,16 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
+#include "xfs_shared.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
 #include "xfs_bmap_item.h"
 #include "xfs_log.h"
 #include "xfs_bmap.h"
 #include "xfs_icache.h"
-#include "xfs_trace.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_trans_space.h"
 
@@ -96,15 +95,6 @@
 }
 
 /*
- * Pinning has no meaning for an bui item, so just return.
- */
-STATIC void
-xfs_bui_item_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
  * The unpin operation is the last place an BUI is manipulated in the log. It is
  * either inserted in the AIL or aborted in the event of a log I/O error. In
  * either case, the BUI transaction has been successfully committed to make it
@@ -123,71 +113,22 @@
 }
 
 /*
- * BUI items have no locking or pushing.  However, since BUIs are pulled from
- * the AIL when their corresponding BUDs are committed to disk, their situation
- * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log.  This should help in getting the BUI out of
- * the AIL.
- */
-STATIC uint
-xfs_bui_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	return XFS_ITEM_PINNED;
-}
-
-/*
  * The BUI has been either committed or aborted if the transaction has been
  * cancelled. If the transaction was cancelled, an BUD isn't going to be
  * constructed and thus we free the BUI here directly.
  */
 STATIC void
-xfs_bui_item_unlock(
+xfs_bui_item_release(
 	struct xfs_log_item	*lip)
 {
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
-		xfs_bui_release(BUI_ITEM(lip));
+	xfs_bui_release(BUI_ITEM(lip));
 }
 
-/*
- * The BUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_bui_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	return lsn;
-}
-
-/*
- * The BUI dependency tracking op doesn't do squat.  It can't because
- * it doesn't know where the free extent is coming from.  The dependency
- * tracking has to be handled by the "enclosing" metadata object.  For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_bui_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all bui log items.
- */
 static const struct xfs_item_ops xfs_bui_item_ops = {
 	.iop_size	= xfs_bui_item_size,
 	.iop_format	= xfs_bui_item_format,
-	.iop_pin	= xfs_bui_item_pin,
 	.iop_unpin	= xfs_bui_item_unpin,
-	.iop_unlock	= xfs_bui_item_unlock,
-	.iop_committed	= xfs_bui_item_committed,
-	.iop_push	= xfs_bui_item_push,
-	.iop_committing = xfs_bui_item_committing,
+	.iop_release	= xfs_bui_item_release,
 };
 
 /*
@@ -200,7 +141,7 @@
 {
 	struct xfs_bui_log_item		*buip;
 
-	buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+	buip = kmem_zone_zalloc(xfs_bui_zone, 0);
 
 	xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
 	buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -249,127 +190,242 @@
 }
 
 /*
- * Pinning has no meaning for an bud item, so just return.
- */
-STATIC void
-xfs_bud_item_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an bud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_bud_item_unpin(
-	struct xfs_log_item	*lip,
-	int			remove)
-{
-}
-
-/*
- * There isn't much you can do to push on an bud item.  It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_bud_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	return XFS_ITEM_PINNED;
-}
-
-/*
  * The BUD is either committed or aborted if the transaction is cancelled. If
  * the transaction is cancelled, drop our reference to the BUI and free the
  * BUD.
  */
 STATIC void
-xfs_bud_item_unlock(
+xfs_bud_item_release(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
 
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
-		xfs_bui_release(budp->bud_buip);
-		kmem_zone_free(xfs_bud_zone, budp);
-	}
-}
-
-/*
- * When the bud item is committed to disk, all we need to do is delete our
- * reference to our partner bui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_bud_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
-
-	/*
-	 * Drop the BUI reference regardless of whether the BUD has been
-	 * aborted. Once the BUD transaction is constructed, it is the sole
-	 * responsibility of the BUD to release the BUI (even if the BUI is
-	 * aborted due to log I/O error).
-	 */
 	xfs_bui_release(budp->bud_buip);
 	kmem_zone_free(xfs_bud_zone, budp);
-
-	return (xfs_lsn_t)-1;
 }
 
-/*
- * The BUD dependency tracking op doesn't do squat.  It can't because
- * it doesn't know where the free extent is coming from.  The dependency
- * tracking has to be handled by the "enclosing" metadata object.  For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_bud_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all bud log items.
- */
 static const struct xfs_item_ops xfs_bud_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
 	.iop_size	= xfs_bud_item_size,
 	.iop_format	= xfs_bud_item_format,
-	.iop_pin	= xfs_bud_item_pin,
-	.iop_unpin	= xfs_bud_item_unpin,
-	.iop_unlock	= xfs_bud_item_unlock,
-	.iop_committed	= xfs_bud_item_committed,
-	.iop_push	= xfs_bud_item_push,
-	.iop_committing = xfs_bud_item_committing,
+	.iop_release	= xfs_bud_item_release,
 };
 
-/*
- * Allocate and initialize an bud item with the given number of extents.
- */
-struct xfs_bud_log_item *
-xfs_bud_init(
-	struct xfs_mount		*mp,
+static struct xfs_bud_log_item *
+xfs_trans_get_bud(
+	struct xfs_trans		*tp,
 	struct xfs_bui_log_item		*buip)
-
 {
-	struct xfs_bud_log_item	*budp;
+	struct xfs_bud_log_item		*budp;
 
-	budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
-	xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops);
+	budp = kmem_zone_zalloc(xfs_bud_zone, 0);
+	xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
+			  &xfs_bud_item_ops);
 	budp->bud_buip = buip;
 	budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
 
+	xfs_trans_add_item(tp, &budp->bud_item);
 	return budp;
 }
 
 /*
+ * Finish an bmap update and log it to the BUD. Note that the
+ * transaction is marked dirty regardless of whether the bmap update
+ * succeeds or fails to support the BUI/BUD lifecycle rules.
+ */
+static int
+xfs_trans_log_finish_bmap_update(
+	struct xfs_trans		*tp,
+	struct xfs_bud_log_item		*budp,
+	enum xfs_bmap_intent_type	type,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			*blockcount,
+	xfs_exntst_t			state)
+{
+	int				error;
+
+	error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
+			startblock, blockcount, state);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the BUI and frees the BUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+	return error;
+}
+
+/* Sort bmap intents by inode. */
+static int
+xfs_bmap_update_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_bmap_intent		*ba;
+	struct xfs_bmap_intent		*bb;
+
+	ba = container_of(a, struct xfs_bmap_intent, bi_list);
+	bb = container_of(b, struct xfs_bmap_intent, bi_list);
+	return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
+}
+
+/* Get an BUI. */
+STATIC void *
+xfs_bmap_update_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_bui_log_item		*buip;
+
+	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+	ASSERT(tp != NULL);
+
+	buip = xfs_bui_init(tp->t_mountp);
+	ASSERT(buip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &buip->bui_item);
+	return buip;
+}
+
+/* Set the map extent flags for this mapping. */
+static void
+xfs_trans_set_bmap_flags(
+	struct xfs_map_extent		*bmap,
+	enum xfs_bmap_intent_type	type,
+	int				whichfork,
+	xfs_exntst_t			state)
+{
+	bmap->me_flags = 0;
+	switch (type) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		bmap->me_flags = type;
+		break;
+	default:
+		ASSERT(0);
+	}
+	if (state == XFS_EXT_UNWRITTEN)
+		bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+	if (whichfork == XFS_ATTR_FORK)
+		bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+}
+
+/* Log bmap updates in the intent item. */
+STATIC void
+xfs_bmap_update_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
+{
+	struct xfs_bui_log_item		*buip = intent;
+	struct xfs_bmap_intent		*bmap;
+	uint				next_extent;
+	struct xfs_map_extent		*map;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
+	ASSERT(next_extent < buip->bui_format.bui_nextents);
+	map = &buip->bui_format.bui_extents[next_extent];
+	map->me_owner = bmap->bi_owner->i_ino;
+	map->me_startblock = bmap->bi_bmap.br_startblock;
+	map->me_startoff = bmap->bi_bmap.br_startoff;
+	map->me_len = bmap->bi_bmap.br_blockcount;
+	xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
+			bmap->bi_bmap.br_state);
+}
+
+/* Get an BUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_bmap_update_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_bud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_bmap_update_finish_item(
+	struct xfs_trans		*tp,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_bmap_intent		*bmap;
+	xfs_filblks_t			count;
+	int				error;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	count = bmap->bi_bmap.br_blockcount;
+	error = xfs_trans_log_finish_bmap_update(tp, done_item,
+			bmap->bi_type,
+			bmap->bi_owner, bmap->bi_whichfork,
+			bmap->bi_bmap.br_startoff,
+			bmap->bi_bmap.br_startblock,
+			&count,
+			bmap->bi_bmap.br_state);
+	if (!error && count > 0) {
+		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+		bmap->bi_bmap.br_blockcount = count;
+		return -EAGAIN;
+	}
+	kmem_free(bmap);
+	return error;
+}
+
+/* Abort all pending BUIs. */
+STATIC void
+xfs_bmap_update_abort_intent(
+	void				*intent)
+{
+	xfs_bui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_bmap_intent		*bmap;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	kmem_free(bmap);
+}
+
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_bmap_update_diff_items,
+	.create_intent	= xfs_bmap_update_create_intent,
+	.abort_intent	= xfs_bmap_update_abort_intent,
+	.log_item	= xfs_bmap_update_log_item,
+	.create_done	= xfs_bmap_update_create_done,
+	.finish_item	= xfs_bmap_update_finish_item,
+	.cancel_item	= xfs_bmap_update_cancel_item,
+};
+
+/*
  * Process a bmap update intent item that was recovered from the log.
  * We need to update some inode's bmbt.
  */
@@ -486,9 +542,7 @@
 		irec.br_blockcount = count;
 		irec.br_startoff = bmap->me_startoff;
 		irec.br_state = state;
-		error = xfs_bmap_unmap_extent(tp, ip, &irec);
-		if (error)
-			goto err_inode;
+		xfs_bmap_unmap_extent(tp, ip, &irec);
 	}
 
 	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);

diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 89e043a..ad479cc 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h

@@ -75,8 +75,6 @@
 extern struct kmem_zone	*xfs_bud_zone;
 
 struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
-struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
-		struct xfs_bui_log_item *);
 void xfs_bui_item_free(struct xfs_bui_log_item *);
 void xfs_bui_release(struct xfs_bui_log_item *);
 int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip);

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 6de8d90..4f44370 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c

@@ -12,12 +12,10 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
 #include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_trans.h"
-#include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
@@ -28,11 +26,8 @@
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
-#include "xfs_log.h"
-#include "xfs_rmap_btree.h"
 #include "xfs_iomap.h"
 #include "xfs_reflink.h"
-#include "xfs_refcount.h"
 
 /* Kernel only BMAP related definitions and functions */
 
@@ -44,9 +39,9 @@
 xfs_daddr_t
 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 {
-	return (XFS_IS_REALTIME_INODE(ip) ? \
-		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
-		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+	if (XFS_IS_REALTIME_INODE(ip))
+		return XFS_FSB_TO_BB(ip->i_mount, fsb);
+	return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
 }
 
 /*
@@ -276,7 +271,7 @@
 	struct xfs_btree_block	*block, *nextblock;
 	int			numrecs;
 
-	error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+	error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF,
 						&xfs_bmbt_buf_ops);
 	if (error)
 		return error;
@@ -287,7 +282,7 @@
 		/* Not at node above leaves, count this level of nodes */
 		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 		while (nextbno != NULLFSBLOCK) {
-			error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+			error = xfs_btree_read_bufl(mp, tp, nextbno, &nbp,
 						XFS_BMAP_BTREE_REF,
 						&xfs_bmbt_buf_ops);
 			if (error)
@@ -321,7 +316,7 @@
 			if (nextbno == NULLFSBLOCK)
 				break;
 			bno = nextbno;
-			error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+			error = xfs_btree_read_bufl(mp, tp, bno, &bp,
 						XFS_BMAP_BTREE_REF,
 						&xfs_bmbt_buf_ops);
 			if (error)
@@ -406,10 +401,10 @@
 	struct xfs_bmbt_irec	*got)
 {
 	struct kgetbmap		*p = out + bmv->bmv_entries;
-	bool			shared = false, trimmed = false;
+	bool			shared = false;
 	int			error;
 
-	error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed);
+	error = xfs_reflink_trim_around_shared(ip, got, &shared);
 	if (error)
 		return error;
 
@@ -869,6 +864,7 @@
 	xfs_filblks_t		allocatesize_fsb;
 	xfs_extlen_t		extsz, temp;
 	xfs_fileoff_t		startoffset_fsb;
+	xfs_fileoff_t		endoffset_fsb;
 	int			nimaps;
 	int			quota_flag;
 	int			rt;
@@ -896,7 +892,8 @@
 	imapp = &imaps[0];
 	nimaps = 1;
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
-	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+	endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+	allocatesize_fsb = endoffset_fsb - startoffset_fsb;
 
 	/*
 	 * Allocate file space until done or until there is an error
@@ -1042,45 +1039,7 @@
 	goto out_unlock;
 }
 
-static int
-xfs_adjust_extent_unmap_boundaries(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		*startoffset_fsb,
-	xfs_fileoff_t		*endoffset_fsb)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_bmbt_irec	imap;
-	int			nimap, error;
-	xfs_extlen_t		mod = 0;
-
-	nimap = 1;
-	error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
-	if (error)
-		return error;
-
-	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-		div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod);
-		if (mod)
-			*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
-	}
-
-	nimap = 1;
-	error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
-	if (error)
-		return error;
-
-	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-		mod++;
-		if (mod && mod != mp->m_sb.sb_rextsize)
-			*endoffset_fsb -= mod;
-	}
-
-	return 0;
-}
-
-static int
+int
 xfs_flush_unmap_range(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
@@ -1133,19 +1092,8 @@
 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
 	/*
-	 * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
-	 * and we can't use unwritten extents then we actually need to ensure
-	 * to zero the whole extent, otherwise we just need to take of block
-	 * boundaries, and xfs_bunmapi will handle the rest.
+	 * Need to zero the stuff we're not freeing, on disk.
 	 */
-	if (XFS_IS_REALTIME_INODE(ip) &&
-	    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-		error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
-				&endoffset_fsb);
-		if (error)
-			return error;
-	}
-
 	if (endoffset_fsb > startoffset_fsb) {
 		while (!done) {
 			error = xfs_unmap_extent(ip, startoffset_fsb,
@@ -1175,9 +1123,9 @@
 	 * page could be mmap'd and iomap_zero_range doesn't do that for us.
 	 * Writeback of the eof page will do this, albeit clumsily.
 	 */
-	if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) {
+	if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-				(offset + len) & ~PAGE_MASK, LLONG_MAX);
+				round_down(offset + len, PAGE_SIZE), LLONG_MAX);
 	}
 
 	return error;
@@ -1211,16 +1159,13 @@
 	 * by virtue of the hole punch.
 	 */
 	error = xfs_free_file_space(ip, offset, len);
-	if (error)
-		goto out;
+	if (error || xfs_is_always_cow_inode(ip))
+		return error;
 
-	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+	return xfs_alloc_file_space(ip, round_down(offset, blksize),
 				     round_up(offset + len, blksize) -
 				     round_down(offset, blksize),
 				     XFS_BMAPI_PREALLOC);
-out:
-	return error;
-
 }
 
 static int
@@ -1244,11 +1189,7 @@
 	 * Writeback and invalidate cache for the remainder of the file as we're
 	 * about to shift down every extent from offset to EOF.
 	 */
-	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1);
-	if (error)
-		return error;
-	error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-					offset >> PAGE_SHIFT, -1);
+	error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
 	if (error)
 		return error;
 
@@ -1593,24 +1534,16 @@
 			trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
 
 			/* Remove the mapping from the donor file. */
-			error = xfs_bmap_unmap_extent(tp, tip, &uirec);
-			if (error)
-				goto out;
+			xfs_bmap_unmap_extent(tp, tip, &uirec);
 
 			/* Remove the mapping from the source file. */
-			error = xfs_bmap_unmap_extent(tp, ip, &irec);
-			if (error)
-				goto out;
+			xfs_bmap_unmap_extent(tp, ip, &irec);
 
 			/* Map the donor file's blocks into the source file. */
-			error = xfs_bmap_map_extent(tp, ip, &uirec);
-			if (error)
-				goto out;
+			xfs_bmap_map_extent(tp, ip, &uirec);
 
 			/* Map the source file's blocks into the donor file. */
-			error = xfs_bmap_map_extent(tp, tip, &irec);
-			if (error)
-				goto out;
+			xfs_bmap_map_extent(tp, tip, &irec);
 
 			error = xfs_defer_finish(tpp);
 			tp = *tpp;
@@ -1824,6 +1757,12 @@
 	if (error)
 		goto out_unlock;
 
+	if (xfs_inode_has_cow_data(tip)) {
+		error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
+		if (error)
+			return error;
+	}
+
 	/*
 	 * Extent "swapping" with rmap requires a permanent reservation and
 	 * a block reservation because it's really just a remap operation

diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 87363d1..7a78229 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h

@@ -80,4 +80,7 @@
 			  int whichfork, xfs_extnum_t *nextents,
 			  xfs_filblks_t *count);
 
+int	xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
+			      xfs_off_t len);
+
 #endif	/* __XFS_BMAP_UTIL_H__ */

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e839907..0abba17 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c

@@ -4,24 +4,9 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/bio.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/percpu.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
-#include <linux/migrate.h>
 #include <linux/backing-dev.h>
-#include <linux/freezer.h>
 
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
@@ -37,6 +22,32 @@
 #define xb_to_gfp(flags) \
 	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
 
+/*
+ * Locking orders
+ *
+ * xfs_buf_ioacct_inc:
+ * xfs_buf_ioacct_dec:
+ *	b_sema (caller holds)
+ *	  b_lock
+ *
+ * xfs_buf_stale:
+ *	b_sema (caller holds)
+ *	  b_lock
+ *	    lru_lock
+ *
+ * xfs_buf_rele:
+ *	b_lock
+ *	  pag_buf_lock
+ *	    lru_lock
+ *
+ * xfs_buftarg_wait_rele
+ *	lru_lock
+ *	  b_lock (trylock due to inversion)
+ *
+ * xfs_buftarg_isolate
+ *	lru_lock
+ *	  b_lock (trylock due to inversion)
+ */
 
 static inline int
 xfs_buf_is_vmapped(
@@ -187,7 +198,7 @@
 	}
 }
 
-struct xfs_buf *
+static struct xfs_buf *
 _xfs_buf_alloc(
 	struct xfs_buftarg	*target,
 	struct xfs_buf_map	*map,
@@ -217,6 +228,7 @@
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
 	spin_lock_init(&bp->b_lock);
 	bp->b_target = target;
+	bp->b_mount = target->bt_mount;
 	bp->b_flags = flags;
 
 	/*
@@ -237,12 +249,11 @@
 		bp->b_maps[i].bm_len = map[i].bm_len;
 		bp->b_length += map[i].bm_len;
 	}
-	bp->b_io_length = bp->b_length;
 
 	atomic_set(&bp->b_pin_count, 0);
 	init_waitqueue_head(&bp->b_waiters);
 
-	XFS_STATS_INC(target->bt_mount, xb_create);
+	XFS_STATS_INC(bp->b_mount, xb_create);
 	trace_xfs_buf_init(bp, _RET_IP_);
 
 	return bp;
@@ -334,6 +345,15 @@
 	unsigned short		page_count, i;
 	xfs_off_t		start, end;
 	int			error;
+	xfs_km_flags_t		kmflag_mask = 0;
+
+	/*
+	 * assure zeroed buffer for non-read cases.
+	 */
+	if (!(flags & XBF_READ)) {
+		kmflag_mask |= KM_ZERO;
+		gfp_mask |= __GFP_ZERO;
+	}
 
 	/*
 	 * for buffers that are contained within a single page, just allocate
@@ -342,7 +362,9 @@
 	 */
 	size = BBTOB(bp->b_length);
 	if (size < PAGE_SIZE) {
-		bp->b_addr = kmem_alloc(size, KM_NOFS);
+		int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
+		bp->b_addr = kmem_alloc_io(size, align_mask,
+					   KM_NOFS | kmflag_mask);
 		if (!bp->b_addr) {
 			/* low memory - use alloc_page loop instead */
 			goto use_alloc_page;
@@ -357,7 +379,7 @@
 		}
 		bp->b_offset = offset_in_page(bp->b_addr);
 		bp->b_pages = bp->b_page_array;
-		bp->b_pages[0] = virt_to_page(bp->b_addr);
+		bp->b_pages[0] = kmem_to_page(bp->b_addr);
 		bp->b_page_count = 1;
 		bp->b_flags |= _XBF_KMEM;
 		return 0;
@@ -399,12 +421,12 @@
 					current->comm, current->pid,
 					__func__, gfp_mask);
 
-			XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
+			XFS_STATS_INC(bp->b_mount, xb_page_retries);
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
 
-		XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
+		XFS_STATS_INC(bp->b_mount, xb_page_found);
 
 		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
 		size -= nbytes;
@@ -749,6 +771,41 @@
 	return xfs_buf_submit(bp);
 }
 
+/*
+ * Reverify a buffer found in cache without an attached ->b_ops.
+ *
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
+ *
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type.  If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
+ */
+int
+xfs_buf_reverify(
+	struct xfs_buf		*bp,
+	const struct xfs_buf_ops *ops)
+{
+	ASSERT(bp->b_flags & XBF_DONE);
+	ASSERT(bp->b_error == 0);
+
+	if (!ops || bp->b_ops)
+		return 0;
+
+	bp->b_ops = ops;
+	bp->b_ops->verify_read(bp);
+	if (bp->b_error)
+		bp->b_flags &= ~XBF_DONE;
+	return bp->b_error;
+}
+
 xfs_buf_t *
 xfs_buf_read_map(
 	struct xfs_buftarg	*target,
@@ -762,26 +819,32 @@
 	flags |= XBF_READ;
 
 	bp = xfs_buf_get_map(target, map, nmaps, flags);
-	if (bp) {
-		trace_xfs_buf_read(bp, flags, _RET_IP_);
+	if (!bp)
+		return NULL;
 
-		if (!(bp->b_flags & XBF_DONE)) {
-			XFS_STATS_INC(target->bt_mount, xb_get_read);
-			bp->b_ops = ops;
-			_xfs_buf_read(bp, flags);
-		} else if (flags & XBF_ASYNC) {
-			/*
-			 * Read ahead call which is already satisfied,
-			 * drop the buffer
-			 */
-			xfs_buf_relse(bp);
-			return NULL;
-		} else {
-			/* We do not want read in the flags */
-			bp->b_flags &= ~XBF_READ;
-		}
+	trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+	if (!(bp->b_flags & XBF_DONE)) {
+		XFS_STATS_INC(target->bt_mount, xb_get_read);
+		bp->b_ops = ops;
+		_xfs_buf_read(bp, flags);
+		return bp;
 	}
 
+	xfs_buf_reverify(bp, ops);
+
+	if (flags & XBF_ASYNC) {
+		/*
+		 * Read ahead call which is already satisfied,
+		 * drop the buffer
+		 */
+		xfs_buf_relse(bp);
+		return NULL;
+	}
+
+	/* We do not want read in the flags */
+	bp->b_flags &= ~XBF_READ;
+	ASSERT(bp->b_ops != NULL || ops == NULL);
 	return bp;
 }
 
@@ -842,83 +905,6 @@
 	return 0;
 }
 
-/*
- * Return a buffer allocated as an empty buffer and associated to external
- * memory via xfs_buf_associate_memory() back to it's empty state.
- */
-void
-xfs_buf_set_empty(
-	struct xfs_buf		*bp,
-	size_t			numblks)
-{
-	if (bp->b_pages)
-		_xfs_buf_free_pages(bp);
-
-	bp->b_pages = NULL;
-	bp->b_page_count = 0;
-	bp->b_addr = NULL;
-	bp->b_length = numblks;
-	bp->b_io_length = numblks;
-
-	ASSERT(bp->b_map_count == 1);
-	bp->b_bn = XFS_BUF_DADDR_NULL;
-	bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
-	bp->b_maps[0].bm_len = bp->b_length;
-}
-
-static inline struct page *
-mem_to_page(
-	void			*addr)
-{
-	if ((!is_vmalloc_addr(addr))) {
-		return virt_to_page(addr);
-	} else {
-		return vmalloc_to_page(addr);
-	}
-}
-
-int
-xfs_buf_associate_memory(
-	xfs_buf_t		*bp,
-	void			*mem,
-	size_t			len)
-{
-	int			rval;
-	int			i = 0;
-	unsigned long		pageaddr;
-	unsigned long		offset;
-	size_t			buflen;
-	int			page_count;
-
-	pageaddr = (unsigned long)mem & PAGE_MASK;
-	offset = (unsigned long)mem - pageaddr;
-	buflen = PAGE_ALIGN(len + offset);
-	page_count = buflen >> PAGE_SHIFT;
-
-	/* Free any previous set of page pointers */
-	if (bp->b_pages)
-		_xfs_buf_free_pages(bp);
-
-	bp->b_pages = NULL;
-	bp->b_addr = mem;
-
-	rval = _xfs_buf_get_pages(bp, page_count);
-	if (rval)
-		return rval;
-
-	bp->b_offset = offset;
-
-	for (i = 0; i < bp->b_page_count; i++) {
-		bp->b_pages[i] = mem_to_page((void *)pageaddr);
-		pageaddr += PAGE_SIZE;
-	}
-
-	bp->b_io_length = BTOBB(len);
-	bp->b_length = BTOBB(buflen);
-
-	return 0;
-}
-
 xfs_buf_t *
 xfs_buf_get_uncached(
 	struct xfs_buftarg	*target,
@@ -1006,8 +992,18 @@
 
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 
-	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+	/*
+	 * We grab the b_lock here first to serialise racing xfs_buf_rele()
+	 * calls. The pag_buf_lock being taken on the last reference only
+	 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
+	 * to last reference we drop here is not serialised against the last
+	 * reference until we take bp->b_lock. Hence if we don't grab b_lock
+	 * first, the last "release" reference can win the race to the lock and
+	 * free the buffer before the second-to-last reference is processed,
+	 * leading to a use-after-free scenario.
+	 */
 	spin_lock(&bp->b_lock);
+	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
 	if (!release) {
 		/*
 		 * Drop the in-flight state if the buffer is already on the LRU
@@ -1103,7 +1099,7 @@
 	trace_xfs_buf_lock(bp, _RET_IP_);
 
 	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-		xfs_log_force(bp->b_target->bt_mount, 0);
+		xfs_log_force(bp->b_mount, 0);
 	down(&bp->b_sema);
 
 	trace_xfs_buf_lock_done(bp, _RET_IP_);
@@ -1192,7 +1188,7 @@
 	struct xfs_buf	*bp)
 {
 	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
-	queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
+	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
 }
 
 void
@@ -1211,7 +1207,7 @@
 	struct xfs_buf		*bp,
 	const char		*func)
 {
-	xfs_alert(bp->b_target->bt_mount,
+	xfs_alert(bp->b_mount,
 "metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d",
 			func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
 			-bp->b_error);
@@ -1230,10 +1226,8 @@
 			 XBF_WRITE_FAIL | XBF_DONE);
 
 	error = xfs_buf_submit(bp);
-	if (error) {
-		xfs_force_shutdown(bp->b_target->bt_mount,
-				   SHUTDOWN_META_IO_ERROR);
-	}
+	if (error)
+		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
 	return error;
 }
 
@@ -1359,21 +1353,8 @@
 	 */
 	bp->b_error = 0;
 
-	/*
-	 * Initialize the I/O completion workqueue if we haven't yet or the
-	 * submitter has not opted to specify a custom one.
-	 */
-	if (!bp->b_ioend_wq)
-		bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
-
 	if (bp->b_flags & XBF_WRITE) {
 		op = REQ_OP_WRITE;
-		if (bp->b_flags & XBF_SYNCIO)
-			op_flags = REQ_SYNC;
-		if (bp->b_flags & XBF_FUA)
-			op_flags |= REQ_FUA;
-		if (bp->b_flags & XBF_FLUSH)
-			op_flags |= REQ_PREFLUSH;
 
 		/*
 		 * Run the write verifier callback function if it exists. If
@@ -1383,12 +1364,12 @@
 		if (bp->b_ops) {
 			bp->b_ops->verify_write(bp);
 			if (bp->b_error) {
-				xfs_force_shutdown(bp->b_target->bt_mount,
+				xfs_force_shutdown(bp->b_mount,
 						   SHUTDOWN_CORRUPT_INCORE);
 				return;
 			}
 		} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
-			struct xfs_mount *mp = bp->b_target->bt_mount;
+			struct xfs_mount *mp = bp->b_mount;
 
 			/*
 			 * non-crc filesystems don't attach verifiers during
@@ -1420,7 +1401,7 @@
 	 * subsequent call.
 	 */
 	offset = bp->b_offset;
-	size = BBTOB(bp->b_io_length);
+	size = BBTOB(bp->b_length);
 	blk_start_plug(&plug);
 	for (i = 0; i < bp->b_map_count; i++) {
 		xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
@@ -1466,12 +1447,11 @@
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
 	/* on shutdown we stale and complete the buffer immediately */
-	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
 		xfs_buf_ioerror(bp, -EIO);
 		bp->b_flags &= ~XBF_DONE;
 		xfs_buf_stale(bp);
-		if (bp->b_flags & XBF_ASYNC)
-			xfs_buf_ioend(bp);
+		xfs_buf_ioend(bp);
 		return -EIO;
 	}
 
@@ -1537,16 +1517,11 @@
 	return page_address(page) + (offset & (PAGE_SIZE-1));
 }
 
-/*
- *	Move data into or out of a buffer.
- */
 void
-xfs_buf_iomove(
-	xfs_buf_t		*bp,	/* buffer to process		*/
-	size_t			boff,	/* starting buffer offset	*/
-	size_t			bsize,	/* length to copy		*/
-	void			*data,	/* data address			*/
-	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
+xfs_buf_zero(
+	struct xfs_buf		*bp,
+	size_t			boff,
+	size_t			bsize)
 {
 	size_t			bend;
 
@@ -1559,23 +1534,13 @@
 		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
 		page = bp->b_pages[page_index];
 		csize = min_t(size_t, PAGE_SIZE - page_offset,
-				      BBTOB(bp->b_io_length) - boff);
+				      BBTOB(bp->b_length) - boff);
 
 		ASSERT((csize + page_offset) <= PAGE_SIZE);
 
-		switch (mode) {
-		case XBRW_ZERO:
-			memset(page_address(page) + page_offset, 0, csize);
-			break;
-		case XBRW_READ:
-			memcpy(data, page_address(page) + page_offset, csize);
-			break;
-		case XBRW_WRITE:
-			memcpy(page_address(page) + page_offset, data, csize);
-		}
+		memset(page_address(page) + page_offset, 0, csize);
 
 		boff += csize;
-		data += csize;
 	}
 }
 
@@ -1787,7 +1752,7 @@
 {
 	xfs_buftarg_t		*btp;
 
-	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
+	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
 
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
@@ -1926,7 +1891,6 @@
 	struct list_head	*wait_list)
 {
 	struct xfs_buf		*bp, *n;
-	LIST_HEAD		(submit_list);
 	int			pinned = 0;
 	struct blk_plug		plug;
 
@@ -1989,6 +1953,13 @@
  * is only safely useable for callers that can track I/O completion by higher
  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
  * function.
+ *
+ * Note: this function will skip buffers it would block on, and in doing so
+ * leaves them on @buffer_list so they can be retried on a later pass. As such,
+ * it is up to the caller to ensure that the buffer list is fully submitted or
+ * cancelled appropriately when they are finished with the list. Failure to
+ * cancel or resubmit the list until it is empty will result in leaked buffers
+ * at unmount time.
  */
 int
 xfs_buf_delwri_submit_nowait(
@@ -2116,9 +2087,45 @@
 	 * This allows userspace to disrupt buffer caching for debug/testing
 	 * purposes.
 	 */
-	if (XFS_TEST_ERROR(false, bp->b_target->bt_mount,
-			   XFS_ERRTAG_BUF_LRU_REF))
+	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
 		lru_ref = 0;
 
 	atomic_set(&bp->b_lru_ref, lru_ref);
 }
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+	struct xfs_buf		*bp,
+	__be32			dmagic)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	int			idx;
+
+	idx = xfs_sb_version_hascrc(&mp->m_sb);
+	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
+		return false;
+	return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+	struct xfs_buf		*bp,
+	__be16			dmagic)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	int			idx;
+
+	idx = xfs_sb_version_hascrc(&mp->m_sb);
+	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
+		return false;
+	return dmagic == bp->b_ops->magic16[idx];
+}

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4e3171a..f6ce17d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h

@@ -21,12 +21,6 @@
 
 #define XFS_BUF_DADDR_NULL	((xfs_daddr_t) (-1LL))
 
-typedef enum {
-	XBRW_READ = 1,			/* transfer into target memory */
-	XBRW_WRITE = 2,			/* transfer from target memory */
-	XBRW_ZERO = 3,			/* Zero target memory */
-} xfs_buf_rw_t;
-
 #define XBF_READ	 (1 << 0) /* buffer intended for reading from device */
 #define XBF_WRITE	 (1 << 1) /* buffer intended for writing to device */
 #define XBF_READ_AHEAD	 (1 << 2) /* asynchronous read-ahead */
@@ -34,12 +28,7 @@
 #define XBF_ASYNC	 (1 << 4) /* initiator will not wait for completion */
 #define XBF_DONE	 (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_STALE	 (1 << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL	 (1 << 24)/* async writes have failed on this buffer */
-
-/* I/O hints for the BIO layer */
-#define XBF_SYNCIO	 (1 << 10)/* treat this buffer as synchronous I/O */
-#define XBF_FUA		 (1 << 11)/* force cache write through mode */
-#define XBF_FLUSH	 (1 << 12)/* flush the disk cache before a write */
+#define XBF_WRITE_FAIL	 (1 << 7) /* async writes have failed on this buffer */
 
 /* flags used only as arguments to access routines */
 #define XBF_TRYLOCK	 (1 << 16)/* lock requested, but do not wait */
@@ -49,7 +38,6 @@
 #define _XBF_PAGES	 (1 << 20)/* backed by refcounted pages */
 #define _XBF_KMEM	 (1 << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1 << 22)/* buffer on a delwri queue */
-#define _XBF_COMPOUND	 (1 << 23)/* compound buffer */
 
 typedef unsigned int xfs_buf_flags_t;
 
@@ -62,15 +50,11 @@
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_STALE,		"STALE" }, \
 	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
-	{ XBF_SYNCIO,		"SYNCIO" }, \
-	{ XBF_FUA,		"FUA" }, \
-	{ XBF_FLUSH,		"FLUSH" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" },	/* should never be set */\
 	{ XBF_UNMAPPED,		"UNMAPPED" },	/* ditto */\
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
-	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
-	{ _XBF_COMPOUND,	"COMPOUND" }
+	{ _XBF_DELWRI_Q,	"DELWRI_Q" }
 
 
 /*
@@ -125,6 +109,10 @@
 
 struct xfs_buf_ops {
 	char *name;
+	union {
+		__be32 magic[2];	/* v4 and v5 on disk magic values */
+		__be16 magic16[2];	/* v4 and v5 on disk magic values */
+	};
 	void (*verify_read)(struct xfs_buf *);
 	void (*verify_write)(struct xfs_buf *);
 	xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -157,13 +145,13 @@
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;		/* contains rbtree root */
+	struct xfs_mount	*b_mount;
 	xfs_buftarg_t		*b_target;	/* buffer target (device) */
 	void			*b_addr;	/* virtual address of buffer */
 	struct work_struct	b_ioend_work;
-	struct workqueue_struct	*b_ioend_wq;	/* I/O completion wq */
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
 	struct completion	b_iowait;	/* queue for I/O waiters */
-	void			*b_log_item;
+	struct xfs_buf_log_item	*b_log_item;
 	struct list_head	b_li_list;	/* Log items list head */
 	struct xfs_trans	*b_transp;
 	struct page		**b_pages;	/* array of page pointers */
@@ -171,7 +159,6 @@
 	struct xfs_buf_map	*b_maps;	/* compound buffer map */
 	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
 	int			b_map_count;
-	int			b_io_length;	/* IO size in BBs */
 	atomic_t		b_pin_count;	/* pin count */
 	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
 	unsigned int		b_page_count;	/* size of page array */
@@ -205,21 +192,6 @@
 			   xfs_daddr_t blkno, size_t numblks,
 			   xfs_buf_flags_t flags);
 
-struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
-			       struct xfs_buf_map *map, int nmaps,
-			       xfs_buf_flags_t flags);
-
-static inline struct xfs_buf *
-xfs_buf_alloc(
-	struct xfs_buftarg	*target,
-	xfs_daddr_t		blkno,
-	size_t			numblks,
-	xfs_buf_flags_t		flags)
-{
-	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-	return _xfs_buf_alloc(target, &map, 1, flags);
-}
-
 struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
 			       struct xfs_buf_map *map, int nmaps,
 			       xfs_buf_flags_t flags);
@@ -235,11 +207,10 @@
 xfs_buf_get(
 	struct xfs_buftarg	*target,
 	xfs_daddr_t		blkno,
-	size_t			numblks,
-	xfs_buf_flags_t		flags)
+	size_t			numblks)
 {
 	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-	return xfs_buf_get_map(target, &map, 1, flags);
+	return xfs_buf_get_map(target, &map, 1, 0);
 }
 
 static inline struct xfs_buf *
@@ -265,9 +236,6 @@
 	return xfs_buf_readahead_map(target, &map, 1, ops);
 }
 
-void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
-int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
-
 struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
 				int flags);
 int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
@@ -301,10 +269,7 @@
 	return __xfs_buf_submit(bp, wait);
 }
 
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
-				xfs_buf_rw_t);
-#define xfs_buf_zero(bp, off, len) \
-	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
 
 /* Buffer Utility Routines */
 extern void *xfs_buf_offset(struct xfs_buf *, size_t);
@@ -385,4 +350,14 @@
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
 
+static inline int
+xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
+{
+	return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
+}
+
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+
 #endif	/* __XFS_BUF_H__ */

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 12d8455..d74fbd1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c

@@ -5,19 +5,17 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
-#include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
-#include "xfs_inode.h"
 
 
 kmem_zone_t	*xfs_buf_item_zone;
@@ -520,7 +518,7 @@
 	/* has a previous flush failed due to IO errors? */
 	if ((bp->b_flags & XBF_WRITE_FAIL) &&
 	    ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
-		xfs_warn(bp->b_target->bt_mount,
+		xfs_warn(bp->b_mount,
 "Failing async write on buffer block 0x%llx. Retrying async write.",
 			 (long long)bp->b_bn);
 	}
@@ -594,7 +592,7 @@
  * free the item.
  */
 STATIC void
-xfs_buf_item_unlock(
+xfs_buf_item_release(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
@@ -605,9 +603,11 @@
 #if defined(DEBUG) || defined(XFS_WARN)
 	bool			ordered = bip->bli_flags & XFS_BLI_ORDERED;
 	bool			dirty = bip->bli_flags & XFS_BLI_DIRTY;
+	bool			aborted = test_bit(XFS_LI_ABORTED,
+						   &lip->li_flags);
 #endif
 
-	trace_xfs_buf_item_unlock(bip);
+	trace_xfs_buf_item_release(bip);
 
 	/*
 	 * The bli dirty state should match whether the blf has logged segments
@@ -633,10 +633,18 @@
 	released = xfs_buf_item_put(bip);
 	if (hold || (stale && !released))
 		return;
-	ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags));
+	ASSERT(!stale || aborted);
 	xfs_buf_relse(bp);
 }
 
+STATIC void
+xfs_buf_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn)
+{
+	return xfs_buf_item_release(lip);
+}
+
 /*
  * This is called to find out where the oldest active copy of the
  * buf log item in the on disk log resides now that the last log
@@ -669,25 +677,15 @@
 	return lsn;
 }
 
-STATIC void
-xfs_buf_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		commit_lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all buf log items.
- */
 static const struct xfs_item_ops xfs_buf_item_ops = {
 	.iop_size	= xfs_buf_item_size,
 	.iop_format	= xfs_buf_item_format,
 	.iop_pin	= xfs_buf_item_pin,
 	.iop_unpin	= xfs_buf_item_unpin,
-	.iop_unlock	= xfs_buf_item_unlock,
+	.iop_release	= xfs_buf_item_release,
+	.iop_committing	= xfs_buf_item_committing,
 	.iop_committed	= xfs_buf_item_committed,
 	.iop_push	= xfs_buf_item_push,
-	.iop_committing = xfs_buf_item_committing
 };
 
 STATIC int
@@ -704,7 +702,7 @@
 	}
 
 	bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
-				KM_SLEEP);
+				0);
 	if (!bip->bli_formats)
 		return -ENOMEM;
 	return 0;
@@ -741,7 +739,7 @@
 	 * this buffer. If we do already have one, there is
 	 * nothing to do here so return.
 	 */
-	ASSERT(bp->b_target->bt_mount == mp);
+	ASSERT(bp->b_mount == mp);
 	if (bip) {
 		ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
 		ASSERT(!bp->b_transp);
@@ -749,7 +747,7 @@
 		return 0;
 	}
 
-	bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
+	bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
 	xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
 	bip->bli_buf = bp;
 
@@ -978,9 +976,9 @@
  */
 void
 xfs_buf_attach_iodone(
-	xfs_buf_t	*bp,
-	void		(*cb)(xfs_buf_t *, xfs_log_item_t *),
-	xfs_log_item_t	*lip)
+	struct xfs_buf		*bp,
+	void			(*cb)(struct xfs_buf *, struct xfs_log_item *),
+	struct xfs_log_item	*lip)
 {
 	ASSERT(xfs_buf_islocked(bp));
 
@@ -1233,9 +1231,23 @@
 }
 
 /*
- * Requeue a failed buffer for writeback
+ * Requeue a failed buffer for writeback.
  *
- * Return true if the buffer has been re-queued properly, false otherwise
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore  always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ *
+ * Return true if the buffer was added to the buffer list, false if it was
+ * already on the buffer list.
  */
 bool
 xfs_buf_resubmit_failed_buffers(
@@ -1243,16 +1255,16 @@
 	struct list_head	*buffer_list)
 {
 	struct xfs_log_item	*lip;
+	bool			ret;
+
+	ret = xfs_buf_delwri_queue(bp, buffer_list);
 
 	/*
-	 * Clear XFS_LI_FAILED flag from all items before resubmit
-	 *
-	 * XFS_LI_FAILED set/clear is protected by ail_lock, caller  this
+	 * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
 	 * function already have it acquired
 	 */
 	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
 		xfs_clear_li_failed(lip);
 
-	/* Add this buffer back to the delayed write list */
-	return xfs_buf_delwri_queue(bp, buffer_list);
+	return ret;
 }

diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 90f65f8..4a054b1 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h

@@ -39,7 +39,7 @@
  * locked, and which 128 byte chunks of the buffer are dirty.
  */
 struct xfs_buf_log_item {
-	xfs_log_item_t		bli_item;	/* common item structure */
+	struct xfs_log_item	bli_item;	/* common item structure */
 	struct xfs_buf		*bli_buf;	/* real buffer pointer */
 	unsigned int		bli_flags;	/* misc flags */
 	unsigned int		bli_recur;	/* lock recursion count */
@@ -55,8 +55,8 @@
 void	xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
 bool	xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
 void	xfs_buf_attach_iodone(struct xfs_buf *,
-			      void(*)(struct xfs_buf *, xfs_log_item_t *),
-			      xfs_log_item_t *);
+			      void(*)(struct xfs_buf *, struct xfs_log_item *),
+			      struct xfs_log_item *);
 void	xfs_buf_iodone_callbacks(struct xfs_buf *);
 void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
 bool	xfs_buf_resubmit_failed_buffers(struct xfs_buf *,

diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 5142e64..283df89 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c

@@ -6,17 +6,14 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
-#include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
 #include "xfs_trans.h"

diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 93f07ed..8ec7aab 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c

@@ -4,19 +4,17 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_quota.h"
-#include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
-#include "xfs_discard.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
 
@@ -161,9 +159,19 @@
 		return -EPERM;
 	if (!blk_queue_discard(q))
 		return -EOPNOTSUPP;
+
+	/*
+	 * We haven't recovered the log, so we cannot use our bnobt-guided
+	 * storage zapping commands.
+	 */
+	if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+		return -EROFS;
+
 	if (copy_from_user(&range, urange, sizeof(range)))
 		return -EFAULT;
 
+	range.minlen = max_t(u64, granularity, range.minlen);
+	minlen = BTOBB(range.minlen);
 	/*
 	 * Truncating down the len isn't actually quite correct, but using
 	 * BBTOB would mean we trivially get overflows for values
@@ -178,7 +186,6 @@
 
 	start = BTOBB(range.start);
 	end = start + BTOBBT(range.len) - 1;
-	minlen = BTOBB(max_t(u64, granularity, range.minlen));
 
 	if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
 		end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 87e6dd5..aeb95e7 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c

@@ -14,16 +14,12 @@
 #include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_alloc.h"
 #include "xfs_quota.h"
-#include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
-#include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
@@ -277,7 +273,8 @@
 
 /*
  * Ensure that the given in-core dquot has a buffer on disk backing it, and
- * return the buffer. This is called when the bmapi finds a hole.
+ * return the buffer locked and held. This is called when the bmapi finds a
+ * hole.
  */
 STATIC int
 xfs_dquot_disk_alloc(
@@ -355,13 +352,14 @@
 	 * If everything succeeds, the caller of this function is returned a
 	 * buffer that is locked and held to the transaction.  The caller
 	 * is responsible for unlocking any buffer passed back, either
-	 * manually or by committing the transaction.
+	 * manually or by committing the transaction.  On error, the buffer is
+	 * released and not passed back.
 	 */
 	xfs_trans_bhold(tp, bp);
 	error = xfs_defer_finish(tpp);
-	tp = *tpp;
 	if (error) {
-		xfs_buf_relse(bp);
+		xfs_trans_bhold_release(*tpp, bp);
+		xfs_trans_brelse(*tpp, bp);
 		return error;
 	}
 	*bpp = bp;
@@ -442,7 +440,7 @@
 {
 	struct xfs_dquot	*dqp;
 
-	dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+	dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
 
 	dqp->dq_flags = type;
 	dqp->q_core.d_id = cpu_to_be32(id);
@@ -521,7 +519,6 @@
 	struct xfs_buf		**bpp)
 {
 	struct xfs_trans	*tp;
-	struct xfs_buf		*bp;
 	int			error;
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
@@ -529,7 +526,7 @@
 	if (error)
 		goto err;
 
-	error = xfs_dquot_disk_alloc(&tp, dqp, &bp);
+	error = xfs_dquot_disk_alloc(&tp, dqp, bpp);
 	if (error)
 		goto err_cancel;
 
@@ -539,10 +536,10 @@
 		 * Buffer was held to the transaction, so we have to unlock it
 		 * manually here because we're not passing it back.
 		 */
-		xfs_buf_relse(bp);
+		xfs_buf_relse(*bpp);
+		*bpp = NULL;
 		goto err;
 	}
-	*bpp = bp;
 	return 0;
 
 err_cancel:
@@ -1242,7 +1239,7 @@
 /*
  * Iterate every dquot of a particular type.  The caller must ensure that the
  * particular quota type is active.  iter_fn can return negative error codes,
- * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating.
+ * or -ECANCELED to indicate that it wants to stop iterating.
  */
 int
 xfs_qm_dqiterate(

diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 64bd864..4fe8570 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h

@@ -34,7 +34,6 @@
 	uint		 dq_flags;	/* various flags (XFS_DQ_*) */
 	struct list_head q_lru;		/* global free list of dquots */
 	struct xfs_mount*q_mount;	/* filesystem this relates to */
-	struct xfs_trans*q_transp;	/* trans this belongs to currently */
 	uint		 q_nrefs;	/* # active refs from inodes */
 	xfs_daddr_t	 q_blkno;	/* blkno of dquot buffer */
 	int		 q_bufoffset;	/* off of dq in buffer (# dquots) */

diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 7dedd17..d60647d 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c

@@ -5,13 +5,13 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_quota.h"
-#include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -94,18 +94,6 @@
 		wake_up(&dqp->q_pinwait);
 }
 
-STATIC xfs_lsn_t
-xfs_qm_dquot_logitem_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	/*
-	 * We always re-log the entire dquot when it becomes dirty,
-	 * so, the latest copy _is_ the only one that matters.
-	 */
-	return lsn;
-}
-
 /*
  * This is called to wait for the given dquot to be unpinned.
  * Most of these pin/unpin routines are plagiarized from inode code.
@@ -209,14 +197,8 @@
 	return rval;
 }
 
-/*
- * Unlock the dquot associated with the log item.
- * Clear the fields of the dquot and dquot log item that
- * are specific to the current transaction.  If the
- * hold flags is set, do not unlock the dquot.
- */
 STATIC void
-xfs_qm_dquot_logitem_unlock(
+xfs_qm_dquot_logitem_release(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
@@ -224,11 +206,6 @@
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
 	/*
-	 * Clear the transaction pointer in the dquot
-	 */
-	dqp->q_transp = NULL;
-
-	/*
 	 * dquots are never 'held' from getting unlocked at the end of
 	 * a transaction.  Their locking and unlocking is hidden inside the
 	 * transaction layer, within trans_commit. Hence, no LI_HOLD flag
@@ -237,30 +214,22 @@
 	xfs_dqunlock(dqp);
 }
 
-/*
- * this needs to stamp an lsn into the dquot, I think.
- * rpc's that look at user dquot's would then have to
- * push on the dependency recorded in the dquot
- */
 STATIC void
 xfs_qm_dquot_logitem_committing(
 	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
+	xfs_lsn_t		commit_lsn)
 {
+	return xfs_qm_dquot_logitem_release(lip);
 }
 
-/*
- * This is the ops vector for dquots
- */
 static const struct xfs_item_ops xfs_dquot_item_ops = {
 	.iop_size	= xfs_qm_dquot_logitem_size,
 	.iop_format	= xfs_qm_dquot_logitem_format,
 	.iop_pin	= xfs_qm_dquot_logitem_pin,
 	.iop_unpin	= xfs_qm_dquot_logitem_unpin,
-	.iop_unlock	= xfs_qm_dquot_logitem_unlock,
-	.iop_committed	= xfs_qm_dquot_logitem_committed,
+	.iop_release	= xfs_qm_dquot_logitem_release,
+	.iop_committing	= xfs_qm_dquot_logitem_committing,
 	.iop_push	= xfs_qm_dquot_logitem_push,
-	.iop_committing = xfs_qm_dquot_logitem_committing,
 	.iop_error	= xfs_dquot_item_error
 };
 
@@ -320,26 +289,6 @@
 }
 
 /*
- * Pinning has no meaning for an quotaoff item, so just return.
- */
-STATIC void
-xfs_qm_qoff_logitem_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an quotaoff item, unpinning does
- * not either.
- */
-STATIC void
-xfs_qm_qoff_logitem_unpin(
-	struct xfs_log_item	*lip,
-	int			remove)
-{
-}
-
-/*
  * There isn't much you can do to push a quotaoff item.  It is simply
  * stuck waiting for the log to be flushed to disk.
  */
@@ -351,28 +300,6 @@
 	return XFS_ITEM_LOCKED;
 }
 
-/*
- * Quotaoff items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
-STATIC void
-xfs_qm_qoff_logitem_unlock(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
- * The quotaoff-start-item is logged only once and cannot be moved in the log,
- * so simply return the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	return lsn;
-}
-
 STATIC xfs_lsn_t
 xfs_qm_qoffend_logitem_committed(
 	struct xfs_log_item	*lip,
@@ -396,50 +323,17 @@
 	return (xfs_lsn_t)-1;
 }
 
-/*
- * XXX rcc - don't know quite what to do with this.  I think we can
- * just ignore it.  The only time that isn't the case is if we allow
- * the client to somehow see that quotas have been turned off in which
- * we can't allow that to get back until the quotaoff hits the disk.
- * So how would that happen?  Also, do we need different routines for
- * quotaoff start and quotaoff end?  I suspect the answer is yes but
- * to be sure, I need to look at the recovery code and see how quota off
- * recovery is handled (do we roll forward or back or do something else).
- * If we roll forwards or backwards, then we need two separate routines,
- * one that does nothing and one that stamps in the lsn that matters
- * (truly makes the quotaoff irrevocable).  If we do something else,
- * then maybe we don't need two.
- */
-STATIC void
-xfs_qm_qoff_logitem_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		commit_lsn)
-{
-}
-
 static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
 	.iop_size	= xfs_qm_qoff_logitem_size,
 	.iop_format	= xfs_qm_qoff_logitem_format,
-	.iop_pin	= xfs_qm_qoff_logitem_pin,
-	.iop_unpin	= xfs_qm_qoff_logitem_unpin,
-	.iop_unlock	= xfs_qm_qoff_logitem_unlock,
 	.iop_committed	= xfs_qm_qoffend_logitem_committed,
 	.iop_push	= xfs_qm_qoff_logitem_push,
-	.iop_committing = xfs_qm_qoff_logitem_committing
 };
 
-/*
- * This is the ops vector shared by all quotaoff-start log items.
- */
 static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
 	.iop_size	= xfs_qm_qoff_logitem_size,
 	.iop_format	= xfs_qm_qoff_logitem_format,
-	.iop_pin	= xfs_qm_qoff_logitem_pin,
-	.iop_unpin	= xfs_qm_qoff_logitem_unpin,
-	.iop_unlock	= xfs_qm_qoff_logitem_unlock,
-	.iop_committed	= xfs_qm_qoff_logitem_committed,
 	.iop_push	= xfs_qm_qoff_logitem_push,
-	.iop_committing = xfs_qm_qoff_logitem_committing
 };
 
 /*
@@ -453,7 +347,7 @@
 {
 	struct xfs_qoff_logitem	*qf;
 
-	qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+	qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
 
 	xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
 			&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);

diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index db9df71..1aed34c 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h

@@ -12,13 +12,13 @@
 struct xfs_qoff_logitem;
 
 typedef struct xfs_dq_logitem {
-	xfs_log_item_t		 qli_item;	   /* common portion */
+	struct xfs_log_item	 qli_item;	   /* common portion */
 	struct xfs_dquot	*qli_dquot;	   /* dquot ptr */
 	xfs_lsn_t		 qli_flush_lsn;	   /* lsn at last flush */
 } xfs_dq_logitem_t;
 
 typedef struct xfs_qoff_logitem {
-	xfs_log_item_t		 qql_item;	/* common portion */
+	struct xfs_log_item	 qql_item;	/* common portion */
 	struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
 	unsigned int		qql_flags;
 } xfs_qoff_logitem_t;

diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f54..849fd44 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c

@@ -4,6 +4,7 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_fs.h"
 #include "xfs_log_format.h"
@@ -51,6 +52,7 @@
 	XFS_RANDOM_BUF_LRU_REF,
 	XFS_RANDOM_FORCE_SCRUB_REPAIR,
 	XFS_RANDOM_FORCE_SUMMARY_RECALC,
+	XFS_RANDOM_IUNLINK_FALLBACK,
 };
 
 struct xfs_errortag_attr {
@@ -159,6 +161,7 @@
 XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
 XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
 XFS_ERRORTAG_ATTR_RW(bad_summary,	XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback,	XFS_ERRTAG_IUNLINK_FALLBACK);
 
 static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +198,7 @@
 	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
 	XFS_ERRORTAG_ATTR_LIST(force_repair),
 	XFS_ERRORTAG_ATTR_LIST(bad_summary),
+	XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
 	NULL,
 };
 
@@ -209,7 +213,7 @@
 	struct xfs_mount	*mp)
 {
 	mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
-			KM_SLEEP | KM_MAYFAIL);
+			KM_MAYFAIL);
 	if (!mp->m_errortag)
 		return -ENOMEM;
 
@@ -350,14 +354,15 @@
 	size_t			bufsz,
 	xfs_failaddr_t		failaddr)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_mount	*mp = bp->b_mount;
 	xfs_failaddr_t		fa;
 	int			sz;
 
 	fa = failaddr ? failaddr : __return_address;
 	__xfs_buf_ioerror(bp, error, fa);
 
-	xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+	xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+		  "Metadata %s detected at %pS, %s block 0x%llx %s",
 		  bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
 		  fa, bp->b_ops->name, bp->b_bn, name);
 

diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e9..602aa7d 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h

@@ -98,5 +98,6 @@
 #define		XFS_PTAG_SHUTDOWN_IOERROR	0x00000020
 #define		XFS_PTAG_SHUTDOWN_LOGERROR	0x00000040
 #define		XFS_PTAG_FSBLOCK_ZERO		0x00000080
+#define		XFS_PTAG_VERIFIER_ERROR		0x00000100
 
 #endif	/* __XFS_ERROR_H__ */

diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f2284ce..f1372f9 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c

@@ -4,18 +4,16 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_export.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
-#include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
 #include "xfs_pnfs.h"

diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed6837..2183d87 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c

@@ -33,7 +33,7 @@
 	struct rb_node		**rbp;
 	struct rb_node		*parent = NULL;
 
-	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+	new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
 	new->agno = agno;
 	new->bno = bno;
 	new->length = len;

diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d9da66c..e44efc4 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c

@@ -9,14 +9,18 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
+#include "xfs_shared.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
 #include "xfs_extfree_item.h"
 #include "xfs_log.h"
 #include "xfs_btree.h"
 #include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_trace.h"
 
 
 kmem_zone_t	*xfs_efi_zone;
@@ -107,15 +111,6 @@
 
 
 /*
- * Pinning has no meaning for an efi item, so just return.
- */
-STATIC void
-xfs_efi_item_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
  * The unpin operation is the last place an EFI is manipulated in the log. It is
  * either inserted in the AIL or aborted in the event of a log I/O error. In
  * either case, the EFI transaction has been successfully committed to make it
@@ -133,71 +128,22 @@
 }
 
 /*
- * Efi items have no locking or pushing.  However, since EFIs are pulled from
- * the AIL when their corresponding EFDs are committed to disk, their situation
- * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log.  This should help in getting the EFI out of
- * the AIL.
- */
-STATIC uint
-xfs_efi_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	return XFS_ITEM_PINNED;
-}
-
-/*
  * The EFI has been either committed or aborted if the transaction has been
  * cancelled. If the transaction was cancelled, an EFD isn't going to be
  * constructed and thus we free the EFI here directly.
  */
 STATIC void
-xfs_efi_item_unlock(
+xfs_efi_item_release(
 	struct xfs_log_item	*lip)
 {
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
-		xfs_efi_release(EFI_ITEM(lip));
+	xfs_efi_release(EFI_ITEM(lip));
 }
 
-/*
- * The EFI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_efi_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	return lsn;
-}
-
-/*
- * The EFI dependency tracking op doesn't do squat.  It can't because
- * it doesn't know where the free extent is coming from.  The dependency
- * tracking has to be handled by the "enclosing" metadata object.  For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_efi_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all efi log items.
- */
 static const struct xfs_item_ops xfs_efi_item_ops = {
 	.iop_size	= xfs_efi_item_size,
 	.iop_format	= xfs_efi_item_format,
-	.iop_pin	= xfs_efi_item_pin,
 	.iop_unpin	= xfs_efi_item_unpin,
-	.iop_unlock	= xfs_efi_item_unlock,
-	.iop_committed	= xfs_efi_item_committed,
-	.iop_push	= xfs_efi_item_push,
-	.iop_committing = xfs_efi_item_committing
+	.iop_release	= xfs_efi_item_release,
 };
 
 
@@ -217,9 +163,9 @@
 	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
 		size = (uint)(sizeof(xfs_efi_log_item_t) +
 			((nextents - 1) * sizeof(xfs_extent_t)));
-		efip = kmem_zalloc(size, KM_SLEEP);
+		efip = kmem_zalloc(size, 0);
 	} else {
-		efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+		efip = kmem_zone_zalloc(xfs_efi_zone, 0);
 	}
 
 	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -349,137 +295,299 @@
 }
 
 /*
- * Pinning has no meaning for an efd item, so just return.
- */
-STATIC void
-xfs_efd_item_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an efd item, unpinning does
- * not either.
- */
-STATIC void
-xfs_efd_item_unpin(
-	struct xfs_log_item	*lip,
-	int			remove)
-{
-}
-
-/*
- * There isn't much you can do to push on an efd item.  It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_efd_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	return XFS_ITEM_PINNED;
-}
-
-/*
  * The EFD is either committed or aborted if the transaction is cancelled. If
  * the transaction is cancelled, drop our reference to the EFI and free the EFD.
  */
 STATIC void
-xfs_efd_item_unlock(
+xfs_efd_item_release(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
 
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
-		xfs_efi_release(efdp->efd_efip);
-		xfs_efd_item_free(efdp);
-	}
-}
-
-/*
- * When the efd item is committed to disk, all we need to do is delete our
- * reference to our partner efi item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from further
- * referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_efd_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
-
-	/*
-	 * Drop the EFI reference regardless of whether the EFD has been
-	 * aborted. Once the EFD transaction is constructed, it is the sole
-	 * responsibility of the EFD to release the EFI (even if the EFI is
-	 * aborted due to log I/O error).
-	 */
 	xfs_efi_release(efdp->efd_efip);
 	xfs_efd_item_free(efdp);
-
-	return (xfs_lsn_t)-1;
 }
 
-/*
- * The EFD dependency tracking op doesn't do squat.  It can't because
- * it doesn't know where the free extent is coming from.  The dependency
- * tracking has to be handled by the "enclosing" metadata object.  For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_efd_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all efd log items.
- */
 static const struct xfs_item_ops xfs_efd_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
 	.iop_size	= xfs_efd_item_size,
 	.iop_format	= xfs_efd_item_format,
-	.iop_pin	= xfs_efd_item_pin,
-	.iop_unpin	= xfs_efd_item_unpin,
-	.iop_unlock	= xfs_efd_item_unlock,
-	.iop_committed	= xfs_efd_item_committed,
-	.iop_push	= xfs_efd_item_push,
-	.iop_committing = xfs_efd_item_committing
+	.iop_release	= xfs_efd_item_release,
 };
 
 /*
- * Allocate and initialize an efd item with the given number of extents.
+ * Allocate an "extent free done" log item that will hold nextents worth of
+ * extents.  The caller must use all nextents extents, because we are not
+ * flexible about this at all.
  */
-struct xfs_efd_log_item *
-xfs_efd_init(
-	struct xfs_mount	*mp,
-	struct xfs_efi_log_item	*efip,
-	uint			nextents)
-
+static struct xfs_efd_log_item *
+xfs_trans_get_efd(
+	struct xfs_trans		*tp,
+	struct xfs_efi_log_item		*efip,
+	unsigned int			nextents)
 {
-	struct xfs_efd_log_item	*efdp;
-	uint			size;
+	struct xfs_efd_log_item		*efdp;
 
 	ASSERT(nextents > 0);
+
 	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
-		size = (uint)(sizeof(xfs_efd_log_item_t) +
-			((nextents - 1) * sizeof(xfs_extent_t)));
-		efdp = kmem_zalloc(size, KM_SLEEP);
+		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
+				(nextents - 1) * sizeof(struct xfs_extent),
+				0);
 	} else {
-		efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+		efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
 	}
 
-	xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
+	xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+			  &xfs_efd_item_ops);
 	efdp->efd_efip = efip;
 	efdp->efd_format.efd_nextents = nextents;
 	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
 
+	xfs_trans_add_item(tp, &efdp->efd_item);
 	return efdp;
 }
 
 /*
+ * Free an extent and log it to the EFD. Note that the transaction is marked
+ * dirty regardless of whether the extent free succeeds or fails to support the
+ * EFI/EFD lifecycle rules.
+ */
+static int
+xfs_trans_free_extent(
+	struct xfs_trans		*tp,
+	struct xfs_efd_log_item		*efdp,
+	xfs_fsblock_t			start_block,
+	xfs_extlen_t			ext_len,
+	const struct xfs_owner_info	*oinfo,
+	bool				skip_discard)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_extent		*extp;
+	uint				next_extent;
+	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, start_block);
+	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
+								start_block);
+	int				error;
+
+	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
+
+	error = __xfs_free_extent(tp, start_block, ext_len,
+				  oinfo, XFS_AG_RESV_NONE, skip_discard);
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the EFI and frees the EFD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+	efdp->efd_next_extent++;
+
+	return error;
+}
+
+/* Sort bmap items by AG. */
+static int
+xfs_extent_free_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_mount		*mp = priv;
+	struct xfs_extent_free_item	*ra;
+	struct xfs_extent_free_item	*rb;
+
+	ra = container_of(a, struct xfs_extent_free_item, xefi_list);
+	rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+}
+
+/* Get an EFI. */
+STATIC void *
+xfs_extent_free_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_efi_log_item		*efip;
+
+	ASSERT(tp != NULL);
+	ASSERT(count > 0);
+
+	efip = xfs_efi_init(tp->t_mountp, count);
+	ASSERT(efip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &efip->efi_item);
+	return efip;
+}
+
+/* Log a free extent to the intent item. */
+STATIC void
+xfs_extent_free_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
+{
+	struct xfs_efi_log_item		*efip = intent;
+	struct xfs_extent_free_item	*free;
+	uint				next_extent;
+	struct xfs_extent		*extp;
+
+	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+	ASSERT(next_extent < efip->efi_format.efi_nextents);
+	extp = &efip->efi_format.efi_extents[next_extent];
+	extp->ext_start = free->xefi_startblock;
+	extp->ext_len = free->xefi_blockcount;
+}
+
+/* Get an EFD so we can process all the free extents. */
+STATIC void *
+xfs_extent_free_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_efd(tp, intent, count);
+}
+
+/* Process a free extent. */
+STATIC int
+xfs_extent_free_finish_item(
+	struct xfs_trans		*tp,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_extent_free_item	*free;
+	int				error;
+
+	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+	error = xfs_trans_free_extent(tp, done_item,
+			free->xefi_startblock,
+			free->xefi_blockcount,
+			&free->xefi_oinfo, free->xefi_skip_discard);
+	kmem_free(free);
+	return error;
+}
+
+/* Abort all pending EFIs. */
+STATIC void
+xfs_extent_free_abort_intent(
+	void				*intent)
+{
+	xfs_efi_release(intent);
+}
+
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_extent_free_item	*free;
+
+	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+	kmem_free(free);
+}
+
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_extent_free_diff_items,
+	.create_intent	= xfs_extent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.log_item	= xfs_extent_free_log_item,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_extent_free_finish_item,
+	.cancel_item	= xfs_extent_free_cancel_item,
+};
+
+/*
+ * AGFL blocks are accounted differently in the reserve pools and are not
+ * inserted into the busy extent list.
+ */
+STATIC int
+xfs_agfl_free_finish_item(
+	struct xfs_trans		*tp,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_efd_log_item		*efdp = done_item;
+	struct xfs_extent_free_item	*free;
+	struct xfs_extent		*extp;
+	struct xfs_buf			*agbp;
+	int				error;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	uint				next_extent;
+
+	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+	ASSERT(free->xefi_blockcount == 1);
+	agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+	agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+
+	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+
+	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+	if (!error)
+		error = xfs_free_agfl_block(tp, agno, agbno, agbp,
+					    &free->xefi_oinfo);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the EFI and frees the EFD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = free->xefi_startblock;
+	extp->ext_len = free->xefi_blockcount;
+	efdp->efd_next_extent++;
+
+	kmem_free(free);
+	return error;
+}
+
+/* sub-type with special handling for AGFL deferred frees */
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_extent_free_diff_items,
+	.create_intent	= xfs_extent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.log_item	= xfs_extent_free_log_item,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_agfl_free_finish_item,
+	.cancel_item	= xfs_extent_free_cancel_item,
+};
+
+/*
  * Process an extent free intent item that was recovered from
  * the log.  We need to free the extents that it describes.
  */
@@ -494,7 +602,6 @@
 	int			error = 0;
 	xfs_extent_t		*extp;
 	xfs_fsblock_t		startblock_fsb;
-	struct xfs_owner_info	oinfo;
 
 	ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
 
@@ -526,11 +633,11 @@
 		return error;
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
-	xfs_rmap_any_owner_update(&oinfo);
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &efip->efi_format.efi_extents[i];
 		error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
-					      extp->ext_len, &oinfo, false);
+					      extp->ext_len,
+					      &XFS_RMAP_OINFO_ANY_OWNER, false);
 		if (error)
 			goto abort_error;
 

diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2a6a895..16aaab0 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h

@@ -51,7 +51,7 @@
  * AIL, so at this point both the EFI and EFD are freed.
  */
 typedef struct xfs_efi_log_item {
-	xfs_log_item_t		efi_item;
+	struct xfs_log_item	efi_item;
 	atomic_t		efi_refcount;
 	atomic_t		efi_next_extent;
 	unsigned long		efi_flags;	/* misc flags */
@@ -64,7 +64,7 @@
  * have been freed.
  */
 typedef struct xfs_efd_log_item {
-	xfs_log_item_t		efd_item;
+	struct xfs_log_item	efd_item;
 	xfs_efi_log_item_t	*efd_efip;
 	uint			efd_next_extent;
 	xfs_efd_log_format_t	efd_format;
@@ -79,8 +79,6 @@
 extern struct kmem_zone	*xfs_efd_zone;
 
 xfs_efi_log_item_t	*xfs_efi_init(struct xfs_mount *, uint);
-xfs_efd_log_item_t	*xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
-				      uint);
 int			xfs_efi_copy_format(xfs_log_iovec_t *buf,
 					    xfs_efi_log_format_t *dst_efi_fmt);
 void			xfs_efi_item_free(xfs_efi_log_item_t *);

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 61a5ad2..1ffb179 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c

@@ -10,14 +10,11 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
-#include "xfs_error.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_ioctl.h"
@@ -28,11 +25,10 @@
 #include "xfs_iomap.h"
 #include "xfs_reflink.h"
 
-#include <linux/dcache.h>
 #include <linux/falloc.h>
-#include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/mman.h>
+#include <linux/fadvise.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -367,40 +363,30 @@
 	 * lock above.  Eventually we should look into a way to avoid
 	 * the pointless lock roundtrip.
 	 */
-	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
-		error = file_update_time(file);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * If we're writing the file then make sure to clear the setuid and
-	 * setgid bits if the process is not being run by root.  This keeps
-	 * people from modifying setuid and setgid binaries.
-	 */
-	if (!IS_NOSEC(inode))
-		return file_remove_privs(file);
-	return 0;
+	return file_modified(file);
 }
 
 static int
 xfs_dio_write_end_io(
 	struct kiocb		*iocb,
 	ssize_t			size,
+	int			error,
 	unsigned		flags)
 {
 	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	loff_t			offset = iocb->ki_pos;
-	int			error = 0;
+	unsigned int		nofs_flag;
 
 	trace_xfs_end_io_direct_write(ip, offset, size);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	if (size <= 0)
-		return size;
+	if (error)
+		return error;
+	if (!size)
+		return 0;
 
 	/*
 	 * Capture amount written on completion as we can't reliably account
@@ -408,10 +394,17 @@
 	 */
 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
 
+	/*
+	 * We can allocate memory here while doing writeback on behalf of
+	 * memory reclaim.  To avoid memory allocation deadlocks set the
+	 * task-wide nofs context for the following operations.
+	 */
+	nofs_flag = memalloc_nofs_save();
+
 	if (flags & IOMAP_DIO_COW) {
 		error = xfs_reflink_end_cow(ip, offset, size);
 		if (error)
-			return error;
+			goto out;
 	}
 
 	/*
@@ -420,8 +413,10 @@
 	 * earlier allows a racing dio read to find unwritten extents before
 	 * they are converted.
 	 */
-	if (flags & IOMAP_DIO_UNWRITTEN)
-		return xfs_iomap_write_unwritten(ip, offset, size, true);
+	if (flags & IOMAP_DIO_UNWRITTEN) {
+		error = xfs_iomap_write_unwritten(ip, offset, size, true);
+		goto out;
+	}
 
 	/*
 	 * We need to update the in-core inode size here so that we don't end up
@@ -443,9 +438,15 @@
 		spin_unlock(&ip->i_flags_lock);
 	}
 
+out:
+	memalloc_nofs_restore(nofs_flag);
 	return error;
 }
 
+static const struct iomap_dio_ops xfs_dio_write_ops = {
+	.end_io		= xfs_dio_write_end_io,
+};
+
 /*
  * xfs_file_dio_aio_write - handle direct IO writes
  *
@@ -507,7 +508,7 @@
 		 * We can't properly handle unaligned direct I/O to reflink
 		 * files yet, as we can't unshare a partial block.
 		 */
-		if (xfs_is_reflink_inode(ip)) {
+		if (xfs_is_cow_inode(ip)) {
 			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
 			return -EREMCHG;
 		}
@@ -517,6 +518,9 @@
 	}
 
 	if (iocb->ki_flags & IOCB_NOWAIT) {
+		/* unaligned dio always waits, bail */
+		if (unaligned_io)
+			return -EAGAIN;
 		if (!xfs_ilock_nowait(ip, iolock))
 			return -EAGAIN;
 	} else {
@@ -529,25 +533,29 @@
 	count = iov_iter_count(from);
 
 	/*
-	 * If we are doing unaligned IO, wait for all other IO to drain,
-	 * otherwise demote the lock if we had to take the exclusive lock
-	 * for other reasons in xfs_file_aio_write_checks.
+	 * If we are doing unaligned IO, we can't allow any other overlapping IO
+	 * in-flight at the same time or we risk data corruption. Wait for all
+	 * other IO to drain before we submit. If the IO is aligned, demote the
+	 * iolock if we had to take the exclusive lock in
+	 * xfs_file_aio_write_checks() for other reasons.
 	 */
 	if (unaligned_io) {
-		/* If we are going to wait for other DIO to finish, bail */
-		if (iocb->ki_flags & IOCB_NOWAIT) {
-			if (atomic_read(&inode->i_dio_count))
-				return -EAGAIN;
-		} else {
-			inode_dio_wait(inode);
-		}
+		inode_dio_wait(inode);
 	} else if (iolock == XFS_IOLOCK_EXCL) {
 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		iolock = XFS_IOLOCK_SHARED;
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
-	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
+	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
+
+	/*
+	 * If unaligned, this is the only IO in-flight. If it has not yet
+	 * completed, wait on it before we release the iolock to prevent
+	 * subsequent overlapping IO.
+	 */
+	if (ret == -EIOCBQUEUED && unaligned_io)
+		inode_dio_wait(inode);
 out:
 	xfs_iunlock(ip, iolock);
 
@@ -872,14 +880,27 @@
 				goto out_unlock;
 		}
 
-		if (mode & FALLOC_FL_ZERO_RANGE)
+		if (mode & FALLOC_FL_ZERO_RANGE) {
 			error = xfs_zero_file_space(ip, offset, len);
-		else {
-			if (mode & FALLOC_FL_UNSHARE_RANGE) {
-				error = xfs_reflink_unshare(ip, offset, len);
-				if (error)
-					goto out_unlock;
+		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+			error = xfs_reflink_unshare(ip, offset, len);
+			if (error)
+				goto out_unlock;
+
+			if (!xfs_is_always_cow_inode(ip)) {
+				error = xfs_alloc_file_space(ip, offset, len,
+						XFS_BMAPI_PREALLOC);
 			}
+		} else {
+			/*
+			 * If always_cow mode we can't use preallocations and
+			 * thus should not create them.
+			 */
+			if (xfs_is_always_cow_inode(ip)) {
+				error = -EOPNOTSUPP;
+				goto out_unlock;
+			}
+
 			error = xfs_alloc_file_space(ip, offset, len,
 						     XFS_BMAPI_PREALLOC);
 		}
@@ -920,27 +941,90 @@
 }
 
 STATIC int
-xfs_file_clone_range(
-	struct file	*file_in,
-	loff_t		pos_in,
-	struct file	*file_out,
-	loff_t		pos_out,
-	u64		len)
+xfs_file_fadvise(
+	struct file	*file,
+	loff_t		start,
+	loff_t		end,
+	int		advice)
 {
-	return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-				     len, false);
+	struct xfs_inode *ip = XFS_I(file_inode(file));
+	int ret;
+	int lockflags = 0;
+
+	/*
+	 * Operations creating pages in page cache need protection from hole
+	 * punching and similar ops
+	 */
+	if (advice == POSIX_FADV_WILLNEED) {
+		lockflags = XFS_IOLOCK_SHARED;
+		xfs_ilock(ip, lockflags);
+	}
+	ret = generic_fadvise(file, start, end, advice);
+	if (lockflags)
+		xfs_iunlock(ip, lockflags);
+	return ret;
 }
 
-STATIC int
-xfs_file_dedupe_range(
-	struct file	*file_in,
-	loff_t		pos_in,
-	struct file	*file_out,
-	loff_t		pos_out,
-	u64		len)
+STATIC loff_t
+xfs_file_remap_range(
+	struct file		*file_in,
+	loff_t			pos_in,
+	struct file		*file_out,
+	loff_t			pos_out,
+	loff_t			len,
+	unsigned int		remap_flags)
 {
-	return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-				     len, true);
+	struct inode		*inode_in = file_inode(file_in);
+	struct xfs_inode	*src = XFS_I(inode_in);
+	struct inode		*inode_out = file_inode(file_out);
+	struct xfs_inode	*dest = XFS_I(inode_out);
+	struct xfs_mount	*mp = src->i_mount;
+	loff_t			remapped = 0;
+	xfs_extlen_t		cowextsize;
+	int			ret;
+
+	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/* Prepare and then clone file data. */
+	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+			&len, remap_flags);
+	if (ret < 0 || len == 0)
+		return ret;
+
+	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
+	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
+			&remapped);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Carry the cowextsize hint from src to dest if we're sharing the
+	 * entire source file to the entire destination file, the source file
+	 * has a cowextsize hint, and the destination file does not.
+	 */
+	cowextsize = 0;
+	if (pos_in == 0 && len == i_size_read(inode_in) &&
+	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    pos_out == 0 && len >= i_size_read(inode_out) &&
+	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+		cowextsize = src->i_d.di_cowextsize;
+
+	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
+			remap_flags);
+
+out_unlock:
+	xfs_reflink_remap_unlock(file_in, file_out);
+	if (ret)
+		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+	return remapped > 0 ? remapped : ret;
 }
 
 STATIC int
@@ -1029,10 +1113,10 @@
 	default:
 		return generic_file_llseek(file, offset, whence);
 	case SEEK_HOLE:
-		offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
 		break;
 	case SEEK_DATA:
-		offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
 		break;
 	}
 
@@ -1144,11 +1228,14 @@
 	struct file	*filp,
 	struct vm_area_struct *vma)
 {
+	struct dax_device 	*dax_dev;
+
+	dax_dev = xfs_find_daxdev_for_inode(file_inode(filp));
 	/*
-	 * We don't support synchronous mappings for non-DAX files. At least
-	 * until someone comes with a sensible use case.
+	 * We don't support synchronous mappings for non-DAX files and
+	 * for DAX files if underneath dax_device is not synchronous.
 	 */
-	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+	if (!daxdev_mapping_supported(vma, dax_dev))
 		return -EOPNOTSUPP;
 
 	file_accessed(filp);
@@ -1164,6 +1251,7 @@
 	.write_iter	= xfs_file_write_iter,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
+	.iopoll		= iomap_dio_iopoll,
 	.unlocked_ioctl	= xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= xfs_file_compat_ioctl,
@@ -1175,8 +1263,8 @@
 	.fsync		= xfs_file_fsync,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.fallocate	= xfs_file_fallocate,
-	.clone_file_range = xfs_file_clone_range,
-	.dedupe_file_range = xfs_file_dedupe_range,
+	.fadvise	= xfs_file_fadvise,
+	.remap_file_range = xfs_file_remap_range,
 };
 
 const struct file_operations xfs_dir_file_operations = {

diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 1825013..574a7a8 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c

@@ -5,22 +5,19 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
 #include "xfs_alloc.h"
 #include "xfs_mru_cache.h"
-#include "xfs_filestream.h"
 #include "xfs_trace.h"
 #include "xfs_ag_resv.h"
 #include "xfs_trans.h"
-#include "xfs_shared.h"
 
 struct xfs_fstrm_item {
 	struct xfs_mru_cache_elem	mru;

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3d76a9e..d082143 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c

@@ -9,16 +9,12 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_error.h"
 #include "xfs_btree.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_trace.h"
-#include "xfs_log.h"
 #include "xfs_rmap.h"
 #include "xfs_alloc.h"
 #include "xfs_bit.h"
@@ -254,7 +250,7 @@
 		rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
 		if (info->next_daddr < rec_daddr)
 			info->next_daddr = rec_daddr;
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 	}
 
 	/* Are we just counting mappings? */
@@ -263,14 +259,14 @@
 			info->head->fmh_entries++;
 
 		if (info->last)
-			return XFS_BTREE_QUERY_RANGE_CONTINUE;
+			return 0;
 
 		info->head->fmh_entries++;
 
 		rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
 		if (info->next_daddr < rec_daddr)
 			info->next_daddr = rec_daddr;
-		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+		return 0;
 	}
 
 	/*
@@ -280,7 +276,7 @@
 	 */
 	if (rec_daddr > info->next_daddr) {
 		if (info->head->fmh_entries >= info->head->fmh_count)
-			return XFS_BTREE_QUERY_RANGE_ABORT;
+			return -ECANCELED;
 
 		fmr.fmr_device = info->dev;
 		fmr.fmr_physical = info->next_daddr;
@@ -299,7 +295,7 @@
 
 	/* Fill out the extent we found */
 	if (info->head->fmh_entries >= info->head->fmh_count)
-		return XFS_BTREE_QUERY_RANGE_ABORT;
+		return -ECANCELED;
 
 	trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
 
@@ -332,7 +328,7 @@
 	rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
 	if (info->next_daddr < rec_daddr)
 		info->next_daddr = rec_daddr;
-	return XFS_BTREE_QUERY_RANGE_CONTINUE;
+	return 0;
 }
 
 /* Transform a rmapbt irec into a fsmap */

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7c00b8b..3e61d0c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c

@@ -11,15 +11,11 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_trans.h"
 #include "xfs_error.h"
-#include "xfs_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_fsops.h"
 #include "xfs_trans_space.h"
-#include "xfs_rtalloc.h"
-#include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
@@ -40,7 +36,6 @@
 	xfs_rfsblock_t		new;
 	xfs_agnumber_t		oagcount;
 	xfs_trans_t		*tp;
-	LIST_HEAD		(buffer_list);
 	struct aghdr_init_data	id = {};
 
 	nb = in->newblocks;
@@ -252,9 +247,9 @@
 	if (mp->m_sb.sb_imax_pct) {
 		uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
 		do_div(icount, 100);
-		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+		M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount);
 	} else
-		mp->m_maxicount = 0;
+		M_IGEO(mp)->maxicount = 0;
 
 	/* Update secondary superblocks now the physical grow has completed */
 	error = xfs_update_secondary_sbs(mp);
@@ -290,7 +285,7 @@
  * exported through ioctl XFS_IOC_FSCOUNTS
  */
 
-int
+void
 xfs_fs_counts(
 	xfs_mount_t		*mp,
 	xfs_fsop_counts_t	*cnt)
@@ -303,7 +298,6 @@
 	spin_lock(&mp->m_sb_lock);
 	cnt->freertx = mp->m_sb.sb_frextents;
 	spin_unlock(&mp->m_sb_lock);
-	return 0;
 }
 
 /*
@@ -470,20 +464,13 @@
  */
 void
 xfs_do_force_shutdown(
-	xfs_mount_t	*mp,
+	struct xfs_mount *mp,
 	int		flags,
 	char		*fname,
 	int		lnnum)
 {
-	int		logerror;
+	bool		logerror = flags & SHUTDOWN_LOG_IO_ERROR;
 
-	logerror = flags & SHUTDOWN_LOG_IO_ERROR;
-
-	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-		xfs_notice(mp,
-	"%s(0x%x) called from line %d of file %s.  Return address = "PTR_FMT,
-			__func__, flags, lnnum, fname, __return_address);
-	}
 	/*
 	 * No need to duplicate efforts.
 	 */
@@ -499,27 +486,34 @@
 	if (xfs_log_force_umount(mp, logerror))
 		return;
 
+	if (flags & SHUTDOWN_FORCE_UMOUNT) {
+		xfs_alert(mp,
+"User initiated shutdown received. Shutting down filesystem");
+		return;
+	}
+
+	xfs_notice(mp,
+"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
+		__func__, flags, lnnum, fname, __return_address);
+
 	if (flags & SHUTDOWN_CORRUPT_INCORE) {
 		xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-    "Corruption of in-memory data detected.  Shutting down filesystem");
+"Corruption of in-memory data detected.  Shutting down filesystem");
 		if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
 			xfs_stack_trace();
-	} else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-		if (logerror) {
-			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
-		"Log I/O Error Detected.  Shutting down filesystem");
-		} else if (flags & SHUTDOWN_DEVICE_REQ) {
-			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-		"All device paths lost.  Shutting down filesystem");
-		} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
-			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-		"I/O Error Detected. Shutting down filesystem");
-		}
+	} else if (logerror) {
+		xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+			"Log I/O Error Detected. Shutting down filesystem");
+	} else if (flags & SHUTDOWN_DEVICE_REQ) {
+		xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+			"All device paths lost. Shutting down filesystem");
+	} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+		xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+			"I/O Error Detected. Shutting down filesystem");
 	}
-	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-		xfs_alert(mp,
-	"Please umount the filesystem and rectify the problem(s)");
-	}
+
+	xfs_alert(mp,
+		"Please unmount the filesystem and rectify the problem(s)");
 }
 
 /*
@@ -534,6 +528,7 @@
 	int			error = 0;
 	int			err2;
 
+	mp->m_finobt_nores = false;
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		pag = xfs_perag_get(mp, agno);
 		err2 = xfs_ag_resv_init(pag, NULL);

diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index d023db0..92869f6 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h

@@ -8,7 +8,7 @@
 
 extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
 extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
-extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
+extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);

diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5169e84..fa55ab8 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c

@@ -4,7 +4,6 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
-#include "xfs_sysctl.h"
 
 /*
  * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
@@ -16,7 +15,7 @@
 			  /*	MIN		DFLT		MAX	*/
 	.sgid_inherit	= {	0,		0,		1	},
 	.symlink_mode	= {	0,		0,		1	},
-	.panic_mask	= {	0,		0,		255	},
+	.panic_mask	= {	0,		0,		256	},
 	.error_level	= {	0,		3,		11	},
 	.syncd_timer	= {	1*100,		30*100,		7200*100},
 	.stats_clear	= {	0,		0,		1	},
@@ -41,4 +40,7 @@
 #else
 	.bug_on_assert		=	false,	/* assert failures WARN() */
 #endif
+#ifdef DEBUG
+	.pwork_threads		=	-1,	/* automatic thread detection */
+#endif
 };

diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
new file mode 100644
index 0000000..8e0cb05
--- /dev/null
+++ b/fs/xfs/xfs_health.c

@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_health.h"
+
+/*
+ * Warn about metadata corruption that we detected but haven't fixed, and
+ * make sure we're not sitting on anything that would get in the way of
+ * recovery.
+ */
+void
+xfs_health_unmount(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	unsigned int		sick = 0;
+	unsigned int		checked = 0;
+	bool			warn = false;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return;
+
+	/* Measure AG corruption levels. */
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		pag = xfs_perag_get(mp, agno);
+		xfs_ag_measure_sickness(pag, &sick, &checked);
+		if (sick) {
+			trace_xfs_ag_unfixed_corruption(mp, agno, sick);
+			warn = true;
+		}
+		xfs_perag_put(pag);
+	}
+
+	/* Measure realtime volume corruption levels. */
+	xfs_rt_measure_sickness(mp, &sick, &checked);
+	if (sick) {
+		trace_xfs_rt_unfixed_corruption(mp, sick);
+		warn = true;
+	}
+
+	/*
+	 * Measure fs corruption and keep the sample around for the warning.
+	 * See the note below for why we exempt FS_COUNTERS.
+	 */
+	xfs_fs_measure_sickness(mp, &sick, &checked);
+	if (sick & ~XFS_SICK_FS_COUNTERS) {
+		trace_xfs_fs_unfixed_corruption(mp, sick);
+		warn = true;
+	}
+
+	if (warn) {
+		xfs_warn(mp,
+"Uncorrected metadata errors detected; please run xfs_repair.");
+
+		/*
+		 * We discovered uncorrected metadata problems at some point
+		 * during this filesystem mount and have advised the
+		 * administrator to run repair once the unmount completes.
+		 *
+		 * However, we must be careful -- when FSCOUNTERS are flagged
+		 * unhealthy, the unmount procedure omits writing the clean
+		 * unmount record to the log so that the next mount will run
+		 * recovery and recompute the summary counters.  In other
+		 * words, we leave a dirty log to get the counters fixed.
+		 *
+		 * Unfortunately, xfs_repair cannot recover dirty logs, so if
+		 * there were filesystem problems, FSCOUNTERS was flagged, and
+		 * the administrator takes our advice to run xfs_repair,
+		 * they'll have to zap the log before repairing structures.
+		 * We don't really want to encourage this, so we mark the
+		 * FSCOUNTERS healthy so that a subsequent repair run won't see
+		 * a dirty log.
+		 */
+		if (sick & XFS_SICK_FS_COUNTERS)
+			xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+	}
+}
+
+/* Mark unhealthy per-fs metadata. */
+void
+xfs_fs_mark_sick(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+	trace_xfs_fs_mark_sick(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_fs_sick |= mask;
+	mp->m_fs_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a per-fs metadata healed. */
+void
+xfs_fs_mark_healthy(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+	trace_xfs_fs_mark_healthy(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_fs_sick &= ~mask;
+	mp->m_fs_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which per-fs metadata are unhealthy. */
+void
+xfs_fs_measure_sickness(
+	struct xfs_mount	*mp,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&mp->m_sb_lock);
+	*sick = mp->m_fs_sick;
+	*checked = mp->m_fs_checked;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy realtime metadata. */
+void
+xfs_rt_mark_sick(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+	trace_xfs_rt_mark_sick(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_rt_sick |= mask;
+	mp->m_rt_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a realtime metadata healed. */
+void
+xfs_rt_mark_healthy(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+	trace_xfs_rt_mark_healthy(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_rt_sick &= ~mask;
+	mp->m_rt_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which realtime metadata are unhealthy. */
+void
+xfs_rt_measure_sickness(
+	struct xfs_mount	*mp,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&mp->m_sb_lock);
+	*sick = mp->m_rt_sick;
+	*checked = mp->m_rt_checked;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy per-ag metadata. */
+void
+xfs_ag_mark_sick(
+	struct xfs_perag	*pag,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+	trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
+
+	spin_lock(&pag->pag_state_lock);
+	pag->pag_sick |= mask;
+	pag->pag_checked |= mask;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata ok. */
+void
+xfs_ag_mark_healthy(
+	struct xfs_perag	*pag,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+	trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
+
+	spin_lock(&pag->pag_state_lock);
+	pag->pag_sick &= ~mask;
+	pag->pag_checked |= mask;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Sample which per-ag metadata are unhealthy. */
+void
+xfs_ag_measure_sickness(
+	struct xfs_perag	*pag,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&pag->pag_state_lock);
+	*sick = pag->pag_sick;
+	*checked = pag->pag_checked;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark the unhealthy parts of an inode. */
+void
+xfs_inode_mark_sick(
+	struct xfs_inode	*ip,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+	trace_xfs_inode_mark_sick(ip, mask);
+
+	spin_lock(&ip->i_flags_lock);
+	ip->i_sick |= mask;
+	ip->i_checked |= mask;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+/* Mark parts of an inode healed. */
+void
+xfs_inode_mark_healthy(
+	struct xfs_inode	*ip,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+	trace_xfs_inode_mark_healthy(ip, mask);
+
+	spin_lock(&ip->i_flags_lock);
+	ip->i_sick &= ~mask;
+	ip->i_checked |= mask;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+/* Sample which parts of an inode are unhealthy. */
+void
+xfs_inode_measure_sickness(
+	struct xfs_inode	*ip,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&ip->i_flags_lock);
+	*sick = ip->i_sick;
+	*checked = ip->i_checked;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+/* Mappings between internal sick masks and ioctl sick masks. */
+
+struct ioctl_sick_map {
+	unsigned int		sick_mask;
+	unsigned int		ioctl_mask;
+};
+
+static const struct ioctl_sick_map fs_map[] = {
+	{ XFS_SICK_FS_COUNTERS,	XFS_FSOP_GEOM_SICK_COUNTERS},
+	{ XFS_SICK_FS_UQUOTA,	XFS_FSOP_GEOM_SICK_UQUOTA },
+	{ XFS_SICK_FS_GQUOTA,	XFS_FSOP_GEOM_SICK_GQUOTA },
+	{ XFS_SICK_FS_PQUOTA,	XFS_FSOP_GEOM_SICK_PQUOTA },
+	{ 0, 0 },
+};
+
+static const struct ioctl_sick_map rt_map[] = {
+	{ XFS_SICK_RT_BITMAP,	XFS_FSOP_GEOM_SICK_RT_BITMAP },
+	{ XFS_SICK_RT_SUMMARY,	XFS_FSOP_GEOM_SICK_RT_SUMMARY },
+	{ 0, 0 },
+};
+
+static inline void
+xfgeo_health_tick(
+	struct xfs_fsop_geom		*geo,
+	unsigned int			sick,
+	unsigned int			checked,
+	const struct ioctl_sick_map	*m)
+{
+	if (checked & m->sick_mask)
+		geo->checked |= m->ioctl_mask;
+	if (sick & m->sick_mask)
+		geo->sick |= m->ioctl_mask;
+}
+
+/* Fill out fs geometry health info. */
+void
+xfs_fsop_geom_health(
+	struct xfs_mount		*mp,
+	struct xfs_fsop_geom		*geo)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			sick;
+	unsigned int			checked;
+
+	geo->sick = 0;
+	geo->checked = 0;
+
+	xfs_fs_measure_sickness(mp, &sick, &checked);
+	for (m = fs_map; m->sick_mask; m++)
+		xfgeo_health_tick(geo, sick, checked, m);
+
+	xfs_rt_measure_sickness(mp, &sick, &checked);
+	for (m = rt_map; m->sick_mask; m++)
+		xfgeo_health_tick(geo, sick, checked, m);
+}
+
+static const struct ioctl_sick_map ag_map[] = {
+	{ XFS_SICK_AG_SB,	XFS_AG_GEOM_SICK_SB },
+	{ XFS_SICK_AG_AGF,	XFS_AG_GEOM_SICK_AGF },
+	{ XFS_SICK_AG_AGFL,	XFS_AG_GEOM_SICK_AGFL },
+	{ XFS_SICK_AG_AGI,	XFS_AG_GEOM_SICK_AGI },
+	{ XFS_SICK_AG_BNOBT,	XFS_AG_GEOM_SICK_BNOBT },
+	{ XFS_SICK_AG_CNTBT,	XFS_AG_GEOM_SICK_CNTBT },
+	{ XFS_SICK_AG_INOBT,	XFS_AG_GEOM_SICK_INOBT },
+	{ XFS_SICK_AG_FINOBT,	XFS_AG_GEOM_SICK_FINOBT },
+	{ XFS_SICK_AG_RMAPBT,	XFS_AG_GEOM_SICK_RMAPBT },
+	{ XFS_SICK_AG_REFCNTBT,	XFS_AG_GEOM_SICK_REFCNTBT },
+	{ 0, 0 },
+};
+
+/* Fill out ag geometry health info. */
+void
+xfs_ag_geom_health(
+	struct xfs_perag		*pag,
+	struct xfs_ag_geometry		*ageo)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			sick;
+	unsigned int			checked;
+
+	ageo->ag_sick = 0;
+	ageo->ag_checked = 0;
+
+	xfs_ag_measure_sickness(pag, &sick, &checked);
+	for (m = ag_map; m->sick_mask; m++) {
+		if (checked & m->sick_mask)
+			ageo->ag_checked |= m->ioctl_mask;
+		if (sick & m->sick_mask)
+			ageo->ag_sick |= m->ioctl_mask;
+	}
+}
+
+static const struct ioctl_sick_map ino_map[] = {
+	{ XFS_SICK_INO_CORE,	XFS_BS_SICK_INODE },
+	{ XFS_SICK_INO_BMBTD,	XFS_BS_SICK_BMBTD },
+	{ XFS_SICK_INO_BMBTA,	XFS_BS_SICK_BMBTA },
+	{ XFS_SICK_INO_BMBTC,	XFS_BS_SICK_BMBTC },
+	{ XFS_SICK_INO_DIR,	XFS_BS_SICK_DIR },
+	{ XFS_SICK_INO_XATTR,	XFS_BS_SICK_XATTR },
+	{ XFS_SICK_INO_SYMLINK,	XFS_BS_SICK_SYMLINK },
+	{ XFS_SICK_INO_PARENT,	XFS_BS_SICK_PARENT },
+	{ 0, 0 },
+};
+
+/* Fill out bulkstat health info. */
+void
+xfs_bulkstat_health(
+	struct xfs_inode		*ip,
+	struct xfs_bulkstat		*bs)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			sick;
+	unsigned int			checked;
+
+	bs->bs_sick = 0;
+	bs->bs_checked = 0;
+
+	xfs_inode_measure_sickness(ip, &sick, &checked);
+	for (m = ino_map; m->sick_mask; m++) {
+		if (checked & m->sick_mask)
+			bs->bs_checked |= m->ioctl_mask;
+		if (sick & m->sick_mask)
+			bs->bs_sick |= m->ioctl_mask;
+	}
+}

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 245483c..944add5 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c

@@ -5,13 +5,13 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
-#include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
@@ -23,8 +23,6 @@
 #include "xfs_dquot.h"
 #include "xfs_reflink.h"
 
-#include <linux/kthread.h>
-#include <linux/freezer.h>
 #include <linux/iversion.h>
 
 /*
@@ -42,7 +40,7 @@
 	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
 	 * code up to do this anyway.
 	 */
-	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+	ip = kmem_zone_alloc(xfs_inode_zone, 0);
 	if (!ip)
 		return NULL;
 	if (inode_init_always(mp->m_super, VFS_I(ip))) {
@@ -70,6 +68,11 @@
 	ip->i_flags = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(ip->i_d));
+	ip->i_sick = 0;
+	ip->i_checked = 0;
+	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
+	INIT_LIST_HEAD(&ip->i_ioend_list);
+	spin_lock_init(&ip->i_ioend_lock);
 
 	return ip;
 }
@@ -446,6 +449,8 @@
 		ip->i_flags |= XFS_INEW;
 		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 		inode->i_state = I_NEW;
+		ip->i_sick = 0;
+		ip->i_checked = 0;
 
 		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 		init_rwsem(&inode->i_rwsem);
@@ -1815,7 +1820,7 @@
 
 /* Disable post-EOF and CoW block auto-reclamation. */
 void
-xfs_icache_disable_reclaim(
+xfs_stop_block_reaping(
 	struct xfs_mount	*mp)
 {
 	cancel_delayed_work_sync(&mp->m_eofblocks_work);
@@ -1824,7 +1829,7 @@
 
 /* Enable post-EOF and CoW block auto-reclamation. */
 void
-xfs_icache_enable_reclaim(
+xfs_start_block_reaping(
 	struct xfs_mount	*mp)
 {
 	xfs_queue_eofblocks(mp);

diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 26c0626..48f1fd2 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h

@@ -119,7 +119,7 @@
 int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
 				  xfs_ino_t ino, bool *inuse);
 
-void xfs_icache_disable_reclaim(struct xfs_mount *mp);
-void xfs_icache_enable_reclaim(struct xfs_mount *mp);
+void xfs_stop_block_reaping(struct xfs_mount *mp);
+void xfs_start_block_reaping(struct xfs_mount *mp);
 
 #endif

diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 8381d34..3ebd1b7 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c

@@ -6,14 +6,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_format.h"
 #include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_mount.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-#include "xfs_error.h"
 #include "xfs_icreate_item.h"
 #include "xfs_log.h"
 
@@ -56,80 +51,18 @@
 			sizeof(struct xfs_icreate_log));
 }
 
-
-/* Pinning has no meaning for the create item, so just return. */
 STATIC void
-xfs_icreate_item_pin(
+xfs_icreate_item_release(
 	struct xfs_log_item	*lip)
 {
+	kmem_zone_free(xfs_icreate_zone, ICR_ITEM(lip));
 }
 
-
-/* pinning has no meaning for the create item, so just return. */
-STATIC void
-xfs_icreate_item_unpin(
-	struct xfs_log_item	*lip,
-	int			remove)
-{
-}
-
-STATIC void
-xfs_icreate_item_unlock(
-	struct xfs_log_item	*lip)
-{
-	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
-
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
-		kmem_zone_free(xfs_icreate_zone, icp);
-	return;
-}
-
-/*
- * Because we have ordered buffers being tracked in the AIL for the inode
- * creation, we don't need the create item after this. Hence we can free
- * the log item and return -1 to tell the caller we're done with the item.
- */
-STATIC xfs_lsn_t
-xfs_icreate_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
-
-	kmem_zone_free(xfs_icreate_zone, icp);
-	return (xfs_lsn_t)-1;
-}
-
-/* item can never get into the AIL */
-STATIC uint
-xfs_icreate_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	ASSERT(0);
-	return XFS_ITEM_SUCCESS;
-}
-
-/* Ordered buffers do the dependency tracking here, so this does nothing. */
-STATIC void
-xfs_icreate_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all buf log items.
- */
 static const struct xfs_item_ops xfs_icreate_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
 	.iop_size	= xfs_icreate_item_size,
 	.iop_format	= xfs_icreate_item_format,
-	.iop_pin	= xfs_icreate_item_pin,
-	.iop_unpin	= xfs_icreate_item_unpin,
-	.iop_push	= xfs_icreate_item_push,
-	.iop_unlock	= xfs_icreate_item_unlock,
-	.iop_committed	= xfs_icreate_item_committed,
-	.iop_committing = xfs_icreate_item_committing,
+	.iop_release	= xfs_icreate_item_release,
 };
 
 
@@ -156,7 +89,7 @@
 {
 	struct xfs_icreate_item	*icp;
 
-	icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+	icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
 
 	xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
 			  &xfs_icreate_item_ops);

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 05db954..18f4b26 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c

@@ -3,7 +3,6 @@
  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
-#include <linux/log2.h>
 #include <linux/iversion.h>
 
 #include "xfs.h"
@@ -16,10 +15,7 @@
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_inode.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
-#include "xfs_attr_sf.h"
 #include "xfs_attr.h"
 #include "xfs_trans_space.h"
 #include "xfs_trans.h"
@@ -32,7 +28,6 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_filestream.h"
-#include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
@@ -40,7 +35,6 @@
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
-#include "xfs_dir2_priv.h"
 
 kmem_zone_t *xfs_inode_zone;
 
@@ -441,12 +435,12 @@
  */
 static void
 xfs_lock_inodes(
-	xfs_inode_t	**ips,
-	int		inodes,
-	uint		lock_mode)
+	struct xfs_inode	**ips,
+	int			inodes,
+	uint			lock_mode)
 {
-	int		attempts = 0, i, j, try_lock;
-	xfs_log_item_t	*lp;
+	int			attempts = 0, i, j, try_lock;
+	struct xfs_log_item	*lp;
 
 	/*
 	 * Currently supports between 2 and 5 inodes with exclusive locking.  We
@@ -485,7 +479,7 @@
 		 */
 		if (!try_lock) {
 			for (j = (i - 1); j >= 0 && !try_lock; j--) {
-				lp = (xfs_log_item_t *)ips[j]->i_itemp;
+				lp = &ips[j]->i_itemp->ili_item;
 				if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
 					try_lock++;
 			}
@@ -551,7 +545,7 @@
 	struct xfs_inode	*temp;
 	uint			mode_temp;
 	int			attempts = 0;
-	xfs_log_item_t		*lp;
+	struct xfs_log_item	*lp;
 
 	ASSERT(hweight32(ip0_mode) == 1);
 	ASSERT(hweight32(ip1_mode) == 1);
@@ -585,7 +579,7 @@
 	 * the second lock. If we can't get it, we must release the first one
 	 * and try again.
 	 */
-	lp = (xfs_log_item_t *)ip0->i_itemp;
+	lp = &ip0->i_itemp->ili_item;
 	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
 		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
 			xfs_iunlock(ip0, ip0_mode);
@@ -1116,7 +1110,7 @@
 /*
  * Increment the link count on an inode & log the change.
  */
-static int
+static void
 xfs_bumplink(
 	xfs_trans_t *tp,
 	xfs_inode_t *ip)
@@ -1126,7 +1120,6 @@
 	ASSERT(ip->i_d.di_version > 1);
 	inc_nlink(VFS_I(ip));
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	return 0;
 }
 
 int
@@ -1235,9 +1228,7 @@
 		if (error)
 			goto out_trans_cancel;
 
-		error = xfs_bumplink(tp, dp);
-		if (error)
-			goto out_trans_cancel;
+		xfs_bumplink(tp, dp);
 	}
 
 	/*
@@ -1332,7 +1323,7 @@
 	if (error)
 		goto out_trans_cancel;
 
-	error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
 	if (error)
 		goto out_trans_cancel;
 
@@ -1454,9 +1445,7 @@
 	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
 
-	error = xfs_bumplink(tp, sip);
-	if (error)
-		goto error_return;
+	xfs_bumplink(tp, sip);
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -1754,7 +1743,7 @@
 	 * now remains allocated and sits on the unlinked list until the fs is
 	 * repaired.
 	 */
-	if (unlikely(mp->m_inotbt_nores)) {
+	if (unlikely(mp->m_finobt_nores)) {
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
 				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
 				&tp);
@@ -1907,86 +1896,510 @@
 }
 
 /*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk.  That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk.  The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+	struct rhash_head	iu_rhash_head;
+	xfs_agino_t		iu_agino;		/* X */
+	xfs_agino_t		iu_next_unlinked;	/* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const xfs_agino_t		*key = arg->key;
+	const struct xfs_iunlink	*iu = obj;
+
+	if (iu->iu_next_unlinked != *key)
+		return 1;
+	return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
+	.key_len		= sizeof(xfs_agino_t),
+	.key_offset		= offsetof(struct xfs_iunlink,
+					   iu_next_unlinked),
+	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino)
+{
+	struct xfs_iunlink	*iu;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+			xfs_iunlink_hash_params);
+	return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+	struct xfs_perag	*pag,
+	struct xfs_iunlink	*iu)
+{
+	int			error;
+
+	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	/*
+	 * Fail loudly if there already was an entry because that's a sign of
+	 * corruption of in-memory data.  Also fail loudly if we see an error
+	 * code we didn't anticipate from the rhashtable code.  Currently we
+	 * only anticipate ENOMEM.
+	 */
+	if (error) {
+		WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+		kmem_free(iu);
+	}
+	/*
+	 * Absorb any runtime errors that aren't a result of corruption because
+	 * this is a cache and we can always fall back to bucket list scanning.
+	 */
+	if (error != 0 && error != -EEXIST)
+		error = 0;
+	return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		this_agino)
+{
+	struct xfs_iunlink	*iu;
+
+	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+		return 0;
+
+	iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
+	iu->iu_agino = prev_agino;
+	iu->iu_next_unlinked = this_agino;
+
+	return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agino_t		next_unlinked)
+{
+	struct xfs_iunlink	*iu;
+	int			error;
+
+	/* Look up the old entry; if there wasn't one then exit. */
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return 0;
+
+	/*
+	 * Remove the entry.  This shouldn't ever return an error, but if we
+	 * couldn't remove the old entry we don't want to add it again to the
+	 * hash table, and if the entry disappeared on us then someone's
+	 * violated the locking rules and we need to fail loudly.  Either way
+	 * we cannot remove the inode because internal state is or would have
+	 * been corrupt.
+	 */
+	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error)
+		return error;
+
+	/* If there is no new next entry just free our item and return. */
+	if (next_unlinked == NULLAGINO) {
+		kmem_free(iu);
+		return 0;
+	}
+
+	/* Update the entry and re-add it to the hash table. */
+	iu->iu_next_unlinked = next_unlinked;
+	return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+	struct xfs_perag	*pag)
+{
+	return rhashtable_init(&pag->pagi_unlinked_hash,
+			&xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+	void			*ptr,
+	void			*arg)
+{
+	struct xfs_iunlink	*iu = ptr;
+	bool			*freed_anything = arg;
+
+	*freed_anything = true;
+	kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+	struct xfs_perag	*pag)
+{
+	bool			freed_anything = false;
+
+	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+			xfs_iunlink_free_item, &freed_anything);
+
+	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results.  The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		*agibp,
+	unsigned int		bucket_index,
+	xfs_agino_t		new_agino)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agibp);
+	xfs_agino_t		old_value;
+	int			offset;
+
+	ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+			old_value, new_agino);
+
+	/*
+	 * We should never find the head of the list already set to the value
+	 * passed in because either we're adding or removing ourselves from the
+	 * head of the list.
+	 */
+	if (old_value == new_agino)
+		return -EFSCORRUPTED;
+
+	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+	offset = offsetof(struct xfs_agi, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	struct xfs_buf		*ibp,
+	struct xfs_dinode	*dip,
+	struct xfs_imap		*imap,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			offset;
+
+	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+	trace_xfs_iunlink_update_dinode(mp, agno, agino,
+			be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+	dip->di_next_unlinked = cpu_to_be32(next_agino);
+	offset = imap->im_boffset +
+			offsetof(struct xfs_dinode, di_next_unlinked);
+
+	/* need to recalc the inode CRC if appropriate */
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_inode_buf(tp, ibp);
+	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		next_agino,
+	xfs_agino_t		*old_next_agino)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*ibp;
+	xfs_agino_t		old_value;
+	int			error;
+
+	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+	if (error)
+		return error;
+
+	/* Make sure the old pointer isn't garbage. */
+	old_value = be32_to_cpu(dip->di_next_unlinked);
+	if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/*
+	 * Since we're updating a linked list, we should never find that the
+	 * current pointer is the same as the new value, unless we're
+	 * terminating the list.
+	 */
+	*old_next_agino = old_value;
+	if (old_value == next_agino) {
+		if (next_agino != NULLAGINO)
+			error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* Ok, update the new pointer. */
+	xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			ibp, dip, &ip->i_imap, next_agino);
+	return 0;
+out:
+	xfs_trans_brelse(tp, ibp);
+	return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
  *
  * We place the on-disk inode on a list in the AGI.  It will be pulled from this
  * list when the inode is freed.
  */
 STATIC int
 xfs_iunlink(
-	struct xfs_trans *tp,
-	struct xfs_inode *ip)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
 {
-	xfs_mount_t	*mp = tp->t_mountp;
-	xfs_agi_t	*agi;
-	xfs_dinode_t	*dip;
-	xfs_buf_t	*agibp;
-	xfs_buf_t	*ibp;
-	xfs_agino_t	agino;
-	short		bucket_index;
-	int		offset;
-	int		error;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi;
+	struct xfs_buf		*agibp;
+	xfs_agino_t		next_agino;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			error;
 
+	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(VFS_I(ip)->i_mode != 0);
+	trace_xfs_iunlink(ip);
 
-	/*
-	 * Get the agi buffer first.  It ensures lock ordering
-	 * on the list.
-	 */
-	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		return error;
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
-	 * Get the index into the agi hash table for the
-	 * list this inode will go on.
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the pointer isn't garbage and that this inode
+	 * isn't already on the list.
 	 */
-	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	ASSERT(agino != 0);
-	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-	ASSERT(agi->agi_unlinked[bucket_index]);
-	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (next_agino == agino ||
+	    !xfs_verify_agino_or_null(mp, agno, next_agino))
+		return -EFSCORRUPTED;
 
-	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
+	if (next_agino != NULLAGINO) {
+		struct xfs_perag	*pag;
+		xfs_agino_t		old_agino;
+
 		/*
-		 * There is already another inode in the bucket we need
-		 * to add ourselves to.  Add us at the front of the list.
-		 * Here we put the head pointer into our next pointer,
-		 * and then we fall through to point the head at us.
+		 * There is already another inode in the bucket, so point this
+		 * inode to the current head of the list.
 		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
+		error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+				&old_agino);
+		if (error)
+			return error;
+		ASSERT(old_agino == NULLAGINO);
+
+		/*
+		 * agino has been unlinked, add a backref from the next inode
+		 * back to agino.
+		 */
+		pag = xfs_perag_get(mp, agno);
+		error = xfs_iunlink_add_backref(pag, agino, next_agino);
+		xfs_perag_put(pag);
+		if (error)
+			return error;
+	}
+
+	/* Point the head of the list to point to this inode. */
+	return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
+
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	struct xfs_imap		*imap,
+	struct xfs_dinode	**dipp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			error;
+
+	imap->im_blkno = 0;
+	error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_imap returned error %d.",
+				__func__, error);
+		return error;
+	}
+
+	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+				__func__, error);
+		return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino.  Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		head_agino,
+	xfs_agino_t		target_agino,
+	xfs_agino_t		*agino,
+	struct xfs_imap		*imap,
+	struct xfs_dinode	**dipp,
+	struct xfs_buf		**bpp,
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_agino_t		next_agino;
+	int			error;
+
+	ASSERT(head_agino != target_agino);
+	*bpp = NULL;
+
+	/* See if our backref cache can find it faster. */
+	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
+	if (*agino != NULLAGINO) {
+		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
 		if (error)
 			return error;
 
-		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
-		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-		offset = ip->i_imap.im_boffset +
-			offsetof(xfs_dinode_t, di_next_unlinked);
+		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+			return 0;
 
-		/* need to recalc the inode CRC if appropriate */
-		xfs_dinode_calc_crc(mp, dip);
-
-		xfs_trans_inode_buf(tp, ibp);
-		xfs_trans_log_buf(tp, ibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
-		xfs_inobp_check(mp, ibp);
+		/*
+		 * If we get here the cache contents were corrupt, so drop the
+		 * buffer and fall back to walking the bucket list.
+		 */
+		xfs_trans_brelse(tp, *bpp);
+		*bpp = NULL;
+		WARN_ON_ONCE(1);
 	}
 
-	/*
-	 * Point the bucket head pointer at the inode being inserted.
-	 */
-	ASSERT(agino != 0);
-	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
-	offset = offsetof(xfs_agi_t, agi_unlinked) +
-		(sizeof(xfs_agino_t) * bucket_index);
-	xfs_trans_log_buf(tp, agibp, offset,
-			  (offset + sizeof(xfs_agino_t) - 1));
+	trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+	/* Otherwise, walk the entire bucket until we find it. */
+	next_agino = head_agino;
+	while (next_agino != target_agino) {
+		xfs_agino_t	unlinked_agino;
+
+		if (*bpp)
+			xfs_trans_brelse(tp, *bpp);
+
+		*agino = next_agino;
+		error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+				bpp);
+		if (error)
+			return error;
+
+		unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+		/*
+		 * Make sure this pointer is valid and isn't an obvious
+		 * infinite loop.
+		 */
+		if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+		    next_agino == unlinked_agino) {
+			XFS_CORRUPTION_ERROR(__func__,
+					XFS_ERRLEVEL_LOW, mp,
+					*dipp, sizeof(**dipp));
+			error = -EFSCORRUPTED;
+			return error;
+		}
+		next_agino = unlinked_agino;
+	}
+
 	return 0;
 }
 
@@ -1995,181 +2408,106 @@
  */
 STATIC int
 xfs_iunlink_remove(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
 {
-	xfs_ino_t	next_ino;
-	xfs_mount_t	*mp;
-	xfs_agi_t	*agi;
-	xfs_dinode_t	*dip;
-	xfs_buf_t	*agibp;
-	xfs_buf_t	*ibp;
-	xfs_agnumber_t	agno;
-	xfs_agino_t	agino;
-	xfs_agino_t	next_agino;
-	xfs_buf_t	*last_ibp;
-	xfs_dinode_t	*last_dip = NULL;
-	short		bucket_index;
-	int		offset, last_offset = 0;
-	int		error;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi;
+	struct xfs_buf		*agibp;
+	struct xfs_buf		*last_ibp;
+	struct xfs_dinode	*last_dip = NULL;
+	struct xfs_perag	*pag = NULL;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	xfs_agino_t		next_agino;
+	xfs_agino_t		head_agino;
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	int			error;
 
-	mp = tp->t_mountp;
-	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	trace_xfs_iunlink_remove(ip);
 
-	/*
-	 * Get the agi buffer first.  It ensures lock ordering
-	 * on the list.
-	 */
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		return error;
-
 	agi = XFS_BUF_TO_AGI(agibp);
 
 	/*
-	 * Get the index into the agi hash table for the
-	 * list this inode will go on.
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the head pointer isn't garbage.
 	 */
-	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	if (!xfs_verify_agino(mp, agno, agino))
-		return -EFSCORRUPTED;
-	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-	if (!xfs_verify_agino(mp, agno,
-			be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (!xfs_verify_agino(mp, agno, head_agino)) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 				agi, sizeof(*agi));
 		return -EFSCORRUPTED;
 	}
 
-	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
-		/*
-		 * We're at the head of the list.  Get the inode's on-disk
-		 * buffer to see if there is anyone after us on the list.
-		 * Only modify our next pointer if it is not already NULLAGINO.
-		 * This saves us the overhead of dealing with the buffer when
-		 * there is no need to change it.
-		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
-		if (error) {
-			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
-				__func__, error);
-			return error;
-		}
-		next_agino = be32_to_cpu(dip->di_next_unlinked);
-		ASSERT(next_agino != 0);
-		if (next_agino != NULLAGINO) {
-			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			offset = ip->i_imap.im_boffset +
-				offsetof(xfs_dinode_t, di_next_unlinked);
+	/*
+	 * Set our inode's next_unlinked pointer to NULL and then return
+	 * the old pointer value so that we can update whatever was previous
+	 * to us in the list to point to whatever was next in the list.
+	 */
+	error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+	if (error)
+		return error;
 
-			/* need to recalc the inode CRC if appropriate */
-			xfs_dinode_calc_crc(mp, dip);
-
-			xfs_trans_inode_buf(tp, ibp);
-			xfs_trans_log_buf(tp, ibp, offset,
-					  (offset + sizeof(xfs_agino_t) - 1));
-			xfs_inobp_check(mp, ibp);
-		} else {
-			xfs_trans_brelse(tp, ibp);
-		}
-		/*
-		 * Point the bucket head pointer at the next inode.
-		 */
-		ASSERT(next_agino != 0);
-		ASSERT(next_agino != agino);
-		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
-		offset = offsetof(xfs_agi_t, agi_unlinked) +
-			(sizeof(xfs_agino_t) * bucket_index);
-		xfs_trans_log_buf(tp, agibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
-	} else {
-		/*
-		 * We need to search the list for the inode being freed.
-		 */
-		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-		last_ibp = NULL;
-		while (next_agino != agino) {
-			struct xfs_imap	imap;
-
-			if (last_ibp)
-				xfs_trans_brelse(tp, last_ibp);
-
-			imap.im_blkno = 0;
-			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
-
-			error = xfs_imap(mp, tp, next_ino, &imap, 0);
-			if (error) {
-				xfs_warn(mp,
-	"%s: xfs_imap returned error %d.",
-					 __func__, error);
-				return error;
-			}
-
-			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
-					       &last_ibp, 0, 0);
-			if (error) {
-				xfs_warn(mp,
-	"%s: xfs_imap_to_bp returned error %d.",
-					__func__, error);
-				return error;
-			}
-
-			last_offset = imap.im_boffset;
-			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
-			if (!xfs_verify_agino(mp, agno, next_agino)) {
-				XFS_CORRUPTION_ERROR(__func__,
-						XFS_ERRLEVEL_LOW, mp,
-						last_dip, sizeof(*last_dip));
-				return -EFSCORRUPTED;
-			}
-		}
-
-		/*
-		 * Now last_ibp points to the buffer previous to us on the
-		 * unlinked list.  Pull us from the list.
-		 */
-		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-				       0, 0);
-		if (error) {
-			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
-				__func__, error);
-			return error;
-		}
-		next_agino = be32_to_cpu(dip->di_next_unlinked);
-		ASSERT(next_agino != 0);
-		ASSERT(next_agino != agino);
-		if (next_agino != NULLAGINO) {
-			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			offset = ip->i_imap.im_boffset +
-				offsetof(xfs_dinode_t, di_next_unlinked);
-
-			/* need to recalc the inode CRC if appropriate */
-			xfs_dinode_calc_crc(mp, dip);
-
-			xfs_trans_inode_buf(tp, ibp);
-			xfs_trans_log_buf(tp, ibp, offset,
-					  (offset + sizeof(xfs_agino_t) - 1));
-			xfs_inobp_check(mp, ibp);
-		} else {
-			xfs_trans_brelse(tp, ibp);
-		}
-		/*
-		 * Point the previous inode on the list to the next inode.
-		 */
-		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
-		ASSERT(next_agino != 0);
-		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
-		/* need to recalc the inode CRC if appropriate */
-		xfs_dinode_calc_crc(mp, last_dip);
-
-		xfs_trans_inode_buf(tp, last_ibp);
-		xfs_trans_log_buf(tp, last_ibp, offset,
-				  (offset + sizeof(xfs_agino_t) - 1));
-		xfs_inobp_check(mp, last_ibp);
+	/*
+	 * If there was a backref pointing from the next inode back to this
+	 * one, remove it because we've removed this inode from the list.
+	 *
+	 * Later, if this inode was in the middle of the list we'll update
+	 * this inode's backref to point from the next inode.
+	 */
+	if (next_agino != NULLAGINO) {
+		pag = xfs_perag_get(mp, agno);
+		error = xfs_iunlink_change_backref(pag, next_agino,
+				NULLAGINO);
+		if (error)
+			goto out;
 	}
-	return 0;
+
+	if (head_agino == agino) {
+		/* Point the head of the list to the next unlinked inode. */
+		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+				next_agino);
+		if (error)
+			goto out;
+	} else {
+		struct xfs_imap	imap;
+		xfs_agino_t	prev_agino;
+
+		if (!pag)
+			pag = xfs_perag_get(mp, agno);
+
+		/* We need to search the list for the inode being freed. */
+		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+				&prev_agino, &imap, &last_dip, &last_ibp,
+				pag);
+		if (error)
+			goto out;
+
+		/* Point the previous inode on the list to the next inode. */
+		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+				last_dip, &imap, next_agino);
+
+		/*
+		 * Now we deal with the backref for this inode.  If this inode
+		 * pointed at a real inode, change the backref that pointed to
+		 * us to point to our old next.  If this inode was the end of
+		 * the list, delete the backref that pointed to us.  Note that
+		 * change_backref takes care of deleting the backref if
+		 * next_agino is NULLAGINO.
+		 */
+		error = xfs_iunlink_change_backref(pag, agino, next_agino);
+		if (error)
+			goto out;
+	}
+
+out:
+	if (pag)
+		xfs_perag_put(pag);
+	return error;
 }
 
 /*
@@ -2184,8 +2522,6 @@
 	struct xfs_icluster	*xic)
 {
 	xfs_mount_t		*mp = free_ip->i_mount;
-	int			blks_per_cluster;
-	int			inodes_per_cluster;
 	int			nbufs;
 	int			i, j;
 	int			ioffset;
@@ -2195,15 +2531,14 @@
 	xfs_inode_log_item_t	*iip;
 	struct xfs_log_item	*lip;
 	struct xfs_perag	*pag;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
 	xfs_ino_t		inum;
 
 	inum = xic->first_ino;
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-	nbufs = mp->m_ialloc_blks / blks_per_cluster;
+	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
 
-	for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
 		/*
 		 * The allocation bitmap tells us which inodes of the chunk were
 		 * physically allocated. Skip the cluster if an inode falls into
@@ -2211,7 +2546,7 @@
 		 */
 		ioffset = inum - xic->first_ino;
 		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
-			ASSERT(ioffset % inodes_per_cluster == 0);
+			ASSERT(ioffset % igeo->inodes_per_cluster == 0);
 			continue;
 		}
 
@@ -2227,7 +2562,7 @@
 		 * to mark all the active inodes on the buffer stale.
 		 */
 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
-					mp->m_bsize * blks_per_cluster,
+					mp->m_bsize * igeo->blocks_per_cluster,
 					XBF_UNMAPPED);
 
 		if (!bp)
@@ -2242,7 +2577,7 @@
 		 * want it to fail. We can acheive this by adding a write
 		 * verifier to the buffer.
 		 */
-		 bp->b_ops = &xfs_inode_buf_ops;
+		bp->b_ops = &xfs_inode_buf_ops;
 
 		/*
 		 * Walk the inodes already attached to the buffer and mark them
@@ -2274,7 +2609,7 @@
 		 * transaction stale above, which means there is no point in
 		 * even trying to lock them.
 		 */
-		for (i = 0; i < inodes_per_cluster; i++) {
+		for (i = 0; i < igeo->inodes_per_cluster; i++) {
 retry:
 			rcu_read_lock();
 			ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2752,9 +3087,7 @@
 				error = xfs_droplink(tp, dp2);
 				if (error)
 					goto out_trans_abort;
-				error = xfs_bumplink(tp, dp1);
-				if (error)
-					goto out_trans_abort;
+				xfs_bumplink(tp, dp1);
 			}
 
 			/*
@@ -2778,9 +3111,7 @@
 				error = xfs_droplink(tp, dp1);
 				if (error)
 					goto out_trans_abort;
-				error = xfs_bumplink(tp, dp2);
-				if (error)
-					goto out_trans_abort;
+				xfs_bumplink(tp, dp2);
 			}
 
 			/*
@@ -2837,11 +3168,9 @@
 
 	/*
 	 * Prepare the tmpfile inode as if it were created through the VFS.
-	 * Otherwise, the link increment paths will complain about nlink 0->1.
-	 * Drop the link count as done by d_tmpfile(), complete the inode setup
-	 * and flag it as linkable.
+	 * Complete the inode setup and flag it as linkable.  nlink is already
+	 * zero, so we can skip the drop_nlink.
 	 */
-	drop_nlink(VFS_I(tmpfile));
 	xfs_setup_iops(tmpfile);
 	xfs_finish_inode_setup(tmpfile);
 	VFS_I(tmpfile)->i_state |= I_LINKABLE;
@@ -2953,7 +3282,8 @@
 					spaceres);
 
 	/*
-	 * Set up the target.
+	 * Check for expected errors before we dirty the transaction
+	 * so we can return an error without a transaction abort.
 	 */
 	if (target_ip == NULL) {
 		/*
@@ -2965,6 +3295,46 @@
 			if (error)
 				goto out_trans_cancel;
 		}
+	} else {
+		/*
+		 * If target exists and it's a directory, check that whether
+		 * it can be destroyed.
+		 */
+		if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+		    (!xfs_dir_isempty(target_ip) ||
+		     (VFS_I(target_ip)->i_nlink > 2))) {
+			error = -EEXIST;
+			goto out_trans_cancel;
+		}
+	}
+
+	/*
+	 * Directory entry creation below may acquire the AGF. Remove
+	 * the whiteout from the unlinked list first to preserve correct
+	 * AGI/AGF locking order. This dirties the transaction so failures
+	 * after this point will abort and log recovery will clean up the
+	 * mess.
+	 *
+	 * For whiteouts, we need to bump the link count on the whiteout
+	 * inode. After this point, we have a real link, clear the tmpfile
+	 * state flag from the inode so it doesn't accidentally get misused
+	 * in future.
+	 */
+	if (wip) {
+		ASSERT(VFS_I(wip)->i_nlink == 0);
+		error = xfs_iunlink_remove(tp, wip);
+		if (error)
+			goto out_trans_cancel;
+
+		xfs_bumplink(tp, wip);
+		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
+		VFS_I(wip)->i_state &= ~I_LINKABLE;
+	}
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
 		/*
 		 * If target does not exist and the rename crosses
 		 * directories, adjust the target directory link count
@@ -2979,28 +3349,10 @@
 					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		if (new_parent && src_is_directory) {
-			error = xfs_bumplink(tp, target_dp);
-			if (error)
-				goto out_trans_cancel;
+			xfs_bumplink(tp, target_dp);
 		}
 	} else { /* target_ip != NULL */
 		/*
-		 * If target exists and it's a directory, check that both
-		 * target and source are directories and that target can be
-		 * destroyed, or that neither is a directory.
-		 */
-		if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
-			/*
-			 * Make sure target dir is empty.
-			 */
-			if (!(xfs_dir_isempty(target_ip)) ||
-			    (VFS_I(target_ip)->i_nlink > 2)) {
-				error = -EEXIST;
-				goto out_trans_cancel;
-			}
-		}
-
-		/*
 		 * Link the source inode under the target name.
 		 * If the source inode is a directory and we are moving
 		 * it across directories, its ".." entry will be
@@ -3090,32 +3442,6 @@
 	if (error)
 		goto out_trans_cancel;
 
-	/*
-	 * For whiteouts, we need to bump the link count on the whiteout inode.
-	 * This means that failures all the way up to this point leave the inode
-	 * on the unlinked list and so cleanup is a simple matter of dropping
-	 * the remaining reference to it. If we fail here after bumping the link
-	 * count, we're shutting down the filesystem so we'll never see the
-	 * intermediate state on disk.
-	 */
-	if (wip) {
-		ASSERT(VFS_I(wip)->i_nlink == 0);
-		error = xfs_bumplink(tp, wip);
-		if (error)
-			goto out_trans_cancel;
-		error = xfs_iunlink_remove(tp, wip);
-		if (error)
-			goto out_trans_cancel;
-		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-
-		/*
-		 * Now we have a real link, clear the "I'm a tmpfile" state
-		 * flag from the inode so it doesn't accidentally get misused in
-		 * future.
-		 */
-		VFS_I(wip)->i_state &= ~I_LINKABLE;
-	}
-
 	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
 	if (new_parent)
@@ -3142,28 +3468,27 @@
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_perag	*pag;
 	unsigned long		first_index, mask;
-	unsigned long		inodes_per_cluster;
 	int			cilist_size;
 	struct xfs_inode	**cilist;
 	struct xfs_inode	*cip;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
 	int			nr_found;
 	int			clcount = 0;
 	int			i;
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 
-	inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-	cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+	cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
 	cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
 	if (!cilist)
 		goto out_put;
 
-	mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
+	mask = ~(igeo->inodes_per_cluster - 1);
 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
 	rcu_read_lock();
 	/* really need a gang lookup range call here */
 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
-					first_index, inodes_per_cluster);
+					first_index, igeo->inodes_per_cluster);
 	if (nr_found == 0)
 		goto out_free;
 
@@ -3271,7 +3596,6 @@
 	 * inode buffer and shut down the filesystem.
 	 */
 	rcu_read_unlock();
-	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 
 	/*
 	 * We'll always have an inode attached to the buffer for completion
@@ -3281,11 +3605,14 @@
 	 * xfs_buf_submit().
 	 */
 	ASSERT(bp->b_iodone);
+	bp->b_flags |= XBF_ASYNC;
 	bp->b_flags &= ~XBF_DONE;
 	xfs_buf_stale(bp);
 	xfs_buf_ioerror(bp, -EIO);
 	xfs_buf_ioend(bp);
 
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
 	/* abort the corrupt inode, as it was not attached to the buffer */
 	xfs_iflush_abort(cip, false);
 	kmem_free(cilist);

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be20145..558173f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h

@@ -45,10 +45,18 @@
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
+
+	/*
+	 * Bitsets of inode metadata that have been checked and/or are sick.
+	 * Callers must hold i_flags_lock before accessing this field.
+	 */
+	uint16_t		i_checked;
+	uint16_t		i_sick;
+
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
 	unsigned long		i_flags;	/* see defined flags below */
-	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
+	uint64_t		i_delayed_blks;	/* count of delay alloc blks */
 
 	struct xfs_icdinode	i_d;		/* most of ondisk inode */
 
@@ -57,6 +65,11 @@
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
+
+	/* pending io completions */
+	spinlock_t		i_ioend_lock;
+	struct work_struct	i_ioend_work;
+	struct list_head	i_ioend_list;
 } xfs_inode_t;
 
 /* Convert from vfs inode to xfs inode */
@@ -500,4 +513,9 @@
 
 bool xfs_inode_verify_forks(struct xfs_inode *ip);
 
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
+void xfs_end_io(struct work_struct *work);
+
 #endif	/* __XFS_INODE_H__ */

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fa1c4fe..bb8f076 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c

@@ -5,6 +5,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
@@ -12,7 +13,6 @@
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
-#include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_trans_priv.h"
 #include "xfs_buf_item.h"
@@ -565,7 +565,7 @@
  * Unlock the inode associated with the inode log item.
  */
 STATIC void
-xfs_inode_item_unlock(
+xfs_inode_item_release(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
@@ -621,23 +621,21 @@
 STATIC void
 xfs_inode_item_committing(
 	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
+	xfs_lsn_t		commit_lsn)
 {
-	INODE_ITEM(lip)->ili_last_lsn = lsn;
+	INODE_ITEM(lip)->ili_last_lsn = commit_lsn;
+	return xfs_inode_item_release(lip);
 }
 
-/*
- * This is the ops vector shared by all buf log items.
- */
 static const struct xfs_item_ops xfs_inode_item_ops = {
 	.iop_size	= xfs_inode_item_size,
 	.iop_format	= xfs_inode_item_format,
 	.iop_pin	= xfs_inode_item_pin,
 	.iop_unpin	= xfs_inode_item_unpin,
-	.iop_unlock	= xfs_inode_item_unlock,
+	.iop_release	= xfs_inode_item_release,
 	.iop_committed	= xfs_inode_item_committed,
 	.iop_push	= xfs_inode_item_push,
-	.iop_committing = xfs_inode_item_committing,
+	.iop_committing	= xfs_inode_item_committing,
 	.iop_error	= xfs_inode_item_error
 };
 
@@ -653,7 +651,7 @@
 	struct xfs_inode_log_item *iip;
 
 	ASSERT(ip->i_itemp == NULL);
-	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
 
 	iip->ili_inode = ip;
 	xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,

diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 27081eb..07a60e7 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h

@@ -14,7 +14,7 @@
 struct xfs_mount;
 
 typedef struct xfs_inode_log_item {
-	xfs_log_item_t		ili_item;	   /* common portion */
+	struct xfs_log_item	ili_item;	   /* common portion */
 	struct xfs_inode	*ili_inode;	   /* inode ptr */
 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
 	xfs_lsn_t		ili_last_lsn;	   /* lsn at last transaction */

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0ef5ece..d58f0d6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c

@@ -11,9 +11,8 @@
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
-#include "xfs_ioctl.h"
-#include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
+#include "xfs_iwalk.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
 #include "xfs_attr.h"
@@ -25,7 +24,6 @@
 #include "xfs_export.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
-#include "xfs_symlink.h"
 #include "xfs_trans.h"
 #include "xfs_acl.h"
 #include "xfs_btree.h"
@@ -33,15 +31,11 @@
 #include "xfs_fsmap.h"
 #include "scrub/xfs_scrub.h"
 #include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_health.h"
 
-#include <linux/capability.h>
-#include <linux/cred.h>
-#include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/exportfs.h>
 
 /*
  * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -73,7 +67,7 @@
 			return -EBADF;
 		inode = file_inode(f.file);
 	} else {
-		error = user_lpath((const char __user *)hreq->path, &path);
+		error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
 		if (error)
 			return error;
 		inode = d_inode(path.dentry);
@@ -402,7 +396,7 @@
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+	kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
 	if (!kbuf)
 		goto out_dput;
 
@@ -440,11 +434,11 @@
 
 	if (*len > XFS_XATTR_SIZE_MAX)
 		return -EINVAL;
-	kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+	kbuf = kmem_zalloc_large(*len, 0);
 	if (!kbuf)
 		return -ENOMEM;
 
-	error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+	error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
 	if (error)
 		goto out_kfree;
 
@@ -604,14 +598,6 @@
 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	int			error;
 
-	/*
-	 * Only allow the sys admin to reserve space unless
-	 * unwritten extents are enabled.
-	 */
-	if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
 		return -EPERM;
 
@@ -727,16 +713,45 @@
 	return error;
 }
 
+/* Return 0 on success or positive error */
+int
+xfs_fsbulkstat_one_fmt(
+	struct xfs_ibulk		*breq,
+	const struct xfs_bulkstat	*bstat)
+{
+	struct xfs_bstat		bs1;
+
+	xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat);
+	if (copy_to_user(breq->ubuffer, &bs1, sizeof(bs1)))
+		return -EFAULT;
+	return xfs_ibulk_advance(breq, sizeof(struct xfs_bstat));
+}
+
+int
+xfs_fsinumbers_fmt(
+	struct xfs_ibulk		*breq,
+	const struct xfs_inumbers	*igrp)
+{
+	struct xfs_inogrp		ig1;
+
+	xfs_inumbers_to_inogrp(&ig1, igrp);
+	if (copy_to_user(breq->ubuffer, &ig1, sizeof(struct xfs_inogrp)))
+		return -EFAULT;
+	return xfs_ibulk_advance(breq, sizeof(struct xfs_inogrp));
+}
+
 STATIC int
-xfs_ioc_bulkstat(
+xfs_ioc_fsbulkstat(
 	xfs_mount_t		*mp,
 	unsigned int		cmd,
 	void			__user *arg)
 {
-	xfs_fsop_bulkreq_t	bulkreq;
-	int			count;	/* # of records returned */
-	xfs_ino_t		inlast;	/* last inode number */
-	int			done;
+	struct xfs_fsop_bulkreq	bulkreq;
+	struct xfs_ibulk	breq = {
+		.mp		= mp,
+		.ocount		= 0,
+	};
+	xfs_ino_t		lastino;
 	int			error;
 
 	/* done = 1 if there are more stats to get and if bulkstat */
@@ -748,79 +763,291 @@
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
+	if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq)))
 		return -EFAULT;
 
-	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+	if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64)))
 		return -EFAULT;
 
-	if ((count = bulkreq.icount) <= 0)
+	if (bulkreq.icount <= 0)
 		return -EINVAL;
 
 	if (bulkreq.ubuffer == NULL)
 		return -EINVAL;
 
-	if (cmd == XFS_IOC_FSINUMBERS)
-		error = xfs_inumbers(mp, &inlast, &count,
-					bulkreq.ubuffer, xfs_inumbers_fmt);
-	else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
-		error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
-					sizeof(xfs_bstat_t), NULL, &done);
-	else	/* XFS_IOC_FSBULKSTAT */
-		error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
-				     sizeof(xfs_bstat_t), bulkreq.ubuffer,
-				     &done);
+	breq.ubuffer = bulkreq.ubuffer;
+	breq.icount = bulkreq.icount;
+
+	/*
+	 * FSBULKSTAT_SINGLE expects that *lastip contains the inode number
+	 * that we want to stat.  However, FSINUMBERS and FSBULKSTAT expect
+	 * that *lastip contains either zero or the number of the last inode to
+	 * be examined by the previous call and return results starting with
+	 * the next inode after that.  The new bulk request back end functions
+	 * take the inode to start with, so we have to compute the startino
+	 * parameter from lastino to maintain correct function.  lastino == 0
+	 * is a special case because it has traditionally meant "first inode
+	 * in filesystem".
+	 */
+	if (cmd == XFS_IOC_FSINUMBERS) {
+		breq.startino = lastino ? lastino + 1 : 0;
+		error = xfs_inumbers(&breq, xfs_fsinumbers_fmt);
+		lastino = breq.startino - 1;
+	} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) {
+		breq.startino = lastino;
+		breq.icount = 1;
+		error = xfs_bulkstat_one(&breq, xfs_fsbulkstat_one_fmt);
+	} else {	/* XFS_IOC_FSBULKSTAT */
+		breq.startino = lastino ? lastino + 1 : 0;
+		error = xfs_bulkstat(&breq, xfs_fsbulkstat_one_fmt);
+		lastino = breq.startino - 1;
+	}
 
 	if (error)
 		return error;
 
-	if (bulkreq.ocount != NULL) {
-		if (copy_to_user(bulkreq.lastip, &inlast,
-						sizeof(xfs_ino_t)))
-			return -EFAULT;
+	if (bulkreq.lastip != NULL &&
+	    copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t)))
+		return -EFAULT;
 
-		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-			return -EFAULT;
+	if (bulkreq.ocount != NULL &&
+	    copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* Return 0 on success or positive error */
+static int
+xfs_bulkstat_fmt(
+	struct xfs_ibulk		*breq,
+	const struct xfs_bulkstat	*bstat)
+{
+	if (copy_to_user(breq->ubuffer, bstat, sizeof(struct xfs_bulkstat)))
+		return -EFAULT;
+	return xfs_ibulk_advance(breq, sizeof(struct xfs_bulkstat));
+}
+
+/*
+ * Check the incoming bulk request @hdr from userspace and initialize the
+ * internal @breq bulk request appropriately.  Returns 0 if the bulk request
+ * should proceed; -ECANCELED if there's nothing to do; or the usual
+ * negative error code.
+ */
+static int
+xfs_bulk_ireq_setup(
+	struct xfs_mount	*mp,
+	struct xfs_bulk_ireq	*hdr,
+	struct xfs_ibulk	*breq,
+	void __user		*ubuffer)
+{
+	if (hdr->icount == 0 ||
+	    (hdr->flags & ~XFS_BULK_IREQ_FLAGS_ALL) ||
+	    memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+		return -EINVAL;
+
+	breq->startino = hdr->ino;
+	breq->ubuffer = ubuffer;
+	breq->icount = hdr->icount;
+	breq->ocount = 0;
+	breq->flags = 0;
+
+	/*
+	 * The @ino parameter is a special value, so we must look it up here.
+	 * We're not allowed to have IREQ_AGNO, and we only return one inode
+	 * worth of data.
+	 */
+	if (hdr->flags & XFS_BULK_IREQ_SPECIAL) {
+		if (hdr->flags & XFS_BULK_IREQ_AGNO)
+			return -EINVAL;
+
+		switch (hdr->ino) {
+		case XFS_BULK_IREQ_SPECIAL_ROOT:
+			hdr->ino = mp->m_sb.sb_rootino;
+			break;
+		default:
+			return -EINVAL;
+		}
+		breq->icount = 1;
 	}
 
+	/*
+	 * The IREQ_AGNO flag means that we only want results from a given AG.
+	 * If @hdr->ino is zero, we start iterating in that AG.  If @hdr->ino is
+	 * beyond the specified AG then we return no results.
+	 */
+	if (hdr->flags & XFS_BULK_IREQ_AGNO) {
+		if (hdr->agno >= mp->m_sb.sb_agcount)
+			return -EINVAL;
+
+		if (breq->startino == 0)
+			breq->startino = XFS_AGINO_TO_INO(mp, hdr->agno, 0);
+		else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno)
+			return -EINVAL;
+
+		breq->flags |= XFS_IBULK_SAME_AG;
+
+		/* Asking for an inode past the end of the AG?  We're done! */
+		if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
+			return -ECANCELED;
+	} else if (hdr->agno)
+		return -EINVAL;
+
+	/* Asking for an inode past the end of the FS?  We're done! */
+	if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
+		return -ECANCELED;
+
+	return 0;
+}
+
+/*
+ * Update the userspace bulk request @hdr to reflect the end state of the
+ * internal bulk request @breq.
+ */
+static void
+xfs_bulk_ireq_teardown(
+	struct xfs_bulk_ireq	*hdr,
+	struct xfs_ibulk	*breq)
+{
+	hdr->ino = breq->startino;
+	hdr->ocount = breq->ocount;
+}
+
+/* Handle the v5 bulkstat ioctl. */
+STATIC int
+xfs_ioc_bulkstat(
+	struct xfs_mount		*mp,
+	unsigned int			cmd,
+	struct xfs_bulkstat_req __user	*arg)
+{
+	struct xfs_bulk_ireq		hdr;
+	struct xfs_ibulk		breq = {
+		.mp			= mp,
+	};
+	int				error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
+		return -EFAULT;
+
+	error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
+	if (error == -ECANCELED)
+		goto out_teardown;
+	if (error < 0)
+		return error;
+
+	error = xfs_bulkstat(&breq, xfs_bulkstat_fmt);
+	if (error)
+		return error;
+
+out_teardown:
+	xfs_bulk_ireq_teardown(&hdr, &breq);
+	if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr)))
+		return -EFAULT;
+
 	return 0;
 }
 
 STATIC int
-xfs_ioc_fsgeometry_v1(
-	xfs_mount_t		*mp,
-	void			__user *arg)
+xfs_inumbers_fmt(
+	struct xfs_ibulk		*breq,
+	const struct xfs_inumbers	*igrp)
 {
-	xfs_fsop_geom_t         fsgeo;
-	int			error;
+	if (copy_to_user(breq->ubuffer, igrp, sizeof(struct xfs_inumbers)))
+		return -EFAULT;
+	return xfs_ibulk_advance(breq, sizeof(struct xfs_inumbers));
+}
 
-	error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
+/* Handle the v5 inumbers ioctl. */
+STATIC int
+xfs_ioc_inumbers(
+	struct xfs_mount		*mp,
+	unsigned int			cmd,
+	struct xfs_inumbers_req __user	*arg)
+{
+	struct xfs_bulk_ireq		hdr;
+	struct xfs_ibulk		breq = {
+		.mp			= mp,
+	};
+	int				error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
+		return -EFAULT;
+
+	error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
+	if (error == -ECANCELED)
+		goto out_teardown;
+	if (error < 0)
+		return error;
+
+	error = xfs_inumbers(&breq, xfs_inumbers_fmt);
 	if (error)
 		return error;
 
-	/*
-	 * Caller should have passed an argument of type
-	 * xfs_fsop_geom_v1_t.  This is a proper subset of the
-	 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
-	 */
-	if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
+out_teardown:
+	xfs_bulk_ireq_teardown(&hdr, &breq);
+	if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr)))
 		return -EFAULT;
+
 	return 0;
 }
 
 STATIC int
 xfs_ioc_fsgeometry(
-	xfs_mount_t		*mp,
+	struct xfs_mount	*mp,
+	void			__user *arg,
+	int			struct_version)
+{
+	struct xfs_fsop_geom	fsgeo;
+	size_t			len;
+
+	xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
+
+	if (struct_version <= 3)
+		len = sizeof(struct xfs_fsop_geom_v1);
+	else if (struct_version == 4)
+		len = sizeof(struct xfs_fsop_geom_v4);
+	else {
+		xfs_fsop_geom_health(mp, &fsgeo);
+		len = sizeof(fsgeo);
+	}
+
+	if (copy_to_user(arg, &fsgeo, len))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_ioc_ag_geometry(
+	struct xfs_mount	*mp,
 	void			__user *arg)
 {
-	xfs_fsop_geom_t		fsgeo;
+	struct xfs_ag_geometry	ageo;
 	int			error;
 
-	error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 4);
+	if (copy_from_user(&ageo, arg, sizeof(ageo)))
+		return -EFAULT;
+	if (ageo.ag_flags)
+		return -EINVAL;
+	if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
+		return -EINVAL;
+
+	error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
 	if (error)
 		return error;
 
-	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+	if (copy_to_user(arg, &ageo, sizeof(ageo)))
 		return -EFAULT;
 	return 0;
 }
@@ -879,6 +1106,34 @@
 	return flags;
 }
 
+static void
+xfs_fill_fsxattr(
+	struct xfs_inode	*ip,
+	bool			attr,
+	struct fsxattr		*fa)
+{
+	simple_fill_fsxattr(fa, xfs_ip2xflags(ip));
+	fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+	fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
+			ip->i_mount->m_sb.sb_blocklog;
+	fa->fsx_projid = xfs_get_projid(ip);
+
+	if (attr) {
+		if (ip->i_afp) {
+			if (ip->i_afp->if_flags & XFS_IFEXTENTS)
+				fa->fsx_nextents = xfs_iext_count(ip->i_afp);
+			else
+				fa->fsx_nextents = ip->i_d.di_anextents;
+		} else
+			fa->fsx_nextents = 0;
+	} else {
+		if (ip->i_df.if_flags & XFS_IFEXTENTS)
+			fa->fsx_nextents = xfs_iext_count(&ip->i_df);
+		else
+			fa->fsx_nextents = ip->i_d.di_nextents;
+	}
+}
+
 STATIC int
 xfs_ioc_fsgetxattr(
 	xfs_inode_t		*ip,
@@ -887,29 +1142,8 @@
 {
 	struct fsxattr		fa;
 
-	memset(&fa, 0, sizeof(struct fsxattr));
-
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	fa.fsx_xflags = xfs_ip2xflags(ip);
-	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
-	fa.fsx_cowextsize = ip->i_d.di_cowextsize <<
-			ip->i_mount->m_sb.sb_blocklog;
-	fa.fsx_projid = xfs_get_projid(ip);
-
-	if (attr) {
-		if (ip->i_afp) {
-			if (ip->i_afp->if_flags & XFS_IFEXTENTS)
-				fa.fsx_nextents = xfs_iext_count(ip->i_afp);
-			else
-				fa.fsx_nextents = ip->i_d.di_anextents;
-		} else
-			fa.fsx_nextents = 0;
-	} else {
-		if (ip->i_df.if_flags & XFS_IFEXTENTS)
-			fa.fsx_nextents = xfs_iext_count(&ip->i_df);
-		else
-			fa.fsx_nextents = ip->i_d.di_nextents;
-	}
+	xfs_fill_fsxattr(ip, attr, &fa);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (copy_to_user(arg, &fa, sizeof(fa)))
@@ -1035,15 +1269,6 @@
 	if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
 		return -EINVAL;
 
-	/*
-	 * Can't modify an immutable/append-only file unless
-	 * we have appropriate permission.
-	 */
-	if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
-	     (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
-		return -EPERM;
-
 	/* diflags2 only valid for v3 inodes. */
 	di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
 	if (di_flags2 && ip->i_d.di_version < 3)
@@ -1088,8 +1313,7 @@
 	if (fa->fsx_xflags & FS_XFLAG_DAX) {
 		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
 			return -EINVAL;
-		if (S_ISREG(inode->i_mode) &&
-		    !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+		if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
 				sb->s_blocksize))
 			return -EINVAL;
 	}
@@ -1150,7 +1374,7 @@
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
 	if (error)
-		return ERR_PTR(error);
+		goto out_unlock;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
@@ -1202,39 +1426,31 @@
 	struct fsxattr		*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-
-	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
-		return -EINVAL;
-
-	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
-	    !S_ISDIR(VFS_I(ip)->i_mode))
-		return -EINVAL;
+	xfs_extlen_t		size;
+	xfs_fsblock_t		extsize_fsb;
 
 	if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
 	    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
 		return -EINVAL;
 
-	if (fa->fsx_extsize != 0) {
-		xfs_extlen_t    size;
-		xfs_fsblock_t   extsize_fsb;
+	if (fa->fsx_extsize == 0)
+		return 0;
 
-		extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
-		if (extsize_fsb > MAXEXTLEN)
+	extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+	if (extsize_fsb > MAXEXTLEN)
+		return -EINVAL;
+
+	if (XFS_IS_REALTIME_INODE(ip) ||
+	    (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
+		size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+	} else {
+		size = mp->m_sb.sb_blocksize;
+		if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
 			return -EINVAL;
+	}
 
-		if (XFS_IS_REALTIME_INODE(ip) ||
-		    (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
-			size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
-		} else {
-			size = mp->m_sb.sb_blocksize;
-			if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
-				return -EINVAL;
-		}
-
-		if (fa->fsx_extsize % size)
-			return -EINVAL;
-	} else
-		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+	if (fa->fsx_extsize % size)
+		return -EINVAL;
 
 	return 0;
 }
@@ -1260,6 +1476,8 @@
 	struct fsxattr		*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
+	xfs_extlen_t		size;
+	xfs_fsblock_t		cowextsize_fsb;
 
 	if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
 		return 0;
@@ -1268,25 +1486,19 @@
 	    ip->i_d.di_version != 3)
 		return -EINVAL;
 
-	if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode))
+	if (fa->fsx_cowextsize == 0)
+		return 0;
+
+	cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+	if (cowextsize_fsb > MAXEXTLEN)
 		return -EINVAL;
 
-	if (fa->fsx_cowextsize != 0) {
-		xfs_extlen_t    size;
-		xfs_fsblock_t   cowextsize_fsb;
+	size = mp->m_sb.sb_blocksize;
+	if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
+		return -EINVAL;
 
-		cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
-		if (cowextsize_fsb > MAXEXTLEN)
-			return -EINVAL;
-
-		size = mp->m_sb.sb_blocksize;
-		if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
-			return -EINVAL;
-
-		if (fa->fsx_cowextsize % size)
-			return -EINVAL;
-	} else
-		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+	if (fa->fsx_cowextsize % size)
+		return -EINVAL;
 
 	return 0;
 }
@@ -1300,21 +1512,6 @@
 	if (fa->fsx_projid > (uint16_t)-1 &&
 	    !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
 		return -EINVAL;
-
-	/*
-	 * Project Quota ID state is only allowed to change from within the init
-	 * namespace. Enforce that restriction only if we are trying to change
-	 * the quota ID state. Everything else is allowed in user namespaces.
-	 */
-	if (current_user_ns() == &init_user_ns)
-		return 0;
-
-	if (xfs_get_projid(ip) != fa->fsx_projid)
-		return -EINVAL;
-	if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=
-	    (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
-		return -EINVAL;
-
 	return 0;
 }
 
@@ -1323,6 +1520,7 @@
 	xfs_inode_t		*ip,
 	struct fsxattr		*fa)
 {
+	struct fsxattr		old_fa;
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
 	struct xfs_dquot	*udqp = NULL;
@@ -1370,7 +1568,6 @@
 		goto error_free_dquots;
 	}
 
-
 	if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
 	    xfs_get_projid(ip) != fa->fsx_projid) {
 		code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
@@ -1379,6 +1576,11 @@
 			goto error_trans_cancel;
 	}
 
+	xfs_fill_fsxattr(ip, false, &old_fa);
+	code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
+	if (code)
+		goto error_trans_cancel;
+
 	code = xfs_ioctl_setattr_check_extsize(ip, fa);
 	if (code)
 		goto error_trans_cancel;
@@ -1489,6 +1691,7 @@
 {
 	struct xfs_trans	*tp;
 	struct fsxattr		fa;
+	struct fsxattr		old_fa;
 	unsigned int		flags;
 	int			join_flags = 0;
 	int			error;
@@ -1524,6 +1727,13 @@
 		goto out_drop_write;
 	}
 
+	xfs_fill_fsxattr(ip, false, &old_fa);
+	error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa);
+	if (error) {
+		xfs_trans_cancel(tp);
+		goto out_drop_write;
+	}
+
 	error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
 	if (error) {
 		xfs_trans_cancel(tp);
@@ -1616,7 +1826,7 @@
 	error = 0;
 out_free_buf:
 	kmem_free(buf);
-	return 0;
+	return error;
 }
 
 struct getfsmap_info {
@@ -1674,7 +1884,7 @@
 	info.mp = ip->i_mount;
 	info.data = arg;
 	error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
-	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+	if (error == -ECANCELED) {
 		error = 0;
 		aborted = true;
 	} else if (error)
@@ -1942,13 +2152,22 @@
 	case XFS_IOC_FSBULKSTAT_SINGLE:
 	case XFS_IOC_FSBULKSTAT:
 	case XFS_IOC_FSINUMBERS:
+		return xfs_ioc_fsbulkstat(mp, cmd, arg);
+
+	case XFS_IOC_BULKSTAT:
 		return xfs_ioc_bulkstat(mp, cmd, arg);
+	case XFS_IOC_INUMBERS:
+		return xfs_ioc_inumbers(mp, cmd, arg);
 
 	case XFS_IOC_FSGEOMETRY_V1:
-		return xfs_ioc_fsgeometry_v1(mp, arg);
-
+		return xfs_ioc_fsgeometry(mp, arg, 3);
+	case XFS_IOC_FSGEOMETRY_V4:
+		return xfs_ioc_fsgeometry(mp, arg, 4);
 	case XFS_IOC_FSGEOMETRY:
-		return xfs_ioc_fsgeometry(mp, arg);
+		return xfs_ioc_fsgeometry(mp, arg, 5);
+
+	case XFS_IOC_AG_GEOMETRY:
+		return xfs_ioc_ag_geometry(mp, arg);
 
 	case XFS_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *)arg);
@@ -2039,9 +2258,7 @@
 	case XFS_IOC_FSCOUNTS: {
 		xfs_fsop_counts_t out;
 
-		error = xfs_fs_counts(mp, &out);
-		if (error)
-			return error;
+		xfs_fs_counts(mp, &out);
 
 		if (copy_to_user(arg, &out, sizeof(out)))
 			return -EFAULT;

diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 4b17f67..654c0bb 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h

@@ -77,4 +77,12 @@
 	uint			evmask,
 	uint16_t		state);
 
+struct xfs_ibulk;
+struct xfs_bstat;
+struct xfs_inogrp;
+
+int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq,
+			   const struct xfs_bulkstat *bstat);
+int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp);
+
 #endif

diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index fba115f..1e08bf7 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c

@@ -3,23 +3,19 @@
  * Copyright (c) 2004-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  */
-#include <linux/compat.h>
-#include <linux/ioctl.h>
 #include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
 #include <linux/fsmap.h>
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
+#include "xfs_iwalk.h"
 #include "xfs_itable.h"
-#include "xfs_error.h"
 #include "xfs_fsops.h"
-#include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_attr.h"
 #include "xfs_ioctl.h"
@@ -52,12 +48,9 @@
 	struct xfs_mount	  *mp,
 	compat_xfs_fsop_geom_v1_t __user *arg32)
 {
-	xfs_fsop_geom_t		  fsgeo;
-	int			  error;
+	struct xfs_fsop_geom	  fsgeo;
 
-	error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
-	if (error)
-		return error;
+	xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
 	/* The 32-bit variant simply has some padding at the end */
 	if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
 		return -EFAULT;
@@ -87,27 +80,26 @@
 }
 
 STATIC int
-xfs_inumbers_fmt_compat(
-	void			__user *ubuffer,
-	const struct xfs_inogrp	*buffer,
-	long			count,
-	long			*written)
+xfs_fsinumbers_fmt_compat(
+	struct xfs_ibulk		*breq,
+	const struct xfs_inumbers	*ig)
 {
-	compat_xfs_inogrp_t	__user *p32 = ubuffer;
-	long			i;
+	struct compat_xfs_inogrp __user	*p32 = breq->ubuffer;
+	struct xfs_inogrp		ig1;
+	struct xfs_inogrp		*igrp = &ig1;
 
-	for (i = 0; i < count; i++) {
-		if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
-		    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
-		    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-			return -EFAULT;
-	}
-	*written = count * sizeof(*p32);
-	return 0;
+	xfs_inumbers_to_inogrp(&ig1, ig);
+
+	if (put_user(igrp->xi_startino,   &p32->xi_startino) ||
+	    put_user(igrp->xi_alloccount, &p32->xi_alloccount) ||
+	    put_user(igrp->xi_allocmask,  &p32->xi_allocmask))
+		return -EFAULT;
+
+	return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_inogrp));
 }
 
 #else
-#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
+#define xfs_fsinumbers_fmt_compat xfs_fsinumbers_fmt
 #endif	/* BROKEN_X86_ALIGNMENT */
 
 STATIC int
@@ -124,11 +116,14 @@
 	return 0;
 }
 
-/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+/*
+ * struct xfs_bstat has differing alignment on intel, & bstime_t sizes
+ * everywhere
+ */
 STATIC int
 xfs_ioctl32_bstat_copyin(
-	xfs_bstat_t		*bstat,
-	compat_xfs_bstat_t	__user *bstat32)
+	struct xfs_bstat		*bstat,
+	struct compat_xfs_bstat	__user	*bstat32)
 {
 	if (get_user(bstat->bs_ino,	&bstat32->bs_ino)	||
 	    get_user(bstat->bs_mode,	&bstat32->bs_mode)	||
@@ -174,16 +169,15 @@
 
 /* Return 0 on success or positive error (to xfs_bulkstat()) */
 STATIC int
-xfs_bulkstat_one_fmt_compat(
-	void			__user *ubuffer,
-	int			ubsize,
-	int			*ubused,
-	const xfs_bstat_t	*buffer)
+xfs_fsbulkstat_one_fmt_compat(
+	struct xfs_ibulk		*breq,
+	const struct xfs_bulkstat	*bstat)
 {
-	compat_xfs_bstat_t	__user *p32 = ubuffer;
+	struct compat_xfs_bstat	__user	*p32 = breq->ubuffer;
+	struct xfs_bstat		bs1;
+	struct xfs_bstat		*buffer = &bs1;
 
-	if (ubsize < sizeof(*p32))
-		return -ENOMEM;
+	xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat);
 
 	if (put_user(buffer->bs_ino,	  &p32->bs_ino)		||
 	    put_user(buffer->bs_mode,	  &p32->bs_mode)	||
@@ -208,39 +202,50 @@
 	    put_user(buffer->bs_dmstate,  &p32->bs_dmstate)	||
 	    put_user(buffer->bs_aextents, &p32->bs_aextents))
 		return -EFAULT;
-	if (ubused)
-		*ubused = sizeof(*p32);
-	return 0;
-}
 
-STATIC int
-xfs_bulkstat_one_compat(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	void		__user *buffer,	/* buffer to place output in */
-	int		ubsize,		/* size of buffer */
-	int		*ubused,	/* bytes used by me */
-	int		*stat)		/* BULKSTAT_RV_... */
-{
-	return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-				    xfs_bulkstat_one_fmt_compat,
-				    ubused, stat);
+	return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_bstat));
 }
 
 /* copied from xfs_ioctl.c */
 STATIC int
-xfs_compat_ioc_bulkstat(
+xfs_compat_ioc_fsbulkstat(
 	xfs_mount_t		  *mp,
 	unsigned int		  cmd,
-	compat_xfs_fsop_bulkreq_t __user *p32)
+	struct compat_xfs_fsop_bulkreq __user *p32)
 {
 	u32			addr;
-	xfs_fsop_bulkreq_t	bulkreq;
-	int			count;	/* # of records returned */
-	xfs_ino_t		inlast;	/* last inode number */
-	int			done;
+	struct xfs_fsop_bulkreq	bulkreq;
+	struct xfs_ibulk	breq = {
+		.mp		= mp,
+		.ocount		= 0,
+	};
+	xfs_ino_t		lastino;
 	int			error;
 
+	/*
+	 * Output structure handling functions.  Depending on the command,
+	 * either the xfs_bstat and xfs_inogrp structures are written out
+	 * to userpace memory via bulkreq.ubuffer.  Normally the compat
+	 * functions and structure size are the correct ones to use ...
+	 */
+	inumbers_fmt_pf		inumbers_func = xfs_fsinumbers_fmt_compat;
+	bulkstat_one_fmt_pf	bs_one_func = xfs_fsbulkstat_one_fmt_compat;
+
+#ifdef CONFIG_X86_X32
+	if (in_x32_syscall()) {
+		/*
+		 * ... but on x32 the input xfs_fsop_bulkreq has pointers
+		 * which must be handled in the "compat" (32-bit) way, while
+		 * the xfs_bstat and xfs_inogrp structures follow native 64-
+		 * bit layout convention.  So adjust accordingly, otherwise
+		 * the data written out in compat layout will not match what
+		 * x32 userspace expects.
+		 */
+		inumbers_func = xfs_fsinumbers_fmt;
+		bs_one_func = xfs_fsbulkstat_one_fmt;
+	}
+#endif
+
 	/* done = 1 if there are more stats to get and if bulkstat */
 	/* should be called again (unused here, but used in dmapi) */
 
@@ -261,40 +266,55 @@
 		return -EFAULT;
 	bulkreq.ocount = compat_ptr(addr);
 
-	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+	if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64)))
 		return -EFAULT;
 
-	if ((count = bulkreq.icount) <= 0)
+	if (bulkreq.icount <= 0)
 		return -EINVAL;
 
 	if (bulkreq.ubuffer == NULL)
 		return -EINVAL;
 
-	if (cmd == XFS_IOC_FSINUMBERS_32) {
-		error = xfs_inumbers(mp, &inlast, &count,
-				bulkreq.ubuffer, xfs_inumbers_fmt_compat);
-	} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
-		int res;
+	breq.ubuffer = bulkreq.ubuffer;
+	breq.icount = bulkreq.icount;
 
-		error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
-				sizeof(compat_xfs_bstat_t), NULL, &res);
+	/*
+	 * FSBULKSTAT_SINGLE expects that *lastip contains the inode number
+	 * that we want to stat.  However, FSINUMBERS and FSBULKSTAT expect
+	 * that *lastip contains either zero or the number of the last inode to
+	 * be examined by the previous call and return results starting with
+	 * the next inode after that.  The new bulk request back end functions
+	 * take the inode to start with, so we have to compute the startino
+	 * parameter from lastino to maintain correct function.  lastino == 0
+	 * is a special case because it has traditionally meant "first inode
+	 * in filesystem".
+	 */
+	if (cmd == XFS_IOC_FSINUMBERS_32) {
+		breq.startino = lastino ? lastino + 1 : 0;
+		error = xfs_inumbers(&breq, inumbers_func);
+		lastino = breq.startino - 1;
+	} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+		breq.startino = lastino;
+		breq.icount = 1;
+		error = xfs_bulkstat_one(&breq, bs_one_func);
+		lastino = breq.startino;
 	} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
-		error = xfs_bulkstat(mp, &inlast, &count,
-			xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
-			bulkreq.ubuffer, &done);
-	} else
+		breq.startino = lastino ? lastino + 1 : 0;
+		error = xfs_bulkstat(&breq, bs_one_func);
+		lastino = breq.startino - 1;
+	} else {
 		error = -EINVAL;
+	}
 	if (error)
 		return error;
 
-	if (bulkreq.ocount != NULL) {
-		if (copy_to_user(bulkreq.lastip, &inlast,
-						sizeof(xfs_ino_t)))
-			return -EFAULT;
+	if (bulkreq.lastip != NULL &&
+	    copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t)))
+		return -EFAULT;
 
-		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-			return -EFAULT;
-	}
+	if (bulkreq.ocount != NULL &&
+	    copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32)))
+		return -EFAULT;
 
 	return 0;
 }
@@ -336,6 +356,7 @@
 {
 	int			error;
 	attrlist_cursor_kern_t	*cursor;
+	compat_xfs_fsop_attrlist_handlereq_t __user *p = arg;
 	compat_xfs_fsop_attrlist_handlereq_t al_hreq;
 	struct dentry		*dentry;
 	char			*kbuf;
@@ -360,7 +381,7 @@
 		return PTR_ERR(dentry);
 
 	error = -ENOMEM;
-	kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+	kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
 	if (!kbuf)
 		goto out_dput;
 
@@ -370,6 +391,11 @@
 	if (error)
 		goto out_kfree;
 
+	if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
+		error = -EFAULT;
+		goto out_kfree;
+	}
+
 	if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
 		error = -EFAULT;
 
@@ -521,48 +547,13 @@
 	struct inode		*inode = file_inode(filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	void			__user *arg = (void __user *)p;
+	void			__user *arg = compat_ptr(p);
 	int			error;
 
 	trace_xfs_file_compat_ioctl(ip);
 
 	switch (cmd) {
-	/* No size or alignment issues on any arch */
-	case XFS_IOC_DIOINFO:
-	case XFS_IOC_FSGEOMETRY:
-	case XFS_IOC_FSGETXATTR:
-	case XFS_IOC_FSSETXATTR:
-	case XFS_IOC_FSGETXATTRA:
-	case XFS_IOC_FSSETDM:
-	case XFS_IOC_GETBMAP:
-	case XFS_IOC_GETBMAPA:
-	case XFS_IOC_GETBMAPX:
-	case XFS_IOC_FSCOUNTS:
-	case XFS_IOC_SET_RESBLKS:
-	case XFS_IOC_GET_RESBLKS:
-	case XFS_IOC_FSGROWFSLOG:
-	case XFS_IOC_GOINGDOWN:
-	case XFS_IOC_ERROR_INJECTION:
-	case XFS_IOC_ERROR_CLEARALL:
-	case FS_IOC_GETFSMAP:
-	case XFS_IOC_SCRUB_METADATA:
-		return xfs_file_ioctl(filp, cmd, p);
-#ifndef BROKEN_X86_ALIGNMENT
-	/* These are handled fine if no alignment issues */
-	case XFS_IOC_ALLOCSP:
-	case XFS_IOC_FREESP:
-	case XFS_IOC_RESVSP:
-	case XFS_IOC_UNRESVSP:
-	case XFS_IOC_ALLOCSP64:
-	case XFS_IOC_FREESP64:
-	case XFS_IOC_RESVSP64:
-	case XFS_IOC_UNRESVSP64:
-	case XFS_IOC_FSGEOMETRY_V1:
-	case XFS_IOC_FSGROWFSDATA:
-	case XFS_IOC_FSGROWFSRT:
-	case XFS_IOC_ZERO_RANGE:
-		return xfs_file_ioctl(filp, cmd, p);
-#else
+#if defined(BROKEN_X86_ALIGNMENT)
 	case XFS_IOC_ALLOCSP_32:
 	case XFS_IOC_FREESP_32:
 	case XFS_IOC_ALLOCSP64_32:
@@ -631,7 +622,7 @@
 	case XFS_IOC_FSBULKSTAT_32:
 	case XFS_IOC_FSBULKSTAT_SINGLE_32:
 	case XFS_IOC_FSINUMBERS_32:
-		return xfs_compat_ioc_bulkstat(mp, cmd, arg);
+		return xfs_compat_ioc_fsbulkstat(mp, cmd, arg);
 	case XFS_IOC_FD_TO_HANDLE_32:
 	case XFS_IOC_PATH_TO_HANDLE_32:
 	case XFS_IOC_PATH_TO_FSHANDLE_32: {
@@ -663,6 +654,7 @@
 	case XFS_IOC_FSSETDM_BY_HANDLE_32:
 		return xfs_compat_fssetdm_by_handle(filp, arg);
 	default:
-		return -ENOIOCTLCMD;
+		/* try the native version */
+		return xfs_file_ioctl(filp, cmd, (unsigned long)arg);
 	}
 }

diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index d28fa82..7985344 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h

@@ -36,7 +36,7 @@
 	__s32		tv_nsec;	/* and nanoseconds	*/
 } compat_xfs_bstime_t;
 
-typedef struct compat_xfs_bstat {
+struct compat_xfs_bstat {
 	__u64		bs_ino;		/* inode number			*/
 	__u16		bs_mode;	/* type and mode		*/
 	__u16		bs_nlink;	/* number of links		*/
@@ -61,14 +61,14 @@
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
-} __compat_packed compat_xfs_bstat_t;
+} __compat_packed;
 
-typedef struct compat_xfs_fsop_bulkreq {
+struct compat_xfs_fsop_bulkreq {
 	compat_uptr_t	lastip;		/* last inode # pointer		*/
 	__s32		icount;		/* count of entries in buffer	*/
 	compat_uptr_t	ubuffer;	/* user buffer for inode desc.	*/
 	compat_uptr_t	ocount;		/* output count pointer		*/
-} compat_xfs_fsop_bulkreq_t;
+};
 
 #define XFS_IOC_FSBULKSTAT_32 \
 	_IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
@@ -106,7 +106,7 @@
 	xfs_off_t		sx_offset;	/* offset into file */
 	xfs_off_t		sx_length;	/* leng from offset */
 	char			sx_pad[16];	/* pad space, unused */
-	compat_xfs_bstat_t	sx_stat;	/* stat of target b4 copy */
+	struct compat_xfs_bstat	sx_stat;	/* stat of target b4 copy */
 } __compat_packed compat_xfs_swapext_t;
 
 #define XFS_IOC_SWAPEXT_32	_IOWR('X', 109, struct compat_xfs_swapext)
@@ -201,11 +201,11 @@
 #define XFS_IOC_FSGEOMETRY_V1_32  \
 	_IOR('X', 100, struct compat_xfs_fsop_geom_v1)
 
-typedef struct compat_xfs_inogrp {
+struct compat_xfs_inogrp {
 	__u64		xi_startino;	/* starting inode number	*/
 	__s32		xi_alloccount;	/* # bits set in allocmask	*/
 	__u64		xi_allocmask;	/* mask of allocated inodes	*/
-} __attribute__((packed)) compat_xfs_inogrp_t;
+} __attribute__((packed));
 
 /* These growfs input structures have padding on the end, so must translate */
 typedef struct compat_xfs_growfs_data {

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6320aca..f780e22 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c

@@ -4,7 +4,6 @@
  * Copyright (c) 2016-2018 Christoph Hellwig.
  * All Rights Reserved.
  */
-#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
@@ -12,7 +11,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_bmap_btree.h"
@@ -25,7 +23,6 @@
 #include "xfs_inode_item.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
-#include "xfs_icache.h"
 #include "xfs_quota.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
@@ -35,18 +32,40 @@
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
 						<< mp->m_writeio_log)
 
-void
+static int
+xfs_alert_fsblock_zero(
+	xfs_inode_t	*ip,
+	xfs_bmbt_irec_t	*imap)
+{
+	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+			"Access to block zero in inode %llu "
+			"start_block: %llx start_off: %llx "
+			"blkcnt: %llx extent-state: %x",
+		(unsigned long long)ip->i_ino,
+		(unsigned long long)imap->br_startblock,
+		(unsigned long long)imap->br_startoff,
+		(unsigned long long)imap->br_blockcount,
+		imap->br_state);
+	return -EFSCORRUPTED;
+}
+
+int
 xfs_bmbt_to_iomap(
 	struct xfs_inode	*ip,
 	struct iomap		*iomap,
-	struct xfs_bmbt_irec	*imap)
+	struct xfs_bmbt_irec	*imap,
+	bool			shared)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
+	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+		return xfs_alert_fsblock_zero(ip, imap);
+
 	if (imap->br_startblock == HOLESTARTBLOCK) {
 		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->type = IOMAP_HOLE;
-	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
+	} else if (imap->br_startblock == DELAYSTARTBLOCK ||
+		   isnullstartblock(imap->br_startblock)) {
 		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->type = IOMAP_DELALLOC;
 	} else {
@@ -60,6 +79,28 @@
 	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
 	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
 	iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+	if (xfs_ipincount(ip) &&
+	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+		iomap->flags |= IOMAP_F_DIRTY;
+	if (shared)
+		iomap->flags |= IOMAP_F_SHARED;
+	return 0;
+}
+
+static void
+xfs_hole_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	xfs_fileoff_t		offset_fsb,
+	xfs_fileoff_t		end_fsb)
+{
+	iomap->addr = IOMAP_NULL_ADDR;
+	iomap->type = IOMAP_HOLE;
+	iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
+	iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
+	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+	iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
 }
 
 xfs_extlen_t
@@ -123,23 +164,6 @@
 	return 0;
 }
 
-STATIC int
-xfs_alert_fsblock_zero(
-	xfs_inode_t	*ip,
-	xfs_bmbt_irec_t	*imap)
-{
-	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-			"Access to block zero in inode %llu "
-			"start_block: %llx start_off: %llx "
-			"blkcnt: %llx extent-state: %x",
-		(unsigned long long)ip->i_ino,
-		(unsigned long long)imap->br_startblock,
-		(unsigned long long)imap->br_startoff,
-		(unsigned long long)imap->br_blockcount,
-		imap->br_state);
-	return -EFSCORRUPTED;
-}
-
 int
 xfs_iomap_write_direct(
 	xfs_inode_t	*ip,
@@ -273,7 +297,7 @@
 		goto out_unlock;
 	}
 
-	if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
 		error = xfs_alert_fsblock_zero(ip, imap);
 
 out_unlock:
@@ -368,12 +392,13 @@
 STATIC xfs_fsblock_t
 xfs_iomap_prealloc_size(
 	struct xfs_inode	*ip,
+	int			whichfork,
 	loff_t			offset,
 	loff_t			count,
 	struct xfs_iext_cursor	*icur)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	struct xfs_bmbt_irec	prev;
 	int			shift = 0;
@@ -502,19 +527,21 @@
 	struct inode		*inode,
 	loff_t			offset,
 	loff_t			count,
+	unsigned		flags,
 	struct iomap		*iomap)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		maxbytes_fsb =
 		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 	xfs_fileoff_t		end_fsb;
-	int			error = 0, eof = 0;
-	struct xfs_bmbt_irec	got;
-	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	imap, cmap;
+	struct xfs_iext_cursor	icur, ccur;
 	xfs_fsblock_t		prealloc_blocks = 0;
+	bool			eof = false, cow_eof = false, shared = false;
+	int			whichfork = XFS_DATA_FORK;
+	int			error = 0;
 
 	ASSERT(!XFS_IS_REALTIME_INODE(ip));
 	ASSERT(!xfs_get_extsz_hint(ip));
@@ -532,48 +559,109 @@
 
 	XFS_STATS_INC(mp, xs_blk_mapw);
 
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
 		if (error)
 			goto out_unlock;
 	}
 
-	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
-	if (!eof && got.br_startoff <= offset_fsb) {
-		if (xfs_is_reflink_inode(ip)) {
-			bool		shared;
+	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 
-			end_fsb = min(XFS_B_TO_FSB(mp, offset + count),
-					maxbytes_fsb);
-			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
-			error = xfs_reflink_reserve_cow(ip, &got, &shared);
-			if (error)
-				goto out_unlock;
+	/*
+	 * Search the data fork fork first to look up our source mapping.  We
+	 * always need the data fork map, as we have to return it to the
+	 * iomap code so that the higher level write code can read data in to
+	 * perform read-modify-write cycles for unaligned writes.
+	 */
+	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
+	if (eof)
+		imap.br_startoff = end_fsb; /* fake hole until the end */
+
+	/* We never need to allocate blocks for zeroing a hole. */
+	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+		goto out_unlock;
+	}
+
+	/*
+	 * Search the COW fork extent list even if we did not find a data fork
+	 * extent.  This serves two purposes: first this implements the
+	 * speculative preallocation using cowextsize, so that we also unshare
+	 * block adjacent to shared blocks instead of just the shared blocks
+	 * themselves.  Second the lookup in the extent list is generally faster
+	 * than going out to the shared extent tree.
+	 */
+	if (xfs_is_cow_inode(ip)) {
+		if (!ip->i_cowfp) {
+			ASSERT(!xfs_is_reflink_inode(ip));
+			xfs_ifork_init_cow(ip);
+		}
+		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+				&ccur, &cmap);
+		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+			trace_xfs_reflink_cow_found(ip, &cmap);
+			whichfork = XFS_COW_FORK;
+			goto done;
+		}
+	}
+
+	if (imap.br_startoff <= offset_fsb) {
+		/*
+		 * For reflink files we may need a delalloc reservation when
+		 * overwriting shared extents.   This includes zeroing of
+		 * existing extents that contain data.
+		 */
+		if (!xfs_is_cow_inode(ip) ||
+		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+					&imap);
+			goto done;
 		}
 
-		trace_xfs_iomap_found(ip, offset, count, 0, &got);
-		goto done;
+		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
+
+		/* Trim the mapping to the nearest shared extent boundary. */
+		error = xfs_inode_need_cow(ip, &imap, &shared);
+		if (error)
+			goto out_unlock;
+
+		/* Not shared?  Just report the (potentially capped) extent. */
+		if (!shared) {
+			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+					&imap);
+			goto done;
+		}
+
+		/*
+		 * Fork all the shared blocks from our write offset until the
+		 * end of the extent.
+		 */
+		whichfork = XFS_COW_FORK;
+		end_fsb = imap.br_startoff + imap.br_blockcount;
+	} else {
+		/*
+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+		 * pages to keep the chunks of work done where somewhat
+		 * symmetric with the work writeback does.  This is a completely
+		 * arbitrary number pulled out of thin air.
+		 *
+		 * Note that the values needs to be less than 32-bits wide until
+		 * the lower level functions are updated.
+		 */
+		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+		if (xfs_is_always_cow_inode(ip))
+			whichfork = XFS_COW_FORK;
 	}
 
 	error = xfs_qm_dqattach_locked(ip, false);
 	if (error)
 		goto out_unlock;
 
-	/*
-	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
-	 * to keep the chunks of work done where somewhat symmetric with the
-	 * work writeback does. This is a completely arbitrary number pulled
-	 * out of thin air as a best guess for initial testing.
-	 *
-	 * Note that the values needs to be less than 32-bits wide until
-	 * the lower level functions are updated.
-	 */
-	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
-	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
 	if (eof) {
-		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
-				&icur);
+		prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+				count, &icur);
 		if (prealloc_blocks) {
 			xfs_extlen_t	align;
 			xfs_off_t	end_offset;
@@ -594,9 +682,11 @@
 	}
 
 retry:
-	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
-			end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
-			eof);
+	error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+			end_fsb - offset_fsb, prealloc_blocks,
+			whichfork == XFS_DATA_FORK ? &imap : &cmap,
+			whichfork == XFS_DATA_FORK ? &icur : &ccur,
+			whichfork == XFS_DATA_FORK ? eof : cow_eof);
 	switch (error) {
 	case 0:
 		break;
@@ -618,190 +708,26 @@
 	 * them out if the write happens to fail.
 	 */
 	iomap->flags |= IOMAP_F_NEW;
-	trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+			whichfork == XFS_DATA_FORK ? &imap : &cmap);
 done:
-	if (isnullstartblock(got.br_startblock))
-		got.br_startblock = DELAYSTARTBLOCK;
-
-	if (!got.br_startblock) {
-		error = xfs_alert_fsblock_zero(ip, &got);
-		if (error)
+	if (whichfork == XFS_COW_FORK) {
+		if (imap.br_startoff > offset_fsb) {
+			xfs_trim_extent(&cmap, offset_fsb,
+					imap.br_startoff - offset_fsb);
+			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
 			goto out_unlock;
+		}
+		/* ensure we only report blocks we have a reservation for */
+		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+		shared = true;
 	}
-
-	xfs_bmbt_to_iomap(ip, iomap, &got);
-
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
-	xfs_inode_t	*ip,
-	int		whichfork,
-	xfs_off_t	offset,
-	xfs_bmbt_irec_t *imap,
-	unsigned int	*cow_seq)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
-	xfs_fileoff_t	offset_fsb, last_block;
-	xfs_fileoff_t	end_fsb, map_start_fsb;
-	xfs_filblks_t	count_fsb;
-	xfs_trans_t	*tp;
-	int		nimaps;
-	int		error = 0;
-	int		flags = XFS_BMAPI_DELALLOC;
-	int		nres;
-
-	if (whichfork == XFS_COW_FORK)
-		flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
-	/*
-	 * Make sure that the dquots are there.
-	 */
-	error = xfs_qm_dqattach(ip);
-	if (error)
-		return error;
-
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	count_fsb = imap->br_blockcount;
-	map_start_fsb = imap->br_startoff;
-
-	XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
-	while (count_fsb != 0) {
-		/*
-		 * Set up a transaction with which to allocate the
-		 * backing store for the file.  Do allocations in a
-		 * loop until we get some space in the range we are
-		 * interested in.  The other space that might be allocated
-		 * is in the delayed allocation extent on which we sit
-		 * but before our buffer starts.
-		 */
-		nimaps = 0;
-		while (nimaps == 0) {
-			nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-			/*
-			 * We have already reserved space for the extent and any
-			 * indirect blocks when creating the delalloc extent,
-			 * there is no need to reserve space in this transaction
-			 * again.
-			 */
-			error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
-					0, XFS_TRANS_RESERVE, &tp);
-			if (error)
-				return error;
-
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip, 0);
-
-			/*
-			 * it is possible that the extents have changed since
-			 * we did the read call as we dropped the ilock for a
-			 * while. We have to be careful about truncates or hole
-			 * punchs here - we are not allowed to allocate
-			 * non-delalloc blocks here.
-			 *
-			 * The only protection against truncation is the pages
-			 * for the range we are being asked to convert are
-			 * locked and hence a truncate will block on them
-			 * first.
-			 *
-			 * As a result, if we go beyond the range we really
-			 * need and hit an delalloc extent boundary followed by
-			 * a hole while we have excess blocks in the map, we
-			 * will fill the hole incorrectly and overrun the
-			 * transaction reservation.
-			 *
-			 * Using a single map prevents this as we are forced to
-			 * check each map we look for overlap with the desired
-			 * range and abort as soon as we find it. Also, given
-			 * that we only return a single map, having one beyond
-			 * what we can return is probably a bit silly.
-			 *
-			 * We also need to check that we don't go beyond EOF;
-			 * this is a truncate optimisation as a truncate sets
-			 * the new file size before block on the pages we
-			 * currently have locked under writeback. Because they
-			 * are about to be tossed, we don't need to write them
-			 * back....
-			 */
-			nimaps = 1;
-			end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
-			error = xfs_bmap_last_offset(ip, &last_block,
-							XFS_DATA_FORK);
-			if (error)
-				goto trans_cancel;
-
-			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
-			if ((map_start_fsb + count_fsb) > last_block) {
-				count_fsb = last_block - map_start_fsb;
-				if (count_fsb == 0) {
-					error = -EAGAIN;
-					goto trans_cancel;
-				}
-			}
-
-			/*
-			 * From this point onwards we overwrite the imap
-			 * pointer that the caller gave to us.
-			 */
-			error = xfs_bmapi_write(tp, ip, map_start_fsb,
-						count_fsb, flags, nres, imap,
-						&nimaps);
-			if (error)
-				goto trans_cancel;
-
-			error = xfs_trans_commit(tp);
-			if (error)
-				goto error0;
-
-			if (whichfork == XFS_COW_FORK)
-				*cow_seq = READ_ONCE(ifp->if_seq);
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		}
-
-		/*
-		 * See if we were able to allocate an extent that
-		 * covers at least part of the callers request
-		 */
-		if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-			return xfs_alert_fsblock_zero(ip, imap);
-
-		if ((offset_fsb >= imap->br_startoff) &&
-		    (offset_fsb < (imap->br_startoff +
-				   imap->br_blockcount))) {
-			XFS_STATS_INC(mp, xs_xstrat_quick);
-			return 0;
-		}
-
-		/*
-		 * So far we have not mapped the requested part of the
-		 * file, just surrounding data, try again.
-		 */
-		count_fsb -= imap->br_blockcount;
-		map_start_fsb = imap->br_startoff + imap->br_blockcount;
-	}
-
-trans_cancel:
-	xfs_trans_cancel(tp);
-error0:
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return error;
-}
-
 int
 xfs_iomap_write_unwritten(
 	xfs_inode_t	*ip,
@@ -850,7 +776,7 @@
 		 * complete here and might deadlock on the iolock.
 		 */
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
-				XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+				XFS_TRANS_RESERVE, &tp);
 		if (error)
 			return error;
 
@@ -888,7 +814,7 @@
 		if (error)
 			return error;
 
-		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+		if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
 			return xfs_alert_fsblock_zero(ip, &imap);
 
 		if ((numblks_fsb = imap.br_blockcount) == 0) {
@@ -946,7 +872,7 @@
 	 * COW writes may allocate delalloc space or convert unwritten COW
 	 * extents, so we need to make sure to take the lock exclusively here.
 	 */
-	if (xfs_is_reflink_inode(ip) && is_write) {
+	if (xfs_is_cow_inode(ip) && is_write) {
 		/*
 		 * FIXME: It could still overwrite on unshared extents and not
 		 * need allocation.
@@ -980,7 +906,7 @@
 	 * check, so if we got ILOCK_SHARED for a write and but we're now a
 	 * reflink inode we have to switch to ILOCK_EXCL and relock.
 	 */
-	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
 		xfs_iunlock(ip, mode);
 		mode = XFS_ILOCK_EXCL;
 		goto relock;
@@ -1003,16 +929,17 @@
 	struct xfs_bmbt_irec	imap;
 	xfs_fileoff_t		offset_fsb, end_fsb;
 	int			nimaps = 1, error = 0;
-	bool			shared = false, trimmed = false;
+	bool			shared = false;
 	unsigned		lockmode;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
 			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
 		/* Reserve delalloc blocks for regular writeback. */
-		return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
+		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+				iomap);
 	}
 
 	/*
@@ -1038,8 +965,7 @@
 
 	if (flags & IOMAP_REPORT) {
 		/* Trim the mapping to the nearest shared extent boundary. */
-		error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
-				&trimmed);
+		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
 		if (error)
 			goto out_unlock;
 	}
@@ -1052,23 +978,33 @@
 	 * Break shared extents if necessary. Checks for non-blocking IO have
 	 * been done up front, so we don't need to do them here.
 	 */
-	if (xfs_is_reflink_inode(ip)) {
+	if (xfs_is_cow_inode(ip)) {
+		struct xfs_bmbt_irec	cmap;
+		bool			directio = (flags & IOMAP_DIRECT);
+
 		/* if zeroing doesn't need COW allocation, then we are done. */
 		if ((flags & IOMAP_ZERO) &&
 		    !needs_cow_for_zeroing(&imap, nimaps))
 			goto out_found;
 
-		if (flags & IOMAP_DIRECT) {
-			/* may drop and re-acquire the ilock */
-			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
-					&lockmode);
-			if (error)
-				goto out_unlock;
-		} else {
-			error = xfs_reflink_reserve_cow(ip, &imap, &shared);
-			if (error)
-				goto out_unlock;
-		}
+		/* may drop and re-acquire the ilock */
+		cmap = imap;
+		error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+				directio);
+		if (error)
+			goto out_unlock;
+
+		/*
+		 * For buffered writes we need to report the address of the
+		 * previous block (if there was any) so that the higher level
+		 * write code can perform read-modify-write operations; we
+		 * won't need the CoW fork mapping until writeback.  For direct
+		 * I/O, which must be block aligned, we need to report the
+		 * newly allocated address.  If the data fork has a hole, copy
+		 * the COW fork mapping to avoid allocating to the data fork.
+		 */
+		if (directio || imap.br_startblock == HOLESTARTBLOCK)
+			imap = cmap;
 
 		end_fsb = imap.br_startoff + imap.br_blockcount;
 		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1110,23 +1046,15 @@
 		return error;
 
 	iomap->flags |= IOMAP_F_NEW;
-	trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
 
 out_finish:
-	if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
-				& ~XFS_ILOG_TIMESTAMP))
-		iomap->flags |= IOMAP_F_DIRTY;
-
-	xfs_bmbt_to_iomap(ip, iomap, &imap);
-
-	if (shared)
-		iomap->flags |= IOMAP_F_SHARED;
-	return 0;
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
 
 out_found:
 	ASSERT(nimaps);
 	xfs_iunlock(ip, lockmode);
-	trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
 	goto out_finish;
 
 out_unlock:
@@ -1212,6 +1140,92 @@
 };
 
 static int
+xfs_seek_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
+	xfs_fileoff_t		cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	imap, cmap;
+	int			error = 0;
+	unsigned		lockmode;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	lockmode = xfs_ilock_data_map_shared(ip);
+	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+		/*
+		 * If we found a data extent we are done.
+		 */
+		if (imap.br_startoff <= offset_fsb)
+			goto done;
+		data_fsb = imap.br_startoff;
+	} else {
+		/*
+		 * Fake a hole until the end of the file.
+		 */
+		data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+			       XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+	}
+
+	/*
+	 * If a COW fork extent covers the hole, report it - capped to the next
+	 * data fork extent:
+	 */
+	if (xfs_inode_has_cow_data(ip) &&
+	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+		cow_fsb = cmap.br_startoff;
+	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+		if (data_fsb < cow_fsb + cmap.br_blockcount)
+			end_fsb = min(end_fsb, data_fsb);
+		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+		/*
+		 * This is a COW extent, so we must probe the page cache
+		 * because there could be dirty page cache being backed
+		 * by this extent.
+		 */
+		iomap->type = IOMAP_UNWRITTEN;
+		goto out_unlock;
+	}
+
+	/*
+	 * Else report a hole, capped to the next found data or COW extent.
+	 */
+	if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+		imap.br_blockcount = cow_fsb - offset_fsb;
+	else
+		imap.br_blockcount = data_fsb - offset_fsb;
+	imap.br_startoff = offset_fsb;
+	imap.br_startblock = HOLESTARTBLOCK;
+	imap.br_state = XFS_EXT_NORM;
+done:
+	xfs_trim_extent(&imap, offset_fsb, end_fsb);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+	xfs_iunlock(ip, lockmode);
+	return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+	.iomap_begin		= xfs_seek_iomap_begin,
+};
+
+static int
 xfs_xattr_iomap_begin(
 	struct inode		*inode,
 	loff_t			offset,
@@ -1244,12 +1258,10 @@
 out_unlock:
 	xfs_iunlock(ip, lockmode);
 
-	if (!error) {
-		ASSERT(nimaps);
-		xfs_bmbt_to_iomap(ip, iomap, &imap);
-	}
-
-	return error;
+	if (error)
+		return error;
+	ASSERT(nimaps);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
 }
 
 const struct iomap_ops xfs_xattr_iomap_ops = {

diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c617054..5c2f6aa 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h

@@ -13,12 +13,10 @@
 
 int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
-			struct xfs_bmbt_irec *, unsigned int *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
-		struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+		struct xfs_bmbt_irec *, bool shared);
 xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
 
 static inline xfs_filblks_t
@@ -42,6 +40,7 @@
 }
 
 extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
 
 #endif /* __XFS_IOMAP_H__*/

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f48ffd7..fe285d1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c

@@ -10,30 +10,20 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
 #include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
 #include "xfs_acl.h"
 #include "xfs_quota.h"
-#include "xfs_error.h"
 #include "xfs_attr.h"
 #include "xfs_trans.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
-#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
-#include "xfs_trans_space.h"
 #include "xfs_iomap.h"
-#include "xfs_defer.h"
 
-#include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/iomap.h>
-#include <linux/slab.h>
 #include <linux/iversion.h>
 
 /*
@@ -191,9 +181,18 @@
 
 	xfs_setup_iops(ip);
 
-	if (tmpfile)
+	if (tmpfile) {
+		/*
+		 * The VFS requires that any inode fed to d_tmpfile must have
+		 * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+		 * However, we created the temp file with nlink == 0 because
+		 * we're not allowed to put an inode with nlink > 0 on the
+		 * unlinked list.  Therefore we have to set nlink to 1 so that
+		 * d_tmpfile can immediately set it back to zero.
+		 */
+		set_nlink(inode, 1);
 		d_tmpfile(dentry, inode);
-	else
+	} else
 		d_instantiate(dentry, inode);
 
 	xfs_finish_inode_setup(ip);
@@ -522,6 +521,10 @@
 		}
 	}
 
+	/*
+	 * Note: If you add another clause to set an attribute flag, please
+	 * update attributes_mask below.
+	 */
 	if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
 		stat->attributes |= STATX_ATTR_IMMUTABLE;
 	if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +532,10 @@
 	if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
 		stat->attributes |= STATX_ATTR_NODUMP;
 
+	stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+				  STATX_ATTR_APPEND |
+				  STATX_ATTR_NODUMP);
+
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFBLK:
 	case S_IFCHR:
@@ -786,6 +793,7 @@
 
 out_cancel:
 	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_dqrele:
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e9508ba..884950a 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c

@@ -14,45 +14,66 @@
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
-#include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_health.h"
 
 /*
- * Return stat information for one inode.
- * Return 0 if ok, else errno.
+ * Bulk Stat
+ * =========
+ *
+ * Use the inode walking functions to fill out struct xfs_bulkstat for every
+ * allocated inode, then pass the stat information to some externally provided
+ * iteration function.
  */
-int
+
+struct xfs_bstat_chunk {
+	bulkstat_one_fmt_pf	formatter;
+	struct xfs_ibulk	*breq;
+	struct xfs_bulkstat	*buf;
+};
+
+/*
+ * Fill out the bulkstat info for a single inode and report it somewhere.
+ *
+ * bc->breq->lastino is effectively the inode cursor as we walk through the
+ * filesystem.  Therefore, we update it any time we need to move the cursor
+ * forward, regardless of whether or not we're sending any bstat information
+ * back to userspace.  If the inode is internal metadata or, has been freed
+ * out from under us, we just simply keep going.
+ *
+ * However, if any other type of error happens we want to stop right where we
+ * are so that userspace will call back with exact number of the bad inode and
+ * we can send back an error code.
+ *
+ * Note that if the formatter tells us there's no space left in the buffer we
+ * move the cursor forward and abort the walk.
+ */
+STATIC int
 xfs_bulkstat_one_int(
-	struct xfs_mount	*mp,		/* mount point for filesystem */
-	xfs_ino_t		ino,		/* inode to get data for */
-	void __user		*buffer,	/* buffer to place output in */
-	int			ubsize,		/* size of buffer */
-	bulkstat_one_fmt_pf	formatter,	/* formatter, copy to user */
-	int			*ubused,	/* bytes used by me */
-	int			*stat)		/* BULKSTAT_RV_... */
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	struct xfs_bstat_chunk	*bc)
 {
 	struct xfs_icdinode	*dic;		/* dinode core info pointer */
 	struct xfs_inode	*ip;		/* incore inode pointer */
 	struct inode		*inode;
-	struct xfs_bstat	*buf;		/* return buffer */
-	int			error = 0;	/* error value */
+	struct xfs_bulkstat	*buf = bc->buf;
+	int			error = -EINVAL;
 
-	*stat = BULKSTAT_RV_NOTHING;
+	if (xfs_internal_inum(mp, ino))
+		goto out_advance;
 
-	if (!buffer || xfs_internal_inum(mp, ino))
-		return -EINVAL;
-
-	buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
-	if (!buf)
-		return -ENOMEM;
-
-	error = xfs_iget(mp, NULL, ino,
+	error = xfs_iget(mp, tp, ino,
 			 (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
 			 XFS_ILOCK_SHARED, &ip);
+	if (error == -ENOENT || error == -EINVAL)
+		goto out_advance;
 	if (error)
-		goto out_free;
+		goto out;
 
 	ASSERT(ip != NULL);
 	ASSERT(ip->i_imap.im_blkno != 0);
@@ -63,36 +84,35 @@
 	/* xfs_iget returns the following without needing
 	 * further change.
 	 */
-	buf->bs_projid_lo = dic->di_projid_lo;
-	buf->bs_projid_hi = dic->di_projid_hi;
+	buf->bs_projectid = xfs_get_projid(ip);
 	buf->bs_ino = ino;
 	buf->bs_uid = dic->di_uid;
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
 
 	buf->bs_nlink = inode->i_nlink;
-	buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
-	buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
-	buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
-	buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
-	buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
-	buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+	buf->bs_atime = inode->i_atime.tv_sec;
+	buf->bs_atime_nsec = inode->i_atime.tv_nsec;
+	buf->bs_mtime = inode->i_mtime.tv_sec;
+	buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
+	buf->bs_ctime = inode->i_ctime.tv_sec;
+	buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
+	buf->bs_btime = dic->di_crtime.t_sec;
+	buf->bs_btime_nsec = dic->di_crtime.t_nsec;
 	buf->bs_gen = inode->i_generation;
 	buf->bs_mode = inode->i_mode;
 
 	buf->bs_xflags = xfs_ip2xflags(ip);
-	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+	buf->bs_extsize_blks = dic->di_extsize;
 	buf->bs_extents = dic->di_nextents;
-	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
-	buf->bs_dmevmask = dic->di_dmevmask;
-	buf->bs_dmstate = dic->di_dmstate;
+	xfs_bulkstat_health(ip, buf);
 	buf->bs_aextents = dic->di_anextents;
 	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+	buf->bs_version = XFS_BULKSTAT_VERSION_V5;
 
 	if (dic->di_version == 3) {
 		if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
-			buf->bs_cowextsize = dic->di_cowextsize <<
-					mp->m_sb.sb_blocklog;
+			buf->bs_cowextsize_blks = dic->di_cowextsize;
 	}
 
 	switch (dic->di_format) {
@@ -116,387 +136,121 @@
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	xfs_irele(ip);
 
-	error = formatter(buffer, ubsize, ubused, buf);
-	if (!error)
-		*stat = BULKSTAT_RV_DIDONE;
+	error = bc->formatter(bc->breq, buf);
+	if (error == -ECANCELED)
+		goto out_advance;
+	if (error)
+		goto out;
 
- out_free:
-	kmem_free(buf);
+out_advance:
+	/*
+	 * Advance the cursor to the inode that comes after the one we just
+	 * looked at.  We want the caller to move along if the bulkstat
+	 * information was copied successfully; if we tried to grab the inode
+	 * but it's no longer allocated; or if it's internal metadata.
+	 */
+	bc->breq->startino = ino + 1;
+out:
 	return error;
 }
 
-/* Return 0 on success or positive error */
-STATIC int
-xfs_bulkstat_one_fmt(
-	void			__user *ubuffer,
-	int			ubsize,
-	int			*ubused,
-	const xfs_bstat_t	*buffer)
-{
-	if (ubsize < sizeof(*buffer))
-		return -ENOMEM;
-	if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
-		return -EFAULT;
-	if (ubused)
-		*ubused = sizeof(*buffer);
-	return 0;
-}
-
+/* Bulkstat a single inode. */
 int
 xfs_bulkstat_one(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	void		__user *buffer,	/* buffer to place output in */
-	int		ubsize,		/* size of buffer */
-	int		*ubused,	/* bytes used by me */
-	int		*stat)		/* BULKSTAT_RV_... */
+	struct xfs_ibulk	*breq,
+	bulkstat_one_fmt_pf	formatter)
 {
-	return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-				    xfs_bulkstat_one_fmt, ubused, stat);
-}
+	struct xfs_bstat_chunk	bc = {
+		.formatter	= formatter,
+		.breq		= breq,
+	};
+	int			error;
 
-/*
- * Loop over all clusters in a chunk for a given incore inode allocation btree
- * record.  Do a readahead if there are any allocated inodes in that cluster.
- */
-STATIC void
-xfs_bulkstat_ichunk_ra(
-	struct xfs_mount		*mp,
-	xfs_agnumber_t			agno,
-	struct xfs_inobt_rec_incore	*irec)
-{
-	xfs_agblock_t			agbno;
-	struct blk_plug			plug;
-	int				blks_per_cluster;
-	int				inodes_per_cluster;
-	int				i;	/* inode chunk index */
+	ASSERT(breq->icount == 1);
 
-	agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
+			KM_MAYFAIL);
+	if (!bc.buf)
+		return -ENOMEM;
 
-	blk_start_plug(&plug);
-	for (i = 0; i < XFS_INODES_PER_CHUNK;
-	     i += inodes_per_cluster, agbno += blks_per_cluster) {
-		if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
-			xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
-					     &xfs_inode_buf_ops);
-		}
-	}
-	blk_finish_plug(&plug);
-}
+	error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc);
 
-/*
- * Lookup the inode chunk that the given inode lives in and then get the record
- * if we found the chunk.  If the inode was not the last in the chunk and there
- * are some left allocated, update the data for the pointed-to record as well as
- * return the count of grabbed inodes.
- */
-STATIC int
-xfs_bulkstat_grab_ichunk(
-	struct xfs_btree_cur		*cur,	/* btree cursor */
-	xfs_agino_t			agino,	/* starting inode of chunk */
-	int				*icount,/* return # of inodes grabbed */
-	struct xfs_inobt_rec_incore	*irec)	/* btree record */
-{
-	int				idx;	/* index into inode chunk */
-	int				stat;
-	int				error = 0;
-
-	/* Lookup the inode chunk that this inode lives in */
-	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
-	if (error)
-		return error;
-	if (!stat) {
-		*icount = 0;
-		return error;
-	}
-
-	/* Get the record, should always work */
-	error = xfs_inobt_get_rec(cur, irec, &stat);
-	if (error)
-		return error;
-	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
-
-	/* Check if the record contains the inode in request */
-	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
-		*icount = 0;
-		return 0;
-	}
-
-	idx = agino - irec->ir_startino + 1;
-	if (idx < XFS_INODES_PER_CHUNK &&
-	    (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
-		int	i;
-
-		/* We got a right chunk with some left inodes allocated at it.
-		 * Grab the chunk record.  Mark all the uninteresting inodes
-		 * free -- because they're before our start point.
-		 */
-		for (i = 0; i < idx; i++) {
-			if (XFS_INOBT_MASK(i) & ~irec->ir_free)
-				irec->ir_freecount++;
-		}
-
-		irec->ir_free |= xfs_inobt_maskn(0, idx);
-		*icount = irec->ir_count - irec->ir_freecount;
-	}
-
-	return 0;
-}
-
-#define XFS_BULKSTAT_UBLEFT(ubleft)	((ubleft) >= statstruct_size)
-
-struct xfs_bulkstat_agichunk {
-	char		__user **ac_ubuffer;/* pointer into user's buffer */
-	int		ac_ubleft;	/* bytes left in user's buffer */
-	int		ac_ubelem;	/* spaces used in user's buffer */
-};
-
-/*
- * Process inodes in chunk with a pointer to a formatter function
- * that will iget the inode and fill in the appropriate structure.
- */
-static int
-xfs_bulkstat_ag_ichunk(
-	struct xfs_mount		*mp,
-	xfs_agnumber_t			agno,
-	struct xfs_inobt_rec_incore	*irbp,
-	bulkstat_one_pf			formatter,
-	size_t				statstruct_size,
-	struct xfs_bulkstat_agichunk	*acp,
-	xfs_agino_t			*last_agino)
-{
-	char				__user **ubufp = acp->ac_ubuffer;
-	int				chunkidx;
-	int				error = 0;
-	xfs_agino_t			agino = irbp->ir_startino;
-
-	for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
-	     chunkidx++, agino++) {
-		int		fmterror;
-		int		ubused;
-
-		/* inode won't fit in buffer, we are done */
-		if (acp->ac_ubleft < statstruct_size)
-			break;
-
-		/* Skip if this inode is free */
-		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
-			continue;
-
-		/* Get the inode and fill in a single buffer */
-		ubused = statstruct_size;
-		error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
-				  *ubufp, acp->ac_ubleft, &ubused, &fmterror);
-
-		if (fmterror == BULKSTAT_RV_GIVEUP ||
-		    (error && error != -ENOENT && error != -EINVAL)) {
-			acp->ac_ubleft = 0;
-			ASSERT(error);
-			break;
-		}
-
-		/* be careful not to leak error if at end of chunk */
-		if (fmterror == BULKSTAT_RV_NOTHING || error) {
-			error = 0;
-			continue;
-		}
-
-		*ubufp += ubused;
-		acp->ac_ubleft -= ubused;
-		acp->ac_ubelem++;
-	}
+	kmem_free(bc.buf);
 
 	/*
-	 * Post-update *last_agino. At this point, agino will always point one
-	 * inode past the last inode we processed successfully. Hence we
-	 * substract that inode when setting the *last_agino cursor so that we
-	 * return the correct cookie to userspace. On the next bulkstat call,
-	 * the inode under the lastino cookie will be skipped as we have already
-	 * processed it here.
+	 * If we reported one inode to userspace then we abort because we hit
+	 * the end of the buffer.  Don't leak that back to userspace.
 	 */
-	*last_agino = agino - 1;
+	if (error == -ECANCELED)
+		error = 0;
 
 	return error;
 }
 
-/*
- * Return stat information in bulk (by-inode) for the filesystem.
- */
-int					/* error status */
-xfs_bulkstat(
-	xfs_mount_t		*mp,	/* mount point for filesystem */
-	xfs_ino_t		*lastinop, /* last inode returned */
-	int			*ubcountp, /* size of buffer/count returned */
-	bulkstat_one_pf		formatter, /* func that'd fill a single buf */
-	size_t			statstruct_size, /* sizeof struct filling */
-	char			__user *ubuffer, /* buffer with inode stats */
-	int			*done)	/* 1 if there are more stats to get */
+static int
+xfs_bulkstat_iwalk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	void			*data)
 {
-	xfs_buf_t		*agbp;	/* agi header buffer */
-	xfs_agino_t		agino;	/* inode # in allocation group */
-	xfs_agnumber_t		agno;	/* allocation group number */
-	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
-	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
-	int			nirbuf;	/* size of irbuf */
-	int			ubcount; /* size of user's buffer */
-	struct xfs_bulkstat_agichunk ac;
-	int			error = 0;
+	int			error;
 
-	/*
-	 * Get the last inode value, see if there's nothing to do.
-	 */
-	agno = XFS_INO_TO_AGNO(mp, *lastinop);
-	agino = XFS_INO_TO_AGINO(mp, *lastinop);
-	if (agno >= mp->m_sb.sb_agcount ||
-	    *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
-		*done = 1;
-		*ubcountp = 0;
+	error = xfs_bulkstat_one_int(mp, tp, ino, data);
+	/* bulkstat just skips over missing inodes */
+	if (error == -ENOENT || error == -EINVAL)
 		return 0;
-	}
+	return error;
+}
 
-	ubcount = *ubcountp; /* statstruct's */
-	ac.ac_ubuffer = &ubuffer;
-	ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
-	ac.ac_ubelem = 0;
+/*
+ * Check the incoming lastino parameter.
+ *
+ * We allow any inode value that could map to physical space inside the
+ * filesystem because if there are no inodes there, bulkstat moves on to the
+ * next chunk.  In other words, the magic agino value of zero takes us to the
+ * first chunk in the AG, and an agino value past the end of the AG takes us to
+ * the first chunk in the next AG.
+ *
+ * Therefore we can end early if the requested inode is beyond the end of the
+ * filesystem or doesn't map properly.
+ */
+static inline bool
+xfs_bulkstat_already_done(
+	struct xfs_mount	*mp,
+	xfs_ino_t		startino)
+{
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, startino);
 
-	*ubcountp = 0;
-	*done = 0;
+	return agno >= mp->m_sb.sb_agcount ||
+	       startino != XFS_AGINO_TO_INO(mp, agno, agino);
+}
 
-	irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP);
-	if (!irbuf)
+/* Return stat information in bulk (by-inode) for the filesystem. */
+int
+xfs_bulkstat(
+	struct xfs_ibulk	*breq,
+	bulkstat_one_fmt_pf	formatter)
+{
+	struct xfs_bstat_chunk	bc = {
+		.formatter	= formatter,
+		.breq		= breq,
+	};
+	int			error;
+
+	if (xfs_bulkstat_already_done(breq->mp, breq->startino))
+		return 0;
+
+	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
+			KM_MAYFAIL);
+	if (!bc.buf)
 		return -ENOMEM;
-	nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf);
 
-	/*
-	 * Loop over the allocation groups, starting from the last
-	 * inode returned; 0 means start of the allocation group.
-	 */
-	while (agno < mp->m_sb.sb_agcount) {
-		struct xfs_inobt_rec_incore	*irbp = irbuf;
-		struct xfs_inobt_rec_incore	*irbufend = irbuf + nirbuf;
-		bool				end_of_ag = false;
-		int				icount = 0;
-		int				stat;
+	error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags,
+			xfs_bulkstat_iwalk, breq->icount, &bc);
 
-		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-		if (error)
-			break;
-		/*
-		 * Allocate and initialize a btree cursor for ialloc btree.
-		 */
-		cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
-					    XFS_BTNUM_INO);
-		if (agino > 0) {
-			/*
-			 * In the middle of an allocation group, we need to get
-			 * the remainder of the chunk we're in.
-			 */
-			struct xfs_inobt_rec_incore	r;
-
-			error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
-			if (error)
-				goto del_cursor;
-			if (icount) {
-				irbp->ir_startino = r.ir_startino;
-				irbp->ir_holemask = r.ir_holemask;
-				irbp->ir_count = r.ir_count;
-				irbp->ir_freecount = r.ir_freecount;
-				irbp->ir_free = r.ir_free;
-				irbp++;
-			}
-			/* Increment to the next record */
-			error = xfs_btree_increment(cur, 0, &stat);
-		} else {
-			/* Start of ag.  Lookup the first inode chunk */
-			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
-		}
-		if (error || stat == 0) {
-			end_of_ag = true;
-			goto del_cursor;
-		}
-
-		/*
-		 * Loop through inode btree records in this ag,
-		 * until we run out of inodes or space in the buffer.
-		 */
-		while (irbp < irbufend && icount < ubcount) {
-			struct xfs_inobt_rec_incore	r;
-
-			error = xfs_inobt_get_rec(cur, &r, &stat);
-			if (error || stat == 0) {
-				end_of_ag = true;
-				goto del_cursor;
-			}
-
-			/*
-			 * If this chunk has any allocated inodes, save it.
-			 * Also start read-ahead now for this chunk.
-			 */
-			if (r.ir_freecount < r.ir_count) {
-				xfs_bulkstat_ichunk_ra(mp, agno, &r);
-				irbp->ir_startino = r.ir_startino;
-				irbp->ir_holemask = r.ir_holemask;
-				irbp->ir_count = r.ir_count;
-				irbp->ir_freecount = r.ir_freecount;
-				irbp->ir_free = r.ir_free;
-				irbp++;
-				icount += r.ir_count - r.ir_freecount;
-			}
-			error = xfs_btree_increment(cur, 0, &stat);
-			if (error || stat == 0) {
-				end_of_ag = true;
-				goto del_cursor;
-			}
-			cond_resched();
-		}
-
-		/*
-		 * Drop the btree buffers and the agi buffer as we can't hold any
-		 * of the locks these represent when calling iget. If there is a
-		 * pending error, then we are done.
-		 */
-del_cursor:
-		xfs_btree_del_cursor(cur, error);
-		xfs_buf_relse(agbp);
-		if (error)
-			break;
-		/*
-		 * Now format all the good inodes into the user's buffer. The
-		 * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
-		 * for the next loop iteration.
-		 */
-		irbufend = irbp;
-		for (irbp = irbuf;
-		     irbp < irbufend && ac.ac_ubleft >= statstruct_size;
-		     irbp++) {
-			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
-					formatter, statstruct_size, &ac,
-					&agino);
-			if (error)
-				break;
-
-			cond_resched();
-		}
-
-		/*
-		 * If we've run out of space or had a formatting error, we
-		 * are now done
-		 */
-		if (ac.ac_ubleft < statstruct_size || error)
-			break;
-
-		if (end_of_ag) {
-			agno++;
-			agino = 0;
-		}
-	}
-	/*
-	 * Done, we're either out of filesystem or space to put the data.
-	 */
-	kmem_free(irbuf);
-	*ubcountp = ac.ac_ubelem;
+	kmem_free(bc.buf);
 
 	/*
 	 * We found some inodes, so clear the error status and return them.
@@ -505,135 +259,139 @@
 	 * triggered again and propagated to userspace as there will be no
 	 * formatted inodes in the buffer.
 	 */
-	if (ac.ac_ubelem)
+	if (breq->ocount > 0)
 		error = 0;
 
-	/*
-	 * If we ran out of filesystem, lastino will point off the end of
-	 * the filesystem so the next call will return immediately.
-	 */
-	*lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
-	if (agno >= mp->m_sb.sb_agcount)
-		*done = 1;
-
 	return error;
 }
 
-int
-xfs_inumbers_fmt(
-	void			__user *ubuffer, /* buffer to write to */
-	const struct xfs_inogrp	*buffer,	/* buffer to read from */
-	long			count,		/* # of elements to read */
-	long			*written)	/* # of bytes written */
+/* Convert bulkstat (v5) to bstat (v1). */
+void
+xfs_bulkstat_to_bstat(
+	struct xfs_mount		*mp,
+	struct xfs_bstat		*bs1,
+	const struct xfs_bulkstat	*bstat)
 {
-	if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))
-		return -EFAULT;
-	*written = count * sizeof(*buffer);
-	return 0;
+	/* memset is needed here because of padding holes in the structure. */
+	memset(bs1, 0, sizeof(struct xfs_bstat));
+	bs1->bs_ino = bstat->bs_ino;
+	bs1->bs_mode = bstat->bs_mode;
+	bs1->bs_nlink = bstat->bs_nlink;
+	bs1->bs_uid = bstat->bs_uid;
+	bs1->bs_gid = bstat->bs_gid;
+	bs1->bs_rdev = bstat->bs_rdev;
+	bs1->bs_blksize = bstat->bs_blksize;
+	bs1->bs_size = bstat->bs_size;
+	bs1->bs_atime.tv_sec = bstat->bs_atime;
+	bs1->bs_mtime.tv_sec = bstat->bs_mtime;
+	bs1->bs_ctime.tv_sec = bstat->bs_ctime;
+	bs1->bs_atime.tv_nsec = bstat->bs_atime_nsec;
+	bs1->bs_mtime.tv_nsec = bstat->bs_mtime_nsec;
+	bs1->bs_ctime.tv_nsec = bstat->bs_ctime_nsec;
+	bs1->bs_blocks = bstat->bs_blocks;
+	bs1->bs_xflags = bstat->bs_xflags;
+	bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks);
+	bs1->bs_extents = bstat->bs_extents;
+	bs1->bs_gen = bstat->bs_gen;
+	bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF;
+	bs1->bs_forkoff = bstat->bs_forkoff;
+	bs1->bs_projid_hi = bstat->bs_projectid >> 16;
+	bs1->bs_sick = bstat->bs_sick;
+	bs1->bs_checked = bstat->bs_checked;
+	bs1->bs_cowextsize = XFS_FSB_TO_B(mp, bstat->bs_cowextsize_blks);
+	bs1->bs_dmevmask = 0;
+	bs1->bs_dmstate = 0;
+	bs1->bs_aextents = bstat->bs_aextents;
+}
+
+struct xfs_inumbers_chunk {
+	inumbers_fmt_pf		formatter;
+	struct xfs_ibulk	*breq;
+};
+
+/*
+ * INUMBERS
+ * ========
+ * This is how we export inode btree records to userspace, so that XFS tools
+ * can figure out where inodes are allocated.
+ */
+
+/*
+ * Format the inode group structure and report it somewhere.
+ *
+ * Similar to xfs_bulkstat_one_int, lastino is the inode cursor as we walk
+ * through the filesystem so we move it forward unless there was a runtime
+ * error.  If the formatter tells us the buffer is now full we also move the
+ * cursor forward and abort the walk.
+ */
+STATIC int
+xfs_inumbers_walk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	const struct xfs_inobt_rec_incore *irec,
+	void			*data)
+{
+	struct xfs_inumbers	inogrp = {
+		.xi_startino	= XFS_AGINO_TO_INO(mp, agno, irec->ir_startino),
+		.xi_alloccount	= irec->ir_count - irec->ir_freecount,
+		.xi_allocmask	= ~irec->ir_free,
+		.xi_version	= XFS_INUMBERS_VERSION_V5,
+	};
+	struct xfs_inumbers_chunk *ic = data;
+	int			error;
+
+	error = ic->formatter(ic->breq, &inogrp);
+	if (error && error != -ECANCELED)
+		return error;
+
+	ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
+			XFS_INODES_PER_CHUNK;
+	return error;
 }
 
 /*
  * Return inode number table for the filesystem.
  */
-int					/* error status */
+int
 xfs_inumbers(
-	struct xfs_mount	*mp,/* mount point for filesystem */
-	xfs_ino_t		*lastino,/* last inode returned */
-	int			*count,/* size of buffer/count returned */
-	void			__user *ubuffer,/* buffer with inode descriptions */
+	struct xfs_ibulk	*breq,
 	inumbers_fmt_pf		formatter)
 {
-	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, *lastino);
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, *lastino);
-	struct xfs_btree_cur	*cur = NULL;
-	struct xfs_buf		*agbp = NULL;
-	struct xfs_inogrp	*buffer;
-	int			bcount;
-	int			left = *count;
-	int			bufidx = 0;
+	struct xfs_inumbers_chunk ic = {
+		.formatter	= formatter,
+		.breq		= breq,
+	};
 	int			error = 0;
 
-	*count = 0;
-	if (agno >= mp->m_sb.sb_agcount ||
-	    *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
-		return error;
+	if (xfs_bulkstat_already_done(breq->mp, breq->startino))
+		return 0;
 
-	bcount = min(left, (int)(PAGE_SIZE / sizeof(*buffer)));
-	buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP);
-	do {
-		struct xfs_inobt_rec_incore	r;
-		int				stat;
+	error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags,
+			xfs_inumbers_walk, breq->icount, &ic);
 
-		if (!agbp) {
-			error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-			if (error)
-				break;
-
-			cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
-						    XFS_BTNUM_INO);
-			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
-						 &stat);
-			if (error)
-				break;
-			if (!stat)
-				goto next_ag;
-		}
-
-		error = xfs_inobt_get_rec(cur, &r, &stat);
-		if (error)
-			break;
-		if (!stat)
-			goto next_ag;
-
-		agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
-		buffer[bufidx].xi_startino =
-			XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
-		buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
-		buffer[bufidx].xi_allocmask = ~r.ir_free;
-		if (++bufidx == bcount) {
-			long	written;
-
-			error = formatter(ubuffer, buffer, bufidx, &written);
-			if (error)
-				break;
-			ubuffer += written;
-			*count += bufidx;
-			bufidx = 0;
-		}
-		if (!--left)
-			break;
-
-		error = xfs_btree_increment(cur, 0, &stat);
-		if (error)
-			break;
-		if (stat)
-			continue;
-
-next_ag:
-		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-		cur = NULL;
-		xfs_buf_relse(agbp);
-		agbp = NULL;
-		agino = 0;
-		agno++;
-	} while (agno < mp->m_sb.sb_agcount);
-
-	if (!error) {
-		if (bufidx) {
-			long	written;
-
-			error = formatter(ubuffer, buffer, bufidx, &written);
-			if (!error)
-				*count += bufidx;
-		}
-		*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
-	}
-
-	kmem_free(buffer);
-	if (cur)
-		xfs_btree_del_cursor(cur, error);
-	if (agbp)
-		xfs_buf_relse(agbp);
+	/*
+	 * We found some inode groups, so clear the error status and return
+	 * them.  The lastino pointer will point directly at the inode that
+	 * triggered any error that occurred, so on the next call the error
+	 * will be triggered again and propagated to userspace as there will be
+	 * no formatted inode groups in the buffer.
+	 */
+	if (breq->ocount > 0)
+		error = 0;
 
 	return error;
 }
+
+/* Convert an inumbers (v5) struct to a inogrp (v1) struct. */
+void
+xfs_inumbers_to_inogrp(
+	struct xfs_inogrp		*ig1,
+	const struct xfs_inumbers	*ig)
+{
+	/* memset is needed here because of padding holes in the structure. */
+	memset(ig1, 0, sizeof(struct xfs_inogrp));
+	ig1->xi_startino = ig->xi_startino;
+	ig1->xi_alloccount = ig->xi_alloccount;
+	ig1->xi_allocmask = ig->xi_allocmask;
+}

diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 8a82228..96a1e2a 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h

@@ -5,83 +5,60 @@
 #ifndef __XFS_ITABLE_H__
 #define	__XFS_ITABLE_H__
 
-/*
- * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
- * structures (by the dmi library). This is a pointer to a formatter function
- * that will iget the inode and fill in the appropriate structure.
- * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c
- */
-typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
-			       xfs_ino_t	ino,
-			       void		__user *buffer,
-			       int		ubsize,
-			       int		*ubused,
-			       int		*stat);
+/* In-memory representation of a userspace request for batch inode data. */
+struct xfs_ibulk {
+	struct xfs_mount	*mp;
+	void __user		*ubuffer; /* user output buffer */
+	xfs_ino_t		startino; /* start with this inode */
+	unsigned int		icount;   /* number of elements in ubuffer */
+	unsigned int		ocount;   /* number of records returned */
+	unsigned int		flags;    /* see XFS_IBULK_FLAG_* */
+};
+
+/* Only iterate within the same AG as startino */
+#define XFS_IBULK_SAME_AG	(XFS_IWALK_SAME_AG)
 
 /*
- * Values for stat return value.
+ * Advance the user buffer pointer by one record of the given size.  If the
+ * buffer is now full, return the appropriate error code.
  */
-#define BULKSTAT_RV_NOTHING	0
-#define BULKSTAT_RV_DIDONE	1
-#define BULKSTAT_RV_GIVEUP	2
+static inline int
+xfs_ibulk_advance(
+	struct xfs_ibulk	*breq,
+	size_t			bytes)
+{
+	char __user		*b = breq->ubuffer;
+
+	breq->ubuffer = b + bytes;
+	breq->ocount++;
+	return breq->ocount == breq->icount ? -ECANCELED : 0;
+}
 
 /*
  * Return stat information in bulk (by-inode) for the filesystem.
  */
-int					/* error status */
-xfs_bulkstat(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	*lastino,	/* last inode returned */
-	int		*count,		/* size of buffer/count returned */
-	bulkstat_one_pf formatter,	/* func that'd fill a single buf */
-	size_t		statstruct_size,/* sizeof struct that we're filling */
-	char		__user *ubuffer,/* buffer with inode stats */
-	int		*done);		/* 1 if there are more stats to get */
 
-typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
-	void			__user *ubuffer, /* buffer to write to */
-	int			ubsize,		 /* remaining user buffer sz */
-	int			*ubused,	 /* bytes used by formatter */
-	const xfs_bstat_t	*buffer);        /* buffer to read from */
+/*
+ * Return codes for the formatter function are 0 to continue iterating, and
+ * non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * bulkstat/inumbers caller.  The special value -ECANCELED can be used to stop
+ * iteration, as neither bulkstat nor inumbers will ever generate that error
+ * code on their own.
+ */
 
-int
-xfs_bulkstat_one_int(
-	xfs_mount_t		*mp,
-	xfs_ino_t		ino,
-	void			__user *buffer,
-	int			ubsize,
-	bulkstat_one_fmt_pf	formatter,
-	int			*ubused,
-	int			*stat);
+typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
+		const struct xfs_bulkstat *bstat);
 
-int
-xfs_bulkstat_one(
-	xfs_mount_t		*mp,
-	xfs_ino_t		ino,
-	void			__user *buffer,
-	int			ubsize,
-	int			*ubused,
-	int			*stat);
+int xfs_bulkstat_one(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter);
+int xfs_bulkstat(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter);
+void xfs_bulkstat_to_bstat(struct xfs_mount *mp, struct xfs_bstat *bs1,
+		const struct xfs_bulkstat *bstat);
 
-typedef int (*inumbers_fmt_pf)(
-	void			__user *ubuffer, /* buffer to write to */
-	const xfs_inogrp_t	*buffer,	/* buffer to read from */
-	long			count,		/* # of elements to read */
-	long			*written);	/* # of bytes written */
+typedef int (*inumbers_fmt_pf)(struct xfs_ibulk *breq,
+		const struct xfs_inumbers *igrp);
 
-int
-xfs_inumbers_fmt(
-	void			__user *ubuffer, /* buffer to write to */
-	const xfs_inogrp_t	*buffer,	/* buffer to read from */
-	long			count,		/* # of elements to read */
-	long			*written);	/* # of bytes written */
-
-int					/* error status */
-xfs_inumbers(
-	xfs_mount_t		*mp,	/* mount point for filesystem */
-	xfs_ino_t		*last,	/* last inode returned */
-	int			*count,	/* size of buffer/count returned */
-	void			__user *buffer, /* buffer with inode info */
-	inumbers_fmt_pf		formatter);
+int xfs_inumbers(struct xfs_ibulk *breq, inumbers_fmt_pf formatter);
+void xfs_inumbers_to_inogrp(struct xfs_inogrp *ig1,
+		const struct xfs_inumbers *ig);
 
 #endif	/* __XFS_ITABLE_H__ */

diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
new file mode 100644
index 0000000..aa375cf
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.c

@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_health.h"
+#include "xfs_trans.h"
+#include "xfs_pwork.h"
+
+/*
+ * Walking Inodes in the Filesystem
+ * ================================
+ *
+ * This iterator function walks a subset of filesystem inodes in increasing
+ * order from @startino until there are no more inodes.  For each allocated
+ * inode it finds, it calls a walk function with the relevant inode number and
+ * a pointer to caller-provided data.  The walk function can return the usual
+ * negative error code to stop the iteration; 0 to continue the iteration; or
+ * -ECANCELED to stop the iteration.  This return value is returned to the
+ * caller.
+ *
+ * Internally, we allow the walk function to do anything, which means that we
+ * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
+ * therefore cache the inobt records in kernel memory and only call the walk
+ * function when our memory buffer is full.  @nr_recs is the number of records
+ * that we've cached, and @sz_recs is the size of our cache.
+ *
+ * It is the responsibility of the walk function to ensure it accesses
+ * allocated inodes, as the inobt records may be stale by the time they are
+ * acted upon.
+ */
+
+struct xfs_iwalk_ag {
+	/* parallel work control data; will be null if single threaded */
+	struct xfs_pwork		pwork;
+
+	struct xfs_mount		*mp;
+	struct xfs_trans		*tp;
+
+	/* Where do we start the traversal? */
+	xfs_ino_t			startino;
+
+	/* Array of inobt records we cache. */
+	struct xfs_inobt_rec_incore	*recs;
+
+	/* Number of entries allocated for the @recs array. */
+	unsigned int			sz_recs;
+
+	/* Number of entries in the @recs array that are in use. */
+	unsigned int			nr_recs;
+
+	/* Inode walk function and data pointer. */
+	xfs_iwalk_fn			iwalk_fn;
+	xfs_inobt_walk_fn		inobt_walk_fn;
+	void				*data;
+
+	/*
+	 * Make it look like the inodes up to startino are free so that
+	 * bulkstat can start its inode iteration at the correct place without
+	 * needing to special case everywhere.
+	 */
+	unsigned int			trim_start:1;
+
+	/* Skip empty inobt records? */
+	unsigned int			skip_empty:1;
+};
+
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record.  Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_iwalk_ichunk_ra(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
+	xfs_agblock_t			agbno;
+	struct blk_plug			plug;
+	int				i;	/* inode chunk index */
+
+	agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+
+	blk_start_plug(&plug);
+	for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
+		xfs_inofree_t	imask;
+
+		imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
+		if (imask & ~irec->ir_free) {
+			xfs_btree_reada_bufs(mp, agno, agbno,
+					igeo->blocks_per_cluster,
+					&xfs_inode_buf_ops);
+		}
+		agbno += igeo->blocks_per_cluster;
+	}
+	blk_finish_plug(&plug);
+}
+
+/*
+ * Set the bits in @irec's free mask that correspond to the inodes before
+ * @agino so that we skip them.  This is how we restart an inode walk that was
+ * interrupted in the middle of an inode record.
+ */
+STATIC void
+xfs_iwalk_adjust_start(
+	xfs_agino_t			agino,	/* starting inode of chunk */
+	struct xfs_inobt_rec_incore	*irec)	/* btree record */
+{
+	int				idx;	/* index into inode chunk */
+	int				i;
+
+	idx = agino - irec->ir_startino;
+
+	/*
+	 * We got a right chunk with some left inodes allocated at it.  Grab
+	 * the chunk record.  Mark all the uninteresting inodes free because
+	 * they're before our start point.
+	 */
+	for (i = 0; i < idx; i++) {
+		if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+			irec->ir_freecount++;
+	}
+
+	irec->ir_free |= xfs_inobt_maskn(0, idx);
+}
+
+/* Allocate memory for a walk. */
+STATIC int
+xfs_iwalk_alloc(
+	struct xfs_iwalk_ag	*iwag)
+{
+	size_t			size;
+
+	ASSERT(iwag->recs == NULL);
+	iwag->nr_recs = 0;
+
+	/* Allocate a prefetch buffer for inobt records. */
+	size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
+	iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+	if (iwag->recs == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Free memory we allocated for a walk. */
+STATIC void
+xfs_iwalk_free(
+	struct xfs_iwalk_ag	*iwag)
+{
+	kmem_free(iwag->recs);
+	iwag->recs = NULL;
+}
+
+/* For each inuse inode in each cached inobt record, call our function. */
+STATIC int
+xfs_iwalk_ag_recs(
+	struct xfs_iwalk_ag		*iwag)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	xfs_ino_t			ino;
+	unsigned int			i, j;
+	xfs_agnumber_t			agno;
+	int				error;
+
+	agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+	for (i = 0; i < iwag->nr_recs; i++) {
+		struct xfs_inobt_rec_incore	*irec = &iwag->recs[i];
+
+		trace_xfs_iwalk_ag_rec(mp, agno, irec);
+
+		if (xfs_pwork_want_abort(&iwag->pwork))
+			return 0;
+
+		if (iwag->inobt_walk_fn) {
+			error = iwag->inobt_walk_fn(mp, tp, agno, irec,
+					iwag->data);
+			if (error)
+				return error;
+		}
+
+		if (!iwag->iwalk_fn)
+			continue;
+
+		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+			if (xfs_pwork_want_abort(&iwag->pwork))
+				return 0;
+
+			/* Skip if this inode is free */
+			if (XFS_INOBT_MASK(j) & irec->ir_free)
+				continue;
+
+			/* Otherwise call our function. */
+			ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
+			error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
+			if (error)
+				return error;
+		}
+	}
+
+	return 0;
+}
+
+/* Delete cursor and let go of AGI. */
+static inline void
+xfs_iwalk_del_inobt(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp,
+	int			error)
+{
+	if (*curpp) {
+		xfs_btree_del_cursor(*curpp, error);
+		*curpp = NULL;
+	}
+	if (*agi_bpp) {
+		xfs_trans_brelse(tp, *agi_bpp);
+		*agi_bpp = NULL;
+	}
+}
+
+/*
+ * Set ourselves up for walking inobt records starting from a given point in
+ * the filesystem.
+ *
+ * If caller passed in a nonzero start inode number, load the record from the
+ * inobt and make the record look like all the inodes before agino are free so
+ * that we skip them, and then move the cursor to the next inobt record.  This
+ * is how we support starting an iwalk in the middle of an inode chunk.
+ *
+ * If the caller passed in a start number of zero, move the cursor to the first
+ * inobt record.
+ *
+ * The caller is responsible for cleaning up the cursor and buffer pointer
+ * regardless of the error status.
+ */
+STATIC int
+xfs_iwalk_ag_start(
+	struct xfs_iwalk_ag	*iwag,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp,
+	int			*has_more)
+{
+	struct xfs_mount	*mp = iwag->mp;
+	struct xfs_trans	*tp = iwag->tp;
+	struct xfs_inobt_rec_incore *irec;
+	int			error;
+
+	/* Set up a fresh cursor and empty the inobt cache. */
+	iwag->nr_recs = 0;
+	error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+	if (error)
+		return error;
+
+	/* Starting at the beginning of the AG?  That's easy! */
+	if (agino == 0)
+		return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
+
+	/*
+	 * Otherwise, we have to grab the inobt record where we left off, stuff
+	 * the record into our cache, and then see if there are more records.
+	 * We require a lookup cache of at least two elements so that the
+	 * caller doesn't have to deal with tearing down the cursor to walk the
+	 * records.
+	 */
+	error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
+	if (error)
+		return error;
+
+	/*
+	 * If the LE lookup at @agino yields no records, jump ahead to the
+	 * inobt cursor increment to see if there are more records to process.
+	 */
+	if (!*has_more)
+		goto out_advance;
+
+	/* Get the record, should always work */
+	irec = &iwag->recs[iwag->nr_recs];
+	error = xfs_inobt_get_rec(*curpp, irec, has_more);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+
+	/*
+	 * If the LE lookup yielded an inobt record before the cursor position,
+	 * skip it and see if there's another one after it.
+	 */
+	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+		goto out_advance;
+
+	/*
+	 * If agino fell in the middle of the inode record, make it look like
+	 * the inodes up to agino are free so that we don't return them again.
+	 */
+	if (iwag->trim_start)
+		xfs_iwalk_adjust_start(agino, irec);
+
+	/*
+	 * The prefetch calculation is supposed to give us a large enough inobt
+	 * record cache that grab_ichunk can stage a partial first record and
+	 * the loop body can cache a record without having to check for cache
+	 * space until after it reads an inobt record.
+	 */
+	iwag->nr_recs++;
+	ASSERT(iwag->nr_recs < iwag->sz_recs);
+
+out_advance:
+	return xfs_btree_increment(*curpp, 0, has_more);
+}
+
+/*
+ * The inobt record cache is full, so preserve the inobt cursor state and
+ * run callbacks on the cached inobt records.  When we're done, restore the
+ * cursor state to wherever the cursor would have been had the cache not been
+ * full (and therefore we could've just incremented the cursor) if *@has_more
+ * is true.  On exit, *@has_more will indicate whether or not the caller should
+ * try for more inode records.
+ */
+STATIC int
+xfs_iwalk_run_callbacks(
+	struct xfs_iwalk_ag		*iwag,
+	xfs_agnumber_t			agno,
+	struct xfs_btree_cur		**curpp,
+	struct xfs_buf			**agi_bpp,
+	int				*has_more)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	struct xfs_inobt_rec_incore	*irec;
+	xfs_agino_t			restart;
+	int				error;
+
+	ASSERT(iwag->nr_recs > 0);
+
+	/* Delete cursor but remember the last record we cached... */
+	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+	irec = &iwag->recs[iwag->nr_recs - 1];
+	restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+	error = xfs_iwalk_ag_recs(iwag);
+	if (error)
+		return error;
+
+	/* ...empty the cache... */
+	iwag->nr_recs = 0;
+
+	if (!has_more)
+		return 0;
+
+	/* ...and recreate the cursor just past where we left off. */
+	error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+	if (error)
+		return error;
+
+	return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+}
+
+/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
+STATIC int
+xfs_iwalk_ag(
+	struct xfs_iwalk_ag		*iwag)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	struct xfs_buf			*agi_bp = NULL;
+	struct xfs_btree_cur		*cur = NULL;
+	xfs_agnumber_t			agno;
+	xfs_agino_t			agino;
+	int				has_more;
+	int				error = 0;
+
+	/* Set up our cursor at the right place in the inode btree. */
+	agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
+	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
+
+	while (!error && has_more) {
+		struct xfs_inobt_rec_incore	*irec;
+
+		cond_resched();
+		if (xfs_pwork_want_abort(&iwag->pwork))
+			goto out;
+
+		/* Fetch the inobt record. */
+		irec = &iwag->recs[iwag->nr_recs];
+		error = xfs_inobt_get_rec(cur, irec, &has_more);
+		if (error || !has_more)
+			break;
+
+		/* No allocated inodes in this chunk; skip it. */
+		if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
+			error = xfs_btree_increment(cur, 0, &has_more);
+			if (error)
+				break;
+			continue;
+		}
+
+		/*
+		 * Start readahead for this inode chunk in anticipation of
+		 * walking the inodes.
+		 */
+		if (iwag->iwalk_fn)
+			xfs_iwalk_ichunk_ra(mp, agno, irec);
+
+		/*
+		 * If there's space in the buffer for more records, increment
+		 * the btree cursor and grab more.
+		 */
+		if (++iwag->nr_recs < iwag->sz_recs) {
+			error = xfs_btree_increment(cur, 0, &has_more);
+			if (error || !has_more)
+				break;
+			continue;
+		}
+
+		/*
+		 * Otherwise, we need to save cursor state and run the callback
+		 * function on the cached records.  The run_callbacks function
+		 * is supposed to return a cursor pointing to the record where
+		 * we would be if we had been able to increment like above.
+		 */
+		ASSERT(has_more);
+		error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
+				&has_more);
+	}
+
+	if (iwag->nr_recs == 0 || error)
+		goto out;
+
+	/* Walk the unprocessed records in the cache. */
+	error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
+
+out:
+	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
+	return error;
+}
+
+/*
+ * We experimentally determined that the reduction in ioctl call overhead
+ * diminishes when userspace asks for more than 2048 inodes, so we'll cap
+ * prefetch at this point.
+ */
+#define IWALK_MAX_INODE_PREFETCH	(2048U)
+
+/*
+ * Given the number of inodes to prefetch, set the number of inobt records that
+ * we cache in memory, which controls the number of inodes we try to read
+ * ahead.  Set the maximum if @inodes == 0.
+ */
+static inline unsigned int
+xfs_iwalk_prefetch(
+	unsigned int		inodes)
+{
+	unsigned int		inobt_records;
+
+	/*
+	 * If the caller didn't tell us the number of inodes they wanted,
+	 * assume the maximum prefetch possible for best performance.
+	 * Otherwise, cap prefetch at that maximum so that we don't start an
+	 * absurd amount of prefetch.
+	 */
+	if (inodes == 0)
+		inodes = IWALK_MAX_INODE_PREFETCH;
+	inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
+
+	/* Round the inode count up to a full chunk. */
+	inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
+
+	/*
+	 * In order to convert the number of inodes to prefetch into an
+	 * estimate of the number of inobt records to cache, we require a
+	 * conversion factor that reflects our expectations of the average
+	 * loading factor of an inode chunk.  Based on data gathered, most
+	 * (but not all) filesystems manage to keep the inode chunks totally
+	 * full, so we'll underestimate slightly so that our readahead will
+	 * still deliver the performance we want on aging filesystems:
+	 *
+	 * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
+	 *
+	 * The funny math is to avoid integer division.
+	 */
+	inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
+
+	/*
+	 * Allocate enough space to prefetch at least two inobt records so that
+	 * we can cache both the record where the iwalk started and the next
+	 * record.  This simplifies the AG inode walk loop setup code.
+	 */
+	return max(inobt_records, 2U);
+}
+
+/*
+ * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
+ * will be called for each allocated inode, being passed the inode's number and
+ * @data.  @max_prefetch controls how many inobt records' worth of inodes we
+ * try to readahead.
+ */
+int
+xfs_iwalk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		startino,
+	unsigned int		flags,
+	xfs_iwalk_fn		iwalk_fn,
+	unsigned int		inode_records,
+	void			*data)
+{
+	struct xfs_iwalk_ag	iwag = {
+		.mp		= mp,
+		.tp		= tp,
+		.iwalk_fn	= iwalk_fn,
+		.data		= data,
+		.startino	= startino,
+		.sz_recs	= xfs_iwalk_prefetch(inode_records),
+		.trim_start	= 1,
+		.skip_empty	= 1,
+		.pwork		= XFS_PWORK_SINGLE_THREADED,
+	};
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+	error = xfs_iwalk_alloc(&iwag);
+	if (error)
+		return error;
+
+	for (; agno < mp->m_sb.sb_agcount; agno++) {
+		error = xfs_iwalk_ag(&iwag);
+		if (error)
+			break;
+		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+		if (flags & XFS_INOBT_WALK_SAME_AG)
+			break;
+	}
+
+	xfs_iwalk_free(&iwag);
+	return error;
+}
+
+/* Run per-thread iwalk work. */
+static int
+xfs_iwalk_ag_work(
+	struct xfs_mount	*mp,
+	struct xfs_pwork	*pwork)
+{
+	struct xfs_iwalk_ag	*iwag;
+	int			error = 0;
+
+	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
+	if (xfs_pwork_want_abort(pwork))
+		goto out;
+
+	error = xfs_iwalk_alloc(iwag);
+	if (error)
+		goto out;
+
+	error = xfs_iwalk_ag(iwag);
+	xfs_iwalk_free(iwag);
+out:
+	kmem_free(iwag);
+	return error;
+}
+
+/*
+ * Walk all the inodes in the filesystem using multiple threads to process each
+ * AG.
+ */
+int
+xfs_iwalk_threaded(
+	struct xfs_mount	*mp,
+	xfs_ino_t		startino,
+	unsigned int		flags,
+	xfs_iwalk_fn		iwalk_fn,
+	unsigned int		inode_records,
+	bool			polled,
+	void			*data)
+{
+	struct xfs_pwork_ctl	pctl;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	unsigned int		nr_threads;
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+	nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
+	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
+			nr_threads);
+	if (error)
+		return error;
+
+	for (; agno < mp->m_sb.sb_agcount; agno++) {
+		struct xfs_iwalk_ag	*iwag;
+
+		if (xfs_pwork_ctl_want_abort(&pctl))
+			break;
+
+		iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
+		iwag->mp = mp;
+		iwag->iwalk_fn = iwalk_fn;
+		iwag->data = data;
+		iwag->startino = startino;
+		iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+		xfs_pwork_queue(&pctl, &iwag->pwork);
+		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+		if (flags & XFS_INOBT_WALK_SAME_AG)
+			break;
+	}
+
+	if (polled)
+		xfs_pwork_poll(&pctl);
+	return xfs_pwork_destroy(&pctl);
+}
+
+/*
+ * Allow callers to cache up to a page's worth of inobt records.  This reflects
+ * the existing inumbers prefetching behavior.  Since the inobt walk does not
+ * itself do anything with the inobt records, we can set a fairly high limit
+ * here.
+ */
+#define MAX_INOBT_WALK_PREFETCH	\
+	(PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
+
+/*
+ * Given the number of records that the user wanted, set the number of inobt
+ * records that we buffer in memory.  Set the maximum if @inobt_records == 0.
+ */
+static inline unsigned int
+xfs_inobt_walk_prefetch(
+	unsigned int		inobt_records)
+{
+	/*
+	 * If the caller didn't tell us the number of inobt records they
+	 * wanted, assume the maximum prefetch possible for best performance.
+	 */
+	if (inobt_records == 0)
+		inobt_records = MAX_INOBT_WALK_PREFETCH;
+
+	/*
+	 * Allocate enough space to prefetch at least two inobt records so that
+	 * we can cache both the record where the iwalk started and the next
+	 * record.  This simplifies the AG inode walk loop setup code.
+	 */
+	inobt_records = max(inobt_records, 2U);
+
+	/*
+	 * Cap prefetch at that maximum so that we don't use an absurd amount
+	 * of memory.
+	 */
+	return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
+}
+
+/*
+ * Walk all inode btree records in the filesystem starting from @startino.  The
+ * @inobt_walk_fn will be called for each btree record, being passed the incore
+ * record and @data.  @max_prefetch controls how many inobt records we try to
+ * cache ahead of time.
+ */
+int
+xfs_inobt_walk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		startino,
+	unsigned int		flags,
+	xfs_inobt_walk_fn	inobt_walk_fn,
+	unsigned int		inobt_records,
+	void			*data)
+{
+	struct xfs_iwalk_ag	iwag = {
+		.mp		= mp,
+		.tp		= tp,
+		.inobt_walk_fn	= inobt_walk_fn,
+		.data		= data,
+		.startino	= startino,
+		.sz_recs	= xfs_inobt_walk_prefetch(inobt_records),
+		.pwork		= XFS_PWORK_SINGLE_THREADED,
+	};
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
+
+	error = xfs_iwalk_alloc(&iwag);
+	if (error)
+		return error;
+
+	for (; agno < mp->m_sb.sb_agcount; agno++) {
+		error = xfs_iwalk_ag(&iwag);
+		if (error)
+			break;
+		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+		if (flags & XFS_INOBT_WALK_SAME_AG)
+			break;
+	}
+
+	xfs_iwalk_free(&iwag);
+	return error;
+}

diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
new file mode 100644
index 0000000..37a795f
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.h

@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_IWALK_H__
+#define __XFS_IWALK_H__
+
+/*
+ * Return codes for the inode/inobt walk function are 0 to continue iterating,
+ * and non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * iwalk or inobt_walk caller.  The special value -ECANCELED can be used to
+ * stop iteration, as neither iwalk nor inobt_walk will ever generate that
+ * error code on their own.
+ */
+
+/* Walk all inodes in the filesystem starting from @startino. */
+typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+			    xfs_ino_t ino, void *data);
+
+int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
+		unsigned int flags, xfs_iwalk_fn iwalk_fn,
+		unsigned int inode_records, void *data);
+int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
+		unsigned int flags, xfs_iwalk_fn iwalk_fn,
+		unsigned int inode_records, bool poll, void *data);
+
+/* Only iterate inodes within the same AG as @startino. */
+#define XFS_IWALK_SAME_AG	(0x1)
+
+#define XFS_IWALK_FLAGS_ALL	(XFS_IWALK_SAME_AG)
+
+/* Walk all inode btree records in the filesystem starting from @startino. */
+typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+				 xfs_agnumber_t agno,
+				 const struct xfs_inobt_rec_incore *irec,
+				 void *data);
+
+int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
+		xfs_ino_t startino, unsigned int flags,
+		xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records,
+		void *data);
+
+/* Only iterate inobt records within the same AG as @startino. */
+#define XFS_INOBT_WALK_SAME_AG	(XFS_IWALK_SAME_AG)
+
+#define XFS_INOBT_WALK_FLAGS_ALL (XFS_INOBT_WALK_SAME_AG)
+
+#endif /* __XFS_IWALK_H__ */

diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index edbd5a2..ca15105 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h

@@ -110,8 +110,6 @@
 #define current_restore_flags_nested(sp, f)	\
 		(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
 
-#define spinlock_destroy(lock)
-
 #define NBBY		8		/* number of bits per byte */
 
 /*
@@ -221,6 +219,9 @@
 	return x;
 }
 
+int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
+		char *data, unsigned int op);
+
 #define ASSERT_ALWAYS(expr)	\
 	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c3b610b..641d07f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c

@@ -16,13 +16,10 @@
 #include "xfs_trans_priv.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
-#include "xfs_log_recover.h"
-#include "xfs_inode.h"
 #include "xfs_trace.h"
-#include "xfs_fsops.h"
-#include "xfs_cksum.h"
 #include "xfs_sysfs.h"
 #include "xfs_sb.h"
+#include "xfs_health.h"
 
 kmem_zone_t	*xfs_log_ticket_zone;
 
@@ -44,21 +41,14 @@
 xlog_space_left(
 	struct xlog		*log,
 	atomic64_t		*head);
-STATIC int
-xlog_sync(
-	struct xlog		*log,
-	struct xlog_in_core	*iclog);
 STATIC void
 xlog_dealloc_log(
 	struct xlog		*log);
 
 /* local state machine functions */
-STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
-STATIC void
-xlog_state_do_callback(
-	struct xlog		*log,
-	int			aborted,
-	struct xlog_in_core	*iclog);
+STATIC void xlog_state_done_syncing(
+	struct xlog_in_core	*iclog,
+	bool			aborted);
 STATIC int
 xlog_state_get_iclog_space(
 	struct xlog		*log,
@@ -106,8 +96,7 @@
 xlog_verify_iclog(
 	struct xlog		*log,
 	struct xlog_in_core	*iclog,
-	int			count,
-	bool                    syncing);
+	int			count);
 STATIC void
 xlog_verify_tail_lsn(
 	struct xlog		*log,
@@ -116,7 +105,7 @@
 #else
 #define xlog_verify_dest_ptr(a,b)
 #define xlog_verify_grant_tail(a)
-#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_iclog(a,b,c)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 
@@ -225,15 +214,42 @@
 {
 	struct xlog_ticket	*tic;
 	int			need_bytes;
+	bool			woken_task = false;
 
 	list_for_each_entry(tic, &head->waiters, t_queue) {
+
+		/*
+		 * There is a chance that the size of the CIL checkpoints in
+		 * progress at the last AIL push target calculation resulted in
+		 * limiting the target to the log head (l_last_sync_lsn) at the
+		 * time. This may not reflect where the log head is now as the
+		 * CIL checkpoints may have completed.
+		 *
+		 * Hence when we are woken here, it may be that the head of the
+		 * log that has moved rather than the tail. As the tail didn't
+		 * move, there still won't be space available for the
+		 * reservation we require.  However, if the AIL has already
+		 * pushed to the target defined by the old log head location, we
+		 * will hang here waiting for something else to update the AIL
+		 * push target.
+		 *
+		 * Therefore, if there isn't space to wake the first waiter on
+		 * the grant head, we need to push the AIL again to ensure the
+		 * target reflects both the current log tail and log head
+		 * position before we wait for the tail to move again.
+		 */
+
 		need_bytes = xlog_ticket_reservation(log, head, tic);
-		if (*free_bytes < need_bytes)
+		if (*free_bytes < need_bytes) {
+			if (!woken_task)
+				xlog_grant_push_ail(log, need_bytes);
 			return false;
+		}
 
 		*free_bytes -= need_bytes;
 		trace_xfs_log_grant_wake_up(log, tic);
 		wake_up_process(tic->t_task);
+		woken_task = true;
 	}
 
 	return true;
@@ -439,11 +455,7 @@
 	XFS_STATS_INC(mp, xs_try_logspace);
 
 	ASSERT(*ticp == NULL);
-	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
-				KM_SLEEP | KM_MAYFAIL);
-	if (!tic)
-		return -ENOMEM;
-
+	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
 	*ticp = tic;
 
 	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -540,32 +552,6 @@
 	return lsn;
 }
 
-/*
- * Attaches a new iclog I/O completion callback routine during
- * transaction commit.  If the log is in error state, a non-zero
- * return code is handed back and the caller is responsible for
- * executing the callback at an appropriate time.
- */
-int
-xfs_log_notify(
-	struct xlog_in_core	*iclog,
-	xfs_log_callback_t	*cb)
-{
-	int	abortflg;
-
-	spin_lock(&iclog->ic_callback_lock);
-	abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
-	if (!abortflg) {
-		ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
-			      (iclog->ic_state == XLOG_STATE_WANT_SYNC));
-		cb->cb_next = NULL;
-		*(iclog->ic_callback_tail) = cb;
-		iclog->ic_callback_tail = &(cb->cb_next);
-	}
-	spin_unlock(&iclog->ic_callback_lock);
-	return abortflg;
-}
-
 int
 xfs_log_release_iclog(
 	struct xfs_mount	*mp,
@@ -806,16 +792,12 @@
  * The mount has failed. Cancel the recovery if it hasn't completed and destroy
  * the log.
  */
-int
+void
 xfs_log_mount_cancel(
 	struct xfs_mount	*mp)
 {
-	int			error;
-
-	error = xlog_recover_cancel(mp->m_log);
+	xlog_recover_cancel(mp->m_log);
 	xfs_log_unmount(mp);
-
-	return error;
 }
 
 /*
@@ -861,7 +843,7 @@
 	 * recalculated during log recovery at next mount.  Refer to
 	 * xlog_check_unmount_rec for more details.
 	 */
-	if (XFS_TEST_ERROR((mp->m_flags & XFS_MOUNT_BAD_SUMMARY), mp,
+	if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
 			XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
 		xfs_alert(mp, "%s: will fix summary counters at next mount",
 				__func__);
@@ -931,7 +913,7 @@
 	 * Or, if we are doing a forced umount (typically because of IO errors).
 	 */
 	if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
-	    xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+	    xfs_readonly_buftarg(log->l_targ)) {
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 		return 0;
 	}
@@ -1243,53 +1225,49 @@
 }
 
 
-/*
- * Log function which is called when an io completes.
- *
- * The log manager needs its own routine, in order to control what
- * happens with the buffer after the write completes.
- */
 static void
-xlog_iodone(xfs_buf_t *bp)
+xlog_ioend_work(
+	struct work_struct	*work)
 {
-	struct xlog_in_core	*iclog = bp->b_log_item;
-	struct xlog		*l = iclog->ic_log;
-	int			aborted = 0;
+	struct xlog_in_core     *iclog =
+		container_of(work, struct xlog_in_core, ic_end_io_work);
+	struct xlog		*log = iclog->ic_log;
+	bool			aborted = false;
+	int			error;
+
+	error = blk_status_to_errno(iclog->ic_bio.bi_status);
+#ifdef DEBUG
+	/* treat writes with injected CRC errors as failed */
+	if (iclog->ic_fail_crc)
+		error = -EIO;
+#endif
 
 	/*
-	 * Race to shutdown the filesystem if we see an error or the iclog is in
-	 * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
-	 * CRC errors into log recovery.
+	 * Race to shutdown the filesystem if we see an error.
 	 */
-	if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) ||
-	    iclog->ic_state & XLOG_STATE_IOABORT) {
-		if (iclog->ic_state & XLOG_STATE_IOABORT)
-			iclog->ic_state &= ~XLOG_STATE_IOABORT;
-
-		xfs_buf_ioerror_alert(bp, __func__);
-		xfs_buf_stale(bp);
-		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
+	if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+		xfs_alert(log->l_mp, "log I/O error %d", error);
+		xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
 		/*
 		 * This flag will be propagated to the trans-committed
 		 * callback routines to let them know that the log-commit
 		 * didn't succeed.
 		 */
-		aborted = XFS_LI_ABORTED;
+		aborted = true;
 	} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
-		aborted = XFS_LI_ABORTED;
+		aborted = true;
 	}
 
-	/* log I/O is always issued ASYNC */
-	ASSERT(bp->b_flags & XBF_ASYNC);
 	xlog_state_done_syncing(iclog, aborted);
+	bio_uninit(&iclog->ic_bio);
 
 	/*
-	 * drop the buffer lock now that we are done. Nothing references
-	 * the buffer after this, so an unmount waiting on this lock can now
-	 * tear it down safely. As such, it is unsafe to reference the buffer
-	 * (bp) after the unlock as we could race with it being freed.
+	 * Drop the lock to signal that we are done. Nothing references the
+	 * iclog after this, so an unmount waiting on this lock can now tear it
+	 * down safely. As such, it is unsafe to reference the iclog after the
+	 * unlock as we could race with it being freed.
 	 */
-	xfs_buf_unlock(bp);
+	up(&iclog->ic_sema);
 }
 
 /*
@@ -1300,65 +1278,26 @@
  * If the filesystem blocksize is too large, we may need to choose a
  * larger size since the directory code currently logs entire blocks.
  */
-
 STATIC void
 xlog_get_iclog_buffer_size(
 	struct xfs_mount	*mp,
 	struct xlog		*log)
 {
-	int size;
-	int xhdrs;
-
 	if (mp->m_logbufs <= 0)
-		log->l_iclog_bufs = XLOG_MAX_ICLOGS;
-	else
-		log->l_iclog_bufs = mp->m_logbufs;
+		mp->m_logbufs = XLOG_MAX_ICLOGS;
+	if (mp->m_logbsize <= 0)
+		mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
+
+	log->l_iclog_bufs = mp->m_logbufs;
+	log->l_iclog_size = mp->m_logbsize;
 
 	/*
-	 * Buffer size passed in from mount system call.
+	 * # headers = size / 32k - one header holds cycles from 32k of data.
 	 */
-	if (mp->m_logbsize > 0) {
-		size = log->l_iclog_size = mp->m_logbsize;
-		log->l_iclog_size_log = 0;
-		while (size != 1) {
-			log->l_iclog_size_log++;
-			size >>= 1;
-		}
-
-		if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-			/* # headers = size / 32k
-			 * one header holds cycles from 32k of data
-			 */
-
-			xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
-			if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
-				xhdrs++;
-			log->l_iclog_hsize = xhdrs << BBSHIFT;
-			log->l_iclog_heads = xhdrs;
-		} else {
-			ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
-			log->l_iclog_hsize = BBSIZE;
-			log->l_iclog_heads = 1;
-		}
-		goto done;
-	}
-
-	/* All machines use 32kB buffers by default. */
-	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
-	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
-
-	/* the default log size is 16k or 32k which is one header sector */
-	log->l_iclog_hsize = BBSIZE;
-	log->l_iclog_heads = 1;
-
-done:
-	/* are we being asked to make the sizes selected above visible? */
-	if (mp->m_logbufs == 0)
-		mp->m_logbufs = log->l_iclog_bufs;
-	if (mp->m_logbsize == 0)
-		mp->m_logbsize = log->l_iclog_size;
-}	/* xlog_get_iclog_buffer_size */
-
+	log->l_iclog_heads =
+		DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
+	log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
+}
 
 void
 xfs_log_work_queue(
@@ -1421,7 +1360,6 @@
 	xlog_rec_header_t	*head;
 	xlog_in_core_t		**iclogp;
 	xlog_in_core_t		*iclog, *prev_iclog=NULL;
-	xfs_buf_t		*bp;
 	int			i;
 	int			error = -ENOMEM;
 	uint			log2_size = 0;
@@ -1479,30 +1417,6 @@
 
 	xlog_get_iclog_buffer_size(mp, log);
 
-	/*
-	 * Use a NULL block for the extra log buffer used during splits so that
-	 * it will trigger errors if we ever try to do IO on it without first
-	 * having set it up properly.
-	 */
-	error = -ENOMEM;
-	bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
-			   BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
-	if (!bp)
-		goto out_free_log;
-
-	/*
-	 * The iclogbuf buffer locks are held over IO but we are not going to do
-	 * IO yet.  Hence unlock the buffer so that the log IO path can grab it
-	 * when appropriately.
-	 */
-	ASSERT(xfs_buf_islocked(bp));
-	xfs_buf_unlock(bp);
-
-	/* use high priority wq for log I/O completion */
-	bp->b_ioend_wq = mp->m_log_workqueue;
-	bp->b_iodone = xlog_iodone;
-	log->l_xbuf = bp;
-
 	spin_lock_init(&log->l_icloglock);
 	init_waitqueue_head(&log->l_flush_wait);
 
@@ -1515,29 +1429,23 @@
 	 * xlog_in_core_t in xfs_log_priv.h for details.
 	 */
 	ASSERT(log->l_iclog_size >= 4096);
-	for (i=0; i < log->l_iclog_bufs; i++) {
-		*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
-		if (!*iclogp)
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
+		size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
+				sizeof(struct bio_vec);
+
+		iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+		if (!iclog)
 			goto out_free_iclog;
 
-		iclog = *iclogp;
+		*iclogp = iclog;
 		iclog->ic_prev = prev_iclog;
 		prev_iclog = iclog;
 
-		bp = xfs_buf_get_uncached(mp->m_logdev_targp,
-					  BTOBB(log->l_iclog_size),
-					  XBF_NO_IOACCT);
-		if (!bp)
+		iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
+						KM_MAYFAIL | KM_ZERO);
+		if (!iclog->ic_data)
 			goto out_free_iclog;
-
-		ASSERT(xfs_buf_islocked(bp));
-		xfs_buf_unlock(bp);
-
-		/* use high priority wq for log I/O completion */
-		bp->b_ioend_wq = mp->m_log_workqueue;
-		bp->b_iodone = xlog_iodone;
-		iclog->ic_bp = bp;
-		iclog->ic_data = bp->b_addr;
 #ifdef DEBUG
 		log->l_iclog_bak[i] = &iclog->ic_header;
 #endif
@@ -1551,36 +1459,43 @@
 		head->h_fmt = cpu_to_be32(XLOG_FMT);
 		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
 
-		iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
+		iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
 		iclog->ic_state = XLOG_STATE_ACTIVE;
 		iclog->ic_log = log;
 		atomic_set(&iclog->ic_refcnt, 0);
 		spin_lock_init(&iclog->ic_callback_lock);
-		iclog->ic_callback_tail = &(iclog->ic_callback);
+		INIT_LIST_HEAD(&iclog->ic_callbacks);
 		iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
 
 		init_waitqueue_head(&iclog->ic_force_wait);
 		init_waitqueue_head(&iclog->ic_write_wait);
+		INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
+		sema_init(&iclog->ic_sema, 1);
 
 		iclogp = &iclog->ic_next;
 	}
 	*iclogp = log->l_iclog;			/* complete ring */
 	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
 
+	log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
+			WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
+			mp->m_fsname);
+	if (!log->l_ioend_workqueue)
+		goto out_free_iclog;
+
 	error = xlog_cil_init(log);
 	if (error)
-		goto out_free_iclog;
+		goto out_destroy_workqueue;
 	return log;
 
+out_destroy_workqueue:
+	destroy_workqueue(log->l_ioend_workqueue);
 out_free_iclog:
 	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
 		prev_iclog = iclog->ic_next;
-		if (iclog->ic_bp)
-			xfs_buf_free(iclog->ic_bp);
+		kmem_free(iclog->ic_data);
 		kmem_free(iclog);
 	}
-	spinlock_destroy(&log->l_icloglock);
-	xfs_buf_free(log->l_xbuf);
 out_free_log:
 	kmem_free(log);
 out:
@@ -1765,42 +1680,155 @@
 	return xfs_end_cksum(crc);
 }
 
-/*
- * The bdstrat callback function for log bufs. This gives us a central
- * place to trap bufs in case we get hit by a log I/O error and need to
- * shutdown. Actually, in practice, even when we didn't get a log error,
- * we transition the iclogs to IOERROR state *after* flushing all existing
- * iclogs to disk. This is because we don't want anymore new transactions to be
- * started or completed afterwards.
- *
- * We lock the iclogbufs here so that we can serialise against IO completion
- * during unmount. We might be processing a shutdown triggered during unmount,
- * and that can occur asynchronously to the unmount thread, and hence we need to
- * ensure that completes before tearing down the iclogbufs. Hence we need to
- * hold the buffer lock across the log IO to acheive that.
- */
-STATIC int
-xlog_bdstrat(
-	struct xfs_buf		*bp)
+static void
+xlog_bio_end_io(
+	struct bio		*bio)
 {
-	struct xlog_in_core	*iclog = bp->b_log_item;
+	struct xlog_in_core	*iclog = bio->bi_private;
 
-	xfs_buf_lock(bp);
-	if (iclog->ic_state & XLOG_STATE_IOERROR) {
-		xfs_buf_ioerror(bp, -EIO);
-		xfs_buf_stale(bp);
-		xfs_buf_ioend(bp);
+	queue_work(iclog->ic_log->l_ioend_workqueue,
+		   &iclog->ic_end_io_work);
+}
+
+static void
+xlog_map_iclog_data(
+	struct bio		*bio,
+	void			*data,
+	size_t			count)
+{
+	do {
+		struct page	*page = kmem_to_page(data);
+		unsigned int	off = offset_in_page(data);
+		size_t		len = min_t(size_t, count, PAGE_SIZE - off);
+
+		WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len);
+
+		data += len;
+		count -= len;
+	} while (count);
+}
+
+STATIC void
+xlog_write_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	uint64_t		bno,
+	unsigned int		count,
+	bool			need_flush)
+{
+	ASSERT(bno < log->l_logBBsize);
+
+	/*
+	 * We lock the iclogbufs here so that we can serialise against I/O
+	 * completion during unmount.  We might be processing a shutdown
+	 * triggered during unmount, and that can occur asynchronously to the
+	 * unmount thread, and hence we need to ensure that completes before
+	 * tearing down the iclogbufs.  Hence we need to hold the buffer lock
+	 * across the log IO to archieve that.
+	 */
+	down(&iclog->ic_sema);
+	if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) {
 		/*
 		 * It would seem logical to return EIO here, but we rely on
 		 * the log state machine to propagate I/O errors instead of
-		 * doing it here. Similarly, IO completion will unlock the
-		 * buffer, so we don't do it here.
+		 * doing it here.  We kick of the state machine and unlock
+		 * the buffer manually, the code needs to be kept in sync
+		 * with the I/O completion path.
 		 */
-		return 0;
+		xlog_state_done_syncing(iclog, XFS_LI_ABORTED);
+		up(&iclog->ic_sema);
+		return;
 	}
 
-	xfs_buf_submit(bp);
-	return 0;
+	iclog->ic_io_size = count;
+
+	bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
+	bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
+	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+	iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+	iclog->ic_bio.bi_private = iclog;
+	iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
+	if (need_flush)
+		iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+
+	xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size);
+	if (is_vmalloc_addr(iclog->ic_data))
+		flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size);
+
+	/*
+	 * If this log buffer would straddle the end of the log we will have
+	 * to split it up into two bios, so that we can continue at the start.
+	 */
+	if (bno + BTOBB(count) > log->l_logBBsize) {
+		struct bio *split;
+
+		split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno,
+				  GFP_NOIO, &fs_bio_set);
+		bio_chain(split, &iclog->ic_bio);
+		submit_bio(split);
+
+		/* restart at logical offset zero for the remainder */
+		iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
+	}
+
+	submit_bio(&iclog->ic_bio);
+}
+
+/*
+ * We need to bump cycle number for the part of the iclog that is
+ * written to the start of the log. Watch out for the header magic
+ * number case, though.
+ */
+static void
+xlog_split_iclog(
+	struct xlog		*log,
+	void			*data,
+	uint64_t		bno,
+	unsigned int		count)
+{
+	unsigned int		split_offset = BBTOB(log->l_logBBsize - bno);
+	unsigned int		i;
+
+	for (i = split_offset; i < count; i += BBSIZE) {
+		uint32_t cycle = get_unaligned_be32(data + i);
+
+		if (++cycle == XLOG_HEADER_MAGIC_NUM)
+			cycle++;
+		put_unaligned_be32(cycle, data + i);
+	}
+}
+
+static int
+xlog_calc_iclog_size(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	uint32_t		*roundoff)
+{
+	uint32_t		count_init, count;
+	bool			use_lsunit;
+
+	use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+			log->l_mp->m_sb.sb_logsunit > 1;
+
+	/* Add for LR header */
+	count_init = log->l_iclog_hsize + iclog->ic_offset;
+
+	/* Round out the log write size */
+	if (use_lsunit) {
+		/* we have a v2 stripe unit to use */
+		count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+	} else {
+		count = BBTOB(BTOBB(count_init));
+	}
+
+	ASSERT(count >= count_init);
+	*roundoff = count - count_init;
+
+	if (use_lsunit)
+		ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
+	else
+		ASSERT(*roundoff < BBTOB(1));
+	return count;
 }
 
 /*
@@ -1823,46 +1851,23 @@
  * log will require grabbing the lock though.
  *
  * The entire log manager uses a logical block numbering scheme.  Only
- * log_sync (and then only bwrite()) know about the fact that the log may
- * not start with block zero on a given device.  The log block start offset
- * is added immediately before calling bwrite().
+ * xlog_write_iclog knows about the fact that the log may not start with
+ * block zero on a given device.
  */
-
-STATIC int
+STATIC void
 xlog_sync(
 	struct xlog		*log,
 	struct xlog_in_core	*iclog)
 {
-	xfs_buf_t	*bp;
-	int		i;
-	uint		count;		/* byte count of bwrite */
-	uint		count_init;	/* initial count before roundup */
-	int		roundoff;       /* roundoff to BB or stripe */
-	int		split = 0;	/* split write into two regions */
-	int		error;
-	int		v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
-	int		size;
+	unsigned int		count;		/* byte count of bwrite */
+	unsigned int		roundoff;       /* roundoff to BB or stripe */
+	uint64_t		bno;
+	unsigned int		size;
+	bool			need_flush = true, split = false;
 
-	XFS_STATS_INC(log->l_mp, xs_log_writes);
 	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
 
-	/* Add for LR header */
-	count_init = log->l_iclog_hsize + iclog->ic_offset;
-
-	/* Round out the log write size */
-	if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
-		/* we have a v2 stripe unit to use */
-		count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
-	} else {
-		count = BBTOB(BTOBB(count_init));
-	}
-	roundoff = count - count_init;
-	ASSERT(roundoff >= 0);
-	ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && 
-                roundoff < log->l_mp->m_sb.sb_logsunit)
-		|| 
-		(log->l_mp->m_sb.sb_logsunit <= 1 && 
-		 roundoff < BBTOB(1)));
+	count = xlog_calc_iclog_size(log, iclog, &roundoff);
 
 	/* move grant heads by roundoff in sync */
 	xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
@@ -1873,41 +1878,19 @@
 
 	/* real byte length */
 	size = iclog->ic_offset;
-	if (v2)
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb))
 		size += roundoff;
 	iclog->ic_header.h_len = cpu_to_be32(size);
 
-	bp = iclog->ic_bp;
-	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
-
+	XFS_STATS_INC(log->l_mp, xs_log_writes);
 	XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
 
+	bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
+
 	/* Do we need to split this write into 2 parts? */
-	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
-		char		*dptr;
-
-		split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
-		count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
-		iclog->ic_bwritecnt = 2;
-
-		/*
-		 * Bump the cycle numbers at the start of each block in the
-		 * part of the iclog that ends up in the buffer that gets
-		 * written to the start of the log.
-		 *
-		 * Watch out for the header magic number case, though.
-		 */
-		dptr = (char *)&iclog->ic_header + count;
-		for (i = 0; i < split; i += BBSIZE) {
-			uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
-			if (++cycle == XLOG_HEADER_MAGIC_NUM)
-				cycle++;
-			*(__be32 *)dptr = cpu_to_be32(cycle);
-
-			dptr += BBSIZE;
-		}
-	} else {
-		iclog->ic_bwritecnt = 1;
+	if (bno + BTOBB(count) > log->l_logBBsize) {
+		xlog_split_iclog(log, &iclog->ic_header, bno, count);
+		split = true;
 	}
 
 	/* calculcate the checksum */
@@ -1920,18 +1903,15 @@
 	 * write on I/O completion and shutdown the fs. The subsequent mount
 	 * detects the bad CRC and attempts to recover.
 	 */
+#ifdef DEBUG
 	if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
 		iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
-		iclog->ic_state |= XLOG_STATE_IOABORT;
+		iclog->ic_fail_crc = true;
 		xfs_warn(log->l_mp,
 	"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
 			 be64_to_cpu(iclog->ic_header.h_lsn));
 	}
-
-	bp->b_io_length = BTOBB(count);
-	bp->b_log_item = iclog;
-	bp->b_flags &= ~XBF_FLUSH;
-	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
+#endif
 
 	/*
 	 * Flush the data device before flushing the log to make sure all meta
@@ -1941,50 +1921,14 @@
 	 * synchronously here; for an internal log we can simply use the block
 	 * layer state machine for preflushes.
 	 */
-	if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+	if (log->l_targ != log->l_mp->m_ddev_targp || split) {
 		xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
-	else
-		bp->b_flags |= XBF_FLUSH;
-
-	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
-	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-
-	xlog_verify_iclog(log, iclog, count, true);
-
-	/* account for log which doesn't start at block #0 */
-	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
-
-	/*
-	 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
-	 * is shutting down.
-	 */
-	error = xlog_bdstrat(bp);
-	if (error) {
-		xfs_buf_ioerror_alert(bp, "xlog_sync");
-		return error;
+		need_flush = false;
 	}
-	if (split) {
-		bp = iclog->ic_log->l_xbuf;
-		XFS_BUF_SET_ADDR(bp, 0);	     /* logical 0 */
-		xfs_buf_associate_memory(bp,
-				(char *)&iclog->ic_header + count, split);
-		bp->b_log_item = iclog;
-		bp->b_flags &= ~XBF_FLUSH;
-		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
 
-		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
-		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-
-		/* account for internal log which doesn't start at block #0 */
-		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
-		error = xlog_bdstrat(bp);
-		if (error) {
-			xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
-			return error;
-		}
-	}
-	return 0;
-}	/* xlog_sync */
+	xlog_verify_iclog(log, iclog, count);
+	xlog_write_iclog(log, iclog, bno, count, need_flush);
+}
 
 /*
  * Deallocate a log structure
@@ -2004,31 +1948,21 @@
 	 */
 	iclog = log->l_iclog;
 	for (i = 0; i < log->l_iclog_bufs; i++) {
-		xfs_buf_lock(iclog->ic_bp);
-		xfs_buf_unlock(iclog->ic_bp);
+		down(&iclog->ic_sema);
+		up(&iclog->ic_sema);
 		iclog = iclog->ic_next;
 	}
 
-	/*
-	 * Always need to ensure that the extra buffer does not point to memory
-	 * owned by another log buffer before we free it. Also, cycle the lock
-	 * first to ensure we've completed IO on it.
-	 */
-	xfs_buf_lock(log->l_xbuf);
-	xfs_buf_unlock(log->l_xbuf);
-	xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
-	xfs_buf_free(log->l_xbuf);
-
 	iclog = log->l_iclog;
 	for (i = 0; i < log->l_iclog_bufs; i++) {
-		xfs_buf_free(iclog->ic_bp);
 		next_iclog = iclog->ic_next;
+		kmem_free(iclog->ic_data);
 		kmem_free(iclog);
 		iclog = next_iclog;
 	}
-	spinlock_destroy(&log->l_icloglock);
 
 	log->l_mp->m_log = NULL;
+	destroy_workqueue(log->l_ioend_workqueue);
 	kmem_free(log);
 }	/* xlog_dealloc_log */
 
@@ -2068,7 +2002,7 @@
 
 	/* match with XLOG_REG_TYPE_* in xfs_log.h */
 #define REG_TYPE_STR(type, str)	[XLOG_REG_TYPE_##type] = str
-	static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
+	static char *res_type_str[] = {
 	    REG_TYPE_STR(BFORMAT, "bformat"),
 	    REG_TYPE_STR(BCHUNK, "bchunk"),
 	    REG_TYPE_STR(EFI_FORMAT, "efi_format"),
@@ -2088,8 +2022,15 @@
 	    REG_TYPE_STR(UNMOUNT, "unmount"),
 	    REG_TYPE_STR(COMMIT, "commit"),
 	    REG_TYPE_STR(TRANSHDR, "trans header"),
-	    REG_TYPE_STR(ICREATE, "inode create")
+	    REG_TYPE_STR(ICREATE, "inode create"),
+	    REG_TYPE_STR(RUI_FORMAT, "rui_format"),
+	    REG_TYPE_STR(RUD_FORMAT, "rud_format"),
+	    REG_TYPE_STR(CUI_FORMAT, "cui_format"),
+	    REG_TYPE_STR(CUD_FORMAT, "cud_format"),
+	    REG_TYPE_STR(BUI_FORMAT, "bui_format"),
+	    REG_TYPE_STR(BUD_FORMAT, "bud_format"),
 	};
+	BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
 #undef REG_TYPE_STR
 
 	xfs_warn(mp, "ticket reservation summary:");
@@ -2582,27 +2523,41 @@
  *****************************************************************************
  */
 
-/* Clean iclogs starting from the head.  This ordering must be
- * maintained, so an iclog doesn't become ACTIVE beyond one that
- * is SYNCING.  This is also required to maintain the notion that we use
- * a ordered wait queue to hold off would be writers to the log when every
- * iclog is trying to sync to disk.
+/*
+ * An iclog has just finished IO completion processing, so we need to update
+ * the iclog state and propagate that up into the overall log state. Hence we
+ * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
+ * starting from the head, and then wake up any threads that are waiting for the
+ * iclog to be marked clean.
  *
- * State Change: DIRTY -> ACTIVE
+ * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
+ * doesn't become ACTIVE beyond one that is SYNCING.  This is also required to
+ * maintain the notion that we use a ordered wait queue to hold off would be
+ * writers to the log when every iclog is trying to sync to disk.
+ *
+ * Caller must hold the icloglock before calling us.
+ *
+ * State Change: !IOERROR -> DIRTY -> ACTIVE
  */
 STATIC void
-xlog_state_clean_log(
-	struct xlog *log)
+xlog_state_clean_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*dirty_iclog)
 {
-	xlog_in_core_t	*iclog;
-	int changed = 0;
+	struct xlog_in_core	*iclog;
+	int			changed = 0;
 
+	/* Prepare the completed iclog. */
+	if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
+		dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+
+	/* Walk all the iclogs to update the ordered active state. */
 	iclog = log->l_iclog;
 	do {
 		if (iclog->ic_state == XLOG_STATE_DIRTY) {
 			iclog->ic_state	= XLOG_STATE_ACTIVE;
 			iclog->ic_offset       = 0;
-			ASSERT(iclog->ic_callback == NULL);
+			ASSERT(list_empty_careful(&iclog->ic_callbacks));
 			/*
 			 * If the number of ops in this iclog indicate it just
 			 * contains the dummy transaction, we can
@@ -2634,7 +2589,13 @@
 		iclog = iclog->ic_next;
 	} while (iclog != log->l_iclog);
 
-	/* log is locked when we are called */
+
+	/*
+	 * Wake up threads waiting in xfs_log_force() for the dirty iclog
+	 * to be cleaned.
+	 */
+	wake_up_all(&dirty_iclog->ic_force_wait);
+
 	/*
 	 * Change state for the dummy log recording.
 	 * We usually go to NEED. But we go to NEED2 if the changed indicates
@@ -2668,56 +2629,226 @@
 			ASSERT(0);
 		}
 	}
-}	/* xlog_state_clean_log */
+}
 
 STATIC xfs_lsn_t
 xlog_get_lowest_lsn(
-	struct xlog	*log)
+	struct xlog		*log)
 {
-	xlog_in_core_t  *lsn_log;
-	xfs_lsn_t	lowest_lsn, lsn;
+	struct xlog_in_core	*iclog = log->l_iclog;
+	xfs_lsn_t		lowest_lsn = 0, lsn;
 
-	lsn_log = log->l_iclog;
-	lowest_lsn = 0;
 	do {
-	    if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
-		lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
-		if ((lsn && !lowest_lsn) ||
-		    (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
+		if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+			continue;
+
+		lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+		if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0)
 			lowest_lsn = lsn;
-		}
-	    }
-	    lsn_log = lsn_log->ic_next;
-	} while (lsn_log != log->l_iclog);
+	} while ((iclog = iclog->ic_next) != log->l_iclog);
+
 	return lowest_lsn;
 }
 
+/*
+ * Completion of a iclog IO does not imply that a transaction has completed, as
+ * transactions can be large enough to span many iclogs. We cannot change the
+ * tail of the log half way through a transaction as this may be the only
+ * transaction in the log and moving the tail to point to the middle of it
+ * will prevent recovery from finding the start of the transaction. Hence we
+ * should only update the last_sync_lsn if this iclog contains transaction
+ * completion callbacks on it.
+ *
+ * We have to do this before we drop the icloglock to ensure we are the only one
+ * that can update it.
+ *
+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
+ * the reservation grant head pushing. This is due to the fact that the push
+ * target is bound by the current last_sync_lsn value. Hence if we have a large
+ * amount of log space bound up in this committing transaction then the
+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
+ * should push the AIL to ensure the push target (and hence the grant head) is
+ * no longer bound by the old log head location and can move forwards and make
+ * progress again.
+ */
+static void
+xlog_state_set_callback(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	xfs_lsn_t		header_lsn)
+{
+	iclog->ic_state = XLOG_STATE_CALLBACK;
+
+	ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+			   header_lsn) <= 0);
+
+	if (list_empty_careful(&iclog->ic_callbacks))
+		return;
+
+	atomic64_set(&log->l_last_sync_lsn, header_lsn);
+	xlog_grant_push_ail(log, 0);
+}
+
+/*
+ * Return true if we need to stop processing, false to continue to the next
+ * iclog. The caller will need to run callbacks if the iclog is returned in the
+ * XLOG_STATE_CALLBACK state.
+ */
+static bool
+xlog_state_iodone_process_iclog(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	struct xlog_in_core	*completed_iclog,
+	bool			*ioerror)
+{
+	xfs_lsn_t		lowest_lsn;
+	xfs_lsn_t		header_lsn;
+
+	/* Skip all iclogs in the ACTIVE & DIRTY states */
+	if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+		return false;
+
+	/*
+	 * Between marking a filesystem SHUTDOWN and stopping the log, we do
+	 * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
+	 * want things to go smoothly in case of just a SHUTDOWN  w/o a
+	 * LOG_IO_ERROR.
+	 */
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		*ioerror = true;
+		return false;
+	}
+
+	/*
+	 * Can only perform callbacks in order.  Since this iclog is not in the
+	 * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
+	 * up.  If we set our iclog to DO_CALLBACK, we will not process it when
+	 * we retry since a previous iclog is in the CALLBACK and the state
+	 * cannot change since we are holding the l_icloglock.
+	 */
+	if (!(iclog->ic_state &
+			(XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
+		if (completed_iclog &&
+		    (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
+			completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
+		}
+		return true;
+	}
+
+	/*
+	 * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
+	 * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
+	 * by the above if and are going to clean (i.e. we aren't doing their
+	 * callbacks) see the above if.
+	 *
+	 * We will do one more check here to see if we have chased our tail
+	 * around. If this is not the lowest lsn iclog, then we will leave it
+	 * for another completion to process.
+	 */
+	header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+	lowest_lsn = xlog_get_lowest_lsn(log);
+	if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+		return false;
+
+	xlog_state_set_callback(log, iclog, header_lsn);
+	return false;
+
+}
+
+/*
+ * Keep processing entries in the iclog callback list until we come around and
+ * it is empty.  We need to atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more callbacks being added.
+ *
+ * This function is called with the icloglock held and returns with it held. We
+ * drop it while running callbacks, however, as holding it over thousands of
+ * callbacks is unnecessary and causes excessive contention if we do.
+ */
+static void
+xlog_state_do_iclog_callbacks(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	bool			aborted)
+{
+	spin_unlock(&log->l_icloglock);
+	spin_lock(&iclog->ic_callback_lock);
+	while (!list_empty(&iclog->ic_callbacks)) {
+		LIST_HEAD(tmp);
+
+		list_splice_init(&iclog->ic_callbacks, &tmp);
+
+		spin_unlock(&iclog->ic_callback_lock);
+		xlog_cil_process_committed(&tmp, aborted);
+		spin_lock(&iclog->ic_callback_lock);
+	}
+
+	/*
+	 * Pick up the icloglock while still holding the callback lock so we
+	 * serialise against anyone trying to add more callbacks to this iclog
+	 * now we've finished processing.
+	 */
+	spin_lock(&log->l_icloglock);
+	spin_unlock(&iclog->ic_callback_lock);
+}
+
+#ifdef DEBUG
+/*
+ * Make one last gasp attempt to see if iclogs are being left in limbo.  If the
+ * above loop finds an iclog earlier than the current iclog and in one of the
+ * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
+ * are deferred to the completion of the earlier iclog. Walk the iclogs in order
+ * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
+ * one of the syncing states.
+ *
+ * Note that SYNCING|IOERROR is a valid state so we cannot just check for
+ * ic_state == SYNCING.
+ */
+static void
+xlog_state_callback_check_state(
+	struct xlog		*log)
+{
+	struct xlog_in_core	*first_iclog = log->l_iclog;
+	struct xlog_in_core	*iclog = first_iclog;
+
+	do {
+		ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+		/*
+		 * Terminate the loop if iclogs are found in states
+		 * which will cause other threads to clean up iclogs.
+		 *
+		 * SYNCING - i/o completion will go through logs
+		 * DONE_SYNC - interrupt thread should be waiting for
+		 *              l_icloglock
+		 * IOERROR - give up hope all ye who enter here
+		 */
+		if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+		    iclog->ic_state & XLOG_STATE_SYNCING ||
+		    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+		    iclog->ic_state == XLOG_STATE_IOERROR )
+			break;
+		iclog = iclog->ic_next;
+	} while (first_iclog != iclog);
+}
+#else
+#define xlog_state_callback_check_state(l)	((void)0)
+#endif
 
 STATIC void
 xlog_state_do_callback(
 	struct xlog		*log,
-	int			aborted,
+	bool			aborted,
 	struct xlog_in_core	*ciclog)
 {
-	xlog_in_core_t	   *iclog;
-	xlog_in_core_t	   *first_iclog;	/* used to know when we've
-						 * processed all iclogs once */
-	xfs_log_callback_t *cb, *cb_next;
-	int		   flushcnt = 0;
-	xfs_lsn_t	   lowest_lsn;
-	int		   ioerrors;	/* counter: iclogs with errors */
-	int		   loopdidcallbacks; /* flag: inner loop did callbacks*/
-	int		   funcdidcallbacks; /* flag: function did callbacks */
-	int		   repeats;	/* for issuing console warnings if
-					 * looping too many times */
-	int		   wake = 0;
+	struct xlog_in_core	*iclog;
+	struct xlog_in_core	*first_iclog;
+	bool			did_callbacks = false;
+	bool			cycled_icloglock;
+	bool			ioerror;
+	int			flushcnt = 0;
+	int			repeats = 0;
 
 	spin_lock(&log->l_icloglock);
-	first_iclog = iclog = log->l_iclog;
-	ioerrors = 0;
-	funcdidcallbacks = 0;
-	repeats = 0;
-
 	do {
 		/*
 		 * Scan all iclogs starting with the one pointed to by the
@@ -2729,143 +2860,34 @@
 		 */
 		first_iclog = log->l_iclog;
 		iclog = log->l_iclog;
-		loopdidcallbacks = 0;
+		cycled_icloglock = false;
+		ioerror = false;
 		repeats++;
 
 		do {
+			if (xlog_state_iodone_process_iclog(log, iclog,
+							ciclog, &ioerror))
+				break;
 
-			/* skip all iclogs in the ACTIVE & DIRTY states */
-			if (iclog->ic_state &
-			    (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+			if (!(iclog->ic_state &
+			      (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
 				iclog = iclog->ic_next;
 				continue;
 			}
 
 			/*
-			 * Between marking a filesystem SHUTDOWN and stopping
-			 * the log, we do flush all iclogs to disk (if there
-			 * wasn't a log I/O error). So, we do want things to
-			 * go smoothly in case of just a SHUTDOWN  w/o a
-			 * LOG_IO_ERROR.
+			 * Running callbacks will drop the icloglock which means
+			 * we'll have to run at least one more complete loop.
 			 */
-			if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
-				/*
-				 * Can only perform callbacks in order.  Since
-				 * this iclog is not in the DONE_SYNC/
-				 * DO_CALLBACK state, we skip the rest and
-				 * just try to clean up.  If we set our iclog
-				 * to DO_CALLBACK, we will not process it when
-				 * we retry since a previous iclog is in the
-				 * CALLBACK and the state cannot change since
-				 * we are holding the l_icloglock.
-				 */
-				if (!(iclog->ic_state &
-					(XLOG_STATE_DONE_SYNC |
-						 XLOG_STATE_DO_CALLBACK))) {
-					if (ciclog && (ciclog->ic_state ==
-							XLOG_STATE_DONE_SYNC)) {
-						ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
-					}
-					break;
-				}
-				/*
-				 * We now have an iclog that is in either the
-				 * DO_CALLBACK or DONE_SYNC states. The other
-				 * states (WANT_SYNC, SYNCING, or CALLBACK were
-				 * caught by the above if and are going to
-				 * clean (i.e. we aren't doing their callbacks)
-				 * see the above if.
-				 */
+			cycled_icloglock = true;
+			xlog_state_do_iclog_callbacks(log, iclog, aborted);
 
-				/*
-				 * We will do one more check here to see if we
-				 * have chased our tail around.
-				 */
-
-				lowest_lsn = xlog_get_lowest_lsn(log);
-				if (lowest_lsn &&
-				    XFS_LSN_CMP(lowest_lsn,
-						be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
-					iclog = iclog->ic_next;
-					continue; /* Leave this iclog for
-						   * another thread */
-				}
-
-				iclog->ic_state = XLOG_STATE_CALLBACK;
-
-
-				/*
-				 * Completion of a iclog IO does not imply that
-				 * a transaction has completed, as transactions
-				 * can be large enough to span many iclogs. We
-				 * cannot change the tail of the log half way
-				 * through a transaction as this may be the only
-				 * transaction in the log and moving th etail to
-				 * point to the middle of it will prevent
-				 * recovery from finding the start of the
-				 * transaction. Hence we should only update the
-				 * last_sync_lsn if this iclog contains
-				 * transaction completion callbacks on it.
-				 *
-				 * We have to do this before we drop the
-				 * icloglock to ensure we are the only one that
-				 * can update it.
-				 */
-				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-				if (iclog->ic_callback)
-					atomic64_set(&log->l_last_sync_lsn,
-						be64_to_cpu(iclog->ic_header.h_lsn));
-
-			} else
-				ioerrors++;
-
-			spin_unlock(&log->l_icloglock);
-
-			/*
-			 * Keep processing entries in the callback list until
-			 * we come around and it is empty.  We need to
-			 * atomically see that the list is empty and change the
-			 * state to DIRTY so that we don't miss any more
-			 * callbacks being added.
-			 */
-			spin_lock(&iclog->ic_callback_lock);
-			cb = iclog->ic_callback;
-			while (cb) {
-				iclog->ic_callback_tail = &(iclog->ic_callback);
-				iclog->ic_callback = NULL;
-				spin_unlock(&iclog->ic_callback_lock);
-
-				/* perform callbacks in the order given */
-				for (; cb; cb = cb_next) {
-					cb_next = cb->cb_next;
-					cb->cb_func(cb->cb_arg, aborted);
-				}
-				spin_lock(&iclog->ic_callback_lock);
-				cb = iclog->ic_callback;
-			}
-
-			loopdidcallbacks++;
-			funcdidcallbacks++;
-
-			spin_lock(&log->l_icloglock);
-			ASSERT(iclog->ic_callback == NULL);
-			spin_unlock(&iclog->ic_callback_lock);
-			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
-				iclog->ic_state = XLOG_STATE_DIRTY;
-
-			/*
-			 * Transition from DIRTY to ACTIVE if applicable.
-			 * NOP if STATE_IOERROR.
-			 */
-			xlog_state_clean_log(log);
-
-			/* wake up threads waiting in xfs_log_force() */
-			wake_up_all(&iclog->ic_force_wait);
-
+			xlog_state_clean_iclog(log, iclog);
 			iclog = iclog->ic_next;
 		} while (first_iclog != iclog);
 
+		did_callbacks |= cycled_icloglock;
+
 		if (repeats > 5000) {
 			flushcnt += repeats;
 			repeats = 0;
@@ -2873,50 +2895,15 @@
 				"%s: possible infinite loop (%d iterations)",
 				__func__, flushcnt);
 		}
-	} while (!ioerrors && loopdidcallbacks);
+	} while (!ioerror && cycled_icloglock);
 
-#ifdef DEBUG
-	/*
-	 * Make one last gasp attempt to see if iclogs are being left in limbo.
-	 * If the above loop finds an iclog earlier than the current iclog and
-	 * in one of the syncing states, the current iclog is put into
-	 * DO_CALLBACK and the callbacks are deferred to the completion of the
-	 * earlier iclog. Walk the iclogs in order and make sure that no iclog
-	 * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
-	 * states.
-	 *
-	 * Note that SYNCING|IOABORT is a valid state so we cannot just check
-	 * for ic_state == SYNCING.
-	 */
-	if (funcdidcallbacks) {
-		first_iclog = iclog = log->l_iclog;
-		do {
-			ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
-			/*
-			 * Terminate the loop if iclogs are found in states
-			 * which will cause other threads to clean up iclogs.
-			 *
-			 * SYNCING - i/o completion will go through logs
-			 * DONE_SYNC - interrupt thread should be waiting for
-			 *              l_icloglock
-			 * IOERROR - give up hope all ye who enter here
-			 */
-			if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
-			    iclog->ic_state & XLOG_STATE_SYNCING ||
-			    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
-			    iclog->ic_state == XLOG_STATE_IOERROR )
-				break;
-			iclog = iclog->ic_next;
-		} while (first_iclog != iclog);
-	}
-#endif
+	if (did_callbacks)
+		xlog_state_callback_check_state(log);
 
 	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
-		wake = 1;
-	spin_unlock(&log->l_icloglock);
-
-	if (wake)
 		wake_up_all(&log->l_flush_wait);
+
+	spin_unlock(&log->l_icloglock);
 }
 
 
@@ -2935,18 +2922,16 @@
  */
 STATIC void
 xlog_state_done_syncing(
-	xlog_in_core_t	*iclog,
-	int		aborted)
+	struct xlog_in_core	*iclog,
+	bool			aborted)
 {
-	struct xlog	   *log = iclog->ic_log;
+	struct xlog		*log = iclog->ic_log;
 
 	spin_lock(&log->l_icloglock);
 
 	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
 	       iclog->ic_state == XLOG_STATE_IOERROR);
 	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
-	ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
-
 
 	/*
 	 * If we got an error, either on the first buffer, or in the case of
@@ -2954,13 +2939,8 @@
 	 * and none should ever be attempted to be written to disk
 	 * again.
 	 */
-	if (iclog->ic_state != XLOG_STATE_IOERROR) {
-		if (--iclog->ic_bwritecnt == 1) {
-			spin_unlock(&log->l_icloglock);
-			return;
-		}
+	if (iclog->ic_state != XLOG_STATE_IOERROR)
 		iclog->ic_state = XLOG_STATE_DONE_SYNC;
-	}
 
 	/*
 	 * Someone could be sleeping prior to writing out the next
@@ -3229,7 +3209,7 @@
 	 * flags after this point.
 	 */
 	if (sync)
-		return xlog_sync(log, iclog);
+		xlog_sync(log, iclog);
 	return 0;
 }	/* xlog_state_release_iclog */
 
@@ -3820,8 +3800,7 @@
 xlog_verify_iclog(
 	struct xlog		*log,
 	struct xlog_in_core	*iclog,
-	int			count,
-	bool                    syncing)
+	int			count)
 {
 	xlog_op_header_t	*ophead;
 	xlog_in_core_t		*icptr;
@@ -3865,7 +3844,7 @@
 		/* clientid is only 1 byte */
 		p = &ophead->oh_clientid;
 		field_offset = p - base_ptr;
-		if (!syncing || (field_offset & 0x1ff)) {
+		if (field_offset & 0x1ff) {
 			clientid = ophead->oh_clientid;
 		} else {
 			idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
@@ -3888,7 +3867,7 @@
 		/* check length */
 		p = &ophead->oh_len;
 		field_offset = p - base_ptr;
-		if (!syncing || (field_offset & 0x1ff)) {
+		if (field_offset & 0x1ff) {
 			op_len = be32_to_cpu(ophead->oh_len);
 		} else {
 			idx = BTOBBT((uintptr_t)&ophead->oh_len -
@@ -4024,8 +4003,10 @@
 	 * item committed callback functions will do this again under lock to
 	 * avoid races.
 	 */
+	spin_lock(&log->l_cilp->xc_push_lock);
 	wake_up_all(&log->l_cilp->xc_commit_wait);
-	xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+	spin_unlock(&log->l_cilp->xc_push_lock);
+	xlog_state_do_callback(log, true, NULL);
 
 #ifdef XFSERRORDEBUG
 	{

diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 73a64bf..84e0680 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h

@@ -6,6 +6,8 @@
 #ifndef	__XFS_LOG_H__
 #define __XFS_LOG_H__
 
+struct xfs_cil_ctx;
+
 struct xfs_log_vec {
 	struct xfs_log_vec	*lv_next;	/* next lv in build list */
 	int			lv_niovecs;	/* number of iovecs in lv */
@@ -72,16 +74,6 @@
 }
 
 /*
- * Structure used to pass callback function and the function's argument
- * to the log manager.
- */
-typedef struct xfs_log_callback {
-	struct xfs_log_callback	*cb_next;
-	void			(*cb_func)(void *, int);
-	void			*cb_arg;
-} xfs_log_callback_t;
-
-/*
  * By comparing each component, we don't have to worry about extra
  * endian issues in treating two 32 bit numbers as one 64 bit number
  */
@@ -125,12 +117,10 @@
 			xfs_daddr_t		start_block,
 			int		 	num_bblocks);
 int	  xfs_log_mount_finish(struct xfs_mount *mp);
-int	xfs_log_mount_cancel(struct xfs_mount *);
+void	xfs_log_mount_cancel(struct xfs_mount *);
 xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
 void	  xfs_log_space_wake(struct xfs_mount *mp);
-int	  xfs_log_notify(struct xlog_in_core	*iclog,
-			 struct xfs_log_callback *callback_entry);
 int	  xfs_log_release_iclog(struct xfs_mount *mp,
 			 struct xlog_in_core	 *iclog);
 int	  xfs_log_reserve(struct xfs_mount *mp,
@@ -148,6 +138,7 @@
 
 void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 				xfs_lsn_t *commit_lsn, bool regrant);
+void	xlog_cil_process_committed(struct list_head *list, bool aborted);
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
 void	xfs_log_work_queue(struct xfs_mount *mp);

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d3884e0..ef652ab 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c

@@ -10,10 +10,7 @@
 #include "xfs_shared.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
-#include "xfs_discard.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_log.h"
@@ -41,7 +38,7 @@
 	struct xlog_ticket *tic;
 
 	tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
-				KM_SLEEP|KM_NOFS);
+				KM_NOFS);
 
 	/*
 	 * set the current reservation to zero so we know to steal the basic
@@ -189,7 +186,7 @@
 			 */
 			kmem_free(lip->li_lv_shadow);
 
-			lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS);
+			lv = kmem_alloc_large(buf_size, KM_NOFS);
 			memset(lv, 0, xlog_cil_iovec_space(niovecs));
 
 			lv->lv_item = lip;
@@ -246,7 +243,8 @@
 	 * shadow buffer, so update the the pointer to it appropriately.
 	 */
 	if (!old_lv) {
-		lv->lv_item->li_ops->iop_pin(lv->lv_item);
+		if (lv->lv_item->li_ops->iop_pin)
+			lv->lv_item->li_ops->iop_pin(lv->lv_item);
 		lv->lv_item->li_lv_shadow = NULL;
 	} else if (old_lv != lv) {
 		ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
@@ -576,12 +574,24 @@
  */
 static void
 xlog_cil_committed(
-	void	*args,
-	int	abort)
+	struct xfs_cil_ctx	*ctx,
+	bool			abort)
 {
-	struct xfs_cil_ctx	*ctx = args;
 	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
 
+	/*
+	 * If the I/O failed, we're aborting the commit and already shutdown.
+	 * Wake any commit waiters before aborting the log items so we don't
+	 * block async log pushers on callbacks. Async log pushers explicitly do
+	 * not wait on log force completion because they may be holding locks
+	 * required to unpin items.
+	 */
+	if (abort) {
+		spin_lock(&ctx->cil->xc_push_lock);
+		wake_up_all(&ctx->cil->xc_commit_wait);
+		spin_unlock(&ctx->cil->xc_push_lock);
+	}
+
 	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
 					ctx->start_lsn, abort);
 
@@ -589,15 +599,7 @@
 	xfs_extent_busy_clear(mp, &ctx->busy_extents,
 			     (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
 
-	/*
-	 * If we are aborting the commit, wake up anyone waiting on the
-	 * committing list.  If we don't, then a shutdown we can leave processes
-	 * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
-	 * will never happen because we aborted it.
-	 */
 	spin_lock(&ctx->cil->xc_push_lock);
-	if (abort)
-		wake_up_all(&ctx->cil->xc_commit_wait);
 	list_del(&ctx->committing);
 	spin_unlock(&ctx->cil->xc_push_lock);
 
@@ -609,6 +611,20 @@
 		kmem_free(ctx);
 }
 
+void
+xlog_cil_process_committed(
+	struct list_head	*list,
+	bool			aborted)
+{
+	struct xfs_cil_ctx	*ctx;
+
+	while ((ctx = list_first_entry_or_null(list,
+			struct xfs_cil_ctx, iclog_entry))) {
+		list_del(&ctx->iclog_entry);
+		xlog_cil_committed(ctx, aborted);
+	}
+}
+
 /*
  * Push the Committed Item List to the log. If @push_seq flag is zero, then it
  * is a background flush and so we can chose to ignore it. Otherwise, if the
@@ -644,7 +660,7 @@
 	if (!cil)
 		return 0;
 
-	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
 	new_ctx->ticket = xlog_cil_ticket_alloc(log);
 
 	down_write(&cil->xc_ctx_lock);
@@ -830,12 +846,15 @@
 	if (commit_lsn == -1)
 		goto out_abort;
 
-	/* attach all the transactions w/ busy extents to iclog */
-	ctx->log_cb.cb_func = xlog_cil_committed;
-	ctx->log_cb.cb_arg = ctx;
-	error = xfs_log_notify(commit_iclog, &ctx->log_cb);
-	if (error)
+	spin_lock(&commit_iclog->ic_callback_lock);
+	if (commit_iclog->ic_state & XLOG_STATE_IOERROR) {
+		spin_unlock(&commit_iclog->ic_callback_lock);
 		goto out_abort;
+	}
+	ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE ||
+		      commit_iclog->ic_state == XLOG_STATE_WANT_SYNC);
+	list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks);
+	spin_unlock(&commit_iclog->ic_callback_lock);
 
 	/*
 	 * now the checkpoint commit is complete and we've attached the
@@ -859,7 +878,7 @@
 out_abort_free_ticket:
 	xfs_log_ticket_put(tic);
 out_abort:
-	xlog_cil_committed(ctx, XFS_LI_ABORTED);
+	xlog_cil_committed(ctx, true);
 	return -EIO;
 }
 
@@ -979,6 +998,7 @@
 {
 	struct xlog		*log = mp->m_log;
 	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_log_item	*lip, *next;
 	xfs_lsn_t		xc_commit_lsn;
 
 	/*
@@ -1003,7 +1023,7 @@
 
 	/*
 	 * Once all the items of the transaction have been copied to the CIL,
-	 * the items can be unlocked and freed.
+	 * the items can be unlocked and possibly freed.
 	 *
 	 * This needs to be done before we drop the CIL context lock because we
 	 * have to update state in the log items and unlock them before they go
@@ -1012,8 +1032,12 @@
 	 * the log items. This affects (at least) processing of stale buffers,
 	 * inodes and EFIs.
 	 */
-	xfs_trans_free_items(tp, xc_commit_lsn, false);
-
+	trace_xfs_trans_commit_items(tp, _RET_IP_);
+	list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+		xfs_trans_del_item(lip);
+		if (lip->li_ops->iop_committing)
+			lip->li_ops->iop_committing(lip, xc_commit_lsn);
+	}
 	xlog_cil_push_background(log);
 
 	up_read(&cil->xc_ctx_lock);
@@ -1155,11 +1179,11 @@
 	struct xfs_cil	*cil;
 	struct xfs_cil_ctx *ctx;
 
-	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+	cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
 	if (!cil)
 		return -ENOMEM;
 
-	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+	ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
 	if (!ctx) {
 		kmem_free(cil);
 		return -ENOMEM;

diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b5f82cb..b880c23 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h

@@ -10,7 +10,6 @@
 struct xlog;
 struct xlog_ticket;
 struct xfs_mount;
-struct xfs_log_callback;
 
 /*
  * Flags for log structure
@@ -50,7 +49,6 @@
 #define XLOG_STATE_CALLBACK  0x0020 /* Callback functions now */
 #define XLOG_STATE_DIRTY     0x0040 /* Dirty IC log, not ready for ACTIVE status*/
 #define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
-#define XLOG_STATE_IOABORT   0x0100 /* force abort on I/O completion (debug) */
 #define XLOG_STATE_ALL	     0x7FFF /* All possible valid flags */
 #define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
 
@@ -179,11 +177,10 @@
  *	the iclog.
  * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
  * - ic_next is the pointer to the next iclog in the ring.
- * - ic_bp is a pointer to the buffer used to write this incore log to disk.
  * - ic_log is a pointer back to the global log structure.
- * - ic_callback is a linked list of callback function/argument pairs to be
- *	called after an iclog finishes writing.
- * - ic_size is the full size of the header plus data.
+ * - ic_size is the full size of the log buffer, minus the cycle headers.
+ * - ic_io_size is the size of the currently pending log buffer write, which
+ *	might be smaller than ic_size
  * - ic_offset is the current number of bytes written to in this iclog.
  * - ic_refcnt is bumped when someone is writing to the log.
  * - ic_state is the state of the iclog.
@@ -193,7 +190,7 @@
  * structure cacheline aligned. The following fields can be contended on
  * by independent processes:
  *
- *	- ic_callback_*
+ *	- ic_callbacks
  *	- ic_refcnt
  *	- fields protected by the global l_icloglock
  *
@@ -206,23 +203,28 @@
 	wait_queue_head_t	ic_write_wait;
 	struct xlog_in_core	*ic_next;
 	struct xlog_in_core	*ic_prev;
-	struct xfs_buf		*ic_bp;
 	struct xlog		*ic_log;
-	int			ic_size;
-	int			ic_offset;
-	int			ic_bwritecnt;
+	u32			ic_size;
+	u32			ic_io_size;
+	u32			ic_offset;
 	unsigned short		ic_state;
 	char			*ic_datap;	/* pointer to iclog data */
 
 	/* Callback structures need their own cacheline */
 	spinlock_t		ic_callback_lock ____cacheline_aligned_in_smp;
-	struct xfs_log_callback	*ic_callback;
-	struct xfs_log_callback	**ic_callback_tail;
+	struct list_head	ic_callbacks;
 
 	/* reference counts need their own cacheline */
 	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
 	xlog_in_core_2_t	*ic_data;
 #define ic_header	ic_data->hic_header
+#ifdef DEBUG
+	bool			ic_fail_crc : 1;
+#endif
+	struct semaphore	ic_sema;
+	struct work_struct	ic_end_io_work;
+	struct bio		ic_bio;
+	struct bio_vec		ic_bvec[];
 } xlog_in_core_t;
 
 /*
@@ -243,7 +245,7 @@
 	int			space_used;	/* aggregate size of regions */
 	struct list_head	busy_extents;	/* busy extents in chkpt */
 	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
-	struct xfs_log_callback	log_cb;		/* completion callback hook. */
+	struct list_head	iclog_entry;
 	struct list_head	committing;	/* ctx committing list */
 	struct work_struct	discard_endio_work;
 };
@@ -350,9 +352,8 @@
 	struct xfs_mount	*l_mp;	        /* mount point */
 	struct xfs_ail		*l_ailp;	/* AIL log is working with */
 	struct xfs_cil		*l_cilp;	/* CIL log is working with */
-	struct xfs_buf		*l_xbuf;        /* extra buffer for log
-						 * wrapping */
 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	struct workqueue_struct	*l_ioend_workqueue; /* for I/O completions */
 	struct delayed_work	l_work;		/* background flush work */
 	uint			l_flags;
 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
@@ -361,7 +362,6 @@
 	int			l_iclog_heads;  /* # of iclog header sectors */
 	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
 	int			l_iclog_size;	/* size of log in bytes */
-	int			l_iclog_size_log; /* log power size of log */
 	int			l_iclog_bufs;	/* number of iclog buffers */
 	xfs_daddr_t		l_logBBstart;   /* start block of log */
 	int			l_logsize;      /* size of log in bytes */
@@ -418,7 +418,7 @@
 extern int
 xlog_recover_finish(
 	struct xlog		*log);
-extern int
+extern void
 xlog_recover_cancel(struct xlog *);
 
 extern __le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1fc9e90..c1a514f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c

@@ -13,8 +13,6 @@
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_log.h"
@@ -26,7 +24,6 @@
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_quota.h"
-#include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_btree.h"
@@ -79,7 +76,7 @@
  * are valid, false otherwise.
  */
 static inline bool
-xlog_verify_bp(
+xlog_verify_bno(
 	struct xlog	*log,
 	xfs_daddr_t	blk_no,
 	int		bbcount)
@@ -92,22 +89,21 @@
 }
 
 /*
- * Allocate a buffer to hold log data.  The buffer needs to be able
- * to map to a range of nbblks basic blocks at any valid (basic
- * block) offset within the log.
+ * Allocate a buffer to hold log data.  The buffer needs to be able to map to
+ * a range of nbblks basic blocks at any valid offset within the log.
  */
-STATIC xfs_buf_t *
-xlog_get_bp(
+static char *
+xlog_alloc_buffer(
 	struct xlog	*log,
 	int		nbblks)
 {
-	struct xfs_buf	*bp;
+	int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
 
 	/*
 	 * Pass log block 0 since we don't have an addr yet, buffer will be
 	 * verified on read.
 	 */
-	if (!xlog_verify_bp(log, 0, nbblks)) {
+	if (!xlog_verify_bno(log, 0, nbblks)) {
 		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 			nbblks);
 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
@@ -115,69 +111,48 @@
 	}
 
 	/*
-	 * We do log I/O in units of log sectors (a power-of-2
-	 * multiple of the basic block size), so we round up the
-	 * requested size to accommodate the basic blocks required
-	 * for complete log sectors.
+	 * We do log I/O in units of log sectors (a power-of-2 multiple of the
+	 * basic block size), so we round up the requested size to accommodate
+	 * the basic blocks required for complete log sectors.
 	 *
-	 * In addition, the buffer may be used for a non-sector-
-	 * aligned block offset, in which case an I/O of the
-	 * requested size could extend beyond the end of the
-	 * buffer.  If the requested size is only 1 basic block it
-	 * will never straddle a sector boundary, so this won't be
-	 * an issue.  Nor will this be a problem if the log I/O is
-	 * done in basic blocks (sector size 1).  But otherwise we
-	 * extend the buffer by one extra log sector to ensure
-	 * there's space to accommodate this possibility.
+	 * In addition, the buffer may be used for a non-sector-aligned block
+	 * offset, in which case an I/O of the requested size could extend
+	 * beyond the end of the buffer.  If the requested size is only 1 basic
+	 * block it will never straddle a sector boundary, so this won't be an
+	 * issue.  Nor will this be a problem if the log I/O is done in basic
+	 * blocks (sector size 1).  But otherwise we extend the buffer by one
+	 * extra log sector to ensure there's space to accommodate this
+	 * possibility.
 	 */
 	if (nbblks > 1 && log->l_sectBBsize > 1)
 		nbblks += log->l_sectBBsize;
 	nbblks = round_up(nbblks, log->l_sectBBsize);
-
-	bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
-	if (bp)
-		xfs_buf_unlock(bp);
-	return bp;
-}
-
-STATIC void
-xlog_put_bp(
-	xfs_buf_t	*bp)
-{
-	xfs_buf_free(bp);
+	return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
 }
 
 /*
  * Return the address of the start of the given block number's data
  * in a log buffer.  The buffer covers a log sector-aligned region.
  */
-STATIC char *
+static inline unsigned int
 xlog_align(
 	struct xlog	*log,
-	xfs_daddr_t	blk_no,
-	int		nbblks,
-	struct xfs_buf	*bp)
+	xfs_daddr_t	blk_no)
 {
-	xfs_daddr_t	offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
-
-	ASSERT(offset + nbblks <= bp->b_length);
-	return bp->b_addr + BBTOB(offset);
+	return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
 }
 
-
-/*
- * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
- */
-STATIC int
-xlog_bread_noalign(
-	struct xlog	*log,
-	xfs_daddr_t	blk_no,
-	int		nbblks,
-	struct xfs_buf	*bp)
+static int
+xlog_do_io(
+	struct xlog		*log,
+	xfs_daddr_t		blk_no,
+	unsigned int		nbblks,
+	char			*data,
+	unsigned int		op)
 {
-	int		error;
+	int			error;
 
-	if (!xlog_verify_bp(log, blk_no, nbblks)) {
+	if (!xlog_verify_bno(log, blk_no, nbblks)) {
 		xfs_warn(log->l_mp,
 			 "Invalid log block/length (0x%llx, 0x%x) for buffer",
 			 blk_no, nbblks);
@@ -187,107 +162,53 @@
 
 	blk_no = round_down(blk_no, log->l_sectBBsize);
 	nbblks = round_up(nbblks, log->l_sectBBsize);
-
 	ASSERT(nbblks > 0);
-	ASSERT(nbblks <= bp->b_length);
 
-	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
-	bp->b_flags |= XBF_READ;
-	bp->b_io_length = nbblks;
-	bp->b_error = 0;
-
-	error = xfs_buf_submit(bp);
-	if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
-		xfs_buf_ioerror_alert(bp, __func__);
+	error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
+			BBTOB(nbblks), data, op);
+	if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
+		xfs_alert(log->l_mp,
+			  "log recovery %s I/O error at daddr 0x%llx len %d error %d",
+			  op == REQ_OP_WRITE ? "write" : "read",
+			  blk_no, nbblks, error);
+	}
 	return error;
 }
 
 STATIC int
+xlog_bread_noalign(
+	struct xlog	*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	char		*data)
+{
+	return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
+}
+
+STATIC int
 xlog_bread(
 	struct xlog	*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
-	struct xfs_buf	*bp,
+	char		*data,
 	char		**offset)
 {
 	int		error;
 
-	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
-	if (error)
-		return error;
-
-	*offset = xlog_align(log, blk_no, nbblks, bp);
-	return 0;
+	error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
+	if (!error)
+		*offset = data + xlog_align(log, blk_no);
+	return error;
 }
 
-/*
- * Read at an offset into the buffer. Returns with the buffer in it's original
- * state regardless of the result of the read.
- */
-STATIC int
-xlog_bread_offset(
-	struct xlog	*log,
-	xfs_daddr_t	blk_no,		/* block to read from */
-	int		nbblks,		/* blocks to read */
-	struct xfs_buf	*bp,
-	char		*offset)
-{
-	char		*orig_offset = bp->b_addr;
-	int		orig_len = BBTOB(bp->b_length);
-	int		error, error2;
-
-	error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
-	if (error)
-		return error;
-
-	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
-
-	/* must reset buffer pointer even on error */
-	error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
-	if (error)
-		return error;
-	return error2;
-}
-
-/*
- * Write out the buffer at the given block for the given number of blocks.
- * The buffer is kept locked across the write and is returned locked.
- * This can only be used for synchronous log writes.
- */
 STATIC int
 xlog_bwrite(
 	struct xlog	*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
-	struct xfs_buf	*bp)
+	char		*data)
 {
-	int		error;
-
-	if (!xlog_verify_bp(log, blk_no, nbblks)) {
-		xfs_warn(log->l_mp,
-			 "Invalid log block/length (0x%llx, 0x%x) for buffer",
-			 blk_no, nbblks);
-		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
-		return -EFSCORRUPTED;
-	}
-
-	blk_no = round_down(blk_no, log->l_sectBBsize);
-	nbblks = round_up(nbblks, log->l_sectBBsize);
-
-	ASSERT(nbblks > 0);
-	ASSERT(nbblks <= bp->b_length);
-
-	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
-	xfs_buf_hold(bp);
-	xfs_buf_lock(bp);
-	bp->b_io_length = nbblks;
-	bp->b_error = 0;
-
-	error = xfs_bwrite(bp);
-	if (error)
-		xfs_buf_ioerror_alert(bp, __func__);
-	xfs_buf_relse(bp);
-	return error;
+	return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
 }
 
 #ifdef DEBUG
@@ -377,10 +298,9 @@
 		 * We're not going to bother about retrying
 		 * this during recovery. One strike!
 		 */
-		if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+		if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
 			xfs_buf_ioerror_alert(bp, __func__);
-			xfs_force_shutdown(bp->b_target->bt_mount,
-						SHUTDOWN_META_IO_ERROR);
+			xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
 		}
 	}
 
@@ -405,7 +325,7 @@
 STATIC int
 xlog_find_cycle_start(
 	struct xlog	*log,
-	struct xfs_buf	*bp,
+	char		*buffer,
 	xfs_daddr_t	first_blk,
 	xfs_daddr_t	*last_blk,
 	uint		cycle)
@@ -419,7 +339,7 @@
 	end_blk = *last_blk;
 	mid_blk = BLK_AVG(first_blk, end_blk);
 	while (mid_blk != first_blk && mid_blk != end_blk) {
-		error = xlog_bread(log, mid_blk, 1, bp, &offset);
+		error = xlog_bread(log, mid_blk, 1, buffer, &offset);
 		if (error)
 			return error;
 		mid_cycle = xlog_get_cycle(offset);
@@ -455,7 +375,7 @@
 {
 	xfs_daddr_t	i, j;
 	uint		cycle;
-	xfs_buf_t	*bp;
+	char		*buffer;
 	xfs_daddr_t	bufblks;
 	char		*buf = NULL;
 	int		error = 0;
@@ -469,7 +389,7 @@
 	bufblks = 1 << ffs(nbblks);
 	while (bufblks > log->l_logBBsize)
 		bufblks >>= 1;
-	while (!(bp = xlog_get_bp(log, bufblks))) {
+	while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
 		bufblks >>= 1;
 		if (bufblks < log->l_sectBBsize)
 			return -ENOMEM;
@@ -480,7 +400,7 @@
 
 		bcount = min(bufblks, (start_blk + nbblks - i));
 
-		error = xlog_bread(log, i, bcount, bp, &buf);
+		error = xlog_bread(log, i, bcount, buffer, &buf);
 		if (error)
 			goto out;
 
@@ -498,7 +418,7 @@
 	*new_blk = -1;
 
 out:
-	xlog_put_bp(bp);
+	kmem_free(buffer);
 	return error;
 }
 
@@ -522,7 +442,7 @@
 	int			extra_bblks)
 {
 	xfs_daddr_t		i;
-	xfs_buf_t		*bp;
+	char			*buffer;
 	char			*offset = NULL;
 	xlog_rec_header_t	*head = NULL;
 	int			error = 0;
@@ -532,12 +452,14 @@
 
 	ASSERT(start_blk != 0 || *last_blk != start_blk);
 
-	if (!(bp = xlog_get_bp(log, num_blks))) {
-		if (!(bp = xlog_get_bp(log, 1)))
+	buffer = xlog_alloc_buffer(log, num_blks);
+	if (!buffer) {
+		buffer = xlog_alloc_buffer(log, 1);
+		if (!buffer)
 			return -ENOMEM;
 		smallmem = 1;
 	} else {
-		error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+		error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
 		if (error)
 			goto out;
 		offset += ((num_blks - 1) << BBSHIFT);
@@ -554,7 +476,7 @@
 		}
 
 		if (smallmem) {
-			error = xlog_bread(log, i, 1, bp, &offset);
+			error = xlog_bread(log, i, 1, buffer, &offset);
 			if (error)
 				goto out;
 		}
@@ -607,7 +529,7 @@
 		*last_blk = i;
 
 out:
-	xlog_put_bp(bp);
+	kmem_free(buffer);
 	return error;
 }
 
@@ -629,7 +551,7 @@
 	struct xlog	*log,
 	xfs_daddr_t	*return_head_blk)
 {
-	xfs_buf_t	*bp;
+	char		*buffer;
 	char		*offset;
 	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
 	int		num_scan_bblks;
@@ -659,20 +581,20 @@
 	}
 
 	first_blk = 0;			/* get cycle # of 1st block */
-	bp = xlog_get_bp(log, 1);
-	if (!bp)
+	buffer = xlog_alloc_buffer(log, 1);
+	if (!buffer)
 		return -ENOMEM;
 
-	error = xlog_bread(log, 0, 1, bp, &offset);
+	error = xlog_bread(log, 0, 1, buffer, &offset);
 	if (error)
-		goto bp_err;
+		goto out_free_buffer;
 
 	first_half_cycle = xlog_get_cycle(offset);
 
 	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
-	error = xlog_bread(log, last_blk, 1, bp, &offset);
+	error = xlog_bread(log, last_blk, 1, buffer, &offset);
 	if (error)
-		goto bp_err;
+		goto out_free_buffer;
 
 	last_half_cycle = xlog_get_cycle(offset);
 	ASSERT(last_half_cycle != 0);
@@ -740,9 +662,10 @@
 		 *                           ^ we want to locate this spot
 		 */
 		stop_on_cycle = last_half_cycle;
-		if ((error = xlog_find_cycle_start(log, bp, first_blk,
-						&head_blk, last_half_cycle)))
-			goto bp_err;
+		error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
+				last_half_cycle);
+		if (error)
+			goto out_free_buffer;
 	}
 
 	/*
@@ -762,7 +685,7 @@
 		if ((error = xlog_find_verify_cycle(log,
 						start_blk, num_scan_bblks,
 						stop_on_cycle, &new_blk)))
-			goto bp_err;
+			goto out_free_buffer;
 		if (new_blk != -1)
 			head_blk = new_blk;
 	} else {		/* need to read 2 parts of log */
@@ -799,7 +722,7 @@
 		if ((error = xlog_find_verify_cycle(log, start_blk,
 					num_scan_bblks - (int)head_blk,
 					(stop_on_cycle - 1), &new_blk)))
-			goto bp_err;
+			goto out_free_buffer;
 		if (new_blk != -1) {
 			head_blk = new_blk;
 			goto validate_head;
@@ -815,7 +738,7 @@
 		if ((error = xlog_find_verify_cycle(log,
 					start_blk, (int)head_blk,
 					stop_on_cycle, &new_blk)))
-			goto bp_err;
+			goto out_free_buffer;
 		if (new_blk != -1)
 			head_blk = new_blk;
 	}
@@ -834,13 +757,13 @@
 		if (error == 1)
 			error = -EIO;
 		if (error)
-			goto bp_err;
+			goto out_free_buffer;
 	} else {
 		start_blk = 0;
 		ASSERT(head_blk <= INT_MAX);
 		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 		if (error < 0)
-			goto bp_err;
+			goto out_free_buffer;
 		if (error == 1) {
 			/* We hit the beginning of the log during our search */
 			start_blk = log_bbnum - (num_scan_bblks - head_blk);
@@ -853,14 +776,14 @@
 			if (error == 1)
 				error = -EIO;
 			if (error)
-				goto bp_err;
+				goto out_free_buffer;
 			if (new_blk != log_bbnum)
 				head_blk = new_blk;
 		} else if (error)
-			goto bp_err;
+			goto out_free_buffer;
 	}
 
-	xlog_put_bp(bp);
+	kmem_free(buffer);
 	if (head_blk == log_bbnum)
 		*return_head_blk = 0;
 	else
@@ -873,9 +796,8 @@
 	 */
 	return 0;
 
- bp_err:
-	xlog_put_bp(bp);
-
+out_free_buffer:
+	kmem_free(buffer);
 	if (error)
 		xfs_warn(log->l_mp, "failed to find log head");
 	return error;
@@ -895,7 +817,7 @@
 	xfs_daddr_t		head_blk,
 	xfs_daddr_t		tail_blk,
 	int			count,
-	struct xfs_buf		*bp,
+	char			*buffer,
 	xfs_daddr_t		*rblk,
 	struct xlog_rec_header	**rhead,
 	bool			*wrapped)
@@ -914,7 +836,7 @@
 	 */
 	end_blk = head_blk > tail_blk ? tail_blk : 0;
 	for (i = (int) head_blk - 1; i >= end_blk; i--) {
-		error = xlog_bread(log, i, 1, bp, &offset);
+		error = xlog_bread(log, i, 1, buffer, &offset);
 		if (error)
 			goto out_error;
 
@@ -933,7 +855,7 @@
 	 */
 	if (tail_blk >= head_blk && found != count) {
 		for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
-			error = xlog_bread(log, i, 1, bp, &offset);
+			error = xlog_bread(log, i, 1, buffer, &offset);
 			if (error)
 				goto out_error;
 
@@ -969,7 +891,7 @@
 	xfs_daddr_t		head_blk,
 	xfs_daddr_t		tail_blk,
 	int			count,
-	struct xfs_buf		*bp,
+	char			*buffer,
 	xfs_daddr_t		*rblk,
 	struct xlog_rec_header	**rhead,
 	bool			*wrapped)
@@ -988,7 +910,7 @@
 	 */
 	end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
 	for (i = (int) tail_blk; i <= end_blk; i++) {
-		error = xlog_bread(log, i, 1, bp, &offset);
+		error = xlog_bread(log, i, 1, buffer, &offset);
 		if (error)
 			goto out_error;
 
@@ -1006,7 +928,7 @@
 	 */
 	if (tail_blk > head_blk && found != count) {
 		for (i = 0; i < (int) head_blk; i++) {
-			error = xlog_bread(log, i, 1, bp, &offset);
+			error = xlog_bread(log, i, 1, buffer, &offset);
 			if (error)
 				goto out_error;
 
@@ -1069,22 +991,22 @@
 	int			hsize)
 {
 	struct xlog_rec_header	*thead;
-	struct xfs_buf		*bp;
+	char			*buffer;
 	xfs_daddr_t		first_bad;
 	int			error = 0;
 	bool			wrapped;
 	xfs_daddr_t		tmp_tail;
 	xfs_daddr_t		orig_tail = *tail_blk;
 
-	bp = xlog_get_bp(log, 1);
-	if (!bp)
+	buffer = xlog_alloc_buffer(log, 1);
+	if (!buffer)
 		return -ENOMEM;
 
 	/*
 	 * Make sure the tail points to a record (returns positive count on
 	 * success).
 	 */
-	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
+	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
 			&tmp_tail, &thead, &wrapped);
 	if (error < 0)
 		goto out;
@@ -1113,8 +1035,8 @@
 			break;
 
 		/* skip to the next record; returns positive count on success */
-		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
-				&tmp_tail, &thead, &wrapped);
+		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
+				buffer, &tmp_tail, &thead, &wrapped);
 		if (error < 0)
 			goto out;
 
@@ -1129,7 +1051,7 @@
 		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
 			 orig_tail, *tail_blk);
 out:
-	xlog_put_bp(bp);
+	kmem_free(buffer);
 	return error;
 }
 
@@ -1151,13 +1073,13 @@
 	struct xlog		*log,
 	xfs_daddr_t		*head_blk,	/* in/out: unverified head */
 	xfs_daddr_t		*tail_blk,	/* out: tail block */
-	struct xfs_buf		*bp,
+	char			*buffer,
 	xfs_daddr_t		*rhead_blk,	/* start blk of last record */
 	struct xlog_rec_header	**rhead,	/* ptr to last record */
 	bool			*wrapped)	/* last rec. wraps phys. log */
 {
 	struct xlog_rec_header	*tmp_rhead;
-	struct xfs_buf		*tmp_bp;
+	char			*tmp_buffer;
 	xfs_daddr_t		first_bad;
 	xfs_daddr_t		tmp_rhead_blk;
 	int			found;
@@ -1168,15 +1090,15 @@
 	 * Check the head of the log for torn writes. Search backwards from the
 	 * head until we hit the tail or the maximum number of log record I/Os
 	 * that could have been in flight at one time. Use a temporary buffer so
-	 * we don't trash the rhead/bp pointers from the caller.
+	 * we don't trash the rhead/buffer pointers from the caller.
 	 */
-	tmp_bp = xlog_get_bp(log, 1);
-	if (!tmp_bp)
+	tmp_buffer = xlog_alloc_buffer(log, 1);
+	if (!tmp_buffer)
 		return -ENOMEM;
 	error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
-				      XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
-				      &tmp_rhead, &tmp_wrapped);
-	xlog_put_bp(tmp_bp);
+				      XLOG_MAX_ICLOGS, tmp_buffer,
+				      &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
+	kmem_free(tmp_buffer);
 	if (error < 0)
 		return error;
 
@@ -1205,8 +1127,8 @@
 		 * (i.e., the records with invalid CRC) if the cycle number
 		 * matches the the current cycle.
 		 */
-		found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
-					      rhead_blk, rhead, wrapped);
+		found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
+				buffer, rhead_blk, rhead, wrapped);
 		if (found < 0)
 			return found;
 		if (found == 0)		/* XXX: right thing to do here? */
@@ -1266,7 +1188,7 @@
 	xfs_daddr_t		*tail_blk,
 	struct xlog_rec_header	*rhead,
 	xfs_daddr_t		rhead_blk,
-	struct xfs_buf		*bp,
+	char			*buffer,
 	bool			*clean)
 {
 	struct xlog_op_header	*op_head;
@@ -1309,7 +1231,7 @@
 	if (*head_blk == after_umount_blk &&
 	    be32_to_cpu(rhead->h_num_logops) == 1) {
 		umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
-		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+		error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
 		if (error)
 			return error;
 
@@ -1388,7 +1310,7 @@
 {
 	xlog_rec_header_t	*rhead;
 	char			*offset = NULL;
-	xfs_buf_t		*bp;
+	char			*buffer;
 	int			error;
 	xfs_daddr_t		rhead_blk;
 	xfs_lsn_t		tail_lsn;
@@ -1402,11 +1324,11 @@
 		return error;
 	ASSERT(*head_blk < INT_MAX);
 
-	bp = xlog_get_bp(log, 1);
-	if (!bp)
+	buffer = xlog_alloc_buffer(log, 1);
+	if (!buffer)
 		return -ENOMEM;
 	if (*head_blk == 0) {				/* special case */
-		error = xlog_bread(log, 0, 1, bp, &offset);
+		error = xlog_bread(log, 0, 1, buffer, &offset);
 		if (error)
 			goto done;
 
@@ -1422,7 +1344,7 @@
 	 * block. This wraps all the way back around to the head so something is
 	 * seriously wrong if we can't find it.
 	 */
-	error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+	error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
 				      &rhead_blk, &rhead, &wrapped);
 	if (error < 0)
 		return error;
@@ -1443,7 +1365,7 @@
 	 * state to determine whether recovery is necessary.
 	 */
 	error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
-				       rhead_blk, bp, &clean);
+				       rhead_blk, buffer, &clean);
 	if (error)
 		goto done;
 
@@ -1460,7 +1382,7 @@
 	if (!clean) {
 		xfs_daddr_t	orig_head = *head_blk;
 
-		error = xlog_verify_head(log, head_blk, tail_blk, bp,
+		error = xlog_verify_head(log, head_blk, tail_blk, buffer,
 					 &rhead_blk, &rhead, &wrapped);
 		if (error)
 			goto done;
@@ -1471,7 +1393,7 @@
 				       wrapped);
 			tail_lsn = atomic64_read(&log->l_tail_lsn);
 			error = xlog_check_unmount_rec(log, head_blk, tail_blk,
-						       rhead, rhead_blk, bp,
+						       rhead, rhead_blk, buffer,
 						       &clean);
 			if (error)
 				goto done;
@@ -1505,11 +1427,11 @@
 	 * But... if the -device- itself is readonly, just skip this.
 	 * We can't recover this device anyway, so it won't matter.
 	 */
-	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
+	if (!xfs_readonly_buftarg(log->l_targ))
 		error = xlog_clear_stale_blocks(log, tail_lsn);
 
 done:
-	xlog_put_bp(bp);
+	kmem_free(buffer);
 
 	if (error)
 		xfs_warn(log->l_mp, "failed to locate log tail");
@@ -1537,7 +1459,7 @@
 	struct xlog	*log,
 	xfs_daddr_t	*blk_no)
 {
-	xfs_buf_t	*bp;
+	char		*buffer;
 	char		*offset;
 	uint	        first_cycle, last_cycle;
 	xfs_daddr_t	new_blk, last_blk, start_blk;
@@ -1547,35 +1469,36 @@
 	*blk_no = 0;
 
 	/* check totally zeroed log */
-	bp = xlog_get_bp(log, 1);
-	if (!bp)
+	buffer = xlog_alloc_buffer(log, 1);
+	if (!buffer)
 		return -ENOMEM;
-	error = xlog_bread(log, 0, 1, bp, &offset);
+	error = xlog_bread(log, 0, 1, buffer, &offset);
 	if (error)
-		goto bp_err;
+		goto out_free_buffer;
 
 	first_cycle = xlog_get_cycle(offset);
 	if (first_cycle == 0) {		/* completely zeroed log */
 		*blk_no = 0;
-		xlog_put_bp(bp);
+		kmem_free(buffer);
 		return 1;
 	}
 
 	/* check partially zeroed log */
-	error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+	error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
 	if (error)
-		goto bp_err;
+		goto out_free_buffer;
 
 	last_cycle = xlog_get_cycle(offset);
 	if (last_cycle != 0) {		/* log completely written to */
-		xlog_put_bp(bp);
+		kmem_free(buffer);
 		return 0;
 	}
 
 	/* we have a partially zeroed log */
 	last_blk = log_bbnum-1;
-	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
-		goto bp_err;
+	error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
+	if (error)
+		goto out_free_buffer;
 
 	/*
 	 * Validate the answer.  Because there is no way to guarantee that
@@ -1598,7 +1521,7 @@
 	 */
 	if ((error = xlog_find_verify_cycle(log, start_blk,
 					 (int)num_scan_bblks, 0, &new_blk)))
-		goto bp_err;
+		goto out_free_buffer;
 	if (new_blk != -1)
 		last_blk = new_blk;
 
@@ -1610,11 +1533,11 @@
 	if (error == 1)
 		error = -EIO;
 	if (error)
-		goto bp_err;
+		goto out_free_buffer;
 
 	*blk_no = last_blk;
-bp_err:
-	xlog_put_bp(bp);
+out_free_buffer:
+	kmem_free(buffer);
 	if (error)
 		return error;
 	return 1;
@@ -1657,7 +1580,7 @@
 	int		tail_block)
 {
 	char		*offset;
-	xfs_buf_t	*bp;
+	char		*buffer;
 	int		balign, ealign;
 	int		sectbb = log->l_sectBBsize;
 	int		end_block = start_block + blocks;
@@ -1674,7 +1597,7 @@
 	bufblks = 1 << ffs(blocks);
 	while (bufblks > log->l_logBBsize)
 		bufblks >>= 1;
-	while (!(bp = xlog_get_bp(log, bufblks))) {
+	while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
 		bufblks >>= 1;
 		if (bufblks < sectbb)
 			return -ENOMEM;
@@ -1686,9 +1609,9 @@
 	 */
 	balign = round_down(start_block, sectbb);
 	if (balign != start_block) {
-		error = xlog_bread_noalign(log, start_block, 1, bp);
+		error = xlog_bread_noalign(log, start_block, 1, buffer);
 		if (error)
-			goto out_put_bp;
+			goto out_free_buffer;
 
 		j = start_block - balign;
 	}
@@ -1705,29 +1628,28 @@
 		 */
 		ealign = round_down(end_block, sectbb);
 		if (j == 0 && (start_block + endcount > ealign)) {
-			offset = bp->b_addr + BBTOB(ealign - start_block);
-			error = xlog_bread_offset(log, ealign, sectbb,
-							bp, offset);
+			error = xlog_bread_noalign(log, ealign, sectbb,
+					buffer + BBTOB(ealign - start_block));
 			if (error)
 				break;
 
 		}
 
-		offset = xlog_align(log, start_block, endcount, bp);
+		offset = buffer + xlog_align(log, start_block);
 		for (; j < endcount; j++) {
 			xlog_add_record(log, offset, cycle, i+j,
 					tail_cycle, tail_block);
 			offset += BBSIZE;
 		}
-		error = xlog_bwrite(log, start_block, endcount, bp);
+		error = xlog_bwrite(log, start_block, endcount, buffer);
 		if (error)
 			break;
 		start_block += endcount;
 		j = 0;
 	}
 
- out_put_bp:
-	xlog_put_bp(bp);
+out_free_buffer:
+	kmem_free(buffer);
 	return error;
 }
 
@@ -2040,7 +1962,7 @@
 		}
 	}
 
-	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
 	bcp->bc_blkno = buf_f->blf_blkno;
 	bcp->bc_len = buf_f->blf_len;
 	bcp->bc_refcount = 1;
@@ -2162,7 +2084,7 @@
 	if (xfs_sb_version_hascrc(&mp->m_sb))
 		bp->b_ops = &xfs_inode_buf_ops;
 
-	inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
+	inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
 	for (i = 0; i < inodes_per_buf; i++) {
 		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
 			offsetof(xfs_dinode_t, di_next_unlinked);
@@ -2204,8 +2126,7 @@
 
 		ASSERT(item->ri_buf[item_index].i_addr != NULL);
 		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
-		ASSERT((reg_buf_offset + reg_buf_bytes) <=
-							BBTOB(bp->b_io_length));
+		ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
 
 		/*
 		 * The current logged region contains a copy of the
@@ -2439,17 +2360,21 @@
 	case XFS_BLFT_BTREE_BUF:
 		switch (magic32) {
 		case XFS_ABTB_CRC_MAGIC:
-		case XFS_ABTC_CRC_MAGIC:
 		case XFS_ABTB_MAGIC:
+			bp->b_ops = &xfs_bnobt_buf_ops;
+			break;
+		case XFS_ABTC_CRC_MAGIC:
 		case XFS_ABTC_MAGIC:
-			bp->b_ops = &xfs_allocbt_buf_ops;
+			bp->b_ops = &xfs_cntbt_buf_ops;
 			break;
 		case XFS_IBT_CRC_MAGIC:
-		case XFS_FIBT_CRC_MAGIC:
 		case XFS_IBT_MAGIC:
-		case XFS_FIBT_MAGIC:
 			bp->b_ops = &xfs_inobt_buf_ops;
 			break;
+		case XFS_FIBT_CRC_MAGIC:
+		case XFS_FIBT_MAGIC:
+			bp->b_ops = &xfs_finobt_buf_ops;
+			break;
 		case XFS_BMAP_CRC_MAGIC:
 		case XFS_BMAP_MAGIC:
 			bp->b_ops = &xfs_bmbt_buf_ops;
@@ -2666,7 +2591,7 @@
 		ASSERT(nbits > 0);
 		ASSERT(item->ri_buf[i].i_addr != NULL);
 		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
-		ASSERT(BBTOB(bp->b_io_length) >=
+		ASSERT(BBTOB(bp->b_length) >=
 		       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
 
 		/*
@@ -2878,23 +2803,22 @@
 	 *
 	 * Also make sure that only inode buffers with good sizes stay in
 	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
-	 * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
+	 * or inode_cluster_size bytes, whichever is bigger.  The inode
 	 * buffers in the log can be a different size if the log was generated
 	 * by an older kernel using unclustered inode buffers or a newer kernel
 	 * running with a different inode cluster size.  Regardless, if the
-	 * the inode buffer size isn't max(blocksize, mp->m_inode_cluster_size)
-	 * for *our* value of mp->m_inode_cluster_size, then we need to keep
+	 * the inode buffer size isn't max(blocksize, inode_cluster_size)
+	 * for *our* value of inode_cluster_size, then we need to keep
 	 * the buffer out of the buffer cache so that the buffer won't
 	 * overlap with future reads of those inodes.
 	 */
 	if (XFS_DINODE_MAGIC ==
 	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
-	    (BBTOB(bp->b_io_length) != max(log->l_mp->m_sb.sb_blocksize,
-			(uint32_t)log->l_mp->m_inode_cluster_size))) {
+	    (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
 		xfs_buf_stale(bp);
 		error = xfs_bwrite(bp);
 	} else {
-		ASSERT(bp->b_target->bt_mount == mp);
+		ASSERT(bp->b_mount == mp);
 		bp->b_iodone = xlog_recover_iodone;
 		xfs_buf_delwri_queue(bp, buffer_list);
 	}
@@ -3008,7 +2932,7 @@
 	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
 		in_f = item->ri_buf[0].i_addr;
 	} else {
-		in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
+		in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
 		need_free = 1;
 		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
 		if (error)
@@ -3045,7 +2969,7 @@
 	 * Make sure the place we're flushing out to really looks
 	 * like an inode!
 	 */
-	if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+	if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
 		xfs_alert(mp,
 	"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
 			__func__, dip, bp, in_f->ilf_ino);
@@ -3256,7 +3180,7 @@
 	/* re-generate the checksum. */
 	xfs_dinode_calc_crc(log->l_mp, dip);
 
-	ASSERT(bp->b_target->bt_mount == mp);
+	ASSERT(bp->b_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
 	xfs_buf_delwri_queue(bp, buffer_list);
 
@@ -3395,7 +3319,7 @@
 	}
 
 	ASSERT(dq_f->qlf_size == 2);
-	ASSERT(bp->b_target->bt_mount == mp);
+	ASSERT(bp->b_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
 	xfs_buf_delwri_queue(bp, buffer_list);
 
@@ -3459,7 +3383,7 @@
 {
 	xfs_efd_log_format_t	*efd_formatp;
 	xfs_efi_log_item_t	*efip = NULL;
-	xfs_log_item_t		*lip;
+	struct xfs_log_item	*lip;
 	uint64_t		efi_id;
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp = log->l_ailp;
@@ -3845,12 +3769,12 @@
 {
 	struct xfs_mount	*mp = log->l_mp;
 	struct xfs_icreate_log	*icl;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
 	unsigned int		count;
 	unsigned int		isize;
 	xfs_agblock_t		length;
-	int			blks_per_cluster;
 	int			bb_per_cluster;
 	int			cancel_count;
 	int			nbufs;
@@ -3895,10 +3819,10 @@
 
 	/*
 	 * The inode chunk is either full or sparse and we only support
-	 * m_ialloc_min_blks sized sparse allocations at this time.
+	 * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
 	 */
-	if (length != mp->m_ialloc_blks &&
-	    length != mp->m_ialloc_min_blks) {
+	if (length != igeo->ialloc_blks &&
+	    length != igeo->ialloc_min_blks) {
 		xfs_warn(log->l_mp,
 			 "%s: unsupported chunk length", __FUNCTION__);
 		return -EINVAL;
@@ -3918,14 +3842,13 @@
 	 * buffers for cancellation so we don't overwrite anything written after
 	 * a cancellation.
 	 */
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-	bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
-	nbufs = length / blks_per_cluster;
+	bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+	nbufs = length / igeo->blocks_per_cluster;
 	for (i = 0, cancel_count = 0; i < nbufs; i++) {
 		xfs_daddr_t	daddr;
 
 		daddr = XFS_AGB_TO_DADDR(mp, agno,
-					 agbno + i * blks_per_cluster);
+				agbno + i * igeo->blocks_per_cluster);
 		if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
 			cancel_count++;
 	}
@@ -4240,7 +4163,7 @@
 {
 	xlog_recover_item_t	*item;
 
-	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+	item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
 	INIT_LIST_HEAD(&item->ri_list);
 	list_add_tail(&item->ri_list, head);
 }
@@ -4280,7 +4203,7 @@
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
 
-	ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
+	ptr = kmem_realloc(old_ptr, len + old_len, 0);
 	memcpy(&ptr[old_len], dp, len);
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4340,7 +4263,7 @@
 		return 0;
 	}
 
-	ptr = kmem_alloc(len, KM_SLEEP);
+	ptr = kmem_alloc(len, 0);
 	memcpy(ptr, dp, len);
 	in_f = (struct xfs_inode_log_format *)ptr;
 
@@ -4368,7 +4291,7 @@
 		item->ri_total = in_f->ilf_size;
 		item->ri_buf =
 			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
-				    KM_SLEEP);
+				    0);
 	}
 	ASSERT(item->ri_total > item->ri_cnt);
 	/* Description region is ri_buf[0] */
@@ -4502,7 +4425,7 @@
 	 * This is a new transaction so allocate a new recovery container to
 	 * hold the recovery ops that will follow.
 	 */
-	trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+	trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
 	trans->r_log_tid = tid;
 	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
 	INIT_LIST_HEAD(&trans->r_itemq);
@@ -4954,12 +4877,11 @@
  * A cancel occurs when the mount has failed and we're bailing out.
  * Release all pending log intent items so they don't pin the AIL.
  */
-STATIC int
+STATIC void
 xlog_recover_cancel_intents(
 	struct xlog		*log)
 {
 	struct xfs_log_item	*lip;
-	int			error = 0;
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp;
 
@@ -4999,7 +4921,6 @@
 
 	xfs_trans_ail_cursor_done(&cur);
 	spin_unlock(&ailp->ail_lock);
-	return error;
 }
 
 /*
@@ -5103,16 +5024,27 @@
 }
 
 /*
- * xlog_iunlink_recover
+ * Recover AGI unlinked lists
  *
- * This is called during recovery to process any inodes which
- * we unlinked but not freed when the system crashed.  These
- * inodes will be on the lists in the AGI blocks.  What we do
- * here is scan all the AGIs and fully truncate and free any
- * inodes found on the lists.  Each inode is removed from the
- * lists when it has been fully truncated and is freed.  The
- * freeing of the inode and its removal from the list must be
- * atomic.
+ * This is called during recovery to process any inodes which we unlinked but
+ * not freed when the system crashed.  These inodes will be on the lists in the
+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
+ * any inodes found on the lists. Each inode is removed from the lists when it
+ * has been fully truncated and is freed. The freeing of the inode and its
+ * removal from the list must be atomic.
+ *
+ * If everything we touch in the agi processing loop is already in memory, this
+ * loop can hold the cpu for a long time. It runs without lock contention,
+ * memory allocation contention, the need wait for IO, etc, and so will run
+ * until we either run out of inodes to process, run low on memory or we run out
+ * of log space.
+ *
+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
+ * and can prevent other filesytem work (such as CIL pushes) from running. This
+ * can lead to deadlocks if the recovery process runs out of log reservation
+ * space. Hence we need to yield the CPU when there is other kernel work
+ * scheduled on this CPU to ensure other scheduled work can run without undue
+ * latency.
  */
 STATIC void
 xlog_recover_process_iunlinks(
@@ -5159,13 +5091,14 @@
 			while (agino != NULLAGINO) {
 				agino = xlog_recover_process_one_iunlink(mp,
 							agno, agino, bucket);
+				cond_resched();
 			}
 		}
 		xfs_buf_rele(agibp);
 	}
 }
 
-STATIC int
+STATIC void
 xlog_unpack_data(
 	struct xlog_rec_header	*rhead,
 	char			*dp,
@@ -5188,8 +5121,6 @@
 			dp += BBSIZE;
 		}
 	}
-
-	return 0;
 }
 
 /*
@@ -5204,11 +5135,9 @@
 	int			pass,
 	struct list_head	*buffer_list)
 {
-	int			error;
 	__le32			old_crc = rhead->h_crc;
 	__le32			crc;
 
-
 	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
 
 	/*
@@ -5247,9 +5176,7 @@
 			return -EFSCORRUPTED;
 	}
 
-	error = xlog_unpack_data(rhead, dp, log);
-	if (error)
-		return error;
+	xlog_unpack_data(rhead, dp, log);
 
 	return xlog_recover_process_data(log, rhash, rhead, dp, pass,
 					 buffer_list);
@@ -5311,7 +5238,7 @@
 	xfs_daddr_t		blk_no, rblk_no;
 	xfs_daddr_t		rhead_blk;
 	char			*offset;
-	xfs_buf_t		*hbp, *dbp;
+	char			*hbp, *dbp;
 	int			error = 0, h_size, h_len;
 	int			error2 = 0;
 	int			bblks, split_bblks;
@@ -5336,7 +5263,7 @@
 		 * iclog header and extract the header size from it.  Get a
 		 * new hbp that is the correct size.
 		 */
-		hbp = xlog_get_bp(log, 1);
+		hbp = xlog_alloc_buffer(log, 1);
 		if (!hbp)
 			return -ENOMEM;
 
@@ -5378,23 +5305,23 @@
 			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
 			if (h_size % XLOG_HEADER_CYCLE_SIZE)
 				hblks++;
-			xlog_put_bp(hbp);
-			hbp = xlog_get_bp(log, hblks);
+			kmem_free(hbp);
+			hbp = xlog_alloc_buffer(log, hblks);
 		} else {
 			hblks = 1;
 		}
 	} else {
 		ASSERT(log->l_sectBBsize == 1);
 		hblks = 1;
-		hbp = xlog_get_bp(log, 1);
+		hbp = xlog_alloc_buffer(log, 1);
 		h_size = XLOG_BIG_RECORD_BSIZE;
 	}
 
 	if (!hbp)
 		return -ENOMEM;
-	dbp = xlog_get_bp(log, BTOBB(h_size));
+	dbp = xlog_alloc_buffer(log, BTOBB(h_size));
 	if (!dbp) {
-		xlog_put_bp(hbp);
+		kmem_free(hbp);
 		return -ENOMEM;
 	}
 
@@ -5409,7 +5336,7 @@
 			/*
 			 * Check for header wrapping around physical end-of-log
 			 */
-			offset = hbp->b_addr;
+			offset = hbp;
 			split_hblks = 0;
 			wrapped_hblks = 0;
 			if (blk_no + hblks <= log->l_logBBsize) {
@@ -5445,8 +5372,8 @@
 				 *   - order is important.
 				 */
 				wrapped_hblks = hblks - split_hblks;
-				error = xlog_bread_offset(log, 0,
-						wrapped_hblks, hbp,
+				error = xlog_bread_noalign(log, 0,
+						wrapped_hblks,
 						offset + BBTOB(split_hblks));
 				if (error)
 					goto bread_err2;
@@ -5477,7 +5404,7 @@
 			} else {
 				/* This log record is split across the
 				 * physical end of log */
-				offset = dbp->b_addr;
+				offset = dbp;
 				split_bblks = 0;
 				if (blk_no != log->l_logBBsize) {
 					/* some data is before the physical
@@ -5506,8 +5433,8 @@
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
-				error = xlog_bread_offset(log, 0,
-						bblks - split_bblks, dbp,
+				error = xlog_bread_noalign(log, 0,
+						bblks - split_bblks,
 						offset + BBTOB(split_bblks));
 				if (error)
 					goto bread_err2;
@@ -5555,9 +5482,9 @@
 	}
 
  bread_err2:
-	xlog_put_bp(dbp);
+	kmem_free(dbp);
  bread_err1:
-	xlog_put_bp(hbp);
+	kmem_free(hbp);
 
 	/*
 	 * Submit buffers that have been added from the last record processed,
@@ -5614,7 +5541,7 @@
 	 */
 	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
 						 sizeof(struct list_head),
-						 KM_SLEEP);
+						 0);
 	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
 		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
 
@@ -5691,7 +5618,7 @@
 	 * Now that we've finished replaying all buffer and inode
 	 * updates, re-read in the superblock and reverify it.
 	 */
-	bp = xfs_getsb(mp, 0);
+	bp = xfs_getsb(mp);
 	bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
 	ASSERT(!(bp->b_flags & XBF_WRITE));
 	bp->b_flags |= XBF_READ;
@@ -5864,16 +5791,12 @@
 	return 0;
 }
 
-int
+void
 xlog_recover_cancel(
 	struct xlog	*log)
 {
-	int		error = 0;
-
 	if (log->l_flags & XLOG_RECOVERY_NEEDED)
-		error = xlog_recover_cancel_intents(log);
-
-	return error;
+		xlog_recover_cancel_intents(log);
 }
 
 #if defined(DEBUG)

diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 576c375..9804efe 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c

@@ -6,8 +6,8 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_error.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
-#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 
@@ -107,5 +107,5 @@
 void
 xfs_hex_dump(void *p, int length)
 {
-	print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+	print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
 }

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 02d1509..ba5b6f3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c

@@ -12,9 +12,6 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
 #include "xfs_ialloc.h"
@@ -27,13 +24,13 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
-#include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_sysfs.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_reflink.h"
 #include "xfs_extent_busy.h"
+#include "xfs_health.h"
 
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -85,7 +82,7 @@
 	if (hole < 0) {
 		xfs_uuid_table = kmem_realloc(xfs_uuid_table,
 			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
-			KM_SLEEP);
+			0);
 		hole = xfs_uuid_table_size++;
 	}
 	xfs_uuid_table[hole] = *uuid;
@@ -149,6 +146,7 @@
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -216,7 +214,7 @@
 
 		spin_lock(&mp->m_perag_lock);
 		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
-			BUG();
+			WARN_ON_ONCE(1);
 			spin_unlock(&mp->m_perag_lock);
 			radix_tree_preload_end();
 			error = -EEXIST;
@@ -227,6 +225,10 @@
 		/* first new pag is fully initialized */
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
+		error = xfs_iunlink_init(pag);
+		if (error)
+			goto out_hash_destroy;
+		spin_lock_init(&pag->pag_state_lock);
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +251,7 @@
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
+		xfs_iunlink_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		kmem_free(pag);
 	}
@@ -423,30 +426,6 @@
 }
 
 /*
- * Set the maximum inode count for this filesystem
- */
-STATIC void
-xfs_set_maxicount(xfs_mount_t *mp)
-{
-	xfs_sb_t	*sbp = &(mp->m_sb);
-	uint64_t	icount;
-
-	if (sbp->sb_imax_pct) {
-		/*
-		 * Make sure the maximum inode count is a multiple
-		 * of the units we allocate inodes in.
-		 */
-		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
-		do_div(icount, 100);
-		do_div(icount, mp->m_ialloc_blks);
-		mp->m_maxicount = (icount * mp->m_ialloc_blks)  <<
-				   sbp->sb_inopblog;
-	} else {
-		mp->m_maxicount = 0;
-	}
-}
-
-/*
  * Set the default minimum read and write sizes unless
  * already specified in a mount option.
  * We use smaller I/O sizes when the file system
@@ -502,29 +481,6 @@
 	}
 }
 
-
-/*
- * Set whether we're using inode alignment.
- */
-STATIC void
-xfs_set_inoalignment(xfs_mount_t *mp)
-{
-	if (xfs_sb_version_hasalign(&mp->m_sb) &&
-		mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
-		mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
-	else
-		mp->m_inoalign_mask = 0;
-	/*
-	 * If we are using stripe alignment, check whether
-	 * the stripe unit is a multiple of the inode alignment
-	 */
-	if (mp->m_dalign && mp->m_inoalign_mask &&
-	    !(mp->m_dalign & mp->m_inoalign_mask))
-		mp->m_sinoalign = mp->m_dalign;
-	else
-		mp->m_sinoalign = 0;
-}
-
 /*
  * Check that the data (and log if separate) is an ok size.
  */
@@ -639,7 +595,7 @@
 	    (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
 	     !xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
 	     mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
-		mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
 
 	/*
 	 * We can safely re-initialise incore superblock counters from the
@@ -654,7 +610,7 @@
 	 */
 	if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) ||
 	     XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) &&
-	    !(mp->m_flags & XFS_MOUNT_BAD_SUMMARY))
+	    !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
 		return 0;
 
 	return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
@@ -676,6 +632,7 @@
 {
 	struct xfs_sb		*sbp = &(mp->m_sb);
 	struct xfs_inode	*rip;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
 	uint64_t		resblks;
 	uint			quotamount = 0;
 	uint			quotaflags = 0;
@@ -742,12 +699,10 @@
 	xfs_alloc_compute_maxlevels(mp);
 	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
 	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
-	xfs_ialloc_compute_maxlevels(mp);
+	xfs_ialloc_setup_geometry(mp);
 	xfs_rmapbt_compute_maxlevels(mp);
 	xfs_refcountbt_compute_maxlevels(mp);
 
-	xfs_set_maxicount(mp);
-
 	/* enable fail_at_unmount as default */
 	mp->m_fail_unmount = true;
 
@@ -781,46 +736,22 @@
 	xfs_set_low_space_thresholds(mp);
 
 	/*
-	 * Set the inode cluster size.
-	 * This may still be overridden by the file system
-	 * block size if it is larger than the chosen cluster size.
-	 *
-	 * For v5 filesystems, scale the cluster size with the inode size to
-	 * keep a constant ratio of inode per cluster buffer, but only if mkfs
-	 * has set the inode alignment value appropriately for larger cluster
-	 * sizes.
-	 */
-	mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		int	new_size = mp->m_inode_cluster_size;
-
-		new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
-		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
-			mp->m_inode_cluster_size = new_size;
-	}
-
-	/*
 	 * If enabled, sparse inode chunk alignment is expected to match the
 	 * cluster size. Full inode chunk alignment must match the chunk size,
 	 * but that is checked on sb read verification...
 	 */
 	if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
 	    mp->m_sb.sb_spino_align !=
-			XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+			XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
 		xfs_warn(mp,
 	"Sparse inode block alignment (%u) must match cluster size (%llu).",
 			 mp->m_sb.sb_spino_align,
-			 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+			 XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
 		error = -EINVAL;
 		goto out_remove_uuid;
 	}
 
 	/*
-	 * Set inode alignment fields
-	 */
-	xfs_set_inoalignment(mp);
-
-	/*
 	 * Check that the data (and log if separate) is an ok size.
 	 */
 	error = xfs_check_sizes(mp);
@@ -1059,6 +990,7 @@
 	 */
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
+	xfs_health_unmount(mp);
  out_log_dealloc:
 	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
 	xfs_log_mount_cancel(mp);
@@ -1095,7 +1027,7 @@
 	uint64_t		resblks;
 	int			error;
 
-	xfs_icache_disable_reclaim(mp);
+	xfs_stop_block_reaping(mp);
 	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
 	xfs_rtunmount_inodes(mp);
@@ -1141,6 +1073,7 @@
 	 */
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
+	xfs_health_unmount(mp);
 
 	xfs_qm_unmount(mp);
 
@@ -1372,24 +1305,14 @@
  * xfs_getsb() is called to obtain the buffer for the superblock.
  * The buffer is returned locked and read in from disk.
  * The buffer should be released with a call to xfs_brelse().
- *
- * If the flags parameter is BUF_TRYLOCK, then we'll only return
- * the superblock buffer if it can be locked without sleeping.
- * If it can't then we'll return NULL.
  */
 struct xfs_buf *
 xfs_getsb(
-	struct xfs_mount	*mp,
-	int			flags)
+	struct xfs_mount	*mp)
 {
 	struct xfs_buf		*bp = mp->m_sb_bp;
 
-	if (!xfs_buf_trylock(bp)) {
-		if (flags & XBF_TRYLOCK)
-			return NULL;
-		xfs_buf_lock(bp);
-	}
-
+	xfs_buf_lock(bp);
 	xfs_buf_hold(bp);
 	ASSERT(bp->b_flags & XBF_DONE);
 	return bp;
@@ -1436,7 +1359,26 @@
 	if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
 		return;
 
-	spin_lock(&mp->m_sb_lock);
-	mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
-	spin_unlock(&mp->m_sb_lock);
+	xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+}
+
+/*
+ * Update the in-core delayed block counter.
+ *
+ * We prefer to update the counter without having to take a spinlock for every
+ * counter update (i.e. batching).  Each change to delayed allocation
+ * reservations can change can easily exceed the default percpu counter
+ * batching, so we use a larger batch factor here.
+ *
+ * Note that we don't currently have any callers requiring fast summation
+ * (e.g. percpu_counter_read) so we can use a big batch value here.
+ */
+#define XFS_DELALLOC_BATCH	(4096)
+void
+xfs_mod_delalloc(
+	struct xfs_mount	*mp,
+	int64_t			delta)
+{
+	percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
+			XFS_DELALLOC_BATCH);
 }

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7964513..fdb60e0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h

@@ -60,6 +60,20 @@
 typedef struct xfs_mount {
 	struct super_block	*m_super;
 	xfs_tid_t		m_tid;		/* next unused tid for fs */
+
+	/*
+	 * Bitsets of per-fs metadata that have been checked and/or are sick.
+	 * Callers must hold m_sb_lock to access these two fields.
+	 */
+	uint8_t			m_fs_checked;
+	uint8_t			m_fs_sick;
+	/*
+	 * Bitsets of rt metadata that have been checked and/or are sick.
+	 * Callers must hold m_sb_lock to access this field.
+	 */
+	uint8_t			m_rt_checked;
+	uint8_t			m_rt_sick;
+
 	struct xfs_ail		*m_ail;		/* fs active log item list */
 
 	struct xfs_sb		m_sb;		/* copy of fs superblock */
@@ -67,6 +81,12 @@
 	struct percpu_counter	m_icount;	/* allocated inodes counter */
 	struct percpu_counter	m_ifree;	/* free inodes counter */
 	struct percpu_counter	m_fdblocks;	/* free block counter */
+	/*
+	 * Count of data device blocks reserved for delayed allocations,
+	 * including indlen blocks.  Does not include allocated CoW staging
+	 * extents or anything related to the rt device.
+	 */
+	struct percpu_counter	m_delalloc_blks;
 
 	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
 	char			*m_fsname;	/* filesystem name */
@@ -85,10 +105,18 @@
 	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
 	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
 	struct xlog		*m_log;		/* log specific stuff */
+	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
 	int			m_logbufs;	/* number of log buffers */
 	int			m_logbsize;	/* size of each log buffer */
 	uint			m_rsumlevels;	/* rt summary levels */
 	uint			m_rsumsize;	/* size of rt summary, bytes */
+	/*
+	 * Optional cache of rt summary level per bitmap block with the
+	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
+	 * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
+	 * inode lock.
+	 */
+	uint8_t			*m_rsum_cache;
 	struct xfs_inode	*m_rbmip;	/* pointer to bitmap inode */
 	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
 	struct xfs_inode	*m_rootip;	/* pointer to root directory */
@@ -99,8 +127,6 @@
 	uint8_t			m_blkbit_log;	/* blocklog + NBBY */
 	uint8_t			m_blkbb_log;	/* blocklog - BBSHIFT */
 	uint8_t			m_agno_log;	/* log #ag's */
-	uint8_t			m_agino_log;	/* #bits for agino in inum */
-	uint			m_inode_cluster_size;/* min inode buf size */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
 	uint			m_blockwmask;	/* blockwsize-1 */
@@ -108,15 +134,12 @@
 	uint			m_alloc_mnr[2];	/* min alloc btree records */
 	uint			m_bmap_dmxr[2];	/* max bmap btree records */
 	uint			m_bmap_dmnr[2];	/* min bmap btree records */
-	uint			m_inobt_mxr[2];	/* max inobt btree records */
-	uint			m_inobt_mnr[2];	/* min inobt btree records */
 	uint			m_rmap_mxr[2];	/* max rmap btree records */
 	uint			m_rmap_mnr[2];	/* min rmap btree records */
 	uint			m_refc_mxr[2];	/* max refc btree records */
 	uint			m_refc_mnr[2];	/* min refc btree records */
 	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
-	uint			m_in_maxlevels;	/* max inobt btree levels. */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
 	uint			m_refc_maxlevels; /* max refcount btree level */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
@@ -127,21 +150,14 @@
 	struct mutex		m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint64_t		m_flags;	/* global mount flags */
-	bool			m_inotbt_nores; /* no per-AG finobt resv. */
-	int			m_ialloc_inos;	/* inodes in inode allocation */
-	int			m_ialloc_blks;	/* blocks in inode allocation */
-	int			m_ialloc_min_blks;/* min blocks in sparse inode
-						   * allocation */
-	int			m_inoalign_mask;/* mask sb_inoalignmt if used */
+	bool			m_finobt_nores; /* no per-AG finobt resv. */
 	uint			m_qflags;	/* quota status flags */
 	struct xfs_trans_resv	m_resv;		/* precomputed res values */
-	uint64_t		m_maxicount;	/* maximum inode count */
 	uint64_t		m_resblks;	/* total reserved blocks */
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	int			m_dalign;	/* stripe unit */
 	int			m_swidth;	/* stripe width */
-	int			m_sinoalign;	/* stripe unit inode alignment */
 	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
 	const struct xfs_nameops *m_dirnameops;	/* vector of dir name ops */
 	const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
@@ -164,11 +180,9 @@
 	struct xstats		m_stats;	/* per-fs stats */
 
 	struct workqueue_struct *m_buf_workqueue;
-	struct workqueue_struct	*m_data_workqueue;
 	struct workqueue_struct	*m_unwritten_workqueue;
 	struct workqueue_struct	*m_cil_workqueue;
 	struct workqueue_struct	*m_reclaim_workqueue;
-	struct workqueue_struct	*m_log_workqueue;
 	struct workqueue_struct *m_eofblocks_workqueue;
 	struct workqueue_struct	*m_sync_workqueue;
 
@@ -183,6 +197,7 @@
 	 */
 	uint32_t		m_generation;
 
+	bool			m_always_cow;
 	bool			m_fail_unmount;
 #ifdef DEBUG
 	/*
@@ -195,6 +210,8 @@
 #endif
 } xfs_mount_t;
 
+#define M_IGEO(mp)		(&(mp)->m_ino_geo)
+
 /*
  * Flags for m_flags.
  */
@@ -202,7 +219,6 @@
 						   must be synchronous except
 						   for space allocations */
 #define XFS_MOUNT_UNMOUNTING	(1ULL << 1)	/* filesystem is unmounting */
-#define XFS_MOUNT_BAD_SUMMARY	(1ULL << 2)	/* summary counters are bad */
 #define XFS_MOUNT_WAS_CLEAN	(1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
 						   operations, typically for
@@ -311,13 +327,6 @@
 }
 
 /* per-AG block reservation data structures*/
-enum xfs_ag_resv_type {
-	XFS_AG_RESV_NONE = 0,
-	XFS_AG_RESV_AGFL,
-	XFS_AG_RESV_METADATA,
-	XFS_AG_RESV_RMAPBT,
-};
-
 struct xfs_ag_resv {
 	/* number of blocks originally reserved here */
 	xfs_extlen_t			ar_orig_reserved;
@@ -357,6 +366,15 @@
 	xfs_agino_t	pagl_pagino;
 	xfs_agino_t	pagl_leftrec;
 	xfs_agino_t	pagl_rightrec;
+
+	/*
+	 * Bitsets of per-ag metadata that have been checked and/or are sick.
+	 * Callers should hold pag_state_lock before accessing this field.
+	 */
+	uint16_t	pag_checked;
+	uint16_t	pag_sick;
+	spinlock_t	pag_state_lock;
+
 	spinlock_t	pagb_lock;	/* lock for pagb_tree */
 	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
 	unsigned int	pagb_gen;	/* generation count for pagb_tree */
@@ -385,6 +403,13 @@
 
 	/* reference count */
 	uint8_t			pagf_refcount_level;
+
+	/*
+	 * Unlinked inode information.  This incore information reflects
+	 * data stored in the AGI, so callers must hold the AGI buffer lock
+	 * or have some other means to control concurrency.
+	 */
+	struct rhashtable	pagi_unlinked_hash;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *
@@ -419,7 +444,7 @@
 				 bool reserved);
 extern int	xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
 
-extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
+extern struct xfs_buf *xfs_getsb(xfs_mount_t *);
 extern int	xfs_readsb(xfs_mount_t *, int);
 extern void	xfs_freesb(xfs_mount_t *);
 extern bool	xfs_fs_writable(struct xfs_mount *mp, int level);
@@ -435,5 +460,6 @@
 struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
 		int error_class, int error);
 void xfs_force_summary_recalc(struct xfs_mount *mp);
+void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
 
 #endif	/* __XFS_MOUNT_H__ */

diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 7473881..a06661d 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c

@@ -333,12 +333,12 @@
 	if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
 		return -EINVAL;
 
-	if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+	if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
 		return -ENOMEM;
 
 	/* An extra list is needed to avoid reaping up to a grp_time early. */
 	mru->grp_count = grp_count + 1;
-	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
 
 	if (!mru->lists) {
 		err = -ENOMEM;

diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index d3e04d2..b6701b4 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h

@@ -125,6 +125,32 @@
 	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,	56);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,	20);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,		16);
+
+	/*
+	 * The v5 superblock format extended several v4 header structures with
+	 * additional data. While new fields are only accessible on v5
+	 * superblocks, it's important that the v5 structures place original v4
+	 * fields/headers in the correct location on-disk. For example, we must
+	 * be able to find magic values at the same location in certain blocks
+	 * regardless of superblock version.
+	 *
+	 * The following checks ensure that various v5 data structures place the
+	 * subset of v4 metadata associated with the same type of block at the
+	 * start of the on-disk block. If there is no data structure definition
+	 * for certain types of v4 blocks, traverse down to the first field of
+	 * common metadata (e.g., magic value) and make sure it is at offset
+	 * zero.
+	 */
+	XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr,	0);
+	XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr,	0);
+	XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic,	0);
+	XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic,	0);
+	XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
+
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat,		192);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers,		24);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req,		64);
 }
 
 #endif /* __XFS_ONDISK_H */

diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c359..a339bd5 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c

@@ -2,23 +2,16 @@
 /*
  * Copyright (c) 2014 Christoph Hellwig.
  */
-#include <linux/iomap.h>
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_log.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_error.h"
 #include "xfs_iomap.h"
-#include "xfs_shared.h"
-#include "xfs_bit.h"
-#include "xfs_pnfs.h"
 
 /*
  * Ensure that we do not have any outstanding pNFS layouts that can be used by
@@ -39,7 +32,7 @@
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
 
-	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
+	while ((error = break_layout(inode, false)) == -EWOULDBLOCK) {
 		xfs_iunlock(ip, *iolock);
 		*did_unlock = true;
 		error = break_layout(inode, true);
@@ -185,7 +178,7 @@
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
-	xfs_bmbt_to_iomap(ip, iomap, &imap);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
 	*device_generation = mp->m_generation;
 	return error;
 out_unlock:

diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
new file mode 100644
index 0000000..4bcc3e6
--- /dev/null
+++ b/fs/xfs/xfs_pwork.c

@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+#include "xfs_sysctl.h"
+#include "xfs_pwork.h"
+#include <linux/nmi.h>
+
+/*
+ * Parallel Work Queue
+ * ===================
+ *
+ * Abstract away the details of running a large and "obviously" parallelizable
+ * task across multiple CPUs.  Callers initialize the pwork control object with
+ * a desired level of parallelization and a work function.  Next, they embed
+ * struct xfs_pwork in whatever structure they use to pass work context to a
+ * worker thread and queue that pwork.  The work function will be passed the
+ * pwork item when it is run (from process context) and any returned error will
+ * be recorded in xfs_pwork_ctl.error.  Work functions should check for errors
+ * and abort if necessary; the non-zeroness of xfs_pwork_ctl.error does not
+ * stop workqueue item processing.
+ *
+ * This is the rough equivalent of the xfsprogs workqueue code, though we can't
+ * reuse that name here.
+ */
+
+/* Invoke our caller's function. */
+static void
+xfs_pwork_work(
+	struct work_struct	*work)
+{
+	struct xfs_pwork	*pwork;
+	struct xfs_pwork_ctl	*pctl;
+	int			error;
+
+	pwork = container_of(work, struct xfs_pwork, work);
+	pctl = pwork->pctl;
+	error = pctl->work_fn(pctl->mp, pwork);
+	if (error && !pctl->error)
+		pctl->error = error;
+	if (atomic_dec_and_test(&pctl->nr_work))
+		wake_up(&pctl->poll_wait);
+}
+
+/*
+ * Set up control data for parallel work.  @work_fn is the function that will
+ * be called.  @tag will be written into the kernel threads.  @nr_threads is
+ * the level of parallelism desired, or 0 for no limit.
+ */
+int
+xfs_pwork_init(
+	struct xfs_mount	*mp,
+	struct xfs_pwork_ctl	*pctl,
+	xfs_pwork_work_fn	work_fn,
+	const char		*tag,
+	unsigned int		nr_threads)
+{
+#ifdef DEBUG
+	if (xfs_globals.pwork_threads >= 0)
+		nr_threads = xfs_globals.pwork_threads;
+#endif
+	trace_xfs_pwork_init(mp, nr_threads, current->pid);
+
+	pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
+			current->pid);
+	if (!pctl->wq)
+		return -ENOMEM;
+	pctl->work_fn = work_fn;
+	pctl->error = 0;
+	pctl->mp = mp;
+	atomic_set(&pctl->nr_work, 0);
+	init_waitqueue_head(&pctl->poll_wait);
+
+	return 0;
+}
+
+/* Queue some parallel work. */
+void
+xfs_pwork_queue(
+	struct xfs_pwork_ctl	*pctl,
+	struct xfs_pwork	*pwork)
+{
+	INIT_WORK(&pwork->work, xfs_pwork_work);
+	pwork->pctl = pctl;
+	atomic_inc(&pctl->nr_work);
+	queue_work(pctl->wq, &pwork->work);
+}
+
+/* Wait for the work to finish and tear down the control structure. */
+int
+xfs_pwork_destroy(
+	struct xfs_pwork_ctl	*pctl)
+{
+	destroy_workqueue(pctl->wq);
+	pctl->wq = NULL;
+	return pctl->error;
+}
+
+/*
+ * Wait for the work to finish by polling completion status and touch the soft
+ * lockup watchdog.  This is for callers such as mount which hold locks.
+ */
+void
+xfs_pwork_poll(
+	struct xfs_pwork_ctl	*pctl)
+{
+	while (wait_event_timeout(pctl->poll_wait,
+				atomic_read(&pctl->nr_work) == 0, HZ) == 0)
+		touch_softlockup_watchdog();
+}
+
+/*
+ * Return the amount of parallelism that the data device can handle, or 0 for
+ * no limit.
+ */
+unsigned int
+xfs_pwork_guess_datadev_parallelism(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buftarg	*btp = mp->m_ddev_targp;
+
+	/*
+	 * For now we'll go with the most conservative setting possible,
+	 * which is two threads for an SSD and 1 thread everywhere else.
+	 */
+	return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1;
+}

diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
new file mode 100644
index 0000000..8133124
--- /dev/null
+++ b/fs/xfs/xfs_pwork.h

@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_PWORK_H__
+#define __XFS_PWORK_H__
+
+struct xfs_pwork;
+struct xfs_mount;
+
+typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
+
+/*
+ * Parallel work coordination structure.
+ */
+struct xfs_pwork_ctl {
+	struct workqueue_struct	*wq;
+	struct xfs_mount	*mp;
+	xfs_pwork_work_fn	work_fn;
+	struct wait_queue_head	poll_wait;
+	atomic_t		nr_work;
+	int			error;
+};
+
+/*
+ * Embed this parallel work control item inside your own work structure,
+ * then queue work with it.
+ */
+struct xfs_pwork {
+	struct work_struct	work;
+	struct xfs_pwork_ctl	*pctl;
+};
+
+#define XFS_PWORK_SINGLE_THREADED	{ .pctl = NULL }
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_ctl_want_abort(
+	struct xfs_pwork_ctl	*pctl)
+{
+	return pctl && pctl->error;
+}
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_want_abort(
+	struct xfs_pwork	*pwork)
+{
+	return xfs_pwork_ctl_want_abort(pwork->pctl);
+}
+
+int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
+		xfs_pwork_work_fn work_fn, const char *tag,
+		unsigned int nr_threads);
+void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
+int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
+void xfs_pwork_poll(struct xfs_pwork_ctl *pctl);
+unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
+
+#endif /* __XFS_PWORK_H__ */

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 52ed790..ecd8ce1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c

@@ -13,19 +13,15 @@
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
+#include "xfs_iwalk.h"
 #include "xfs_quota.h"
-#include "xfs_error.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_bmap_util.h"
 #include "xfs_trans.h"
 #include "xfs_trans_space.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
-#include "xfs_cksum.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
@@ -646,7 +642,7 @@
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
-	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
 
 	error = list_lru_init(&qinf->qi_lru);
 	if (error)
@@ -982,7 +978,7 @@
 	if (qip->i_d.di_nblocks == 0)
 		return 0;
 
-	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
 
 	lblkno = 0;
 	maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
@@ -1118,17 +1114,15 @@
 /* ARGSUSED */
 STATIC int
 xfs_qm_dqusage_adjust(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	void		__user *buffer,	/* not used */
-	int		ubsize,		/* not used */
-	int		*ubused,	/* not used */
-	int		*res)		/* result code value */
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	void			*data)
 {
-	xfs_inode_t	*ip;
-	xfs_qcnt_t	nblks;
-	xfs_filblks_t	rtblks = 0;	/* total rt blks */
-	int		error;
+	struct xfs_inode	*ip;
+	xfs_qcnt_t		nblks;
+	xfs_filblks_t		rtblks = 0;	/* total rt blks */
+	int			error;
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
@@ -1136,20 +1130,18 @@
 	 * rootino must have its resources accounted for, not so with the quota
 	 * inodes.
 	 */
-	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
-		*res = BULKSTAT_RV_NOTHING;
-		return -EINVAL;
-	}
+	if (xfs_is_quota_inode(&mp->m_sb, ino))
+		return 0;
 
 	/*
 	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
 	 * at mount time and therefore nobody will be racing chown/chproj.
 	 */
-	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
-	if (error) {
-		*res = BULKSTAT_RV_NOTHING;
+	error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
+	if (error == -EINVAL || error == -ENOENT)
+		return 0;
+	if (error)
 		return error;
-	}
 
 	ASSERT(ip->i_delayed_blks == 0);
 
@@ -1157,7 +1149,7 @@
 		struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 
 		if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-			error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+			error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
 			if (error)
 				goto error0;
 		}
@@ -1200,13 +1192,8 @@
 			goto error0;
 	}
 
-	xfs_irele(ip);
-	*res = BULKSTAT_RV_DIDONE;
-	return 0;
-
 error0:
 	xfs_irele(ip);
-	*res = BULKSTAT_RV_GIVEUP;
 	return error;
 }
 
@@ -1270,18 +1257,13 @@
 xfs_qm_quotacheck(
 	xfs_mount_t	*mp)
 {
-	int			done, count, error, error2;
-	xfs_ino_t		lastino;
-	size_t			structsz;
+	int			error, error2;
 	uint			flags;
 	LIST_HEAD		(buffer_list);
 	struct xfs_inode	*uip = mp->m_quotainfo->qi_uquotaip;
 	struct xfs_inode	*gip = mp->m_quotainfo->qi_gquotaip;
 	struct xfs_inode	*pip = mp->m_quotainfo->qi_pquotaip;
 
-	count = INT_MAX;
-	structsz = 1;
-	lastino = 0;
 	flags = 0;
 
 	ASSERT(uip || gip || pip);
@@ -1318,18 +1300,10 @@
 		flags |= XFS_PQUOTA_CHKD;
 	}
 
-	do {
-		/*
-		 * Iterate thru all the inodes in the file system,
-		 * adjusting the corresponding dquot counters in core.
-		 */
-		error = xfs_bulkstat(mp, &lastino, &count,
-				     xfs_qm_dqusage_adjust,
-				     structsz, NULL, &done);
-		if (error)
-			break;
-
-	} while (!done);
+	error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
+			NULL);
+	if (error)
+		goto error_return;
 
 	/*
 	 * We've made all the changes that we need to make incore.  Flush them
@@ -1812,7 +1786,8 @@
 	uint			flags)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	uint			delblks, blkflags, prjflags = 0;
+	uint64_t		delblks;
+	unsigned int		blkflags, prjflags = 0;
 	struct xfs_dquot	*udq_unres = NULL;
 	struct xfs_dquot	*gdq_unres = NULL;
 	struct xfs_dquot	*pdq_unres = NULL;

diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 3ccf0fb..b41b750 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h

@@ -113,12 +113,8 @@
 	return NULL;
 }
 
-extern void	xfs_trans_mod_dquot(struct xfs_trans *,
-					struct xfs_dquot *, uint, long);
-extern int	xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
-			struct xfs_mount *, struct xfs_dquot *,
-			struct xfs_dquot *, struct xfs_dquot *,
-			long, long, uint);
+extern void	xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
+				    uint field, int64_t delta);
 extern void	xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
 extern void	xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
 

diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 73a1d77..5d72e88 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c

@@ -5,13 +5,13 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
-#include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_qm.h"
 
@@ -40,7 +40,7 @@
 		statp->f_files = limit;
 		statp->f_ffree =
 			(statp->f_files > dqp->q_res_icount) ?
-			 (statp->f_ffree - dqp->q_res_icount) : 0;
+			 (statp->f_files - dqp->q_res_icount) : 0;
 	}
 }
 

diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index b319089..da7ad03 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c

@@ -4,7 +4,6 @@
  * All Rights Reserved.
  */
 
-#include <linux/capability.h>
 
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -12,17 +11,13 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_qm.h"
-#include "xfs_trace.h"
 #include "xfs_icache.h"
-#include "xfs_defer.h"
 
 STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,

diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 55b7982..efe42ae 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h

@@ -56,32 +56,35 @@
  * The structure kept inside the xfs_trans_t keep track of dquot changes
  * within a transaction and apply them later.
  */
-typedef struct xfs_dqtrx {
+struct xfs_dqtrx {
 	struct xfs_dquot *qt_dquot;	  /* the dquot this refers to */
-	ulong		qt_blk_res;	  /* blks reserved on a dquot */
-	ulong		qt_ino_res;	  /* inode reserved on a dquot */
-	ulong		qt_ino_res_used;  /* inodes used from the reservation */
-	long		qt_bcount_delta;  /* dquot blk count changes */
-	long		qt_delbcnt_delta; /* delayed dquot blk count changes */
-	long		qt_icount_delta;  /* dquot inode count changes */
-	ulong		qt_rtblk_res;	  /* # blks reserved on a dquot */
-	ulong		qt_rtblk_res_used;/* # blks used from reservation */
-	long		qt_rtbcount_delta;/* dquot realtime blk changes */
-	long		qt_delrtb_delta;  /* delayed RT blk count changes */
-} xfs_dqtrx_t;
+
+	uint64_t	qt_blk_res;	  /* blks reserved on a dquot */
+	int64_t		qt_bcount_delta;  /* dquot blk count changes */
+	int64_t		qt_delbcnt_delta; /* delayed dquot blk count changes */
+
+	uint64_t	qt_rtblk_res;	  /* # blks reserved on a dquot */
+	uint64_t	qt_rtblk_res_used;/* # blks used from reservation */
+	int64_t		qt_rtbcount_delta;/* dquot realtime blk changes */
+	int64_t		qt_delrtb_delta;  /* delayed RT blk count changes */
+
+	uint64_t	qt_ino_res;	  /* inode reserved on a dquot */
+	uint64_t	qt_ino_res_used;  /* inodes used from the reservation */
+	int64_t		qt_icount_delta;  /* dquot inode count changes */
+};
 
 #ifdef CONFIG_XFS_QUOTA
 extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
 extern void xfs_trans_free_dqinfo(struct xfs_trans *);
 extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
-		uint, long);
+		uint, int64_t);
 extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
 extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
 extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
-		struct xfs_inode *, long, long, uint);
+		struct xfs_inode *, int64_t, long, uint);
 extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
 		struct xfs_mount *, struct xfs_dquot *,
-		struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
+		struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
 
 extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
 		prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
@@ -121,14 +124,14 @@
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
 static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
-		struct xfs_inode *ip, long nblks, long ninos, uint flags)
+		struct xfs_inode *ip, int64_t nblks, long ninos, uint flags)
 {
 	return 0;
 }
 static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
 		struct xfs_mount *mp, struct xfs_dquot *udqp,
 		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
-		long nblks, long nions, uint flags)
+		int64_t nblks, long nions, uint flags)
 {
 	return 0;
 }

diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a7c0c65..cd6c721 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c

@@ -4,6 +4,7 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
@@ -11,10 +12,8 @@
 #include "xfs_inode.h"
 #include "xfs_quota.h"
 #include "xfs_trans.h"
-#include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_qm.h"
-#include <linux/quota.h>
 
 
 static void

diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fce38b5..2328268 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c

@@ -14,7 +14,6 @@
 #include "xfs_defer.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
 #include "xfs_refcount_item.h"
 #include "xfs_log.h"
 #include "xfs_refcount.h"
@@ -95,15 +94,6 @@
 }
 
 /*
- * Pinning has no meaning for an cui item, so just return.
- */
-STATIC void
-xfs_cui_item_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
  * The unpin operation is the last place an CUI is manipulated in the log. It is
  * either inserted in the AIL or aborted in the event of a log I/O error. In
  * either case, the CUI transaction has been successfully committed to make it
@@ -122,71 +112,22 @@
 }
 
 /*
- * CUI items have no locking or pushing.  However, since CUIs are pulled from
- * the AIL when their corresponding CUDs are committed to disk, their situation
- * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log.  This should help in getting the CUI out of
- * the AIL.
- */
-STATIC uint
-xfs_cui_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	return XFS_ITEM_PINNED;
-}
-
-/*
  * The CUI has been either committed or aborted if the transaction has been
  * cancelled. If the transaction was cancelled, an CUD isn't going to be
  * constructed and thus we free the CUI here directly.
  */
 STATIC void
-xfs_cui_item_unlock(
+xfs_cui_item_release(
 	struct xfs_log_item	*lip)
 {
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
-		xfs_cui_release(CUI_ITEM(lip));
+	xfs_cui_release(CUI_ITEM(lip));
 }
 
-/*
- * The CUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_cui_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	return lsn;
-}
-
-/*
- * The CUI dependency tracking op doesn't do squat.  It can't because
- * it doesn't know where the free extent is coming from.  The dependency
- * tracking has to be handled by the "enclosing" metadata object.  For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_cui_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all cui log items.
- */
 static const struct xfs_item_ops xfs_cui_item_ops = {
 	.iop_size	= xfs_cui_item_size,
 	.iop_format	= xfs_cui_item_format,
-	.iop_pin	= xfs_cui_item_pin,
 	.iop_unpin	= xfs_cui_item_unpin,
-	.iop_unlock	= xfs_cui_item_unlock,
-	.iop_committed	= xfs_cui_item_committed,
-	.iop_push	= xfs_cui_item_push,
-	.iop_committing = xfs_cui_item_committing,
+	.iop_release	= xfs_cui_item_release,
 };
 
 /*
@@ -203,9 +144,9 @@
 	ASSERT(nextents > 0);
 	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
 		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
-				KM_SLEEP);
+				0);
 	else
-		cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+		cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
 
 	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
 	cuip->cui_format.cui_nextents = nextents;
@@ -254,127 +195,251 @@
 }
 
 /*
- * Pinning has no meaning for an cud item, so just return.
- */
-STATIC void
-xfs_cud_item_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an cud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_cud_item_unpin(
-	struct xfs_log_item	*lip,
-	int			remove)
-{
-}
-
-/*
- * There isn't much you can do to push on an cud item.  It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_cud_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	return XFS_ITEM_PINNED;
-}
-
-/*
  * The CUD is either committed or aborted if the transaction is cancelled. If
  * the transaction is cancelled, drop our reference to the CUI and free the
  * CUD.
  */
 STATIC void
-xfs_cud_item_unlock(
+xfs_cud_item_release(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
 
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
-		xfs_cui_release(cudp->cud_cuip);
-		kmem_zone_free(xfs_cud_zone, cudp);
-	}
-}
-
-/*
- * When the cud item is committed to disk, all we need to do is delete our
- * reference to our partner cui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_cud_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
-
-	/*
-	 * Drop the CUI reference regardless of whether the CUD has been
-	 * aborted. Once the CUD transaction is constructed, it is the sole
-	 * responsibility of the CUD to release the CUI (even if the CUI is
-	 * aborted due to log I/O error).
-	 */
 	xfs_cui_release(cudp->cud_cuip);
 	kmem_zone_free(xfs_cud_zone, cudp);
-
-	return (xfs_lsn_t)-1;
 }
 
-/*
- * The CUD dependency tracking op doesn't do squat.  It can't because
- * it doesn't know where the free extent is coming from.  The dependency
- * tracking has to be handled by the "enclosing" metadata object.  For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_cud_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all cud log items.
- */
 static const struct xfs_item_ops xfs_cud_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
 	.iop_size	= xfs_cud_item_size,
 	.iop_format	= xfs_cud_item_format,
-	.iop_pin	= xfs_cud_item_pin,
-	.iop_unpin	= xfs_cud_item_unpin,
-	.iop_unlock	= xfs_cud_item_unlock,
-	.iop_committed	= xfs_cud_item_committed,
-	.iop_push	= xfs_cud_item_push,
-	.iop_committing = xfs_cud_item_committing,
+	.iop_release	= xfs_cud_item_release,
 };
 
-/*
- * Allocate and initialize an cud item with the given number of extents.
- */
-struct xfs_cud_log_item *
-xfs_cud_init(
-	struct xfs_mount		*mp,
+static struct xfs_cud_log_item *
+xfs_trans_get_cud(
+	struct xfs_trans		*tp,
 	struct xfs_cui_log_item		*cuip)
-
 {
-	struct xfs_cud_log_item	*cudp;
+	struct xfs_cud_log_item		*cudp;
 
-	cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
-	xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops);
+	cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
+	xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
+			  &xfs_cud_item_ops);
 	cudp->cud_cuip = cuip;
 	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
 
+	xfs_trans_add_item(tp, &cudp->cud_item);
 	return cudp;
 }
 
 /*
+ * Finish an refcount update and log it to the CUD. Note that the
+ * transaction is marked dirty regardless of whether the refcount
+ * update succeeds or fails to support the CUI/CUD lifecycle rules.
+ */
+static int
+xfs_trans_log_finish_refcount_update(
+	struct xfs_trans		*tp,
+	struct xfs_cud_log_item		*cudp,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	xfs_fsblock_t			*new_fsb,
+	xfs_extlen_t			*new_len,
+	struct xfs_btree_cur		**pcur)
+{
+	int				error;
+
+	error = xfs_refcount_finish_one(tp, type, startblock,
+			blockcount, new_fsb, new_len, pcur);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the CUI and frees the CUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+	return error;
+}
+
+/* Sort refcount intents by AG. */
+static int
+xfs_refcount_update_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_mount		*mp = priv;
+	struct xfs_refcount_intent	*ra;
+	struct xfs_refcount_intent	*rb;
+
+	ra = container_of(a, struct xfs_refcount_intent, ri_list);
+	rb = container_of(b, struct xfs_refcount_intent, ri_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+}
+
+/* Get an CUI. */
+STATIC void *
+xfs_refcount_update_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_cui_log_item		*cuip;
+
+	ASSERT(tp != NULL);
+	ASSERT(count > 0);
+
+	cuip = xfs_cui_init(tp->t_mountp, count);
+	ASSERT(cuip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &cuip->cui_item);
+	return cuip;
+}
+
+/* Set the phys extent flags for this reverse mapping. */
+static void
+xfs_trans_set_refcount_flags(
+	struct xfs_phys_extent		*refc,
+	enum xfs_refcount_intent_type	type)
+{
+	refc->pe_flags = 0;
+	switch (type) {
+	case XFS_REFCOUNT_INCREASE:
+	case XFS_REFCOUNT_DECREASE:
+	case XFS_REFCOUNT_ALLOC_COW:
+	case XFS_REFCOUNT_FREE_COW:
+		refc->pe_flags |= type;
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+/* Log refcount updates in the intent item. */
+STATIC void
+xfs_refcount_update_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
+{
+	struct xfs_cui_log_item		*cuip = intent;
+	struct xfs_refcount_intent	*refc;
+	uint				next_extent;
+	struct xfs_phys_extent		*ext;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
+	ASSERT(next_extent < cuip->cui_format.cui_nextents);
+	ext = &cuip->cui_format.cui_extents[next_extent];
+	ext->pe_startblock = refc->ri_startblock;
+	ext->pe_len = refc->ri_blockcount;
+	xfs_trans_set_refcount_flags(ext, refc->ri_type);
+}
+
+/* Get an CUD so we can process all the deferred refcount updates. */
+STATIC void *
+xfs_refcount_update_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_cud(tp, intent);
+}
+
+/* Process a deferred refcount update. */
+STATIC int
+xfs_refcount_update_finish_item(
+	struct xfs_trans		*tp,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_refcount_intent	*refc;
+	xfs_fsblock_t			new_fsb;
+	xfs_extlen_t			new_aglen;
+	int				error;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+	error = xfs_trans_log_finish_refcount_update(tp, done_item,
+			refc->ri_type,
+			refc->ri_startblock,
+			refc->ri_blockcount,
+			&new_fsb, &new_aglen,
+			(struct xfs_btree_cur **)state);
+	/* Did we run out of reservation?  Requeue what we didn't finish. */
+	if (!error && new_aglen > 0) {
+		ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
+		       refc->ri_type == XFS_REFCOUNT_DECREASE);
+		refc->ri_startblock = new_fsb;
+		refc->ri_blockcount = new_aglen;
+		return -EAGAIN;
+	}
+	kmem_free(refc);
+	return error;
+}
+
+/* Clean up after processing deferred refcounts. */
+STATIC void
+xfs_refcount_update_finish_cleanup(
+	struct xfs_trans	*tp,
+	void			*state,
+	int			error)
+{
+	struct xfs_btree_cur	*rcur = state;
+
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending CUIs. */
+STATIC void
+xfs_refcount_update_abort_intent(
+	void				*intent)
+{
+	xfs_cui_release(intent);
+}
+
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_refcount_intent	*refc;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+	kmem_free(refc);
+}
+
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_refcount_update_diff_items,
+	.create_intent	= xfs_refcount_update_create_intent,
+	.abort_intent	= xfs_refcount_update_abort_intent,
+	.log_item	= xfs_refcount_update_log_item,
+	.create_done	= xfs_refcount_update_create_done,
+	.finish_item	= xfs_refcount_update_finish_item,
+	.finish_cleanup = xfs_refcount_update_finish_cleanup,
+	.cancel_item	= xfs_refcount_update_cancel_item,
+};
+
+/*
  * Process a refcount update intent item that was recovered from the log.
  * We need to update the refcountbt.
  */
@@ -490,26 +555,24 @@
 			irec.br_blockcount = new_len;
 			switch (type) {
 			case XFS_REFCOUNT_INCREASE:
-				error = xfs_refcount_increase_extent(tp, &irec);
+				xfs_refcount_increase_extent(tp, &irec);
 				break;
 			case XFS_REFCOUNT_DECREASE:
-				error = xfs_refcount_decrease_extent(tp, &irec);
+				xfs_refcount_decrease_extent(tp, &irec);
 				break;
 			case XFS_REFCOUNT_ALLOC_COW:
-				error = xfs_refcount_alloc_cow_extent(tp,
+				xfs_refcount_alloc_cow_extent(tp,
 						irec.br_startblock,
 						irec.br_blockcount);
 				break;
 			case XFS_REFCOUNT_FREE_COW:
-				error = xfs_refcount_free_cow_extent(tp,
+				xfs_refcount_free_cow_extent(tp,
 						irec.br_startblock,
 						irec.br_blockcount);
 				break;
 			default:
 				ASSERT(0);
 			}
-			if (error)
-				goto abort_error;
 			requeue_only = true;
 		}
 	}

diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index 3896dcc..e47530f 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h

@@ -78,8 +78,6 @@
 extern struct kmem_zone	*xfs_cud_zone;
 
 struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
-struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
-		struct xfs_cui_log_item *);
 void xfs_cui_item_free(struct xfs_cui_log_item *);
 void xfs_cui_release(struct xfs_cui_log_item *);
 int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip);

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 42ea7ba..0f08153 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c

@@ -11,21 +11,12 @@
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
-#include "xfs_error.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_ioctl.h"
 #include "xfs_trace.h"
-#include "xfs_log.h"
 #include "xfs_icache.h"
-#include "xfs_pnfs.h"
 #include "xfs_btree.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_refcount.h"
@@ -33,11 +24,9 @@
 #include "xfs_trans_space.h"
 #include "xfs_bit.h"
 #include "xfs_alloc.h"
-#include "xfs_quota_defs.h"
 #include "xfs_quota.h"
 #include "xfs_reflink.h"
 #include "xfs_iomap.h"
-#include "xfs_rmap_btree.h"
 #include "xfs_sb.h"
 #include "xfs_ag_resv.h"
 
@@ -182,8 +171,7 @@
 xfs_reflink_trim_around_shared(
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*irec,
-	bool			*shared,
-	bool			*trimmed)
+	bool			*shared)
 {
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
@@ -193,7 +181,7 @@
 	int			error = 0;
 
 	/* Holes, unwritten, and delalloc extents cannot be shared */
-	if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+	if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
 		*shared = false;
 		return 0;
 	}
@@ -209,7 +197,7 @@
 	if (error)
 		return error;
 
-	*shared = *trimmed = false;
+	*shared = false;
 	if (fbno == NULLAGBLOCK) {
 		/* No shared blocks at all. */
 		return 0;
@@ -222,8 +210,6 @@
 		 */
 		irec->br_blockcount = flen;
 		*shared = true;
-		if (flen != aglen)
-			*trimmed = true;
 		return 0;
 	} else {
 		/*
@@ -233,99 +219,63 @@
 		 * start of the shared region.
 		 */
 		irec->br_blockcount = fbno - agbno;
-		*trimmed = true;
 		return 0;
 	}
 }
 
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.  In this case *shared is set to true, else to false.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*imap,
 	bool			*shared)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec	got;
-	int			error = 0;
-	bool			eof = false, trimmed;
-	struct xfs_iext_cursor	icur;
-
-	/*
-	 * Search the COW fork extent list first.  This serves two purposes:
-	 * first this implement the speculative preallocation using cowextisze,
-	 * so that we also unshared block adjacent to shared blocks instead
-	 * of just the shared blocks themselves.  Second the lookup in the
-	 * extent list is generally faster than going out to the shared extent
-	 * tree.
-	 */
-
-	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
-		eof = true;
-	if (!eof && got.br_startoff <= imap->br_startoff) {
-		trace_xfs_reflink_cow_found(ip, imap);
-		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-
+	/* We can't update any real extents in always COW mode. */
+	if (xfs_is_always_cow_inode(ip) &&
+	    !isnullstartblock(imap->br_startblock)) {
 		*shared = true;
 		return 0;
 	}
 
 	/* Trim the mapping to the nearest shared extent boundary. */
-	error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
-	if (error)
-		return error;
-
-	/* Not shared?  Just report the (potentially capped) extent. */
-	if (!*shared)
-		return 0;
-
-	/*
-	 * Fork all the shared blocks from our write offset until the end of
-	 * the extent.
-	 */
-	error = xfs_qm_dqattach_locked(ip, false);
-	if (error)
-		return error;
-
-	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
-			imap->br_blockcount, 0, &got, &icur, eof);
-	if (error == -ENOSPC || error == -EDQUOT)
-		trace_xfs_reflink_cow_enospc(ip, imap);
-	if (error)
-		return error;
-
-	trace_xfs_reflink_cow_alloc(ip, &got);
-	return 0;
+	return xfs_reflink_trim_around_shared(ip, imap, shared);
 }
 
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
-	struct xfs_inode		*ip,
-	struct xfs_bmbt_irec		*imap,
-	xfs_fileoff_t			offset_fsb,
-	xfs_filblks_t			count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb,
+	xfs_filblks_t		count_fsb)
 {
-	int				nimaps = 1;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	struct xfs_btree_cur	*dummy_cur = NULL;
+	int			dummy_logflags;
+	int			error = 0;
 
-	if (imap->br_state == XFS_EXT_NORM)
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
 		return 0;
 
-	xfs_trim_extent(imap, offset_fsb, count_fsb);
-	trace_xfs_reflink_convert_cow(ip, imap);
-	if (imap->br_blockcount == 0)
-		return 0;
-	return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
-			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
-			&nimaps);
+	do {
+		if (got.br_startoff >= offset_fsb + count_fsb)
+			break;
+		if (got.br_state == XFS_EXT_NORM)
+			continue;
+		if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+			return -EIO;
+
+		xfs_trim_extent(&got, offset_fsb, count_fsb);
+		if (!got.br_blockcount)
+			continue;
+
+		got.br_state = XFS_EXT_NORM;
+		error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+				XFS_COW_FORK, &icur, &dummy_cur, &got,
+				&dummy_logflags);
+		if (error)
+			return error;
+	} while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+	return error;
 }
 
 /* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -339,15 +289,12 @@
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
 	xfs_filblks_t		count_fsb = end_fsb - offset_fsb;
-	struct xfs_bmbt_irec	imap;
-	int			nimaps = 1, error = 0;
+	int			error;
 
 	ASSERT(count != 0);
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
-			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
-			XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
@@ -368,7 +315,6 @@
 	xfs_filblks_t		count_fsb = imap->br_blockcount;
 	struct xfs_iext_cursor	icur;
 	struct xfs_bmbt_irec	got;
-	bool			trimmed;
 
 	*found = false;
 
@@ -376,9 +322,13 @@
 	 * If we don't find an overlapping extent, trim the range we need to
 	 * allocate to fit the hole we found.
 	 */
-	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
-	    got.br_startoff > offset_fsb)
-		return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+		got.br_startoff = offset_fsb + count_fsb;
+	if (got.br_startoff > offset_fsb) {
+		xfs_trim_extent(imap, imap->br_startoff,
+				got.br_startoff - imap->br_startoff);
+		return xfs_inode_need_cow(ip, imap, shared);
+	}
 
 	*shared = true;
 	if (isnullstartblock(got.br_startblock)) {
@@ -399,7 +349,8 @@
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*imap,
 	bool			*shared,
-	uint			*lockmode)
+	uint			*lockmode,
+	bool			convert_now)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb = imap->br_startoff;
@@ -411,7 +362,10 @@
 	xfs_extlen_t		resblks = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(xfs_is_reflink_inode(ip));
+	if (!ip->i_cowfp) {
+		ASSERT(!xfs_is_reflink_inode(ip));
+		xfs_ifork_init_cow(ip);
+	}
 
 	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
 	if (error || !*shared)
@@ -473,7 +427,16 @@
 	if (nimaps == 0)
 		return -ENOSPC;
 convert:
-	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+	xfs_trim_extent(imap, offset_fsb, count_fsb);
+	/*
+	 * COW fork extents are supposed to remain unwritten until we're ready
+	 * to initiate a disk write.  For direct I/O we are going to write the
+	 * data and need the conversion, but for buffered writes we're done.
+	 */
+	if (!convert_now || imap->br_state == XFS_EXT_NORM)
+		return 0;
+	trace_xfs_reflink_convert_cow(ip, imap);
+	return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
 
 out_unreserve:
 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -532,10 +495,8 @@
 			ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
 
 			/* Free the CoW orphan record. */
-			error = xfs_refcount_free_cow_extent(*tpp,
-					del.br_startblock, del.br_blockcount);
-			if (error)
-				break;
+			xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
+					del.br_blockcount);
 
 			xfs_bmap_add_free(*tpp, del.br_startblock,
 					  del.br_blockcount, NULL);
@@ -588,7 +549,7 @@
 	int			error;
 
 	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
-	ASSERT(xfs_is_reflink_inode(ip));
+	ASSERT(ip->i_cowfp);
 
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 	if (count == NULLFILEOFF)
@@ -598,7 +559,7 @@
 
 	/* Start a rolling transaction to remove the mappings */
 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
-			0, 0, XFS_TRANS_NOFS, &tp);
+			0, 0, 0, &tp);
 	if (error)
 		goto out;
 
@@ -625,54 +586,47 @@
 }
 
 /*
- * Remap parts of a file's data fork after a successful CoW.
+ * Remap part of the CoW fork into the data fork.
+ *
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
+ * into the data fork; this function will remap what it can (at the end of the
+ * range) and update @end_fsb appropriately.  Each remap gets its own
+ * transaction because we can end up merging and splitting bmbt blocks for
+ * every remap operation and we'd like to keep the block reservation
+ * requirements as low as possible.
  */
-int
-xfs_reflink_end_cow(
-	struct xfs_inode		*ip,
-	xfs_off_t			offset,
-	xfs_off_t			count)
+STATIC int
+xfs_reflink_end_cow_extent(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb,
+	xfs_fileoff_t		*end_fsb)
 {
-	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec		got, del;
-	struct xfs_trans		*tp;
-	xfs_fileoff_t			offset_fsb;
-	xfs_fileoff_t			end_fsb;
-	int				error;
-	unsigned int			resblks;
-	xfs_filblks_t			rlen;
-	struct xfs_iext_cursor		icur;
-
-	trace_xfs_reflink_end_cow(ip, offset, count);
+	struct xfs_bmbt_irec	got, del;
+	struct xfs_iext_cursor	icur;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	xfs_filblks_t		rlen;
+	unsigned int		resblks;
+	int			error;
 
 	/* No COW extents?  That's easy! */
-	if (ifp->if_bytes == 0)
+	if (ifp->if_bytes == 0) {
+		*end_fsb = offset_fsb;
 		return 0;
+	}
 
-	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
-	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+			XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
 
 	/*
-	 * Start a rolling transaction to switch the mappings.  We're
-	 * unlikely ever to have to remap 16T worth of single-block
-	 * extents, so just cap the worst case extent count to 2^32-1.
-	 * Stick a warning in just in case, and avoid 64-bit division.
+	 * Lock the inode.  We have to ijoin without automatic unlock because
+	 * the lead transaction is the refcountbt record deletion; the data
+	 * fork update follows as a deferred log item.
 	 */
-	BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
-	if (end_fsb - offset_fsb > UINT_MAX) {
-		error = -EFSCORRUPTED;
-		xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
-		ASSERT(0);
-		goto out;
-	}
-	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
-			(unsigned int)(end_fsb - offset_fsb),
-			XFS_DATA_FORK);
-	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
-			resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
-	if (error)
-		goto out;
-
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
@@ -681,80 +635,126 @@
 	 * left by the time I/O completes for the loser of the race.  In that
 	 * case we are done.
 	 */
-	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
+	if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
+	    got.br_startoff + got.br_blockcount <= offset_fsb) {
+		*end_fsb = offset_fsb;
+		goto out_cancel;
+	}
+
+	/*
+	 * Structure copy @got into @del, then trim @del to the range that we
+	 * were asked to remap.  We preserve @got for the eventual CoW fork
+	 * deletion; from now on @del represents the mapping that we're
+	 * actually remapping.
+	 */
+	del = got;
+	xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
+
+	ASSERT(del.br_blockcount > 0);
+
+	/*
+	 * Only remap real extents that contain data.  With AIO, speculative
+	 * preallocations can leak into the range we are called upon, and we
+	 * need to skip them.
+	 */
+	if (!xfs_bmap_is_real_extent(&got)) {
+		*end_fsb = del.br_startoff;
+		goto out_cancel;
+	}
+
+	/* Unmap the old blocks in the data fork. */
+	rlen = del.br_blockcount;
+	error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+	if (error)
 		goto out_cancel;
 
-	/* Walk backwards until we're out of the I/O range... */
-	while (got.br_startoff + got.br_blockcount > offset_fsb) {
-		del = got;
-		xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+	/* Trim the extent to whatever got unmapped. */
+	xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
+	trace_xfs_reflink_cow_remap(ip, &del);
 
-		/* Extent delete may have bumped ext forward */
-		if (!del.br_blockcount)
-			goto prev_extent;
+	/* Free the CoW orphan record. */
+	xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
 
-		/*
-		 * Only remap real extent that contain data.  With AIO
-		 * speculatively preallocations can leak into the range we
-		 * are called upon, and we need to skip them.
-		 */
-		if (!xfs_bmap_is_real_extent(&got))
-			goto prev_extent;
+	/* Map the new blocks into the data fork. */
+	xfs_bmap_map_extent(tp, ip, &del);
 
-		/* Unmap the old blocks in the data fork. */
-		ASSERT(tp->t_firstblock == NULLFSBLOCK);
-		rlen = del.br_blockcount;
-		error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
-		if (error)
-			goto out_cancel;
+	/* Charge this new data fork mapping to the on-disk quota. */
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
+			(long)del.br_blockcount);
 
-		/* Trim the extent to whatever got unmapped. */
-		if (rlen) {
-			xfs_trim_extent(&del, del.br_startoff + rlen,
-				del.br_blockcount - rlen);
-		}
-		trace_xfs_reflink_cow_remap(ip, &del);
-
-		/* Free the CoW orphan record. */
-		error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
-				del.br_blockcount);
-		if (error)
-			goto out_cancel;
-
-		/* Map the new blocks into the data fork. */
-		error = xfs_bmap_map_extent(tp, ip, &del);
-		if (error)
-			goto out_cancel;
-
-		/* Charge this new data fork mapping to the on-disk quota. */
-		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
-				(long)del.br_blockcount);
-
-		/* Remove the mapping from the CoW fork. */
-		xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
-
-		error = xfs_defer_finish(&tp);
-		if (error)
-			goto out_cancel;
-		if (!xfs_iext_get_extent(ifp, &icur, &got))
-			break;
-		continue;
-prev_extent:
-		if (!xfs_iext_prev_extent(ifp, &icur, &got))
-			break;
-	}
+	/* Remove the mapping from the CoW fork. */
+	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
 
 	error = xfs_trans_commit(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error)
-		goto out;
+		return error;
+
+	/* Update the caller about how much progress we made. */
+	*end_fsb = del.br_startoff;
 	return 0;
 
 out_cancel:
 	xfs_trans_cancel(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out:
-	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Remap parts of a file's data fork after a successful CoW.
+ */
+int
+xfs_reflink_end_cow(
+	struct xfs_inode		*ip,
+	xfs_off_t			offset,
+	xfs_off_t			count)
+{
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	int				error = 0;
+
+	trace_xfs_reflink_end_cow(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+	/*
+	 * Walk backwards until we're out of the I/O range.  The loop function
+	 * repeatedly cycles the ILOCK to allocate one transaction per remapped
+	 * extent.
+	 *
+	 * If we're being called by writeback then the the pages will still
+	 * have PageWriteback set, which prevents races with reflink remapping
+	 * and truncate.  Reflink remapping prevents races with writeback by
+	 * taking the iolock and mmaplock before flushing the pages and
+	 * remapping, which means there won't be any further writeback or page
+	 * cache dirtying until the reflink completes.
+	 *
+	 * We should never have two threads issuing writeback for the same file
+	 * region.  There are also have post-eof checks in the writeback
+	 * preparation code so that we don't bother writing out pages that are
+	 * about to be truncated.
+	 *
+	 * If we're being called as part of directio write completion, the dio
+	 * count is still elevated, which reflink and truncate will wait for.
+	 * Reflink remapping takes the iolock and mmaplock and waits for
+	 * pending dio to finish, which should prevent any directio until the
+	 * remap completes.  Multiple concurrent directio writes to the same
+	 * region are handled by end_cow processing only occurring for the
+	 * threads which succeed; the outcome of multiple overlapping direct
+	 * writes is not well defined anyway.
+	 *
+	 * It's possible that a buffered write and a direct write could collide
+	 * here (the buffered write stumbles in after the dio flushes and
+	 * invalidates the page cache and immediately queues writeback), but we
+	 * have never supported this 100%.  If either disk write succeeds the
+	 * blocks will be remapped.
+	 */
+	while (end_fsb > offset_fsb && !error)
+		error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+
+	if (error)
+		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
 	return error;
 }
 
@@ -916,18 +916,18 @@
 /*
  * Update destination inode size & cowextsize hint, if necessary.
  */
-STATIC int
+int
 xfs_reflink_update_dest(
 	struct xfs_inode	*dest,
 	xfs_off_t		newlen,
 	xfs_extlen_t		cowextsize,
-	bool			is_dedupe)
+	unsigned int		remap_flags)
 {
 	struct xfs_mount	*mp = dest->i_mount;
 	struct xfs_trans	*tp;
 	int			error;
 
-	if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
 		return 0;
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@@ -948,10 +948,6 @@
 		dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 	}
 
-	if (!is_dedupe) {
-		xfs_trans_ichgtime(tp, dest,
-				   XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	}
 	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
 
 	error = xfs_trans_commit(tp);
@@ -1067,14 +1063,10 @@
 				uirec.br_blockcount, uirec.br_startblock);
 
 		/* Update the refcount tree */
-		error = xfs_refcount_increase_extent(tp, &uirec);
-		if (error)
-			goto out_cancel;
+		xfs_refcount_increase_extent(tp, &uirec);
 
 		/* Map the new blocks into the data fork. */
-		error = xfs_bmap_map_extent(tp, ip, &uirec);
-		if (error)
-			goto out_cancel;
+		xfs_bmap_map_extent(tp, ip, &uirec);
 
 		/* Update quota accounting. */
 		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
@@ -1115,19 +1107,28 @@
 /*
  * Iteratively remap one file's extents (and holes) to another's.
  */
-STATIC int
+int
 xfs_reflink_remap_blocks(
 	struct xfs_inode	*src,
-	xfs_fileoff_t		srcoff,
+	loff_t			pos_in,
 	struct xfs_inode	*dest,
-	xfs_fileoff_t		destoff,
-	xfs_filblks_t		len,
-	xfs_off_t		new_isize)
+	loff_t			pos_out,
+	loff_t			remap_len,
+	loff_t			*remapped)
 {
 	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		srcoff;
+	xfs_fileoff_t		destoff;
+	xfs_filblks_t		len;
+	xfs_filblks_t		range_len;
+	xfs_filblks_t		remapped_len = 0;
+	xfs_off_t		new_isize = pos_out + remap_len;
 	int			nimaps;
 	int			error = 0;
-	xfs_filblks_t		range_len;
+
+	destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
+	srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
+	len = XFS_B_TO_FSB(src->i_mount, remap_len);
 
 	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
 	while (len) {
@@ -1142,10 +1143,10 @@
 		error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
 		xfs_iunlock(src, lock_mode);
 		if (error)
-			goto err;
+			break;
 		ASSERT(nimaps == 1);
 
-		trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+		trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
 				&imap);
 
 		/* Translate imap into the destination file. */
@@ -1156,32 +1157,33 @@
 		error = xfs_reflink_remap_extent(dest, &imap, destoff,
 				new_isize);
 		if (error)
-			goto err;
+			break;
 
 		if (fatal_signal_pending(current)) {
 			error = -EINTR;
-			goto err;
+			break;
 		}
 
 		/* Advance drange/srange */
 		srcoff += range_len;
 		destoff += range_len;
 		len -= range_len;
+		remapped_len += range_len;
 	}
 
-	return 0;
-
-err:
-	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+	if (error)
+		trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+	*remapped = min_t(loff_t, remap_len,
+			  XFS_FSB_TO_B(src->i_mount, remapped_len));
 	return error;
 }
 
 /*
- * Grab the exclusive iolock for a data copy from src to dest, making
- * sure to abide vfs locking order (lowest pointer value goes first) and
- * breaking the pnfs layout leases on dest before proceeding.  The loop
- * is needed because we cannot call the blocking break_layout() with the
- * src iolock held, and therefore have to back out both locks.
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding.  The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
  */
 static int
 xfs_iolock_two_inodes_and_break_layout(
@@ -1190,38 +1192,49 @@
 {
 	int			error;
 
-retry:
-	if (src < dest) {
-		inode_lock_shared(src);
-		inode_lock_nested(dest, I_MUTEX_NONDIR2);
-	} else {
-		/* src >= dest */
-		inode_lock(dest);
-	}
+	if (src > dest)
+		swap(src, dest);
 
-	error = break_layout(dest, false);
-	if (error == -EWOULDBLOCK) {
-		inode_unlock(dest);
-		if (src < dest)
-			inode_unlock_shared(src);
+retry:
+	/* Wait to break both inodes' layouts before we start locking. */
+	error = break_layout(src, true);
+	if (error)
+		return error;
+	if (src != dest) {
 		error = break_layout(dest, true);
 		if (error)
 			return error;
-		goto retry;
 	}
+
+	/* Lock one inode and make sure nobody got in and leased it. */
+	inode_lock(src);
+	error = break_layout(src, false);
 	if (error) {
-		inode_unlock(dest);
-		if (src < dest)
-			inode_unlock_shared(src);
+		inode_unlock(src);
+		if (error == -EWOULDBLOCK)
+			goto retry;
 		return error;
 	}
-	if (src > dest)
-		inode_lock_shared_nested(src, I_MUTEX_NONDIR2);
+
+	if (src == dest)
+		return 0;
+
+	/* Lock the other inode and make sure nobody got in and leased it. */
+	inode_lock_nested(dest, I_MUTEX_NONDIR2);
+	error = break_layout(dest, false);
+	if (error) {
+		inode_unlock(src);
+		inode_unlock(dest);
+		if (error == -EWOULDBLOCK)
+			goto retry;
+		return error;
+	}
+
 	return 0;
 }
 
 /* Unlock both inodes after they've been prepped for a range clone. */
-STATIC void
+void
 xfs_reflink_remap_unlock(
 	struct file		*file_in,
 	struct file		*file_out)
@@ -1234,10 +1247,10 @@
 
 	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
 	if (!same_inode)
-		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
+		xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
 	inode_unlock(inode_out);
 	if (!same_inode)
-		inode_unlock_shared(inode_in);
+		inode_unlock(inode_in);
 }
 
 /*
@@ -1289,21 +1302,20 @@
  * stale data in the destination file. Hence we reject these clone attempts with
  * -EINVAL in this case.
  */
-STATIC int
+int
 xfs_reflink_remap_prep(
 	struct file		*file_in,
 	loff_t			pos_in,
 	struct file		*file_out,
 	loff_t			pos_out,
-	u64			*len,
-	bool			is_dedupe)
+	loff_t			*len,
+	unsigned int		remap_flags)
 {
 	struct inode		*inode_in = file_inode(file_in);
 	struct xfs_inode	*src = XFS_I(inode_in);
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
 	bool			same_inode = (inode_in == inode_out);
-	u64			blkmask = i_blocksize(inode_in) - 1;
 	ssize_t			ret;
 
 	/* Lock both files against IO */
@@ -1313,7 +1325,7 @@
 	if (same_inode)
 		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
 	else
-		xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
+		xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
 				XFS_MMAPLOCK_EXCL);
 
 	/* Check file eligibility and prepare for block sharing. */
@@ -1326,29 +1338,11 @@
 	if (IS_DAX(inode_in) || IS_DAX(inode_out))
 		goto out_unlock;
 
-	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
-			len, is_dedupe);
-	if (ret <= 0)
+	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+			len, remap_flags);
+	if (ret < 0 || *len == 0)
 		goto out_unlock;
 
-	/*
-	 * If the dedupe data matches, chop off the partial EOF block
-	 * from the source file so we don't try to dedupe the partial
-	 * EOF block.
-	 */
-	if (is_dedupe) {
-		*len &= ~blkmask;
-	} else if (*len & blkmask) {
-		/*
-		 * The user is attempting to share a partial EOF block,
-		 * if it's inside the destination EOF then reject it.
-		 */
-		if (pos_out + *len < i_size_read(inode_out)) {
-			ret = -EINVAL;
-			goto out_unlock;
-		}
-	}
-
 	/* Attach dquots to dest inode before changing block map */
 	ret = xfs_qm_dqattach(dest);
 	if (ret)
@@ -1367,32 +1361,19 @@
 	if (ret)
 		goto out_unlock;
 
-	/* Zap any page cache for the destination file's range. */
-	truncate_inode_pages_range(&inode_out->i_data, pos_out,
-				   PAGE_ALIGN(pos_out + *len) - 1);
-
-	/* If we're altering the file contents... */
-	if (!is_dedupe) {
-		/*
-		 * ...update the timestamps (which will grab the ilock again
-		 * from xfs_fs_dirty_inode, so we have to call it before we
-		 * take the ilock).
-		 */
-		if (!(file_out->f_mode & FMODE_NOCMTIME)) {
-			ret = file_update_time(file_out);
-			if (ret)
-				goto out_unlock;
-		}
-
-		/*
-		 * ...clear the security bits if the process is not being run
-		 * by root.  This keeps people from modifying setuid and setgid
-		 * binaries.
-		 */
-		ret = file_remove_privs(file_out);
-		if (ret)
-			goto out_unlock;
+	/*
+	 * If pos_out > EOF, we may have dirtied blocks between EOF and
+	 * pos_out. In that case, we need to extend the flush and unmap to cover
+	 * from EOF to the end of the copy length.
+	 */
+	if (pos_out > XFS_ISIZE(dest)) {
+		loff_t	flen = *len + (pos_out - XFS_ISIZE(dest));
+		ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
+	} else {
+		ret = xfs_flush_unmap_range(dest, pos_out, *len);
 	}
+	if (ret)
+		goto out_unlock;
 
 	return 1;
 out_unlock:
@@ -1401,72 +1382,6 @@
 }
 
 /*
- * Link a range of blocks from one file to another.
- */
-int
-xfs_reflink_remap_range(
-	struct file		*file_in,
-	loff_t			pos_in,
-	struct file		*file_out,
-	loff_t			pos_out,
-	u64			len,
-	bool			is_dedupe)
-{
-	struct inode		*inode_in = file_inode(file_in);
-	struct xfs_inode	*src = XFS_I(inode_in);
-	struct inode		*inode_out = file_inode(file_out);
-	struct xfs_inode	*dest = XFS_I(inode_out);
-	struct xfs_mount	*mp = src->i_mount;
-	xfs_fileoff_t		sfsbno, dfsbno;
-	xfs_filblks_t		fsblen;
-	xfs_extlen_t		cowextsize;
-	ssize_t			ret;
-
-	if (!xfs_sb_version_hasreflink(&mp->m_sb))
-		return -EOPNOTSUPP;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	/* Prepare and then clone file data. */
-	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
-			&len, is_dedupe);
-	if (ret <= 0)
-		return ret;
-
-	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
-	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
-	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
-	fsblen = XFS_B_TO_FSB(mp, len);
-	ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
-			pos_out + len);
-	if (ret)
-		goto out_unlock;
-
-	/*
-	 * Carry the cowextsize hint from src to dest if we're sharing the
-	 * entire source file to the entire destination file, the source file
-	 * has a cowextsize hint, and the destination file does not.
-	 */
-	cowextsize = 0;
-	if (pos_in == 0 && len == i_size_read(inode_in) &&
-	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
-	    pos_out == 0 && len >= i_size_read(inode_out) &&
-	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
-		cowextsize = src->i_d.di_cowextsize;
-
-	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
-			is_dedupe);
-
-out_unlock:
-	xfs_reflink_remap_unlock(file_in, file_out);
-	if (ret)
-		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
-	return ret;
-}
-
-/*
  * The user wants to preemptively CoW all shared blocks in this file,
  * which enables us to turn off the reflink flag.  Iterate all
  * extents which are not prealloc/delalloc to see which ranges are

diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index c585ad9..28a43b7 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h

@@ -6,16 +6,28 @@
 #ifndef __XFS_REFLINK_H
 #define __XFS_REFLINK_H 1
 
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+	return ip->i_mount->m_always_cow &&
+		xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
 extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
 		xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
 		xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
-		struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+		struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+		bool *shared);
 
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
-		struct xfs_bmbt_irec *imap, bool *shared);
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
-		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+		bool convert_now);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
 
@@ -27,13 +39,24 @@
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
-		struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
+extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+		struct file *file_out, loff_t pos_out, loff_t len,
+		unsigned int remap_flags);
 extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
 		struct xfs_inode *ip, bool *has_shared);
 extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
 		struct xfs_trans **tpp);
 extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t len);
+extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
+		struct file *file_out, loff_t pos_out, loff_t *len,
+		unsigned int remap_flags);
+extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
+		struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
+		loff_t *remapped);
+extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
+		xfs_extlen_t cowextsize, unsigned int remap_flags);
+extern void xfs_reflink_remap_unlock(struct file *file_in,
+		struct file *file_out);
 
 #endif /* __XFS_REFLINK_H */

diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 127dc9c..8939e0e 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c

@@ -14,7 +14,6 @@
 #include "xfs_defer.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
 #include "xfs_rmap_item.h"
 #include "xfs_log.h"
 #include "xfs_rmap.h"
@@ -94,15 +93,6 @@
 }
 
 /*
- * Pinning has no meaning for an rui item, so just return.
- */
-STATIC void
-xfs_rui_item_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
  * The unpin operation is the last place an RUI is manipulated in the log. It is
  * either inserted in the AIL or aborted in the event of a log I/O error. In
  * either case, the RUI transaction has been successfully committed to make it
@@ -121,71 +111,22 @@
 }
 
 /*
- * RUI items have no locking or pushing.  However, since RUIs are pulled from
- * the AIL when their corresponding RUDs are committed to disk, their situation
- * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log.  This should help in getting the RUI out of
- * the AIL.
- */
-STATIC uint
-xfs_rui_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	return XFS_ITEM_PINNED;
-}
-
-/*
  * The RUI has been either committed or aborted if the transaction has been
  * cancelled. If the transaction was cancelled, an RUD isn't going to be
  * constructed and thus we free the RUI here directly.
  */
 STATIC void
-xfs_rui_item_unlock(
+xfs_rui_item_release(
 	struct xfs_log_item	*lip)
 {
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
-		xfs_rui_release(RUI_ITEM(lip));
+	xfs_rui_release(RUI_ITEM(lip));
 }
 
-/*
- * The RUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_rui_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-	return lsn;
-}
-
-/*
- * The RUI dependency tracking op doesn't do squat.  It can't because
- * it doesn't know where the free extent is coming from.  The dependency
- * tracking has to be handled by the "enclosing" metadata object.  For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_rui_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all rui log items.
- */
 static const struct xfs_item_ops xfs_rui_item_ops = {
 	.iop_size	= xfs_rui_item_size,
 	.iop_format	= xfs_rui_item_format,
-	.iop_pin	= xfs_rui_item_pin,
 	.iop_unpin	= xfs_rui_item_unpin,
-	.iop_unlock	= xfs_rui_item_unlock,
-	.iop_committed	= xfs_rui_item_committed,
-	.iop_push	= xfs_rui_item_push,
-	.iop_committing = xfs_rui_item_committing,
+	.iop_release	= xfs_rui_item_release,
 };
 
 /*
@@ -201,9 +142,9 @@
 
 	ASSERT(nextents > 0);
 	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
-		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
 	else
-		ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
+		ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
 
 	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
 	ruip->rui_format.rui_nextents = nextents;
@@ -275,127 +216,272 @@
 }
 
 /*
- * Pinning has no meaning for an rud item, so just return.
- */
-STATIC void
-xfs_rud_item_pin(
-	struct xfs_log_item	*lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an rud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_rud_item_unpin(
-	struct xfs_log_item	*lip,
-	int			remove)
-{
-}
-
-/*
- * There isn't much you can do to push on an rud item.  It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_rud_item_push(
-	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
-{
-	return XFS_ITEM_PINNED;
-}
-
-/*
  * The RUD is either committed or aborted if the transaction is cancelled. If
  * the transaction is cancelled, drop our reference to the RUI and free the
  * RUD.
  */
 STATIC void
-xfs_rud_item_unlock(
+xfs_rud_item_release(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_rud_log_item	*rudp = RUD_ITEM(lip);
 
-	if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
-		xfs_rui_release(rudp->rud_ruip);
-		kmem_zone_free(xfs_rud_zone, rudp);
+	xfs_rui_release(rudp->rud_ruip);
+	kmem_zone_free(xfs_rud_zone, rudp);
+}
+
+static const struct xfs_item_ops xfs_rud_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
+	.iop_size	= xfs_rud_item_size,
+	.iop_format	= xfs_rud_item_format,
+	.iop_release	= xfs_rud_item_release,
+};
+
+static struct xfs_rud_log_item *
+xfs_trans_get_rud(
+	struct xfs_trans		*tp,
+	struct xfs_rui_log_item		*ruip)
+{
+	struct xfs_rud_log_item		*rudp;
+
+	rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
+	xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
+			  &xfs_rud_item_ops);
+	rudp->rud_ruip = ruip;
+	rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+	xfs_trans_add_item(tp, &rudp->rud_item);
+	return rudp;
+}
+
+/* Set the map extent flags for this reverse mapping. */
+static void
+xfs_trans_set_rmap_flags(
+	struct xfs_map_extent		*rmap,
+	enum xfs_rmap_intent_type	type,
+	int				whichfork,
+	xfs_exntst_t			state)
+{
+	rmap->me_flags = 0;
+	if (state == XFS_EXT_UNWRITTEN)
+		rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+	if (whichfork == XFS_ATTR_FORK)
+		rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+	switch (type) {
+	case XFS_RMAP_MAP:
+		rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
+		break;
+	case XFS_RMAP_MAP_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+		break;
+	case XFS_RMAP_UNMAP:
+		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+		break;
+	case XFS_RMAP_UNMAP_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+		break;
+	case XFS_RMAP_CONVERT:
+		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+		break;
+	case XFS_RMAP_CONVERT_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+		break;
+	case XFS_RMAP_ALLOC:
+		rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+		break;
+	case XFS_RMAP_FREE:
+		rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
+		break;
+	default:
+		ASSERT(0);
 	}
 }
 
 /*
- * When the rud item is committed to disk, all we need to do is delete our
- * reference to our partner rui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
+ * Finish an rmap update and log it to the RUD. Note that the transaction is
+ * marked dirty regardless of whether the rmap update succeeds or fails to
+ * support the RUI/RUD lifecycle rules.
  */
-STATIC xfs_lsn_t
-xfs_rud_item_committed(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
+static int
+xfs_trans_log_finish_rmap_update(
+	struct xfs_trans		*tp,
+	struct xfs_rud_log_item		*rudp,
+	enum xfs_rmap_intent_type	type,
+	uint64_t			owner,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			blockcount,
+	xfs_exntst_t			state,
+	struct xfs_btree_cur		**pcur)
 {
-	struct xfs_rud_log_item	*rudp = RUD_ITEM(lip);
+	int				error;
+
+	error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
+			startblock, blockcount, state, pcur);
 
 	/*
-	 * Drop the RUI reference regardless of whether the RUD has been
-	 * aborted. Once the RUD transaction is constructed, it is the sole
-	 * responsibility of the RUD to release the RUI (even if the RUI is
-	 * aborted due to log I/O error).
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the RUI and frees the RUD
+	 * 2.) shuts down the filesystem
 	 */
-	xfs_rui_release(rudp->rud_ruip);
-	kmem_zone_free(xfs_rud_zone, rudp);
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
 
-	return (xfs_lsn_t)-1;
+	return error;
 }
 
-/*
- * The RUD dependency tracking op doesn't do squat.  It can't because
- * it doesn't know where the free extent is coming from.  The dependency
- * tracking has to be handled by the "enclosing" metadata object.  For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
+/* Sort rmap intents by AG. */
+static int
+xfs_rmap_update_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_mount		*mp = priv;
+	struct xfs_rmap_intent		*ra;
+	struct xfs_rmap_intent		*rb;
+
+	ra = container_of(a, struct xfs_rmap_intent, ri_list);
+	rb = container_of(b, struct xfs_rmap_intent, ri_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+}
+
+/* Get an RUI. */
+STATIC void *
+xfs_rmap_update_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_rui_log_item		*ruip;
+
+	ASSERT(tp != NULL);
+	ASSERT(count > 0);
+
+	ruip = xfs_rui_init(tp->t_mountp, count);
+	ASSERT(ruip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &ruip->rui_item);
+	return ruip;
+}
+
+/* Log rmap updates in the intent item. */
 STATIC void
-xfs_rud_item_committing(
-	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn)
+xfs_rmap_update_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
 {
+	struct xfs_rui_log_item		*ruip = intent;
+	struct xfs_rmap_intent		*rmap;
+	uint				next_extent;
+	struct xfs_map_extent		*map;
+
+	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
+	ASSERT(next_extent < ruip->rui_format.rui_nextents);
+	map = &ruip->rui_format.rui_extents[next_extent];
+	map->me_owner = rmap->ri_owner;
+	map->me_startblock = rmap->ri_bmap.br_startblock;
+	map->me_startoff = rmap->ri_bmap.br_startoff;
+	map->me_len = rmap->ri_bmap.br_blockcount;
+	xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
+			rmap->ri_bmap.br_state);
 }
 
-/*
- * This is the ops vector shared by all rud log items.
- */
-static const struct xfs_item_ops xfs_rud_item_ops = {
-	.iop_size	= xfs_rud_item_size,
-	.iop_format	= xfs_rud_item_format,
-	.iop_pin	= xfs_rud_item_pin,
-	.iop_unpin	= xfs_rud_item_unpin,
-	.iop_unlock	= xfs_rud_item_unlock,
-	.iop_committed	= xfs_rud_item_committed,
-	.iop_push	= xfs_rud_item_push,
-	.iop_committing = xfs_rud_item_committing,
+/* Get an RUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_rmap_update_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_rud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_rmap_update_finish_item(
+	struct xfs_trans		*tp,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_rmap_intent		*rmap;
+	int				error;
+
+	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+	error = xfs_trans_log_finish_rmap_update(tp, done_item,
+			rmap->ri_type,
+			rmap->ri_owner, rmap->ri_whichfork,
+			rmap->ri_bmap.br_startoff,
+			rmap->ri_bmap.br_startblock,
+			rmap->ri_bmap.br_blockcount,
+			rmap->ri_bmap.br_state,
+			(struct xfs_btree_cur **)state);
+	kmem_free(rmap);
+	return error;
+}
+
+/* Clean up after processing deferred rmaps. */
+STATIC void
+xfs_rmap_update_finish_cleanup(
+	struct xfs_trans	*tp,
+	void			*state,
+	int			error)
+{
+	struct xfs_btree_cur	*rcur = state;
+
+	xfs_rmap_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending RUIs. */
+STATIC void
+xfs_rmap_update_abort_intent(
+	void				*intent)
+{
+	xfs_rui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_rmap_intent		*rmap;
+
+	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+	kmem_free(rmap);
+}
+
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_rmap_update_diff_items,
+	.create_intent	= xfs_rmap_update_create_intent,
+	.abort_intent	= xfs_rmap_update_abort_intent,
+	.log_item	= xfs_rmap_update_log_item,
+	.create_done	= xfs_rmap_update_create_done,
+	.finish_item	= xfs_rmap_update_finish_item,
+	.finish_cleanup = xfs_rmap_update_finish_cleanup,
+	.cancel_item	= xfs_rmap_update_cancel_item,
 };
 
 /*
- * Allocate and initialize an rud item with the given number of extents.
- */
-struct xfs_rud_log_item *
-xfs_rud_init(
-	struct xfs_mount		*mp,
-	struct xfs_rui_log_item		*ruip)
-
-{
-	struct xfs_rud_log_item	*rudp;
-
-	rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
-	xfs_log_item_init(mp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops);
-	rudp->rud_ruip = ruip;
-	rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
-
-	return rudp;
-}
-
-/*
  * Process an rmap update intent item that was recovered from the log.
  * We need to update the rmapbt.
  */

diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 7e482ba..8708e4a 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h

@@ -78,8 +78,6 @@
 extern struct kmem_zone	*xfs_rud_zone;
 
 struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint);
-struct xfs_rud_log_item *xfs_rud_init(struct xfs_mount *,
-		struct xfs_rui_log_item *);
 int xfs_rui_copy_format(struct xfs_log_iovec *buf,
 		struct xfs_rui_log_format *dst_rui_fmt);
 void xfs_rui_item_free(struct xfs_rui_log_item *);

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 926ed31..4a48a8c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c

@@ -11,17 +11,11 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
-#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_buf.h"
 #include "xfs_icache.h"
 #include "xfs_rtalloc.h"
 
@@ -64,8 +58,12 @@
 	int		log;		/* loop counter, log2 of ext. size */
 	xfs_suminfo_t	sum;		/* summary data */
 
+	/* There are no extents at levels < m_rsum_cache[bbno]. */
+	if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno])
+		low = mp->m_rsum_cache[bbno];
+
 	/*
-	 * Loop over logs of extent sizes.  Order is irrelevant.
+	 * Loop over logs of extent sizes.
 	 */
 	for (log = low; log <= high; log++) {
 		/*
@@ -80,13 +78,17 @@
 		 */
 		if (sum) {
 			*stat = 1;
-			return 0;
+			goto out;
 		}
 	}
 	/*
 	 * Found nothing, return failure.
 	 */
 	*stat = 0;
+out:
+	/* There were no extents at levels < log. */
+	if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno])
+		mp->m_rsum_cache[bbno] = log;
 	return 0;
 }
 
@@ -853,6 +855,21 @@
 	return error;
 }
 
+static void
+xfs_alloc_rsum_cache(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_extlen_t	rbmblocks)	/* number of rt bitmap blocks */
+{
+	/*
+	 * The rsum cache is initialized to all zeroes, which is trivially a
+	 * lower bound on the minimum level with any free extents. We can
+	 * continue without the cache if it couldn't be allocated.
+	 */
+	mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
+	if (!mp->m_rsum_cache)
+		xfs_warn(mp, "could not allocate realtime summary cache");
+}
+
 /*
  * Visible (exported) functions.
  */
@@ -881,6 +898,7 @@
 	xfs_extlen_t	rsumblocks;	/* current number of rt summary blks */
 	xfs_sb_t	*sbp;		/* old superblock */
 	xfs_fsblock_t	sumbno;		/* summary block number */
+	uint8_t		*rsum_cache;	/* old summary cache */
 
 	sbp = &mp->m_sb;
 	/*
@@ -937,10 +955,15 @@
 	error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
 	if (error)
 		return error;
+
+	rsum_cache = mp->m_rsum_cache;
+	if (nrbmblocks != sbp->sb_rbmblocks)
+		xfs_alloc_rsum_cache(mp, nrbmblocks);
+
 	/*
 	 * Allocate a new (fake) mount/sb.
 	 */
-	nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+	nmp = kmem_alloc(sizeof(*nmp), 0);
 	/*
 	 * Loop over the bitmap blocks.
 	 * We will do everything one bitmap block at a time.
@@ -1062,6 +1085,20 @@
 	 */
 	kmem_free(nmp);
 
+	/*
+	 * If we had to allocate a new rsum_cache, we either need to free the
+	 * old one (if we succeeded) or free the new one and restore the old one
+	 * (if there was an error).
+	 */
+	if (rsum_cache != mp->m_rsum_cache) {
+		if (error) {
+			kmem_free(mp->m_rsum_cache);
+			mp->m_rsum_cache = rsum_cache;
+		} else {
+			kmem_free(rsum_cache);
+		}
+	}
+
 	return error;
 }
 
@@ -1187,8 +1224,8 @@
 }
 
 /*
- * Get the bitmap and summary inodes into the mount structure
- * at mount time.
+ * Get the bitmap and summary inodes and the summary cache into the mount
+ * structure at mount time.
  */
 int					/* error */
 xfs_rtmount_inodes(
@@ -1198,19 +1235,18 @@
 	xfs_sb_t	*sbp;
 
 	sbp = &mp->m_sb;
-	if (sbp->sb_rbmino == NULLFSINO)
-		return 0;
 	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
 	if (error)
 		return error;
 	ASSERT(mp->m_rbmip != NULL);
-	ASSERT(sbp->sb_rsumino != NULLFSINO);
+
 	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
 	if (error) {
 		xfs_irele(mp->m_rbmip);
 		return error;
 	}
 	ASSERT(mp->m_rsumip != NULL);
+	xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
 	return 0;
 }
 
@@ -1218,6 +1254,7 @@
 xfs_rtunmount_inodes(
 	struct xfs_mount	*mp)
 {
+	kmem_free(mp->m_rsum_cache);
 	if (mp->m_rbmip)
 		xfs_irele(mp->m_rbmip);
 	if (mp->m_rsumip)

diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 4e44231..113883c 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c

@@ -4,7 +4,6 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
-#include <linux/proc_fs.h>
 
 struct xstats xfsstats;
 
@@ -29,30 +28,30 @@
 		char	*desc;
 		int	endpoint;
 	} xstats[] = {
-		{ "extent_alloc",	XFSSTAT_END_EXTENT_ALLOC	},
-		{ "abt",		XFSSTAT_END_ALLOC_BTREE		},
-		{ "blk_map",		XFSSTAT_END_BLOCK_MAPPING	},
-		{ "bmbt",		XFSSTAT_END_BLOCK_MAP_BTREE	},
-		{ "dir",		XFSSTAT_END_DIRECTORY_OPS	},
-		{ "trans",		XFSSTAT_END_TRANSACTIONS	},
-		{ "ig",			XFSSTAT_END_INODE_OPS		},
-		{ "log",		XFSSTAT_END_LOG_OPS		},
-		{ "push_ail",		XFSSTAT_END_TAIL_PUSHING	},
-		{ "xstrat",		XFSSTAT_END_WRITE_CONVERT	},
-		{ "rw",			XFSSTAT_END_READ_WRITE_OPS	},
-		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
-		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
-		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
-		{ "buf",		XFSSTAT_END_BUF			},
-		{ "abtb2",		XFSSTAT_END_ABTB_V2		},
-		{ "abtc2",		XFSSTAT_END_ABTC_V2		},
-		{ "bmbt2",		XFSSTAT_END_BMBT_V2		},
-		{ "ibt2",		XFSSTAT_END_IBT_V2		},
-		{ "fibt2",		XFSSTAT_END_FIBT_V2		},
-		{ "rmapbt",		XFSSTAT_END_RMAP_V2		},
-		{ "refcntbt",		XFSSTAT_END_REFCOUNT		},
+		{ "extent_alloc",	xfsstats_offset(xs_abt_lookup)	},
+		{ "abt",		xfsstats_offset(xs_blk_mapr)	},
+		{ "blk_map",		xfsstats_offset(xs_bmbt_lookup)	},
+		{ "bmbt",		xfsstats_offset(xs_dir_lookup)	},
+		{ "dir",		xfsstats_offset(xs_trans_sync)	},
+		{ "trans",		xfsstats_offset(xs_ig_attempts)	},
+		{ "ig",			xfsstats_offset(xs_log_writes)	},
+		{ "log",		xfsstats_offset(xs_try_logspace)},
+		{ "push_ail",		xfsstats_offset(xs_xstrat_quick)},
+		{ "xstrat",		xfsstats_offset(xs_write_calls)	},
+		{ "rw",			xfsstats_offset(xs_attr_get)	},
+		{ "attr",		xfsstats_offset(xs_iflush_count)},
+		{ "icluster",		xfsstats_offset(vn_active)	},
+		{ "vnodes",		xfsstats_offset(xb_get)		},
+		{ "buf",		xfsstats_offset(xs_abtb_2)	},
+		{ "abtb2",		xfsstats_offset(xs_abtc_2)	},
+		{ "abtc2",		xfsstats_offset(xs_bmbt_2)	},
+		{ "bmbt2",		xfsstats_offset(xs_ibt_2)	},
+		{ "ibt2",		xfsstats_offset(xs_fibt_2)	},
+		{ "fibt2",		xfsstats_offset(xs_rmap_2)	},
+		{ "rmapbt",		xfsstats_offset(xs_refcbt_2)	},
+		{ "refcntbt",		xfsstats_offset(xs_qm_dqreclaims)},
 		/* we print both series of quota information together */
-		{ "qm",			XFSSTAT_END_QM			},
+		{ "qm",			xfsstats_offset(xs_xstrat_bytes)},
 	};
 
 	/* Loop over all stats groups */
@@ -104,6 +103,10 @@
 #ifdef CONFIG_PROC_FS
 /* legacy quota interfaces */
 #ifdef CONFIG_XFS_QUOTA
+
+#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims)
+#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot)
+
 static int xqm_proc_show(struct seq_file *m, void *v)
 {
 	/* maximum; incore; ratio free to inuse; freelist */
@@ -119,7 +122,7 @@
 	int j;
 
 	seq_printf(m, "qm");
-	for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+	for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
 		seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
 	seq_putc(m, '\n');
 	return 0;

diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 130db07..34d704f 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h

@@ -41,17 +41,14 @@
  * XFS global statistics
  */
 struct __xfsstats {
-# define XFSSTAT_END_EXTENT_ALLOC	4
 	uint32_t		xs_allocx;
 	uint32_t		xs_allocb;
 	uint32_t		xs_freex;
 	uint32_t		xs_freeb;
-# define XFSSTAT_END_ALLOC_BTREE	(XFSSTAT_END_EXTENT_ALLOC+4)
 	uint32_t		xs_abt_lookup;
 	uint32_t		xs_abt_compare;
 	uint32_t		xs_abt_insrec;
 	uint32_t		xs_abt_delrec;
-# define XFSSTAT_END_BLOCK_MAPPING	(XFSSTAT_END_ALLOC_BTREE+7)
 	uint32_t		xs_blk_mapr;
 	uint32_t		xs_blk_mapw;
 	uint32_t		xs_blk_unmap;
@@ -59,21 +56,17 @@
 	uint32_t		xs_del_exlist;
 	uint32_t		xs_look_exlist;
 	uint32_t		xs_cmp_exlist;
-# define XFSSTAT_END_BLOCK_MAP_BTREE	(XFSSTAT_END_BLOCK_MAPPING+4)
 	uint32_t		xs_bmbt_lookup;
 	uint32_t		xs_bmbt_compare;
 	uint32_t		xs_bmbt_insrec;
 	uint32_t		xs_bmbt_delrec;
-# define XFSSTAT_END_DIRECTORY_OPS	(XFSSTAT_END_BLOCK_MAP_BTREE+4)
 	uint32_t		xs_dir_lookup;
 	uint32_t		xs_dir_create;
 	uint32_t		xs_dir_remove;
 	uint32_t		xs_dir_getdents;
-# define XFSSTAT_END_TRANSACTIONS	(XFSSTAT_END_DIRECTORY_OPS+3)
 	uint32_t		xs_trans_sync;
 	uint32_t		xs_trans_async;
 	uint32_t		xs_trans_empty;
-# define XFSSTAT_END_INODE_OPS		(XFSSTAT_END_TRANSACTIONS+7)
 	uint32_t		xs_ig_attempts;
 	uint32_t		xs_ig_found;
 	uint32_t		xs_ig_frecycle;
@@ -81,13 +74,11 @@
 	uint32_t		xs_ig_dup;
 	uint32_t		xs_ig_reclaims;
 	uint32_t		xs_ig_attrchg;
-# define XFSSTAT_END_LOG_OPS		(XFSSTAT_END_INODE_OPS+5)
 	uint32_t		xs_log_writes;
 	uint32_t		xs_log_blocks;
 	uint32_t		xs_log_noiclogs;
 	uint32_t		xs_log_force;
 	uint32_t		xs_log_force_sleep;
-# define XFSSTAT_END_TAIL_PUSHING	(XFSSTAT_END_LOG_OPS+10)
 	uint32_t		xs_try_logspace;
 	uint32_t		xs_sleep_logspace;
 	uint32_t		xs_push_ail;
@@ -98,22 +89,17 @@
 	uint32_t		xs_push_ail_flushing;
 	uint32_t		xs_push_ail_restarts;
 	uint32_t		xs_push_ail_flush;
-# define XFSSTAT_END_WRITE_CONVERT	(XFSSTAT_END_TAIL_PUSHING+2)
 	uint32_t		xs_xstrat_quick;
 	uint32_t		xs_xstrat_split;
-# define XFSSTAT_END_READ_WRITE_OPS	(XFSSTAT_END_WRITE_CONVERT+2)
 	uint32_t		xs_write_calls;
 	uint32_t		xs_read_calls;
-# define XFSSTAT_END_ATTRIBUTE_OPS	(XFSSTAT_END_READ_WRITE_OPS+4)
 	uint32_t		xs_attr_get;
 	uint32_t		xs_attr_set;
 	uint32_t		xs_attr_remove;
 	uint32_t		xs_attr_list;
-# define XFSSTAT_END_INODE_CLUSTER	(XFSSTAT_END_ATTRIBUTE_OPS+3)
 	uint32_t		xs_iflush_count;
 	uint32_t		xs_icluster_flushcnt;
 	uint32_t		xs_icluster_flushinode;
-# define XFSSTAT_END_VNODE_OPS		(XFSSTAT_END_INODE_CLUSTER+8)
 	uint32_t		vn_active;	/* # vnodes not on free lists */
 	uint32_t		vn_alloc;	/* # times vn_alloc called */
 	uint32_t		vn_get;		/* # times vn_get called */
@@ -122,7 +108,6 @@
 	uint32_t		vn_reclaim;	/* # times vn_reclaim called */
 	uint32_t		vn_remove;	/* # times vn_remove called */
 	uint32_t		vn_free;	/* # times vn_free called */
-#define XFSSTAT_END_BUF			(XFSSTAT_END_VNODE_OPS+9)
 	uint32_t		xb_get;
 	uint32_t		xb_create;
 	uint32_t		xb_get_locked;
@@ -133,28 +118,19 @@
 	uint32_t		xb_page_found;
 	uint32_t		xb_get_read;
 /* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2		(XFSSTAT_END_BUF + __XBTS_MAX)
 	uint32_t		xs_abtb_2[__XBTS_MAX];
-#define XFSSTAT_END_ABTC_V2		(XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
 	uint32_t		xs_abtc_2[__XBTS_MAX];
-#define XFSSTAT_END_BMBT_V2		(XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
 	uint32_t		xs_bmbt_2[__XBTS_MAX];
-#define XFSSTAT_END_IBT_V2		(XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
 	uint32_t		xs_ibt_2[__XBTS_MAX];
-#define XFSSTAT_END_FIBT_V2		(XFSSTAT_END_IBT_V2 + __XBTS_MAX)
 	uint32_t		xs_fibt_2[__XBTS_MAX];
-#define XFSSTAT_END_RMAP_V2		(XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
 	uint32_t		xs_rmap_2[__XBTS_MAX];
-#define XFSSTAT_END_REFCOUNT		(XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
 	uint32_t		xs_refcbt_2[__XBTS_MAX];
-#define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_REFCOUNT + 6)
 	uint32_t		xs_qm_dqreclaims;
 	uint32_t		xs_qm_dqreclaim_misses;
 	uint32_t		xs_qm_dquot_dups;
 	uint32_t		xs_qm_dqcachemisses;
 	uint32_t		xs_qm_dqcachehits;
 	uint32_t		xs_qm_dqwants;
-#define XFSSTAT_END_QM			(XFSSTAT_END_XQMSTAT+2)
 	uint32_t		xs_qm_dquot;
 	uint32_t		xs_qm_dquot_unused;
 /* Extra precision counters */
@@ -163,10 +139,12 @@
 	uint64_t		xs_read_bytes;
 };
 
+#define	xfsstats_offset(f)	(offsetof(struct __xfsstats, f)/sizeof(uint32_t))
+
 struct xfsstats {
 	union {
 		struct __xfsstats	s;
-		uint32_t		a[XFSSTAT_END_XQMSTAT];
+		uint32_t		a[xfsstats_offset(xs_qm_dquot)];
 	};
 };
 

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 207ee30..8d1df9f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c

@@ -11,18 +11,15 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_alloc.h"
-#include "xfs_error.h"
 #include "xfs_fsops.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
-#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
@@ -39,15 +36,7 @@
 #include "xfs_bmap_item.h"
 #include "xfs_reflink.h"
 
-#include <linux/namei.h>
-#include <linux/dax.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/mount.h>
-#include <linux/mempool.h>
-#include <linux/writeback.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
+#include <linux/magic.h>
 #include <linux/parser.h>
 
 static const struct super_operations xfs_super_operations;
@@ -64,7 +53,7 @@
 enum {
 	Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
 	Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
-	Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
+	Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
 	Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
 	Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2,
 	Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
@@ -85,7 +74,6 @@
 	{Opt_sunit,	"sunit=%u"},	/* data volume stripe unit */
 	{Opt_swidth,	"swidth=%u"},	/* data volume stripe width */
 	{Opt_nouuid,	"nouuid"},	/* ignore filesystem UUID */
-	{Opt_mtpt,	"mtpt"},	/* filesystem mount point */
 	{Opt_grpid,	"grpid"},	/* group-ID from parent directory */
 	{Opt_nogrpid,	"nogrpid"},	/* group-ID from current process */
 	{Opt_bsdgroups,	"bsdgroups"},	/* group-ID from parent directory */
@@ -234,9 +222,6 @@
 			if (!mp->m_logname)
 				return -ENOMEM;
 			break;
-		case Opt_mtpt:
-			xfs_warn(mp, "%s option not allowed on this system", p);
-			return -EINVAL;
 		case Opt_rtdev:
 			kfree(mp->m_rtname);
 			mp->m_rtname = match_strdup(args);
@@ -446,7 +431,7 @@
 	char		*str;
 };
 
-STATIC int
+STATIC void
 xfs_showargs(
 	struct xfs_mount	*mp,
 	struct seq_file		*m)
@@ -525,9 +510,8 @@
 
 	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
 		seq_puts(m, ",noquota");
-
-	return 0;
 }
+
 static uint64_t
 xfs_max_file_offset(
 	unsigned int		blockshift)
@@ -537,26 +521,18 @@
 
 	/* Figure out maximum filesize, on Linux this can depend on
 	 * the filesystem blocksize (on 32 bit platforms).
-	 * __block_write_begin does this in an [unsigned] long...
+	 * __block_write_begin does this in an [unsigned] long long...
 	 *      page->index << (PAGE_SHIFT - bbits)
 	 * So, for page sized blocks (4K on 32 bit platforms),
 	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
 	 *      (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
 	 * but for smaller blocksizes it is less (bbits = log2 bsize).
-	 * Note1: get_block_t takes a long (implicit cast from above)
-	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
-	 * can optionally convert the [unsigned] long from above into
-	 * an [unsigned] long long.
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBDAF)
 	ASSERT(sizeof(sector_t) == 8);
 	pagefactor = PAGE_SIZE;
 	bitshift = BITS_PER_LONG;
-# else
-	pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift);
-# endif
 #endif
 
 	return (((uint64_t)pagefactor) << bitshift) - 1;
@@ -593,7 +569,7 @@
 	 * Calculate how much should be reserved for inodes to meet
 	 * the max inode percentage.  Used only for inode32.
 	 */
-	if (mp->m_maxicount) {
+	if (M_IGEO(mp)->maxicount) {
 		uint64_t	icount;
 
 		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
@@ -606,7 +582,7 @@
 	}
 
 	/* Get the last possible inode in the filesystem */
-	agino =	XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+	agino =	XFS_AGB_TO_AGINO(mp, sbp->sb_agblocks - 1);
 	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
 
 	/*
@@ -836,18 +812,14 @@
 	if (!mp->m_buf_workqueue)
 		goto out;
 
-	mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
-			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
-	if (!mp->m_data_workqueue)
-		goto out_destroy_buf;
-
 	mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
 			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_unwritten_workqueue)
-		goto out_destroy_data_iodone_queue;
+		goto out_destroy_buf;
 
 	mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
-			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+			WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+			0, mp->m_fsname);
 	if (!mp->m_cil_workqueue)
 		goto out_destroy_unwritten;
 
@@ -856,16 +828,10 @@
 	if (!mp->m_reclaim_workqueue)
 		goto out_destroy_cil;
 
-	mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
-			WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
-			mp->m_fsname);
-	if (!mp->m_log_workqueue)
-		goto out_destroy_reclaim;
-
 	mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
 			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_eofblocks_workqueue)
-		goto out_destroy_log;
+		goto out_destroy_reclaim;
 
 	mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
 					       mp->m_fsname);
@@ -876,16 +842,12 @@
 
 out_destroy_eofb:
 	destroy_workqueue(mp->m_eofblocks_workqueue);
-out_destroy_log:
-	destroy_workqueue(mp->m_log_workqueue);
 out_destroy_reclaim:
 	destroy_workqueue(mp->m_reclaim_workqueue);
 out_destroy_cil:
 	destroy_workqueue(mp->m_cil_workqueue);
 out_destroy_unwritten:
 	destroy_workqueue(mp->m_unwritten_workqueue);
-out_destroy_data_iodone_queue:
-	destroy_workqueue(mp->m_data_workqueue);
 out_destroy_buf:
 	destroy_workqueue(mp->m_buf_workqueue);
 out:
@@ -898,10 +860,8 @@
 {
 	destroy_workqueue(mp->m_sync_workqueue);
 	destroy_workqueue(mp->m_eofblocks_workqueue);
-	destroy_workqueue(mp->m_log_workqueue);
 	destroy_workqueue(mp->m_reclaim_workqueue);
 	destroy_workqueue(mp->m_cil_workqueue);
-	destroy_workqueue(mp->m_data_workqueue);
 	destroy_workqueue(mp->m_unwritten_workqueue);
 	destroy_workqueue(mp->m_buf_workqueue);
 }
@@ -933,6 +893,32 @@
 	return NULL;
 }
 
+#ifdef DEBUG
+static void
+xfs_check_delalloc(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_bmbt_irec	got;
+	struct xfs_iext_cursor	icur;
+
+	if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
+		return;
+	do {
+		if (isnullstartblock(got.br_startblock)) {
+			xfs_warn(ip->i_mount,
+	"ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
+				ip->i_ino,
+				whichfork == XFS_DATA_FORK ? "data" : "cow",
+				got.br_startoff, got.br_blockcount);
+		}
+	} while (xfs_iext_next_extent(ifp, &icur, &got));
+}
+#else
+#define xfs_check_delalloc(ip, whichfork)	do { } while (0)
+#endif
+
 /*
  * Now that the generic code is guaranteed not to be accessing
  * the linux inode, we can inactivate and reclaim the inode.
@@ -951,7 +937,12 @@
 
 	xfs_inactive(ip);
 
-	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) {
+		xfs_check_delalloc(ip, XFS_DATA_FORK);
+		xfs_check_delalloc(ip, XFS_COW_FORK);
+		ASSERT(0);
+	}
+
 	XFS_STATS_INC(ip->i_mount, vn_reclaim);
 
 	/*
@@ -1097,7 +1088,7 @@
 	xfs_extlen_t		lsize;
 	int64_t			ffree;
 
-	statp->f_type = XFS_SB_MAGIC;
+	statp->f_type = XFS_SUPER_MAGIC;
 	statp->f_namelen = MAXNAMELEN - 1;
 
 	id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
@@ -1117,12 +1108,12 @@
 	statp->f_bfree = fdblocks - mp->m_alloc_set_aside;
 	statp->f_bavail = statp->f_bfree;
 
-	fakeinos = statp->f_bfree << sbp->sb_inopblog;
+	fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
 	statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
-	if (mp->m_maxicount)
+	if (M_IGEO(mp)->maxicount)
 		statp->f_files = min_t(typeof(statp->f_files),
 					statp->f_files,
-					mp->m_maxicount);
+					M_IGEO(mp)->maxicount);
 
 	/* If sb_icount overshot maxicount, report actual allocation */
 	statp->f_files = max_t(typeof(statp->f_files),
@@ -1343,7 +1334,7 @@
 			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 			return error;
 		}
-		xfs_icache_enable_reclaim(mp);
+		xfs_start_block_reaping(mp);
 
 		/* Create the per-AG metadata reservation pool .*/
 		error = xfs_fs_reserve_ag_blocks(mp);
@@ -1357,7 +1348,7 @@
 		 * Cancel background eofb scanning so it cannot race with the
 		 * final log force+buftarg wait and deadlock the remount.
 		 */
-		xfs_icache_disable_reclaim(mp);
+		xfs_stop_block_reaping(mp);
 
 		/* Get rid of any leftover CoW reservations... */
 		error = xfs_icache_free_cowblocks(mp, NULL);
@@ -1401,7 +1392,7 @@
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	xfs_icache_disable_reclaim(mp);
+	xfs_stop_block_reaping(mp);
 	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
 	return xfs_sync_sb(mp, true);
@@ -1415,7 +1406,7 @@
 
 	xfs_restore_resvblks(mp);
 	xfs_log_work_queue(mp);
-	xfs_icache_enable_reclaim(mp);
+	xfs_start_block_reaping(mp);
 	return 0;
 }
 
@@ -1424,7 +1415,8 @@
 	struct seq_file		*m,
 	struct dentry		*root)
 {
-	return xfs_showargs(XFS_M(root->d_sb), m);
+	xfs_showargs(XFS_M(root->d_sb), m);
+	return 0;
 }
 
 /*
@@ -1513,8 +1505,14 @@
 	if (error)
 		goto free_ifree;
 
+	error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
+	if (error)
+		goto free_fdblocks;
+
 	return 0;
 
+free_fdblocks:
+	percpu_counter_destroy(&mp->m_fdblocks);
 free_ifree:
 	percpu_counter_destroy(&mp->m_ifree);
 free_icount:
@@ -1538,6 +1536,9 @@
 	percpu_counter_destroy(&mp->m_icount);
 	percpu_counter_destroy(&mp->m_ifree);
 	percpu_counter_destroy(&mp->m_fdblocks);
+	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+	       percpu_counter_sum(&mp->m_delalloc_blks) == 0);
+	percpu_counter_destroy(&mp->m_delalloc_blks);
 }
 
 static struct xfs_mount *
@@ -1561,6 +1562,13 @@
 	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
 	INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
+	/*
+	 * We don't create the finobt per-ag space reservation until after log
+	 * recovery, so we must set this to true so that an ifree transaction
+	 * started during log recovery will not depend on space reservations
+	 * for finobt expansion.
+	 */
+	mp->m_finobt_nores = true;
 	return mp;
 }
 
@@ -1650,12 +1658,16 @@
 	 * we must configure the block size in the superblock before we run the
 	 * full mount process as the mount process can lookup and cache inodes.
 	 */
-	sb->s_magic = XFS_SB_MAGIC;
+	sb->s_magic = XFS_SUPER_MAGIC;
 	sb->s_blocksize = mp->m_sb.sb_blocksize;
 	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
 	sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
 	sb->s_max_links = XFS_MAXLINK;
 	sb->s_time_gran = 1;
+	sb->s_time_min = S32_MIN;
+	sb->s_time_max = S32_MAX;
+	sb->s_iflags |= SB_I_CGROUPWB;
+
 	set_posix_acl_flag(sb);
 
 	/* version 5 superblocks support inode version counters. */
@@ -1696,11 +1708,18 @@
 		}
 	}
 
-	if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
-		xfs_alert(mp,
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		if (mp->m_sb.sb_rblocks) {
+			xfs_alert(mp,
 	"reflink not compatible with realtime device!");
-		error = -EINVAL;
-		goto out_filestream_unmount;
+			error = -EINVAL;
+			goto out_filestream_unmount;
+		}
+
+		if (xfs_globals.always_cow) {
+			xfs_info(mp, "using DEBUG-only always_cow mode.");
+			mp->m_always_cow = true;
+		}
 	}
 
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
@@ -2053,11 +2072,6 @@
 	printk(KERN_INFO XFS_VERSION_STRING " with "
 			 XFS_BUILD_OPTIONS " enabled\n");
 
-	xfs_extent_free_init_defer_op();
-	xfs_rmap_update_init_defer_op();
-	xfs_refcount_update_init_defer_op();
-	xfs_bmap_update_init_defer_op();
-
 	xfs_dir_startup();
 
 	error = xfs_init_zones();

diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 21cb49a..763e43d 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h

@@ -38,6 +38,18 @@
 # define XFS_SCRUB_STRING
 #endif
 
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+# define XFS_REPAIR_STRING	"repair, "
+#else
+# define XFS_REPAIR_STRING
+#endif
+
+#ifdef CONFIG_XFS_WARN
+# define XFS_WARN_STRING	"verbose warnings, "
+#else
+# define XFS_WARN_STRING
+#endif
+
 #ifdef DEBUG
 # define XFS_DBG_STRING		"debug"
 #else
@@ -49,6 +61,8 @@
 				XFS_SECURITY_STRING \
 				XFS_REALTIME_STRING \
 				XFS_SCRUB_STRING \
+				XFS_REPAIR_STRING \
+				XFS_WARN_STRING \
 				XFS_DBG_STRING /* DBG must be last */
 
 struct xfs_inode;

diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index a3e98c6..ed66fd2 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c

@@ -12,23 +12,14 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_defer.h"
 #include "xfs_dir2.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_bmap_util.h"
-#include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
-#include "xfs_symlink.h"
 #include "xfs_trans.h"
-#include "xfs_log.h"
 
 /* ----- Kernel only functions below ----- */
 int
@@ -192,6 +183,7 @@
 	pathlen = strlen(target_path);
 	if (pathlen >= XFS_SYMLINK_MAXLEN)      /* total string too long */
 		return -ENAMETOOLONG;
+	ASSERT(pathlen > 0);
 
 	udqp = gdqp = NULL;
 	prid = xfs_get_initial_prid(dp);
@@ -378,6 +370,12 @@
 
 /*
  * Free a symlink that has blocks associated with it.
+ *
+ * Note: zero length symlinks are not allowed to exist. When we set the size to
+ * zero, also change it to a regular file so that it does not get written to
+ * disk as a zero length symlink. The inode is on the unlinked list already, so
+ * userspace cannot find this inode anymore, so this change is not user visible
+ * but allows us to catch corrupt zero-length symlinks in the verifiers.
  */
 STATIC int
 xfs_inactive_symlink_rmt(
@@ -412,13 +410,14 @@
 	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
-	 * Lock the inode, fix the size, and join it to the transaction.
-	 * Hold it so in the normal path, we still have it locked for
-	 * the second transaction.  In the error paths we need it
+	 * Lock the inode, fix the size, turn it into a regular file and join it
+	 * to the transaction.  Hold it so in the normal path, we still have it
+	 * locked for the second transaction.  In the error paths we need it
 	 * held so the cancel won't rele it, see below.
 	 */
 	size = (int)ip->i_d.di_size;
 	ip->i_d.di_size = 0;
+	VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	/*
 	 * Find the block(s) so we can inval and unmap them.
@@ -494,17 +493,10 @@
 		return -EIO;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	/*
-	 * Zero length symlinks _can_ exist.
-	 */
 	pathlen = (int)ip->i_d.di_size;
-	if (!pathlen) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		return 0;
-	}
+	ASSERT(pathlen);
 
-	if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
+	if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) {
 		xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
 			 __func__, (unsigned long long)ip->i_ino, pathlen);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -512,12 +504,12 @@
 		return -EFSCORRUPTED;
 	}
 
+	/*
+	 * Inline fork state gets removed by xfs_difree() so we have nothing to
+	 * do here in that case.
+	 */
 	if (ip->i_df.if_flags & XFS_IFINLINE) {
-		if (ip->i_df.if_bytes > 0) 
-			xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
-					  XFS_DATA_FORK);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		ASSERT(ip->i_df.if_bytes == 0);
 		return 0;
 	}
 

diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 0cc034d..31b3bdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c

@@ -4,10 +4,7 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
 #include "xfs_error.h"
-#include "xfs_stats.h"
 
 static struct ctl_table_header *xfs_table_header;
 

diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 1684881..8abf464 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h

@@ -82,9 +82,13 @@
 extern xfs_param_t	xfs_params;
 
 struct xfs_globals {
+#ifdef DEBUG
+	int	pwork_threads;		/* parallel workqueue threads */
+#endif
 	int	log_recovery_delay;	/* log recovery delay (secs) */
 	int	mount_delay;		/* mount setup delay (secs) */
 	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
+	bool	always_cow;		/* use COW fork for all overwrites */
 };
 extern struct xfs_globals	xfs_globals;
 

diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994..f1bc88f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c

@@ -10,9 +10,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sysfs.h"
-#include "xfs_log.h"
 #include "xfs_log_priv.h"
-#include "xfs_stats.h"
 #include "xfs_mount.h"
 
 struct xfs_sysfs_attr {
@@ -65,19 +63,6 @@
 	.store = xfs_sysfs_object_store,
 };
 
-/*
- * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
- * that is identified by the fsname under sysfs.
- */
-
-static inline struct xfs_mount *
-to_mp(struct kobject *kobject)
-{
-	struct xfs_kobj *kobj = to_kobj(kobject);
-
-	return container_of(kobj, struct xfs_mount, m_kobj);
-}
-
 static struct attribute *xfs_mp_attrs[] = {
 	NULL,
 };
@@ -183,10 +168,74 @@
 }
 XFS_SYSFS_ATTR_RW(mount_delay);
 
+static ssize_t
+always_cow_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	ssize_t		ret;
+
+	ret = kstrtobool(buf, &xfs_globals.always_cow);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+static ssize_t
+always_cow_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
+#ifdef DEBUG
+/*
+ * Override how many threads the parallel work queue is allowed to create.
+ * This has to be a debug-only global (instead of an errortag) because one of
+ * the main users of parallel workqueues is mount time quotacheck.
+ */
+STATIC ssize_t
+pwork_threads_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < -1 || val > num_possible_cpus())
+		return -EINVAL;
+
+	xfs_globals.pwork_threads = val;
+
+	return count;
+}
+
+STATIC ssize_t
+pwork_threads_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
+}
+XFS_SYSFS_ATTR_RW(pwork_threads);
+#endif /* DEBUG */
+
 static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(bug_on_assert),
 	ATTR_LIST(log_recovery_delay),
 	ATTR_LIST(mount_delay),
+	ATTR_LIST(always_cow),
+#ifdef DEBUG
+	ATTR_LIST(pwork_threads),
+#endif
 	NULL,
 };
 

diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index cb6489c..bc85b89 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c

@@ -15,24 +15,16 @@
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_da_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
 #include "xfs_trans.h"
-#include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_quota.h"
-#include "xfs_iomap.h"
-#include "xfs_aops.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_log_recover.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_filestream.h"
 #include "xfs_fsmap.h"
 

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3043e5e..eaae275 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h

@@ -23,6 +23,7 @@
 struct xlog_ticket;
 struct xlog_recover;
 struct xlog_recover_item;
+struct xlog_rec_header;
 struct xfs_buf_log_format;
 struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
@@ -30,6 +31,10 @@
 struct xfs_refcount_irec;
 struct xfs_fsmap;
 struct xfs_rmap_irec;
+struct xfs_icreate_log;
+struct xfs_owner_info;
+struct xfs_trans_res;
+struct xfs_inobt_rec_incore;
 
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -280,7 +285,10 @@
 	),
 	TP_fast_assign(
 		__entry->dev = bp->b_target->bt_dev;
-		__entry->bno = bp->b_bn;
+		if (bp->b_bn == XFS_BUF_DADDR_NULL)
+			__entry->bno = bp->b_maps[0].bm_bn;
+		else
+			__entry->bno = bp->b_bn;
 		__entry->nblks = bp->b_length;
 		__entry->hold = atomic_read(&bp->b_hold);
 		__entry->pincount = atomic_read(&bp->b_pin_count);
@@ -472,7 +480,7 @@
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_release);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
@@ -637,6 +645,16 @@
 DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
+/*
+ * ftrace's __print_symbolic requires that all enum values be wrapped in the
+ * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
+ * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
+ * code.
+ */
+TRACE_DEFINE_ENUM(PE_SIZE_PTE);
+TRACE_DEFINE_ENUM(PE_SIZE_PMD);
+TRACE_DEFINE_ENUM(PE_SIZE_PUD);
+
 TRACE_EVENT(xfs_filemap_fault,
 	TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
 		 bool write_fault),
@@ -1207,15 +1225,15 @@
 
 DECLARE_EVENT_CLASS(xfs_imap_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-		 int type, struct xfs_bmbt_irec *irec),
-	TP_ARGS(ip, offset, count, type, irec),
+		 int whichfork, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, offset, count, whichfork, irec),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(loff_t, size)
 		__field(loff_t, offset)
 		__field(size_t, count)
-		__field(int, type)
+		__field(int, whichfork)
 		__field(xfs_fileoff_t, startoff)
 		__field(xfs_fsblock_t, startblock)
 		__field(xfs_filblks_t, blockcount)
@@ -1226,33 +1244,33 @@
 		__entry->size = ip->i_d.di_size;
 		__entry->offset = offset;
 		__entry->count = count;
-		__entry->type = type;
+		__entry->whichfork = whichfork;
 		__entry->startoff = irec ? irec->br_startoff : 0;
 		__entry->startblock = irec ? irec->br_startblock : 0;
 		__entry->blockcount = irec ? irec->br_blockcount : 0;
 	),
 	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
-		  "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+		  "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->size,
 		  __entry->offset,
 		  __entry->count,
-		  __print_symbolic(__entry->type, XFS_IO_TYPES),
+		  __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
 		  __entry->startoff,
 		  (int64_t)__entry->startblock,
 		  __entry->blockcount)
 )
 
-#define DEFINE_IOMAP_EVENT(name)	\
+#define DEFINE_IMAP_EVENT(name)	\
 DEFINE_EVENT(xfs_imap_class, name,	\
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
-		 int type, struct xfs_bmbt_irec *irec),		\
-	TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+		 int whichfork, struct xfs_bmbt_irec *irec),		\
+	TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1882,11 +1900,11 @@
 	{ 0,	"target" }, \
 	{ 1,	"temp" }
 
-#define XFS_INODE_FORMAT_STR \
-	{ 0,	"invalid" }, \
-	{ 1,	"local" }, \
-	{ 2,	"extent" }, \
-	{ 3,	"btree" }
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_DEV);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
 
 DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 	TP_PROTO(struct xfs_inode *ip, int which),
@@ -2175,6 +2193,14 @@
 DEFINE_DISCARD_EVENT(xfs_discard_busy);
 
 /* btree cursor events */
+TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+
 DECLARE_EVENT_CLASS(xfs_btree_cur_class,
 	TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
 	TP_ARGS(cur, level, bp),
@@ -2194,9 +2220,9 @@
 		__entry->ptr = cur->bc_ptrs[level];
 		__entry->daddr = bp ? bp->b_bn : -1;
 	),
-	TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx",
+	TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->btnum,
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
 		  __entry->level,
 		  __entry->nlevels,
 		  __entry->ptr,
@@ -2273,7 +2299,7 @@
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
-		__entry->type = dfp->dfp_type->type;
+		__entry->type = dfp->dfp_type;
 		__entry->intent = dfp->dfp_intent;
 		__entry->committed = dfp->dfp_done != NULL;
 		__entry->nr = dfp->dfp_count;
@@ -2402,7 +2428,7 @@
 DECLARE_EVENT_CLASS(xfs_rmap_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
 		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
-		 struct xfs_owner_info *oinfo),
+		 const struct xfs_owner_info *oinfo),
 	TP_ARGS(mp, agno, agbno, len, unwritten, oinfo),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
@@ -2437,7 +2463,7 @@
 DEFINE_EVENT(xfs_rmap_class, name, \
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
 		 xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
-		 struct xfs_owner_info *oinfo), \
+		 const struct xfs_owner_info *oinfo), \
 	TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
 
 /* simple AG-based error/%ip tracepoint class */
@@ -2607,10 +2633,9 @@
 #define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
 
 /* ag btree lookup tracepoint class */
-#define XFS_AG_BTREE_CMP_FORMAT_STR \
-	{ XFS_LOOKUP_EQ,	"eq" }, \
-	{ XFS_LOOKUP_LE,	"le" }, \
-	{ XFS_LOOKUP_GE,	"ge" }
+TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
+TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
+TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
 DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
 		 xfs_agblock_t agbno, xfs_lookup_t dir),
@@ -3052,7 +3077,7 @@
 DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
 DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
 DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
 TRACE_EVENT(xfs_reflink_remap_blocks_loop,
 	TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
 		 xfs_filblks_t len, struct xfs_inode *dest,
@@ -3176,13 +3201,10 @@
 
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
 
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3343,8 +3365,250 @@
 DEFINE_TRANS_EVENT(xfs_trans_free);
 DEFINE_TRANS_EVENT(xfs_trans_roll);
 DEFINE_TRANS_EVENT(xfs_trans_add_item);
+DEFINE_TRANS_EVENT(xfs_trans_commit_items);
 DEFINE_TRANS_EVENT(xfs_trans_free_items);
 
+TRACE_EVENT(xfs_iunlink_update_bucket,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+	TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(unsigned int, bucket)
+		__field(xfs_agino_t, old_ptr)
+		__field(xfs_agino_t, new_ptr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->bucket = bucket;
+		__entry->old_ptr = old_ptr;
+		__entry->new_ptr = new_ptr;
+	),
+	TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->bucket,
+		  __entry->old_ptr,
+		  __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+		 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+	TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(xfs_agino_t, old_ptr)
+		__field(xfs_agino_t, new_ptr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agino = agino;
+		__entry->old_ptr = old_ptr;
+		__entry->new_ptr = new_ptr;
+	),
+	TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agino,
+		  __entry->old_ptr,
+		  __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+	TP_PROTO(struct xfs_inode *ip),
+	TP_ARGS(ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+		__entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+	),
+	TP_printk("dev %d:%d agno %u agino %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+	TP_PROTO(struct xfs_inode *ip), \
+	TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
+DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
+	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
+	TP_ARGS(mp, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->flags)
+);
+#define DEFINE_FS_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
+	TP_ARGS(mp, flags))
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
+
+DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
+	TP_ARGS(mp, agno, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d agno %u flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno, __entry->flags)
+);
+#define DEFINE_AG_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 unsigned int flags), \
+	TP_ARGS(mp, agno, flags))
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
+
+DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned int flags),
+	TP_ARGS(ip, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = ip->i_mount->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->flags)
+);
+#define DEFINE_INODE_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
+	TP_ARGS(ip, flags))
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+
+TRACE_EVENT(xfs_iwalk_ag,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t startino),
+	TP_ARGS(mp, agno, startino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = startino;
+	),
+	TP_printk("dev %d:%d agno %d startino %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+		  __entry->startino)
+)
+
+TRACE_EVENT(xfs_iwalk_ag_rec,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_inobt_rec_incore *irec),
+	TP_ARGS(mp, agno, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+		__field(uint64_t, freemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = irec->ir_startino;
+		__entry->freemask = irec->ir_free;
+	),
+	TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+		  __entry->startino, __entry->freemask)
+)
+
+TRACE_EVENT(xfs_pwork_init,
+	TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
+	TP_ARGS(mp, nr_threads, pid),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, nr_threads)
+		__field(pid_t, pid)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->nr_threads = nr_threads;
+		__entry->pid = pid;
+	),
+	TP_printk("dev %d:%d nr_threads %u pid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_threads, __entry->pid)
+)
+
+DECLARE_EVENT_CLASS(xfs_kmem_class,
+	TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
+	TP_ARGS(size, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(ssize_t, size)
+		__field(int, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->size = size;
+		__entry->flags = flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("size %zd flags 0x%x caller %pS",
+		  __entry->size,
+		  __entry->flags,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_KMEM_EVENT(name) \
+DEFINE_EVENT(xfs_kmem_class, name, \
+	TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
+	TP_ARGS(size, flags, caller_ip))
+DEFINE_KMEM_EVENT(kmem_alloc);
+DEFINE_KMEM_EVENT(kmem_alloc_io);
+DEFINE_KMEM_EVENT(kmem_alloc_large);
+DEFINE_KMEM_EVENT(kmem_realloc);
+DEFINE_KMEM_EVENT(kmem_zone_alloc);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 912b42f..f4795fd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c

@@ -11,7 +11,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_inode.h"
 #include "xfs_extent_busy.h"
 #include "xfs_quota.h"
 #include "xfs_trans.h"
@@ -91,7 +90,7 @@
 
 	trace_xfs_trans_dup(tp, _RET_IP_);
 
-	ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
 
 	/*
 	 * Initialize the new transaction structure.
@@ -264,9 +263,7 @@
 	 * GFP_NOFS allocation context so that we avoid lockdep false positives
 	 * by doing GFP_KERNEL allocations inside sb_start_intwrite().
 	 */
-	tp = kmem_zone_zalloc(xfs_trans_zone,
-		(flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
-
+	tp = kmem_zone_zalloc(xfs_trans_zone, 0);
 	if (!(flags & XFS_TRANS_NO_WRITECOUNT))
 		sb_start_intwrite(mp->m_super);
 
@@ -452,7 +449,7 @@
 	xfs_buf_t	*bp;
 	int		whole = 0;
 
-	bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+	bp = xfs_trans_getsb(tp, tp->t_mountp);
 	sbp = XFS_BUF_TO_SBP(bp);
 
 	/*
@@ -767,10 +764,9 @@
 }
 
 /* Detach and unlock all of the items in a transaction */
-void
+static void
 xfs_trans_free_items(
 	struct xfs_trans	*tp,
-	xfs_lsn_t		commit_lsn,
 	bool			abort)
 {
 	struct xfs_log_item	*lip, *next;
@@ -779,11 +775,10 @@
 
 	list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
 		xfs_trans_del_item(lip);
-		if (commit_lsn != NULLCOMMITLSN)
-			lip->li_ops->iop_committing(lip, commit_lsn);
 		if (abort)
 			set_bit(XFS_LI_ABORTED, &lip->li_flags);
-		lip->li_ops->iop_unlock(lip);
+		if (lip->li_ops->iop_release)
+			lip->li_ops->iop_release(lip);
 	}
 }
 
@@ -804,7 +799,8 @@
 	for (i = 0; i < nr_items; i++) {
 		struct xfs_log_item *lip = log_items[i];
 
-		lip->li_ops->iop_unpin(lip, 0);
+		if (lip->li_ops->iop_unpin)
+			lip->li_ops->iop_unpin(lip, 0);
 	}
 }
 
@@ -815,7 +811,7 @@
  *
  * If we are called with the aborted flag set, it is because a log write during
  * a CIL checkpoint commit has failed. In this case, all the items in the
- * checkpoint have already gone through iop_commited and iop_unlock, which
+ * checkpoint have already gone through iop_committed and iop_committing, which
  * means that checkpoint commit abort handling is treated exactly the same
  * as an iclog write error even though we haven't started any IO yet. Hence in
  * this case all we need to do is iop_committed processing, followed by an
@@ -833,7 +829,7 @@
 	struct xfs_ail		*ailp,
 	struct xfs_log_vec	*log_vector,
 	xfs_lsn_t		commit_lsn,
-	int			aborted)
+	bool			aborted)
 {
 #define LOG_ITEM_BATCH_SIZE	32
 	struct xfs_log_item	*log_items[LOG_ITEM_BATCH_SIZE];
@@ -852,7 +848,16 @@
 
 		if (aborted)
 			set_bit(XFS_LI_ABORTED, &lip->li_flags);
-		item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
+
+		if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
+			lip->li_ops->iop_release(lip);
+			continue;
+		}
+
+		if (lip->li_ops->iop_committed)
+			item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
+		else
+			item_lsn = commit_lsn;
 
 		/* item_lsn of -1 means the item needs no further processing */
 		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
@@ -864,7 +869,8 @@
 		 */
 		if (aborted) {
 			ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount));
-			lip->li_ops->iop_unpin(lip, 1);
+			if (lip->li_ops->iop_unpin)
+				lip->li_ops->iop_unpin(lip, 1);
 			continue;
 		}
 
@@ -882,7 +888,8 @@
 				xfs_trans_ail_update(ailp, lip, item_lsn);
 			else
 				spin_unlock(&ailp->ail_lock);
-			lip->li_ops->iop_unpin(lip, 0);
+			if (lip->li_ops->iop_unpin)
+				lip->li_ops->iop_unpin(lip, 0);
 			continue;
 		}
 
@@ -998,7 +1005,7 @@
 		tp->t_ticket = NULL;
 	}
 	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-	xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
+	xfs_trans_free_items(tp, !!error);
 	xfs_trans_free(tp);
 
 	XFS_STATS_INC(mp, xs_trans_empty);
@@ -1060,7 +1067,7 @@
 	/* mark this thread as no longer being in a transaction */
 	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
-	xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
+	xfs_trans_free_items(tp, dirty);
 	xfs_trans_free(tp);
 }
 

diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c3d278e..64d7f17 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h

@@ -27,7 +27,7 @@
 struct xfs_bui_log_item;
 struct xfs_bud_log_item;
 
-typedef struct xfs_log_item {
+struct xfs_log_item {
 	struct list_head		li_ail;		/* AIL pointers */
 	struct list_head		li_trans;	/* transaction list */
 	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
@@ -48,7 +48,7 @@
 	struct xfs_log_vec		*li_lv;		/* active log vector */
 	struct xfs_log_vec		*li_lv_shadow;	/* standby vector */
 	xfs_lsn_t			li_seq;		/* CIL commit seq */
-} xfs_log_item_t;
+};
 
 /*
  * li_flags use the (set/test/clear)_bit atomic interfaces because updates can
@@ -67,17 +67,24 @@
 	{ (1 << XFS_LI_DIRTY),		"DIRTY" }
 
 struct xfs_item_ops {
-	void (*iop_size)(xfs_log_item_t *, int *, int *);
-	void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
-	void (*iop_pin)(xfs_log_item_t *);
-	void (*iop_unpin)(xfs_log_item_t *, int remove);
+	unsigned flags;
+	void (*iop_size)(struct xfs_log_item *, int *, int *);
+	void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
+	void (*iop_pin)(struct xfs_log_item *);
+	void (*iop_unpin)(struct xfs_log_item *, int remove);
 	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
-	void (*iop_unlock)(xfs_log_item_t *);
-	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
-	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-	void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
+	void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
+	void (*iop_release)(struct xfs_log_item *);
+	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+	void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
 };
 
+/*
+ * Release the log item as soon as committed.  This is for items just logging
+ * intents that never need to be written back in place.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED	(1 << 0)
+
 void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
 			  int type, const struct xfs_item_ops *ops);
 
@@ -203,7 +210,7 @@
 				      flags, bpp, ops);
 }
 
-struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *);
 
 void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
@@ -220,16 +227,9 @@
 void		xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
 				  uint);
 void		xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+bool		xfs_trans_buf_is_dirty(struct xfs_buf *bp);
 void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 
-void		xfs_extent_free_init_defer_op(void);
-struct xfs_efd_log_item	*xfs_trans_get_efd(struct xfs_trans *,
-				  struct xfs_efi_log_item *,
-				  uint);
-int		xfs_trans_free_extent(struct xfs_trans *,
-				      struct xfs_efd_log_item *, xfs_fsblock_t,
-				      xfs_extlen_t, struct xfs_owner_info *,
-				      bool);
 int		xfs_trans_commit(struct xfs_trans *);
 int		xfs_trans_roll(struct xfs_trans **);
 int		xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
@@ -244,40 +244,4 @@
 
 extern kmem_zone_t	*xfs_trans_zone;
 
-/* rmap updates */
-enum xfs_rmap_intent_type;
-
-void xfs_rmap_update_init_defer_op(void);
-struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp,
-		struct xfs_rui_log_item *ruip);
-int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
-		struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type,
-		uint64_t owner, int whichfork, xfs_fileoff_t startoff,
-		xfs_fsblock_t startblock, xfs_filblks_t blockcount,
-		xfs_exntst_t state, struct xfs_btree_cur **pcur);
-
-/* refcount updates */
-enum xfs_refcount_intent_type;
-
-void xfs_refcount_update_init_defer_op(void);
-struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp,
-		struct xfs_cui_log_item *cuip);
-int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp,
-		struct xfs_cud_log_item *cudp,
-		enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
-		xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
-		xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
-
-/* mapping updates */
-enum xfs_bmap_intent_type;
-
-void xfs_bmap_update_init_defer_op(void);
-struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp,
-		struct xfs_bui_log_item *buip);
-int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
-		struct xfs_bud_log_item *rudp, enum xfs_bmap_intent_type type,
-		struct xfs_inode *ip, int whichfork, xfs_fileoff_t startoff,
-		xfs_fsblock_t startblock, xfs_filblks_t *blockcount,
-		xfs_exntst_t state);
-
 #endif	/* __XFS_TRANS_H__ */

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 55326f9..6ccfd75 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c

@@ -6,6 +6,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
@@ -74,29 +75,29 @@
  * Return a pointer to the last item in the AIL.  If the AIL is empty, then
  * return NULL.
  */
-static xfs_log_item_t *
+static struct xfs_log_item *
 xfs_ail_max(
 	struct xfs_ail  *ailp)
 {
 	if (list_empty(&ailp->ail_head))
 		return NULL;
 
-	return list_entry(ailp->ail_head.prev, xfs_log_item_t, li_ail);
+	return list_entry(ailp->ail_head.prev, struct xfs_log_item, li_ail);
 }
 
 /*
  * Return a pointer to the item which follows the given item in the AIL.  If
  * the given item is the last item in the list, then return NULL.
  */
-static xfs_log_item_t *
+static struct xfs_log_item *
 xfs_ail_next(
-	struct xfs_ail  *ailp,
-	xfs_log_item_t  *lip)
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
 {
 	if (lip->li_ail.next == &ailp->ail_head)
 		return NULL;
 
-	return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+	return list_first_entry(&lip->li_ail, struct xfs_log_item, li_ail);
 }
 
 /*
@@ -109,10 +110,10 @@
  */
 xfs_lsn_t
 xfs_ail_min_lsn(
-	struct xfs_ail	*ailp)
+	struct xfs_ail		*ailp)
 {
-	xfs_lsn_t	lsn = 0;
-	xfs_log_item_t	*lip;
+	xfs_lsn_t		lsn = 0;
+	struct xfs_log_item	*lip;
 
 	spin_lock(&ailp->ail_lock);
 	lip = xfs_ail_min(ailp);
@@ -128,10 +129,10 @@
  */
 static xfs_lsn_t
 xfs_ail_max_lsn(
-	struct xfs_ail  *ailp)
+	struct xfs_ail		*ailp)
 {
-	xfs_lsn_t       lsn = 0;
-	xfs_log_item_t  *lip;
+	xfs_lsn_t       	lsn = 0;
+	struct xfs_log_item	*lip;
 
 	spin_lock(&ailp->ail_lock);
 	lip = xfs_ail_max(ailp);
@@ -216,13 +217,13 @@
  * ascending traversal.  Pass a @lsn of zero to initialise the cursor to the
  * first item in the AIL. Returns NULL if the list is empty.
  */
-xfs_log_item_t *
+struct xfs_log_item *
 xfs_trans_ail_cursor_first(
 	struct xfs_ail		*ailp,
 	struct xfs_ail_cursor	*cur,
 	xfs_lsn_t		lsn)
 {
-	xfs_log_item_t		*lip;
+	struct xfs_log_item	*lip;
 
 	xfs_trans_ail_cursor_init(ailp, cur);
 
@@ -248,7 +249,7 @@
 	struct xfs_ail		*ailp,
 	xfs_lsn_t		lsn)
 {
-	xfs_log_item_t		*lip;
+	struct xfs_log_item	*lip;
 
 	list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) {
 		if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
@@ -327,8 +328,8 @@
  */
 static void
 xfs_ail_delete(
-	struct xfs_ail  *ailp,
-	xfs_log_item_t  *lip)
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
 {
 	xfs_ail_check(ailp, lip);
 	list_del(&lip->li_ail);
@@ -347,6 +348,14 @@
 	if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN))
 		return XFS_ITEM_PINNED;
 
+	/*
+	 * Consider the item pinned if a push callback is not defined so the
+	 * caller will force the log. This should only happen for intent items
+	 * as they are unpinned once the associated done item is committed to
+	 * the on-disk log.
+	 */
+	if (!lip->li_ops->iop_push)
+		return XFS_ITEM_PINNED;
 	return lip->li_ops->iop_push(lip, &ailp->ail_buf_list);
 }
 
@@ -356,7 +365,7 @@
 {
 	xfs_mount_t		*mp = ailp->ail_mount;
 	struct xfs_ail_cursor	cur;
-	xfs_log_item_t		*lip;
+	struct xfs_log_item	*lip;
 	xfs_lsn_t		lsn;
 	xfs_lsn_t		target;
 	long			tout;
@@ -531,17 +540,33 @@
 			set_current_state(TASK_INTERRUPTIBLE);
 
 		/*
-		 * Check kthread_should_stop() after we set the task state
-		 * to guarantee that we either see the stop bit and exit or
-		 * the task state is reset to runnable such that it's not
-		 * scheduled out indefinitely and detects the stop bit at
-		 * next iteration.
-		 *
+		 * Check kthread_should_stop() after we set the task state to
+		 * guarantee that we either see the stop bit and exit or the
+		 * task state is reset to runnable such that it's not scheduled
+		 * out indefinitely and detects the stop bit at next iteration.
 		 * A memory barrier is included in above task state set to
 		 * serialize again kthread_stop().
 		 */
 		if (kthread_should_stop()) {
 			__set_current_state(TASK_RUNNING);
+
+			/*
+			 * The caller forces out the AIL before stopping the
+			 * thread in the common case, which means the delwri
+			 * queue is drained. In the shutdown case, the queue may
+			 * still hold relogged buffers that haven't been
+			 * submitted because they were pinned since added to the
+			 * queue.
+			 *
+			 * Log I/O error processing stales the underlying buffer
+			 * and clears the delwri state, expecting the buf to be
+			 * removed on the next submission attempt. That won't
+			 * happen if we're shutting down, so this is the last
+			 * opportunity to release such buffers from the queue.
+			 */
+			ASSERT(list_empty(&ailp->ail_buf_list) ||
+			       XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+			xfs_buf_delwri_cancel(&ailp->ail_buf_list);
 			break;
 		}
 
@@ -595,10 +620,10 @@
  */
 void
 xfs_ail_push(
-	struct xfs_ail	*ailp,
-	xfs_lsn_t	threshold_lsn)
+	struct xfs_ail		*ailp,
+	xfs_lsn_t		threshold_lsn)
 {
-	xfs_log_item_t	*lip;
+	struct xfs_log_item	*lip;
 
 	lip = xfs_ail_min(ailp);
 	if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) ||
@@ -683,7 +708,7 @@
 	int			nr_items,
 	xfs_lsn_t		lsn) __releases(ailp->ail_lock)
 {
-	xfs_log_item_t		*mlip;
+	struct xfs_log_item	*mlip;
 	int			mlip_changed = 0;
 	int			i;
 	LIST_HEAD(tmp);

diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
deleted file mode 100644
index 741c558..0000000
--- a/fs/xfs/xfs_trans_bmap.c
+++ /dev/null

@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_bmap_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_inode.h"
-
-/*
- * This routine is called to allocate a "bmap update done"
- * log item.
- */
-struct xfs_bud_log_item *
-xfs_trans_get_bud(
-	struct xfs_trans		*tp,
-	struct xfs_bui_log_item		*buip)
-{
-	struct xfs_bud_log_item		*budp;
-
-	budp = xfs_bud_init(tp->t_mountp, buip);
-	xfs_trans_add_item(tp, &budp->bud_item);
-	return budp;
-}
-
-/*
- * Finish an bmap update and log it to the BUD. Note that the
- * transaction is marked dirty regardless of whether the bmap update
- * succeeds or fails to support the BUI/BUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_bmap_update(
-	struct xfs_trans		*tp,
-	struct xfs_bud_log_item		*budp,
-	enum xfs_bmap_intent_type	type,
-	struct xfs_inode		*ip,
-	int				whichfork,
-	xfs_fileoff_t			startoff,
-	xfs_fsblock_t			startblock,
-	xfs_filblks_t			*blockcount,
-	xfs_exntst_t			state)
-{
-	int				error;
-
-	error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
-			startblock, blockcount, state);
-
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the BUI and frees the BUD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
-	return error;
-}
-
-/* Sort bmap intents by inode. */
-static int
-xfs_bmap_update_diff_items(
-	void				*priv,
-	struct list_head		*a,
-	struct list_head		*b)
-{
-	struct xfs_bmap_intent		*ba;
-	struct xfs_bmap_intent		*bb;
-
-	ba = container_of(a, struct xfs_bmap_intent, bi_list);
-	bb = container_of(b, struct xfs_bmap_intent, bi_list);
-	return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
-}
-
-/* Get an BUI. */
-STATIC void *
-xfs_bmap_update_create_intent(
-	struct xfs_trans		*tp,
-	unsigned int			count)
-{
-	struct xfs_bui_log_item		*buip;
-
-	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
-	ASSERT(tp != NULL);
-
-	buip = xfs_bui_init(tp->t_mountp);
-	ASSERT(buip != NULL);
-
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	xfs_trans_add_item(tp, &buip->bui_item);
-	return buip;
-}
-
-/* Set the map extent flags for this mapping. */
-static void
-xfs_trans_set_bmap_flags(
-	struct xfs_map_extent		*bmap,
-	enum xfs_bmap_intent_type	type,
-	int				whichfork,
-	xfs_exntst_t			state)
-{
-	bmap->me_flags = 0;
-	switch (type) {
-	case XFS_BMAP_MAP:
-	case XFS_BMAP_UNMAP:
-		bmap->me_flags = type;
-		break;
-	default:
-		ASSERT(0);
-	}
-	if (state == XFS_EXT_UNWRITTEN)
-		bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
-	if (whichfork == XFS_ATTR_FORK)
-		bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
-}
-
-/* Log bmap updates in the intent item. */
-STATIC void
-xfs_bmap_update_log_item(
-	struct xfs_trans		*tp,
-	void				*intent,
-	struct list_head		*item)
-{
-	struct xfs_bui_log_item		*buip = intent;
-	struct xfs_bmap_intent		*bmap;
-	uint				next_extent;
-	struct xfs_map_extent		*map;
-
-	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
-
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
-
-	/*
-	 * atomic_inc_return gives us the value after the increment;
-	 * we want to use it as an array index so we need to subtract 1 from
-	 * it.
-	 */
-	next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
-	ASSERT(next_extent < buip->bui_format.bui_nextents);
-	map = &buip->bui_format.bui_extents[next_extent];
-	map->me_owner = bmap->bi_owner->i_ino;
-	map->me_startblock = bmap->bi_bmap.br_startblock;
-	map->me_startoff = bmap->bi_bmap.br_startoff;
-	map->me_len = bmap->bi_bmap.br_blockcount;
-	xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
-			bmap->bi_bmap.br_state);
-}
-
-/* Get an BUD so we can process all the deferred rmap updates. */
-STATIC void *
-xfs_bmap_update_create_done(
-	struct xfs_trans		*tp,
-	void				*intent,
-	unsigned int			count)
-{
-	return xfs_trans_get_bud(tp, intent);
-}
-
-/* Process a deferred rmap update. */
-STATIC int
-xfs_bmap_update_finish_item(
-	struct xfs_trans		*tp,
-	struct list_head		*item,
-	void				*done_item,
-	void				**state)
-{
-	struct xfs_bmap_intent		*bmap;
-	xfs_filblks_t			count;
-	int				error;
-
-	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
-	count = bmap->bi_bmap.br_blockcount;
-	error = xfs_trans_log_finish_bmap_update(tp, done_item,
-			bmap->bi_type,
-			bmap->bi_owner, bmap->bi_whichfork,
-			bmap->bi_bmap.br_startoff,
-			bmap->bi_bmap.br_startblock,
-			&count,
-			bmap->bi_bmap.br_state);
-	if (!error && count > 0) {
-		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
-		bmap->bi_bmap.br_blockcount = count;
-		return -EAGAIN;
-	}
-	kmem_free(bmap);
-	return error;
-}
-
-/* Abort all pending BUIs. */
-STATIC void
-xfs_bmap_update_abort_intent(
-	void				*intent)
-{
-	xfs_bui_release(intent);
-}
-
-/* Cancel a deferred rmap update. */
-STATIC void
-xfs_bmap_update_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_bmap_intent		*bmap;
-
-	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
-	kmem_free(bmap);
-}
-
-static const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
-	.type		= XFS_DEFER_OPS_TYPE_BMAP,
-	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
-	.diff_items	= xfs_bmap_update_diff_items,
-	.create_intent	= xfs_bmap_update_create_intent,
-	.abort_intent	= xfs_bmap_update_abort_intent,
-	.log_item	= xfs_bmap_update_log_item,
-	.create_done	= xfs_bmap_update_create_done,
-	.finish_item	= xfs_bmap_update_finish_item,
-	.cancel_item	= xfs_bmap_update_cancel_item,
-};
-
-/* Register the deferred op type. */
-void
-xfs_bmap_update_init_defer_op(void)
-{
-	xfs_defer_init_op_type(&xfs_bmap_update_defer_type);
-}

diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 286a287..b5b3a78 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c

@@ -10,11 +10,9 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
-#include "xfs_error.h"
 #include "xfs_trace.h"
 
 /*
@@ -174,8 +172,7 @@
 xfs_buf_t *
 xfs_trans_getsb(
 	xfs_trans_t		*tp,
-	struct xfs_mount	*mp,
-	int			flags)
+	struct xfs_mount	*mp)
 {
 	xfs_buf_t		*bp;
 	struct xfs_buf_log_item	*bip;
@@ -185,7 +182,7 @@
 	 * if tp is NULL.
 	 */
 	if (tp == NULL)
-		return xfs_getsb(mp, flags);
+		return xfs_getsb(mp);
 
 	/*
 	 * If the superblock buffer already has this transaction
@@ -203,7 +200,7 @@
 		return bp;
 	}
 
-	bp = xfs_getsb(mp, flags);
+	bp = xfs_getsb(mp);
 	if (bp == NULL)
 		return NULL;
 
@@ -264,11 +261,39 @@
 			return -EIO;
 		}
 
+		/*
+		 * Check if the caller is trying to read a buffer that is
+		 * already attached to the transaction yet has no buffer ops
+		 * assigned.  Ops are usually attached when the buffer is
+		 * attached to the transaction, or by the read caller if
+		 * special circumstances.  That didn't happen, which is not
+		 * how this is supposed to go.
+		 *
+		 * If the buffer passes verification we'll let this go, but if
+		 * not we have to shut down.  Let the transaction cleanup code
+		 * release this buffer when it kills the tranaction.
+		 */
+		ASSERT(bp->b_ops != NULL);
+		error = xfs_buf_reverify(bp, ops);
+		if (error) {
+			xfs_buf_ioerror_alert(bp, __func__);
+
+			if (tp->t_flags & XFS_TRANS_DIRTY)
+				xfs_force_shutdown(tp->t_mountp,
+						SHUTDOWN_META_IO_ERROR);
+
+			/* bad CRC means corrupted metadata */
+			if (error == -EFSBADCRC)
+				error = -EFSCORRUPTED;
+			return error;
+		}
+
 		bip = bp->b_log_item;
 		bip->bli_recur++;
 
 		ASSERT(atomic_read(&bip->bli_refcount) > 0);
 		trace_xfs_trans_read_buf_recur(bip);
+		ASSERT(bp->b_ops != NULL || ops == NULL);
 		*bpp = bp;
 		return 0;
 	}
@@ -316,11 +341,25 @@
 		_xfs_trans_bjoin(tp, bp, 1);
 		trace_xfs_trans_read_buf(bp->b_log_item);
 	}
+	ASSERT(bp->b_ops != NULL || ops == NULL);
 	*bpp = bp;
 	return 0;
 
 }
 
+/* Has this buffer been dirtied by anyone? */
+bool
+xfs_trans_buf_is_dirty(
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	if (!bip)
+		return false;
+	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+	return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
+}
+
 /*
  * Release a buffer previously joined to the transaction. If the buffer is
  * modified within this transaction, decrement the recursion count but do not
@@ -386,7 +425,7 @@
 
 /*
  * Mark the buffer as not needing to be unlocked when the buf item's
- * iop_unlock() routine is called.  The buffer must already be locked
+ * iop_committing() routine is called.  The buffer must already be locked
  * and associated with the given transaction.
  */
 /* ARGSUSED */

diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c23257a..1645746 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c

@@ -11,7 +11,6 @@
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
-#include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
@@ -29,7 +28,6 @@
 	xfs_trans_t	*tp,
 	xfs_dquot_t	*dqp)
 {
-	ASSERT(dqp->q_transp != tp);
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	ASSERT(dqp->q_logitem.qli_dquot == dqp);
 
@@ -37,15 +35,8 @@
 	 * Get a log_item_desc to point at the new item.
 	 */
 	xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
-
-	/*
-	 * Initialize d_transp so we can later determine if this dquot is
-	 * associated with this transaction.
-	 */
-	dqp->q_transp = tp;
 }
 
-
 /*
  * This is called to mark the dquot as needing
  * to be logged when the transaction is committed.  The dquot must
@@ -61,7 +52,6 @@
 	xfs_trans_t	*tp,
 	xfs_dquot_t	*dqp)
 {
-	ASSERT(dqp->q_transp == tp);
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
@@ -74,13 +64,13 @@
  */
 void
 xfs_trans_dup_dqinfo(
-	xfs_trans_t	*otp,
-	xfs_trans_t	*ntp)
+	struct xfs_trans	*otp,
+	struct xfs_trans	*ntp)
 {
-	xfs_dqtrx_t	*oq, *nq;
-	int		i, j;
-	xfs_dqtrx_t	*oqa, *nqa;
-	ulong		blk_res_used;
+	struct xfs_dqtrx	*oq, *nq;
+	int			i, j;
+	struct xfs_dqtrx	*oqa, *nqa;
+	uint64_t		blk_res_used;
 
 	if (!otp->t_dqinfo)
 		return;
@@ -137,7 +127,7 @@
 	xfs_trans_t	*tp,
 	xfs_inode_t	*ip,
 	uint		field,
-	long		delta)
+	int64_t		delta)
 {
 	xfs_mount_t	*mp = tp->t_mountp;
 
@@ -191,12 +181,12 @@
  */
 void
 xfs_trans_mod_dquot(
-	xfs_trans_t	*tp,
-	xfs_dquot_t	*dqp,
-	uint		field,
-	long		delta)
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp,
+	uint			field,
+	int64_t			delta)
 {
-	xfs_dqtrx_t	*qtrx;
+	struct xfs_dqtrx	*qtrx;
 
 	ASSERT(tp);
 	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
@@ -219,14 +209,14 @@
 		 * regular disk blk reservation
 		 */
 	      case XFS_TRANS_DQ_RES_BLKS:
-		qtrx->qt_blk_res += (ulong)delta;
+		qtrx->qt_blk_res += delta;
 		break;
 
 		/*
 		 * inode reservation
 		 */
 	      case XFS_TRANS_DQ_RES_INOS:
-		qtrx->qt_ino_res += (ulong)delta;
+		qtrx->qt_ino_res += delta;
 		break;
 
 		/*
@@ -245,7 +235,7 @@
 		 */
 	      case XFS_TRANS_DQ_ICOUNT:
 		if (qtrx->qt_ino_res && delta > 0) {
-			qtrx->qt_ino_res_used += (ulong)delta;
+			qtrx->qt_ino_res_used += delta;
 			ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
 		}
 		qtrx->qt_icount_delta += delta;
@@ -255,7 +245,7 @@
 		 * rtblk reservation
 		 */
 	      case XFS_TRANS_DQ_RES_RTBLKS:
-		qtrx->qt_rtblk_res += (ulong)delta;
+		qtrx->qt_rtblk_res += delta;
 		break;
 
 		/*
@@ -263,7 +253,7 @@
 		 */
 	      case XFS_TRANS_DQ_RTBCOUNT:
 		if (qtrx->qt_rtblk_res && delta > 0) {
-			qtrx->qt_rtblk_res_used += (ulong)delta;
+			qtrx->qt_rtblk_res_used += delta;
 			ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
 		}
 		qtrx->qt_rtbcount_delta += delta;
@@ -288,8 +278,8 @@
  */
 STATIC void
 xfs_trans_dqlockedjoin(
-	xfs_trans_t	*tp,
-	xfs_dqtrx_t	*q)
+	struct xfs_trans	*tp,
+	struct xfs_dqtrx	*q)
 {
 	ASSERT(q[0].qt_dquot != NULL);
 	if (q[1].qt_dquot == NULL) {
@@ -320,8 +310,8 @@
 	struct xfs_dquot	*dqp;
 	struct xfs_dqtrx	*qtrx, *qa;
 	struct xfs_disk_dquot	*d;
-	long			totalbdelta;
-	long			totalrtbdelta;
+	int64_t			totalbdelta;
+	int64_t			totalrtbdelta;
 
 	if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
 		return;
@@ -347,7 +337,6 @@
 				break;
 
 			ASSERT(XFS_DQ_IS_LOCKED(dqp));
-			ASSERT(dqp->q_transp == tp);
 
 			/*
 			 * adjust the actual number of blocks used
@@ -413,7 +402,7 @@
 			 * reservation that a transaction structure knows of.
 			 */
 			if (qtrx->qt_blk_res != 0) {
-				ulong blk_res_used = 0;
+				uint64_t	blk_res_used = 0;
 
 				if (qtrx->qt_bcount_delta > 0)
 					blk_res_used = qtrx->qt_bcount_delta;
@@ -501,7 +490,7 @@
 {
 	int			i, j;
 	xfs_dquot_t		*dqp;
-	xfs_dqtrx_t		*qtrx, *qa;
+	struct xfs_dqtrx	*qtrx, *qa;
 	bool                    locked;
 
 	if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
@@ -585,7 +574,7 @@
 	xfs_trans_t	*tp,
 	xfs_mount_t	*mp,
 	xfs_dquot_t	*dqp,
-	long		nblks,
+	int64_t		nblks,
 	long		ninos,
 	uint		flags)
 {
@@ -745,7 +734,7 @@
 	struct xfs_dquot	*udqp,
 	struct xfs_dquot	*gdqp,
 	struct xfs_dquot	*pdqp,
-	long			nblks,
+	int64_t			nblks,
 	long			ninos,
 	uint			flags)
 {
@@ -804,7 +793,7 @@
 xfs_trans_reserve_quota_nblks(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
-	long			nblks,
+	int64_t			nblks,
 	long			ninos,
 	uint			flags)
 {
@@ -874,7 +863,7 @@
 xfs_trans_alloc_dqinfo(
 	xfs_trans_t	*tp)
 {
-	tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
+	tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
 }
 
 void

diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
deleted file mode 100644
index 855c0b6..0000000
--- a/fs/xfs/xfs_trans_extfree.c
+++ /dev/null

@@ -1,295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_extfree_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-
-/*
- * This routine is called to allocate an "extent free done"
- * log item that will hold nextents worth of extents.  The
- * caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-struct xfs_efd_log_item *
-xfs_trans_get_efd(struct xfs_trans		*tp,
-		  struct xfs_efi_log_item	*efip,
-		  uint				nextents)
-{
-	struct xfs_efd_log_item			*efdp;
-
-	ASSERT(tp != NULL);
-	ASSERT(nextents > 0);
-
-	efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
-	ASSERT(efdp != NULL);
-
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	xfs_trans_add_item(tp, &efdp->efd_item);
-	return efdp;
-}
-
-/*
- * Free an extent and log it to the EFD. Note that the transaction is marked
- * dirty regardless of whether the extent free succeeds or fails to support the
- * EFI/EFD lifecycle rules.
- */
-int
-xfs_trans_free_extent(
-	struct xfs_trans	*tp,
-	struct xfs_efd_log_item	*efdp,
-	xfs_fsblock_t		start_block,
-	xfs_extlen_t		ext_len,
-	struct xfs_owner_info	*oinfo,
-	bool			skip_discard)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	uint			next_extent;
-	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, start_block);
-	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(mp, start_block);
-	struct xfs_extent	*extp;
-	int			error;
-
-	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
-
-	error = __xfs_free_extent(tp, start_block, ext_len,
-				  oinfo, XFS_AG_RESV_NONE, skip_discard);
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the EFI and frees the EFD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
-	next_extent = efdp->efd_next_extent;
-	ASSERT(next_extent < efdp->efd_format.efd_nextents);
-	extp = &(efdp->efd_format.efd_extents[next_extent]);
-	extp->ext_start = start_block;
-	extp->ext_len = ext_len;
-	efdp->efd_next_extent++;
-
-	return error;
-}
-
-/* Sort bmap items by AG. */
-static int
-xfs_extent_free_diff_items(
-	void				*priv,
-	struct list_head		*a,
-	struct list_head		*b)
-{
-	struct xfs_mount		*mp = priv;
-	struct xfs_extent_free_item	*ra;
-	struct xfs_extent_free_item	*rb;
-
-	ra = container_of(a, struct xfs_extent_free_item, xefi_list);
-	rb = container_of(b, struct xfs_extent_free_item, xefi_list);
-	return  XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
-		XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
-}
-
-/* Get an EFI. */
-STATIC void *
-xfs_extent_free_create_intent(
-	struct xfs_trans		*tp,
-	unsigned int			count)
-{
-	struct xfs_efi_log_item		*efip;
-
-	ASSERT(tp != NULL);
-	ASSERT(count > 0);
-
-	efip = xfs_efi_init(tp->t_mountp, count);
-	ASSERT(efip != NULL);
-
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	xfs_trans_add_item(tp, &efip->efi_item);
-	return efip;
-}
-
-/* Log a free extent to the intent item. */
-STATIC void
-xfs_extent_free_log_item(
-	struct xfs_trans		*tp,
-	void				*intent,
-	struct list_head		*item)
-{
-	struct xfs_efi_log_item		*efip = intent;
-	struct xfs_extent_free_item	*free;
-	uint				next_extent;
-	struct xfs_extent		*extp;
-
-	free = container_of(item, struct xfs_extent_free_item, xefi_list);
-
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
-
-	/*
-	 * atomic_inc_return gives us the value after the increment;
-	 * we want to use it as an array index so we need to subtract 1 from
-	 * it.
-	 */
-	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
-	ASSERT(next_extent < efip->efi_format.efi_nextents);
-	extp = &efip->efi_format.efi_extents[next_extent];
-	extp->ext_start = free->xefi_startblock;
-	extp->ext_len = free->xefi_blockcount;
-}
-
-/* Get an EFD so we can process all the free extents. */
-STATIC void *
-xfs_extent_free_create_done(
-	struct xfs_trans		*tp,
-	void				*intent,
-	unsigned int			count)
-{
-	return xfs_trans_get_efd(tp, intent, count);
-}
-
-/* Process a free extent. */
-STATIC int
-xfs_extent_free_finish_item(
-	struct xfs_trans		*tp,
-	struct list_head		*item,
-	void				*done_item,
-	void				**state)
-{
-	struct xfs_extent_free_item	*free;
-	int				error;
-
-	free = container_of(item, struct xfs_extent_free_item, xefi_list);
-	error = xfs_trans_free_extent(tp, done_item,
-			free->xefi_startblock,
-			free->xefi_blockcount,
-			&free->xefi_oinfo, free->xefi_skip_discard);
-	kmem_free(free);
-	return error;
-}
-
-/* Abort all pending EFIs. */
-STATIC void
-xfs_extent_free_abort_intent(
-	void				*intent)
-{
-	xfs_efi_release(intent);
-}
-
-/* Cancel a free extent. */
-STATIC void
-xfs_extent_free_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_extent_free_item	*free;
-
-	free = container_of(item, struct xfs_extent_free_item, xefi_list);
-	kmem_free(free);
-}
-
-static const struct xfs_defer_op_type xfs_extent_free_defer_type = {
-	.type		= XFS_DEFER_OPS_TYPE_FREE,
-	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
-	.diff_items	= xfs_extent_free_diff_items,
-	.create_intent	= xfs_extent_free_create_intent,
-	.abort_intent	= xfs_extent_free_abort_intent,
-	.log_item	= xfs_extent_free_log_item,
-	.create_done	= xfs_extent_free_create_done,
-	.finish_item	= xfs_extent_free_finish_item,
-	.cancel_item	= xfs_extent_free_cancel_item,
-};
-
-/*
- * AGFL blocks are accounted differently in the reserve pools and are not
- * inserted into the busy extent list.
- */
-STATIC int
-xfs_agfl_free_finish_item(
-	struct xfs_trans		*tp,
-	struct list_head		*item,
-	void				*done_item,
-	void				**state)
-{
-	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_efd_log_item		*efdp = done_item;
-	struct xfs_extent_free_item	*free;
-	struct xfs_extent		*extp;
-	struct xfs_buf			*agbp;
-	int				error;
-	xfs_agnumber_t			agno;
-	xfs_agblock_t			agbno;
-	uint				next_extent;
-
-	free = container_of(item, struct xfs_extent_free_item, xefi_list);
-	ASSERT(free->xefi_blockcount == 1);
-	agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
-	agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
-
-	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
-
-	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
-	if (!error)
-		error = xfs_free_agfl_block(tp, agno, agbno, agbp,
-					    &free->xefi_oinfo);
-
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the EFI and frees the EFD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
-	next_extent = efdp->efd_next_extent;
-	ASSERT(next_extent < efdp->efd_format.efd_nextents);
-	extp = &(efdp->efd_format.efd_extents[next_extent]);
-	extp->ext_start = free->xefi_startblock;
-	extp->ext_len = free->xefi_blockcount;
-	efdp->efd_next_extent++;
-
-	kmem_free(free);
-	return error;
-}
-
-
-/* sub-type with special handling for AGFL deferred frees */
-static const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
-	.type		= XFS_DEFER_OPS_TYPE_AGFL_FREE,
-	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
-	.diff_items	= xfs_extent_free_diff_items,
-	.create_intent	= xfs_extent_free_create_intent,
-	.abort_intent	= xfs_extent_free_abort_intent,
-	.log_item	= xfs_extent_free_log_item,
-	.create_done	= xfs_extent_free_create_done,
-	.finish_item	= xfs_agfl_free_finish_item,
-	.cancel_item	= xfs_extent_free_cancel_item,
-};
-
-/* Register the deferred op type. */
-void
-xfs_extent_free_init_defer_op(void)
-{
-	xfs_defer_init_op_type(&xfs_extent_free_defer_type);
-	xfs_defer_init_op_type(&xfs_agfl_free_defer_type);
-}

diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 091eae9..2e073c1 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h

@@ -16,12 +16,10 @@
 void	xfs_trans_init(struct xfs_mount *);
 void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void	xfs_trans_del_item(struct xfs_log_item *);
-void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-				bool abort);
 void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
 void	xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
-				xfs_lsn_t commit_lsn, int aborted);
+				xfs_lsn_t commit_lsn, bool aborted);
 /*
  * AIL traversal cursor.
  *

diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
deleted file mode 100644
index 523c556..0000000
--- a/fs/xfs/xfs_trans_refcount.c
+++ /dev/null

@@ -1,248 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_refcount_item.h"
-#include "xfs_alloc.h"
-#include "xfs_refcount.h"
-
-/*
- * This routine is called to allocate a "refcount update done"
- * log item.
- */
-struct xfs_cud_log_item *
-xfs_trans_get_cud(
-	struct xfs_trans		*tp,
-	struct xfs_cui_log_item		*cuip)
-{
-	struct xfs_cud_log_item		*cudp;
-
-	cudp = xfs_cud_init(tp->t_mountp, cuip);
-	xfs_trans_add_item(tp, &cudp->cud_item);
-	return cudp;
-}
-
-/*
- * Finish an refcount update and log it to the CUD. Note that the
- * transaction is marked dirty regardless of whether the refcount
- * update succeeds or fails to support the CUI/CUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_refcount_update(
-	struct xfs_trans		*tp,
-	struct xfs_cud_log_item		*cudp,
-	enum xfs_refcount_intent_type	type,
-	xfs_fsblock_t			startblock,
-	xfs_extlen_t			blockcount,
-	xfs_fsblock_t			*new_fsb,
-	xfs_extlen_t			*new_len,
-	struct xfs_btree_cur		**pcur)
-{
-	int				error;
-
-	error = xfs_refcount_finish_one(tp, type, startblock,
-			blockcount, new_fsb, new_len, pcur);
-
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the CUI and frees the CUD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
-	return error;
-}
-
-/* Sort refcount intents by AG. */
-static int
-xfs_refcount_update_diff_items(
-	void				*priv,
-	struct list_head		*a,
-	struct list_head		*b)
-{
-	struct xfs_mount		*mp = priv;
-	struct xfs_refcount_intent	*ra;
-	struct xfs_refcount_intent	*rb;
-
-	ra = container_of(a, struct xfs_refcount_intent, ri_list);
-	rb = container_of(b, struct xfs_refcount_intent, ri_list);
-	return  XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
-		XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
-}
-
-/* Get an CUI. */
-STATIC void *
-xfs_refcount_update_create_intent(
-	struct xfs_trans		*tp,
-	unsigned int			count)
-{
-	struct xfs_cui_log_item		*cuip;
-
-	ASSERT(tp != NULL);
-	ASSERT(count > 0);
-
-	cuip = xfs_cui_init(tp->t_mountp, count);
-	ASSERT(cuip != NULL);
-
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	xfs_trans_add_item(tp, &cuip->cui_item);
-	return cuip;
-}
-
-/* Set the phys extent flags for this reverse mapping. */
-static void
-xfs_trans_set_refcount_flags(
-	struct xfs_phys_extent		*refc,
-	enum xfs_refcount_intent_type	type)
-{
-	refc->pe_flags = 0;
-	switch (type) {
-	case XFS_REFCOUNT_INCREASE:
-	case XFS_REFCOUNT_DECREASE:
-	case XFS_REFCOUNT_ALLOC_COW:
-	case XFS_REFCOUNT_FREE_COW:
-		refc->pe_flags |= type;
-		break;
-	default:
-		ASSERT(0);
-	}
-}
-
-/* Log refcount updates in the intent item. */
-STATIC void
-xfs_refcount_update_log_item(
-	struct xfs_trans		*tp,
-	void				*intent,
-	struct list_head		*item)
-{
-	struct xfs_cui_log_item		*cuip = intent;
-	struct xfs_refcount_intent	*refc;
-	uint				next_extent;
-	struct xfs_phys_extent		*ext;
-
-	refc = container_of(item, struct xfs_refcount_intent, ri_list);
-
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-
-	/*
-	 * atomic_inc_return gives us the value after the increment;
-	 * we want to use it as an array index so we need to subtract 1 from
-	 * it.
-	 */
-	next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
-	ASSERT(next_extent < cuip->cui_format.cui_nextents);
-	ext = &cuip->cui_format.cui_extents[next_extent];
-	ext->pe_startblock = refc->ri_startblock;
-	ext->pe_len = refc->ri_blockcount;
-	xfs_trans_set_refcount_flags(ext, refc->ri_type);
-}
-
-/* Get an CUD so we can process all the deferred refcount updates. */
-STATIC void *
-xfs_refcount_update_create_done(
-	struct xfs_trans		*tp,
-	void				*intent,
-	unsigned int			count)
-{
-	return xfs_trans_get_cud(tp, intent);
-}
-
-/* Process a deferred refcount update. */
-STATIC int
-xfs_refcount_update_finish_item(
-	struct xfs_trans		*tp,
-	struct list_head		*item,
-	void				*done_item,
-	void				**state)
-{
-	struct xfs_refcount_intent	*refc;
-	xfs_fsblock_t			new_fsb;
-	xfs_extlen_t			new_aglen;
-	int				error;
-
-	refc = container_of(item, struct xfs_refcount_intent, ri_list);
-	error = xfs_trans_log_finish_refcount_update(tp, done_item,
-			refc->ri_type,
-			refc->ri_startblock,
-			refc->ri_blockcount,
-			&new_fsb, &new_aglen,
-			(struct xfs_btree_cur **)state);
-	/* Did we run out of reservation?  Requeue what we didn't finish. */
-	if (!error && new_aglen > 0) {
-		ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
-		       refc->ri_type == XFS_REFCOUNT_DECREASE);
-		refc->ri_startblock = new_fsb;
-		refc->ri_blockcount = new_aglen;
-		return -EAGAIN;
-	}
-	kmem_free(refc);
-	return error;
-}
-
-/* Clean up after processing deferred refcounts. */
-STATIC void
-xfs_refcount_update_finish_cleanup(
-	struct xfs_trans	*tp,
-	void			*state,
-	int			error)
-{
-	struct xfs_btree_cur	*rcur = state;
-
-	xfs_refcount_finish_one_cleanup(tp, rcur, error);
-}
-
-/* Abort all pending CUIs. */
-STATIC void
-xfs_refcount_update_abort_intent(
-	void				*intent)
-{
-	xfs_cui_release(intent);
-}
-
-/* Cancel a deferred refcount update. */
-STATIC void
-xfs_refcount_update_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_refcount_intent	*refc;
-
-	refc = container_of(item, struct xfs_refcount_intent, ri_list);
-	kmem_free(refc);
-}
-
-static const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
-	.type		= XFS_DEFER_OPS_TYPE_REFCOUNT,
-	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
-	.diff_items	= xfs_refcount_update_diff_items,
-	.create_intent	= xfs_refcount_update_create_intent,
-	.abort_intent	= xfs_refcount_update_abort_intent,
-	.log_item	= xfs_refcount_update_log_item,
-	.create_done	= xfs_refcount_update_create_done,
-	.finish_item	= xfs_refcount_update_finish_item,
-	.finish_cleanup = xfs_refcount_update_finish_cleanup,
-	.cancel_item	= xfs_refcount_update_cancel_item,
-};
-
-/* Register the deferred op type. */
-void
-xfs_refcount_update_init_defer_op(void)
-{
-	xfs_defer_init_op_type(&xfs_refcount_update_defer_type);
-}

diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
deleted file mode 100644
index 05b00e4..0000000
--- a/fs/xfs/xfs_trans_rmap.c
+++ /dev/null

@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle.  All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_rmap_item.h"
-#include "xfs_alloc.h"
-#include "xfs_rmap.h"
-
-/* Set the map extent flags for this reverse mapping. */
-static void
-xfs_trans_set_rmap_flags(
-	struct xfs_map_extent		*rmap,
-	enum xfs_rmap_intent_type	type,
-	int				whichfork,
-	xfs_exntst_t			state)
-{
-	rmap->me_flags = 0;
-	if (state == XFS_EXT_UNWRITTEN)
-		rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
-	if (whichfork == XFS_ATTR_FORK)
-		rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
-	switch (type) {
-	case XFS_RMAP_MAP:
-		rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
-		break;
-	case XFS_RMAP_MAP_SHARED:
-		rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
-		break;
-	case XFS_RMAP_UNMAP:
-		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
-		break;
-	case XFS_RMAP_UNMAP_SHARED:
-		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
-		break;
-	case XFS_RMAP_CONVERT:
-		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
-		break;
-	case XFS_RMAP_CONVERT_SHARED:
-		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
-		break;
-	case XFS_RMAP_ALLOC:
-		rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
-		break;
-	case XFS_RMAP_FREE:
-		rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
-		break;
-	default:
-		ASSERT(0);
-	}
-}
-
-struct xfs_rud_log_item *
-xfs_trans_get_rud(
-	struct xfs_trans		*tp,
-	struct xfs_rui_log_item		*ruip)
-{
-	struct xfs_rud_log_item		*rudp;
-
-	rudp = xfs_rud_init(tp->t_mountp, ruip);
-	xfs_trans_add_item(tp, &rudp->rud_item);
-	return rudp;
-}
-
-/*
- * Finish an rmap update and log it to the RUD. Note that the transaction is
- * marked dirty regardless of whether the rmap update succeeds or fails to
- * support the RUI/RUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_rmap_update(
-	struct xfs_trans		*tp,
-	struct xfs_rud_log_item		*rudp,
-	enum xfs_rmap_intent_type	type,
-	uint64_t			owner,
-	int				whichfork,
-	xfs_fileoff_t			startoff,
-	xfs_fsblock_t			startblock,
-	xfs_filblks_t			blockcount,
-	xfs_exntst_t			state,
-	struct xfs_btree_cur		**pcur)
-{
-	int				error;
-
-	error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
-			startblock, blockcount, state, pcur);
-
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the RUI and frees the RUD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
-	return error;
-}
-
-/* Sort rmap intents by AG. */
-static int
-xfs_rmap_update_diff_items(
-	void				*priv,
-	struct list_head		*a,
-	struct list_head		*b)
-{
-	struct xfs_mount		*mp = priv;
-	struct xfs_rmap_intent		*ra;
-	struct xfs_rmap_intent		*rb;
-
-	ra = container_of(a, struct xfs_rmap_intent, ri_list);
-	rb = container_of(b, struct xfs_rmap_intent, ri_list);
-	return  XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
-		XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
-}
-
-/* Get an RUI. */
-STATIC void *
-xfs_rmap_update_create_intent(
-	struct xfs_trans		*tp,
-	unsigned int			count)
-{
-	struct xfs_rui_log_item		*ruip;
-
-	ASSERT(tp != NULL);
-	ASSERT(count > 0);
-
-	ruip = xfs_rui_init(tp->t_mountp, count);
-	ASSERT(ruip != NULL);
-
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	xfs_trans_add_item(tp, &ruip->rui_item);
-	return ruip;
-}
-
-/* Log rmap updates in the intent item. */
-STATIC void
-xfs_rmap_update_log_item(
-	struct xfs_trans		*tp,
-	void				*intent,
-	struct list_head		*item)
-{
-	struct xfs_rui_log_item		*ruip = intent;
-	struct xfs_rmap_intent		*rmap;
-	uint				next_extent;
-	struct xfs_map_extent		*map;
-
-	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
-
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
-
-	/*
-	 * atomic_inc_return gives us the value after the increment;
-	 * we want to use it as an array index so we need to subtract 1 from
-	 * it.
-	 */
-	next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
-	ASSERT(next_extent < ruip->rui_format.rui_nextents);
-	map = &ruip->rui_format.rui_extents[next_extent];
-	map->me_owner = rmap->ri_owner;
-	map->me_startblock = rmap->ri_bmap.br_startblock;
-	map->me_startoff = rmap->ri_bmap.br_startoff;
-	map->me_len = rmap->ri_bmap.br_blockcount;
-	xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
-			rmap->ri_bmap.br_state);
-}
-
-/* Get an RUD so we can process all the deferred rmap updates. */
-STATIC void *
-xfs_rmap_update_create_done(
-	struct xfs_trans		*tp,
-	void				*intent,
-	unsigned int			count)
-{
-	return xfs_trans_get_rud(tp, intent);
-}
-
-/* Process a deferred rmap update. */
-STATIC int
-xfs_rmap_update_finish_item(
-	struct xfs_trans		*tp,
-	struct list_head		*item,
-	void				*done_item,
-	void				**state)
-{
-	struct xfs_rmap_intent		*rmap;
-	int				error;
-
-	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
-	error = xfs_trans_log_finish_rmap_update(tp, done_item,
-			rmap->ri_type,
-			rmap->ri_owner, rmap->ri_whichfork,
-			rmap->ri_bmap.br_startoff,
-			rmap->ri_bmap.br_startblock,
-			rmap->ri_bmap.br_blockcount,
-			rmap->ri_bmap.br_state,
-			(struct xfs_btree_cur **)state);
-	kmem_free(rmap);
-	return error;
-}
-
-/* Clean up after processing deferred rmaps. */
-STATIC void
-xfs_rmap_update_finish_cleanup(
-	struct xfs_trans	*tp,
-	void			*state,
-	int			error)
-{
-	struct xfs_btree_cur	*rcur = state;
-
-	xfs_rmap_finish_one_cleanup(tp, rcur, error);
-}
-
-/* Abort all pending RUIs. */
-STATIC void
-xfs_rmap_update_abort_intent(
-	void				*intent)
-{
-	xfs_rui_release(intent);
-}
-
-/* Cancel a deferred rmap update. */
-STATIC void
-xfs_rmap_update_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_rmap_intent		*rmap;
-
-	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
-	kmem_free(rmap);
-}
-
-static const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
-	.type		= XFS_DEFER_OPS_TYPE_RMAP,
-	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
-	.diff_items	= xfs_rmap_update_diff_items,
-	.create_intent	= xfs_rmap_update_create_intent,
-	.abort_intent	= xfs_rmap_update_abort_intent,
-	.log_item	= xfs_rmap_update_log_item,
-	.create_done	= xfs_rmap_update_create_done,
-	.finish_item	= xfs_rmap_update_finish_item,
-	.finish_cleanup = xfs_rmap_update_finish_cleanup,
-	.cancel_item	= xfs_rmap_update_cancel_item,
-};
-
-/* Register the deferred op type. */
-void
-xfs_rmap_update_init_defer_op(void)
-{
-	xfs_defer_init_op_type(&xfs_rmap_update_defer_type);
-}

diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 63ee1d5..cb895b1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c

@@ -5,15 +5,12 @@
  */
 
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_inode.h"
 #include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_acl.h"
 
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
@@ -33,7 +30,7 @@
 		value = NULL;
 	}
 
-	error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+	error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
 	if (error)
 		return error;
 	return asize;
@@ -129,6 +126,9 @@
 	char *offset;
 	int arraytop;
 
+	if (context->count < 0 || context->seen_enough)
+		return;
+
 	if (!context->alist)
 		goto compute_size;
commit	0f672f6c0b52b7b0700b0915c72b540721af4465	[log] [tgz]
author	David Brazdil <dbrazdil@google.com>	Tue Dec 10 10:32:29 2019 +0000
committer	David Brazdil <dbrazdil@google.com>	Tue Dec 10 19:03:18 2019 +0000
tree	85c8cba019caa205e4f8920d72d93f6d6deaf29c
parent	3a0ad55d848b50499b68d7141d4eca997fce28ef [diff]